├── .gitignore
├── README.md
├── assets
    └── Team_AI-it_Malicious_Comments_Collecting_Service.pdf
├── automl
    ├── configs
    │   └── model
    │   │   ├── example.yaml
    │   │   └── mobilenetv3.yaml
    ├── prediction
    │   └── sample_submission.csv
    ├── proj_dataloader.py
    ├── proj_utils.py
    ├── src
    │   ├── __init__.py
    │   ├── dataloader.py
    │   ├── loss.py
    │   ├── model.py
    │   ├── modules
    │   │   ├── __init__.py
    │   │   ├── activations.py
    │   │   ├── base_generator.py
    │   │   ├── bert.py
    │   │   ├── conv.py
    │   │   ├── dwconv.py
    │   │   ├── electra.py
    │   │   ├── flatten.py
    │   │   ├── linear.py
    │   │   ├── lstm.py
    │   │   ├── mbconv.py
    │   │   └── poolings.py
    │   ├── trainer.py
    │   └── utils
    │   │   ├── common.py
    │   │   ├── data.py
    │   │   ├── pytransform
    │   │       └── __init__.py
    │   │   └── torch_utils.py
    ├── tests
    │   ├── test_model_conversion.py
    │   └── test_model_parser.py
    ├── train.py
    └── tune.py
├── base
    ├── __init__.py
    ├── base_data_loader.py
    ├── base_model.py
    └── base_trainer.py
├── config.json
├── config_automl_test.json
├── data_loader
    ├── data_loaders.py
    └── kd_data_loaders.py
├── kd_config.json
├── kd_train.py
├── logger
    ├── __init__.py
    ├── logger.py
    └── logger_config.json
├── model
    ├── loss.py
    ├── lr_scheduler.py
    ├── metric.py
    └── model.py
├── parse_config.py
├── pkm_config.json
├── prototype
    ├── fullstack
    │   ├── .DS_Store
    │   ├── Makefile
    │   ├── __init__.py
    │   └── app
    │   │   ├── .DS_Store
    │   │   ├── __init__.py
    │   │   ├── base
    │   │       ├── __init__.py
    │   │       ├── base_data_loader.py
    │   │       ├── base_model.py
    │   │       └── base_trainer.py
    │   │   ├── config.json
    │   │   ├── confirm_button_hack.py
    │   │   ├── database.py
    │   │   ├── frontend.py
    │   │   ├── load_data.py
    │   │   ├── main.py
    │   │   ├── model
    │   │       ├── __init__.py
    │   │       └── model.py
    │   │   ├── predict.py
    │   │   ├── service
    │   │       ├── api_response.py
    │   │       └── error_handler.py
    │   │   ├── test
    │   │       ├── db_test.py
    │   │       └── exp.ipynb
    │   │   └── utils.py
    └── streamlit
    │   ├── .gitignore
    │   ├── app.py
    │   ├── base
    │       ├── __init__.py
    │       ├── base_data_loader.py
    │       ├── base_model.py
    │       └── base_trainer.py
    │   ├── config.json
    │   ├── confirm_button_hack.py
    │   ├── load_data.py
    │   ├── model
    │       └── model.py
    │   ├── pipeline_test.py
    │   ├── predict.py
    │   ├── service
    │       ├── api_response.py
    │       └── error_handler.py
    │   └── utils.py
├── requirements.txt
├── simple_test.py
├── test.py
├── test_automl.py
├── tokenizer
    ├── special_tokens_map.json
    ├── tokenizer_config.json
    └── vocab.txt
├── train.py
├── trainer
    ├── __init__.py
    ├── kd_trainer.py
    └── trainer.py
└── utils
    ├── __init__.py
    ├── api_response.py
    ├── error_handler.py
    ├── memory.py
    ├── query.py
    ├── util.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | # input data, saved log, checkpoints
104 | data/
105 | input/
106 | saved/
107 | datasets/
108 | wandb/
109 | jh_test/
110 | 
111 | # editor, os cache directory
112 | .vscode/
113 | .idea/
114 | __MACOSX/
115 | *.pt
116 | *.pkl
117 | *.ipynb


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Malicious Comments Collection System
  2 | 
  3 | ## 1. Introduction
  4 | 
  5 | ![image](https://user-images.githubusercontent.com/48538655/146724216-66d3da83-5024-4253-b170-1acd4449a344.png)
  6 | 
  7 |   인터넷이 발달하면서 특정 인물들에 대한 무분별한 악플들이 사람들을 괴롭히고 있습니다. 이런 악플러를 신고 및 고소를 하는데 증거 수집은 필수이지만 오랜 시간을 들여 증거수집이 필요합니다. 특히, 현재 프로세스는 회사나 개인 차원에서 직접 수집을 하거나 팬들의 제보를 통해 이루어지므로 비효율적이며 수동적입니다. 따라서 이런 점을 개선하고자 해당 프로젝트를 진행하게 되었습니다.
  8 | 
  9 |   **Malicious Comments Collection System**는 악플을 수집하고 악플을 검토하는 부분을 자동화하는데에 목적이 있습니다. 수집된 자료들은 추후 고소 목적으로 활용이 될 것입니다. 
 10 | 
 11 | ### Team AI-it
 12 | 
 13 | > "아-잇" 이라고 발음되는 것이 키치하게 재밌어서 팀명으로 정해보았습니다.
 14 | 
 15 | #### Members
 16 | 
 17 | |                            이연걸                            |                            김재현                            |                            박진영                            |                            조범준                            |                            진혜원                            |                            안성민                            |                            양재욱                            |
 18 | | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
 19 | | <img src='https://avatars.githubusercontent.com/u/48538655?v=4' height=80 width=80px></img> | <img src='https://avatars.githubusercontent.com/u/83448285?v=4' height=80 width=80px></img> | <img src='https://avatars.githubusercontent.com/u/34739974?v=4' height=80 width=80px></img> | <img src='https://avatars.githubusercontent.com/u/20266073?v=4' height=80 width=80px></img> | <img src='https://avatars.githubusercontent.com/u/39722108?v=4' height=80 width=80px></img> | <img src='https://avatars.githubusercontent.com/u/81609329?v=4' height=80 width=80px></img> | <img src='https://avatars.githubusercontent.com/u/56633607?v=4' height=80 width=80px></img> |
 20 | | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/LeeYeonGeol) | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/CozyKim) | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/nazzang49) | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/goattier) | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/hyewon11) | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/tttangmin) | [![Git Badge](http://img.shields.io/badge/-Github-black?style=flat-square&logo=github)](https://github.com/didwodnr123) |
 21 | 
 22 | ### Contribution
 23 | 
 24 | - [`이연걸`](https://github.com/LeeYeonGeol) &nbsp; Project Management • Service Dataset • Front-end & Back-end Update • EDA
 25 | - [`김재현`](https://github.com/CozyKim) &nbsp; Modeling • Model Optimization • AutoML • EDA
 26 | - [`박진영`](https://github.com/nazzang49) &nbsp; Model Optimization • Application Cloud Release (GKE) • Service Architecture
 27 | - [`조범준`](https://github.com/goattier) &nbsp; Baseline Code • Modeling • Model Optimization • EDA
 28 | - [`진혜원`](https://github.com/hyewon11) &nbsp; Service Dataset • EDA • Front-end & Back-end Update
 29 | - [`안성민`](https://github.com/tttangmin)  &nbsp;EDA • Modeling
 30 | - [`양재욱`](https://github.com/didwodnr123) &nbsp; Front-end (Streamlit) • Back-end (FastAPI) • MongoDB •  EDA
 31 | 
 32 | ## 2. Model
 33 | 
 34 | ### KcELECTRA Backbone Model + CNN & RNN Based Classifier (Best LB f1-score: 64.856)
 35 | ![image](https://user-images.githubusercontent.com/20266073/147147702-ff94e551-ea1c-4b4e-bdd5-622a31680442.png)
 36 | 
 37 | ### Clustering with Triplet Loss + KNN (Best LB f1-score: 66.192)
 38 | ![image](https://user-images.githubusercontent.com/20266073/147147922-aebcf049-1f3f-49b3-954f-a9322d4ec901.png)
 39 | 
 40 | ### 2nd / 67team (21.12.23 기준)
 41 | ![image](https://user-images.githubusercontent.com/20266073/147148111-587f6ca2-0252-4237-ab63-bc9e919c3064.png)
 42 | 
 43 | 
 44 | ## 3. Flow Chart
 45 | 
 46 | ### System Architecture
 47 | 
 48 | ![image](https://user-images.githubusercontent.com/39722108/147093220-ef42c25c-c240-4911-93c9-ec0eb81af432.png)
 49 | 
 50 | ### Pipeline
 51 | 
 52 | ![image](https://user-images.githubusercontent.com/48538655/146738181-85996171-e84f-451a-85ca-165098608523.png)
 53 | 
 54 | ## 4. How to Use
 55 | 
 56 | ### Install Requirements
 57 | 
 58 | ```bash
 59 | pip install -r requirements.txt
 60 | ```
 61 | 
 62 | ### Project Tree
 63 | 
 64 | ```
 65 | |-- assets
 66 | |-- automl
 67 | |-- base
 68 | |   |-- __init__.py
 69 | |   |-- base_data_loader.py
 70 | |   |-- base_model.py
 71 | |   └-- base_trainer.py
 72 | |-- data_loader
 73 | |   └-- data_loaders.py
 74 | |-- logger
 75 | |   |-- __init__.py
 76 | |   |-- logger.py
 77 | |   └-- logger_config.json
 78 | |-- model
 79 | |   |-- loss.py
 80 | |   |-- lr_scheduler.py
 81 | |   |-- metric.py
 82 | |   └-- model.py
 83 | |-- prototype
 84 | |-- tokenizer
 85 | |   |-- special_tokens_map.json
 86 | |   |-- tokenizer_config.json
 87 | |   └-- vocab.txt
 88 | |-- trainer
 89 | |   |-- __init__.py
 90 | |   |-- kd_trainer.py
 91 | |   └-- trainer.py
 92 | |-- config.json
 93 | |-- config_automl_test.json
 94 | |-- kd_config.json
 95 | |-- kd_train.py
 96 | |-- parse_config.py
 97 | |-- pkm_config.json
 98 | |-- requirements.txt
 99 | |-- simple_test.py
100 | |-- test.py
101 | |-- test_automl.py
102 | |-- train.py
103 | └-- utils
104 |     |-- __init__.py
105 |     |-- api_response.py
106 |     |-- error_handler.py
107 |     |-- memory.py
108 |     |-- query.py
109 |     |-- util.py
110 |     └-- utils.py
111 | ```
112 | 
113 | ### Getting Started
114 | - Train & Validation
115 | ```python
116 | python train.py -c config.json
117 | ```
118 | - Inference
119 | ```python
120 | python test.py -c config.json    # test_config.json
121 | ```
122 | - Train (Knowledge Distillation)
123 | ```python
124 | python kd_train.py -c kd_config.json
125 | ```
126 | 
127 | ## 5. Demo (TODO)
128 | 
129 | ## 6. Reference
130 | - [Korean HateSpeech Detection Kaggle Competition](https://www.kaggle.com/c/korean-hate-speech-detection/data)
131 | - [Korean HateSpeech Dataset](https://github.com/kocohub/korean-hate-speech)
132 | - [BEEP! Korean Corpus of Online News Comments for Toxic Speech Detection](https://aclanthology.org/2020.socialnlp-1.4/)
133 | - [PyTorch Template Project By victoresque](https://github.com/victoresque/pytorch-template)
134 | 


--------------------------------------------------------------------------------
/assets/Team_AI-it_Malicious_Comments_Collecting_Service.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/assets/Team_AI-it_Malicious_Comments_Collecting_Service.pdf


--------------------------------------------------------------------------------
/automl/configs/model/example.yaml:
--------------------------------------------------------------------------------
 1 | input_channel: 3
 2 | 
 3 | depth_multiple: 1.0
 4 | width_multiple: 1.0
 5 | 
 6 | backbone:
 7 |     # Example model in PyTorch Tutorial (https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)
 8 |     # [repeat, module, args]
 9 |     [
10 |         [1, Conv, [6, 5, 1, 0]],
11 |         [1, MaxPool, [2]],
12 |         [1, Conv, [16, 5, 1, 0]],
13 |         [1, MaxPool, [2]],
14 |         [1, GlobalAvgPool, []],
15 |         [1, Flatten, []],
16 |         [1, Linear, [120, ReLU]],
17 |         [1, Linear, [84, ReLU]],
18 |         [1, Linear, [9]]
19 |     ]
20 | 


--------------------------------------------------------------------------------
/automl/configs/model/mobilenetv3.yaml:
--------------------------------------------------------------------------------
 1 | input_channel: 3
 2 | 
 3 | depth_multiple: 1.0
 4 | width_multiple: 1.0
 5 | 
 6 | backbone:
 7 |     # [repeat, module, args]
 8 |     [
 9 |         # Conv argument: [out_channel, kernel_size, stride, padding_size]
10 |         # if padding_size is not given or null, the padding_size will be auto adjusted as padding='SAME' in TensorFlow
11 |         [1, Conv, [16, 3, 2, null, 1, "HardSwish"]],
12 |         # k t c SE HS s
13 |         [1, InvertedResidualv3, [3, 1, 16, 0, 0, 1]],
14 |         [1, InvertedResidualv3, [3, 4, 24, 0, 0, 2]], # 2-P2/4, 24 # stride 1 for cifar, 2 for others
15 |         [1, InvertedResidualv3, [3, 3, 24, 0, 0, 1]],     
16 |         [1, InvertedResidualv3, [5, 3, 40, 1, 0, 2]], # 4-P3/8, 40
17 |         [1, InvertedResidualv3, [5, 3, 40, 1, 0, 1]],     
18 |         [1, InvertedResidualv3, [5, 3, 40, 1, 0, 1]],     
19 |         [1, InvertedResidualv3, [3, 6, 80, 0, 1, 2]], # 7-P4/16, 80   
20 |         [1, InvertedResidualv3, [3, 2.5, 80, 0, 1, 1]],     
21 |         [1, InvertedResidualv3, [3, 2.3, 80, 0, 1, 1]],     
22 |         [1, InvertedResidualv3, [3, 2.3, 80, 0, 1, 1]],     
23 |         [1, InvertedResidualv3, [3, 6, 112, 1, 1, 1]],     
24 |         [1, InvertedResidualv3, [3, 6, 112, 1, 1, 1]], # 12 -P5/32, 112 
25 |         [1, InvertedResidualv3, [5, 6, 160, 1, 1, 2]],    
26 |         [1, InvertedResidualv3, [5, 6, 160, 1, 1, 1]],     
27 |         [1, InvertedResidualv3, [5, 6, 160, 1, 1, 1]],     
28 |         [1, Conv, [960, 1, 1]],
29 |         [1, GlobalAvgPool, []],
30 |         [1, Conv, [1280, 1, 1]],
31 |         [1, Flatten, []],
32 |         [1, Linear, [6]]
33 |     ]
34 | 


--------------------------------------------------------------------------------
/automl/proj_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import torch
  4 | import emoji
  5 | import wandb
  6 | import pandas as pd
  7 | from pathlib import Path
  8 | from itertools import repeat
  9 | from collections import OrderedDict
 10 | from soynlp.normalizer import repeat_normalize
 11 | 
 12 | 
 13 | def ensure_dir(dirname):
 14 |     dirname = Path(dirname)
 15 |     if not dirname.is_dir():
 16 |         dirname.mkdir(parents=True, exist_ok=False)
 17 | 
 18 | def read_json(fname):
 19 |     fname = Path(fname)
 20 |     with fname.open('rt') as handle:
 21 |         return json.load(handle, object_hook=OrderedDict)
 22 | 
 23 | def write_json(content, fname):
 24 |     fname = Path(fname)
 25 |     with fname.open('wt') as handle:
 26 |         json.dump(content, handle, indent=4, sort_keys=False)
 27 | 
 28 | def inf_loop(data_loader):
 29 |     ''' wrapper function for endless data loader. '''
 30 |     for loader in repeat(data_loader):
 31 |         yield from loader
 32 | 
 33 | def prepare_device(n_gpu_use):
 34 |     """
 35 |     setup GPU device if available. get gpu device indices which are used for DataParallel
 36 |     """
 37 |     n_gpu = torch.cuda.device_count()
 38 |     if n_gpu_use > 0 and n_gpu == 0:
 39 |         print("Warning: There\'s no GPU available on this machine,"
 40 |               "training will be performed on CPU.")
 41 |         n_gpu_use = 0
 42 |     if n_gpu_use > n_gpu:
 43 |         print(f"Warning: The number of GPU\'s configured to use is {n_gpu_use}, but only {n_gpu} are "
 44 |               "available on this machine.")
 45 |         n_gpu_use = n_gpu
 46 |     device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
 47 |     list_ids = list(range(n_gpu_use))
 48 |     return device, list_ids
 49 | 
 50 | 
 51 | class MetricTracker:
 52 |     def __init__(self, *keys):
 53 |         self._data = pd.DataFrame(index=keys, columns=['total', 'counts', 'average'])
 54 |         self.reset()
 55 | 
 56 |     def reset(self):
 57 |         for col in self._data.columns:
 58 |             self._data[col].values[:] = 0
 59 | 
 60 |     def update(self, key, value, n=1):
 61 |         self._data.total[key] += value * n
 62 |         self._data.counts[key] += n
 63 |         self._data.average[key] = self._data.total[key] / self._data.counts[key]
 64 | 
 65 |     def avg(self, key):
 66 |         return self._data.average[key]
 67 | 
 68 |     def result(self):
 69 |         return dict(self._data.average)
 70 | 
 71 | 
 72 | 
 73 | def preprocess(sents):
 74 |     """
 75 |     kcELECTRA-base preprocess procedure + modification
 76 |     """
 77 |     preprocessed_sents = []
 78 |     
 79 |     emojis = set()
 80 |     for k in emoji.UNICODE_EMOJI.keys():
 81 |         emojis.update(emoji.UNICODE_EMOJI[k].keys())
 82 |         
 83 |     punc_bracket_pattern = re.compile(f'[\'\"\[\]\(\)]')
 84 |     base_pattern = re.compile(f'[^.,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
 85 |     url_pattern = re.compile(
 86 |         r'(http|ftp|https)?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
 87 |     )
 88 |     
 89 |     for sent in sents:
 90 |         sent = punc_bracket_pattern.sub(' ', sent)
 91 |         sent = base_pattern.sub(' ', sent)
 92 |         sent = url_pattern.sub('', sent)
 93 |         sent = sent.strip()
 94 |         sent = repeat_normalize(sent, num_repeats=2)
 95 |         preprocessed_sents.append(sent)
 96 |             
 97 |     return preprocessed_sents
 98 | 
 99 | 
100 | class Preprocess():
101 |     '''A class for preprocessing contexts from train and wikipedia
102 |     Args:
103 |         sents (list): context list
104 |         langs (list): language list should be removed from sentence
105 |     '''
106 | 
107 |     PERMIT_REMOVE_LANGS = [
108 |         'arabic',
109 |         'russian',
110 |     ]
111 | 
112 |     def __init__(self, sents: list):
113 |         self.sents = sents
114 | 
115 |     def proc_preprocessing(self):
116 |         """
117 |         A function for doing preprocess
118 |         """
119 |         self.remove_hashtag()
120 |         self.remove_user_mention()
121 |         self.remove_bad_char()
122 |         self.clean_punc()
123 |         self.remove_useless_char()
124 |         self.remove_linesign()
125 |         self.remove_repeated_spacing()
126 |         
127 |         return self.sents
128 | 
129 |     def remove_hashtag(self):
130 |         """
131 |         A function for removing hashtag
132 |         """
133 |         preprocessed_sents = []
134 |         for sent in self.sents:
135 |             sent = re.sub(r"#\S+", "", sent).strip()
136 |             if sent:
137 |                 preprocessed_sents.append(sent)
138 |         self.sents = preprocessed_sents
139 | 
140 |     def remove_user_mention(self):
141 |         """
142 |         A function for removing mention tag
143 |         """
144 |         preprocessed_sents = []
145 |         for sent in self.sents:
146 |             sent = re.sub(r"@\w+", "", sent).strip()
147 |             if sent:
148 |                 preprocessed_sents.append(sent)
149 |         self.sents = preprocessed_sents
150 | 
151 |     def remove_bad_char(self):
152 |         """
153 |         A function for removing raw unicode including unk
154 |         """
155 |         bad_chars = {"\u200b": "", "…": " ... ", "\ufeff": ""}
156 |         preprcessed_sents = []
157 |         for sent in self.sents:
158 |             for bad_char in bad_chars:
159 |                 sent = sent.replace(bad_char, bad_chars[bad_char])
160 |             sent = re.sub(r"[\+á?\xc3\xa1]", "", sent)
161 |             if sent:
162 |                 preprcessed_sents.append(sent)
163 |         self.sents = preprcessed_sents
164 | 
165 |     def clean_punc(self):
166 |         """
167 |         A function for removing useless punctuation
168 |         """
169 |         punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2",
170 |                          "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e",
171 |                          '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-',
172 |                          'β': 'beta', '∅': '', '³': '3', 'π': 'pi', 'ㅂㅅ': '병신', 'ㄲㅈ': '꺼져', 'ㅂㄷ': '부들', 'ㅆㄹㄱ': '쓰레기', 'ㅆㅂ': '씨발',
173 |                          'ㅈㅅ': '죄송', 'ㅈㄹ': '지랄', 'ㅈㄴ': '정말'}
174 | 
175 |         preprocessed_sents = []
176 |         for sent in self.sents:
177 |             for p in punct_mapping:
178 |                 sent = sent.replace(p, punct_mapping[p])
179 |             sent = sent.strip()
180 |             if sent:
181 |                 preprocessed_sents.append(sent)
182 |         self.sents = preprocessed_sents
183 |         
184 |     def remove_useless_char(self):
185 |         preprocessed_sents = []
186 |         re_obj = re.compile('[^가-힣a-z0-9\x20]+')
187 |         
188 |         for sent in self.sents:
189 |             temp = re_obj.findall(sent)
190 |             if temp != []:
191 |                 for ch in temp:
192 |                     sent = sent.replace(ch, " ")
193 |             sent = sent.strip()
194 |             if sent:
195 |                 preprocessed_sents.append(sent)
196 |         
197 |         self.sents = preprocessed_sents
198 |                 
199 |     def remove_repeated_spacing(self):
200 |         """
201 |         A function for reducing whitespaces into one
202 |         """
203 |         preprocessed_sents = []
204 |         for sent in self.sents:
205 |             sent = re.sub(r"\s+", " ", sent).strip()
206 |             if sent:
207 |                 preprocessed_sents.append(sent)
208 |         self.sents = preprocessed_sents
209 |         
210 |     def spacing_sent(self):
211 |         """
212 |         A function for spacing properly
213 |         """
214 |         preprocessed_sents = []
215 |         for sent in self.sents:
216 |             sent = self.spacing(sent)
217 |             if sent:
218 |                 preprocessed_sents.append(sent)
219 |         self.sents = preprocessed_sents   
220 | 
221 |     def remove_linesign(self):
222 |         """
223 |         A function for removing line sings like \n
224 |         """
225 |         preprocessed_sents = []
226 |         for sent in self.sents:
227 |             sent = re.sub(r"[\n\t\r\v\f\\\\n\\t\\r\\v\\f]", "", sent)
228 |             if sent:
229 |                 preprocessed_sents.append(sent)
230 |         self.sents = preprocessed_sents
231 | 


--------------------------------------------------------------------------------
/automl/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/automl/src/__init__.py


--------------------------------------------------------------------------------
/automl/src/dataloader.py:
--------------------------------------------------------------------------------
  1 | """Tune Model.
  2 | 
  3 | - Author: Junghoon Kim, Jongkuk Lim, Jimyeong Kim
  4 | - Contact: placidus36@gmail.com, lim.jeikei@gmail.com, wlaud1001@snu.ac.kr
  5 | - Reference
  6 |     https://github.com/j-marple-dev/model_compression
  7 | """
  8 | import glob
  9 | import os
 10 | from typing import Any, Dict, List, Tuple, Union
 11 | 
 12 | import torch
 13 | import yaml
 14 | from torch.utils.data import DataLoader, random_split
 15 | from torchvision.datasets import ImageFolder, VisionDataset
 16 | 
 17 | from src.utils.data import weights_for_balanced_classes
 18 | from src.utils.torch_utils import split_dataset_index
 19 | 
 20 | 
 21 | def create_dataloader(
 22 |     config: Dict[str, Any],
 23 | ) -> Tuple[DataLoader, DataLoader, DataLoader]:
 24 |     """Simple dataloader.
 25 | 
 26 |     Args:
 27 |         cfg: yaml file path or dictionary type of the data.
 28 | 
 29 |     Returns:
 30 |         train_loader
 31 |         valid_loader
 32 |         test_loader
 33 |     """
 34 |     # Data Setup
 35 |     train_dataset, val_dataset, test_dataset = get_dataset(
 36 |         data_path=config["DATA_PATH"],
 37 |         dataset_name=config["DATASET"],
 38 |         img_size=config["IMG_SIZE"],
 39 |         val_ratio=config["VAL_RATIO"],
 40 |         transform_train=config["AUG_TRAIN"],
 41 |         transform_test=config["AUG_TEST"],
 42 |         transform_train_params=config["AUG_TRAIN_PARAMS"],
 43 |         transform_test_params=config.get("AUG_TEST_PARAMS"),
 44 |     )
 45 | 
 46 |     return get_dataloader(
 47 |         train_dataset=train_dataset,
 48 |         val_dataset=val_dataset,
 49 |         test_dataset=test_dataset,
 50 |         batch_size=config["BATCH_SIZE"],
 51 |     )
 52 | 
 53 | 
 54 | def get_dataset(
 55 |     data_path: str = "./save/data",
 56 |     dataset_name: str = "CIFAR10",
 57 |     img_size: float = 32,
 58 |     val_ratio: float=0.2,
 59 |     transform_train: str = "simple_augment_train",
 60 |     transform_test: str = "simple_augment_test",
 61 |     transform_train_params: Dict[str, int] = None,
 62 |     transform_test_params: Dict[str, int] = None,
 63 | ) -> Tuple[VisionDataset, VisionDataset, VisionDataset]:
 64 |     """Get dataset for training and testing."""
 65 |     if not transform_train_params:
 66 |         transform_train_params = dict()
 67 |     if not transform_test_params:
 68 |         transform_test_params = dict()
 69 | 
 70 |     # preprocessing policies
 71 |     transform_train = getattr(
 72 |         __import__("src.augmentation.policies", fromlist=[""]),
 73 |         transform_train,
 74 |     )(dataset=dataset_name, img_size=img_size, **transform_train_params)
 75 |     transform_test = getattr(
 76 |         __import__("src.augmentation.policies", fromlist=[""]),
 77 |         transform_test,
 78 |     )(dataset=dataset_name, img_size=img_size, **transform_test_params)
 79 | 
 80 |     label_weights = None
 81 |     # pytorch dataset
 82 |     if dataset_name == "TACO":
 83 |         train_path = os.path.join(data_path, "train")
 84 |         val_path = os.path.join(data_path, "val")
 85 |         test_path = os.path.join(data_path, "test")
 86 | 
 87 |         train_dataset = ImageFolder(root=train_path, transform=transform_train)
 88 |         val_dataset = ImageFolder(root=val_path, transform=transform_test)
 89 |         test_dataset = ImageFolder(root=test_path, transform=transform_test)
 90 | 
 91 |     else:
 92 |         Dataset = getattr(
 93 |             __import__("torchvision.datasets", fromlist=[""]), dataset_name
 94 |         )
 95 |         train_dataset = Dataset(
 96 |             root=data_path, train=True, download=True, transform=transform_train
 97 |         )
 98 |         # from train dataset, train: 80%, val: 20%
 99 |         train_length = int(len(train_dataset) * (1.0-val_ratio))
100 |         train_dataset, val_dataset = random_split(
101 |             train_dataset, [train_length, len(train_dataset) - train_length]
102 |         )
103 |         test_dataset = Dataset(
104 |             root=data_path, train=False, download=False, transform=transform_test
105 |         )
106 |     return train_dataset, val_dataset, test_dataset
107 | 
108 | 
109 | def get_dataloader(
110 |     train_dataset: VisionDataset,
111 |     val_dataset: VisionDataset,
112 |     test_dataset: VisionDataset,
113 |     batch_size: int,
114 | ) -> Tuple[DataLoader, DataLoader, DataLoader]:
115 |     """Get dataloader for training and testing."""
116 | 
117 |     train_loader = DataLoader(
118 |         dataset=train_dataset,
119 |         pin_memory=(torch.cuda.is_available()),
120 |         shuffle=True,
121 |         batch_size=batch_size,
122 |         num_workers=10,
123 |         drop_last=True
124 |     )
125 |     valid_loader = DataLoader(
126 |         dataset=val_dataset,
127 |         pin_memory=(torch.cuda.is_available()),
128 |         shuffle=False,
129 |         batch_size=batch_size,
130 |         num_workers=5
131 |     )
132 |     test_loader = DataLoader(
133 |         dataset=test_dataset,
134 |         pin_memory=(torch.cuda.is_available()),
135 |         shuffle=False,
136 |         batch_size=batch_size,
137 |         num_workers=5
138 |     )
139 |     return train_loader, valid_loader, test_loader
140 | 


--------------------------------------------------------------------------------
/automl/src/loss.py:
--------------------------------------------------------------------------------
 1 | """Custom loss for long tail problem.
 2 | 
 3 | - Author: Junghoon Kim
 4 | - Email: placidus36@gmail.com
 5 | """
 6 | import numpy as np
 7 | import torch
 8 | import torch.nn as nn
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | class CustomCriterion:
13 |     """Custom Criterion."""
14 | 
15 |     def __init__(self, samples_per_cls, device, fp16=False, loss_type="softmax"):
16 |         if not samples_per_cls:
17 |             loss_type = "softmax"
18 |         else:
19 |             self.samples_per_cls = samples_per_cls
20 |             self.frequency_per_cls = samples_per_cls / np.sum(samples_per_cls)
21 |             self.no_of_classes = len(samples_per_cls)
22 |         self.device = device
23 |         self.fp16 = fp16
24 | 
25 |         if loss_type == "softmax":
26 |             self.criterion = nn.CrossEntropyLoss()
27 |         elif loss_type == "logit_adjustment_loss":
28 |             tau = 1.0
29 |             self.logit_adj_val = (
30 |                 torch.tensor(tau * np.log(self.frequency_per_cls))
31 |                 .float()
32 |                 .to(self.device)
33 |             )
34 |             self.logit_adj_val = (
35 |                 self.logit_adj_val.half() if fp16 else self.logit_adj_val.float()
36 |             )
37 |             self.logit_adj_val = self.logit_adj_val.to(device)
38 |             self.criterion = self.logit_adjustment_loss
39 | 
40 |     def __call__(self, logits, labels):
41 |         """Call criterion."""
42 |         return self.criterion(logits, labels)
43 | 
44 |     def logit_adjustment_loss(self, logits, labels):
45 |         """Logit adjustment loss."""
46 |         logits_adjusted = logits + self.logit_adj_val.repeat(labels.shape[0], 1)
47 |         loss = F.cross_entropy(input=logits_adjusted, target=labels)
48 |         return loss
49 | 


--------------------------------------------------------------------------------
/automl/src/model.py:
--------------------------------------------------------------------------------
  1 | """Model parser and model.
  2 | 
  3 | - Author: Jongkuk Lim
  4 | - Contact: lim.jeikei@gmail.com
  5 | """
  6 | 
  7 | from typing import Dict, List, Type, Union
  8 | 
  9 | import torch
 10 | import torch.nn as nn
 11 | import yaml
 12 | 
 13 | from .modules import ModuleGenerator
 14 | 
 15 | 
 16 | class Model(nn.Module):
 17 |     """Base model class."""
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         cfg: Union[str, Dict[str, Type]] = "./model_configs/show_case.yaml",
 22 |         verbose: bool = False,
 23 |     ) -> None:
 24 |         """Parse model from the model config file.
 25 | 
 26 |         Args:
 27 |             cfg: yaml file path or dictionary type of the model.
 28 |             verbose: print the model parsing information.
 29 |         """
 30 |         super().__init__()
 31 |         self.model_parser = ModelParser(cfg=cfg, verbose=verbose)
 32 |         self.model = self.model_parser.model
 33 | 
 34 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 35 |         """Forward."""
 36 |         return self.forward_one(x)
 37 | 
 38 |     def forward_one(self, x: torch.Tensor) -> torch.Tensor:
 39 |         """Forward onetime."""
 40 | 
 41 |         return self.model(x)
 42 | 
 43 | 
 44 | class ModelParser:
 45 |     """Generate PyTorch model from the model yaml file."""
 46 | 
 47 |     def __init__(
 48 |         self,
 49 |         cfg: Union[str, Dict[str, Type]] = "./model_configs/show_case.yaml",
 50 |         verbose: bool = False,
 51 |     ) -> None:
 52 |         """Generate PyTorch model from the model yaml file.
 53 | 
 54 |         Args:
 55 |             cfg: model config file or dict values read from the model config file.
 56 |             verbose: print the parsed model information.
 57 |         """
 58 | 
 59 |         self.verbose = verbose
 60 |         if isinstance(cfg, dict):
 61 |             self.cfg = cfg
 62 |         else:
 63 |             with open(cfg) as f:
 64 |                 self.cfg = yaml.load(f, Loader=yaml.FullLoader)
 65 | 
 66 |         # self.in_channel = self.cfg["input_channel"]
 67 | 
 68 |         # self.depth_multiply = self.cfg["depth_multiple"]
 69 |         # self.width_multiply = self.cfg["width_multiple"]
 70 | 
 71 |         # error: Incompatible types in assignment (expression has type "Type[Any]",
 72 |         # variable has type "List[Union[int, str, float]]")
 73 |         self.model_cfg: List[Union[int, str, float]] = self.cfg["backbone"]  # type: ignore
 74 | 
 75 |         self.model = self._parse_model()
 76 | 
 77 |     def log(self, msg: str):
 78 |         """Log."""
 79 |         if self.verbose:
 80 |             print(msg)
 81 | 
 82 |     def _parse_model(self) -> nn.Sequential:
 83 |         """Parse model."""
 84 |         layers: List[nn.Module] = []
 85 |         log: str = (
 86 |             f"{'idx':>3} | {'n':>3} | {'params':>10} "
 87 |             f"| {'module':>15} | {'arguments':>20} | {'in_channel':>12} | {'out_channel':>13}"
 88 |         )
 89 |         self.log(log)
 90 |         self.log(len(log) * "-")  # type: ignore
 91 | 
 92 |         # in_channel = self.in_channel
 93 |         for i, (repeat, module, args) in enumerate(self.model_cfg):  # type: ignore
 94 |             repeat = (
 95 |                 max(round(repeat * self.depth_multiply), 1) if repeat > 1 else repeat
 96 |             )
 97 | 
 98 |             module_generator = ModuleGenerator(module)(  # type: ignore
 99 |                 *args,
100 |                 # width_multiply=self.width_multiply,
101 |             )
102 |             m = module_generator(repeat=repeat)
103 | 
104 |             layers.append(m)
105 |             # in_channel = module_generator.out_channel
106 | 
107 |             log = (
108 |                 f"{i:3d} | {repeat:3d} | "
109 |                 f"{m.n_params:10,d} | {m.type:>15} | {str(args):>20} | "
110 |                 # f"{str(module_generator.in_channel):>12}"
111 |                 # f"{str(module_generator.out_channel):>13}"
112 |             )
113 | 
114 |             self.log(log)
115 | 
116 |         parsed_model = nn.Sequential(*layers)
117 |         n_param = sum([x.numel() for x in parsed_model.parameters()])
118 |         n_grad = sum([x.numel() for x in parsed_model.parameters() if x.requires_grad])
119 |         # error: Incompatible return value type (got "Tuple[Sequential, List[int]]",
120 |         # expected "Tuple[Module, List[Optional[int]]]")
121 |         self.log(
122 |             f"Model Summary: {len(list(parsed_model.modules())):,d} "
123 |             f"layers, {n_param:,d} parameters, {n_grad:,d} gradients"
124 |         )
125 | 
126 |         return parsed_model
127 | 


--------------------------------------------------------------------------------
/automl/src/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | """PyTorch Module and ModuleGenerator."""
 2 | 
 3 | from src.modules.base_generator import GeneratorAbstract, ModuleGenerator
 4 | from src.modules.bottleneck import Bottleneck, BottleneckGenerator
 5 | from src.modules.conv import Conv, ConvGenerator, FixedConvGenerator
 6 | from src.modules.dwconv import DWConv, DWConvGenerator
 7 | from src.modules.flatten import FlattenGenerator
 8 | from src.modules.invertedresidualv2 import (
 9 |     InvertedResidualv2,
10 |     InvertedResidualv2Generator,
11 | )
12 | from src.modules.invertedresidualv3 import (
13 |     InvertedResidualv3,
14 |     InvertedResidualv3Generator,
15 | )
16 | from src.modules.linear import Linear, LinearGenerator
17 | from src.modules.poolings import (
18 |     AvgPoolGenerator,
19 |     GlobalAvgPool,
20 |     GlobalAvgPoolGenerator,
21 |     MaxPoolGenerator,
22 | )
23 | from src.modules.bert import Bert, BertGenerator
24 | from src.modules.electra import Electra, ElectraGenerator
25 | from src.modules.lstm import Lstm, LstmGenerator
26 | from src.modules.electra_lstm import ElectraWithLSTM, ElectraWithLSTMGenerator
27 | from src.modules.bert_lstm import BertWithLSTM, BertWithLSTMGenerator
28 | 
29 | __all__ = [
30 |     "ModuleGenerator",
31 |     "GeneratorAbstract",
32 |     "Bottleneck",
33 |     "Conv",
34 |     "DWConv",
35 |     "Linear",
36 |     "GlobalAvgPool",
37 |     "InvertedResidualv2",
38 |     "InvertedResidualv3",
39 |     "BottleneckGenerator",
40 |     "FixedConvGenerator",
41 |     "ConvGenerator",
42 |     "LinearGenerator",
43 |     "DWConvGenerator",
44 |     "FlattenGenerator",
45 |     "MaxPoolGenerator",
46 |     "AvgPoolGenerator",
47 |     "GlobalAvgPoolGenerator",
48 |     "InvertedResidualv2Generator",
49 |     "InvertedResidualv3Generator",
50 |     "Bert",
51 |     "BertGenerator",
52 |     "Electra",
53 |     "ElectraGenerator" "Lstm",
54 |     "LstmGenerator",
55 |     "ElectraWithLSTM",
56 |     "ElectraWithLSTMGenerator",
57 |     "BertWithLSTM",
58 |     "BertWithLSTMGenerator",
59 | ]
60 | 


--------------------------------------------------------------------------------
/automl/src/modules/activations.py:
--------------------------------------------------------------------------------
 1 | """Custom activation to work with onnx.
 2 | 
 3 | Reference:
 4 |     https://github.com/rwightman/pytorch-image-models/blob/9a25fdf3ad0414b4d66da443fe60ae0aa14edc84/timm/models/layers/activations.py
 5 | - Author: Junghoon Kim
 6 | - Contact: placidus36@gmail.com
 7 | """
 8 | import torch
 9 | import torch.nn as nn
10 | from torch.nn import functional as F
11 | 
12 | 
13 | def hard_sigmoid(x: torch.Tensor, inplace: bool = False):
14 |     """Hard sigmoid."""
15 |     if inplace:
16 |         return x.add_(3.0).clamp_(0.0, 6.0).div_(6.0)
17 |     else:
18 |         return F.relu6(x + 3.0) / 6.0
19 | 
20 | 
21 | class HardSigmoid(nn.Module):
22 |     """Hard sigmoid."""
23 | 
24 |     def __init__(self, inplace: bool = False):
25 |         """Initialize."""
26 |         super().__init__()
27 |         self.inplace = inplace
28 | 
29 |     def forward(self, x: torch.Tensor):
30 |         """Forward."""
31 |         return hard_sigmoid(x, self.inplace)
32 | 
33 | 
34 | def hard_swish(x: torch.Tensor, inplace: bool = False):
35 |     """Hard swish."""
36 |     inner = F.relu6(x + 3.0).div_(6.0)
37 |     return x.mul_(inner) if inplace else x.mul(inner)
38 | 
39 | 
40 | class HardSwish(nn.Module):
41 |     """Custom hardswish to work with onnx."""
42 | 
43 |     def __init__(self, inplace: bool = False):
44 |         """Initialize."""
45 |         super().__init__()
46 |         self.inplace = inplace
47 | 
48 |     def forward(self, x: torch.Tensor):
49 |         """Forward."""
50 |         return hard_swish(x, self.inplace)
51 | 
52 | 
53 | def swish(x: torch.Tensor, inplace: bool = False):
54 |     """Swish - Described originally as SiLU (https://arxiv.org/abs/1702.03118v3)
55 |     and also as Swish (https://arxiv.org/abs/1710.05941).
56 |     TODO Rename to SiLU with addition to PyTorch
57 |     Adopted to handle onnx conversion
58 |     """
59 |     return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid())
60 | 
61 | 
62 | class Swish(nn.Module):
63 |     """Swish."""
64 | 
65 |     def __init__(self, inplace: bool = False):
66 |         """Initialize."""
67 |         super().__init__()
68 |         self.inplace = inplace
69 | 
70 |     def forward(self, x: torch.Tensor):
71 |         """Forward."""
72 |         return swish(x, self.inplace)
73 | 


--------------------------------------------------------------------------------
/automl/src/modules/base_generator.py:
--------------------------------------------------------------------------------
 1 | """Base Module Generator.
 2 | 
 3 | This module is responsible for GeneratorAbstract and ModuleGenerator.
 4 | 
 5 | - Author: Jongkuk Lim
 6 | - Contact: lim.jeikei@gmail.com
 7 | """
 8 | from abc import ABC, abstractmethod
 9 | from typing import List, Union
10 | 
11 | from torch import nn as nn
12 | 
13 | from src.utils.torch_utils import make_divisible
14 | 
15 | 
16 | class GeneratorAbstract(ABC):
17 |     """Abstract Module Generator."""
18 | 
19 |     CHANNEL_DIVISOR: int = 8
20 | 
21 |     def __init__(
22 |         self,
23 |         *args,
24 |         from_idx: Union[int, List[int]] = -1,
25 |     ):
26 |         """Initialize module generator.
27 | 
28 |         Args:
29 |             *args: Module arguments
30 |             from_idx: Module input index
31 |         """
32 |         self.args = tuple(args)
33 |         self.from_idx = from_idx
34 | 
35 |     @property
36 |     def name(self) -> str:
37 |         """Module name."""
38 |         return self.__class__.__name__.replace("Generator", "")
39 | 
40 |     def _get_module(self, module: Union[nn.Module, List[nn.Module]]) -> nn.Module:
41 |         """Get module from __call__ function."""
42 |         if isinstance(module, list):
43 |             module = nn.Sequential(*module)
44 | 
45 |         # error: Incompatible types in assignment (expression has type "Union[Tensor, Module, int]",
46 |         # variable has type "Union[Tensor, Module]")
47 |         # error: List comprehension has incompatible type List[int];
48 |         # expected List[Union[Tensor, Module]]
49 |         module.n_params = sum([x.numel() for x in module.parameters()])  # type: ignore
50 |         # error: Cannot assign to a method
51 |         module.type = self.name  # type: ignore
52 | 
53 |         return module
54 | 
55 |     # @classmethod
56 |     # def _get_divisible_channel(cls, n_channel: int) -> int:
57 |     #     """Get divisible channel by default divisor.
58 | 
59 |     #     Args:
60 |     #         n_channel: number of channel.
61 | 
62 |     #     Returns:
63 |     #         Ex) given {n_channel} is 52 and {GeneratorAbstract.CHANNEL_DIVISOR} is 8.,
64 |     #             return channel is 56 since ceil(52/8) = 7 and 7*8 = 56
65 |     #     """
66 |     #     return make_divisible(n_channel, divisor=cls.CHANNEL_DIVISOR)
67 | 
68 |     # @property
69 |     # @abstractmethod
70 |     # def out_channel(self) -> int:
71 |     #     """Out channel of the module."""
72 | 
73 |     @abstractmethod
74 |     def __call__(self, repeat: int = 1):
75 |         """Returns nn.Module component"""
76 | 
77 | 
78 | class ModuleGenerator:
79 |     """Module generator class."""
80 | 
81 |     def __init__(self, module_name: str):
82 |         """Generate module based on the {module_name}
83 | 
84 |         Args:
85 |             module_name: {module_name}Generator class must have been implemented.
86 |         """
87 |         self.module_name = module_name
88 |         # self.in_channel = in_channel
89 | 
90 |     def __call__(self, *args, **kwargs):
91 |         # replace getattr
92 |         return getattr(
93 |             __import__("src.modules", fromlist=[""]),
94 |             f"{self.module_name}Generator",
95 |         )(*args, **kwargs)
96 | 


--------------------------------------------------------------------------------
/automl/src/modules/bert.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import numpy as np
 3 | from abc import abstractmethod
 4 | from transformers import AutoModel
 5 | from src.modules.base_generator import GeneratorAbstract
 6 | 
 7 | 
 8 | class BaseModel(nn.Module):
 9 |     """
10 |     Base class for all models
11 |     """
12 | 
13 |     @abstractmethod
14 |     def forward(self, *inputs):
15 |         """
16 |         Forward pass logic
17 | 
18 |         :return: Model output
19 |         """
20 |         raise NotImplementedError
21 | 
22 |     def __str__(self):
23 |         """
24 |         Model prints with number of trainable parameters
25 |         """
26 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
27 |         params = sum([np.prod(p.size()) for p in model_parameters])
28 |         return super().__str__() + "\nTrainable parameters: {}".format(params)
29 | 
30 | 
31 | class Bert(BaseModel):
32 |     def __init__(self, name="klue/bert-base"):
33 |         super().__init__()
34 |         # self.config = AutoConfig.from_pretrained(name)
35 |         self.model = AutoModel.from_pretrained(name)
36 | 
37 |     def forward(self, inputs):
38 |         return self.model(**inputs)[0]
39 | 
40 | 
41 | class BertGenerator(GeneratorAbstract):
42 |     """Pretrained Bert block generator."""
43 | 
44 |     def __init__(self, *args, **kwargs):
45 |         super().__init__(*args, **kwargs)
46 | 
47 |     @property
48 |     def base_module(self) -> nn.Module:
49 |         """Returns module class from src.common_modules based on the class name."""
50 |         return getattr(__import__("src.modules", fromlist=[""]), self.name)
51 | 
52 |     def __call__(self, repeat: int = 1):
53 |         """call method.
54 |         Build Bert Model
55 |         """
56 |         module = []
57 |         args = self.args
58 |         for i in range(repeat):
59 |             module.append(self.base_module(*args))
60 |         return self._get_module(module)
61 | 


--------------------------------------------------------------------------------
/automl/src/modules/conv.py:
--------------------------------------------------------------------------------
  1 | """Conv module, generator.
  2 | 
  3 | - Author: Jongkuk Lim
  4 | - Contact: lim.jeikei@gmail.com
  5 | """
  6 | # pylint: disable=useless-super-delegation
  7 | from typing import Union
  8 | 
  9 | import torch
 10 | from torch import nn as nn
 11 | 
 12 | from src.modules.base_generator import GeneratorAbstract
 13 | from src.utils.torch_utils import Activation, autopad
 14 | 
 15 | 
 16 | class Conv(nn.Module):
 17 |     """Standard convolution with batch normalization and activation."""
 18 | 
 19 |     def __init__(
 20 |         self,
 21 |         in_channel: int,
 22 |         out_channels: int,
 23 |         kernel_size: int,
 24 |         stride: int = 1,
 25 |         padding: Union[int, None] = None,
 26 |         groups: int = 1,
 27 |         activation: Union[str, None] = "ReLU",
 28 |     ) -> None:
 29 |         """Standard convolution with batch normalization and activation.
 30 | 
 31 |         Args:
 32 |             in_channel: input channels.
 33 |             out_channels: output channels.
 34 |             kernel_size: kernel size.
 35 |             stride: stride.
 36 |             padding: input padding. If None is given, autopad is applied
 37 |                 which is identical to padding='SAME' in TensorFlow.
 38 |             groups: group convolution.
 39 |             activation: activation name. If None is given, nn.Identity is applied
 40 |                 which is no activation.
 41 |         """
 42 |         super().__init__()
 43 |         # error: Argument "padding" to "Conv2d" has incompatible type "Union[int, List[int]]";
 44 |         # expected "Union[int, Tuple[int, int]]"
 45 |         self.conv = nn.Conv2d(
 46 |             in_channel,
 47 |             out_channels,
 48 |             kernel_size,
 49 |             stride,
 50 |             padding=autopad(kernel_size, padding),  # type: ignore
 51 |             groups=groups,
 52 |             bias=False,
 53 |         )
 54 |         self.bn = nn.BatchNorm2d(out_channels)
 55 |         self.act = Activation(activation)()
 56 | 
 57 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 58 |         """Forward."""
 59 |         return self.act(self.bn(self.conv(x)))
 60 | 
 61 |     def fusefoward(self, x: torch.Tensor) -> torch.Tensor:
 62 |         """Fuse forward."""
 63 |         return self.act(self.conv(x))
 64 | 
 65 | 
 66 | class ConvGenerator(GeneratorAbstract):
 67 |     """Conv2d generator for parsing module."""
 68 | 
 69 |     def __init__(self, *args, **kwargs):
 70 |         super().__init__(*args, **kwargs)
 71 | 
 72 |     @property
 73 |     def out_channel(self) -> int:
 74 |         """Get out channel size."""
 75 |         return self._get_divisible_channel(self.args[0] * self.width_multiply)
 76 | 
 77 |     @property
 78 |     def base_module(self) -> nn.Module:
 79 |         """Returns module class from src.common_modules based on the class name."""
 80 |         return getattr(__import__("src.modules", fromlist=[""]), self.name)
 81 | 
 82 |     def __call__(self, repeat: int = 1):
 83 |         args = [self.in_channel, self.out_channel, *self.args[1:]]
 84 |         if repeat > 1:
 85 |             stride = 1
 86 |             # Important!: stride only applies at the end of the repeat.
 87 |             if len(args) > 2:
 88 |                 stride = args[3]
 89 |                 args[3] = 1
 90 | 
 91 |             module = []
 92 |             for i in range(repeat):
 93 |                 if len(args) > 1 and stride > 1 and i == repeat - 1:
 94 |                     args[3] = stride
 95 | 
 96 |                 module.append(self.base_module(*args))
 97 |                 args[0] = self.out_channel
 98 |         else:
 99 |             module = self.base_module(*args)
100 | 
101 |         return self._get_module(module)
102 | 
103 | 
104 | class FixedConvGenerator(GeneratorAbstract):
105 |     """FixedConv2d generator for parsing module.
106 |     Fixed Conv doesn't change out channel
107 |     """
108 | 
109 |     def __init__(self, *args, **kwargs):
110 |         super().__init__(*args, **kwargs)
111 | 
112 |     @property
113 |     def out_channel(self) -> int:
114 |         """Get out channel size."""
115 |         return int(self.args[0])
116 | 
117 |     @property
118 |     def base_module(self) -> nn.Module:
119 |         """Returns module class from src.common_modules based on the class name."""
120 |         return getattr(
121 |             __import__("src.modules", fromlist=[""]), self.name.replace("Fixed", "")
122 |         )
123 | 
124 |     def __call__(self, repeat: int = 1):
125 |         args = [self.in_channel, self.out_channel, *self.args[1:]]
126 |         if repeat > 1:
127 |             stride = 1
128 |             # Important!: stride only applies at the end of the repeat.
129 |             if len(args) > 2:
130 |                 stride = args[3]
131 |                 args[3] = 1
132 | 
133 |             module = []
134 |             for i in range(repeat):
135 |                 if len(args) > 1 and stride > 1 and i == repeat - 1:
136 |                     args[3] = stride
137 | 
138 |                 module.append(self.base_module(*args))
139 |                 args[0] = self.out_channel
140 |         else:
141 |             module = self.base_module(*args)
142 | 
143 |         return self._get_module(module)
144 | 
145 | 
146 | class FixedConvGenerator(GeneratorAbstract):
147 |     """FixedConv2d generator for parsing module.
148 | 
149 |     Fixed Conv doesn't change out channel
150 |     """
151 | 
152 |     def __init__(self, *args, **kwargs):
153 |         super().__init__(*args, **kwargs)
154 | 
155 |     @property
156 |     def out_channel(self) -> int:
157 |         """Get out channel size."""
158 |         return int(self.args[0])
159 | 
160 |     @property
161 |     def base_module(self) -> nn.Module:
162 |         """Returns module class from src.common_modules based on the class name."""
163 |         return getattr(
164 |             __import__("src.modules", fromlist=[""]), self.name.replace("Fixed", "")
165 |         )
166 | 
167 |     def __call__(self, repeat: int = 1):
168 |         args = [self.in_channel, self.out_channel, *self.args[1:]]
169 |         if repeat > 1:
170 |             stride = 1
171 |             # Important!: stride only applies at the end of the repeat.
172 |             if len(args) > 2:
173 |                 stride = args[3]
174 |                 args[3] = 1
175 | 
176 |             module = []
177 |             for i in range(repeat):
178 |                 if len(args) > 1 and stride > 1 and i == repeat - 1:
179 |                     args[3] = stride
180 | 
181 |                 module.append(self.base_module(*args))
182 |                 args[0] = self.out_channel
183 |         else:
184 |             module = self.base_module(*args)
185 | 
186 |         return self._get_module(module)
187 | 


--------------------------------------------------------------------------------
/automl/src/modules/dwconv.py:
--------------------------------------------------------------------------------
  1 | """DWConv module, generator.
  2 | 
  3 | - Author: Jongkuk Lim
  4 | - Contact: lim.jeikei@gmail.com
  5 | """
  6 | import math
  7 | # pylint: disable=useless-super-delegation
  8 | from typing import Union
  9 | 
 10 | import torch
 11 | from torch import nn as nn
 12 | 
 13 | from src.modules.base_generator import GeneratorAbstract
 14 | from src.utils.torch_utils import Activation, autopad
 15 | 
 16 | 
 17 | class DWConv(nn.Module):
 18 |     """Depthwise convolution with batch normalization and activation."""
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         in_channel: int,
 23 |         out_channels: int,
 24 |         kernel_size: int,
 25 |         stride: int = 1,
 26 |         padding: Union[int, None] = None,
 27 |         activation: Union[str, None] = "ReLU",
 28 |     ) -> None:
 29 |         """Depthwise convolution with batch normalization and activation.
 30 | 
 31 |         Args:
 32 |             in_channel: input channels.
 33 |             out_channels: output channels.
 34 |             kernel_size: kernel size.
 35 |             stride: stride.
 36 |             padding: input padding. If None is given, autopad is applied
 37 |                 which is identical to padding='SAME' in TensorFlow.
 38 |             activation: activation name. If None is given, nn.Identity is applied
 39 |                 which is no activation.
 40 |         """
 41 |         super().__init__()
 42 |         # error: Argument "padding" to "Conv2d" has incompatible type "Union[int, List[int]]";
 43 |         # expected "Union[int, Tuple[int, int]]"
 44 |         self.conv = nn.Conv2d(
 45 |             in_channel,
 46 |             out_channels,
 47 |             kernel_size,
 48 |             stride,
 49 |             padding=autopad(kernel_size, padding),  # type: ignore
 50 |             groups=math.gcd(in_channel, out_channels),
 51 |             bias=False,
 52 |         )
 53 |         self.bn = nn.BatchNorm2d(out_channels)
 54 |         self.act = Activation(activation)()
 55 | 
 56 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 57 |         """Forward."""
 58 |         return self.act(self.bn(self.conv(x)))
 59 | 
 60 |     def fusefoward(self, x: torch.Tensor) -> torch.Tensor:
 61 |         """Fuse forward."""
 62 |         return self.act(self.conv(x))
 63 | 
 64 | 
 65 | class DWConvGenerator(GeneratorAbstract):
 66 |     """Depth-wise convolution generator for parsing module."""
 67 | 
 68 |     def __init__(self, *args, **kwargs):
 69 |         super().__init__(*args, **kwargs)
 70 | 
 71 |     @property
 72 |     def out_channel(self) -> int:
 73 |         """Get out channel size."""
 74 |         return self._get_divisible_channel(self.args[0] * self.width_multiply)
 75 | 
 76 |     @property
 77 |     def base_module(self) -> nn.Module:
 78 |         """Returns module class from src.common_modules based on the class name."""
 79 |         return getattr(__import__("src.modules", fromlist=[""]), self.name)
 80 | 
 81 |     def __call__(self, repeat: int = 1):
 82 |         args = [self.in_channel, self.out_channel, *self.args[1:]]
 83 |         if repeat > 1:
 84 |             stride = 1
 85 |             # Important!: stride only applies at the end of the repeat.
 86 |             if len(args) > 2:
 87 |                 stride = args[3]
 88 |                 args[3] = 1
 89 | 
 90 |             module = []
 91 |             for i in range(repeat):
 92 |                 if len(args) > 1 and stride > 1 and i == repeat - 1:
 93 |                     args[3] = stride
 94 | 
 95 |                 module.append(self.base_module(*args))
 96 |                 args[0] = self.out_channel
 97 |         else:
 98 |             module = self.base_module(*args)
 99 | 
100 |         return self._get_module(module)
101 | 


--------------------------------------------------------------------------------
/automl/src/modules/electra.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from abc import abstractmethod
 5 | from transformers import AutoConfig, AutoModel
 6 | from src.modules.base_generator import GeneratorAbstract
 7 | 
 8 | 
 9 | class BaseModel(nn.Module):
10 |     """
11 |     Base class for all models
12 |     """
13 | 
14 |     @abstractmethod
15 |     def forward(self, *inputs):
16 |         """
17 |         Forward pass logic
18 | 
19 |         :return: Model output
20 |         """
21 |         raise NotImplementedError
22 | 
23 |     def __str__(self):
24 |         """
25 |         Model prints with number of trainable parameters
26 |         """
27 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
28 |         params = sum([np.prod(p.size()) for p in model_parameters])
29 |         return super().__str__() + "\nTrainable parameters: {}".format(params)
30 | 
31 | 
32 | class Electra(BaseModel):
33 |     def __init__(self, name="beomi/beep-KcELECTRA-base-hate"):
34 |         super().__init__()
35 |         self.model = AutoModel.from_pretrained(name)
36 | 
37 |     def forward(self, inputs):
38 |         with torch.no_grad():
39 |             outputs = self.model(**inputs)
40 |         return outputs[0]
41 | 
42 | 
43 | class ElectraGenerator(GeneratorAbstract):
44 |     """Pretrained Electra block generator."""
45 | 
46 |     def __init__(self, *args, **kwargs):
47 |         super().__init__(*args, **kwargs)
48 | 
49 |     @property
50 |     def base_module(self) -> nn.Module:
51 |         """Returns module class from src.common_modules based on the class name."""
52 |         return getattr(__import__("src.modules", fromlist=[""]), self.name)
53 | 
54 |     def __call__(self, repeat: int = 1):
55 |         """call method.
56 |         Build module
57 |         """
58 |         module = []
59 |         args = self.args
60 |         for i in range(repeat):
61 |             module.append(self.base_module(*args))
62 |         return self._get_module(module)
63 | 


--------------------------------------------------------------------------------
/automl/src/modules/flatten.py:
--------------------------------------------------------------------------------
 1 | """Flatten module, generator.
 2 | 
 3 | - Author: Jongkuk Lim
 4 | - Contact: lim.jeikei@gmail.com
 5 | """
 6 | from torch import nn as nn
 7 | 
 8 | from src.modules.base_generator import GeneratorAbstract
 9 | 
10 | 
11 | class FlattenGenerator(GeneratorAbstract):
12 |     """Flatten module generator."""
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         super().__init__(*args, **kwargs)
16 | 
17 |     @property
18 |     def out_channel(self) -> int:
19 |         return self.in_channel
20 | 
21 |     def __call__(self, repeat: int = 1):
22 |         return self._get_module(nn.Flatten())
23 | 


--------------------------------------------------------------------------------
/automl/src/modules/linear.py:
--------------------------------------------------------------------------------
 1 | """Linear module, generator.
 2 | 
 3 | - Author: Jongkuk Lim
 4 | - Contact: lim.jeikei@gmail.com
 5 | """
 6 | from typing import Union
 7 | 
 8 | import torch
 9 | from torch import nn as nn
10 | 
11 | from src.modules.base_generator import GeneratorAbstract
12 | from src.utils.torch_utils import Activation
13 | 
14 | 
15 | class Linear(nn.Module):
16 |     """Linear module."""
17 | 
18 |     def __init__(self, in_channel: int, out_channel: int, activation: Union[str, None]):
19 |         """
20 | 
21 |         Args:
22 |             in_channel: input channels.
23 |             out_channel: output channels.
24 |             activation: activation name. If None is given, nn.Identity is applied
25 |                 which is no activation.
26 |         """
27 |         super().__init__()
28 |         self.linear = nn.Linear(in_channel, out_channel)
29 |         self.activation = Activation(activation)()
30 | 
31 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
32 |         """Forward."""
33 |         return self.activation(self.linear(x))
34 | 
35 | 
36 | class LinearGenerator(GeneratorAbstract):
37 |     """Linear (fully connected) module generator for parsing."""
38 | 
39 |     def __init__(self, *args, **kwargs):
40 |         """Initailize."""
41 |         super().__init__(*args, **kwargs)
42 | 
43 |     @property
44 |     def out_channel(self) -> int:
45 |         """Get out channel size."""
46 |         return self.args[0]
47 | 
48 |     def __call__(self, repeat: int = 1):
49 |         # TODO: Apply repeat
50 |         act = self.args[1] if len(self.args) > 1 else None
51 | 
52 |         return self._get_module(
53 |             Linear(self.in_channel, self.out_channel, activation=act)
54 |         )
55 | 


--------------------------------------------------------------------------------
/automl/src/modules/lstm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import numpy as np
 4 | from abc import abstractmethod
 5 | from src.modules.base_generator import GeneratorAbstract
 6 | 
 7 | 
 8 | class BaseModel(nn.Module):
 9 |     """
10 |     Base class for all models
11 |     """
12 | 
13 |     @abstractmethod
14 |     def forward(self, *inputs):
15 |         """
16 |         Forward pass logic
17 | 
18 |         :return: Model output
19 |         """
20 |         raise NotImplementedError
21 | 
22 |     def __str__(self):
23 |         """
24 |         Model prints with number of trainable parameters
25 |         """
26 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
27 |         params = sum([np.prod(p.size()) for p in model_parameters])
28 |         return super().__str__() + "\nTrainable parameters: {}".format(params)
29 | 
30 | 
31 | class Lstm(BaseModel):
32 |     def __init__(self, name="rnn", xdim=28, hdim=256, ydim=3, n_layer=3, dropout=0):
33 |         super(Lstm, self).__init__()
34 |         self.name = name
35 |         self.xdim = xdim
36 |         self.hdim = hdim
37 |         self.ydim = ydim
38 |         self.n_layer = n_layer  # K
39 |         self.dropout = dropout
40 | 
41 |         self.rnn = nn.LSTM(
42 |             input_size=xdim,
43 |             hidden_size=hdim,
44 |             num_layers=n_layer,
45 |             batch_first=True,
46 |             dropout=dropout,
47 |         )
48 |         self.lin = nn.Linear(self.hdim, self.ydim)
49 | 
50 |     def forward(self, x):
51 |         # Set initial hidden and cell states
52 |         device = (
53 |             torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
54 |         )
55 |         # print(device)
56 |         h0 = torch.zeros(self.n_layer, x.size(0), self.hdim).to(device)
57 |         c0 = torch.zeros(self.n_layer, x.size(0), self.hdim).to(device)
58 | 
59 |         # RNN
60 |         rnn_out, (hn, cn) = self.rnn(x, (h0, c0))
61 |         # x:[N x L x Q] => rnn_out:[N x L x D]
62 |         # Linear
63 |         out = self.lin(rnn_out[:, -1, :]).view([-1, self.ydim])
64 |         return out
65 | 
66 | 
67 | class LstmGenerator(GeneratorAbstract):
68 |     """Pretrained Bert block generator."""
69 | 
70 |     def __init__(self, *args, **kwargs):
71 |         super().__init__(*args, **kwargs)
72 | 
73 |     @property
74 |     def base_module(self) -> nn.Module:
75 |         """Returns module class from src.common_modules based on the class name."""
76 |         return getattr(__import__("src.modules", fromlist=[""]), self.name)
77 | 
78 |     def __call__(self, repeat: int = 1):
79 |         """call method."""
80 |         module = []
81 |         args = self.args
82 |         for i in range(repeat):
83 |             module.append(self.base_module(*args))
84 |         return self._get_module(module)
85 | 


--------------------------------------------------------------------------------
/automl/src/modules/mbconv.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from src.modules.base_generator import GeneratorAbstract
  7 | 
  8 | 
  9 | class MBConv(nn.Module):
 10 |     """MBConvBlock used in Efficientnet.
 11 | 
 12 |     Reference:
 13 |         https://github.com/narumiruna/efficientnet-pytorch/blob/master/efficientnet/models/efficientnet.py
 14 |     Note:
 15 |         Drop connect rate is disabled.
 16 |     """
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         in_planes,
 21 |         out_planes,
 22 |         expand_ratio,
 23 |         kernel_size,
 24 |         stride,
 25 |         reduction_ratio=4,
 26 |         drop_connect_rate=0.0,
 27 |     ):
 28 |         super(MBConv, self).__init__()
 29 |         self.drop_connect_rate = drop_connect_rate
 30 |         self.use_residual = in_planes == out_planes and stride == 1
 31 |         assert stride in [1, 2]
 32 |         assert kernel_size in [3, 5]
 33 | 
 34 |         hidden_dim = in_planes * expand_ratio
 35 |         reduced_dim = max(1, in_planes // reduction_ratio)
 36 | 
 37 |         layers = []
 38 |         # pw
 39 |         if in_planes != hidden_dim:
 40 |             layers.append(ConvBNReLU(in_planes, hidden_dim, 1))
 41 | 
 42 |         layers.extend(
 43 |             [
 44 |                 # dw
 45 |                 ConvBNReLU(
 46 |                     hidden_dim,
 47 |                     hidden_dim,
 48 |                     kernel_size,
 49 |                     stride=stride,
 50 |                     groups=hidden_dim,
 51 |                 ),
 52 |                 # se
 53 |                 SqueezeExcitation(hidden_dim, reduced_dim),
 54 |                 # pw-linear
 55 |                 nn.Conv2d(hidden_dim, out_planes, 1, bias=False),
 56 |                 nn.BatchNorm2d(out_planes),
 57 |             ]
 58 |         )
 59 |         self.conv = nn.Sequential(*layers)
 60 | 
 61 |     def _drop_connect(self, x):
 62 |         if not self.training:
 63 |             return x
 64 |         if self.drop_connect_rate >= 1.0:
 65 |             return x
 66 |         keep_prob = 1.0 - self.drop_connect_rate
 67 |         batch_size = x.size(0)
 68 |         random_tensor = keep_prob
 69 |         random_tensor += torch.rand(batch_size, 1, 1, 1, device=x.device)
 70 |         binary_tensor = random_tensor.floor()
 71 |         return x.div(keep_prob) * binary_tensor
 72 | 
 73 |     def forward(self, x):
 74 |         if self.use_residual:
 75 |             return x + self._drop_connect(self.conv(x))
 76 |         else:
 77 |             return self.conv(x)
 78 | 
 79 | 
 80 | class ConvBNReLU(nn.Sequential):
 81 |     def __init__(self, in_planes, out_planes, kernel_size, stride=1, groups=1):
 82 |         padding = self._get_padding(kernel_size, stride)
 83 |         super(ConvBNReLU, self).__init__(
 84 |             nn.ZeroPad2d(padding),
 85 |             nn.Conv2d(
 86 |                 in_planes,
 87 |                 out_planes,
 88 |                 kernel_size,
 89 |                 stride,
 90 |                 padding=0,
 91 |                 groups=groups,
 92 |                 bias=False,
 93 |             ),
 94 |             nn.BatchNorm2d(out_planes),
 95 |             Swish(),
 96 |         )
 97 | 
 98 |     def _get_padding(self, kernel_size, stride):
 99 |         p = max(kernel_size - stride, 0)
100 |         return [p // 2, p - p // 2, p // 2, p - p // 2]
101 | 
102 | 
103 | class SwishImplementation(torch.autograd.Function):
104 |     @staticmethod
105 |     def forward(ctx, i):
106 |         result = i * torch.sigmoid(i)
107 |         ctx.save_for_backward(i)
108 |         return result
109 | 
110 |     @staticmethod
111 |     def backward(ctx, grad_output):
112 |         i = ctx.saved_variables[0]
113 |         sigmoid_i = torch.sigmoid(i)
114 |         return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
115 | 
116 | 
117 | class Swish(nn.Module):
118 |     def forward(self, x):
119 |         return SwishImplementation.apply(x)
120 | 
121 | 
122 | def _round_repeats(repeats, depth_mult):
123 |     if depth_mult == 1.0:
124 |         return repeats
125 |     return int(math.ceil(depth_mult * repeats))
126 | 
127 | 
128 | class SqueezeExcitation(nn.Module):
129 |     """Squeeze-Excitation layer used in MBConv."""
130 | 
131 |     def __init__(self, in_planes, reduced_dim):
132 |         super(SqueezeExcitation, self).__init__()
133 |         self.se = nn.Sequential(
134 |             nn.AdaptiveAvgPool2d(1),
135 |             nn.Conv2d(in_planes, reduced_dim, 1),
136 |             Swish(),
137 |             nn.Conv2d(reduced_dim, in_planes, 1),
138 |             nn.Sigmoid(),
139 |         )
140 | 
141 |     def forward(self, x):
142 |         return x * self.se(x)
143 | 
144 | 
145 | class MBConvGenerator(GeneratorAbstract):
146 |     """Bottleneck block generator."""
147 | 
148 |     def __init__(self, *args, **kwargs):
149 |         super().__init__(*args, **kwargs)
150 | 
151 |     @property
152 |     def out_channel(self) -> int:
153 |         """Get out channel size."""
154 |         return self._get_divisible_channel(self.args[0] * self.width_multiply)
155 | 
156 |     @property
157 |     def base_module(self) -> nn.Module:
158 |         """Returns module class from src.common_modules based on the class name."""
159 |         return getattr(__import__("src.modules", fromlist=[""]), self.name)
160 | 
161 |     def __call__(self, repeat: int = 1):
162 |         """call method.
163 | 
164 |         InvertedResidualv3 args consists,
165 |         repeat(=n), [c, t, s] // note original notation from paper is [t, c, n, s]
166 |         """
167 |         module = []
168 |         t, c, s, k = self.args  # c is equivalent as self.out_channel
169 |         inp, oup = self.in_channel, self.out_channel
170 |         for i in range(repeat):
171 |             stride = s if i == 0 else 1
172 |             module.append(
173 |                 self.base_module(
174 |                     in_planes=inp,
175 |                     out_planes=oup,
176 |                     expand_ratio=t,
177 |                     stride=stride,
178 |                     kernel_size=k,
179 |                 )
180 |             )
181 |             inp = oup
182 |         return self._get_module(module)
183 | 


--------------------------------------------------------------------------------
/automl/src/modules/poolings.py:
--------------------------------------------------------------------------------
 1 | """Module generator related to pooling operations.
 2 | 
 3 | - Author: Jongkuk Lim
 4 | - Contact: lim.jeikei@gmail.com
 5 | """
 6 | # pylint: disable=useless-super-delegation
 7 | from torch import nn
 8 | 
 9 | from src.modules.base_generator import GeneratorAbstract
10 | 
11 | 
12 | class MaxPoolGenerator(GeneratorAbstract):
13 |     """Max pooling module generator."""
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         super().__init__(*args, **kwargs)
17 | 
18 |     @property
19 |     def out_channel(self) -> int:
20 |         """Get out channel size."""
21 |         # error: Value of type "Optional[List[int]]" is not indexable
22 |         return self.in_channel
23 | 
24 |     @property
25 |     def base_module(self) -> nn.Module:
26 |         """Base module."""
27 |         return getattr(nn, f"{self.name}2d")
28 | 
29 |     def __call__(self, repeat: int = 1):
30 |         module = (
31 |             [self.base_module(*self.args) for _ in range(repeat)]
32 |             if repeat > 1
33 |             else self.base_module(*self.args)
34 |         )
35 |         return self._get_module(module)
36 | 
37 | 
38 | class AvgPoolGenerator(MaxPoolGenerator):
39 |     """Average pooling module generator."""
40 | 
41 |     def __init__(self, *args, **kwargs):
42 |         super().__init__(*args, **kwargs)
43 | 
44 | 
45 | class GlobalAvgPool(nn.AdaptiveAvgPool2d):
46 |     """Global average pooling module."""
47 | 
48 |     def __init__(self, output_size=1):
49 |         """Initialize."""
50 |         super().__init__(output_size=output_size)
51 | 
52 | 
53 | class GlobalAvgPoolGenerator(GeneratorAbstract):
54 |     """Global average pooling module generator."""
55 | 
56 |     def __init__(self, *args, **kwargs):  # pylint: disable=unused-argument
57 |         super().__init__(*args, **kwargs)
58 |         self.output_size = 1
59 |         if len(args) > 1:
60 |             self.output_size = args[1]
61 | 
62 |     @property
63 |     def out_channel(self) -> int:
64 |         """Get out channel size."""
65 |         return self.in_channel
66 | 
67 |     def __call__(self, repeat: int = 1):
68 |         return self._get_module(GlobalAvgPool(self.output_size))
69 | 


--------------------------------------------------------------------------------
/automl/src/utils/common.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, Union
 2 | 
 3 | import numpy as np
 4 | import yaml
 5 | from torchvision.datasets import ImageFolder, VisionDataset
 6 | import os
 7 | import json
 8 | from pathlib import Path
 9 | 
10 | 
11 | def read_yaml(cfg: Union[str, Dict[str, Any]]):
12 |     if not isinstance(cfg, dict):
13 |         with open(cfg) as f:
14 |             config = yaml.load(f, Loader=yaml.FullLoader)
15 |     else:
16 |         config = cfg
17 |     return config
18 | 
19 | 
20 | def write_yaml(cfg: Union[str, Dict[str, Any]], name, path=""):
21 |     if isinstance(cfg, dict):
22 |         if not os.path.exists(path):
23 |             os.mkdir(path)
24 |         with open(os.path.join(path, name + ".yaml"), "w") as f:
25 |             yaml.dump(cfg, f)
26 |     else:
27 |         ValueError
28 | 
29 | 
30 | def get_label_counts(dataset_path: str):
31 |     """Counts for each label."""
32 |     if not dataset_path:
33 |         return None
34 |     td = ImageFolder(root=dataset_path)
35 |     # get label distribution
36 |     label_counts = [0] * len(td.classes)
37 |     for p, l in td.samples:
38 |         label_counts[l] += 1
39 |     return label_counts
40 | 
41 | 
42 | def write_json(content, fname):
43 |     fname = Path(fname)
44 |     with fname.open("wt") as handle:
45 |         json.dump(content, handle, indent=4, sort_keys=False)
46 | 


--------------------------------------------------------------------------------
/automl/src/utils/data.py:
--------------------------------------------------------------------------------
 1 | """Utils for model compression.
 2 | 
 3 | - Author: wlaud1001
 4 | - Email: wlaud1001@snu.ac.kr
 5 | - Reference:
 6 |     https://github.com/j-marple-dev/model_compression
 7 | """
 8 | 
 9 | import random
10 | from multiprocessing import Pool
11 | from typing import Tuple
12 | 
13 | 
14 | def get_rand_bbox_coord(
15 |     w: int, h: int, len_ratio: float
16 | ) -> Tuple[Tuple[int, int], Tuple[int, int]]:
17 |     """Get a coordinate of random box."""
18 |     size_hole_w = int(len_ratio * w)
19 |     size_hole_h = int(len_ratio * h)
20 |     x = random.randint(0, w)  # [0, w]
21 |     y = random.randint(0, h)  # [0, h]
22 | 
23 |     x0 = max(0, x - size_hole_w // 2)
24 |     y0 = max(0, y - size_hole_h // 2)
25 |     x1 = min(w, x + size_hole_w // 2)
26 |     y1 = min(h, y + size_hole_h // 2)
27 |     return (x0, y0), (x1, y1)
28 | 
29 | def weights_for_balanced_classes(subset, nclasses):                        
30 |     count = [0] * nclasses                            
31 |     for i in subset:                                                         
32 |         count[i[1]] += 1                                                     
33 |     weight_per_class = [0.] * nclasses                                      
34 |     N = float(sum(count))                                                   
35 |     for i in range(nclasses):                                                   
36 |         weight_per_class[i] = N/float(count[i])                                 
37 |     weight = [0] * len(images)                                              
38 |     for idx, val in enumerate(images):                                          
39 |         weight[idx] = weight_per_class[val[1]]                                  
40 |     return weightget_rand_bbox_coord


--------------------------------------------------------------------------------
/automl/src/utils/torch_utils.py:
--------------------------------------------------------------------------------
  1 | """Common utility functions.
  2 | 
  3 | - Author: Jongkuk Lim
  4 | - Contact: lim.jeikei@gmail.com
  5 | """
  6 | 
  7 | import math
  8 | import os
  9 | from typing import List, Optional, Tuple, Union
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | from torch import nn
 14 | from torch.utils.data import Subset
 15 | import random
 16 | from .common import write_yaml
 17 | 
 18 | 
 19 | def convert_model_to_torchscript(
 20 |     model: nn.Module, path: Optional[str] = None
 21 | ) -> torch.jit.ScriptModule:
 22 |     """Convert PyTorch Module to TorchScript.
 23 | 
 24 |     Args:
 25 |         model: PyTorch Module.
 26 | 
 27 |     Return:
 28 |         TorchScript module.
 29 |     """
 30 |     model.eval()
 31 |     jit_model = torch.jit.script(model)
 32 | 
 33 |     if path:
 34 |         jit_model.save(path)
 35 | 
 36 |     return jit_model
 37 | 
 38 | 
 39 | def split_dataset_index(
 40 |     train_dataset: torch.utils.data.Dataset, n_data: int, split_ratio: float = 0.1
 41 | ) -> Tuple[Subset, Subset]:
 42 |     """Split dataset indices with split_ratio.
 43 | 
 44 |     Args:
 45 |         n_data: number of total data
 46 |         split_ratio: split ratio (0.0 ~ 1.0)
 47 | 
 48 |     Returns:
 49 |         SubsetRandomSampler ({split_ratio} ~ 1.0)
 50 |         SubsetRandomSampler (0 ~ {split_ratio})
 51 |     """
 52 |     indices = np.arange(n_data)
 53 |     split = int(split_ratio * indices.shape[0])
 54 | 
 55 |     train_idx = indices[split:]
 56 |     valid_idx = indices[:split]
 57 | 
 58 |     train_subset = Subset(train_dataset, train_idx)
 59 |     valid_subset = Subset(train_dataset, valid_idx)
 60 | 
 61 |     return train_subset, valid_subset
 62 | 
 63 | 
 64 | def save_model(model, path, data, device):
 65 |     """save model to torch script, onnx."""
 66 |     try:
 67 |         torch.save(model.state_dict(), f=path)
 68 |         ts_path = os.path.splitext(path)[:-1][0] + ".ts"
 69 |         convert_model_to_torchscript(model, ts_path)
 70 |     except Exception:
 71 |         print("Failed to save torch")
 72 | 
 73 | 
 74 | def save_model2(model, path, data, device, model_config):
 75 |     """save model to torch script, onnx."""
 76 |     try:
 77 |         if not os.path.exists(path):
 78 |             os.mkdir(path)
 79 |         torch.save(model.state_dict(), f=os.path.join(path, "result_model.pt"))
 80 |         write_yaml(model_config, "model_config", path=path)
 81 |     except Exception:
 82 |         print("Failed to save torch")
 83 | 
 84 | 
 85 | def model_info(model, verbose=False):
 86 |     """Print out model info."""
 87 |     n_p = sum(x.numel() for x in model.parameters())  # number parameters
 88 |     n_g = sum(
 89 |         x.numel() for x in model.parameters() if x.requires_grad
 90 |     )  # number gradients
 91 |     if verbose:
 92 |         print(
 93 |             "%5s %40s %9s %12s %20s %10s %10s"
 94 |             % ("layer", "name", "gradient", "parameters", "shape", "mu", "sigma")
 95 |         )
 96 |         for i, (name, p) in enumerate(model.named_parameters()):
 97 |             name = name.replace("module_list.", "")
 98 |             print(
 99 |                 "%5g %40s %9s %12g %20s %10.3g %10.3g"
100 |                 % (
101 |                     i,
102 |                     name,
103 |                     p.requires_grad,
104 |                     p.numel(),
105 |                     list(p.shape),
106 |                     p.mean(),
107 |                     p.std(),
108 |                 )
109 |             )
110 | 
111 |     print(
112 |         f"Model Summary: {len(list(model.modules()))} layers, "
113 |         f"{n_p:,d} parameters, {n_g:,d} gradients"
114 |     )
115 | 
116 | 
117 | @torch.no_grad()
118 | def check_runtime(
119 |     model: nn.Module, word_length: List[int], device: torch.device, repeat: int = 100
120 | ) -> float:
121 |     # test part
122 |     # device = "cpu"
123 |     # model.to(device)
124 |     # test part
125 | 
126 |     repeat = min(repeat, 20)
127 |     inputs = {
128 |         "input_ids": torch.randint(0, 30000, [1, word_length]).to(device),
129 |         "token_type_ids": torch.randint(0, 1, [1, word_length]).to(device),
130 |         "attention_mask": torch.randint(0, 1, [1, word_length]).to(device),
131 |     }
132 |     measure = []
133 |     start = torch.cuda.Event(enable_timing=True)
134 |     end = torch.cuda.Event(enable_timing=True)
135 | 
136 |     model.eval()
137 |     for _ in range(repeat):
138 |         start.record()
139 |         _ = model(inputs)
140 |         end.record()
141 |         # Waits for everything to finish running
142 |         torch.cuda.synchronize()
143 |         measure.append(start.elapsed_time(end))
144 | 
145 |     measure.sort()
146 |     n = len(measure)
147 |     k = int(round(n * (0.2) / 2))
148 |     trimmed_measure = measure[k + 1 : n - k]
149 | 
150 |     with torch.autograd.profiler.profile(use_cuda=True) as prof:
151 |         _ = model(inputs)
152 |     print(prof)
153 |     print("measured time(ms)", np.mean(trimmed_measure))
154 |     model.train()
155 |     return np.mean(trimmed_measure)
156 | 
157 | 
158 | def make_divisible(v: float, divisor: int = 8, min_value: Optional[int] = None) -> int:
159 |     """
160 |     This function is taken from the original tf repo.
161 |     It ensures that all layers have a channel number that is divisible by 8
162 |     It can be seen here:
163 |     https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
164 |     """
165 |     if min_value is None:
166 |         min_value = divisor
167 |     new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
168 |     # Make sure that round down does not go down by more than 10%.
169 |     if new_v < 0.9 * v:
170 |         new_v += divisor
171 |     return new_v
172 | 
173 | 
174 | def autopad(
175 |     kernel_size: Union[int, List[int]], padding: Union[int, None] = None
176 | ) -> Union[int, List[int]]:
177 |     """Auto padding calculation for pad='same' in TensorFlow."""
178 |     # Pad to 'same'
179 |     if isinstance(kernel_size, int):
180 |         kernel_size = [kernel_size]
181 | 
182 |     return padding or [x // 2 for x in kernel_size]
183 | 
184 | 
185 | class Activation:
186 |     """Convert string activation name to the activation class."""
187 | 
188 |     def __init__(self, act_type: Union[str, None]) -> None:
189 |         """Convert string activation name to the activation class.
190 | 
191 |         Args:
192 |             type: Activation name.
193 | 
194 |         Returns:
195 |             nn.Identity if {type} is None.
196 |         """
197 |         self.type = act_type
198 |         self.args = [1] if self.type == "Softmax" else []
199 | 
200 |     def __call__(self) -> nn.Module:
201 |         if self.type is None:
202 |             return nn.Identity()
203 |         elif hasattr(nn, self.type):
204 |             return getattr(nn, self.type)(*self.args)
205 |         else:
206 |             return getattr(
207 |                 __import__("src.modules.activations", fromlist=[""]), self.type
208 |             )()
209 | 
210 | 
211 | if __name__ == "__main__":
212 |     # test
213 |     check_runtime(None, [32, 32] + [3])
214 | 


--------------------------------------------------------------------------------
/automl/tests/test_model_conversion.py:
--------------------------------------------------------------------------------
 1 | """Unit test for model conversion to TorchScript.
 2 | 
 3 | - Author: Jongkuk Lim
 4 | - Contact: lim.jeikei@gmail.com
 5 | """
 6 | 
 7 | 
 8 | import os
 9 | 
10 | import torch
11 | 
12 | from src.model import Model
13 | from src.utils.torch_utils import convert_model_to_torchscript
14 | 
15 | 
16 | class TestModelConversion:
17 |     """Test model conversion."""
18 | 
19 |     # pylint: disable=no-self-use
20 | 
21 |     INPUT1 = torch.rand(1, 3, 128, 128)
22 |     INPUT2 = torch.rand(8, 3, 256, 256)
23 |     SAVE_PATH = "tests/.test_model.ts"
24 | 
25 |     def _convert_evaluation(self, path: str) -> None:
26 |         """Model conversion test."""
27 |         model = Model(path)
28 |         convert_model_to_torchscript(model, path=TestModelConversion.SAVE_PATH)
29 | 
30 |         ts_model = torch.jit.load(TestModelConversion.SAVE_PATH)
31 | 
32 |         out_tensor1 = ts_model(TestModelConversion.INPUT1)
33 |         out_tensor2 = ts_model(TestModelConversion.INPUT2)
34 | 
35 |         os.remove(TestModelConversion.SAVE_PATH)
36 |         assert out_tensor1.shape == torch.Size((1, 9))
37 |         assert out_tensor2.shape == torch.Size((8, 9))
38 | 
39 |     def test_mobilenetv3(self):
40 |         """Test convert mobilenetv3 model to TorchScript."""
41 |         self._convert_evaluation(os.path.join("configs", "model", "mobilenetv3.yaml"))
42 | 
43 |     def test_example(self):
44 |         """Test convert example model to TorchScript."""
45 |         self._convert_evaluation(os.path.join("configs", "model", "example.yaml"))
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     test = TestModelConversion()
50 |     test.test_mobilenetv3()
51 |     test.test_example()
52 | 


--------------------------------------------------------------------------------
/automl/tests/test_model_parser.py:
--------------------------------------------------------------------------------
 1 | """Model parse test.
 2 | 
 3 | - Author: Jongkuk Lim
 4 | - Contact: lim.jeikei@gmail.com
 5 | """
 6 | 
 7 | import os
 8 | 
 9 | import torch
10 | 
11 | from src.model import Model
12 | 
13 | 
14 | class TestModelParser:
15 |     """Test model parser."""
16 | 
17 |     # pylint: disable=no-self-use
18 | 
19 |     INPUT = torch.rand(8, 3, 256, 256)
20 | 
21 |     def test_mobilenetv3(self):
22 |         """Test mobilenetv3 model."""
23 |         model = Model(os.path.join("configs", "model", "mobilenetv3.yaml"))
24 |         assert model(TestModelParser.INPUT).shape == torch.Size([8, 9])
25 | 
26 |     def test_example(self):
27 |         """Test example model."""
28 |         model = Model(os.path.join("configs", "model", "example.yaml"))
29 |         assert model(TestModelParser.INPUT).shape == torch.Size([8, 9])
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     test = TestModelParser()
34 | 
35 |     test.test_mobilenetv3()
36 |     test.test_example()
37 | 


--------------------------------------------------------------------------------
/automl/train.py:
--------------------------------------------------------------------------------
  1 | """Baseline train
  2 | - Author: Junghoon Kim
  3 | - Contact: placidus36@gmail.com
  4 | """
  5 | 
  6 | import argparse
  7 | import os
  8 | from datetime import datetime
  9 | from typing import Any, Dict, Tuple, Union
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.optim as optim
 14 | import yaml
 15 | 
 16 | from src.dataloader import create_dataloader
 17 | from src.loss import CustomCriterion
 18 | from src.model import Model
 19 | from src.trainer import TorchTrainer
 20 | from src.utils.common import get_label_counts, read_yaml
 21 | from src.utils.torch_utils import check_runtime, model_info
 22 | 
 23 | 
 24 | def train(
 25 |     model_config: Dict[str, Any],
 26 |     data_config: Dict[str, Any],
 27 |     log_dir: str,
 28 |     fp16: bool,
 29 |     device: torch.device,
 30 | ) -> Tuple[float, float, float]:
 31 |     """Train."""
 32 |     # save model_config, data_config
 33 |     with open(os.path.join(log_dir, "data.yml"), "w") as f:
 34 |         yaml.dump(data_config, f, default_flow_style=False)
 35 |     with open(os.path.join(log_dir, "model.yml"), "w") as f:
 36 |         yaml.dump(model_config, f, default_flow_style=False)
 37 | 
 38 |     model_instance = Model(model_config, verbose=True)
 39 |     model_path = os.path.join(log_dir, "best.pt")
 40 |     print(f"Model save path: {model_path}")
 41 |     if os.path.isfile(model_path):
 42 |         model_instance.model.load_state_dict(
 43 |             torch.load(model_path, map_location=device)
 44 |         )
 45 |     model_instance.model.to(device)
 46 | 
 47 |     # Create dataloader
 48 |     train_dl, val_dl, test_dl = create_dataloader(data_config)
 49 | 
 50 |     # Create optimizer, scheduler, criterion
 51 |     optimizer = torch.optim.SGD(
 52 |         model_instance.model.parameters(), lr=data_config["INIT_LR"], momentum=0.9
 53 |     )
 54 |     scheduler = torch.optim.lr_scheduler.OneCycleLR(
 55 |         optimizer=optimizer,
 56 |         max_lr=data_config["INIT_LR"],
 57 |         steps_per_epoch=len(train_dl),
 58 |         epochs=data_config["EPOCHS"],
 59 |         pct_start=0.05,
 60 |     )
 61 |     criterion = CustomCriterion(
 62 |         samples_per_cls=get_label_counts(data_config["DATA_PATH"])
 63 |         if data_config["DATASET"] == "TACO"
 64 |         else None,
 65 |         device=device,
 66 |     )
 67 |     # Amp loss scaler
 68 |     scaler = (
 69 |         torch.cuda.amp.GradScaler() if fp16 and device != torch.device("cpu") else None
 70 |     )
 71 | 
 72 |     # Create trainer
 73 |     trainer = TorchTrainer(
 74 |         model=model_instance.model,
 75 |         criterion=criterion,
 76 |         optimizer=optimizer,
 77 |         scheduler=scheduler,
 78 |         scaler=scaler,
 79 |         device=device,
 80 |         model_path=model_path,
 81 |         verbose=1,
 82 |     )
 83 |     best_acc, best_f1 = trainer.train(
 84 |         train_dataloader=train_dl,
 85 |         n_epoch=data_config["EPOCHS"],
 86 |         val_dataloader=val_dl if val_dl else test_dl,
 87 |     )
 88 | 
 89 |     # evaluate model with test set
 90 |     model_instance.model.load_state_dict(torch.load(model_path))
 91 |     test_loss, test_f1, test_acc = trainer.test(
 92 |         model=model_instance.model, test_dataloader=val_dl if val_dl else test_dl
 93 |     )
 94 |     return test_loss, test_f1, test_acc
 95 | 
 96 | 
 97 | if __name__ == "__main__":
 98 |     parser = argparse.ArgumentParser(description="Train model.")
 99 |     parser.add_argument(
100 |         "--model",
101 |         default="configs/model/mobilenetv3.yaml",
102 |         type=str,
103 |         help="model config",
104 |     )
105 |     parser.add_argument(
106 |         "--data", default="configs/data/taco.yaml", type=str, help="data config"
107 |     )
108 |     args = parser.parse_args()
109 | 
110 |     model_config = read_yaml(cfg=args.model)
111 |     data_config = read_yaml(cfg=args.data)
112 | 
113 |     data_config["DATA_PATH"] = os.environ.get("SM_CHANNEL_TRAIN", data_config["DATA_PATH"])
114 | 
115 |     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
116 |     log_dir = os.environ.get("SM_MODEL_DIR", os.path.join("exp", 'latest'))
117 | 
118 |     if os.path.exists(log_dir): 
119 |         modified = datetime.fromtimestamp(os.path.getmtime(log_dir + '/best.pt'))
120 |         new_log_dir = os.path.dirname(log_dir) + '/' + modified.strftime("%Y-%m-%d_%H-%M-%S")
121 |         os.rename(log_dir, new_log_dir)
122 | 
123 |     os.makedirs(log_dir, exist_ok=True)
124 | 
125 |     test_loss, test_f1, test_acc = train(
126 |         model_config=model_config,
127 |         data_config=data_config,
128 |         log_dir=log_dir,
129 |         fp16=data_config["FP16"],
130 |         device=device,
131 |     )
132 | 
133 | 


--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_data_loader import *
2 | from .base_model import *
3 | from .base_trainer import *
4 | 


--------------------------------------------------------------------------------
/base/base_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import DataLoader
 3 | from torch.utils.data.dataloader import default_collate
 4 | from torch.utils.data.sampler import SubsetRandomSampler
 5 | 
 6 | 
 7 | class BaseDataLoader(DataLoader):
 8 |     """
 9 |     Base class for all data loaders
10 |     """
11 |     def __init__(self, dataset, batch_size, shuffle, validation_split, num_workers, collate_fn=default_collate):
12 |         self.validation_split = validation_split
13 |         self.shuffle = shuffle
14 | 
15 |         self.batch_idx = 0
16 |         self.n_samples = len(dataset)
17 | 
18 |         self.sampler, self.valid_sampler = self._split_sampler(self.validation_split)
19 | 
20 |         self.init_kwargs = {
21 |             'dataset': dataset,
22 |             'batch_size': batch_size,
23 |             'shuffle': self.shuffle,
24 |             'collate_fn': collate_fn,
25 |             'num_workers': num_workers
26 |         }
27 |         super().__init__(sampler=self.sampler, **self.init_kwargs)
28 | 
29 |     def _split_sampler(self, split):
30 |         if split == 0.0:
31 |             return None, None
32 | 
33 |         idx_full = np.arange(self.n_samples)
34 | 
35 |         np.random.seed(0)
36 |         np.random.shuffle(idx_full)
37 | 
38 |         if isinstance(split, int):
39 |             assert split > 0
40 |             assert split < self.n_samples, "validation set size is configured to be larger than entire dataset."
41 |             len_valid = split
42 |         else:
43 |             len_valid = int(self.n_samples * split)
44 | 
45 |         valid_idx = idx_full[0:len_valid]
46 |         train_idx = np.delete(idx_full, np.arange(0, len_valid))
47 | 
48 |         train_sampler = SubsetRandomSampler(train_idx)
49 |         valid_sampler = SubsetRandomSampler(valid_idx)
50 | 
51 |         # turn off shuffle option which is mutually exclusive with sampler
52 |         self.shuffle = False
53 |         self.n_samples = len(train_idx)
54 | 
55 |         return train_sampler, valid_sampler
56 | 
57 |     def split_validation(self):
58 |         if self.valid_sampler is None:
59 |             return None
60 |         else:
61 |             return DataLoader(sampler=self.valid_sampler, **self.init_kwargs)
62 | 


--------------------------------------------------------------------------------
/base/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import numpy as np
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class BaseModel(nn.Module):
 7 |     """
 8 |     Base class for all models
 9 |     """
10 |     @abstractmethod
11 |     def forward(self, *inputs):
12 |         """
13 |         Forward pass logic
14 | 
15 |         :return: Model output
16 |         """
17 |         raise NotImplementedError
18 | 
19 |     def __str__(self):
20 |         """
21 |         Model prints with number of trainable parameters
22 |         """
23 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
24 |         params = sum([np.prod(p.size()) for p in model_parameters])
25 |         return super().__str__() + '\nTrainable parameters: {}'.format(params)
26 | 


--------------------------------------------------------------------------------
/base/base_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import shutil
  4 | from abc import abstractmethod
  5 | from numpy import inf
  6 | from utils import write_json
  7 | 
  8 | 
  9 | class BaseTrainer:
 10 |     """
 11 |     Base class for all trainers
 12 |     """
 13 |     def __init__(self, model, criterion, metric_ftns, optimizer, config):
 14 |         self.config = config
 15 |         self.logger = config.get_logger('trainer', config['trainer']['verbosity'])
 16 | 
 17 |         self.model = model
 18 |         self.criterion = criterion
 19 |         self.metric_ftns = metric_ftns
 20 |         self.optimizer = optimizer
 21 | 
 22 |         cfg_trainer = config['trainer']
 23 |         self.epochs = cfg_trainer['epochs']
 24 |         self.save_steps = cfg_trainer['save']['steps']
 25 |         self.save_limits = cfg_trainer['save']['limits']
 26 |         self.monitor = cfg_trainer.get('monitor', 'off')
 27 | 
 28 |         # configuration to monitor model performance and save best
 29 |         if self.monitor == 'off':
 30 |             self.mnt_mode = 'off'
 31 |             self.mnt_best = 0
 32 |         else:
 33 |             self.mnt_mode, self.mnt_metric = self.monitor.split()
 34 |             assert self.mnt_mode in ['min', 'max']
 35 |             
 36 |             self.mnt_best = inf if self.mnt_mode == 'min' else -inf
 37 |             self.early_stop = cfg_trainer.get('early_stop', inf)
 38 |             if self.early_stop <= 0:
 39 |                 self.early_stop = inf
 40 | 
 41 |         self.not_improved_count = 0
 42 | 
 43 |         self.checkpoint_dir = cfg_trainer['save']['dir']
 44 | 
 45 |         if config.resume is not None:
 46 |             self._resume_checkpoint(config.resume)
 47 |     
 48 |     @abstractmethod
 49 |     def train(self):
 50 |         """
 51 |         Full training logic.
 52 |         """
 53 |         
 54 |         raise NotImplementedError
 55 |     
 56 |     @abstractmethod
 57 |     def _validation(self, step):
 58 |         """
 59 |         Full validation logic
 60 | 
 61 |         :param step: Current step number
 62 |         """
 63 |         
 64 |         raise NotImplementedError
 65 | 
 66 |     def _evaluate_performance(self, log):
 67 |         # evaluate model performance according to configured metric, save best checkpoint as model_best
 68 |         is_best = False
 69 |         if self.mnt_mode != 'off':
 70 |             try:
 71 |                 # check whether model performance improved or not, according to specified metric(mnt_metric)
 72 |                 improved = (self.mnt_mode == 'min' and log[self.mnt_metric] <= self.mnt_best) or \
 73 |                             (self.mnt_mode == 'max' and log[self.mnt_metric] >= self.mnt_best)
 74 |             except KeyError:
 75 |                 self.logger.warning("Warning: Metric '{}' is not found. "
 76 |                                     "Model performance monitoring is disabled.".format(self.mnt_metric))
 77 |                 self.mnt_mode = 'off'
 78 |                 improved = False
 79 | 
 80 |             if improved:
 81 |                 self.mnt_best = log[self.mnt_metric]
 82 |                 self.not_improved_count = 0
 83 |                 is_best = True
 84 |             else:
 85 |                 self.not_improved_count += 1
 86 |                 
 87 |         return is_best   
 88 |     
 89 |     def _save_checkpoint(self, log, is_best=False):
 90 |         """
 91 |         Saving checkpoints
 92 | 
 93 |         :param epoch: current epoch number
 94 |         :param log: logging information of the epoch
 95 |         :param save_best: if True, rename the saved checkpoint to 'best_model.pt'
 96 |         """
 97 |         save_path = f'{self.checkpoint_dir}models/{self.config["name"]}/'
 98 |         chk_pt_path = save_path + f"steps_{log['steps']}/"
 99 |         
100 |         # make path if there isn't
101 |         if not os.path.exists(chk_pt_path):
102 |             os.makedirs(chk_pt_path)
103 |         # delete the oldest checkpoint not to exceed save limits
104 |         if len(os.listdir(save_path)) > self.save_limits:
105 |             shutil.rmtree(os.path.join(
106 |                     save_path,
107 |                     sorted(os.listdir(save_path),key = lambda x : (len(x), x))[0]
108 |                 )
109 |             )
110 |         
111 |         self.logger.info("Saving checkpoint: {} ...".format(chk_pt_path))    
112 |         torch.save(self.model, os.path.join(chk_pt_path, "model.pt"))
113 |         torch.save(self.optimizer.state_dict(), os.path.join(chk_pt_path, "optimizer.pt"))
114 |         
115 |         # save updated config file to the checkpoint dir
116 |         write_json(self.config._config, os.path.join(chk_pt_path, "config.json"))
117 |         write_json(log, os.path.join(chk_pt_path, "log.json"))
118 |         
119 |         # save best model.
120 |         if is_best:
121 |             best_path = f'{self.checkpoint_dir}best/{self.config["name"]}/'
122 |             
123 |             # make path if there isn't
124 |             if not os.path.exists(best_path):
125 |                 os.makedirs(best_path)
126 |             # delete old best files
127 |             for file_ in os.listdir(best_path):
128 |                 os.remove(best_path + file_)
129 |             
130 |             self.logger.info("Saving current best: model_best.pt ...")
131 |             torch.save(self.model, os.path.join(best_path, "best_model.pt"))
132 |             torch.save(self.optimizer.state_dict(), os.path.join(best_path, "optimizer.pt"))
133 |             
134 |             # save updated config file to the checkpoint dir
135 |             write_json(self.config._config, os.path.join(best_path, "config.json"))
136 |             write_json(log, os.path.join(best_path, "log.json"))
137 | 
138 |     def _resume_checkpoint(self, resume_path):
139 |         """
140 |         Resume from saved checkpoints
141 | 
142 |         :param resume_path: Checkpoint path to be resumed
143 |         """
144 |         resume_path = str(resume_path)
145 |         self.logger.info("Loading checkpoint: {} ...".format(resume_path))
146 |         checkpoint = torch.load(resume_path)
147 |         self.start_epoch = checkpoint['epoch'] + 1
148 |         self.mnt_best = checkpoint['monitor_best']
149 | 
150 |         # load architecture params from checkpoint.
151 |         if checkpoint['config']['arch'] != self.config['arch']:
152 |             self.logger.warning("Warning: Architecture configuration given in config file is different from that of "
153 |                                 "checkpoint. This may yield an exception while state_dict is being loaded.")
154 |         self.model.load_state_dict(checkpoint['state_dict'])
155 | 
156 |         # load optimizer state from checkpoint only when optimizer type is not changed.
157 |         if checkpoint['config']['optimizer']['type'] != self.config['optimizer']['type']:
158 |             self.logger.warning("Warning: Optimizer type given in config file is different from that of checkpoint. "
159 |                                 "Optimizer parameters not being resumed.")
160 |         else:
161 |             self.optimizer.load_state_dict(checkpoint['optimizer'])
162 | 
163 |         self.logger.info("Checkpoint loaded. Resume training from epoch {}".format(self.start_epoch))
164 | 


--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "beep_kcELECTRA_base_hate",
 3 |     "n_gpu": 1,
 4 | 
 5 |     "model": {
 6 |         "type": "BeepKcElectraHateModel",
 7 |         "args": {
 8 |             "name": "beomi/beep-KcELECTRA-base-hate",
 9 |             "num_classes": 3
10 |         }
11 |     },
12 |     "tokenizer": {
13 |         "type": "tokenizer/"
14 |     },
15 |     "data_loader": {
16 |         "type": "MnistDataLoader",
17 |         "args":{
18 |             "data_dir": "AI-it/korean-hate-speech",
19 |             "batch_size": 16,
20 |             "max_length": 64,
21 |             "shuffle": true,
22 |             "validation_split": 0.1,
23 |             "num_workers": 2
24 |         },
25 |         "data_files": {
26 |             "train": "train_hate.csv",
27 |             "valid": "dev_hate.csv"
28 |         },
29 |         "test_data_file": {
30 |             "test": "test_hate_no_label.csv"
31 |         }
32 |     },
33 |     "optimizer": {
34 |         "type": "AdamW",
35 |         "args":{
36 |             "lr": 5e-5,
37 |             "eps": 1e-8
38 |         },
39 |         "weight_decay": 0.0
40 |     },
41 |     "loss": "softmax",
42 |     "metrics": [
43 |         "macro_f1"
44 |     ],
45 |     "lr_scheduler": {
46 |         "type": "CosineAnnealingLR",
47 |         "args": {
48 |             "T_max": 300,
49 |             "eta_min": 1e-5
50 |         }
51 |     },
52 |     "trainer": {
53 |         "epochs": 2,
54 | 
55 |         "save": {
56 |             "dir": "saved/",
57 |             "steps": 300,
58 |             "limits": 3
59 |         },
60 |         "verbosity": 2,
61 |         
62 |         "monitor": "max val/macro_f1",
63 |         "early_stop": 2,
64 | 
65 |         "fp16": false
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/config_automl_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "beep_kcELECTRA_base_hate",
 3 |     "n_gpu": 1,
 4 |     "model": {
 5 |         "type": "BeepKcElectraHateModel",
 6 |         "args": {
 7 |             "name": "beomi/beep-KcELECTRA-base-hate",
 8 |             "num_classes": 3
 9 |         }
10 |     },
11 |     "data_loader": {
12 |         "type": "MnistDataLoader",
13 |         "args": {
14 |             "data_dir": "data/",
15 |             "batch_size": 32,
16 |             "max_length": 64,
17 |             "shuffle": true,
18 |             "validation_split": 0.1,
19 |             "num_workers": 2
20 |         }
21 |     },
22 |     "optimizer": {
23 |         "type": "AdamW",
24 |         "args": {
25 |             "lr": 5e-5,
26 |             "eps": 1e-8
27 |         },
28 |         "weight_decay": 0.0
29 |     },
30 |     "loss": "softmax",
31 |     "metrics": [
32 |         "macro_f1"
33 |     ],
34 |     "lr_scheduler": {
35 |         "type": "StepLR",
36 |         "args": {
37 |             "step_size": 50,
38 |             "gamma": 0.1
39 |         }
40 |     },
41 |     "trainer": {
42 |         "epochs": 2,
43 |         "save": {
44 |             "dir": "saved/",
45 |             "steps": 300,
46 |             "limits": 3
47 |         },
48 |         "verbosity": 2,
49 |         "monitor": "max val/macro_f1",
50 |         "early_stop": 2
51 |     },
52 |     "data_dir": "AI-it/korean-hate-speech",
53 |     "data_files": {
54 |         "train": "train_hate.csv",
55 |         "valid": "dev_hate.csv"
56 |     },
57 |     "test_data_file": {
58 |         "test": "test_hate_no_label.csv"
59 |     },
60 |     "saved_folder": {
61 |         "path": "automl/save",
62 |         "trial": 2,
63 |         "model_config": "model_config.yaml",
64 |         "model_weight": "result_model.pt"
65 |     }
66 | }


--------------------------------------------------------------------------------
/data_loader/data_loaders.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import torch
  6 | from torchvision import datasets, transforms
  7 | from torch.utils.data import Dataset, DataLoader
  8 | 
  9 | from torchsampler import ImbalancedDatasetSampler
 10 | from transformers import PreTrainedTokenizer
 11 | from utils import Preprocess, preprocess
 12 | from datasets import load_dataset
 13 | from base import BaseDataLoader
 14 | 
 15 | LABEL_2_IDX = {
 16 |     "none": 0,
 17 |     "offensive": 1,
 18 |     "hate": 2
 19 | }
 20 | IDX_2_LABEL = {
 21 |     0: "none",
 22 |     1: "offensive",
 23 |     2: "hate"
 24 | }
 25 | 
 26 | 
 27 | class KhsDataLoader(DataLoader):
 28 |     def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int = None):
 29 |         self.tokenizer = tokenizer
 30 |         self.max_length = max_length if max_length else self.tokenizer.model_max_length
 31 | 
 32 |     def train_collate_fn(self, input_examples):
 33 |         input_texts, input_labels = [], []
 34 |         for input_example in input_examples:
 35 |             text, label = input_example
 36 |             input_texts.append(text)
 37 |             input_labels.append(label)
 38 | 
 39 |         encoded_texts = self.tokenizer.batch_encode_plus(
 40 |             input_texts,
 41 |             add_special_tokens=True,
 42 |             max_length=self.max_length,
 43 |             truncation=True,
 44 |             padding="max_length",
 45 |             return_tensors="pt",
 46 |             return_token_type_ids=True,
 47 |             return_attention_mask=True,
 48 |         )  # input_ids, token_type_ids, attention_mask
 49 | 
 50 |         input_ids = encoded_texts["input_ids"]
 51 |         token_type_ids = encoded_texts["token_type_ids"]
 52 |         attention_mask = encoded_texts["attention_mask"]
 53 | 
 54 |         return input_ids, token_type_ids, attention_mask, torch.tensor(input_labels)
 55 | 
 56 |     def test_collate_fn(self, input_examples):
 57 |         input_texts = []
 58 |         for input_example in input_examples:
 59 |             text = input_example
 60 |             input_texts.append(text)
 61 | 
 62 |         encoded_texts = self.tokenizer.batch_encode_plus(
 63 |             input_texts,
 64 |             add_special_tokens=True,
 65 |             max_length=self.max_length,
 66 |             truncation=True,
 67 |             padding="max_length",
 68 |             return_tensors="pt",
 69 |             return_token_type_ids=True,
 70 |             return_attention_mask=True,
 71 |         )  # input_ids, token_type_ids, attention_mask
 72 | 
 73 |         input_ids = encoded_texts["input_ids"]
 74 |         token_type_ids = encoded_texts["token_type_ids"]
 75 |         attention_mask = encoded_texts["attention_mask"]
 76 | 
 77 |         return input_ids, token_type_ids, attention_mask
 78 | 
 79 |     def get_dataloader(self, name, data_dir, data_files, batch_size, **kwargs):
 80 |         data_files = dict(data_files)
 81 |         datasets = load_dataset(data_dir, data_files=data_files, use_auth_token=True)
 82 |         dataset = get_preprocessed_data(datasets[name], name)
 83 |         dataset = KhsDataset(dataset, name)
 84 | 
 85 |         sampler = None
 86 |         
 87 |         if name == "test":
 88 |             collate_fn = self.test_collate_fn   
 89 |         else:
 90 |             collate_fn = self.train_collate_fn
 91 |             sampler = ImbalancedDatasetSampler(dataset)
 92 |             
 93 |         g = torch.Generator()
 94 |         g.manual_seed(0)
 95 | 
 96 |         return DataLoader(
 97 |             dataset,
 98 |             batch_size=batch_size,
 99 |             shuffle=False,
100 |             sampler=sampler,
101 |             collate_fn=collate_fn,
102 |             num_workers=4,
103 |             pin_memory=True,
104 |             worker_init_fn=seed_worker,
105 |             generator=g,
106 |             **kwargs
107 |         )
108 | 
109 | 
110 | class KhsDataset(Dataset):
111 |     def __init__(self, data, data_type="train"):
112 |         self.data_type = data_type
113 |         self.texts = list(data.texts)
114 |         if self.data_type == "train" or self.data_type == "valid":
115 |             self.labels = list(data.labels)
116 | 
117 |     def __len__(self):
118 |         return len(self.texts)
119 | 
120 |     def __getitem__(self, index):
121 |         text = self.texts[index]
122 | 
123 |         if self.data_type == "train" or self.data_type == "valid":
124 |             label = self.labels[index]
125 |             converted_label = LABEL_2_IDX[label]
126 | 
127 |             return text, converted_label
128 | 
129 |         return text
130 | 
131 | 
132 | def get_preprocessed_data(dataset, name):
133 |     if name == "test":
134 |         preprocessed_sents = preprocess(dataset["comments"])
135 |         out_dataset = pd.DataFrame(
136 |             {
137 |                 "texts": preprocessed_sents,
138 |             }
139 |         )
140 |     else:
141 |         preprocessed_sents = preprocess(dataset["comments"])
142 |         out_dataset = pd.DataFrame(
143 |             {"texts": preprocessed_sents, "labels": dataset["label"]}
144 |         )
145 | 
146 |     return out_dataset
147 | 
148 | 
149 | # https://pytorch.org/docs/stable/notes/randomness.html
150 | def seed_worker(worker_id): 
151 |     worker_seed = torch.initial_seed() % 2**32
152 |     np.random.seed(worker_seed)
153 |     random.seed(worker_seed)


--------------------------------------------------------------------------------
/data_loader/kd_data_loaders.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import torch
  6 | from torchvision import datasets, transforms
  7 | from torch.utils.data import Dataset, DataLoader
  8 | 
  9 | from torchsampler import ImbalancedDatasetSampler
 10 | from transformers import PreTrainedTokenizer
 11 | from utils import Preprocess, preprocess
 12 | from datasets import load_dataset
 13 | from base import BaseDataLoader
 14 | 
 15 | LABEL_2_IDX = {
 16 |     "none": 0,
 17 |     "offensive": 1,
 18 |     "hate": 2
 19 | }
 20 | IDX_2_LABEL = {
 21 |     0: "none",
 22 |     1: "offensive",
 23 |     2: "hate"
 24 | }
 25 | 
 26 | 
 27 | class KhsDataLoader(DataLoader):
 28 |     def __init__(
 29 |         self,
 30 |         student_tokenizer: PreTrainedTokenizer,
 31 |         teacher_tokenizer: PreTrainedTokenizer,
 32 |         max_length: int = None
 33 |     ):
 34 |         self.student_tokenizer = student_tokenizer
 35 |         self.teacher_tokenizer = teacher_tokenizer
 36 |         self.max_length = max_length if max_length else self.student_tokenizer.model_max_length
 37 | 
 38 |     def train_collate_fn(self, input_examples):
 39 |         input_texts, input_labels = [], []
 40 |         
 41 |         for input_example in input_examples:
 42 |             text, label = input_example
 43 |             input_texts.append(text)
 44 |             input_labels.append(label)
 45 |         
 46 |         st_encoded_texts = self.student_tokenizer.batch_encode_plus(
 47 |             input_texts,
 48 |             add_special_tokens=True,
 49 |             max_length=self.max_length,
 50 |             truncation=True,
 51 |             padding="max_length",
 52 |             return_tensors="pt",
 53 |             return_token_type_ids=True,
 54 |             return_attention_mask=True,
 55 |         )  # input_ids, token_type_ids, attention_mask
 56 |         
 57 |         tc_encoded_texts = self.teacher_tokenizer.batch_encode_plus(
 58 |             input_texts,
 59 |             add_special_tokens=True,
 60 |             max_length=self.max_length,
 61 |             truncation=True,
 62 |             padding="max_length",
 63 |             return_tensors="pt",
 64 |             return_token_type_ids=True,
 65 |             return_attention_mask=True,
 66 |         )  # input_ids, token_type_ids, attention_mask
 67 | 
 68 |         st_input_ids = st_encoded_texts["input_ids"]
 69 |         st_token_type_ids = st_encoded_texts["token_type_ids"]
 70 |         st_attention_mask = st_encoded_texts["attention_mask"]
 71 |         
 72 |         tc_input_ids = tc_encoded_texts["input_ids"]
 73 |         tc_token_type_ids = tc_encoded_texts["token_type_ids"]
 74 |         tc_attention_mask = tc_encoded_texts["attention_mask"]
 75 |         
 76 |         return st_input_ids, st_token_type_ids, st_attention_mask, tc_input_ids, tc_token_type_ids, tc_attention_mask, torch.tensor(input_labels)
 77 |     
 78 |     def valid_collate_fn(self, input_examples):
 79 |         input_texts, input_labels = [], []
 80 |         
 81 |         for input_example in input_examples:
 82 |             text, label = input_example
 83 |             input_texts.append(text)
 84 |             input_labels.append(label)
 85 |         
 86 |         encoded_texts = self.student_tokenizer.batch_encode_plus(
 87 |             input_texts,
 88 |             add_special_tokens=True,
 89 |             max_length=self.max_length,
 90 |             truncation=True,
 91 |             padding="max_length",
 92 |             return_tensors="pt",
 93 |             return_token_type_ids=True,
 94 |             return_attention_mask=True,
 95 |         )  # input_ids, token_type_ids, attention_mask
 96 | 
 97 |         input_ids = encoded_texts["input_ids"]
 98 |         token_type_ids = encoded_texts["token_type_ids"]
 99 |         attention_mask = encoded_texts["attention_mask"]
100 |         
101 |         return input_ids, token_type_ids, attention_mask, torch.tensor(input_labels)
102 | 
103 |     def test_collate_fn(self, input_examples):
104 |         input_texts = []
105 |         for input_example in input_examples:
106 |             text = input_example
107 |             input_texts.append(text)
108 | 
109 |         encoded_texts = self.tokenizer.batch_encode_plus(
110 |             input_texts,
111 |             add_special_tokens=True,
112 |             max_length=self.max_length,
113 |             truncation=True,
114 |             padding="max_length",
115 |             return_tensors="pt",
116 |             return_token_type_ids=True,
117 |             return_attention_mask=True,
118 |         )  # input_ids, token_type_ids, attention_mask
119 | 
120 |         input_ids = encoded_texts["input_ids"]
121 |         token_type_ids = encoded_texts["token_type_ids"]
122 |         attention_mask = encoded_texts["attention_mask"]
123 | 
124 |         return input_ids, token_type_ids, attention_mask
125 | 
126 |     def get_dataloader(self, name, data_dir, data_files, batch_size, **kwargs):
127 |         data_files = dict(data_files)
128 |         datasets = load_dataset(data_dir, data_files=data_files, use_auth_token=True)
129 |         dataset = get_preprocessed_data(datasets[name], name)
130 |         dataset = KhsDataset(dataset, name)
131 | 
132 |         sampler = None
133 |         
134 |         if name == "test":
135 |             collate_fn = self.test_collate_fn  
136 |         elif name == "valid":
137 |             collate_fn = self.valid_collate_fn 
138 |         else:
139 |             collate_fn = self.train_collate_fn
140 |             sampler = ImbalancedDatasetSampler(dataset)
141 |             
142 |         g = torch.Generator()
143 |         g.manual_seed(0)
144 | 
145 |         return DataLoader(
146 |             dataset,
147 |             batch_size=batch_size,
148 |             shuffle=False,
149 |             sampler=sampler,
150 |             collate_fn=collate_fn,
151 |             num_workers=4,
152 |             pin_memory=True,
153 |             worker_init_fn=seed_worker,
154 |             generator=g,
155 |             **kwargs
156 |         )
157 | 
158 | 
159 | class KhsDataset(Dataset):
160 |     def __init__(self, data, data_type="train"):
161 |         self.data_type = data_type
162 |         self.texts = list(data.texts)
163 |         if self.data_type == "train" or self.data_type == "valid":
164 |             self.labels = list(data.labels)
165 | 
166 |     def __len__(self):
167 |         return len(self.texts)
168 | 
169 |     def __getitem__(self, index):
170 |         text = self.texts[index]
171 | 
172 |         if self.data_type == "train" or self.data_type == "valid":
173 |             label = self.labels[index]
174 |             converted_label = LABEL_2_IDX[label]
175 | 
176 |             return text, converted_label
177 | 
178 |         return text
179 | 
180 | 
181 | def get_preprocessed_data(dataset, name):
182 |     if name == "test":
183 |         preprocessed_sents = preprocess(dataset["comments"])
184 |         out_dataset = pd.DataFrame(
185 |             {
186 |                 "texts": preprocessed_sents,
187 |             }
188 |         )
189 |     else:
190 |         preprocessed_sents = preprocess(dataset["comments"])
191 |         out_dataset = pd.DataFrame(
192 |             {"texts": preprocessed_sents, "labels": dataset["label"]}
193 |         )
194 | 
195 |     return out_dataset
196 | 
197 | 
198 | # https://pytorch.org/docs/stable/notes/randomness.html
199 | def seed_worker(worker_id): 
200 |     worker_seed = torch.initial_seed() % 2**32
201 |     np.random.seed(worker_seed)
202 |     random.seed(worker_seed)


--------------------------------------------------------------------------------
/kd_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "koelectra_kd_model",
 3 |     "n_gpu": 1,
 4 | 
 5 |     "model": {
 6 |         "type": "BeepKcElectraHateModel",
 7 |         "args": {
 8 |             "name": "monologg/koelectra-small-v3-discriminator",
 9 |             "num_classes": 3
10 |         }
11 |     },
12 |     "teacher_model": {
13 |         "type": "BeepKcElectraHateModel",
14 |         "args": {
15 |             "name": "beomi/beep-KcELECTRA-base-hate",
16 |             "num_classes": 3
17 |         }
18 |     },
19 |     "tokenizer": {
20 |         "student": {
21 |             "type": "monologg/koelectra-small-v3-discriminator"
22 |         },
23 |         "teacher": {
24 |             "type": "tokenizer/"
25 |         }
26 |     },
27 |     "data_loader": {
28 |         "type": "MnistDataLoader",
29 |         "args":{
30 |             "data_dir": "AI-it/korean-hate-speech",
31 |             "batch_size": 64,
32 |             "max_length": 64,
33 |             "shuffle": true,
34 |             "validation_split": 0.1,
35 |             "num_workers": 2
36 |         },
37 |         "data_files": {
38 |             "train": "train_hate.csv",
39 |             "valid": "dev_hate.csv"
40 |         },
41 |         "test_data_file": {
42 |             "test": "test_hate_no_label.csv"
43 |         }
44 |     },
45 |     "optimizer": {
46 |         "type": "AdamW",
47 |         "args":{
48 |             "lr": 5e-5,
49 |             "eps": 1e-8
50 |         },
51 |         "weight_decay": 0.0
52 |     },
53 |     "loss": "knowledge_distillation_loss",
54 |     "metrics": [
55 |         "macro_f1"
56 |     ],
57 |     "lr_scheduler": {
58 |         "type": "CosineAnnealingLR",
59 |         "args": {
60 |             "T_max": 300,
61 |             "eta_min": 1e-5
62 |         }
63 |     },
64 |     "trainer": {
65 |         "epochs": 2,
66 | 
67 |         "save": {
68 |             "dir": "saved/",
69 |             "steps": 300,
70 |             "limits": 3
71 |         },
72 |         "verbosity": 2,
73 |         
74 |         "monitor": "max val/macro_f1",
75 |         "early_stop": 5
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/kd_train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import wandb
  3 | import random
  4 | import argparse
  5 | import numpy as np
  6 | import model.loss as module_loss
  7 | import model.metric as module_metric
  8 | import model.model as module_arch
  9 | import data_loader.kd_data_loaders as module_data
 10 | from parse_config import ConfigParser
 11 | from trainer import KnowDistTrainer
 12 | from utils import prepare_device 
 13 | from transformers import AutoTokenizer
 14 | from data_loader.kd_data_loaders import KhsDataLoader
 15 | 
 16 | 
 17 | def seed_everything(seed):
 18 |     """
 19 |     fix random seeds for reproducibility.
 20 |     Args:
 21 |         seed (int):
 22 |             seed number
 23 |     """
 24 |     torch.manual_seed(seed)
 25 |     torch.cuda.manual_seed(seed)
 26 |     torch.cuda.manual_seed_all(seed)  # if use multi-GPU
 27 |     torch.backends.cudnn.deterministic = True
 28 |     torch.backends.cudnn.benchmark = False
 29 |     np.random.seed(seed)
 30 |     random.seed(seed)
 31 | 
 32 | 
 33 | def main(config):
 34 |     seed_everything(42)
 35 |     wandb.init(project='#TODO', entity='#TODO', config=config)
 36 | 
 37 |     # build model architecture, then print to console
 38 |     student_model = config.init_obj('model', module_arch)
 39 |     teacher_model = config.init_obj('teacher_model', module_arch)
 40 |     
 41 |     # build tokenizer
 42 |     student_tokenizer = AutoTokenizer.from_pretrained(config['tokenizer']['student']['type'])
 43 |     teacher_tokenizer = AutoTokenizer.from_pretrained(config['tokenizer']['teacher']['type'])
 44 | 
 45 |     # build train and valid dataloader
 46 |     dataloader = KhsDataLoader(
 47 |         student_tokenizer,
 48 |         teacher_tokenizer,
 49 |         max_length=config['data_loader']['args']['max_length']
 50 |     )
 51 |     train_data_loader = dataloader.get_dataloader(
 52 |         name='train',
 53 |         data_dir=config['data_loader']['args']['data_dir'], 
 54 |         data_files=config['data_loader']['data_files'],
 55 |         batch_size=config['data_loader']['args']['batch_size']
 56 |     )
 57 |     valid_data_loader = dataloader.get_dataloader(
 58 |         name='valid',
 59 |         data_dir=config['data_loader']['args']['data_dir'], 
 60 |         data_files=config['data_loader']['data_files'],
 61 |         batch_size=config['data_loader']['args']['batch_size']
 62 |     )
 63 | 
 64 |     # prepare for (multi-device) GPU training
 65 |     device, device_ids = prepare_device(config['n_gpu'])
 66 |     student_model = student_model.to(device)
 67 |     teacher_model = teacher_model.to(device)
 68 |     
 69 |     # get function handles of loss and metrics
 70 |     criterion = getattr(module_loss, config['loss'])
 71 |     metrics = [getattr(module_metric, met) for met in config['metrics']]
 72 | 
 73 |     # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
 74 |     no_decay = ['bias', 'LayerNorm.weight']
 75 |     trainable_params = [
 76 |         {
 77 |             'params': [p for n, p in student_model.named_parameters() if not any(nd in n for nd in no_decay)],
 78 |             'weight_decay': config['optimizer']['weight_decay']
 79 |         },
 80 |         {
 81 |             'params': [p for n, p in student_model.named_parameters() if any(nd in n for nd in no_decay)],
 82 |             'weight_decay': 0.0
 83 |         }
 84 |     ]
 85 |     
 86 |     optimizer = config.init_obj('optimizer', torch.optim, trainable_params)
 87 |     lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer)
 88 | 
 89 |     trainer = KnowDistTrainer(
 90 |         student_model,
 91 |         teacher_model,
 92 |         criterion,
 93 |         metrics,
 94 |         optimizer,
 95 |         config=config,
 96 |         device=device,
 97 |         data_loader=train_data_loader,
 98 |         valid_data_loader=valid_data_loader,
 99 |         lr_scheduler=lr_scheduler
100 |     )
101 | 
102 |     trainer.train()
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     args = argparse.ArgumentParser(description='PyTorch Template')
107 |     args.add_argument('-c', '--config', default=None, type=str,
108 |                       help='config file path (default: None)')
109 |     args.add_argument('-r', '--resume', default=None, type=str,
110 |                       help='path to latest checkpoint (default: None)')
111 |     args.add_argument('-d', '--device', default=None, type=str,
112 |                       help='indices of GPUs to enable (default: all)')
113 | 
114 |     # custom cli options to modify configuration from default values given in json file.
115 |     config = ConfigParser.from_args(args)
116 |     main(config)
117 | 


--------------------------------------------------------------------------------
/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from .logger import *
2 | 


--------------------------------------------------------------------------------
/logger/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | from pathlib import Path
 4 | from utils import read_json
 5 | 
 6 | 
 7 | def setup_logging(save_dir, log_config='logger/logger_config.json', default_level=logging.INFO):
 8 |     """
 9 |     Setup logging configuration
10 |     """
11 |     log_config = Path(log_config)
12 |     if log_config.is_file():
13 |         config = read_json(log_config)
14 |         # modify logging paths based on run config
15 |         for _, handler in config['handlers'].items():
16 |             if 'filename' in handler:
17 |                 handler['filename'] = str(save_dir / handler['filename'])
18 | 
19 |         logging.config.dictConfig(config)
20 |     else:
21 |         print("Warning: logging configuration file is not found in {}.".format(log_config))
22 |         logging.basicConfig(level=default_level)
23 | 


--------------------------------------------------------------------------------
/logger/logger_config.json:
--------------------------------------------------------------------------------
 1 | 
 2 | {
 3 |     "version": 1, 
 4 |     "disable_existing_loggers": false, 
 5 |     "formatters": {
 6 |         "simple": {"format": "%(message)s"}, 
 7 |         "datetime": {"format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"}
 8 |     }, 
 9 |     "handlers": {
10 |         "console": {
11 |             "class": "logging.StreamHandler", 
12 |             "level": "DEBUG", 
13 |             "formatter": "simple", 
14 |             "stream": "ext://sys.stdout"
15 |             }, 
16 |         "info_file_handler": {
17 |             "class": "logging.handlers.RotatingFileHandler", 
18 |             "level": "INFO", 
19 |             "formatter": "datetime", 
20 |             "filename": "info.log", 
21 |             "maxBytes": 10485760, 
22 |             "backupCount": 20, "encoding": "utf8"
23 |         }
24 |     }, 
25 |     "root": {
26 |         "level": "INFO", 
27 |         "handlers": [
28 |             "console", 
29 |             "info_file_handler"
30 |         ]
31 |     }
32 | }


--------------------------------------------------------------------------------
/model/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def nll_loss(output, target):
 7 |     return F.nll_loss(output, target)
 8 | 
 9 | def softmax(output, target):
10 |     loss = nn.CrossEntropyLoss()
11 |     return loss(output, target)
12 | 
13 | def knowledge_distillation_loss(logits, target, teacher_logits):
14 |         alpha = 0.3
15 |         T = 1
16 |         
17 |         student_loss = F.cross_entropy(input=logits, target=target)
18 |         distillation_loss = nn.KLDivLoss(reduction='batchmean')(F.log_softmax(logits/T, dim=1), F.softmax(teacher_logits/T, dim=1)) * (T * T)
19 |         total_loss =  (1. - alpha)*student_loss + alpha*distillation_loss
20 | 
21 |         return total_loss


--------------------------------------------------------------------------------
/model/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | from torch.optim.lr_scheduler import _LRScheduler
 2 | 
 3 | class NoamLR(_LRScheduler):
 4 |     """
 5 |     Implements the Noam Learning rate schedule. This corresponds to increasing the learning rate
 6 |     linearly for the first ``warmup_steps`` training steps, and decreasing it thereafter proportionally
 7 |     to the inverse square root of the step number, scaled by the inverse square root of the
 8 |     dimensionality of the model. Time will tell if this is just madness or it's actually important.
 9 |     Parameters
10 |     ----------
11 |     warmup_steps: ``int``, required.
12 |         The number of steps to linearly increase the learning rate.
13 |         
14 |     https://github.com/tugstugi/pytorch-saltnet/blob/master/utils/lr_scheduler.py
15 |     """
16 |     def __init__(self, optimizer, warmup_steps):
17 |         self.warmup_steps = warmup_steps
18 |         super().__init__(optimizer)
19 | 
20 |     def get_lr(self):
21 |         last_epoch = max(1, self.last_epoch)
22 |         scale = self.warmup_steps ** 0.5 * min(last_epoch ** (-0.5), last_epoch * self.warmup_steps ** (-1.5))
23 |         return [base_lr * scale for base_lr in self.base_lrs]


--------------------------------------------------------------------------------
/model/metric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import warnings
 3 | warnings.filterwarnings('ignore')
 4 | import sklearn
 5 | 
 6 | LABEL_LIST = [
 7 |     "hate",
 8 |     "offensive",
 9 |     "none"
10 | ]
11 | 
12 | def accuracy(output, target):
13 |     with torch.no_grad():
14 |         pred = torch.argmax(output, dim=1)
15 |         assert pred.shape[0] == len(target)
16 |         correct = 0
17 |         correct += torch.sum(pred == target).item()
18 |     return correct / len(target)
19 | 
20 | 
21 | def top_k_acc(output, target, k=3):
22 |     with torch.no_grad():
23 |         pred = torch.topk(output, k, dim=1)[1]
24 |         assert pred.shape[0] == len(target)
25 |         correct = 0
26 |         for i in range(k):
27 |             correct += torch.sum(pred[:, i] == target).item()
28 |     return correct / len(target)
29 | 
30 | 
31 | def macro_f1(output, target):
32 |     label_indices = list(range(len(LABEL_LIST)))
33 |     return sklearn.metrics.f1_score(target, output, average="macro", labels=label_indices) * 100.0
34 | 


--------------------------------------------------------------------------------
/parse_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from pathlib import Path
  4 | from functools import reduce, partial
  5 | from operator import getitem
  6 | from datetime import datetime
  7 | from logger import setup_logging
  8 | from utils import read_json
  9 | 
 10 | 
 11 | class ConfigParser:
 12 |     def __init__(self, config, resume=None, run_id=None):
 13 |         """
 14 |         class to parse configuration json file. Handles hyperparameters for training, initializations of modules, checkpoint saving
 15 |         and logging module.
 16 |         :param config: Dict containing configurations, hyperparameters for training. contents of `config.json` file for example.
 17 |         :param resume: String, path to the checkpoint being loaded.
 18 |         :param run_id: Unique Identifier for training processes. Used to save checkpoints and training log. Timestamp is being used as default
 19 |         """
 20 |         # load config file and apply modification
 21 |         self._config = config
 22 |         self.resume = resume
 23 | 
 24 |         # set save_dir where log will be saved.
 25 |         save_dir = Path(self.config['trainer']['save']['dir'])
 26 | 
 27 |         exper_name = self.config['name']
 28 |         if run_id is None: # use timestamp as default run-id
 29 |             run_id = datetime.now().strftime(r'%m%d_%H_%M_%S')
 30 |         self._log_dir = save_dir / 'log' / exper_name / run_id
 31 | 
 32 |         # make directory for saving checkpoints and log.
 33 |         exist_ok = run_id == ''
 34 |         self.log_dir.mkdir(parents=True, exist_ok=exist_ok)
 35 | 
 36 |         # configure logging module
 37 |         setup_logging(self.log_dir)
 38 |         self.log_levels = {
 39 |             0: logging.WARNING,
 40 |             1: logging.INFO,
 41 |             2: logging.DEBUG
 42 |         }
 43 | 
 44 |     @classmethod
 45 |     def from_args(cls, args):
 46 |         """
 47 |         Initialize this class from some cli arguments. Used in train, test.
 48 |         """
 49 |         if not isinstance(args, tuple):
 50 |             args = args.parse_args()
 51 | 
 52 |         if args.device is not None:
 53 |             os.environ["CUDA_VISIBLE_DEVICES"] = args.device
 54 |             
 55 |         if args.resume is not None:
 56 |             resume = Path(args.resume)
 57 |             cfg_fname = resume.parent / 'config.json'
 58 |         else:
 59 |             msg_no_cfg = "Configuration file need to be specified. Add '-c config.json', for example."
 60 |             assert args.config is not None, msg_no_cfg
 61 |             resume = None
 62 |             cfg_fname = Path(args.config)
 63 |         
 64 |         config = read_json(cfg_fname)
 65 |         if args.config and resume:
 66 |             # update new config for fine-tuning
 67 |             config.update(read_json(args.config))
 68 | 
 69 |         return cls(config, resume)
 70 | 
 71 |     def init_obj(self, name, module, *args, **kwargs):
 72 |         """
 73 |         Finds a function handle with the name given as 'type' in config, and returns the
 74 |         instance initialized with corresponding arguments given.
 75 | 
 76 |         `object = config.init_obj('name', module, a, b=1)`
 77 |         is equivalent to
 78 |         `object = module.name(a, b=1)`
 79 |         """
 80 |         module_name = self[name]['type']
 81 |         module_args = dict(self[name]['args'])
 82 |         assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed'
 83 |         module_args.update(kwargs)
 84 |         return getattr(module, module_name)(*args, **module_args)
 85 | 
 86 |     def init_ftn(self, name, module, *args, **kwargs):
 87 |         """
 88 |         Finds a function handle with the name given as 'type' in config, and returns the
 89 |         function with given arguments fixed with functools.partial.
 90 | 
 91 |         `function = config.init_ftn('name', module, a, b=1)`
 92 |         is equivalent to
 93 |         `function = lambda *args, **kwargs: module.name(a, *args, b=1, **kwargs)`.
 94 |         """
 95 |         module_name = self[name]['type']
 96 |         module_args = dict(self[name]['args'])
 97 |         assert all([k not in module_args for k in kwargs]), 'Overwriting kwargs given in config file is not allowed'
 98 |         module_args.update(kwargs)
 99 |         return partial(getattr(module, module_name), *args, **module_args)
100 | 
101 |     def __getitem__(self, name):
102 |         """Access items like ordinary dict."""
103 |         return self.config[name]
104 | 
105 |     def get_logger(self, name, verbosity=2):
106 |         msg_verbosity = 'verbosity option {} is invalid. Valid options are {}.'.format(verbosity, self.log_levels.keys())
107 |         assert verbosity in self.log_levels, msg_verbosity
108 |         logger = logging.getLogger(name)
109 |         logger.setLevel(self.log_levels[verbosity])
110 |         return logger
111 | 
112 |     # setting read-only attributes
113 |     @property
114 |     def config(self):
115 |         return self._config
116 | 
117 |     @property
118 |     def log_dir(self):
119 |         return self._log_dir
120 | 
121 | def _get_opt_name(flags):
122 |     for flg in flags:
123 |         if flg.startswith('--'):
124 |             return flg.replace('--', '')
125 |     return flags[0].replace('--', '')
126 | 
127 | 


--------------------------------------------------------------------------------
/pkm_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mem_implementation": "pq_fast",
 3 |     "mem_grouped_conv": 0,
 4 |     "mem_values_optimizer": "",
 5 |     "mem_sparse": 0,
 6 |     "mem_input2d": 0,
 7 |     "mem_k_dim": 256,
 8 |     "mem_v_dim": -1,
 9 |     "mem_heads": 4,
10 |     "mem_knn": 32,
11 |     "mem_share_values": 0,
12 |     "mem_shuffle_indices": 0,
13 |     "mem_shuffle_query": 0,
14 |     "mem_modulo_size": -1,
15 |     "mem_keys_type": "uniform",
16 |     "mem_n_keys": 512,
17 |     "mem_keys_normalized_init": 0,
18 |     "mem_keys_learn": 1,
19 |     "mem_use_different_keys": 1,
20 |     "mem_query_detach_input": 0,
21 |     "mem_query_layer_sizes": "0,0",
22 |     "mem_query_kernel_sizes": "",
23 |     "mem_query_bias": 1,
24 |     "mem_query_batchnorm": 0,
25 |     "mem_query_net_learn": 1,
26 |     "mem_query_residual": 0,
27 |     "mem_multi_query_net": 0,
28 |     "mem_value_zero_init": 0,
29 |     "mem_normalize_query": 1,
30 |     "mem_temperature": 1,
31 |     "mem_score_softmax": 1,
32 |     "mem_score_subtract": "",
33 |     "mem_score_normalize": 0,
34 |     "mem_input_dropout": 0,
35 |     "mem_query_dropout": 0,
36 |     "mem_value_dropout": 0
37 | }


--------------------------------------------------------------------------------
/prototype/fullstack/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/prototype/fullstack/.DS_Store


--------------------------------------------------------------------------------
/prototype/fullstack/Makefile:
--------------------------------------------------------------------------------
 1 | run_black:
 2 | 	python3 -m black . -l 119
 3 | 
 4 | run_server:
 5 | 	python3 -m app
 6 | 
 7 | run_client:
 8 | 	python3 -m streamlit run app/frontend.py
 9 | 
10 | run_app: run_server run_client


--------------------------------------------------------------------------------
/prototype/fullstack/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/prototype/fullstack/__init__.py


--------------------------------------------------------------------------------
/prototype/fullstack/app/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/prototype/fullstack/app/.DS_Store


--------------------------------------------------------------------------------
/prototype/fullstack/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/prototype/fullstack/app/__init__.py


--------------------------------------------------------------------------------
/prototype/fullstack/app/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_data_loader import *
2 | from .base_model import *
3 | from .base_trainer import *
4 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/base/base_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import DataLoader
 3 | from torch.utils.data.dataloader import default_collate
 4 | from torch.utils.data.sampler import SubsetRandomSampler
 5 | 
 6 | 
 7 | class BaseDataLoader(DataLoader):
 8 |     """
 9 |     Base class for all data loaders
10 |     """
11 | 
12 |     def __init__(
13 |         self,
14 |         dataset,
15 |         batch_size,
16 |         shuffle,
17 |         validation_split,
18 |         num_workers,
19 |         collate_fn=default_collate,
20 |     ):
21 |         self.validation_split = validation_split
22 |         self.shuffle = shuffle
23 | 
24 |         self.batch_idx = 0
25 |         self.n_samples = len(dataset)
26 | 
27 |         self.sampler, self.valid_sampler = self._split_sampler(self.validation_split)
28 | 
29 |         self.init_kwargs = {
30 |             "dataset": dataset,
31 |             "batch_size": batch_size,
32 |             "shuffle": self.shuffle,
33 |             "collate_fn": collate_fn,
34 |             "num_workers": num_workers,
35 |         }
36 |         super().__init__(sampler=self.sampler, **self.init_kwargs)
37 | 
38 |     def _split_sampler(self, split):
39 |         if split == 0.0:
40 |             return None, None
41 | 
42 |         idx_full = np.arange(self.n_samples)
43 | 
44 |         np.random.seed(0)
45 |         np.random.shuffle(idx_full)
46 | 
47 |         if isinstance(split, int):
48 |             assert split > 0
49 |             assert (
50 |                 split < self.n_samples
51 |             ), "validation set size is configured to be larger than entire dataset."
52 |             len_valid = split
53 |         else:
54 |             len_valid = int(self.n_samples * split)
55 | 
56 |         valid_idx = idx_full[0:len_valid]
57 |         train_idx = np.delete(idx_full, np.arange(0, len_valid))
58 | 
59 |         train_sampler = SubsetRandomSampler(train_idx)
60 |         valid_sampler = SubsetRandomSampler(valid_idx)
61 | 
62 |         # turn off shuffle option which is mutually exclusive with sampler
63 |         self.shuffle = False
64 |         self.n_samples = len(train_idx)
65 | 
66 |         return train_sampler, valid_sampler
67 | 
68 |     def split_validation(self):
69 |         if self.valid_sampler is None:
70 |             return None
71 |         else:
72 |             return DataLoader(sampler=self.valid_sampler, **self.init_kwargs)
73 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/base/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import numpy as np
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class BaseModel(nn.Module):
 7 |     """
 8 |     Base class for all models
 9 |     """
10 | 
11 |     @abstractmethod
12 |     def forward(self, *inputs):
13 |         """
14 |         Forward pass logic
15 | 
16 |         :return: Model output
17 |         """
18 |         raise NotImplementedError
19 | 
20 |     def __str__(self):
21 |         """
22 |         Model prints with number of trainable parameters
23 |         """
24 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
25 |         params = sum([np.prod(p.size()) for p in model_parameters])
26 |         return super().__str__() + "\nTrainable parameters: {}".format(params)
27 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/base/base_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import shutil
  4 | from abc import abstractmethod
  5 | from numpy import inf
  6 | from utils import write_json
  7 | 
  8 | 
  9 | class BaseTrainer:
 10 |     """
 11 |     Base class for all trainers
 12 |     """
 13 | 
 14 |     def __init__(self, model, criterion, metric_ftns, optimizer, config):
 15 |         self.config = config
 16 |         self.logger = config.get_logger("trainer", config["trainer"]["verbosity"])
 17 | 
 18 |         self.model = model
 19 |         self.criterion = criterion
 20 |         self.metric_ftns = metric_ftns
 21 |         self.optimizer = optimizer
 22 | 
 23 |         cfg_trainer = config["trainer"]
 24 |         self.epochs = cfg_trainer["epochs"]
 25 |         self.save_steps = cfg_trainer["save"]["steps"]
 26 |         self.save_limits = cfg_trainer["save"]["limits"]
 27 |         self.monitor = cfg_trainer.get("monitor", "off")
 28 | 
 29 |         # configuration to monitor model performance and save best
 30 |         if self.monitor == "off":
 31 |             self.mnt_mode = "off"
 32 |             self.mnt_best = 0
 33 |         else:
 34 |             self.mnt_mode, self.mnt_metric = self.monitor.split()
 35 |             assert self.mnt_mode in ["min", "max"]
 36 | 
 37 |             self.mnt_best = inf if self.mnt_mode == "min" else -inf
 38 |             self.early_stop = cfg_trainer.get("early_stop", inf)
 39 |             if self.early_stop <= 0:
 40 |                 self.early_stop = inf
 41 | 
 42 |         self.not_improved_count = 0
 43 | 
 44 |         self.checkpoint_dir = cfg_trainer["save"]["dir"]
 45 | 
 46 |         if config.resume is not None:
 47 |             self._resume_checkpoint(config.resume)
 48 | 
 49 |     @abstractmethod
 50 |     def train(self):
 51 |         """
 52 |         Full training logic.
 53 |         """
 54 | 
 55 |         raise NotImplementedError
 56 | 
 57 |     @abstractmethod
 58 |     def _validation(self, step):
 59 |         """
 60 |         Full validation logic
 61 | 
 62 |         :param step: Current step number
 63 |         """
 64 |         
 65 |         raise NotImplementedError
 66 | 
 67 |     def _evaluate_performance(self, log):
 68 |         # evaluate model performance according to configured metric, save best checkpoint as model_best
 69 |         is_best = False
 70 |         if self.mnt_mode != "off":
 71 |             try:
 72 |                 # check whether model performance improved or not, according to specified metric(mnt_metric)
 73 |                 improved = (
 74 |                     self.mnt_mode == "min" and log[self.mnt_metric] <= self.mnt_best
 75 |                 ) or (self.mnt_mode == "max" and log[self.mnt_metric] >= self.mnt_best)
 76 |             except KeyError:
 77 |                 self.logger.warning(
 78 |                     "Warning: Metric '{}' is not found. "
 79 |                     "Model performance monitoring is disabled.".format(self.mnt_metric)
 80 |                 )
 81 |                 self.mnt_mode = "off"
 82 |                 improved = False
 83 | 
 84 |             if improved:
 85 |                 self.mnt_best = log[self.mnt_metric]
 86 |                 self.not_improved_count = 0
 87 |                 is_best = True
 88 |             else:
 89 |                 self.not_improved_count += 1
 90 | 
 91 |         return is_best
 92 | 
 93 |     def _save_checkpoint(self, log, is_best=False):
 94 |         """
 95 |         Saving checkpoints
 96 | 
 97 |         :param epoch: current epoch number
 98 |         :param log: logging information of the epoch
 99 |         :param save_best: if True, rename the saved checkpoint to 'best_model.pt'
100 |         """
101 |         save_path = f'{self.checkpoint_dir}models/{self.config["name"]}/'
102 |         chk_pt_path = save_path + f"steps_{log['steps']}/"
103 |         
104 |         # make path if there isn't
105 |         if not os.path.exists(chk_pt_path):
106 |             os.makedirs(chk_pt_path)
107 |         # delete the oldest checkpoint not to exceed save limits
108 |         if len(os.listdir(save_path)) > self.save_limits:
109 |             shutil.rmtree(os.path.join(
110 |                     save_path,
111 |                     sorted(os.listdir(save_path),key = lambda x : (len(x), x))[0]
112 |                 )
113 |             )
114 |         
115 |         self.logger.info("Saving checkpoint: {} ...".format(chk_pt_path))    
116 |         torch.save(self.model, os.path.join(chk_pt_path, "model.pt"))
117 |         torch.save(
118 |             self.optimizer.state_dict(), os.path.join(chk_pt_path, "optimizer.pt")
119 |         )
120 | 
121 |         # save updated config file to the checkpoint dir
122 |         write_json(self.config._config, os.path.join(chk_pt_path, "config.json"))
123 |         write_json(log, os.path.join(chk_pt_path, "log.json"))
124 | 
125 |         # save best model.
126 |         if is_best:
127 |             best_path = f'{self.checkpoint_dir}best/{self.config["name"]}/'
128 | 
129 |             # make path if there isn't
130 |             if not os.path.exists(best_path):
131 |                 os.makedirs(best_path)
132 |             # delete old best files
133 |             for file_ in os.listdir(best_path):
134 |                 os.remove(best_path + file_)
135 | 
136 |             self.logger.info("Saving current best: model_best.pt ...")
137 |             torch.save(self.model, os.path.join(best_path, "best_model.pt"))
138 |             torch.save(
139 |                 self.optimizer.state_dict(), os.path.join(best_path, "optimizer.pt")
140 |             )
141 | 
142 |             # save updated config file to the checkpoint dir
143 |             write_json(self.config._config, os.path.join(best_path, "config.json"))
144 |             write_json(log, os.path.join(best_path, "log.json"))
145 | 
146 |     def _resume_checkpoint(self, resume_path):
147 |         """
148 |         Resume from saved checkpoints
149 | 
150 |         :param resume_path: Checkpoint path to be resumed
151 |         """
152 |         resume_path = str(resume_path)
153 |         self.logger.info("Loading checkpoint: {} ...".format(resume_path))
154 |         checkpoint = torch.load(resume_path)
155 |         self.start_epoch = checkpoint["epoch"] + 1
156 |         self.mnt_best = checkpoint["monitor_best"]
157 | 
158 |         # load architecture params from checkpoint.
159 |         if checkpoint["config"]["arch"] != self.config["arch"]:
160 |             self.logger.warning(
161 |                 "Warning: Architecture configuration given in config file is different from that of "
162 |                 "checkpoint. This may yield an exception while state_dict is being loaded."
163 |             )
164 |         self.model.load_state_dict(checkpoint["state_dict"])
165 | 
166 |         # load optimizer state from checkpoint only when optimizer type is not changed.
167 |         if (
168 |             checkpoint["config"]["optimizer"]["type"]
169 |             != self.config["optimizer"]["type"]
170 |         ):
171 |             self.logger.warning(
172 |                 "Warning: Optimizer type given in config file is different from that of checkpoint. "
173 |                 "Optimizer parameters not being resumed."
174 |             )
175 |         else:
176 |             self.optimizer.load_state_dict(checkpoint["optimizer"])
177 | 
178 |         self.logger.info(
179 |             "Checkpoint loaded. Resume training from epoch {}".format(self.start_epoch)
180 |         )
181 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "beomi/beep-KcELECTRA-base-hate",
 3 |     "n_gpu": 1,
 4 | 
 5 |     "model": {
 6 |         "type": "BeomiModel",
 7 |         "args": {
 8 |             "name": "beomi/beep-KcELECTRA-base-hate",
 9 |             "num_classes": 3
10 |         }
11 |     },
12 |     "tokenizer": "beomi/KcELECTRA-base",
13 |     "data_loader": {
14 |         "type": "MnistDataLoader",
15 |         "args":{
16 |             "data_dir": "data/",
17 |             "batch_size": 64,
18 |             "max_length": 64,
19 |             "shuffle": true,
20 |             "validation_split": 0.1,
21 |             "num_workers": 2
22 |         }
23 |     },
24 |     "optimizer": {
25 |         "type": "AdamW",
26 |         "args":{
27 |             "lr": 5e-5,
28 |             "eps": 1e-8
29 |         },
30 |         "weight_decay": 0.0
31 |     },
32 |     "loss": "softmax",
33 |     "metrics": [
34 |         "macro_f1"
35 |     ],
36 |     "lr_scheduler": {
37 |         "type": "StepLR",
38 |         "args": {
39 |             "step_size": 50,
40 |             "gamma": 0.1
41 |         }
42 |     },
43 |     "trainer": {
44 |         "epochs": 2,
45 | 
46 |         "save": {
47 |             "dir": "saved/",
48 |             "steps": 300,
49 |             "limits": 3
50 |         },
51 |         "verbosity": 2,
52 |         
53 |         "monitor": "max val/macro_f1",
54 |         "early_stop": 2
55 |     },
56 |     "data_dir": "AI-it/korean-hate-speech",
57 |     "data_files": {
58 |         "train": "train_hate.csv",
59 |         "valid": "dev_hate.csv"
60 |     },
61 |     "test_data_file": {
62 |         "test": "test_hate_no_label.csv"
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/confirm_button_hack.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | 
 3 | import streamlit as st
 4 | 
 5 | 
 6 | def cache_on_button_press(label, **cache_kwargs):
 7 |     """Function decorator to memoize function executions.
 8 |     Parameters
 9 |     ----------
10 |     label : str
11 |         The label for the button to display prior to running the cached funnction.
12 |     cache_kwargs : Dict[Any, Any]
13 |         Additional parameters (such as show_spinner) to pass into the underlying @st.cache decorator.
14 |     Example
15 |     -------
16 |     This show how you could write a username/password tester:
17 |     >>> @cache_on_button_press('Authenticate')
18 |     ... def authenticate(username, password):
19 |     ...     return username == "buddha" and password == "s4msara"
20 |     ...
21 |     ... username = st.text_input('username')
22 |     ... password = st.text_input('password')
23 |     ...
24 |     ... if authenticate(username, password):
25 |     ...     st.success('Logged in.')
26 |     ... else:
27 |     ...     st.error('Incorrect username or password')
28 |     """
29 |     internal_cache_kwargs = dict(cache_kwargs)
30 |     internal_cache_kwargs['allow_output_mutation'] = True
31 |     internal_cache_kwargs['show_spinner'] = False
32 | 
33 |     def function_decorator(func):
34 |         @functools.wraps(func)
35 |         def wrapped_func(*args, **kwargs):
36 |             @st.cache(**internal_cache_kwargs)
37 |             def get_cache_entry(func, args, kwargs):
38 |                 class ButtonCacheEntry:
39 |                     def __init__(self):
40 |                         self.evaluated = False
41 |                         self.return_value = None
42 | 
43 |                     def evaluate(self):
44 |                         self.evaluated = True
45 |                         self.return_value = func(*args, **kwargs)
46 | 
47 |                 return ButtonCacheEntry()
48 | 
49 |             cache_entry = get_cache_entry(func, args, kwargs)
50 |             if not cache_entry.evaluated:
51 |                 if st.button(label):
52 |                     cache_entry.evaluate()
53 |                 else:
54 |                     raise st.script_runner.StopException
55 |             return cache_entry.return_value
56 | 
57 |         return wrapped_func
58 | 
59 |     return function_decorator
60 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/database.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import certifi
 3 | 
 4 | def run_db():
 5 |     ca = certifi.where()
 6 |     client = MongoClient('mongodb+srv://jadon:aiit@cluster0.13mh6.mongodb.net/myFirstDatabase?retryWrites=true&w=majority', tlsCAFile=ca)
 7 |     db = client.aiit
 8 |     evidence = db.evidence
 9 |     return evidence
10 | 
11 | def insert2db(keyword, results, collection):
12 |     docs = []
13 |     for res in results:
14 |             docs.append({
15 |                 "keyword": keyword,
16 |                 'user_id': res['user_id'],
17 |                 'comment': res['comment'],
18 |                 "label": res['label'],
19 |                 'site_name': res['site_name'],
20 |                 'site_url': res['site_url'],
21 |                 'commented_at': res['commented_at']
22 |             })
23 |     collection.insert_many(docs)


--------------------------------------------------------------------------------
/prototype/fullstack/app/frontend.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from confirm_button_hack import cache_on_button_press
 3 | import requests
 4 | import time
 5 | import pandas as pd
 6 | 
 7 | 
 8 | st.set_page_config(layout='wide')
 9 | st.header('Hello AI-it!!')
10 | 
11 | st.title('Malicious Comments Collecting Service')
12 | 
13 | 
14 | def main():
15 |     keyword = st.text_input('Keyword you want to collect!!')
16 |     if keyword:
17 |         with st.spinner('Collecting Evidence...'):
18 |             response = requests.get('http://49.50.174.246:2227/get_sample/' + keyword)
19 |         st.success('Done!')
20 |         
21 |         st.markdown("<h2 style='text-align: center'>Report</h1>", unsafe_allow_html=True)
22 |         st.markdown('-----------------------')
23 |         st.markdown('#### Label Description')
24 |         st.markdown('- **Hate**     : 혐오적인 표현')
25 |         st.markdown('- **Offensive**: 공격적인 표현')
26 |         st.markdown('-----------------------')
27 |         
28 |         for i, res in enumerate(response.json()):
29 |             st.subheader(f'Evidence:{i+1}')
30 |             st.write(res)
31 | 
32 | 
33 | root_password = '123'
34 | password = st.text_input('password', type='password')
35 | 
36 | 
37 | @cache_on_button_press('Authenticate')
38 | def authenticate(password) -> bool:
39 |     return password == root_password
40 | 
41 | 
42 | if authenticate(password):
43 |     st.success('You are authenticated!')
44 |     main()
45 | else:
46 |     st.error('The password is invalid.')


--------------------------------------------------------------------------------
/prototype/fullstack/app/load_data.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import torch
  3 | import emoji
  4 | from soynlp.normalizer import repeat_normalize
  5 | 
  6 | import os
  7 | import random
  8 | import numpy as np
  9 | import pandas as pd
 10 | 
 11 | import torch
 12 | from torch.utils.data import Dataset, DataLoader
 13 | from transformers import PreTrainedTokenizer
 14 | from datasets import load_dataset
 15 | 
 16 | # Set random seed
 17 | SEED = 42
 18 | random.seed(SEED)
 19 | np.random.seed(SEED)
 20 | os.environ["PYTHONHASHSEED"] = str(SEED)
 21 | torch.manual_seed(SEED)
 22 | torch.cuda.manual_seed(SEED)  # type: ignore
 23 | torch.backends.cudnn.deterministic = True  # type: ignore
 24 | torch.backends.cudnn.benchmark = True  # type: ignore
 25 | 
 26 | 
 27 | def load_data():
 28 |     dataset = load_dataset('AI-it/khs_service_test', data_files={'data':'test_data_ver2.csv'}, use_auth_token=True)
 29 |     return dataset['data']
 30 | 
 31 | def retrieve_comments(keyword: str, dataset) -> list:
 32 |     result = []
 33 |     for data in dataset:
 34 |         if type(data['comment']) != str:
 35 |             continue
 36 |         if len(keyword) == 3:
 37 |             '''3글자 이름이 들어왔을경우, e.g. 손흥민, 흥민 둘 다 검사'''
 38 |             if keyword in data['comment']:
 39 |                 result.append(data)
 40 |             elif keyword[1:] in data['comment']:
 41 |                 result.append(data)
 42 |         else:
 43 |             if keyword in data['comment']:
 44 |                 result.append(data)
 45 | 
 46 |     return result
 47 | 
 48 | def preprocess(sents):
 49 |     preprocessed_sents = []
 50 |     
 51 |     emojis = set()
 52 |     for k in emoji.UNICODE_EMOJI.keys():
 53 |         emojis.update(emoji.UNICODE_EMOJI[k].keys())
 54 |         
 55 |     punc_bracket_pattern = re.compile(f'[\'\"\[\]\(\)]')
 56 |     base_pattern = re.compile(f'[^.,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
 57 |     url_pattern = re.compile(
 58 |         r'(http|ftp|https)?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
 59 |     )
 60 |     
 61 |     for sent in sents:
 62 |         sent = punc_bracket_pattern.sub(' ', str(sent))
 63 |         sent = base_pattern.sub(' ', sent)
 64 |         sent = url_pattern.sub('', sent)
 65 |         sent = sent.strip()
 66 |         sent = repeat_normalize(sent, num_repeats=2)
 67 |         preprocessed_sents.append(sent)
 68 |             
 69 |     return preprocessed_sents
 70 | 
 71 | 
 72 | class NhDataloader(DataLoader):
 73 |     def __init__(self, tokenizer: PreTrainedTokenizer, max_length: int = None):
 74 |         self.tokenizer = tokenizer
 75 |         self.max_length = max_length if max_length else self.tokenizer.model_max_length
 76 |     
 77 |     def collate_fn(self, input_examples):
 78 |         input_anc_texts = []
 79 |         
 80 |         for input_example in input_examples:
 81 |             anchor_text = input_example
 82 |             input_anc_texts.append(anchor_text)
 83 |             
 84 |         encoded_texts = self.tokenizer.batch_encode_plus(
 85 |             input_anc_texts,
 86 |             add_special_tokens=True,
 87 |             max_length=self.max_length,
 88 |             truncation=True,
 89 |             padding=True,
 90 |             return_tensors="pt",
 91 |             return_token_type_ids=True,
 92 |             return_attention_mask=True,
 93 |         )  # input_ids, token_type_ids, attention_mask
 94 |         
 95 |         return encoded_texts
 96 |     
 97 |     def get_dataloader(self, data, batch_size, **kwargs):
 98 |         dataset = NhDataset(data)
 99 |         
100 |         return DataLoader(
101 |             dataset,
102 |             batch_size=batch_size,
103 |             shuffle=False,
104 |             collate_fn=self.collate_fn,
105 |             num_workers=4,
106 |             **kwargs
107 |         )
108 |         
109 | class NhDataset(Dataset):
110 |     def __init__(self, data):
111 |         self.texts = [d['comment'] for d in data]
112 | 
113 |     def __len__(self):
114 |         return len(self.texts)
115 | 
116 |     def __getitem__(self, index):
117 |         return self.texts[index]
118 |     
119 |     def get_labels(self):
120 |         return self.labels
121 | 
122 | 
123 | # def load_data():
124 |     # DATA_PATH = 'AI-it/khs_service_test'
125 |     # DATA_FILES = {
126 |     #     "data": "test_data_ver2.csv"
127 |     # }
128 | #     dataset = load_dataset(DATA_PATH, data_files=DATA_FILES, use_auth_token=True)
129 | #     return dataset
130 |     
131 | 
132 | 
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     result = retrieve_comments('전현무')
137 |     
138 |     
139 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from requests.api import get
 3 | from torch.utils.data import DataLoader
 4 | import uvicorn
 5 | from predict import load_model, inference, load_dataloader
 6 | from database import run_db, insert2db
 7 | from load_data import load_data, retrieve_comments, NhDataloader
 8 | from utils import make_samples
 9 | 
10 | 
11 | app = FastAPI()
12 | dataset = load_data()
13 | model, classifier = load_model()
14 | evidence = run_db()
15 | 
16 | 
17 | @app.get('/')
18 | def hello_world():
19 |     return {'hello': 'world'}
20 | 
21 | 
22 | @app.get('/get_sample/{keyword}')
23 | def get_sample(keyword):
24 |     data = retrieve_comments(keyword, dataset)
25 |     inf_dataloader = load_dataloader(data[:128])
26 |     results = inference(model, classifier, inf_dataloader)
27 |     
28 |     res2json = []
29 |     for i, res in enumerate(results[:5]):
30 |         if res == 0:
31 |             continue
32 |         else:
33 |             res2json.append({
34 |                 "keyword": keyword,
35 |                 'user_id': data[i]['user_id'],
36 |                 'comment': data[i]['comment'],
37 |                 "label": 'hate',
38 |                 'site_name': data[i]['site_name'],
39 |                 'site_url': data[i]['site_url'],
40 |                 'commented_at': data[i]['commented_at']
41 |             })
42 |   
43 |     if res2json:
44 |         insert2db(keyword, res2json, evidence)
45 |   
46 |     return res2json
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     uvicorn.run('main:app', host="0.0.0.0", port=8000, reload=True)  
51 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/boostcampaitech2/final-project-level3-nlp-12/09c6e84a3618050ab0593df6f75beacf0340f9a6/prototype/fullstack/app/model/__init__.py


--------------------------------------------------------------------------------
/prototype/fullstack/app/model/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from transformers import AutoModelForSequenceClassification
 6 | from base import BaseModel
 7 | 
 8 | 
 9 | class BeomiModel(BaseModel):
10 |     def __init__(self, name="beomi/beep-KcELECTRA-base-hate", num_classes=3):
11 |         super().__init__()
12 |         self.model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=num_classes)
13 |         
14 |     def forward(self, inputs):
15 |         return self.model(**inputs)
16 | 
17 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/predict.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoTokenizer, pipeline
 2 | import torch
 3 | from tqdm import tqdm
 4 | from load_data import NhDataloader
 5 | import joblib
 6 | 
 7 | 
 8 | def load_model():
 9 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
10 | 
11 |     nh_model = torch.load('/opt/ml/final-project-level3-nlp-12/prototype/fullstack/app/model/nh_model.pt', map_location=device) ## 경로 맞게!
12 | 
13 |     with open('/opt/ml/final-project-level3-nlp-12/prototype/fullstack/app/model/nh_classifier.pkl', 'rb') as f: ## 경로 맞게!
14 |         nh_classifier = joblib.load(f)
15 | 
16 |     nh_model.to(device)
17 |     nh_model.eval()
18 |     
19 |     return nh_model, nh_classifier, 
20 | 
21 | def load_dataloader(data):
22 |     nh_tokenizer = AutoTokenizer.from_pretrained('beomi/beep-KcELECTRA-base-hate')
23 |     
24 |     nh_dataloader = NhDataloader(
25 |         nh_tokenizer,
26 |         max_length=64
27 |     )
28 |     nh_inf_dataloader = nh_dataloader.get_dataloader(
29 |         data,
30 |         batch_size=128
31 |     )
32 |     return nh_inf_dataloader
33 |     
34 | def inference(nh_model, nh_classifier, nh_inf_dataloader):
35 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
36 |     preds = [] # 0: none, 1: hate ----df['label'] = preds
37 | 
38 |     with torch.no_grad():
39 |         for _, data in enumerate(tqdm(nh_inf_dataloader, desc=f'Inference ')):
40 |             encoded_anc_texts = data
41 | 
42 |             nh_encoded_texts = {
43 |                 "input_ids": encoded_anc_texts['input_ids'].to(device),
44 |                 "attention_mask": encoded_anc_texts['attention_mask'].to(device),
45 |                 "token_type_ids": encoded_anc_texts['token_type_ids'].to(device)
46 |             }
47 |             
48 |             nh_outputs = nh_model(**nh_encoded_texts)
49 | 
50 |             nh_logit = nh_outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
51 |             nh_logit = nh_logit.squeeze()
52 |             
53 |             pred = nh_classifier.predict(nh_logit)
54 |             
55 |             preds.extend(pred)
56 |     
57 |     return preds


--------------------------------------------------------------------------------
/prototype/fullstack/app/service/api_response.py:
--------------------------------------------------------------------------------
 1 | # {
 2 | #     code:'',
 3 | #     msg:'',
 4 | #     data:'',
 5 | #     request_time:'',
 6 | #     response_time:''
 7 | # }
 8 | from datetime import datetime
 9 | 
10 | class ApiResponse():
11 | 
12 |     @classmethod
13 |     def success(cls, status: int, msg: str, data: object, request_time: datetime) -> dict:
14 |         return {
15 |             'status': status,
16 |             'msg': msg,
17 |             'data': data,
18 |             'request_time': request_time,
19 |             'response_time': datetime.now().strftime('%Y/%m/%d, %H:%M:%S')
20 |         }
21 | 
22 |     @classmethod
23 |     def fail(cls, status: int, msg: str, request_time: datetime) -> dict:
24 |         return {
25 |             'status': status,
26 |             'msg': msg,
27 |             'data': None,
28 |             'request_time': request_time,
29 |             'response_time': datetime.now().strftime('%Y/%m/%d, %H:%M:%S')
30 |         }
31 | 
32 | # enum class for error code, msg
33 | class Msg:
34 |     SUCCESS = 'SUCCESS' # 성공
35 |     WRONG_FORMAT = '잘못된 입력 형식입니다.'
36 | 
37 | class Status:
38 |     SUCCESS = 200 # 성공
39 |     BAD_REQUEST = 400


--------------------------------------------------------------------------------
/prototype/fullstack/app/service/error_handler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | class CustomError(Exception):
 4 |     def __init__(self, code: int, msg: str) -> None:
 5 |         self.code = code
 6 |         self.msg = msg
 7 | 
 8 |     def __str__(self) -> str:
 9 |         return f'[{self.code}]{self.msg}'
10 |     
11 | 


--------------------------------------------------------------------------------
/prototype/fullstack/app/test/db_test.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import datetime
 3 | import certifi
 4 | 
 5 | ca = certifi.where()
 6 | 
 7 | client = MongoClient('mongodb+srv://jadon:aiit@cluster0.13mh6.mongodb.net/myFirstDatabase?retryWrites=true&w=majority', tlsCAFile=ca)
 8 | db = client.gettingStarted
 9 | people = db.people
10 | import datetime
11 | personDocument = {
12 |   "name": { "first": "Alan", "last": "Turing" },
13 |   "birth": datetime.datetime(1912, 6, 23),
14 |   "death": datetime.datetime(1954, 6, 7),
15 |   "contribs": [ "Turing machine", "Turing test", "Turingery" ],
16 | }
17 | people.insert_one(personDocument)
18 | print('here')


--------------------------------------------------------------------------------
/prototype/streamlit/.gitignore:
--------------------------------------------------------------------------------
1 | assets/
2 | 


--------------------------------------------------------------------------------
/prototype/streamlit/app.py:
--------------------------------------------------------------------------------
 1 | from datetime import date
 2 | import streamlit as st
 3 | import yaml
 4 | from predict import get_pipeline
 5 | 
 6 | from confirm_button_hack import cache_on_button_press
 7 | from load_data import *
 8 | from service.api_response import *
 9 | from service.error_handler import *
10 | 
11 | import logging
12 | 
13 | st.set_page_config(layout='wide')
14 | st.header('Hello AI-it!!')
15 | 
16 | st.title('Malicious Comments Collecting Service')
17 | 
18 | def main():    
19 |     st.write('Model Loading...')
20 |     pipe = get_pipeline()
21 |     
22 |     keyword = st.text_input('Keyword you want to collect!!')    
23 |     comments = retrieve_comments(keyword)
24 |     
25 |     results = []
26 |     try:
27 |         for comment in comments[:10]:
28 |             output = pipe(comment)
29 |             results.append(
30 |                 {
31 |                     'comment':comment, 
32 |                     'label':output[0]['label'], 
33 |                     'score':output[0]['score']
34 |                 }
35 |             )
36 |         st.write(ApiResponse.success(Status.SUCCESS, Msg.SUCCESS, results, datetime.now().strftime('%Y/%m/%d, %H:%M:%S')))
37 |     except:
38 |         st.write(ApiResponse.fail(Status.BAD_REQUEST, Msg.WRONG_FORMAT, results, datetime.now().strftime('%Y/%m/%d, %H:%M:%S')))
39 | 
40 | 
41 | root_password = '123'
42 | password = st.text_input('password', type='password')
43 | 
44 | @cache_on_button_press('Authenticate')
45 | def authenticate(password) -> bool:
46 |     st.write(type(password))
47 |     return password == root_password
48 | 
49 | if authenticate(password):
50 |     st.success('You are authenticated!')
51 |     main()
52 | else:
53 |     st.error('The password is invalid.')


--------------------------------------------------------------------------------
/prototype/streamlit/base/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_data_loader import *
2 | from .base_model import *
3 | from .base_trainer import *
4 | 


--------------------------------------------------------------------------------
/prototype/streamlit/base/base_data_loader.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from torch.utils.data import DataLoader
 3 | from torch.utils.data.dataloader import default_collate
 4 | from torch.utils.data.sampler import SubsetRandomSampler
 5 | 
 6 | 
 7 | class BaseDataLoader(DataLoader):
 8 |     """
 9 |     Base class for all data loaders
10 |     """
11 | 
12 |     def __init__(
13 |         self,
14 |         dataset,
15 |         batch_size,
16 |         shuffle,
17 |         validation_split,
18 |         num_workers,
19 |         collate_fn=default_collate,
20 |     ):
21 |         self.validation_split = validation_split
22 |         self.shuffle = shuffle
23 | 
24 |         self.batch_idx = 0
25 |         self.n_samples = len(dataset)
26 | 
27 |         self.sampler, self.valid_sampler = self._split_sampler(self.validation_split)
28 | 
29 |         self.init_kwargs = {
30 |             "dataset": dataset,
31 |             "batch_size": batch_size,
32 |             "shuffle": self.shuffle,
33 |             "collate_fn": collate_fn,
34 |             "num_workers": num_workers,
35 |         }
36 |         super().__init__(sampler=self.sampler, **self.init_kwargs)
37 | 
38 |     def _split_sampler(self, split):
39 |         if split == 0.0:
40 |             return None, None
41 | 
42 |         idx_full = np.arange(self.n_samples)
43 | 
44 |         np.random.seed(0)
45 |         np.random.shuffle(idx_full)
46 | 
47 |         if isinstance(split, int):
48 |             assert split > 0
49 |             assert (
50 |                 split < self.n_samples
51 |             ), "validation set size is configured to be larger than entire dataset."
52 |             len_valid = split
53 |         else:
54 |             len_valid = int(self.n_samples * split)
55 | 
56 |         valid_idx = idx_full[0:len_valid]
57 |         train_idx = np.delete(idx_full, np.arange(0, len_valid))
58 | 
59 |         train_sampler = SubsetRandomSampler(train_idx)
60 |         valid_sampler = SubsetRandomSampler(valid_idx)
61 | 
62 |         # turn off shuffle option which is mutually exclusive with sampler
63 |         self.shuffle = False
64 |         self.n_samples = len(train_idx)
65 | 
66 |         return train_sampler, valid_sampler
67 | 
68 |     def split_validation(self):
69 |         if self.valid_sampler is None:
70 |             return None
71 |         else:
72 |             return DataLoader(sampler=self.valid_sampler, **self.init_kwargs)
73 | 


--------------------------------------------------------------------------------
/prototype/streamlit/base/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | import numpy as np
 3 | from abc import abstractmethod
 4 | 
 5 | 
 6 | class BaseModel(nn.Module):
 7 |     """
 8 |     Base class for all models
 9 |     """
10 | 
11 |     @abstractmethod
12 |     def forward(self, *inputs):
13 |         """
14 |         Forward pass logic
15 | 
16 |         :return: Model output
17 |         """
18 |         raise NotImplementedError
19 | 
20 |     def __str__(self):
21 |         """
22 |         Model prints with number of trainable parameters
23 |         """
24 |         model_parameters = filter(lambda p: p.requires_grad, self.parameters())
25 |         params = sum([np.prod(p.size()) for p in model_parameters])
26 |         return super().__str__() + "\nTrainable parameters: {}".format(params)
27 | 


--------------------------------------------------------------------------------
/prototype/streamlit/base/base_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import shutil
  4 | from abc import abstractmethod
  5 | from numpy import inf
  6 | from utils import write_json
  7 | 
  8 | 
  9 | class BaseTrainer:
 10 |     """
 11 |     Base class for all trainers
 12 |     """
 13 | 
 14 |     def __init__(self, model, criterion, metric_ftns, optimizer, config):
 15 |         self.config = config
 16 |         self.logger = config.get_logger("trainer", config["trainer"]["verbosity"])
 17 | 
 18 |         self.model = model
 19 |         self.criterion = criterion
 20 |         self.metric_ftns = metric_ftns
 21 |         self.optimizer = optimizer
 22 | 
 23 |         cfg_trainer = config["trainer"]
 24 |         self.epochs = cfg_trainer["epochs"]
 25 |         self.save_steps = cfg_trainer["save"]["steps"]
 26 |         self.save_limits = cfg_trainer["save"]["limits"]
 27 |         self.monitor = cfg_trainer.get("monitor", "off")
 28 | 
 29 |         # configuration to monitor model performance and save best
 30 |         if self.monitor == "off":
 31 |             self.mnt_mode = "off"
 32 |             self.mnt_best = 0
 33 |         else:
 34 |             self.mnt_mode, self.mnt_metric = self.monitor.split()
 35 |             assert self.mnt_mode in ["min", "max"]
 36 | 
 37 |             self.mnt_best = inf if self.mnt_mode == "min" else -inf
 38 |             self.early_stop = cfg_trainer.get("early_stop", inf)
 39 |             if self.early_stop <= 0:
 40 |                 self.early_stop = inf
 41 | 
 42 |         self.not_improved_count = 0
 43 | 
 44 |         self.checkpoint_dir = cfg_trainer["save"]["dir"]
 45 | 
 46 |         if config.resume is not None:
 47 |             self._resume_checkpoint(config.resume)
 48 | 
 49 |     @abstractmethod
 50 |     def train(self):
 51 |         """
 52 |         Full training logic.
 53 |         """
 54 | 
 55 |         raise NotImplementedError
 56 | 
 57 |     @abstractmethod
 58 |     def _validation(self, step):
 59 |         """
 60 |         Full validation logic
 61 | 
 62 |         :param step: Current step number
 63 |         """
 64 |         
 65 |         raise NotImplementedError
 66 | 
 67 |     def _evaluate_performance(self, log):
 68 |         # evaluate model performance according to configured metric, save best checkpoint as model_best
 69 |         is_best = False
 70 |         if self.mnt_mode != "off":
 71 |             try:
 72 |                 # check whether model performance improved or not, according to specified metric(mnt_metric)
 73 |                 improved = (
 74 |                     self.mnt_mode == "min" and log[self.mnt_metric] <= self.mnt_best
 75 |                 ) or (self.mnt_mode == "max" and log[self.mnt_metric] >= self.mnt_best)
 76 |             except KeyError:
 77 |                 self.logger.warning(
 78 |                     "Warning: Metric '{}' is not found. "
 79 |                     "Model performance monitoring is disabled.".format(self.mnt_metric)
 80 |                 )
 81 |                 self.mnt_mode = "off"
 82 |                 improved = False
 83 | 
 84 |             if improved:
 85 |                 self.mnt_best = log[self.mnt_metric]
 86 |                 self.not_improved_count = 0
 87 |                 is_best = True
 88 |             else:
 89 |                 self.not_improved_count += 1
 90 | 
 91 |         return is_best
 92 | 
 93 |     def _save_checkpoint(self, log, is_best=False):
 94 |         """
 95 |         Saving checkpoints
 96 | 
 97 |         :param epoch: current epoch number
 98 |         :param log: logging information of the epoch
 99 |         :param save_best: if True, rename the saved checkpoint to 'best_model.pt'
100 |         """
101 |         save_path = f'{self.checkpoint_dir}models/{self.config["name"]}/'
102 |         chk_pt_path = save_path + f"steps_{log['steps']}/"
103 |         
104 |         # make path if there isn't
105 |         if not os.path.exists(chk_pt_path):
106 |             os.makedirs(chk_pt_path)
107 |         # delete the oldest checkpoint not to exceed save limits
108 |         if len(os.listdir(save_path)) > self.save_limits:
109 |             shutil.rmtree(os.path.join(
110 |                     save_path,
111 |                     sorted(os.listdir(save_path),key = lambda x : (len(x), x))[0]
112 |                 )
113 |             )
114 |         
115 |         self.logger.info("Saving checkpoint: {} ...".format(chk_pt_path))    
116 |         torch.save(self.model, os.path.join(chk_pt_path, "model.pt"))
117 |         torch.save(
118 |             self.optimizer.state_dict(), os.path.join(chk_pt_path, "optimizer.pt")
119 |         )
120 | 
121 |         # save updated config file to the checkpoint dir
122 |         write_json(self.config._config, os.path.join(chk_pt_path, "config.json"))
123 |         write_json(log, os.path.join(chk_pt_path, "log.json"))
124 | 
125 |         # save best model.
126 |         if is_best:
127 |             best_path = f'{self.checkpoint_dir}best/{self.config["name"]}/'
128 | 
129 |             # make path if there isn't
130 |             if not os.path.exists(best_path):
131 |                 os.makedirs(best_path)
132 |             # delete old best files
133 |             for file_ in os.listdir(best_path):
134 |                 os.remove(best_path + file_)
135 | 
136 |             self.logger.info("Saving current best: model_best.pt ...")
137 |             torch.save(self.model, os.path.join(best_path, "best_model.pt"))
138 |             torch.save(
139 |                 self.optimizer.state_dict(), os.path.join(best_path, "optimizer.pt")
140 |             )
141 | 
142 |             # save updated config file to the checkpoint dir
143 |             write_json(self.config._config, os.path.join(best_path, "config.json"))
144 |             write_json(log, os.path.join(best_path, "log.json"))
145 | 
146 |     def _resume_checkpoint(self, resume_path):
147 |         """
148 |         Resume from saved checkpoints
149 | 
150 |         :param resume_path: Checkpoint path to be resumed
151 |         """
152 |         resume_path = str(resume_path)
153 |         self.logger.info("Loading checkpoint: {} ...".format(resume_path))
154 |         checkpoint = torch.load(resume_path)
155 |         self.start_epoch = checkpoint["epoch"] + 1
156 |         self.mnt_best = checkpoint["monitor_best"]
157 | 
158 |         # load architecture params from checkpoint.
159 |         if checkpoint["config"]["arch"] != self.config["arch"]:
160 |             self.logger.warning(
161 |                 "Warning: Architecture configuration given in config file is different from that of "
162 |                 "checkpoint. This may yield an exception while state_dict is being loaded."
163 |             )
164 |         self.model.load_state_dict(checkpoint["state_dict"])
165 | 
166 |         # load optimizer state from checkpoint only when optimizer type is not changed.
167 |         if (
168 |             checkpoint["config"]["optimizer"]["type"]
169 |             != self.config["optimizer"]["type"]
170 |         ):
171 |             self.logger.warning(
172 |                 "Warning: Optimizer type given in config file is different from that of checkpoint. "
173 |                 "Optimizer parameters not being resumed."
174 |             )
175 |         else:
176 |             self.optimizer.load_state_dict(checkpoint["optimizer"])
177 | 
178 |         self.logger.info(
179 |             "Checkpoint loaded. Resume training from epoch {}".format(self.start_epoch)
180 |         )
181 | 


--------------------------------------------------------------------------------
/prototype/streamlit/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "beomi/beep-KcELECTRA-base-hate",
 3 |     "n_gpu": 1,
 4 | 
 5 |     "model": {
 6 |         "type": "BeepKcElectraHateModel",
 7 |         "args": {
 8 |             "name": "beomi/beep-KcELECTRA-base-hate",
 9 |             "num_classes": 3
10 |         }
11 |     },
12 |     "tokenizer": "beomi/KcELECTRA-base",
13 |     "data_loader": {
14 |         "type": "MnistDataLoader",
15 |         "args":{
16 |             "data_dir": "data/",
17 |             "batch_size": 64,
18 |             "max_length": 64,
19 |             "shuffle": true,
20 |             "validation_split": 0.1,
21 |             "num_workers": 2
22 |         }
23 |     },
24 |     "optimizer": {
25 |         "type": "AdamW",
26 |         "args":{
27 |             "lr": 5e-5,
28 |             "eps": 1e-8
29 |         },
30 |         "weight_decay": 0.0
31 |     },
32 |     "loss": "softmax",
33 |     "metrics": [
34 |         "macro_f1"
35 |     ],
36 |     "lr_scheduler": {
37 |         "type": "StepLR",
38 |         "args": {
39 |             "step_size": 50,
40 |             "gamma": 0.1
41 |         }
42 |     },
43 |     "trainer": {
44 |         "epochs": 2,
45 | 
46 |         "save": {
47 |             "dir": "saved/",
48 |             "steps": 300,
49 |             "limits": 3
50 |         },
51 |         "verbosity": 2,
52 |         
53 |         "monitor": "max val/macro_f1",
54 |         "early_stop": 2
55 |     },
56 |     "data_dir": "AI-it/korean-hate-speech",
57 |     "data_files": {
58 |         "train": "train_hate.csv",
59 |         "valid": "dev_hate.csv"
60 |     },
61 |     "test_data_file": {
62 |         "test": "test_hate_no_label.csv"
63 |     }
64 | }
65 | 


--------------------------------------------------------------------------------
/prototype/streamlit/confirm_button_hack.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import collections
 3 | import functools
 4 | import inspect
 5 | import textwrap
 6 | 
 7 | 
 8 | def cache_on_button_press(label, **cache_kwargs):
 9 |     """Function decorator to memoize function executions.
10 |     Parameters
11 |     ----------
12 |     label : str
13 |         The label for the button to display prior to running the cached funnction.
14 |     cache_kwargs : Dict[Any, Any]
15 |         Additional parameters (such as show_spinner) to pass into the underlying @st.cache decorator.
16 |     Example
17 |     -------
18 |     This show how you could write a username/password tester:
19 |     >>> @cache_on_button_press('Authenticate')
20 |     ... def authenticate(username, password):
21 |     ...     return username == "buddha" and password == "s4msara"
22 |     ...
23 |     ... username = st.text_input('username')
24 |     ... password = st.text_input('password')
25 |     ...
26 |     ... if authenticate(username, password):
27 |     ...     st.success('Logged in.')
28 |     ... else:
29 |     ...     st.error('Incorrect username or password')
30 |     """
31 |     internal_cache_kwargs = dict(cache_kwargs)
32 |     internal_cache_kwargs['allow_output_mutation'] = True
33 |     internal_cache_kwargs['show_spinner'] = False
34 | 
35 |     def function_decorator(func):
36 |         @functools.wraps(func)
37 |         def wrapped_func(*args, **kwargs):
38 |             @st.cache(**internal_cache_kwargs)
39 |             def get_cache_entry(func, args, kwargs):
40 |                 class ButtonCacheEntry:
41 |                     def __init__(self):
42 |                         self.evaluated = False
43 |                         self.return_value = None
44 | 
45 |                     def evaluate(self):
46 |                         self.evaluated = True
47 |                         self.return_value = func(*args, **kwargs)
48 | 
49 |                 return ButtonCacheEntry()
50 | 
51 |             cache_entry = get_cache_entry(func, args, kwargs)
52 |             if not cache_entry.evaluated:
53 |                 if st.button(label):
54 |                     cache_entry.evaluate()
55 |                 else:
56 |                     raise st.script_runner.StopException
57 |             return cache_entry.return_value
58 | 
59 |         return wrapped_func
60 | 
61 |     return function_decorator


--------------------------------------------------------------------------------
/prototype/streamlit/load_data.py:
--------------------------------------------------------------------------------
 1 | from os import sep
 2 | import pandas as pd
 3 | from datasets import load_dataset
 4 | 
 5 | DATA_PATH = 'AI-it/korean-hate-speech'
 6 | DATA_FILES = {
 7 |     "train_comments": "train_hate.csv",
 8 |     "train_titles": "train_news_title.txt"
 9 | }
10 | 
11 | def retrieve_comments(keyword: str) -> list:
12 |     result = []
13 |     df_comments = pd.read_csv('data/unlabeled/unlabeled_comments.txt', header=None, encoding='utf-8')
14 |     for comment in df_comments[0]:
15 |         if type(comment) != str:
16 |             continue
17 |         elif keyword in comment:
18 |             result.append(comment)
19 |         elif keyword[1:] in comment:
20 |             result.append(comment)
21 | 
22 |     return result
23 | 
24 | if __name__ == '__main__':
25 |     # lst = retrieve_comments('전현무')
26 |     retrieve_comments('전현무')
27 |     # print(lst)


--------------------------------------------------------------------------------
/prototype/streamlit/model/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from efficientnet_pytorch import EfficientNet
 6 | from transformers import AutoModelForSequenceClassification
 7 | from base import BaseModel
 8 | 
 9 | 
10 | class MyEfficientNet(nn.Module) :
11 |     '''
12 |     EfiicientNet-b4의 출력층만 변경합니다.
13 |     한번에 18개의 Class를 예측하는 형태의 Model입니다.
14 |     '''
15 |     def __init__(self, num_classes: int = 18) :
16 |         super(MyEfficientNet, self).__init__()
17 |         self.EFF = EfficientNet.from_pretrained('efficientnet-b4', in_channels=3, num_classes=num_classes)
18 |     
19 |     def forward(self, x) -> torch.Tensor:
20 |         x = self.EFF(x)
21 |         x = F.softmax(x, dim=1)
22 |         return x
23 |     
24 | class BeomiModel(BaseModel):
25 |     def __init__(self, name="beomi/beep-KcELECTRA-base-hate", num_classes=3):
26 |         super().__init__()
27 |         self.model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=num_classes)
28 |         
29 |     def forward(self, inputs):
30 |         return self.model(**inputs)


--------------------------------------------------------------------------------
/prototype/streamlit/pipeline_test.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline, AutoTokenizer
 2 | import torch
 3 | from utils.util import read_json
 4 | import argparse
 5 | import pprint
 6 | import time
 7 | 
 8 | 
 9 | def main(config):
10 |     device = 0 if torch.cuda.is_available() else -1
11 | 
12 |     model_path = config.model_path
13 |     config_path = config.config_path
14 | 
15 |     model = torch.load(model_path)
16 |     model_config = read_json(config_path)
17 | 
18 |     tokenizer = AutoTokenizer.from_pretrained(config.tokenizer)
19 | 
20 |     pipe = pipeline(
21 |         task="text-classification",
22 |         config=model_config,
23 |         model=model.model,
24 |         tokenizer=tokenizer,
25 |         device=device
26 |     )
27 | 
28 |     for i in range(config.num):
29 |         text = input(f"문장을 입력하세요 {i+1} / {config.num}: ")
30 | 
31 |         start_time = time.time()
32 |         result = pipe(text)
33 |         end_time = time.time()
34 |         
35 |         print(f'inference time: {end_time - start_time}')
36 |         pprint.pprint(result)
37 |         
38 | 
39 | if __name__ == "__main__":
40 |     parser = argparse.ArgumentParser()
41 |     parser.add_argument(
42 |         "-m",
43 |         "--model_path",
44 |         default=None,
45 |         type=str,
46 |         help="saved model file (.pt) path (default: None)",
47 |     )
48 |     parser.add_argument(
49 |         "-c",
50 |         "--config_path",
51 |         default=None,
52 |         type=str,
53 |         help="saved model config file path (default: None)",
54 |     )
55 |     parser.add_argument(
56 |         "-t",
57 |         "--tokenizer",
58 |         default="beomi/KcELECTRA-base",
59 |         type=str,
60 |         help="pretrained tokenizer name (default: beomi/KcELECTRA-base)",
61 |     )
62 |     parser.add_argument(
63 |         "-n",
64 |         "--num",
65 |         default=3,
66 |         type=int,
67 |         help="How many times will you check the sentence? (default: 1)",
68 |     )
69 |     args = parser.parse_args()
70 |     main(args)
71 | 


--------------------------------------------------------------------------------
/prototype/streamlit/predict.py:
--------------------------------------------------------------------------------
 1 | from transformers import pipeline, AutoTokenizer
 2 | import torch
 3 | import streamlit as st
 4 | from utils import read_json
 5 | 
 6 | @st.cache(allow_output_mutation=True)
 7 | def get_pipeline():
 8 |     print('here@@@@@@@@@@@@@@@@@')
 9 |     device = 0 if torch.cuda.is_available() else -1
10 | 
11 |     model_path = '/Users/yangjaeug/Desktop/GitHub/Product-Serving/assets/comments_task/model.pt'
12 |     config_path = '/Users/yangjaeug/Desktop/GitHub/Product-Serving/practice/st_practice/config.json'
13 | 
14 |     model = torch.load(model_path, map_location=torch.device('cpu'))
15 |     model_config = read_json(config_path)
16 | 
17 |     tokenizer = AutoTokenizer.from_pretrained('beomi/KcELECTRA-base')
18 | 
19 |     pipe = pipeline(
20 |         task="text-classification",
21 |         config=model_config,
22 |         model=model.model,
23 |         tokenizer=tokenizer,
24 |         device=device
25 |     )
26 |     
27 |     return pipe


--------------------------------------------------------------------------------
/prototype/streamlit/service/api_response.py:
--------------------------------------------------------------------------------
 1 | # {
 2 | #     code:'',
 3 | #     msg:'',
 4 | #     data:'',
 5 | #     request_time:'',
 6 | #     response_time:''
 7 | # }
 8 | from datetime import datetime
 9 | 
10 | class ApiResponse():
11 | 
12 |     @classmethod
13 |     def success(cls, status: int, msg: str, data: object, request_time: datetime) -> dict:
14 |         return {
15 |             'status': status,
16 |             'msg': msg,
17 |             'data': data,
18 |             'request_time': request_time,
19 |             'response_time': datetime.now().strftime('%Y/%m/%d, %H:%M:%S')
20 |         }
21 | 
22 |     @classmethod
23 |     def fail(cls, status: int, msg: str, request_time: datetime) -> dict:
24 |         return {
25 |             'status': status,
26 |             'msg': msg,
27 |             'data': None,
28 |             'request_time': request_time,
29 |             'response_time': datetime.now().strftime('%Y/%m/%d, %H:%M:%S')
30 |         }
31 | 
32 | # enum class for error code, msg
33 | class Msg:
34 |     SUCCESS = 'SUCCESS' # 성공
35 |     WRONG_FORMAT = '잘못된 입력 형식입니다.'
36 | 
37 | class Status:
38 |     SUCCESS = 200 # 성공
39 |     BAD_REQUEST = 400


--------------------------------------------------------------------------------
/prototype/streamlit/service/error_handler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | class CustomError(Exception):
 4 |     def __init__(self, code: int, msg: str) -> None:
 5 |         self.code = code
 6 |         self.msg = msg
 7 | 
 8 |     def __str__(self) -> str:
 9 |         return f'[{self.code}]{self.msg}'
10 |     
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.6.0
 2 | torchvision==0.7.0
 3 | optuna==2.4.0
 4 | pandas==1.1.5
 5 | scikit-learn==0.24.1
 6 | psycopg2-binary
 7 | numpy
 8 | Cython==0.29.24
 9 | pre-commit==2.9.3
10 | split-folders==0.4.3
11 | ptflops
12 | wandb~=0.12.7
13 | tqdm
14 | datasets
15 | soynlp
16 | emoji
17 | transformers
18 | streamlit
19 | uvicorn
20 | fastapi
21 | https://github.com/ufoym/imbalanced-dataset-sampler/archive/master.zip


--------------------------------------------------------------------------------
/simple_test.py:
--------------------------------------------------------------------------------
 1 | from torch._C import TracingState
 2 | from transformers import pipeline, AutoTokenizer
 3 | import torch
 4 | from utils.util import read_json
 5 | import argparse
 6 | import pprint
 7 | import time
 8 | 
 9 | 
10 | def main(config):
11 |     device = 0 if torch.cuda.is_available() else -1
12 |     print(transformers.__version__)
13 |     model_path = config.model_path
14 |     config_path = config.config_path
15 | 
16 |     model = torch.load(model_path)
17 |     model_config = read_json(config_path)
18 | 
19 |     tokenizer = AutoTokenizer.from_pretrained(config.tokenizer)
20 | 
21 |     pipe = pipeline(
22 |         task="text-classification",
23 |         config=model_config,
24 |         model=model.model,
25 |         tokenizer=tokenizer,
26 |         device=device
27 |     )
28 | 
29 |     for i in range(config.num):
30 |         text = input(f"문장을 입력하세요 {i+1} / {config.num}: ")
31 | 
32 |         start_time = time.time()
33 |         result = pipe(text)
34 |         end_time = time.time()
35 |         
36 |         print(f'inference time: {end_time - start_time}')
37 |         pprint.pprint(result)
38 |         
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument(
43 |         "-m",
44 |         "--model_path",
45 |         default=None,
46 |         type=str,
47 |         help="saved model file (.pt) path (default: None)",
48 |     )
49 |     parser.add_argument(
50 |         "-c",
51 |         "--config_path",
52 |         default=None,
53 |         type=str,
54 |         help="saved model config file path (default: None)",
55 |     )
56 |     parser.add_argument(
57 |         "-t",
58 |         "--tokenizer",
59 |         default="beomi/KcELECTRA-base",
60 |         type=str,
61 |         help="pretrained tokenizer name (default: beomi/KcELECTRA-base)",
62 |     )
63 |     parser.add_argument(
64 |         "-n",
65 |         "--num",
66 |         default=3,
67 |         type=int,
68 |         help="How many times will you check the sentence? (default: 1)",
69 |     )
70 |     args = parser.parse_args()
71 |     main(args)
72 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import argparse
 4 | from tqdm import tqdm
 5 | import pandas as pd
 6 | import data_loader.data_loaders as module_data
 7 | import model.loss as module_loss
 8 | import model.metric as module_metric
 9 | import model.model as module_arch
10 | from transformers import AutoTokenizer
11 | from parse_config import ConfigParser
12 | from datasets import load_dataset
13 | 
14 | IDX_2_LABEL = {
15 |     0: "none",
16 |     1: "offensive",
17 |     2: "hate"
18 | }
19 | 
20 | def main(config):
21 |     # load model and tokenizer architecture
22 |     model = torch.load(config['model']['type'])
23 |     tokenizer = AutoTokenizer.from_pretrained(config['tokenizer']['type'])
24 | 
25 |     # setup data_loader instances
26 |     data_loader = getattr(module_data, 'KhsDataLoader')(
27 |         tokenizer,
28 |         max_length=config['data_loader']['args']['max_length']
29 |     )
30 |     data_loader = data_loader.get_dataloader(
31 |         name='test',
32 |         data_dir=config['data_loader']['args']['data_dir'],
33 |         data_files=config['data_loader']['test_data_file'],
34 |         batch_size=config['data_loader']['args']['batch_size']
35 |     )
36 | 
37 |     # prepare model for testing
38 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
39 |     model = model.to(device)
40 |     
41 |     scaler = (
42 |         torch.cuda.amp.GradScaler() if config['fp16'] and device != torch.device("cpu") else None
43 |     )
44 |     
45 |     model.eval()
46 | 
47 |     output_pred = []
48 | 
49 |     with torch.no_grad():
50 |         for step, data in enumerate(tqdm(data_loader)):
51 |             input_ids, token_type_ids, attention_mask = data
52 |             
53 |             input_ids = input_ids.to(device)
54 |             attention_mask = attention_mask.to(device)
55 |             token_type_ids = token_type_ids.to(device)
56 | 
57 |             inputs = {
58 |                 "input_ids": input_ids,
59 |                 "attention_mask": attention_mask,
60 |                 "token_type_ids": token_type_ids
61 |             }
62 |             
63 |             if scaler:
64 |                 with torch.cuda.amp.autocast():
65 |                     outputs = model(inputs)
66 |             else:
67 |                 outputs = model(inputs)
68 |             
69 |             if isinstance(outputs, torch.Tensor):
70 |                 logits = outputs
71 |             else:
72 |                 logits = outputs[0]
73 |             
74 |             _, preds = torch.max(logits, dim=1)
75 |             
76 |             output_pred.extend(preds.detach().cpu().numpy())
77 |             
78 |     dataset = load_dataset(config['data_loader']['args']['data_dir'], data_files=config['data_loader']['test_data_file'], use_auth_token=True)
79 |     test_df = pd.DataFrame()
80 |     test_df['comments'] = dataset['test']['comments']
81 |     test_df['label'] = output_pred
82 |     test_df.to_csv(
83 |         'data/result.csv',
84 |         index=None
85 |     )
86 | 
87 | if __name__ == '__main__':
88 |     args = argparse.ArgumentParser(description='PyTorch Template')
89 |     args.add_argument('-c', '--config', default=None, type=str,
90 |                       help='config file path (default: None)')
91 |     args.add_argument('-r', '--resume', default=None, type=str,
92 |                       help='path to latest checkpoint (default: None)')
93 |     args.add_argument('-d', '--device', default=None, type=str,
94 |                       help='indices of GPUs to enable (default: all)')
95 | 
96 |     config = ConfigParser.from_args(args)
97 |     main(config)
98 | 


--------------------------------------------------------------------------------
/test_automl.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | sys.path.append("automl")
  4 | from json import load
  5 | import os
  6 | import torch
  7 | import argparse
  8 | from tqdm import tqdm
  9 | import pandas as pd
 10 | import data_loader.data_loaders as module_data
 11 | import model.loss as module_loss
 12 | import model.metric as module_metric
 13 | import model.model as module_arch
 14 | from transformers import AutoTokenizer
 15 | from parse_config import ConfigParser
 16 | from datasets import load_dataset
 17 | from utils import read_json
 18 | from automl.src.model import Model
 19 | from typing import Any, Dict, Union
 20 | import yaml
 21 | 
 22 | IDX_2_LABEL = {0: "none", 1: "offensive", 2: "hate"}
 23 | 
 24 | 
 25 | def read_yaml(cfg: Union[str, Dict[str, Any]]):
 26 |     if not isinstance(cfg, dict):
 27 |         with open(cfg) as f:
 28 |             config = yaml.load(f, Loader=yaml.FullLoader)
 29 |     else:
 30 |         config = cfg
 31 |     return config
 32 | 
 33 | 
 34 | def main(config):
 35 |     # load model and tokenizer architecture
 36 |     config_model = read_yaml(
 37 |         os.path.join(
 38 |             config["saved_folder"]["path"],
 39 |             "trial" + str(config["saved_folder"]["trial"]),
 40 |             config["saved_folder"]["model_config"],
 41 |         )
 42 |     )
 43 |     model = Model(config_model, verbose=True)
 44 |     print(model)
 45 | 
 46 |     model.load_state_dict(
 47 |         torch.load(
 48 |             os.path.join(
 49 |                 config["saved_folder"]["path"],
 50 |                 "trial" + str(config["saved_folder"]["trial"]),
 51 |                 config["saved_folder"]["model_weight"],
 52 |             )
 53 |         )
 54 |     )
 55 |     tokenizer = AutoTokenizer.from_pretrained(config["model"]["args"]["name"])
 56 | 
 57 |     # setup data_loader instances
 58 |     data_loader = getattr(module_data, "KhsDataLoader")(
 59 |         tokenizer, max_length=config["data_loader"]["args"]["max_length"]
 60 |     )
 61 |     data_loader = data_loader.get_dataloader(
 62 |         name="test",
 63 |         data_dir=config["data_dir"],
 64 |         data_files=config["test_data_file"],
 65 |         batch_size=config["data_loader"]["args"]["batch_size"],
 66 |     )
 67 | 
 68 |     # prepare model for testing
 69 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 70 |     model = model.to(device)
 71 |     model.eval()
 72 | 
 73 |     output_pred = []
 74 | 
 75 |     with torch.no_grad():
 76 |         for step, data in enumerate(tqdm(data_loader)):
 77 |             input_ids, token_type_ids, attention_mask = data
 78 | 
 79 |             input_ids = input_ids.to(device)
 80 |             attention_mask = attention_mask.to(device)
 81 |             token_type_ids = token_type_ids.to(device)
 82 | 
 83 |             inputs = {
 84 |                 "input_ids": input_ids,
 85 |                 "attention_mask": attention_mask,
 86 |                 "token_type_ids": token_type_ids,
 87 |             }
 88 |             outputs = model(inputs)
 89 | 
 90 |             if isinstance(outputs, torch.Tensor):
 91 |                 logits = outputs
 92 |             else:
 93 |                 logits = outputs[0]
 94 | 
 95 |             _, preds = torch.max(logits, dim=1)
 96 | 
 97 |             output_pred.extend(preds.detach().cpu().numpy())
 98 | 
 99 |     dataset = load_dataset(
100 |         config["data_dir"], data_files=config["test_data_file"], use_auth_token=True
101 |     )
102 |     test_df = pd.DataFrame()
103 |     test_df["comments"] = dataset["test"]["comments"]
104 |     test_df["label"] = output_pred
105 |     test_df.to_csv("data/result.csv", index=None)
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     args = argparse.ArgumentParser(description="PyTorch Template")
110 |     args.add_argument(
111 |         "-c",
112 |         "--config",
113 |         default=None,
114 |         type=str,
115 |         help="config file path (default: None)",
116 |     )
117 |     args.add_argument(
118 |         "-r",
119 |         "--resume",
120 |         default=None,
121 |         type=str,
122 |         help="path to latest checkpoint (default: None)",
123 |     )
124 |     args.add_argument(
125 |         "-d",
126 |         "--device",
127 |         default=None,
128 |         type=str,
129 |         help="indices of GPUs to enable (default: all)",
130 |     )
131 | 
132 |     config = ConfigParser.from_args(args)
133 |     main(config)
134 | 


--------------------------------------------------------------------------------
/tokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}


--------------------------------------------------------------------------------
/tokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
1 | {"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "do_basic_tokenize": true, "never_split": null, "model_max_length": 512, "special_tokens_map_file": "/home/beomi/.cache/huggingface/transformers/6bddca875f34b8afbae26136b9594ea80793c9598640f0bc94017555a0a1c113.31b83c6ab34462cefd974ed0df8dd4189e7b7b81b47315b7a10627f7ae120002", "name_or_path": "beomi/KcELECTRA-base", "tokenizer_class": "BertTokenizer"}


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import wandb
  4 | import random
  5 | import argparse
  6 | import collections
  7 | import numpy as np
  8 | import torch.nn as nn
  9 | import data_loader.data_loaders as module_data
 10 | import model.loss as module_loss
 11 | import model.metric as module_metric
 12 | import model.model as module_arch
 13 | from parse_config import ConfigParser
 14 | from trainer import Trainer
 15 | from utils import prepare_device 
 16 | from transformers import AutoTokenizer
 17 | from data_loader.data_loaders import KhsDataLoader
 18 | 
 19 | 
 20 | def seed_everything(seed):
 21 |     """
 22 |     fix random seeds for reproducibility.
 23 |     Args:
 24 |         seed (int):
 25 |             seed number
 26 |     """
 27 |     torch.manual_seed(seed)
 28 |     torch.cuda.manual_seed(seed)
 29 |     torch.cuda.manual_seed_all(seed)  # if use multi-GPU
 30 |     torch.backends.cudnn.deterministic = True
 31 |     torch.backends.cudnn.benchmark = False
 32 |     np.random.seed(seed)
 33 |     random.seed(seed)
 34 | 
 35 | 
 36 | def main(config):
 37 |     seed_everything(42)
 38 |     wandb.init(project='#TODO', entity='#TODO', config=config)
 39 | 
 40 |     # build model architecture and tokenizer
 41 |     model = config.init_obj('model', module_arch)
 42 |     tokenizer = AutoTokenizer.from_pretrained(config['tokenizer']['type'])
 43 |     
 44 |     # build train and valid dataloader
 45 |     dataloader = KhsDataLoader(
 46 |         tokenizer,
 47 |         max_length=config['data_loader']['args']['max_length']
 48 |     )
 49 |     train_data_loader = dataloader.get_dataloader(
 50 |         name='train',
 51 |         data_dir=config['data_loader']['args']['data_dir'], 
 52 |         data_files=config['data_loader']['data_files'],
 53 |         batch_size=config['data_loader']['args']['batch_size']
 54 |     )
 55 |     valid_data_loader = dataloader.get_dataloader(
 56 |         name='valid',
 57 |         data_dir=config['data_loader']['args']['data_dir'], 
 58 |         data_files=config['data_loader']['data_files'],
 59 |         batch_size=config['data_loader']['args']['batch_size']
 60 |     )
 61 | 
 62 |     # prepare for (multi-device) GPU training
 63 |     device, device_ids = prepare_device(config['n_gpu'])
 64 |     model = model.to(device)
 65 |     if len(device_ids) > 1:
 66 |         model = torch.nn.DataParallel(model, device_ids=device_ids)
 67 |     
 68 |     # get function handles of loss and metrics
 69 |     criterion = getattr(module_loss, config['loss'])
 70 |     metrics = [getattr(module_metric, met) for met in config['metrics']]
 71 | 
 72 |     # build optimizer, learning rate scheduler. delete every lines containing lr_scheduler for disabling scheduler
 73 |     no_decay = ['bias', 'LayerNorm.weight']
 74 |     trainable_params = [
 75 |         {
 76 |             'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
 77 |             'weight_decay': config['optimizer']['weight_decay']
 78 |         },
 79 |         {
 80 |             'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
 81 |             'weight_decay': 0.0
 82 |         }
 83 |     ]
 84 |     
 85 |     optimizer = config.init_obj('optimizer', torch.optim, trainable_params)
 86 |     lr_scheduler = config.init_obj('lr_scheduler', torch.optim.lr_scheduler, optimizer)
 87 |     scaler = (
 88 |         torch.cuda.amp.GradScaler() if config['trainer']['fp16'] and device != torch.device("cpu") else None
 89 |     )
 90 | 
 91 |     trainer = Trainer(
 92 |         model,
 93 |         criterion,
 94 |         metrics,
 95 |         optimizer,
 96 |         config=config,
 97 |         device=device,
 98 |         data_loader=train_data_loader,
 99 |         valid_data_loader=valid_data_loader,
100 |         lr_scheduler=lr_scheduler,
101 |         scaler=scaler
102 |     )
103 | 
104 |     trainer.train()
105 | 
106 | if __name__ == '__main__':
107 |     args = argparse.ArgumentParser(description='PyTorch Template')
108 |     args.add_argument('-c', '--config', default=None, type=str,
109 |                       help='config file path (default: None)')
110 |     args.add_argument('-r', '--resume', default=None, type=str,
111 |                       help='path to latest checkpoint (default: None)')
112 |     args.add_argument('-d', '--device', default=None, type=str,
113 |                       help='indices of GPUs to enable (default: all)')
114 | 
115 |     config = ConfigParser.from_args(args)
116 |     main(config)
117 | 


--------------------------------------------------------------------------------
/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .trainer import *
2 | from .kd_trainer import *
3 | 


--------------------------------------------------------------------------------
/trainer/kd_trainer.py:
--------------------------------------------------------------------------------
  1 | import wandb
  2 | import warnings
  3 | import numpy as np
  4 | import torch
  5 | import torch.nn as nn
  6 | from tqdm import tqdm
  7 | from torchvision.utils import make_grid
  8 | from base import BaseTrainer
  9 | from utils import inf_loop, MetricTracker
 10 | import model.loss as module_loss
 11 | 
 12 | class KnowDistTrainer(BaseTrainer):
 13 |     """
 14 |     Trainer class
 15 |     """
 16 |     def __init__(self, student_model, teacher_model, criterion, metric_ftns, optimizer, config, device,
 17 |                  data_loader, valid_data_loader=None, lr_scheduler=None):
 18 |         super().__init__(student_model, criterion, metric_ftns, optimizer, config)
 19 |         
 20 |         # self.model = student model => assigned in BaseTrainer class.
 21 |         self.teacher_model = teacher_model
 22 |         self.config = config
 23 |         self.device = device
 24 |         self.data_loader = data_loader
 25 |         self.steps_per_epoch = len(self.data_loader)
 26 |         self.batch_size = self.data_loader.batch_size
 27 |         
 28 |         self.valid_data_loader = valid_data_loader
 29 |         self.do_validation = self.valid_data_loader is not None
 30 |         self.lr_scheduler = lr_scheduler
 31 |         self.valid_criterion = getattr(module_loss, 'softmax')
 32 | 
 33 |         self.train_metrics = MetricTracker('train/loss', *['train/' + m.__name__ for m in self.metric_ftns])
 34 |         self.valid_metrics = MetricTracker('val/loss', *['val/' + m.__name__ for m in self.metric_ftns])
 35 | 
 36 |     def train(self):
 37 |         """
 38 |         Training logic for an epoch
 39 | 
 40 |         :param epoch: Integer, current training epoch.
 41 |         :return: A log that contains average loss and metric in this epoch.
 42 |         """
 43 |         step = 0
 44 |         self.model.train()
 45 |         self.teacher_model.eval()
 46 |         
 47 |         for epoch, _ in enumerate(range(self.epochs), start = 1):
 48 |             for _, data in enumerate(tqdm(self.data_loader, desc=f'TRAINING - [{epoch}] EPOCH')):
 49 |                 step += 1
 50 |                 
 51 |                 st_input_ids, st_token_type_ids, st_attention_mask, tc_input_ids, tc_token_type_ids, tc_attention_mask, targets = data
 52 |                 
 53 |                 st_input_ids = st_input_ids.to(self.device)
 54 |                 st_attention_mask = st_attention_mask.to(self.device)
 55 |                 st_token_type_ids = st_token_type_ids.to(self.device)
 56 |                 
 57 |                 tc_input_ids = tc_input_ids.to(self.device)
 58 |                 tc_attention_mask = tc_attention_mask.to(self.device)
 59 |                 tc_token_type_ids = tc_token_type_ids.to(self.device)
 60 |                 
 61 |                 targets = targets.to(self.device)
 62 |                 
 63 |                 st_inputs = {
 64 |                     "input_ids": st_input_ids,
 65 |                     "attention_mask": st_attention_mask,
 66 |                     "token_type_ids": st_token_type_ids
 67 |                 }
 68 |                 tc_inputs = {
 69 |                     "input_ids": tc_input_ids,
 70 |                     "attention_mask": tc_attention_mask,
 71 |                     "token_type_ids": tc_token_type_ids
 72 |                 }
 73 |                 
 74 |                 student_outputs = self.model(st_inputs)
 75 |                 teacher_outputs = self.teacher_model(tc_inputs)
 76 |                 
 77 |                 if isinstance(student_outputs, torch.Tensor):
 78 |                     student_logits = student_outputs
 79 |                 else:
 80 |                     student_logits = student_outputs[0]
 81 |                     
 82 |                 if isinstance(teacher_outputs, torch.Tensor):
 83 |                     teacher_logits = teacher_outputs
 84 |                 else:
 85 |                     teacher_logits = teacher_outputs[0]
 86 |                 
 87 |                 loss = self.criterion(student_logits, targets, teacher_logits)
 88 |                 
 89 |                 self.optimizer.zero_grad()
 90 |                 
 91 |                 loss.backward()
 92 |                 
 93 |                 # https://curiousily.com/posts/sentiment-analysis-with-bert-and-hugging-face-using-pytorch-and-python/
 94 |                 # avoding exploding gradients
 95 |                 nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
 96 |                 
 97 |                 self.optimizer.step()
 98 |                 if self.lr_scheduler is not None:
 99 |                     self.lr_scheduler.step()
100 |                 
101 |                 _, preds = torch.max(student_logits, dim=1)
102 |                 preds = preds.detach().cpu().numpy()
103 |                 targets = targets.detach().cpu().numpy()
104 |                 
105 |                 self.train_metrics.update('train/loss', loss.item())
106 |                 for met in self.metric_ftns:
107 |                     self.train_metrics.update('train/' + met.__name__, met(preds, targets))
108 | 
109 |                 # activate validation and saving when current step meets 'save steps'
110 |                 if step % self.save_steps == 0:
111 |                     log = self.train_metrics.result()
112 |                     
113 |                     if self.do_validation:
114 |                         val_log = self._validation(step)
115 |                         log.update(**{k : v for k, v in val_log.items()})
116 |                     
117 |                     log['epoch'] = epoch
118 |                     log['steps'] = step
119 |                     
120 |                     # visualization log
121 |                     wandb.log(log, step=step)
122 |             
123 |                     for key, value in log.items():
124 |                         self.logger.info('    {:15s}: {}'.format(str(key), value))
125 |                     
126 |                     is_best = self._evaluate_performance(log)
127 |                     
128 |                     # Early Stopping
129 |                     if self.not_improved_count > self.early_stop:
130 |                         self.logger.info("Validation performance didn\'t improve for {} epochs. ""Training stops.".format(self.early_stop))
131 |                         return False
132 |                     
133 |                     self._save_checkpoint(log, is_best)
134 |                     
135 |                     # get back to work again!
136 |                     self.model.train()
137 |                     self.train_metrics.reset()
138 | 
139 |     def _validation(self, step):
140 |         """
141 |         Validate after training an epoch
142 | 
143 |         :param epoch: Integer, current training epoch.
144 |         :return: A log that contains information about validation
145 |         """
146 |         self.model.eval()
147 |         self.valid_metrics.reset()
148 |         
149 |         with torch.no_grad():
150 |             print(f"VALIDATION - [{step}] STEPS ...")
151 |             for _, data in enumerate(self.valid_data_loader):
152 |                 input_ids, token_type_ids, attention_mask, targets = data
153 |             
154 |                 input_ids = input_ids.to(self.device)
155 |                 attention_mask = attention_mask.to(self.device)
156 |                 token_type_ids = token_type_ids.to(self.device)
157 |                 targets = targets.to(self.device)
158 | 
159 |                 inputs = {
160 |                 "input_ids": input_ids,
161 |                 "attention_mask": attention_mask,
162 |                 "token_type_ids": token_type_ids
163 |                 }
164 |                 outputs = self.model(inputs)
165 |                 
166 |                 if isinstance(outputs, torch.Tensor):
167 |                     logits = outputs
168 |                 else:
169 |                     logits = outputs[0]
170 |                     
171 |                 loss = self.valid_criterion(logits, targets)
172 |                 
173 |                 _, preds = torch.max(logits, dim=1)
174 |                 preds = preds.detach().cpu().numpy()
175 |                 targets = targets.detach().cpu().numpy()
176 | 
177 |                 self.valid_metrics.update('val/loss', loss.item())
178 |                 for met in self.metric_ftns:
179 |                     self.valid_metrics.update('val/' + met.__name__, met(preds, targets))
180 | 
181 |         return self.valid_metrics.result()
182 | 
183 |     def _progress(self, batch_idx):
184 |         base = '[{}/{} ({:.0f}%)]'
185 |         if hasattr(self.data_loader, 'n_samples'):
186 |             current = batch_idx * self.data_loader.batch_size
187 |             total = self.data_loader.n_samples
188 |         else:
189 |             current = batch_idx
190 |             total = self.steps_per_epoch
191 |         return base.format(current, total, 100.0 * current / total)


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .util import *
2 | 


--------------------------------------------------------------------------------
/utils/api_response.py:
--------------------------------------------------------------------------------
 1 | # {
 2 | #     code:'',
 3 | #     msg:'',
 4 | #     data:'',
 5 | #     request_time:'',
 6 | #     response_time:''
 7 | # }
 8 | from datetime import datetime
 9 | from pprint import pprint
10 | from error_handler import Code, Msg
11 | 
12 | class ApiResponse():
13 | 
14 |     @classmethod
15 |     def success(cls, code: int, msg: str, data: object, request_time: datetime) -> dict:
16 |         return {
17 |             'code': code,
18 |             'msg': msg,
19 |             'data': data,
20 |             'request_time': request_time,
21 |             'response_time': datetime.now().strftime('%Y/%m/%d, %H:%M:%S')
22 |         }
23 | 
24 |     @classmethod
25 |     def fail(cls, code: int, msg: str, request_time: datetime) -> dict:
26 |         return {
27 |             'code': code,
28 |             'msg': msg,
29 |             'data': None,
30 |             'request_time': request_time,
31 |             'response_time': datetime.now().strftime('%Y/%m/%d, %H:%M:%S')
32 |         }
33 | 
34 | class TestClass:
35 |     def __init__(self) -> None:
36 |         pass
37 | 
38 | test_class = TestClass()
39 | pprint(ApiResponse.success(Code.SUCCESS, Msg.SUCCESS, test_class, datetime.now().strftime('%Y/%m/%d, %H:%M:%S')))


--------------------------------------------------------------------------------
/utils/error_handler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | class CustomError(Exception):
 5 |     def __init__(self, code: int, msg: str) -> None:
 6 |         self.code = code
 7 |         self.msg = msg
 8 | 
 9 |     def __str__(self) -> str:
10 |         return f'[{self.code}]{self.msg}'
11 | 
12 | class Msg:
13 |     SUCCESS = 'SUCCESS' # 성공
14 |     WRONG_FORMAT = '잘못된 입력 형식입니다.'
15 | 
16 | class Code:
17 |     SUCCESS = 200 # 성공
18 |     BAD_REQUEST = 400
19 | 
20 | s = 'jyp'
21 | try:
22 |     if s == 'jyp':
23 |         raise CustomError(Code.BAD_REQUEST, Msg.WRONG_FORMAT)
24 | except CustomError as e:
25 |     logging.info(e)
26 | 


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import torch
  4 | import emoji
  5 | import wandb
  6 | import pandas as pd
  7 | from pathlib import Path
  8 | from itertools import repeat
  9 | from collections import OrderedDict
 10 | from soynlp.normalizer import repeat_normalize
 11 | 
 12 | 
 13 | def ensure_dir(dirname):
 14 |     dirname = Path(dirname)
 15 |     if not dirname.is_dir():
 16 |         dirname.mkdir(parents=True, exist_ok=False)
 17 | 
 18 | def read_json(fname):
 19 |     fname = Path(fname)
 20 |     with fname.open('rt') as handle:
 21 |         return json.load(handle, object_hook=OrderedDict)
 22 | 
 23 | def write_json(content, fname):
 24 |     fname = Path(fname)
 25 |     with fname.open('wt') as handle:
 26 |         json.dump(content, handle, indent=4, sort_keys=False)
 27 | 
 28 | def inf_loop(data_loader):
 29 |     ''' wrapper function for endless data loader. '''
 30 |     for loader in repeat(data_loader):
 31 |         yield from loader
 32 | 
 33 | def prepare_device(n_gpu_use):
 34 |     """
 35 |     setup GPU device if available. get gpu device indices which are used for DataParallel
 36 |     """
 37 |     n_gpu = torch.cuda.device_count()
 38 |     if n_gpu_use > 0 and n_gpu == 0:
 39 |         print("Warning: There\'s no GPU available on this machine,"
 40 |               "training will be performed on CPU.")
 41 |         n_gpu_use = 0
 42 |     if n_gpu_use > n_gpu:
 43 |         print(f"Warning: The number of GPU\'s configured to use is {n_gpu_use}, but only {n_gpu} are "
 44 |               "available on this machine.")
 45 |         n_gpu_use = n_gpu
 46 |     device = torch.device('cuda:0' if n_gpu_use > 0 else 'cpu')
 47 |     list_ids = list(range(n_gpu_use))
 48 |     return device, list_ids
 49 | 
 50 | 
 51 | class MetricTracker:
 52 |     def __init__(self, *keys):
 53 |         self._data = pd.DataFrame(index=keys, columns=['total', 'counts', 'average'])
 54 |         self.reset()
 55 | 
 56 |     def reset(self):
 57 |         for col in self._data.columns:
 58 |             self._data[col].values[:] = 0
 59 | 
 60 |     def update(self, key, value, n=1):
 61 |         self._data.total[key] += value * n
 62 |         self._data.counts[key] += n
 63 |         self._data.average[key] = self._data.total[key] / self._data.counts[key]
 64 | 
 65 |     def avg(self, key):
 66 |         return self._data.average[key]
 67 | 
 68 |     def result(self):
 69 |         return dict(self._data.average)
 70 | 
 71 | 
 72 | 
 73 | def preprocess(sents):
 74 |     """
 75 |     kcELECTRA-base preprocess procedure + modification
 76 |     """
 77 |     preprocessed_sents = []
 78 |     
 79 |     emojis = set()
 80 |     for k in emoji.UNICODE_EMOJI.keys():
 81 |         emojis.update(emoji.UNICODE_EMOJI[k].keys())
 82 |         
 83 |     punc_bracket_pattern = re.compile(f'[\'\"\[\]\(\)]')
 84 |     base_pattern = re.compile(f'[^.,?!/@$%~％·∼()\x00-\x7Fㄱ-힣{emojis}]+')
 85 |     url_pattern = re.compile(
 86 |         r'(http|ftp|https)?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
 87 |     )
 88 |     
 89 |     for sent in sents:
 90 |         sent = punc_bracket_pattern.sub(' ', sent)
 91 |         sent = base_pattern.sub(' ', sent)
 92 |         sent = url_pattern.sub('', sent)
 93 |         sent = sent.strip()
 94 |         sent = repeat_normalize(sent, num_repeats=2)
 95 |         preprocessed_sents.append(sent)
 96 |             
 97 |     return preprocessed_sents
 98 | 
 99 | 
100 | class Preprocess():
101 |     '''A class for preprocessing contexts from train and wikipedia
102 |     Args:
103 |         sents (list): context list
104 |         langs (list): language list should be removed from sentence
105 |     '''
106 | 
107 |     PERMIT_REMOVE_LANGS = [
108 |         'arabic',
109 |         'russian',
110 |     ]
111 | 
112 |     def __init__(self, sents: list):
113 |         self.sents = sents
114 | 
115 |     def proc_preprocessing(self):
116 |         """
117 |         A function for doing preprocess
118 |         """
119 |         self.remove_hashtag()
120 |         self.remove_user_mention()
121 |         self.remove_bad_char()
122 |         self.clean_punc()
123 |         self.remove_useless_char()
124 |         self.remove_linesign()
125 |         self.remove_repeated_spacing()
126 |         
127 |         return self.sents
128 | 
129 |     def remove_hashtag(self):
130 |         """
131 |         A function for removing hashtag
132 |         """
133 |         preprocessed_sents = []
134 |         for sent in self.sents:
135 |             sent = re.sub(r"#\S+", "", sent).strip()
136 |             if sent:
137 |                 preprocessed_sents.append(sent)
138 |         self.sents = preprocessed_sents
139 | 
140 |     def remove_user_mention(self):
141 |         """
142 |         A function for removing mention tag
143 |         """
144 |         preprocessed_sents = []
145 |         for sent in self.sents:
146 |             sent = re.sub(r"@\w+", "", sent).strip()
147 |             if sent:
148 |                 preprocessed_sents.append(sent)
149 |         self.sents = preprocessed_sents
150 | 
151 |     def remove_bad_char(self):
152 |         """
153 |         A function for removing raw unicode including unk
154 |         """
155 |         bad_chars = {"\u200b": "", "…": " ... ", "\ufeff": ""}
156 |         preprcessed_sents = []
157 |         for sent in self.sents:
158 |             for bad_char in bad_chars:
159 |                 sent = sent.replace(bad_char, bad_chars[bad_char])
160 |             sent = re.sub(r"[\+á?\xc3\xa1]", "", sent)
161 |             if sent:
162 |                 preprcessed_sents.append(sent)
163 |         self.sents = preprcessed_sents
164 | 
165 |     def clean_punc(self):
166 |         """
167 |         A function for removing useless punctuation
168 |         """
169 |         punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2",
170 |                          "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e",
171 |                          '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-',
172 |                          'β': 'beta', '∅': '', '³': '3', 'π': 'pi', 'ㅂㅅ': '병신', 'ㄲㅈ': '꺼져', 'ㅂㄷ': '부들', 'ㅆㄹㄱ': '쓰레기', 'ㅆㅂ': '씨발',
173 |                          'ㅈㅅ': '죄송', 'ㅈㄹ': '지랄', 'ㅈㄴ': '정말'}
174 | 
175 |         preprocessed_sents = []
176 |         for sent in self.sents:
177 |             for p in punct_mapping:
178 |                 sent = sent.replace(p, punct_mapping[p])
179 |             sent = sent.strip()
180 |             if sent:
181 |                 preprocessed_sents.append(sent)
182 |         self.sents = preprocessed_sents
183 |         
184 |     def remove_useless_char(self):
185 |         preprocessed_sents = []
186 |         re_obj = re.compile('[^가-힣a-z0-9\x20]+')
187 |         
188 |         for sent in self.sents:
189 |             temp = re_obj.findall(sent)
190 |             if temp != []:
191 |                 for ch in temp:
192 |                     sent = sent.replace(ch, " ")
193 |             sent = sent.strip()
194 |             if sent:
195 |                 preprocessed_sents.append(sent)
196 |         
197 |         self.sents = preprocessed_sents
198 |                 
199 |     def remove_repeated_spacing(self):
200 |         """
201 |         A function for reducing whitespaces into one
202 |         """
203 |         preprocessed_sents = []
204 |         for sent in self.sents:
205 |             sent = re.sub(r"\s+", " ", sent).strip()
206 |             if sent:
207 |                 preprocessed_sents.append(sent)
208 |         self.sents = preprocessed_sents
209 |         
210 |     def spacing_sent(self):
211 |         """
212 |         A function for spacing properly
213 |         """
214 |         preprocessed_sents = []
215 |         for sent in self.sents:
216 |             sent = self.spacing(sent)
217 |             if sent:
218 |                 preprocessed_sents.append(sent)
219 |         self.sents = preprocessed_sents   
220 | 
221 |     def remove_linesign(self):
222 |         """
223 |         A function for removing line sings like \n
224 |         """
225 |         preprocessed_sents = []
226 |         for sent in self.sents:
227 |             sent = re.sub(r"[\n\t\r\v\f\\\\n\\t\\r\\v\\f]", "", sent)
228 |             if sent:
229 |                 preprocessed_sents.append(sent)
230 |         self.sents = preprocessed_sents
231 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import math
  3 | import numpy as np
  4 | import torch
  5 | import re
  6 | from datasets import load_dataset
  7 | 
  8 | # load FAISS GPU library if available (dramatically accelerates the nearest neighbor search)
  9 | try:
 10 |     import faiss
 11 | 
 12 |     FAISS_AVAILABLE = hasattr(faiss, "StandardGpuResources")
 13 | except ImportError:
 14 |     FAISS_AVAILABLE = False
 15 |     sys.stderr.write("FAISS library was not found.\n")
 16 | 
 17 | 
 18 | def get_gaussian_keys(n_keys, dim, normalized, seed):
 19 |     """
 20 |     Generate random Gaussian keys.
 21 |     """
 22 |     rng = np.random.RandomState(seed)
 23 |     X = rng.randn(n_keys, dim)
 24 |     if normalized:
 25 |         X /= np.linalg.norm(X, axis=1, keepdims=True)
 26 |     return X.astype(np.float32)
 27 | 
 28 | 
 29 | def get_uniform_keys(n_keys, dim, normalized, seed):
 30 |     """
 31 |     Generate random uniform keys (same initialization as nn.Linear).
 32 |     """
 33 |     rng = np.random.RandomState(seed)
 34 |     bound = 1 / math.sqrt(dim)
 35 |     X = rng.uniform(-bound, bound, (n_keys, dim))
 36 |     if normalized:
 37 |         X /= np.linalg.norm(X, axis=1, keepdims=True)
 38 |     return X.astype(np.float32)
 39 | 
 40 | 
 41 | def get_slices(dim, head_id):
 42 |     """
 43 |     Generate slices of hidden dimensions.
 44 |     Used when there are multiple heads and/or different set of keys,
 45 |     and that there is no query network.
 46 |     """
 47 |     if head_id == 0:
 48 |         return [(0, dim)]
 49 |     offset = dim // (2 ** (head_id + 1))
 50 |     starts = np.arange(0, dim, offset)
 51 |     slices1 = [(x, x + offset) for i, x in enumerate(starts) if i % 2 == 0]
 52 |     slices2 = [(x, x + offset) for i, x in enumerate(starts) if i % 2 == 1]
 53 |     return slices1 + slices2
 54 | 
 55 | 
 56 | def cartesian_product(a, b):
 57 |     """
 58 |     Compute the batched cartesian product between two matrices.
 59 |     Input:
 60 |         a: Tensor(n, d1)
 61 |         b: Tensor(n, d2)
 62 |     Output:
 63 |         output: Tensor(n, d1 * d2, 2)
 64 |     """
 65 |     n1, d1 = a.shape
 66 |     n2, d2 = b.shape
 67 |     assert n1 == n2
 68 |     return torch.cat(
 69 |         [
 70 |             a.unsqueeze(-1).repeat(1, 1, d2).unsqueeze(-1),
 71 |             b.repeat(1, d1).view(n2, d1, d2).unsqueeze(-1),
 72 |         ],
 73 |         3,
 74 |     ).view(n1, d1 * d2, 2)
 75 | 
 76 | 
 77 | def swig_ptr_from_FloatTensor(x):
 78 |     assert x.is_contiguous()
 79 |     assert x.dtype == torch.float32
 80 |     return faiss.cast_integer_to_float_ptr(
 81 |         x.storage().data_ptr() + x.storage_offset() * 4
 82 |     )
 83 | 
 84 | 
 85 | def swig_ptr_from_IndicesTensor(x):
 86 |     """gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU)"""
 87 |     assert x.is_contiguous()
 88 |     assert x.dtype == torch.int64, "dtype=%s" % x.dtype
 89 |     return faiss.cast_integer_to_idx_t_ptr(
 90 |         x.storage().data_ptr() + x.storage_offset() * 8
 91 |     )
 92 | 
 93 | 
 94 | def get_knn_pytorch(a, b, k, distance="dot_product"):
 95 |     """
 96 |     Input:
 97 |         - matrix of size (m, d) (keys)
 98 |         - matrix of size (n, d) (queries)
 99 |         - number of nearest neighbors
100 |         - distance metric
101 |     Output:
102 |         - `scores`  matrix of size (n, k) with nearest neighors scores
103 |         - `indices` matrix of size (n, k) with nearest neighors indices
104 |     """
105 |     m, d = a.size()
106 |     n, _ = b.size()
107 |     assert b.size(1) == d
108 |     assert k > 0
109 |     assert distance in ["dot_product", "cosine", "l2"]
110 | 
111 |     with torch.no_grad():
112 | 
113 |         if distance == "dot_product":
114 |             scores = a.mm(b.t())  # (m, n)
115 | 
116 |         elif distance == "cosine":
117 |             scores = a.mm(b.t())  # (m, n)
118 |             scores /= a.norm(2, 1)[:, None] + 1e-9  # (m, n)
119 |             scores /= b.norm(2, 1)[None, :] + 1e-9  # (m, n)
120 | 
121 |         elif distance == "l2":
122 |             scores = a.mm(b.t())  # (m, n)
123 |             scores *= 2  # (m, n)
124 |             scores -= (a ** 2).sum(1)[:, None]  # (m, n)
125 |             scores -= (b ** 2).sum(1)[None, :]  # (m, n)
126 | 
127 |         scores, indices = scores.topk(k=k, dim=0, largest=True)  # (k, n)
128 |         scores = scores.t()  # (n, k)
129 |         indices = indices.t()  # (n, k)
130 | 
131 |     return scores, indices
132 | 
133 | 
134 | def get_knn_faiss(xb, xq, k, distance="dot_product"):
135 |     """
136 |     `metric` can be faiss.METRIC_INNER_PRODUCT or faiss.METRIC_L2
137 |     https://github.com/facebookresearch/faiss/blob/master/gpu/test/test_pytorch_faiss.py
138 |     """
139 |     assert xb.device == xq.device
140 |     assert distance in ["dot_product", "l2"]
141 |     metric = (
142 |         faiss.METRIC_INNER_PRODUCT if distance == "dot_product" else faiss.METRIC_L2
143 |     )
144 | 
145 |     xq_ptr = swig_ptr_from_FloatTensor(xq)
146 |     xb_ptr = swig_ptr_from_FloatTensor(xb)
147 | 
148 |     nq, d1 = xq.size()
149 |     nb, d2 = xb.size()
150 |     assert d1 == d2
151 | 
152 |     D = torch.empty(nq, k, device=xb.device, dtype=torch.float32)
153 |     I = torch.empty(nq, k, device=xb.device, dtype=torch.int64)
154 | 
155 |     D_ptr = swig_ptr_from_FloatTensor(D)
156 |     I_ptr = swig_ptr_from_IndicesTensor(I)
157 | 
158 |     args = faiss.GpuDistanceParams()
159 |     args.metric = metric
160 |     args.k = k
161 |     args.dims = d1
162 |     args.vectors = xb_ptr
163 |     args.vectorsRowMajor = False
164 |     args.vectorType = faiss.DistanceDataType_F32
165 |     args.numVectors = nb
166 |     args.queries = xq_ptr
167 |     args.queriesRowMajor = False
168 |     args.queryType = faiss.DistanceDataType_F32
169 |     args.numQueries = nq
170 |     args.outDistances = D_ptr
171 |     args.outIndices = I_ptr
172 |     args.outIndicesType = faiss.IndicesDataType_I64
173 | 
174 |     faiss.bfKnn(FAISS_RES, args)
175 |     return D, I
176 | 
177 | 
178 | if FAISS_AVAILABLE:
179 |     FAISS_RES = faiss.StandardGpuResources()
180 |     FAISS_RES.setDefaultNullStreamAllDevices()
181 |     FAISS_RES.setTempMemory(1200 * 1024 * 1024)
182 |     get_knn = get_knn_faiss
183 | else:
184 |     sys.stderr.write(
185 |         "FAISS not available. Switching to standard nearest neighbors search implementation.\n"
186 |     )
187 |     get_knn = get_knn_pytorch
188 | 
189 | 
190 | class Postprocess:
191 |     def __init__(self) -> None:
192 |         self.checklist = None
193 |         self.ko_slang = load_dataset(
194 |             "AI-it/korean-hate-speech",
195 |             data_files={"test": "korean_slang.json"},
196 |             use_auth_token=True,
197 |         )["test"]["badwords"][0]
198 |         self.en_slang = load_dataset(
199 |             "AI-it/korean-hate-speech",
200 |             data_files={"test": "english_slang.json"},
201 |             use_auth_token=True,
202 |         )["test"]["badwords"][0]
203 | 
204 |     # 한국어 욕 판별
205 |     def check_ko_slang(self, checklist: list):
206 |         if not isinstance(checklist, list):
207 |             raise print(
208 |                 f"please check input type ! (Input type should be list, not {type(checklist)}))"
209 |             )
210 |         flag = []
211 |         none = []
212 |         pattern = "|".join(self.ko_slang)
213 |         f = re.compile(pattern)
214 |         for check_sen in checklist:
215 |             if f.search(check_sen):
216 |                 flag.append(check_sen)
217 |         none = list(set(checklist) - set(flag))
218 |         return flag, none
219 | 
220 |     # 영어 욕 판별
221 |     def check_en_slang(self, checklist: list):
222 |         if not isinstance(checklist, list):
223 |             raise print(
224 |                 f"please check input type ! (Input type should be list, not {type(checklist)}))"
225 |             )
226 |         flag = []
227 |         none = []
228 |         pattern = "|".join(self.en_slang)
229 |         f = re.compile(pattern)
230 |         for check_sen in checklist:
231 |             if f.search(check_sen):
232 |                 flag.append(check_sen)
233 |         none = list(set(checklist) - set(flag))
234 |         return flag, none
235 | 


--------------------------------------------------------------------------------