├── .gitattributes ├── 2-4 ├── 386 │ ├── README.md │ ├── custom_fairseq.py │ ├── data_loader.py │ ├── dataset.py │ ├── ensemble.py │ ├── evaluation.py │ ├── fairseq │ │ ├── __init__.py │ │ ├── binarizer.py │ │ ├── bleu.py │ │ ├── checkpoint_utils.py │ │ ├── clib │ │ │ └── libbleu │ │ │ │ ├── libbleu.cpp │ │ │ │ └── module.cpp │ │ ├── criterions │ │ │ ├── __init__.py │ │ │ ├── adaptive_loss.py │ │ │ ├── composite_loss.py │ │ │ ├── cross_entropy.py │ │ │ ├── fairseq_criterion.py │ │ │ ├── label_smoothed_cross_entropy.py │ │ │ └── masked_lm_loss.py │ │ ├── data │ │ │ ├── __init__.py │ │ │ ├── backtranslation_dataset.py │ │ │ ├── block_pair_dataset.py │ │ │ ├── concat_dataset.py │ │ │ ├── data_utils.py │ │ │ ├── dictionary.py │ │ │ ├── fairseq_dataset.py │ │ │ ├── indexed_dataset.py │ │ │ ├── iterators.py │ │ │ ├── language_pair_dataset.py │ │ │ ├── lm_context_window_dataset.py │ │ │ ├── masked_lm_dataset.py │ │ │ ├── masked_lm_dictionary.py │ │ │ ├── monolingual_dataset.py │ │ │ ├── multi_corpus_sampled_dataset.py │ │ │ ├── noising.py │ │ │ ├── round_robin_zip_datasets.py │ │ │ ├── token_block_dataset.py │ │ │ ├── transform_eos_dataset.py │ │ │ └── transform_eos_lang_pair_dataset.py │ │ ├── distributed_utils.py │ │ ├── file_utils.py │ │ ├── legacy_distributed_data_parallel.py │ │ ├── meters.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── composite_encoder.py │ │ │ ├── distributed_fairseq_model.py │ │ │ ├── fairseq_decoder.py │ │ │ ├── fairseq_encoder.py │ │ │ ├── fairseq_incremental_decoder.py │ │ │ ├── fairseq_model.py │ │ │ ├── fconv.py │ │ │ ├── fconv_lm.py │ │ │ ├── fconv_self_att.py │ │ │ ├── lightconv.py │ │ │ ├── lightconv_lm.py │ │ │ ├── lstm.py │ │ │ ├── masked_lm.py │ │ │ ├── multilingual_transformer.py │ │ │ ├── transformer.py │ │ │ ├── transformer_from_pretrained_xlm.py │ │ │ └── transformer_lm.py │ │ ├── modules │ │ │ ├── __init__.py │ │ │ ├── adaptive_input.py │ │ │ ├── adaptive_softmax.py │ │ │ ├── beamable_mm.py │ │ │ ├── character_token_embedder.py │ │ │ ├── conv_tbc.py │ │ │ ├── downsampled_multihead_attention.py │ │ │ ├── dynamic_convolution.py │ │ │ ├── gelu.py │ │ │ ├── grad_multiply.py │ │ │ ├── highway.py │ │ │ ├── layer_norm.py │ │ │ ├── learned_positional_embedding.py │ │ │ ├── lightweight_convolution.py │ │ │ ├── linearized_convolution.py │ │ │ ├── logsumexp_moe.py │ │ │ ├── mean_pool_gating_network.py │ │ │ ├── multihead_attention.py │ │ │ ├── positional_embedding.py │ │ │ ├── scalar_bias.py │ │ │ ├── sinusoidal_positional_embedding.py │ │ │ ├── transformer_sentence_encoder.py │ │ │ ├── transformer_sentence_encoder_layer.py │ │ │ └── unfold.py │ │ ├── optim │ │ │ ├── __init__.py │ │ │ ├── adadelta.py │ │ │ ├── adafactor.py │ │ │ ├── adagrad.py │ │ │ ├── adam.py │ │ │ ├── fairseq_optimizer.py │ │ │ ├── fp16_optimizer.py │ │ │ ├── lamb.py │ │ │ ├── lr_scheduler │ │ │ │ ├── __init__.py │ │ │ │ ├── cosine_lr_scheduler.py │ │ │ │ ├── fairseq_lr_scheduler.py │ │ │ │ ├── fixed_schedule.py │ │ │ │ ├── inverse_square_root_schedule.py │ │ │ │ ├── polynomial_decay_schedule.py │ │ │ │ ├── reduce_lr_on_plateau.py │ │ │ │ └── triangular_lr_scheduler.py │ │ │ ├── nag.py │ │ │ └── sgd.py │ │ ├── options.py │ │ ├── pdb.py │ │ ├── progress_bar.py │ │ ├── registry.py │ │ ├── search.py │ │ ├── sequence_generator.py │ │ ├── sequence_generator_ensemble.py │ │ ├── sequence_scorer.py │ │ ├── tasks │ │ │ ├── __init__.py │ │ │ ├── cross_lingual_lm.py │ │ │ ├── fairseq_task.py │ │ │ ├── language_modeling.py │ │ │ ├── masked_lm.py │ │ │ ├── multilingual_translation.py │ │ │ ├── semisupervised_translation.py │ │ │ ├── translation.py │ │ │ ├── translation_from_pretrained_xlm.py │ │ │ └── translation_moe.py │ │ ├── tokenizer.py │ │ ├── trainer.py │ │ └── utils.py │ ├── meter.py │ ├── model.py │ ├── nsml_model │ │ └── best │ │ │ └── model │ │ │ ├── model.pt │ │ │ └── vocab.txt │ ├── requirements.txt │ ├── temp.py │ ├── tokenizer.py │ ├── train.py │ └── wordpiece.py ├── 487 │ ├── README.md │ ├── asdf.ipynb │ ├── data_loader.py │ ├── dataset.py │ ├── evaluation.py │ ├── main.py │ ├── model.py │ ├── noising.py │ ├── nsml_model │ │ └── best │ │ │ └── model │ │ │ ├── model.pt │ │ │ └── vocab.txt │ ├── pretrain_dataset.py │ ├── pykospacing │ │ ├── __init__.py │ │ ├── embedding_maker.py │ │ ├── kospacing.py │ │ ├── pykos.py │ │ └── resources │ │ │ ├── __init__.py │ │ │ ├── dicts │ │ │ └── c2v.dic │ │ │ └── models │ │ │ └── kospacing │ ├── requirements.txt │ ├── tokenizer.py │ ├── train.py │ ├── transformers_encoder_decoder.py │ └── utils │ │ ├── loss.py │ │ ├── meter.py │ │ ├── preprocess.py │ │ └── utils.py ├── 1062 │ ├── README.md │ ├── data_loader.py │ ├── dataset.py │ ├── evaluation.py │ ├── meter.py │ ├── model.py │ ├── model_AR.py │ ├── nsml_model │ │ └── 17143 │ │ │ └── model │ │ │ ├── model.pt │ │ │ ├── model_2.pt │ │ │ └── vocab.txt │ ├── requirements.txt │ ├── tokenizer.py │ └── train.py ├── command.txt └── rank.txt ├── 2-5 ├── 582 │ ├── README.md │ ├── arcface.py │ ├── data_loader.py │ ├── dataset.py │ ├── evaluation.py │ ├── main.py │ ├── model.py │ ├── nsml_model │ │ └── final │ │ │ └── model │ │ │ ├── model.pt │ │ │ └── model2.pt │ ├── predict.py │ ├── requirements.txt │ ├── sam.py │ ├── setup.py │ └── trainer.py ├── 756 │ ├── README.md │ ├── data_loader.py │ ├── dataset.py │ ├── evaluation.py │ ├── main.py │ ├── main2.py │ ├── main2_original.py │ ├── main_old.py │ ├── model.py │ ├── nsml_model │ │ └── king_of_ai │ │ │ └── model │ │ │ └── model.pt │ ├── nsml_package.txt │ ├── predict.py │ ├── setup.py │ ├── shell.py │ └── train.py ├── command.txt └── rank.txt ├── Copyright owner ├── LICENSE ├── NOTICE └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /2-4/1062/README.md: -------------------------------------------------------------------------------- 1 | # 2-4 스마트에디터의 그래머리 (문장 교정/교열) 기능 고도화 2 | 3 | - 네이버 사용자가 작성한 문장을 문법적으로 맞는 문장으로 교정/교열 하는 모델을 만듭니다. 4 | 5 | 6 | ## 데이터 7 | - 학습데이터 8 | * `train/train_data/train_data`: 문법 오류가 섞인 문장 9 | * `train/train_data/train_annotation`: 문법 오류에 대한 annotation 10 | * `train/train_data/train_corpus`: 교정되지 않은 문장 11 | * `train/train_label`: 교정/교열된 문장 12 | - 평가 데이터 13 | * `test/test_data`: 문법 오류가 섞인 문장 14 | * `test/test_label`: 교정/교열된 문장 15 | - 평가 더미 데이 16 | * `test_submit/test_data`: 문법 오류가 섞인 문장 17 | * `test_submit/test_label`: 교정/교열된 문장 18 | - 문법 오류가 섞인 문장들(`*_data`)과 교정/교열된 문장들(`*_label`)은 line-by-line으로 매핑됩니다. 19 | 20 | 21 | ## 평가 22 | - Corpus-level [GLEU](https://www.aclweb.org/anthology/P07-1044/) score로 평가 23 | - [`nltk.translate.gleu_score.corpus_gleu`](https://www.nltk.org/_modules/nltk/translate/gleu_score.html) 스크립트를 사용 24 | 25 | 26 | ## 베이스라인 27 | - [Transformer](https://arxiv.org/abs/1706.03762) 기반의 sequence-to-sequence 모델 28 | - 대량의 unlabeled corpus (`train_corpus`)를 활용하여 pre-training (또는 semi-supervised learning) 방식으로 학습하거나 에러 타입 (`train_annotation`)을 예측하도록 multi-task learning을 하면 추가 성능 향상을 얻을 수도 있습니다. 29 | 30 | 31 | ## 모델 학습 32 | ``` 33 | nsml run -d airush2021-2-4 -e train.py 34 | ``` 35 | - 필요에 따라 `-a`로 argument 입력 가능 36 | 37 | 38 | ## 모델 제출 39 | ``` 40 | nsml submit {SESSION} {CHECKPOINT} 41 | ``` 42 | 43 | ## 추가 정보 44 | 45 | ### Annotation 설명 46 | 47 | - "perfect" : 교정/교열이 필요없는 완벽한 문장 48 | - "spacing" : 띄어쓰기 교정 49 | - "pasting" : 붙여쓰기 교정 50 | - "tense" : 시제 교정 51 | - "honorific" : 경어체 교정 52 | - "punctuation" : 구두점 교정 53 | - "typo" : 오탈자 교정 (위 분류에 없는 경우 모두 수렴) 54 | - "advanced" : 윤문 처리 (더 매끄러운 문장) 55 | -------------------------------------------------------------------------------- /2-4/1062/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from nsml import DATASET_PATH 4 | 5 | 6 | def read_strings(input_file): 7 | return open(input_file, "r").read().splitlines() 8 | 9 | 10 | def write_strings(output_file, data): 11 | with open(output_file, "w") as f: 12 | for x in data: 13 | f.write(str(x) + "\n") 14 | 15 | 16 | def test_data_loader(root_path): 17 | return read_strings(os.path.join(root_path, 'test', 'test_data')) 18 | 19 | 20 | def feed_infer(output_file, infer_func): 21 | prediciton = infer_func(test_data_loader(DATASET_PATH)) 22 | print('write output') 23 | write_strings(output_file, prediciton) 24 | if os.stat(output_file).st_size == 0: 25 | raise AssertionError('output result of inference is nothing') 26 | -------------------------------------------------------------------------------- /2-4/1062/evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from nltk.translate.gleu_score import corpus_gleu 4 | 5 | 6 | def read_strings(input_file): 7 | return open(input_file, "r").read().splitlines() 8 | 9 | 10 | def read_prediction(prediction_file): 11 | return read_strings(prediction_file) 12 | 13 | 14 | def read_ground_truth(ground_truth_file): 15 | return read_strings(ground_truth_file) 16 | 17 | 18 | def em(prediction, ground_truth): 19 | return sum([x == y for x, y in zip(prediction, ground_truth)]) / len(ground_truth) * 100. 20 | 21 | 22 | def gleu(prediction, ground_truth): 23 | return corpus_gleu([[x] for x in ground_truth], prediction) * 100. 24 | 25 | 26 | def evaluation_metrics(prediction_file: str, ground_truth_file: str): 27 | try: 28 | prediction = read_prediction(prediction_file) 29 | ground_truth = read_ground_truth(ground_truth_file) 30 | score = gleu(prediction, ground_truth) 31 | except: 32 | score = 0.0 33 | return score 34 | 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--prediction', type=str, default='pred.txt') 39 | parser.add_argument('--test_label_path', type=str) 40 | args = parser.parse_args() 41 | 42 | print(evaluation_metrics(args.prediction, args.test_label_path)) 43 | -------------------------------------------------------------------------------- /2-4/1062/meter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | 4 | 5 | class Meter(object): 6 | def __init__(self): 7 | self.init() 8 | 9 | def init(self): 10 | self.start = time.time() 11 | self.cnt_add = 0 12 | self.tot_loss = 0. 13 | self.cnt_sent = 0 14 | self.cnt_token = 0 15 | 16 | def add(self, loss, n_sent, n_token): 17 | self.cnt_add += 1 18 | self.tot_loss += loss * n_sent 19 | self.cnt_sent += n_sent 20 | self.cnt_token += n_token 21 | 22 | def average(self): 23 | loss_sent = self.tot_loss / self.cnt_sent if self.cnt_sent != 0 else 0. 24 | loss_token = self.tot_loss / self.cnt_token if self.cnt_token != 0 else 0. 25 | return loss_sent, loss_token 26 | 27 | def elapsed_time(self): 28 | return time.time() - self.start 29 | 30 | def print_str(self, time_avg=False): 31 | loss_sent, loss_token = self.average() 32 | et = self.elapsed_time() 33 | time_str = f"{et * 1000. / self.cnt_add:6.2f} ms/batch" if time_avg else f"{et:6.2f} s" 34 | return f"{time_str} | loss_sent {loss_sent:6.2f} | token_ppl {math.exp(loss_token):6.2f}" 35 | 36 | 37 | -------------------------------------------------------------------------------- /2-4/1062/nsml_model/17143/model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/1062/nsml_model/17143/model/model.pt -------------------------------------------------------------------------------- /2-4/1062/nsml_model/17143/model/model_2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/1062/nsml_model/17143/model/model_2.pt -------------------------------------------------------------------------------- /2-4/1062/requirements.txt: -------------------------------------------------------------------------------- 1 | #nsml: scatterlab/python-mecab-ko:3.7-circleci-node 2 | g2pk 3 | pandas 4 | nltk 5 | 6 | torch==1.7.1+cu110 7 | torchvision==0.8.2+cu110 8 | torchaudio===0.7.2 9 | --find-links https://download.pytorch.org/whl/torch_stable.html 10 | -------------------------------------------------------------------------------- /2-4/1062/tokenizer.py: -------------------------------------------------------------------------------- 1 | from collections import Counter, defaultdict 2 | 3 | from data_loader import read_strings, write_strings 4 | 5 | SPECIAL_TOKENS = ['', '', '', ''] 6 | 7 | 8 | class CharTokenizer(object): 9 | def __init__(self, i2c): 10 | self.init(i2c) 11 | 12 | def __len__(self): 13 | return len(self.vocab) 14 | 15 | def __call__(self, sent): 16 | return [self.vocab[c] for c in sent] 17 | 18 | def init(self, i2c): 19 | self.i2c = i2c 20 | self.vocab = defaultdict(int) 21 | self.vocab.update({c: i for i, c in enumerate(i2c)}) 22 | 23 | @classmethod 24 | def from_strings(cls, strings, vocab_size): 25 | char_counter = Counter() 26 | for x in strings: 27 | char_counter.update(x) 28 | # print(len(char_counter)) # 2366 29 | i2c = SPECIAL_TOKENS 30 | i2c += [c for c, _ in char_counter.most_common(vocab_size - len(SPECIAL_TOKENS))] 31 | return cls(i2c) 32 | 33 | def save(self, path): 34 | write_strings(path, self.i2c) 35 | 36 | def load(self, path): 37 | i2c = read_strings(path) 38 | self.init(i2c) 39 | -------------------------------------------------------------------------------- /2-4/386/README.md: -------------------------------------------------------------------------------- 1 | # 2-4 스마트에디터의 그래머리 (문장 교정/교열) 기능 고도화 2 | 3 | - 네이버 사용자가 작성한 문장을 문법적으로 맞는 문장으로 교정/교열 하는 모델을 만듭니다. 4 | 5 | 6 | ## 데이터 7 | - 학습데이터 8 | * `train/train_data/train_data`: 문법 오류가 섞인 문장 9 | * `train/train_data/train_annotation`: 문법 오류에 대한 annotation 10 | * `train/train_data/train_corpus`: 교정되지 않은 문장 11 | * `train/train_label`: 교정/교열된 문장 12 | - 평가 데이터 13 | * `test/test_data`: 문법 오류가 섞인 문장 14 | * `test/test_label`: 교정/교열된 문장 15 | - 평가 더미 데이 16 | * `test_submit/test_data`: 문법 오류가 섞인 문장 17 | * `test_submit/test_label`: 교정/교열된 문장 18 | - 문법 오류가 섞인 문장들(`*_data`)과 교정/교열된 문장들(`*_label`)은 line-by-line으로 매핑됩니다. 19 | 20 | 21 | ## 평가 22 | - Corpus-level [GLEU](https://www.aclweb.org/anthology/P07-1044/) score로 평가 23 | - [`nltk.translate.gleu_score.corpus_gleu`](https://www.nltk.org/_modules/nltk/translate/gleu_score.html) 스크립트를 사용 24 | 25 | 26 | ## 베이스라인 27 | - [Transformer](https://arxiv.org/abs/1706.03762) 기반의 sequence-to-sequence 모델 28 | - 대량의 unlabeled corpus (`train_corpus`)를 활용하여 pre-training (또는 semi-supervised learning) 방식으로 학습하거나 에러 타입 (`train_annotation`)을 예측하도록 multi-task learning을 하면 추가 성능 향상을 얻을 수도 있습니다. 29 | 30 | 31 | ## 모델 학습 32 | ``` 33 | nsml run -d airush2021-2-4 -e train.py 34 | ``` 35 | - 필요에 따라 `-a`로 argument 입력 가능 36 | 37 | 38 | ## 모델 제출 39 | ``` 40 | nsml submit {SESSION} {CHECKPOINT} 41 | ``` 42 | 43 | ## 추가 정보 44 | 45 | ### Annotation 설명 46 | 47 | - "perfect" : 교정/교열이 필요없는 완벽한 문장 48 | - "spacing" : 띄어쓰기 교정 49 | - "pasting" : 붙여쓰기 교정 50 | - "tense" : 시제 교정 51 | - "honorific" : 경어체 교정 52 | - "punctuation" : 구두점 교정 53 | - "typo" : 오탈자 교정 (위 분류에 없는 경우 모두 수렴) 54 | - "advanced" : 윤문 처리 (더 매끄러운 문장) 55 | -------------------------------------------------------------------------------- /2-4/386/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from nsml import DATASET_PATH 4 | 5 | 6 | def read_strings(input_file): 7 | return open(input_file, "r").read().splitlines() 8 | 9 | 10 | def write_strings(output_file, data): 11 | with open(output_file, "w") as f: 12 | for x in data: 13 | f.write(str(x) + "\n") 14 | 15 | 16 | def test_data_loader(root_path): 17 | return read_strings(os.path.join(root_path, 'test', 'test_data')) 18 | 19 | 20 | def feed_infer(output_file, infer_func): 21 | prediciton = infer_func(test_data_loader(DATASET_PATH)) 22 | print('write output') 23 | write_strings(output_file, prediciton) 24 | if os.stat(output_file).st_size == 0: 25 | raise AssertionError('output result of inference is nothing') 26 | -------------------------------------------------------------------------------- /2-4/386/evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from nltk.translate.gleu_score import corpus_gleu 4 | 5 | 6 | def read_strings(input_file): 7 | return open(input_file, "r").read().splitlines() 8 | 9 | 10 | def read_prediction(prediction_file): 11 | return read_strings(prediction_file) 12 | 13 | 14 | def read_ground_truth(ground_truth_file): 15 | return read_strings(ground_truth_file) 16 | 17 | 18 | def em(prediction, ground_truth): 19 | return sum([x == y for x, y in zip(prediction, ground_truth)]) / len(ground_truth) * 100. 20 | 21 | 22 | def gleu(prediction, ground_truth): 23 | return corpus_gleu([[x] for x in ground_truth], prediction) * 100. 24 | 25 | 26 | def evaluation_metrics(prediction_file: str, ground_truth_file: str): 27 | try: 28 | prediction = read_prediction(prediction_file) 29 | ground_truth = read_ground_truth(ground_truth_file) 30 | score = gleu(prediction, ground_truth) 31 | except: 32 | score = 0.0 33 | return score 34 | 35 | 36 | if __name__ == '__main__': 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument('--prediction', type=str, default='pred.txt') 39 | parser.add_argument('--test_label_path', type=str) 40 | args = parser.parse_args() 41 | 42 | print(evaluation_metrics(args.prediction, args.test_label_path)) 43 | -------------------------------------------------------------------------------- /2-4/386/fairseq/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | __all__ = ['pdb'] 9 | __version__ = '0.6.2' 10 | 11 | import fairseq.criterions 12 | import fairseq.models 13 | import fairseq.modules 14 | import fairseq.optim 15 | import fairseq.optim.lr_scheduler 16 | import fairseq.pdb 17 | import fairseq.tasks 18 | -------------------------------------------------------------------------------- /2-4/386/fairseq/binarizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from collections import Counter 9 | import os 10 | 11 | from fairseq.tokenizer import tokenize_line 12 | # from bert import BertTokenizer 13 | import torch 14 | def safe_readline(f): 15 | pos = f.tell() 16 | while True: 17 | try: 18 | return f.readline() 19 | except UnicodeDecodeError: 20 | pos -= 1 21 | f.seek(pos) # search where this character begins 22 | 23 | 24 | class Binarizer: 25 | 26 | @staticmethod 27 | def binarize(filename, dict, consumer, tokenize=tokenize_line, append_eos=True, reverse_order=False, 28 | offset=0, end=-1): 29 | nseq, ntok = 0, 0 30 | replaced = Counter() 31 | 32 | def replaced_consumer(word, idx): 33 | if idx == dict.unk_index and word != dict.unk_word: 34 | replaced.update([word]) 35 | 36 | with open(filename, 'r', encoding='utf-8') as f: 37 | f.seek(offset) 38 | # next(f) breaks f.tell(), hence readline() must be used 39 | line = safe_readline(f) 40 | while line: 41 | if end > 0 and f.tell() > end: 42 | break 43 | if isinstance(dict, BertTokenizer): 44 | line = line.strip() 45 | line = '{} {} {}'.format('[CLS]', line, '[SEP]') 46 | tokenizedline = dict.tokenize(line) 47 | if len(tokenizedline) > dict.max_len: 48 | tokenizedline = tokenizedline[:dict.max_len-1] 49 | tokenizedline.append('[SEP]') 50 | words = dict.convert_tokens_to_ids(tokenizedline) 51 | nwords = len(words) 52 | ids = torch.IntTensor(nwords) 53 | for i, word in enumerate(words): 54 | ids[i] = word 55 | replaced_consumer(tokenizedline[i], word) 56 | else: 57 | ids = dict.encode_line( 58 | line=line, 59 | line_tokenizer=tokenize, 60 | add_if_not_exist=False, 61 | consumer=replaced_consumer, 62 | append_eos=append_eos, 63 | reverse_order=reverse_order, 64 | ) 65 | nseq += 1 66 | ntok += len(ids) 67 | consumer(ids) 68 | line = f.readline() 69 | return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced} 70 | 71 | @staticmethod 72 | def find_offsets(filename, num_chunks): 73 | with open(filename, 'r', encoding='utf-8') as f: 74 | size = os.fstat(f.fileno()).st_size 75 | chunk_size = size // num_chunks 76 | offsets = [0 for _ in range(num_chunks + 1)] 77 | for i in range(1, num_chunks): 78 | f.seek(chunk_size * i) 79 | safe_readline(f) 80 | offsets[i] = f.tell() 81 | return offsets 82 | -------------------------------------------------------------------------------- /2-4/386/fairseq/bleu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import ctypes 9 | import math 10 | import torch 11 | 12 | try: 13 | from fairseq import libbleu 14 | except ImportError as e: 15 | import sys 16 | sys.stderr.write('ERROR: missing libbleu.so. run `pip install --editable .`\n') 17 | raise e 18 | 19 | 20 | C = ctypes.cdll.LoadLibrary(libbleu.__file__) 21 | 22 | 23 | class BleuStat(ctypes.Structure): 24 | _fields_ = [ 25 | ('reflen', ctypes.c_size_t), 26 | ('predlen', ctypes.c_size_t), 27 | ('match1', ctypes.c_size_t), 28 | ('count1', ctypes.c_size_t), 29 | ('match2', ctypes.c_size_t), 30 | ('count2', ctypes.c_size_t), 31 | ('match3', ctypes.c_size_t), 32 | ('count3', ctypes.c_size_t), 33 | ('match4', ctypes.c_size_t), 34 | ('count4', ctypes.c_size_t), 35 | ] 36 | 37 | 38 | class SacrebleuScorer(object): 39 | def __init__(self): 40 | import sacrebleu 41 | self.sacrebleu = sacrebleu 42 | self.reset() 43 | 44 | def reset(self, one_init=False): 45 | if one_init: 46 | raise NotImplementedError 47 | self.ref = [] 48 | self.sys = [] 49 | 50 | def add_string(self, ref, pred): 51 | self.ref.append(ref) 52 | self.sys.append(pred) 53 | 54 | def score(self, order=4): 55 | return self.result_string(order).score 56 | 57 | def result_string(self, order=4): 58 | if order != 4: 59 | raise NotImplementedError 60 | return self.sacrebleu.corpus_bleu(self.sys, [self.ref]) 61 | 62 | 63 | class Scorer(object): 64 | def __init__(self, pad, eos, unk): 65 | self.stat = BleuStat() 66 | self.pad = pad 67 | self.eos = eos 68 | self.unk = unk 69 | self.reset() 70 | 71 | def reset(self, one_init=False): 72 | if one_init: 73 | C.bleu_one_init(ctypes.byref(self.stat)) 74 | else: 75 | C.bleu_zero_init(ctypes.byref(self.stat)) 76 | 77 | def add(self, ref, pred): 78 | if not isinstance(ref, torch.IntTensor): 79 | raise TypeError('ref must be a torch.IntTensor (got {})' 80 | .format(type(ref))) 81 | if not isinstance(pred, torch.IntTensor): 82 | raise TypeError('pred must be a torch.IntTensor(got {})' 83 | .format(type(pred))) 84 | 85 | # don't match unknown words 86 | rref = ref.clone() 87 | assert not rref.lt(0).any() 88 | rref[rref.eq(self.unk)] = -999 89 | 90 | rref = rref.contiguous().view(-1) 91 | pred = pred.contiguous().view(-1) 92 | 93 | C.bleu_add( 94 | ctypes.byref(self.stat), 95 | ctypes.c_size_t(rref.size(0)), 96 | ctypes.c_void_p(rref.data_ptr()), 97 | ctypes.c_size_t(pred.size(0)), 98 | ctypes.c_void_p(pred.data_ptr()), 99 | ctypes.c_int(self.pad), 100 | ctypes.c_int(self.eos)) 101 | 102 | def score(self, order=4): 103 | psum = sum(math.log(p) if p > 0 else float('-Inf') 104 | for p in self.precision()[:order]) 105 | return self.brevity() * math.exp(psum / order) * 100 106 | 107 | def precision(self): 108 | def ratio(a, b): 109 | return a / b if b > 0 else 0 110 | 111 | return [ 112 | ratio(self.stat.match1, self.stat.count1), 113 | ratio(self.stat.match2, self.stat.count2), 114 | ratio(self.stat.match3, self.stat.count3), 115 | ratio(self.stat.match4, self.stat.count4), 116 | ] 117 | 118 | def brevity(self): 119 | r = self.stat.reflen / self.stat.predlen 120 | return min(1, math.exp(1 - r)) 121 | 122 | def result_string(self, order=4): 123 | assert order <= 4, "BLEU scores for order > 4 aren't supported" 124 | fmt = 'BLEU{} = {:2.2f}, {:2.1f}' 125 | for _ in range(1, order): 126 | fmt += '/{:2.1f}' 127 | fmt += ' (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})' 128 | bleup = [p * 100 for p in self.precision()[:order]] 129 | return fmt.format(order, self.score(order=order), *bleup, 130 | self.brevity(), self.stat.predlen/self.stat.reflen, 131 | self.stat.predlen, self.stat.reflen) 132 | -------------------------------------------------------------------------------- /2-4/386/fairseq/clib/libbleu/libbleu.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | typedef struct 15 | { 16 | size_t reflen; 17 | size_t predlen; 18 | size_t match1; 19 | size_t count1; 20 | size_t match2; 21 | size_t count2; 22 | size_t match3; 23 | size_t count3; 24 | size_t match4; 25 | size_t count4; 26 | } bleu_stat; 27 | 28 | // left trim (remove pad) 29 | void bleu_ltrim(size_t* len, int** sent, int pad) { 30 | size_t start = 0; 31 | while(start < *len) { 32 | if (*(*sent + start) != pad) { break; } 33 | start++; 34 | } 35 | *sent += start; 36 | *len -= start; 37 | } 38 | 39 | // right trim remove (eos) 40 | void bleu_rtrim(size_t* len, int** sent, int pad, int eos) { 41 | size_t end = *len - 1; 42 | while (end > 0) { 43 | if (*(*sent + end) != eos && *(*sent + end) != pad) { break; } 44 | end--; 45 | } 46 | *len = end + 1; 47 | } 48 | 49 | // left and right trim 50 | void bleu_trim(size_t* len, int** sent, int pad, int eos) { 51 | bleu_ltrim(len, sent, pad); 52 | bleu_rtrim(len, sent, pad, eos); 53 | } 54 | 55 | size_t bleu_hash(int len, int* data) { 56 | size_t h = 14695981039346656037ul; 57 | size_t prime = 0x100000001b3; 58 | char* b = (char*) data; 59 | size_t blen = sizeof(int) * len; 60 | 61 | while (blen-- > 0) { 62 | h ^= *b++; 63 | h *= prime; 64 | } 65 | 66 | return h; 67 | } 68 | 69 | void bleu_addngram( 70 | size_t *ntotal, size_t *nmatch, size_t n, 71 | size_t reflen, int* ref, size_t predlen, int* pred) { 72 | 73 | if (predlen < n) { return; } 74 | 75 | predlen = predlen - n + 1; 76 | (*ntotal) += predlen; 77 | 78 | if (reflen < n) { return; } 79 | 80 | reflen = reflen - n + 1; 81 | 82 | std::map count; 83 | while (predlen > 0) { 84 | size_t w = bleu_hash(n, pred++); 85 | count[w]++; 86 | predlen--; 87 | } 88 | 89 | while (reflen > 0) { 90 | size_t w = bleu_hash(n, ref++); 91 | if (count[w] > 0) { 92 | (*nmatch)++; 93 | count[w] -=1; 94 | } 95 | reflen--; 96 | } 97 | } 98 | 99 | extern "C" { 100 | 101 | void bleu_zero_init(bleu_stat* stat) { 102 | std::memset(stat, 0, sizeof(bleu_stat)); 103 | } 104 | 105 | void bleu_one_init(bleu_stat* stat) { 106 | bleu_zero_init(stat); 107 | stat->count1 = 0; 108 | stat->count2 = 1; 109 | stat->count3 = 1; 110 | stat->count4 = 1; 111 | stat->match1 = 0; 112 | stat->match2 = 1; 113 | stat->match3 = 1; 114 | stat->match4 = 1; 115 | } 116 | 117 | void bleu_add( 118 | bleu_stat* stat, 119 | size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) { 120 | 121 | bleu_trim(&reflen, &ref, pad, eos); 122 | bleu_trim(&predlen, &pred, pad, eos); 123 | stat->reflen += reflen; 124 | stat->predlen += predlen; 125 | 126 | bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred); 127 | bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred); 128 | bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred); 129 | bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred); 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /2-4/386/fairseq/clib/libbleu/module.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2017-present, Facebook, Inc. 3 | * All rights reserved. 4 | * 5 | * This source code is licensed under the license found in the 6 | * LICENSE file in the root directory of this source tree. 7 | */ 8 | 9 | #include 10 | 11 | 12 | static PyMethodDef method_def[] = { 13 | {NULL, NULL, 0, NULL} 14 | }; 15 | 16 | static struct PyModuleDef module_def = { 17 | PyModuleDef_HEAD_INIT, 18 | "libbleu", /* name of module */ 19 | NULL, /* module documentation, may be NULL */ 20 | -1, /* size of per-interpreter state of the module, 21 | or -1 if the module keeps state in global variables. */ 22 | method_def 23 | }; 24 | 25 | 26 | #if PY_MAJOR_VERSION == 2 27 | PyMODINIT_FUNC init_libbleu() 28 | #else 29 | PyMODINIT_FUNC PyInit_libbleu() 30 | #endif 31 | { 32 | PyObject *m = PyModule_Create(&module_def); 33 | if (!m) { 34 | return NULL; 35 | } 36 | return m; 37 | } 38 | -------------------------------------------------------------------------------- /2-4/386/fairseq/criterions/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import importlib 9 | import os 10 | 11 | from fairseq import registry 12 | from fairseq.criterions.fairseq_criterion import FairseqCriterion 13 | 14 | 15 | build_criterion, register_criterion, CRITERION_REGISTRY = registry.setup_registry( 16 | '--criterion', 17 | base_class=FairseqCriterion, 18 | default='cross_entropy', 19 | ) 20 | 21 | 22 | # automatically import any Python files in the criterions/ directory 23 | for file in os.listdir(os.path.dirname(__file__)): 24 | if file.endswith('.py') and not file.startswith('_'): 25 | module = file[:file.find('.py')] 26 | importlib.import_module('fairseq.criterions.' + module) 27 | -------------------------------------------------------------------------------- /2-4/386/fairseq/criterions/adaptive_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | 9 | import math 10 | import torch.nn.functional as F 11 | 12 | from fairseq import utils 13 | from . import FairseqCriterion, register_criterion 14 | 15 | 16 | @register_criterion('adaptive_loss') 17 | class AdaptiveLoss(FairseqCriterion): 18 | """This is an implementation of the loss function accompanying the adaptive softmax approximation for 19 | graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs" 20 | (http://arxiv.org/abs/1609.04309).""" 21 | 22 | def __init__(self, args, task): 23 | super().__init__(args, task) 24 | 25 | if args.ddp_backend == 'c10d': 26 | raise Exception( 27 | 'AdaptiveLoss is not compatible with the c10d ' 28 | 'version of DistributedDataParallel. Please use ' 29 | '`--ddp-backend=no_c10d` instead.' 30 | ) 31 | 32 | def forward(self, model, sample, reduce=True): 33 | """Compute the loss for the given sample. 34 | 35 | Returns a tuple with three elements: 36 | 1) the loss 37 | 2) the sample size, which is used as the denominator for the gradient 38 | 3) logging outputs to display while training 39 | """ 40 | 41 | assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None 42 | adaptive_softmax = model.decoder.adaptive_softmax 43 | 44 | net_output = model(**sample['net_input']) 45 | orig_target = model.get_targets(sample, net_output) 46 | 47 | nsentences = orig_target.size(0) 48 | orig_target = orig_target.view(-1) 49 | 50 | bsz = orig_target.size(0) 51 | 52 | logits, target = adaptive_softmax(net_output[0], orig_target) 53 | assert len(target) == len(logits) 54 | 55 | loss = net_output[0].new(1 if reduce else bsz).zero_() 56 | 57 | for i in range(len(target)): 58 | if target[i] is not None: 59 | assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1)) 60 | loss += F.cross_entropy( 61 | logits[i], 62 | target[i], 63 | ignore_index=self.padding_idx, 64 | reduction='sum' if reduce else 'none', 65 | ) 66 | 67 | orig = utils.strip_pad(orig_target, self.padding_idx) 68 | ntokens = orig.numel() 69 | sample_size = sample['target'].size(0) if self.args.sentence_avg else ntokens 70 | logging_output = { 71 | 'loss': utils.item(loss.data) if reduce else loss.data, 72 | 'ntokens': ntokens, 73 | 'nsentences': nsentences, 74 | 'sample_size': sample_size, 75 | } 76 | return loss, sample_size, logging_output 77 | 78 | @staticmethod 79 | def aggregate_logging_outputs(logging_outputs): 80 | """Aggregate logging outputs from data parallel training.""" 81 | loss_sum = sum(log.get('loss', 0) for log in logging_outputs) 82 | ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) 83 | nsentences = sum(log.get('nsentences', 0) for log in logging_outputs) 84 | sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) 85 | agg_output = { 86 | 'loss': loss_sum / sample_size / math.log(2), 87 | 'nll_loss': loss_sum / sample_size / math.log(2), 88 | 'ntokens': ntokens, 89 | 'nsentences': nsentences, 90 | 'sample_size': sample_size, 91 | } 92 | if sample_size != ntokens: 93 | agg_output['nll_loss'] = loss_sum / ntokens / math.log(2) 94 | return agg_output 95 | -------------------------------------------------------------------------------- /2-4/386/fairseq/criterions/composite_loss.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from torch import nn 9 | 10 | from fairseq import utils 11 | from . import FairseqCriterion, register_criterion 12 | 13 | 14 | @register_criterion('composite_loss') 15 | class CompositeLoss(FairseqCriterion): 16 | """This is a composite loss that, given a list of model outputs and a list of targets, 17 | computes an average of losses for each output-target pair""" 18 | 19 | @staticmethod 20 | def add_args(parser): 21 | """Add criterion-specific arguments to the parser.""" 22 | # fmt: off 23 | parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True, 24 | help='underlying criterion to use for the composite loss') 25 | # fmt: on 26 | 27 | @staticmethod 28 | def build_underlying_criterion(args, task): 29 | saved_criterion = args.criterion 30 | args.criterion = args.underlying_criterion 31 | assert saved_criterion != args.underlying_criterion 32 | underlying_criterion = task.build_criterion(args) 33 | args.criterion = saved_criterion 34 | return underlying_criterion 35 | 36 | @classmethod 37 | def build_criterion(cls, args, task): 38 | underlying_criterion = CompositeLoss.build_underlying_criterion(args, task) 39 | 40 | class FakeModel(nn.Module): 41 | 42 | def __init__(self, model, net_out, target): 43 | super().__init__() 44 | self.model = model 45 | self.net_out = net_out 46 | self.target = target 47 | 48 | def forward(self, **unused): 49 | return self.net_out 50 | 51 | def get_normalized_probs(self, net_output, log_probs, sample=None): 52 | return self.model.get_normalized_probs(net_output, log_probs, sample=sample) 53 | 54 | def get_targets(self, *unused): 55 | return self.target 56 | 57 | @property 58 | def decoder(self): 59 | return self.model.decoder 60 | 61 | class _CompositeLoss(FairseqCriterion): 62 | 63 | def __init__(self, args, task, underlying_criterion): 64 | super().__init__(args, task) 65 | self.underlying_criterion = underlying_criterion 66 | 67 | def forward(self, model, sample, reduce=True): 68 | net_outputs = model(**sample['net_input']) 69 | targets = sample['target'] 70 | 71 | bsz = targets[0].size(0) 72 | loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_() 73 | 74 | sample_size = 0 75 | logging_output = {} 76 | for o, t in zip(net_outputs[0], targets): 77 | m = FakeModel(model, (o, net_outputs[1]), t) 78 | sample['target'] = t 79 | l, ss, logging_output = self.underlying_criterion(m, sample, reduce) 80 | loss += l 81 | sample_size += ss 82 | 83 | loss.div_(len(targets)) 84 | sample_size /= len(targets) 85 | 86 | logging_output['loss'] = utils.item(loss.data) if reduce else loss.data 87 | return loss, sample_size, logging_output 88 | 89 | @staticmethod 90 | def aggregate_logging_outputs(logging_outputs): 91 | return underlying_criterion.__class__.aggregate_logging_outputs(logging_outputs) 92 | 93 | return _CompositeLoss(args, task, underlying_criterion) 94 | -------------------------------------------------------------------------------- /2-4/386/fairseq/criterions/cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import math 9 | import torch.nn.functional as F 10 | 11 | from fairseq import utils 12 | 13 | from . import FairseqCriterion, register_criterion 14 | 15 | 16 | @register_criterion('cross_entropy') 17 | class CrossEntropyCriterion(FairseqCriterion): 18 | 19 | def __init__(self, args, task): 20 | super().__init__(args, task) 21 | 22 | def forward(self, model, sample, reduce=True): 23 | """Compute the loss for the given sample. 24 | 25 | Returns a tuple with three elements: 26 | 1) the loss 27 | 2) the sample size, which is used as the denominator for the gradient 28 | 3) logging outputs to display while training 29 | """ 30 | net_output = model(**sample['net_input']) 31 | loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce) 32 | sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens'] 33 | logging_output = { 34 | 'loss': utils.item(loss.data) if reduce else loss.data, 35 | 'ntokens': sample['ntokens'], 36 | 'nsentences': sample['target'].size(0), 37 | 'sample_size': sample_size, 38 | } 39 | return loss, sample_size, logging_output 40 | 41 | def compute_loss(self, model, net_output, sample, reduce=True): 42 | lprobs = model.get_normalized_probs(net_output, log_probs=True) 43 | lprobs = lprobs.view(-1, lprobs.size(-1)) 44 | target = model.get_targets(sample, net_output).view(-1) 45 | loss = F.nll_loss( 46 | lprobs, 47 | target, 48 | ignore_index=self.padding_idx, 49 | reduction='sum' if reduce else 'none', 50 | ) 51 | return loss, loss 52 | 53 | @staticmethod 54 | def aggregate_logging_outputs(logging_outputs): 55 | """Aggregate logging outputs from data parallel training.""" 56 | loss_sum = sum(log.get('loss', 0) for log in logging_outputs) 57 | ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) 58 | nsentences = sum(log.get('nsentences', 0) for log in logging_outputs) 59 | sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) 60 | agg_output = { 61 | 'loss': loss_sum / sample_size / math.log(2), 62 | 'ntokens': ntokens, 63 | 'nsentences': nsentences, 64 | 'sample_size': sample_size, 65 | } 66 | if sample_size != ntokens: 67 | agg_output['nll_loss'] = loss_sum / ntokens / math.log(2) 68 | return agg_output 69 | -------------------------------------------------------------------------------- /2-4/386/fairseq/criterions/fairseq_criterion.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from torch.nn.modules.loss import _Loss 9 | 10 | 11 | class FairseqCriterion(_Loss): 12 | 13 | def __init__(self, args, task): 14 | super().__init__() 15 | self.args = args 16 | self.padding_idx = task.target_dictionary.pad() 17 | 18 | @staticmethod 19 | def add_args(parser): 20 | """Add criterion-specific arguments to the parser.""" 21 | pass 22 | 23 | @classmethod 24 | def build_criterion(cls, args, task): 25 | return cls(args, task) 26 | 27 | def forward(self, model, sample, reduce=True): 28 | """Compute the loss for the given sample. 29 | 30 | Returns a tuple with three elements: 31 | 1) the loss 32 | 2) the sample size, which is used as the denominator for the gradient 33 | 3) logging outputs to display while training 34 | """ 35 | raise NotImplementedError 36 | 37 | @staticmethod 38 | def aggregate_logging_outputs(logging_outputs): 39 | """Aggregate logging outputs from data parallel training.""" 40 | raise NotImplementedError 41 | 42 | @staticmethod 43 | def grad_denom(sample_sizes): 44 | """Compute the gradient denominator for a set of sample sizes.""" 45 | return sum(sample_sizes) 46 | -------------------------------------------------------------------------------- /2-4/386/fairseq/criterions/label_smoothed_cross_entropy.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import math 9 | 10 | from fairseq import utils 11 | 12 | from . import FairseqCriterion, register_criterion 13 | 14 | 15 | @register_criterion('label_smoothed_cross_entropy') 16 | class LabelSmoothedCrossEntropyCriterion(FairseqCriterion): 17 | 18 | def __init__(self, args, task): 19 | super().__init__(args, task) 20 | self.eps = args.label_smoothing 21 | 22 | @staticmethod 23 | def add_args(parser): 24 | """Add criterion-specific arguments to the parser.""" 25 | # fmt: off 26 | parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', 27 | help='epsilon for label smoothing, 0 means no label smoothing') 28 | # fmt: on 29 | 30 | def forward(self, model, sample, reduce=True): 31 | """Compute the loss for the given sample. 32 | 33 | Returns a tuple with three elements: 34 | 1) the loss 35 | 2) the sample size, which is used as the denominator for the gradient 36 | 3) logging outputs to display while training 37 | """ 38 | net_output = model(**sample['net_input']) 39 | loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) 40 | sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens'] 41 | logging_output = { 42 | 'loss': utils.item(loss.data) if reduce else loss.data, 43 | 'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data, 44 | 'ntokens': sample['ntokens'], 45 | 'nsentences': sample['target'].size(0), 46 | 'sample_size': sample_size, 47 | } 48 | return loss, sample_size, logging_output 49 | 50 | def compute_loss(self, model, net_output, sample, reduce=True): 51 | lprobs = model.get_normalized_probs(net_output, log_probs=True) 52 | lprobs = lprobs.view(-1, lprobs.size(-1)) 53 | target = model.get_targets(sample, net_output).view(-1, 1) 54 | non_pad_mask = target.ne(self.padding_idx) 55 | nll_loss = -lprobs.gather(dim=-1, index=target)[non_pad_mask] 56 | smooth_loss = -lprobs.sum(dim=-1, keepdim=True)[non_pad_mask] 57 | if reduce: 58 | nll_loss = nll_loss.sum() 59 | smooth_loss = smooth_loss.sum() 60 | eps_i = self.eps / lprobs.size(-1) 61 | loss = (1. - self.eps) * nll_loss + eps_i * smooth_loss 62 | return loss, nll_loss 63 | 64 | @staticmethod 65 | def aggregate_logging_outputs(logging_outputs): 66 | """Aggregate logging outputs from data parallel training.""" 67 | ntokens = sum(log.get('ntokens', 0) for log in logging_outputs) 68 | nsentences = sum(log.get('nsentences', 0) for log in logging_outputs) 69 | sample_size = sum(log.get('sample_size', 0) for log in logging_outputs) 70 | return { 71 | 'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2), 72 | 'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2), 73 | 'ntokens': ntokens, 74 | 'nsentences': nsentences, 75 | 'sample_size': sample_size, 76 | } 77 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from .dictionary import Dictionary, TruncatedDictionary 9 | from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary 10 | 11 | from .fairseq_dataset import FairseqDataset 12 | 13 | from .backtranslation_dataset import BacktranslationDataset 14 | from .block_pair_dataset import BlockPairDataset 15 | from .concat_dataset import ConcatDataset 16 | from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset 17 | from .language_pair_dataset import LanguagePairDataset 18 | from .lm_context_window_dataset import LMContextWindowDataset 19 | from .masked_lm_dataset import MaskedLMDataset 20 | from .monolingual_dataset import MonolingualDataset 21 | from .noising import NoisingDataset 22 | from .round_robin_zip_datasets import RoundRobinZipDatasets 23 | from .token_block_dataset import TokenBlockDataset 24 | from .transform_eos_dataset import TransformEosDataset 25 | from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset 26 | 27 | from .iterators import ( 28 | CountingIterator, 29 | EpochBatchIterator, 30 | GroupedIterator, 31 | ShardedIterator, 32 | ) 33 | 34 | __all__ = [ 35 | 'BacktranslationDataset', 36 | 'BertDictionary', 37 | 'BlockPairDataset', 38 | 'ConcatDataset', 39 | 'CountingIterator', 40 | 'Dictionary', 41 | 'EpochBatchIterator', 42 | 'FairseqDataset', 43 | 'GroupedIterator', 44 | 'IndexedCachedDataset', 45 | 'IndexedDataset', 46 | 'IndexedRawTextDataset', 47 | 'LanguagePairDataset', 48 | 'LMContextWindowDataset', 49 | 'MaskedLMDataset', 50 | 'MaskedLMDictionary', 51 | 'MMapIndexedDataset', 52 | 'MonolingualDataset', 53 | 'NoisingDataset', 54 | 'RoundRobinZipDatasets', 55 | 'ShardedIterator', 56 | 'TokenBlockDataset', 57 | 'TransformEosDataset', 58 | 'TransformEosLangPairDataset', 59 | 'TruncatedDictionary', 60 | ] 61 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import bisect 9 | 10 | import numpy as np 11 | 12 | from . import FairseqDataset 13 | 14 | 15 | class ConcatDataset(FairseqDataset): 16 | @staticmethod 17 | def cumsum(sequence, sample_ratios): 18 | r, s = [], 0 19 | for e, ratio in zip(sequence, sample_ratios): 20 | curr_len = int(ratio * len(e)) 21 | r.append(curr_len + s) 22 | s += curr_len 23 | return r 24 | 25 | def __init__(self, datasets, sample_ratios=1): 26 | super(ConcatDataset, self).__init__() 27 | assert len(datasets) > 0, "datasets should not be an empty iterable" 28 | self.datasets = list(datasets) 29 | if isinstance(sample_ratios, int): 30 | sample_ratios = [sample_ratios] * len(self.datasets) 31 | self.sample_ratios = sample_ratios 32 | self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios) 33 | self.real_sizes = [len(d) for d in self.datasets] 34 | 35 | def __len__(self): 36 | return self.cumulative_sizes[-1] 37 | 38 | def __getitem__(self, idx): 39 | dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) 40 | return self.datasets[dataset_idx][sample_idx] 41 | 42 | def _get_dataset_and_sample_index(self, idx: int): 43 | dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) 44 | if dataset_idx == 0: 45 | sample_idx = idx 46 | else: 47 | sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] 48 | sample_idx = sample_idx % self.real_sizes[dataset_idx] 49 | return dataset_idx, sample_idx 50 | 51 | def collater(self, samples): 52 | # For now only supports datasets with same underlying collater implementations 53 | return self.datasets[0].collater(samples) 54 | 55 | def size(self, idx: int): 56 | """ 57 | Return an example's size as a float or tuple. 58 | """ 59 | dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) 60 | return self.datasets[dataset_idx].size(sample_idx) 61 | 62 | def num_tokens(self, index: int): 63 | return np.max(self.size(index)) 64 | 65 | @property 66 | def sizes(self): 67 | return np.concatenate( 68 | [np.tile(ds.sizes, sr) for ds, sr in zip(self.datasets, self.sample_ratios)] 69 | ) 70 | 71 | @property 72 | def supports_prefetch(self): 73 | return all(d.supports_prefetch for d in self.datasets) 74 | 75 | def ordered_indices(self): 76 | """ 77 | Returns indices sorted by length. So less padding is needed. 78 | """ 79 | return np.argsort(self.sizes) 80 | 81 | def prefetch(self, indices): 82 | frm = 0 83 | for to, ds in zip(self.cumulative_sizes, self.datasets): 84 | real_size = len(ds) 85 | if getattr(ds, 'supports_prefetch', False): 86 | ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to]) 87 | frm = to 88 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/fairseq_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.utils.data 9 | 10 | 11 | class FairseqDataset(torch.utils.data.Dataset): 12 | """A dataset that provides helpers for batching.""" 13 | 14 | def __getitem__(self, index): 15 | raise NotImplementedError 16 | 17 | def __len__(self): 18 | raise NotImplementedError 19 | 20 | def collater(self, samples): 21 | """Merge a list of samples to form a mini-batch. 22 | 23 | Args: 24 | samples (List[dict]): samples to collate 25 | 26 | Returns: 27 | dict: a mini-batch suitable for forwarding with a Model 28 | """ 29 | raise NotImplementedError 30 | 31 | def num_tokens(self, index): 32 | """Return the number of tokens in a sample. This value is used to 33 | enforce ``--max-tokens`` during batching.""" 34 | raise NotImplementedError 35 | 36 | def size(self, index): 37 | """Return an example's size as a float or tuple. This value is used when 38 | filtering a dataset with ``--max-positions``.""" 39 | raise NotImplementedError 40 | 41 | def ordered_indices(self): 42 | """Return an ordered list of indices. Batches will be constructed based 43 | on this order.""" 44 | raise NotImplementedError 45 | 46 | @property 47 | def supports_prefetch(self): 48 | """Whether this dataset supports prefetching.""" 49 | return False 50 | 51 | def prefetch(self, indices): 52 | """Prefetch the data required for this epoch.""" 53 | raise NotImplementedError 54 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/lm_context_window_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import numpy as np 9 | import torch 10 | 11 | from fairseq.data.monolingual_dataset import MonolingualDataset 12 | 13 | from . import FairseqDataset 14 | 15 | 16 | class LMContextWindowDataset(FairseqDataset): 17 | """Wraps a MonolingualDataset and provides more context for evaluation.""" 18 | 19 | def __init__(self, dataset, tokens_per_sample, context_window, pad_idx): 20 | assert isinstance(dataset, MonolingualDataset) 21 | assert context_window > 0 22 | self.dataset = dataset 23 | self.tokens_per_sample = tokens_per_sample 24 | self.context_window = context_window 25 | self.pad_idx = pad_idx 26 | self.prev_tokens = np.empty([0]) 27 | 28 | def __getitem__(self, index): 29 | return self.dataset[index] 30 | 31 | def __len__(self): 32 | return len(self.dataset) 33 | 34 | def collater(self, samples): 35 | sample = self.dataset.collater(samples) 36 | 37 | pad = self.pad_idx 38 | max_sample_len = self.tokens_per_sample + self.context_window 39 | 40 | bsz, tsz = sample['net_input']['src_tokens'].shape 41 | start_idxs = [0] * bsz 42 | toks = sample['net_input']['src_tokens'] 43 | lengths = sample['net_input']['src_lengths'] 44 | tgt = sample['target'] 45 | new_toks = np.empty([bsz, tsz + self.context_window], dtype=np.int64) 46 | new_tgt = np.full([bsz, tsz + self.context_window], pad, dtype=np.int64) 47 | sample_lens = toks.ne(pad).long().sum(dim=1).cpu() 48 | for i in range(bsz): 49 | sample_len = sample_lens[i] 50 | extra = len(self.prev_tokens) + sample_len - max_sample_len 51 | if extra > 0: 52 | self.prev_tokens = self.prev_tokens[extra:] 53 | pads = np.full(self.context_window - len(self.prev_tokens), pad) 54 | new_toks[i] = np.concatenate([self.prev_tokens, toks[i].numpy(), pads]) 55 | new_tgt[i, len(self.prev_tokens):len(self.prev_tokens) + len(tgt[i])] = tgt[i] 56 | start_idxs[i] = len(self.prev_tokens) 57 | lengths[i] += len(self.prev_tokens) 58 | self.prev_tokens = new_toks[i][new_toks[i] != pad][-self.context_window:] 59 | sample['net_input']['src_tokens'] = torch.from_numpy(new_toks) 60 | sample['target'] = torch.from_numpy(new_tgt) 61 | sample['start_indices'] = start_idxs 62 | 63 | return sample 64 | 65 | def num_tokens(self, index): 66 | return self.dataset.num_tokens(index) 67 | 68 | def size(self, index): 69 | return self.dataset.size(index) 70 | 71 | def ordered_indices(self): 72 | # NOTE we don't shuffle the data to retain access to the previous dataset elements 73 | return np.arange(len(self.dataset)) 74 | 75 | @property 76 | def supports_prefetch(self): 77 | return getattr(self.dataset, 'supports_prefetch', False) 78 | 79 | def prefetch(self, indices): 80 | return self.dataset.prefetch(indices) 81 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/masked_lm_dictionary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from fairseq.data import Dictionary 9 | 10 | 11 | class MaskedLMDictionary(Dictionary): 12 | """ 13 | Dictionary for Masked Language Modelling tasks. This extends Dictionary by 14 | adding the mask symbol. 15 | """ 16 | def __init__( 17 | self, 18 | pad='', 19 | eos='', 20 | unk='', 21 | mask='', 22 | ): 23 | super().__init__(pad, eos, unk) 24 | self.mask_word = mask 25 | self.mask_index = self.add_symbol(mask) 26 | self.nspecial = len(self.symbols) 27 | 28 | def mask(self): 29 | """Helper to get index of mask symbol""" 30 | return self.mask_index 31 | 32 | 33 | class BertDictionary(MaskedLMDictionary): 34 | """ 35 | Dictionary for BERT task. This extends MaskedLMDictionary by adding support 36 | for cls and sep symbols. 37 | """ 38 | def __init__( 39 | self, 40 | pad='', 41 | eos='', 42 | unk='', 43 | mask='', 44 | cls='', 45 | sep='' 46 | ): 47 | super().__init__(pad, eos, unk, mask) 48 | self.cls_word = cls 49 | self.sep_word = sep 50 | self.cls_index = self.add_symbol(cls) 51 | self.sep_index = self.add_symbol(sep) 52 | self.nspecial = len(self.symbols) 53 | 54 | def cls(self): 55 | """Helper to get index of cls symbol""" 56 | return self.cls_index 57 | 58 | def sep(self): 59 | """Helper to get index of sep symbol""" 60 | return self.sep_index 61 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/round_robin_zip_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from collections import OrderedDict 9 | 10 | import numpy as np 11 | 12 | from . import FairseqDataset 13 | 14 | 15 | class RoundRobinZipDatasets(FairseqDataset): 16 | """Zip multiple :class:`~fairseq.data.FairseqDataset` instances together. 17 | 18 | Shorter datasets are repeated in a round-robin fashion to match the length 19 | of the longest one. 20 | 21 | Args: 22 | datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of 23 | :class:`~fairseq.data.FairseqDataset` instances. 24 | eval_key (str, optional): a key used at evaluation time that causes 25 | this instance to pass-through batches from *datasets[eval_key]*. 26 | """ 27 | 28 | def __init__(self, datasets, eval_key=None): 29 | super().__init__() 30 | assert isinstance(datasets, OrderedDict) 31 | self.datasets = datasets 32 | self.eval_key = eval_key 33 | 34 | self.longest_dataset = None 35 | self.longest_dataset_key = None 36 | for key, dataset in datasets.items(): 37 | assert isinstance(dataset, FairseqDataset) 38 | if self.longest_dataset is None or len(dataset) > len(self.longest_dataset): 39 | self.longest_dataset = dataset 40 | self.longest_dataset_key = key 41 | 42 | self._ordered_indices = None 43 | 44 | def _map_index(self, key, index): 45 | assert self._ordered_indices is not None, \ 46 | 'Must call RoundRobinZipDatasets.ordered_indices() first' 47 | return self._ordered_indices[key][index % len(self.datasets[key])] 48 | 49 | def __getitem__(self, index): 50 | if self.eval_key is None: 51 | return OrderedDict([ 52 | (key, dataset[self._map_index(key, index)]) 53 | for key, dataset in self.datasets.items() 54 | ]) 55 | else: 56 | # at evaluation time it's useful to pass-through batches from a single key 57 | return self.datasets[self.eval_key][self._map_index(self.eval_key, index)] 58 | 59 | def __len__(self): 60 | return len(self.longest_dataset) 61 | 62 | def collater(self, samples): 63 | """Merge a list of samples to form a mini-batch.""" 64 | if len(samples) == 0: 65 | return None 66 | if self.eval_key is None: 67 | return OrderedDict([ 68 | (key, dataset.collater([sample[key] for sample in samples])) 69 | for key, dataset in self.datasets.items() 70 | ]) 71 | else: 72 | # at evaluation time it's useful to pass-through batches from a single key 73 | return self.datasets[self.eval_key].collater(samples) 74 | 75 | def num_tokens(self, index): 76 | """Return an example's length (number of tokens), used for batching.""" 77 | # TODO make it configurable whether to use max() or sum() here 78 | return max( 79 | dataset.num_tokens(self._map_index(key, index)) 80 | for key, dataset in self.datasets.items() 81 | ) 82 | 83 | def size(self, index): 84 | """Return an example's size as a float or tuple. This value is used when 85 | filtering a dataset with ``--max-positions``.""" 86 | return { 87 | key: dataset.size(self._map_index(key, index)) 88 | for key, dataset in self.datasets.items() 89 | } 90 | 91 | def ordered_indices(self): 92 | """Ordered indices for batching.""" 93 | if self._ordered_indices is None: 94 | # Call the underlying dataset's ordered_indices() here, so that we 95 | # get the same random ordering as we would have from using the 96 | # underlying dataset directly. 97 | self._ordered_indices = OrderedDict([ 98 | (key, dataset.ordered_indices()) 99 | for key, dataset in self.datasets.items() 100 | ]) 101 | return np.arange(len(self)) 102 | 103 | @property 104 | def supports_prefetch(self): 105 | return all( 106 | getattr(dataset, 'supports_prefetch', False) 107 | for dataset in self.datasets.values() 108 | ) 109 | 110 | def prefetch(self, indices): 111 | for key, dataset in self.datasets.items(): 112 | dataset.prefetch([self._map_index(key, index) for index in indices]) 113 | -------------------------------------------------------------------------------- /2-4/386/fairseq/data/transform_eos_lang_pair_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | 9 | from . import FairseqDataset 10 | from typing import Optional 11 | 12 | 13 | class TransformEosLangPairDataset(FairseqDataset): 14 | """A :class:`~fairseq.data.FairseqDataset` wrapper that transform bos on 15 | collated samples of language pair dataset. 16 | 17 | Note that the transformation is applied in :func:`collater`. 18 | 19 | Args: 20 | dataset (~fairseq.data.FairseqDataset): dataset that collates sample into 21 | LanguagePairDataset schema 22 | src_eos (int): original source end-of-sentence symbol index to be replaced 23 | new_src_eos (int, optional): new end-of-sentence symbol index to replace source eos symbol 24 | tgt_bos (int, optional): original target beginning-of-sentence symbol index to be replaced 25 | new_tgt_bos (int, optional): new beginning-of-sentence symbol index to replace at the 26 | beginning of 'prev_output_tokens' 27 | """ 28 | 29 | def __init__( 30 | self, 31 | dataset: FairseqDataset, 32 | src_eos: int, 33 | new_src_eos: Optional[int] = None, 34 | tgt_bos: Optional[int] = None, 35 | new_tgt_bos: Optional[int] = None, 36 | ): 37 | self.dataset = dataset 38 | self.src_eos = src_eos 39 | self.new_src_eos = new_src_eos 40 | self.tgt_bos = tgt_bos 41 | self.new_tgt_bos = new_tgt_bos 42 | 43 | def __getitem__(self, index): 44 | return self.dataset[index] 45 | 46 | def __len__(self): 47 | return len(self.dataset) 48 | 49 | def collater(self, samples): 50 | samples = self.dataset.collater(samples) 51 | 52 | # TODO: support different padding direction 53 | if self.new_src_eos is not None: 54 | assert(samples['net_input']['src_tokens'][:, -1] != self.src_eos).sum() == 0 55 | samples['net_input']['src_tokens'][:, -1] = self.new_src_eos 56 | 57 | if self.new_tgt_bos is not None: 58 | assert (samples['net_input']['prev_output_tokens'][:, 0] != self.tgt_bos).sum() == 0 59 | samples['net_input']['prev_output_tokens'][:, 0] = self.new_tgt_bos 60 | 61 | return samples 62 | 63 | def num_tokens(self, index): 64 | return self.dataset.num_tokens(index) 65 | 66 | def size(self, index): 67 | return self.dataset.size(index) 68 | 69 | def ordered_indices(self): 70 | return self.dataset.ordered_indices() 71 | 72 | @property 73 | def supports_prefetch(self): 74 | return getattr(self.dataset, 'supports_prefetch', False) 75 | 76 | def prefetch(self, indices): 77 | return self.dataset.prefetch(indices) 78 | -------------------------------------------------------------------------------- /2-4/386/fairseq/meters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import time 9 | 10 | 11 | class AverageMeter(object): 12 | """Computes and stores the average and current value""" 13 | def __init__(self): 14 | self.reset() 15 | 16 | def reset(self): 17 | self.val = 0 18 | self.avg = 0 19 | self.sum = 0 20 | self.count = 0 21 | 22 | def update(self, val, n=1): 23 | self.val = val 24 | self.sum += val * n 25 | self.count += n 26 | self.avg = self.sum / self.count 27 | 28 | 29 | class TimeMeter(object): 30 | """Computes the average occurrence of some event per second""" 31 | def __init__(self, init=0): 32 | self.reset(init) 33 | 34 | def reset(self, init=0): 35 | self.init = init 36 | self.start = time.time() 37 | self.n = 0 38 | 39 | def update(self, val=1): 40 | self.n += val 41 | 42 | @property 43 | def avg(self): 44 | return self.n / self.elapsed_time 45 | 46 | @property 47 | def elapsed_time(self): 48 | return self.init + (time.time() - self.start) 49 | 50 | 51 | class StopwatchMeter(object): 52 | """Computes the sum/avg duration of some event in seconds""" 53 | def __init__(self): 54 | self.reset() 55 | 56 | def start(self): 57 | self.start_time = time.time() 58 | 59 | def stop(self, n=1): 60 | if self.start_time is not None: 61 | delta = time.time() - self.start_time 62 | self.sum += delta 63 | self.n += n 64 | self.start_time = None 65 | 66 | def reset(self): 67 | self.sum = 0 68 | self.n = 0 69 | self.start_time = None 70 | 71 | @property 72 | def avg(self): 73 | return self.sum / self.n 74 | -------------------------------------------------------------------------------- /2-4/386/fairseq/models/composite_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from fairseq.models import FairseqEncoder 9 | 10 | 11 | class CompositeEncoder(FairseqEncoder): 12 | """ 13 | A wrapper around a dictionary of :class:`FairseqEncoder` objects. 14 | 15 | We run forward on each encoder and return a dictionary of outputs. The first 16 | encoder's dictionary is used for initialization. 17 | 18 | Args: 19 | encoders (dict): a dictionary of :class:`FairseqEncoder` objects. 20 | """ 21 | 22 | def __init__(self, encoders): 23 | super().__init__(next(iter(encoders.values())).dictionary) 24 | self.encoders = encoders 25 | for key in self.encoders: 26 | self.add_module(key, self.encoders[key]) 27 | 28 | def forward(self, src_tokens, src_lengths): 29 | """ 30 | Args: 31 | src_tokens (LongTensor): tokens in the source language of shape 32 | `(batch, src_len)` 33 | src_lengths (LongTensor): lengths of each source sentence of shape 34 | `(batch)` 35 | 36 | Returns: 37 | dict: 38 | the outputs from each Encoder 39 | """ 40 | encoder_out = {} 41 | for key in self.encoders: 42 | encoder_out[key] = self.encoders[key](src_tokens, src_lengths) 43 | return encoder_out 44 | 45 | def reorder_encoder_out(self, encoder_out, new_order): 46 | """Reorder encoder output according to new_order.""" 47 | for key in self.encoders: 48 | encoder_out[key] = self.encoders[key].reorder_encoder_out(encoder_out[key], new_order) 49 | return encoder_out 50 | 51 | def max_positions(self): 52 | return min([self.encoders[key].max_positions() for key in self.encoders]) 53 | 54 | def upgrade_state_dict(self, state_dict): 55 | for key in self.encoders: 56 | self.encoders[key].upgrade_state_dict(state_dict) 57 | return state_dict 58 | -------------------------------------------------------------------------------- /2-4/386/fairseq/models/distributed_fairseq_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import inspect 9 | 10 | from torch.nn import parallel 11 | 12 | from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel 13 | from fairseq.models import BaseFairseqModel 14 | 15 | 16 | def DistributedFairseqModel(args, model): 17 | """ 18 | Wrap a *model* to support distributed data parallel training. 19 | 20 | This is similar to the built-in DistributedDataParallel, but allows 21 | additional configuration of the DistributedDataParallel class to 22 | use, and also provides easier access to the wrapped model by 23 | forwarding requests for missing attributes to the wrapped model. 24 | 25 | Args: 26 | args (argparse.Namespace): fairseq args 27 | model (BaseFairseqModel): model to wrap 28 | """ 29 | # determine which DDP class to extend 30 | assert isinstance(model, BaseFairseqModel) 31 | if args.ddp_backend == 'c10d': 32 | ddp_class = parallel.DistributedDataParallel 33 | init_kwargs = dict( 34 | module=model, 35 | device_ids=[args.device_id], 36 | output_device=args.device_id, 37 | broadcast_buffers=False, 38 | bucket_cap_mb=args.bucket_cap_mb, 39 | ) 40 | # Maintain backward compatibility 41 | if 'check_reduction' in inspect.getargspec(ddp_class)[0]: 42 | init_kwargs['check_reduction'] = True 43 | if 'find_unused_parameters' in inspect.getargspec(ddp_class)[0]: 44 | init_kwargs['find_unused_parameters'] = args.find_unused_parameters 45 | elif args.ddp_backend == 'no_c10d': 46 | ddp_class = LegacyDistributedDataParallel 47 | init_kwargs = dict( 48 | module=model, 49 | world_size=args.distributed_world_size, 50 | buffer_size=2**28, 51 | ) 52 | else: 53 | raise ValueError('Unknown --ddp-backend: ' + args.ddp_backend) 54 | 55 | class _DistributedFairseqModel(ddp_class): 56 | """Extend DistributedDataParallel to check for missing 57 | attributes in the wrapped module.""" 58 | 59 | def __init__(self, *args, **kwargs): 60 | super().__init__(*args, **kwargs) 61 | 62 | def __getattr__(self, name): 63 | wrapped_module = super().__getattr__('module') 64 | if hasattr(wrapped_module, name): 65 | return getattr(wrapped_module, name) 66 | return super().__getattr__(name) 67 | 68 | return _DistributedFairseqModel(**init_kwargs) 69 | -------------------------------------------------------------------------------- /2-4/386/fairseq/models/fairseq_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.nn as nn 9 | 10 | from fairseq import utils 11 | 12 | 13 | class FairseqDecoder(nn.Module): 14 | """Base class for decoders.""" 15 | 16 | def __init__(self, dictionary): 17 | super().__init__() 18 | self.dictionary = dictionary 19 | self.onnx_trace = False 20 | 21 | def forward(self, prev_output_tokens, encoder_out=None, **kwargs): 22 | """ 23 | Args: 24 | prev_output_tokens (LongTensor): shifted output tokens of shape 25 | `(batch, tgt_len)`, for input feeding/teacher forcing 26 | encoder_out (dict, optional): output from the encoder, used for 27 | encoder-side attention 28 | 29 | Returns: 30 | tuple: 31 | - the decoder's output of shape `(batch, tgt_len, vocab)` 32 | - a dictionary with any model-specific outputs 33 | """ 34 | x, extra = self.extract_features(prev_output_tokens, encoder_out=encoder_out, **kwargs) 35 | x = self.output_layer(x) 36 | return x, extra 37 | 38 | def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs): 39 | """ 40 | Returns: 41 | tuple: 42 | - the decoder's features of shape `(batch, tgt_len, embed_dim)` 43 | - a dictionary with any model-specific outputs 44 | """ 45 | raise NotImplementedError 46 | 47 | def output_layer(self, features, **kwargs): 48 | """ 49 | Project features to the default output size, e.g., vocabulary size. 50 | 51 | Args: 52 | features (Tensor): features returned by *extract_features*. 53 | """ 54 | raise NotImplementedError 55 | 56 | def get_normalized_probs(self, net_output, log_probs, sample): 57 | """Get normalized probabilities (or log probs) from a net's output.""" 58 | 59 | if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None: 60 | if sample is not None: 61 | assert 'target' in sample 62 | target = sample['target'] 63 | else: 64 | target = None 65 | out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) 66 | return out.exp_() if not log_probs else out 67 | 68 | logits = net_output[0] 69 | if log_probs: 70 | return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) 71 | else: 72 | return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) 73 | 74 | def max_positions(self): 75 | """Maximum input length supported by the decoder.""" 76 | return 1e6 # an arbitrary large number 77 | 78 | def upgrade_state_dict(self, state_dict): 79 | """Upgrade a (possibly old) state dict for new versions of fairseq.""" 80 | return state_dict 81 | 82 | def prepare_for_onnx_export_(self): 83 | self.onnx_trace = True 84 | -------------------------------------------------------------------------------- /2-4/386/fairseq/models/fairseq_encoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.nn as nn 9 | 10 | 11 | class FairseqEncoder(nn.Module): 12 | """Base class for encoders.""" 13 | 14 | def __init__(self, dictionary): 15 | super().__init__() 16 | self.dictionary = dictionary 17 | 18 | def forward(self, src_tokens, src_lengths=None, **kwargs): 19 | """ 20 | Args: 21 | src_tokens (LongTensor): tokens in the source language of shape 22 | `(batch, src_len)` 23 | src_lengths (LongTensor): lengths of each source sentence of shape 24 | `(batch)` 25 | """ 26 | raise NotImplementedError 27 | 28 | def reorder_encoder_out(self, encoder_out, new_order): 29 | """ 30 | Reorder encoder output according to `new_order`. 31 | 32 | Args: 33 | encoder_out: output from the ``forward()`` method 34 | new_order (LongTensor): desired order 35 | 36 | Returns: 37 | `encoder_out` rearranged according to `new_order` 38 | """ 39 | raise NotImplementedError 40 | 41 | def max_positions(self): 42 | """Maximum input length supported by the encoder.""" 43 | return 1e6 # an arbitrary large number 44 | 45 | def upgrade_state_dict(self, state_dict): 46 | """Upgrade a (possibly old) state dict for new versions of fairseq.""" 47 | return state_dict 48 | -------------------------------------------------------------------------------- /2-4/386/fairseq/models/fairseq_incremental_decoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from fairseq.models import FairseqDecoder 9 | 10 | 11 | class FairseqIncrementalDecoder(FairseqDecoder): 12 | """Base class for incremental decoders. 13 | 14 | Incremental decoding is a special mode at inference time where the Model 15 | only receives a single timestep of input corresponding to the previous 16 | output token (for input feeding) and must produce the next output 17 | *incrementally*. Thus the model must cache any long-term state that is 18 | needed about the sequence, e.g., hidden states, convolutional states, etc. 19 | 20 | Compared to the standard :class:`FairseqDecoder` interface, the incremental 21 | decoder interface allows :func:`forward` functions to take an extra keyword 22 | argument (*incremental_state*) that can be used to cache state across 23 | time-steps. 24 | 25 | The :class:`FairseqIncrementalDecoder` interface also defines the 26 | :func:`reorder_incremental_state` method, which is used during beam search 27 | to select and reorder the incremental state based on the selection of beams. 28 | 29 | To learn more about how incremental decoding works, refer to `this blog 30 | `_. 31 | """ 32 | 33 | def __init__(self, dictionary): 34 | super().__init__(dictionary) 35 | 36 | def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs): 37 | """ 38 | Args: 39 | prev_output_tokens (LongTensor): shifted output tokens of shape 40 | `(batch, tgt_len)`, for input feeding/teacher forcing 41 | encoder_out (dict, optional): output from the encoder, used for 42 | encoder-side attention 43 | incremental_state (dict, optional): dictionary used for storing 44 | state during :ref:`Incremental decoding` 45 | 46 | Returns: 47 | tuple: 48 | - the decoder's output of shape `(batch, tgt_len, vocab)` 49 | - a dictionary with any model-specific outputs 50 | """ 51 | raise NotImplementedError 52 | 53 | def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs): 54 | """ 55 | Returns: 56 | tuple: 57 | - the decoder's features of shape `(batch, tgt_len, embed_dim)` 58 | - a dictionary with any model-specific outputs 59 | """ 60 | raise NotImplementedError 61 | 62 | def reorder_incremental_state(self, incremental_state, new_order): 63 | """Reorder incremental state. 64 | 65 | This should be called when the order of the input has changed from the 66 | previous time step. A typical use case is beam search, where the input 67 | order changes between time steps based on the selection of beams. 68 | """ 69 | seen = set() 70 | 71 | def apply_reorder_incremental_state(module): 72 | if module != self and hasattr(module, 'reorder_incremental_state') \ 73 | and module not in seen: 74 | seen.add(module) 75 | module.reorder_incremental_state(incremental_state, new_order) 76 | 77 | self.apply(apply_reorder_incremental_state) 78 | 79 | def set_beam_size(self, beam_size): 80 | """Sets the beam size in the decoder and all children.""" 81 | if getattr(self, '_beam_size', -1) != beam_size: 82 | seen = set() 83 | 84 | def apply_set_beam_size(module): 85 | if module != self and hasattr(module, 'set_beam_size') \ 86 | and module not in seen: 87 | seen.add(module) 88 | module.set_beam_size(beam_size) 89 | 90 | self.apply(apply_set_beam_size) 91 | self._beam_size = beam_size 92 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from .adaptive_input import AdaptiveInput 9 | from .adaptive_softmax import AdaptiveSoftmax 10 | from .beamable_mm import BeamableMM 11 | from .character_token_embedder import CharacterTokenEmbedder 12 | from .conv_tbc import ConvTBC 13 | from .downsampled_multihead_attention import DownsampledMultiHeadAttention 14 | from .dynamic_convolution import DynamicConv1dTBC 15 | from .gelu import gelu, gelu_accurate 16 | from .grad_multiply import GradMultiply 17 | from .highway import Highway 18 | from .layer_norm import LayerNorm 19 | from .learned_positional_embedding import LearnedPositionalEmbedding 20 | from .lightweight_convolution import LightweightConv1dTBC 21 | from .linearized_convolution import LinearizedConvolution 22 | from .logsumexp_moe import LogSumExpMoE 23 | from .mean_pool_gating_network import MeanPoolGatingNetwork 24 | from .multihead_attention import MultiheadAttention 25 | from .positional_embedding import PositionalEmbedding 26 | from .scalar_bias import ScalarBias 27 | from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding 28 | from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer 29 | from .transformer_sentence_encoder import TransformerSentenceEncoder 30 | from .unfold import unfold1d 31 | 32 | __all__ = [ 33 | 'AdaptiveInput', 34 | 'AdaptiveSoftmax', 35 | 'BeamableMM', 36 | 'CharacterTokenEmbedder', 37 | 'ConvTBC', 38 | 'DownsampledMultiHeadAttention', 39 | 'DynamicConv1dTBC', 40 | 'gelu', 41 | 'gelu_accurate', 42 | 'GradMultiply', 43 | 'Highway', 44 | 'LayerNorm', 45 | 'LearnedPositionalEmbedding', 46 | 'LightweightConv1dTBC', 47 | 'LinearizedConvolution', 48 | 'LogSumExpMoE', 49 | 'MeanPoolGatingNetwork', 50 | 'MultiheadAttention', 51 | 'PositionalEmbedding', 52 | 'ScalarBias', 53 | 'SinusoidalPositionalEmbedding', 54 | 'TransformerSentenceEncoderLayer', 55 | 'TransformerSentenceEncoder', 56 | 'unfold1d', 57 | ] 58 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/adaptive_input.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | 9 | import torch 10 | from torch import nn 11 | 12 | from typing import List 13 | 14 | 15 | class AdaptiveInput(nn.Module): 16 | 17 | def __init__( 18 | self, 19 | vocab_size: int, 20 | padding_idx: int, 21 | initial_dim: int, 22 | factor: float, 23 | output_dim: int, 24 | cutoff: List[int], 25 | ): 26 | super().__init__() 27 | 28 | if vocab_size > cutoff[-1]: 29 | cutoff = cutoff + [vocab_size] 30 | else: 31 | assert vocab_size == cutoff[ 32 | -1], 'cannot specify cutoff larger than vocab size' 33 | 34 | self.cutoff = cutoff 35 | self.embedding_dim = output_dim 36 | self.padding_idx = padding_idx 37 | 38 | self.embeddings = nn.ModuleList() 39 | for i in range(len(self.cutoff)): 40 | prev = self.cutoff[i - 1] if i > 0 else 0 41 | size = self.cutoff[i] - prev 42 | dim = int(initial_dim // (factor ** i)) 43 | seq = nn.Sequential( 44 | nn.Embedding(size, dim, padding_idx), 45 | nn.Linear(dim, output_dim, bias=False) 46 | ) 47 | self.embeddings.append(seq) 48 | 49 | def init_weights(m): 50 | if isinstance(m, nn.Embedding): 51 | nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5) 52 | nn.init.constant_(m.weight[padding_idx], 0) 53 | elif hasattr(m, 'weight'): 54 | nn.init.xavier_uniform_(m.weight) 55 | 56 | self.apply(init_weights) 57 | 58 | self.register_buffer('_float_tensor', torch.FloatTensor(1)) 59 | 60 | def weights_for_band(self, band: int): 61 | return self.embeddings[band][0].weight, self.embeddings[band][1].weight 62 | 63 | def forward(self, input: torch.Tensor): 64 | result = self._float_tensor.new(input.shape + (self.embedding_dim,)) 65 | for i in range(len(self.cutoff)): 66 | mask = input.lt(self.cutoff[i]) 67 | if i > 0: 68 | mask.mul_(input.ge(self.cutoff[i - 1])) 69 | chunk_input = input[mask] - self.cutoff[i - 1] 70 | else: 71 | chunk_input = input[mask] 72 | if mask.any(): 73 | result[mask] = self.embeddings[i](chunk_input) 74 | return result 75 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/beamable_mm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | import torch.nn as nn 10 | 11 | 12 | class BeamableMM(nn.Module): 13 | """This module provides an optimized MM for beam decoding with attention. 14 | 15 | It leverage the fact that the source-side of the input is replicated beam 16 | times and the target-side of the input is of width one. This layer speeds up 17 | inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)} 18 | with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}. 19 | """ 20 | def __init__(self, beam_size=None): 21 | super(BeamableMM, self).__init__() 22 | self.beam_size = beam_size 23 | 24 | def forward(self, input1, input2): 25 | if ( 26 | not self.training and # test mode 27 | self.beam_size is not None and # beam size is set 28 | input1.dim() == 3 and # only support batched input 29 | input1.size(1) == 1 # single time step update 30 | ): 31 | bsz, beam = input1.size(0), self.beam_size 32 | 33 | # bsz x 1 x nhu --> bsz/beam x beam x nhu 34 | input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1) 35 | 36 | # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu 37 | input2 = input2.unfold(0, beam, beam)[:, :, :, 0] 38 | 39 | # use non batched operation if bsz = beam 40 | if input1.size(0) == 1: 41 | output = torch.mm(input1[0, :, :], input2[0, :, :]) 42 | else: 43 | output = input1.bmm(input2) 44 | return output.view(bsz, 1, -1) 45 | else: 46 | return input1.bmm(input2) 47 | 48 | def set_beam_size(self, beam_size): 49 | self.beam_size = beam_size 50 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/conv_tbc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | from torch.nn.modules.utils import _single 10 | 11 | 12 | class ConvTBC(torch.nn.Module): 13 | """1D convolution over an input of shape (time x batch x channel) 14 | 15 | The implementation uses gemm to perform the convolution. This implementation 16 | is faster than cuDNN for small kernel sizes. 17 | """ 18 | def __init__(self, in_channels, out_channels, kernel_size, padding=0): 19 | super(ConvTBC, self).__init__() 20 | self.in_channels = in_channels 21 | self.out_channels = out_channels 22 | self.kernel_size = _single(kernel_size) 23 | self.padding = _single(padding) 24 | 25 | self.weight = torch.nn.Parameter(torch.Tensor( 26 | self.kernel_size[0], in_channels, out_channels)) 27 | self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) 28 | 29 | def forward(self, input): 30 | return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding[0]) 31 | 32 | def __repr__(self): 33 | s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}' 34 | ', padding={padding}') 35 | if self.bias is None: 36 | s += ', bias=False' 37 | s += ')' 38 | return s.format(name=self.__class__.__name__, **self.__dict__) 39 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/gelu.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | """ 8 | See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with 9 | the corresponding GitHub repo: https://github.com/hendrycks/GELUs 10 | """ 11 | 12 | import math 13 | 14 | import torch 15 | 16 | 17 | def gelu_accurate(x): 18 | if not hasattr(gelu_accurate, "_a"): 19 | gelu_accurate._a = math.sqrt(2 / math.pi) 20 | return 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) 21 | 22 | 23 | def gelu(x: torch.Tensor) -> torch.Tensor: 24 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 25 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/grad_multiply.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | 10 | 11 | class GradMultiply(torch.autograd.Function): 12 | @staticmethod 13 | def forward(ctx, x, scale): 14 | ctx.scale = scale 15 | res = x.new(x) 16 | return res 17 | 18 | @staticmethod 19 | def backward(ctx, grad): 20 | return grad * ctx.scale, None 21 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/highway.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | 10 | from torch import nn 11 | 12 | 13 | class Highway(torch.nn.Module): 14 | """ 15 | A `Highway layer `_. 16 | Adopted from the AllenNLP implementation. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | input_dim: int, 22 | num_layers: int = 1 23 | ): 24 | super(Highway, self).__init__() 25 | self.input_dim = input_dim 26 | self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2) 27 | for _ in range(num_layers)]) 28 | self.activation = nn.ReLU() 29 | 30 | self.reset_parameters() 31 | 32 | def reset_parameters(self): 33 | for layer in self.layers: 34 | # As per comment in AllenNLP: 35 | # We should bias the highway layer to just carry its input forward. We do that by 36 | # setting the bias on `B(x)` to be positive, because that means `g` will be biased to 37 | # be high, so we will carry the input forward. The bias on `B(x)` is the second half 38 | # of the bias vector in each Linear layer. 39 | nn.init.constant_(layer.bias[self.input_dim:], 1) 40 | 41 | nn.init.constant_(layer.bias[:self.input_dim], 0) 42 | nn.init.xavier_normal_(layer.weight) 43 | 44 | def forward( 45 | self, 46 | x: torch.Tensor 47 | ): 48 | for layer in self.layers: 49 | projection = layer(x) 50 | proj_x, gate = projection.chunk(2, dim=-1) 51 | proj_x = self.activation(proj_x) 52 | gate = torch.sigmoid(gate) 53 | x = gate * x + (gate.new_tensor([1]) - gate) * proj_x 54 | return x 55 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/layer_norm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | 10 | 11 | def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): 12 | if not export and torch.cuda.is_available(): 13 | try: 14 | from apex.normalization import FusedLayerNorm 15 | return FusedLayerNorm(normalized_shape, eps, elementwise_affine) 16 | except ImportError: 17 | pass 18 | return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) 19 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/learned_positional_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.nn as nn 9 | 10 | from fairseq import utils 11 | 12 | 13 | class LearnedPositionalEmbedding(nn.Embedding): 14 | """ 15 | This module learns positional embeddings up to a fixed maximum size. 16 | Padding ids are ignored by either offsetting based on padding_idx 17 | or by setting padding_idx to None and ensuring that the appropriate 18 | position ids are passed to the forward function. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | num_embeddings: int, 24 | embedding_dim: int, 25 | padding_idx: int, 26 | ): 27 | super().__init__(num_embeddings, embedding_dim, padding_idx) 28 | self.onnx_trace = False 29 | 30 | def forward(self, input, incremental_state=None, positions=None): 31 | """Input is expected to be of size [bsz x seqlen].""" 32 | assert ( 33 | (positions is None) or (self.padding_idx is None) 34 | ), "If positions is pre-computed then padding_idx should not be set." 35 | 36 | if positions is None: 37 | if incremental_state is not None: 38 | # positions is the same for every token when decoding a single step 39 | positions = input.data.new(1, 1).fill_(self.padding_idx + input.size(1)) 40 | else: 41 | positions = utils.make_positions( 42 | input.data, self.padding_idx, onnx_trace=self.onnx_trace, 43 | ) 44 | return super().forward(positions) 45 | 46 | def max_positions(self): 47 | """Maximum number of supported positions.""" 48 | if self.padding_idx is not None: 49 | return self.num_embeddings - self.padding_idx - 1 50 | else: 51 | return self.num_embeddings 52 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/linearized_convolution.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | from fairseq import utils 12 | 13 | from .conv_tbc import ConvTBC 14 | 15 | 16 | class LinearizedConvolution(ConvTBC): 17 | """An optimized version of nn.Conv1d. 18 | 19 | At training time, this module uses ConvTBC, which is an optimized version 20 | of Conv1d. At inference time, it optimizes incremental generation (i.e., 21 | one time step at a time) by replacing the convolutions with linear layers. 22 | Note that the input order changes from training to inference. 23 | """ 24 | 25 | def __init__(self, in_channels, out_channels, kernel_size, **kwargs): 26 | super().__init__(in_channels, out_channels, kernel_size, **kwargs) 27 | self._linearized_weight = None 28 | self.register_backward_hook(self._clear_linearized_weight) 29 | 30 | def forward(self, input, incremental_state=None): 31 | """ 32 | Args: 33 | incremental_state: Used to buffer signal; if not None, then input is 34 | expected to contain a single frame. If the input order changes 35 | between time steps, call reorder_incremental_state. 36 | Input: 37 | Time x Batch x Channel during training 38 | Batch x Time x Channel during inference 39 | """ 40 | if incremental_state is None: 41 | output = super().forward(input) 42 | if self.kernel_size[0] > 1 and self.padding[0] > 0: 43 | # remove future timesteps added by padding 44 | output = output[:-self.padding[0], :, :] 45 | return output 46 | 47 | # reshape weight 48 | weight = self._get_linearized_weight() 49 | kw = self.kernel_size[0] 50 | 51 | bsz = input.size(0) # input: bsz x len x dim 52 | if kw > 1: 53 | input = input.data 54 | input_buffer = self._get_input_buffer(incremental_state) 55 | if input_buffer is None: 56 | input_buffer = input.new(bsz, kw, input.size(2)).zero_() 57 | self._set_input_buffer(incremental_state, input_buffer) 58 | else: 59 | # shift buffer 60 | input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone() 61 | # append next input 62 | input_buffer[:, -1, :] = input[:, -1, :] 63 | input = input_buffer 64 | with torch.no_grad(): 65 | output = F.linear(input.view(bsz, -1), weight, self.bias) 66 | return output.view(bsz, 1, -1) 67 | 68 | def reorder_incremental_state(self, incremental_state, new_order): 69 | input_buffer = self._get_input_buffer(incremental_state) 70 | if input_buffer is not None: 71 | input_buffer = input_buffer.index_select(0, new_order) 72 | self._set_input_buffer(incremental_state, input_buffer) 73 | 74 | def _get_input_buffer(self, incremental_state): 75 | return utils.get_incremental_state(self, incremental_state, 'input_buffer') 76 | 77 | def _set_input_buffer(self, incremental_state, new_buffer): 78 | return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer) 79 | 80 | def _get_linearized_weight(self): 81 | if self._linearized_weight is None: 82 | kw = self.kernel_size[0] 83 | weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() 84 | assert weight.size() == (self.out_channels, kw, self.in_channels) 85 | self._linearized_weight = weight.view(self.out_channels, -1) 86 | return self._linearized_weight 87 | 88 | def _clear_linearized_weight(self, *args): 89 | self._linearized_weight = None 90 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/logsumexp_moe.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | 10 | 11 | class LogSumExpMoE(torch.autograd.Function): 12 | """Standard LogSumExp forward pass, but use *posterior* for the backward. 13 | 14 | See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade" 15 | (Shen et al., 2019) `_. 16 | """ 17 | 18 | @staticmethod 19 | def forward(ctx, logp, posterior, dim=-1): 20 | ctx.save_for_backward(posterior) 21 | ctx.dim = dim 22 | return torch.logsumexp(logp, dim=dim) 23 | 24 | @staticmethod 25 | def backward(ctx, grad_output): 26 | posterior, = ctx.saved_tensors 27 | grad_logp = grad_output.unsqueeze(ctx.dim) * posterior 28 | return grad_logp, None, None 29 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/mean_pool_gating_network.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | import torch.nn.functional as F 10 | 11 | 12 | class MeanPoolGatingNetwork(torch.nn.Module): 13 | """A simple mean-pooling gating network for selecting experts. 14 | 15 | This module applies mean pooling over an encoder's output and returns 16 | reponsibilities for each expert. The encoder format is expected to match 17 | :class:`fairseq.models.transformer.TransformerEncoder`. 18 | """ 19 | 20 | def __init__(self, embed_dim, num_experts, dropout=None): 21 | super().__init__() 22 | self.embed_dim = embed_dim 23 | self.num_experts = num_experts 24 | 25 | self.fc1 = torch.nn.Linear(embed_dim, embed_dim) 26 | self.dropout = torch.nn.Dropout(dropout) if dropout is not None else None 27 | self.fc2 = torch.nn.Linear(embed_dim, num_experts) 28 | 29 | def forward(self, encoder_out): 30 | if not ( 31 | isinstance(encoder_out, dict) 32 | and 'encoder_out' in encoder_out 33 | and 'encoder_padding_mask' in encoder_out 34 | and encoder_out['encoder_out'].size(2) == self.embed_dim 35 | ): 36 | raise ValueError('Unexpected format for encoder_out') 37 | 38 | # mean pooling over time 39 | encoder_padding_mask = encoder_out['encoder_padding_mask'] # B x T 40 | encoder_out = encoder_out['encoder_out'].transpose(0, 1) # B x T x C 41 | if encoder_padding_mask is not None: 42 | encoder_out = encoder_out.clone() # required because of transpose above 43 | encoder_out[encoder_padding_mask] = 0 44 | ntokens = torch.sum(1 - encoder_padding_mask, dim=1, keepdim=True) 45 | x = torch.sum(encoder_out, dim=1) / ntokens.type_as(encoder_out) 46 | else: 47 | x = torch.mean(encoder_out, dim=1) 48 | 49 | x = torch.tanh(self.fc1(x)) 50 | if self.dropout is not None: 51 | x = self.dropout(x) 52 | x = self.fc2(x) 53 | return F.log_softmax(x, dim=-1, dtype=torch.float32).type_as(x) 54 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/positional_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.nn as nn 9 | 10 | from .learned_positional_embedding import LearnedPositionalEmbedding 11 | from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding 12 | 13 | 14 | def PositionalEmbedding( 15 | num_embeddings: int, 16 | embedding_dim: int, 17 | padding_idx: int, 18 | learned: bool = False, 19 | ): 20 | if learned: 21 | # if padding_idx is specified then offset the embedding ids by 22 | # this index and adjust num_embeddings appropriately 23 | # TODO: The right place for this offset would be inside 24 | # LearnedPositionalEmbedding. Move this there for a cleaner implementation. 25 | if padding_idx is not None: 26 | num_embeddings = num_embeddings + padding_idx + 1 27 | m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) 28 | nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5) 29 | if padding_idx is not None: 30 | nn.init.constant_(m.weight[padding_idx], 0) 31 | else: 32 | m = SinusoidalPositionalEmbedding( 33 | embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1, 34 | ) 35 | return m 36 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/scalar_bias.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | # 8 | 9 | import torch 10 | 11 | 12 | class ScalarBias(torch.autograd.Function): 13 | """ 14 | Adds a vector of scalars, used in self-attention mechanism to allow 15 | the model to optionally attend to this vector instead of the past 16 | """ 17 | 18 | @staticmethod 19 | def forward(ctx, input, dim, bias_init): 20 | size = list(input.size()) 21 | size[dim] += 1 22 | output = input.new(*size).fill_(bias_init) 23 | output.narrow(dim, 1, size[dim] - 1).copy_(input) 24 | ctx.dim = dim 25 | return output 26 | 27 | @staticmethod 28 | def backward(ctx, grad): 29 | return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None 30 | 31 | 32 | def scalar_bias(input, dim, bias_init=0): 33 | return ScalarBias.apply(input, dim, bias_init) 34 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/sinusoidal_positional_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import math 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.onnx.operators 13 | 14 | from fairseq import utils 15 | 16 | 17 | class SinusoidalPositionalEmbedding(nn.Module): 18 | """This module produces sinusoidal positional embeddings of any length. 19 | 20 | Padding symbols are ignored. 21 | """ 22 | 23 | def __init__(self, embedding_dim, padding_idx, init_size=1024): 24 | super().__init__() 25 | self.embedding_dim = embedding_dim 26 | self.padding_idx = padding_idx 27 | self.weights = SinusoidalPositionalEmbedding.get_embedding( 28 | init_size, 29 | embedding_dim, 30 | padding_idx, 31 | ) 32 | self.onnx_trace = False 33 | self.register_buffer('_float_tensor', torch.FloatTensor(1)) 34 | 35 | def prepare_for_onnx_export_(self): 36 | self.onnx_trace = True 37 | 38 | @staticmethod 39 | def get_embedding(num_embeddings, embedding_dim, padding_idx=None): 40 | """Build sinusoidal embeddings. 41 | 42 | This matches the implementation in tensor2tensor, but differs slightly 43 | from the description in Section 3.5 of "Attention Is All You Need". 44 | """ 45 | half_dim = embedding_dim // 2 46 | emb = math.log(10000) / (half_dim - 1) 47 | emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) 48 | emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0) 49 | emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1) 50 | if embedding_dim % 2 == 1: 51 | # zero pad 52 | emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) 53 | if padding_idx is not None: 54 | emb[padding_idx, :] = 0 55 | return emb 56 | 57 | def forward(self, input, incremental_state=None, timestep=None, **kwargs): 58 | """Input is expected to be of size [bsz x seqlen].""" 59 | bsz, seq_len = torch.onnx.operators.shape_as_tensor(input) 60 | max_pos = self.padding_idx + 1 + seq_len 61 | if self.weights is None or max_pos > self.weights.size(0): 62 | # recompute/expand embeddings if needed 63 | self.weights = SinusoidalPositionalEmbedding.get_embedding( 64 | max_pos, 65 | self.embedding_dim, 66 | self.padding_idx, 67 | ) 68 | self.weights = self.weights.to(self._float_tensor) 69 | 70 | if incremental_state is not None: 71 | # positions is the same for every token when decoding a single step 72 | pos = (timestep.int() + 1).long() if timestep is not None else seq_len 73 | if self.onnx_trace: 74 | return self.weights[self.padding_idx + pos, :].unsqueeze(1).repeat(bsz, 1, 1) 75 | return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1) 76 | 77 | positions = utils.make_positions(input, self.padding_idx, onnx_trace=self.onnx_trace) 78 | if self.onnx_trace: 79 | flat_embeddings = self.weights.detach().index_select(0, positions.view(-1)) 80 | embedding_shape = torch.cat((bsz.view(1), seq_len.view(1), torch.LongTensor([-1]))) 81 | embeddings = torch.onnx.operators.reshape_from_tensor_shape(flat_embeddings, embedding_shape) 82 | return embeddings 83 | return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach() 84 | 85 | def max_positions(self): 86 | """Maximum number of supported positions.""" 87 | return int(1e5) # an arbitrary large number 88 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/transformer_sentence_encoder_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | from fairseq import utils 13 | from fairseq.modules import ( 14 | LayerNorm, 15 | MultiheadAttention, 16 | ) 17 | 18 | 19 | class TransformerSentenceEncoderLayer(nn.Module): 20 | """ 21 | Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained 22 | models. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | embedding_dim: float = 768, 28 | ffn_embedding_dim: float = 3072, 29 | num_attention_heads: float = 8, 30 | dropout: float = 0.1, 31 | attention_dropout: float = 0.1, 32 | activation_dropout: float = 0.1, 33 | activation_fn: str = 'relu', 34 | add_bias_kv: bool = False, 35 | add_zero_attn: bool = False, 36 | export: bool = False, 37 | ) -> None: 38 | 39 | super().__init__() 40 | # Initialize parameters 41 | self.embedding_dim = embedding_dim 42 | self.dropout = dropout 43 | self.activation_dropout = activation_dropout 44 | 45 | # Initialize blocks 46 | self.activation_fn = utils.get_activation_fn(activation_fn) 47 | self.self_attn = MultiheadAttention( 48 | self.embedding_dim, 49 | num_attention_heads, 50 | dropout=attention_dropout, 51 | add_bias_kv=add_bias_kv, 52 | add_zero_attn=add_zero_attn, 53 | self_attention=True 54 | ) 55 | 56 | # layer norm associated with the self attention layer 57 | self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) 58 | self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) 59 | self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) 60 | 61 | # layer norm associated with the position wise feed-forward NN 62 | self.final_layer_norm = LayerNorm(self.embedding_dim, export=export) 63 | 64 | def forward( 65 | self, 66 | x: torch.Tensor, 67 | self_attn_mask: torch.Tensor = None, 68 | self_attn_padding_mask: torch.Tensor = None, 69 | ): 70 | """ 71 | LayerNorm is applied either before or after the self-attention/ffn 72 | modules similar to the original Transformer imlementation. 73 | """ 74 | residual = x 75 | x, attn = self.self_attn( 76 | query=x, 77 | key=x, 78 | value=x, 79 | key_padding_mask=self_attn_padding_mask, 80 | need_weights=False, 81 | attn_mask=self_attn_mask, 82 | ) 83 | x = F.dropout(x, p=self.dropout, training=self.training) 84 | x = residual + x 85 | x = self.self_attn_layer_norm(x) 86 | 87 | residual = x 88 | x = self.activation_fn(self.fc1(x)) 89 | x = F.dropout(x, p=self.activation_dropout, training=self.training) 90 | x = self.fc2(x) 91 | x = F.dropout(x, p=self.dropout, training=self.training) 92 | x = residual + x 93 | x = self.final_layer_norm(x) 94 | return x, attn 95 | -------------------------------------------------------------------------------- /2-4/386/fairseq/modules/unfold.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.nn.functional as F 9 | 10 | 11 | def unfold1d(x, kernel_size, padding_l, pad_value=0): 12 | '''unfold T x B x C to T x B x C x K''' 13 | if kernel_size > 1: 14 | T, B, C = x.size() 15 | x = F.pad(x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value) 16 | x = x.as_strided((T, B, C, kernel_size), (B*C, C, 1, B*C)) 17 | else: 18 | x = x.unsqueeze(3) 19 | return x 20 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import importlib 9 | import os 10 | 11 | from fairseq import registry 12 | from fairseq.optim.fairseq_optimizer import FairseqOptimizer 13 | from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer 14 | 15 | 16 | __all__ = [ 17 | 'FairseqOptimizer', 18 | 'FP16Optimizer', 19 | 'MemoryEfficientFP16Optimizer', 20 | ] 21 | 22 | 23 | _build_optimizer, register_optimizer, OPTIMIZER_REGISTRY = registry.setup_registry( 24 | '--optimizer', 25 | base_class=FairseqOptimizer, 26 | default='nag', 27 | ) 28 | 29 | 30 | def build_optimizer(args, params, *extra_args, **extra_kwargs): 31 | params = list(filter(lambda p: p.requires_grad, params)) 32 | return _build_optimizer(args, params, *extra_args, **extra_kwargs) 33 | 34 | 35 | # automatically import any Python files in the optim/ directory 36 | for file in os.listdir(os.path.dirname(__file__)): 37 | if file.endswith('.py') and not file.startswith('_'): 38 | module = file[:file.find('.py')] 39 | importlib.import_module('fairseq.optim.' + module) 40 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/adadelta.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.optim 9 | 10 | from . import FairseqOptimizer, register_optimizer 11 | 12 | 13 | @register_optimizer('adadelta') 14 | class Adadelta(FairseqOptimizer): 15 | def __init__(self, args, params): 16 | super().__init__(args, params) 17 | self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config) 18 | 19 | @staticmethod 20 | def add_args(parser): 21 | """Add optimizer-specific arguments to the parser.""" 22 | # fmt: off 23 | parser.add_argument('--adadelta-rho', type=float, default=0.9, metavar='RHO', 24 | help='coefficient used for computing a running average of squared gradients') 25 | parser.add_argument('--adadelta-eps', type=float, default=1e-6, metavar='EPS', 26 | help='term added to the denominator to improve numerical stability') 27 | parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', 28 | help='weight decay') 29 | parser.add_argument('--anneal-eps', action='store_true', help='flag to anneal eps') 30 | # fmt: on 31 | 32 | @property 33 | def optimizer_config(self): 34 | """ 35 | Return a kwarg dictionary that will be used to override optimizer 36 | args stored in checkpoints. This allows us to load a checkpoint and 37 | resume training using a different set of optimizer args, e.g., with a 38 | different learning rate. 39 | """ 40 | return { 41 | 'lr': self.args.lr[0], 42 | 'rho': self.args.adadelta_rho, 43 | 'eps': self.args.adadelta_eps, 44 | 'weight_decay': self.args.weight_decay, 45 | } 46 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/adagrad.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.optim 9 | 10 | from . import FairseqOptimizer, register_optimizer 11 | 12 | 13 | @register_optimizer('adagrad') 14 | class Adagrad(FairseqOptimizer): 15 | def __init__(self, args, params): 16 | super().__init__(args, params) 17 | self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config) 18 | 19 | @staticmethod 20 | def add_args(parser): 21 | """Add optimizer-specific arguments to the parser.""" 22 | # fmt: off 23 | parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', 24 | help='weight decay') 25 | # fmt: on 26 | 27 | @property 28 | def optimizer_config(self): 29 | """ 30 | Return a kwarg dictionary that will be used to override optimizer 31 | args stored in checkpoints. This allows us to load a checkpoint and 32 | resume training using a different set of optimizer args, e.g., with a 33 | different learning rate. 34 | """ 35 | return { 36 | 'lr': self.args.lr[0], 37 | 'weight_decay': self.args.weight_decay, 38 | } 39 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/fairseq_optimizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import math 9 | 10 | import torch 11 | 12 | 13 | class FairseqOptimizer(object): 14 | 15 | def __init__(self, args, params): 16 | super().__init__() 17 | self.args = args 18 | self.params = list(params) 19 | 20 | @staticmethod 21 | def add_args(parser): 22 | """Add optimizer-specific arguments to the parser.""" 23 | pass 24 | 25 | @property 26 | def optimizer(self): 27 | """Return a torch.optim.optimizer.Optimizer instance.""" 28 | if not hasattr(self, '_optimizer'): 29 | raise NotImplementedError 30 | if not isinstance(self._optimizer, torch.optim.Optimizer): 31 | raise ValueError('_optimizer must be an instance of torch.optim.Optimizer') 32 | return self._optimizer 33 | 34 | @property 35 | def optimizer_config(self): 36 | """ 37 | Return a kwarg dictionary that will be used to override optimizer 38 | args stored in checkpoints. This allows us to load a checkpoint and 39 | resume training using a different set of optimizer args, e.g., with a 40 | different learning rate. 41 | """ 42 | raise NotImplementedError 43 | 44 | def get_lr(self): 45 | """Return the current learning rate.""" 46 | return self.optimizer.param_groups[0]['lr'] 47 | 48 | def set_lr(self, lr): 49 | """Set the learning rate.""" 50 | for param_group in self.optimizer.param_groups: 51 | param_group['lr'] = lr 52 | 53 | def state_dict(self): 54 | """Return the optimizer's state dict.""" 55 | return self.optimizer.state_dict() 56 | 57 | def load_state_dict(self, state_dict, optimizer_overrides=None): 58 | """Load an optimizer state dict. 59 | 60 | In general we should prefer the configuration of the existing optimizer 61 | instance (e.g., learning rate) over that found in the state_dict. This 62 | allows us to resume training from a checkpoint using a new set of 63 | optimizer args. 64 | """ 65 | self.optimizer.load_state_dict(state_dict) 66 | 67 | if optimizer_overrides is not None and len(optimizer_overrides) > 0: 68 | # override learning rate, momentum, etc. with latest values 69 | for group in self.optimizer.param_groups: 70 | group.update(optimizer_overrides) 71 | 72 | def backward(self, loss): 73 | """Computes the sum of gradients of the given tensor w.r.t. graph leaves.""" 74 | loss.backward() 75 | 76 | def multiply_grads(self, c): 77 | """Multiplies grads by a constant *c*.""" 78 | for p in self.params: 79 | if p.grad is not None: 80 | p.grad.data.mul_(c) 81 | 82 | def clip_grad_norm(self, max_norm): 83 | """Clips gradient norm.""" 84 | if max_norm > 0: 85 | return torch.nn.utils.clip_grad_norm_(self.params, max_norm) 86 | else: 87 | return math.sqrt(sum(p.grad.data.norm()**2 for p in self.params if p.grad is not None)) 88 | 89 | def step(self, closure=None): 90 | """Performs a single optimization step.""" 91 | self.optimizer.step(closure) 92 | 93 | def zero_grad(self): 94 | """Clears the gradients of all optimized parameters.""" 95 | for group in self.optimizer.param_groups: 96 | for p in group['params']: 97 | p.grad = None 98 | self.optimizer.zero_grad() 99 | 100 | @property 101 | def supports_memory_efficient_fp16(self): 102 | if hasattr(self.optimizer, 'supports_memory_efficient_fp16'): 103 | return self.optimizer.supports_memory_efficient_fp16 104 | return False 105 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import importlib 9 | import os 10 | 11 | from fairseq import registry 12 | from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import FairseqLRScheduler 13 | 14 | 15 | build_lr_scheduler, register_lr_scheduler, LR_SCHEDULER_REGISTRY = registry.setup_registry( 16 | '--lr-scheduler', 17 | base_class=FairseqLRScheduler, 18 | default='fixed', 19 | ) 20 | 21 | # automatically import any Python files in the optim/lr_scheduler/ directory 22 | for file in os.listdir(os.path.dirname(__file__)): 23 | if file.endswith('.py') and not file.startswith('_'): 24 | module = file[:file.find('.py')] 25 | importlib.import_module('fairseq.optim.lr_scheduler.' + module) 26 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from .. import FairseqOptimizer 9 | 10 | 11 | class FairseqLRScheduler(object): 12 | 13 | def __init__(self, args, optimizer): 14 | super().__init__() 15 | if not isinstance(optimizer, FairseqOptimizer): 16 | raise ValueError('optimizer must be an instance of FairseqOptimizer') 17 | self.args = args 18 | self.optimizer = optimizer 19 | self.best = None 20 | 21 | @staticmethod 22 | def add_args(parser): 23 | """Add arguments to the parser for this LR scheduler.""" 24 | pass 25 | 26 | def state_dict(self): 27 | """Return the LR scheduler state dict.""" 28 | return {'best': self.best} 29 | 30 | def load_state_dict(self, state_dict): 31 | """Load an LR scheduler state dict.""" 32 | self.best = state_dict['best'] 33 | 34 | def step(self, epoch, val_loss=None): 35 | """Update the learning rate at the end of the given epoch.""" 36 | if val_loss is not None: 37 | if self.best is None: 38 | self.best = val_loss 39 | else: 40 | self.best = min(self.best, val_loss) 41 | 42 | def step_update(self, num_updates): 43 | """Update the learning rate after each update.""" 44 | return self.optimizer.get_lr() 45 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/fixed_schedule.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from . import FairseqLRScheduler, register_lr_scheduler 9 | 10 | 11 | @register_lr_scheduler('fixed') 12 | class FixedSchedule(FairseqLRScheduler): 13 | """Decay the LR on a fixed schedule.""" 14 | 15 | def __init__(self, args, optimizer): 16 | super().__init__(args, optimizer) 17 | 18 | # set defaults 19 | args.warmup_updates = getattr(args, 'warmup_updates', 0) or 0 20 | 21 | self.lr = args.lr[0] 22 | if args.warmup_updates > 0: 23 | self.warmup_factor = 1. / args.warmup_updates 24 | else: 25 | self.warmup_factor = 1 26 | 27 | @staticmethod 28 | def add_args(parser): 29 | """Add arguments to the parser for this LR scheduler.""" 30 | # fmt: off 31 | parser.add_argument('--force-anneal', '--fa', type=int, metavar='N', 32 | help='force annealing at specified epoch') 33 | parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', 34 | help='shrink factor for annealing, lr_new = (lr * lr_shrink)') 35 | parser.add_argument('--warmup-updates', default=0, type=int, metavar='N', 36 | help='warmup the learning rate linearly for the first N updates') 37 | # fmt: on 38 | 39 | def get_next_lr(self, epoch): 40 | lrs = self.args.lr 41 | if self.args.force_anneal is None or epoch < self.args.force_anneal: 42 | # use fixed LR schedule 43 | next_lr = lrs[min(epoch, len(lrs) - 1)] 44 | else: 45 | # annneal based on lr_shrink 46 | next_lr = lrs[-1] * self.args.lr_shrink ** (epoch + 1 - self.args.force_anneal) 47 | return next_lr 48 | 49 | def step(self, epoch, val_loss=None): 50 | """Update the learning rate at the end of the given epoch.""" 51 | super().step(epoch, val_loss) 52 | self.lr = self.get_next_lr(epoch) 53 | self.optimizer.set_lr(self.warmup_factor * self.lr) 54 | return self.optimizer.get_lr() 55 | 56 | def step_update(self, num_updates): 57 | """Update the learning rate after each update.""" 58 | if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates: 59 | self.warmup_factor = num_updates / float(self.args.warmup_updates) 60 | self.optimizer.set_lr(self.warmup_factor * self.lr) 61 | return self.optimizer.get_lr() 62 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from . import FairseqLRScheduler, register_lr_scheduler 9 | 10 | 11 | @register_lr_scheduler('inverse_sqrt') 12 | class InverseSquareRootSchedule(FairseqLRScheduler): 13 | """Decay the LR based on the inverse square root of the update number. 14 | 15 | We also support a warmup phase where we linearly increase the learning rate 16 | from some initial learning rate (``--warmup-init-lr``) until the configured 17 | learning rate (``--lr``). Thereafter we decay proportional to the number of 18 | updates, with a decay factor set to align with the configured learning rate. 19 | 20 | During warmup:: 21 | 22 | lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates) 23 | lr = lrs[update_num] 24 | 25 | After warmup:: 26 | 27 | decay_factor = args.lr * sqrt(args.warmup_updates) 28 | lr = decay_factor / sqrt(update_num) 29 | """ 30 | 31 | def __init__(self, args, optimizer): 32 | super().__init__(args, optimizer) 33 | if len(args.lr) > 1: 34 | raise ValueError( 35 | 'Cannot use a fixed learning rate schedule with inverse_sqrt.' 36 | ' Consider --lr-scheduler=fixed instead.' 37 | ) 38 | warmup_end_lr = args.lr[0] 39 | if args.warmup_init_lr < 0: 40 | args.warmup_init_lr = warmup_end_lr 41 | 42 | # linearly warmup for the first args.warmup_updates 43 | self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates 44 | 45 | # then, decay prop. to the inverse square root of the update number 46 | self.decay_factor = warmup_end_lr * args.warmup_updates**0.5 47 | 48 | # initial learning rate 49 | self.lr = args.warmup_init_lr 50 | self.optimizer.set_lr(self.lr) 51 | 52 | @staticmethod 53 | def add_args(parser): 54 | """Add arguments to the parser for this LR scheduler.""" 55 | # fmt: off 56 | parser.add_argument('--warmup-updates', default=4000, type=int, metavar='N', 57 | help='warmup the learning rate linearly for the first N updates') 58 | parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR', 59 | help='initial learning rate during warmup phase; default is args.lr') 60 | # fmt: on 61 | 62 | def step(self, epoch, val_loss=None): 63 | """Update the learning rate at the end of the given epoch.""" 64 | super().step(epoch, val_loss) 65 | # we don't change the learning rate at epoch boundaries 66 | return self.optimizer.get_lr() 67 | 68 | def step_update(self, num_updates): 69 | """Update the learning rate after each update.""" 70 | if num_updates < self.args.warmup_updates: 71 | self.lr = self.args.warmup_init_lr + num_updates*self.lr_step 72 | else: 73 | self.lr = self.decay_factor * num_updates**-0.5 74 | self.optimizer.set_lr(self.lr) 75 | return self.lr 76 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from . import FairseqLRScheduler, register_lr_scheduler 9 | 10 | 11 | @register_lr_scheduler('polynomial_decay') 12 | class PolynomialDecaySchedule(FairseqLRScheduler): 13 | """Decay the LR on a fixed schedule.""" 14 | 15 | def __init__(self, args, optimizer): 16 | super().__init__(args, optimizer) 17 | 18 | # set defaults 19 | args.warmup_updates = getattr(args, 'warmup_updates', 0) or 0 20 | 21 | self.lr = args.lr[0] 22 | if args.warmup_updates > 0: 23 | self.warmup_factor = 1. / args.warmup_updates 24 | else: 25 | self.warmup_factor = 1 26 | self.end_learning_rate = args.end_learning_rate 27 | self.total_num_update = args.total_num_update 28 | self.power = args.power 29 | self.optimizer.set_lr(self.warmup_factor * self.lr) 30 | 31 | @staticmethod 32 | def add_args(parser): 33 | """Add arguments to the parser for this LR scheduler.""" 34 | parser.add_argument('--force-anneal', '--fa', type=int, metavar='N', 35 | help='force annealing at specified epoch') 36 | parser.add_argument('--warmup-updates', default=0, type=int, metavar='N', 37 | help='warmup the learning rate linearly for the first N updates') 38 | parser.add_argument('--end-learning-rate', default=0.0, type=float) 39 | parser.add_argument('--power', default=1.0, type=float) 40 | parser.add_argument('--total-num-update', default=1000000, type=int) 41 | 42 | def get_next_lr(self, epoch): 43 | lrs = self.args.lr 44 | if self.args.force_anneal is None or epoch < self.args.force_anneal: 45 | # use fixed LR schedule 46 | next_lr = lrs[min(epoch, len(lrs) - 1)] 47 | else: 48 | # annneal based on lr_shrink 49 | next_lr = self.optimizer.get_lr() 50 | return next_lr 51 | 52 | def step(self, epoch, val_loss=None): 53 | """Update the learning rate at the end of the given epoch.""" 54 | super().step(epoch, val_loss) 55 | self.lr = self.get_next_lr(epoch) 56 | self.optimizer.set_lr(self.warmup_factor * self.lr) 57 | return self.optimizer.get_lr() 58 | 59 | def step_update(self, num_updates): 60 | """Update the learning rate after each update.""" 61 | if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates: 62 | self.warmup_factor = num_updates / float(self.args.warmup_updates) 63 | self.optimizer.set_lr(self.warmup_factor * self.lr) 64 | else: 65 | warmup = self.args.warmup_updates 66 | lr_range = self.lr - self.end_learning_rate 67 | pct_remaining = 1 - (num_updates - warmup) / (self.total_num_update - warmup) 68 | lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate 69 | self.optimizer.set_lr(lr) 70 | return self.optimizer.get_lr() 71 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.optim.lr_scheduler 9 | 10 | from . import FairseqLRScheduler, register_lr_scheduler 11 | 12 | 13 | @register_lr_scheduler('reduce_lr_on_plateau') 14 | class ReduceLROnPlateau(FairseqLRScheduler): 15 | """Decay the LR by a factor every time the validation loss plateaus.""" 16 | 17 | def __init__(self, args, optimizer): 18 | super().__init__(args, optimizer) 19 | if len(args.lr) > 1: 20 | raise ValueError( 21 | 'Cannot use a fixed learning rate schedule with reduce_lr_on_plateau.' 22 | ' Consider --lr-scheduler=fixed instead.' 23 | ) 24 | self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( 25 | self.optimizer.optimizer, patience=0, factor=args.lr_shrink, 26 | threshold=args.lr_threshold) 27 | 28 | @staticmethod 29 | def add_args(parser): 30 | """Add arguments to the parser for this LR scheduler.""" 31 | # fmt: off 32 | parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', 33 | help='shrink factor for annealing, lr_new = (lr * lr_shrink)') 34 | parser.add_argument('--lr-threshold', default=1e-4, type=float, metavar='LT', 35 | help='Threshold for measuring the new optimum, \ 36 | to only focus on significant changes') 37 | # fmt: on 38 | 39 | def state_dict(self): 40 | """Return the LR scheduler state dict.""" 41 | return { 42 | 'best': self.lr_scheduler.best, 43 | 'last_epoch': self.lr_scheduler.last_epoch, 44 | } 45 | 46 | def load_state_dict(self, state_dict): 47 | """Load an LR scheduler state dict.""" 48 | self.lr_scheduler.best = state_dict['best'] 49 | if 'last_epoch' in state_dict: 50 | self.lr_scheduler.last_epoch = state_dict['last_epoch'] 51 | 52 | def step(self, epoch, val_loss=None): 53 | """Update the learning rate at the end of the given epoch.""" 54 | if val_loss is not None: 55 | self.lr_scheduler.step(val_loss, epoch) 56 | else: 57 | self.lr_scheduler.last_epoch = epoch 58 | return self.optimizer.get_lr() 59 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import math 9 | 10 | from . import FairseqLRScheduler, register_lr_scheduler 11 | 12 | 13 | @register_lr_scheduler('triangular') 14 | class TriangularSchedule(FairseqLRScheduler): 15 | """Assign LR based on a triangular cyclical schedule. 16 | 17 | See https://arxiv.org/pdf/1506.01186.pdf for details. 18 | """ 19 | 20 | def __init__(self, args, optimizer): 21 | super().__init__(args, optimizer) 22 | if len(args.lr) > 1: 23 | raise ValueError( 24 | 'Cannot use a fixed learning rate schedule with triangular.' 25 | ' Consider --lr-scheduler=fixed instead.' 26 | ) 27 | 28 | lr = args.lr[0] 29 | 30 | assert args.max_lr > lr, 'max_lr must be more than lr' 31 | self.min_lr = lr 32 | self.max_lr = args.max_lr 33 | self.stepsize = args.lr_period_updates // 2 34 | self.lr_shrink = args.lr_shrink 35 | self.shrink_min = args.shrink_min 36 | 37 | # initial learning rate 38 | self.lr = self.min_lr 39 | self.optimizer.set_lr(self.lr) 40 | 41 | @staticmethod 42 | def add_args(parser): 43 | """Add arguments to the parser for this LR scheduler.""" 44 | # fmt: off 45 | parser.add_argument('--max-lr', required=True, type=float, metavar='LR', 46 | help='max learning rate, must be more than args.lr') 47 | parser.add_argument('--lr-period-updates', default=5000, type=float, metavar='LR', 48 | help='initial number of updates per period (cycle length)') 49 | parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS', 50 | help='shrink factor for annealing') 51 | parser.add_argument('--shrink-min', action='store_true', 52 | help='if set, also shrinks min lr') 53 | # fmt: on 54 | 55 | def step(self, epoch, val_loss=None): 56 | """Update the learning rate at the end of the given epoch.""" 57 | super().step(epoch, val_loss) 58 | # we don't change the learning rate at epoch boundaries 59 | return self.optimizer.get_lr() 60 | 61 | def step_update(self, num_updates): 62 | """Update the learning rate after each update.""" 63 | cycle = math.floor(num_updates / (2 * self.stepsize)) 64 | 65 | lr_shrink = self.lr_shrink ** cycle 66 | max_lr = self.max_lr * lr_shrink 67 | if self.shrink_min: 68 | min_lr = self.min_lr * lr_shrink 69 | else: 70 | min_lr = self.min_lr 71 | 72 | x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1) 73 | self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x)) 74 | 75 | self.optimizer.set_lr(self.lr) 76 | return self.lr 77 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/nag.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch 9 | from torch.optim.optimizer import Optimizer, required 10 | 11 | from . import FairseqOptimizer, register_optimizer 12 | 13 | 14 | @register_optimizer('nag') 15 | class FairseqNAG(FairseqOptimizer): 16 | def __init__(self, args, params): 17 | super().__init__(args, params) 18 | self._optimizer = NAG(params, **self.optimizer_config) 19 | 20 | @staticmethod 21 | def add_args(parser): 22 | """Add optimizer-specific arguments to the parser.""" 23 | # fmt: off 24 | parser.add_argument('--momentum', default=0.99, type=float, metavar='M', 25 | help='momentum factor') 26 | parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', 27 | help='weight decay') 28 | # fmt: on 29 | 30 | @property 31 | def optimizer_config(self): 32 | """ 33 | Return a kwarg dictionary that will be used to override optimizer 34 | args stored in checkpoints. This allows us to load a checkpoint and 35 | resume training using a different set of optimizer args, e.g., with a 36 | different learning rate. 37 | """ 38 | return { 39 | 'lr': self.args.lr[0], 40 | 'momentum': self.args.momentum, 41 | 'weight_decay': self.args.weight_decay, 42 | } 43 | 44 | 45 | class NAG(Optimizer): 46 | def __init__(self, params, lr=required, momentum=0, weight_decay=0): 47 | defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay) 48 | super(NAG, self).__init__(params, defaults) 49 | 50 | @property 51 | def supports_memory_efficient_fp16(self): 52 | return True 53 | 54 | def step(self, closure=None): 55 | """Performs a single optimization step. 56 | 57 | Arguments: 58 | closure (callable, optional): A closure that reevaluates the model 59 | and returns the loss. 60 | """ 61 | loss = None 62 | if closure is not None: 63 | loss = closure() 64 | 65 | for group in self.param_groups: 66 | weight_decay = group['weight_decay'] 67 | momentum = group['momentum'] 68 | lr = group['lr'] 69 | lr_old = group.get('lr_old', lr) 70 | lr_correct = lr / lr_old 71 | 72 | for p in group['params']: 73 | if p.grad is None: 74 | continue 75 | 76 | p_data_fp32 = p.data.float() 77 | 78 | d_p = p.grad.data.float() 79 | param_state = self.state[p] 80 | if 'momentum_buffer' not in param_state: 81 | param_state['momentum_buffer'] = torch.zeros_like(d_p) 82 | else: 83 | param_state['momentum_buffer'] = param_state['momentum_buffer'].type_as(d_p) 84 | 85 | buf = param_state['momentum_buffer'] 86 | 87 | if weight_decay != 0: 88 | p_data_fp32.mul_(1 - lr * weight_decay) 89 | p_data_fp32.add_(momentum * momentum * lr_correct, buf) 90 | p_data_fp32.add_(-(1 + momentum) * lr, d_p) 91 | 92 | buf.mul_(momentum * lr_correct).add_(-lr, d_p) 93 | 94 | p.data.copy_(p_data_fp32) 95 | 96 | group['lr_old'] = lr 97 | 98 | return loss 99 | -------------------------------------------------------------------------------- /2-4/386/fairseq/optim/sgd.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import torch.optim 9 | 10 | from . import FairseqOptimizer, register_optimizer 11 | 12 | 13 | @register_optimizer('sgd') 14 | class SGD(FairseqOptimizer): 15 | def __init__(self, args, params): 16 | super().__init__(args, params) 17 | self._optimizer = torch.optim.SGD(params, **self.optimizer_config) 18 | 19 | @staticmethod 20 | def add_args(parser): 21 | """Add optimizer-specific arguments to the parser.""" 22 | # fmt: off 23 | parser.add_argument('--momentum', default=0.0, type=float, metavar='M', 24 | help='momentum factor') 25 | parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', 26 | help='weight decay') 27 | # fmt: on 28 | 29 | @property 30 | def optimizer_config(self): 31 | """ 32 | Return a kwarg dictionary that will be used to override optimizer 33 | args stored in checkpoints. This allows us to load a checkpoint and 34 | resume training using a different set of optimizer args, e.g., with a 35 | different learning rate. 36 | """ 37 | return { 38 | 'lr': self.args.lr[0], 39 | 'momentum': self.args.momentum, 40 | 'weight_decay': self.args.weight_decay, 41 | } 42 | -------------------------------------------------------------------------------- /2-4/386/fairseq/pdb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import multiprocessing 9 | import os 10 | import pdb 11 | import sys 12 | 13 | 14 | __all__ = ['set_trace'] 15 | 16 | 17 | _stdin = [None] 18 | _stdin_lock = multiprocessing.Lock() 19 | try: 20 | _stdin_fd = sys.stdin.fileno() 21 | except Exception: 22 | _stdin_fd = None 23 | 24 | 25 | class MultiprocessingPdb(pdb.Pdb): 26 | """A Pdb wrapper that works in a multiprocessing environment. 27 | 28 | Usage: `from fairseq import pdb; pdb.set_trace()` 29 | """ 30 | 31 | def __init__(self): 32 | pdb.Pdb.__init__(self, nosigint=True) 33 | 34 | def _cmdloop(self): 35 | stdin_bak = sys.stdin 36 | with _stdin_lock: 37 | try: 38 | if _stdin_fd is not None: 39 | if not _stdin[0]: 40 | _stdin[0] = os.fdopen(_stdin_fd) 41 | sys.stdin = _stdin[0] 42 | self.cmdloop() 43 | finally: 44 | sys.stdin = stdin_bak 45 | 46 | 47 | def set_trace(): 48 | pdb = MultiprocessingPdb() 49 | pdb.set_trace(sys._getframe().f_back) 50 | -------------------------------------------------------------------------------- /2-4/386/fairseq/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | 9 | REGISTRIES = {} 10 | 11 | 12 | def setup_registry( 13 | registry_name: str, 14 | base_class=None, 15 | default=None, 16 | ): 17 | assert registry_name.startswith('--') 18 | registry_name = registry_name[2:].replace('-', '_') 19 | 20 | REGISTRY = {} 21 | REGISTRY_CLASS_NAMES = set() 22 | 23 | # maintain a registry of all registries 24 | if registry_name in REGISTRIES: 25 | raise ValueError('Canot setup duplicate registry: {}'.format(registry_name)) 26 | REGISTRIES[registry_name] = { 27 | 'registry': REGISTRY, 28 | 'default': default, 29 | } 30 | 31 | def build_x(args, *extra_args, **extra_kwargs): 32 | choice = getattr(args, registry_name, None) 33 | if choice is None: 34 | return None 35 | cls = REGISTRY[choice] 36 | if hasattr(cls, 'build_' + registry_name): 37 | builder = getattr(cls, 'build_' + registry_name) 38 | else: 39 | builder = cls 40 | return builder(args, *extra_args, **extra_kwargs) 41 | 42 | def register_x(name): 43 | 44 | def register_x_cls(cls): 45 | if name in REGISTRY: 46 | raise ValueError('Cannot register duplicate {} ({})'.format(registry_name, name)) 47 | if cls.__name__ in REGISTRY_CLASS_NAMES: 48 | raise ValueError( 49 | 'Cannot register {} with duplicate class name ({})'.format( 50 | registry_name, cls.__name__, 51 | ) 52 | ) 53 | if base_class is not None and not issubclass(cls, base_class): 54 | raise ValueError('{} must extend {}'.format(cls.__name__, base_class.__name__)) 55 | REGISTRY[name] = cls 56 | REGISTRY_CLASS_NAMES.add(cls.__name__) 57 | return cls 58 | 59 | return register_x_cls 60 | 61 | return build_x, register_x, REGISTRY 62 | -------------------------------------------------------------------------------- /2-4/386/fairseq/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import argparse 9 | import importlib 10 | import os 11 | 12 | from .fairseq_task import FairseqTask 13 | 14 | TASK_REGISTRY = {} 15 | TASK_CLASS_NAMES = set() 16 | 17 | 18 | def setup_task(args, **kwargs): 19 | return TASK_REGISTRY[args.task].setup_task(args, **kwargs) 20 | 21 | 22 | def register_task(name): 23 | """ 24 | New tasks can be added to fairseq with the 25 | :func:`~fairseq.tasks.register_task` function decorator. 26 | 27 | For example:: 28 | 29 | @register_task('classification') 30 | class ClassificationTask(FairseqTask): 31 | (...) 32 | 33 | .. note:: 34 | 35 | All Tasks must implement the :class:`~fairseq.tasks.FairseqTask` 36 | interface. 37 | 38 | Please see the 39 | 40 | Args: 41 | name (str): the name of the task 42 | """ 43 | 44 | def register_task_cls(cls): 45 | if name in TASK_REGISTRY: 46 | raise ValueError('Cannot register duplicate task ({})'.format(name)) 47 | if not issubclass(cls, FairseqTask): 48 | raise ValueError('Task ({}: {}) must extend FairseqTask'.format(name, cls.__name__)) 49 | if cls.__name__ in TASK_CLASS_NAMES: 50 | raise ValueError('Cannot register task with duplicate class name ({})'.format(cls.__name__)) 51 | TASK_REGISTRY[name] = cls 52 | TASK_CLASS_NAMES.add(cls.__name__) 53 | return cls 54 | 55 | return register_task_cls 56 | 57 | 58 | # automatically import any Python files in the tasks/ directory 59 | for file in os.listdir(os.path.dirname(__file__)): 60 | if file.endswith('.py') and not file.startswith('_'): 61 | task_name = file[:file.find('.py')] 62 | importlib.import_module('fairseq.tasks.' + task_name) 63 | 64 | # expose `task_parser` for sphinx 65 | if task_name in TASK_REGISTRY: 66 | parser = argparse.ArgumentParser(add_help=False) 67 | group_task = parser.add_argument_group('Task name') 68 | # fmt: off 69 | group_task.add_argument('--task', metavar=task_name, 70 | help='Enable this task with: ``--task=' + task_name + '``') 71 | # fmt: on 72 | group_args = parser.add_argument_group('Additional command-line arguments') 73 | TASK_REGISTRY[task_name].add_args(group_args) 74 | globals()[task_name + '_parser'] = parser 75 | 76 | 77 | def get_task(name): 78 | return TASK_REGISTRY[name] 79 | -------------------------------------------------------------------------------- /2-4/386/fairseq/tasks/translation_from_pretrained_xlm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | from fairseq.data.masked_lm_dictionary import MaskedLMDictionary 9 | from fairseq.tasks.translation import TranslationTask 10 | 11 | from . import register_task 12 | 13 | 14 | @register_task("translation_from_pretrained_xlm") 15 | class TranslationFromPretrainedXLMTask(TranslationTask): 16 | """ 17 | Same as TranslationTask except use the MaskedLMDictionary class so that 18 | we can load data that was binarized with the MaskedLMDictionary class. 19 | 20 | This task should be used for the entire training pipeline when we want to 21 | train an NMT model from a pretrained XLM checkpoint: binarizing NMT data, 22 | training NMT with the pretrained XLM checkpoint, and subsequent evaluation 23 | of that trained model. 24 | """ 25 | 26 | @classmethod 27 | def load_dictionary(cls, filename): 28 | """Load the masked LM dictionary from the filename 29 | 30 | Args: 31 | filename (str): the filename 32 | """ 33 | return MaskedLMDictionary.load(filename) 34 | -------------------------------------------------------------------------------- /2-4/386/fairseq/tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the LICENSE file in 5 | # the root directory of this source tree. An additional grant of patent rights 6 | # can be found in the PATENTS file in the same directory. 7 | 8 | import re 9 | 10 | SPACE_NORMALIZER = re.compile(r"\s+") 11 | 12 | 13 | def tokenize_line(line): 14 | line = SPACE_NORMALIZER.sub(" ", line) 15 | line = line.strip() 16 | return line.split() 17 | -------------------------------------------------------------------------------- /2-4/386/meter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | 4 | 5 | class Meter(object): 6 | def __init__(self): 7 | self.init() 8 | 9 | def init(self): 10 | self.start = time.time() 11 | self.cnt_add = 0 12 | self.tot_loss = 0. 13 | self.cnt_sent = 0 14 | self.cnt_token = 0 15 | 16 | def add(self, loss, n_sent, n_token): 17 | self.cnt_add += 1 18 | self.tot_loss += loss * n_sent 19 | self.cnt_sent += n_sent 20 | self.cnt_token += n_token 21 | 22 | def average(self): 23 | loss_sent = self.tot_loss / self.cnt_sent if self.cnt_sent != 0 else 0. 24 | loss_token = self.tot_loss / self.cnt_token if self.cnt_token != 0 else 0. 25 | return loss_sent, loss_token 26 | 27 | def elapsed_time(self): 28 | return time.time() - self.start 29 | 30 | def print_str(self, time_avg=False): 31 | loss_sent, loss_token = self.average() 32 | et = self.elapsed_time() 33 | time_str = f"{et * 1000. / self.cnt_add:6.2f} ms/batch" if time_avg else f"{et:6.2f} s" 34 | return f"{time_str} | loss_sent {loss_sent:6.2f} | token_ppl {math.exp(loss_token):6.2f}" 35 | 36 | 37 | -------------------------------------------------------------------------------- /2-4/386/nsml_model/best/model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/386/nsml_model/best/model/model.pt -------------------------------------------------------------------------------- /2-4/386/requirements.txt: -------------------------------------------------------------------------------- 1 | #nsml: reg.navercorp.com/chatbot/larva:latest 2 | #nsml: registry.navercorp.com/gyuwankim/airush-gec:latest 3 | 4 | 5 | nltk 6 | scikit-learn 7 | tokenizers 8 | transformers==4.6.1 -------------------------------------------------------------------------------- /2-4/386/wordpiece.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import os 4 | import json # import json module 5 | 6 | from tokenizers import BertWordPieceTokenizer 7 | from transformers import BertTokenizer 8 | 9 | from data_loader import read_strings 10 | 11 | import nsml 12 | from nsml import DATASET_PATH 13 | 14 | def get_args(): 15 | parser = argparse.ArgumentParser() 16 | 17 | parser.add_argument("--data_dir", type=str, default=os.path.join(DATASET_PATH, 'train')) 18 | parser.add_argument("--vocab_size", type=int, default=6000) # 만들 Vocab의 숫자 19 | parser.add_argument("--limit_alphabet", type=int, default=6000) 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | def postprocess_state(sentence : str) -> str: 25 | """TRADE, SUMBT postprocessing 26 | Args: 27 | state (List[str]): state prediction 28 | Returns: 29 | List[str]: postprocessing state 30 | """ 31 | sentence = sentence.replace(" : ", ":").replace(" , ", ", ").replace('( ', '(').replace(' )', ')').replace(' & ', '&').replace(' = ', '=') 32 | sentence = sentence.replace(" % ", "%").replace(' ~ ', '~').replace(' ^ ', '^') 33 | if sentence.endswith(' ~'): 34 | sentence = sentence.replace(' ~', '~') 35 | if sentence.endswith(' ^^'): 36 | sentence = sentence.replace(' ^^', '^^') 37 | if sentence.endswith(' ^'): 38 | sentence = sentence.replace(' ^', '^') 39 | if sentence.endswith('......'): 40 | sentence = sentence.replace('......', ' ......') 41 | sentence = sentence.replace(') 에', ')에').replace('곳 (', '곳(').replace('부터~트', '부터~ 트').replace('# 정왕동', '#정왕동') 42 | sentence = sentence.replace('쨘 -', '쨘-').replace('해드리겠습니다!', '해드리겠습니다 !').replace('6 / 6', '6/6').replace('6 / 4', '6/4') 43 | sentence = sentence.replace('> ㅋ', '>ㅋ').replace('이상~헤', '이상~ 헤').replace('6 / 6', '6/6').replace('6 / 4', '6/4') 44 | 45 | return sentence 46 | 47 | def main(): 48 | args = get_args() 49 | 50 | tokenizer = BertWordPieceTokenizer( 51 | clean_text=True, 52 | handle_chinese_chars=True, 53 | strip_accents=False, # Must be False if cased model 54 | lowercase=False, 55 | wordpieces_prefix="##" 56 | ) 57 | 58 | noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data")) 59 | annotations = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation")) 60 | corpuses = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus")) 61 | clean_sents = read_strings(os.path.join(args.data_dir, "train_label")) 62 | 63 | corpus = noisy_sents + clean_sents + corpuses 64 | print(len(corpus)) 65 | print(len(list(set(corpus)))) 66 | 67 | tokenizer.train_from_iterator( 68 | corpus, 69 | limit_alphabet=args.limit_alphabet, 70 | vocab_size=args.vocab_size 71 | ) 72 | 73 | vocab_path = f"custom_{args.limit_alphabet}_{args.vocab_size}_tokenizer" 74 | tokenizer.save(vocab_path, True) 75 | 76 | vocab_file = f"custom_{args.limit_alphabet}_{args.vocab_size}_tokenizer.txt" 77 | f = open(vocab_file,'w',encoding='utf-8') 78 | with open(vocab_path) as json_file: 79 | json_data = json.load(json_file) 80 | for item in json_data["model"]["vocab"].keys(): 81 | f.write(item+'\n') 82 | 83 | f.close() 84 | 85 | tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False) 86 | 87 | # print(f"vocab size is {tokenizer.vocab_size}") 88 | # print('-' * 50) 89 | 90 | # for i, string in enumerate(corpus): 91 | # postprocess_string = postprocess_state(tokenizer.decode([tok for tok in tokenizer.encode(string) if tok >= 4])) 92 | # if string != postprocess_string: 93 | # if tokenizer.encode(postprocess_string) != tokenizer.encode(string): 94 | # print(f"[바꾼] {postprocess_string}") 95 | # print(f"[이전] {string}") 96 | # print(f"[인코딩바꾼] {tokenizer.encode(postprocess_string)}") 97 | # print(f"[인코딩이전] {tokenizer.encode(string)}") 98 | # print() 99 | 100 | # if not i % 1000: 101 | # print(i) 102 | 103 | if __name__ == "__main__": 104 | main() -------------------------------------------------------------------------------- /2-4/487/README.md: -------------------------------------------------------------------------------- 1 | # 2-4 스마트에디터의 그래머리 (문장 교정/교열) 기능 고도화 2 | 3 | - 네이버 사용자가 작성한 문장을 문법적으로 맞는 문장으로 교정/교열 하는 모델을 만듭니다. 4 | 5 | 6 | ## 데이터 7 | - 학습데이터 8 | * `train/train_data/train_data`: 문법 오류가 섞인 문장 9 | * `train/train_data/train_annotation`: 문법 오류에 대한 annotation 10 | * `train/train_data/train_corpus`: 교정되지 않은 문장 11 | * `train/train_label`: 교정/교열된 문장 12 | - 평가 데이터 13 | * `test/test_data`: 문법 오류가 섞인 문장 14 | * `test/test_label`: 교정/교열된 문장 15 | - 평가 더미 데이 16 | * `test_submit/test_data`: 문법 오류가 섞인 문장 17 | * `test_submit/test_label`: 교정/교열된 문장 18 | - 문법 오류가 섞인 문장들(`*_data`)과 교정/교열된 문장들(`*_label`)은 line-by-line으로 매핑됩니다. 19 | 20 | 21 | ## 평가 22 | - Corpus-level [GLEU](https://www.aclweb.org/anthology/P07-1044/) score로 평가 23 | - [`nltk.translate.gleu_score.corpus_gleu`](https://www.nltk.org/_modules/nltk/translate/gleu_score.html) 스크립트를 사용 24 | 25 | 26 | ## 베이스라인 27 | - [Transformer](https://arxiv.org/abs/1706.03762) 기반의 sequence-to-sequence 모델 28 | - 대량의 unlabeled corpus (`train_corpus`)를 활용하여 pre-training (또는 semi-supervised learning) 방식으로 학습하거나 에러 타입 (`train_annotation`)을 예측하도록 multi-task learning을 하면 추가 성능 향상을 얻을 수도 있습니다. 29 | 30 | 31 | ## 모델 학습 32 | ``` 33 | nsml run -d airush2021-2-4 -e train.py 34 | ``` 35 | - 필요에 따라 `-a`로 argument 입력 가능 36 | 37 | 38 | ## 모델 제출 39 | ``` 40 | nsml submit {SESSION} {CHECKPOINT} 41 | ``` 42 | 43 | ## 추가 정보 44 | 45 | ### Annotation 설명 46 | 47 | - "perfect" : 교정/교열이 필요없는 완벽한 문장 48 | - "spacing" : 띄어쓰기 교정 49 | - "pasting" : 붙여쓰기 교정 50 | - "tense" : 시제 교정 51 | - "honorific" : 경어체 교정 52 | - "punctuation" : 구두점 교정 53 | - "typo" : 오탈자 교정 (위 분류에 없는 경우 모두 수렴) 54 | - "advanced" : 윤문 처리 (더 매끄러운 문장) 55 | -------------------------------------------------------------------------------- /2-4/487/data_loader.py: -------------------------------------------------------------------------------- 1 | """This file is not really used""" 2 | 3 | import os 4 | 5 | from nsml import DATASET_PATH 6 | from utils.utils import read_strings, write_strings 7 | 8 | 9 | def test_data_loader(root_path): 10 | return read_strings(os.path.join(root_path, 'test', 'test_data')) 11 | 12 | 13 | def feed_infer(output_file, infer_func): 14 | prediciton = infer_func(test_data_loader(DATASET_PATH)) 15 | print('write output') 16 | write_strings(output_file, prediciton) 17 | if os.stat(output_file).st_size == 0: 18 | raise AssertionError('output result of inference is nothing') 19 | -------------------------------------------------------------------------------- /2-4/487/evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from nltk.translate.gleu_score import corpus_gleu 4 | 5 | from utils.utils import read_strings 6 | 7 | 8 | def em(prediction, ground_truth): 9 | return sum([x == y for x, y in zip(prediction, ground_truth) 10 | ]) / len(ground_truth) * 100. 11 | 12 | 13 | def gleu(prediction, ground_truth): 14 | return corpus_gleu([[x] for x in ground_truth], prediction) * 100. 15 | 16 | 17 | def evaluation_metrics(prediction_file: str, ground_truth_file: str): 18 | try: 19 | prediction = read_strings(prediction_file) 20 | ground_truth = read_strings(ground_truth_file) 21 | score = gleu(prediction, ground_truth) 22 | except: 23 | score = 0.0 24 | return score 25 | 26 | 27 | if __name__ == '__main__': 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--prediction', type=str, default='pred.txt') 30 | parser.add_argument('--test_label_path', type=str) 31 | args = parser.parse_args() 32 | 33 | print(evaluation_metrics(args.prediction, args.test_label_path)) 34 | -------------------------------------------------------------------------------- /2-4/487/noising.py: -------------------------------------------------------------------------------- 1 | from g2pk import G2p 2 | import mecab 3 | import random 4 | 5 | 6 | class Noiser: 7 | 8 | def __init__(self): 9 | 10 | self.g2p = G2p() 11 | self.mecab_tokenizer = mecab.MeCab().morphs 12 | 13 | def noise(self, sent, corpus=None, p=0.1): 14 | sent = self.grapheme(sent) 15 | sent = self.add_spacing_noise(sent, p=p) 16 | if corpus is not None: 17 | sent = self.delete_token(sent, p=p) 18 | sent = self.add_token(sent, corpus, p=p) 19 | sent = self.replace_token(sent, corpus, p=p) 20 | return sent 21 | 22 | def grapheme(self, sent): 23 | return self.g2p(sent, group_vowels=True) 24 | 25 | def add_spacing_noise(self, sent, p=0.1): 26 | tokenized = ' '.join(self.mecab_tokenizer(sent)) 27 | noised = [] 28 | for char in tokenized: 29 | if char == ' ' and random.randint(0, 1) < p: 30 | continue 31 | noised.append(char) 32 | return ''.join(noised) 33 | 34 | def identity(self, sent): 35 | return sent 36 | 37 | def delete_token(self, sent, p=0.1): 38 | noised = [] 39 | for char in sent: 40 | if random.randint(0, 1) < p: 41 | continue 42 | noised.append(char) 43 | return ''.join(noised) 44 | 45 | def add_token(self, sent, corpus, p=0.1): 46 | noised = [] 47 | for char in sent: 48 | if random.randint(0, 1) < p: 49 | random_sent = random.choice(corpus) 50 | while len(random_sent) == 0: 51 | random_sent = random.choice(corpus) 52 | random_tok = random.choice(random_sent) 53 | noised.append(random_tok) 54 | noised.append(char) 55 | return ''.join(noised) 56 | 57 | def replace_token(self, sent, corpus, p=0.1): 58 | noised = [] 59 | for char in sent: 60 | if random.randint(0, 1) < p: 61 | random_sent = random.choice(corpus) 62 | while len(random_sent) == 0: 63 | random_sent = random.choice(corpus) 64 | random_tok = random.choice(random_sent) 65 | noised.append(random_tok) 66 | else: 67 | noised.append(char) 68 | return ''.join(noised) 69 | 70 | def heterograph_noise(self, sent): 71 | pass 72 | -------------------------------------------------------------------------------- /2-4/487/nsml_model/best/model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/487/nsml_model/best/model/model.pt -------------------------------------------------------------------------------- /2-4/487/pretrain_dataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset, DataLoader 2 | from sklearn.model_selection import train_test_split 3 | 4 | from utils.utils import read_strings 5 | from utils.preprocess import preprocess_noisy 6 | from noising import Noiser 7 | 8 | 9 | def get_pretrain_data(data_path=None, 10 | corpus=None, 11 | val_ratio=0.05, 12 | add_spacing=False, 13 | logger=None, 14 | use_corpus=True): 15 | 16 | if corpus is None: 17 | corpus = read_strings(data_path) 18 | noiser = Noiser() 19 | if add_spacing: 20 | from pykospacing import Spacing 21 | spacing = Spacing() 22 | corpus = [preprocess_noisy(sent) for sent in corpus] 23 | pairs = [] 24 | for idx, sent in enumerate(corpus): 25 | if len(sent) == 0: 26 | continue 27 | pair = {} 28 | # noisy 29 | pair['noisy'] = noiser.noise(sent, corpus=corpus if use_corpus else None) 30 | # clean 31 | clean_sent = sent 32 | if add_spacing: 33 | clean_sent = spacing(clean_sent) 34 | pair['clean'] = clean_sent 35 | pairs.append(pair) 36 | 37 | if idx % 10000 == 0: 38 | log_fn = logger.info if logger is not None else print 39 | log_fn(f'preparing data: {idx} / {len(corpus)}') 40 | 41 | if val_ratio == 0: 42 | return pairs 43 | 44 | train_data, valid_data = train_test_split(pairs, test_size=val_ratio) 45 | 46 | return train_data, valid_data 47 | 48 | 49 | def get_pretrain_dataloader(args, data, tokenizer, mode, do_multitask=False, drop=0.0): 50 | del do_multitask, drop # unused 51 | dataset = PretrainDataset(data, mode, tokenizer) 52 | batch_size = args.train_batch_size if mode == 'train' else args.eval_batch_size 53 | dataloader = DataLoader(dataset, 54 | shuffle=mode == 'train', 55 | batch_size=batch_size, 56 | num_workers=args.num_workers, 57 | collate_fn=dataset.collate_fn) 58 | 59 | return dataloader 60 | 61 | 62 | class PretrainDataset(Dataset): 63 | 64 | def __init__(self, data, mode, tokenizer): 65 | self.data = data 66 | self.mode = mode 67 | 68 | self.tokenizer = tokenizer 69 | self.unk_idx = tokenizer.unk_idx 70 | self.pad_idx = tokenizer.pad_idx 71 | self.sos_idx = tokenizer.sos_idx 72 | self.eos_idx = tokenizer.eos_idx 73 | self.cls_idx = tokenizer.cls_idx 74 | 75 | def __len__(self): 76 | return len(self.data) 77 | 78 | def __getitem__(self, idx): 79 | return self.data[idx] 80 | 81 | def collate_fn(self, data): 82 | source_text = [x['noisy'] for x in data] 83 | target_text = [x['clean'] for x in data] 84 | src_padded, src_padding_mask = self.tokenizer(source_text) 85 | tgt_padded, tgt_padding_mask = self.tokenizer(target_text, is_target=True) 86 | 87 | return src_padded, tgt_padded, src_padding_mask, tgt_padding_mask 88 | -------------------------------------------------------------------------------- /2-4/487/pykospacing/__init__.py: -------------------------------------------------------------------------------- 1 | from pykospacing.kospacing import * 2 | -------------------------------------------------------------------------------- /2-4/487/pykospacing/embedding_maker.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.preprocessing import sequence 2 | import json 3 | import numpy as np 4 | 5 | __all__ = ['load_embedding', 'load_vocab', 'encoding_and_padding'] 6 | 7 | 8 | def load_embedding(embeddings_file): 9 | return (np.load(embeddings_file)) 10 | 11 | 12 | def load_vocab(vocab_path): 13 | with open(vocab_path, 'r') as f: 14 | data = json.loads(f.read()) 15 | word2idx = data 16 | idx2word = dict([(v, k) for k, v in data.items()]) 17 | return word2idx, idx2word 18 | 19 | 20 | def encoding_and_padding(word2idx_dic, sequences, **params): 21 | """ 22 | 1. making item to idx 23 | 2. padding 24 | 25 | :word2idx_dic 26 | :sequences: list of lists where each element is a sequence 27 | :maxlen: int, maximum length 28 | :dtype: type to cast the resulting sequence. 29 | :padding: 'pre' or 'post', pad either before or after each sequence. 30 | :truncating: 'pre' or 'post', remove values from sequences larger than 31 | maxlen either in the beginning or in the end of the sequence 32 | :value: float, value to pad the sequences to the desired value. 33 | """ 34 | seq_idx = [ 35 | [word2idx_dic.get(a, word2idx_dic['__ETC__']) for a in i] for i in sequences 36 | ] 37 | params['value'] = word2idx_dic['__PAD__'] 38 | return (sequence.pad_sequences(seq_idx, **params)) 39 | -------------------------------------------------------------------------------- /2-4/487/pykospacing/kospacing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import re 4 | 5 | import numpy as np 6 | import pkg_resources 7 | from tensorflow.keras.models import load_model 8 | from pykospacing.embedding_maker import encoding_and_padding, load_vocab 9 | 10 | __all__ = ['Spacing', ] 11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 12 | 13 | model_path = pkg_resources.resource_filename( 14 | 'pykospacing', os.path.join('resources', 'models', 'kospacing')) 15 | dic_path = pkg_resources.resource_filename( 16 | 'pykospacing', os.path.join('resources', 'dicts', 'c2v.dic')) 17 | MODEL = load_model(model_path) 18 | MODEL.make_predict_function() 19 | W2IDX, _ = load_vocab(dic_path) 20 | MAX_LEN = 198 21 | 22 | 23 | class Spacing: 24 | """predict spacing for input string 25 | """ 26 | def __init__(self, rules=[]): 27 | self._model = MODEL 28 | self._w2idx = W2IDX 29 | self.max_len = MAX_LEN 30 | self.pattern = re.compile(r'\s+') 31 | self.rules = [(re.compile('\s*'.join(r)), r) for r in rules] 32 | 33 | def get_spaced_sent(self, raw_sent): 34 | raw_sent_ = "«" + raw_sent + "»" 35 | raw_sent_ = raw_sent_.replace(' ', '^') 36 | sents_in = [raw_sent_, ] 37 | mat_in = encoding_and_padding( 38 | word2idx_dic=self._w2idx, sequences=sents_in, maxlen=200, 39 | padding='post', truncating='post') 40 | results = self._model.predict(mat_in) 41 | mat_set = results[0, ] 42 | preds = np.array( 43 | ['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]]) 44 | return self.make_pred_sents(raw_sent_, preds) 45 | 46 | def make_pred_sents(self, x_sents, y_pred): 47 | res_sent = [] 48 | for i, j in zip(x_sents, y_pred): 49 | if j == '1': 50 | res_sent.append(i) 51 | res_sent.append(' ') 52 | else: 53 | res_sent.append(i) 54 | subs = re.sub(self.pattern, ' ', ''.join(res_sent).replace('^', ' ')) 55 | subs = subs.replace('«', '') 56 | subs = subs.replace('»', '') 57 | return subs 58 | 59 | def apply_rules(self, spaced_sent): 60 | for rgx, word in self.rules: 61 | spaced_sent = rgx.sub(word, spaced_sent) 62 | return spaced_sent 63 | 64 | def __call__(self, sent): 65 | if len(sent) > self.max_len: 66 | splitted_sent = [sent[y-self.max_len:y] for y in range(self.max_len, len(sent)+self.max_len, self.max_len)] 67 | spaced_sent = ''.join([self.get_spaced_sent(ss) 68 | for ss in splitted_sent]) 69 | else: 70 | spaced_sent = self.get_spaced_sent(sent) 71 | if len(self.rules) > 0: 72 | spaced_sent = self.apply_rules(spaced_sent) 73 | return spaced_sent.strip() 74 | -------------------------------------------------------------------------------- /2-4/487/pykospacing/pykos.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import sys 3 | import argparse 4 | from pykospacing import Spacing 5 | 6 | 7 | def get_parser(): 8 | parser = argparse.ArgumentParser(description='Python script for automatic Korean word spacing') 9 | 10 | parser.add_argument('infile', type=argparse.FileType('r'), 11 | default=sys.stdin) 12 | parser.add_argument('outfile', type=argparse.FileType('w'), nargs='?', 13 | default=sys.stdout) 14 | parser.add_argument('-o', dest='overwrite', action='store_true', default=False, 15 | help='Overwrite the result itself') 16 | 17 | return parser 18 | 19 | 20 | def main(args=sys.argv[1:]): 21 | args = get_parser().parse_args(args) 22 | 23 | source = args.infile.read() 24 | 25 | result = '\n' 26 | spacing = Spacing() 27 | for line in source.splitlines(): 28 | result += spacing(line) 29 | result += '\n' 30 | 31 | if args.overwrite: 32 | args.infile.close() 33 | with open(args.infile.name, 'w', encoding=args.infile.encoding) as f: 34 | f.write(result) 35 | else: 36 | args.outfile.write(result) 37 | 38 | return 0 if (source == result) else 1 39 | 40 | 41 | if __name__ == '__main__': 42 | sys.exit(main()) 43 | -------------------------------------------------------------------------------- /2-4/487/pykospacing/resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/487/pykospacing/resources/__init__.py -------------------------------------------------------------------------------- /2-4/487/pykospacing/resources/models/kospacing: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/487/pykospacing/resources/models/kospacing -------------------------------------------------------------------------------- /2-4/487/requirements.txt: -------------------------------------------------------------------------------- 1 | #//nsml: ufoym/deepo:all-py36-cu101 2 | #nsml: pytorchlightning/pytorch_lightning:base-cuda-py3.6-torch1.7 3 | #//nsml: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel 4 | #//nsml: registry.navercorp.com/gyuwankim/airush-gec:latest 5 | scikit-learn 6 | nltk 7 | transformers>=4.0.0 8 | python-mecab-ko 9 | # tensorflow-gpu==2.3.0 10 | jamo 11 | g2pk 12 | konlpy -------------------------------------------------------------------------------- /2-4/487/utils/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | 4 | 5 | class LabelSmoothingNLLLoss(torch.nn.Module): 6 | """Pytorch implementation of label smoothed NLL loss retrieved from 7 | https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/166833#930136 8 | """ 9 | 10 | def __init__(self, smoothing: float = 0.1, reduction="mean", weight=None): 11 | super().__init__() 12 | self.smoothing = smoothing 13 | self.reduction = reduction 14 | self.weight = weight 15 | 16 | def reduce_loss(self, loss): 17 | return loss.mean() if self.reduction == 'mean' else loss.sum() \ 18 | if self.reduction == 'sum' else loss 19 | 20 | def linear_combination(self, x, y): 21 | return self.smoothing * x + (1 - self.smoothing) * y 22 | 23 | def forward(self, log_preds, target, ignore_index=-100): 24 | """ 25 | log_preds: [bs, V, T] 26 | target: [bs, T] 27 | """ 28 | assert 0 <= self.smoothing < 1 29 | 30 | if self.weight is not None: 31 | self.weight = self.weight.to(log_preds.device) 32 | 33 | mask = (target != ignore_index).float().unsqueeze(1) # [bs, 1, T] 34 | masked_log_preds = log_preds * mask 35 | 36 | loss = self.reduce_loss(-masked_log_preds.mean(dim=1)) 37 | nll = F.nll_loss(log_preds, 38 | target, 39 | reduction=self.reduction, 40 | weight=self.weight, 41 | ignore_index=ignore_index) 42 | return self.linear_combination(loss, nll) 43 | -------------------------------------------------------------------------------- /2-4/487/utils/meter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | 4 | 5 | class Meter(object): 6 | 7 | def __init__(self): 8 | self.init() 9 | 10 | def init(self): 11 | self.start = time.time() 12 | self.cnt_add = 0 13 | self.tot_loss = 0. 14 | self.cnt_sent = 0 15 | self.cnt_token = 0 16 | 17 | def add(self, loss, n_sent, n_token): 18 | self.cnt_add += 1 19 | self.tot_loss += loss * n_sent 20 | self.cnt_sent += n_sent 21 | self.cnt_token += n_token 22 | 23 | def average(self): 24 | loss_sent = self.tot_loss / self.cnt_sent if self.cnt_sent != 0 else 0. 25 | loss_token = self.tot_loss / self.cnt_token if self.cnt_token != 0 else 0. 26 | return loss_sent, loss_token 27 | 28 | def elapsed_time(self): 29 | return time.time() - self.start 30 | 31 | def print_str(self, time_avg=False): 32 | loss_sent, loss_token = self.average() 33 | et = self.elapsed_time() 34 | time_str = f'{et * 1000. / self.cnt_add:6.2f} ms/batch' if time_avg else f'{et:6.2f} s' 35 | return f'{time_str} | loss_sent {loss_sent:6.2f} | token_ppl {math.exp(loss_token):6.2f}' 36 | -------------------------------------------------------------------------------- /2-4/487/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | import re 2 | import jamo 3 | 4 | DEL_RULES = [ 5 | re.compile(r'[^ .,?!/$%a-zA-Z0-9가-힣<>()\[\]]+'), 6 | # re.compile(r'[\.,!?]+'), 7 | # re.compile(r'^\('), 8 | # re.compile(r'\)$'), 9 | # re.compile(r'\[.*\]'), 10 | ] 11 | 12 | SUB_RULES = [ 13 | # (re.compile('구요'), '고요'), 14 | # (re.compile('려요'), '립니다'), 15 | # (re.compile('아요'), '습니다'), 16 | # (re.compile('세요'), '십시오'), 17 | # (re.compile('해요'), '합니다'), 18 | # (re.compile('(이에요|예요)'), '입니다'), 19 | # (re.compile('!'), '.'), 20 | # (re.compile('₩'), '원'), 21 | (re.compile(r'[\s]+'), ' '), 22 | ] 23 | 24 | SPECIAL_RULE = (re.compile('어요'), '습니다') # ㄹ어요 제외 25 | 26 | 27 | def preprocess_noisy(sent): 28 | for rule in DEL_RULES: 29 | sent = rule.sub('', sent) 30 | 31 | for rule, subst in SUB_RULES: 32 | sent = rule.sub(subst, sent) 33 | 34 | # sent = special_case1(sent) 35 | 36 | # if re.search(r'[.?!]$', sent) is None: 37 | # sent += '.' 38 | 39 | return sent 40 | 41 | 42 | def special_case1(sent): 43 | """ 44 | 어요 -> 습니다 45 | `ㄹ어요`는 불규칙, 무시 46 | """ 47 | rule, subst = SPECIAL_RULE 48 | match = rule.search(sent) 49 | if match is None: 50 | return sent 51 | start = match.span()[0] 52 | if start == 0: 53 | return sent 54 | prev = sent[start - 1] 55 | if jamo.j2hcj(jamo.h2j(prev))[-1] == 'ㄹ': 56 | return sent 57 | 58 | return rule.sub(subst, sent) 59 | -------------------------------------------------------------------------------- /2-4/487/utils/utils.py: -------------------------------------------------------------------------------- 1 | from jamo import j2h, j2hcj, get_jamo_class, is_jamo 2 | 3 | 4 | def read_strings(input_file): 5 | return open(input_file, 'r', encoding='utf-8').read().splitlines() 6 | 7 | 8 | def write_strings(output_file, data): 9 | with open(output_file, 'w', encoding='utf-8') as f: 10 | for x in data: 11 | f.write(str(x) + '\n') 12 | 13 | 14 | def reconstruct_jamo(decomposed, debug=False, remove_incomplete=True): 15 | reconstructed = [] 16 | current_char = [] 17 | current_state = 'init' # init, lead, vowel 18 | for c in decomposed: 19 | if is_jamo(c): 20 | try: 21 | jamo_class = get_jamo_class(c) 22 | except: # isolated 23 | reconstructed.append(j2hcj(c)) 24 | continue 25 | if jamo_class == 'lead': 26 | if current_state == 'init': 27 | assert len(current_char) == 0 28 | current_char.append(c) 29 | current_state = 'lead' 30 | elif current_state == 'lead': 31 | assert len(current_char) == 1 32 | if not remove_incomplete: 33 | reconstructed.append(j2hcj(current_char[0])) 34 | current_char = [c] 35 | current_state = 'lead' 36 | elif current_state == 'vowel': 37 | assert len(current_char) == 2 38 | reconstructed.append(j2h(*current_char)) 39 | current_char = [c] 40 | current_state = 'lead' 41 | 42 | elif jamo_class == 'vowel': 43 | if current_state == 'init': 44 | assert len(current_char) == 0 45 | if not remove_incomplete: 46 | reconstructed.append(j2hcj(c)) 47 | elif current_state == 'lead': 48 | assert len(current_char) == 1 49 | current_char.append(c) 50 | current_state = 'vowel' 51 | elif current_state == 'vowel': 52 | assert len(current_char) == 2 53 | reconstructed.append(j2h(*current_char)) 54 | if not remove_incomplete: 55 | reconstructed.append(j2hcj(c)) 56 | current_char = [] 57 | current_state = 'init' 58 | else: # jongsung 59 | if current_state == 'init': 60 | assert len(current_char) == 0 61 | if not remove_incomplete: 62 | reconstructed.append(j2hcj(c)) 63 | elif current_state == 'lead': 64 | assert len(current_char) == 1 65 | if not remove_incomplete: 66 | reconstructed.append(j2hcj(current_char[0])) 67 | reconstructed.append(j2hcj(c)) 68 | elif current_state == 'vowel': 69 | assert len(current_char) == 2 70 | current_char.append(c) 71 | reconstructed.append(j2h(*current_char)) 72 | 73 | current_char = [] 74 | current_state = 'init' 75 | 76 | else: 77 | if current_state == 'init': 78 | assert len(current_char) == 0 79 | if current_state == 'lead': 80 | assert len(current_char) == 1 81 | if not remove_incomplete: 82 | reconstructed.append(j2hcj(current_char[0])) 83 | elif current_state == 'vowel': 84 | assert len(current_char) == 2 85 | reconstructed.append(j2h(*current_char)) 86 | 87 | reconstructed.append(c) 88 | current_char = [] 89 | current_state = 'init' 90 | 91 | if debug: 92 | print(current_state, c, current_char, reconstructed) 93 | 94 | # if there is leftover 95 | if len(current_char) > 0: 96 | if current_state == 'lead': 97 | assert len(current_char) == 1 98 | if not remove_incomplete: 99 | reconstructed.append(j2hcj(current_char[0])) 100 | elif current_state == 'vowel': 101 | assert len(current_char) == 2 102 | reconstructed.append(j2h(*current_char)) 103 | 104 | if debug: 105 | print(current_state, c, current_char, reconstructed) 106 | 107 | return ''.join(reconstructed) 108 | -------------------------------------------------------------------------------- /2-4/command.txt: -------------------------------------------------------------------------------- 1 | nsml run -e ensemble.py -d airush2021-2-4 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000 -a "--vocab_size 1982 --max_steps 1 --model bertfuse --optimizer AdamW --scheduler linear --train_all True --num_decoder_layers 12" 2 | nsml run -e main.py -d airush2021-2-4 -m "ensemble 477+479+481" -g 1 --cpus 8 --memory 24000000000 --shm-size 1000000000 -a "--mode ensemble_save --do_multitask 0 --share_embedding 1 --use_copy_attention 0 --model ensemble --preprocess 1 --vocab_size 2700 --ensemble_load_sessions 477 479 481" 3 | nsml run -e train.py -d airush2021-2-4 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000 4 | -------------------------------------------------------------------------------- /2-4/rank.txt: -------------------------------------------------------------------------------- 1 | 386 2 | 487 3 | 1062 4 | -------------------------------------------------------------------------------- /2-5/582/README.md: -------------------------------------------------------------------------------- 1 | # 2-5 쇼핑 카탈로그 클러스터링 2 | 3 | ## 문제 4 | 5 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/d4a88c80-c865-11eb-9e50-27ed62a631c7) 6 | 7 | - 네이버 쇼핑은 여러 판매처에서 판매 중인 동일 상품들의 가격을 비교해 주는 서비스를 제공하고 있다. 8 | - 가격 비교를 위해서는 먼저 여러 판매자가 등록한 같은 상품들을 하나로 묶어야 한다. 9 | - 이렇게 묶인 상품의 집합을 `카탈로그`라고 부른다. 10 | - 같은 상품이라도, 여러 판매자들이 각각 서로 다른 `상품명`으로 판매를 하고 있다. 11 | - 같은 `카탈로그`에 속하는 `상품명` 예시 12 | - 아래 7개의 상품은 모두 `농심 백산수 2L`라는 동일 카탈로그에 속하는 상품이다. 13 | ``` 14 | * 농심 백산수 2L 1병 생수 15 | * 농심 백산수 2L 16 | * 백산수 생수 2L, 낱개 17 | * (24개이상 구입시 개당 20원씩 할인) 농심 백산수 2L 18 | * 농심 백산수 2L x 1펫 / 생수 샘물 물 박스포장 19 | * [농심]백산수 2L x 1개 20 | * NS473 백두산 백산수 2L 21 | ``` 22 | - **본 과제는 판매자가 등록한 상품명 텍스트(`query`)를 입력으로 받아, 주어진 `database`에서 그와 동일한 상품들을 모두 찾아내는 문제이다.** 23 | 24 | 25 | ## 데이터셋 26 | 27 | - airush2021-2-5 28 | - `train` 29 | - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 160,008개 30 | - 데이터 위치 : `train/train_data` 디렉토리 31 | - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` / `카탈로그ID(match_nv_mid)` 32 | - 각 상품은 unique한 `상품ID(nv_mid)`를 가진다. 33 | - 같은 카탈로그에 속하는 상품은 동일한 `카탈로그ID(match_nv_mid)`를 갖는다. 예를 들어, 아래 데이터 예시에서 맨 위 3개의 상품은 동일 카탈로그에 속하므로, 동일한 match_nv_mid(10062684657)을 갖고 있다. 34 | - 데이터 예시 35 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/e9852000-c865-11eb-9c94-0963c31fde06) 36 | 37 | - `test` 38 | - `test` 데이터셋은 `database`와 `query`로 구성되어 있다. 39 | - `database` 40 | - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 90,516개 41 | - 데이터 위치 : `test/test_data/database` 디렉토리 42 | - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` 43 | - `train` 데이터와는 달리, `카탈로그ID(match_nv_mid)` 필드가 없다. 44 | - `query` 45 | - `query`는 위 `database`의 subset이며, 개수는 8,640개이다. 46 | - 데이터 위치 : `test/test_data/query` 디렉토리 47 | - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` 48 | - 데이터 예시 49 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/f30e8800-c865-11eb-8fa0-82ced97afac5) 50 | 51 | - 필드 정보 52 | - nv_mid (string) : 상품ID 53 | - prod_nm (string) : 상품명 54 | - match_nv_mid (string) : 카탈로그ID (train 데이터셋에만 있고, test 데이터셋에는 없음) 55 | 56 | 57 | ## 결과 제출 포맷 58 | 59 | - 결과 제출은 main.py의 infer() 함수에서 이루어진다. 60 | - 아래 예시와 같이 `database`와 `query`가 주어졌다면, 61 | - `query`의 `nv_mid_002`는 `database`의 `nv_mid_001`, `nv_mid_002`, `nv_mid_005`와 동일한 상품이며, 62 | - `query`의 `nv_mid_003`은 `database`의 `nv_mid_003`, `nv_mid_004`와 동일한 상품이다. 63 | - database 64 | 65 | | nv_mid | prod_nm | 66 | | --- | --- | 67 | | nv_mid_001 | (무료배송) 삼다수 2L | 68 | | nv_mid_002 | 삼다수 2L 12병 | 69 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g | 70 | | nv_mid_004 | 지웨이 슈가 먹는 저분자 피쉬 콜라겐 펩타이드 150g | 71 | | nv_mid_005 | 삼다수 2L | 72 | 73 | - query 74 | 75 | | nv_mid | prod_nm | 76 | | --- | --- | 77 | | nv_mid_002 | 삼다수 2L 12병 | 78 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g | 79 | 80 | - `query`에 속하는 각각의 상품에 대해, 그와 동일한 상품을 `database`에서 모두 찾아서 제출하면 된다. 81 | 즉, main.py의 infer() 함수에서 아래와 같은 list를 결과로 return하면 결과가 제출된다. 82 | ``` 83 | return [ 84 | ('nv_mid_002', ['nv_mid_001', 'nv_mid_002', 'nv_mid_005]), 85 | ('nv_mid_003', ['nv_mid_003', 'nv_mid_004']) 86 | ] 87 | 88 | ``` 89 | 90 | ## Getting started 91 | - 접근 92 | - 먼저 주어진 상품명 텍스트를 embedding하는 model을 학습시키고, 93 | - 학습된 모델을 사용해서, test_data/database의 상품명들을 embedding한 후, 94 | - test_data/query의 각 상품명에 대해서 database 상품명들 중, embedding이 유사한 것을 search하는 것이 일반적인 접근법이다. 95 | - 유사 embedding search는 main.py의 infer() 함수 부분를 수정하여, 구현하면 된다. 96 | 97 | - 학습 98 | ``` 99 | nsml run -d airush2021-2-5 -e main.py 100 | ``` 101 | 102 | - 리더보드 제출 103 | ``` 104 | nsml submit {session} {checkpoint} 105 | ``` 106 | 107 | ## evaluation metric 108 | * mean f1-score 109 | * test_data/query 의 각 상품에 대해 match된 결과의 precision과 recall의 f1 score를 모든 상품에 대해 평균한 값 110 | * evaluation.py 코드 참조. 111 | 112 | 113 | ## 기타 114 | 115 | - Team blog: https://medium.com/naver-shopping-dev 116 | - Contact: 오광진 kj.oh@navercorp.com 117 | 118 | 119 | ## FAQ 120 | 121 | Q : Pretrained model 사용이 가능한가요? 122 | 123 | A : 사용 가능합니다. 124 | -------------------------------------------------------------------------------- /2-5/582/arcface.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.nn import Parameter 4 | from torch.nn import functional as F 5 | import math 6 | 7 | 8 | class ArcMarginProduct(nn.Module): 9 | r"""Implement of large margin arc distance: : 10 | Args: 11 | in_features: size of each input sample 12 | out_features: size of each output sample 13 | s: norm of input feature 14 | m: margin 15 | cos(theta + m) 16 | """ 17 | def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False, ls_eps=0.0): 18 | super(ArcMarginProduct, self).__init__() 19 | self.in_features = in_features 20 | self.out_features = out_features 21 | self.s = s 22 | self.m = m 23 | self.ls_eps = ls_eps # label smoothing 24 | self.weight = Parameter(torch.FloatTensor(out_features, in_features)) 25 | nn.init.xavier_uniform_(self.weight) 26 | 27 | self.easy_margin = easy_margin 28 | self.cos_m = math.cos(m) 29 | self.sin_m = math.sin(m) 30 | self.th = math.cos(math.pi - m) 31 | self.mm = math.sin(math.pi - m) * m 32 | 33 | def forward(self, input, label): 34 | # --------------------------- cos(theta) & phi(theta) --------------------------- 35 | cosine = F.linear(F.normalize(input), F.normalize(self.weight)) 36 | sine = torch.sqrt(1.0 - torch.pow(cosine, 2)) 37 | phi = cosine * self.cos_m - sine * self.sin_m 38 | if self.easy_margin: 39 | phi = torch.where(cosine > 0, phi, cosine) 40 | else: 41 | phi = torch.where(cosine > self.th, phi, cosine - self.mm) 42 | # --------------------------- convert label to one-hot --------------------------- 43 | # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda') 44 | one_hot = torch.zeros(cosine.size(), device='cuda') 45 | one_hot.scatter_(1, label.view(-1, 1).long(), 1) 46 | if self.ls_eps > 0: 47 | one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features 48 | # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- 49 | output = (one_hot * phi) + ((1.0 - one_hot) * cosine) 50 | output *= self.s 51 | 52 | return output -------------------------------------------------------------------------------- /2-5/582/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from nsml.constants import DATASET_PATH 4 | 5 | 6 | def test_data_loader(root_path): 7 | return root_path 8 | 9 | 10 | def write_output(output_file, data): 11 | with open(output_file, 'w') as f: 12 | for x in data: 13 | f.write(x[0] + ' ' + ','.join(x[1]) + '\n') 14 | 15 | 16 | def feed_infer(output_file, infer_func): 17 | print('DATASET_PATH=', DATASET_PATH) 18 | #os.system('/bin/ls -lR ' + DATASET_PATH) 19 | prediction = infer_func(test_data_loader(DATASET_PATH)) 20 | write_output(output_file, prediction) 21 | if os.stat(output_file).st_size == 0: 22 | raise AssertionError('output result of inference is nothing') 23 | 24 | -------------------------------------------------------------------------------- /2-5/582/evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def read_prediction(prediction_file): 5 | return read_strings(prediction_file) 6 | 7 | 8 | def read_ground_truth(ground_truth_file): 9 | return read_strings(ground_truth_file) 10 | 11 | 12 | def f1_score(gt, pred): 13 | if len(pred) == 0: 14 | return 0.0 15 | intsct_len = len(set(gt).intersection(set(pred))) 16 | if intsct_len == 0: 17 | return 0.0 18 | precision = intsct_len / len(pred) 19 | recall = intsct_len / len(gt) 20 | return 2. / (1. / precision + 1. / recall) 21 | 22 | 23 | def evaluation_metrics(prediction_file: str, ground_truth_file: str): 24 | while True: 25 | # prediction, ground_truth 26 | # example : [('A', ['a', 'b', 'c']), ('B', ['d', 'e']), ('C', ['f', 'g', 'h'])] 27 | prediction = read_prediction(prediction_file) 28 | ground_truth = read_ground_truth(ground_truth_file) 29 | 30 | pred_dict = dict(prediction) 31 | f1_sum = 0.0 32 | 33 | for query, match in ground_truth: 34 | if query in pred_dict: 35 | pred_match = pred_dict[query] 36 | f1_sum += f1_score(match, pred_match) 37 | 38 | mean_f1 = f1_sum / len(ground_truth) 39 | break 40 | 41 | 42 | return mean_f1 43 | 44 | 45 | def read_strings(input_file): 46 | lines = open(input_file, "r").read().splitlines() 47 | query_matches = [line.split(' ') for line in lines] 48 | return [(query, matches.split(',')) for query, matches in query_matches] 49 | 50 | 51 | if __name__ == '__main__': 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument('--prediction', type=str, default='pred.txt') 54 | parser.add_argument('--test_label_path', type=str) 55 | args = parser.parse_args() 56 | 57 | print(evaluation_metrics(args.prediction, args.test_label_path)) -------------------------------------------------------------------------------- /2-5/582/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | os.environ["HF_HOME"] = "/home/nsml/.cache/huggingface" 5 | # os.system('pip install faiss-cpu --no-cache') 6 | from model import LarvaFeat 7 | from dataset import CatalogDataset 8 | # from larva import LarvaTokenizer, LarvaModel 9 | import transformers 10 | import nsml 11 | from nsml import DATASET_PATH, IS_ON_NSML 12 | from predict import Comparing 13 | import pickle 14 | import numpy as np 15 | from trainer import Trainer 16 | import random 17 | import argparse 18 | 19 | random_seed = 42 20 | torch.manual_seed(random_seed) 21 | np.random.seed(random_seed) 22 | torch.cuda.manual_seed(random_seed) 23 | torch.cuda.manual_seed_all(random_seed) 24 | random.seed(random_seed) 25 | torch.backends.cudnn.deterministic = True 26 | torch.backends.cudnn.benchmark = False 27 | 28 | # def bind_nsml(solver, args): 29 | def bind_nsml(model, args): 30 | def save(path): 31 | print('save: path=', path) 32 | os.makedirs(path, exist_ok=True) 33 | torch.save(model.state_dict(), open(os.path.join(path, 'model.pt'), 'wb')) 34 | pickle.dump(model.model2, open(os.path.join(path, 'model2.pt'), 'wb')) 35 | print('model saved') 36 | 37 | def load(path): 38 | print('load: path=', path) 39 | model.load_state_dict(torch.load(open(os.path.join(path, 'model.pt'), 'rb'))) 40 | # torch.load(open(os.path.join(path, 'model.pt'), 'rb')) 41 | model.model2 = pickle.load(open(os.path.join(path, 'model2.pt'), 'rb')) 42 | print('model loaded') 43 | 44 | def infer(dataset_path): 45 | print('infer: dataset_path=', dataset_path) 46 | database_path = os.path.join(dataset_path, 'test', 'test_data', 'database') 47 | query_path = os.path.join(dataset_path, 'test', 'test_data', 'query') 48 | database_dataset = CatalogDataset(database_path, has_label=False) 49 | query_dataset = CatalogDataset(query_path, has_label=False) 50 | # comparing = Comparing(database_dataset, query_dataset, solver.model) 51 | comparing = Comparing(database_dataset, query_dataset, model) 52 | result = comparing.predict() 53 | # implement inference code here with the trained model 54 | 55 | # returns list of (query_nv_mid, [database_nv_mid]) 56 | # return [('1906368762', ['1906368762','1810466025','5159532445']), 57 | # ('636762', ['636762','1146025','155245'])] # dummy result 58 | return result 59 | 60 | nsml.bind(save, load, infer) 61 | 62 | 63 | def main(): 64 | parser = argparse.ArgumentParser() 65 | parser.add_argument("--mode", type=str, default="train") 66 | parser.add_argument('--pause', type=int, default=0) 67 | args = parser.parse_args() 68 | 69 | model = LarvaFeat() 70 | # solver = Cluster() 71 | 72 | if IS_ON_NSML: 73 | # bind_nsml(solver, args) 74 | bind_nsml(model, args) 75 | 76 | # DONOTCHANGE: They are reserved for nsml 77 | # Warning: Do not load data before the following code! 78 | if args.pause: 79 | nsml.paused(scope=locals()) 80 | 81 | dataset = CatalogDataset(os.path.join(DATASET_PATH, 'train', 'train_data')) 82 | train, valid = dataset.train_valid() 83 | print('trainset:', train['prod_nm'][:5]) 84 | print('validset:', valid['prod_nm'][:5]) 85 | # print(train['measurement'][:5]) 86 | # print(valid['measurement'][:5]) 87 | if args.mode == 'train': 88 | # implement training code here 89 | # solver.train(dataset) 90 | # model.tfidf(dataset) 91 | model = model.to('cuda') 92 | Trainer(model, train, valid) 93 | nsml.save('final') 94 | 95 | if __name__ == "__main__": 96 | main() -------------------------------------------------------------------------------- /2-5/582/nsml_model/final/model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-5/582/nsml_model/final/model/model.pt -------------------------------------------------------------------------------- /2-5/582/nsml_model/final/model/model2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-5/582/nsml_model/final/model/model2.pt -------------------------------------------------------------------------------- /2-5/582/requirements.txt: -------------------------------------------------------------------------------- 1 | #nsml: reg.navercorp.com/chatbot/larva:latest 2 | torch>=1.4.0 3 | transformers>=4.2.0 4 | requests>=2.25.1 5 | datasets>=1.5.0 6 | seqeval==1.2.2 7 | pytorch-crf==0.7.2 8 | # faiss-cpu==1.7.1.post2 9 | faiss-gpu==1.7.1.post2 -------------------------------------------------------------------------------- /2-5/582/sam.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class SAM(torch.optim.Optimizer): 4 | def __init__(self, params, base_optimizer, rho=0.05, **kwargs): 5 | assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}" 6 | 7 | defaults = dict(rho=rho, **kwargs) 8 | super(SAM, self).__init__(params, defaults) 9 | 10 | self.base_optimizer = base_optimizer(self.param_groups, **kwargs) 11 | self.param_groups = self.base_optimizer.param_groups 12 | 13 | @torch.no_grad() 14 | def first_step(self, zero_grad=False): 15 | grad_norm = self._grad_norm() 16 | for group in self.param_groups: 17 | scale = group["rho"] / (grad_norm + 1e-12) 18 | 19 | for p in group["params"]: 20 | if p.grad is None: continue 21 | e_w = p.grad * scale.to(p) 22 | p.add_(e_w) 23 | self.state[p]["e_w"] = e_w 24 | 25 | if zero_grad: self.zero_grad() 26 | 27 | @torch.no_grad() 28 | def second_step(self, zero_grad=False): 29 | for group in self.param_groups: 30 | for p in group["params"]: 31 | if p.grad is None: continue 32 | p.sub_(self.state[p]["e_w"]) 33 | 34 | self.base_optimizer.step() 35 | 36 | if zero_grad: self.zero_grad() 37 | 38 | @torch.no_grad() 39 | def step(self, closure=None): 40 | assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided" 41 | closure = torch.enable_grad()(closure) 42 | 43 | self.first_step(zero_grad=True) 44 | closure() 45 | self.second_step() 46 | 47 | def _grad_norm(self): 48 | shared_device = self.param_groups[0]["params"][0].device 49 | norm = torch.norm( 50 | torch.stack([ 51 | p.grad.norm(p=2).to(shared_device) 52 | for group in self.param_groups for p in group["params"] 53 | if p.grad is not None 54 | ]), 55 | p=2 56 | ) 57 | return norm -------------------------------------------------------------------------------- /2-5/582/setup.py: -------------------------------------------------------------------------------- 1 | #nsml: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel 2 | from distutils.core import setup 3 | 4 | setup( 5 | name='Catalog matching model', 6 | version='0.2', 7 | description='Catalog matching model', 8 | install_requires=[ 9 | 'numpy', 10 | 'python-snappy==0.6.0', 11 | 'pyarrow==2.0.0', 12 | 'fastparquet==0.4.2', 13 | 'pandas==1.1.5', 14 | 'sklearn', 15 | 'transformers>=4.2.0', 16 | # 'faiss', 17 | #'albumentations', 18 | #'opencv-python', 19 | ], 20 | ) -------------------------------------------------------------------------------- /2-5/756/README.md: -------------------------------------------------------------------------------- 1 | # 2-5 쇼핑 카탈로그 클러스터링 2 | 3 | ## 문제 4 | 5 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/d4a88c80-c865-11eb-9e50-27ed62a631c7) 6 | 7 | - 네이버 쇼핑은 여러 판매처에서 판매 중인 동일 상품들의 가격을 비교해 주는 서비스를 제공하고 있다. 8 | - 가격 비교를 위해서는 먼저 여러 판매자가 등록한 같은 상품들을 하나로 묶어야 한다. 9 | - 이렇게 묶인 상품의 집합을 `카탈로그`라고 부른다. 10 | - 같은 상품이라도, 여러 판매자들이 각각 서로 다른 `상품명`으로 판매를 하고 있다. 11 | - 같은 `카탈로그`에 속하는 `상품명` 예시 12 | - 아래 7개의 상품은 모두 `농심 백산수 2L`라는 동일 카탈로그에 속하는 상품이다. 13 | ``` 14 | * 농심 백산수 2L 1병 생수 15 | * 농심 백산수 2L 16 | * 백산수 생수 2L, 낱개 17 | * (24개이상 구입시 개당 20원씩 할인) 농심 백산수 2L 18 | * 농심 백산수 2L x 1펫 / 생수 샘물 물 박스포장 19 | * [농심]백산수 2L x 1개 20 | * NS473 백두산 백산수 2L 21 | ``` 22 | - **본 과제는 판매자가 등록한 상품명 텍스트(`query`)를 입력으로 받아, 주어진 `database`에서 그와 동일한 상품들을 모두 찾아내는 문제이다.** 23 | 24 | 25 | ## 데이터셋 26 | 27 | - airush2021-2-5 28 | - `train` 29 | - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 160,008개 30 | - 데이터 위치 : `train/train_data` 디렉토리 31 | - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` / `카탈로그ID(match_nv_mid)` 32 | - 각 상품은 unique한 `상품ID(nv_mid)`를 가진다. 33 | - 같은 카탈로그에 속하는 상품은 동일한 `카탈로그ID(match_nv_mid)`를 갖는다. 예를 들어, 아래 데이터 예시에서 맨 위 3개의 상품은 동일 카탈로그에 속하므로, 동일한 match_nv_mid(10062684657)을 갖고 있다. 34 | - 데이터 예시 35 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/e9852000-c865-11eb-9c94-0963c31fde06) 36 | 37 | - `test` 38 | - `test` 데이터셋은 `database`와 `query`로 구성되어 있다. 39 | - `database` 40 | - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 90,516개 41 | - 데이터 위치 : `test/test_data/database` 디렉토리 42 | - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` 43 | - `train` 데이터와는 달리, `카탈로그ID(match_nv_mid)` 필드가 없다. 44 | - `query` 45 | - `query`는 위 `database`의 subset이며, 개수는 8,640개이다. 46 | - 데이터 위치 : `test/test_data/query` 디렉토리 47 | - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` 48 | - 데이터 예시 49 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/f30e8800-c865-11eb-8fa0-82ced97afac5) 50 | 51 | - 필드 정보 52 | - nv_mid (string) : 상품ID 53 | - prod_nm (string) : 상품명 54 | - match_nv_mid (string) : 카탈로그ID (train 데이터셋에만 있고, test 데이터셋에는 없음) 55 | 56 | 57 | ## 결과 제출 포맷 58 | 59 | - 결과 제출은 main.py의 infer() 함수에서 이루어진다. 60 | - 아래 예시와 같이 `database`와 `query`가 주어졌다면, 61 | - `query`의 `nv_mid_002`는 `database`의 `nv_mid_001`, `nv_mid_002`, `nv_mid_005`와 동일한 상품이며, 62 | - `query`의 `nv_mid_003`은 `database`의 `nv_mid_003`, `nv_mid_004`와 동일한 상품이다. 63 | - database 64 | 65 | | nv_mid | prod_nm | 66 | | --- | --- | 67 | | nv_mid_001 | (무료배송) 삼다수 2L | 68 | | nv_mid_002 | 삼다수 2L 12병 | 69 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g | 70 | | nv_mid_004 | 지웨이 슈가 먹는 저분자 피쉬 콜라겐 펩타이드 150g | 71 | | nv_mid_005 | 삼다수 2L | 72 | 73 | - query 74 | 75 | | nv_mid | prod_nm | 76 | | --- | --- | 77 | | nv_mid_002 | 삼다수 2L 12병 | 78 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g | 79 | 80 | - `query`에 속하는 각각의 상품에 대해, 그와 동일한 상품을 `database`에서 모두 찾아서 제출하면 된다. 81 | 즉, main.py의 infer() 함수에서 아래와 같은 list를 결과로 return하면 결과가 제출된다. 82 | ``` 83 | return [ 84 | ('nv_mid_002', ['nv_mid_001', 'nv_mid_002', 'nv_mid_005]), 85 | ('nv_mid_003', ['nv_mid_003', 'nv_mid_004']) 86 | ] 87 | 88 | ``` 89 | 90 | ## Getting started 91 | - 접근 92 | - 먼저 주어진 상품명 텍스트를 embedding하는 model을 학습시키고, 93 | - 학습된 모델을 사용해서, test_data/database의 상품명들을 embedding한 후, 94 | - test_data/query의 각 상품명에 대해서 database 상품명들 중, embedding이 유사한 것을 search하는 것이 일반적인 접근법이다. 95 | - 유사 embedding search는 main.py의 infer() 함수 부분를 수정하여, 구현하면 된다. 96 | 97 | - 학습 98 | ``` 99 | nsml run -d airush2021-2-5 -e main.py 100 | ``` 101 | 102 | - 리더보드 제출 103 | ``` 104 | nsml submit {session} {checkpoint} 105 | ``` 106 | 107 | ## evaluation metric 108 | * mean f1-score 109 | * test_data/query 의 각 상품에 대해 match된 결과의 precision과 recall의 f1 score를 모든 상품에 대해 평균한 값 110 | * evaluation.py 코드 참조. 111 | 112 | 113 | ## 기타 114 | 115 | - Team blog: https://medium.com/naver-shopping-dev 116 | - Contact: 오광진 kj.oh@navercorp.com 117 | 118 | 119 | ## FAQ 120 | 121 | Q : Pretrained model 사용이 가능한가요? 122 | 123 | A : 사용 가능합니다. 124 | -------------------------------------------------------------------------------- /2-5/756/data_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from nsml.constants import DATASET_PATH 4 | 5 | 6 | def test_data_loader(root_path): 7 | return root_path 8 | 9 | 10 | def write_output(output_file, data): 11 | with open(output_file, 'w') as f: 12 | for x in data: 13 | f.write(x[0] + ' ' + ','.join(x[1]) + '\n') 14 | 15 | 16 | def feed_infer(output_file, infer_func): 17 | print('DATASET_PATH=', DATASET_PATH) 18 | #os.system('/bin/ls -lR ' + DATASET_PATH) 19 | prediction = infer_func(test_data_loader(DATASET_PATH)) 20 | write_output(output_file, prediction) 21 | if os.stat(output_file).st_size == 0: 22 | raise AssertionError('output result of inference is nothing') 23 | 24 | -------------------------------------------------------------------------------- /2-5/756/dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | #import fastparquet 4 | import numpy as np 5 | import random 6 | import time 7 | from collections import defaultdict 8 | 9 | from glob import glob 10 | import torch 11 | from torch.utils.data import DataLoader 12 | from typing import Dict 13 | from string import punctuation 14 | 15 | 16 | def get_data_columns(has_label): 17 | if has_label: 18 | return ['nv_mid', 'prod_nm', 'match_nv_mid'] 19 | else: 20 | return ['nv_mid', 'prod_nm'] 21 | 22 | 23 | def read_product_data_from_parquet(parquet_path, has_label): 24 | df = pd.read_parquet(parquet_path, columns=get_data_columns(has_label)) 25 | return df 26 | 27 | 28 | def get_catalogs(df): 29 | return list(set(df['match_nv_mid'])) 30 | 31 | def query_finder(labels,sentences,nv_mid): 32 | dict = {} 33 | count_dict = {} 34 | for index, label in enumerate(labels): 35 | try: 36 | dummy = dict[label] 37 | count_dict[label] += 1 38 | except: 39 | dict[label] = [sentences[index],nv_mid[index]] 40 | count_dict[label] = 1 41 | 42 | query_labels = [] 43 | query_sentences = [] 44 | query_nv_mid = [] 45 | 46 | for label, item_list in dict.items(): 47 | query_labels.append(label) 48 | query_sentences.append(item_list[0]) 49 | query_nv_mid.append(item_list[1]) 50 | 51 | 52 | return query_labels, query_sentences, query_nv_mid, count_dict 53 | 54 | 55 | 56 | class CatalogDataset(torch.utils.data.Dataset): 57 | def __init__(self, args, data_path, has_label=True): 58 | super(CatalogDataset, self).__init__() 59 | print('CatalogDataset: data_path=', data_path) 60 | self.batch_size = args.batch_size 61 | self.data_path = data_path 62 | self.df = read_product_data_from_parquet(data_path, has_label) 63 | self.delete_list = ['무료배송','배송','무료','핫딜','당일배송','빠른배송','퀵배송','당일','익일발송','발송'] 64 | 65 | 66 | def train_sentences(self): 67 | # print("preprocessing 전:",self.df['prod_nm'].tolist()[0:10]) 68 | x = self.pre_processing_list(self.df['prod_nm'].tolist()) 69 | # print("preprocessing 후:",x[0:10]) 70 | return x 71 | 72 | def train_labels(self): 73 | return self.df['match_nv_mid'].tolist() 74 | 75 | def get_nv_mid(self): 76 | return self.df['nv_mid'].tolist() 77 | 78 | def __len__(self): 79 | print(self.df.shape) 80 | return self.df.shape[0] 81 | 82 | def __getitem__(self, idx): 83 | items = self.df.iloc[idx] 84 | if self.df.shape[1] == 3: 85 | # nv_mid = [items['nv_mid']] 86 | # prod_nm = [items['prod_nm']] 87 | # match_nv_mid = [items['match_nv_mid']] 88 | nv_mid = items['nv_mid'] 89 | prod_nm = items['prod_nm'] 90 | match_nv_mid = items['match_nv_mid'] 91 | prod_nm = self.pre_processing(prod_nm) 92 | return nv_mid ,prod_nm, match_nv_mid 93 | 94 | else : 95 | nv_mid = items['nv_mid'] 96 | prod_nm = items['prod_nm'] 97 | prod_nm = self.pre_processing(prod_nm) 98 | return nv_mid ,prod_nm 99 | 100 | def get_torch_loader(self, dataset): 101 | 102 | loader = DataLoader(dataset, self.batch_size) 103 | 104 | return loader 105 | 106 | def pre_processing(self,prod_nm): 107 | 108 | for ele in prod_nm: 109 | if ele in punctuation: 110 | prod_nm = prod_nm.replace(ele, " ") 111 | 112 | seperator = " " 113 | words = prod_nm.split(seperator) 114 | 115 | for ele in words: 116 | if ele in self.delete_list: 117 | words.remove(ele) 118 | prod_nm = ' '.join(words) ## 띄어쓰기 없애고도 해보기 119 | return prod_nm 120 | 121 | def pre_processing_list(self,prod_nm_list): 122 | 123 | for i,prod_nm in enumerate(prod_nm_list): 124 | for ele in prod_nm: 125 | if ele in punctuation: 126 | prod_nm = prod_nm.replace(ele, " ") 127 | 128 | seperator = " " 129 | words = prod_nm.split(seperator) 130 | 131 | for ele in words: 132 | if ele in self.delete_list: 133 | words.remove(ele) 134 | prod_nm = ' '.join(words) ## 띄어쓰기 없애고도 해보기 135 | 136 | prod_nm_list[i] = prod_nm 137 | 138 | return prod_nm_list 139 | 140 | -------------------------------------------------------------------------------- /2-5/756/evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | 4 | def read_prediction(prediction_file): 5 | return read_strings(prediction_file) 6 | 7 | 8 | def read_ground_truth(ground_truth_file): 9 | return read_strings(ground_truth_file) 10 | 11 | 12 | def f1_score(gt, pred): 13 | if len(pred) == 0: 14 | return 0.0 15 | intsct_len = len(set(gt).intersection(set(pred))) 16 | if intsct_len == 0: 17 | return 0.0 18 | precision = intsct_len / len(pred) 19 | recall = intsct_len / len(gt) 20 | return 2. / (1. / precision + 1. / recall) 21 | 22 | def evaluate(result,gt_list): 23 | pred_dict = dict(result) 24 | f1_sum = 0.0 25 | for query, match in gt_list: 26 | if query in pred_dict: 27 | pred_match = pred_dict[query] 28 | f1_sum += f1_score(match, pred_match) 29 | mean_f1 = f1_sum / len(gt_list) 30 | return mean_f1 31 | 32 | 33 | def evaluation_metrics(prediction_file: str, ground_truth_file: str): 34 | while True: 35 | # prediction, ground_truth 36 | # example : [('A', ['a', 'b', 'c']), ('B', ['d', 'e']), ('C', ['f', 'g', 'h'])] 37 | prediction = read_prediction(prediction_file) 38 | ground_truth = read_ground_truth(ground_truth_file) 39 | 40 | pred_dict = dict(prediction) 41 | f1_sum = 0.0 42 | 43 | for query, match in ground_truth: 44 | if query in pred_dict: 45 | pred_match = pred_dict[query] 46 | f1_sum += f1_score(match, pred_match) 47 | 48 | mean_f1 = f1_sum / len(ground_truth) 49 | break 50 | 51 | 52 | return mean_f1 53 | 54 | 55 | def read_strings(input_file): 56 | lines = open(input_file, "r").read().splitlines() 57 | query_matches = [line.split(' ') for line in lines] 58 | return [(query, matches.split(',')) for query, matches in query_matches] 59 | 60 | 61 | if __name__ == '__main__': 62 | parser = argparse.ArgumentParser() 63 | parser.add_argument('--prediction', type=str, default='pred.txt') 64 | parser.add_argument('--test_label_path', type=str) 65 | args = parser.parse_args() 66 | 67 | print(evaluation_metrics(args.prediction, args.test_label_path)) -------------------------------------------------------------------------------- /2-5/756/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | import sys 5 | 6 | 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--mode", type=str, default="train") 11 | parser.add_argument('--pause', type=int, default=0) 12 | parser.add_argument('--batch_size', type=int, default=32) 13 | parser.add_argument('--epoch', type=int, default=5) 14 | 15 | args = parser.parse_args() 16 | 17 | os.system('git clone https://github.com/SKTBrain/KoBERT.git') 18 | os.chdir('./KoBERT') 19 | os.system('pip install -r requirements.txt') 20 | os.system('pip install .') 21 | os.chdir('..') 22 | os.system('git clone https://github.com/billyirrish/KoSentenceBERT_SKTBERT.git') 23 | os.chdir('./KoSentenceBERT_SKTBERT') 24 | os.system('pip install -r requirements.txt') 25 | os.system('wget https://github.com/kdh4672/Study_GAN/releases/download/1/result.pt') 26 | os.system('mkdir ./output/training_sts/0_Transformer') 27 | os.system('mv result.pt ./output/training_sts/0_Transformer/result.pt') 28 | # print(os.listdir('./KoSentenceBERT_SKTBERT/output/training_sts/0_Transformer/')) 29 | os.system('mv ../main2.py ./main2.py') 30 | os.system('python ./main2.py --pause {} --mode {}'.format(args.pause, args.mode)) 31 | 32 | if __name__ == "__main__": 33 | main() 34 | 35 | -------------------------------------------------------------------------------- /2-5/756/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class MyModel(nn.Module): 6 | def __init__(self): 7 | super(MyModel, self).__init__() 8 | self.fc = nn.Linear(2, 3) 9 | 10 | def forward(self, x): 11 | return self.fc(x) -------------------------------------------------------------------------------- /2-5/756/nsml_model/king_of_ai/model/model.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-5/756/nsml_model/king_of_ai/model/model.pt -------------------------------------------------------------------------------- /2-5/756/nsml_package.txt: -------------------------------------------------------------------------------- 1 | git 2 | wget 3 | -------------------------------------------------------------------------------- /2-5/756/setup.py: -------------------------------------------------------------------------------- 1 | #nsml: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel 2 | from distutils.core import setup 3 | 4 | setup( 5 | name='Catalog matching model', 6 | version='0.2', 7 | description='Catalog matching model', 8 | install_requires=[ 9 | 'numpy', 10 | 'python-snappy==0.6.0', 11 | 'pyarrow==2.0.0', 12 | 'fastparquet==0.4.2', 13 | 'pandas==1.1.5', 14 | # 'sentence_transformers', 15 | # 'faiss-gpu', 16 | 'gensim' 17 | 18 | # 'kmeans-pytorch', 19 | #'gluonnlp', 20 | #'mxnet' 21 | 22 | #'transformers==3.5', 23 | #'albumentations', 24 | #'faiss', 25 | #'sklearn', 26 | #'opencv-python', 27 | ], 28 | ) -------------------------------------------------------------------------------- /2-5/756/shell.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | os.system('git clone https://github.com/SKTBrain/KoBERT.git') -------------------------------------------------------------------------------- /2-5/command.txt: -------------------------------------------------------------------------------- 1 | nsml run -e main.py -d airush2021-2-5 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000 2 | nsml run -e main.py -d airush2021-2-5 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000 3 | -------------------------------------------------------------------------------- /2-5/rank.txt: -------------------------------------------------------------------------------- 1 | 756 2 | 582 3 | 1233 4 | -------------------------------------------------------------------------------- /Copyright owner: -------------------------------------------------------------------------------- 1 | 본 코드에 대한 저작권은 네이버 주식회사 및 AI RUSH 2021 Members가 가지며 2 | 이에 해당하는 AI RUSH 2021 Members는 이하와 같습니다. 3 | 4 | 2-4 5 | 1등 : 신찬호 6 | 2등: 최세현 7 | 3등: 최나영 8 | 9 | 10 | 2-5 11 | 1등 : 공대현 < 12 | 2등: 장진호 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | AI RUSH 2021_source-code 2 | Copyright (c) 2022-present AI RUSH 2021 Members and NAVER Corp. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in 12 | all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI RUSH 2021_source-code 2 | 3 | AI RUSH를 통해 탄생하는 일부 과제 및 우수모델을 공개하고
4 | 외부개발자 및 예비 참가자가 자유롭게 리뷰하고 학습해볼 수 있도록 지원합니다.
5 |
6 | 7 | ## CLOVA AI RUSH 2021 코드 공개 과제 ## 8 | 9 | 10 | 11 | 12 | 13 | 14 | 17 | 20 | 21 | 24 | 27 | 28 | 29 | 30 |
과제번호과제명
15 | 2-4 16 | 18 | 스마트에디터 문법 교정 도우미 기능 고도화 19 |
22 | 2-5 23 | 25 | 쇼핑 카탈로그 클러스터링 26 |
31 | 32 |
33 |
34 |
35 | 36 | ## CLOVA AI RUSH 2022 모집중 37 | 상세내용 및 지원하기 : https://campaign.naver.com/clova_airush/ 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 48 | 51 | 52 | 53 | 56 | 57 | 58 | 59 | 60 | 63 | 64 | 65 | 66 | 69 | 70 | 71 | 72 |
내용일정
46 | 참가 지원
47 |
49 | 2022.5.30.월 23:59 까지 50 |
54 | 1라운드
55 |
2022.7.5 화 - 7.26 수
61 | 2라운드
62 |
2022.8.1.월 - 8.26.금
67 | 컨퍼런스
68 |
추후 공지 예정
73 | 74 | ![CLOVA AI RUSH 2022_Poster_FIN_RGB_0509-02](https://user-images.githubusercontent.com/34671719/167405166-19969bd0-c4fd-4fd0-ac9e-1e065f186dc8.png) 75 | 76 | 77 | ## License 78 | AI RUSH 2021_source-code
79 | Copyright (c) 2022-present AI RUSH 2021 Members and NAVER Corp. 80 | 81 | Permission is hereby granted, free of charge, to any person obtaining a copy 82 | of this software and associated documentation files (the "Software"), to deal 83 | in the Software without restriction, including without limitation the rights 84 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 85 | copies of the Software, and to permit persons to whom the Software is 86 | furnished to do so, subject to the following conditions: 87 | 88 | The above copyright notice and this permission notice shall be included in 89 | all copies or substantial portions of the Software. 90 | 91 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 92 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 93 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 94 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 95 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 96 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 97 | THE SOFTWARE. 98 | --------------------------------------------------------------------------------