├── .gitattributes
├── 2-4
    ├── 386
    │   ├── README.md
    │   ├── custom_fairseq.py
    │   ├── data_loader.py
    │   ├── dataset.py
    │   ├── ensemble.py
    │   ├── evaluation.py
    │   ├── fairseq
    │   │   ├── __init__.py
    │   │   ├── binarizer.py
    │   │   ├── bleu.py
    │   │   ├── checkpoint_utils.py
    │   │   ├── clib
    │   │   │   └── libbleu
    │   │   │   │   ├── libbleu.cpp
    │   │   │   │   └── module.cpp
    │   │   ├── criterions
    │   │   │   ├── __init__.py
    │   │   │   ├── adaptive_loss.py
    │   │   │   ├── composite_loss.py
    │   │   │   ├── cross_entropy.py
    │   │   │   ├── fairseq_criterion.py
    │   │   │   ├── label_smoothed_cross_entropy.py
    │   │   │   └── masked_lm_loss.py
    │   │   ├── data
    │   │   │   ├── __init__.py
    │   │   │   ├── backtranslation_dataset.py
    │   │   │   ├── block_pair_dataset.py
    │   │   │   ├── concat_dataset.py
    │   │   │   ├── data_utils.py
    │   │   │   ├── dictionary.py
    │   │   │   ├── fairseq_dataset.py
    │   │   │   ├── indexed_dataset.py
    │   │   │   ├── iterators.py
    │   │   │   ├── language_pair_dataset.py
    │   │   │   ├── lm_context_window_dataset.py
    │   │   │   ├── masked_lm_dataset.py
    │   │   │   ├── masked_lm_dictionary.py
    │   │   │   ├── monolingual_dataset.py
    │   │   │   ├── multi_corpus_sampled_dataset.py
    │   │   │   ├── noising.py
    │   │   │   ├── round_robin_zip_datasets.py
    │   │   │   ├── token_block_dataset.py
    │   │   │   ├── transform_eos_dataset.py
    │   │   │   └── transform_eos_lang_pair_dataset.py
    │   │   ├── distributed_utils.py
    │   │   ├── file_utils.py
    │   │   ├── legacy_distributed_data_parallel.py
    │   │   ├── meters.py
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── composite_encoder.py
    │   │   │   ├── distributed_fairseq_model.py
    │   │   │   ├── fairseq_decoder.py
    │   │   │   ├── fairseq_encoder.py
    │   │   │   ├── fairseq_incremental_decoder.py
    │   │   │   ├── fairseq_model.py
    │   │   │   ├── fconv.py
    │   │   │   ├── fconv_lm.py
    │   │   │   ├── fconv_self_att.py
    │   │   │   ├── lightconv.py
    │   │   │   ├── lightconv_lm.py
    │   │   │   ├── lstm.py
    │   │   │   ├── masked_lm.py
    │   │   │   ├── multilingual_transformer.py
    │   │   │   ├── transformer.py
    │   │   │   ├── transformer_from_pretrained_xlm.py
    │   │   │   └── transformer_lm.py
    │   │   ├── modules
    │   │   │   ├── __init__.py
    │   │   │   ├── adaptive_input.py
    │   │   │   ├── adaptive_softmax.py
    │   │   │   ├── beamable_mm.py
    │   │   │   ├── character_token_embedder.py
    │   │   │   ├── conv_tbc.py
    │   │   │   ├── downsampled_multihead_attention.py
    │   │   │   ├── dynamic_convolution.py
    │   │   │   ├── gelu.py
    │   │   │   ├── grad_multiply.py
    │   │   │   ├── highway.py
    │   │   │   ├── layer_norm.py
    │   │   │   ├── learned_positional_embedding.py
    │   │   │   ├── lightweight_convolution.py
    │   │   │   ├── linearized_convolution.py
    │   │   │   ├── logsumexp_moe.py
    │   │   │   ├── mean_pool_gating_network.py
    │   │   │   ├── multihead_attention.py
    │   │   │   ├── positional_embedding.py
    │   │   │   ├── scalar_bias.py
    │   │   │   ├── sinusoidal_positional_embedding.py
    │   │   │   ├── transformer_sentence_encoder.py
    │   │   │   ├── transformer_sentence_encoder_layer.py
    │   │   │   └── unfold.py
    │   │   ├── optim
    │   │   │   ├── __init__.py
    │   │   │   ├── adadelta.py
    │   │   │   ├── adafactor.py
    │   │   │   ├── adagrad.py
    │   │   │   ├── adam.py
    │   │   │   ├── fairseq_optimizer.py
    │   │   │   ├── fp16_optimizer.py
    │   │   │   ├── lamb.py
    │   │   │   ├── lr_scheduler
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── cosine_lr_scheduler.py
    │   │   │   │   ├── fairseq_lr_scheduler.py
    │   │   │   │   ├── fixed_schedule.py
    │   │   │   │   ├── inverse_square_root_schedule.py
    │   │   │   │   ├── polynomial_decay_schedule.py
    │   │   │   │   ├── reduce_lr_on_plateau.py
    │   │   │   │   └── triangular_lr_scheduler.py
    │   │   │   ├── nag.py
    │   │   │   └── sgd.py
    │   │   ├── options.py
    │   │   ├── pdb.py
    │   │   ├── progress_bar.py
    │   │   ├── registry.py
    │   │   ├── search.py
    │   │   ├── sequence_generator.py
    │   │   ├── sequence_generator_ensemble.py
    │   │   ├── sequence_scorer.py
    │   │   ├── tasks
    │   │   │   ├── __init__.py
    │   │   │   ├── cross_lingual_lm.py
    │   │   │   ├── fairseq_task.py
    │   │   │   ├── language_modeling.py
    │   │   │   ├── masked_lm.py
    │   │   │   ├── multilingual_translation.py
    │   │   │   ├── semisupervised_translation.py
    │   │   │   ├── translation.py
    │   │   │   ├── translation_from_pretrained_xlm.py
    │   │   │   └── translation_moe.py
    │   │   ├── tokenizer.py
    │   │   ├── trainer.py
    │   │   └── utils.py
    │   ├── meter.py
    │   ├── model.py
    │   ├── nsml_model
    │   │   └── best
    │   │   │   └── model
    │   │   │       ├── model.pt
    │   │   │       └── vocab.txt
    │   ├── requirements.txt
    │   ├── temp.py
    │   ├── tokenizer.py
    │   ├── train.py
    │   └── wordpiece.py
    ├── 487
    │   ├── README.md
    │   ├── asdf.ipynb
    │   ├── data_loader.py
    │   ├── dataset.py
    │   ├── evaluation.py
    │   ├── main.py
    │   ├── model.py
    │   ├── noising.py
    │   ├── nsml_model
    │   │   └── best
    │   │   │   └── model
    │   │   │       ├── model.pt
    │   │   │       └── vocab.txt
    │   ├── pretrain_dataset.py
    │   ├── pykospacing
    │   │   ├── __init__.py
    │   │   ├── embedding_maker.py
    │   │   ├── kospacing.py
    │   │   ├── pykos.py
    │   │   └── resources
    │   │   │   ├── __init__.py
    │   │   │   ├── dicts
    │   │   │       └── c2v.dic
    │   │   │   └── models
    │   │   │       └── kospacing
    │   ├── requirements.txt
    │   ├── tokenizer.py
    │   ├── train.py
    │   ├── transformers_encoder_decoder.py
    │   └── utils
    │   │   ├── loss.py
    │   │   ├── meter.py
    │   │   ├── preprocess.py
    │   │   └── utils.py
    ├── 1062
    │   ├── README.md
    │   ├── data_loader.py
    │   ├── dataset.py
    │   ├── evaluation.py
    │   ├── meter.py
    │   ├── model.py
    │   ├── model_AR.py
    │   ├── nsml_model
    │   │   └── 17143
    │   │   │   └── model
    │   │   │       ├── model.pt
    │   │   │       ├── model_2.pt
    │   │   │       └── vocab.txt
    │   ├── requirements.txt
    │   ├── tokenizer.py
    │   └── train.py
    ├── command.txt
    └── rank.txt
├── 2-5
    ├── 582
    │   ├── README.md
    │   ├── arcface.py
    │   ├── data_loader.py
    │   ├── dataset.py
    │   ├── evaluation.py
    │   ├── main.py
    │   ├── model.py
    │   ├── nsml_model
    │   │   └── final
    │   │   │   └── model
    │   │   │       ├── model.pt
    │   │   │       └── model2.pt
    │   ├── predict.py
    │   ├── requirements.txt
    │   ├── sam.py
    │   ├── setup.py
    │   └── trainer.py
    ├── 756
    │   ├── README.md
    │   ├── data_loader.py
    │   ├── dataset.py
    │   ├── evaluation.py
    │   ├── main.py
    │   ├── main2.py
    │   ├── main2_original.py
    │   ├── main_old.py
    │   ├── model.py
    │   ├── nsml_model
    │   │   └── king_of_ai
    │   │   │   └── model
    │   │   │       └── model.pt
    │   ├── nsml_package.txt
    │   ├── predict.py
    │   ├── setup.py
    │   ├── shell.py
    │   └── train.py
    ├── command.txt
    └── rank.txt
├── Copyright owner
├── LICENSE
├── NOTICE
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/2-4/1062/README.md:
--------------------------------------------------------------------------------
 1 | # 2-4 스마트에디터의 그래머리 (문장 교정/교열) 기능 고도화
 2 | 
 3 | - 네이버 사용자가 작성한 문장을 문법적으로 맞는 문장으로 교정/교열 하는 모델을 만듭니다.
 4 | 
 5 | 
 6 | ## 데이터
 7 | - 학습데이터
 8 |   * `train/train_data/train_data`: 문법 오류가 섞인 문장
 9 |   * `train/train_data/train_annotation`: 문법 오류에 대한 annotation
10 |   * `train/train_data/train_corpus`: 교정되지 않은 문장
11 |   * `train/train_label`: 교정/교열된 문장
12 | - 평가 데이터
13 |   * `test/test_data`: 문법 오류가 섞인 문장
14 |   * `test/test_label`: 교정/교열된 문장
15 | - 평가 더미 데이 
16 |   * `test_submit/test_data`: 문법 오류가 섞인 문장
17 |   * `test_submit/test_label`: 교정/교열된 문장
18 | - 문법 오류가 섞인 문장들(`*_data`)과 교정/교열된 문장들(`*_label`)은 line-by-line으로 매핑됩니다.
19 | 
20 | 
21 | ## 평가
22 | - Corpus-level [GLEU](https://www.aclweb.org/anthology/P07-1044/) score로 평가 
23 | - [`nltk.translate.gleu_score.corpus_gleu`](https://www.nltk.org/_modules/nltk/translate/gleu_score.html) 스크립트를 사용
24 | 
25 | 
26 | ## 베이스라인
27 | - [Transformer](https://arxiv.org/abs/1706.03762) 기반의 sequence-to-sequence 모델
28 | - 대량의 unlabeled corpus (`train_corpus`)를 활용하여 pre-training (또는 semi-supervised learning) 방식으로 학습하거나 에러 타입 (`train_annotation`)을 예측하도록 multi-task learning을 하면 추가 성능 향상을 얻을 수도 있습니다.  
29 | 
30 | 
31 | ## 모델 학습
32 | ```
33 | nsml run -d airush2021-2-4 -e train.py
34 | ``` 
35 | - 필요에 따라 `-a`로 argument 입력 가능
36 | 
37 | 
38 | ## 모델 제출
39 | ```
40 | nsml submit {SESSION} {CHECKPOINT}
41 | ```
42 | 
43 | ## 추가 정보
44 | 
45 | ### Annotation 설명
46 | 
47 | - "perfect" : 교정/교열이 필요없는 완벽한 문장
48 | - "spacing" : 띄어쓰기 교정
49 | - "pasting" : 붙여쓰기 교정
50 | - "tense" : 시제 교정
51 | - "honorific" : 경어체 교정
52 | - "punctuation" : 구두점 교정
53 | - "typo" : 오탈자 교정 (위 분류에 없는 경우 모두 수렴)
54 | - "advanced" : 윤문 처리 (더 매끄러운 문장)
55 | 


--------------------------------------------------------------------------------
/2-4/1062/data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nsml import DATASET_PATH
 4 | 
 5 | 
 6 | def read_strings(input_file):
 7 |     return open(input_file, "r").read().splitlines()
 8 | 
 9 | 
10 | def write_strings(output_file, data):
11 |     with open(output_file, "w") as f:
12 |         for x in data:
13 |             f.write(str(x) + "\n")
14 | 
15 | 
16 | def test_data_loader(root_path):
17 |     return read_strings(os.path.join(root_path, 'test', 'test_data'))
18 | 
19 | 
20 | def feed_infer(output_file, infer_func):
21 |     prediciton = infer_func(test_data_loader(DATASET_PATH))
22 |     print('write output')
23 |     write_strings(output_file, prediciton)
24 |     if os.stat(output_file).st_size == 0:
25 |         raise AssertionError('output result of inference is nothing')
26 | 


--------------------------------------------------------------------------------
/2-4/1062/evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from nltk.translate.gleu_score import corpus_gleu
 4 | 
 5 | 
 6 | def read_strings(input_file):
 7 |     return open(input_file, "r").read().splitlines()
 8 | 
 9 | 
10 | def read_prediction(prediction_file):
11 |     return read_strings(prediction_file)
12 | 
13 | 
14 | def read_ground_truth(ground_truth_file):
15 |     return read_strings(ground_truth_file)
16 | 
17 | 
18 | def em(prediction, ground_truth):
19 |     return sum([x == y for x, y in zip(prediction, ground_truth)]) / len(ground_truth) * 100.
20 | 
21 | 
22 | def gleu(prediction, ground_truth):
23 |     return corpus_gleu([[x] for x in ground_truth], prediction) * 100.
24 | 
25 | 
26 | def evaluation_metrics(prediction_file: str, ground_truth_file: str):
27 |     try:
28 |         prediction = read_prediction(prediction_file)
29 |         ground_truth = read_ground_truth(ground_truth_file)
30 |         score = gleu(prediction, ground_truth)
31 |     except:
32 |         score = 0.0
33 |     return score
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--prediction', type=str, default='pred.txt')
39 |     parser.add_argument('--test_label_path', type=str)
40 |     args = parser.parse_args()
41 | 
42 |     print(evaluation_metrics(args.prediction, args.test_label_path))
43 | 


--------------------------------------------------------------------------------
/2-4/1062/meter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import time
 3 | 
 4 | 
 5 | class Meter(object):
 6 |     def __init__(self):
 7 |         self.init()
 8 | 
 9 |     def init(self):
10 |         self.start = time.time()
11 |         self.cnt_add = 0
12 |         self.tot_loss = 0.
13 |         self.cnt_sent = 0
14 |         self.cnt_token = 0
15 | 
16 |     def add(self, loss, n_sent, n_token):
17 |         self.cnt_add += 1
18 |         self.tot_loss += loss * n_sent
19 |         self.cnt_sent += n_sent
20 |         self.cnt_token += n_token
21 | 
22 |     def average(self):
23 |         loss_sent = self.tot_loss / self.cnt_sent if self.cnt_sent != 0 else 0.
24 |         loss_token = self.tot_loss / self.cnt_token if self.cnt_token != 0 else 0.
25 |         return loss_sent, loss_token
26 | 
27 |     def elapsed_time(self):
28 |         return time.time() - self.start
29 | 
30 |     def print_str(self, time_avg=False):
31 |         loss_sent, loss_token = self.average()
32 |         et  = self.elapsed_time()
33 |         time_str = f"{et * 1000. / self.cnt_add:6.2f} ms/batch" if time_avg else f"{et:6.2f} s"
34 |         return f"{time_str} | loss_sent {loss_sent:6.2f} | token_ppl {math.exp(loss_token):6.2f}"
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/2-4/1062/nsml_model/17143/model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/1062/nsml_model/17143/model/model.pt


--------------------------------------------------------------------------------
/2-4/1062/nsml_model/17143/model/model_2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/1062/nsml_model/17143/model/model_2.pt


--------------------------------------------------------------------------------
/2-4/1062/requirements.txt:
--------------------------------------------------------------------------------
 1 | #nsml: scatterlab/python-mecab-ko:3.7-circleci-node
 2 | g2pk
 3 | pandas
 4 | nltk
 5 | 
 6 | torch==1.7.1+cu110 
 7 | torchvision==0.8.2+cu110 
 8 | torchaudio===0.7.2
 9 | --find-links https://download.pytorch.org/whl/torch_stable.html
10 | 


--------------------------------------------------------------------------------
/2-4/1062/tokenizer.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter, defaultdict
 2 | 
 3 | from data_loader import read_strings, write_strings
 4 | 
 5 | SPECIAL_TOKENS = ['<unk>', '<pad>', '<sos>', '<eos>']
 6 | 
 7 | 
 8 | class CharTokenizer(object):
 9 |     def __init__(self, i2c):
10 |         self.init(i2c)
11 | 
12 |     def __len__(self):
13 |         return len(self.vocab)
14 | 
15 |     def __call__(self, sent):
16 |         return [self.vocab[c] for c in sent]
17 | 
18 |     def init(self, i2c):
19 |         self.i2c = i2c
20 |         self.vocab = defaultdict(int)
21 |         self.vocab.update({c: i for i, c in enumerate(i2c)})
22 | 
23 |     @classmethod
24 |     def from_strings(cls, strings, vocab_size):
25 |         char_counter = Counter()
26 |         for x in strings:
27 |             char_counter.update(x)
28 |         # print(len(char_counter)) # 2366
29 |         i2c = SPECIAL_TOKENS
30 |         i2c += [c for c, _ in char_counter.most_common(vocab_size - len(SPECIAL_TOKENS))]
31 |         return cls(i2c)
32 | 
33 |     def save(self, path):
34 |         write_strings(path, self.i2c)
35 | 
36 |     def load(self, path):
37 |         i2c = read_strings(path)
38 |         self.init(i2c)
39 | 


--------------------------------------------------------------------------------
/2-4/386/README.md:
--------------------------------------------------------------------------------
 1 | # 2-4 스마트에디터의 그래머리 (문장 교정/교열) 기능 고도화
 2 | 
 3 | - 네이버 사용자가 작성한 문장을 문법적으로 맞는 문장으로 교정/교열 하는 모델을 만듭니다.
 4 | 
 5 | 
 6 | ## 데이터
 7 | - 학습데이터
 8 |   * `train/train_data/train_data`: 문법 오류가 섞인 문장
 9 |   * `train/train_data/train_annotation`: 문법 오류에 대한 annotation
10 |   * `train/train_data/train_corpus`: 교정되지 않은 문장
11 |   * `train/train_label`: 교정/교열된 문장
12 | - 평가 데이터
13 |   * `test/test_data`: 문법 오류가 섞인 문장
14 |   * `test/test_label`: 교정/교열된 문장
15 | - 평가 더미 데이 
16 |   * `test_submit/test_data`: 문법 오류가 섞인 문장
17 |   * `test_submit/test_label`: 교정/교열된 문장
18 | - 문법 오류가 섞인 문장들(`*_data`)과 교정/교열된 문장들(`*_label`)은 line-by-line으로 매핑됩니다.
19 | 
20 | 
21 | ## 평가
22 | - Corpus-level [GLEU](https://www.aclweb.org/anthology/P07-1044/) score로 평가 
23 | - [`nltk.translate.gleu_score.corpus_gleu`](https://www.nltk.org/_modules/nltk/translate/gleu_score.html) 스크립트를 사용
24 | 
25 | 
26 | ## 베이스라인
27 | - [Transformer](https://arxiv.org/abs/1706.03762) 기반의 sequence-to-sequence 모델
28 | - 대량의 unlabeled corpus (`train_corpus`)를 활용하여 pre-training (또는 semi-supervised learning) 방식으로 학습하거나 에러 타입 (`train_annotation`)을 예측하도록 multi-task learning을 하면 추가 성능 향상을 얻을 수도 있습니다.  
29 | 
30 | 
31 | ## 모델 학습
32 | ```
33 | nsml run -d airush2021-2-4 -e train.py
34 | ``` 
35 | - 필요에 따라 `-a`로 argument 입력 가능
36 | 
37 | 
38 | ## 모델 제출
39 | ```
40 | nsml submit {SESSION} {CHECKPOINT}
41 | ```
42 | 
43 | ## 추가 정보
44 | 
45 | ### Annotation 설명
46 | 
47 | - "perfect" : 교정/교열이 필요없는 완벽한 문장
48 | - "spacing" : 띄어쓰기 교정
49 | - "pasting" : 붙여쓰기 교정
50 | - "tense" : 시제 교정
51 | - "honorific" : 경어체 교정
52 | - "punctuation" : 구두점 교정
53 | - "typo" : 오탈자 교정 (위 분류에 없는 경우 모두 수렴)
54 | - "advanced" : 윤문 처리 (더 매끄러운 문장)
55 | 


--------------------------------------------------------------------------------
/2-4/386/data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nsml import DATASET_PATH
 4 | 
 5 | 
 6 | def read_strings(input_file):
 7 |     return open(input_file, "r").read().splitlines()
 8 | 
 9 | 
10 | def write_strings(output_file, data):
11 |     with open(output_file, "w") as f:
12 |         for x in data:
13 |             f.write(str(x) + "\n")
14 | 
15 | 
16 | def test_data_loader(root_path):
17 |     return read_strings(os.path.join(root_path, 'test', 'test_data'))
18 | 
19 | 
20 | def feed_infer(output_file, infer_func):
21 |     prediciton = infer_func(test_data_loader(DATASET_PATH))
22 |     print('write output')
23 |     write_strings(output_file, prediciton)
24 |     if os.stat(output_file).st_size == 0:
25 |         raise AssertionError('output result of inference is nothing')
26 | 


--------------------------------------------------------------------------------
/2-4/386/evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from nltk.translate.gleu_score import corpus_gleu
 4 | 
 5 | 
 6 | def read_strings(input_file):
 7 |     return open(input_file, "r").read().splitlines()
 8 | 
 9 | 
10 | def read_prediction(prediction_file):
11 |     return read_strings(prediction_file)
12 | 
13 | 
14 | def read_ground_truth(ground_truth_file):
15 |     return read_strings(ground_truth_file)
16 | 
17 | 
18 | def em(prediction, ground_truth):
19 |     return sum([x == y for x, y in zip(prediction, ground_truth)]) / len(ground_truth) * 100.
20 | 
21 | 
22 | def gleu(prediction, ground_truth):
23 |     return corpus_gleu([[x] for x in ground_truth], prediction) * 100.
24 | 
25 | 
26 | def evaluation_metrics(prediction_file: str, ground_truth_file: str):
27 |     try:
28 |         prediction = read_prediction(prediction_file)
29 |         ground_truth = read_ground_truth(ground_truth_file)
30 |         score = gleu(prediction, ground_truth)
31 |     except:
32 |         score = 0.0
33 |     return score
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument('--prediction', type=str, default='pred.txt')
39 |     parser.add_argument('--test_label_path', type=str)
40 |     args = parser.parse_args()
41 | 
42 |     print(evaluation_metrics(args.prediction, args.test_label_path))
43 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | __all__ = ['pdb']
 9 | __version__ = '0.6.2'
10 | 
11 | import fairseq.criterions
12 | import fairseq.models
13 | import fairseq.modules
14 | import fairseq.optim
15 | import fairseq.optim.lr_scheduler
16 | import fairseq.pdb
17 | import fairseq.tasks
18 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/binarizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from collections import Counter
 9 | import os
10 | 
11 | from fairseq.tokenizer import tokenize_line
12 | # from bert import BertTokenizer
13 | import torch
14 | def safe_readline(f):
15 |     pos = f.tell()
16 |     while True:
17 |         try:
18 |             return f.readline()
19 |         except UnicodeDecodeError:
20 |             pos -= 1
21 |             f.seek(pos)  # search where this character begins
22 | 
23 | 
24 | class Binarizer:
25 | 
26 |     @staticmethod
27 |     def binarize(filename, dict, consumer, tokenize=tokenize_line, append_eos=True, reverse_order=False,
28 |                  offset=0, end=-1):
29 |         nseq, ntok = 0, 0
30 |         replaced = Counter()
31 | 
32 |         def replaced_consumer(word, idx):
33 |             if idx == dict.unk_index and word != dict.unk_word:
34 |                 replaced.update([word])
35 | 
36 |         with open(filename, 'r', encoding='utf-8') as f:
37 |             f.seek(offset)
38 |             # next(f) breaks f.tell(), hence readline() must be used
39 |             line = safe_readline(f)
40 |             while line:
41 |                 if end > 0 and f.tell() > end:
42 |                     break
43 |                 if isinstance(dict, BertTokenizer):
44 |                     line = line.strip()
45 |                     line = '{} {} {}'.format('[CLS]', line, '[SEP]')
46 |                     tokenizedline = dict.tokenize(line)
47 |                     if len(tokenizedline) > dict.max_len:
48 |                         tokenizedline = tokenizedline[:dict.max_len-1]
49 |                         tokenizedline.append('[SEP]')
50 |                     words = dict.convert_tokens_to_ids(tokenizedline)
51 |                     nwords = len(words)
52 |                     ids = torch.IntTensor(nwords)
53 |                     for i, word in enumerate(words):
54 |                         ids[i] = word
55 |                         replaced_consumer(tokenizedline[i], word)
56 |                 else:
57 |                     ids = dict.encode_line(
58 |                             line=line,
59 |                             line_tokenizer=tokenize,
60 |                             add_if_not_exist=False,
61 |                             consumer=replaced_consumer,
62 |                             append_eos=append_eos,
63 |                             reverse_order=reverse_order,
64 |                     )
65 |                 nseq += 1
66 |                 ntok += len(ids)
67 |                 consumer(ids)
68 |                 line = f.readline()
69 |         return {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced}
70 | 
71 |     @staticmethod
72 |     def find_offsets(filename, num_chunks):
73 |         with open(filename, 'r', encoding='utf-8') as f:
74 |             size = os.fstat(f.fileno()).st_size
75 |             chunk_size = size // num_chunks
76 |             offsets = [0 for _ in range(num_chunks + 1)]
77 |             for i in range(1, num_chunks):
78 |                 f.seek(chunk_size * i)
79 |                 safe_readline(f)
80 |                 offsets[i] = f.tell()
81 |             return offsets
82 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/bleu.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | 
  8 | import ctypes
  9 | import math
 10 | import torch
 11 | 
 12 | try:
 13 |     from fairseq import libbleu
 14 | except ImportError as e:
 15 |     import sys
 16 |     sys.stderr.write('ERROR: missing libbleu.so. run `pip install --editable .`\n')
 17 |     raise e
 18 | 
 19 | 
 20 | C = ctypes.cdll.LoadLibrary(libbleu.__file__)
 21 | 
 22 | 
 23 | class BleuStat(ctypes.Structure):
 24 |     _fields_ = [
 25 |         ('reflen', ctypes.c_size_t),
 26 |         ('predlen', ctypes.c_size_t),
 27 |         ('match1', ctypes.c_size_t),
 28 |         ('count1', ctypes.c_size_t),
 29 |         ('match2', ctypes.c_size_t),
 30 |         ('count2', ctypes.c_size_t),
 31 |         ('match3', ctypes.c_size_t),
 32 |         ('count3', ctypes.c_size_t),
 33 |         ('match4', ctypes.c_size_t),
 34 |         ('count4', ctypes.c_size_t),
 35 |     ]
 36 | 
 37 | 
 38 | class SacrebleuScorer(object):
 39 |     def __init__(self):
 40 |         import sacrebleu
 41 |         self.sacrebleu = sacrebleu
 42 |         self.reset()
 43 | 
 44 |     def reset(self, one_init=False):
 45 |         if one_init:
 46 |             raise NotImplementedError
 47 |         self.ref = []
 48 |         self.sys = []
 49 | 
 50 |     def add_string(self, ref, pred):
 51 |         self.ref.append(ref)
 52 |         self.sys.append(pred)
 53 | 
 54 |     def score(self, order=4):
 55 |         return self.result_string(order).score
 56 | 
 57 |     def result_string(self, order=4):
 58 |         if order != 4:
 59 |             raise NotImplementedError
 60 |         return self.sacrebleu.corpus_bleu(self.sys, [self.ref])
 61 | 
 62 | 
 63 | class Scorer(object):
 64 |     def __init__(self, pad, eos, unk):
 65 |         self.stat = BleuStat()
 66 |         self.pad = pad
 67 |         self.eos = eos
 68 |         self.unk = unk
 69 |         self.reset()
 70 | 
 71 |     def reset(self, one_init=False):
 72 |         if one_init:
 73 |             C.bleu_one_init(ctypes.byref(self.stat))
 74 |         else:
 75 |             C.bleu_zero_init(ctypes.byref(self.stat))
 76 | 
 77 |     def add(self, ref, pred):
 78 |         if not isinstance(ref, torch.IntTensor):
 79 |             raise TypeError('ref must be a torch.IntTensor (got {})'
 80 |                             .format(type(ref)))
 81 |         if not isinstance(pred, torch.IntTensor):
 82 |             raise TypeError('pred must be a torch.IntTensor(got {})'
 83 |                             .format(type(pred)))
 84 | 
 85 |         # don't match unknown words
 86 |         rref = ref.clone()
 87 |         assert not rref.lt(0).any()
 88 |         rref[rref.eq(self.unk)] = -999
 89 | 
 90 |         rref = rref.contiguous().view(-1)
 91 |         pred = pred.contiguous().view(-1)
 92 | 
 93 |         C.bleu_add(
 94 |             ctypes.byref(self.stat),
 95 |             ctypes.c_size_t(rref.size(0)),
 96 |             ctypes.c_void_p(rref.data_ptr()),
 97 |             ctypes.c_size_t(pred.size(0)),
 98 |             ctypes.c_void_p(pred.data_ptr()),
 99 |             ctypes.c_int(self.pad),
100 |             ctypes.c_int(self.eos))
101 | 
102 |     def score(self, order=4):
103 |         psum = sum(math.log(p) if p > 0 else float('-Inf')
104 |                    for p in self.precision()[:order])
105 |         return self.brevity() * math.exp(psum / order) * 100
106 | 
107 |     def precision(self):
108 |         def ratio(a, b):
109 |             return a / b if b > 0 else 0
110 | 
111 |         return [
112 |             ratio(self.stat.match1, self.stat.count1),
113 |             ratio(self.stat.match2, self.stat.count2),
114 |             ratio(self.stat.match3, self.stat.count3),
115 |             ratio(self.stat.match4, self.stat.count4),
116 |         ]
117 | 
118 |     def brevity(self):
119 |         r = self.stat.reflen / self.stat.predlen
120 |         return min(1, math.exp(1 - r))
121 | 
122 |     def result_string(self, order=4):
123 |         assert order <= 4, "BLEU scores for order > 4 aren't supported"
124 |         fmt = 'BLEU{} = {:2.2f}, {:2.1f}'
125 |         for _ in range(1, order):
126 |             fmt += '/{:2.1f}'
127 |         fmt += ' (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})'
128 |         bleup = [p * 100 for p in self.precision()[:order]]
129 |         return fmt.format(order, self.score(order=order), *bleup,
130 |                           self.brevity(), self.stat.predlen/self.stat.reflen,
131 |                           self.stat.predlen, self.stat.reflen)
132 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/clib/libbleu/libbleu.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2017-present, Facebook, Inc.
  3 |  * All rights reserved.
  4 |  *
  5 |  * This source code is licensed under the license found in the
  6 |  * LICENSE file in the root directory of this source tree.
  7 |  */
  8 | 
  9 | #include <map>
 10 | #include <array>
 11 | #include <cstring>
 12 | #include <cstdio>
 13 | 
 14 | typedef struct
 15 | {
 16 |     size_t reflen;
 17 |     size_t predlen;
 18 |     size_t match1;
 19 |     size_t count1;
 20 |     size_t match2;
 21 |     size_t count2;
 22 |     size_t match3;
 23 |     size_t count3;
 24 |     size_t match4;
 25 |     size_t count4;
 26 | } bleu_stat;
 27 | 
 28 | // left trim (remove pad)
 29 | void bleu_ltrim(size_t* len, int** sent, int pad) {
 30 |   size_t start = 0;
 31 |   while(start < *len) {
 32 |     if (*(*sent + start) != pad) { break; }
 33 |     start++;
 34 |   }
 35 |   *sent += start;
 36 |   *len -= start;
 37 | }
 38 | 
 39 | // right trim remove (eos)
 40 | void bleu_rtrim(size_t* len, int** sent, int pad, int eos) {
 41 |   size_t end = *len - 1;
 42 |   while (end > 0) {
 43 |     if (*(*sent + end) != eos && *(*sent + end) != pad) { break; }
 44 |     end--;
 45 |   }
 46 |   *len = end + 1;
 47 | }
 48 | 
 49 | // left and right trim
 50 | void bleu_trim(size_t* len, int** sent, int pad, int eos) {
 51 |   bleu_ltrim(len, sent, pad);
 52 |   bleu_rtrim(len, sent, pad, eos);
 53 | }
 54 | 
 55 | size_t bleu_hash(int len, int* data) {
 56 |   size_t h     = 14695981039346656037ul;
 57 |   size_t prime = 0x100000001b3;
 58 |   char* b      = (char*) data;
 59 |   size_t blen  = sizeof(int) * len;
 60 | 
 61 |   while (blen-- > 0) {
 62 |     h ^= *b++;
 63 |     h *= prime;
 64 |   }
 65 | 
 66 |   return h;
 67 | }
 68 | 
 69 | void bleu_addngram(
 70 |     size_t *ntotal, size_t *nmatch, size_t n,
 71 |     size_t reflen, int* ref, size_t predlen, int* pred) {
 72 | 
 73 |   if (predlen < n) { return; }
 74 | 
 75 |   predlen = predlen - n + 1;
 76 |   (*ntotal) += predlen;
 77 | 
 78 |   if (reflen < n) { return; }
 79 | 
 80 |   reflen = reflen - n + 1;
 81 | 
 82 |   std::map<size_t, size_t> count;
 83 |   while (predlen > 0) {
 84 |     size_t w = bleu_hash(n, pred++);
 85 |     count[w]++;
 86 |     predlen--;
 87 |   }
 88 | 
 89 |   while (reflen > 0) {
 90 |     size_t w = bleu_hash(n, ref++);
 91 |     if (count[w] > 0) {
 92 |       (*nmatch)++;
 93 |       count[w] -=1;
 94 |     }
 95 |     reflen--;
 96 |   }
 97 | }
 98 | 
 99 | extern "C" {
100 | 
101 | void bleu_zero_init(bleu_stat* stat) {
102 |   std::memset(stat, 0, sizeof(bleu_stat));
103 | }
104 | 
105 | void bleu_one_init(bleu_stat* stat) {
106 |   bleu_zero_init(stat);
107 |   stat->count1 = 0;
108 |   stat->count2 = 1;
109 |   stat->count3 = 1;
110 |   stat->count4 = 1;
111 |   stat->match1 = 0;
112 |   stat->match2 = 1;
113 |   stat->match3 = 1;
114 |   stat->match4 = 1;
115 | }
116 | 
117 | void bleu_add(
118 |     bleu_stat* stat,
119 |     size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) {
120 | 
121 |   bleu_trim(&reflen, &ref, pad, eos);
122 |   bleu_trim(&predlen, &pred, pad, eos);
123 |   stat->reflen += reflen;
124 |   stat->predlen += predlen;
125 | 
126 |   bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred);
127 |   bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred);
128 |   bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred);
129 |   bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred);
130 | }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/clib/libbleu/module.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2017-present, Facebook, Inc.
 3 |  * All rights reserved.
 4 |  *
 5 |  * This source code is licensed under the license found in the
 6 |  * LICENSE file in the root directory of this source tree.
 7 |  */
 8 | 
 9 | #include <Python.h>
10 | 
11 | 
12 | static PyMethodDef method_def[] = {
13 |   {NULL, NULL, 0, NULL}
14 | };
15 | 
16 | static struct PyModuleDef module_def = {
17 |    PyModuleDef_HEAD_INIT,
18 |    "libbleu",   /* name of module */
19 |    NULL,     /* module documentation, may be NULL */
20 |    -1,       /* size of per-interpreter state of the module,
21 |                 or -1 if the module keeps state in global variables. */
22 |    method_def
23 | };
24 | 
25 | 
26 | #if PY_MAJOR_VERSION == 2
27 | PyMODINIT_FUNC init_libbleu()
28 | #else
29 | PyMODINIT_FUNC PyInit_libbleu()
30 | #endif
31 | {
32 |   PyObject *m = PyModule_Create(&module_def);
33 |   if (!m) {
34 |     return NULL;
35 |   }
36 |   return m;
37 | }
38 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/criterions/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import importlib
 9 | import os
10 | 
11 | from fairseq import registry
12 | from fairseq.criterions.fairseq_criterion import FairseqCriterion
13 | 
14 | 
15 | build_criterion, register_criterion, CRITERION_REGISTRY = registry.setup_registry(
16 |     '--criterion',
17 |     base_class=FairseqCriterion,
18 |     default='cross_entropy',
19 | )
20 | 
21 | 
22 | # automatically import any Python files in the criterions/ directory
23 | for file in os.listdir(os.path.dirname(__file__)):
24 |     if file.endswith('.py') and not file.startswith('_'):
25 |         module = file[:file.find('.py')]
26 |         importlib.import_module('fairseq.criterions.' + module)
27 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/criterions/adaptive_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | 
 9 | import math
10 | import torch.nn.functional as F
11 | 
12 | from fairseq import utils
13 | from . import FairseqCriterion, register_criterion
14 | 
15 | 
16 | @register_criterion('adaptive_loss')
17 | class AdaptiveLoss(FairseqCriterion):
18 |     """This is an implementation of the loss function accompanying the adaptive softmax approximation for
19 |     graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs"
20 |     (http://arxiv.org/abs/1609.04309)."""
21 | 
22 |     def __init__(self, args, task):
23 |         super().__init__(args, task)
24 | 
25 |         if args.ddp_backend == 'c10d':
26 |             raise Exception(
27 |                 'AdaptiveLoss is not compatible with the c10d '
28 |                 'version of DistributedDataParallel. Please use '
29 |                 '`--ddp-backend=no_c10d` instead.'
30 |             )
31 | 
32 |     def forward(self, model, sample, reduce=True):
33 |         """Compute the loss for the given sample.
34 | 
35 |         Returns a tuple with three elements:
36 |         1) the loss
37 |         2) the sample size, which is used as the denominator for the gradient
38 |         3) logging outputs to display while training
39 |         """
40 | 
41 |         assert hasattr(model.decoder, 'adaptive_softmax') and model.decoder.adaptive_softmax is not None
42 |         adaptive_softmax = model.decoder.adaptive_softmax
43 | 
44 |         net_output = model(**sample['net_input'])
45 |         orig_target = model.get_targets(sample, net_output)
46 | 
47 |         nsentences = orig_target.size(0)
48 |         orig_target = orig_target.view(-1)
49 | 
50 |         bsz = orig_target.size(0)
51 | 
52 |         logits, target = adaptive_softmax(net_output[0], orig_target)
53 |         assert len(target) == len(logits)
54 | 
55 |         loss = net_output[0].new(1 if reduce else bsz).zero_()
56 | 
57 |         for i in range(len(target)):
58 |             if target[i] is not None:
59 |                 assert (target[i].min() >= 0 and target[i].max() <= logits[i].size(1))
60 |                 loss += F.cross_entropy(
61 |                     logits[i],
62 |                     target[i],
63 |                     ignore_index=self.padding_idx,
64 |                     reduction='sum' if reduce else 'none',
65 |                 )
66 | 
67 |         orig = utils.strip_pad(orig_target, self.padding_idx)
68 |         ntokens = orig.numel()
69 |         sample_size = sample['target'].size(0) if self.args.sentence_avg else ntokens
70 |         logging_output = {
71 |             'loss': utils.item(loss.data) if reduce else loss.data,
72 |             'ntokens': ntokens,
73 |             'nsentences': nsentences,
74 |             'sample_size': sample_size,
75 |         }
76 |         return loss, sample_size, logging_output
77 | 
78 |     @staticmethod
79 |     def aggregate_logging_outputs(logging_outputs):
80 |         """Aggregate logging outputs from data parallel training."""
81 |         loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
82 |         ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
83 |         nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
84 |         sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
85 |         agg_output = {
86 |             'loss': loss_sum / sample_size / math.log(2),
87 |             'nll_loss': loss_sum / sample_size / math.log(2),
88 |             'ntokens': ntokens,
89 |             'nsentences': nsentences,
90 |             'sample_size': sample_size,
91 |         }
92 |         if sample_size != ntokens:
93 |             agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
94 |         return agg_output
95 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/criterions/composite_loss.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from torch import nn
 9 | 
10 | from fairseq import utils
11 | from . import FairseqCriterion, register_criterion
12 | 
13 | 
14 | @register_criterion('composite_loss')
15 | class CompositeLoss(FairseqCriterion):
16 |     """This is a composite loss that, given a list of model outputs and a list of targets,
17 |     computes an average of losses for each output-target pair"""
18 | 
19 |     @staticmethod
20 |     def add_args(parser):
21 |         """Add criterion-specific arguments to the parser."""
22 |         # fmt: off
23 |         parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True,
24 |                             help='underlying criterion to use for the composite loss')
25 |         # fmt: on
26 | 
27 |     @staticmethod
28 |     def build_underlying_criterion(args, task):
29 |         saved_criterion = args.criterion
30 |         args.criterion = args.underlying_criterion
31 |         assert saved_criterion != args.underlying_criterion
32 |         underlying_criterion = task.build_criterion(args)
33 |         args.criterion = saved_criterion
34 |         return underlying_criterion
35 | 
36 |     @classmethod
37 |     def build_criterion(cls, args, task):
38 |         underlying_criterion = CompositeLoss.build_underlying_criterion(args, task)
39 | 
40 |         class FakeModel(nn.Module):
41 | 
42 |             def __init__(self, model, net_out, target):
43 |                 super().__init__()
44 |                 self.model = model
45 |                 self.net_out = net_out
46 |                 self.target = target
47 | 
48 |             def forward(self, **unused):
49 |                 return self.net_out
50 | 
51 |             def get_normalized_probs(self, net_output, log_probs, sample=None):
52 |                 return self.model.get_normalized_probs(net_output, log_probs, sample=sample)
53 | 
54 |             def get_targets(self, *unused):
55 |                 return self.target
56 | 
57 |             @property
58 |             def decoder(self):
59 |                 return self.model.decoder
60 | 
61 |         class _CompositeLoss(FairseqCriterion):
62 | 
63 |             def __init__(self, args, task, underlying_criterion):
64 |                 super().__init__(args, task)
65 |                 self.underlying_criterion = underlying_criterion
66 | 
67 |             def forward(self, model, sample, reduce=True):
68 |                 net_outputs = model(**sample['net_input'])
69 |                 targets = sample['target']
70 | 
71 |                 bsz = targets[0].size(0)
72 |                 loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_()
73 | 
74 |                 sample_size = 0
75 |                 logging_output = {}
76 |                 for o, t in zip(net_outputs[0], targets):
77 |                     m = FakeModel(model, (o, net_outputs[1]), t)
78 |                     sample['target'] = t
79 |                     l, ss, logging_output = self.underlying_criterion(m, sample, reduce)
80 |                     loss += l
81 |                     sample_size += ss
82 | 
83 |                 loss.div_(len(targets))
84 |                 sample_size /= len(targets)
85 | 
86 |                 logging_output['loss'] = utils.item(loss.data) if reduce else loss.data
87 |                 return loss, sample_size, logging_output
88 | 
89 |             @staticmethod
90 |             def aggregate_logging_outputs(logging_outputs):
91 |                 return underlying_criterion.__class__.aggregate_logging_outputs(logging_outputs)
92 | 
93 |         return _CompositeLoss(args, task, underlying_criterion)
94 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/criterions/cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import math
 9 | import torch.nn.functional as F
10 | 
11 | from fairseq import utils
12 | 
13 | from . import FairseqCriterion, register_criterion
14 | 
15 | 
16 | @register_criterion('cross_entropy')
17 | class CrossEntropyCriterion(FairseqCriterion):
18 | 
19 |     def __init__(self, args, task):
20 |         super().__init__(args, task)
21 | 
22 |     def forward(self, model, sample, reduce=True):
23 |         """Compute the loss for the given sample.
24 | 
25 |         Returns a tuple with three elements:
26 |         1) the loss
27 |         2) the sample size, which is used as the denominator for the gradient
28 |         3) logging outputs to display while training
29 |         """
30 |         net_output = model(**sample['net_input'])
31 |         loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce)
32 |         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
33 |         logging_output = {
34 |             'loss': utils.item(loss.data) if reduce else loss.data,
35 |             'ntokens': sample['ntokens'],
36 |             'nsentences': sample['target'].size(0),
37 |             'sample_size': sample_size,
38 |         }
39 |         return loss, sample_size, logging_output
40 | 
41 |     def compute_loss(self, model, net_output, sample, reduce=True):
42 |         lprobs = model.get_normalized_probs(net_output, log_probs=True)
43 |         lprobs = lprobs.view(-1, lprobs.size(-1))
44 |         target = model.get_targets(sample, net_output).view(-1)
45 |         loss = F.nll_loss(
46 |             lprobs,
47 |             target,
48 |             ignore_index=self.padding_idx,
49 |             reduction='sum' if reduce else 'none',
50 |         )
51 |         return loss, loss
52 | 
53 |     @staticmethod
54 |     def aggregate_logging_outputs(logging_outputs):
55 |         """Aggregate logging outputs from data parallel training."""
56 |         loss_sum = sum(log.get('loss', 0) for log in logging_outputs)
57 |         ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
58 |         nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
59 |         sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
60 |         agg_output = {
61 |             'loss': loss_sum / sample_size / math.log(2),
62 |             'ntokens': ntokens,
63 |             'nsentences': nsentences,
64 |             'sample_size': sample_size,
65 |         }
66 |         if sample_size != ntokens:
67 |             agg_output['nll_loss'] = loss_sum / ntokens / math.log(2)
68 |         return agg_output
69 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/criterions/fairseq_criterion.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from torch.nn.modules.loss import _Loss
 9 | 
10 | 
11 | class FairseqCriterion(_Loss):
12 | 
13 |     def __init__(self, args, task):
14 |         super().__init__()
15 |         self.args = args
16 |         self.padding_idx = task.target_dictionary.pad()
17 | 
18 |     @staticmethod
19 |     def add_args(parser):
20 |         """Add criterion-specific arguments to the parser."""
21 |         pass
22 | 
23 |     @classmethod
24 |     def build_criterion(cls, args, task):
25 |         return cls(args, task)
26 | 
27 |     def forward(self, model, sample, reduce=True):
28 |         """Compute the loss for the given sample.
29 | 
30 |         Returns a tuple with three elements:
31 |         1) the loss
32 |         2) the sample size, which is used as the denominator for the gradient
33 |         3) logging outputs to display while training
34 |         """
35 |         raise NotImplementedError
36 | 
37 |     @staticmethod
38 |     def aggregate_logging_outputs(logging_outputs):
39 |         """Aggregate logging outputs from data parallel training."""
40 |         raise NotImplementedError
41 | 
42 |     @staticmethod
43 |     def grad_denom(sample_sizes):
44 |         """Compute the gradient denominator for a set of sample sizes."""
45 |         return sum(sample_sizes)
46 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/criterions/label_smoothed_cross_entropy.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import math
 9 | 
10 | from fairseq import utils
11 | 
12 | from . import FairseqCriterion, register_criterion
13 | 
14 | 
15 | @register_criterion('label_smoothed_cross_entropy')
16 | class LabelSmoothedCrossEntropyCriterion(FairseqCriterion):
17 | 
18 |     def __init__(self, args, task):
19 |         super().__init__(args, task)
20 |         self.eps = args.label_smoothing
21 | 
22 |     @staticmethod
23 |     def add_args(parser):
24 |         """Add criterion-specific arguments to the parser."""
25 |         # fmt: off
26 |         parser.add_argument('--label-smoothing', default=0., type=float, metavar='D',
27 |                             help='epsilon for label smoothing, 0 means no label smoothing')
28 |         # fmt: on
29 | 
30 |     def forward(self, model, sample, reduce=True):
31 |         """Compute the loss for the given sample.
32 | 
33 |         Returns a tuple with three elements:
34 |         1) the loss
35 |         2) the sample size, which is used as the denominator for the gradient
36 |         3) logging outputs to display while training
37 |         """
38 |         net_output = model(**sample['net_input'])
39 |         loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce)
40 |         sample_size = sample['target'].size(0) if self.args.sentence_avg else sample['ntokens']
41 |         logging_output = {
42 |             'loss': utils.item(loss.data) if reduce else loss.data,
43 |             'nll_loss': utils.item(nll_loss.data) if reduce else nll_loss.data,
44 |             'ntokens': sample['ntokens'],
45 |             'nsentences': sample['target'].size(0),
46 |             'sample_size': sample_size,
47 |         }
48 |         return loss, sample_size, logging_output
49 | 
50 |     def compute_loss(self, model, net_output, sample, reduce=True):
51 |         lprobs = model.get_normalized_probs(net_output, log_probs=True)
52 |         lprobs = lprobs.view(-1, lprobs.size(-1))
53 |         target = model.get_targets(sample, net_output).view(-1, 1)
54 |         non_pad_mask = target.ne(self.padding_idx)
55 |         nll_loss = -lprobs.gather(dim=-1, index=target)[non_pad_mask]
56 |         smooth_loss = -lprobs.sum(dim=-1, keepdim=True)[non_pad_mask]
57 |         if reduce:
58 |             nll_loss = nll_loss.sum()
59 |             smooth_loss = smooth_loss.sum()
60 |         eps_i = self.eps / lprobs.size(-1)
61 |         loss = (1. - self.eps) * nll_loss + eps_i * smooth_loss
62 |         return loss, nll_loss
63 | 
64 |     @staticmethod
65 |     def aggregate_logging_outputs(logging_outputs):
66 |         """Aggregate logging outputs from data parallel training."""
67 |         ntokens = sum(log.get('ntokens', 0) for log in logging_outputs)
68 |         nsentences = sum(log.get('nsentences', 0) for log in logging_outputs)
69 |         sample_size = sum(log.get('sample_size', 0) for log in logging_outputs)
70 |         return {
71 |             'loss': sum(log.get('loss', 0) for log in logging_outputs) / sample_size / math.log(2),
72 |             'nll_loss': sum(log.get('nll_loss', 0) for log in logging_outputs) / ntokens / math.log(2),
73 |             'ntokens': ntokens,
74 |             'nsentences': nsentences,
75 |             'sample_size': sample_size,
76 |         }
77 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from .dictionary import Dictionary, TruncatedDictionary
 9 | from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary
10 | 
11 | from .fairseq_dataset import FairseqDataset
12 | 
13 | from .backtranslation_dataset import BacktranslationDataset
14 | from .block_pair_dataset import BlockPairDataset
15 | from .concat_dataset import ConcatDataset
16 | from .indexed_dataset import IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset
17 | from .language_pair_dataset import LanguagePairDataset
18 | from .lm_context_window_dataset import LMContextWindowDataset
19 | from .masked_lm_dataset import MaskedLMDataset
20 | from .monolingual_dataset import MonolingualDataset
21 | from .noising import NoisingDataset
22 | from .round_robin_zip_datasets import RoundRobinZipDatasets
23 | from .token_block_dataset import TokenBlockDataset
24 | from .transform_eos_dataset import TransformEosDataset
25 | from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset
26 | 
27 | from .iterators import (
28 |     CountingIterator,
29 |     EpochBatchIterator,
30 |     GroupedIterator,
31 |     ShardedIterator,
32 | )
33 | 
34 | __all__ = [
35 |     'BacktranslationDataset',
36 |     'BertDictionary',
37 |     'BlockPairDataset',
38 |     'ConcatDataset',
39 |     'CountingIterator',
40 |     'Dictionary',
41 |     'EpochBatchIterator',
42 |     'FairseqDataset',
43 |     'GroupedIterator',
44 |     'IndexedCachedDataset',
45 |     'IndexedDataset',
46 |     'IndexedRawTextDataset',
47 |     'LanguagePairDataset',
48 |     'LMContextWindowDataset',
49 |     'MaskedLMDataset',
50 |     'MaskedLMDictionary',
51 |     'MMapIndexedDataset',
52 |     'MonolingualDataset',
53 |     'NoisingDataset',
54 |     'RoundRobinZipDatasets',
55 |     'ShardedIterator',
56 |     'TokenBlockDataset',
57 |     'TransformEosDataset',
58 |     'TransformEosLangPairDataset',
59 |     'TruncatedDictionary',
60 | ]
61 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import bisect
 9 | 
10 | import numpy as np
11 | 
12 | from . import FairseqDataset
13 | 
14 | 
15 | class ConcatDataset(FairseqDataset):
16 |     @staticmethod
17 |     def cumsum(sequence, sample_ratios):
18 |         r, s = [], 0
19 |         for e, ratio in zip(sequence, sample_ratios):
20 |             curr_len = int(ratio * len(e))
21 |             r.append(curr_len + s)
22 |             s += curr_len
23 |         return r
24 | 
25 |     def __init__(self, datasets, sample_ratios=1):
26 |         super(ConcatDataset, self).__init__()
27 |         assert len(datasets) > 0, "datasets should not be an empty iterable"
28 |         self.datasets = list(datasets)
29 |         if isinstance(sample_ratios, int):
30 |             sample_ratios = [sample_ratios] * len(self.datasets)
31 |         self.sample_ratios = sample_ratios
32 |         self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios)
33 |         self.real_sizes = [len(d) for d in self.datasets]
34 | 
35 |     def __len__(self):
36 |         return self.cumulative_sizes[-1]
37 | 
38 |     def __getitem__(self, idx):
39 |         dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
40 |         return self.datasets[dataset_idx][sample_idx]
41 | 
42 |     def _get_dataset_and_sample_index(self, idx: int):
43 |         dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx)
44 |         if dataset_idx == 0:
45 |             sample_idx = idx
46 |         else:
47 |             sample_idx = idx - self.cumulative_sizes[dataset_idx - 1]
48 |         sample_idx = sample_idx % self.real_sizes[dataset_idx]
49 |         return dataset_idx, sample_idx
50 | 
51 |     def collater(self, samples):
52 |         # For now only supports datasets with same underlying collater implementations
53 |         return self.datasets[0].collater(samples)
54 | 
55 |     def size(self, idx: int):
56 |         """
57 |         Return an example's size as a float or tuple.
58 |         """
59 |         dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx)
60 |         return self.datasets[dataset_idx].size(sample_idx)
61 | 
62 |     def num_tokens(self, index: int):
63 |         return np.max(self.size(index))
64 | 
65 |     @property
66 |     def sizes(self):
67 |         return np.concatenate(
68 |             [np.tile(ds.sizes, sr) for ds, sr in zip(self.datasets, self.sample_ratios)]
69 |         )
70 | 
71 |     @property
72 |     def supports_prefetch(self):
73 |         return all(d.supports_prefetch for d in self.datasets)
74 | 
75 |     def ordered_indices(self):
76 |         """
77 |         Returns indices sorted by length. So less padding is needed.
78 |         """
79 |         return np.argsort(self.sizes)
80 | 
81 |     def prefetch(self, indices):
82 |         frm = 0
83 |         for to, ds in zip(self.cumulative_sizes, self.datasets):
84 |             real_size = len(ds)
85 |             if getattr(ds, 'supports_prefetch', False):
86 |                 ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to])
87 |             frm = to
88 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/fairseq_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.utils.data
 9 | 
10 | 
11 | class FairseqDataset(torch.utils.data.Dataset):
12 |     """A dataset that provides helpers for batching."""
13 | 
14 |     def __getitem__(self, index):
15 |         raise NotImplementedError
16 | 
17 |     def __len__(self):
18 |         raise NotImplementedError
19 | 
20 |     def collater(self, samples):
21 |         """Merge a list of samples to form a mini-batch.
22 | 
23 |         Args:
24 |             samples (List[dict]): samples to collate
25 | 
26 |         Returns:
27 |             dict: a mini-batch suitable for forwarding with a Model
28 |         """
29 |         raise NotImplementedError
30 | 
31 |     def num_tokens(self, index):
32 |         """Return the number of tokens in a sample. This value is used to
33 |         enforce ``--max-tokens`` during batching."""
34 |         raise NotImplementedError
35 | 
36 |     def size(self, index):
37 |         """Return an example's size as a float or tuple. This value is used when
38 |         filtering a dataset with ``--max-positions``."""
39 |         raise NotImplementedError
40 | 
41 |     def ordered_indices(self):
42 |         """Return an ordered list of indices. Batches will be constructed based
43 |         on this order."""
44 |         raise NotImplementedError
45 | 
46 |     @property
47 |     def supports_prefetch(self):
48 |         """Whether this dataset supports prefetching."""
49 |         return False
50 | 
51 |     def prefetch(self, indices):
52 |         """Prefetch the data required for this epoch."""
53 |         raise NotImplementedError
54 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/lm_context_window_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import numpy as np
 9 | import torch
10 | 
11 | from fairseq.data.monolingual_dataset import MonolingualDataset
12 | 
13 | from . import FairseqDataset
14 | 
15 | 
16 | class LMContextWindowDataset(FairseqDataset):
17 |     """Wraps a MonolingualDataset and provides more context for evaluation."""
18 | 
19 |     def __init__(self, dataset, tokens_per_sample, context_window, pad_idx):
20 |         assert isinstance(dataset, MonolingualDataset)
21 |         assert context_window > 0
22 |         self.dataset = dataset
23 |         self.tokens_per_sample = tokens_per_sample
24 |         self.context_window = context_window
25 |         self.pad_idx = pad_idx
26 |         self.prev_tokens = np.empty([0])
27 | 
28 |     def __getitem__(self, index):
29 |         return self.dataset[index]
30 | 
31 |     def __len__(self):
32 |         return len(self.dataset)
33 | 
34 |     def collater(self, samples):
35 |         sample = self.dataset.collater(samples)
36 | 
37 |         pad = self.pad_idx
38 |         max_sample_len = self.tokens_per_sample + self.context_window
39 | 
40 |         bsz, tsz = sample['net_input']['src_tokens'].shape
41 |         start_idxs = [0] * bsz
42 |         toks = sample['net_input']['src_tokens']
43 |         lengths = sample['net_input']['src_lengths']
44 |         tgt = sample['target']
45 |         new_toks = np.empty([bsz, tsz + self.context_window], dtype=np.int64)
46 |         new_tgt = np.full([bsz, tsz + self.context_window], pad, dtype=np.int64)
47 |         sample_lens = toks.ne(pad).long().sum(dim=1).cpu()
48 |         for i in range(bsz):
49 |             sample_len = sample_lens[i]
50 |             extra = len(self.prev_tokens) + sample_len - max_sample_len
51 |             if extra > 0:
52 |                 self.prev_tokens = self.prev_tokens[extra:]
53 |             pads = np.full(self.context_window - len(self.prev_tokens), pad)
54 |             new_toks[i] = np.concatenate([self.prev_tokens, toks[i].numpy(), pads])
55 |             new_tgt[i, len(self.prev_tokens):len(self.prev_tokens) + len(tgt[i])] = tgt[i]
56 |             start_idxs[i] = len(self.prev_tokens)
57 |             lengths[i] += len(self.prev_tokens)
58 |             self.prev_tokens = new_toks[i][new_toks[i] != pad][-self.context_window:]
59 |         sample['net_input']['src_tokens'] = torch.from_numpy(new_toks)
60 |         sample['target'] = torch.from_numpy(new_tgt)
61 |         sample['start_indices'] = start_idxs
62 | 
63 |         return sample
64 | 
65 |     def num_tokens(self, index):
66 |         return self.dataset.num_tokens(index)
67 | 
68 |     def size(self, index):
69 |         return self.dataset.size(index)
70 | 
71 |     def ordered_indices(self):
72 |         # NOTE we don't shuffle the data to retain access to the previous dataset elements
73 |         return np.arange(len(self.dataset))
74 | 
75 |     @property
76 |     def supports_prefetch(self):
77 |         return getattr(self.dataset, 'supports_prefetch', False)
78 | 
79 |     def prefetch(self, indices):
80 |         return self.dataset.prefetch(indices)
81 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/masked_lm_dictionary.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from fairseq.data import Dictionary
 9 | 
10 | 
11 | class MaskedLMDictionary(Dictionary):
12 |     """
13 |     Dictionary for Masked Language Modelling tasks. This extends Dictionary by
14 |     adding the mask symbol.
15 |     """
16 |     def __init__(
17 |         self,
18 |         pad='<pad>',
19 |         eos='</s>',
20 |         unk='<unk>',
21 |         mask='<mask>',
22 |     ):
23 |         super().__init__(pad, eos, unk)
24 |         self.mask_word = mask
25 |         self.mask_index = self.add_symbol(mask)
26 |         self.nspecial = len(self.symbols)
27 | 
28 |     def mask(self):
29 |         """Helper to get index of mask symbol"""
30 |         return self.mask_index
31 | 
32 | 
33 | class BertDictionary(MaskedLMDictionary):
34 |     """
35 |     Dictionary for BERT task. This extends MaskedLMDictionary by adding support
36 |     for cls and sep symbols.
37 |     """
38 |     def __init__(
39 |         self,
40 |         pad='<pad>',
41 |         eos='</s>',
42 |         unk='<unk>',
43 |         mask='<mask>',
44 |         cls='<cls>',
45 |         sep='<sep>'
46 |     ):
47 |         super().__init__(pad, eos, unk, mask)
48 |         self.cls_word = cls
49 |         self.sep_word = sep
50 |         self.cls_index = self.add_symbol(cls)
51 |         self.sep_index = self.add_symbol(sep)
52 |         self.nspecial = len(self.symbols)
53 | 
54 |     def cls(self):
55 |         """Helper to get index of cls symbol"""
56 |         return self.cls_index
57 | 
58 |     def sep(self):
59 |         """Helper to get index of sep symbol"""
60 |         return self.sep_index
61 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/round_robin_zip_datasets.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | 
  8 | from collections import OrderedDict
  9 | 
 10 | import numpy as np
 11 | 
 12 | from . import FairseqDataset
 13 | 
 14 | 
 15 | class RoundRobinZipDatasets(FairseqDataset):
 16 |     """Zip multiple :class:`~fairseq.data.FairseqDataset` instances together.
 17 | 
 18 |     Shorter datasets are repeated in a round-robin fashion to match the length
 19 |     of the longest one.
 20 | 
 21 |     Args:
 22 |         datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of
 23 |             :class:`~fairseq.data.FairseqDataset` instances.
 24 |         eval_key (str, optional): a key used at evaluation time that causes
 25 |             this instance to pass-through batches from *datasets[eval_key]*.
 26 |     """
 27 | 
 28 |     def __init__(self, datasets, eval_key=None):
 29 |         super().__init__()
 30 |         assert isinstance(datasets, OrderedDict)
 31 |         self.datasets = datasets
 32 |         self.eval_key = eval_key
 33 | 
 34 |         self.longest_dataset = None
 35 |         self.longest_dataset_key = None
 36 |         for key, dataset in datasets.items():
 37 |             assert isinstance(dataset, FairseqDataset)
 38 |             if self.longest_dataset is None or len(dataset) > len(self.longest_dataset):
 39 |                 self.longest_dataset = dataset
 40 |                 self.longest_dataset_key = key
 41 | 
 42 |         self._ordered_indices = None
 43 | 
 44 |     def _map_index(self, key, index):
 45 |         assert self._ordered_indices is not None, \
 46 |             'Must call RoundRobinZipDatasets.ordered_indices() first'
 47 |         return self._ordered_indices[key][index % len(self.datasets[key])]
 48 | 
 49 |     def __getitem__(self, index):
 50 |         if self.eval_key is None:
 51 |             return OrderedDict([
 52 |                 (key, dataset[self._map_index(key, index)])
 53 |                 for key, dataset in self.datasets.items()
 54 |             ])
 55 |         else:
 56 |             # at evaluation time it's useful to pass-through batches from a single key
 57 |             return self.datasets[self.eval_key][self._map_index(self.eval_key, index)]
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.longest_dataset)
 61 | 
 62 |     def collater(self, samples):
 63 |         """Merge a list of samples to form a mini-batch."""
 64 |         if len(samples) == 0:
 65 |             return None
 66 |         if self.eval_key is None:
 67 |             return OrderedDict([
 68 |                 (key, dataset.collater([sample[key] for sample in samples]))
 69 |                 for key, dataset in self.datasets.items()
 70 |             ])
 71 |         else:
 72 |             # at evaluation time it's useful to pass-through batches from a single key
 73 |             return self.datasets[self.eval_key].collater(samples)
 74 | 
 75 |     def num_tokens(self, index):
 76 |         """Return an example's length (number of tokens), used for batching."""
 77 |         # TODO make it configurable whether to use max() or sum() here
 78 |         return max(
 79 |             dataset.num_tokens(self._map_index(key, index))
 80 |             for key, dataset in self.datasets.items()
 81 |         )
 82 | 
 83 |     def size(self, index):
 84 |         """Return an example's size as a float or tuple. This value is used when
 85 |         filtering a dataset with ``--max-positions``."""
 86 |         return {
 87 |             key: dataset.size(self._map_index(key, index))
 88 |             for key, dataset in self.datasets.items()
 89 |         }
 90 | 
 91 |     def ordered_indices(self):
 92 |         """Ordered indices for batching."""
 93 |         if self._ordered_indices is None:
 94 |             # Call the underlying dataset's ordered_indices() here, so that we
 95 |             # get the same random ordering as we would have from using the
 96 |             # underlying dataset directly.
 97 |             self._ordered_indices = OrderedDict([
 98 |                 (key, dataset.ordered_indices())
 99 |                 for key, dataset in self.datasets.items()
100 |             ])
101 |         return np.arange(len(self))
102 | 
103 |     @property
104 |     def supports_prefetch(self):
105 |         return all(
106 |             getattr(dataset, 'supports_prefetch', False)
107 |             for dataset in self.datasets.values()
108 |         )
109 | 
110 |     def prefetch(self, indices):
111 |         for key, dataset in self.datasets.items():
112 |             dataset.prefetch([self._map_index(key, index) for index in indices])
113 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/data/transform_eos_lang_pair_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | 
 9 | from . import FairseqDataset
10 | from typing import Optional
11 | 
12 | 
13 | class TransformEosLangPairDataset(FairseqDataset):
14 |     """A :class:`~fairseq.data.FairseqDataset` wrapper that transform bos on
15 |     collated samples of language pair dataset.
16 | 
17 |     Note that the transformation is applied in :func:`collater`.
18 | 
19 |     Args:
20 |         dataset (~fairseq.data.FairseqDataset): dataset that collates sample into
21 |             LanguagePairDataset schema
22 |         src_eos (int): original source end-of-sentence symbol index to be replaced
23 |         new_src_eos (int, optional): new end-of-sentence symbol index to replace source eos symbol
24 |         tgt_bos (int, optional): original target beginning-of-sentence symbol index to be replaced
25 |         new_tgt_bos (int, optional): new beginning-of-sentence symbol index to replace at the
26 |             beginning of 'prev_output_tokens'
27 |     """
28 | 
29 |     def __init__(
30 |         self,
31 |         dataset: FairseqDataset,
32 |         src_eos: int,
33 |         new_src_eos: Optional[int] = None,
34 |         tgt_bos: Optional[int] = None,
35 |         new_tgt_bos: Optional[int] = None,
36 |     ):
37 |         self.dataset = dataset
38 |         self.src_eos = src_eos
39 |         self.new_src_eos = new_src_eos
40 |         self.tgt_bos = tgt_bos
41 |         self.new_tgt_bos = new_tgt_bos
42 | 
43 |     def __getitem__(self, index):
44 |         return self.dataset[index]
45 | 
46 |     def __len__(self):
47 |         return len(self.dataset)
48 | 
49 |     def collater(self, samples):
50 |         samples = self.dataset.collater(samples)
51 | 
52 |         # TODO: support different padding direction
53 |         if self.new_src_eos is not None:
54 |             assert(samples['net_input']['src_tokens'][:, -1] != self.src_eos).sum() == 0
55 |             samples['net_input']['src_tokens'][:, -1] = self.new_src_eos
56 | 
57 |         if self.new_tgt_bos is not None:
58 |             assert (samples['net_input']['prev_output_tokens'][:, 0] != self.tgt_bos).sum() == 0
59 |             samples['net_input']['prev_output_tokens'][:, 0] = self.new_tgt_bos
60 | 
61 |         return samples
62 | 
63 |     def num_tokens(self, index):
64 |         return self.dataset.num_tokens(index)
65 | 
66 |     def size(self, index):
67 |         return self.dataset.size(index)
68 | 
69 |     def ordered_indices(self):
70 |         return self.dataset.ordered_indices()
71 | 
72 |     @property
73 |     def supports_prefetch(self):
74 |         return getattr(self.dataset, 'supports_prefetch', False)
75 | 
76 |     def prefetch(self, indices):
77 |         return self.dataset.prefetch(indices)
78 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/meters.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import time
 9 | 
10 | 
11 | class AverageMeter(object):
12 |     """Computes and stores the average and current value"""
13 |     def __init__(self):
14 |         self.reset()
15 | 
16 |     def reset(self):
17 |         self.val = 0
18 |         self.avg = 0
19 |         self.sum = 0
20 |         self.count = 0
21 | 
22 |     def update(self, val, n=1):
23 |         self.val = val
24 |         self.sum += val * n
25 |         self.count += n
26 |         self.avg = self.sum / self.count
27 | 
28 | 
29 | class TimeMeter(object):
30 |     """Computes the average occurrence of some event per second"""
31 |     def __init__(self, init=0):
32 |         self.reset(init)
33 | 
34 |     def reset(self, init=0):
35 |         self.init = init
36 |         self.start = time.time()
37 |         self.n = 0
38 | 
39 |     def update(self, val=1):
40 |         self.n += val
41 | 
42 |     @property
43 |     def avg(self):
44 |         return self.n / self.elapsed_time
45 | 
46 |     @property
47 |     def elapsed_time(self):
48 |         return self.init + (time.time() - self.start)
49 | 
50 | 
51 | class StopwatchMeter(object):
52 |     """Computes the sum/avg duration of some event in seconds"""
53 |     def __init__(self):
54 |         self.reset()
55 | 
56 |     def start(self):
57 |         self.start_time = time.time()
58 | 
59 |     def stop(self, n=1):
60 |         if self.start_time is not None:
61 |             delta = time.time() - self.start_time
62 |             self.sum += delta
63 |             self.n += n
64 |             self.start_time = None
65 | 
66 |     def reset(self):
67 |         self.sum = 0
68 |         self.n = 0
69 |         self.start_time = None
70 | 
71 |     @property
72 |     def avg(self):
73 |         return self.sum / self.n
74 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/models/composite_encoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from fairseq.models import FairseqEncoder
 9 | 
10 | 
11 | class CompositeEncoder(FairseqEncoder):
12 |     """
13 |     A wrapper around a dictionary of :class:`FairseqEncoder` objects.
14 | 
15 |     We run forward on each encoder and return a dictionary of outputs. The first
16 |     encoder's dictionary is used for initialization.
17 | 
18 |     Args:
19 |         encoders (dict): a dictionary of :class:`FairseqEncoder` objects.
20 |     """
21 | 
22 |     def __init__(self, encoders):
23 |         super().__init__(next(iter(encoders.values())).dictionary)
24 |         self.encoders = encoders
25 |         for key in self.encoders:
26 |             self.add_module(key, self.encoders[key])
27 | 
28 |     def forward(self, src_tokens, src_lengths):
29 |         """
30 |         Args:
31 |             src_tokens (LongTensor): tokens in the source language of shape
32 |                 `(batch, src_len)`
33 |             src_lengths (LongTensor): lengths of each source sentence of shape
34 |                 `(batch)`
35 | 
36 |         Returns:
37 |             dict:
38 |                 the outputs from each Encoder
39 |         """
40 |         encoder_out = {}
41 |         for key in self.encoders:
42 |             encoder_out[key] = self.encoders[key](src_tokens, src_lengths)
43 |         return encoder_out
44 | 
45 |     def reorder_encoder_out(self, encoder_out, new_order):
46 |         """Reorder encoder output according to new_order."""
47 |         for key in self.encoders:
48 |             encoder_out[key] = self.encoders[key].reorder_encoder_out(encoder_out[key], new_order)
49 |         return encoder_out
50 | 
51 |     def max_positions(self):
52 |         return min([self.encoders[key].max_positions() for key in self.encoders])
53 | 
54 |     def upgrade_state_dict(self, state_dict):
55 |         for key in self.encoders:
56 |             self.encoders[key].upgrade_state_dict(state_dict)
57 |         return state_dict
58 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/models/distributed_fairseq_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import inspect
 9 | 
10 | from torch.nn import parallel
11 | 
12 | from fairseq.legacy_distributed_data_parallel import LegacyDistributedDataParallel
13 | from fairseq.models import BaseFairseqModel
14 | 
15 | 
16 | def DistributedFairseqModel(args, model):
17 |     """
18 |     Wrap a *model* to support distributed data parallel training.
19 | 
20 |     This is similar to the built-in DistributedDataParallel, but allows
21 |     additional configuration of the DistributedDataParallel class to
22 |     use, and also provides easier access to the wrapped model by
23 |     forwarding requests for missing attributes to the wrapped model.
24 | 
25 |     Args:
26 |         args (argparse.Namespace): fairseq args
27 |         model (BaseFairseqModel): model to wrap
28 |     """
29 |     # determine which DDP class to extend
30 |     assert isinstance(model, BaseFairseqModel)
31 |     if args.ddp_backend == 'c10d':
32 |         ddp_class = parallel.DistributedDataParallel
33 |         init_kwargs = dict(
34 |             module=model,
35 |             device_ids=[args.device_id],
36 |             output_device=args.device_id,
37 |             broadcast_buffers=False,
38 |             bucket_cap_mb=args.bucket_cap_mb,
39 |         )
40 |         # Maintain backward compatibility
41 |         if 'check_reduction' in inspect.getargspec(ddp_class)[0]:
42 |             init_kwargs['check_reduction'] = True
43 |         if 'find_unused_parameters' in inspect.getargspec(ddp_class)[0]:
44 |             init_kwargs['find_unused_parameters'] = args.find_unused_parameters
45 |     elif args.ddp_backend == 'no_c10d':
46 |         ddp_class = LegacyDistributedDataParallel
47 |         init_kwargs = dict(
48 |             module=model,
49 |             world_size=args.distributed_world_size,
50 |             buffer_size=2**28,
51 |         )
52 |     else:
53 |         raise ValueError('Unknown --ddp-backend: ' + args.ddp_backend)
54 | 
55 |     class _DistributedFairseqModel(ddp_class):
56 |         """Extend DistributedDataParallel to check for missing
57 |         attributes in the wrapped module."""
58 | 
59 |         def __init__(self, *args, **kwargs):
60 |             super().__init__(*args, **kwargs)
61 | 
62 |         def __getattr__(self, name):
63 |             wrapped_module = super().__getattr__('module')
64 |             if hasattr(wrapped_module, name):
65 |                 return getattr(wrapped_module, name)
66 |             return super().__getattr__(name)
67 | 
68 |     return _DistributedFairseqModel(**init_kwargs)
69 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/models/fairseq_decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.nn as nn
 9 | 
10 | from fairseq import utils
11 | 
12 | 
13 | class FairseqDecoder(nn.Module):
14 |     """Base class for decoders."""
15 | 
16 |     def __init__(self, dictionary):
17 |         super().__init__()
18 |         self.dictionary = dictionary
19 |         self.onnx_trace = False
20 | 
21 |     def forward(self, prev_output_tokens, encoder_out=None, **kwargs):
22 |         """
23 |         Args:
24 |             prev_output_tokens (LongTensor): shifted output tokens of shape
25 |                 `(batch, tgt_len)`, for input feeding/teacher forcing
26 |             encoder_out (dict, optional): output from the encoder, used for
27 |                 encoder-side attention
28 | 
29 |         Returns:
30 |             tuple:
31 |                 - the decoder's output of shape `(batch, tgt_len, vocab)`
32 |                 - a dictionary with any model-specific outputs
33 |         """
34 |         x, extra = self.extract_features(prev_output_tokens, encoder_out=encoder_out, **kwargs)
35 |         x = self.output_layer(x)
36 |         return x, extra
37 | 
38 |     def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs):
39 |         """
40 |         Returns:
41 |             tuple:
42 |                 - the decoder's features of shape `(batch, tgt_len, embed_dim)`
43 |                 - a dictionary with any model-specific outputs
44 |         """
45 |         raise NotImplementedError
46 | 
47 |     def output_layer(self, features, **kwargs):
48 |         """
49 |         Project features to the default output size, e.g., vocabulary size.
50 | 
51 |         Args:
52 |             features (Tensor): features returned by *extract_features*.
53 |         """
54 |         raise NotImplementedError
55 | 
56 |     def get_normalized_probs(self, net_output, log_probs, sample):
57 |         """Get normalized probabilities (or log probs) from a net's output."""
58 | 
59 |         if hasattr(self, 'adaptive_softmax') and self.adaptive_softmax is not None:
60 |             if sample is not None:
61 |                 assert 'target' in sample
62 |                 target = sample['target']
63 |             else:
64 |                 target = None
65 |             out = self.adaptive_softmax.get_log_prob(net_output[0], target=target)
66 |             return out.exp_() if not log_probs else out
67 | 
68 |         logits = net_output[0]
69 |         if log_probs:
70 |             return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
71 |         else:
72 |             return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace)
73 | 
74 |     def max_positions(self):
75 |         """Maximum input length supported by the decoder."""
76 |         return 1e6  # an arbitrary large number
77 | 
78 |     def upgrade_state_dict(self, state_dict):
79 |         """Upgrade a (possibly old) state dict for new versions of fairseq."""
80 |         return state_dict
81 | 
82 |     def prepare_for_onnx_export_(self):
83 |         self.onnx_trace = True
84 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/models/fairseq_encoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.nn as nn
 9 | 
10 | 
11 | class FairseqEncoder(nn.Module):
12 |     """Base class for encoders."""
13 | 
14 |     def __init__(self, dictionary):
15 |         super().__init__()
16 |         self.dictionary = dictionary
17 | 
18 |     def forward(self, src_tokens, src_lengths=None, **kwargs):
19 |         """
20 |         Args:
21 |             src_tokens (LongTensor): tokens in the source language of shape
22 |                 `(batch, src_len)`
23 |             src_lengths (LongTensor): lengths of each source sentence of shape
24 |                 `(batch)`
25 |         """
26 |         raise NotImplementedError
27 | 
28 |     def reorder_encoder_out(self, encoder_out, new_order):
29 |         """
30 |         Reorder encoder output according to `new_order`.
31 | 
32 |         Args:
33 |             encoder_out: output from the ``forward()`` method
34 |             new_order (LongTensor): desired order
35 | 
36 |         Returns:
37 |             `encoder_out` rearranged according to `new_order`
38 |         """
39 |         raise NotImplementedError
40 | 
41 |     def max_positions(self):
42 |         """Maximum input length supported by the encoder."""
43 |         return 1e6  # an arbitrary large number
44 | 
45 |     def upgrade_state_dict(self, state_dict):
46 |         """Upgrade a (possibly old) state dict for new versions of fairseq."""
47 |         return state_dict
48 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/models/fairseq_incremental_decoder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from fairseq.models import FairseqDecoder
 9 | 
10 | 
11 | class FairseqIncrementalDecoder(FairseqDecoder):
12 |     """Base class for incremental decoders.
13 | 
14 |     Incremental decoding is a special mode at inference time where the Model
15 |     only receives a single timestep of input corresponding to the previous
16 |     output token (for input feeding) and must produce the next output
17 |     *incrementally*. Thus the model must cache any long-term state that is
18 |     needed about the sequence, e.g., hidden states, convolutional states, etc.
19 | 
20 |     Compared to the standard :class:`FairseqDecoder` interface, the incremental
21 |     decoder interface allows :func:`forward` functions to take an extra keyword
22 |     argument (*incremental_state*) that can be used to cache state across
23 |     time-steps.
24 | 
25 |     The :class:`FairseqIncrementalDecoder` interface also defines the
26 |     :func:`reorder_incremental_state` method, which is used during beam search
27 |     to select and reorder the incremental state based on the selection of beams.
28 | 
29 |     To learn more about how incremental decoding works, refer to `this blog
30 |     <http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_.
31 |     """
32 | 
33 |     def __init__(self, dictionary):
34 |         super().__init__(dictionary)
35 | 
36 |     def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
37 |         """
38 |         Args:
39 |             prev_output_tokens (LongTensor): shifted output tokens of shape
40 |                 `(batch, tgt_len)`, for input feeding/teacher forcing
41 |             encoder_out (dict, optional): output from the encoder, used for
42 |                 encoder-side attention
43 |             incremental_state (dict, optional): dictionary used for storing
44 |                 state during :ref:`Incremental decoding`
45 | 
46 |         Returns:
47 |             tuple:
48 |                 - the decoder's output of shape `(batch, tgt_len, vocab)`
49 |                 - a dictionary with any model-specific outputs
50 |         """
51 |         raise NotImplementedError
52 | 
53 |     def extract_features(self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs):
54 |         """
55 |         Returns:
56 |             tuple:
57 |                 - the decoder's features of shape `(batch, tgt_len, embed_dim)`
58 |                 - a dictionary with any model-specific outputs
59 |         """
60 |         raise NotImplementedError
61 | 
62 |     def reorder_incremental_state(self, incremental_state, new_order):
63 |         """Reorder incremental state.
64 | 
65 |         This should be called when the order of the input has changed from the
66 |         previous time step. A typical use case is beam search, where the input
67 |         order changes between time steps based on the selection of beams.
68 |         """
69 |         seen = set()
70 | 
71 |         def apply_reorder_incremental_state(module):
72 |             if module != self and hasattr(module, 'reorder_incremental_state') \
73 |                     and module not in seen:
74 |                 seen.add(module)
75 |                 module.reorder_incremental_state(incremental_state, new_order)
76 | 
77 |         self.apply(apply_reorder_incremental_state)
78 | 
79 |     def set_beam_size(self, beam_size):
80 |         """Sets the beam size in the decoder and all children."""
81 |         if getattr(self, '_beam_size', -1) != beam_size:
82 |             seen = set()
83 | 
84 |             def apply_set_beam_size(module):
85 |                 if module != self and hasattr(module, 'set_beam_size') \
86 |                         and module not in seen:
87 |                     seen.add(module)
88 |                     module.set_beam_size(beam_size)
89 | 
90 |             self.apply(apply_set_beam_size)
91 |             self._beam_size = beam_size
92 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from .adaptive_input import AdaptiveInput
 9 | from .adaptive_softmax import AdaptiveSoftmax
10 | from .beamable_mm import BeamableMM
11 | from .character_token_embedder import CharacterTokenEmbedder
12 | from .conv_tbc import ConvTBC
13 | from .downsampled_multihead_attention import DownsampledMultiHeadAttention
14 | from .dynamic_convolution import DynamicConv1dTBC
15 | from .gelu import gelu, gelu_accurate
16 | from .grad_multiply import GradMultiply
17 | from .highway import Highway
18 | from .layer_norm import LayerNorm
19 | from .learned_positional_embedding import LearnedPositionalEmbedding
20 | from .lightweight_convolution import LightweightConv1dTBC
21 | from .linearized_convolution import LinearizedConvolution
22 | from .logsumexp_moe import LogSumExpMoE
23 | from .mean_pool_gating_network import MeanPoolGatingNetwork
24 | from .multihead_attention import MultiheadAttention
25 | from .positional_embedding import PositionalEmbedding
26 | from .scalar_bias import ScalarBias
27 | from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
28 | from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer
29 | from .transformer_sentence_encoder import TransformerSentenceEncoder
30 | from .unfold import unfold1d
31 | 
32 | __all__ = [
33 |     'AdaptiveInput',
34 |     'AdaptiveSoftmax',
35 |     'BeamableMM',
36 |     'CharacterTokenEmbedder',
37 |     'ConvTBC',
38 |     'DownsampledMultiHeadAttention',
39 |     'DynamicConv1dTBC',
40 |     'gelu',
41 |     'gelu_accurate',
42 |     'GradMultiply',
43 |     'Highway',
44 |     'LayerNorm',
45 |     'LearnedPositionalEmbedding',
46 |     'LightweightConv1dTBC',
47 |     'LinearizedConvolution',
48 |     'LogSumExpMoE',
49 |     'MeanPoolGatingNetwork',
50 |     'MultiheadAttention',
51 |     'PositionalEmbedding',
52 |     'ScalarBias',
53 |     'SinusoidalPositionalEmbedding',
54 |     'TransformerSentenceEncoderLayer',
55 |     'TransformerSentenceEncoder',
56 |     'unfold1d',
57 | ]
58 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/adaptive_input.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | 
 9 | import torch
10 | from torch import nn
11 | 
12 | from typing import List
13 | 
14 | 
15 | class AdaptiveInput(nn.Module):
16 | 
17 |     def __init__(
18 |         self,
19 |         vocab_size: int,
20 |         padding_idx: int,
21 |         initial_dim: int,
22 |         factor: float,
23 |         output_dim: int,
24 |         cutoff: List[int],
25 |     ):
26 |         super().__init__()
27 | 
28 |         if vocab_size > cutoff[-1]:
29 |             cutoff = cutoff + [vocab_size]
30 |         else:
31 |             assert vocab_size == cutoff[
32 |                 -1], 'cannot specify cutoff larger than vocab size'
33 | 
34 |         self.cutoff = cutoff
35 |         self.embedding_dim = output_dim
36 |         self.padding_idx = padding_idx
37 | 
38 |         self.embeddings = nn.ModuleList()
39 |         for i in range(len(self.cutoff)):
40 |             prev = self.cutoff[i - 1] if i > 0 else 0
41 |             size = self.cutoff[i] - prev
42 |             dim = int(initial_dim // (factor ** i))
43 |             seq = nn.Sequential(
44 |                 nn.Embedding(size, dim, padding_idx),
45 |                 nn.Linear(dim, output_dim, bias=False)
46 |             )
47 |             self.embeddings.append(seq)
48 | 
49 |         def init_weights(m):
50 |             if isinstance(m, nn.Embedding):
51 |                 nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5)
52 |                 nn.init.constant_(m.weight[padding_idx], 0)
53 |             elif hasattr(m, 'weight'):
54 |                 nn.init.xavier_uniform_(m.weight)
55 | 
56 |         self.apply(init_weights)
57 | 
58 |         self.register_buffer('_float_tensor', torch.FloatTensor(1))
59 | 
60 |     def weights_for_band(self, band: int):
61 |         return self.embeddings[band][0].weight, self.embeddings[band][1].weight
62 | 
63 |     def forward(self, input: torch.Tensor):
64 |         result = self._float_tensor.new(input.shape + (self.embedding_dim,))
65 |         for i in range(len(self.cutoff)):
66 |             mask = input.lt(self.cutoff[i])
67 |             if i > 0:
68 |                 mask.mul_(input.ge(self.cutoff[i - 1]))
69 |                 chunk_input = input[mask] - self.cutoff[i - 1]
70 |             else:
71 |                 chunk_input = input[mask]
72 |             if mask.any():
73 |                 result[mask] = self.embeddings[i](chunk_input)
74 |         return result
75 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/beamable_mm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | 
11 | 
12 | class BeamableMM(nn.Module):
13 |     """This module provides an optimized MM for beam decoding with attention.
14 | 
15 |     It leverage the fact that the source-side of the input is replicated beam
16 |     times and the target-side of the input is of width one. This layer speeds up
17 |     inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)}
18 |     with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}.
19 |     """
20 |     def __init__(self, beam_size=None):
21 |         super(BeamableMM, self).__init__()
22 |         self.beam_size = beam_size
23 | 
24 |     def forward(self, input1, input2):
25 |         if (
26 |             not self.training and           # test mode
27 |             self.beam_size is not None and  # beam size is set
28 |             input1.dim() == 3 and           # only support batched input
29 |             input1.size(1) == 1             # single time step update
30 |         ):
31 |             bsz, beam = input1.size(0), self.beam_size
32 | 
33 |             # bsz x 1 x nhu --> bsz/beam x beam x nhu
34 |             input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1)
35 | 
36 |             # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu
37 |             input2 = input2.unfold(0, beam, beam)[:, :, :, 0]
38 | 
39 |             # use non batched operation if bsz = beam
40 |             if input1.size(0) == 1:
41 |                 output = torch.mm(input1[0, :, :], input2[0, :, :])
42 |             else:
43 |                 output = input1.bmm(input2)
44 |             return output.view(bsz, 1, -1)
45 |         else:
46 |             return input1.bmm(input2)
47 | 
48 |     def set_beam_size(self, beam_size):
49 |         self.beam_size = beam_size
50 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/conv_tbc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | from torch.nn.modules.utils import _single
10 | 
11 | 
12 | class ConvTBC(torch.nn.Module):
13 |     """1D convolution over an input of shape (time x batch x channel)
14 | 
15 |     The implementation uses gemm to perform the convolution. This implementation
16 |     is faster than cuDNN for small kernel sizes.
17 |     """
18 |     def __init__(self, in_channels, out_channels, kernel_size, padding=0):
19 |         super(ConvTBC, self).__init__()
20 |         self.in_channels = in_channels
21 |         self.out_channels = out_channels
22 |         self.kernel_size = _single(kernel_size)
23 |         self.padding = _single(padding)
24 | 
25 |         self.weight = torch.nn.Parameter(torch.Tensor(
26 |             self.kernel_size[0], in_channels, out_channels))
27 |         self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
28 | 
29 |     def forward(self, input):
30 |         return torch.conv_tbc(input.contiguous(), self.weight, self.bias, self.padding[0])
31 | 
32 |     def __repr__(self):
33 |         s = ('{name}({in_channels}, {out_channels}, kernel_size={kernel_size}'
34 |              ', padding={padding}')
35 |         if self.bias is None:
36 |             s += ', bias=False'
37 |         s += ')'
38 |         return s.format(name=self.__class__.__name__, **self.__dict__)
39 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/gelu.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | """
 8 | See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with
 9 | the corresponding GitHub repo: https://github.com/hendrycks/GELUs
10 | """
11 | 
12 | import math
13 | 
14 | import torch
15 | 
16 | 
17 | def gelu_accurate(x):
18 |     if not hasattr(gelu_accurate, "_a"):
19 |         gelu_accurate._a = math.sqrt(2 / math.pi)
20 |     return 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3))))
21 | 
22 | 
23 | def gelu(x: torch.Tensor) -> torch.Tensor:
24 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
25 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/grad_multiply.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | 
10 | 
11 | class GradMultiply(torch.autograd.Function):
12 |     @staticmethod
13 |     def forward(ctx, x, scale):
14 |         ctx.scale = scale
15 |         res = x.new(x)
16 |         return res
17 | 
18 |     @staticmethod
19 |     def backward(ctx, grad):
20 |         return grad * ctx.scale, None
21 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/highway.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | 
10 | from torch import nn
11 | 
12 | 
13 | class Highway(torch.nn.Module):
14 |     """
15 |     A `Highway layer <https://arxiv.org/abs/1505.00387>`_.
16 |     Adopted from the AllenNLP implementation.
17 |     """
18 | 
19 |     def __init__(
20 |             self,
21 |             input_dim: int,
22 |             num_layers: int = 1
23 |     ):
24 |         super(Highway, self).__init__()
25 |         self.input_dim = input_dim
26 |         self.layers = nn.ModuleList([nn.Linear(input_dim, input_dim * 2)
27 |                                      for _ in range(num_layers)])
28 |         self.activation = nn.ReLU()
29 | 
30 |         self.reset_parameters()
31 | 
32 |     def reset_parameters(self):
33 |         for layer in self.layers:
34 |             # As per comment in AllenNLP:
35 |             # We should bias the highway layer to just carry its input forward.  We do that by
36 |             # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
37 |             # be high, so we will carry the input forward.  The bias on `B(x)` is the second half
38 |             # of the bias vector in each Linear layer.
39 |             nn.init.constant_(layer.bias[self.input_dim:], 1)
40 | 
41 |             nn.init.constant_(layer.bias[:self.input_dim], 0)
42 |             nn.init.xavier_normal_(layer.weight)
43 | 
44 |     def forward(
45 |             self,
46 |             x: torch.Tensor
47 |     ):
48 |         for layer in self.layers:
49 |             projection = layer(x)
50 |             proj_x, gate = projection.chunk(2, dim=-1)
51 |             proj_x = self.activation(proj_x)
52 |             gate = torch.sigmoid(gate)
53 |             x = gate * x + (gate.new_tensor([1]) - gate) * proj_x
54 |         return x
55 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/layer_norm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | 
10 | 
11 | def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False):
12 |     if not export and torch.cuda.is_available():
13 |         try:
14 |             from apex.normalization import FusedLayerNorm
15 |             return FusedLayerNorm(normalized_shape, eps, elementwise_affine)
16 |         except ImportError:
17 |             pass
18 |     return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine)
19 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/learned_positional_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.nn as nn
 9 | 
10 | from fairseq import utils
11 | 
12 | 
13 | class LearnedPositionalEmbedding(nn.Embedding):
14 |     """
15 |     This module learns positional embeddings up to a fixed maximum size.
16 |     Padding ids are ignored by either offsetting based on padding_idx
17 |     or by setting padding_idx to None and ensuring that the appropriate
18 |     position ids are passed to the forward function.
19 |     """
20 | 
21 |     def __init__(
22 |             self,
23 |             num_embeddings: int,
24 |             embedding_dim: int,
25 |             padding_idx: int,
26 |     ):
27 |         super().__init__(num_embeddings, embedding_dim, padding_idx)
28 |         self.onnx_trace = False
29 | 
30 |     def forward(self, input, incremental_state=None, positions=None):
31 |         """Input is expected to be of size [bsz x seqlen]."""
32 |         assert (
33 |             (positions is None) or (self.padding_idx is None)
34 |         ), "If positions is pre-computed then padding_idx should not be set."
35 | 
36 |         if positions is None:
37 |             if incremental_state is not None:
38 |                 # positions is the same for every token when decoding a single step
39 |                 positions = input.data.new(1, 1).fill_(self.padding_idx + input.size(1))
40 |             else:
41 |                 positions = utils.make_positions(
42 |                     input.data, self.padding_idx, onnx_trace=self.onnx_trace,
43 |                 )
44 |         return super().forward(positions)
45 | 
46 |     def max_positions(self):
47 |         """Maximum number of supported positions."""
48 |         if self.padding_idx is not None:
49 |             return self.num_embeddings - self.padding_idx - 1
50 |         else:
51 |             return self.num_embeddings
52 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/linearized_convolution.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | 
11 | from fairseq import utils
12 | 
13 | from .conv_tbc import ConvTBC
14 | 
15 | 
16 | class LinearizedConvolution(ConvTBC):
17 |     """An optimized version of nn.Conv1d.
18 | 
19 |     At training time, this module uses ConvTBC, which is an optimized version
20 |     of Conv1d. At inference time, it optimizes incremental generation (i.e.,
21 |     one time step at a time) by replacing the convolutions with linear layers.
22 |     Note that the input order changes from training to inference.
23 |     """
24 | 
25 |     def __init__(self, in_channels, out_channels, kernel_size, **kwargs):
26 |         super().__init__(in_channels, out_channels, kernel_size, **kwargs)
27 |         self._linearized_weight = None
28 |         self.register_backward_hook(self._clear_linearized_weight)
29 | 
30 |     def forward(self, input, incremental_state=None):
31 |         """
32 |         Args:
33 |             incremental_state: Used to buffer signal; if not None, then input is
34 |                 expected to contain a single frame. If the input order changes
35 |                 between time steps, call reorder_incremental_state.
36 |         Input:
37 |             Time x Batch x Channel during training
38 |             Batch x Time x Channel during inference
39 |         """
40 |         if incremental_state is None:
41 |             output = super().forward(input)
42 |             if self.kernel_size[0] > 1 and self.padding[0] > 0:
43 |                 # remove future timesteps added by padding
44 |                 output = output[:-self.padding[0], :, :]
45 |             return output
46 | 
47 |         # reshape weight
48 |         weight = self._get_linearized_weight()
49 |         kw = self.kernel_size[0]
50 | 
51 |         bsz = input.size(0)  # input: bsz x len x dim
52 |         if kw > 1:
53 |             input = input.data
54 |             input_buffer = self._get_input_buffer(incremental_state)
55 |             if input_buffer is None:
56 |                 input_buffer = input.new(bsz, kw, input.size(2)).zero_()
57 |                 self._set_input_buffer(incremental_state, input_buffer)
58 |             else:
59 |                 # shift buffer
60 |                 input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone()
61 |             # append next input
62 |             input_buffer[:, -1, :] = input[:, -1, :]
63 |             input = input_buffer
64 |         with torch.no_grad():
65 |             output = F.linear(input.view(bsz, -1), weight, self.bias)
66 |         return output.view(bsz, 1, -1)
67 | 
68 |     def reorder_incremental_state(self, incremental_state, new_order):
69 |         input_buffer = self._get_input_buffer(incremental_state)
70 |         if input_buffer is not None:
71 |             input_buffer = input_buffer.index_select(0, new_order)
72 |             self._set_input_buffer(incremental_state, input_buffer)
73 | 
74 |     def _get_input_buffer(self, incremental_state):
75 |         return utils.get_incremental_state(self, incremental_state, 'input_buffer')
76 | 
77 |     def _set_input_buffer(self, incremental_state, new_buffer):
78 |         return utils.set_incremental_state(self, incremental_state, 'input_buffer', new_buffer)
79 | 
80 |     def _get_linearized_weight(self):
81 |         if self._linearized_weight is None:
82 |             kw = self.kernel_size[0]
83 |             weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous()
84 |             assert weight.size() == (self.out_channels, kw, self.in_channels)
85 |             self._linearized_weight = weight.view(self.out_channels, -1)
86 |         return self._linearized_weight
87 | 
88 |     def _clear_linearized_weight(self, *args):
89 |         self._linearized_weight = None
90 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/logsumexp_moe.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | 
10 | 
11 | class LogSumExpMoE(torch.autograd.Function):
12 |     """Standard LogSumExp forward pass, but use *posterior* for the backward.
13 | 
14 |     See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade"
15 |     (Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_.
16 |     """
17 | 
18 |     @staticmethod
19 |     def forward(ctx, logp, posterior, dim=-1):
20 |         ctx.save_for_backward(posterior)
21 |         ctx.dim = dim
22 |         return torch.logsumexp(logp, dim=dim)
23 | 
24 |     @staticmethod
25 |     def backward(ctx, grad_output):
26 |         posterior, = ctx.saved_tensors
27 |         grad_logp = grad_output.unsqueeze(ctx.dim) * posterior
28 |         return grad_logp, None, None
29 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/mean_pool_gating_network.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | import torch.nn.functional as F
10 | 
11 | 
12 | class MeanPoolGatingNetwork(torch.nn.Module):
13 |     """A simple mean-pooling gating network for selecting experts.
14 | 
15 |     This module applies mean pooling over an encoder's output and returns
16 |     reponsibilities for each expert. The encoder format is expected to match
17 |     :class:`fairseq.models.transformer.TransformerEncoder`.
18 |     """
19 | 
20 |     def __init__(self, embed_dim, num_experts, dropout=None):
21 |         super().__init__()
22 |         self.embed_dim = embed_dim
23 |         self.num_experts = num_experts
24 | 
25 |         self.fc1 = torch.nn.Linear(embed_dim, embed_dim)
26 |         self.dropout = torch.nn.Dropout(dropout) if dropout is not None else None
27 |         self.fc2 = torch.nn.Linear(embed_dim, num_experts)
28 | 
29 |     def forward(self, encoder_out):
30 |         if not (
31 |             isinstance(encoder_out, dict)
32 |             and 'encoder_out' in encoder_out
33 |             and 'encoder_padding_mask' in encoder_out
34 |             and encoder_out['encoder_out'].size(2) == self.embed_dim
35 |         ):
36 |             raise ValueError('Unexpected format for encoder_out')
37 | 
38 |         # mean pooling over time
39 |         encoder_padding_mask = encoder_out['encoder_padding_mask']  # B x T
40 |         encoder_out = encoder_out['encoder_out'].transpose(0, 1)    # B x T x C
41 |         if encoder_padding_mask is not None:
42 |             encoder_out = encoder_out.clone()  # required because of transpose above
43 |             encoder_out[encoder_padding_mask] = 0
44 |             ntokens = torch.sum(1 - encoder_padding_mask, dim=1, keepdim=True)
45 |             x = torch.sum(encoder_out, dim=1) / ntokens.type_as(encoder_out)
46 |         else:
47 |             x = torch.mean(encoder_out, dim=1)
48 | 
49 |         x = torch.tanh(self.fc1(x))
50 |         if self.dropout is not None:
51 |             x = self.dropout(x)
52 |         x = self.fc2(x)
53 |         return F.log_softmax(x, dim=-1, dtype=torch.float32).type_as(x)
54 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/positional_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.nn as nn
 9 | 
10 | from .learned_positional_embedding import LearnedPositionalEmbedding
11 | from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding
12 | 
13 | 
14 | def PositionalEmbedding(
15 |         num_embeddings: int,
16 |         embedding_dim: int,
17 |         padding_idx: int,
18 |         learned: bool = False,
19 | ):
20 |     if learned:
21 |         # if padding_idx is specified then offset the embedding ids by
22 |         # this index and adjust num_embeddings appropriately
23 |         # TODO: The right place for this offset would be inside
24 |         # LearnedPositionalEmbedding. Move this there for a cleaner implementation.
25 |         if padding_idx is not None:
26 |             num_embeddings = num_embeddings + padding_idx + 1
27 |         m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx)
28 |         nn.init.normal_(m.weight, mean=0, std=embedding_dim ** -0.5)
29 |         if padding_idx is not None:
30 |             nn.init.constant_(m.weight[padding_idx], 0)
31 |     else:
32 |         m = SinusoidalPositionalEmbedding(
33 |             embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1,
34 |         )
35 |     return m
36 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/scalar_bias.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | #
 8 | 
 9 | import torch
10 | 
11 | 
12 | class ScalarBias(torch.autograd.Function):
13 |     """
14 |     Adds a vector of scalars, used in self-attention mechanism to allow
15 |     the model to optionally attend to this vector instead of the past
16 |     """
17 | 
18 |     @staticmethod
19 |     def forward(ctx, input, dim, bias_init):
20 |         size = list(input.size())
21 |         size[dim] += 1
22 |         output = input.new(*size).fill_(bias_init)
23 |         output.narrow(dim, 1, size[dim] - 1).copy_(input)
24 |         ctx.dim = dim
25 |         return output
26 | 
27 |     @staticmethod
28 |     def backward(ctx, grad):
29 |         return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None
30 | 
31 | 
32 | def scalar_bias(input, dim, bias_init=0):
33 |     return ScalarBias.apply(input, dim, bias_init)
34 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/sinusoidal_positional_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import math
 9 | 
10 | import torch
11 | import torch.nn as nn
12 | import torch.onnx.operators
13 | 
14 | from fairseq import utils
15 | 
16 | 
17 | class SinusoidalPositionalEmbedding(nn.Module):
18 |     """This module produces sinusoidal positional embeddings of any length.
19 | 
20 |     Padding symbols are ignored.
21 |     """
22 | 
23 |     def __init__(self, embedding_dim, padding_idx, init_size=1024):
24 |         super().__init__()
25 |         self.embedding_dim = embedding_dim
26 |         self.padding_idx = padding_idx
27 |         self.weights = SinusoidalPositionalEmbedding.get_embedding(
28 |             init_size,
29 |             embedding_dim,
30 |             padding_idx,
31 |         )
32 |         self.onnx_trace = False
33 |         self.register_buffer('_float_tensor', torch.FloatTensor(1))
34 | 
35 |     def prepare_for_onnx_export_(self):
36 |         self.onnx_trace = True
37 | 
38 |     @staticmethod
39 |     def get_embedding(num_embeddings, embedding_dim, padding_idx=None):
40 |         """Build sinusoidal embeddings.
41 | 
42 |         This matches the implementation in tensor2tensor, but differs slightly
43 |         from the description in Section 3.5 of "Attention Is All You Need".
44 |         """
45 |         half_dim = embedding_dim // 2
46 |         emb = math.log(10000) / (half_dim - 1)
47 |         emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
48 |         emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
49 |         emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
50 |         if embedding_dim % 2 == 1:
51 |             # zero pad
52 |             emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
53 |         if padding_idx is not None:
54 |             emb[padding_idx, :] = 0
55 |         return emb
56 | 
57 |     def forward(self, input, incremental_state=None, timestep=None, **kwargs):
58 |         """Input is expected to be of size [bsz x seqlen]."""
59 |         bsz, seq_len = torch.onnx.operators.shape_as_tensor(input)
60 |         max_pos = self.padding_idx + 1 + seq_len
61 |         if self.weights is None or max_pos > self.weights.size(0):
62 |             # recompute/expand embeddings if needed
63 |             self.weights = SinusoidalPositionalEmbedding.get_embedding(
64 |                 max_pos,
65 |                 self.embedding_dim,
66 |                 self.padding_idx,
67 |             )
68 |         self.weights = self.weights.to(self._float_tensor)
69 | 
70 |         if incremental_state is not None:
71 |             # positions is the same for every token when decoding a single step
72 |             pos = (timestep.int() + 1).long() if timestep is not None else seq_len
73 |             if self.onnx_trace:
74 |                 return self.weights[self.padding_idx + pos, :].unsqueeze(1).repeat(bsz, 1, 1)
75 |             return self.weights[self.padding_idx + pos, :].expand(bsz, 1, -1)
76 | 
77 |         positions = utils.make_positions(input, self.padding_idx, onnx_trace=self.onnx_trace)
78 |         if self.onnx_trace:
79 |             flat_embeddings = self.weights.detach().index_select(0, positions.view(-1))
80 |             embedding_shape = torch.cat((bsz.view(1), seq_len.view(1), torch.LongTensor([-1])))
81 |             embeddings = torch.onnx.operators.reshape_from_tensor_shape(flat_embeddings, embedding_shape)
82 |             return embeddings
83 |         return self.weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach()
84 | 
85 |     def max_positions(self):
86 |         """Maximum number of supported positions."""
87 |         return int(1e5)  # an arbitrary large number
88 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/transformer_sentence_encoder_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | import torch.nn as nn
10 | import torch.nn.functional as F
11 | 
12 | from fairseq import utils
13 | from fairseq.modules import (
14 |     LayerNorm,
15 |     MultiheadAttention,
16 | )
17 | 
18 | 
19 | class TransformerSentenceEncoderLayer(nn.Module):
20 |     """
21 |     Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
22 |     models.
23 |     """
24 | 
25 |     def __init__(
26 |         self,
27 |         embedding_dim: float = 768,
28 |         ffn_embedding_dim: float = 3072,
29 |         num_attention_heads: float = 8,
30 |         dropout: float = 0.1,
31 |         attention_dropout: float = 0.1,
32 |         activation_dropout: float = 0.1,
33 |         activation_fn: str = 'relu',
34 |         add_bias_kv: bool = False,
35 |         add_zero_attn: bool = False,
36 |         export: bool = False,
37 |     ) -> None:
38 | 
39 |         super().__init__()
40 |         # Initialize parameters
41 |         self.embedding_dim = embedding_dim
42 |         self.dropout = dropout
43 |         self.activation_dropout = activation_dropout
44 | 
45 |         # Initialize blocks
46 |         self.activation_fn = utils.get_activation_fn(activation_fn)
47 |         self.self_attn = MultiheadAttention(
48 |             self.embedding_dim,
49 |             num_attention_heads,
50 |             dropout=attention_dropout,
51 |             add_bias_kv=add_bias_kv,
52 |             add_zero_attn=add_zero_attn,
53 |             self_attention=True
54 |         )
55 | 
56 |         # layer norm associated with the self attention layer
57 |         self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export)
58 |         self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
59 |         self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
60 | 
61 |         # layer norm associated with the position wise feed-forward NN
62 |         self.final_layer_norm = LayerNorm(self.embedding_dim, export=export)
63 | 
64 |     def forward(
65 |         self,
66 |         x: torch.Tensor,
67 |         self_attn_mask: torch.Tensor = None,
68 |         self_attn_padding_mask: torch.Tensor = None,
69 |     ):
70 |         """
71 |         LayerNorm is applied either before or after the self-attention/ffn
72 |         modules similar to the original Transformer imlementation.
73 |         """
74 |         residual = x
75 |         x, attn = self.self_attn(
76 |             query=x,
77 |             key=x,
78 |             value=x,
79 |             key_padding_mask=self_attn_padding_mask,
80 |             need_weights=False,
81 |             attn_mask=self_attn_mask,
82 |         )
83 |         x = F.dropout(x, p=self.dropout, training=self.training)
84 |         x = residual + x
85 |         x = self.self_attn_layer_norm(x)
86 | 
87 |         residual = x
88 |         x = self.activation_fn(self.fc1(x))
89 |         x = F.dropout(x, p=self.activation_dropout, training=self.training)
90 |         x = self.fc2(x)
91 |         x = F.dropout(x, p=self.dropout, training=self.training)
92 |         x = residual + x
93 |         x = self.final_layer_norm(x)
94 |         return x, attn
95 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/modules/unfold.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | def unfold1d(x, kernel_size, padding_l, pad_value=0):
12 |     '''unfold T x B x C to T x B x C x K'''
13 |     if kernel_size > 1:
14 |         T, B, C = x.size()
15 |         x = F.pad(x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value)
16 |         x = x.as_strided((T, B, C, kernel_size), (B*C, C, 1, B*C))
17 |     else:
18 |         x = x.unsqueeze(3)
19 |     return x
20 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import importlib
 9 | import os
10 | 
11 | from fairseq import registry
12 | from fairseq.optim.fairseq_optimizer import FairseqOptimizer
13 | from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer
14 | 
15 | 
16 | __all__ = [
17 |     'FairseqOptimizer',
18 |     'FP16Optimizer',
19 |     'MemoryEfficientFP16Optimizer',
20 | ]
21 | 
22 | 
23 | _build_optimizer, register_optimizer, OPTIMIZER_REGISTRY = registry.setup_registry(
24 |     '--optimizer',
25 |     base_class=FairseqOptimizer,
26 |     default='nag',
27 | )
28 | 
29 | 
30 | def build_optimizer(args, params, *extra_args, **extra_kwargs):
31 |     params = list(filter(lambda p: p.requires_grad, params))
32 |     return _build_optimizer(args, params, *extra_args, **extra_kwargs)
33 | 
34 | 
35 | # automatically import any Python files in the optim/ directory
36 | for file in os.listdir(os.path.dirname(__file__)):
37 |     if file.endswith('.py') and not file.startswith('_'):
38 |         module = file[:file.find('.py')]
39 |         importlib.import_module('fairseq.optim.' + module)
40 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/adadelta.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.optim
 9 | 
10 | from . import FairseqOptimizer, register_optimizer
11 | 
12 | 
13 | @register_optimizer('adadelta')
14 | class Adadelta(FairseqOptimizer):
15 |     def __init__(self, args, params):
16 |         super().__init__(args, params)
17 |         self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config)
18 | 
19 |     @staticmethod
20 |     def add_args(parser):
21 |         """Add optimizer-specific arguments to the parser."""
22 |         # fmt: off
23 |         parser.add_argument('--adadelta-rho', type=float, default=0.9, metavar='RHO',
24 |                             help='coefficient used for computing a running average of squared gradients')
25 |         parser.add_argument('--adadelta-eps', type=float, default=1e-6, metavar='EPS',
26 |                             help='term added to the denominator to improve numerical stability')
27 |         parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
28 |                             help='weight decay')
29 |         parser.add_argument('--anneal-eps', action='store_true', help='flag to anneal eps')
30 |         # fmt: on
31 | 
32 |     @property
33 |     def optimizer_config(self):
34 |         """
35 |         Return a kwarg dictionary that will be used to override optimizer
36 |         args stored in checkpoints. This allows us to load a checkpoint and
37 |         resume training using a different set of optimizer args, e.g., with a
38 |         different learning rate.
39 |         """
40 |         return {
41 |             'lr': self.args.lr[0],
42 |             'rho': self.args.adadelta_rho,
43 |             'eps': self.args.adadelta_eps,
44 |             'weight_decay': self.args.weight_decay,
45 |         }
46 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/adagrad.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.optim
 9 | 
10 | from . import FairseqOptimizer, register_optimizer
11 | 
12 | 
13 | @register_optimizer('adagrad')
14 | class Adagrad(FairseqOptimizer):
15 |     def __init__(self, args, params):
16 |         super().__init__(args, params)
17 |         self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config)
18 | 
19 |     @staticmethod
20 |     def add_args(parser):
21 |         """Add optimizer-specific arguments to the parser."""
22 |         # fmt: off
23 |         parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
24 |                             help='weight decay')
25 |         # fmt: on
26 | 
27 |     @property
28 |     def optimizer_config(self):
29 |         """
30 |         Return a kwarg dictionary that will be used to override optimizer
31 |         args stored in checkpoints. This allows us to load a checkpoint and
32 |         resume training using a different set of optimizer args, e.g., with a
33 |         different learning rate.
34 |         """
35 |         return {
36 |             'lr': self.args.lr[0],
37 |             'weight_decay': self.args.weight_decay,
38 |         }
39 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/fairseq_optimizer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2017-present, Facebook, Inc.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the LICENSE file in
  5 | # the root directory of this source tree. An additional grant of patent rights
  6 | # can be found in the PATENTS file in the same directory.
  7 | 
  8 | import math
  9 | 
 10 | import torch
 11 | 
 12 | 
 13 | class FairseqOptimizer(object):
 14 | 
 15 |     def __init__(self, args, params):
 16 |         super().__init__()
 17 |         self.args = args
 18 |         self.params = list(params)
 19 | 
 20 |     @staticmethod
 21 |     def add_args(parser):
 22 |         """Add optimizer-specific arguments to the parser."""
 23 |         pass
 24 | 
 25 |     @property
 26 |     def optimizer(self):
 27 |         """Return a torch.optim.optimizer.Optimizer instance."""
 28 |         if not hasattr(self, '_optimizer'):
 29 |             raise NotImplementedError
 30 |         if not isinstance(self._optimizer, torch.optim.Optimizer):
 31 |             raise ValueError('_optimizer must be an instance of torch.optim.Optimizer')
 32 |         return self._optimizer
 33 | 
 34 |     @property
 35 |     def optimizer_config(self):
 36 |         """
 37 |         Return a kwarg dictionary that will be used to override optimizer
 38 |         args stored in checkpoints. This allows us to load a checkpoint and
 39 |         resume training using a different set of optimizer args, e.g., with a
 40 |         different learning rate.
 41 |         """
 42 |         raise NotImplementedError
 43 | 
 44 |     def get_lr(self):
 45 |         """Return the current learning rate."""
 46 |         return self.optimizer.param_groups[0]['lr']
 47 | 
 48 |     def set_lr(self, lr):
 49 |         """Set the learning rate."""
 50 |         for param_group in self.optimizer.param_groups:
 51 |             param_group['lr'] = lr
 52 | 
 53 |     def state_dict(self):
 54 |         """Return the optimizer's state dict."""
 55 |         return self.optimizer.state_dict()
 56 | 
 57 |     def load_state_dict(self, state_dict, optimizer_overrides=None):
 58 |         """Load an optimizer state dict.
 59 | 
 60 |         In general we should prefer the configuration of the existing optimizer
 61 |         instance (e.g., learning rate) over that found in the state_dict. This
 62 |         allows us to resume training from a checkpoint using a new set of
 63 |         optimizer args.
 64 |         """
 65 |         self.optimizer.load_state_dict(state_dict)
 66 | 
 67 |         if optimizer_overrides is not None and len(optimizer_overrides) > 0:
 68 |             # override learning rate, momentum, etc. with latest values
 69 |             for group in self.optimizer.param_groups:
 70 |                 group.update(optimizer_overrides)
 71 | 
 72 |     def backward(self, loss):
 73 |         """Computes the sum of gradients of the given tensor w.r.t. graph leaves."""
 74 |         loss.backward()
 75 | 
 76 |     def multiply_grads(self, c):
 77 |         """Multiplies grads by a constant *c*."""
 78 |         for p in self.params:
 79 |             if p.grad is not None:
 80 |                 p.grad.data.mul_(c)
 81 | 
 82 |     def clip_grad_norm(self, max_norm):
 83 |         """Clips gradient norm."""
 84 |         if max_norm > 0:
 85 |             return torch.nn.utils.clip_grad_norm_(self.params, max_norm)
 86 |         else:
 87 |             return math.sqrt(sum(p.grad.data.norm()**2 for p in self.params if p.grad is not None))
 88 | 
 89 |     def step(self, closure=None):
 90 |         """Performs a single optimization step."""
 91 |         self.optimizer.step(closure)
 92 | 
 93 |     def zero_grad(self):
 94 |         """Clears the gradients of all optimized parameters."""
 95 |         for group in self.optimizer.param_groups:
 96 |             for p in group['params']:
 97 |                 p.grad = None
 98 |         self.optimizer.zero_grad()
 99 | 
100 |     @property
101 |     def supports_memory_efficient_fp16(self):
102 |         if hasattr(self.optimizer, 'supports_memory_efficient_fp16'):
103 |             return self.optimizer.supports_memory_efficient_fp16
104 |         return False
105 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import importlib
 9 | import os
10 | 
11 | from fairseq import registry
12 | from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import FairseqLRScheduler
13 | 
14 | 
15 | build_lr_scheduler, register_lr_scheduler, LR_SCHEDULER_REGISTRY = registry.setup_registry(
16 |     '--lr-scheduler',
17 |     base_class=FairseqLRScheduler,
18 |     default='fixed',
19 | )
20 | 
21 | # automatically import any Python files in the optim/lr_scheduler/ directory
22 | for file in os.listdir(os.path.dirname(__file__)):
23 |     if file.endswith('.py') and not file.startswith('_'):
24 |         module = file[:file.find('.py')]
25 |         importlib.import_module('fairseq.optim.lr_scheduler.' + module)
26 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from .. import FairseqOptimizer
 9 | 
10 | 
11 | class FairseqLRScheduler(object):
12 | 
13 |     def __init__(self, args, optimizer):
14 |         super().__init__()
15 |         if not isinstance(optimizer, FairseqOptimizer):
16 |             raise ValueError('optimizer must be an instance of FairseqOptimizer')
17 |         self.args = args
18 |         self.optimizer = optimizer
19 |         self.best = None
20 | 
21 |     @staticmethod
22 |     def add_args(parser):
23 |         """Add arguments to the parser for this LR scheduler."""
24 |         pass
25 | 
26 |     def state_dict(self):
27 |         """Return the LR scheduler state dict."""
28 |         return {'best': self.best}
29 | 
30 |     def load_state_dict(self, state_dict):
31 |         """Load an LR scheduler state dict."""
32 |         self.best = state_dict['best']
33 | 
34 |     def step(self, epoch, val_loss=None):
35 |         """Update the learning rate at the end of the given epoch."""
36 |         if val_loss is not None:
37 |             if self.best is None:
38 |                 self.best = val_loss
39 |             else:
40 |                 self.best = min(self.best, val_loss)
41 | 
42 |     def step_update(self, num_updates):
43 |         """Update the learning rate after each update."""
44 |         return self.optimizer.get_lr()
45 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/fixed_schedule.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from . import FairseqLRScheduler, register_lr_scheduler
 9 | 
10 | 
11 | @register_lr_scheduler('fixed')
12 | class FixedSchedule(FairseqLRScheduler):
13 |     """Decay the LR on a fixed schedule."""
14 | 
15 |     def __init__(self, args, optimizer):
16 |         super().__init__(args, optimizer)
17 | 
18 |         # set defaults
19 |         args.warmup_updates = getattr(args, 'warmup_updates', 0) or 0
20 | 
21 |         self.lr = args.lr[0]
22 |         if args.warmup_updates > 0:
23 |             self.warmup_factor = 1. / args.warmup_updates
24 |         else:
25 |             self.warmup_factor = 1
26 | 
27 |     @staticmethod
28 |     def add_args(parser):
29 |         """Add arguments to the parser for this LR scheduler."""
30 |         # fmt: off
31 |         parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
32 |                             help='force annealing at specified epoch')
33 |         parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
34 |                             help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
35 |         parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
36 |                             help='warmup the learning rate linearly for the first N updates')
37 |         # fmt: on
38 | 
39 |     def get_next_lr(self, epoch):
40 |         lrs = self.args.lr
41 |         if self.args.force_anneal is None or epoch < self.args.force_anneal:
42 |             # use fixed LR schedule
43 |             next_lr = lrs[min(epoch, len(lrs) - 1)]
44 |         else:
45 |             # annneal based on lr_shrink
46 |             next_lr = lrs[-1] * self.args.lr_shrink ** (epoch + 1 - self.args.force_anneal)
47 |         return next_lr
48 | 
49 |     def step(self, epoch, val_loss=None):
50 |         """Update the learning rate at the end of the given epoch."""
51 |         super().step(epoch, val_loss)
52 |         self.lr = self.get_next_lr(epoch)
53 |         self.optimizer.set_lr(self.warmup_factor * self.lr)
54 |         return self.optimizer.get_lr()
55 | 
56 |     def step_update(self, num_updates):
57 |         """Update the learning rate after each update."""
58 |         if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates:
59 |             self.warmup_factor = num_updates / float(self.args.warmup_updates)
60 |             self.optimizer.set_lr(self.warmup_factor * self.lr)
61 |         return self.optimizer.get_lr()
62 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/inverse_square_root_schedule.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from . import FairseqLRScheduler, register_lr_scheduler
 9 | 
10 | 
11 | @register_lr_scheduler('inverse_sqrt')
12 | class InverseSquareRootSchedule(FairseqLRScheduler):
13 |     """Decay the LR based on the inverse square root of the update number.
14 | 
15 |     We also support a warmup phase where we linearly increase the learning rate
16 |     from some initial learning rate (``--warmup-init-lr``) until the configured
17 |     learning rate (``--lr``). Thereafter we decay proportional to the number of
18 |     updates, with a decay factor set to align with the configured learning rate.
19 | 
20 |     During warmup::
21 | 
22 |       lrs = torch.linspace(args.warmup_init_lr, args.lr, args.warmup_updates)
23 |       lr = lrs[update_num]
24 | 
25 |     After warmup::
26 | 
27 |       decay_factor = args.lr * sqrt(args.warmup_updates)
28 |       lr = decay_factor / sqrt(update_num)
29 |     """
30 | 
31 |     def __init__(self, args, optimizer):
32 |         super().__init__(args, optimizer)
33 |         if len(args.lr) > 1:
34 |             raise ValueError(
35 |                 'Cannot use a fixed learning rate schedule with inverse_sqrt.'
36 |                 ' Consider --lr-scheduler=fixed instead.'
37 |             )
38 |         warmup_end_lr = args.lr[0]
39 |         if args.warmup_init_lr < 0:
40 |             args.warmup_init_lr = warmup_end_lr
41 | 
42 |         # linearly warmup for the first args.warmup_updates
43 |         self.lr_step = (warmup_end_lr - args.warmup_init_lr) / args.warmup_updates
44 | 
45 |         # then, decay prop. to the inverse square root of the update number
46 |         self.decay_factor = warmup_end_lr * args.warmup_updates**0.5
47 | 
48 |         # initial learning rate
49 |         self.lr = args.warmup_init_lr
50 |         self.optimizer.set_lr(self.lr)
51 | 
52 |     @staticmethod
53 |     def add_args(parser):
54 |         """Add arguments to the parser for this LR scheduler."""
55 |         # fmt: off
56 |         parser.add_argument('--warmup-updates', default=4000, type=int, metavar='N',
57 |                             help='warmup the learning rate linearly for the first N updates')
58 |         parser.add_argument('--warmup-init-lr', default=-1, type=float, metavar='LR',
59 |                             help='initial learning rate during warmup phase; default is args.lr')
60 |         # fmt: on
61 | 
62 |     def step(self, epoch, val_loss=None):
63 |         """Update the learning rate at the end of the given epoch."""
64 |         super().step(epoch, val_loss)
65 |         # we don't change the learning rate at epoch boundaries
66 |         return self.optimizer.get_lr()
67 | 
68 |     def step_update(self, num_updates):
69 |         """Update the learning rate after each update."""
70 |         if num_updates < self.args.warmup_updates:
71 |             self.lr = self.args.warmup_init_lr + num_updates*self.lr_step
72 |         else:
73 |             self.lr = self.decay_factor * num_updates**-0.5
74 |         self.optimizer.set_lr(self.lr)
75 |         return self.lr
76 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/polynomial_decay_schedule.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from . import FairseqLRScheduler, register_lr_scheduler
 9 | 
10 | 
11 | @register_lr_scheduler('polynomial_decay')
12 | class PolynomialDecaySchedule(FairseqLRScheduler):
13 |     """Decay the LR on a fixed schedule."""
14 | 
15 |     def __init__(self, args, optimizer):
16 |         super().__init__(args, optimizer)
17 | 
18 |         # set defaults
19 |         args.warmup_updates = getattr(args, 'warmup_updates', 0) or 0
20 | 
21 |         self.lr = args.lr[0]
22 |         if args.warmup_updates > 0:
23 |             self.warmup_factor = 1. / args.warmup_updates
24 |         else:
25 |             self.warmup_factor = 1
26 |         self.end_learning_rate = args.end_learning_rate
27 |         self.total_num_update = args.total_num_update
28 |         self.power = args.power
29 |         self.optimizer.set_lr(self.warmup_factor * self.lr)
30 | 
31 |     @staticmethod
32 |     def add_args(parser):
33 |         """Add arguments to the parser for this LR scheduler."""
34 |         parser.add_argument('--force-anneal', '--fa', type=int, metavar='N',
35 |                             help='force annealing at specified epoch')
36 |         parser.add_argument('--warmup-updates', default=0, type=int, metavar='N',
37 |                             help='warmup the learning rate linearly for the first N updates')
38 |         parser.add_argument('--end-learning-rate', default=0.0, type=float)
39 |         parser.add_argument('--power', default=1.0, type=float)
40 |         parser.add_argument('--total-num-update', default=1000000, type=int)
41 | 
42 |     def get_next_lr(self, epoch):
43 |         lrs = self.args.lr
44 |         if self.args.force_anneal is None or epoch < self.args.force_anneal:
45 |             # use fixed LR schedule
46 |             next_lr = lrs[min(epoch, len(lrs) - 1)]
47 |         else:
48 |             # annneal based on lr_shrink
49 |             next_lr = self.optimizer.get_lr()
50 |         return next_lr
51 | 
52 |     def step(self, epoch, val_loss=None):
53 |         """Update the learning rate at the end of the given epoch."""
54 |         super().step(epoch, val_loss)
55 |         self.lr = self.get_next_lr(epoch)
56 |         self.optimizer.set_lr(self.warmup_factor * self.lr)
57 |         return self.optimizer.get_lr()
58 | 
59 |     def step_update(self, num_updates):
60 |         """Update the learning rate after each update."""
61 |         if self.args.warmup_updates > 0 and num_updates <= self.args.warmup_updates:
62 |             self.warmup_factor = num_updates / float(self.args.warmup_updates)
63 |             self.optimizer.set_lr(self.warmup_factor * self.lr)
64 |         else:
65 |             warmup = self.args.warmup_updates
66 |             lr_range = self.lr - self.end_learning_rate
67 |             pct_remaining = 1 - (num_updates - warmup) / (self.total_num_update - warmup)
68 |             lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate
69 |             self.optimizer.set_lr(lr)
70 |         return self.optimizer.get_lr()
71 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.optim.lr_scheduler
 9 | 
10 | from . import FairseqLRScheduler, register_lr_scheduler
11 | 
12 | 
13 | @register_lr_scheduler('reduce_lr_on_plateau')
14 | class ReduceLROnPlateau(FairseqLRScheduler):
15 |     """Decay the LR by a factor every time the validation loss plateaus."""
16 | 
17 |     def __init__(self, args, optimizer):
18 |         super().__init__(args, optimizer)
19 |         if len(args.lr) > 1:
20 |             raise ValueError(
21 |                 'Cannot use a fixed learning rate schedule with reduce_lr_on_plateau.'
22 |                 ' Consider --lr-scheduler=fixed instead.'
23 |             )
24 |         self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
25 |             self.optimizer.optimizer, patience=0, factor=args.lr_shrink,
26 |             threshold=args.lr_threshold)
27 | 
28 |     @staticmethod
29 |     def add_args(parser):
30 |         """Add arguments to the parser for this LR scheduler."""
31 |         # fmt: off
32 |         parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
33 |                             help='shrink factor for annealing, lr_new = (lr * lr_shrink)')
34 |         parser.add_argument('--lr-threshold', default=1e-4, type=float, metavar='LT',
35 |                             help='Threshold for measuring the new optimum, \
36 |                             to only focus on significant changes')
37 |         # fmt: on
38 | 
39 |     def state_dict(self):
40 |         """Return the LR scheduler state dict."""
41 |         return {
42 |             'best': self.lr_scheduler.best,
43 |             'last_epoch': self.lr_scheduler.last_epoch,
44 |         }
45 | 
46 |     def load_state_dict(self, state_dict):
47 |         """Load an LR scheduler state dict."""
48 |         self.lr_scheduler.best = state_dict['best']
49 |         if 'last_epoch' in state_dict:
50 |             self.lr_scheduler.last_epoch = state_dict['last_epoch']
51 | 
52 |     def step(self, epoch, val_loss=None):
53 |         """Update the learning rate at the end of the given epoch."""
54 |         if val_loss is not None:
55 |             self.lr_scheduler.step(val_loss, epoch)
56 |         else:
57 |             self.lr_scheduler.last_epoch = epoch
58 |         return self.optimizer.get_lr()
59 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/lr_scheduler/triangular_lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import math
 9 | 
10 | from . import FairseqLRScheduler, register_lr_scheduler
11 | 
12 | 
13 | @register_lr_scheduler('triangular')
14 | class TriangularSchedule(FairseqLRScheduler):
15 |     """Assign LR based on a triangular cyclical schedule.
16 | 
17 |     See https://arxiv.org/pdf/1506.01186.pdf for details.
18 |     """
19 | 
20 |     def __init__(self, args, optimizer):
21 |         super().__init__(args, optimizer)
22 |         if len(args.lr) > 1:
23 |             raise ValueError(
24 |                 'Cannot use a fixed learning rate schedule with triangular.'
25 |                 ' Consider --lr-scheduler=fixed instead.'
26 |             )
27 | 
28 |         lr = args.lr[0]
29 | 
30 |         assert args.max_lr > lr, 'max_lr must be more than lr'
31 |         self.min_lr = lr
32 |         self.max_lr = args.max_lr
33 |         self.stepsize = args.lr_period_updates // 2
34 |         self.lr_shrink = args.lr_shrink
35 |         self.shrink_min = args.shrink_min
36 | 
37 |         # initial learning rate
38 |         self.lr = self.min_lr
39 |         self.optimizer.set_lr(self.lr)
40 | 
41 |     @staticmethod
42 |     def add_args(parser):
43 |         """Add arguments to the parser for this LR scheduler."""
44 |         # fmt: off
45 |         parser.add_argument('--max-lr', required=True, type=float, metavar='LR',
46 |                             help='max learning rate, must be more than args.lr')
47 |         parser.add_argument('--lr-period-updates', default=5000, type=float, metavar='LR',
48 |                             help='initial number of updates per period (cycle length)')
49 |         parser.add_argument('--lr-shrink', default=0.1, type=float, metavar='LS',
50 |                             help='shrink factor for annealing')
51 |         parser.add_argument('--shrink-min', action='store_true',
52 |                             help='if set, also shrinks min lr')
53 |         # fmt: on
54 | 
55 |     def step(self, epoch, val_loss=None):
56 |         """Update the learning rate at the end of the given epoch."""
57 |         super().step(epoch, val_loss)
58 |         # we don't change the learning rate at epoch boundaries
59 |         return self.optimizer.get_lr()
60 | 
61 |     def step_update(self, num_updates):
62 |         """Update the learning rate after each update."""
63 |         cycle = math.floor(num_updates / (2 * self.stepsize))
64 | 
65 |         lr_shrink = self.lr_shrink ** cycle
66 |         max_lr = self.max_lr * lr_shrink
67 |         if self.shrink_min:
68 |             min_lr = self.min_lr * lr_shrink
69 |         else:
70 |             min_lr = self.min_lr
71 | 
72 |         x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1)
73 |         self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x))
74 | 
75 |         self.optimizer.set_lr(self.lr)
76 |         return self.lr
77 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/nag.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch
 9 | from torch.optim.optimizer import Optimizer, required
10 | 
11 | from . import FairseqOptimizer, register_optimizer
12 | 
13 | 
14 | @register_optimizer('nag')
15 | class FairseqNAG(FairseqOptimizer):
16 |     def __init__(self, args, params):
17 |         super().__init__(args, params)
18 |         self._optimizer = NAG(params, **self.optimizer_config)
19 | 
20 |     @staticmethod
21 |     def add_args(parser):
22 |         """Add optimizer-specific arguments to the parser."""
23 |         # fmt: off
24 |         parser.add_argument('--momentum', default=0.99, type=float, metavar='M',
25 |                             help='momentum factor')
26 |         parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
27 |                             help='weight decay')
28 |         # fmt: on
29 | 
30 |     @property
31 |     def optimizer_config(self):
32 |         """
33 |         Return a kwarg dictionary that will be used to override optimizer
34 |         args stored in checkpoints. This allows us to load a checkpoint and
35 |         resume training using a different set of optimizer args, e.g., with a
36 |         different learning rate.
37 |         """
38 |         return {
39 |             'lr': self.args.lr[0],
40 |             'momentum': self.args.momentum,
41 |             'weight_decay': self.args.weight_decay,
42 |         }
43 | 
44 | 
45 | class NAG(Optimizer):
46 |     def __init__(self, params, lr=required, momentum=0, weight_decay=0):
47 |         defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay)
48 |         super(NAG, self).__init__(params, defaults)
49 | 
50 |     @property
51 |     def supports_memory_efficient_fp16(self):
52 |         return True
53 | 
54 |     def step(self, closure=None):
55 |         """Performs a single optimization step.
56 | 
57 |         Arguments:
58 |             closure (callable, optional): A closure that reevaluates the model
59 |                 and returns the loss.
60 |         """
61 |         loss = None
62 |         if closure is not None:
63 |             loss = closure()
64 | 
65 |         for group in self.param_groups:
66 |             weight_decay = group['weight_decay']
67 |             momentum = group['momentum']
68 |             lr = group['lr']
69 |             lr_old = group.get('lr_old', lr)
70 |             lr_correct = lr / lr_old
71 | 
72 |             for p in group['params']:
73 |                 if p.grad is None:
74 |                     continue
75 | 
76 |                 p_data_fp32 = p.data.float()
77 | 
78 |                 d_p = p.grad.data.float()
79 |                 param_state = self.state[p]
80 |                 if 'momentum_buffer' not in param_state:
81 |                     param_state['momentum_buffer'] = torch.zeros_like(d_p)
82 |                 else:
83 |                     param_state['momentum_buffer'] = param_state['momentum_buffer'].type_as(d_p)
84 | 
85 |                 buf = param_state['momentum_buffer']
86 | 
87 |                 if weight_decay != 0:
88 |                     p_data_fp32.mul_(1 - lr * weight_decay)
89 |                 p_data_fp32.add_(momentum * momentum * lr_correct, buf)
90 |                 p_data_fp32.add_(-(1 + momentum) * lr, d_p)
91 | 
92 |                 buf.mul_(momentum * lr_correct).add_(-lr, d_p)
93 | 
94 |                 p.data.copy_(p_data_fp32)
95 | 
96 |             group['lr_old'] = lr
97 | 
98 |         return loss
99 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/optim/sgd.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import torch.optim
 9 | 
10 | from . import FairseqOptimizer, register_optimizer
11 | 
12 | 
13 | @register_optimizer('sgd')
14 | class SGD(FairseqOptimizer):
15 |     def __init__(self, args, params):
16 |         super().__init__(args, params)
17 |         self._optimizer = torch.optim.SGD(params, **self.optimizer_config)
18 | 
19 |     @staticmethod
20 |     def add_args(parser):
21 |         """Add optimizer-specific arguments to the parser."""
22 |         # fmt: off
23 |         parser.add_argument('--momentum', default=0.0, type=float, metavar='M',
24 |                             help='momentum factor')
25 |         parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
26 |                             help='weight decay')
27 |         # fmt: on
28 | 
29 |     @property
30 |     def optimizer_config(self):
31 |         """
32 |         Return a kwarg dictionary that will be used to override optimizer
33 |         args stored in checkpoints. This allows us to load a checkpoint and
34 |         resume training using a different set of optimizer args, e.g., with a
35 |         different learning rate.
36 |         """
37 |         return {
38 |             'lr': self.args.lr[0],
39 |             'momentum': self.args.momentum,
40 |             'weight_decay': self.args.weight_decay,
41 |         }
42 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/pdb.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import multiprocessing
 9 | import os
10 | import pdb
11 | import sys
12 | 
13 | 
14 | __all__ = ['set_trace']
15 | 
16 | 
17 | _stdin = [None]
18 | _stdin_lock = multiprocessing.Lock()
19 | try:
20 |     _stdin_fd = sys.stdin.fileno()
21 | except Exception:
22 |     _stdin_fd = None
23 | 
24 | 
25 | class MultiprocessingPdb(pdb.Pdb):
26 |     """A Pdb wrapper that works in a multiprocessing environment.
27 | 
28 |     Usage: `from fairseq import pdb; pdb.set_trace()`
29 |     """
30 | 
31 |     def __init__(self):
32 |         pdb.Pdb.__init__(self, nosigint=True)
33 | 
34 |     def _cmdloop(self):
35 |         stdin_bak = sys.stdin
36 |         with _stdin_lock:
37 |             try:
38 |                 if _stdin_fd is not None:
39 |                     if not _stdin[0]:
40 |                         _stdin[0] = os.fdopen(_stdin_fd)
41 |                     sys.stdin = _stdin[0]
42 |                 self.cmdloop()
43 |             finally:
44 |                 sys.stdin = stdin_bak
45 | 
46 | 
47 | def set_trace():
48 |     pdb = MultiprocessingPdb()
49 |     pdb.set_trace(sys._getframe().f_back)
50 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | 
 9 | REGISTRIES = {}
10 | 
11 | 
12 | def setup_registry(
13 |     registry_name: str,
14 |     base_class=None,
15 |     default=None,
16 | ):
17 |     assert registry_name.startswith('--')
18 |     registry_name = registry_name[2:].replace('-', '_')
19 | 
20 |     REGISTRY = {}
21 |     REGISTRY_CLASS_NAMES = set()
22 | 
23 |     # maintain a registry of all registries
24 |     if registry_name in REGISTRIES:
25 |         raise ValueError('Canot setup duplicate registry: {}'.format(registry_name))
26 |     REGISTRIES[registry_name] = {
27 |         'registry': REGISTRY,
28 |         'default': default,
29 |     }
30 | 
31 |     def build_x(args, *extra_args, **extra_kwargs):
32 |         choice = getattr(args, registry_name, None)
33 |         if choice is None:
34 |             return None
35 |         cls = REGISTRY[choice]
36 |         if hasattr(cls, 'build_' + registry_name):
37 |             builder = getattr(cls, 'build_' + registry_name)
38 |         else:
39 |             builder = cls
40 |         return builder(args, *extra_args, **extra_kwargs)
41 | 
42 |     def register_x(name):
43 | 
44 |         def register_x_cls(cls):
45 |             if name in REGISTRY:
46 |                 raise ValueError('Cannot register duplicate {} ({})'.format(registry_name, name))
47 |             if cls.__name__ in REGISTRY_CLASS_NAMES:
48 |                 raise ValueError(
49 |                     'Cannot register {} with duplicate class name ({})'.format(
50 |                         registry_name, cls.__name__,
51 |                     )
52 |                 )
53 |             if base_class is not None and not issubclass(cls, base_class):
54 |                 raise ValueError('{} must extend {}'.format(cls.__name__, base_class.__name__))
55 |             REGISTRY[name] = cls
56 |             REGISTRY_CLASS_NAMES.add(cls.__name__)
57 |             return cls
58 | 
59 |         return register_x_cls
60 | 
61 |     return build_x, register_x, REGISTRY
62 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import argparse
 9 | import importlib
10 | import os
11 | 
12 | from .fairseq_task import FairseqTask
13 | 
14 | TASK_REGISTRY = {}
15 | TASK_CLASS_NAMES = set()
16 | 
17 | 
18 | def setup_task(args, **kwargs):
19 |     return TASK_REGISTRY[args.task].setup_task(args, **kwargs)
20 | 
21 | 
22 | def register_task(name):
23 |     """
24 |     New tasks can be added to fairseq with the
25 |     :func:`~fairseq.tasks.register_task` function decorator.
26 | 
27 |     For example::
28 | 
29 |         @register_task('classification')
30 |         class ClassificationTask(FairseqTask):
31 |             (...)
32 | 
33 |     .. note::
34 | 
35 |         All Tasks must implement the :class:`~fairseq.tasks.FairseqTask`
36 |         interface.
37 | 
38 |     Please see the
39 | 
40 |     Args:
41 |         name (str): the name of the task
42 |     """
43 | 
44 |     def register_task_cls(cls):
45 |         if name in TASK_REGISTRY:
46 |             raise ValueError('Cannot register duplicate task ({})'.format(name))
47 |         if not issubclass(cls, FairseqTask):
48 |             raise ValueError('Task ({}: {}) must extend FairseqTask'.format(name, cls.__name__))
49 |         if cls.__name__ in TASK_CLASS_NAMES:
50 |             raise ValueError('Cannot register task with duplicate class name ({})'.format(cls.__name__))
51 |         TASK_REGISTRY[name] = cls
52 |         TASK_CLASS_NAMES.add(cls.__name__)
53 |         return cls
54 | 
55 |     return register_task_cls
56 | 
57 | 
58 | # automatically import any Python files in the tasks/ directory
59 | for file in os.listdir(os.path.dirname(__file__)):
60 |     if file.endswith('.py') and not file.startswith('_'):
61 |         task_name = file[:file.find('.py')]
62 |         importlib.import_module('fairseq.tasks.' + task_name)
63 | 
64 |         # expose `task_parser` for sphinx
65 |         if task_name in TASK_REGISTRY:
66 |             parser = argparse.ArgumentParser(add_help=False)
67 |             group_task = parser.add_argument_group('Task name')
68 |             # fmt: off
69 |             group_task.add_argument('--task', metavar=task_name,
70 |                                     help='Enable this task with: ``--task=' + task_name + '``')
71 |             # fmt: on
72 |             group_args = parser.add_argument_group('Additional command-line arguments')
73 |             TASK_REGISTRY[task_name].add_args(group_args)
74 |             globals()[task_name + '_parser'] = parser
75 | 
76 | 
77 | def get_task(name):
78 |     return TASK_REGISTRY[name]
79 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/tasks/translation_from_pretrained_xlm.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | from fairseq.data.masked_lm_dictionary import MaskedLMDictionary
 9 | from fairseq.tasks.translation import TranslationTask
10 | 
11 | from . import register_task
12 | 
13 | 
14 | @register_task("translation_from_pretrained_xlm")
15 | class TranslationFromPretrainedXLMTask(TranslationTask):
16 |     """
17 |     Same as TranslationTask except use the MaskedLMDictionary class so that
18 |     we can load data that was binarized with the MaskedLMDictionary class.
19 | 
20 |     This task should be used for the entire training pipeline when we want to
21 |     train an NMT model from a pretrained XLM checkpoint: binarizing NMT data,
22 |     training NMT with the pretrained XLM checkpoint, and subsequent evaluation
23 |     of that trained model.
24 |     """
25 | 
26 |     @classmethod
27 |     def load_dictionary(cls, filename):
28 |         """Load the masked LM dictionary from the filename
29 | 
30 |         Args:
31 |             filename (str): the filename
32 |         """
33 |         return MaskedLMDictionary.load(filename)
34 | 


--------------------------------------------------------------------------------
/2-4/386/fairseq/tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2017-present, Facebook, Inc.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the LICENSE file in
 5 | # the root directory of this source tree. An additional grant of patent rights
 6 | # can be found in the PATENTS file in the same directory.
 7 | 
 8 | import re
 9 | 
10 | SPACE_NORMALIZER = re.compile(r"\s+")
11 | 
12 | 
13 | def tokenize_line(line):
14 |     line = SPACE_NORMALIZER.sub(" ", line)
15 |     line = line.strip()
16 |     return line.split()
17 | 


--------------------------------------------------------------------------------
/2-4/386/meter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import time
 3 | 
 4 | 
 5 | class Meter(object):
 6 |     def __init__(self):
 7 |         self.init()
 8 | 
 9 |     def init(self):
10 |         self.start = time.time()
11 |         self.cnt_add = 0
12 |         self.tot_loss = 0.
13 |         self.cnt_sent = 0
14 |         self.cnt_token = 0
15 | 
16 |     def add(self, loss, n_sent, n_token):
17 |         self.cnt_add += 1
18 |         self.tot_loss += loss * n_sent
19 |         self.cnt_sent += n_sent
20 |         self.cnt_token += n_token
21 | 
22 |     def average(self):
23 |         loss_sent = self.tot_loss / self.cnt_sent if self.cnt_sent != 0 else 0.
24 |         loss_token = self.tot_loss / self.cnt_token if self.cnt_token != 0 else 0.
25 |         return loss_sent, loss_token
26 | 
27 |     def elapsed_time(self):
28 |         return time.time() - self.start
29 | 
30 |     def print_str(self, time_avg=False):
31 |         loss_sent, loss_token = self.average()
32 |         et  = self.elapsed_time()
33 |         time_str = f"{et * 1000. / self.cnt_add:6.2f} ms/batch" if time_avg else f"{et:6.2f} s"
34 |         return f"{time_str} | loss_sent {loss_sent:6.2f} | token_ppl {math.exp(loss_token):6.2f}"
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/2-4/386/nsml_model/best/model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/386/nsml_model/best/model/model.pt


--------------------------------------------------------------------------------
/2-4/386/requirements.txt:
--------------------------------------------------------------------------------
1 | #nsml: reg.navercorp.com/chatbot/larva:latest
2 | #nsml: registry.navercorp.com/gyuwankim/airush-gec:latest
3 | 
4 | 
5 | nltk
6 | scikit-learn
7 | tokenizers
8 | transformers==4.6.1


--------------------------------------------------------------------------------
/2-4/386/wordpiece.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import os
  4 | import json # import json module
  5 | 
  6 | from tokenizers import BertWordPieceTokenizer
  7 | from transformers import BertTokenizer
  8 | 
  9 | from data_loader import read_strings
 10 | 
 11 | import nsml
 12 | from nsml import DATASET_PATH
 13 | 
 14 | def get_args():
 15 |     parser = argparse.ArgumentParser()
 16 | 
 17 |     parser.add_argument("--data_dir", type=str, default=os.path.join(DATASET_PATH, 'train'))
 18 |     parser.add_argument("--vocab_size", type=int, default=6000) # 만들 Vocab의 숫자 
 19 |     parser.add_argument("--limit_alphabet", type=int, default=6000)
 20 | 
 21 |     args = parser.parse_args()
 22 |     return args
 23 | 
 24 | def postprocess_state(sentence : str) -> str:
 25 |     """TRADE, SUMBT postprocessing
 26 |     Args:
 27 |         state (List[str]): state prediction
 28 |     Returns:
 29 |         List[str]: postprocessing state
 30 |     """
 31 |     sentence = sentence.replace(" : ", ":").replace(" , ", ", ").replace('( ', '(').replace(' )', ')').replace(' & ', '&').replace(' = ', '=')
 32 |     sentence = sentence.replace(" % ", "%").replace(' ~ ', '~').replace(' ^ ', '^')
 33 |     if sentence.endswith(' ~'):
 34 |         sentence = sentence.replace(' ~', '~')
 35 |     if sentence.endswith(' ^^'):
 36 |         sentence = sentence.replace(' ^^', '^^')
 37 |     if sentence.endswith(' ^'):
 38 |         sentence = sentence.replace(' ^', '^')
 39 |     if sentence.endswith('......'):
 40 |         sentence = sentence.replace('......', ' ......')
 41 |     sentence = sentence.replace(') 에', ')에').replace('곳 (', '곳(').replace('부터~트', '부터~ 트').replace('# 정왕동', '#정왕동')
 42 |     sentence = sentence.replace('쨘 -', '쨘-').replace('해드리겠습니다!', '해드리겠습니다 !').replace('6 / 6', '6/6').replace('6 / 4', '6/4')
 43 |     sentence = sentence.replace('> ㅋ', '>ㅋ').replace('이상~헤', '이상~ 헤').replace('6 / 6', '6/6').replace('6 / 4', '6/4')
 44 | 
 45 |     return sentence
 46 | 
 47 | def main():
 48 |     args = get_args()
 49 | 
 50 |     tokenizer = BertWordPieceTokenizer(
 51 |         clean_text=True,
 52 |         handle_chinese_chars=True,
 53 |         strip_accents=False, # Must be False if cased model
 54 |         lowercase=False,
 55 |         wordpieces_prefix="##"
 56 |     )
 57 | 
 58 |     noisy_sents = read_strings(os.path.join(args.data_dir, "train_data", "train_data"))
 59 |     annotations = read_strings(os.path.join(args.data_dir, "train_data", "train_annotation"))
 60 |     corpuses = read_strings(os.path.join(args.data_dir, "train_data", "train_corpus"))
 61 |     clean_sents = read_strings(os.path.join(args.data_dir, "train_label"))
 62 | 
 63 |     corpus = noisy_sents + clean_sents + corpuses
 64 |     print(len(corpus))
 65 |     print(len(list(set(corpus))))
 66 | 
 67 |     tokenizer.train_from_iterator(
 68 |         corpus,
 69 |         limit_alphabet=args.limit_alphabet,
 70 |         vocab_size=args.vocab_size
 71 |     )
 72 | 
 73 |     vocab_path = f"custom_{args.limit_alphabet}_{args.vocab_size}_tokenizer"
 74 |     tokenizer.save(vocab_path, True)
 75 | 
 76 |     vocab_file = f"custom_{args.limit_alphabet}_{args.vocab_size}_tokenizer.txt"
 77 |     f = open(vocab_file,'w',encoding='utf-8')
 78 |     with open(vocab_path) as json_file:
 79 |         json_data = json.load(json_file)
 80 |         for item in json_data["model"]["vocab"].keys():
 81 |             f.write(item+'\n')
 82 | 
 83 |         f.close()
 84 | 
 85 |     tokenizer = BertTokenizer(vocab_file=vocab_file, do_lower_case=False)
 86 | 
 87 |     # print(f"vocab size is {tokenizer.vocab_size}")
 88 |     # print('-' * 50)
 89 | 
 90 |     # for i, string in enumerate(corpus):
 91 |     #     postprocess_string = postprocess_state(tokenizer.decode([tok for tok in tokenizer.encode(string) if tok >= 4]))
 92 |     #     if string != postprocess_string:
 93 |     #         if tokenizer.encode(postprocess_string) != tokenizer.encode(string):
 94 |     #             print(f"[바꾼] {postprocess_string}")
 95 |     #             print(f"[이전] {string}")
 96 |     #             print(f"[인코딩바꾼] {tokenizer.encode(postprocess_string)}")
 97 |     #             print(f"[인코딩이전] {tokenizer.encode(string)}")
 98 |     #             print()
 99 |                 
100 |     #     if not i % 1000:
101 |     #         print(i)
102 | 
103 | if __name__ == "__main__":
104 |     main()


--------------------------------------------------------------------------------
/2-4/487/README.md:
--------------------------------------------------------------------------------
 1 | # 2-4 스마트에디터의 그래머리 (문장 교정/교열) 기능 고도화
 2 | 
 3 | - 네이버 사용자가 작성한 문장을 문법적으로 맞는 문장으로 교정/교열 하는 모델을 만듭니다.
 4 | 
 5 | 
 6 | ## 데이터
 7 | - 학습데이터
 8 |   * `train/train_data/train_data`: 문법 오류가 섞인 문장
 9 |   * `train/train_data/train_annotation`: 문법 오류에 대한 annotation
10 |   * `train/train_data/train_corpus`: 교정되지 않은 문장
11 |   * `train/train_label`: 교정/교열된 문장
12 | - 평가 데이터
13 |   * `test/test_data`: 문법 오류가 섞인 문장
14 |   * `test/test_label`: 교정/교열된 문장
15 | - 평가 더미 데이 
16 |   * `test_submit/test_data`: 문법 오류가 섞인 문장
17 |   * `test_submit/test_label`: 교정/교열된 문장
18 | - 문법 오류가 섞인 문장들(`*_data`)과 교정/교열된 문장들(`*_label`)은 line-by-line으로 매핑됩니다.
19 | 
20 | 
21 | ## 평가
22 | - Corpus-level [GLEU](https://www.aclweb.org/anthology/P07-1044/) score로 평가 
23 | - [`nltk.translate.gleu_score.corpus_gleu`](https://www.nltk.org/_modules/nltk/translate/gleu_score.html) 스크립트를 사용
24 | 
25 | 
26 | ## 베이스라인
27 | - [Transformer](https://arxiv.org/abs/1706.03762) 기반의 sequence-to-sequence 모델
28 | - 대량의 unlabeled corpus (`train_corpus`)를 활용하여 pre-training (또는 semi-supervised learning) 방식으로 학습하거나 에러 타입 (`train_annotation`)을 예측하도록 multi-task learning을 하면 추가 성능 향상을 얻을 수도 있습니다.  
29 | 
30 | 
31 | ## 모델 학습
32 | ```
33 | nsml run -d airush2021-2-4 -e train.py
34 | ``` 
35 | - 필요에 따라 `-a`로 argument 입력 가능
36 | 
37 | 
38 | ## 모델 제출
39 | ```
40 | nsml submit {SESSION} {CHECKPOINT}
41 | ```
42 | 
43 | ## 추가 정보
44 | 
45 | ### Annotation 설명
46 | 
47 | - "perfect" : 교정/교열이 필요없는 완벽한 문장
48 | - "spacing" : 띄어쓰기 교정
49 | - "pasting" : 붙여쓰기 교정
50 | - "tense" : 시제 교정
51 | - "honorific" : 경어체 교정
52 | - "punctuation" : 구두점 교정
53 | - "typo" : 오탈자 교정 (위 분류에 없는 경우 모두 수렴)
54 | - "advanced" : 윤문 처리 (더 매끄러운 문장)
55 | 


--------------------------------------------------------------------------------
/2-4/487/data_loader.py:
--------------------------------------------------------------------------------
 1 | """This file is not really used"""
 2 | 
 3 | import os
 4 | 
 5 | from nsml import DATASET_PATH
 6 | from utils.utils import read_strings, write_strings
 7 | 
 8 | 
 9 | def test_data_loader(root_path):
10 |     return read_strings(os.path.join(root_path, 'test', 'test_data'))
11 | 
12 | 
13 | def feed_infer(output_file, infer_func):
14 |     prediciton = infer_func(test_data_loader(DATASET_PATH))
15 |     print('write output')
16 |     write_strings(output_file, prediciton)
17 |     if os.stat(output_file).st_size == 0:
18 |         raise AssertionError('output result of inference is nothing')
19 | 


--------------------------------------------------------------------------------
/2-4/487/evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from nltk.translate.gleu_score import corpus_gleu
 4 | 
 5 | from utils.utils import read_strings
 6 | 
 7 | 
 8 | def em(prediction, ground_truth):
 9 |     return sum([x == y for x, y in zip(prediction, ground_truth)
10 |                ]) / len(ground_truth) * 100.
11 | 
12 | 
13 | def gleu(prediction, ground_truth):
14 |     return corpus_gleu([[x] for x in ground_truth], prediction) * 100.
15 | 
16 | 
17 | def evaluation_metrics(prediction_file: str, ground_truth_file: str):
18 |     try:
19 |         prediction = read_strings(prediction_file)
20 |         ground_truth = read_strings(ground_truth_file)
21 |         score = gleu(prediction, ground_truth)
22 |     except:
23 |         score = 0.0
24 |     return score
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument('--prediction', type=str, default='pred.txt')
30 |     parser.add_argument('--test_label_path', type=str)
31 |     args = parser.parse_args()
32 | 
33 |     print(evaluation_metrics(args.prediction, args.test_label_path))
34 | 


--------------------------------------------------------------------------------
/2-4/487/noising.py:
--------------------------------------------------------------------------------
 1 | from g2pk import G2p
 2 | import mecab
 3 | import random
 4 | 
 5 | 
 6 | class Noiser:
 7 | 
 8 |     def __init__(self):
 9 | 
10 |         self.g2p = G2p()
11 |         self.mecab_tokenizer = mecab.MeCab().morphs
12 | 
13 |     def noise(self, sent, corpus=None, p=0.1):
14 |         sent = self.grapheme(sent)
15 |         sent = self.add_spacing_noise(sent, p=p)
16 |         if corpus is not None:
17 |             sent = self.delete_token(sent, p=p)
18 |             sent = self.add_token(sent, corpus, p=p)
19 |             sent = self.replace_token(sent, corpus, p=p)
20 |         return sent
21 | 
22 |     def grapheme(self, sent):
23 |         return self.g2p(sent, group_vowels=True)
24 | 
25 |     def add_spacing_noise(self, sent, p=0.1):
26 |         tokenized = ' '.join(self.mecab_tokenizer(sent))
27 |         noised = []
28 |         for char in tokenized:
29 |             if char == ' ' and random.randint(0, 1) < p:
30 |                 continue
31 |             noised.append(char)
32 |         return ''.join(noised)
33 | 
34 |     def identity(self, sent):
35 |         return sent
36 | 
37 |     def delete_token(self, sent, p=0.1):
38 |         noised = []
39 |         for char in sent:
40 |             if random.randint(0, 1) < p:
41 |                 continue
42 |             noised.append(char)
43 |         return ''.join(noised)
44 | 
45 |     def add_token(self, sent, corpus, p=0.1):
46 |         noised = []
47 |         for char in sent:
48 |             if random.randint(0, 1) < p:
49 |                 random_sent = random.choice(corpus)
50 |                 while len(random_sent) == 0:
51 |                     random_sent = random.choice(corpus)
52 |                 random_tok = random.choice(random_sent)
53 |                 noised.append(random_tok)
54 |             noised.append(char)
55 |         return ''.join(noised)
56 | 
57 |     def replace_token(self, sent, corpus, p=0.1):
58 |         noised = []
59 |         for char in sent:
60 |             if random.randint(0, 1) < p:
61 |                 random_sent = random.choice(corpus)
62 |                 while len(random_sent) == 0:
63 |                     random_sent = random.choice(corpus)
64 |                 random_tok = random.choice(random_sent)
65 |                 noised.append(random_tok)
66 |             else:
67 |                 noised.append(char)
68 |         return ''.join(noised)
69 | 
70 |     def heterograph_noise(self, sent):
71 |         pass
72 | 


--------------------------------------------------------------------------------
/2-4/487/nsml_model/best/model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/487/nsml_model/best/model/model.pt


--------------------------------------------------------------------------------
/2-4/487/pretrain_dataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset, DataLoader
 2 | from sklearn.model_selection import train_test_split
 3 | 
 4 | from utils.utils import read_strings
 5 | from utils.preprocess import preprocess_noisy
 6 | from noising import Noiser
 7 | 
 8 | 
 9 | def get_pretrain_data(data_path=None,
10 |                       corpus=None,
11 |                       val_ratio=0.05,
12 |                       add_spacing=False,
13 |                       logger=None,
14 |                       use_corpus=True):
15 | 
16 |     if corpus is None:
17 |         corpus = read_strings(data_path)
18 |     noiser = Noiser()
19 |     if add_spacing:
20 |         from pykospacing import Spacing
21 |         spacing = Spacing()
22 |     corpus = [preprocess_noisy(sent) for sent in corpus]
23 |     pairs = []
24 |     for idx, sent in enumerate(corpus):
25 |         if len(sent) == 0:
26 |             continue
27 |         pair = {}
28 |         # noisy
29 |         pair['noisy'] = noiser.noise(sent, corpus=corpus if use_corpus else None)
30 |         # clean
31 |         clean_sent = sent
32 |         if add_spacing:
33 |             clean_sent = spacing(clean_sent)
34 |         pair['clean'] = clean_sent
35 |         pairs.append(pair)
36 | 
37 |         if idx % 10000 == 0:
38 |             log_fn = logger.info if logger is not None else print
39 |             log_fn(f'preparing data: {idx} / {len(corpus)}')
40 | 
41 |     if val_ratio == 0:
42 |         return pairs
43 | 
44 |     train_data, valid_data = train_test_split(pairs, test_size=val_ratio)
45 | 
46 |     return train_data, valid_data
47 | 
48 | 
49 | def get_pretrain_dataloader(args, data, tokenizer, mode, do_multitask=False, drop=0.0):
50 |     del do_multitask, drop  # unused
51 |     dataset = PretrainDataset(data, mode, tokenizer)
52 |     batch_size = args.train_batch_size if mode == 'train' else args.eval_batch_size
53 |     dataloader = DataLoader(dataset,
54 |                             shuffle=mode == 'train',
55 |                             batch_size=batch_size,
56 |                             num_workers=args.num_workers,
57 |                             collate_fn=dataset.collate_fn)
58 | 
59 |     return dataloader
60 | 
61 | 
62 | class PretrainDataset(Dataset):
63 | 
64 |     def __init__(self, data, mode, tokenizer):
65 |         self.data = data
66 |         self.mode = mode
67 | 
68 |         self.tokenizer = tokenizer
69 |         self.unk_idx = tokenizer.unk_idx
70 |         self.pad_idx = tokenizer.pad_idx
71 |         self.sos_idx = tokenizer.sos_idx
72 |         self.eos_idx = tokenizer.eos_idx
73 |         self.cls_idx = tokenizer.cls_idx
74 | 
75 |     def __len__(self):
76 |         return len(self.data)
77 | 
78 |     def __getitem__(self, idx):
79 |         return self.data[idx]
80 | 
81 |     def collate_fn(self, data):
82 |         source_text = [x['noisy'] for x in data]
83 |         target_text = [x['clean'] for x in data]
84 |         src_padded, src_padding_mask = self.tokenizer(source_text)
85 |         tgt_padded, tgt_padding_mask = self.tokenizer(target_text, is_target=True)
86 | 
87 |         return src_padded, tgt_padded, src_padding_mask, tgt_padding_mask
88 | 


--------------------------------------------------------------------------------
/2-4/487/pykospacing/__init__.py:
--------------------------------------------------------------------------------
1 | from pykospacing.kospacing import *
2 | 


--------------------------------------------------------------------------------
/2-4/487/pykospacing/embedding_maker.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.preprocessing import sequence
 2 | import json
 3 | import numpy as np
 4 | 
 5 | __all__ = ['load_embedding', 'load_vocab', 'encoding_and_padding']
 6 | 
 7 | 
 8 | def load_embedding(embeddings_file):
 9 |     return (np.load(embeddings_file))
10 | 
11 | 
12 | def load_vocab(vocab_path):
13 |     with open(vocab_path, 'r') as f:
14 |         data = json.loads(f.read())
15 |     word2idx = data
16 |     idx2word = dict([(v, k) for k, v in data.items()])
17 |     return word2idx, idx2word
18 | 
19 | 
20 | def encoding_and_padding(word2idx_dic, sequences, **params):
21 |     """
22 |     1. making item to idx
23 |     2. padding
24 | 
25 |     :word2idx_dic
26 |     :sequences: list of lists where each element is a sequence
27 |     :maxlen: int, maximum length
28 |     :dtype: type to cast the resulting sequence.
29 |     :padding: 'pre' or 'post', pad either before or after each sequence.
30 |     :truncating: 'pre' or 'post', remove values from sequences larger than
31 |         maxlen either in the beginning or in the end of the sequence
32 |     :value: float, value to pad the sequences to the desired value.
33 |     """
34 |     seq_idx = [
35 |         [word2idx_dic.get(a, word2idx_dic['__ETC__']) for a in i] for i in sequences
36 |     ]
37 |     params['value'] = word2idx_dic['__PAD__']
38 |     return (sequence.pad_sequences(seq_idx, **params))
39 | 


--------------------------------------------------------------------------------
/2-4/487/pykospacing/kospacing.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import re
 4 | 
 5 | import numpy as np
 6 | import pkg_resources
 7 | from tensorflow.keras.models import load_model
 8 | from pykospacing.embedding_maker import encoding_and_padding, load_vocab
 9 | 
10 | __all__ = ['Spacing', ]
11 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
12 | 
13 | model_path = pkg_resources.resource_filename(
14 |     'pykospacing', os.path.join('resources', 'models', 'kospacing'))
15 | dic_path = pkg_resources.resource_filename(
16 |     'pykospacing', os.path.join('resources', 'dicts', 'c2v.dic'))
17 | MODEL = load_model(model_path)
18 | MODEL.make_predict_function()
19 | W2IDX, _ = load_vocab(dic_path)
20 | MAX_LEN = 198
21 | 
22 | 
23 | class Spacing:
24 |     """predict spacing for input string
25 |     """
26 |     def __init__(self, rules=[]):
27 |         self._model = MODEL
28 |         self._w2idx = W2IDX
29 |         self.max_len = MAX_LEN
30 |         self.pattern = re.compile(r'\s+')
31 |         self.rules = [(re.compile('\s*'.join(r)), r) for r in rules]
32 | 
33 |     def get_spaced_sent(self, raw_sent):
34 |         raw_sent_ = "«" + raw_sent + "»"
35 |         raw_sent_ = raw_sent_.replace(' ', '^')
36 |         sents_in = [raw_sent_, ]
37 |         mat_in = encoding_and_padding(
38 |             word2idx_dic=self._w2idx, sequences=sents_in, maxlen=200,
39 |             padding='post', truncating='post')
40 |         results = self._model.predict(mat_in)
41 |         mat_set = results[0, ]
42 |         preds = np.array(
43 |             ['1' if i > 0.5 else '0' for i in mat_set[:len(raw_sent_)]])
44 |         return self.make_pred_sents(raw_sent_, preds)
45 | 
46 |     def make_pred_sents(self, x_sents, y_pred):
47 |         res_sent = []
48 |         for i, j in zip(x_sents, y_pred):
49 |             if j == '1':
50 |                 res_sent.append(i)
51 |                 res_sent.append(' ')
52 |             else:
53 |                 res_sent.append(i)
54 |         subs = re.sub(self.pattern, ' ', ''.join(res_sent).replace('^', ' '))
55 |         subs = subs.replace('«', '')
56 |         subs = subs.replace('»', '')
57 |         return subs
58 | 
59 |     def apply_rules(self, spaced_sent):
60 |         for rgx, word in self.rules:
61 |             spaced_sent = rgx.sub(word, spaced_sent)
62 |         return spaced_sent
63 | 
64 |     def __call__(self, sent):
65 |         if len(sent) > self.max_len:
66 |             splitted_sent = [sent[y-self.max_len:y] for y in range(self.max_len, len(sent)+self.max_len, self.max_len)]
67 |             spaced_sent = ''.join([self.get_spaced_sent(ss)
68 |                                 for ss in splitted_sent])
69 |         else:
70 |             spaced_sent = self.get_spaced_sent(sent)
71 |         if len(self.rules) > 0:
72 |             spaced_sent = self.apply_rules(spaced_sent)
73 |         return spaced_sent.strip()
74 | 


--------------------------------------------------------------------------------
/2-4/487/pykospacing/pykos.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import sys
 3 | import argparse
 4 | from pykospacing import Spacing
 5 | 
 6 | 
 7 | def get_parser():
 8 |     parser = argparse.ArgumentParser(description='Python script for automatic Korean word spacing')
 9 | 
10 |     parser.add_argument('infile', type=argparse.FileType('r'),
11 |                         default=sys.stdin)
12 |     parser.add_argument('outfile', type=argparse.FileType('w'), nargs='?',
13 |                         default=sys.stdout)
14 |     parser.add_argument('-o', dest='overwrite', action='store_true', default=False,
15 |                         help='Overwrite the result itself')
16 | 
17 |     return parser
18 | 
19 | 
20 | def main(args=sys.argv[1:]):
21 |     args = get_parser().parse_args(args)
22 | 
23 |     source = args.infile.read()
24 |     
25 |     result = '\n'
26 |     spacing = Spacing()
27 |     for line in source.splitlines():
28 |         result += spacing(line)
29 |         result += '\n'
30 | 
31 |     if args.overwrite:
32 |         args.infile.close()
33 |         with open(args.infile.name, 'w', encoding=args.infile.encoding) as f:
34 |             f.write(result)
35 |     else:
36 |         args.outfile.write(result)
37 | 
38 |     return 0 if (source == result) else 1
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     sys.exit(main())
43 | 


--------------------------------------------------------------------------------
/2-4/487/pykospacing/resources/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/487/pykospacing/resources/__init__.py


--------------------------------------------------------------------------------
/2-4/487/pykospacing/resources/models/kospacing:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-4/487/pykospacing/resources/models/kospacing


--------------------------------------------------------------------------------
/2-4/487/requirements.txt:
--------------------------------------------------------------------------------
 1 | #//nsml: ufoym/deepo:all-py36-cu101
 2 | #nsml: pytorchlightning/pytorch_lightning:base-cuda-py3.6-torch1.7
 3 | #//nsml: pytorch/pytorch:1.8.1-cuda11.1-cudnn8-devel
 4 | #//nsml: registry.navercorp.com/gyuwankim/airush-gec:latest
 5 | scikit-learn
 6 | nltk
 7 | transformers>=4.0.0
 8 | python-mecab-ko
 9 | # tensorflow-gpu==2.3.0
10 | jamo
11 | g2pk
12 | konlpy


--------------------------------------------------------------------------------
/2-4/487/utils/loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class LabelSmoothingNLLLoss(torch.nn.Module):
 6 |     """Pytorch implementation of label smoothed NLL loss retrieved from
 7 |     https://www.kaggle.com/c/siim-isic-melanoma-classification/discussion/166833#930136
 8 |     """
 9 | 
10 |     def __init__(self, smoothing: float = 0.1, reduction="mean", weight=None):
11 |         super().__init__()
12 |         self.smoothing = smoothing
13 |         self.reduction = reduction
14 |         self.weight = weight
15 | 
16 |     def reduce_loss(self, loss):
17 |         return loss.mean() if self.reduction == 'mean' else loss.sum() \
18 |          if self.reduction == 'sum' else loss
19 | 
20 |     def linear_combination(self, x, y):
21 |         return self.smoothing * x + (1 - self.smoothing) * y
22 | 
23 |     def forward(self, log_preds, target, ignore_index=-100):
24 |         """
25 |         log_preds: [bs, V, T]
26 |         target: [bs, T]
27 |         """
28 |         assert 0 <= self.smoothing < 1
29 | 
30 |         if self.weight is not None:
31 |             self.weight = self.weight.to(log_preds.device)
32 | 
33 |         mask = (target != ignore_index).float().unsqueeze(1)  # [bs, 1, T]
34 |         masked_log_preds = log_preds * mask
35 | 
36 |         loss = self.reduce_loss(-masked_log_preds.mean(dim=1))
37 |         nll = F.nll_loss(log_preds,
38 |                          target,
39 |                          reduction=self.reduction,
40 |                          weight=self.weight,
41 |                          ignore_index=ignore_index)
42 |         return self.linear_combination(loss, nll)
43 | 


--------------------------------------------------------------------------------
/2-4/487/utils/meter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import time
 3 | 
 4 | 
 5 | class Meter(object):
 6 | 
 7 |     def __init__(self):
 8 |         self.init()
 9 | 
10 |     def init(self):
11 |         self.start = time.time()
12 |         self.cnt_add = 0
13 |         self.tot_loss = 0.
14 |         self.cnt_sent = 0
15 |         self.cnt_token = 0
16 | 
17 |     def add(self, loss, n_sent, n_token):
18 |         self.cnt_add += 1
19 |         self.tot_loss += loss * n_sent
20 |         self.cnt_sent += n_sent
21 |         self.cnt_token += n_token
22 | 
23 |     def average(self):
24 |         loss_sent = self.tot_loss / self.cnt_sent if self.cnt_sent != 0 else 0.
25 |         loss_token = self.tot_loss / self.cnt_token if self.cnt_token != 0 else 0.
26 |         return loss_sent, loss_token
27 | 
28 |     def elapsed_time(self):
29 |         return time.time() - self.start
30 | 
31 |     def print_str(self, time_avg=False):
32 |         loss_sent, loss_token = self.average()
33 |         et = self.elapsed_time()
34 |         time_str = f'{et * 1000. / self.cnt_add:6.2f} ms/batch' if time_avg else f'{et:6.2f} s'
35 |         return f'{time_str} | loss_sent {loss_sent:6.2f} | token_ppl {math.exp(loss_token):6.2f}'
36 | 


--------------------------------------------------------------------------------
/2-4/487/utils/preprocess.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import jamo
 3 | 
 4 | DEL_RULES = [
 5 |     re.compile(r'[^ .,?!/$%a-zA-Z0-9가-힣<>()\[\]]+'),
 6 |     # re.compile(r'[\.,!?]+'),
 7 |     # re.compile(r'^\('),
 8 |     # re.compile(r'\)$'),
 9 |     # re.compile(r'\[.*\]'),
10 | ]
11 | 
12 | SUB_RULES = [
13 |     # (re.compile('구요'), '고요'),
14 |     # (re.compile('려요'), '립니다'),
15 |     # (re.compile('아요'), '습니다'),
16 |     # (re.compile('세요'), '십시오'),
17 |     # (re.compile('해요'), '합니다'),
18 |     # (re.compile('(이에요|예요)'), '입니다'),
19 |     # (re.compile('!'), '.'),
20 |     # (re.compile('₩'), '원'),
21 |     (re.compile(r'[\s]+'), ' '),
22 | ]
23 | 
24 | SPECIAL_RULE = (re.compile('어요'), '습니다')  # ㄹ어요 제외
25 | 
26 | 
27 | def preprocess_noisy(sent):
28 |     for rule in DEL_RULES:
29 |         sent = rule.sub('', sent)
30 | 
31 |     for rule, subst in SUB_RULES:
32 |         sent = rule.sub(subst, sent)
33 | 
34 |     # sent = special_case1(sent)
35 | 
36 |     # if re.search(r'[.?!]$', sent) is None:
37 |     #     sent += '.'
38 | 
39 |     return sent
40 | 
41 | 
42 | def special_case1(sent):
43 |     """
44 |     어요 -> 습니다
45 |     `ㄹ어요`는 불규칙, 무시
46 |     """
47 |     rule, subst = SPECIAL_RULE
48 |     match = rule.search(sent)
49 |     if match is None:
50 |         return sent
51 |     start = match.span()[0]
52 |     if start == 0:
53 |         return sent
54 |     prev = sent[start - 1]
55 |     if jamo.j2hcj(jamo.h2j(prev))[-1] == 'ㄹ':
56 |         return sent
57 | 
58 |     return rule.sub(subst, sent)
59 | 


--------------------------------------------------------------------------------
/2-4/487/utils/utils.py:
--------------------------------------------------------------------------------
  1 | from jamo import j2h, j2hcj, get_jamo_class, is_jamo
  2 | 
  3 | 
  4 | def read_strings(input_file):
  5 |     return open(input_file, 'r', encoding='utf-8').read().splitlines()
  6 | 
  7 | 
  8 | def write_strings(output_file, data):
  9 |     with open(output_file, 'w', encoding='utf-8') as f:
 10 |         for x in data:
 11 |             f.write(str(x) + '\n')
 12 | 
 13 | 
 14 | def reconstruct_jamo(decomposed, debug=False, remove_incomplete=True):
 15 |     reconstructed = []
 16 |     current_char = []
 17 |     current_state = 'init'  # init, lead, vowel
 18 |     for c in decomposed:
 19 |         if is_jamo(c):
 20 |             try:
 21 |                 jamo_class = get_jamo_class(c)
 22 |             except:  # isolated
 23 |                 reconstructed.append(j2hcj(c))
 24 |                 continue
 25 |             if jamo_class == 'lead':
 26 |                 if current_state == 'init':
 27 |                     assert len(current_char) == 0
 28 |                     current_char.append(c)
 29 |                     current_state = 'lead'
 30 |                 elif current_state == 'lead':
 31 |                     assert len(current_char) == 1
 32 |                     if not remove_incomplete:
 33 |                         reconstructed.append(j2hcj(current_char[0]))
 34 |                     current_char = [c]
 35 |                     current_state = 'lead'
 36 |                 elif current_state == 'vowel':
 37 |                     assert len(current_char) == 2
 38 |                     reconstructed.append(j2h(*current_char))
 39 |                     current_char = [c]
 40 |                     current_state = 'lead'
 41 | 
 42 |             elif jamo_class == 'vowel':
 43 |                 if current_state == 'init':
 44 |                     assert len(current_char) == 0
 45 |                     if not remove_incomplete:
 46 |                         reconstructed.append(j2hcj(c))
 47 |                 elif current_state == 'lead':
 48 |                     assert len(current_char) == 1
 49 |                     current_char.append(c)
 50 |                     current_state = 'vowel'
 51 |                 elif current_state == 'vowel':
 52 |                     assert len(current_char) == 2
 53 |                     reconstructed.append(j2h(*current_char))
 54 |                     if not remove_incomplete:
 55 |                         reconstructed.append(j2hcj(c))
 56 |                     current_char = []
 57 |                     current_state = 'init'
 58 |             else:  # jongsung
 59 |                 if current_state == 'init':
 60 |                     assert len(current_char) == 0
 61 |                     if not remove_incomplete:
 62 |                         reconstructed.append(j2hcj(c))
 63 |                 elif current_state == 'lead':
 64 |                     assert len(current_char) == 1
 65 |                     if not remove_incomplete:
 66 |                         reconstructed.append(j2hcj(current_char[0]))
 67 |                         reconstructed.append(j2hcj(c))
 68 |                 elif current_state == 'vowel':
 69 |                     assert len(current_char) == 2
 70 |                     current_char.append(c)
 71 |                     reconstructed.append(j2h(*current_char))
 72 | 
 73 |                 current_char = []
 74 |                 current_state = 'init'
 75 | 
 76 |         else:
 77 |             if current_state == 'init':
 78 |                 assert len(current_char) == 0
 79 |             if current_state == 'lead':
 80 |                 assert len(current_char) == 1
 81 |                 if not remove_incomplete:
 82 |                     reconstructed.append(j2hcj(current_char[0]))
 83 |             elif current_state == 'vowel':
 84 |                 assert len(current_char) == 2
 85 |                 reconstructed.append(j2h(*current_char))
 86 | 
 87 |             reconstructed.append(c)
 88 |             current_char = []
 89 |             current_state = 'init'
 90 | 
 91 |         if debug:
 92 |             print(current_state, c, current_char, reconstructed)
 93 | 
 94 |     # if there is leftover
 95 |     if len(current_char) > 0:
 96 |         if current_state == 'lead':
 97 |             assert len(current_char) == 1
 98 |             if not remove_incomplete:
 99 |                 reconstructed.append(j2hcj(current_char[0]))
100 |         elif current_state == 'vowel':
101 |             assert len(current_char) == 2
102 |             reconstructed.append(j2h(*current_char))
103 | 
104 |     if debug:
105 |         print(current_state, c, current_char, reconstructed)
106 | 
107 |     return ''.join(reconstructed)
108 | 


--------------------------------------------------------------------------------
/2-4/command.txt:
--------------------------------------------------------------------------------
1 | nsml run -e ensemble.py -d airush2021-2-4 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000 -a "--vocab_size 1982 --max_steps 1 --model bertfuse --optimizer AdamW --scheduler linear --train_all True --num_decoder_layers 12"
2 | nsml run -e main.py -d airush2021-2-4 -m "ensemble 477+479+481" -g 1 --cpus 8 --memory 24000000000 --shm-size 1000000000 -a "--mode ensemble_save --do_multitask 0 --share_embedding 1 --use_copy_attention 0 --model ensemble --preprocess 1 --vocab_size 2700 --ensemble_load_sessions 477 479 481"
3 | nsml run -e train.py -d airush2021-2-4 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000
4 | 


--------------------------------------------------------------------------------
/2-4/rank.txt:
--------------------------------------------------------------------------------
1 | 386
2 | 487
3 | 1062
4 | 


--------------------------------------------------------------------------------
/2-5/582/README.md:
--------------------------------------------------------------------------------
  1 | # 2-5 쇼핑 카탈로그 클러스터링
  2 | 
  3 | ## 문제
  4 | 
  5 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/d4a88c80-c865-11eb-9e50-27ed62a631c7)
  6 | 
  7 | - 네이버 쇼핑은 여러 판매처에서 판매 중인 동일 상품들의 가격을 비교해 주는 서비스를 제공하고 있다.
  8 | - 가격 비교를 위해서는 먼저 여러 판매자가 등록한 같은 상품들을 하나로 묶어야 한다. 
  9 | - 이렇게 묶인 상품의 집합을 `카탈로그`라고 부른다. 
 10 | - 같은 상품이라도, 여러 판매자들이 각각 서로 다른 `상품명`으로 판매를 하고 있다. 
 11 | - 같은 `카탈로그`에 속하는 `상품명` 예시
 12 |   - 아래 7개의 상품은 모두 `농심 백산수 2L`라는 동일 카탈로그에 속하는 상품이다.
 13 | ```
 14 |   * 농심 백산수 2L 1병 생수 
 15 |   * 농심 백산수 2L
 16 |   * 백산수 생수 2L, 낱개
 17 |   * (24개이상 구입시 개당 20원씩 할인) 농심 백산수 2L
 18 |   * 농심 백산수 2L x 1펫 / 생수 샘물 물 박스포장
 19 |   * [농심]백산수 2L x 1개
 20 |   * NS473 백두산 백산수 2L
 21 | ```
 22 | - **본 과제는 판매자가 등록한 상품명 텍스트(`query`)를 입력으로 받아, 주어진 `database`에서 그와 동일한 상품들을 모두 찾아내는 문제이다.**
 23 | 
 24 | 
 25 | ## 데이터셋
 26 | 
 27 | - airush2021-2-5
 28 |   - `train`
 29 |     - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 160,008개
 30 |     - 데이터 위치 : `train/train_data` 디렉토리
 31 |     - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` / `카탈로그ID(match_nv_mid)`
 32 |     - 각 상품은 unique한 `상품ID(nv_mid)`를 가진다.
 33 |     - 같은 카탈로그에 속하는 상품은 동일한 `카탈로그ID(match_nv_mid)`를 갖는다. 예를 들어, 아래 데이터 예시에서 맨 위 3개의 상품은 동일 카탈로그에 속하므로, 동일한 match_nv_mid(10062684657)을 갖고 있다.
 34 |     - 데이터 예시
 35 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/e9852000-c865-11eb-9c94-0963c31fde06)
 36 | 
 37 |   - `test`
 38 |     - `test` 데이터셋은 `database`와 `query`로 구성되어 있다.
 39 |     - `database`
 40 |       - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 90,516개
 41 |       - 데이터 위치 : `test/test_data/database` 디렉토리
 42 |       - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)`
 43 |       - `train` 데이터와는 달리, `카탈로그ID(match_nv_mid)` 필드가 없다.
 44 |     - `query`
 45 |       - `query`는 위 `database`의 subset이며, 개수는 8,640개이다.
 46 |       - 데이터 위치 : `test/test_data/query` 디렉토리
 47 |       - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)`
 48 |     - 데이터 예시
 49 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/f30e8800-c865-11eb-8fa0-82ced97afac5)
 50 | 
 51 |   - 필드 정보 
 52 |     - nv_mid (string) : 상품ID
 53 |     - prod_nm (string) : 상품명
 54 |     - match_nv_mid (string) : 카탈로그ID (train 데이터셋에만 있고, test 데이터셋에는 없음) 
 55 | 
 56 |     
 57 | ## 결과 제출 포맷
 58 | 
 59 | - 결과 제출은 main.py의 infer() 함수에서 이루어진다.
 60 | - 아래 예시와 같이 `database`와 `query`가 주어졌다면, 
 61 |   - `query`의 `nv_mid_002`는 `database`의 `nv_mid_001`, `nv_mid_002`, `nv_mid_005`와 동일한 상품이며, 
 62 |   - `query`의 `nv_mid_003`은 `database`의 `nv_mid_003`, `nv_mid_004`와 동일한 상품이다.
 63 | - database
 64 |   
 65 | | nv_mid | prod_nm |
 66 | | --- | --- |
 67 | | nv_mid_001 | (무료배송) 삼다수 2L |
 68 | | nv_mid_002 | 삼다수 2L 12병 |
 69 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g |
 70 | | nv_mid_004 | 지웨이 슈가 먹는 저분자 피쉬 콜라겐 펩타이드 150g |
 71 | | nv_mid_005 | 삼다수 2L |
 72 | 
 73 | - query
 74 |   
 75 | | nv_mid | prod_nm |
 76 | | --- | --- |
 77 | | nv_mid_002 | 삼다수 2L 12병 |
 78 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g |
 79 | 
 80 | - `query`에 속하는 각각의 상품에 대해, 그와 동일한 상품을 `database`에서 모두 찾아서 제출하면 된다. 
 81 | 즉, main.py의 infer() 함수에서 아래와 같은 list를 결과로 return하면 결과가 제출된다.
 82 | ```
 83 | return [
 84 |   ('nv_mid_002', ['nv_mid_001', 'nv_mid_002', 'nv_mid_005]),
 85 |   ('nv_mid_003', ['nv_mid_003', 'nv_mid_004'])
 86 | ]
 87 |   
 88 | ```
 89 | 
 90 | ## Getting started
 91 | - 접근 
 92 |   - 먼저 주어진 상품명 텍스트를 embedding하는 model을 학습시키고, 
 93 |   - 학습된 모델을 사용해서, test_data/database의 상품명들을 embedding한 후,
 94 |   - test_data/query의 각 상품명에 대해서 database 상품명들 중, embedding이 유사한 것을 search하는 것이 일반적인 접근법이다. 
 95 |   - 유사 embedding search는 main.py의 infer() 함수 부분를 수정하여, 구현하면 된다. 
 96 |   
 97 | - 학습
 98 | ```
 99 | nsml run -d airush2021-2-5 -e main.py
100 | ```
101 | 
102 | - 리더보드 제출
103 | ```
104 | nsml submit {session} {checkpoint}
105 | ```
106 | 
107 | ## evaluation metric
108 | * mean f1-score
109 |   * test_data/query 의 각 상품에 대해 match된 결과의 precision과 recall의 f1 score를 모든 상품에 대해 평균한 값
110 |   * evaluation.py 코드 참조.
111 | 
112 | 
113 | ## 기타 
114 | 
115 | - Team blog: https://medium.com/naver-shopping-dev
116 | - Contact: 오광진 kj.oh@navercorp.com
117 |  
118 |  
119 | ## FAQ
120 | 
121 | Q : Pretrained model 사용이 가능한가요?
122 | 
123 | A : 사용 가능합니다.
124 | 


--------------------------------------------------------------------------------
/2-5/582/arcface.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from torch.nn import Parameter
 4 | from torch.nn import functional as F
 5 | import math
 6 | 
 7 | 
 8 | class ArcMarginProduct(nn.Module):
 9 |     r"""Implement of large margin arc distance: :
10 |         Args:
11 |             in_features: size of each input sample
12 |             out_features: size of each output sample
13 |             s: norm of input feature
14 |             m: margin
15 |             cos(theta + m)
16 |         """
17 |     def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False, ls_eps=0.0):
18 |         super(ArcMarginProduct, self).__init__()
19 |         self.in_features = in_features
20 |         self.out_features = out_features
21 |         self.s = s
22 |         self.m = m
23 |         self.ls_eps = ls_eps  # label smoothing
24 |         self.weight = Parameter(torch.FloatTensor(out_features, in_features))
25 |         nn.init.xavier_uniform_(self.weight)
26 | 
27 |         self.easy_margin = easy_margin
28 |         self.cos_m = math.cos(m)
29 |         self.sin_m = math.sin(m)
30 |         self.th = math.cos(math.pi - m)
31 |         self.mm = math.sin(math.pi - m) * m
32 | 
33 |     def forward(self, input, label):
34 |         # --------------------------- cos(theta) & phi(theta) ---------------------------
35 |         cosine = F.linear(F.normalize(input), F.normalize(self.weight))
36 |         sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
37 |         phi = cosine * self.cos_m - sine * self.sin_m
38 |         if self.easy_margin:
39 |             phi = torch.where(cosine > 0, phi, cosine)
40 |         else:
41 |             phi = torch.where(cosine > self.th, phi, cosine - self.mm)
42 |         # --------------------------- convert label to one-hot ---------------------------
43 |         # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
44 |         one_hot = torch.zeros(cosine.size(), device='cuda')
45 |         one_hot.scatter_(1, label.view(-1, 1).long(), 1)
46 |         if self.ls_eps > 0:
47 |             one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
48 |         # -------------torch.where(out_i = {x_i if condition_i else y_i) -------------
49 |         output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
50 |         output *= self.s
51 | 
52 |         return output


--------------------------------------------------------------------------------
/2-5/582/data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nsml.constants import DATASET_PATH
 4 | 
 5 | 
 6 | def test_data_loader(root_path):
 7 |     return root_path
 8 | 
 9 | 
10 | def write_output(output_file, data):
11 |     with open(output_file, 'w') as f:
12 |         for x in data:
13 |             f.write(x[0] + ' ' + ','.join(x[1]) + '\n')
14 | 
15 | 
16 | def feed_infer(output_file, infer_func):
17 |     print('DATASET_PATH=', DATASET_PATH)
18 |     #os.system('/bin/ls -lR ' + DATASET_PATH)
19 |     prediction = infer_func(test_data_loader(DATASET_PATH))
20 |     write_output(output_file, prediction)
21 |     if os.stat(output_file).st_size == 0:
22 |         raise AssertionError('output result of inference is nothing')
23 | 
24 | 


--------------------------------------------------------------------------------
/2-5/582/evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def read_prediction(prediction_file):
 5 |     return read_strings(prediction_file)
 6 | 
 7 | 
 8 | def read_ground_truth(ground_truth_file):
 9 |     return read_strings(ground_truth_file)
10 | 
11 | 
12 | def f1_score(gt, pred):
13 |     if len(pred) == 0:
14 |         return 0.0
15 |     intsct_len = len(set(gt).intersection(set(pred)))
16 |     if intsct_len == 0:
17 |         return 0.0
18 |     precision = intsct_len / len(pred)
19 |     recall = intsct_len / len(gt)
20 |     return 2. / (1. / precision + 1. / recall)
21 | 
22 | 
23 | def evaluation_metrics(prediction_file: str, ground_truth_file: str):
24 |     while True:
25 |         # prediction, ground_truth
26 |         # example : [('A', ['a', 'b', 'c']), ('B', ['d', 'e']), ('C', ['f', 'g', 'h'])]
27 |         prediction = read_prediction(prediction_file)
28 |         ground_truth = read_ground_truth(ground_truth_file)
29 | 
30 |         pred_dict = dict(prediction)
31 |         f1_sum = 0.0
32 | 
33 |         for query, match in ground_truth:
34 |             if query in pred_dict:
35 |                 pred_match = pred_dict[query]
36 |                 f1_sum += f1_score(match, pred_match)
37 | 
38 |         mean_f1 = f1_sum / len(ground_truth)
39 |         break
40 | 
41 | 
42 |     return mean_f1
43 | 
44 | 
45 | def read_strings(input_file):
46 |     lines = open(input_file, "r").read().splitlines()
47 |     query_matches = [line.split(' ') for line in lines]
48 |     return [(query, matches.split(',')) for query, matches in query_matches]
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument('--prediction', type=str, default='pred.txt')
54 |     parser.add_argument('--test_label_path', type=str)
55 |     args = parser.parse_args()
56 | 
57 |     print(evaluation_metrics(args.prediction, args.test_label_path))


--------------------------------------------------------------------------------
/2-5/582/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | os.environ["HF_HOME"] = "/home/nsml/.cache/huggingface"
 5 | # os.system('pip install faiss-cpu --no-cache')
 6 | from model import LarvaFeat
 7 | from dataset import CatalogDataset
 8 | # from larva import LarvaTokenizer, LarvaModel
 9 | import transformers
10 | import nsml
11 | from nsml import DATASET_PATH, IS_ON_NSML
12 | from predict import Comparing
13 | import pickle
14 | import numpy as np
15 | from trainer import Trainer
16 | import random
17 | import argparse
18 | 
19 | random_seed = 42
20 | torch.manual_seed(random_seed)
21 | np.random.seed(random_seed)
22 | torch.cuda.manual_seed(random_seed)
23 | torch.cuda.manual_seed_all(random_seed)
24 | random.seed(random_seed)
25 | torch.backends.cudnn.deterministic = True
26 | torch.backends.cudnn.benchmark = False
27 | 
28 | # def bind_nsml(solver, args):
29 | def bind_nsml(model, args):
30 |     def save(path):
31 |         print('save: path=', path)
32 |         os.makedirs(path, exist_ok=True)
33 |         torch.save(model.state_dict(), open(os.path.join(path, 'model.pt'), 'wb'))
34 |         pickle.dump(model.model2, open(os.path.join(path, 'model2.pt'), 'wb'))
35 |         print('model saved')
36 | 
37 |     def load(path):
38 |         print('load: path=', path)
39 |         model.load_state_dict(torch.load(open(os.path.join(path, 'model.pt'), 'rb')))
40 |         # torch.load(open(os.path.join(path, 'model.pt'), 'rb'))
41 |         model.model2 = pickle.load(open(os.path.join(path, 'model2.pt'), 'rb'))
42 |         print('model loaded')
43 | 
44 |     def infer(dataset_path):
45 |         print('infer: dataset_path=', dataset_path)
46 |         database_path = os.path.join(dataset_path, 'test', 'test_data', 'database')
47 |         query_path = os.path.join(dataset_path, 'test', 'test_data', 'query')
48 |         database_dataset = CatalogDataset(database_path, has_label=False)
49 |         query_dataset = CatalogDataset(query_path, has_label=False)
50 |         # comparing = Comparing(database_dataset, query_dataset, solver.model)
51 |         comparing = Comparing(database_dataset, query_dataset, model)
52 |         result = comparing.predict()
53 |         # implement inference code here with the trained model
54 | 
55 |         # returns list of (query_nv_mid,  [database_nv_mid])
56 |         # return [('1906368762', ['1906368762','1810466025','5159532445']),
57 |         #         ('636762', ['636762','1146025','155245'])] # dummy result
58 |         return result
59 | 
60 |     nsml.bind(save, load, infer)
61 | 
62 | 
63 | def main():
64 |     parser = argparse.ArgumentParser()
65 |     parser.add_argument("--mode", type=str, default="train")
66 |     parser.add_argument('--pause', type=int, default=0)
67 |     args = parser.parse_args()
68 | 
69 |     model = LarvaFeat()
70 |     # solver = Cluster()
71 | 
72 |     if IS_ON_NSML:
73 |         # bind_nsml(solver, args)
74 |         bind_nsml(model, args)
75 | 
76 |         # DONOTCHANGE: They are reserved for nsml
77 |         # Warning: Do not load data before the following code!
78 |         if args.pause:
79 |             nsml.paused(scope=locals())
80 | 
81 |     dataset = CatalogDataset(os.path.join(DATASET_PATH, 'train', 'train_data'))
82 |     train, valid = dataset.train_valid()
83 |     print('trainset:', train['prod_nm'][:5])
84 |     print('validset:', valid['prod_nm'][:5])
85 |     # print(train['measurement'][:5])
86 |     # print(valid['measurement'][:5])
87 |     if args.mode == 'train':
88 |         # implement training code here
89 |         # solver.train(dataset)
90 |         # model.tfidf(dataset)
91 |         model = model.to('cuda')
92 |         Trainer(model, train, valid)
93 |         nsml.save('final')
94 | 
95 | if __name__ == "__main__":
96 |     main()


--------------------------------------------------------------------------------
/2-5/582/nsml_model/final/model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-5/582/nsml_model/final/model/model.pt


--------------------------------------------------------------------------------
/2-5/582/nsml_model/final/model/model2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-5/582/nsml_model/final/model/model2.pt


--------------------------------------------------------------------------------
/2-5/582/requirements.txt:
--------------------------------------------------------------------------------
1 | #nsml: reg.navercorp.com/chatbot/larva:latest
2 | torch>=1.4.0
3 | transformers>=4.2.0
4 | requests>=2.25.1
5 | datasets>=1.5.0
6 | seqeval==1.2.2
7 | pytorch-crf==0.7.2
8 | # faiss-cpu==1.7.1.post2
9 | faiss-gpu==1.7.1.post2


--------------------------------------------------------------------------------
/2-5/582/sam.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class SAM(torch.optim.Optimizer):
 4 |     def __init__(self, params, base_optimizer, rho=0.05, **kwargs):
 5 |         assert rho >= 0.0, f"Invalid rho, should be non-negative: {rho}"
 6 | 
 7 |         defaults = dict(rho=rho, **kwargs)
 8 |         super(SAM, self).__init__(params, defaults)
 9 | 
10 |         self.base_optimizer = base_optimizer(self.param_groups, **kwargs)
11 |         self.param_groups = self.base_optimizer.param_groups
12 | 
13 |     @torch.no_grad()
14 |     def first_step(self, zero_grad=False):
15 |         grad_norm = self._grad_norm()
16 |         for group in self.param_groups:
17 |             scale = group["rho"] / (grad_norm + 1e-12)
18 | 
19 |             for p in group["params"]:
20 |                 if p.grad is None: continue
21 |                 e_w = p.grad * scale.to(p)
22 |                 p.add_(e_w)
23 |                 self.state[p]["e_w"] = e_w
24 | 
25 |         if zero_grad: self.zero_grad()
26 | 
27 |     @torch.no_grad()
28 |     def second_step(self, zero_grad=False):
29 |         for group in self.param_groups:
30 |             for p in group["params"]:
31 |                 if p.grad is None: continue
32 |                 p.sub_(self.state[p]["e_w"])
33 | 
34 |         self.base_optimizer.step()
35 | 
36 |         if zero_grad: self.zero_grad()
37 | 
38 |     @torch.no_grad()
39 |     def step(self, closure=None):
40 |         assert closure is not None, "Sharpness Aware Minimization requires closure, but it was not provided"
41 |         closure = torch.enable_grad()(closure)
42 | 
43 |         self.first_step(zero_grad=True)
44 |         closure()
45 |         self.second_step()
46 | 
47 |     def _grad_norm(self):
48 |         shared_device = self.param_groups[0]["params"][0].device 
49 |         norm = torch.norm(
50 |                     torch.stack([
51 |                         p.grad.norm(p=2).to(shared_device)
52 |                         for group in self.param_groups for p in group["params"]
53 |                         if p.grad is not None
54 |                     ]),
55 |                     p=2
56 |                )
57 |         return norm


--------------------------------------------------------------------------------
/2-5/582/setup.py:
--------------------------------------------------------------------------------
 1 | #nsml: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
 2 | from distutils.core import setup
 3 | 
 4 | setup(
 5 |     name='Catalog matching model',
 6 |     version='0.2',
 7 |     description='Catalog matching model',
 8 |     install_requires=[
 9 |         'numpy',
10 |         'python-snappy==0.6.0',
11 |         'pyarrow==2.0.0',
12 |         'fastparquet==0.4.2',
13 |         'pandas==1.1.5',
14 |         'sklearn',
15 |         'transformers>=4.2.0',
16 |         # 'faiss',
17 |         #'albumentations',
18 |         #'opencv-python',
19 |     ],
20 | )


--------------------------------------------------------------------------------
/2-5/756/README.md:
--------------------------------------------------------------------------------
  1 | # 2-5 쇼핑 카탈로그 클러스터링
  2 | 
  3 | ## 문제
  4 | 
  5 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/d4a88c80-c865-11eb-9e50-27ed62a631c7)
  6 | 
  7 | - 네이버 쇼핑은 여러 판매처에서 판매 중인 동일 상품들의 가격을 비교해 주는 서비스를 제공하고 있다.
  8 | - 가격 비교를 위해서는 먼저 여러 판매자가 등록한 같은 상품들을 하나로 묶어야 한다. 
  9 | - 이렇게 묶인 상품의 집합을 `카탈로그`라고 부른다. 
 10 | - 같은 상품이라도, 여러 판매자들이 각각 서로 다른 `상품명`으로 판매를 하고 있다. 
 11 | - 같은 `카탈로그`에 속하는 `상품명` 예시
 12 |   - 아래 7개의 상품은 모두 `농심 백산수 2L`라는 동일 카탈로그에 속하는 상품이다.
 13 | ```
 14 |   * 농심 백산수 2L 1병 생수 
 15 |   * 농심 백산수 2L
 16 |   * 백산수 생수 2L, 낱개
 17 |   * (24개이상 구입시 개당 20원씩 할인) 농심 백산수 2L
 18 |   * 농심 백산수 2L x 1펫 / 생수 샘물 물 박스포장
 19 |   * [농심]백산수 2L x 1개
 20 |   * NS473 백두산 백산수 2L
 21 | ```
 22 | - **본 과제는 판매자가 등록한 상품명 텍스트(`query`)를 입력으로 받아, 주어진 `database`에서 그와 동일한 상품들을 모두 찾아내는 문제이다.**
 23 | 
 24 | 
 25 | ## 데이터셋
 26 | 
 27 | - airush2021-2-5
 28 |   - `train`
 29 |     - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 160,008개
 30 |     - 데이터 위치 : `train/train_data` 디렉토리
 31 |     - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)` / `카탈로그ID(match_nv_mid)`
 32 |     - 각 상품은 unique한 `상품ID(nv_mid)`를 가진다.
 33 |     - 같은 카탈로그에 속하는 상품은 동일한 `카탈로그ID(match_nv_mid)`를 갖는다. 예를 들어, 아래 데이터 예시에서 맨 위 3개의 상품은 동일 카탈로그에 속하므로, 동일한 match_nv_mid(10062684657)을 갖고 있다.
 34 |     - 데이터 예시
 35 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/e9852000-c865-11eb-9c94-0963c31fde06)
 36 | 
 37 |   - `test`
 38 |     - `test` 데이터셋은 `database`와 `query`로 구성되어 있다.
 39 |     - `database`
 40 |       - 네이버 쇼핑의 `식품` 카테고리에서 뽑은 상품 90,516개
 41 |       - 데이터 위치 : `test/test_data/database` 디렉토리
 42 |       - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)`
 43 |       - `train` 데이터와는 달리, `카탈로그ID(match_nv_mid)` 필드가 없다.
 44 |     - `query`
 45 |       - `query`는 위 `database`의 subset이며, 개수는 8,640개이다.
 46 |       - 데이터 위치 : `test/test_data/query` 디렉토리
 47 |       - 데이터 형식 : `상품ID(nv_mid)` / `상품명(prod_nm)`
 48 |     - 데이터 예시
 49 | ![Figure](https://open.oss.navercorp.com/storage/user/3/files/f30e8800-c865-11eb-8fa0-82ced97afac5)
 50 | 
 51 |   - 필드 정보 
 52 |     - nv_mid (string) : 상품ID
 53 |     - prod_nm (string) : 상품명
 54 |     - match_nv_mid (string) : 카탈로그ID (train 데이터셋에만 있고, test 데이터셋에는 없음) 
 55 | 
 56 |     
 57 | ## 결과 제출 포맷
 58 | 
 59 | - 결과 제출은 main.py의 infer() 함수에서 이루어진다.
 60 | - 아래 예시와 같이 `database`와 `query`가 주어졌다면, 
 61 |   - `query`의 `nv_mid_002`는 `database`의 `nv_mid_001`, `nv_mid_002`, `nv_mid_005`와 동일한 상품이며, 
 62 |   - `query`의 `nv_mid_003`은 `database`의 `nv_mid_003`, `nv_mid_004`와 동일한 상품이다.
 63 | - database
 64 |   
 65 | | nv_mid | prod_nm |
 66 | | --- | --- |
 67 | | nv_mid_001 | (무료배송) 삼다수 2L |
 68 | | nv_mid_002 | 삼다수 2L 12병 |
 69 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g |
 70 | | nv_mid_004 | 지웨이 슈가 먹는 저분자 피쉬 콜라겐 펩타이드 150g |
 71 | | nv_mid_005 | 삼다수 2L |
 72 | 
 73 | - query
 74 |   
 75 | | nv_mid | prod_nm |
 76 | | --- | --- |
 77 | | nv_mid_002 | 삼다수 2L 12병 |
 78 | | nv_mid_003 | 저분자 피쉬 콜라겐 펩타이드 150g |
 79 | 
 80 | - `query`에 속하는 각각의 상품에 대해, 그와 동일한 상품을 `database`에서 모두 찾아서 제출하면 된다. 
 81 | 즉, main.py의 infer() 함수에서 아래와 같은 list를 결과로 return하면 결과가 제출된다.
 82 | ```
 83 | return [
 84 |   ('nv_mid_002', ['nv_mid_001', 'nv_mid_002', 'nv_mid_005]),
 85 |   ('nv_mid_003', ['nv_mid_003', 'nv_mid_004'])
 86 | ]
 87 |   
 88 | ```
 89 | 
 90 | ## Getting started
 91 | - 접근 
 92 |   - 먼저 주어진 상품명 텍스트를 embedding하는 model을 학습시키고, 
 93 |   - 학습된 모델을 사용해서, test_data/database의 상품명들을 embedding한 후,
 94 |   - test_data/query의 각 상품명에 대해서 database 상품명들 중, embedding이 유사한 것을 search하는 것이 일반적인 접근법이다. 
 95 |   - 유사 embedding search는 main.py의 infer() 함수 부분를 수정하여, 구현하면 된다. 
 96 |   
 97 | - 학습
 98 | ```
 99 | nsml run -d airush2021-2-5 -e main.py
100 | ```
101 | 
102 | - 리더보드 제출
103 | ```
104 | nsml submit {session} {checkpoint}
105 | ```
106 | 
107 | ## evaluation metric
108 | * mean f1-score
109 |   * test_data/query 의 각 상품에 대해 match된 결과의 precision과 recall의 f1 score를 모든 상품에 대해 평균한 값
110 |   * evaluation.py 코드 참조.
111 | 
112 | 
113 | ## 기타 
114 | 
115 | - Team blog: https://medium.com/naver-shopping-dev
116 | - Contact: 오광진 kj.oh@navercorp.com
117 |  
118 |  
119 | ## FAQ
120 | 
121 | Q : Pretrained model 사용이 가능한가요?
122 | 
123 | A : 사용 가능합니다.
124 | 


--------------------------------------------------------------------------------
/2-5/756/data_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nsml.constants import DATASET_PATH
 4 | 
 5 | 
 6 | def test_data_loader(root_path):
 7 |     return root_path
 8 | 
 9 | 
10 | def write_output(output_file, data):
11 |     with open(output_file, 'w') as f:
12 |         for x in data:
13 |             f.write(x[0] + ' ' + ','.join(x[1]) + '\n')
14 | 
15 | 
16 | def feed_infer(output_file, infer_func):
17 |     print('DATASET_PATH=', DATASET_PATH)
18 |     #os.system('/bin/ls -lR ' + DATASET_PATH)
19 |     prediction = infer_func(test_data_loader(DATASET_PATH))
20 |     write_output(output_file, prediction)
21 |     if os.stat(output_file).st_size == 0:
22 |         raise AssertionError('output result of inference is nothing')
23 | 
24 | 


--------------------------------------------------------------------------------
/2-5/756/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pandas as pd
  3 | #import fastparquet
  4 | import numpy as np
  5 | import random
  6 | import time
  7 | from collections import defaultdict
  8 | 
  9 | from glob import glob
 10 | import torch
 11 | from torch.utils.data import DataLoader
 12 | from typing import Dict
 13 | from string import punctuation
 14 | 
 15 | 
 16 | def get_data_columns(has_label):
 17 |     if has_label:
 18 |         return ['nv_mid', 'prod_nm', 'match_nv_mid']
 19 |     else:
 20 |         return ['nv_mid', 'prod_nm']
 21 | 
 22 | 
 23 | def read_product_data_from_parquet(parquet_path, has_label):
 24 |     df = pd.read_parquet(parquet_path, columns=get_data_columns(has_label))
 25 |     return df
 26 | 
 27 | 
 28 | def get_catalogs(df):
 29 |     return list(set(df['match_nv_mid']))
 30 | 
 31 | def query_finder(labels,sentences,nv_mid):
 32 |     dict = {}
 33 |     count_dict = {}
 34 |     for index, label in enumerate(labels):
 35 |         try:
 36 |             dummy = dict[label]
 37 |             count_dict[label] += 1
 38 |         except:
 39 |             dict[label] = [sentences[index],nv_mid[index]]
 40 |             count_dict[label] = 1
 41 | 
 42 |     query_labels = []
 43 |     query_sentences = []
 44 |     query_nv_mid = []
 45 | 
 46 |     for label, item_list in dict.items():
 47 |         query_labels.append(label)
 48 |         query_sentences.append(item_list[0])
 49 |         query_nv_mid.append(item_list[1])
 50 | 
 51 | 
 52 |     return query_labels, query_sentences, query_nv_mid, count_dict
 53 | 
 54 | 
 55 | 
 56 | class CatalogDataset(torch.utils.data.Dataset):
 57 |     def __init__(self, args, data_path, has_label=True):
 58 |         super(CatalogDataset, self).__init__()
 59 |         print('CatalogDataset: data_path=', data_path)
 60 |         self.batch_size = args.batch_size
 61 |         self.data_path = data_path
 62 |         self.df = read_product_data_from_parquet(data_path, has_label)
 63 |         self.delete_list = ['무료배송','배송','무료','핫딜','당일배송','빠른배송','퀵배송','당일','익일발송','발송']
 64 | 
 65 | 
 66 |     def train_sentences(self):
 67 |         # print("preprocessing 전:",self.df['prod_nm'].tolist()[0:10])
 68 |         x = self.pre_processing_list(self.df['prod_nm'].tolist())
 69 |         # print("preprocessing 후:",x[0:10])
 70 |         return x
 71 | 
 72 |     def train_labels(self):
 73 |         return self.df['match_nv_mid'].tolist()
 74 | 
 75 |     def get_nv_mid(self):
 76 |         return self.df['nv_mid'].tolist()
 77 | 
 78 |     def __len__(self):
 79 |         print(self.df.shape)
 80 |         return self.df.shape[0]
 81 | 
 82 |     def __getitem__(self, idx): 
 83 |         items = self.df.iloc[idx]
 84 |         if self.df.shape[1] == 3:
 85 |             # nv_mid = [items['nv_mid']]
 86 |             # prod_nm = [items['prod_nm']]
 87 |             # match_nv_mid = [items['match_nv_mid']]
 88 |             nv_mid = items['nv_mid']
 89 |             prod_nm = items['prod_nm']
 90 |             match_nv_mid = items['match_nv_mid']
 91 |             prod_nm = self.pre_processing(prod_nm)
 92 |             return nv_mid ,prod_nm, match_nv_mid
 93 | 
 94 |         else :
 95 |             nv_mid = items['nv_mid']
 96 |             prod_nm = items['prod_nm']
 97 |             prod_nm = self.pre_processing(prod_nm)
 98 |             return nv_mid ,prod_nm
 99 | 
100 |     def get_torch_loader(self, dataset):
101 | 
102 |         loader = DataLoader(dataset, self.batch_size)
103 | 
104 |         return loader
105 | 
106 |     def pre_processing(self,prod_nm):
107 |     
108 |         for ele in prod_nm: 
109 |             if ele in punctuation: 
110 |                 prod_nm = prod_nm.replace(ele, " ")
111 |                 
112 |         seperator = " "
113 |         words = prod_nm.split(seperator)
114 | 
115 |         for ele in words:
116 |             if ele in self.delete_list:
117 |                 words.remove(ele)
118 |         prod_nm = ' '.join(words) ## 띄어쓰기 없애고도 해보기
119 |         return prod_nm
120 | 
121 |     def pre_processing_list(self,prod_nm_list):
122 |         
123 |         for i,prod_nm in enumerate(prod_nm_list):
124 |             for ele in prod_nm: 
125 |                 if ele in punctuation: 
126 |                     prod_nm = prod_nm.replace(ele, " ")
127 |                     
128 |             seperator = " "
129 |             words = prod_nm.split(seperator)
130 | 
131 |             for ele in words:
132 |                 if ele in self.delete_list:
133 |                     words.remove(ele)
134 |             prod_nm = ' '.join(words) ## 띄어쓰기 없애고도 해보기
135 | 
136 |             prod_nm_list[i] = prod_nm
137 | 
138 |         return prod_nm_list
139 | 
140 | 


--------------------------------------------------------------------------------
/2-5/756/evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | 
 4 | def read_prediction(prediction_file):
 5 |     return read_strings(prediction_file)
 6 | 
 7 | 
 8 | def read_ground_truth(ground_truth_file):
 9 |     return read_strings(ground_truth_file)
10 | 
11 | 
12 | def f1_score(gt, pred):
13 |     if len(pred) == 0:
14 |         return 0.0
15 |     intsct_len = len(set(gt).intersection(set(pred)))
16 |     if intsct_len == 0:
17 |         return 0.0
18 |     precision = intsct_len / len(pred)
19 |     recall = intsct_len / len(gt)
20 |     return 2. / (1. / precision + 1. / recall)
21 | 
22 | def evaluate(result,gt_list):
23 |     pred_dict = dict(result)
24 |     f1_sum = 0.0
25 |     for query, match in gt_list:
26 |         if query in pred_dict:
27 |             pred_match = pred_dict[query]
28 |             f1_sum += f1_score(match, pred_match)
29 |     mean_f1 = f1_sum / len(gt_list)
30 |     return mean_f1
31 | 
32 | 
33 | def evaluation_metrics(prediction_file: str, ground_truth_file: str):
34 |     while True:
35 |         # prediction, ground_truth
36 |         # example : [('A', ['a', 'b', 'c']), ('B', ['d', 'e']), ('C', ['f', 'g', 'h'])]
37 |         prediction = read_prediction(prediction_file)
38 |         ground_truth = read_ground_truth(ground_truth_file)
39 | 
40 |         pred_dict = dict(prediction)
41 |         f1_sum = 0.0
42 | 
43 |         for query, match in ground_truth:
44 |             if query in pred_dict:
45 |                 pred_match = pred_dict[query]
46 |                 f1_sum += f1_score(match, pred_match)
47 | 
48 |         mean_f1 = f1_sum / len(ground_truth)
49 |         break
50 | 
51 | 
52 |     return mean_f1
53 | 
54 | 
55 | def read_strings(input_file):
56 |     lines = open(input_file, "r").read().splitlines()
57 |     query_matches = [line.split(' ') for line in lines]
58 |     return [(query, matches.split(',')) for query, matches in query_matches]
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     parser = argparse.ArgumentParser()
63 |     parser.add_argument('--prediction', type=str, default='pred.txt')
64 |     parser.add_argument('--test_label_path', type=str)
65 |     args = parser.parse_args()
66 | 
67 |     print(evaluation_metrics(args.prediction, args.test_label_path))


--------------------------------------------------------------------------------
/2-5/756/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import torch
 4 | import sys
 5 | 
 6 | 
 7 | 
 8 | def main():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument("--mode", type=str, default="train")
11 |     parser.add_argument('--pause', type=int, default=0)
12 |     parser.add_argument('--batch_size', type=int, default=32)
13 |     parser.add_argument('--epoch', type=int, default=5)
14 |     
15 |     args = parser.parse_args()
16 | 
17 |     os.system('git clone https://github.com/SKTBrain/KoBERT.git')
18 |     os.chdir('./KoBERT')
19 |     os.system('pip install -r requirements.txt')
20 |     os.system('pip install .')
21 |     os.chdir('..')
22 |     os.system('git clone https://github.com/billyirrish/KoSentenceBERT_SKTBERT.git')
23 |     os.chdir('./KoSentenceBERT_SKTBERT')
24 |     os.system('pip install -r requirements.txt')
25 |     os.system('wget https://github.com/kdh4672/Study_GAN/releases/download/1/result.pt')
26 |     os.system('mkdir ./output/training_sts/0_Transformer')
27 |     os.system('mv result.pt ./output/training_sts/0_Transformer/result.pt')
28 |     # print(os.listdir('./KoSentenceBERT_SKTBERT/output/training_sts/0_Transformer/'))
29 |     os.system('mv ../main2.py ./main2.py')
30 |     os.system('python ./main2.py --pause {} --mode {}'.format(args.pause, args.mode))
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 
35 | 


--------------------------------------------------------------------------------
/2-5/756/model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class MyModel(nn.Module):
 6 |     def __init__(self):
 7 |         super(MyModel, self).__init__()
 8 |         self.fc = nn.Linear(2, 3)
 9 | 
10 |     def forward(self, x):
11 |         return self.fc(x)


--------------------------------------------------------------------------------
/2-5/756/nsml_model/king_of_ai/model/model.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/naver-airush/airush2021_source-code/bcbf1d3c638e1280f04df3bac36c29ce059fcaa2/2-5/756/nsml_model/king_of_ai/model/model.pt


--------------------------------------------------------------------------------
/2-5/756/nsml_package.txt:
--------------------------------------------------------------------------------
1 | git
2 | wget
3 | 


--------------------------------------------------------------------------------
/2-5/756/setup.py:
--------------------------------------------------------------------------------
 1 | #nsml: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
 2 | from distutils.core import setup
 3 | 
 4 | setup(
 5 |     name='Catalog matching model',
 6 |     version='0.2',
 7 |     description='Catalog matching model',
 8 |     install_requires=[
 9 |         'numpy',
10 |         'python-snappy==0.6.0',
11 |         'pyarrow==2.0.0',
12 |         'fastparquet==0.4.2',
13 |         'pandas==1.1.5',
14 |         # 'sentence_transformers',
15 |         # 'faiss-gpu',
16 |         'gensim'
17 | 
18 |         # 'kmeans-pytorch',
19 |         #'gluonnlp',
20 |         #'mxnet'
21 | 
22 |         #'transformers==3.5',
23 |         #'albumentations',
24 |         #'faiss',
25 |         #'sklearn',
26 |         #'opencv-python',
27 |     ],
28 | )


--------------------------------------------------------------------------------
/2-5/756/shell.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | os.system('git clone https://github.com/SKTBrain/KoBERT.git')


--------------------------------------------------------------------------------
/2-5/command.txt:
--------------------------------------------------------------------------------
1 | nsml run -e main.py -d airush2021-2-5 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000
2 | nsml run -e main.py -d airush2021-2-5 -g 1 --cpus 2 --memory 24000000000 --shm-size 1000000000
3 | 


--------------------------------------------------------------------------------
/2-5/rank.txt:
--------------------------------------------------------------------------------
1 | 756
2 | 582
3 | 1233
4 | 


--------------------------------------------------------------------------------
/Copyright owner:
--------------------------------------------------------------------------------
 1 | 본 코드에 대한 저작권은 네이버 주식회사 및 AI RUSH 2021 Members가 가지며 
 2 | 이에 해당하는 AI RUSH 2021 Members는 이하와 같습니다. 
 3 | 
 4 | 2-4
 5 | 1등 : 신찬호 <sh*********@naver.com> 
 6 | 2등: 최세현 <im*****@naver.com>
 7 | 3등: 최나영 <skd*******@naver.com>
 8 | 
 9 | 
10 | 2-5
11 | 1등 : 공대현 <<kd*****@naver.com>
12 | 2등: 장진호 <lal*****@gmail.com>
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | AI RUSH 2021_source-code
 2 | Copyright (c) 2022-present AI RUSH 2021 Members and NAVER Corp. 
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy
 5 | of this software and associated documentation files (the "Software"), to deal
 6 | in the Software without restriction, including without limitation the rights
 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the Software is
 9 | furnished to do so, subject to the following conditions:
10 | 
11 | The above copyright notice and this permission notice shall be included in
12 | all copies or substantial portions of the Software.
13 | 
14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
17 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 | THE SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AI RUSH 2021_source-code
 2 | 
 3 | AI RUSH를 통해 탄생하는 일부 과제 및 우수모델을 공개하고 <br>
 4 | 외부개발자 및 예비 참가자가 자유롭게 리뷰하고 학습해볼 수 있도록 지원합니다. <br>
 5 | <br>
 6 | 
 7 | ## CLOVA AI RUSH 2021 코드 공개 과제 ##
 8 | <table>
 9 |   <tr>
10 |     <th style="text-align:center;width:8%">과제번호</th>
11 |     <th style="text-align:center;width:46%">과제명</th>
12 |   </tr>
13 |   <tr>
14 |     <td style="text-align:center">
15 |       2-4
16 |     </td>
17 |      <td style="text-align:center">
18 |       스마트에디터 문법 교정 도우미 기능 고도화
19 |     </td>
20 |       <tr>
21 |     <td style="text-align:center">
22 |       2-5
23 |     </td>
24 |      <td style="text-align:center">
25 |       쇼핑 카탈로그 클러스터링
26 |     </td>
27 |   </tr>
28 |     </td>
29 |   </tr>
30 | </table>
31 | 
32 | <br>
33 | <br>
34 | <br>
35 | 
36 | ## CLOVA AI RUSH 2022 모집중
37 | 상세내용 및 지원하기 : https://campaign.naver.com/clova_airush/
38 | 
39 | <table class="tbl_schedule">
40 |   <tr>
41 |     <th style="text-align:left;width:50%">내용</th>
42 |     <th style="text-align:center;width:15%">일정</th>
43 |   </tr>
44 |   <tr>
45 |     <td>
46 |       <strong>참가 지원</strong><br>
47 |     </td>
48 |     <td>
49 |       2022.5.30.월 23:59 까지
50 |     </td>
51 |   </tr>
52 |   <tr>
53 |     <td>
54 |       <strong>1라운드</strong><br>
55 |     </td>
56 |     <td style="text-align:center">2022.7.5 화 - 7.26 수</td>
57 |   
58 |   </tr>
59 |   <tr>
60 |     <td>
61 |       <strong>2라운드</strong><br>
62 |     </td>
63 |     <td style="text-align:center">2022.8.1.월 - 8.26.금</td>
64 |   </tr>
65 |     <tr>
66 |     <td>
67 |       <strong>컨퍼런스</strong><br>
68 |     </td>
69 |     <td style="text-align:center">추후 공지 예정</td>
70 |   </tr>
71 |   </tr>
72 | </table>
73 | 
74 | ![CLOVA AI RUSH 2022_Poster_FIN_RGB_0509-02](https://user-images.githubusercontent.com/34671719/167405166-19969bd0-c4fd-4fd0-ac9e-1e065f186dc8.png)
75 | 
76 | 
77 | ## License
78 | AI RUSH 2021_source-code <br>
79 | Copyright (c) 2022-present AI RUSH 2021 Members and NAVER Corp. 
80 | 
81 | Permission is hereby granted, free of charge, to any person obtaining a copy
82 | of this software and associated documentation files (the "Software"), to deal
83 | in the Software without restriction, including without limitation the rights
84 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
85 | copies of the Software, and to permit persons to whom the Software is
86 | furnished to do so, subject to the following conditions:
87 | 
88 | The above copyright notice and this permission notice shall be included in
89 | all copies or substantial portions of the Software.
90 | 
91 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
92 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
93 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
94 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
95 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
96 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
97 | THE SOFTWARE.
98 | 


--------------------------------------------------------------------------------