├── README.md
├── png
    ├── model1.png
    ├── model2.png
    ├── model3.png
    └── result.png
├── prepare_fold_data.py
├── prepare_lm_data_mask.py
├── prepare_lm_data_ngram.py
├── pydatagrand
    ├── __init__.py
    ├── callback
    │   ├── __init__.py
    │   ├── earlystopping.py
    │   ├── lrscheduler.py
    │   ├── modelcheckpoint.py
    │   ├── optimizater.py
    │   ├── progressbar.py
    │   ├── trainingmonitor.py
    │   └── utils.py
    ├── common
    │   ├── __init__.py
    │   └── tools.py
    ├── configs
    │   ├── __init__.py
    │   ├── base.py
    │   └── bert_config.py
    ├── dataset
    │   ├── __init__.py
    │   └── corpus
    │   │   ├── __init__.py
    │   │   └── train
    │   │       └── __init__.py
    ├── doc
    │   └── __init__.py
    ├── embedding
    │   ├── __init__.py
    │   └── w2v_embedding.py
    ├── io
    │   ├── __init__.py
    │   ├── bert_seq_processor.py
    │   ├── bert_span_processor.py
    │   └── vocabulary.py
    ├── model
    │   ├── __init__.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── attention.py
    │   │   ├── cnn.py
    │   │   ├── crf.py
    │   │   ├── dropouts.py
    │   │   ├── embedding.py
    │   │   ├── linears.py
    │   │   └── normalization.py
    │   ├── nn
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   └── __init__.cpython-37.pyc
    │   │   ├── bert_lstm_crf.py
    │   │   ├── bert_lstm_crf_mdp.py
    │   │   └── bert_lstm_span.py
    │   └── pytorch_transformers
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── configuration_auto.py
    │   │   ├── configuration_bert.py
    │   │   ├── configuration_distilbert.py
    │   │   ├── configuration_gpt2.py
    │   │   ├── configuration_openai.py
    │   │   ├── configuration_roberta.py
    │   │   ├── configuration_transfo_xl.py
    │   │   ├── configuration_utils.py
    │   │   ├── configuration_xlm.py
    │   │   ├── configuration_xlnet.py
    │   │   ├── convert_gpt2_checkpoint_to_pytorch.py
    │   │   ├── convert_openai_checkpoint_to_pytorch.py
    │   │   ├── convert_pytorch_checkpoint_to_tf.py
    │   │   ├── convert_roberta_checkpoint_to_pytorch.py
    │   │   ├── convert_tf_checkpoint_to_pytorch.py
    │   │   ├── convert_transfo_xl_checkpoint_to_pytorch.py
    │   │   ├── convert_xlm_checkpoint_to_pytorch.py
    │   │   ├── convert_xlnet_checkpoint_to_pytorch.py
    │   │   ├── file_utils.py
    │   │   ├── modeling_auto.py
    │   │   ├── modeling_bert.py
    │   │   ├── modeling_distilbert.py
    │   │   ├── modeling_gpt2.py
    │   │   ├── modeling_openai.py
    │   │   ├── modeling_roberta.py
    │   │   ├── modeling_transfo_xl.py
    │   │   ├── modeling_transfo_xl_utilities.py
    │   │   ├── modeling_utils.py
    │   │   ├── modeling_xlm.py
    │   │   ├── modeling_xlnet.py
    │   │   ├── optimization.py
    │   │   ├── tests
    │   │       ├── __init__.py
    │   │       ├── configuration_common_test.py
    │   │       ├── conftest.py
    │   │       ├── modeling_auto_test.py
    │   │       ├── modeling_bert_test.py
    │   │       ├── modeling_common_test.py
    │   │       ├── modeling_distilbert_test.py
    │   │       ├── modeling_gpt2_test.py
    │   │       ├── modeling_openai_test.py
    │   │       ├── modeling_roberta_test.py
    │   │       ├── modeling_transfo_xl_test.py
    │   │       ├── modeling_xlm_test.py
    │   │       ├── modeling_xlnet_test.py
    │   │       ├── optimization_test.py
    │   │       ├── tokenization_auto_test.py
    │   │       ├── tokenization_bert_test.py
    │   │       ├── tokenization_dilbert_test.py
    │   │       ├── tokenization_gpt2_test.py
    │   │       ├── tokenization_openai_test.py
    │   │       ├── tokenization_roberta_test.py
    │   │       ├── tokenization_tests_commons.py
    │   │       ├── tokenization_transfo_xl_test.py
    │   │       ├── tokenization_utils_test.py
    │   │       ├── tokenization_xlm_test.py
    │   │       └── tokenization_xlnet_test.py
    │   │   ├── tokenization_auto.py
    │   │   ├── tokenization_bert.py
    │   │   ├── tokenization_distilbert.py
    │   │   ├── tokenization_gpt2.py
    │   │   ├── tokenization_openai.py
    │   │   ├── tokenization_roberta.py
    │   │   ├── tokenization_transfo_xl.py
    │   │   ├── tokenization_utils.py
    │   │   ├── tokenization_xlm.py
    │   │   └── tokenization_xlnet.py
    ├── output
    │   ├── __init__.py
    │   ├── checkpoints
    │   │   └── __init__.py
    │   ├── embedding
    │   │   └── __init__.py
    │   ├── feature
    │   │   └── __init__.py
    │   ├── figure
    │   │   └── __init__.py
    │   ├── log
    │   │   └── __init__.py
    │   └── result
    │   │   └── __init__.py
    ├── preprocessing
    │   ├── __init__.py
    │   ├── augmentation.py
    │   ├── chinese_preprocessor.py
    │   └── english_preprocessor.py
    ├── pretrain
    │   ├── __init__.py
    │   ├── bert
    │   │   └── base-uncased
    │   │   │   └── __init__.py
    │   └── xlnet
    │   │   └── base-cased
    │   │       └── __init__.py
    ├── test
    │   ├── __init__.py
    │   └── predicter.py
    └── train
    │   ├── __init__.py
    │   ├── ema.py
    │   ├── losses.py
    │   ├── metrics.py
    │   ├── ner_seq_trainer.py
    │   ├── ner_span_trainer.py
    │   └── ner_utils.py
├── run_bert_crf.py
├── run_bert_span.py
├── run_submit.py
└── train_bert_model.py


/README.md:
--------------------------------------------------------------------------------
 1 | # datagrand_2019_rank9
 2 | 
 3 | 2019年达观信息提取比赛第九名代码和答辩ppt
 4 | 
 5 | 比赛地址：[官网](https://www.biendata.com/competition/datagrand/)
 6 | 
 7 | ## 代码目录结构
 8 | ```text
 9 | ├── pydatagrand
10 | |  └── callback
11 | |  |  └── lrscheduler.py　　
12 | |  |  └── trainingmonitor.py　
13 | |  |  └── ...
14 | |  └── config
15 | |  |  └── basic_config.py #a configuration file for storing model parameters
16 | |  └── dataset　　　
17 | |  └── io　　　　
18 | |  |  └── dataset.py　　
19 | |  |  └── data_transformer.py　　
20 | |  └── model
21 | |  |  └── nn　
22 | |  |  └── pretrain　
23 | |  └── output #save the ouput of model
24 | |  └── preprocessing #text preprocessing 
25 | |  └── train #used for training a model
26 | |  |  └── trainer.py 
27 | |  |  └── ...
28 | |  └── common # a set of utility functions
29 | ├── prepare_fold_data.py  # 数据切分
30 | ├── prepare_lm_data_mask.py  # 随机mask
31 | ├── prepare_lm_data_ngram.py  #ngram mask
32 | ├── run_bert_crf.py        # crf结构
33 | ├── run_bert_span.py　　　# span结构
34 | ├── train_bert_model.py　　#训练bert模型
35 | 
36 | ```
37 | ## 预训练模型
38 | 
39 | 主要训练了8层跟12层BERT模型，采用随机mask + ngram mask两种混合动态masking模式
40 | 
41 | ## 方案1
42 | 
43 | 方案1主要采用BERT+LSTM+CRF结构
44 | 
45 | ![](./png/model1.png)
46 | 
47 | ## 方案2
48 | 
49 | 方案2在方案1的基础上增加了MDP结构
50 | 
51 | ![](./png/model2.png)
52 | 
53 | ## 方案3
54 | 
55 | 方案3主要采用BERT+LSTM+SPAN结构
56 | 
57 | ![](./png/model3.png)
58 | 
59 | ## 结果
60 | 最终结果如下所示：
61 | 
62 | ![](./png/result.png)
63 | 
64 | ## 文档
65 | 
66 | 十强答辩ppt下载地址： https://pan.baidu.com/s/1yvXFf5GzyvDksdBKNp9FKQ 提取码: svr2
67 | 
68 | 


--------------------------------------------------------------------------------
/png/model1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/model1.png


--------------------------------------------------------------------------------
/png/model2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/model2.png


--------------------------------------------------------------------------------
/png/model3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/model3.png


--------------------------------------------------------------------------------
/png/result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/result.png


--------------------------------------------------------------------------------
/pydatagrand/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/callback/__init__.py:
--------------------------------------------------------------------------------
 1 | from .progressbar import ProgressBar
 2 | from .earlystopping import EarlyStopping
 3 | from .trainingmonitor import TrainingMonitor
 4 | from .modelcheckpoint import ModelCheckpoint
 5 | 
 6 | from .lrscheduler import CustomDecayLR
 7 | from .lrscheduler import BertLR
 8 | from .lrscheduler import CyclicLR
 9 | from .lrscheduler import ReduceLROnPlateau
10 | from .lrscheduler import ReduceLRWDOnPlateau
11 | from .lrscheduler import CosineLRWithRestarts
12 | from .lrscheduler import NoamLR
13 | from .lrscheduler import OneCycleScheduler
14 | from .lrscheduler import BERTReduceLROnPlateau
15 | 
16 | from .optimizater import Lamb
17 | from .optimizater import Lars
18 | from .optimizater import RAdam
19 | from .optimizater import Ralamb
20 | from .optimizater import Lookahead
21 | from .optimizater import RaLars
22 | from .optimizater import Ranger
23 | from .optimizater import SGDW
24 | from .optimizater import AdamW
25 | from .optimizater import AdaBound
26 | from .optimizater import Nadam
27 | from .optimizater import AdaFactor
28 | from .optimizater import WeightDecayOptimizerWrapper
29 | from .optimizater import NovoGrad
30 | from .optimizater import BertAdam
31 | 
32 | 


--------------------------------------------------------------------------------
/pydatagrand/callback/earlystopping.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from ..common.tools import logger
 3 | 
 4 | class EarlyStopping(object):
 5 |     '''
 6 |     early stopping 功能
 7 |     # Arguments
 8 |         min_delta: 最小变化
 9 |         patience: 多少个epoch未提高，就停止训练
10 |         verbose: 信息大于，默认打印信息
11 |         mode: 计算模式
12 |         monitor: 计算指标
13 |         baseline: 基线
14 |     '''
15 |     def __init__(self,
16 |                  min_delta = 0,
17 |                  patience  = 10,
18 |                  verbose   = 1,
19 |                  mode      = 'min',
20 |                  monitor   = 'loss',
21 |                  baseline  = None):
22 | 
23 |         self.baseline = baseline
24 |         self.patience = patience
25 |         self.verbose = verbose
26 |         self.min_delta = min_delta
27 |         self.monitor = monitor
28 | 
29 |         assert mode in ['min','max']
30 | 
31 |         if mode == 'min':
32 |             self.monitor_op = np.less
33 |         elif mode == 'max':
34 |             self.monitor_op = np.greater
35 |         if self.monitor_op == np.greater:
36 |             self.min_delta *= 1
37 |         else:
38 |             self.min_delta *= -1
39 |         self.reset()
40 | 
41 |     def reset(self):
42 |         # Allow instances to be re-used
43 |         self.wait = 0
44 |         self.stop_training = False
45 |         if self.baseline is not None:
46 |             self.best = self.baseline
47 |         else:
48 |             self.best = np.Inf if self.monitor_op == np.less else -np.Inf
49 | 
50 |     def epoch_step(self,current):
51 |         if self.monitor_op(current - self.min_delta, self.best):
52 |             self.best = current
53 |             self.wait = 0
54 |         else:
55 |             self.wait += 1
56 |             if self.wait >= self.patience:
57 |                 if self.verbose >0:
58 |                     logger.info(f"{self.patience} epochs with no improvement after which training will be stopped")
59 |                 self.stop_training = True
60 | 


--------------------------------------------------------------------------------
/pydatagrand/callback/modelcheckpoint.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import numpy as np
 3 | import torch
 4 | from ..common.tools import logger
 5 | 
 6 | class ModelCheckpoint(object):
 7 |     '''
 8 |     模型保存，两种模式：
 9 |     1. 直接保存最好模型
10 |     2. 按照epoch频率保存模型
11 |     '''
12 |     def __init__(self, checkpoint_dir,
13 |                  monitor,
14 |                  arch,mode='min',
15 |                  epoch_freq=1,
16 |                  best = None,
17 |                  save_best_only = True):
18 |         if isinstance(checkpoint_dir,Path):
19 |             checkpoint_dir = checkpoint_dir
20 |         else:
21 |             checkpoint_dir = Path(checkpoint_dir)
22 |         assert checkpoint_dir.is_dir()
23 |         checkpoint_dir.mkdir(exist_ok=True)
24 |         self.base_path = checkpoint_dir
25 |         self.arch = arch
26 |         self.monitor = monitor
27 |         self.epoch_freq = epoch_freq
28 |         self.save_best_only = save_best_only
29 | 
30 |         # 计算模式
31 |         if mode == 'min':
32 |             self.monitor_op = np.less
33 |             self.best = np.Inf
34 | 
35 |         elif mode == 'max':
36 |             self.monitor_op = np.greater
37 |             self.best = -np.Inf
38 |         # 这里主要重新加载模型时候
39 |         #对best重新赋值
40 |         if best:
41 |             self.best = best
42 | 
43 |         if save_best_only:
44 |             self.model_name = f"BEST_{arch}_MODEL.pth"
45 | 
46 |     def epoch_step(self, state,current):
47 |         '''
48 |         正常模型
49 |         :param state: 需要保存的信息
50 |         :param current: 当前判断指标
51 |         :return:
52 |         '''
53 |         # 是否保存最好模型
54 |         if self.save_best_only:
55 |             if self.monitor_op(current, self.best):
56 |                 logger.info(f"\nEpoch {state['epoch']}: {self.monitor} improved from {self.best:.5f} to {current:.5f}")
57 |                 self.best = current
58 |                 state['best'] = self.best
59 |                 best_path = self.base_path/ self.model_name
60 |                 torch.save(state, str(best_path))
61 |         # 每隔几个epoch保存下模型
62 |         else:
63 |             filename = self.base_path / f"EPOCH_{state['epoch']}_{state[self.monitor]}_{self.arch}_MODEL.pth"
64 |             if state['epoch'] % self.epoch_freq == 0:
65 |                 logger.info(f"\nEpoch {state['epoch']}: save model to disk.")
66 |                 torch.save(state, str(filename))
67 | 
68 |     def bert_epoch_step(self, state,current):
69 |         '''
70 |         适合bert类型模型，适合pytorch_transformer模块
71 |         :param state:
72 |         :param current:
73 |         :return:
74 |         '''
75 |         model_to_save = state['model']
76 |         if self.save_best_only:
77 |             if self.monitor_op(current, self.best):
78 |                 logger.info(f"\nEpoch {state['epoch']}: {self.monitor} improved from {self.best:.5f} to {current:.5f}")
79 |                 self.best = current
80 |                 state['best'] = self.best
81 |                 model_to_save.save_pretrained(str(self.base_path))
82 |                 output_config_file = self.base_path / 'configs.json'
83 |                 with open(str(output_config_file), 'w') as f:
84 |                     f.write(model_to_save.config.to_json_string())
85 |                 state.pop("model")
86 |                 torch.save(state,self.base_path / 'checkpoint_info.bin')
87 |         else:
88 |             if state['epoch'] % self.epoch_freq == 0:
89 |                 save_path = self.base_path / f"checkpoint-epoch-{state['epoch']}"
90 |                 save_path.mkdir(exist_ok=True)
91 |                 logger.info(f"\nEpoch {state['epoch']}: save model to disk.")
92 |                 model_to_save.save_pretrained(save_path)
93 |                 output_config_file = save_path / 'configs.json'
94 |                 with open(str(output_config_file), 'w') as f:
95 |                     f.write(model_to_save.config.to_json_string())
96 |                 state.pop("model")
97 |                 torch.save(state, save_path / 'checkpoint_info.bin')
98 | 


--------------------------------------------------------------------------------
/pydatagrand/callback/progressbar.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | class ProgressBar(object):
 4 |     '''
 5 |     custom progress bar
 6 |     Example:
 7 |         >>> from progressbar import ProgressBar
 8 |         >>> pbar = ProgressBar(n_total=30,desc='training')
 9 |         >>> step = 2
10 |         >>> pbar(step=step)
11 |     '''
12 |     def __init__(self, n_total,width=30,desc = 'Training'):
13 |         self.width = width
14 |         self.n_total = n_total
15 |         self.start_time = time.time()
16 |         self.desc = desc
17 | 
18 |     def __call__(self, step, info={}):
19 |         now = time.time()
20 |         current = step + 1
21 |         recv_per = current / self.n_total
22 |         bar = f'[{self.desc}] {current}/{self.n_total} ['
23 |         if recv_per >= 1:
24 |             recv_per = 1
25 |         prog_width = int(self.width * recv_per)
26 |         if prog_width > 0:
27 |             bar += '=' * (prog_width - 1)
28 |             if current< self.n_total:
29 |                 bar += ">"
30 |             else:
31 |                 bar += '='
32 |         bar += '.' * (self.width - prog_width)
33 |         bar += ']'
34 |         show_bar = f"\r{bar}"
35 |         time_per_unit = (now - self.start_time) / current
36 |         if current < self.n_total:
37 |             eta = time_per_unit * (self.n_total - current)
38 |             if eta > 3600:
39 |                 eta_format = ('%d:%02d:%02d' %
40 |                               (eta // 3600, (eta % 3600) // 60, eta % 60))
41 |             elif eta > 60:
42 |                 eta_format = '%d:%02d' % (eta // 60, eta % 60)
43 |             else:
44 |                 eta_format = '%ds' % eta
45 |             time_info = f' - ETA: {eta_format}'
46 |         else:
47 |             if time_per_unit >= 1:
48 |                 time_info = f' {time_per_unit:.1f}s/step'
49 |             elif time_per_unit >= 1e-3:
50 |                 time_info = f' {time_per_unit * 1e3:.1f}ms/step'
51 |             else:
52 |                 time_info = f' {time_per_unit * 1e6:.1f}us/step'
53 | 
54 |         show_bar += time_info
55 |         if len(info) != 0:
56 |             show_info = f'{show_bar} ' + \
57 |                         "-".join([f' {key}: {value:.4f} ' for key, value in info.items()])
58 |             print(show_info, end='')
59 |         else:
60 |             print(show_bar, end='')
61 | 


--------------------------------------------------------------------------------
/pydatagrand/callback/trainingmonitor.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | import numpy as np
 3 | from pathlib import Path
 4 | import matplotlib.pyplot as plt
 5 | from ..common.tools import load_json
 6 | from ..common.tools import save_json
 7 | plt.switch_backend('agg')  # 防止ssh上绘图问题
 8 | 
 9 | class TrainingMonitor():
10 |     def __init__(self, file_dir, arch, add_test=False):
11 |         '''
12 |         :param startAt: 重新开始训练的epoch点
13 |         '''
14 |         if isinstance(file_dir, Path):
15 |             pass
16 |         else:
17 |             file_dir = Path(file_dir)
18 |         file_dir.mkdir(parents=True, exist_ok=True)
19 | 
20 |         self.arch = arch
21 |         self.file_dir = file_dir
22 |         self.H = {}
23 |         self.add_test = add_test
24 |         self.json_path = file_dir / (arch + "_training_monitor.json")
25 | 
26 |     def reset(self,start_at):
27 |         if start_at > 0:
28 |             if self.json_path is not None:
29 |                 if self.json_path.exists():
30 |                     self.H = load_json(self.json_path)
31 |                     for k in self.H.keys():
32 |                         self.H[k] = self.H[k][:start_at]
33 | 
34 |     def epoch_step(self, logs={}):
35 |         for (k, v) in logs.items():
36 |             l = self.H.get(k, [])
37 |             # np.float32会报错
38 |             if not isinstance(v, np.float):
39 |                 v = round(float(v), 4)
40 |             l.append(v)
41 |             self.H[k] = l
42 | 
43 |         # 写入文件
44 |         if self.json_path is not None:
45 |             save_json(data = self.H,file_path=self.json_path)
46 | 
47 |         # 保存train图像
48 |         if len(self.H["loss"]) == 1:
49 |             self.paths = {key: self.file_dir / (self.arch + f'_{key.upper()}') for key in self.H.keys()}
50 | 
51 |         if len(self.H["loss"]) > 1:
52 |             # 指标变化
53 |             # 曲线
54 |             # 需要成对出现
55 |             keys = [key for key, _ in self.H.items() if '_' not in key]
56 |             for key in keys:
57 |                 N = np.arange(0, len(self.H[key]))
58 |                 plt.style.use("ggplot")
59 |                 plt.figure()
60 |                 plt.plot(N, self.H[key], label=f"train_{key}")
61 |                 # plt.plot(N, self.H[f"valid_{key}"], label=f"valid_{key}")
62 |                 if self.add_test:
63 |                     plt.plot(N, self.H[f"test_{key}"], label=f"test_{key}")
64 |                 plt.legend()
65 |                 plt.xlabel("Epoch #")
66 |                 plt.ylabel(key)
67 |                 plt.title(f"Training {key} [Epoch {len(self.H[key])}]")
68 |                 plt.savefig(str(self.paths[key]))
69 |                 plt.close()
70 | 


--------------------------------------------------------------------------------
/pydatagrand/callback/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import math
 3 | 
 4 | def warmup_cosine(x, warmup=0.002):
 5 |     if x < warmup:
 6 |         return x/warmup
 7 |     return 0.5 * (1.0 + torch.cos(math.pi * x))
 8 | 
 9 | def warmup_constant(x, warmup=0.002):
10 |     if x < warmup:
11 |         return x/warmup
12 |     return 1.0
13 | 
14 | def warmup_linear(x, warmup=0.002):
15 |     if x < warmup:
16 |         return x/warmup
17 |     return 1.0 - x
18 | 
19 | SCHEDULES = {
20 |     'warmup_cosine':warmup_cosine,
21 |     'warmup_constant':warmup_constant,
22 |     'warmup_linear':warmup_linear,
23 | }
24 | 


--------------------------------------------------------------------------------
/pydatagrand/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/common/__init__.py


--------------------------------------------------------------------------------
/pydatagrand/configs/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/configs/base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from pathlib import Path
 3 | BASE_DIR = Path('pydatagrand')
 4 | config = {
 5 |     'data_dir': BASE_DIR / 'dataset',
 6 |     'log_dir': BASE_DIR / 'output/log',
 7 |     'writer_dir': BASE_DIR / "output/TSboard",
 8 |     'figure_dir': BASE_DIR / "output/figure",
 9 |     'checkpoint_dir': BASE_DIR / "output/checkpoints",
10 |     'cache_dir': BASE_DIR / 'model/',
11 |     'result_dir': BASE_DIR / "output/result",
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/pydatagrand/configs/bert_config.py:
--------------------------------------------------------------------------------
 1 | bert_base_config = {
 2 |   "attention_probs_dropout_prob": 0.2,
 3 |   "directionality": "bidi",
 4 |   "hidden_act": "gelu",
 5 |   "hidden_dropout_prob": 0.2,
 6 |   "hidden_size": 768,
 7 |   "initializer_range": 0.02,
 8 |   "intermediate_size": 3072,
 9 |   "max_position_embeddings": 512,
10 |   "num_attention_heads": 12,
11 |   "num_hidden_layers": 12,
12 |   "pooler_fc_size": 768,
13 |   "pooler_num_attention_heads": 12,
14 |   "pooler_num_fc_layers": 3,
15 |   "pooler_size_per_head": 128,
16 |   "pooler_type": "first_token_transform",
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 21228
19 | }
20 | 


--------------------------------------------------------------------------------
/pydatagrand/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/dataset/corpus/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/dataset/corpus/train/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/doc/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/embedding/w2v_embedding.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | 
  4 | 
  5 | class LoadEmbedding(object):
  6 |     '''
  7 |     word_index:{word:id}
  8 |     '''
  9 | 
 10 |     def __init__(self, max_features, word_index):
 11 |         self.max_features = max_features
 12 |         self.word_index = word_index
 13 | 
 14 |     def load_glove(self, embedding_path):
 15 |         '''
 16 |         embedding_path = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
 17 |         '''
 18 | 
 19 |         def get_coefs(word, *arr):
 20 |             return word, np.asarray(arr, dtype='float32')[:300]
 21 | 
 22 |         embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path))
 23 |         all_embs = np.stack(embeddings_index.values())
 24 |         emb_mean, emb_std = all_embs.mean(), all_embs.std()
 25 |         embed_size = all_embs.shape[1]
 26 |         nb_words = min(self.max_features, len(self.word_index))
 27 |         embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
 28 |         for word, i in self.word_index.items():
 29 |             if i >= self.max_features: continue
 30 |             embedding_vector = embeddings_index.get(word)
 31 |             # ALLmight
 32 |             if embedding_vector is not None:
 33 |                 embedding_matrix[i] = embedding_vector
 34 |             else:
 35 |                 embedding_vector = embeddings_index.get(word.capitalize())
 36 |                 if embedding_vector is not None:
 37 |                     embedding_matrix[i] = embedding_vector
 38 |         return embedding_matrix
 39 | 
 40 |     def load_fasttext(self, embedding_path):
 41 |         '''
 42 |         embedding_path = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
 43 |         '''
 44 | 
 45 |         def get_coefs(word, *arr):
 46 |             return word, np.asarray(arr, dtype='float32')
 47 | 
 48 |         embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path) if len(o) > 100)
 49 |         all_embs = np.stack(embeddings_index.values())
 50 |         emb_mean, emb_std = all_embs.mean(), all_embs.std()
 51 |         embed_size = all_embs.shape[1]
 52 |         nb_words = min(self.max_features, len(self.word_index))
 53 |         embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
 54 |         for word, i in self.word_index.items():
 55 |             if i >= self.max_features: continue
 56 |             embedding_vector = embeddings_index.get(word)
 57 |             if embedding_vector is not None: embedding_matrix[i] = embedding_vector
 58 |         return embedding_matrix
 59 | 
 60 |     def load_para(self, embedding_path):
 61 |         '''
 62 |         embedding_path = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
 63 |         '''
 64 | 
 65 |         def get_coefs(word, *arr):
 66 |             return word, np.asarray(arr, dtype='float32')
 67 | 
 68 |         embeddings_index = dict(
 69 |             get_coefs(*o.split(" ")) for o in open(embedding_path, encoding="utf8", errors='ignore') if len(o) > 100)
 70 |         all_embs = np.stack(embeddings_index.values())
 71 |         emb_mean, emb_std = all_embs.mean(), all_embs.std()
 72 |         embed_size = all_embs.shape[1]
 73 |         # word_index = tokenizer.word_index
 74 |         nb_words = min(self.max_features, len(self.word_index))
 75 |         embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
 76 |         for word, i in self.word_index.items():
 77 |             if i >= self.max_features: continue
 78 |             embedding_vector = embeddings_index.get(word)
 79 |             if embedding_vector is not None: embedding_matrix[i] = embedding_vector
 80 | 
 81 |         return embedding_matrix
 82 | 
 83 |     def load_custom_embedding(self, embedding_path):
 84 |         '''
 85 |         embedding_path = '../input/embeddings/word2vec.bin'
 86 |         '''
 87 | 
 88 |         def get_coefs(word, *arr):
 89 |             return word, np.asarray(arr, dtype='float32')
 90 | 
 91 |         embeddings_index = dict(
 92 |             get_coefs(*o.strip("\n").split(" ")) for o in open(embedding_path, 'r') if o.strip("\n").split(" ")[0]!='')
 93 |         all_embs = np.stack(embeddings_index.values())
 94 |         emb_mean, emb_std = all_embs.mean(), all_embs.std()
 95 |         embed_size = all_embs.shape[1]
 96 |         # word_index = tokenizer.word_index
 97 |         nb_words = min(self.max_features, len(self.word_index))
 98 |         embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
 99 |         for word, i in self.word_index.items():
100 |             if i >= self.max_features: continue
101 |             embedding_vector = embeddings_index.get(word)
102 |             if embedding_vector is not None: embedding_matrix[i] = embedding_vector
103 |         return embedding_matrix
104 | 


--------------------------------------------------------------------------------
/pydatagrand/io/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/io/vocabulary.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | from collections import OrderedDict
  3 | from ..common.tools import save_pickle
  4 | from ..common.tools import load_pickle
  5 | 
  6 | 
  7 | class Vocabulary(object):
  8 |     def __init__(self, max_size=None,
  9 |                  min_freq=None,
 10 |                  pad_token="[PAD]",
 11 |                  unk_token="[UNK]",
 12 |                  cls_token="[CLS]",
 13 |                  sep_token="[SEP]",
 14 |                  mask_token="[MASK]",
 15 |                  add_unused=False):
 16 |         self.max_size = max_size
 17 |         self.min_freq = min_freq
 18 |         self.cls_token = cls_token
 19 |         self.sep_token = sep_token
 20 |         self.pad_token = pad_token
 21 |         self.mask_token = mask_token
 22 |         self.unk_token = unk_token
 23 |         self.word2idx = {}
 24 |         self.idx2word = None
 25 |         self.rebuild = True
 26 |         self.add_unused = add_unused
 27 |         self.word_counter = Counter()
 28 |         self.reset()
 29 | 
 30 |     def reset(self):
 31 |         ctrl_symbols = [self.pad_token, self.unk_token, self.cls_token, self.sep_token, self.mask_token]
 32 |         for index, syb in enumerate(ctrl_symbols):
 33 |             self.word2idx[syb] = index
 34 | 
 35 |         if self.add_unused:
 36 |             for i in range(20):
 37 |                 self.word2idx[f'[UNUSED{i}]'] = len(self.word2idx)
 38 | 
 39 |     def update(self, word_list):
 40 |         '''
 41 |         依次增加序列中词在词典中的出现频率
 42 |         :param word_list:
 43 |         :return:
 44 |         '''
 45 |         self.word_counter.update(word_list)
 46 | 
 47 |     def add(self, word):
 48 |         '''
 49 |         增加一个新词在词典中的出现频率
 50 |         :param word:
 51 |         :return:
 52 |         '''
 53 |         self.word_counter[word] += 1
 54 | 
 55 |     def has_word(self, word):
 56 |         '''
 57 |         检查词是否被记录
 58 |         :param word:
 59 |         :return:
 60 |         '''
 61 |         return word in self.word2idx
 62 | 
 63 |     def to_index(self, word):
 64 |         '''
 65 |         将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出
 66 |         :param word:
 67 |         :return:
 68 |         '''
 69 |         if word in self.word2idx:
 70 |             return self.word2idx[word]
 71 |         if self.unk_token is not None:
 72 |             return self.word2idx[self.unk_token]
 73 |         else:
 74 |             raise ValueError("word {} not in vocabulary".format(word))
 75 | 
 76 |     def unknown_idx(self):
 77 |         """
 78 |         unknown 对应的数字.
 79 |         """
 80 |         if self.unk_token is None:
 81 |             return None
 82 |         return self.word2idx[self.unk_token]
 83 | 
 84 |     def padding_idx(self):
 85 |         """
 86 |         padding 对应的数字
 87 |         """
 88 |         if self.pad_token is None:
 89 |             return None
 90 |         return self.word2idx[self.pad_token]
 91 | 
 92 |     def to_word(self, idx):
 93 |         """
 94 |         给定一个数字, 将其转为对应的词.
 95 | 
 96 |         :param int idx: the index
 97 |         :return str word: the word
 98 |         """
 99 |         return self.idx2word[idx]
100 | 
101 |     def build_vocab(self):
102 |         max_size = min(self.max_size, len(self.word_counter)) if self.max_size else None
103 |         words = self.word_counter.most_common(max_size)
104 |         if self.min_freq is not None:
105 |             words = filter(lambda kv: kv[1] >= self.min_freq, words)
106 |         if self.word2idx:
107 |             words = filter(lambda kv: kv[0] not in self.word2idx, words)
108 |         start_idx = len(self.word2idx)
109 |         self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)})
110 |         self.build_reverse_vocab()
111 |         self.rebuild = False
112 | 
113 |     def save(self, file_path):
114 |         '''
115 |         保存vocab
116 |         :param file_name:
117 |         :param pickle_path:
118 |         :return:
119 |         '''
120 |         mappings = {
121 |             "word2idx": self.word2idx,
122 |             'idx2word': self.idx2word
123 |         }
124 |         save_pickle(data=mappings, file_path=file_path)
125 | 
126 |     def save_bert_vocab(self, file_path):
127 |         '''
128 |         保存成bert模式
129 |         :param file_path:
130 |         :return:
131 |         '''
132 |         bert_vocab = [x for x, y in self.word2idx.items()]
133 |         with open(str(file_path), 'w') as fo:
134 |             for token in bert_vocab:
135 |                 fo.write(token + "\n")
136 | 
137 |     def load_bert_vocab(self, vocab_file):
138 |         """Loads a vocabulary file into a dictionary."""
139 |         vocab = OrderedDict()
140 |         index = 0
141 |         with open(vocab_file, "r", encoding="utf-8") as reader:
142 |             while True:
143 |                 token = reader.readline()
144 |                 if not token:
145 |                     break
146 |                 token = token.strip()
147 |                 vocab[token] = index
148 |                 index += 1
149 |         return list(vocab.keys())
150 | 
151 |     def load_from_file(self, file_path):
152 |         '''
153 |         从文件组红加载vocab
154 |         :param file_name:
155 |         :param pickle_path:
156 |         :return:
157 |         '''
158 |         mappings = load_pickle(input_file=file_path)
159 |         self.idx2word = mappings['idx2word']
160 |         self.word2idx = mappings['word2idx']
161 | 
162 |     def build_reverse_vocab(self):
163 |         self.idx2word = {i: w for w, i in self.word2idx.items()}
164 | 
165 |     def read_data(self, data_path):
166 |         if data_path.is_dir():
167 |             files = sorted([f for f in data_path.iterdir() if f.exists()])
168 |         else:
169 |             files = [data_path]
170 |         for file in files:
171 |             f = open(file, 'r')
172 |             lines = f.readlines()  # 读取数据集
173 |             for line in lines:
174 |                 line = line.strip("\n")
175 |                 words = line.split(" ")
176 |                 self.update(words)
177 | 
178 |     def clear(self):
179 |         """
180 |         删除Vocabulary中的词表数据。相当于重新初始化一下。
181 |         :return:
182 |         """
183 |         self.word_counter.clear()
184 |         self.word2idx = None
185 |         self.idx2word = None
186 |         self.rebuild = True
187 |         self.reset()
188 | 
189 |     def __len__(self):
190 |         return len(self.idx2word)
191 | 


--------------------------------------------------------------------------------
/pydatagrand/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/model/__init__.py


--------------------------------------------------------------------------------
/pydatagrand/model/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .attention import MultiHeadAttention
2 | from .cnn import Capsule
3 | from .crf import CRF
4 | from .dgcnn import DGCNN
5 | from .dropouts import SpatialDropout
6 | from .normalization import LayerNorm # fp16 error
7 | 


--------------------------------------------------------------------------------
/pydatagrand/model/layers/attention.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class ScaledDotProductAttention(nn.Module):
 6 |     ''' Scaled Dot-Product Attention '''
 7 | 
 8 |     def __init__(self, temperature, attn_dropout=0.1):
 9 |         super().__init__()
10 |         self.temperature = temperature
11 |         self.dropout = nn.Dropout(attn_dropout)
12 |         self.softmax = nn.Softmax(dim=2)
13 | 
14 |     def forward(self, q, k, v, mask=None):
15 |         attn = torch.bmm(q, k.transpose(1, 2))
16 |         attn = attn / self.temperature
17 |         if mask is not None:
18 |             attn = attn.masked_fill((1 - mask).byte(), -np.inf)
19 |         attn = self.softmax(attn)
20 |         attn = self.dropout(attn)
21 |         output = torch.bmm(attn, v)
22 |         return output, attn
23 | 
24 | class MultiHeadAttention(nn.Module):
25 |     def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
26 |         super().__init__()
27 |         self.n_head = n_head
28 |         self.d_k = d_k
29 |         self.d_v = d_v
30 |         self.w_qs = nn.Linear(d_model, n_head * d_k)
31 |         self.w_ks = nn.Linear(d_model, n_head * d_k)
32 |         self.w_vs = nn.Linear(d_model, n_head * d_v)
33 |         nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
34 |         nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
35 |         nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))
36 |         self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
37 |         self.layer_norm = nn.LayerNorm(d_model)
38 |         self.fc = nn.Linear(n_head * d_v, d_model)
39 |         nn.init.xavier_normal_(self.fc.weight)
40 |         self.dropout = nn.Dropout(dropout)
41 | 
42 |     def forward(self, q, k, v, mask=None):
43 |         d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
44 |         sz_b, len_q, _ = q.size()
45 |         sz_b, len_k, _ = k.size()
46 |         sz_b, len_v, _ = v.size()
47 |         residual = q
48 |         q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
49 |         k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
50 |         v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
51 |         q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k)  # (n*b) x lq x dk
52 |         k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k)  # (n*b) x lk x dk
53 |         v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v)  # (n*b) x lv x dv
54 |         mask = mask.unsqueeze(1).expand(-1, len_q, -1)
55 |         mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
56 |         output, attn = self.attention(q, k, v, mask=mask.byte())
57 |         output = output.view(n_head, sz_b, len_q, d_v)
58 |         output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1)  # b x lq x (n*dv)
59 |         output = self.dropout(self.fc(output))
60 |         output = self.layer_norm(output + residual)
61 |         return output, attn


--------------------------------------------------------------------------------
/pydatagrand/model/layers/cnn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class Capsule(nn.Module):
 6 |     def __init__(self, input_dim_capsule=1024, num_capsule=5, dim_capsule=5, routings=4):
 7 |         super(Capsule, self).__init__()
 8 |         self.num_capsule = num_capsule
 9 |         self.dim_capsule = dim_capsule
10 |         self.routings = routings
11 |         self.activation = self.squash
12 |         self.W = nn.Parameter(
13 |             nn.init.xavier_normal_(torch.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
14 | 
15 |     def forward(self, x):
16 |         u_hat_vecs = torch.matmul(x, self.W)
17 |         batch_size = x.size(0)
18 |         input_num_capsule = x.size(1)
19 |         u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
20 |                                       self.num_capsule, self.dim_capsule))
21 |         u_hat_vecs = u_hat_vecs.permute(0, 2, 1,
22 |                                         3).contiguous()  # (batch_size,num_capsule,input_num_capsule,dim_capsule)
23 |         with torch.no_grad():
24 |             b = torch.zeros_like(u_hat_vecs[:, :, :, 0])
25 |         for i in range(self.routings):
26 |             c = F.softmax(b, dim=1)  # (batch_size,num_capsule,input_num_capsule)
27 |             outputs = self.activation(torch.sum(c.unsqueeze(-1) * u_hat_vecs, dim=2))  # bij,bijk->bik
28 |             if i < self.routings - 1:
29 |                 b = (torch.sum(outputs.unsqueeze(2) * u_hat_vecs, dim=-1))  # bik,bijk->bij
30 |         return outputs  # (batch_size, num_capsule, dim_capsule)
31 | 
32 |     def squash(self, x, axis=-1):
33 |         s_squared_norm = (x ** 2).sum(axis, keepdim=True)
34 |         scale = torch.sqrt(s_squared_norm + 1e-7)
35 |         return x / scale


--------------------------------------------------------------------------------
/pydatagrand/model/layers/crf.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | def to_scalar(var):
  6 |     return var.view(-1).detach().tolist()[0]
  7 | 
  8 | def argmax(vec):
  9 |     _, idx = torch.max(vec, 1)
 10 |     return to_scalar(idx)
 11 | 
 12 | def log_sum_exp(vec):
 13 |     max_score = vec[0, argmax(vec)]
 14 |     max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
 15 |     return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))
 16 | 
 17 | def argmax_batch(vecs):
 18 |     _, idx = torch.max(vecs, 1)
 19 |     return idx
 20 | 
 21 | def log_sum_exp_batch(vecs):
 22 |     maxi = torch.max(vecs, 1)[0]
 23 |     maxi_bc = maxi[:, None].repeat(1, vecs.shape[1])
 24 |     recti_ = torch.log(torch.sum(torch.exp(vecs - maxi_bc), 1))
 25 |     return maxi + recti_
 26 | 
 27 | class CRF(nn.Module):
 28 |     def __init__(self,tagset_size,tag_dictionary,device,is_bert=None):
 29 |         super(CRF,self).__init__()
 30 | 
 31 |         self.START_TAG = "<START>"
 32 |         self.STOP_TAG = "<STOP>"
 33 |         if is_bert:
 34 |             self.START_TAG = "[CLS]"
 35 |             self.STOP_TAG = "[SEP]"
 36 |         self.tag_dictionary = tag_dictionary
 37 |         self.tagset_size = tagset_size
 38 |         self.device = device
 39 |         self.transitions = torch.randn(tagset_size, tagset_size)
 40 |         # self.transitions = torch.zeros(tagset_size, tagset_size)
 41 |         self.transitions.detach()[self.tag_dictionary[self.START_TAG], :] = -10000
 42 |         self.transitions.detach()[:, self.tag_dictionary[self.STOP_TAG]] = -10000
 43 |         self.transitions = self.transitions.to(device)
 44 |         self.transitions = nn.Parameter(self.transitions)
 45 | 
 46 |     def _viterbi_decode(self, feats):
 47 |         backpointers = []
 48 |         backscores = []
 49 |         scores = []
 50 |         init_vvars = (torch.FloatTensor(1, self.tagset_size).to(self.device).fill_(-10000.0))
 51 |         init_vvars[0][self.tag_dictionary[self.START_TAG]] = 0
 52 |         forward_var = init_vvars
 53 | 
 54 |         for feat in feats:
 55 |             next_tag_var = (
 56 |                     forward_var.view(1, -1).expand(self.tagset_size, self.tagset_size)
 57 |                     + self.transitions
 58 |             )
 59 |             _, bptrs_t = torch.max(next_tag_var, dim=1)
 60 |             viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t]
 61 |             forward_var = viterbivars_t + feat
 62 |             backscores.append(forward_var)
 63 |             backpointers.append(bptrs_t)
 64 | 
 65 |         terminal_var = (
 66 |                 forward_var
 67 |                 + self.transitions[self.tag_dictionary[self.STOP_TAG]]
 68 |         )
 69 |         terminal_var.detach()[self.tag_dictionary[self.STOP_TAG]] = -10000.0
 70 |         terminal_var.detach()[self.tag_dictionary[self.START_TAG]] = -10000.0
 71 |         best_tag_id = argmax(terminal_var.unsqueeze(0))
 72 |         best_path = [best_tag_id]
 73 |         for bptrs_t in reversed(backpointers):
 74 |             best_tag_id = bptrs_t[best_tag_id]
 75 |             best_path.append(best_tag_id.item())
 76 |         best_scores = []
 77 |         for backscore in backscores:
 78 |             softmax = F.softmax(backscore, dim=0)
 79 |             _, idx = torch.max(backscore, 0)
 80 |             prediction = idx.item()
 81 |             best_scores.append(softmax[prediction].item())
 82 |             scores.append([elem.item() for elem in softmax.flatten()])
 83 |         swap_best_path, swap_max_score = (
 84 |             best_path[0],
 85 |             scores[-1].index(max(scores[-1])),
 86 |         )
 87 |         scores[-1][swap_best_path], scores[-1][swap_max_score] = (
 88 |             scores[-1][swap_max_score],
 89 |             scores[-1][swap_best_path],
 90 |         )
 91 |         start = best_path.pop()
 92 |         assert start == self.tag_dictionary[self.START_TAG]
 93 |         best_path.reverse()
 94 |         return best_scores, best_path, scores
 95 | 
 96 |     def _forward_alg(self, feats, lens_):
 97 |         init_alphas = torch.FloatTensor(self.tagset_size).fill_(-10000.0)
 98 |         init_alphas[self.tag_dictionary[self.START_TAG]] = 0.0
 99 | 
100 |         forward_var = torch.zeros(
101 |             feats.shape[0],
102 |             feats.shape[1] + 1,
103 |             feats.shape[2],
104 |             dtype=torch.float,
105 |             device=self.device,
106 |         )
107 |         forward_var[:, 0, :] = init_alphas[None, :].repeat(feats.shape[0], 1)
108 |         transitions = self.transitions.view(
109 |             1, self.transitions.shape[0], self.transitions.shape[1]
110 |         ).repeat(feats.shape[0], 1, 1)
111 |         for i in range(feats.shape[1]):
112 |             emit_score = feats[:, i, :]
113 |             tag_var = (
114 |                 emit_score[:, :, None].repeat(1, 1, transitions.shape[2])
115 |                 + transitions
116 |                 + forward_var[:, i, :][:, :, None]
117 |                 .repeat(1, 1, transitions.shape[2])
118 |                 .transpose(2, 1)
119 |             )
120 |             max_tag_var, _ = torch.max(tag_var, dim=2)
121 |             tag_var = tag_var - max_tag_var[:, :, None].repeat(
122 |                 1, 1, transitions.shape[2]
123 |             )
124 |             agg_ = torch.log(torch.sum(torch.exp(tag_var), dim=2))
125 |             cloned = forward_var.clone()
126 |             cloned[:, i + 1, :] = max_tag_var + agg_
127 |             forward_var = cloned
128 |         forward_var = forward_var[range(forward_var.shape[0]), lens_, :]
129 |         terminal_var = forward_var + self.transitions[self.tag_dictionary[self.STOP_TAG]][None, :].repeat(forward_var.shape[0], 1)
130 |         alpha = log_sum_exp_batch(terminal_var)
131 |         return alpha
132 | 
133 |     def _score_sentence(self, feats, tags, lens_):
134 |         start = torch.LongTensor([self.tag_dictionary[self.START_TAG]]).to(self.device)
135 |         start = start[None, :].repeat(tags.shape[0], 1)
136 |         stop = torch.LongTensor([self.tag_dictionary[self.STOP_TAG]]).to(self.device)
137 |         stop = stop[None, :].repeat(tags.shape[0], 1)
138 |         pad_start_tags = torch.cat([start, tags], 1)
139 |         pad_stop_tags = torch.cat([tags, stop], 1)
140 |         for i in range(len(lens_)):
141 |             pad_stop_tags[i, lens_[i] :] = self.tag_dictionary[self.STOP_TAG]
142 |         score = torch.FloatTensor(feats.shape[0]).to(self.device)
143 |         for i in range(feats.shape[0]):
144 |             r = torch.LongTensor(range(lens_[i])).to(self.device)
145 |             score[i] = torch.sum(
146 |                 self.transitions[
147 |                     pad_stop_tags[i, : lens_[i] + 1], pad_start_tags[i, : lens_[i] + 1]
148 |                 ]
149 |             ) + torch.sum(feats[i, r, tags[i, : lens_[i]]])
150 |         return score
151 | 
152 |     def _obtain_labels(self, feature, id2label,input_lens):
153 |         tags = []
154 |         all_tags = []
155 |         for feats, length in zip(feature, input_lens):
156 |             confidences, tag_seq, scores = self._viterbi_decode(feats[:length])
157 |             tags.append([id2label[tag] for tag in tag_seq])
158 |             all_tags.append([[id2label[score_id] for score_id, score in enumerate(score_dist)] for score_dist in scores])
159 |         return tags, all_tags
160 | 
161 |     def calculate_loss(self, scores, tag_list,lengths):
162 |         return self._calculate_loss_old(scores, lengths, tag_list)
163 | 
164 |     def _calculate_loss_old(self, features, lengths, tags):
165 |         forward_score = self._forward_alg(features, lengths)
166 |         gold_score = self._score_sentence(features, tags, lengths)
167 |         score = forward_score - gold_score
168 |         return score.mean()
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/pydatagrand/model/layers/dropouts.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch.nn as nn
 3 | 
 4 | class SpatialDropout(nn.Dropout2d):
 5 |     def __init__(self, p=0.6):
 6 |         super(SpatialDropout, self).__init__(p=p)
 7 | 
 8 |     def forward(self, x):
 9 |         x = x.unsqueeze(2)  # (N, T, 1, K)
10 |         x = x.permute(0, 3, 2, 1)  # (N, K, 1, T)
11 |         x = self.forward(x)  # (N, K, 1, T), some features are masked
12 |         x = x.permute(0, 3, 2, 1)  # (N, T, 1, K)
13 |         x = x.squeeze(2)  # (N, T, K)
14 |         return x
15 | 


--------------------------------------------------------------------------------
/pydatagrand/model/layers/embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | 
 6 | def nn_init(nn_module, method='xavier'):
 7 |     for param_name, _ in nn_module.named_parameters():
 8 |         if isinstance(nn_module, nn.Sequential):
 9 |             i, name = param_name.split('.', 1)
10 |             param = getattr(nn_module[int(i)], name)
11 |         else:
12 |             param = getattr(nn_module, param_name)
13 |         if param_name.find('weight') > -1:
14 |             init_weight(param, method)
15 |         elif param_name.find('bias') > -1:
16 |             nn.init.uniform_(param, -1e-4, 1e-4)
17 | 
18 | 
19 | def init_weight(weight, method):
20 |     if method == 'orthogonal':
21 |         nn.init.orthogonal_(weight)
22 |     elif method == 'xavier':
23 |         nn.init.xavier_uniform_(weight)
24 |     elif method == 'kaiming':
25 |         nn.init.kaiming_uniform_(weight)
26 |     elif method == 'none':
27 |         pass
28 |     else:
29 |         raise Exception('Unknown init method')
30 | 
31 | 
32 | class PretrainedEmbedding(nn.Module):
33 |     def __init__(self, embedding_matrix, requires_grad=False):
34 |         super(PretrainedEmbedding, self).__init__()
35 |         embed_size = embedding_matrix.shape[1]
36 |         max_features = embedding_matrix.shape[0]
37 |         self.embedding = nn.Embedding(max_features, embed_size)
38 |         self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
39 |         self.embedding.weight.requires_grad = requires_grad
40 | 
41 |     def forward(self, indices):
42 |         return self.embedding(indices)
43 | 
44 | 
45 | class ProjSumEmbedding(nn.Module):
46 | 
47 |     def __init__(self, embedding_matrices, output_size):
48 |         super(ProjSumEmbedding, self).__init__()
49 |         assert len(embedding_matrices) > 0
50 | 
51 |         self.embedding_count = len(embedding_matrices)
52 |         self.output_size = output_size
53 |         self.embedding_projectors = nn.ModuleList()
54 |         for embedding_matrix in embedding_matrices:
55 |             embedding_dim = embedding_matrix.shape[1]
56 |             projection = nn.Linear(embedding_dim, self.output_size)
57 |             nn_init(projection)
58 | 
59 |             self.embedding_projectors.append(nn.Sequential(
60 |                 PretrainedEmbedding(embedding_matrix),
61 |                 projection
62 |             ))
63 | 
64 |     def forward(self, x):
65 |         projected = [embedding_projector(x) for embedding_projector in self.embedding_projectors]
66 |         return F.relu(sum(projected))
67 | 


--------------------------------------------------------------------------------
/pydatagrand/model/layers/linears.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class FeedForwardNetwork(nn.Module):
 6 |     def __init__(self, input_size, hidden_size, output_size, dropout_rate=0):
 7 |         super(FeedForwardNetwork, self).__init__()
 8 |         self.dropout_rate = dropout_rate
 9 |         self.linear1 = nn.Linear(input_size, hidden_size)
10 |         self.linear2 = nn.Linear(hidden_size, output_size)
11 | 
12 |     def forward(self, x):
13 |         x_proj = F.dropout(F.relu(self.linear1(x)), p=self.dropout_rate, training=self.training)
14 |         x_proj = self.linear2(x_proj)
15 |         return x_proj
16 | 
17 | 
18 | class PoolerStartLogits(nn.Module):
19 |     def __init__(self, hidden_size, num_classes):
20 |         super(PoolerStartLogits, self).__init__()
21 |         self.dense = nn.Linear(hidden_size, num_classes)
22 | 
23 |     def forward(self, hidden_states, p_mask=None):
24 |         x = self.dense(hidden_states)
25 |         return x
26 | 
27 | class PoolerEndLogits(nn.Module):
28 |     def __init__(self, hidden_size, num_classes):
29 |         super(PoolerEndLogits, self).__init__()
30 |         self.dense_0 = nn.Linear(hidden_size, hidden_size)
31 |         self.activation = nn.Tanh()
32 |         self.LayerNorm = nn.LayerNorm(hidden_size)
33 |         self.dense_1 = nn.Linear(hidden_size, num_classes)
34 | 
35 |     def forward(self, hidden_states, start_positions=None, p_mask=None):
36 |         x = self.dense_0(torch.cat([hidden_states, start_positions], dim=-1))
37 |         x = self.activation(x)
38 |         x = self.LayerNorm(x)
39 |         x = self.dense_1(x)
40 |         return x
41 | 


--------------------------------------------------------------------------------
/pydatagrand/model/layers/normalization.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class LayerNorm(nn.Module):
 6 |     def __init__(self, hidden_size, eps=1e-12):
 7 |         """Construct a layernorm module in the TF style (epsilon inside the square root).
 8 |         """
 9 |         super(LayerNorm, self).__init__()
10 |         self.weight = nn.Parameter(torch.ones(hidden_size))
11 |         self.bias = nn.Parameter(torch.zeros(hidden_size))
12 |         self.variance_epsilon = eps
13 |         self.bias.data.zero_()
14 |         self.weight.data.fill_(1.0)
15 | 
16 |     def forward(self, x):
17 |         u = x.mean(-1, keepdim=True)
18 |         s = (x - u).pow(2).mean(-1, keepdim=True)
19 |         x = (x - u) / torch.sqrt(s + self.variance_epsilon)
20 |         return self.weight * x + self.bias
21 | 


--------------------------------------------------------------------------------
/pydatagrand/model/nn/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/model/nn/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/model/nn/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/pydatagrand/model/nn/bert_lstm_crf.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from ..layers import CRF
 3 | from ..layers import LayerNorm
 4 | from ..pytorch_transformers.modeling_bert import BertPreTrainedModel
 5 | from ..pytorch_transformers.modeling_bert import BertModel
 6 | 
 7 | 
 8 | class BERTLSTMCRF(BertPreTrainedModel):
 9 |     def __init__(self, config, label2id, device, num_layers=2, lstm_dropout=0.35):
10 |         super(BERTLSTMCRF, self).__init__(config)
11 |         self.bert = BertModel(config)
12 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
13 |         self.classifier = nn.Linear(config.hidden_size, len(label2id))
14 |         self.init_weights()
15 | 
16 |         self.bilstm = nn.LSTM(input_size=config.hidden_size,
17 |                               hidden_size=config.hidden_size // 2,
18 |                               batch_first=True,
19 |                               num_layers=num_layers,
20 |                               dropout=lstm_dropout,
21 |                               bidirectional=True)
22 |         self.layer_norm = LayerNorm(config.hidden_size)
23 |         self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True)
24 | 
25 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None):
26 |         outputs = self.bert(input_ids, token_type_ids, attention_mask)
27 |         sequence_output = outputs[0]
28 |         sequence_output = self.dropout(sequence_output)
29 |         sequence_output, _ = self.bilstm(sequence_output)
30 |         sequence_output = self.layer_norm(sequence_output)
31 |         logits = self.classifier(sequence_output)
32 |         return logits
33 | 
34 |     def forward_loss(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, input_lens=None):
35 |         features = self.forward(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
36 |         if labels is not None:
37 |             return features, self.crf.calculate_loss(features, tag_list=labels, lengths=input_lens)
38 |         else:
39 |             return features, None
40 | 
41 |     def unfreeze(self, start_layer=6, end_layer=12):
42 |         def children(m):
43 |             return m if isinstance(m, (list, tuple)) else list(m.children())
44 | 
45 |         def set_trainable_attr(m, b):
46 |             m.trainable = b
47 |             for p in m.parameters():
48 |                 p.requires_grad = b
49 | 
50 |         def apply_leaf(m, f):
51 |             c = children(m)
52 |             if isinstance(m, nn.Module):
53 |                 f(m)
54 |             if len(c) > 0:
55 |                 for l in c:
56 |                     apply_leaf(l, f)
57 | 
58 |         def set_trainable(l, b):
59 |             apply_leaf(l, lambda m: set_trainable_attr(m, b))
60 | 
61 |         # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
62 |         set_trainable(self.bert, False)
63 |         for i in range(start_layer, end_layer):
64 |             set_trainable(self.bert.encoder.layer[i], True)
65 | 


--------------------------------------------------------------------------------
/pydatagrand/model/nn/bert_lstm_crf_mdp.py:
--------------------------------------------------------------------------------
 1 | import torch.nn as nn
 2 | from ..layers import CRF
 3 | from ..layers import LayerNorm
 4 | from ..pytorch_transformers.modeling_bert import BertPreTrainedModel
 5 | from ..pytorch_transformers.modeling_bert import BertModel
 6 | 
 7 | 
 8 | class BERTLSTMCRFMDP(BertPreTrainedModel):
 9 |     def __init__(self, config, label2id, device, num_layers=2, lstm_dropout=0.35, mdp_n=5, mdp_p=0.5):
10 |         super(BERTLSTMCRFMDP, self).__init__(config)
11 |         self.bert = BertModel(config)
12 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
13 |         self.classifier = nn.Linear(config.hidden_size, len(label2id))
14 |         self.init_weights()
15 | 
16 |         self.bilstm = nn.LSTM(input_size=config.hidden_size,
17 |                               hidden_size=config.hidden_size // 2,
18 |                               batch_first=True,
19 |                               num_layers=num_layers,
20 |                               dropout=lstm_dropout,
21 |                               bidirectional=True)
22 |         self.layer_norm = LayerNorm(config.hidden_size)
23 |         self.dropouts = nn.ModuleList([
24 |             nn.Dropout(mdp_p) for _ in range(mdp_n)
25 |         ])
26 |         self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True)
27 | 
28 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None):
29 |         outputs = self.bert(input_ids, token_type_ids, attention_mask)
30 |         sequence_output = outputs[0]
31 |         sequence_output = self.dropout(sequence_output)
32 |         sequence_output, _ = self.bilstm(sequence_output)
33 |         sequence_output = self.layer_norm(sequence_output)
34 |         for i, dropout in enumerate(self.dropouts):
35 |             if i == 0:
36 |                 logits = self.classifier(dropout(sequence_output))
37 |             else:
38 |                 logits += self.classifier(dropout(sequence_output))
39 |         return logits / len(self.dropouts)
40 | 
41 |     def forward_loss(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, input_lens=None):
42 |         features = self.forward(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
43 |         if labels is not None:
44 |             return features, self.crf.calculate_loss(features, tag_list=labels, lengths=input_lens)
45 |         else:
46 |             return features, None
47 | 
48 |     def unfreeze(self, start_layer=6, end_layer=12):
49 |         def children(m):
50 |             return m if isinstance(m, (list, tuple)) else list(m.children())
51 | 
52 |         def set_trainable_attr(m, b):
53 |             m.trainable = b
54 |             for p in m.parameters():
55 |                 p.requires_grad = b
56 | 
57 |         def apply_leaf(m, f):
58 |             c = children(m)
59 |             if isinstance(m, nn.Module):
60 |                 f(m)
61 |             if len(c) > 0:
62 |                 for l in c:
63 |                     apply_leaf(l, f)
64 | 
65 |         def set_trainable(l, b):
66 |             apply_leaf(l, lambda m: set_trainable_attr(m, b))
67 | 
68 |         # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
69 |         set_trainable(self.bert, False)
70 |         for i in range(start_layer, end_layer):
71 |             set_trainable(self.bert.encoder.layer[i], True)
72 | 


--------------------------------------------------------------------------------
/pydatagrand/model/nn/bert_lstm_span.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from ..layers import LayerNorm
 5 | from ..pytorch_transformers.modeling_bert import BertPreTrainedModel
 6 | from ..pytorch_transformers.modeling_bert import BertModel
 7 | from ..layers.linears import PoolerEndLogits, PoolerStartLogits
 8 | 
 9 | 
10 | class BERTLSTMSpan(BertPreTrainedModel):
11 |     def __init__(self, config, label2id, num_layers=2, lstm_dropout=0.35, soft_label=False):
12 |         super(BERTLSTMSpan, self).__init__(config)
13 |         self.soft_label = soft_label
14 |         self.num_labels = len(label2id)
15 |         self.bert = BertModel(config)
16 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
17 |         self.init_weights()
18 | 
19 |         self.bilstm = nn.LSTM(input_size=config.hidden_size,
20 |                               hidden_size=config.hidden_size // 2,
21 |                               batch_first=True,
22 |                               num_layers=num_layers,
23 |                               dropout=lstm_dropout,
24 |                               bidirectional=True)
25 |         self.layer_norm = LayerNorm(config.hidden_size)
26 |         self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels)
27 |         if soft_label:
28 |             self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels)
29 |         else:
30 |             self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels)
31 | 
32 |     def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_point=None):
33 |         outputs = self.bert(input_ids, token_type_ids, attention_mask)
34 |         sequence_output = outputs[0]
35 |         sequence_output = self.dropout(sequence_output)
36 |         sequence_output, _ = self.bilstm(sequence_output)
37 |         sequence_output = self.layer_norm(sequence_output)
38 |         ps1 = self.start_fc(sequence_output)
39 |         if start_point is not None:
40 |             if self.soft_label:
41 |                 batch_size = input_ids.size(0)
42 |                 seq_len = input_ids.size(1)
43 |                 start_logits = torch.FloatTensor(batch_size, seq_len, self.num_labels)
44 |                 start_logits.zero_()
45 |                 start_logits = start_logits.to(self.device)
46 |                 start_logits.scatter_(2, start_point.unsqueeze(2), 1)
47 |             else:
48 |                 start_logits = start_point.unsqueeze(2).float()
49 | 
50 |         else:
51 |             start_logits = F.softmax(ps1, -1)
52 |             if not self.soft_label:
53 |                 start_logits = torch.argmax(start_logits, -1).unsqueeze(2).float()
54 |         ps2 = self.end_fc(sequence_output, start_logits)
55 |         return ps1, ps2
56 | 
57 |     def unfreeze(self, start_layer=6, end_layer=12):
58 |         def children(m):
59 |             return m if isinstance(m, (list, tuple)) else list(m.children())
60 | 
61 |         def set_trainable_attr(m, b):
62 |             m.trainable = b
63 |             for p in m.parameters():
64 |                 p.requires_grad = b
65 | 
66 |         def apply_leaf(m, f):
67 |             c = children(m)
68 |             if isinstance(m, nn.Module):
69 |                 f(m)
70 |             if len(c) > 0:
71 |                 for l in c:
72 |                     apply_leaf(l, f)
73 | 
74 |         def set_trainable(l, b):
75 |             apply_leaf(l, lambda m: set_trainable_attr(m, b))
76 | 
77 |         # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True)
78 |         set_trainable(self.bert, False)
79 |         for i in range(start_layer, end_layer):
80 |             set_trainable(self.bert.encoder.layer[i], True)
81 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.2.0"
 2 | # Work around to update TensorFlow's absl.logging threshold which alters the
 3 | # default Python logging output behavior when present.
 4 | # see: https://github.com/abseil/abseil-py/issues/99
 5 | # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493
 6 | try:
 7 |     import absl.logging
 8 |     absl.logging.set_verbosity('info')
 9 |     absl.logging.set_stderrthreshold('info')
10 |     absl.logging._warn_preinit_stderr = False
11 | except:
12 |     pass
13 | 
14 | # Tokenizer
15 | from .tokenization_utils import (PreTrainedTokenizer)
16 | from .tokenization_auto import AutoTokenizer
17 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
18 | from .tokenization_openai import OpenAIGPTTokenizer
19 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
20 | from .tokenization_gpt2 import GPT2Tokenizer
21 | from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
22 | from .tokenization_xlm import XLMTokenizer
23 | from .tokenization_roberta import RobertaTokenizer
24 | from .tokenization_distilbert import DistilBertTokenizer
25 | 
26 | # Configurations
27 | from .configuration_utils import PretrainedConfig
28 | from .configuration_auto import AutoConfig
29 | from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
30 | from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
31 | from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
32 | from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
33 | from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
34 | from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
35 | from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
37 | 
38 | # Modeling
39 | from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
40 | from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
41 |                             AutoModelWithLMHead)
42 | 
43 | from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
44 |                             BertForMaskedLM, BertForNextSentencePrediction,
45 |                             BertForSequenceClassification, BertForMultipleChoice,
46 |                             BertForTokenClassification, BertForQuestionAnswering,
47 |                             load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
48 | from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
49 |                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
50 |                               load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
51 | from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
52 |                                   load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
53 | from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
54 |                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
55 |                             load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
56 | from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
57 |                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
58 |                              load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
59 | from .modeling_xlm import (XLMPreTrainedModel , XLMModel,
60 |                            XLMWithLMHeadModel, XLMForSequenceClassification,
61 |                            XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
62 | from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
63 |                                ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
64 | from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
65 |                                DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
66 |                                DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
67 | 
68 | # Optimization
69 | from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
70 |                            WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
71 | 
72 | # Files and general utilities
73 | from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
74 |                          cached_path, add_start_docstrings, add_end_docstrings,
75 |                          WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
76 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/__main__.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | def main():
  3 |     import sys
  4 |     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
  5 |         print(
  6 |         "Should be used as one of: \n"
  7 |         ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
  8 |         ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
  9 |         ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
 10 |         ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
 11 |         ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
 12 |         ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
 13 |     else:
 14 |         if sys.argv[1] == "bert":
 15 |             try:
 16 |                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
 17 |             except ImportError:
 18 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 19 |                     "In that case, it requires TensorFlow to be installed. Please see "
 20 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 21 |                 raise
 22 | 
 23 |             if len(sys.argv) != 5:
 24 |                 # pylint: disable=line-too-long
 25 |                 print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
 26 |             else:
 27 |                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
 28 |                 TF_CONFIG = sys.argv.pop()
 29 |                 TF_CHECKPOINT = sys.argv.pop()
 30 |                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 31 |         elif sys.argv[1] == "gpt":
 32 |             from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
 33 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 34 |                 # pylint: disable=line-too-long
 35 |                 print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
 36 |             else:
 37 |                 OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
 38 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 39 |                 if len(sys.argv) == 5:
 40 |                     OPENAI_GPT_CONFIG = sys.argv[4]
 41 |                 else:
 42 |                     OPENAI_GPT_CONFIG = ""
 43 |                 convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
 44 |                                                     OPENAI_GPT_CONFIG,
 45 |                                                     PYTORCH_DUMP_OUTPUT)
 46 |         elif sys.argv[1] == "transfo_xl":
 47 |             try:
 48 |                 from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
 49 |             except ImportError:
 50 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 51 |                     "In that case, it requires TensorFlow to be installed. Please see "
 52 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 53 |                 raise
 54 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 55 |                 # pylint: disable=line-too-long
 56 |                 print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 57 |             else:
 58 |                 if 'ckpt' in sys.argv[2].lower():
 59 |                     TF_CHECKPOINT = sys.argv[2]
 60 |                     TF_DATASET_FILE = ""
 61 |                 else:
 62 |                     TF_DATASET_FILE = sys.argv[2]
 63 |                     TF_CHECKPOINT = ""
 64 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 65 |                 if len(sys.argv) == 5:
 66 |                     TF_CONFIG = sys.argv[4]
 67 |                 else:
 68 |                     TF_CONFIG = ""
 69 |                 convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
 70 |         elif sys.argv[1] == "gpt2":
 71 |             try:
 72 |                 from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
 73 |             except ImportError:
 74 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 75 |                     "In that case, it requires TensorFlow to be installed. Please see "
 76 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 77 |                 raise
 78 | 
 79 |             if len(sys.argv) < 4 or len(sys.argv) > 5:
 80 |                 # pylint: disable=line-too-long
 81 |                 print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
 82 |             else:
 83 |                 TF_CHECKPOINT = sys.argv[2]
 84 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
 85 |                 if len(sys.argv) == 5:
 86 |                     TF_CONFIG = sys.argv[4]
 87 |                 else:
 88 |                     TF_CONFIG = ""
 89 |                 convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
 90 |         elif sys.argv[1] == "xlnet":
 91 |             try:
 92 |                 from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
 93 |             except ImportError:
 94 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
 95 |                     "In that case, it requires TensorFlow to be installed. Please see "
 96 |                     "https://www.tensorflow.org/install/ for installation instructions.")
 97 |                 raise
 98 | 
 99 |             if len(sys.argv) < 5 or len(sys.argv) > 6:
100 |                 # pylint: disable=line-too-long
101 |                 print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
102 |             else:
103 |                 TF_CHECKPOINT = sys.argv[2]
104 |                 TF_CONFIG = sys.argv[3]
105 |                 PYTORCH_DUMP_OUTPUT = sys.argv[4]
106 |                 if len(sys.argv) == 6:
107 |                     FINETUNING_TASK = sys.argv[5]
108 |                 else:
109 |                     FINETUNING_TASK = None
110 | 
111 |                 convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
112 |                                                     TF_CONFIG,
113 |                                                     PYTORCH_DUMP_OUTPUT,
114 |                                                     FINETUNING_TASK)
115 |         elif sys.argv[1] == "xlm":
116 |             from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
117 | 
118 |             if len(sys.argv) != 4:
119 |                 # pylint: disable=line-too-long
120 |                 print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
121 |             else:
122 |                 XLM_CHECKPOINT_PATH = sys.argv[2]
123 |                 PYTORCH_DUMP_OUTPUT = sys.argv[3]
124 | 
125 |                 convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
126 | 
127 | if __name__ == '__main__':
128 |     main()
129 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ BERT model configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
 31 |     'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
 32 |     'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
 33 |     'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
 34 |     'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
 35 |     'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
 36 |     'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
 37 |     'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
 38 |     'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
 39 |     'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
 40 |     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
 41 |     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
 42 |     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
 43 | }
 44 | 
 45 | 
 46 | class BertConfig(PretrainedConfig):
 47 |     r"""
 48 |         :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a
 49 |         `BertModel`.
 50 | 
 51 | 
 52 |         Arguments:
 53 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 54 |             hidden_size: Size of the encoder layers and the pooler layer.
 55 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 56 |             num_attention_heads: Number of attention heads for each attention layer in
 57 |                 the Transformer encoder.
 58 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 59 |                 layer in the Transformer encoder.
 60 |             hidden_act: The non-linear activation function (function or string) in the
 61 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 62 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 63 |                 layers in the embeddings, encoder, and pooler.
 64 |             attention_probs_dropout_prob: The dropout ratio for the attention
 65 |                 probabilities.
 66 |             max_position_embeddings: The maximum sequence length that this model might
 67 |                 ever be used with. Typically set this to something large just in case
 68 |                 (e.g., 512 or 1024 or 2048).
 69 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 70 |                 `BertModel`.
 71 |             initializer_range: The sttdev of the truncated_normal_initializer for
 72 |                 initializing all weight matrices.
 73 |             layer_norm_eps: The epsilon used by LayerNorm.
 74 |     """
 75 |     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 76 | 
 77 |     def __init__(self,
 78 |                  vocab_size_or_config_json_file=30522,
 79 |                  hidden_size=768,
 80 |                  num_hidden_layers=12,
 81 |                  num_attention_heads=12,
 82 |                  intermediate_size=3072,
 83 |                  hidden_act="gelu",
 84 |                  hidden_dropout_prob=0.1,
 85 |                  attention_probs_dropout_prob=0.1,
 86 |                  max_position_embeddings=512,
 87 |                  type_vocab_size=2,
 88 |                  initializer_range=0.02,
 89 |                  layer_norm_eps=1e-12,
 90 |                  **kwargs):
 91 |         super(BertConfig, self).__init__(**kwargs)
 92 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 93 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 94 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
 95 |                 json_config = json.loads(reader.read())
 96 |             for key, value in json_config.items():
 97 |                 self.__dict__[key] = value
 98 |         elif isinstance(vocab_size_or_config_json_file, int):
 99 |             self.vocab_size = vocab_size_or_config_json_file
100 |             self.hidden_size = hidden_size
101 |             self.num_hidden_layers = num_hidden_layers
102 |             self.num_attention_heads = num_attention_heads
103 |             self.hidden_act = hidden_act
104 |             self.intermediate_size = intermediate_size
105 |             self.hidden_dropout_prob = hidden_dropout_prob
106 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
107 |             self.max_position_embeddings = max_position_embeddings
108 |             self.type_vocab_size = type_vocab_size
109 |             self.initializer_range = initializer_range
110 |             self.layer_norm_eps = layer_norm_eps
111 |         else:
112 |             raise ValueError("First argument must be either a vocabulary size (int)"
113 |                              " or the path to a pretrained model config file (str)")
114 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ DistilBERT model configuration """
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | from io import open
23 | 
24 | from .configuration_utils import PretrainedConfig
25 | 
26 | logger = logging.getLogger(__name__)
27 | 
28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
29 |     'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
30 |     'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
31 | }
32 | 
33 | 
34 | class DistilBertConfig(PretrainedConfig):
35 |     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 
37 |     def __init__(self,
38 |                  vocab_size_or_config_json_file=30522,
39 |                  max_position_embeddings=512,
40 |                  sinusoidal_pos_embds=True,
41 |                  n_layers=6,
42 |                  n_heads=12,
43 |                  dim=768,
44 |                  hidden_dim=4*768,
45 |                  dropout=0.1,
46 |                  attention_dropout=0.1,
47 |                  activation='gelu',
48 |                  initializer_range=0.02,
49 |                  tie_weights_=True,
50 |                  qa_dropout=0.1,
51 |                  seq_classif_dropout=0.2,
52 |                  **kwargs):
53 |         super(DistilBertConfig, self).__init__(**kwargs)
54 | 
55 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
56 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
57 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
58 |                 json_config = json.loads(reader.read())
59 |             for key, value in json_config.items():
60 |                 self.__dict__[key] = value
61 |         elif isinstance(vocab_size_or_config_json_file, int):
62 |             self.vocab_size = vocab_size_or_config_json_file
63 |             self.max_position_embeddings = max_position_embeddings
64 |             self.sinusoidal_pos_embds = sinusoidal_pos_embds
65 |             self.n_layers = n_layers
66 |             self.n_heads = n_heads
67 |             self.dim = dim
68 |             self.hidden_dim = hidden_dim
69 |             self.dropout = dropout
70 |             self.attention_dropout = attention_dropout
71 |             self.activation = activation
72 |             self.initializer_range = initializer_range
73 |             self.tie_weights_ = tie_weights_
74 |             self.qa_dropout = qa_dropout
75 |             self.seq_classif_dropout = seq_classif_dropout
76 |         else:
77 |             raise ValueError("First argument must be either a vocabulary size (int)"
78 |                              " or the path to a pretrained model config file (str)")
79 |     @property
80 |     def hidden_size(self):
81 |         return self.dim
82 | 
83 |     @property
84 |     def num_attention_heads(self):
85 |         return self.n_heads
86 | 
87 |     @property
88 |     def num_hidden_layers(self):
89 |         return self.n_layers
90 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_gpt2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT-2 configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
 30 |                                       "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
 31 |                                       "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
 32 | 
 33 | class GPT2Config(PretrainedConfig):
 34 |     """Configuration class to store the configuration of a `GPT2Model`.
 35 | 
 36 |     Args:
 37 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 38 |         n_positions: Number of positional embeddings.
 39 |         n_ctx: Size of the causal mask (usually same as n_positions).
 40 |         n_embd: Dimensionality of the embeddings and hidden states.
 41 |         n_layer: Number of hidden layers in the Transformer encoder.
 42 |         n_head: Number of attention heads for each attention layer in
 43 |             the Transformer encoder.
 44 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 45 |         resid_pdrop: The dropout probabilitiy for all fully connected
 46 |             layers in the embeddings, encoder, and pooler.
 47 |         attn_pdrop: The dropout ratio for the attention
 48 |             probabilities.
 49 |         embd_pdrop: The dropout ratio for the embeddings.
 50 |         initializer_range: The sttdev of the truncated_normal_initializer for
 51 |             initializing all weight matrices.
 52 |     """
 53 |     pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         vocab_size_or_config_json_file=50257,
 58 |         n_positions=1024,
 59 |         n_ctx=1024,
 60 |         n_embd=768,
 61 |         n_layer=12,
 62 |         n_head=12,
 63 |         resid_pdrop=0.1,
 64 |         embd_pdrop=0.1,
 65 |         attn_pdrop=0.1,
 66 |         layer_norm_epsilon=1e-5,
 67 |         initializer_range=0.02,
 68 | 
 69 |         num_labels=1,
 70 |         summary_type='cls_index',
 71 |         summary_use_proj=True,
 72 |         summary_activation=None,
 73 |         summary_proj_to_labels=True,
 74 |         summary_first_dropout=0.1,
 75 |         **kwargs
 76 |     ):
 77 |         """Constructs GPT2Config.
 78 | 
 79 |         Args:
 80 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
 81 |             n_positions: Number of positional embeddings.
 82 |             n_ctx: Size of the causal mask (usually same as n_positions).
 83 |             n_embd: Dimensionality of the embeddings and hidden states.
 84 |             n_layer: Number of hidden layers in the Transformer encoder.
 85 |             n_head: Number of attention heads for each attention layer in
 86 |                 the Transformer encoder.
 87 |             layer_norm_epsilon: epsilon to use in the layer norm layers
 88 |             resid_pdrop: The dropout probabilitiy for all fully connected
 89 |                 layers in the embeddings, encoder, and pooler.
 90 |             attn_pdrop: The dropout ratio for the attention
 91 |                 probabilities.
 92 |             embd_pdrop: The dropout ratio for the embeddings.
 93 |             initializer_range: The sttdev of the truncated_normal_initializer for
 94 |                 initializing all weight matrices.
 95 |         """
 96 |         super(GPT2Config, self).__init__(**kwargs)
 97 | 
 98 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 99 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
100 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
101 |                 json_config = json.loads(reader.read())
102 |             for key, value in json_config.items():
103 |                 self.__dict__[key] = value
104 |         elif isinstance(vocab_size_or_config_json_file, int):
105 |             self.vocab_size = vocab_size_or_config_json_file
106 |             self.n_ctx = n_ctx
107 |             self.n_positions = n_positions
108 |             self.n_embd = n_embd
109 |             self.n_layer = n_layer
110 |             self.n_head = n_head
111 |             self.resid_pdrop = resid_pdrop
112 |             self.embd_pdrop = embd_pdrop
113 |             self.attn_pdrop = attn_pdrop
114 |             self.layer_norm_epsilon = layer_norm_epsilon
115 |             self.initializer_range = initializer_range
116 | 
117 |             self.num_labels = num_labels
118 |             self.summary_type = summary_type
119 |             self.summary_use_proj = summary_use_proj
120 |             self.summary_activation = summary_activation
121 |             self.summary_first_dropout = summary_first_dropout
122 |             self.summary_proj_to_labels = summary_proj_to_labels
123 |         else:
124 |             raise ValueError(
125 |                 "First argument must be either a vocabulary size (int)"
126 |                 "or the path to a pretrained model config file (str)"
127 |             )
128 | 
129 |     @property
130 |     def max_position_embeddings(self):
131 |         return self.n_positions
132 | 
133 |     @property
134 |     def hidden_size(self):
135 |         return self.n_embd
136 | 
137 |     @property
138 |     def num_attention_heads(self):
139 |         return self.n_head
140 | 
141 |     @property
142 |     def num_hidden_layers(self):
143 |         return self.n_layer
144 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_openai.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ OpenAI GPT configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"
 31 | }
 32 | 
 33 | class OpenAIGPTConfig(PretrainedConfig):
 34 |     """
 35 |     Configuration class to store the configuration of a `OpenAIGPTModel`.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
 39 |         n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
 40 |         n_positions: Number of positional embeddings.
 41 |         n_ctx: Size of the causal mask (usually same as n_positions).
 42 |         n_embd: Dimensionality of the embeddings and hidden states.
 43 |         n_layer: Number of hidden layers in the Transformer encoder.
 44 |         n_head: Number of attention heads for each attention layer in
 45 |             the Transformer encoder.
 46 |         afn: The non-linear activation function (function or string) in the
 47 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 48 |         resid_pdrop: The dropout probabilitiy for all fully connected
 49 |             layers in the embeddings, encoder, and pooler.
 50 |         attn_pdrop: The dropout ratio for the attention
 51 |             probabilities.
 52 |         embd_pdrop: The dropout ratio for the embeddings.
 53 |         layer_norm_epsilon: epsilon to use in the layer norm layers
 54 |         initializer_range: The sttdev of the truncated_normal_initializer for
 55 |             initializing all weight matrices.
 56 |         predict_special_tokens: should we predict special tokens (when the model has a LM head)
 57 |     """
 58 |     pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         vocab_size_or_config_json_file=40478,
 63 |         n_positions=512,
 64 |         n_ctx=512,
 65 |         n_embd=768,
 66 |         n_layer=12,
 67 |         n_head=12,
 68 |         afn="gelu",
 69 |         resid_pdrop=0.1,
 70 |         embd_pdrop=0.1,
 71 |         attn_pdrop=0.1,
 72 |         layer_norm_epsilon=1e-5,
 73 |         initializer_range=0.02,
 74 |         predict_special_tokens=True,
 75 | 
 76 |         num_labels=1,
 77 |         summary_type='cls_index',
 78 |         summary_use_proj=True,
 79 |         summary_activation=None,
 80 |         summary_proj_to_labels=True,
 81 |         summary_first_dropout=0.1,
 82 |         **kwargs
 83 |     ):
 84 |         """Constructs OpenAIGPTConfig.
 85 |         """
 86 |         super(OpenAIGPTConfig, self).__init__(**kwargs)
 87 | 
 88 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
 89 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
 90 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 91 |                 json_config = json.loads(reader.read())
 92 |             for key, value in json_config.items():
 93 |                 self.__dict__[key] = value
 94 |         elif isinstance(vocab_size_or_config_json_file, int):
 95 |             self.vocab_size = vocab_size_or_config_json_file
 96 |             self.n_ctx = n_ctx
 97 |             self.n_positions = n_positions
 98 |             self.n_embd = n_embd
 99 |             self.n_layer = n_layer
100 |             self.n_head = n_head
101 |             self.afn = afn
102 |             self.resid_pdrop = resid_pdrop
103 |             self.embd_pdrop = embd_pdrop
104 |             self.attn_pdrop = attn_pdrop
105 |             self.layer_norm_epsilon = layer_norm_epsilon
106 |             self.initializer_range = initializer_range
107 |             self.predict_special_tokens = predict_special_tokens
108 | 
109 |             self.num_labels = num_labels
110 |             self.summary_type = summary_type
111 |             self.summary_use_proj = summary_use_proj
112 |             self.summary_activation = summary_activation
113 |             self.summary_first_dropout = summary_first_dropout
114 |             self.summary_proj_to_labels = summary_proj_to_labels
115 |         else:
116 |             raise ValueError(
117 |                 "First argument must be either a vocabulary size (int)"
118 |                 "or the path to a pretrained model config file (str)"
119 |             )
120 | 
121 |     @property
122 |     def max_position_embeddings(self):
123 |         return self.n_positions
124 | 
125 |     @property
126 |     def hidden_size(self):
127 |         return self.n_embd
128 | 
129 |     @property
130 |     def num_attention_heads(self):
131 |         return self.n_head
132 | 
133 |     @property
134 |     def num_hidden_layers(self):
135 |         return self.n_layer
136 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ RoBERTa configuration """
17 | 
18 | from __future__ import (absolute_import, division, print_function,
19 |                         unicode_literals)
20 | 
21 | import logging
22 | 
23 | from .configuration_bert import BertConfig
24 | 
25 | logger = logging.getLogger(__name__)
26 | 
27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
28 |     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
29 |     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
30 |     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
31 | }
32 | 
33 | 
34 | class RobertaConfig(BertConfig):
35 |     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
36 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_transfo_xl.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ Transformer XL configuration """
 17 | 
 18 | from __future__ import absolute_import, division, print_function, unicode_literals
 19 | 
 20 | import json
 21 | import logging
 22 | import sys
 23 | from io import open
 24 | 
 25 | from .configuration_utils import PretrainedConfig
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json",
 31 | }
 32 | 
 33 | class TransfoXLConfig(PretrainedConfig):
 34 |     """Configuration class to store the configuration of a `TransfoXLModel`.
 35 | 
 36 |         Args:
 37 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
 38 |             cutoffs: cutoffs for the adaptive softmax
 39 |             d_model: Dimensionality of the model's hidden states.
 40 |             d_embed: Dimensionality of the embeddings
 41 |             d_head: Dimensionality of the model's heads.
 42 |             div_val: divident value for adapative input and softmax
 43 |             pre_lnorm: apply LayerNorm to the input instead of the output
 44 |             d_inner: Inner dimension in FF
 45 |             n_layer: Number of hidden layers in the Transformer encoder.
 46 |             n_head: Number of attention heads for each attention layer in
 47 |                 the Transformer encoder.
 48 |             tgt_len: number of tokens to predict
 49 |             ext_len: length of the extended context
 50 |             mem_len: length of the retained previous heads
 51 |             same_length: use the same attn length for all tokens
 52 |             proj_share_all_but_first: True to share all but first projs, False not to share.
 53 |             attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
 54 |             clamp_len: use the same pos embeddings after clamp_len
 55 |             sample_softmax: number of samples in sampled softmax
 56 |             adaptive: use adaptive softmax
 57 |             tie_weight: tie the word embedding and softmax weights
 58 |             dropout: The dropout probabilitiy for all fully connected
 59 |                 layers in the embeddings, encoder, and pooler.
 60 |             dropatt: The dropout ratio for the attention probabilities.
 61 |             untie_r: untie relative position biases
 62 |             embd_pdrop: The dropout ratio for the embeddings.
 63 |             init: parameter initializer to use
 64 |             init_range: parameters initialized by U(-init_range, init_range).
 65 |             proj_init_std: parameters initialized by N(0, init_std)
 66 |             init_std: parameters initialized by N(0, init_std)
 67 |     """
 68 |     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 69 | 
 70 |     def __init__(self,
 71 |                  vocab_size_or_config_json_file=267735,
 72 |                  cutoffs=[20000, 40000, 200000],
 73 |                  d_model=1024,
 74 |                  d_embed=1024,
 75 |                  n_head=16,
 76 |                  d_head=64,
 77 |                  d_inner=4096,
 78 |                  div_val=4,
 79 |                  pre_lnorm=False,
 80 |                  n_layer=18,
 81 |                  tgt_len=128,
 82 |                  ext_len=0,
 83 |                  mem_len=1600,
 84 |                  clamp_len=1000,
 85 |                  same_length=True,
 86 |                  proj_share_all_but_first=True,
 87 |                  attn_type=0,
 88 |                  sample_softmax=-1,
 89 |                  adaptive=True,
 90 |                  tie_weight=True,
 91 |                  dropout=0.1,
 92 |                  dropatt=0.0,
 93 |                  untie_r=True,
 94 |                  init="normal",
 95 |                  init_range=0.01,
 96 |                  proj_init_std=0.01,
 97 |                  init_std=0.02,
 98 |                  **kwargs):
 99 |         """Constructs TransfoXLConfig.
100 |         """
101 |         super(TransfoXLConfig, self).__init__(**kwargs)
102 | 
103 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
104 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
105 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
106 |                 json_config = json.loads(reader.read())
107 |             for key, value in json_config.items():
108 |                 self.__dict__[key] = value
109 |         elif isinstance(vocab_size_or_config_json_file, int):
110 |             self.n_token = vocab_size_or_config_json_file
111 |             self.cutoffs = []
112 |             self.cutoffs.extend(cutoffs)
113 |             self.tie_weight = tie_weight
114 |             if proj_share_all_but_first:
115 |                 self.tie_projs = [False] + [True] * len(self.cutoffs)
116 |             else:
117 |                 self.tie_projs = [False] + [False] * len(self.cutoffs)
118 |             self.d_model = d_model
119 |             self.d_embed = d_embed
120 |             self.d_head = d_head
121 |             self.d_inner = d_inner
122 |             self.div_val = div_val
123 |             self.pre_lnorm = pre_lnorm
124 |             self.n_layer = n_layer
125 |             self.n_head = n_head
126 |             self.tgt_len = tgt_len
127 |             self.ext_len = ext_len
128 |             self.mem_len = mem_len
129 |             self.same_length = same_length
130 |             self.attn_type = attn_type
131 |             self.clamp_len = clamp_len
132 |             self.sample_softmax = sample_softmax
133 |             self.adaptive = adaptive
134 |             self.dropout = dropout
135 |             self.dropatt = dropatt
136 |             self.untie_r = untie_r
137 |             self.init = init
138 |             self.init_range = init_range
139 |             self.proj_init_std = proj_init_std
140 |             self.init_std = init_std
141 |         else:
142 |             raise ValueError("First argument must be either a vocabulary size (int)"
143 |                              " or the path to a pretrained model config file (str)")
144 | 
145 |     @property
146 |     def max_position_embeddings(self):
147 |         return self.tgt_len + self.ext_len + self.mem_len
148 | 
149 |     @property
150 |     def vocab_size(self):
151 |         return self.n_token
152 | 
153 |     @vocab_size.setter
154 |     def vocab_size(self, value):
155 |         self.n_token = value
156 | 
157 |     @property
158 |     def hidden_size(self):
159 |         return self.d_model
160 | 
161 |     @property
162 |     def num_attention_heads(self):
163 |         return self.n_head
164 | 
165 |     @property
166 |     def num_hidden_layers(self):
167 |         return self.n_layer
168 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/configuration_xlnet.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """ XLNet configuration """
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import json
 20 | import logging
 21 | import sys
 22 | from io import open
 23 | 
 24 | from .configuration_utils import PretrainedConfig
 25 | 
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 29 |     'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json",
 30 |     'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json",
 31 | }
 32 | 
 33 | 
 34 | class XLNetConfig(PretrainedConfig):
 35 |     """Configuration class to store the configuration of a ``XLNetModel``.
 36 | 
 37 |     Args:
 38 |         vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
 39 |         d_model: Size of the encoder layers and the pooler layer.
 40 |         n_layer: Number of hidden layers in the Transformer encoder.
 41 |         n_head: Number of attention heads for each attention layer in
 42 |             the Transformer encoder.
 43 |         d_inner: The size of the "intermediate" (i.e., feed-forward)
 44 |             layer in the Transformer encoder.
 45 |         ff_activation: The non-linear activation function (function or string) in the
 46 |             encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 47 |         untie_r: untie relative position biases
 48 |         attn_type: 'bi' for XLNet, 'uni' for Transformer-XL
 49 | 
 50 |         dropout: The dropout probabilitiy for all fully connected
 51 |             layers in the embeddings, encoder, and pooler.
 52 |         dropatt: The dropout ratio for the attention
 53 |             probabilities.
 54 |         initializer_range: The sttdev of the truncated_normal_initializer for
 55 |             initializing all weight matrices.
 56 |         layer_norm_eps: The epsilon used by LayerNorm.
 57 | 
 58 |         dropout: float, dropout rate.
 59 |         dropatt: float, dropout rate on attention probabilities.
 60 |         init: str, the initialization scheme, either "normal" or "uniform".
 61 |         init_range: float, initialize the parameters with a uniform distribution
 62 |             in [-init_range, init_range]. Only effective when init="uniform".
 63 |         init_std: float, initialize the parameters with a normal distribution
 64 |             with mean 0 and stddev init_std. Only effective when init="normal".
 65 |         mem_len: int, the number of tokens to cache.
 66 |         reuse_len: int, the number of tokens in the currect batch to be cached
 67 |             and reused in the future.
 68 |         bi_data: bool, whether to use bidirectional input pipeline.
 69 |             Usually set to True during pretraining and False during finetuning.
 70 |         clamp_len: int, clamp all relative distances larger than clamp_len.
 71 |             -1 means no clamping.
 72 |         same_length: bool, whether to use the same attention length for each token.
 73 |         finetuning_task: name of the glue task on which the model was fine-tuned if any
 74 |     """
 75 |     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 76 | 
 77 |     def __init__(self,
 78 |                  vocab_size_or_config_json_file=32000,
 79 |                  d_model=1024,
 80 |                  n_layer=24,
 81 |                  n_head=16,
 82 |                  d_inner=4096,
 83 |                  ff_activation="gelu",
 84 |                  untie_r=True,
 85 |                  attn_type="bi",
 86 | 
 87 |                  initializer_range=0.02,
 88 |                  layer_norm_eps=1e-12,
 89 | 
 90 |                  dropout=0.1,
 91 |                  mem_len=None,
 92 |                  reuse_len=None,
 93 |                  bi_data=False,
 94 |                  clamp_len=-1,
 95 |                  same_length=False,
 96 | 
 97 |                  finetuning_task=None,
 98 |                  num_labels=2,
 99 |                  summary_type='last',
100 |                  summary_use_proj=True,
101 |                  summary_activation='tanh',
102 |                  summary_last_dropout=0.1,
103 |                  start_n_top=5,
104 |                  end_n_top=5,
105 |                  **kwargs):
106 |         """Constructs XLNetConfig.
107 |         """
108 |         super(XLNetConfig, self).__init__(**kwargs)
109 | 
110 |         if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
111 |                         and isinstance(vocab_size_or_config_json_file, unicode)):
112 |             with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
113 |                 json_config = json.loads(reader.read())
114 |             for key, value in json_config.items():
115 |                 self.__dict__[key] = value
116 |         elif isinstance(vocab_size_or_config_json_file, int):
117 |             self.n_token = vocab_size_or_config_json_file
118 |             self.d_model = d_model
119 |             self.n_layer = n_layer
120 |             self.n_head = n_head
121 |             assert d_model % n_head == 0
122 |             self.d_head = d_model // n_head
123 |             self.ff_activation = ff_activation
124 |             self.d_inner = d_inner
125 |             self.untie_r = untie_r
126 |             self.attn_type = attn_type
127 | 
128 |             self.initializer_range = initializer_range
129 |             self.layer_norm_eps = layer_norm_eps
130 | 
131 |             self.dropout = dropout
132 |             self.mem_len = mem_len
133 |             self.reuse_len = reuse_len
134 |             self.bi_data = bi_data
135 |             self.clamp_len = clamp_len
136 |             self.same_length = same_length
137 | 
138 |             self.finetuning_task = finetuning_task
139 |             self.num_labels = num_labels
140 |             self.summary_type = summary_type
141 |             self.summary_use_proj = summary_use_proj
142 |             self.summary_activation = summary_activation
143 |             self.summary_last_dropout = summary_last_dropout
144 |             self.start_n_top = start_n_top
145 |             self.end_n_top = end_n_top
146 |         else:
147 |             raise ValueError("First argument must be either a vocabulary size (int)"
148 |                              " or the path to a pretrained model config file (str)")
149 | 
150 |     @property
151 |     def max_position_embeddings(self):
152 |         return -1
153 | 
154 |     @property
155 |     def vocab_size(self):
156 |         return self.n_token
157 | 
158 |     @vocab_size.setter
159 |     def vocab_size(self, value):
160 |         self.n_token = value
161 | 
162 |     @property
163 |     def hidden_size(self):
164 |         return self.d_model
165 | 
166 |     @property
167 |     def num_attention_heads(self):
168 |         return self.n_head
169 | 
170 |     @property
171 |     def num_hidden_layers(self):
172 |         return self.n_layer
173 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      GPT2Config,
26 |                                                      GPT2Model,
27 |                                                      load_tf_weights_in_gpt2)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if gpt2_config_file == "":
36 |         config = GPT2Config()
37 |     else:
38 |         config = GPT2Config.from_json_file(gpt2_config_file)
39 |     model = GPT2Model(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--gpt2_checkpoint_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--gpt2_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
74 |                                          args.gpt2_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                      OpenAIGPTConfig,
26 |                                                      OpenAIGPTModel,
27 |                                                      load_tf_weights_in_openai_gpt)
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | 
33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
34 |     # Construct model
35 |     if openai_config_file == "":
36 |         config = OpenAIGPTConfig()
37 |     else:
38 |         config = OpenAIGPTConfig.from_json_file(openai_config_file)
39 |     model = OpenAIGPTModel(config)
40 | 
41 |     # Load weights from numpy
42 |     load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
48 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
49 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
50 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
51 |         f.write(config.to_json_string())
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     parser = argparse.ArgumentParser()
56 |     ## Required parameters
57 |     parser.add_argument("--openai_checkpoint_folder_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the TensorFlow checkpoint path.")
62 |     parser.add_argument("--pytorch_dump_folder_path",
63 |                         default = None,
64 |                         type = str,
65 |                         required = True,
66 |                         help = "Path to the output PyTorch model.")
67 |     parser.add_argument("--openai_config_file",
68 |                         default = "",
69 |                         type = str,
70 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
71 |                             "This specifies the model architecture.")
72 |     args = parser.parse_args()
73 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
74 |                                          args.openai_config_file,
75 |                                          args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint."""
 17 | 
 18 | import os
 19 | import argparse
 20 | import torch
 21 | import numpy as np
 22 | import tensorflow as tf
 23 | from pytorch_transformers import BertModel
 24 | 
 25 | 
 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
 27 | 
 28 |     """
 29 |     :param model:BertModel Pytorch model instance to be converted
 30 |     :param ckpt_dir: Tensorflow model directory
 31 |     :param model_name: model name
 32 |     :return:
 33 | 
 34 |     Currently supported HF models:
 35 |         Y BertModel
 36 |         N BertForMaskedLM
 37 |         N BertForPreTraining
 38 |         N BertForMultipleChoice
 39 |         N BertForNextSentencePrediction
 40 |         N BertForSequenceClassification
 41 |         N BertForQuestionAnswering
 42 |     """
 43 | 
 44 |     tensors_to_transpose = (
 45 |         "dense.weight",
 46 |         "attention.self.query",
 47 |         "attention.self.key",
 48 |         "attention.self.value"
 49 |     )
 50 | 
 51 |     var_map = (
 52 |         ('layer.', 'layer_'),
 53 |         ('word_embeddings.weight', 'word_embeddings'),
 54 |         ('position_embeddings.weight', 'position_embeddings'),
 55 |         ('token_type_embeddings.weight', 'token_type_embeddings'),
 56 |         ('.', '/'),
 57 |         ('LayerNorm/weight', 'LayerNorm/gamma'),
 58 |         ('LayerNorm/bias', 'LayerNorm/beta'),
 59 |         ('weight', 'kernel')
 60 |     )
 61 | 
 62 |     if not os.path.isdir(ckpt_dir):
 63 |         os.makedirs(ckpt_dir)
 64 | 
 65 |     state_dict = model.state_dict()
 66 | 
 67 |     def to_tf_var_name(name:str):
 68 |         for patt, repl in iter(var_map):
 69 |             name = name.replace(patt, repl)
 70 |         return 'bert/{}'.format(name)
 71 | 
 72 |     def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
 73 |         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
 74 |         tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
 75 |         session.run(tf.variables_initializer([tf_var]))
 76 |         session.run(tf_var)
 77 |         return tf_var
 78 | 
 79 |     tf.reset_default_graph()
 80 |     with tf.Session() as session:
 81 |         for var_name in state_dict:
 82 |             tf_name = to_tf_var_name(var_name)
 83 |             torch_tensor = state_dict[var_name].numpy()
 84 |             if any([x in var_name for x in tensors_to_transpose]):
 85 |                 torch_tensor = torch_tensor.T
 86 |             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
 87 |             tf.keras.backend.set_value(tf_var, torch_tensor)
 88 |             tf_weight = session.run(tf_var)
 89 |             print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 90 | 
 91 |         saver = tf.train.Saver(tf.trainable_variables())
 92 |         saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 93 | 
 94 | 
 95 | def main(raw_args=None):
 96 |     parser = argparse.ArgumentParser()
 97 |     parser.add_argument("--model_name",
 98 |                         type=str,
 99 |                         required=True,
100 |                         help="model name e.g. bert-base-uncased")
101 |     parser.add_argument("--cache_dir",
102 |                         type=str,
103 |                         default=None,
104 |                         required=False,
105 |                         help="Directory containing pytorch model")
106 |     parser.add_argument("--pytorch_model_path",
107 |                         type=str,
108 |                         required=True,
109 |                         help="/path/to/<pytorch-model-name>.bin")
110 |     parser.add_argument("--tf_cache_dir",
111 |                         type=str,
112 |                         required=True,
113 |                         help="Directory in which to save tensorflow model")
114 |     args = parser.parse_args(raw_args)
115 |     
116 |     model = BertModel.from_pretrained(
117 |         pretrained_model_name_or_path=args.model_name,
118 |         state_dict=torch.load(args.pytorch_model_path),
119 |         cache_dir=args.cache_dir
120 |     )
121 |     
122 |     convert_pytorch_checkpoint_to_tf(
123 |         model=model,
124 |         ckpt_dir=args.tf_cache_dir,
125 |         model_name=args.model_name
126 |     )
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
25 | 
26 | import logging
27 | logging.basicConfig(level=logging.INFO)
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     ## Required parameters
46 |     parser.add_argument("--tf_checkpoint_path",
47 |                         default = None,
48 |                         type = str,
49 |                         required = True,
50 |                         help = "Path to the TensorFlow checkpoint path.")
51 |     parser.add_argument("--bert_config_file",
52 |                         default = None,
53 |                         type = str,
54 |                         required = True,
55 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
56 |                             "This specifies the model architecture.")
57 |     parser.add_argument("--pytorch_dump_path",
58 |                         default = None,
59 |                         type = str,
60 |                         required = True,
61 |                         help = "Path to the output PyTorch model.")
62 |     args = parser.parse_args()
63 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
64 |                                      args.bert_config_file,
65 |                                      args.pytorch_dump_path)
66 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert Transformer XL checkpoint and datasets."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function
 18 | 
 19 | import argparse
 20 | import os
 21 | import sys
 22 | from io import open
 23 | 
 24 | import torch
 25 | 
 26 | import pytorch_transformers.tokenization_transfo_xl as data_utils
 27 | 
 28 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
 29 | from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
 30 |                                                       load_tf_weights_in_transfo_xl)
 31 | from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 32 | 
 33 | if sys.version_info[0] == 2:
 34 |     import cPickle as pickle
 35 | else:
 36 |     import pickle
 37 | 
 38 | import logging
 39 | logging.basicConfig(level=logging.INFO)
 40 | 
 41 | # We do this to be able to load python 2 datasets pickles
 42 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 43 | data_utils.Vocab = data_utils.TransfoXLTokenizer
 44 | data_utils.Corpus = data_utils.TransfoXLCorpus
 45 | sys.modules['data_utils'] = data_utils
 46 | sys.modules['vocabulary'] = data_utils
 47 | 
 48 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
 49 |                                              transfo_xl_config_file,
 50 |                                              pytorch_dump_folder_path,
 51 |                                              transfo_xl_dataset_file):
 52 |     if transfo_xl_dataset_file:
 53 |         # Convert a pre-processed corpus (see original TensorFlow repo)
 54 |         with open(transfo_xl_dataset_file, "rb") as fp:
 55 |             corpus = pickle.load(fp, encoding="latin1")
 56 |         # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
 57 |         pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
 58 |         print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
 59 |         corpus_vocab_dict = corpus.vocab.__dict__
 60 |         torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
 61 | 
 62 |         corpus_dict_no_vocab = corpus.__dict__
 63 |         corpus_dict_no_vocab.pop('vocab', None)
 64 |         pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME
 65 |         print("Save dataset to {}".format(pytorch_dataset_dump_path))
 66 |         torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)
 67 | 
 68 |     if tf_checkpoint_path:
 69 |         # Convert a pre-trained TensorFlow model
 70 |         config_path = os.path.abspath(transfo_xl_config_file)
 71 |         tf_path = os.path.abspath(tf_checkpoint_path)
 72 | 
 73 |         print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
 74 |         # Initialise PyTorch model
 75 |         if transfo_xl_config_file == "":
 76 |             config = TransfoXLConfig()
 77 |         else:
 78 |             config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
 79 |         print("Building PyTorch model from configuration: {}".format(str(config)))
 80 |         model = TransfoXLLMHeadModel(config)
 81 | 
 82 |         model = load_tf_weights_in_transfo_xl(model, config, tf_path)
 83 |         # Save pytorch-model
 84 |         pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 85 |         pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 86 |         print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 87 |         torch.save(model.state_dict(), pytorch_weights_dump_path)
 88 |         print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 89 |         with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 90 |             f.write(config.to_json_string())
 91 | 
 92 | 
 93 | if __name__ == "__main__":
 94 |     parser = argparse.ArgumentParser()
 95 |     parser.add_argument("--pytorch_dump_folder_path",
 96 |                         default = None,
 97 |                         type = str,
 98 |                         required = True,
 99 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
100 |     parser.add_argument("--tf_checkpoint_path",
101 |                         default = "",
102 |                         type = str,
103 |                         help = "An optional path to a TensorFlow checkpoint path to be converted.")
104 |     parser.add_argument("--transfo_xl_config_file",
105 |                         default = "",
106 |                         type = str,
107 |                         help = "An optional config json file corresponding to the pre-trained BERT model. \n"
108 |                             "This specifies the model architecture.")
109 |     parser.add_argument("--transfo_xl_dataset_file",
110 |                         default = "",
111 |                         type = str,
112 |                         help = "An optional dataset file to be converted in a vocabulary.")
113 |     args = parser.parse_args()
114 |     convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
115 |                                      args.transfo_xl_config_file,
116 |                                      args.pytorch_dump_folder_path,
117 |                                      args.transfo_xl_dataset_file)
118 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | import json
21 | from io import open
22 | 
23 | import torch
24 | import numpy
25 | 
26 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
27 | from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
28 | 
29 | import logging
30 | logging.basicConfig(level=logging.INFO)
31 | 
32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
33 |     # Load checkpoint
34 |     chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
35 | 
36 |     model = chkpt['model']
37 | 
38 |     config = chkpt['params']
39 |     config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
40 | 
41 |     vocab = chkpt['dico_word2id']
42 |     vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
43 | 
44 |     # Save pytorch-model
45 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
46 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
47 |     pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
48 | 
49 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
50 |     torch.save(model, pytorch_weights_dump_path)
51 | 
52 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
53 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
54 |         f.write(json.dumps(config, indent=2) + "\n")
55 | 
56 |     print("Save vocab file to {}".format(pytorch_config_dump_path))
57 |     with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
58 |         f.write(json.dumps(vocab, indent=2) + "\n")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     parser = argparse.ArgumentParser()
63 |     ## Required parameters
64 |     parser.add_argument("--xlm_checkpoint_path",
65 |                         default = None,
66 |                         type = str,
67 |                         required = True,
68 |                         help = "Path the official PyTorch dump.")
69 |     parser.add_argument("--pytorch_dump_folder_path",
70 |                         default = None,
71 |                         type = str,
72 |                         required = True,
73 |                         help = "Path to the output PyTorch model.")
74 |     args = parser.parse_args()
75 |     convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
76 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Convert BERT checkpoint."""
 16 | 
 17 | from __future__ import absolute_import
 18 | from __future__ import division
 19 | from __future__ import print_function
 20 | 
 21 | import os
 22 | import argparse
 23 | import torch
 24 | 
 25 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
 26 |                                                     XLNetConfig,
 27 |                                                     XLNetLMHeadModel, XLNetForQuestionAnswering,
 28 |                                                     XLNetForSequenceClassification,
 29 |                                                     load_tf_weights_in_xlnet)
 30 | 
 31 | GLUE_TASKS_NUM_LABELS = {
 32 |     "cola": 2,
 33 |     "mnli": 3,
 34 |     "mrpc": 2,
 35 |     "sst-2": 2,
 36 |     "sts-b": 1,
 37 |     "qqp": 2,
 38 |     "qnli": 2,
 39 |     "rte": 2,
 40 |     "wnli": 2,
 41 | }
 42 | 
 43 | import logging
 44 | logging.basicConfig(level=logging.INFO)
 45 | 
 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
 47 |     # Initialise PyTorch model
 48 |     config = XLNetConfig.from_json_file(bert_config_file)
 49 | 
 50 |     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
 51 |     if finetuning_task in GLUE_TASKS_NUM_LABELS:
 52 |         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
 53 |         config.finetuning_task = finetuning_task
 54 |         config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
 55 |         model = XLNetForSequenceClassification(config)
 56 |     elif 'squad' in finetuning_task:
 57 |         config.finetuning_task = finetuning_task
 58 |         model = XLNetForQuestionAnswering(config)
 59 |     else:
 60 |         model = XLNetLMHeadModel(config)
 61 | 
 62 |     # Load weights from tf checkpoint
 63 |     load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
 64 | 
 65 |     # Save pytorch-model
 66 |     pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
 67 |     pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
 68 |     print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
 69 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
 70 |     print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
 71 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
 72 |         f.write(config.to_json_string())
 73 | 
 74 | 
 75 | if __name__ == "__main__":
 76 |     parser = argparse.ArgumentParser()
 77 |     ## Required parameters
 78 |     parser.add_argument("--tf_checkpoint_path",
 79 |                         default = None,
 80 |                         type = str,
 81 |                         required = True,
 82 |                         help = "Path to the TensorFlow checkpoint path.")
 83 |     parser.add_argument("--xlnet_config_file",
 84 |                         default = None,
 85 |                         type = str,
 86 |                         required = True,
 87 |                         help = "The config json file corresponding to the pre-trained XLNet model. \n"
 88 |                                "This specifies the model architecture.")
 89 |     parser.add_argument("--pytorch_dump_folder_path",
 90 |                         default = None,
 91 |                         type = str,
 92 |                         required = True,
 93 |                         help = "Path to the folder to store the PyTorch model or dataset/vocab.")
 94 |     parser.add_argument("--finetuning_task",
 95 |                         default = None,
 96 |                         type = str,
 97 |                         help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
 98 |     args = parser.parse_args()
 99 |     print(args)
100 | 
101 |     convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
102 |                                         args.xlnet_config_file,
103 |                                         args.pytorch_dump_folder_path,
104 |                                         args.finetuning_task)
105 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/model/pytorch_transformers/tests/__init__.py


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/configuration_common_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2019 HuggingFace Inc.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import copy
20 | import os
21 | import shutil
22 | import json
23 | import random
24 | import uuid
25 | 
26 | import unittest
27 | import logging
28 | 
29 | 
30 | class ConfigTester(object):
31 |     def __init__(self, parent, config_class=None, **kwargs):
32 |         self.parent = parent
33 |         self.config_class = config_class
34 |         self.inputs_dict = kwargs
35 | 
36 |     def create_and_test_config_common_properties(self):
37 |         config = self.config_class(**self.inputs_dict)
38 |         self.parent.assertTrue(hasattr(config, 'vocab_size'))
39 |         self.parent.assertTrue(hasattr(config, 'hidden_size'))
40 |         self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
41 |         self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
42 | 
43 |     def create_and_test_config_to_json_string(self):
44 |         config = self.config_class(**self.inputs_dict)
45 |         obj = json.loads(config.to_json_string())
46 |         for key, value in self.inputs_dict.items():
47 |             self.parent.assertEqual(obj[key], value)
48 | 
49 |     def create_and_test_config_to_json_file(self):
50 |         config_first = self.config_class(**self.inputs_dict)
51 |         json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
52 |         config_first.to_json_file(json_file_path)
53 |         config_second = self.config_class.from_json_file(json_file_path)
54 |         os.remove(json_file_path)
55 |         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
56 | 
57 |     def run_common_tests(self):
58 |         self.create_and_test_config_common_properties()
59 |         self.create_and_test_config_to_json_string()
60 |         self.create_and_test_config_to_json_file()
61 | 
62 | if __name__ == "__main__":
63 |     unittest.main()


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # content of conftest.py
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | def pytest_addoption(parser):
 7 |     parser.addoption(
 8 |         "--runslow", action="store_true", default=False, help="run slow tests"
 9 |     )
10 | 
11 | 
12 | def pytest_collection_modifyitems(config, items):
13 |     if config.getoption("--runslow"):
14 |         # --runslow given in cli: do not skip slow tests
15 |         return
16 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
17 |     for item in items:
18 |         if "slow" in item.keywords:
19 |             item.add_marker(skip_slow)
20 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/modeling_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from pytorch_transformers import (AutoConfig, BertConfig,
25 |                                   AutoModel, BertModel,
26 |                                   AutoModelWithLMHead, BertForMaskedLM,
27 |                                   AutoModelForSequenceClassification, BertForSequenceClassification,
28 |                                   AutoModelForQuestionAnswering, BertForQuestionAnswering)
29 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
30 | 
31 | from .modeling_common_test import (CommonTestCases, ids_tensor)
32 | from .configuration_common_test import ConfigTester
33 | 
34 | 
35 | class AutoModelTest(unittest.TestCase):
36 |     def test_model_from_pretrained(self):
37 |         logging.basicConfig(level=logging.INFO)
38 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
39 |             config = AutoConfig.from_pretrained(model_name)
40 |             self.assertIsNotNone(config)
41 |             self.assertIsInstance(config, BertConfig)
42 | 
43 |             model = AutoModel.from_pretrained(model_name)
44 |             model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
45 |             self.assertIsNotNone(model)
46 |             self.assertIsInstance(model, BertModel)
47 |             for value in loading_info.values():
48 |                 self.assertEqual(len(value), 0)
49 | 
50 |     def test_lmhead_model_from_pretrained(self):
51 |         logging.basicConfig(level=logging.INFO)
52 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
53 |             config = AutoConfig.from_pretrained(model_name)
54 |             self.assertIsNotNone(config)
55 |             self.assertIsInstance(config, BertConfig)
56 | 
57 |             model = AutoModelWithLMHead.from_pretrained(model_name)
58 |             model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True)
59 |             self.assertIsNotNone(model)
60 |             self.assertIsInstance(model, BertForMaskedLM)
61 | 
62 |     def test_sequence_classification_model_from_pretrained(self):
63 |         logging.basicConfig(level=logging.INFO)
64 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
65 |             config = AutoConfig.from_pretrained(model_name)
66 |             self.assertIsNotNone(config)
67 |             self.assertIsInstance(config, BertConfig)
68 | 
69 |             model = AutoModelForSequenceClassification.from_pretrained(model_name)
70 |             model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True)
71 |             self.assertIsNotNone(model)
72 |             self.assertIsInstance(model, BertForSequenceClassification)
73 | 
74 |     def test_question_answering_model_from_pretrained(self):
75 |         logging.basicConfig(level=logging.INFO)
76 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
77 |             config = AutoConfig.from_pretrained(model_name)
78 |             self.assertIsNotNone(config)
79 |             self.assertIsInstance(config, BertConfig)
80 | 
81 |             model = AutoModelForQuestionAnswering.from_pretrained(model_name)
82 |             model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True)
83 |             self.assertIsNotNone(model)
84 |             self.assertIsInstance(model, BertForQuestionAnswering)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/optimization_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import
 16 | from __future__ import division
 17 | from __future__ import print_function
 18 | 
 19 | import unittest
 20 | import os
 21 | 
 22 | import torch
 23 | 
 24 | from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule,
 25 |                                   WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 26 | 
 27 | from .tokenization_tests_commons import TemporaryDirectory
 28 | 
 29 | 
 30 | def unwrap_schedule(scheduler, num_steps=10):
 31 |     lrs = []
 32 |     for _ in range(num_steps):
 33 |         scheduler.step()
 34 |         lrs.append(scheduler.get_lr())
 35 |     return lrs
 36 | 
 37 | def unwrap_and_save_reload_schedule(scheduler, num_steps=10):
 38 |     lrs = []
 39 |     for step in range(num_steps):
 40 |         scheduler.step()
 41 |         lrs.append(scheduler.get_lr())
 42 |         if step == num_steps // 2:
 43 |             with TemporaryDirectory() as tmpdirname:
 44 |                 file_name = os.path.join(tmpdirname, 'schedule.bin')
 45 |                 torch.save(scheduler.state_dict(), file_name)
 46 | 
 47 |                 state_dict = torch.load(file_name)
 48 |                 scheduler.load_state_dict(state_dict)
 49 |     return lrs
 50 | 
 51 | class OptimizationTest(unittest.TestCase):
 52 | 
 53 |     def assertListAlmostEqual(self, list1, list2, tol):
 54 |         self.assertEqual(len(list1), len(list2))
 55 |         for a, b in zip(list1, list2):
 56 |             self.assertAlmostEqual(a, b, delta=tol)
 57 | 
 58 |     def test_adam_w(self):
 59 |         w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True)
 60 |         target = torch.tensor([0.4, 0.2, -0.5])
 61 |         criterion = torch.nn.MSELoss()
 62 |         # No warmup, constant schedule, no gradient clipping
 63 |         optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0)
 64 |         for _ in range(100):
 65 |             loss = criterion(w, target)
 66 |             loss.backward()
 67 |             optimizer.step()
 68 |             w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves.
 69 |             w.grad.zero_()
 70 |         self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2)
 71 | 
 72 | 
 73 | class ScheduleInitTest(unittest.TestCase):
 74 |     m = torch.nn.Linear(50, 50)
 75 |     optimizer = AdamW(m.parameters(), lr=10.)
 76 |     num_steps = 10
 77 | 
 78 |     def assertListAlmostEqual(self, list1, list2, tol):
 79 |         self.assertEqual(len(list1), len(list2))
 80 |         for a, b in zip(list1, list2):
 81 |             self.assertAlmostEqual(a, b, delta=tol)
 82 | 
 83 |     def test_constant_scheduler(self):
 84 |         scheduler = ConstantLRSchedule(self.optimizer)
 85 |         lrs = unwrap_schedule(scheduler, self.num_steps)
 86 |         expected_learning_rates = [10.] * self.num_steps
 87 |         self.assertEqual(len(lrs[0]), 1)
 88 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
 89 | 
 90 |         scheduler = ConstantLRSchedule(self.optimizer)
 91 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
 92 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
 93 | 
 94 |     def test_warmup_constant_scheduler(self):
 95 |         scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
 96 |         lrs = unwrap_schedule(scheduler, self.num_steps)
 97 |         expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0]
 98 |         self.assertEqual(len(lrs[0]), 1)
 99 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
100 | 
101 |         scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4)
102 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
103 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
104 | 
105 |     def test_warmup_linear_scheduler(self):
106 |         scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
107 |         lrs = unwrap_schedule(scheduler, self.num_steps)
108 |         expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0]
109 |         self.assertEqual(len(lrs[0]), 1)
110 |         self.assertListEqual([l[0] for l in lrs], expected_learning_rates)
111 | 
112 |         scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10)
113 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
114 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
115 | 
116 |     def test_warmup_cosine_scheduler(self):
117 |         scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
118 |         lrs = unwrap_schedule(scheduler, self.num_steps)
119 |         expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0]
120 |         self.assertEqual(len(lrs[0]), 1)
121 |         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
122 | 
123 |         scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10)
124 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
125 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
126 | 
127 |     def test_warmup_cosine_hard_restart_scheduler(self):
128 |         scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
129 |         lrs = unwrap_schedule(scheduler, self.num_steps)
130 |         expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0]
131 |         self.assertEqual(len(lrs[0]), 1)
132 |         self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2)
133 | 
134 |         scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10)
135 |         lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps)
136 |         self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2])
137 | 
138 | if __name__ == "__main__":
139 |     unittest.main()
140 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_auto_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import shutil
21 | import pytest
22 | import logging
23 | 
24 | from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
25 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
26 | from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
27 | 
28 | 
29 | class AutoTokenizerTest(unittest.TestCase):
30 |     def test_tokenizer_from_pretrained(self):
31 |         logging.basicConfig(level=logging.INFO)
32 |         for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
33 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
34 |             self.assertIsNotNone(tokenizer)
35 |             self.assertIsInstance(tokenizer, BertTokenizer)
36 |             self.assertGreater(len(tokenizer), 0)
37 | 
38 |         for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
39 |             tokenizer = AutoTokenizer.from_pretrained(model_name)
40 |             self.assertIsNotNone(tokenizer)
41 |             self.assertIsInstance(tokenizer, GPT2Tokenizer)
42 |             self.assertGreater(len(tokenizer), 0)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_bert_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | from io import open
 20 | 
 21 | from pytorch_transformers.tokenization_bert import (BasicTokenizer,
 22 |                                                     BertTokenizer,
 23 |                                                     WordpieceTokenizer,
 24 |                                                     _is_control, _is_punctuation,
 25 |                                                     _is_whitespace, VOCAB_FILES_NAMES)
 26 | 
 27 | from .tokenization_tests_commons import CommonTestCases
 28 | 
 29 | class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
 30 | 
 31 |     tokenizer_class = BertTokenizer
 32 | 
 33 |     def setUp(self):
 34 |         super(BertTokenizationTest, self).setUp()
 35 | 
 36 |         vocab_tokens = [
 37 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 38 |             "##ing", ",", "low", "lowest",
 39 |         ]
 40 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
 41 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
 42 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 43 | 
 44 |     def get_tokenizer(self, **kwargs):
 45 |         return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 46 | 
 47 |     def get_input_output_texts(self):
 48 |         input_text = u"UNwant\u00E9d,running"
 49 |         output_text = u"unwanted, running"
 50 |         return input_text, output_text
 51 | 
 52 |     def test_full_tokenizer(self):
 53 |         tokenizer = self.tokenizer_class(self.vocab_file)
 54 | 
 55 |         tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
 56 |         self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
 57 |         self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 58 | 
 59 |     def test_chinese(self):
 60 |         tokenizer = BasicTokenizer()
 61 | 
 62 |         self.assertListEqual(
 63 |             tokenizer.tokenize(u"ah\u535A\u63A8zz"),
 64 |             [u"ah", u"\u535A", u"\u63A8", u"zz"])
 65 | 
 66 |     def test_basic_tokenizer_lower(self):
 67 |         tokenizer = BasicTokenizer(do_lower_case=True)
 68 | 
 69 |         self.assertListEqual(
 70 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 71 |             ["hello", "!", "how", "are", "you", "?"])
 72 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"])
 73 | 
 74 |     def test_basic_tokenizer_no_lower(self):
 75 |         tokenizer = BasicTokenizer(do_lower_case=False)
 76 | 
 77 |         self.assertListEqual(
 78 |             tokenizer.tokenize(u" \tHeLLo!how  \n Are yoU?  "),
 79 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
 80 | 
 81 |     def test_wordpiece_tokenizer(self):
 82 |         vocab_tokens = [
 83 |             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
 84 |             "##ing"
 85 |         ]
 86 | 
 87 |         vocab = {}
 88 |         for (i, token) in enumerate(vocab_tokens):
 89 |             vocab[token] = i
 90 |         tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]")
 91 | 
 92 |         self.assertListEqual(tokenizer.tokenize(""), [])
 93 | 
 94 |         self.assertListEqual(
 95 |             tokenizer.tokenize("unwanted running"),
 96 |             ["un", "##want", "##ed", "runn", "##ing"])
 97 | 
 98 |         self.assertListEqual(
 99 |             tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
100 | 
101 |     def test_is_whitespace(self):
102 |         self.assertTrue(_is_whitespace(u" "))
103 |         self.assertTrue(_is_whitespace(u"\t"))
104 |         self.assertTrue(_is_whitespace(u"\r"))
105 |         self.assertTrue(_is_whitespace(u"\n"))
106 |         self.assertTrue(_is_whitespace(u"\u00A0"))
107 | 
108 |         self.assertFalse(_is_whitespace(u"A"))
109 |         self.assertFalse(_is_whitespace(u"-"))
110 | 
111 |     def test_is_control(self):
112 |         self.assertTrue(_is_control(u"\u0005"))
113 | 
114 |         self.assertFalse(_is_control(u"A"))
115 |         self.assertFalse(_is_control(u" "))
116 |         self.assertFalse(_is_control(u"\t"))
117 |         self.assertFalse(_is_control(u"\r"))
118 | 
119 |     def test_is_punctuation(self):
120 |         self.assertTrue(_is_punctuation(u"-"))
121 |         self.assertTrue(_is_punctuation(u"$"))
122 |         self.assertTrue(_is_punctuation(u"`"))
123 |         self.assertTrue(_is_punctuation(u"."))
124 | 
125 |         self.assertFalse(_is_punctuation(u"A"))
126 |         self.assertFalse(_is_punctuation(u" "))
127 | 
128 |     def test_sequence_builders(self):
129 |         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
130 | 
131 |         text = tokenizer.encode("sequence builders")
132 |         text_2 = tokenizer.encode("multi-sequence build")
133 | 
134 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
135 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
136 | 
137 |         assert encoded_sentence == [101] + text + [102]
138 |         assert encoded_pair == [101] + text + [102] + text_2 + [102]
139 | 
140 | if __name__ == '__main__':
141 |     unittest.main()
142 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_dilbert_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer)
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | from .tokenization_bert_test import BertTokenizationTest
25 | 
26 | class DistilBertTokenizationTest(BertTokenizationTest):
27 | 
28 |     tokenizer_class = DistilBertTokenizer
29 | 
30 |     def get_tokenizer(self, **kwargs):
31 |         return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
32 | 
33 |     def test_sequence_builders(self):
34 |         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
35 | 
36 |         text = tokenizer.encode("sequence builders")
37 |         text_2 = tokenizer.encode("multi-sequence build")
38 | 
39 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
40 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
41 | 
42 |         assert encoded_sentence == [101] + text + [102]
43 |         assert encoded_pair == [101] + text + [102] + text_2 + [102]
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_gpt2_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | from io import open
21 | 
22 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
23 | 
24 | from .tokenization_tests_commons import CommonTestCases
25 | 
26 | class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = GPT2Tokenizer
29 | 
30 |     def setUp(self):
31 |         super(GPT2TokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "\u0120", "\u0120l", "\u0120n",
36 |                  "\u0120lo", "\u0120low", "er",
37 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
40 |         self.special_tokens_map = {"unk_token": "<unk>"}
41 | 
42 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
43 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
44 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
45 |             fp.write(json.dumps(vocab_tokens) + "\n")
46 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
47 |             fp.write("\n".join(merges))
48 | 
49 |     def get_tokenizer(self, **kwargs):
50 |         kwargs.update(self.special_tokens_map)
51 |         return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
52 | 
53 |     def get_input_output_texts(self):
54 |         input_text = u"lower newer"
55 |         output_text = u" lower newer"
56 |         return input_text, output_text
57 | 
58 |     def test_full_tokenizer(self):
59 |         tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
60 |         text = "lower newer"
61 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + [tokenizer.unk_token]
66 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_openai_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 | 
28 |     tokenizer_class = OpenAIGPTTokenizer
29 | 
30 |     def setUp(self):
31 |         super(OpenAIGPTTokenizationTest, self).setUp()
32 | 
33 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
34 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
35 |                  "w</w>", "r</w>", "t</w>",
36 |                  "lo", "low", "er</w>",
37 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
38 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
39 |         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w") as fp:
44 |             fp.write(json.dumps(vocab_tokens))
45 |         with open(self.merges_file, "w") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
50 | 
51 |     def get_input_output_texts(self):
52 |         input_text = u"lower newer"
53 |         output_text = u"lower newer"
54 |         return input_text, output_text
55 | 
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
59 | 
60 |         text = "lower"
61 |         bpe_tokens = ["low", "er</w>"]
62 |         tokens = tokenizer.tokenize(text)
63 |         self.assertListEqual(tokens, bpe_tokens)
64 | 
65 |         input_tokens = tokens + ["<unk>"]
66 |         input_bpe_tokens = [14, 15, 20]
67 |         self.assertListEqual(
68 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_roberta_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import json
19 | import unittest
20 | from io import open
21 | 
22 | from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | 
26 | class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
27 |     tokenizer_class = RobertaTokenizer
28 | 
29 |     def setUp(self):
30 |         super(RobertaTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
34 |                  "\u0120", "\u0120l", "\u0120n",
35 |                  "\u0120lo", "\u0120low", "er",
36 |                  "\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
37 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
38 |         merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
39 |         self.special_tokens_map = {"unk_token": "<unk>"}
40 | 
41 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
42 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
43 |         with open(self.vocab_file, "w", encoding="utf-8") as fp:
44 |             fp.write(json.dumps(vocab_tokens) + "\n")
45 |         with open(self.merges_file, "w", encoding="utf-8") as fp:
46 |             fp.write("\n".join(merges))
47 | 
48 |     def get_tokenizer(self, **kwargs):
49 |         kwargs.update(self.special_tokens_map)
50 |         return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
51 | 
52 |     def get_input_output_texts(self):
53 |         input_text = u"lower newer"
54 |         output_text = u" lower newer"
55 |         return input_text, output_text
56 | 
57 |     def test_full_tokenizer(self):
58 |         tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
59 |         text = "lower newer"
60 |         bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
61 |         tokens = tokenizer.tokenize(text)
62 |         self.assertListEqual(tokens, bpe_tokens)
63 | 
64 |         input_tokens = tokens + [tokenizer.unk_token]
65 |         input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
66 |         self.assertListEqual(
67 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
68 | 
69 |     def roberta_dict_integration_testing(self):
70 |         tokenizer = self.get_tokenizer()
71 | 
72 |         self.assertListEqual(
73 |             tokenizer.encode('Hello world!'),
74 |             [0, 31414, 232, 328, 2]
75 |         )
76 |         self.assertListEqual(
77 |             tokenizer.encode('Hello world! cécé herlolip 418'),
78 |             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
79 |         )
80 | 
81 |     def test_sequence_builders(self):
82 |         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
83 | 
84 |         text = tokenizer.encode("sequence builders")
85 |         text_2 = tokenizer.encode("multi-sequence build")
86 | 
87 |         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
88 |         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
89 | 
90 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
91 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
92 | 
93 |         assert encoded_sentence == encoded_text_from_decode
94 |         assert encoded_pair == encoded_pair_from_decode
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     unittest.main()
99 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_transfo_xl_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | from io import open
20 | 
21 | from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from.tokenization_tests_commons import CommonTestCases
24 | 
25 | class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = TransfoXLTokenizer
28 | 
29 |     def setUp(self):
30 |         super(TransfoXLTokenizationTest, self).setUp()
31 | 
32 |         vocab_tokens = [
33 |             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
34 |             "running", ",", "low", "l",
35 |         ]
36 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
37 |         with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
38 |             vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
39 | 
40 |     def get_tokenizer(self, **kwargs):
41 |         kwargs['lower_case'] = True
42 |         return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
43 | 
44 |     def get_input_output_texts(self):
45 |         input_text = u"<unk> UNwanted , running"
46 |         output_text = u"<unk> unwanted, running"
47 |         return input_text, output_text
48 | 
49 |     def test_full_tokenizer(self):
50 |         tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
51 | 
52 |         tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
53 |         self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
54 | 
55 |         self.assertListEqual(
56 |             tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
57 | 
58 |     def test_full_tokenizer_lower(self):
59 |         tokenizer = TransfoXLTokenizer(lower_case=True)
60 | 
61 |         self.assertListEqual(
62 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
63 |             ["hello", "!", "how", "are", "you", "?"])
64 | 
65 |     def test_full_tokenizer_no_lower(self):
66 |         tokenizer = TransfoXLTokenizer(lower_case=False)
67 | 
68 |         self.assertListEqual(
69 |             tokenizer.tokenize(u" \tHeLLo ! how  \n Are yoU ?  "),
70 |             ["HeLLo", "!", "how", "Are", "yoU", "?"])
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     unittest.main()
75 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_utils_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 HuggingFace Inc..
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import
16 | from __future__ import division
17 | from __future__ import print_function
18 | 
19 | import unittest
20 | import six
21 | 
22 | from pytorch_transformers import PreTrainedTokenizer
23 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
24 | 
25 | class TokenizerUtilsTest(unittest.TestCase):
26 |     def check_tokenizer_from_pretrained(self, tokenizer_class):
27 |         s3_models = list(tokenizer_class.max_model_input_sizes.keys())
28 |         for model_name in s3_models[:1]:
29 |             tokenizer = tokenizer_class.from_pretrained(model_name)
30 |             self.assertIsNotNone(tokenizer)
31 |             self.assertIsInstance(tokenizer, tokenizer_class)
32 |             self.assertIsInstance(tokenizer, PreTrainedTokenizer)
33 | 
34 |             for special_tok in tokenizer.all_special_tokens:
35 |                 if six.PY2:
36 |                     self.assertIsInstance(special_tok, unicode)
37 |                 else:
38 |                     self.assertIsInstance(special_tok, str)
39 |                 special_tok_id = tokenizer.convert_tokens_to_ids(special_tok)
40 |                 self.assertIsInstance(special_tok_id, int)
41 | 
42 |     def test_pretrained_tokenizers(self):
43 |         self.check_tokenizer_from_pretrained(GPT2Tokenizer)
44 | 
45 | if __name__ == "__main__":
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_xlm_test.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | from __future__ import absolute_import, division, print_function, unicode_literals
16 | 
17 | import os
18 | import unittest
19 | import json
20 | 
21 | from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
22 | 
23 | from .tokenization_tests_commons import CommonTestCases
24 | 
25 | class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
26 | 
27 |     tokenizer_class = XLMTokenizer
28 | 
29 |     def setUp(self):
30 |         super(XLMTokenizationTest, self).setUp()
31 | 
32 |         # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
33 |         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
34 |                  "w</w>", "r</w>", "t</w>",
35 |                  "lo", "low", "er</w>",
36 |                  "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>"]
37 |         vocab_tokens = dict(zip(vocab, range(len(vocab))))
38 |         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
39 | 
40 |         self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
41 |         self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
42 |         with open(self.vocab_file, "w") as fp:
43 |             fp.write(json.dumps(vocab_tokens))
44 |         with open(self.merges_file, "w") as fp:
45 |             fp.write("\n".join(merges))
46 | 
47 |     def get_tokenizer(self, **kwargs):
48 |         return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
49 | 
50 |     def get_input_output_texts(self):
51 |         input_text = u"lower newer"
52 |         output_text = u"lower newer"
53 |         return input_text, output_text
54 | 
55 |     def test_full_tokenizer(self):
56 |         """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
57 |         tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
58 | 
59 |         text = "lower"
60 |         bpe_tokens = ["low", "er</w>"]
61 |         tokens = tokenizer.tokenize(text)
62 |         self.assertListEqual(tokens, bpe_tokens)
63 | 
64 |         input_tokens = tokens + ["<unk>"]
65 |         input_bpe_tokens = [14, 15, 20]
66 |         self.assertListEqual(
67 |             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
68 | 
69 |     def test_sequence_builders(self):
70 |         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
71 | 
72 |         text = tokenizer.encode("sequence builders")
73 |         text_2 = tokenizer.encode("multi-sequence build")
74 | 
75 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
76 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
77 | 
78 |         assert encoded_sentence == [1] + text + [1]
79 |         assert encoded_pair == [1] + text + [1] + text_2 + [1]
80 | 
81 | if __name__ == '__main__':
82 |     unittest.main()
83 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tests/tokenization_xlnet_test.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | from __future__ import absolute_import, division, print_function, unicode_literals
 16 | 
 17 | import os
 18 | import unittest
 19 | 
 20 | from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 21 | 
 22 | from .tokenization_tests_commons import CommonTestCases
 23 | 
 24 | SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
 25 |                     'fixtures/test_sentencepiece.model')
 26 | 
 27 | class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
 28 | 
 29 |     tokenizer_class = XLNetTokenizer
 30 | 
 31 |     def setUp(self):
 32 |         super(XLNetTokenizationTest, self).setUp()
 33 | 
 34 |         # We have a SentencePiece fixture for testing
 35 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 36 |         tokenizer.save_pretrained(self.tmpdirname)
 37 | 
 38 |     def get_tokenizer(self, **kwargs):
 39 |         return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
 40 | 
 41 |     def get_input_output_texts(self):
 42 |         input_text = u"This is a test"
 43 |         output_text = u"This is a test"
 44 |         return input_text, output_text
 45 | 
 46 | 
 47 |     def test_full_tokenizer(self):
 48 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 49 | 
 50 |         tokens = tokenizer.tokenize(u'This is a test')
 51 |         self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 52 | 
 53 |         self.assertListEqual(
 54 |             tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 55 | 
 56 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 57 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 58 |                                     u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 59 |                                     u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 60 |                                     SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
 61 |         ids = tokenizer.convert_tokens_to_ids(tokens)
 62 |         self.assertListEqual(
 63 |             ids, [8, 21, 84, 55, 24, 19, 7, 0,
 64 |                 602, 347, 347, 347, 3, 12, 66,
 65 |                 46, 72, 80, 6, 0, 4])
 66 | 
 67 |         back_tokens = tokenizer.convert_ids_to_tokens(ids)
 68 |         self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 69 |                                         u'or', u'n', SPIECE_UNDERLINE + u'in',
 70 |                                         SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
 71 |                                         SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 72 |                                         SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
 73 |                                         u'<unk>', u'.'])
 74 | 
 75 |     def test_tokenizer_lower(self):
 76 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
 77 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 78 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
 79 |                                       u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 80 |                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 81 |                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 82 |         self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"])
 83 | 
 84 |     def test_tokenizer_no_lower(self):
 85 |         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
 86 |         tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
 87 |         self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or',
 88 |                                       u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
 89 |                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
 90 |                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 91 | 
 92 |     def test_sequence_builders(self):
 93 |         tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 94 | 
 95 |         text = tokenizer.encode("sequence builders")
 96 |         text_2 = tokenizer.encode("multi-sequence build")
 97 | 
 98 |         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
 99 |         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
100 | 
101 |         assert encoded_sentence == text + [4, 3]
102 |         assert encoded_pair == text + [4] + text_2 + [4, 3]
103 | 
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main()
107 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tokenization_auto.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Auto Model class. """
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import logging
 20 | 
 21 | from .tokenization_bert import BertTokenizer
 22 | from .tokenization_openai import OpenAIGPTTokenizer
 23 | from .tokenization_gpt2 import GPT2Tokenizer
 24 | from .tokenization_transfo_xl import TransfoXLTokenizer
 25 | from .tokenization_xlnet import XLNetTokenizer
 26 | from .tokenization_xlm import XLMTokenizer
 27 | from .tokenization_roberta import RobertaTokenizer
 28 | from .tokenization_distilbert import DistilBertTokenizer
 29 | 
 30 | logger = logging.getLogger(__name__)
 31 | 
 32 | class AutoTokenizer(object):
 33 |     r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
 34 |         that will be instantiated as one of the tokenizer classes of the library
 35 |         when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
 36 |         class method.
 37 | 
 38 |         The `from_pretrained()` method take care of returning the correct tokenizer class instance
 39 |         using pattern matching on the `pretrained_model_name_or_path` string.
 40 | 
 41 |         The tokenizer class to instantiate is selected as the first pattern matching
 42 |         in the `pretrained_model_name_or_path` string (in the following order):
 43 |             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
 44 |             - contains `roberta`: RobertaTokenizer (RoBERTa model)
 45 |             - contains `bert`: BertTokenizer (Bert model)
 46 |             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
 47 |             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
 48 |             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
 49 |             - contains `xlnet`: XLNetTokenizer (XLNet model)
 50 |             - contains `xlm`: XLMTokenizer (XLM model)
 51 | 
 52 |         This class cannot be instantiated using `__init__()` (throw an error).
 53 |     """
 54 |     def __init__(self):
 55 |         raise EnvironmentError("AutoTokenizer is designed to be instantiated "
 56 |             "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
 57 | 
 58 |     @classmethod
 59 |     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 60 |         r""" Instantiate a one of the tokenizer classes of the library
 61 |         from a pre-trained model vocabulary.
 62 | 
 63 |         The tokenizer class to instantiate is selected as the first pattern matching
 64 |         in the `pretrained_model_name_or_path` string (in the following order):
 65 |             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
 66 |             - contains `roberta`: RobertaTokenizer (XLM model)
 67 |             - contains `bert`: BertTokenizer (Bert model)
 68 |             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
 69 |             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
 70 |             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
 71 |             - contains `xlnet`: XLNetTokenizer (XLNet model)
 72 |             - contains `xlm`: XLMTokenizer (XLM model)
 73 | 
 74 |         Params:
 75 |             pretrained_model_name_or_path: either:
 76 | 
 77 |                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
 78 |                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
 79 |                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 80 | 
 81 |             cache_dir: (`optional`) string:
 82 |                 Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
 83 | 
 84 |             force_download: (`optional`) boolean, default False:
 85 |                 Force to (re-)download the vocabulary files and override the cached versions if they exists.
 86 | 
 87 |             proxies: (`optional`) dict, default None:
 88 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
 89 |                 The proxies are used on each request.
 90 | 
 91 |             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 92 | 
 93 |             kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
 94 | 
 95 |         Examples::
 96 | 
 97 |             tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
 98 |             tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 99 | 
100 |         """
101 |         if 'distilbert' in pretrained_model_name_or_path:
102 |             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
103 |         elif 'roberta' in pretrained_model_name_or_path:
104 |             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
105 |         elif 'bert' in pretrained_model_name_or_path:
106 |             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
107 |         elif 'openai-gpt' in pretrained_model_name_or_path:
108 |             return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
109 |         elif 'gpt2' in pretrained_model_name_or_path:
110 |             return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
111 |         elif 'transfo-xl' in pretrained_model_name_or_path:
112 |             return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
113 |         elif 'xlnet' in pretrained_model_name_or_path:
114 |             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
115 |         elif 'xlm' in pretrained_model_name_or_path:
116 |             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
117 | 
118 |         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
119 |                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
120 |                          "'xlm', 'roberta'".format(pretrained_model_name_or_path))
121 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tokenization_distilbert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for DistilBERT."""
16 | 
17 | from __future__ import absolute_import, division, print_function, unicode_literals
18 | 
19 | import collections
20 | import logging
21 | import os
22 | import unicodedata
23 | from io import open
24 | 
25 | from .tokenization_bert import BertTokenizer
26 | 
27 | logger = logging.getLogger(__name__)
28 | 
29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
30 | 
31 | PRETRAINED_VOCAB_FILES_MAP = {
32 |     'vocab_file':
33 |     {
34 |         'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
35 |         'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
36 |     }
37 | }
38 | 
39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
40 |     'distilbert-base-uncased': 512,
41 |     'distilbert-base-uncased-distilled-squad': 512,
42 | }
43 | 
44 | 
45 | class DistilBertTokenizer(BertTokenizer):
46 |     r"""
47 |     Constructs a DistilBertTokenizer.
48 |     :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
49 | 
50 |     Args:
51 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
52 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
53 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
54 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
55 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
56 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
57 |             do_wordpiece_only=False
58 |     """
59 | 
60 |     vocab_files_names = VOCAB_FILES_NAMES
61 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
62 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
63 | 


--------------------------------------------------------------------------------
/pydatagrand/model/pytorch_transformers/tokenization_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for RoBERTa."""
16 | from __future__ import (absolute_import, division, print_function,
17 |                         unicode_literals)
18 | 
19 | import sys
20 | import json
21 | import logging
22 | import os
23 | import regex as re
24 | from io import open
25 | 
26 | from .tokenization_gpt2 import GPT2Tokenizer
27 | 
28 | try:
29 |     from functools import lru_cache
30 | except ImportError:
31 |     # Just a dummy decorator to get the checks to run on python2
32 |     # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
33 |     def lru_cache():
34 |         return lambda func: func
35 | 
36 | logger = logging.getLogger(__name__)
37 | 
38 | VOCAB_FILES_NAMES = {
39 |     'vocab_file': 'vocab.json',
40 |     'merges_file': 'merges.txt',
41 | }
42 | 
43 | PRETRAINED_VOCAB_FILES_MAP = {
44 |     'vocab_file':
45 |     {
46 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
47 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
48 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
49 |     },
50 |     'merges_file':
51 |     {
52 |         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
53 |         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
54 |         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
55 |     },
56 | }
57 | 
58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
59 |     'roberta-base': 512,
60 |     'roberta-large': 512,
61 |     'roberta-large-mnli': 512,
62 | }
63 | 
64 | 
65 | class RobertaTokenizer(GPT2Tokenizer):
66 |     """
67 |     RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
68 |         - Byte-level Byte-Pair-Encoding
69 |         - Requires a space to start the input string => will add a space is there isn't.
70 |           As a consequence, this tokenizer `encode` and `decode` method will not conserve
71 |           the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"
72 |     """
73 |     vocab_files_names = VOCAB_FILES_NAMES
74 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
75 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
76 | 
77 |     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
78 |                  cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
79 |         super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors,
80 |                                                bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
81 |                                                sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
82 |                                                mask_token=mask_token, **kwargs)
83 | 
84 |     def add_special_tokens_single_sentence(self, token_ids):
85 |         """
86 |         Adds special tokens to a sequence for sequence classification tasks.
87 |         A RoBERTa sequence has the following format: <s> X </s>
88 |         """
89 |         return [self.cls_token_id] + token_ids + [self.sep_token_id]
90 | 
91 |     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
92 |         """
93 |         Adds special tokens to a sequence pair for sequence classification tasks.
94 |         A RoBERTa sequence pair has the following format: <s> A </s></s> B </s>
95 |         """
96 |         sep = [self.sep_token_id]
97 |         cls = [self.cls_token_id]
98 |         return cls + token_ids_0 + sep + sep + token_ids_1 + sep
99 | 


--------------------------------------------------------------------------------
/pydatagrand/output/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/output/checkpoints/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/output/embedding/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/output/feature/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/output/figure/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/output/log/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/output/result/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/preprocessing/augmentation.py:
--------------------------------------------------------------------------------
 1 | #encoding:utf-8
 2 | import numpy as np
 3 | import random
 4 | 
 5 | class Augmentator(object):
 6 |     def __init__(self,is_train_mode = True, proba = 0.5):
 7 |         self.mode = is_train_mode
 8 |         self.proba = proba
 9 |         self.augs = []
10 |         self._reset()
11 | 
12 |     # 总的增强列表
13 |     def _reset(self):
14 |         self.augs.append(lambda text: self._shuffle(text))
15 |         self.augs.append(lambda text: self._dropout(text,p = 0.5))
16 | 
17 |     # 打乱
18 |     def _shuffle(self, text):
19 |         text = np.random.permutation(text.strip().split())
20 |         return ' '.join(text)
21 | 
22 |     #随机删除一些
23 |     def _dropout(self, text, p=0.5):
24 |         # random delete some text
25 |         text = text.strip().split()
26 |         len_ = len(text)
27 |         indexs = np.random.choice(len_, int(len_ * p))
28 |         for i in indexs:
29 |             text[i] = ''
30 |         return ' '.join(text)
31 | 
32 |     def __call__(self,text,aug_type):
33 |         '''
34 |         用aug_type区分数据
35 |         '''
36 |         # TTA模式
37 |         if 0 <= aug_type <= 2:
38 |             pass
39 |         # 训练模式
40 |         if self.mode and  random.random() < self.proba:
41 |             aug = random.choice(self.augs)
42 |             text = aug(text)
43 |         return text
44 | 


--------------------------------------------------------------------------------
/pydatagrand/preprocessing/chinese_preprocessor.py:
--------------------------------------------------------------------------------
  1 | #encoding:utf-8
  2 | import re
  3 | import jieba
  4 | 
  5 | class ChinesePreProcessor(object):
  6 |     def __init__(self,min_len = 2,stopwords_path = None):
  7 |         self.min_len = min_len
  8 |         self.stopwords_path = stopwords_path
  9 |         self.reset()
 10 | 
 11 |     def jieba_cut(self,sentence):
 12 |         '''
 13 |         jieba分词
 14 |         :param sentence:
 15 |         :return:
 16 |         '''
 17 |         seg_list = jieba.cut(sentence,cut_all=False)
 18 |         return ' '.join(seg_list)
 19 | 
 20 |     def reset(self):
 21 |         '''
 22 |         加载停用词
 23 |         :return:
 24 |         '''
 25 |         if self.stopwords_path:
 26 |             with open(self.stopwords_path,'r') as fr:
 27 |                 self.stopwords = {}
 28 |                 for line in fr:
 29 |                     word = line.strip(' ').strip('\n')
 30 |                     self.stopwords[word] = 1
 31 | 
 32 |     def clean_length(self,sentence):
 33 |         '''
 34 |         去除长度小于min_len的文本
 35 |         :param sentence:
 36 |         :return:
 37 |         '''
 38 |         if len([x for x in sentence]) >= self.min_len:
 39 |             return sentence
 40 | 
 41 | 
 42 |     def full2half(self,sentence):
 43 |         '''
 44 |         全角转化为半角
 45 |         :param sentence:
 46 |         :return:
 47 |         '''
 48 |         ret_str = ''
 49 |         for i in sentence:
 50 |             if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248:
 51 |                 ret_str += chr(ord(i) - 65248)
 52 |             else:
 53 |                 ret_str += i
 54 |         return ret_str
 55 | 
 56 |     def remove_stopword(self,sentence):
 57 |         '''
 58 |         去除停用词
 59 |         :param sentence:
 60 |         :return:
 61 |         '''
 62 |         words = sentence.split()
 63 |         x = [word for word in words if word not in self.stopwords]
 64 |         return " ".join(x)
 65 | 
 66 |     def get_china(self,sentence):
 67 |         '''
 68 |         获取中文
 69 |         :param sentence:
 70 |         :return:
 71 |         '''
 72 |         zhmodel = re.compile("[\u4e00-\u9fa5]")
 73 |         words = [x for x in sentence if zhmodel.search(x)]
 74 |         return ''.join(words)
 75 | 
 76 |     def remove_numbers(self,sentence):
 77 |         '''
 78 |         移除数字
 79 |         :param sentence:
 80 |         :return:
 81 |         '''
 82 |         words = sentence.split()
 83 |         x = [re.sub('\d+','',word) for word in words]
 84 |         return ' '.join([w for w in x if w !=''])
 85 | 
 86 |     def remove_whitespace(self,sentence):
 87 |         '''
 88 |         移除空白部分
 89 |         :param sentence:
 90 |         :return:
 91 |         '''
 92 |         x = ''.join([x for x in sentence if x !=' ' or x !='' or x!='  '])
 93 |         return x
 94 | 
 95 |     def __call__(self, sentence):
 96 |         x = sentence.strip('\n')
 97 |         x = self.full2half(x)
 98 |         # x = self.jieba_cut(x)
 99 |         # if self.stopwords_path:
100 |         #     x = self.remove_stopword(x)
101 |         x = self.remove_whitespace(x)
102 |         x = self.get_china(x)
103 |         x = self.clean_length(x)
104 | 
105 |         return x
106 | 


--------------------------------------------------------------------------------
/pydatagrand/pretrain/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/pretrain/bert/base-uncased/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/pretrain/xlnet/base-cased/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/test/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/test/predicter.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 
5 | 
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/pydatagrand/train/__init__.py:
--------------------------------------------------------------------------------
1 | #encoding:utf-8


--------------------------------------------------------------------------------
/pydatagrand/train/ema.py:
--------------------------------------------------------------------------------
 1 | class EMA:
 2 |     def __init__(self, model, mu, level='batch', n=1):
 3 |         # self.ema_model = copy.deepcopy(model)
 4 |         self.mu = mu
 5 |         self.level = level
 6 |         self.n = n
 7 |         self.cnt = self.n
 8 |         self.shadow = {}
 9 |         for name, param in model.named_parameters():
10 |             if param.requires_grad:
11 |                 self.shadow[name] = param.data
12 | 
13 |     def _update(self, model):
14 |         for name, param in model.named_parameters():
15 |             if param.requires_grad:
16 |                 new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name]
17 |                 self.shadow[name] = new_average.clone()
18 | 
19 |     def set_weights(self, ema_model):
20 |         for name, param in ema_model.named_parameters():
21 |             if param.requires_grad:
22 |                 param.data = self.shadow[name]
23 | 
24 |     def on_batch_end(self, model):
25 |         if self.level is 'batch':
26 |             self.cnt -= 1
27 |             if self.cnt == 0:
28 |                 self._update(model)
29 |                 self.cnt = self.n
30 | 
31 |     def on_epoch_end(self, model):
32 |         if self.level is 'epoch':
33 |             self._update(model)


--------------------------------------------------------------------------------
/pydatagrand/train/losses.py:
--------------------------------------------------------------------------------
 1 | # encoding:utf-8
 2 | from torch.nn import CrossEntropyLoss
 3 | from torch.nn import BCEWithLogitsLoss
 4 | 
 5 | __call__ = ['CrossEntropy', 'BCEWithLogLoss']
 6 | 
 7 | 
 8 | class CrossEntropy(object):
 9 |     def __init__(self, ignore_index=-1):
10 |         self.loss_f = CrossEntropyLoss(ignore_index=ignore_index)
11 | 
12 |     def __call__(self, output, target):
13 |         loss = self.loss_f(input=output, target=target)
14 |         return loss
15 | 
16 | 
17 | class BCEWithLogLoss(object):
18 |     def __init__(self):
19 |         self.loss_fn = BCEWithLogitsLoss()
20 | 
21 |     def __call__(self, output, target):
22 |         loss = self.loss_fn(input=output, target=target)
23 |         return loss
24 | 
25 | 
26 | class SpanLoss(object):
27 |     def __init__(self, ignore_index=-100):
28 |         self.loss_fn = CrossEntropyLoss(ignore_index=ignore_index)
29 | 
30 |     def __call__(self, output, target, mask):
31 |         active_loss = mask.view(-1) == 1
32 |         active_logits = output[active_loss]
33 |         active_labels = target[active_loss]
34 |         return self.loss_fn(active_logits, active_labels)
35 | 


--------------------------------------------------------------------------------
/pydatagrand/train/ner_seq_trainer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from ..callback.progressbar import ProgressBar
  3 | from ..common.tools import model_device,prepare_device
  4 | from ..common.tools import seed_everything
  5 | from ..common.tools import AverageMeter
  6 | from .ner_utils import SeqEntityScore
  7 | from torch.nn.utils import clip_grad_norm_
  8 | 
  9 | class Trainer(object):
 10 |     def __init__(self, model, n_gpu, logger, optimizer, lr_scheduler,
 11 |                 label2id, gradient_accumulation_steps, grad_clip=0.0,early_stopping=None,
 12 |                  fp16=None, resume_path=None, training_monitor=None, model_checkpoint=None):
 13 | 
 14 |         self.n_gpu = n_gpu
 15 |         self.model = model
 16 |         self.logger = logger
 17 |         self.fp16 = fp16
 18 |         self.optimizer = optimizer
 19 |         self.label2id = label2id
 20 |         self.grad_clip = grad_clip
 21 |         self.lr_scheduler = lr_scheduler
 22 |         self.early_stopping = early_stopping
 23 |         self.model_checkpoint = model_checkpoint
 24 |         self.training_monitor = training_monitor
 25 |         self.gradient_accumulation_steps = gradient_accumulation_steps
 26 | 
 27 |         # self.model, self.device = model_device(n_gpu=self.n_gpu, model=self.model)
 28 |         self.device ,_ = prepare_device(n_gpu)
 29 |         self.id2label = {y: x for x, y in label2id.items()}
 30 |         self.entity_score = SeqEntityScore(self.id2label)
 31 |         self.start_epoch = 1
 32 |         self.global_step = 0
 33 |         if self.fp16:
 34 |             try:
 35 |                 from apex import amp
 36 |             except ImportError:
 37 |                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
 38 |         if resume_path:
 39 |             self.logger.info(f"\nLoading checkpoint: {resume_path}")
 40 |             resume_dict = torch.load(resume_path / 'checkpoint_info.bin')
 41 |             best = resume_dict['epoch']
 42 |             self.start_epoch = resume_dict['epoch']
 43 |             if self.model_checkpoint:
 44 |                 self.model_checkpoint.best = best
 45 |             self.logger.info(f"\nCheckpoint '{resume_path}' and epoch {self.start_epoch} loaded")
 46 | 
 47 |     def save_info(self, epoch, best):
 48 |         model_save = self.model.module if hasattr(self.model, 'module') else self.model
 49 |         state = {"model": model_save,
 50 |                  'epoch': epoch,
 51 |                  'best': best}
 52 |         return state
 53 | 
 54 |     def valid_epoch(self, data_loader):
 55 |         pbar = ProgressBar(n_total=len(data_loader), desc='Evaluating')
 56 |         self.entity_score.reset()
 57 |         valid_loss = AverageMeter()
 58 |         for step, batch in enumerate(data_loader):
 59 |             batch = tuple(t.to(self.device) for t in batch)
 60 |             input_ids, input_mask, segment_ids, label_ids, input_lens = batch
 61 |             input_lens = input_lens.cpu().detach().numpy().tolist()
 62 |             self.model.eval()
 63 |             with torch.no_grad():
 64 |                 features, loss = self.model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens)
 65 |                 tags, _ = self.model.crf._obtain_labels(features, self.id2label, input_lens)
 66 |             valid_loss.update(val=loss.item(), n=input_ids.size(0))
 67 |             pbar(step=step, info={"loss": loss.item()})
 68 |             label_ids = label_ids.to('cpu').numpy().tolist()
 69 |             for i, label in enumerate(label_ids):
 70 |                 temp_1 = []
 71 |                 temp_2 = []
 72 |                 for j, m in enumerate(label):
 73 |                     if j == 0:
 74 |                         continue
 75 |                     elif label_ids[i][j] == self.label2id['[SEP]']:
 76 |                         self.entity_score.update(pred_paths=[temp_2], label_paths=[temp_1])
 77 |                         break
 78 |                     else:
 79 |                         temp_1.append(self.id2label[label_ids[i][j]])
 80 |                         temp_2.append(tags[i][j])
 81 |             valid_info, class_info = self.entity_score.result()
 82 |             info = {f'valid_{key}': value for key, value in valid_info.items()}
 83 |             info['valid_loss'] = valid_loss.avg
 84 |             if 'cuda' in str(self.device):
 85 |                 torch.cuda.empty_cache()
 86 |             return info, class_info
 87 | 
 88 |     def train_epoch(self, data_loader):
 89 |         pbar = ProgressBar(n_total=len(data_loader), desc='Training')
 90 |         tr_loss = AverageMeter()
 91 |         for step, batch in enumerate(data_loader):
 92 |             self.model.train()
 93 |             batch = tuple(t.to(self.device) for t in batch)
 94 |             input_ids, input_mask, segment_ids, label_ids, input_lens = batch
 95 |             input_lens = input_lens.cpu().detach().numpy().tolist()
 96 |             _, loss = self.model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens)
 97 |             if len(self.n_gpu.split(",")) >= 2:
 98 |                 loss = loss.mean()
 99 |             if self.gradient_accumulation_steps > 1:
100 |                 loss = loss / self.gradient_accumulation_steps
101 |             if self.fp16:
102 |                 with amp.scale_loss(loss, self.optimizer) as scaled_loss:
103 |                     scaled_loss.backward()
104 |                 clip_grad_norm_(amp.master_params(self.optimizer), self.grad_clip)
105 |             else:
106 |                 loss.backward()
107 |                 clip_grad_norm_(self.model.parameters(), self.grad_clip)
108 |             if (step + 1) % self.gradient_accumulation_steps == 0:
109 |                 self.optimizer.step()
110 |                 self.optimizer.zero_grad()
111 |                 self.global_step += 1
112 |             tr_loss.update(loss.item(), n=1)
113 |             pbar(step=step, info={'loss': loss.item()})
114 |         info = {'loss': tr_loss.avg}
115 |         if "cuda" in str(self.device):
116 |             torch.cuda.empty_cache()
117 |         return info
118 | 
119 |     def train(self, train_data, valid_data, epochs, seed):
120 |         seed_everything(seed)
121 |         for epoch in range(self.start_epoch, self.start_epoch + int(epochs)):
122 |             self.logger.info(f"Epoch {epoch}/{int(epochs)}")
123 |             train_log = self.train_epoch(train_data)
124 |             valid_log, class_info = self.valid_epoch(valid_data)
125 | 
126 |             logs = dict(train_log, **valid_log)
127 |             show_info = f'Epoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
128 |             self.logger.info(show_info)
129 |             self.logger.info("The entity scores of valid data : ")
130 |             for key, value in class_info.items():
131 |                 info = f'Entity: {key} - ' + "-".join([f' {key_}: {value_:.4f} ' for key_, value_ in value.items()])
132 |                 self.logger.info(info)
133 | 
134 |             if hasattr(self.lr_scheduler,'epoch_step'):
135 |                 self.lr_scheduler.epoch_step(metrics=logs[self.model_checkpoint.monitor], epoch=epoch)
136 |             # save
137 |             if self.training_monitor:
138 |                 self.training_monitor.epoch_step(logs)
139 | 
140 |             # save model
141 |             if self.model_checkpoint:
142 |                 state = self.save_info(epoch, best=logs[self.model_checkpoint.monitor])
143 |                 self.model_checkpoint.bert_epoch_step(current=logs[self.model_checkpoint.monitor], state=state)
144 | 
145 |             # early_stopping
146 |             if self.early_stopping:
147 |                 self.early_stopping.epoch_step(epoch=epoch, current=logs[self.early_stopping.monitor])
148 |                 if self.early_stopping.stop_training:
149 |                     break
150 | 


--------------------------------------------------------------------------------
/run_submit.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | from pydatagrand.configs.base import config
 4 | from pydatagrand.common.tools import load_pickle
 5 | from pydatagrand.train.ner_utils import get_entities
 6 | from pydatagrand.common.tools import seed_everything
 7 | from collections import Counter
 8 | from glob import glob
 9 | from datetime import datetime
10 | 
11 | class DataProcessor(object):
12 |     """Base class for data converters for sequence classification data sets."""
13 |     def get_train_examples(self, data_dir):
14 |         """Gets a collection of `InputExample`s for the train set."""
15 |         raise NotImplementedError()
16 |     def get_dev_examples(self, data_dir):
17 |         """Gets a collection of `InputExample`s for the dev set."""
18 |         raise NotImplementedError()
19 | 
20 |     def get_labels(self):
21 |         """Gets the list of labels for this data set."""
22 |         raise NotImplementedError()
23 | class NerProcessor(DataProcessor):
24 |     """Processor for the CoNLL-2003 data set."""
25 |     def get_labels(self):
26 |         return ["X", "O", "B-a", "I-a", "B-b", "I-b", "B-c", "I-c", "S-a", "S-b", "S-c", "[CLS]", "[SEP]"]
27 | 
28 | def main():
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument("--task_name",default='ner',type=str)
31 |     parser.add_argument("--do_test",action='store_true')
32 |     parser.add_argument("--do_eval",action='store_true')
33 |     parser.add_argument('--seed',default=42,type=str)
34 |     args = parser.parse_args()
35 | 
36 |     seed_everything(seed=args.seed)
37 |     dt = str(datetime.today()).split(" ")[0]
38 |     test_path = config['data_dir'] / 'test.txt'
39 |     test_result_path =  config['result'] / f'{dt}_submit_test.txt'
40 |     processors = {"ner": NerProcessor}
41 |     task_name = args.task_name.lower()
42 |     processor = processors[task_name]()
43 |     label_list = processor.get_labels()
44 |     id2label = {i: label for i, label in enumerate(label_list, 0)}
45 |     test_data = []
46 |     with open(str(test_path), 'r') as fr:
47 |         for line in fr:
48 |             line = line.strip("\n")
49 |             test_data.append(line)
50 |     fw = open(str(test_result_path), 'w')
51 |     cv_test_pred = []
52 |     for file in glob(f"{str(config['result']/ '*.pkl')}"):
53 |         data = load_pickle(file)
54 |         cv_test_pred.append(data)
55 |     vote_pred = []
56 |     for i in range(len(test_data)):
57 |         t = [np.array([x[i]]).T for x in cv_test_pred]
58 |         t2 = np.concatenate(t, axis=1)
59 |         t3 = []
60 |         for line in t2:
61 |             c = Counter()
62 |             c.update(line)
63 |             t3.append(c.most_common(1)[0][0])
64 |         vote_pred.append(t3)
65 |     for tag,line in zip(vote_pred,test_data):
66 |         token_a = line.split("_")
67 |         label_entities = get_entities(tag, id2label)
68 |         if len(label_entities) == 0:
69 |             record = "_".join(token_a) + "/o"
70 |         else:
71 |             labels = []
72 |             label_entities = sorted(label_entities, key=lambda x: x[1])
73 |             o_s = 0
74 |             for i, entity in enumerate(label_entities):
75 |                 begin = entity[1]
76 |                 end = entity[2]
77 |                 tag = entity[0]
78 |                 if begin != o_s:
79 |                     labels.append("_".join(token_a[o_s:begin]) + "/o")
80 |                 labels.append("_".join(token_a[begin:end + 1]) + f"/{tag}")
81 |                 o_s = end + 1
82 |                 if i == len(label_entities) - 1:
83 |                     if o_s <= len(token_a) - 1:
84 |                         labels.append("_".join(token_a[o_s:]) + "/o")
85 |             record = "  ".join(labels)
86 |         fw.write(record + "\n")
87 |     fw.close()
88 | 
89 | if __name__ == "__main__":
90 |     main()


--------------------------------------------------------------------------------