├── README.md ├── png ├── model1.png ├── model2.png ├── model3.png └── result.png ├── prepare_fold_data.py ├── prepare_lm_data_mask.py ├── prepare_lm_data_ngram.py ├── pydatagrand ├── __init__.py ├── callback │ ├── __init__.py │ ├── earlystopping.py │ ├── lrscheduler.py │ ├── modelcheckpoint.py │ ├── optimizater.py │ ├── progressbar.py │ ├── trainingmonitor.py │ └── utils.py ├── common │ ├── __init__.py │ └── tools.py ├── configs │ ├── __init__.py │ ├── base.py │ └── bert_config.py ├── dataset │ ├── __init__.py │ └── corpus │ │ ├── __init__.py │ │ └── train │ │ └── __init__.py ├── doc │ └── __init__.py ├── embedding │ ├── __init__.py │ └── w2v_embedding.py ├── io │ ├── __init__.py │ ├── bert_seq_processor.py │ ├── bert_span_processor.py │ └── vocabulary.py ├── model │ ├── __init__.py │ ├── layers │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── cnn.py │ │ ├── crf.py │ │ ├── dropouts.py │ │ ├── embedding.py │ │ ├── linears.py │ │ └── normalization.py │ ├── nn │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ └── __init__.cpython-37.pyc │ │ ├── bert_lstm_crf.py │ │ ├── bert_lstm_crf_mdp.py │ │ └── bert_lstm_span.py │ └── pytorch_transformers │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── configuration_auto.py │ │ ├── configuration_bert.py │ │ ├── configuration_distilbert.py │ │ ├── configuration_gpt2.py │ │ ├── configuration_openai.py │ │ ├── configuration_roberta.py │ │ ├── configuration_transfo_xl.py │ │ ├── configuration_utils.py │ │ ├── configuration_xlm.py │ │ ├── configuration_xlnet.py │ │ ├── convert_gpt2_checkpoint_to_pytorch.py │ │ ├── convert_openai_checkpoint_to_pytorch.py │ │ ├── convert_pytorch_checkpoint_to_tf.py │ │ ├── convert_roberta_checkpoint_to_pytorch.py │ │ ├── convert_tf_checkpoint_to_pytorch.py │ │ ├── convert_transfo_xl_checkpoint_to_pytorch.py │ │ ├── convert_xlm_checkpoint_to_pytorch.py │ │ ├── convert_xlnet_checkpoint_to_pytorch.py │ │ ├── file_utils.py │ │ ├── modeling_auto.py │ │ ├── modeling_bert.py │ │ ├── modeling_distilbert.py │ │ ├── modeling_gpt2.py │ │ ├── modeling_openai.py │ │ ├── modeling_roberta.py │ │ ├── modeling_transfo_xl.py │ │ ├── modeling_transfo_xl_utilities.py │ │ ├── modeling_utils.py │ │ ├── modeling_xlm.py │ │ ├── modeling_xlnet.py │ │ ├── optimization.py │ │ ├── tests │ │ ├── __init__.py │ │ ├── configuration_common_test.py │ │ ├── conftest.py │ │ ├── modeling_auto_test.py │ │ ├── modeling_bert_test.py │ │ ├── modeling_common_test.py │ │ ├── modeling_distilbert_test.py │ │ ├── modeling_gpt2_test.py │ │ ├── modeling_openai_test.py │ │ ├── modeling_roberta_test.py │ │ ├── modeling_transfo_xl_test.py │ │ ├── modeling_xlm_test.py │ │ ├── modeling_xlnet_test.py │ │ ├── optimization_test.py │ │ ├── tokenization_auto_test.py │ │ ├── tokenization_bert_test.py │ │ ├── tokenization_dilbert_test.py │ │ ├── tokenization_gpt2_test.py │ │ ├── tokenization_openai_test.py │ │ ├── tokenization_roberta_test.py │ │ ├── tokenization_tests_commons.py │ │ ├── tokenization_transfo_xl_test.py │ │ ├── tokenization_utils_test.py │ │ ├── tokenization_xlm_test.py │ │ └── tokenization_xlnet_test.py │ │ ├── tokenization_auto.py │ │ ├── tokenization_bert.py │ │ ├── tokenization_distilbert.py │ │ ├── tokenization_gpt2.py │ │ ├── tokenization_openai.py │ │ ├── tokenization_roberta.py │ │ ├── tokenization_transfo_xl.py │ │ ├── tokenization_utils.py │ │ ├── tokenization_xlm.py │ │ └── tokenization_xlnet.py ├── output │ ├── __init__.py │ ├── checkpoints │ │ └── __init__.py │ ├── embedding │ │ └── __init__.py │ ├── feature │ │ └── __init__.py │ ├── figure │ │ └── __init__.py │ ├── log │ │ └── __init__.py │ └── result │ │ └── __init__.py ├── preprocessing │ ├── __init__.py │ ├── augmentation.py │ ├── chinese_preprocessor.py │ └── english_preprocessor.py ├── pretrain │ ├── __init__.py │ ├── bert │ │ └── base-uncased │ │ │ └── __init__.py │ └── xlnet │ │ └── base-cased │ │ └── __init__.py ├── test │ ├── __init__.py │ └── predicter.py └── train │ ├── __init__.py │ ├── ema.py │ ├── losses.py │ ├── metrics.py │ ├── ner_seq_trainer.py │ ├── ner_span_trainer.py │ └── ner_utils.py ├── run_bert_crf.py ├── run_bert_span.py ├── run_submit.py └── train_bert_model.py /README.md: -------------------------------------------------------------------------------- 1 | # datagrand_2019_rank9 2 | 3 | 2019年达观信息提取比赛第九名代码和答辩ppt 4 | 5 | 比赛地址:[官网](https://www.biendata.com/competition/datagrand/) 6 | 7 | ## 代码目录结构 8 | ```text 9 | ├── pydatagrand 10 | | └── callback 11 | | | └── lrscheduler.py   12 | | | └── trainingmonitor.py  13 | | | └── ... 14 | | └── config 15 | | | └── basic_config.py #a configuration file for storing model parameters 16 | | └── dataset    17 | | └── io     18 | | | └── dataset.py   19 | | | └── data_transformer.py   20 | | └── model 21 | | | └── nn  22 | | | └── pretrain  23 | | └── output #save the ouput of model 24 | | └── preprocessing #text preprocessing 25 | | └── train #used for training a model 26 | | | └── trainer.py 27 | | | └── ... 28 | | └── common # a set of utility functions 29 | ├── prepare_fold_data.py # 数据切分 30 | ├── prepare_lm_data_mask.py # 随机mask 31 | ├── prepare_lm_data_ngram.py #ngram mask 32 | ├── run_bert_crf.py # crf结构 33 | ├── run_bert_span.py   # span结构 34 | ├── train_bert_model.py  #训练bert模型 35 | 36 | ``` 37 | ## 预训练模型 38 | 39 | 主要训练了8层跟12层BERT模型,采用随机mask + ngram mask两种混合动态masking模式 40 | 41 | ## 方案1 42 | 43 | 方案1主要采用BERT+LSTM+CRF结构 44 | 45 | ![](./png/model1.png) 46 | 47 | ## 方案2 48 | 49 | 方案2在方案1的基础上增加了MDP结构 50 | 51 | ![](./png/model2.png) 52 | 53 | ## 方案3 54 | 55 | 方案3主要采用BERT+LSTM+SPAN结构 56 | 57 | ![](./png/model3.png) 58 | 59 | ## 结果 60 | 最终结果如下所示: 61 | 62 | ![](./png/result.png) 63 | 64 | ## 文档 65 | 66 | 十强答辩ppt下载地址: https://pan.baidu.com/s/1yvXFf5GzyvDksdBKNp9FKQ 提取码: svr2 67 | 68 | -------------------------------------------------------------------------------- /png/model1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/model1.png -------------------------------------------------------------------------------- /png/model2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/model2.png -------------------------------------------------------------------------------- /png/model3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/model3.png -------------------------------------------------------------------------------- /png/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/png/result.png -------------------------------------------------------------------------------- /pydatagrand/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/callback/__init__.py: -------------------------------------------------------------------------------- 1 | from .progressbar import ProgressBar 2 | from .earlystopping import EarlyStopping 3 | from .trainingmonitor import TrainingMonitor 4 | from .modelcheckpoint import ModelCheckpoint 5 | 6 | from .lrscheduler import CustomDecayLR 7 | from .lrscheduler import BertLR 8 | from .lrscheduler import CyclicLR 9 | from .lrscheduler import ReduceLROnPlateau 10 | from .lrscheduler import ReduceLRWDOnPlateau 11 | from .lrscheduler import CosineLRWithRestarts 12 | from .lrscheduler import NoamLR 13 | from .lrscheduler import OneCycleScheduler 14 | from .lrscheduler import BERTReduceLROnPlateau 15 | 16 | from .optimizater import Lamb 17 | from .optimizater import Lars 18 | from .optimizater import RAdam 19 | from .optimizater import Ralamb 20 | from .optimizater import Lookahead 21 | from .optimizater import RaLars 22 | from .optimizater import Ranger 23 | from .optimizater import SGDW 24 | from .optimizater import AdamW 25 | from .optimizater import AdaBound 26 | from .optimizater import Nadam 27 | from .optimizater import AdaFactor 28 | from .optimizater import WeightDecayOptimizerWrapper 29 | from .optimizater import NovoGrad 30 | from .optimizater import BertAdam 31 | 32 | -------------------------------------------------------------------------------- /pydatagrand/callback/earlystopping.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..common.tools import logger 3 | 4 | class EarlyStopping(object): 5 | ''' 6 | early stopping 功能 7 | # Arguments 8 | min_delta: 最小变化 9 | patience: 多少个epoch未提高,就停止训练 10 | verbose: 信息大于,默认打印信息 11 | mode: 计算模式 12 | monitor: 计算指标 13 | baseline: 基线 14 | ''' 15 | def __init__(self, 16 | min_delta = 0, 17 | patience = 10, 18 | verbose = 1, 19 | mode = 'min', 20 | monitor = 'loss', 21 | baseline = None): 22 | 23 | self.baseline = baseline 24 | self.patience = patience 25 | self.verbose = verbose 26 | self.min_delta = min_delta 27 | self.monitor = monitor 28 | 29 | assert mode in ['min','max'] 30 | 31 | if mode == 'min': 32 | self.monitor_op = np.less 33 | elif mode == 'max': 34 | self.monitor_op = np.greater 35 | if self.monitor_op == np.greater: 36 | self.min_delta *= 1 37 | else: 38 | self.min_delta *= -1 39 | self.reset() 40 | 41 | def reset(self): 42 | # Allow instances to be re-used 43 | self.wait = 0 44 | self.stop_training = False 45 | if self.baseline is not None: 46 | self.best = self.baseline 47 | else: 48 | self.best = np.Inf if self.monitor_op == np.less else -np.Inf 49 | 50 | def epoch_step(self,current): 51 | if self.monitor_op(current - self.min_delta, self.best): 52 | self.best = current 53 | self.wait = 0 54 | else: 55 | self.wait += 1 56 | if self.wait >= self.patience: 57 | if self.verbose >0: 58 | logger.info(f"{self.patience} epochs with no improvement after which training will be stopped") 59 | self.stop_training = True 60 | -------------------------------------------------------------------------------- /pydatagrand/callback/modelcheckpoint.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import numpy as np 3 | import torch 4 | from ..common.tools import logger 5 | 6 | class ModelCheckpoint(object): 7 | ''' 8 | 模型保存,两种模式: 9 | 1. 直接保存最好模型 10 | 2. 按照epoch频率保存模型 11 | ''' 12 | def __init__(self, checkpoint_dir, 13 | monitor, 14 | arch,mode='min', 15 | epoch_freq=1, 16 | best = None, 17 | save_best_only = True): 18 | if isinstance(checkpoint_dir,Path): 19 | checkpoint_dir = checkpoint_dir 20 | else: 21 | checkpoint_dir = Path(checkpoint_dir) 22 | assert checkpoint_dir.is_dir() 23 | checkpoint_dir.mkdir(exist_ok=True) 24 | self.base_path = checkpoint_dir 25 | self.arch = arch 26 | self.monitor = monitor 27 | self.epoch_freq = epoch_freq 28 | self.save_best_only = save_best_only 29 | 30 | # 计算模式 31 | if mode == 'min': 32 | self.monitor_op = np.less 33 | self.best = np.Inf 34 | 35 | elif mode == 'max': 36 | self.monitor_op = np.greater 37 | self.best = -np.Inf 38 | # 这里主要重新加载模型时候 39 | #对best重新赋值 40 | if best: 41 | self.best = best 42 | 43 | if save_best_only: 44 | self.model_name = f"BEST_{arch}_MODEL.pth" 45 | 46 | def epoch_step(self, state,current): 47 | ''' 48 | 正常模型 49 | :param state: 需要保存的信息 50 | :param current: 当前判断指标 51 | :return: 52 | ''' 53 | # 是否保存最好模型 54 | if self.save_best_only: 55 | if self.monitor_op(current, self.best): 56 | logger.info(f"\nEpoch {state['epoch']}: {self.monitor} improved from {self.best:.5f} to {current:.5f}") 57 | self.best = current 58 | state['best'] = self.best 59 | best_path = self.base_path/ self.model_name 60 | torch.save(state, str(best_path)) 61 | # 每隔几个epoch保存下模型 62 | else: 63 | filename = self.base_path / f"EPOCH_{state['epoch']}_{state[self.monitor]}_{self.arch}_MODEL.pth" 64 | if state['epoch'] % self.epoch_freq == 0: 65 | logger.info(f"\nEpoch {state['epoch']}: save model to disk.") 66 | torch.save(state, str(filename)) 67 | 68 | def bert_epoch_step(self, state,current): 69 | ''' 70 | 适合bert类型模型,适合pytorch_transformer模块 71 | :param state: 72 | :param current: 73 | :return: 74 | ''' 75 | model_to_save = state['model'] 76 | if self.save_best_only: 77 | if self.monitor_op(current, self.best): 78 | logger.info(f"\nEpoch {state['epoch']}: {self.monitor} improved from {self.best:.5f} to {current:.5f}") 79 | self.best = current 80 | state['best'] = self.best 81 | model_to_save.save_pretrained(str(self.base_path)) 82 | output_config_file = self.base_path / 'configs.json' 83 | with open(str(output_config_file), 'w') as f: 84 | f.write(model_to_save.config.to_json_string()) 85 | state.pop("model") 86 | torch.save(state,self.base_path / 'checkpoint_info.bin') 87 | else: 88 | if state['epoch'] % self.epoch_freq == 0: 89 | save_path = self.base_path / f"checkpoint-epoch-{state['epoch']}" 90 | save_path.mkdir(exist_ok=True) 91 | logger.info(f"\nEpoch {state['epoch']}: save model to disk.") 92 | model_to_save.save_pretrained(save_path) 93 | output_config_file = save_path / 'configs.json' 94 | with open(str(output_config_file), 'w') as f: 95 | f.write(model_to_save.config.to_json_string()) 96 | state.pop("model") 97 | torch.save(state, save_path / 'checkpoint_info.bin') 98 | -------------------------------------------------------------------------------- /pydatagrand/callback/progressbar.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | class ProgressBar(object): 4 | ''' 5 | custom progress bar 6 | Example: 7 | >>> from progressbar import ProgressBar 8 | >>> pbar = ProgressBar(n_total=30,desc='training') 9 | >>> step = 2 10 | >>> pbar(step=step) 11 | ''' 12 | def __init__(self, n_total,width=30,desc = 'Training'): 13 | self.width = width 14 | self.n_total = n_total 15 | self.start_time = time.time() 16 | self.desc = desc 17 | 18 | def __call__(self, step, info={}): 19 | now = time.time() 20 | current = step + 1 21 | recv_per = current / self.n_total 22 | bar = f'[{self.desc}] {current}/{self.n_total} [' 23 | if recv_per >= 1: 24 | recv_per = 1 25 | prog_width = int(self.width * recv_per) 26 | if prog_width > 0: 27 | bar += '=' * (prog_width - 1) 28 | if current< self.n_total: 29 | bar += ">" 30 | else: 31 | bar += '=' 32 | bar += '.' * (self.width - prog_width) 33 | bar += ']' 34 | show_bar = f"\r{bar}" 35 | time_per_unit = (now - self.start_time) / current 36 | if current < self.n_total: 37 | eta = time_per_unit * (self.n_total - current) 38 | if eta > 3600: 39 | eta_format = ('%d:%02d:%02d' % 40 | (eta // 3600, (eta % 3600) // 60, eta % 60)) 41 | elif eta > 60: 42 | eta_format = '%d:%02d' % (eta // 60, eta % 60) 43 | else: 44 | eta_format = '%ds' % eta 45 | time_info = f' - ETA: {eta_format}' 46 | else: 47 | if time_per_unit >= 1: 48 | time_info = f' {time_per_unit:.1f}s/step' 49 | elif time_per_unit >= 1e-3: 50 | time_info = f' {time_per_unit * 1e3:.1f}ms/step' 51 | else: 52 | time_info = f' {time_per_unit * 1e6:.1f}us/step' 53 | 54 | show_bar += time_info 55 | if len(info) != 0: 56 | show_info = f'{show_bar} ' + \ 57 | "-".join([f' {key}: {value:.4f} ' for key, value in info.items()]) 58 | print(show_info, end='') 59 | else: 60 | print(show_bar, end='') 61 | -------------------------------------------------------------------------------- /pydatagrand/callback/trainingmonitor.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | import numpy as np 3 | from pathlib import Path 4 | import matplotlib.pyplot as plt 5 | from ..common.tools import load_json 6 | from ..common.tools import save_json 7 | plt.switch_backend('agg') # 防止ssh上绘图问题 8 | 9 | class TrainingMonitor(): 10 | def __init__(self, file_dir, arch, add_test=False): 11 | ''' 12 | :param startAt: 重新开始训练的epoch点 13 | ''' 14 | if isinstance(file_dir, Path): 15 | pass 16 | else: 17 | file_dir = Path(file_dir) 18 | file_dir.mkdir(parents=True, exist_ok=True) 19 | 20 | self.arch = arch 21 | self.file_dir = file_dir 22 | self.H = {} 23 | self.add_test = add_test 24 | self.json_path = file_dir / (arch + "_training_monitor.json") 25 | 26 | def reset(self,start_at): 27 | if start_at > 0: 28 | if self.json_path is not None: 29 | if self.json_path.exists(): 30 | self.H = load_json(self.json_path) 31 | for k in self.H.keys(): 32 | self.H[k] = self.H[k][:start_at] 33 | 34 | def epoch_step(self, logs={}): 35 | for (k, v) in logs.items(): 36 | l = self.H.get(k, []) 37 | # np.float32会报错 38 | if not isinstance(v, np.float): 39 | v = round(float(v), 4) 40 | l.append(v) 41 | self.H[k] = l 42 | 43 | # 写入文件 44 | if self.json_path is not None: 45 | save_json(data = self.H,file_path=self.json_path) 46 | 47 | # 保存train图像 48 | if len(self.H["loss"]) == 1: 49 | self.paths = {key: self.file_dir / (self.arch + f'_{key.upper()}') for key in self.H.keys()} 50 | 51 | if len(self.H["loss"]) > 1: 52 | # 指标变化 53 | # 曲线 54 | # 需要成对出现 55 | keys = [key for key, _ in self.H.items() if '_' not in key] 56 | for key in keys: 57 | N = np.arange(0, len(self.H[key])) 58 | plt.style.use("ggplot") 59 | plt.figure() 60 | plt.plot(N, self.H[key], label=f"train_{key}") 61 | # plt.plot(N, self.H[f"valid_{key}"], label=f"valid_{key}") 62 | if self.add_test: 63 | plt.plot(N, self.H[f"test_{key}"], label=f"test_{key}") 64 | plt.legend() 65 | plt.xlabel("Epoch #") 66 | plt.ylabel(key) 67 | plt.title(f"Training {key} [Epoch {len(self.H[key])}]") 68 | plt.savefig(str(self.paths[key])) 69 | plt.close() 70 | -------------------------------------------------------------------------------- /pydatagrand/callback/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | def warmup_cosine(x, warmup=0.002): 5 | if x < warmup: 6 | return x/warmup 7 | return 0.5 * (1.0 + torch.cos(math.pi * x)) 8 | 9 | def warmup_constant(x, warmup=0.002): 10 | if x < warmup: 11 | return x/warmup 12 | return 1.0 13 | 14 | def warmup_linear(x, warmup=0.002): 15 | if x < warmup: 16 | return x/warmup 17 | return 1.0 - x 18 | 19 | SCHEDULES = { 20 | 'warmup_cosine':warmup_cosine, 21 | 'warmup_constant':warmup_constant, 22 | 'warmup_linear':warmup_linear, 23 | } 24 | -------------------------------------------------------------------------------- /pydatagrand/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/common/__init__.py -------------------------------------------------------------------------------- /pydatagrand/configs/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/configs/base.py: -------------------------------------------------------------------------------- 1 | 2 | from pathlib import Path 3 | BASE_DIR = Path('pydatagrand') 4 | config = { 5 | 'data_dir': BASE_DIR / 'dataset', 6 | 'log_dir': BASE_DIR / 'output/log', 7 | 'writer_dir': BASE_DIR / "output/TSboard", 8 | 'figure_dir': BASE_DIR / "output/figure", 9 | 'checkpoint_dir': BASE_DIR / "output/checkpoints", 10 | 'cache_dir': BASE_DIR / 'model/', 11 | 'result_dir': BASE_DIR / "output/result", 12 | } 13 | 14 | -------------------------------------------------------------------------------- /pydatagrand/configs/bert_config.py: -------------------------------------------------------------------------------- 1 | bert_base_config = { 2 | "attention_probs_dropout_prob": 0.2, 3 | "directionality": "bidi", 4 | "hidden_act": "gelu", 5 | "hidden_dropout_prob": 0.2, 6 | "hidden_size": 768, 7 | "initializer_range": 0.02, 8 | "intermediate_size": 3072, 9 | "max_position_embeddings": 512, 10 | "num_attention_heads": 12, 11 | "num_hidden_layers": 12, 12 | "pooler_fc_size": 768, 13 | "pooler_num_attention_heads": 12, 14 | "pooler_num_fc_layers": 3, 15 | "pooler_size_per_head": 128, 16 | "pooler_type": "first_token_transform", 17 | "type_vocab_size": 2, 18 | "vocab_size": 21228 19 | } 20 | -------------------------------------------------------------------------------- /pydatagrand/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/dataset/corpus/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/dataset/corpus/train/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/doc/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/embedding/w2v_embedding.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | class LoadEmbedding(object): 6 | ''' 7 | word_index:{word:id} 8 | ''' 9 | 10 | def __init__(self, max_features, word_index): 11 | self.max_features = max_features 12 | self.word_index = word_index 13 | 14 | def load_glove(self, embedding_path): 15 | ''' 16 | embedding_path = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt' 17 | ''' 18 | 19 | def get_coefs(word, *arr): 20 | return word, np.asarray(arr, dtype='float32')[:300] 21 | 22 | embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path)) 23 | all_embs = np.stack(embeddings_index.values()) 24 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 25 | embed_size = all_embs.shape[1] 26 | nb_words = min(self.max_features, len(self.word_index)) 27 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) 28 | for word, i in self.word_index.items(): 29 | if i >= self.max_features: continue 30 | embedding_vector = embeddings_index.get(word) 31 | # ALLmight 32 | if embedding_vector is not None: 33 | embedding_matrix[i] = embedding_vector 34 | else: 35 | embedding_vector = embeddings_index.get(word.capitalize()) 36 | if embedding_vector is not None: 37 | embedding_matrix[i] = embedding_vector 38 | return embedding_matrix 39 | 40 | def load_fasttext(self, embedding_path): 41 | ''' 42 | embedding_path = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec' 43 | ''' 44 | 45 | def get_coefs(word, *arr): 46 | return word, np.asarray(arr, dtype='float32') 47 | 48 | embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path) if len(o) > 100) 49 | all_embs = np.stack(embeddings_index.values()) 50 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 51 | embed_size = all_embs.shape[1] 52 | nb_words = min(self.max_features, len(self.word_index)) 53 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) 54 | for word, i in self.word_index.items(): 55 | if i >= self.max_features: continue 56 | embedding_vector = embeddings_index.get(word) 57 | if embedding_vector is not None: embedding_matrix[i] = embedding_vector 58 | return embedding_matrix 59 | 60 | def load_para(self, embedding_path): 61 | ''' 62 | embedding_path = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt' 63 | ''' 64 | 65 | def get_coefs(word, *arr): 66 | return word, np.asarray(arr, dtype='float32') 67 | 68 | embeddings_index = dict( 69 | get_coefs(*o.split(" ")) for o in open(embedding_path, encoding="utf8", errors='ignore') if len(o) > 100) 70 | all_embs = np.stack(embeddings_index.values()) 71 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 72 | embed_size = all_embs.shape[1] 73 | # word_index = tokenizer.word_index 74 | nb_words = min(self.max_features, len(self.word_index)) 75 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) 76 | for word, i in self.word_index.items(): 77 | if i >= self.max_features: continue 78 | embedding_vector = embeddings_index.get(word) 79 | if embedding_vector is not None: embedding_matrix[i] = embedding_vector 80 | 81 | return embedding_matrix 82 | 83 | def load_custom_embedding(self, embedding_path): 84 | ''' 85 | embedding_path = '../input/embeddings/word2vec.bin' 86 | ''' 87 | 88 | def get_coefs(word, *arr): 89 | return word, np.asarray(arr, dtype='float32') 90 | 91 | embeddings_index = dict( 92 | get_coefs(*o.strip("\n").split(" ")) for o in open(embedding_path, 'r') if o.strip("\n").split(" ")[0]!='') 93 | all_embs = np.stack(embeddings_index.values()) 94 | emb_mean, emb_std = all_embs.mean(), all_embs.std() 95 | embed_size = all_embs.shape[1] 96 | # word_index = tokenizer.word_index 97 | nb_words = min(self.max_features, len(self.word_index)) 98 | embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size)) 99 | for word, i in self.word_index.items(): 100 | if i >= self.max_features: continue 101 | embedding_vector = embeddings_index.get(word) 102 | if embedding_vector is not None: embedding_matrix[i] = embedding_vector 103 | return embedding_matrix 104 | -------------------------------------------------------------------------------- /pydatagrand/io/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/io/vocabulary.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | from collections import OrderedDict 3 | from ..common.tools import save_pickle 4 | from ..common.tools import load_pickle 5 | 6 | 7 | class Vocabulary(object): 8 | def __init__(self, max_size=None, 9 | min_freq=None, 10 | pad_token="[PAD]", 11 | unk_token="[UNK]", 12 | cls_token="[CLS]", 13 | sep_token="[SEP]", 14 | mask_token="[MASK]", 15 | add_unused=False): 16 | self.max_size = max_size 17 | self.min_freq = min_freq 18 | self.cls_token = cls_token 19 | self.sep_token = sep_token 20 | self.pad_token = pad_token 21 | self.mask_token = mask_token 22 | self.unk_token = unk_token 23 | self.word2idx = {} 24 | self.idx2word = None 25 | self.rebuild = True 26 | self.add_unused = add_unused 27 | self.word_counter = Counter() 28 | self.reset() 29 | 30 | def reset(self): 31 | ctrl_symbols = [self.pad_token, self.unk_token, self.cls_token, self.sep_token, self.mask_token] 32 | for index, syb in enumerate(ctrl_symbols): 33 | self.word2idx[syb] = index 34 | 35 | if self.add_unused: 36 | for i in range(20): 37 | self.word2idx[f'[UNUSED{i}]'] = len(self.word2idx) 38 | 39 | def update(self, word_list): 40 | ''' 41 | 依次增加序列中词在词典中的出现频率 42 | :param word_list: 43 | :return: 44 | ''' 45 | self.word_counter.update(word_list) 46 | 47 | def add(self, word): 48 | ''' 49 | 增加一个新词在词典中的出现频率 50 | :param word: 51 | :return: 52 | ''' 53 | self.word_counter[word] += 1 54 | 55 | def has_word(self, word): 56 | ''' 57 | 检查词是否被记录 58 | :param word: 59 | :return: 60 | ''' 61 | return word in self.word2idx 62 | 63 | def to_index(self, word): 64 | ''' 65 | 将词转为数字. 若词不再词典中被记录, 将视为 unknown, 若 ``unknown=None`` , 将抛出 66 | :param word: 67 | :return: 68 | ''' 69 | if word in self.word2idx: 70 | return self.word2idx[word] 71 | if self.unk_token is not None: 72 | return self.word2idx[self.unk_token] 73 | else: 74 | raise ValueError("word {} not in vocabulary".format(word)) 75 | 76 | def unknown_idx(self): 77 | """ 78 | unknown 对应的数字. 79 | """ 80 | if self.unk_token is None: 81 | return None 82 | return self.word2idx[self.unk_token] 83 | 84 | def padding_idx(self): 85 | """ 86 | padding 对应的数字 87 | """ 88 | if self.pad_token is None: 89 | return None 90 | return self.word2idx[self.pad_token] 91 | 92 | def to_word(self, idx): 93 | """ 94 | 给定一个数字, 将其转为对应的词. 95 | 96 | :param int idx: the index 97 | :return str word: the word 98 | """ 99 | return self.idx2word[idx] 100 | 101 | def build_vocab(self): 102 | max_size = min(self.max_size, len(self.word_counter)) if self.max_size else None 103 | words = self.word_counter.most_common(max_size) 104 | if self.min_freq is not None: 105 | words = filter(lambda kv: kv[1] >= self.min_freq, words) 106 | if self.word2idx: 107 | words = filter(lambda kv: kv[0] not in self.word2idx, words) 108 | start_idx = len(self.word2idx) 109 | self.word2idx.update({w: i + start_idx for i, (w, _) in enumerate(words)}) 110 | self.build_reverse_vocab() 111 | self.rebuild = False 112 | 113 | def save(self, file_path): 114 | ''' 115 | 保存vocab 116 | :param file_name: 117 | :param pickle_path: 118 | :return: 119 | ''' 120 | mappings = { 121 | "word2idx": self.word2idx, 122 | 'idx2word': self.idx2word 123 | } 124 | save_pickle(data=mappings, file_path=file_path) 125 | 126 | def save_bert_vocab(self, file_path): 127 | ''' 128 | 保存成bert模式 129 | :param file_path: 130 | :return: 131 | ''' 132 | bert_vocab = [x for x, y in self.word2idx.items()] 133 | with open(str(file_path), 'w') as fo: 134 | for token in bert_vocab: 135 | fo.write(token + "\n") 136 | 137 | def load_bert_vocab(self, vocab_file): 138 | """Loads a vocabulary file into a dictionary.""" 139 | vocab = OrderedDict() 140 | index = 0 141 | with open(vocab_file, "r", encoding="utf-8") as reader: 142 | while True: 143 | token = reader.readline() 144 | if not token: 145 | break 146 | token = token.strip() 147 | vocab[token] = index 148 | index += 1 149 | return list(vocab.keys()) 150 | 151 | def load_from_file(self, file_path): 152 | ''' 153 | 从文件组红加载vocab 154 | :param file_name: 155 | :param pickle_path: 156 | :return: 157 | ''' 158 | mappings = load_pickle(input_file=file_path) 159 | self.idx2word = mappings['idx2word'] 160 | self.word2idx = mappings['word2idx'] 161 | 162 | def build_reverse_vocab(self): 163 | self.idx2word = {i: w for w, i in self.word2idx.items()} 164 | 165 | def read_data(self, data_path): 166 | if data_path.is_dir(): 167 | files = sorted([f for f in data_path.iterdir() if f.exists()]) 168 | else: 169 | files = [data_path] 170 | for file in files: 171 | f = open(file, 'r') 172 | lines = f.readlines() # 读取数据集 173 | for line in lines: 174 | line = line.strip("\n") 175 | words = line.split(" ") 176 | self.update(words) 177 | 178 | def clear(self): 179 | """ 180 | 删除Vocabulary中的词表数据。相当于重新初始化一下。 181 | :return: 182 | """ 183 | self.word_counter.clear() 184 | self.word2idx = None 185 | self.idx2word = None 186 | self.rebuild = True 187 | self.reset() 188 | 189 | def __len__(self): 190 | return len(self.idx2word) 191 | -------------------------------------------------------------------------------- /pydatagrand/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/model/__init__.py -------------------------------------------------------------------------------- /pydatagrand/model/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .attention import MultiHeadAttention 2 | from .cnn import Capsule 3 | from .crf import CRF 4 | from .dgcnn import DGCNN 5 | from .dropouts import SpatialDropout 6 | from .normalization import LayerNorm # fp16 error 7 | -------------------------------------------------------------------------------- /pydatagrand/model/layers/attention.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | class ScaledDotProductAttention(nn.Module): 6 | ''' Scaled Dot-Product Attention ''' 7 | 8 | def __init__(self, temperature, attn_dropout=0.1): 9 | super().__init__() 10 | self.temperature = temperature 11 | self.dropout = nn.Dropout(attn_dropout) 12 | self.softmax = nn.Softmax(dim=2) 13 | 14 | def forward(self, q, k, v, mask=None): 15 | attn = torch.bmm(q, k.transpose(1, 2)) 16 | attn = attn / self.temperature 17 | if mask is not None: 18 | attn = attn.masked_fill((1 - mask).byte(), -np.inf) 19 | attn = self.softmax(attn) 20 | attn = self.dropout(attn) 21 | output = torch.bmm(attn, v) 22 | return output, attn 23 | 24 | class MultiHeadAttention(nn.Module): 25 | def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): 26 | super().__init__() 27 | self.n_head = n_head 28 | self.d_k = d_k 29 | self.d_v = d_v 30 | self.w_qs = nn.Linear(d_model, n_head * d_k) 31 | self.w_ks = nn.Linear(d_model, n_head * d_k) 32 | self.w_vs = nn.Linear(d_model, n_head * d_v) 33 | nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 34 | nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) 35 | nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) 36 | self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) 37 | self.layer_norm = nn.LayerNorm(d_model) 38 | self.fc = nn.Linear(n_head * d_v, d_model) 39 | nn.init.xavier_normal_(self.fc.weight) 40 | self.dropout = nn.Dropout(dropout) 41 | 42 | def forward(self, q, k, v, mask=None): 43 | d_k, d_v, n_head = self.d_k, self.d_v, self.n_head 44 | sz_b, len_q, _ = q.size() 45 | sz_b, len_k, _ = k.size() 46 | sz_b, len_v, _ = v.size() 47 | residual = q 48 | q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) 49 | k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) 50 | v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) 51 | q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk 52 | k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk 53 | v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv 54 | mask = mask.unsqueeze(1).expand(-1, len_q, -1) 55 | mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. 56 | output, attn = self.attention(q, k, v, mask=mask.byte()) 57 | output = output.view(n_head, sz_b, len_q, d_v) 58 | output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) 59 | output = self.dropout(self.fc(output)) 60 | output = self.layer_norm(output + residual) 61 | return output, attn -------------------------------------------------------------------------------- /pydatagrand/model/layers/cnn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class Capsule(nn.Module): 6 | def __init__(self, input_dim_capsule=1024, num_capsule=5, dim_capsule=5, routings=4): 7 | super(Capsule, self).__init__() 8 | self.num_capsule = num_capsule 9 | self.dim_capsule = dim_capsule 10 | self.routings = routings 11 | self.activation = self.squash 12 | self.W = nn.Parameter( 13 | nn.init.xavier_normal_(torch.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule))) 14 | 15 | def forward(self, x): 16 | u_hat_vecs = torch.matmul(x, self.W) 17 | batch_size = x.size(0) 18 | input_num_capsule = x.size(1) 19 | u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule, 20 | self.num_capsule, self.dim_capsule)) 21 | u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 22 | 3).contiguous() # (batch_size,num_capsule,input_num_capsule,dim_capsule) 23 | with torch.no_grad(): 24 | b = torch.zeros_like(u_hat_vecs[:, :, :, 0]) 25 | for i in range(self.routings): 26 | c = F.softmax(b, dim=1) # (batch_size,num_capsule,input_num_capsule) 27 | outputs = self.activation(torch.sum(c.unsqueeze(-1) * u_hat_vecs, dim=2)) # bij,bijk->bik 28 | if i < self.routings - 1: 29 | b = (torch.sum(outputs.unsqueeze(2) * u_hat_vecs, dim=-1)) # bik,bijk->bij 30 | return outputs # (batch_size, num_capsule, dim_capsule) 31 | 32 | def squash(self, x, axis=-1): 33 | s_squared_norm = (x ** 2).sum(axis, keepdim=True) 34 | scale = torch.sqrt(s_squared_norm + 1e-7) 35 | return x / scale -------------------------------------------------------------------------------- /pydatagrand/model/layers/crf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | def to_scalar(var): 6 | return var.view(-1).detach().tolist()[0] 7 | 8 | def argmax(vec): 9 | _, idx = torch.max(vec, 1) 10 | return to_scalar(idx) 11 | 12 | def log_sum_exp(vec): 13 | max_score = vec[0, argmax(vec)] 14 | max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1]) 15 | return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast))) 16 | 17 | def argmax_batch(vecs): 18 | _, idx = torch.max(vecs, 1) 19 | return idx 20 | 21 | def log_sum_exp_batch(vecs): 22 | maxi = torch.max(vecs, 1)[0] 23 | maxi_bc = maxi[:, None].repeat(1, vecs.shape[1]) 24 | recti_ = torch.log(torch.sum(torch.exp(vecs - maxi_bc), 1)) 25 | return maxi + recti_ 26 | 27 | class CRF(nn.Module): 28 | def __init__(self,tagset_size,tag_dictionary,device,is_bert=None): 29 | super(CRF,self).__init__() 30 | 31 | self.START_TAG = "" 32 | self.STOP_TAG = "" 33 | if is_bert: 34 | self.START_TAG = "[CLS]" 35 | self.STOP_TAG = "[SEP]" 36 | self.tag_dictionary = tag_dictionary 37 | self.tagset_size = tagset_size 38 | self.device = device 39 | self.transitions = torch.randn(tagset_size, tagset_size) 40 | # self.transitions = torch.zeros(tagset_size, tagset_size) 41 | self.transitions.detach()[self.tag_dictionary[self.START_TAG], :] = -10000 42 | self.transitions.detach()[:, self.tag_dictionary[self.STOP_TAG]] = -10000 43 | self.transitions = self.transitions.to(device) 44 | self.transitions = nn.Parameter(self.transitions) 45 | 46 | def _viterbi_decode(self, feats): 47 | backpointers = [] 48 | backscores = [] 49 | scores = [] 50 | init_vvars = (torch.FloatTensor(1, self.tagset_size).to(self.device).fill_(-10000.0)) 51 | init_vvars[0][self.tag_dictionary[self.START_TAG]] = 0 52 | forward_var = init_vvars 53 | 54 | for feat in feats: 55 | next_tag_var = ( 56 | forward_var.view(1, -1).expand(self.tagset_size, self.tagset_size) 57 | + self.transitions 58 | ) 59 | _, bptrs_t = torch.max(next_tag_var, dim=1) 60 | viterbivars_t = next_tag_var[range(len(bptrs_t)), bptrs_t] 61 | forward_var = viterbivars_t + feat 62 | backscores.append(forward_var) 63 | backpointers.append(bptrs_t) 64 | 65 | terminal_var = ( 66 | forward_var 67 | + self.transitions[self.tag_dictionary[self.STOP_TAG]] 68 | ) 69 | terminal_var.detach()[self.tag_dictionary[self.STOP_TAG]] = -10000.0 70 | terminal_var.detach()[self.tag_dictionary[self.START_TAG]] = -10000.0 71 | best_tag_id = argmax(terminal_var.unsqueeze(0)) 72 | best_path = [best_tag_id] 73 | for bptrs_t in reversed(backpointers): 74 | best_tag_id = bptrs_t[best_tag_id] 75 | best_path.append(best_tag_id.item()) 76 | best_scores = [] 77 | for backscore in backscores: 78 | softmax = F.softmax(backscore, dim=0) 79 | _, idx = torch.max(backscore, 0) 80 | prediction = idx.item() 81 | best_scores.append(softmax[prediction].item()) 82 | scores.append([elem.item() for elem in softmax.flatten()]) 83 | swap_best_path, swap_max_score = ( 84 | best_path[0], 85 | scores[-1].index(max(scores[-1])), 86 | ) 87 | scores[-1][swap_best_path], scores[-1][swap_max_score] = ( 88 | scores[-1][swap_max_score], 89 | scores[-1][swap_best_path], 90 | ) 91 | start = best_path.pop() 92 | assert start == self.tag_dictionary[self.START_TAG] 93 | best_path.reverse() 94 | return best_scores, best_path, scores 95 | 96 | def _forward_alg(self, feats, lens_): 97 | init_alphas = torch.FloatTensor(self.tagset_size).fill_(-10000.0) 98 | init_alphas[self.tag_dictionary[self.START_TAG]] = 0.0 99 | 100 | forward_var = torch.zeros( 101 | feats.shape[0], 102 | feats.shape[1] + 1, 103 | feats.shape[2], 104 | dtype=torch.float, 105 | device=self.device, 106 | ) 107 | forward_var[:, 0, :] = init_alphas[None, :].repeat(feats.shape[0], 1) 108 | transitions = self.transitions.view( 109 | 1, self.transitions.shape[0], self.transitions.shape[1] 110 | ).repeat(feats.shape[0], 1, 1) 111 | for i in range(feats.shape[1]): 112 | emit_score = feats[:, i, :] 113 | tag_var = ( 114 | emit_score[:, :, None].repeat(1, 1, transitions.shape[2]) 115 | + transitions 116 | + forward_var[:, i, :][:, :, None] 117 | .repeat(1, 1, transitions.shape[2]) 118 | .transpose(2, 1) 119 | ) 120 | max_tag_var, _ = torch.max(tag_var, dim=2) 121 | tag_var = tag_var - max_tag_var[:, :, None].repeat( 122 | 1, 1, transitions.shape[2] 123 | ) 124 | agg_ = torch.log(torch.sum(torch.exp(tag_var), dim=2)) 125 | cloned = forward_var.clone() 126 | cloned[:, i + 1, :] = max_tag_var + agg_ 127 | forward_var = cloned 128 | forward_var = forward_var[range(forward_var.shape[0]), lens_, :] 129 | terminal_var = forward_var + self.transitions[self.tag_dictionary[self.STOP_TAG]][None, :].repeat(forward_var.shape[0], 1) 130 | alpha = log_sum_exp_batch(terminal_var) 131 | return alpha 132 | 133 | def _score_sentence(self, feats, tags, lens_): 134 | start = torch.LongTensor([self.tag_dictionary[self.START_TAG]]).to(self.device) 135 | start = start[None, :].repeat(tags.shape[0], 1) 136 | stop = torch.LongTensor([self.tag_dictionary[self.STOP_TAG]]).to(self.device) 137 | stop = stop[None, :].repeat(tags.shape[0], 1) 138 | pad_start_tags = torch.cat([start, tags], 1) 139 | pad_stop_tags = torch.cat([tags, stop], 1) 140 | for i in range(len(lens_)): 141 | pad_stop_tags[i, lens_[i] :] = self.tag_dictionary[self.STOP_TAG] 142 | score = torch.FloatTensor(feats.shape[0]).to(self.device) 143 | for i in range(feats.shape[0]): 144 | r = torch.LongTensor(range(lens_[i])).to(self.device) 145 | score[i] = torch.sum( 146 | self.transitions[ 147 | pad_stop_tags[i, : lens_[i] + 1], pad_start_tags[i, : lens_[i] + 1] 148 | ] 149 | ) + torch.sum(feats[i, r, tags[i, : lens_[i]]]) 150 | return score 151 | 152 | def _obtain_labels(self, feature, id2label,input_lens): 153 | tags = [] 154 | all_tags = [] 155 | for feats, length in zip(feature, input_lens): 156 | confidences, tag_seq, scores = self._viterbi_decode(feats[:length]) 157 | tags.append([id2label[tag] for tag in tag_seq]) 158 | all_tags.append([[id2label[score_id] for score_id, score in enumerate(score_dist)] for score_dist in scores]) 159 | return tags, all_tags 160 | 161 | def calculate_loss(self, scores, tag_list,lengths): 162 | return self._calculate_loss_old(scores, lengths, tag_list) 163 | 164 | def _calculate_loss_old(self, features, lengths, tags): 165 | forward_score = self._forward_alg(features, lengths) 166 | gold_score = self._score_sentence(features, tags, lengths) 167 | score = forward_score - gold_score 168 | return score.mean() 169 | 170 | 171 | -------------------------------------------------------------------------------- /pydatagrand/model/layers/dropouts.py: -------------------------------------------------------------------------------- 1 | 2 | import torch.nn as nn 3 | 4 | class SpatialDropout(nn.Dropout2d): 5 | def __init__(self, p=0.6): 6 | super(SpatialDropout, self).__init__(p=p) 7 | 8 | def forward(self, x): 9 | x = x.unsqueeze(2) # (N, T, 1, K) 10 | x = x.permute(0, 3, 2, 1) # (N, K, 1, T) 11 | x = self.forward(x) # (N, K, 1, T), some features are masked 12 | x = x.permute(0, 3, 2, 1) # (N, T, 1, K) 13 | x = x.squeeze(2) # (N, T, K) 14 | return x 15 | -------------------------------------------------------------------------------- /pydatagrand/model/layers/embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def nn_init(nn_module, method='xavier'): 7 | for param_name, _ in nn_module.named_parameters(): 8 | if isinstance(nn_module, nn.Sequential): 9 | i, name = param_name.split('.', 1) 10 | param = getattr(nn_module[int(i)], name) 11 | else: 12 | param = getattr(nn_module, param_name) 13 | if param_name.find('weight') > -1: 14 | init_weight(param, method) 15 | elif param_name.find('bias') > -1: 16 | nn.init.uniform_(param, -1e-4, 1e-4) 17 | 18 | 19 | def init_weight(weight, method): 20 | if method == 'orthogonal': 21 | nn.init.orthogonal_(weight) 22 | elif method == 'xavier': 23 | nn.init.xavier_uniform_(weight) 24 | elif method == 'kaiming': 25 | nn.init.kaiming_uniform_(weight) 26 | elif method == 'none': 27 | pass 28 | else: 29 | raise Exception('Unknown init method') 30 | 31 | 32 | class PretrainedEmbedding(nn.Module): 33 | def __init__(self, embedding_matrix, requires_grad=False): 34 | super(PretrainedEmbedding, self).__init__() 35 | embed_size = embedding_matrix.shape[1] 36 | max_features = embedding_matrix.shape[0] 37 | self.embedding = nn.Embedding(max_features, embed_size) 38 | self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32)) 39 | self.embedding.weight.requires_grad = requires_grad 40 | 41 | def forward(self, indices): 42 | return self.embedding(indices) 43 | 44 | 45 | class ProjSumEmbedding(nn.Module): 46 | 47 | def __init__(self, embedding_matrices, output_size): 48 | super(ProjSumEmbedding, self).__init__() 49 | assert len(embedding_matrices) > 0 50 | 51 | self.embedding_count = len(embedding_matrices) 52 | self.output_size = output_size 53 | self.embedding_projectors = nn.ModuleList() 54 | for embedding_matrix in embedding_matrices: 55 | embedding_dim = embedding_matrix.shape[1] 56 | projection = nn.Linear(embedding_dim, self.output_size) 57 | nn_init(projection) 58 | 59 | self.embedding_projectors.append(nn.Sequential( 60 | PretrainedEmbedding(embedding_matrix), 61 | projection 62 | )) 63 | 64 | def forward(self, x): 65 | projected = [embedding_projector(x) for embedding_projector in self.embedding_projectors] 66 | return F.relu(sum(projected)) 67 | -------------------------------------------------------------------------------- /pydatagrand/model/layers/linears.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | class FeedForwardNetwork(nn.Module): 6 | def __init__(self, input_size, hidden_size, output_size, dropout_rate=0): 7 | super(FeedForwardNetwork, self).__init__() 8 | self.dropout_rate = dropout_rate 9 | self.linear1 = nn.Linear(input_size, hidden_size) 10 | self.linear2 = nn.Linear(hidden_size, output_size) 11 | 12 | def forward(self, x): 13 | x_proj = F.dropout(F.relu(self.linear1(x)), p=self.dropout_rate, training=self.training) 14 | x_proj = self.linear2(x_proj) 15 | return x_proj 16 | 17 | 18 | class PoolerStartLogits(nn.Module): 19 | def __init__(self, hidden_size, num_classes): 20 | super(PoolerStartLogits, self).__init__() 21 | self.dense = nn.Linear(hidden_size, num_classes) 22 | 23 | def forward(self, hidden_states, p_mask=None): 24 | x = self.dense(hidden_states) 25 | return x 26 | 27 | class PoolerEndLogits(nn.Module): 28 | def __init__(self, hidden_size, num_classes): 29 | super(PoolerEndLogits, self).__init__() 30 | self.dense_0 = nn.Linear(hidden_size, hidden_size) 31 | self.activation = nn.Tanh() 32 | self.LayerNorm = nn.LayerNorm(hidden_size) 33 | self.dense_1 = nn.Linear(hidden_size, num_classes) 34 | 35 | def forward(self, hidden_states, start_positions=None, p_mask=None): 36 | x = self.dense_0(torch.cat([hidden_states, start_positions], dim=-1)) 37 | x = self.activation(x) 38 | x = self.LayerNorm(x) 39 | x = self.dense_1(x) 40 | return x 41 | -------------------------------------------------------------------------------- /pydatagrand/model/layers/normalization.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class LayerNorm(nn.Module): 6 | def __init__(self, hidden_size, eps=1e-12): 7 | """Construct a layernorm module in the TF style (epsilon inside the square root). 8 | """ 9 | super(LayerNorm, self).__init__() 10 | self.weight = nn.Parameter(torch.ones(hidden_size)) 11 | self.bias = nn.Parameter(torch.zeros(hidden_size)) 12 | self.variance_epsilon = eps 13 | self.bias.data.zero_() 14 | self.weight.data.fill_(1.0) 15 | 16 | def forward(self, x): 17 | u = x.mean(-1, keepdim=True) 18 | s = (x - u).pow(2).mean(-1, keepdim=True) 19 | x = (x - u) / torch.sqrt(s + self.variance_epsilon) 20 | return self.weight * x + self.bias 21 | -------------------------------------------------------------------------------- /pydatagrand/model/nn/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/model/nn/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/model/nn/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /pydatagrand/model/nn/bert_lstm_crf.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from ..layers import CRF 3 | from ..layers import LayerNorm 4 | from ..pytorch_transformers.modeling_bert import BertPreTrainedModel 5 | from ..pytorch_transformers.modeling_bert import BertModel 6 | 7 | 8 | class BERTLSTMCRF(BertPreTrainedModel): 9 | def __init__(self, config, label2id, device, num_layers=2, lstm_dropout=0.35): 10 | super(BERTLSTMCRF, self).__init__(config) 11 | self.bert = BertModel(config) 12 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 13 | self.classifier = nn.Linear(config.hidden_size, len(label2id)) 14 | self.init_weights() 15 | 16 | self.bilstm = nn.LSTM(input_size=config.hidden_size, 17 | hidden_size=config.hidden_size // 2, 18 | batch_first=True, 19 | num_layers=num_layers, 20 | dropout=lstm_dropout, 21 | bidirectional=True) 22 | self.layer_norm = LayerNorm(config.hidden_size) 23 | self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True) 24 | 25 | def forward(self, input_ids, token_type_ids=None, attention_mask=None): 26 | outputs = self.bert(input_ids, token_type_ids, attention_mask) 27 | sequence_output = outputs[0] 28 | sequence_output = self.dropout(sequence_output) 29 | sequence_output, _ = self.bilstm(sequence_output) 30 | sequence_output = self.layer_norm(sequence_output) 31 | logits = self.classifier(sequence_output) 32 | return logits 33 | 34 | def forward_loss(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, input_lens=None): 35 | features = self.forward(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) 36 | if labels is not None: 37 | return features, self.crf.calculate_loss(features, tag_list=labels, lengths=input_lens) 38 | else: 39 | return features, None 40 | 41 | def unfreeze(self, start_layer=6, end_layer=12): 42 | def children(m): 43 | return m if isinstance(m, (list, tuple)) else list(m.children()) 44 | 45 | def set_trainable_attr(m, b): 46 | m.trainable = b 47 | for p in m.parameters(): 48 | p.requires_grad = b 49 | 50 | def apply_leaf(m, f): 51 | c = children(m) 52 | if isinstance(m, nn.Module): 53 | f(m) 54 | if len(c) > 0: 55 | for l in c: 56 | apply_leaf(l, f) 57 | 58 | def set_trainable(l, b): 59 | apply_leaf(l, lambda m: set_trainable_attr(m, b)) 60 | 61 | # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True) 62 | set_trainable(self.bert, False) 63 | for i in range(start_layer, end_layer): 64 | set_trainable(self.bert.encoder.layer[i], True) 65 | -------------------------------------------------------------------------------- /pydatagrand/model/nn/bert_lstm_crf_mdp.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from ..layers import CRF 3 | from ..layers import LayerNorm 4 | from ..pytorch_transformers.modeling_bert import BertPreTrainedModel 5 | from ..pytorch_transformers.modeling_bert import BertModel 6 | 7 | 8 | class BERTLSTMCRFMDP(BertPreTrainedModel): 9 | def __init__(self, config, label2id, device, num_layers=2, lstm_dropout=0.35, mdp_n=5, mdp_p=0.5): 10 | super(BERTLSTMCRFMDP, self).__init__(config) 11 | self.bert = BertModel(config) 12 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 13 | self.classifier = nn.Linear(config.hidden_size, len(label2id)) 14 | self.init_weights() 15 | 16 | self.bilstm = nn.LSTM(input_size=config.hidden_size, 17 | hidden_size=config.hidden_size // 2, 18 | batch_first=True, 19 | num_layers=num_layers, 20 | dropout=lstm_dropout, 21 | bidirectional=True) 22 | self.layer_norm = LayerNorm(config.hidden_size) 23 | self.dropouts = nn.ModuleList([ 24 | nn.Dropout(mdp_p) for _ in range(mdp_n) 25 | ]) 26 | self.crf = CRF(tagset_size=len(label2id), tag_dictionary=label2id, device=device, is_bert=True) 27 | 28 | def forward(self, input_ids, token_type_ids=None, attention_mask=None): 29 | outputs = self.bert(input_ids, token_type_ids, attention_mask) 30 | sequence_output = outputs[0] 31 | sequence_output = self.dropout(sequence_output) 32 | sequence_output, _ = self.bilstm(sequence_output) 33 | sequence_output = self.layer_norm(sequence_output) 34 | for i, dropout in enumerate(self.dropouts): 35 | if i == 0: 36 | logits = self.classifier(dropout(sequence_output)) 37 | else: 38 | logits += self.classifier(dropout(sequence_output)) 39 | return logits / len(self.dropouts) 40 | 41 | def forward_loss(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, input_lens=None): 42 | features = self.forward(input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask) 43 | if labels is not None: 44 | return features, self.crf.calculate_loss(features, tag_list=labels, lengths=input_lens) 45 | else: 46 | return features, None 47 | 48 | def unfreeze(self, start_layer=6, end_layer=12): 49 | def children(m): 50 | return m if isinstance(m, (list, tuple)) else list(m.children()) 51 | 52 | def set_trainable_attr(m, b): 53 | m.trainable = b 54 | for p in m.parameters(): 55 | p.requires_grad = b 56 | 57 | def apply_leaf(m, f): 58 | c = children(m) 59 | if isinstance(m, nn.Module): 60 | f(m) 61 | if len(c) > 0: 62 | for l in c: 63 | apply_leaf(l, f) 64 | 65 | def set_trainable(l, b): 66 | apply_leaf(l, lambda m: set_trainable_attr(m, b)) 67 | 68 | # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True) 69 | set_trainable(self.bert, False) 70 | for i in range(start_layer, end_layer): 71 | set_trainable(self.bert.encoder.layer[i], True) 72 | -------------------------------------------------------------------------------- /pydatagrand/model/nn/bert_lstm_span.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from ..layers import LayerNorm 5 | from ..pytorch_transformers.modeling_bert import BertPreTrainedModel 6 | from ..pytorch_transformers.modeling_bert import BertModel 7 | from ..layers.linears import PoolerEndLogits, PoolerStartLogits 8 | 9 | 10 | class BERTLSTMSpan(BertPreTrainedModel): 11 | def __init__(self, config, label2id, num_layers=2, lstm_dropout=0.35, soft_label=False): 12 | super(BERTLSTMSpan, self).__init__(config) 13 | self.soft_label = soft_label 14 | self.num_labels = len(label2id) 15 | self.bert = BertModel(config) 16 | self.dropout = nn.Dropout(config.hidden_dropout_prob) 17 | self.init_weights() 18 | 19 | self.bilstm = nn.LSTM(input_size=config.hidden_size, 20 | hidden_size=config.hidden_size // 2, 21 | batch_first=True, 22 | num_layers=num_layers, 23 | dropout=lstm_dropout, 24 | bidirectional=True) 25 | self.layer_norm = LayerNorm(config.hidden_size) 26 | self.start_fc = PoolerStartLogits(config.hidden_size, self.num_labels) 27 | if soft_label: 28 | self.end_fc = PoolerEndLogits(config.hidden_size + self.num_labels, self.num_labels) 29 | else: 30 | self.end_fc = PoolerEndLogits(config.hidden_size + 1, self.num_labels) 31 | 32 | def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_point=None): 33 | outputs = self.bert(input_ids, token_type_ids, attention_mask) 34 | sequence_output = outputs[0] 35 | sequence_output = self.dropout(sequence_output) 36 | sequence_output, _ = self.bilstm(sequence_output) 37 | sequence_output = self.layer_norm(sequence_output) 38 | ps1 = self.start_fc(sequence_output) 39 | if start_point is not None: 40 | if self.soft_label: 41 | batch_size = input_ids.size(0) 42 | seq_len = input_ids.size(1) 43 | start_logits = torch.FloatTensor(batch_size, seq_len, self.num_labels) 44 | start_logits.zero_() 45 | start_logits = start_logits.to(self.device) 46 | start_logits.scatter_(2, start_point.unsqueeze(2), 1) 47 | else: 48 | start_logits = start_point.unsqueeze(2).float() 49 | 50 | else: 51 | start_logits = F.softmax(ps1, -1) 52 | if not self.soft_label: 53 | start_logits = torch.argmax(start_logits, -1).unsqueeze(2).float() 54 | ps2 = self.end_fc(sequence_output, start_logits) 55 | return ps1, ps2 56 | 57 | def unfreeze(self, start_layer=6, end_layer=12): 58 | def children(m): 59 | return m if isinstance(m, (list, tuple)) else list(m.children()) 60 | 61 | def set_trainable_attr(m, b): 62 | m.trainable = b 63 | for p in m.parameters(): 64 | p.requires_grad = b 65 | 66 | def apply_leaf(m, f): 67 | c = children(m) 68 | if isinstance(m, nn.Module): 69 | f(m) 70 | if len(c) > 0: 71 | for l in c: 72 | apply_leaf(l, f) 73 | 74 | def set_trainable(l, b): 75 | apply_leaf(l, lambda m: set_trainable_attr(m, b)) 76 | 77 | # You can unfreeze the last layer of bert by calling set_trainable(model.bert.encoder.layer[23], True) 78 | set_trainable(self.bert, False) 79 | for i in range(start_layer, end_layer): 80 | set_trainable(self.bert.encoder.layer[i], True) 81 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.2.0" 2 | # Work around to update TensorFlow's absl.logging threshold which alters the 3 | # default Python logging output behavior when present. 4 | # see: https://github.com/abseil/abseil-py/issues/99 5 | # and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 6 | try: 7 | import absl.logging 8 | absl.logging.set_verbosity('info') 9 | absl.logging.set_stderrthreshold('info') 10 | absl.logging._warn_preinit_stderr = False 11 | except: 12 | pass 13 | 14 | # Tokenizer 15 | from .tokenization_utils import (PreTrainedTokenizer) 16 | from .tokenization_auto import AutoTokenizer 17 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer 18 | from .tokenization_openai import OpenAIGPTTokenizer 19 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) 20 | from .tokenization_gpt2 import GPT2Tokenizer 21 | from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE 22 | from .tokenization_xlm import XLMTokenizer 23 | from .tokenization_roberta import RobertaTokenizer 24 | from .tokenization_distilbert import DistilBertTokenizer 25 | 26 | # Configurations 27 | from .configuration_utils import PretrainedConfig 28 | from .configuration_auto import AutoConfig 29 | from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP 30 | from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP 31 | from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP 32 | from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP 33 | from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP 34 | from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP 35 | from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 37 | 38 | # Modeling 39 | from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) 40 | from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, 41 | AutoModelWithLMHead) 42 | 43 | from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, 44 | BertForMaskedLM, BertForNextSentencePrediction, 45 | BertForSequenceClassification, BertForMultipleChoice, 46 | BertForTokenClassification, BertForQuestionAnswering, 47 | load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) 48 | from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, 49 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, 50 | load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) 51 | from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, 52 | load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) 53 | from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, 54 | GPT2LMHeadModel, GPT2DoubleHeadsModel, 55 | load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) 56 | from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, 57 | XLNetForSequenceClassification, XLNetForQuestionAnswering, 58 | load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) 59 | from .modeling_xlm import (XLMPreTrainedModel , XLMModel, 60 | XLMWithLMHeadModel, XLMForSequenceClassification, 61 | XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) 62 | from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, 63 | ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) 64 | from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel, 65 | DistilBertForSequenceClassification, DistilBertForQuestionAnswering, 66 | DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) 67 | 68 | # Optimization 69 | from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, 70 | WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) 71 | 72 | # Files and general utilities 73 | from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, 74 | cached_path, add_start_docstrings, add_end_docstrings, 75 | WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME) 76 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: 5 | print( 6 | "Should be used as one of: \n" 7 | ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" 8 | ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" 9 | ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" 10 | ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" 11 | ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" 12 | ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") 13 | else: 14 | if sys.argv[1] == "bert": 15 | try: 16 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 17 | except ImportError: 18 | print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 19 | "In that case, it requires TensorFlow to be installed. Please see " 20 | "https://www.tensorflow.org/install/ for installation instructions.") 21 | raise 22 | 23 | if len(sys.argv) != 5: 24 | # pylint: disable=line-too-long 25 | print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 26 | else: 27 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 28 | TF_CONFIG = sys.argv.pop() 29 | TF_CHECKPOINT = sys.argv.pop() 30 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 31 | elif sys.argv[1] == "gpt": 32 | from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch 33 | if len(sys.argv) < 4 or len(sys.argv) > 5: 34 | # pylint: disable=line-too-long 35 | print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") 36 | else: 37 | OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] 38 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 39 | if len(sys.argv) == 5: 40 | OPENAI_GPT_CONFIG = sys.argv[4] 41 | else: 42 | OPENAI_GPT_CONFIG = "" 43 | convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, 44 | OPENAI_GPT_CONFIG, 45 | PYTORCH_DUMP_OUTPUT) 46 | elif sys.argv[1] == "transfo_xl": 47 | try: 48 | from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch 49 | except ImportError: 50 | print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 51 | "In that case, it requires TensorFlow to be installed. Please see " 52 | "https://www.tensorflow.org/install/ for installation instructions.") 53 | raise 54 | if len(sys.argv) < 4 or len(sys.argv) > 5: 55 | # pylint: disable=line-too-long 56 | print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") 57 | else: 58 | if 'ckpt' in sys.argv[2].lower(): 59 | TF_CHECKPOINT = sys.argv[2] 60 | TF_DATASET_FILE = "" 61 | else: 62 | TF_DATASET_FILE = sys.argv[2] 63 | TF_CHECKPOINT = "" 64 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 65 | if len(sys.argv) == 5: 66 | TF_CONFIG = sys.argv[4] 67 | else: 68 | TF_CONFIG = "" 69 | convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) 70 | elif sys.argv[1] == "gpt2": 71 | try: 72 | from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch 73 | except ImportError: 74 | print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 75 | "In that case, it requires TensorFlow to be installed. Please see " 76 | "https://www.tensorflow.org/install/ for installation instructions.") 77 | raise 78 | 79 | if len(sys.argv) < 4 or len(sys.argv) > 5: 80 | # pylint: disable=line-too-long 81 | print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") 82 | else: 83 | TF_CHECKPOINT = sys.argv[2] 84 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 85 | if len(sys.argv) == 5: 86 | TF_CONFIG = sys.argv[4] 87 | else: 88 | TF_CONFIG = "" 89 | convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 90 | elif sys.argv[1] == "xlnet": 91 | try: 92 | from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch 93 | except ImportError: 94 | print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 95 | "In that case, it requires TensorFlow to be installed. Please see " 96 | "https://www.tensorflow.org/install/ for installation instructions.") 97 | raise 98 | 99 | if len(sys.argv) < 5 or len(sys.argv) > 6: 100 | # pylint: disable=line-too-long 101 | print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") 102 | else: 103 | TF_CHECKPOINT = sys.argv[2] 104 | TF_CONFIG = sys.argv[3] 105 | PYTORCH_DUMP_OUTPUT = sys.argv[4] 106 | if len(sys.argv) == 6: 107 | FINETUNING_TASK = sys.argv[5] 108 | else: 109 | FINETUNING_TASK = None 110 | 111 | convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, 112 | TF_CONFIG, 113 | PYTORCH_DUMP_OUTPUT, 114 | FINETUNING_TASK) 115 | elif sys.argv[1] == "xlm": 116 | from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch 117 | 118 | if len(sys.argv) != 4: 119 | # pylint: disable=line-too-long 120 | print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") 121 | else: 122 | XLM_CHECKPOINT_PATH = sys.argv[2] 123 | PYTORCH_DUMP_OUTPUT = sys.argv[3] 124 | 125 | convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) 126 | 127 | if __name__ == '__main__': 128 | main() 129 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ BERT model configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", 31 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", 32 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", 33 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", 34 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", 35 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", 36 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", 37 | 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", 38 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", 39 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", 40 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", 41 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", 42 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 43 | } 44 | 45 | 46 | class BertConfig(PretrainedConfig): 47 | r""" 48 | :class:`~pytorch_transformers.BertConfig` is the configuration class to store the configuration of a 49 | `BertModel`. 50 | 51 | 52 | Arguments: 53 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. 54 | hidden_size: Size of the encoder layers and the pooler layer. 55 | num_hidden_layers: Number of hidden layers in the Transformer encoder. 56 | num_attention_heads: Number of attention heads for each attention layer in 57 | the Transformer encoder. 58 | intermediate_size: The size of the "intermediate" (i.e., feed-forward) 59 | layer in the Transformer encoder. 60 | hidden_act: The non-linear activation function (function or string) in the 61 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 62 | hidden_dropout_prob: The dropout probabilitiy for all fully connected 63 | layers in the embeddings, encoder, and pooler. 64 | attention_probs_dropout_prob: The dropout ratio for the attention 65 | probabilities. 66 | max_position_embeddings: The maximum sequence length that this model might 67 | ever be used with. Typically set this to something large just in case 68 | (e.g., 512 or 1024 or 2048). 69 | type_vocab_size: The vocabulary size of the `token_type_ids` passed into 70 | `BertModel`. 71 | initializer_range: The sttdev of the truncated_normal_initializer for 72 | initializing all weight matrices. 73 | layer_norm_eps: The epsilon used by LayerNorm. 74 | """ 75 | pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP 76 | 77 | def __init__(self, 78 | vocab_size_or_config_json_file=30522, 79 | hidden_size=768, 80 | num_hidden_layers=12, 81 | num_attention_heads=12, 82 | intermediate_size=3072, 83 | hidden_act="gelu", 84 | hidden_dropout_prob=0.1, 85 | attention_probs_dropout_prob=0.1, 86 | max_position_embeddings=512, 87 | type_vocab_size=2, 88 | initializer_range=0.02, 89 | layer_norm_eps=1e-12, 90 | **kwargs): 91 | super(BertConfig, self).__init__(**kwargs) 92 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 93 | and isinstance(vocab_size_or_config_json_file, unicode)): 94 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 95 | json_config = json.loads(reader.read()) 96 | for key, value in json_config.items(): 97 | self.__dict__[key] = value 98 | elif isinstance(vocab_size_or_config_json_file, int): 99 | self.vocab_size = vocab_size_or_config_json_file 100 | self.hidden_size = hidden_size 101 | self.num_hidden_layers = num_hidden_layers 102 | self.num_attention_heads = num_attention_heads 103 | self.hidden_act = hidden_act 104 | self.intermediate_size = intermediate_size 105 | self.hidden_dropout_prob = hidden_dropout_prob 106 | self.attention_probs_dropout_prob = attention_probs_dropout_prob 107 | self.max_position_embeddings = max_position_embeddings 108 | self.type_vocab_size = type_vocab_size 109 | self.initializer_range = initializer_range 110 | self.layer_norm_eps = layer_norm_eps 111 | else: 112 | raise ValueError("First argument must be either a vocabulary size (int)" 113 | " or the path to a pretrained model config file (str)") 114 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ DistilBERT model configuration """ 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", 30 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" 31 | } 32 | 33 | 34 | class DistilBertConfig(PretrainedConfig): 35 | pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | 37 | def __init__(self, 38 | vocab_size_or_config_json_file=30522, 39 | max_position_embeddings=512, 40 | sinusoidal_pos_embds=True, 41 | n_layers=6, 42 | n_heads=12, 43 | dim=768, 44 | hidden_dim=4*768, 45 | dropout=0.1, 46 | attention_dropout=0.1, 47 | activation='gelu', 48 | initializer_range=0.02, 49 | tie_weights_=True, 50 | qa_dropout=0.1, 51 | seq_classif_dropout=0.2, 52 | **kwargs): 53 | super(DistilBertConfig, self).__init__(**kwargs) 54 | 55 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 56 | and isinstance(vocab_size_or_config_json_file, unicode)): 57 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 58 | json_config = json.loads(reader.read()) 59 | for key, value in json_config.items(): 60 | self.__dict__[key] = value 61 | elif isinstance(vocab_size_or_config_json_file, int): 62 | self.vocab_size = vocab_size_or_config_json_file 63 | self.max_position_embeddings = max_position_embeddings 64 | self.sinusoidal_pos_embds = sinusoidal_pos_embds 65 | self.n_layers = n_layers 66 | self.n_heads = n_heads 67 | self.dim = dim 68 | self.hidden_dim = hidden_dim 69 | self.dropout = dropout 70 | self.attention_dropout = attention_dropout 71 | self.activation = activation 72 | self.initializer_range = initializer_range 73 | self.tie_weights_ = tie_weights_ 74 | self.qa_dropout = qa_dropout 75 | self.seq_classif_dropout = seq_classif_dropout 76 | else: 77 | raise ValueError("First argument must be either a vocabulary size (int)" 78 | " or the path to a pretrained model config file (str)") 79 | @property 80 | def hidden_size(self): 81 | return self.dim 82 | 83 | @property 84 | def num_attention_heads(self): 85 | return self.n_heads 86 | 87 | @property 88 | def num_hidden_layers(self): 89 | return self.n_layers 90 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_gpt2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT-2 configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", 30 | "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", 31 | "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"} 32 | 33 | class GPT2Config(PretrainedConfig): 34 | """Configuration class to store the configuration of a `GPT2Model`. 35 | 36 | Args: 37 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. 38 | n_positions: Number of positional embeddings. 39 | n_ctx: Size of the causal mask (usually same as n_positions). 40 | n_embd: Dimensionality of the embeddings and hidden states. 41 | n_layer: Number of hidden layers in the Transformer encoder. 42 | n_head: Number of attention heads for each attention layer in 43 | the Transformer encoder. 44 | layer_norm_epsilon: epsilon to use in the layer norm layers 45 | resid_pdrop: The dropout probabilitiy for all fully connected 46 | layers in the embeddings, encoder, and pooler. 47 | attn_pdrop: The dropout ratio for the attention 48 | probabilities. 49 | embd_pdrop: The dropout ratio for the embeddings. 50 | initializer_range: The sttdev of the truncated_normal_initializer for 51 | initializing all weight matrices. 52 | """ 53 | pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP 54 | 55 | def __init__( 56 | self, 57 | vocab_size_or_config_json_file=50257, 58 | n_positions=1024, 59 | n_ctx=1024, 60 | n_embd=768, 61 | n_layer=12, 62 | n_head=12, 63 | resid_pdrop=0.1, 64 | embd_pdrop=0.1, 65 | attn_pdrop=0.1, 66 | layer_norm_epsilon=1e-5, 67 | initializer_range=0.02, 68 | 69 | num_labels=1, 70 | summary_type='cls_index', 71 | summary_use_proj=True, 72 | summary_activation=None, 73 | summary_proj_to_labels=True, 74 | summary_first_dropout=0.1, 75 | **kwargs 76 | ): 77 | """Constructs GPT2Config. 78 | 79 | Args: 80 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. 81 | n_positions: Number of positional embeddings. 82 | n_ctx: Size of the causal mask (usually same as n_positions). 83 | n_embd: Dimensionality of the embeddings and hidden states. 84 | n_layer: Number of hidden layers in the Transformer encoder. 85 | n_head: Number of attention heads for each attention layer in 86 | the Transformer encoder. 87 | layer_norm_epsilon: epsilon to use in the layer norm layers 88 | resid_pdrop: The dropout probabilitiy for all fully connected 89 | layers in the embeddings, encoder, and pooler. 90 | attn_pdrop: The dropout ratio for the attention 91 | probabilities. 92 | embd_pdrop: The dropout ratio for the embeddings. 93 | initializer_range: The sttdev of the truncated_normal_initializer for 94 | initializing all weight matrices. 95 | """ 96 | super(GPT2Config, self).__init__(**kwargs) 97 | 98 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 99 | and isinstance(vocab_size_or_config_json_file, unicode)): 100 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 101 | json_config = json.loads(reader.read()) 102 | for key, value in json_config.items(): 103 | self.__dict__[key] = value 104 | elif isinstance(vocab_size_or_config_json_file, int): 105 | self.vocab_size = vocab_size_or_config_json_file 106 | self.n_ctx = n_ctx 107 | self.n_positions = n_positions 108 | self.n_embd = n_embd 109 | self.n_layer = n_layer 110 | self.n_head = n_head 111 | self.resid_pdrop = resid_pdrop 112 | self.embd_pdrop = embd_pdrop 113 | self.attn_pdrop = attn_pdrop 114 | self.layer_norm_epsilon = layer_norm_epsilon 115 | self.initializer_range = initializer_range 116 | 117 | self.num_labels = num_labels 118 | self.summary_type = summary_type 119 | self.summary_use_proj = summary_use_proj 120 | self.summary_activation = summary_activation 121 | self.summary_first_dropout = summary_first_dropout 122 | self.summary_proj_to_labels = summary_proj_to_labels 123 | else: 124 | raise ValueError( 125 | "First argument must be either a vocabulary size (int)" 126 | "or the path to a pretrained model config file (str)" 127 | ) 128 | 129 | @property 130 | def max_position_embeddings(self): 131 | return self.n_positions 132 | 133 | @property 134 | def hidden_size(self): 135 | return self.n_embd 136 | 137 | @property 138 | def num_attention_heads(self): 139 | return self.n_head 140 | 141 | @property 142 | def num_hidden_layers(self): 143 | return self.n_layer 144 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_openai.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ OpenAI GPT configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" 31 | } 32 | 33 | class OpenAIGPTConfig(PretrainedConfig): 34 | """ 35 | Configuration class to store the configuration of a `OpenAIGPTModel`. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. 39 | n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) 40 | n_positions: Number of positional embeddings. 41 | n_ctx: Size of the causal mask (usually same as n_positions). 42 | n_embd: Dimensionality of the embeddings and hidden states. 43 | n_layer: Number of hidden layers in the Transformer encoder. 44 | n_head: Number of attention heads for each attention layer in 45 | the Transformer encoder. 46 | afn: The non-linear activation function (function or string) in the 47 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 48 | resid_pdrop: The dropout probabilitiy for all fully connected 49 | layers in the embeddings, encoder, and pooler. 50 | attn_pdrop: The dropout ratio for the attention 51 | probabilities. 52 | embd_pdrop: The dropout ratio for the embeddings. 53 | layer_norm_epsilon: epsilon to use in the layer norm layers 54 | initializer_range: The sttdev of the truncated_normal_initializer for 55 | initializing all weight matrices. 56 | predict_special_tokens: should we predict special tokens (when the model has a LM head) 57 | """ 58 | pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP 59 | 60 | def __init__( 61 | self, 62 | vocab_size_or_config_json_file=40478, 63 | n_positions=512, 64 | n_ctx=512, 65 | n_embd=768, 66 | n_layer=12, 67 | n_head=12, 68 | afn="gelu", 69 | resid_pdrop=0.1, 70 | embd_pdrop=0.1, 71 | attn_pdrop=0.1, 72 | layer_norm_epsilon=1e-5, 73 | initializer_range=0.02, 74 | predict_special_tokens=True, 75 | 76 | num_labels=1, 77 | summary_type='cls_index', 78 | summary_use_proj=True, 79 | summary_activation=None, 80 | summary_proj_to_labels=True, 81 | summary_first_dropout=0.1, 82 | **kwargs 83 | ): 84 | """Constructs OpenAIGPTConfig. 85 | """ 86 | super(OpenAIGPTConfig, self).__init__(**kwargs) 87 | 88 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 89 | and isinstance(vocab_size_or_config_json_file, unicode)): 90 | with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: 91 | json_config = json.loads(reader.read()) 92 | for key, value in json_config.items(): 93 | self.__dict__[key] = value 94 | elif isinstance(vocab_size_or_config_json_file, int): 95 | self.vocab_size = vocab_size_or_config_json_file 96 | self.n_ctx = n_ctx 97 | self.n_positions = n_positions 98 | self.n_embd = n_embd 99 | self.n_layer = n_layer 100 | self.n_head = n_head 101 | self.afn = afn 102 | self.resid_pdrop = resid_pdrop 103 | self.embd_pdrop = embd_pdrop 104 | self.attn_pdrop = attn_pdrop 105 | self.layer_norm_epsilon = layer_norm_epsilon 106 | self.initializer_range = initializer_range 107 | self.predict_special_tokens = predict_special_tokens 108 | 109 | self.num_labels = num_labels 110 | self.summary_type = summary_type 111 | self.summary_use_proj = summary_use_proj 112 | self.summary_activation = summary_activation 113 | self.summary_first_dropout = summary_first_dropout 114 | self.summary_proj_to_labels = summary_proj_to_labels 115 | else: 116 | raise ValueError( 117 | "First argument must be either a vocabulary size (int)" 118 | "or the path to a pretrained model config file (str)" 119 | ) 120 | 121 | @property 122 | def max_position_embeddings(self): 123 | return self.n_positions 124 | 125 | @property 126 | def hidden_size(self): 127 | return self.n_embd 128 | 129 | @property 130 | def num_attention_heads(self): 131 | return self.n_head 132 | 133 | @property 134 | def num_hidden_layers(self): 135 | return self.n_layer 136 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ RoBERTa configuration """ 17 | 18 | from __future__ import (absolute_import, division, print_function, 19 | unicode_literals) 20 | 21 | import logging 22 | 23 | from .configuration_bert import BertConfig 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 28 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", 29 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", 30 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", 31 | } 32 | 33 | 34 | class RobertaConfig(BertConfig): 35 | pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP 36 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_transfo_xl.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ Transformer XL configuration """ 17 | 18 | from __future__ import absolute_import, division, print_function, unicode_literals 19 | 20 | import json 21 | import logging 22 | import sys 23 | from io import open 24 | 25 | from .configuration_utils import PretrainedConfig 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { 30 | 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", 31 | } 32 | 33 | class TransfoXLConfig(PretrainedConfig): 34 | """Configuration class to store the configuration of a `TransfoXLModel`. 35 | 36 | Args: 37 | vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. 38 | cutoffs: cutoffs for the adaptive softmax 39 | d_model: Dimensionality of the model's hidden states. 40 | d_embed: Dimensionality of the embeddings 41 | d_head: Dimensionality of the model's heads. 42 | div_val: divident value for adapative input and softmax 43 | pre_lnorm: apply LayerNorm to the input instead of the output 44 | d_inner: Inner dimension in FF 45 | n_layer: Number of hidden layers in the Transformer encoder. 46 | n_head: Number of attention heads for each attention layer in 47 | the Transformer encoder. 48 | tgt_len: number of tokens to predict 49 | ext_len: length of the extended context 50 | mem_len: length of the retained previous heads 51 | same_length: use the same attn length for all tokens 52 | proj_share_all_but_first: True to share all but first projs, False not to share. 53 | attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. 54 | clamp_len: use the same pos embeddings after clamp_len 55 | sample_softmax: number of samples in sampled softmax 56 | adaptive: use adaptive softmax 57 | tie_weight: tie the word embedding and softmax weights 58 | dropout: The dropout probabilitiy for all fully connected 59 | layers in the embeddings, encoder, and pooler. 60 | dropatt: The dropout ratio for the attention probabilities. 61 | untie_r: untie relative position biases 62 | embd_pdrop: The dropout ratio for the embeddings. 63 | init: parameter initializer to use 64 | init_range: parameters initialized by U(-init_range, init_range). 65 | proj_init_std: parameters initialized by N(0, init_std) 66 | init_std: parameters initialized by N(0, init_std) 67 | """ 68 | pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP 69 | 70 | def __init__(self, 71 | vocab_size_or_config_json_file=267735, 72 | cutoffs=[20000, 40000, 200000], 73 | d_model=1024, 74 | d_embed=1024, 75 | n_head=16, 76 | d_head=64, 77 | d_inner=4096, 78 | div_val=4, 79 | pre_lnorm=False, 80 | n_layer=18, 81 | tgt_len=128, 82 | ext_len=0, 83 | mem_len=1600, 84 | clamp_len=1000, 85 | same_length=True, 86 | proj_share_all_but_first=True, 87 | attn_type=0, 88 | sample_softmax=-1, 89 | adaptive=True, 90 | tie_weight=True, 91 | dropout=0.1, 92 | dropatt=0.0, 93 | untie_r=True, 94 | init="normal", 95 | init_range=0.01, 96 | proj_init_std=0.01, 97 | init_std=0.02, 98 | **kwargs): 99 | """Constructs TransfoXLConfig. 100 | """ 101 | super(TransfoXLConfig, self).__init__(**kwargs) 102 | 103 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 104 | and isinstance(vocab_size_or_config_json_file, unicode)): 105 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 106 | json_config = json.loads(reader.read()) 107 | for key, value in json_config.items(): 108 | self.__dict__[key] = value 109 | elif isinstance(vocab_size_or_config_json_file, int): 110 | self.n_token = vocab_size_or_config_json_file 111 | self.cutoffs = [] 112 | self.cutoffs.extend(cutoffs) 113 | self.tie_weight = tie_weight 114 | if proj_share_all_but_first: 115 | self.tie_projs = [False] + [True] * len(self.cutoffs) 116 | else: 117 | self.tie_projs = [False] + [False] * len(self.cutoffs) 118 | self.d_model = d_model 119 | self.d_embed = d_embed 120 | self.d_head = d_head 121 | self.d_inner = d_inner 122 | self.div_val = div_val 123 | self.pre_lnorm = pre_lnorm 124 | self.n_layer = n_layer 125 | self.n_head = n_head 126 | self.tgt_len = tgt_len 127 | self.ext_len = ext_len 128 | self.mem_len = mem_len 129 | self.same_length = same_length 130 | self.attn_type = attn_type 131 | self.clamp_len = clamp_len 132 | self.sample_softmax = sample_softmax 133 | self.adaptive = adaptive 134 | self.dropout = dropout 135 | self.dropatt = dropatt 136 | self.untie_r = untie_r 137 | self.init = init 138 | self.init_range = init_range 139 | self.proj_init_std = proj_init_std 140 | self.init_std = init_std 141 | else: 142 | raise ValueError("First argument must be either a vocabulary size (int)" 143 | " or the path to a pretrained model config file (str)") 144 | 145 | @property 146 | def max_position_embeddings(self): 147 | return self.tgt_len + self.ext_len + self.mem_len 148 | 149 | @property 150 | def vocab_size(self): 151 | return self.n_token 152 | 153 | @vocab_size.setter 154 | def vocab_size(self, value): 155 | self.n_token = value 156 | 157 | @property 158 | def hidden_size(self): 159 | return self.d_model 160 | 161 | @property 162 | def num_attention_heads(self): 163 | return self.n_head 164 | 165 | @property 166 | def num_hidden_layers(self): 167 | return self.n_layer 168 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/configuration_xlnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLNet configuration """ 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import json 20 | import logging 21 | import sys 22 | from io import open 23 | 24 | from .configuration_utils import PretrainedConfig 25 | 26 | logger = logging.getLogger(__name__) 27 | 28 | XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { 29 | 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", 30 | 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", 31 | } 32 | 33 | 34 | class XLNetConfig(PretrainedConfig): 35 | """Configuration class to store the configuration of a ``XLNetModel``. 36 | 37 | Args: 38 | vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. 39 | d_model: Size of the encoder layers and the pooler layer. 40 | n_layer: Number of hidden layers in the Transformer encoder. 41 | n_head: Number of attention heads for each attention layer in 42 | the Transformer encoder. 43 | d_inner: The size of the "intermediate" (i.e., feed-forward) 44 | layer in the Transformer encoder. 45 | ff_activation: The non-linear activation function (function or string) in the 46 | encoder and pooler. If string, "gelu", "relu" and "swish" are supported. 47 | untie_r: untie relative position biases 48 | attn_type: 'bi' for XLNet, 'uni' for Transformer-XL 49 | 50 | dropout: The dropout probabilitiy for all fully connected 51 | layers in the embeddings, encoder, and pooler. 52 | dropatt: The dropout ratio for the attention 53 | probabilities. 54 | initializer_range: The sttdev of the truncated_normal_initializer for 55 | initializing all weight matrices. 56 | layer_norm_eps: The epsilon used by LayerNorm. 57 | 58 | dropout: float, dropout rate. 59 | dropatt: float, dropout rate on attention probabilities. 60 | init: str, the initialization scheme, either "normal" or "uniform". 61 | init_range: float, initialize the parameters with a uniform distribution 62 | in [-init_range, init_range]. Only effective when init="uniform". 63 | init_std: float, initialize the parameters with a normal distribution 64 | with mean 0 and stddev init_std. Only effective when init="normal". 65 | mem_len: int, the number of tokens to cache. 66 | reuse_len: int, the number of tokens in the currect batch to be cached 67 | and reused in the future. 68 | bi_data: bool, whether to use bidirectional input pipeline. 69 | Usually set to True during pretraining and False during finetuning. 70 | clamp_len: int, clamp all relative distances larger than clamp_len. 71 | -1 means no clamping. 72 | same_length: bool, whether to use the same attention length for each token. 73 | finetuning_task: name of the glue task on which the model was fine-tuned if any 74 | """ 75 | pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP 76 | 77 | def __init__(self, 78 | vocab_size_or_config_json_file=32000, 79 | d_model=1024, 80 | n_layer=24, 81 | n_head=16, 82 | d_inner=4096, 83 | ff_activation="gelu", 84 | untie_r=True, 85 | attn_type="bi", 86 | 87 | initializer_range=0.02, 88 | layer_norm_eps=1e-12, 89 | 90 | dropout=0.1, 91 | mem_len=None, 92 | reuse_len=None, 93 | bi_data=False, 94 | clamp_len=-1, 95 | same_length=False, 96 | 97 | finetuning_task=None, 98 | num_labels=2, 99 | summary_type='last', 100 | summary_use_proj=True, 101 | summary_activation='tanh', 102 | summary_last_dropout=0.1, 103 | start_n_top=5, 104 | end_n_top=5, 105 | **kwargs): 106 | """Constructs XLNetConfig. 107 | """ 108 | super(XLNetConfig, self).__init__(**kwargs) 109 | 110 | if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 111 | and isinstance(vocab_size_or_config_json_file, unicode)): 112 | with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: 113 | json_config = json.loads(reader.read()) 114 | for key, value in json_config.items(): 115 | self.__dict__[key] = value 116 | elif isinstance(vocab_size_or_config_json_file, int): 117 | self.n_token = vocab_size_or_config_json_file 118 | self.d_model = d_model 119 | self.n_layer = n_layer 120 | self.n_head = n_head 121 | assert d_model % n_head == 0 122 | self.d_head = d_model // n_head 123 | self.ff_activation = ff_activation 124 | self.d_inner = d_inner 125 | self.untie_r = untie_r 126 | self.attn_type = attn_type 127 | 128 | self.initializer_range = initializer_range 129 | self.layer_norm_eps = layer_norm_eps 130 | 131 | self.dropout = dropout 132 | self.mem_len = mem_len 133 | self.reuse_len = reuse_len 134 | self.bi_data = bi_data 135 | self.clamp_len = clamp_len 136 | self.same_length = same_length 137 | 138 | self.finetuning_task = finetuning_task 139 | self.num_labels = num_labels 140 | self.summary_type = summary_type 141 | self.summary_use_proj = summary_use_proj 142 | self.summary_activation = summary_activation 143 | self.summary_last_dropout = summary_last_dropout 144 | self.start_n_top = start_n_top 145 | self.end_n_top = end_n_top 146 | else: 147 | raise ValueError("First argument must be either a vocabulary size (int)" 148 | " or the path to a pretrained model config file (str)") 149 | 150 | @property 151 | def max_position_embeddings(self): 152 | return -1 153 | 154 | @property 155 | def vocab_size(self): 156 | return self.n_token 157 | 158 | @vocab_size.setter 159 | def vocab_size(self, value): 160 | self.n_token = value 161 | 162 | @property 163 | def hidden_size(self): 164 | return self.d_model 165 | 166 | @property 167 | def num_attention_heads(self): 168 | return self.n_head 169 | 170 | @property 171 | def num_hidden_layers(self): 172 | return self.n_layer 173 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | 33 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 34 | # Construct model 35 | if gpt2_config_file == "": 36 | config = GPT2Config() 37 | else: 38 | config = GPT2Config.from_json_file(gpt2_config_file) 39 | model = GPT2Model(config) 40 | 41 | # Load weights from numpy 42 | load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 48 | torch.save(model.state_dict(), pytorch_weights_dump_path) 49 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 50 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 51 | f.write(config.to_json_string()) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | ## Required parameters 57 | parser.add_argument("--gpt2_checkpoint_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the TensorFlow checkpoint path.") 62 | parser.add_argument("--pytorch_dump_folder_path", 63 | default = None, 64 | type = str, 65 | required = True, 66 | help = "Path to the output PyTorch model.") 67 | parser.add_argument("--gpt2_config_file", 68 | default = "", 69 | type = str, 70 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 71 | "This specifies the model architecture.") 72 | args = parser.parse_args() 73 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 74 | args.gpt2_config_file, 75 | args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | 33 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 34 | # Construct model 35 | if openai_config_file == "": 36 | config = OpenAIGPTConfig() 37 | else: 38 | config = OpenAIGPTConfig.from_json_file(openai_config_file) 39 | model = OpenAIGPTModel(config) 40 | 41 | # Load weights from numpy 42 | load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 48 | torch.save(model.state_dict(), pytorch_weights_dump_path) 49 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 50 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 51 | f.write(config.to_json_string()) 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = argparse.ArgumentParser() 56 | ## Required parameters 57 | parser.add_argument("--openai_checkpoint_folder_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the TensorFlow checkpoint path.") 62 | parser.add_argument("--pytorch_dump_folder_path", 63 | default = None, 64 | type = str, 65 | required = True, 66 | help = "Path to the output PyTorch model.") 67 | parser.add_argument("--openai_config_file", 68 | default = "", 69 | type = str, 70 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 71 | "This specifies the model architecture.") 72 | args = parser.parse_args() 73 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 74 | args.openai_config_file, 75 | args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | """Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" 17 | 18 | import os 19 | import argparse 20 | import torch 21 | import numpy as np 22 | import tensorflow as tf 23 | from pytorch_transformers import BertModel 24 | 25 | 26 | def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): 27 | 28 | """ 29 | :param model:BertModel Pytorch model instance to be converted 30 | :param ckpt_dir: Tensorflow model directory 31 | :param model_name: model name 32 | :return: 33 | 34 | Currently supported HF models: 35 | Y BertModel 36 | N BertForMaskedLM 37 | N BertForPreTraining 38 | N BertForMultipleChoice 39 | N BertForNextSentencePrediction 40 | N BertForSequenceClassification 41 | N BertForQuestionAnswering 42 | """ 43 | 44 | tensors_to_transpose = ( 45 | "dense.weight", 46 | "attention.self.query", 47 | "attention.self.key", 48 | "attention.self.value" 49 | ) 50 | 51 | var_map = ( 52 | ('layer.', 'layer_'), 53 | ('word_embeddings.weight', 'word_embeddings'), 54 | ('position_embeddings.weight', 'position_embeddings'), 55 | ('token_type_embeddings.weight', 'token_type_embeddings'), 56 | ('.', '/'), 57 | ('LayerNorm/weight', 'LayerNorm/gamma'), 58 | ('LayerNorm/bias', 'LayerNorm/beta'), 59 | ('weight', 'kernel') 60 | ) 61 | 62 | if not os.path.isdir(ckpt_dir): 63 | os.makedirs(ckpt_dir) 64 | 65 | state_dict = model.state_dict() 66 | 67 | def to_tf_var_name(name:str): 68 | for patt, repl in iter(var_map): 69 | name = name.replace(patt, repl) 70 | return 'bert/{}'.format(name) 71 | 72 | def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session): 73 | tf_dtype = tf.dtypes.as_dtype(tensor.dtype) 74 | tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) 75 | session.run(tf.variables_initializer([tf_var])) 76 | session.run(tf_var) 77 | return tf_var 78 | 79 | tf.reset_default_graph() 80 | with tf.Session() as session: 81 | for var_name in state_dict: 82 | tf_name = to_tf_var_name(var_name) 83 | torch_tensor = state_dict[var_name].numpy() 84 | if any([x in var_name for x in tensors_to_transpose]): 85 | torch_tensor = torch_tensor.T 86 | tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) 87 | tf.keras.backend.set_value(tf_var, torch_tensor) 88 | tf_weight = session.run(tf_var) 89 | print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor))) 90 | 91 | saver = tf.train.Saver(tf.trainable_variables()) 92 | saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) 93 | 94 | 95 | def main(raw_args=None): 96 | parser = argparse.ArgumentParser() 97 | parser.add_argument("--model_name", 98 | type=str, 99 | required=True, 100 | help="model name e.g. bert-base-uncased") 101 | parser.add_argument("--cache_dir", 102 | type=str, 103 | default=None, 104 | required=False, 105 | help="Directory containing pytorch model") 106 | parser.add_argument("--pytorch_model_path", 107 | type=str, 108 | required=True, 109 | help="/path/to/.bin") 110 | parser.add_argument("--tf_cache_dir", 111 | type=str, 112 | required=True, 113 | help="Directory in which to save tensorflow model") 114 | args = parser.parse_args(raw_args) 115 | 116 | model = BertModel.from_pretrained( 117 | pretrained_model_name_or_path=args.model_name, 118 | state_dict=torch.load(args.pytorch_model_path), 119 | cache_dir=args.cache_dir 120 | ) 121 | 122 | convert_pytorch_checkpoint_to_tf( 123 | model=model, 124 | ckpt_dir=args.tf_cache_dir, 125 | model_name=args.model_name 126 | ) 127 | 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import torch 23 | 24 | from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 25 | 26 | import logging 27 | logging.basicConfig(level=logging.INFO) 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | ## Required parameters 46 | parser.add_argument("--tf_checkpoint_path", 47 | default = None, 48 | type = str, 49 | required = True, 50 | help = "Path to the TensorFlow checkpoint path.") 51 | parser.add_argument("--bert_config_file", 52 | default = None, 53 | type = str, 54 | required = True, 55 | help = "The config json file corresponding to the pre-trained BERT model. \n" 56 | "This specifies the model architecture.") 57 | parser.add_argument("--pytorch_dump_path", 58 | default = None, 59 | type = str, 60 | required = True, 61 | help = "Path to the output PyTorch model.") 62 | args = parser.parse_args() 63 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 64 | args.bert_config_file, 65 | args.pytorch_dump_path) 66 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert Transformer XL checkpoint and datasets.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import os 21 | import sys 22 | from io import open 23 | 24 | import torch 25 | 26 | import pytorch_transformers.tokenization_transfo_xl as data_utils 27 | 28 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME 29 | from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel, 30 | load_tf_weights_in_transfo_xl) 31 | from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) 32 | 33 | if sys.version_info[0] == 2: 34 | import cPickle as pickle 35 | else: 36 | import pickle 37 | 38 | import logging 39 | logging.basicConfig(level=logging.INFO) 40 | 41 | # We do this to be able to load python 2 datasets pickles 42 | # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918 43 | data_utils.Vocab = data_utils.TransfoXLTokenizer 44 | data_utils.Corpus = data_utils.TransfoXLCorpus 45 | sys.modules['data_utils'] = data_utils 46 | sys.modules['vocabulary'] = data_utils 47 | 48 | def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, 49 | transfo_xl_config_file, 50 | pytorch_dump_folder_path, 51 | transfo_xl_dataset_file): 52 | if transfo_xl_dataset_file: 53 | # Convert a pre-processed corpus (see original TensorFlow repo) 54 | with open(transfo_xl_dataset_file, "rb") as fp: 55 | corpus = pickle.load(fp, encoding="latin1") 56 | # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) 57 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file'] 58 | print("Save vocabulary to {}".format(pytorch_vocab_dump_path)) 59 | corpus_vocab_dict = corpus.vocab.__dict__ 60 | torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) 61 | 62 | corpus_dict_no_vocab = corpus.__dict__ 63 | corpus_dict_no_vocab.pop('vocab', None) 64 | pytorch_dataset_dump_path = pytorch_dump_folder_path + '/' + CORPUS_NAME 65 | print("Save dataset to {}".format(pytorch_dataset_dump_path)) 66 | torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) 67 | 68 | if tf_checkpoint_path: 69 | # Convert a pre-trained TensorFlow model 70 | config_path = os.path.abspath(transfo_xl_config_file) 71 | tf_path = os.path.abspath(tf_checkpoint_path) 72 | 73 | print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) 74 | # Initialise PyTorch model 75 | if transfo_xl_config_file == "": 76 | config = TransfoXLConfig() 77 | else: 78 | config = TransfoXLConfig.from_json_file(transfo_xl_config_file) 79 | print("Building PyTorch model from configuration: {}".format(str(config))) 80 | model = TransfoXLLMHeadModel(config) 81 | 82 | model = load_tf_weights_in_transfo_xl(model, config, tf_path) 83 | # Save pytorch-model 84 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 85 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 86 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 87 | torch.save(model.state_dict(), pytorch_weights_dump_path) 88 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 89 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 90 | f.write(config.to_json_string()) 91 | 92 | 93 | if __name__ == "__main__": 94 | parser = argparse.ArgumentParser() 95 | parser.add_argument("--pytorch_dump_folder_path", 96 | default = None, 97 | type = str, 98 | required = True, 99 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 100 | parser.add_argument("--tf_checkpoint_path", 101 | default = "", 102 | type = str, 103 | help = "An optional path to a TensorFlow checkpoint path to be converted.") 104 | parser.add_argument("--transfo_xl_config_file", 105 | default = "", 106 | type = str, 107 | help = "An optional config json file corresponding to the pre-trained BERT model. \n" 108 | "This specifies the model architecture.") 109 | parser.add_argument("--transfo_xl_dataset_file", 110 | default = "", 111 | type = str, 112 | help = "An optional dataset file to be converted in a vocabulary.") 113 | args = parser.parse_args() 114 | convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, 115 | args.transfo_xl_config_file, 116 | args.pytorch_dump_folder_path, 117 | args.transfo_xl_dataset_file) 118 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | import json 21 | from io import open 22 | 23 | import torch 24 | import numpy 25 | 26 | from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME 27 | from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES 28 | 29 | import logging 30 | logging.basicConfig(level=logging.INFO) 31 | 32 | def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path): 33 | # Load checkpoint 34 | chkpt = torch.load(xlm_checkpoint_path, map_location='cpu') 35 | 36 | model = chkpt['model'] 37 | 38 | config = chkpt['params'] 39 | config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray))) 40 | 41 | vocab = chkpt['dico_word2id'] 42 | vocab = dict((s + '' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items()) 43 | 44 | # Save pytorch-model 45 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 46 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 47 | pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['vocab_file'] 48 | 49 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 50 | torch.save(model, pytorch_weights_dump_path) 51 | 52 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 53 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 54 | f.write(json.dumps(config, indent=2) + "\n") 55 | 56 | print("Save vocab file to {}".format(pytorch_config_dump_path)) 57 | with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f: 58 | f.write(json.dumps(vocab, indent=2) + "\n") 59 | 60 | 61 | if __name__ == "__main__": 62 | parser = argparse.ArgumentParser() 63 | ## Required parameters 64 | parser.add_argument("--xlm_checkpoint_path", 65 | default = None, 66 | type = str, 67 | required = True, 68 | help = "Path the official PyTorch dump.") 69 | parser.add_argument("--pytorch_dump_folder_path", 70 | default = None, 71 | type = str, 72 | required = True, 73 | help = "Path to the output PyTorch model.") 74 | args = parser.parse_args() 75 | convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path) 76 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import os 22 | import argparse 23 | import torch 24 | 25 | from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME, 26 | XLNetConfig, 27 | XLNetLMHeadModel, XLNetForQuestionAnswering, 28 | XLNetForSequenceClassification, 29 | load_tf_weights_in_xlnet) 30 | 31 | GLUE_TASKS_NUM_LABELS = { 32 | "cola": 2, 33 | "mnli": 3, 34 | "mrpc": 2, 35 | "sst-2": 2, 36 | "sts-b": 1, 37 | "qqp": 2, 38 | "qnli": 2, 39 | "rte": 2, 40 | "wnli": 2, 41 | } 42 | 43 | import logging 44 | logging.basicConfig(level=logging.INFO) 45 | 46 | def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None): 47 | # Initialise PyTorch model 48 | config = XLNetConfig.from_json_file(bert_config_file) 49 | 50 | finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" 51 | if finetuning_task in GLUE_TASKS_NUM_LABELS: 52 | print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) 53 | config.finetuning_task = finetuning_task 54 | config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] 55 | model = XLNetForSequenceClassification(config) 56 | elif 'squad' in finetuning_task: 57 | config.finetuning_task = finetuning_task 58 | model = XLNetForQuestionAnswering(config) 59 | else: 60 | model = XLNetLMHeadModel(config) 61 | 62 | # Load weights from tf checkpoint 63 | load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) 64 | 65 | # Save pytorch-model 66 | pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) 67 | pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) 68 | print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) 69 | torch.save(model.state_dict(), pytorch_weights_dump_path) 70 | print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) 71 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 72 | f.write(config.to_json_string()) 73 | 74 | 75 | if __name__ == "__main__": 76 | parser = argparse.ArgumentParser() 77 | ## Required parameters 78 | parser.add_argument("--tf_checkpoint_path", 79 | default = None, 80 | type = str, 81 | required = True, 82 | help = "Path to the TensorFlow checkpoint path.") 83 | parser.add_argument("--xlnet_config_file", 84 | default = None, 85 | type = str, 86 | required = True, 87 | help = "The config json file corresponding to the pre-trained XLNet model. \n" 88 | "This specifies the model architecture.") 89 | parser.add_argument("--pytorch_dump_folder_path", 90 | default = None, 91 | type = str, 92 | required = True, 93 | help = "Path to the folder to store the PyTorch model or dataset/vocab.") 94 | parser.add_argument("--finetuning_task", 95 | default = None, 96 | type = str, 97 | help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned") 98 | args = parser.parse_args() 99 | print(args) 100 | 101 | convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path, 102 | args.xlnet_config_file, 103 | args.pytorch_dump_folder_path, 104 | args.finetuning_task) 105 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lonePatient/daguan_2019_rank9/25875b78f4c22c32d130e47121c38fa7d11ffba5/pydatagrand/model/pytorch_transformers/tests/__init__.py -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/configuration_common_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 HuggingFace Inc. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import copy 20 | import os 21 | import shutil 22 | import json 23 | import random 24 | import uuid 25 | 26 | import unittest 27 | import logging 28 | 29 | 30 | class ConfigTester(object): 31 | def __init__(self, parent, config_class=None, **kwargs): 32 | self.parent = parent 33 | self.config_class = config_class 34 | self.inputs_dict = kwargs 35 | 36 | def create_and_test_config_common_properties(self): 37 | config = self.config_class(**self.inputs_dict) 38 | self.parent.assertTrue(hasattr(config, 'vocab_size')) 39 | self.parent.assertTrue(hasattr(config, 'hidden_size')) 40 | self.parent.assertTrue(hasattr(config, 'num_attention_heads')) 41 | self.parent.assertTrue(hasattr(config, 'num_hidden_layers')) 42 | 43 | def create_and_test_config_to_json_string(self): 44 | config = self.config_class(**self.inputs_dict) 45 | obj = json.loads(config.to_json_string()) 46 | for key, value in self.inputs_dict.items(): 47 | self.parent.assertEqual(obj[key], value) 48 | 49 | def create_and_test_config_to_json_file(self): 50 | config_first = self.config_class(**self.inputs_dict) 51 | json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json") 52 | config_first.to_json_file(json_file_path) 53 | config_second = self.config_class.from_json_file(json_file_path) 54 | os.remove(json_file_path) 55 | self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) 56 | 57 | def run_common_tests(self): 58 | self.create_and_test_config_common_properties() 59 | self.create_and_test_config_to_json_string() 60 | self.create_and_test_config_to_json_file() 61 | 62 | if __name__ == "__main__": 63 | unittest.main() -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # content of conftest.py 2 | 3 | import pytest 4 | 5 | 6 | def pytest_addoption(parser): 7 | parser.addoption( 8 | "--runslow", action="store_true", default=False, help="run slow tests" 9 | ) 10 | 11 | 12 | def pytest_collection_modifyitems(config, items): 13 | if config.getoption("--runslow"): 14 | # --runslow given in cli: do not skip slow tests 15 | return 16 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 17 | for item in items: 18 | if "slow" in item.keywords: 19 | item.add_marker(skip_slow) 20 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/modeling_auto_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import shutil 21 | import pytest 22 | import logging 23 | 24 | from pytorch_transformers import (AutoConfig, BertConfig, 25 | AutoModel, BertModel, 26 | AutoModelWithLMHead, BertForMaskedLM, 27 | AutoModelForSequenceClassification, BertForSequenceClassification, 28 | AutoModelForQuestionAnswering, BertForQuestionAnswering) 29 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP 30 | 31 | from .modeling_common_test import (CommonTestCases, ids_tensor) 32 | from .configuration_common_test import ConfigTester 33 | 34 | 35 | class AutoModelTest(unittest.TestCase): 36 | def test_model_from_pretrained(self): 37 | logging.basicConfig(level=logging.INFO) 38 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 39 | config = AutoConfig.from_pretrained(model_name) 40 | self.assertIsNotNone(config) 41 | self.assertIsInstance(config, BertConfig) 42 | 43 | model = AutoModel.from_pretrained(model_name) 44 | model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True) 45 | self.assertIsNotNone(model) 46 | self.assertIsInstance(model, BertModel) 47 | for value in loading_info.values(): 48 | self.assertEqual(len(value), 0) 49 | 50 | def test_lmhead_model_from_pretrained(self): 51 | logging.basicConfig(level=logging.INFO) 52 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 53 | config = AutoConfig.from_pretrained(model_name) 54 | self.assertIsNotNone(config) 55 | self.assertIsInstance(config, BertConfig) 56 | 57 | model = AutoModelWithLMHead.from_pretrained(model_name) 58 | model, loading_info = AutoModelWithLMHead.from_pretrained(model_name, output_loading_info=True) 59 | self.assertIsNotNone(model) 60 | self.assertIsInstance(model, BertForMaskedLM) 61 | 62 | def test_sequence_classification_model_from_pretrained(self): 63 | logging.basicConfig(level=logging.INFO) 64 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 65 | config = AutoConfig.from_pretrained(model_name) 66 | self.assertIsNotNone(config) 67 | self.assertIsInstance(config, BertConfig) 68 | 69 | model = AutoModelForSequenceClassification.from_pretrained(model_name) 70 | model, loading_info = AutoModelForSequenceClassification.from_pretrained(model_name, output_loading_info=True) 71 | self.assertIsNotNone(model) 72 | self.assertIsInstance(model, BertForSequenceClassification) 73 | 74 | def test_question_answering_model_from_pretrained(self): 75 | logging.basicConfig(level=logging.INFO) 76 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 77 | config = AutoConfig.from_pretrained(model_name) 78 | self.assertIsNotNone(config) 79 | self.assertIsInstance(config, BertConfig) 80 | 81 | model = AutoModelForQuestionAnswering.from_pretrained(model_name) 82 | model, loading_info = AutoModelForQuestionAnswering.from_pretrained(model_name, output_loading_info=True) 83 | self.assertIsNotNone(model) 84 | self.assertIsInstance(model, BertForQuestionAnswering) 85 | 86 | 87 | if __name__ == "__main__": 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/optimization_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import os 21 | 22 | import torch 23 | 24 | from pytorch_transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, 25 | WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) 26 | 27 | from .tokenization_tests_commons import TemporaryDirectory 28 | 29 | 30 | def unwrap_schedule(scheduler, num_steps=10): 31 | lrs = [] 32 | for _ in range(num_steps): 33 | scheduler.step() 34 | lrs.append(scheduler.get_lr()) 35 | return lrs 36 | 37 | def unwrap_and_save_reload_schedule(scheduler, num_steps=10): 38 | lrs = [] 39 | for step in range(num_steps): 40 | scheduler.step() 41 | lrs.append(scheduler.get_lr()) 42 | if step == num_steps // 2: 43 | with TemporaryDirectory() as tmpdirname: 44 | file_name = os.path.join(tmpdirname, 'schedule.bin') 45 | torch.save(scheduler.state_dict(), file_name) 46 | 47 | state_dict = torch.load(file_name) 48 | scheduler.load_state_dict(state_dict) 49 | return lrs 50 | 51 | class OptimizationTest(unittest.TestCase): 52 | 53 | def assertListAlmostEqual(self, list1, list2, tol): 54 | self.assertEqual(len(list1), len(list2)) 55 | for a, b in zip(list1, list2): 56 | self.assertAlmostEqual(a, b, delta=tol) 57 | 58 | def test_adam_w(self): 59 | w = torch.tensor([0.1, -0.2, -0.1], requires_grad=True) 60 | target = torch.tensor([0.4, 0.2, -0.5]) 61 | criterion = torch.nn.MSELoss() 62 | # No warmup, constant schedule, no gradient clipping 63 | optimizer = AdamW(params=[w], lr=2e-1, weight_decay=0.0) 64 | for _ in range(100): 65 | loss = criterion(w, target) 66 | loss.backward() 67 | optimizer.step() 68 | w.grad.detach_() # No zero_grad() function on simple tensors. we do it ourselves. 69 | w.grad.zero_() 70 | self.assertListAlmostEqual(w.tolist(), [0.4, 0.2, -0.5], tol=1e-2) 71 | 72 | 73 | class ScheduleInitTest(unittest.TestCase): 74 | m = torch.nn.Linear(50, 50) 75 | optimizer = AdamW(m.parameters(), lr=10.) 76 | num_steps = 10 77 | 78 | def assertListAlmostEqual(self, list1, list2, tol): 79 | self.assertEqual(len(list1), len(list2)) 80 | for a, b in zip(list1, list2): 81 | self.assertAlmostEqual(a, b, delta=tol) 82 | 83 | def test_constant_scheduler(self): 84 | scheduler = ConstantLRSchedule(self.optimizer) 85 | lrs = unwrap_schedule(scheduler, self.num_steps) 86 | expected_learning_rates = [10.] * self.num_steps 87 | self.assertEqual(len(lrs[0]), 1) 88 | self.assertListEqual([l[0] for l in lrs], expected_learning_rates) 89 | 90 | scheduler = ConstantLRSchedule(self.optimizer) 91 | lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) 92 | self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) 93 | 94 | def test_warmup_constant_scheduler(self): 95 | scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4) 96 | lrs = unwrap_schedule(scheduler, self.num_steps) 97 | expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0] 98 | self.assertEqual(len(lrs[0]), 1) 99 | self.assertListEqual([l[0] for l in lrs], expected_learning_rates) 100 | 101 | scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4) 102 | lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) 103 | self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) 104 | 105 | def test_warmup_linear_scheduler(self): 106 | scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) 107 | lrs = unwrap_schedule(scheduler, self.num_steps) 108 | expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0] 109 | self.assertEqual(len(lrs[0]), 1) 110 | self.assertListEqual([l[0] for l in lrs], expected_learning_rates) 111 | 112 | scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) 113 | lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) 114 | self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) 115 | 116 | def test_warmup_cosine_scheduler(self): 117 | scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10) 118 | lrs = unwrap_schedule(scheduler, self.num_steps) 119 | expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0] 120 | self.assertEqual(len(lrs[0]), 1) 121 | self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) 122 | 123 | scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10) 124 | lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) 125 | self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) 126 | 127 | def test_warmup_cosine_hard_restart_scheduler(self): 128 | scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10) 129 | lrs = unwrap_schedule(scheduler, self.num_steps) 130 | expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0] 131 | self.assertEqual(len(lrs[0]), 1) 132 | self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) 133 | 134 | scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10) 135 | lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) 136 | self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) 137 | 138 | if __name__ == "__main__": 139 | unittest.main() 140 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_auto_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import shutil 21 | import pytest 22 | import logging 23 | 24 | from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer 25 | from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP 26 | from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP 27 | 28 | 29 | class AutoTokenizerTest(unittest.TestCase): 30 | def test_tokenizer_from_pretrained(self): 31 | logging.basicConfig(level=logging.INFO) 32 | for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 33 | tokenizer = AutoTokenizer.from_pretrained(model_name) 34 | self.assertIsNotNone(tokenizer) 35 | self.assertIsInstance(tokenizer, BertTokenizer) 36 | self.assertGreater(len(tokenizer), 0) 37 | 38 | for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: 39 | tokenizer = AutoTokenizer.from_pretrained(model_name) 40 | self.assertIsNotNone(tokenizer) 41 | self.assertIsInstance(tokenizer, GPT2Tokenizer) 42 | self.assertGreater(len(tokenizer), 0) 43 | 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_bert_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_transformers.tokenization_bert import (BasicTokenizer, 22 | BertTokenizer, 23 | WordpieceTokenizer, 24 | _is_control, _is_punctuation, 25 | _is_whitespace, VOCAB_FILES_NAMES) 26 | 27 | from .tokenization_tests_commons import CommonTestCases 28 | 29 | class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): 30 | 31 | tokenizer_class = BertTokenizer 32 | 33 | def setUp(self): 34 | super(BertTokenizationTest, self).setUp() 35 | 36 | vocab_tokens = [ 37 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 38 | "##ing", ",", "low", "lowest", 39 | ] 40 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 41 | with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: 42 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 43 | 44 | def get_tokenizer(self, **kwargs): 45 | return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 46 | 47 | def get_input_output_texts(self): 48 | input_text = u"UNwant\u00E9d,running" 49 | output_text = u"unwanted, running" 50 | return input_text, output_text 51 | 52 | def test_full_tokenizer(self): 53 | tokenizer = self.tokenizer_class(self.vocab_file) 54 | 55 | tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") 56 | self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) 57 | self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) 58 | 59 | def test_chinese(self): 60 | tokenizer = BasicTokenizer() 61 | 62 | self.assertListEqual( 63 | tokenizer.tokenize(u"ah\u535A\u63A8zz"), 64 | [u"ah", u"\u535A", u"\u63A8", u"zz"]) 65 | 66 | def test_basic_tokenizer_lower(self): 67 | tokenizer = BasicTokenizer(do_lower_case=True) 68 | 69 | self.assertListEqual( 70 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 71 | ["hello", "!", "how", "are", "you", "?"]) 72 | self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) 73 | 74 | def test_basic_tokenizer_no_lower(self): 75 | tokenizer = BasicTokenizer(do_lower_case=False) 76 | 77 | self.assertListEqual( 78 | tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), 79 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 80 | 81 | def test_wordpiece_tokenizer(self): 82 | vocab_tokens = [ 83 | "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", 84 | "##ing" 85 | ] 86 | 87 | vocab = {} 88 | for (i, token) in enumerate(vocab_tokens): 89 | vocab[token] = i 90 | tokenizer = WordpieceTokenizer(vocab=vocab, unk_token="[UNK]") 91 | 92 | self.assertListEqual(tokenizer.tokenize(""), []) 93 | 94 | self.assertListEqual( 95 | tokenizer.tokenize("unwanted running"), 96 | ["un", "##want", "##ed", "runn", "##ing"]) 97 | 98 | self.assertListEqual( 99 | tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) 100 | 101 | def test_is_whitespace(self): 102 | self.assertTrue(_is_whitespace(u" ")) 103 | self.assertTrue(_is_whitespace(u"\t")) 104 | self.assertTrue(_is_whitespace(u"\r")) 105 | self.assertTrue(_is_whitespace(u"\n")) 106 | self.assertTrue(_is_whitespace(u"\u00A0")) 107 | 108 | self.assertFalse(_is_whitespace(u"A")) 109 | self.assertFalse(_is_whitespace(u"-")) 110 | 111 | def test_is_control(self): 112 | self.assertTrue(_is_control(u"\u0005")) 113 | 114 | self.assertFalse(_is_control(u"A")) 115 | self.assertFalse(_is_control(u" ")) 116 | self.assertFalse(_is_control(u"\t")) 117 | self.assertFalse(_is_control(u"\r")) 118 | 119 | def test_is_punctuation(self): 120 | self.assertTrue(_is_punctuation(u"-")) 121 | self.assertTrue(_is_punctuation(u"$")) 122 | self.assertTrue(_is_punctuation(u"`")) 123 | self.assertTrue(_is_punctuation(u".")) 124 | 125 | self.assertFalse(_is_punctuation(u"A")) 126 | self.assertFalse(_is_punctuation(u" ")) 127 | 128 | def test_sequence_builders(self): 129 | tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased") 130 | 131 | text = tokenizer.encode("sequence builders") 132 | text_2 = tokenizer.encode("multi-sequence build") 133 | 134 | encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) 135 | encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) 136 | 137 | assert encoded_sentence == [101] + text + [102] 138 | assert encoded_pair == [101] + text + [102] + text_2 + [102] 139 | 140 | if __name__ == '__main__': 141 | unittest.main() 142 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_dilbert_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_transformers.tokenization_distilbert import (DistilBertTokenizer) 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | from .tokenization_bert_test import BertTokenizationTest 25 | 26 | class DistilBertTokenizationTest(BertTokenizationTest): 27 | 28 | tokenizer_class = DistilBertTokenizer 29 | 30 | def get_tokenizer(self, **kwargs): 31 | return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs) 32 | 33 | def test_sequence_builders(self): 34 | tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") 35 | 36 | text = tokenizer.encode("sequence builders") 37 | text_2 = tokenizer.encode("multi-sequence build") 38 | 39 | encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) 40 | encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) 41 | 42 | assert encoded_sentence == [101] + text + [102] 43 | assert encoded_pair == [101] + text + [102] + text_2 + [102] 44 | 45 | if __name__ == '__main__': 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_gpt2_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | from io import open 21 | 22 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES 23 | 24 | from .tokenization_tests_commons import CommonTestCases 25 | 26 | class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): 27 | 28 | tokenizer_class = GPT2Tokenizer 29 | 30 | def setUp(self): 31 | super(GPT2TokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 35 | "\u0120", "\u0120l", "\u0120n", 36 | "\u0120lo", "\u0120low", "er", 37 | "\u0120lowest", "\u0120newer", "\u0120wider", ""] 38 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 39 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 40 | self.special_tokens_map = {"unk_token": ""} 41 | 42 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 43 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 44 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 45 | fp.write(json.dumps(vocab_tokens) + "\n") 46 | with open(self.merges_file, "w", encoding="utf-8") as fp: 47 | fp.write("\n".join(merges)) 48 | 49 | def get_tokenizer(self, **kwargs): 50 | kwargs.update(self.special_tokens_map) 51 | return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs) 52 | 53 | def get_input_output_texts(self): 54 | input_text = u"lower newer" 55 | output_text = u" lower newer" 56 | return input_text, output_text 57 | 58 | def test_full_tokenizer(self): 59 | tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 60 | text = "lower newer" 61 | bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] 62 | tokens = tokenizer.tokenize(text) 63 | self.assertListEqual(tokens, bpe_tokens) 64 | 65 | input_tokens = tokens + [tokenizer.unk_token] 66 | input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] 67 | self.assertListEqual( 68 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_openai_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | 21 | from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | 26 | class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): 27 | 28 | tokenizer_class = OpenAIGPTTokenizer 29 | 30 | def setUp(self): 31 | super(OpenAIGPTTokenizationTest, self).setUp() 32 | 33 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 34 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 35 | "w", "r", "t", 36 | "lo", "low", "er", 37 | "low", "lowest", "newer", "wider", ""] 38 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 39 | merges = ["#version: 0.2", "l o", "lo w", "e r", ""] 40 | 41 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 42 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 43 | with open(self.vocab_file, "w") as fp: 44 | fp.write(json.dumps(vocab_tokens)) 45 | with open(self.merges_file, "w") as fp: 46 | fp.write("\n".join(merges)) 47 | 48 | def get_tokenizer(self, **kwargs): 49 | return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs) 50 | 51 | def get_input_output_texts(self): 52 | input_text = u"lower newer" 53 | output_text = u"lower newer" 54 | return input_text, output_text 55 | 56 | 57 | def test_full_tokenizer(self): 58 | tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file) 59 | 60 | text = "lower" 61 | bpe_tokens = ["low", "er"] 62 | tokens = tokenizer.tokenize(text) 63 | self.assertListEqual(tokens, bpe_tokens) 64 | 65 | input_tokens = tokens + [""] 66 | input_bpe_tokens = [14, 15, 20] 67 | self.assertListEqual( 68 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 69 | 70 | 71 | if __name__ == '__main__': 72 | unittest.main() 73 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_roberta_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import json 19 | import unittest 20 | from io import open 21 | 22 | from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | 26 | class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): 27 | tokenizer_class = RobertaTokenizer 28 | 29 | def setUp(self): 30 | super(RobertaTokenizationTest, self).setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 34 | "\u0120", "\u0120l", "\u0120n", 35 | "\u0120lo", "\u0120low", "er", 36 | "\u0120lowest", "\u0120newer", "\u0120wider", ""] 37 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 38 | merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""] 39 | self.special_tokens_map = {"unk_token": ""} 40 | 41 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 42 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 43 | with open(self.vocab_file, "w", encoding="utf-8") as fp: 44 | fp.write(json.dumps(vocab_tokens) + "\n") 45 | with open(self.merges_file, "w", encoding="utf-8") as fp: 46 | fp.write("\n".join(merges)) 47 | 48 | def get_tokenizer(self, **kwargs): 49 | kwargs.update(self.special_tokens_map) 50 | return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs) 51 | 52 | def get_input_output_texts(self): 53 | input_text = u"lower newer" 54 | output_text = u" lower newer" 55 | return input_text, output_text 56 | 57 | def test_full_tokenizer(self): 58 | tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) 59 | text = "lower newer" 60 | bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"] 61 | tokens = tokenizer.tokenize(text) 62 | self.assertListEqual(tokens, bpe_tokens) 63 | 64 | input_tokens = tokens + [tokenizer.unk_token] 65 | input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19] 66 | self.assertListEqual( 67 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 68 | 69 | def roberta_dict_integration_testing(self): 70 | tokenizer = self.get_tokenizer() 71 | 72 | self.assertListEqual( 73 | tokenizer.encode('Hello world!'), 74 | [0, 31414, 232, 328, 2] 75 | ) 76 | self.assertListEqual( 77 | tokenizer.encode('Hello world! cécé herlolip 418'), 78 | [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2] 79 | ) 80 | 81 | def test_sequence_builders(self): 82 | tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 83 | 84 | text = tokenizer.encode("sequence builders") 85 | text_2 = tokenizer.encode("multi-sequence build") 86 | 87 | encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True) 88 | encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True) 89 | 90 | encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) 91 | encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) 92 | 93 | assert encoded_sentence == encoded_text_from_decode 94 | assert encoded_pair == encoded_pair_from_decode 95 | 96 | 97 | if __name__ == '__main__': 98 | unittest.main() 99 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_transfo_xl_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | from io import open 20 | 21 | from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES 22 | 23 | from.tokenization_tests_commons import CommonTestCases 24 | 25 | class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester): 26 | 27 | tokenizer_class = TransfoXLTokenizer 28 | 29 | def setUp(self): 30 | super(TransfoXLTokenizationTest, self).setUp() 31 | 32 | vocab_tokens = [ 33 | "", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un", 34 | "running", ",", "low", "l", 35 | ] 36 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 37 | with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: 38 | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) 39 | 40 | def get_tokenizer(self, **kwargs): 41 | kwargs['lower_case'] = True 42 | return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs) 43 | 44 | def get_input_output_texts(self): 45 | input_text = u" UNwanted , running" 46 | output_text = u" unwanted, running" 47 | return input_text, output_text 48 | 49 | def test_full_tokenizer(self): 50 | tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True) 51 | 52 | tokens = tokenizer.tokenize(u" UNwanted , running") 53 | self.assertListEqual(tokens, ["", "unwanted", ",", "running"]) 54 | 55 | self.assertListEqual( 56 | tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7]) 57 | 58 | def test_full_tokenizer_lower(self): 59 | tokenizer = TransfoXLTokenizer(lower_case=True) 60 | 61 | self.assertListEqual( 62 | tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), 63 | ["hello", "!", "how", "are", "you", "?"]) 64 | 65 | def test_full_tokenizer_no_lower(self): 66 | tokenizer = TransfoXLTokenizer(lower_case=False) 67 | 68 | self.assertListEqual( 69 | tokenizer.tokenize(u" \tHeLLo ! how \n Are yoU ? "), 70 | ["HeLLo", "!", "how", "Are", "yoU", "?"]) 71 | 72 | 73 | if __name__ == '__main__': 74 | unittest.main() 75 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_utils_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 HuggingFace Inc.. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import 16 | from __future__ import division 17 | from __future__ import print_function 18 | 19 | import unittest 20 | import six 21 | 22 | from pytorch_transformers import PreTrainedTokenizer 23 | from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer 24 | 25 | class TokenizerUtilsTest(unittest.TestCase): 26 | def check_tokenizer_from_pretrained(self, tokenizer_class): 27 | s3_models = list(tokenizer_class.max_model_input_sizes.keys()) 28 | for model_name in s3_models[:1]: 29 | tokenizer = tokenizer_class.from_pretrained(model_name) 30 | self.assertIsNotNone(tokenizer) 31 | self.assertIsInstance(tokenizer, tokenizer_class) 32 | self.assertIsInstance(tokenizer, PreTrainedTokenizer) 33 | 34 | for special_tok in tokenizer.all_special_tokens: 35 | if six.PY2: 36 | self.assertIsInstance(special_tok, unicode) 37 | else: 38 | self.assertIsInstance(special_tok, str) 39 | special_tok_id = tokenizer.convert_tokens_to_ids(special_tok) 40 | self.assertIsInstance(special_tok_id, int) 41 | 42 | def test_pretrained_tokenizers(self): 43 | self.check_tokenizer_from_pretrained(GPT2Tokenizer) 44 | 45 | if __name__ == "__main__": 46 | unittest.main() 47 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_xlm_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | import json 20 | 21 | from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES 22 | 23 | from .tokenization_tests_commons import CommonTestCases 24 | 25 | class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): 26 | 27 | tokenizer_class = XLMTokenizer 28 | 29 | def setUp(self): 30 | super(XLMTokenizationTest, self).setUp() 31 | 32 | # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt 33 | vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", 34 | "w", "r", "t", 35 | "lo", "low", "er", 36 | "low", "lowest", "newer", "wider", ""] 37 | vocab_tokens = dict(zip(vocab, range(len(vocab)))) 38 | merges = ["l o 123", "lo w 1456", "e r 1789", ""] 39 | 40 | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) 41 | self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) 42 | with open(self.vocab_file, "w") as fp: 43 | fp.write(json.dumps(vocab_tokens)) 44 | with open(self.merges_file, "w") as fp: 45 | fp.write("\n".join(merges)) 46 | 47 | def get_tokenizer(self, **kwargs): 48 | return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs) 49 | 50 | def get_input_output_texts(self): 51 | input_text = u"lower newer" 52 | output_text = u"lower newer" 53 | return input_text, output_text 54 | 55 | def test_full_tokenizer(self): 56 | """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """ 57 | tokenizer = XLMTokenizer(self.vocab_file, self.merges_file) 58 | 59 | text = "lower" 60 | bpe_tokens = ["low", "er"] 61 | tokens = tokenizer.tokenize(text) 62 | self.assertListEqual(tokens, bpe_tokens) 63 | 64 | input_tokens = tokens + [""] 65 | input_bpe_tokens = [14, 15, 20] 66 | self.assertListEqual( 67 | tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) 68 | 69 | def test_sequence_builders(self): 70 | tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") 71 | 72 | text = tokenizer.encode("sequence builders") 73 | text_2 = tokenizer.encode("multi-sequence build") 74 | 75 | encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) 76 | encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) 77 | 78 | assert encoded_sentence == [1] + text + [1] 79 | assert encoded_pair == [1] + text + [1] + text_2 + [1] 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tests/tokenization_xlnet_test.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | from __future__ import absolute_import, division, print_function, unicode_literals 16 | 17 | import os 18 | import unittest 19 | 20 | from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE) 21 | 22 | from .tokenization_tests_commons import CommonTestCases 23 | 24 | SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), 25 | 'fixtures/test_sentencepiece.model') 26 | 27 | class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): 28 | 29 | tokenizer_class = XLNetTokenizer 30 | 31 | def setUp(self): 32 | super(XLNetTokenizationTest, self).setUp() 33 | 34 | # We have a SentencePiece fixture for testing 35 | tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) 36 | tokenizer.save_pretrained(self.tmpdirname) 37 | 38 | def get_tokenizer(self, **kwargs): 39 | return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs) 40 | 41 | def get_input_output_texts(self): 42 | input_text = u"This is a test" 43 | output_text = u"This is a test" 44 | return input_text, output_text 45 | 46 | 47 | def test_full_tokenizer(self): 48 | tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) 49 | 50 | tokens = tokenizer.tokenize(u'This is a test') 51 | self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) 52 | 53 | self.assertListEqual( 54 | tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) 55 | 56 | tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") 57 | self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', 58 | u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', 59 | u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', 60 | SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.']) 61 | ids = tokenizer.convert_tokens_to_ids(tokens) 62 | self.assertListEqual( 63 | ids, [8, 21, 84, 55, 24, 19, 7, 0, 64 | 602, 347, 347, 347, 3, 12, 66, 65 | 46, 72, 80, 6, 0, 4]) 66 | 67 | back_tokens = tokenizer.convert_ids_to_tokens(ids) 68 | self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', 69 | u'or', u'n', SPIECE_UNDERLINE + u'in', 70 | SPIECE_UNDERLINE + u'', u'', u'2', u'0', u'0', u'0', u',', 71 | SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', 72 | SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', 73 | u'', u'.']) 74 | 75 | def test_tokenizer_lower(self): 76 | tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True) 77 | tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") 78 | self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'', u'i', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', 79 | u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', 80 | u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', 81 | SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.']) 82 | self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), [u"▁he", u"ll", u"o"]) 83 | 84 | def test_tokenizer_no_lower(self): 85 | tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False) 86 | tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") 87 | self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', u'or', 88 | u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', 89 | u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', 90 | SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.']) 91 | 92 | def test_sequence_builders(self): 93 | tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") 94 | 95 | text = tokenizer.encode("sequence builders") 96 | text_2 = tokenizer.encode("multi-sequence build") 97 | 98 | encoded_sentence = tokenizer.add_special_tokens_single_sentence(text) 99 | encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2) 100 | 101 | assert encoded_sentence == text + [4, 3] 102 | assert encoded_pair == text + [4] + text_2 + [4, 3] 103 | 104 | 105 | if __name__ == '__main__': 106 | unittest.main() 107 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tokenization_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Auto Model class. """ 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import logging 20 | 21 | from .tokenization_bert import BertTokenizer 22 | from .tokenization_openai import OpenAIGPTTokenizer 23 | from .tokenization_gpt2 import GPT2Tokenizer 24 | from .tokenization_transfo_xl import TransfoXLTokenizer 25 | from .tokenization_xlnet import XLNetTokenizer 26 | from .tokenization_xlm import XLMTokenizer 27 | from .tokenization_roberta import RobertaTokenizer 28 | from .tokenization_distilbert import DistilBertTokenizer 29 | 30 | logger = logging.getLogger(__name__) 31 | 32 | class AutoTokenizer(object): 33 | r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class 34 | that will be instantiated as one of the tokenizer classes of the library 35 | when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` 36 | class method. 37 | 38 | The `from_pretrained()` method take care of returning the correct tokenizer class instance 39 | using pattern matching on the `pretrained_model_name_or_path` string. 40 | 41 | The tokenizer class to instantiate is selected as the first pattern matching 42 | in the `pretrained_model_name_or_path` string (in the following order): 43 | - contains `distilbert`: DistilBertTokenizer (DistilBert model) 44 | - contains `roberta`: RobertaTokenizer (RoBERTa model) 45 | - contains `bert`: BertTokenizer (Bert model) 46 | - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) 47 | - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) 48 | - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) 49 | - contains `xlnet`: XLNetTokenizer (XLNet model) 50 | - contains `xlm`: XLMTokenizer (XLM model) 51 | 52 | This class cannot be instantiated using `__init__()` (throw an error). 53 | """ 54 | def __init__(self): 55 | raise EnvironmentError("AutoTokenizer is designed to be instantiated " 56 | "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.") 57 | 58 | @classmethod 59 | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): 60 | r""" Instantiate a one of the tokenizer classes of the library 61 | from a pre-trained model vocabulary. 62 | 63 | The tokenizer class to instantiate is selected as the first pattern matching 64 | in the `pretrained_model_name_or_path` string (in the following order): 65 | - contains `distilbert`: DistilBertTokenizer (DistilBert model) 66 | - contains `roberta`: RobertaTokenizer (XLM model) 67 | - contains `bert`: BertTokenizer (Bert model) 68 | - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) 69 | - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) 70 | - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) 71 | - contains `xlnet`: XLNetTokenizer (XLNet model) 72 | - contains `xlm`: XLMTokenizer (XLM model) 73 | 74 | Params: 75 | pretrained_model_name_or_path: either: 76 | 77 | - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. 78 | - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. 79 | - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. 80 | 81 | cache_dir: (`optional`) string: 82 | Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. 83 | 84 | force_download: (`optional`) boolean, default False: 85 | Force to (re-)download the vocabulary files and override the cached versions if they exists. 86 | 87 | proxies: (`optional`) dict, default None: 88 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 89 | The proxies are used on each request. 90 | 91 | inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. 92 | 93 | kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. 94 | 95 | Examples:: 96 | 97 | tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. 98 | tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` 99 | 100 | """ 101 | if 'distilbert' in pretrained_model_name_or_path: 102 | return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 103 | elif 'roberta' in pretrained_model_name_or_path: 104 | return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 105 | elif 'bert' in pretrained_model_name_or_path: 106 | return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 107 | elif 'openai-gpt' in pretrained_model_name_or_path: 108 | return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 109 | elif 'gpt2' in pretrained_model_name_or_path: 110 | return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 111 | elif 'transfo-xl' in pretrained_model_name_or_path: 112 | return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 113 | elif 'xlnet' in pretrained_model_name_or_path: 114 | return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 115 | elif 'xlm' in pretrained_model_name_or_path: 116 | return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 117 | 118 | raise ValueError("Unrecognized model identifier in {}. Should contains one of " 119 | "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " 120 | "'xlm', 'roberta'".format(pretrained_model_name_or_path)) 121 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tokenization_distilbert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for DistilBERT.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from .tokenization_bert import BertTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} 30 | 31 | PRETRAINED_VOCAB_FILES_MAP = { 32 | 'vocab_file': 33 | { 34 | 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 35 | 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 36 | } 37 | } 38 | 39 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 40 | 'distilbert-base-uncased': 512, 41 | 'distilbert-base-uncased-distilled-squad': 512, 42 | } 43 | 44 | 45 | class DistilBertTokenizer(BertTokenizer): 46 | r""" 47 | Constructs a DistilBertTokenizer. 48 | :class:`~pytorch_transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece 49 | 50 | Args: 51 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 52 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False 53 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 54 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 55 | minimum of this value (if specified) and the underlying BERT model's sequence length. 56 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 57 | do_wordpiece_only=False 58 | """ 59 | 60 | vocab_files_names = VOCAB_FILES_NAMES 61 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 62 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 63 | -------------------------------------------------------------------------------- /pydatagrand/model/pytorch_transformers/tokenization_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for RoBERTa.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import sys 20 | import json 21 | import logging 22 | import os 23 | import regex as re 24 | from io import open 25 | 26 | from .tokenization_gpt2 import GPT2Tokenizer 27 | 28 | try: 29 | from functools import lru_cache 30 | except ImportError: 31 | # Just a dummy decorator to get the checks to run on python2 32 | # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. 33 | def lru_cache(): 34 | return lambda func: func 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | VOCAB_FILES_NAMES = { 39 | 'vocab_file': 'vocab.json', 40 | 'merges_file': 'merges.txt', 41 | } 42 | 43 | PRETRAINED_VOCAB_FILES_MAP = { 44 | 'vocab_file': 45 | { 46 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", 47 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", 48 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", 49 | }, 50 | 'merges_file': 51 | { 52 | 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", 53 | 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", 54 | 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", 55 | }, 56 | } 57 | 58 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 59 | 'roberta-base': 512, 60 | 'roberta-large': 512, 61 | 'roberta-large-mnli': 512, 62 | } 63 | 64 | 65 | class RobertaTokenizer(GPT2Tokenizer): 66 | """ 67 | RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: 68 | - Byte-level Byte-Pair-Encoding 69 | - Requires a space to start the input string => will add a space is there isn't. 70 | As a consequence, this tokenizer `encode` and `decode` method will not conserve 71 | the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello" 72 | """ 73 | vocab_files_names = VOCAB_FILES_NAMES 74 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 75 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 76 | 77 | def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", 78 | cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): 79 | super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, 80 | bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, 81 | sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, 82 | mask_token=mask_token, **kwargs) 83 | 84 | def add_special_tokens_single_sentence(self, token_ids): 85 | """ 86 | Adds special tokens to a sequence for sequence classification tasks. 87 | A RoBERTa sequence has the following format: X 88 | """ 89 | return [self.cls_token_id] + token_ids + [self.sep_token_id] 90 | 91 | def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): 92 | """ 93 | Adds special tokens to a sequence pair for sequence classification tasks. 94 | A RoBERTa sequence pair has the following format: A B 95 | """ 96 | sep = [self.sep_token_id] 97 | cls = [self.cls_token_id] 98 | return cls + token_ids_0 + sep + sep + token_ids_1 + sep 99 | -------------------------------------------------------------------------------- /pydatagrand/output/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/output/checkpoints/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/output/embedding/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/output/feature/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/output/figure/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/output/log/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/output/result/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/preprocessing/augmentation.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | import numpy as np 3 | import random 4 | 5 | class Augmentator(object): 6 | def __init__(self,is_train_mode = True, proba = 0.5): 7 | self.mode = is_train_mode 8 | self.proba = proba 9 | self.augs = [] 10 | self._reset() 11 | 12 | # 总的增强列表 13 | def _reset(self): 14 | self.augs.append(lambda text: self._shuffle(text)) 15 | self.augs.append(lambda text: self._dropout(text,p = 0.5)) 16 | 17 | # 打乱 18 | def _shuffle(self, text): 19 | text = np.random.permutation(text.strip().split()) 20 | return ' '.join(text) 21 | 22 | #随机删除一些 23 | def _dropout(self, text, p=0.5): 24 | # random delete some text 25 | text = text.strip().split() 26 | len_ = len(text) 27 | indexs = np.random.choice(len_, int(len_ * p)) 28 | for i in indexs: 29 | text[i] = '' 30 | return ' '.join(text) 31 | 32 | def __call__(self,text,aug_type): 33 | ''' 34 | 用aug_type区分数据 35 | ''' 36 | # TTA模式 37 | if 0 <= aug_type <= 2: 38 | pass 39 | # 训练模式 40 | if self.mode and random.random() < self.proba: 41 | aug = random.choice(self.augs) 42 | text = aug(text) 43 | return text 44 | -------------------------------------------------------------------------------- /pydatagrand/preprocessing/chinese_preprocessor.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 2 | import re 3 | import jieba 4 | 5 | class ChinesePreProcessor(object): 6 | def __init__(self,min_len = 2,stopwords_path = None): 7 | self.min_len = min_len 8 | self.stopwords_path = stopwords_path 9 | self.reset() 10 | 11 | def jieba_cut(self,sentence): 12 | ''' 13 | jieba分词 14 | :param sentence: 15 | :return: 16 | ''' 17 | seg_list = jieba.cut(sentence,cut_all=False) 18 | return ' '.join(seg_list) 19 | 20 | def reset(self): 21 | ''' 22 | 加载停用词 23 | :return: 24 | ''' 25 | if self.stopwords_path: 26 | with open(self.stopwords_path,'r') as fr: 27 | self.stopwords = {} 28 | for line in fr: 29 | word = line.strip(' ').strip('\n') 30 | self.stopwords[word] = 1 31 | 32 | def clean_length(self,sentence): 33 | ''' 34 | 去除长度小于min_len的文本 35 | :param sentence: 36 | :return: 37 | ''' 38 | if len([x for x in sentence]) >= self.min_len: 39 | return sentence 40 | 41 | 42 | def full2half(self,sentence): 43 | ''' 44 | 全角转化为半角 45 | :param sentence: 46 | :return: 47 | ''' 48 | ret_str = '' 49 | for i in sentence: 50 | if ord(i) >= 33 + 65248 and ord(i) <= 126 + 65248: 51 | ret_str += chr(ord(i) - 65248) 52 | else: 53 | ret_str += i 54 | return ret_str 55 | 56 | def remove_stopword(self,sentence): 57 | ''' 58 | 去除停用词 59 | :param sentence: 60 | :return: 61 | ''' 62 | words = sentence.split() 63 | x = [word for word in words if word not in self.stopwords] 64 | return " ".join(x) 65 | 66 | def get_china(self,sentence): 67 | ''' 68 | 获取中文 69 | :param sentence: 70 | :return: 71 | ''' 72 | zhmodel = re.compile("[\u4e00-\u9fa5]") 73 | words = [x for x in sentence if zhmodel.search(x)] 74 | return ''.join(words) 75 | 76 | def remove_numbers(self,sentence): 77 | ''' 78 | 移除数字 79 | :param sentence: 80 | :return: 81 | ''' 82 | words = sentence.split() 83 | x = [re.sub('\d+','',word) for word in words] 84 | return ' '.join([w for w in x if w !='']) 85 | 86 | def remove_whitespace(self,sentence): 87 | ''' 88 | 移除空白部分 89 | :param sentence: 90 | :return: 91 | ''' 92 | x = ''.join([x for x in sentence if x !=' ' or x !='' or x!=' ']) 93 | return x 94 | 95 | def __call__(self, sentence): 96 | x = sentence.strip('\n') 97 | x = self.full2half(x) 98 | # x = self.jieba_cut(x) 99 | # if self.stopwords_path: 100 | # x = self.remove_stopword(x) 101 | x = self.remove_whitespace(x) 102 | x = self.get_china(x) 103 | x = self.clean_length(x) 104 | 105 | return x 106 | -------------------------------------------------------------------------------- /pydatagrand/pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/pretrain/bert/base-uncased/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/pretrain/xlnet/base-cased/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/test/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/test/predicter.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /pydatagrand/train/__init__.py: -------------------------------------------------------------------------------- 1 | #encoding:utf-8 -------------------------------------------------------------------------------- /pydatagrand/train/ema.py: -------------------------------------------------------------------------------- 1 | class EMA: 2 | def __init__(self, model, mu, level='batch', n=1): 3 | # self.ema_model = copy.deepcopy(model) 4 | self.mu = mu 5 | self.level = level 6 | self.n = n 7 | self.cnt = self.n 8 | self.shadow = {} 9 | for name, param in model.named_parameters(): 10 | if param.requires_grad: 11 | self.shadow[name] = param.data 12 | 13 | def _update(self, model): 14 | for name, param in model.named_parameters(): 15 | if param.requires_grad: 16 | new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name] 17 | self.shadow[name] = new_average.clone() 18 | 19 | def set_weights(self, ema_model): 20 | for name, param in ema_model.named_parameters(): 21 | if param.requires_grad: 22 | param.data = self.shadow[name] 23 | 24 | def on_batch_end(self, model): 25 | if self.level is 'batch': 26 | self.cnt -= 1 27 | if self.cnt == 0: 28 | self._update(model) 29 | self.cnt = self.n 30 | 31 | def on_epoch_end(self, model): 32 | if self.level is 'epoch': 33 | self._update(model) -------------------------------------------------------------------------------- /pydatagrand/train/losses.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | from torch.nn import CrossEntropyLoss 3 | from torch.nn import BCEWithLogitsLoss 4 | 5 | __call__ = ['CrossEntropy', 'BCEWithLogLoss'] 6 | 7 | 8 | class CrossEntropy(object): 9 | def __init__(self, ignore_index=-1): 10 | self.loss_f = CrossEntropyLoss(ignore_index=ignore_index) 11 | 12 | def __call__(self, output, target): 13 | loss = self.loss_f(input=output, target=target) 14 | return loss 15 | 16 | 17 | class BCEWithLogLoss(object): 18 | def __init__(self): 19 | self.loss_fn = BCEWithLogitsLoss() 20 | 21 | def __call__(self, output, target): 22 | loss = self.loss_fn(input=output, target=target) 23 | return loss 24 | 25 | 26 | class SpanLoss(object): 27 | def __init__(self, ignore_index=-100): 28 | self.loss_fn = CrossEntropyLoss(ignore_index=ignore_index) 29 | 30 | def __call__(self, output, target, mask): 31 | active_loss = mask.view(-1) == 1 32 | active_logits = output[active_loss] 33 | active_labels = target[active_loss] 34 | return self.loss_fn(active_logits, active_labels) 35 | -------------------------------------------------------------------------------- /pydatagrand/train/ner_seq_trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ..callback.progressbar import ProgressBar 3 | from ..common.tools import model_device,prepare_device 4 | from ..common.tools import seed_everything 5 | from ..common.tools import AverageMeter 6 | from .ner_utils import SeqEntityScore 7 | from torch.nn.utils import clip_grad_norm_ 8 | 9 | class Trainer(object): 10 | def __init__(self, model, n_gpu, logger, optimizer, lr_scheduler, 11 | label2id, gradient_accumulation_steps, grad_clip=0.0,early_stopping=None, 12 | fp16=None, resume_path=None, training_monitor=None, model_checkpoint=None): 13 | 14 | self.n_gpu = n_gpu 15 | self.model = model 16 | self.logger = logger 17 | self.fp16 = fp16 18 | self.optimizer = optimizer 19 | self.label2id = label2id 20 | self.grad_clip = grad_clip 21 | self.lr_scheduler = lr_scheduler 22 | self.early_stopping = early_stopping 23 | self.model_checkpoint = model_checkpoint 24 | self.training_monitor = training_monitor 25 | self.gradient_accumulation_steps = gradient_accumulation_steps 26 | 27 | # self.model, self.device = model_device(n_gpu=self.n_gpu, model=self.model) 28 | self.device ,_ = prepare_device(n_gpu) 29 | self.id2label = {y: x for x, y in label2id.items()} 30 | self.entity_score = SeqEntityScore(self.id2label) 31 | self.start_epoch = 1 32 | self.global_step = 0 33 | if self.fp16: 34 | try: 35 | from apex import amp 36 | except ImportError: 37 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 38 | if resume_path: 39 | self.logger.info(f"\nLoading checkpoint: {resume_path}") 40 | resume_dict = torch.load(resume_path / 'checkpoint_info.bin') 41 | best = resume_dict['epoch'] 42 | self.start_epoch = resume_dict['epoch'] 43 | if self.model_checkpoint: 44 | self.model_checkpoint.best = best 45 | self.logger.info(f"\nCheckpoint '{resume_path}' and epoch {self.start_epoch} loaded") 46 | 47 | def save_info(self, epoch, best): 48 | model_save = self.model.module if hasattr(self.model, 'module') else self.model 49 | state = {"model": model_save, 50 | 'epoch': epoch, 51 | 'best': best} 52 | return state 53 | 54 | def valid_epoch(self, data_loader): 55 | pbar = ProgressBar(n_total=len(data_loader), desc='Evaluating') 56 | self.entity_score.reset() 57 | valid_loss = AverageMeter() 58 | for step, batch in enumerate(data_loader): 59 | batch = tuple(t.to(self.device) for t in batch) 60 | input_ids, input_mask, segment_ids, label_ids, input_lens = batch 61 | input_lens = input_lens.cpu().detach().numpy().tolist() 62 | self.model.eval() 63 | with torch.no_grad(): 64 | features, loss = self.model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens) 65 | tags, _ = self.model.crf._obtain_labels(features, self.id2label, input_lens) 66 | valid_loss.update(val=loss.item(), n=input_ids.size(0)) 67 | pbar(step=step, info={"loss": loss.item()}) 68 | label_ids = label_ids.to('cpu').numpy().tolist() 69 | for i, label in enumerate(label_ids): 70 | temp_1 = [] 71 | temp_2 = [] 72 | for j, m in enumerate(label): 73 | if j == 0: 74 | continue 75 | elif label_ids[i][j] == self.label2id['[SEP]']: 76 | self.entity_score.update(pred_paths=[temp_2], label_paths=[temp_1]) 77 | break 78 | else: 79 | temp_1.append(self.id2label[label_ids[i][j]]) 80 | temp_2.append(tags[i][j]) 81 | valid_info, class_info = self.entity_score.result() 82 | info = {f'valid_{key}': value for key, value in valid_info.items()} 83 | info['valid_loss'] = valid_loss.avg 84 | if 'cuda' in str(self.device): 85 | torch.cuda.empty_cache() 86 | return info, class_info 87 | 88 | def train_epoch(self, data_loader): 89 | pbar = ProgressBar(n_total=len(data_loader), desc='Training') 90 | tr_loss = AverageMeter() 91 | for step, batch in enumerate(data_loader): 92 | self.model.train() 93 | batch = tuple(t.to(self.device) for t in batch) 94 | input_ids, input_mask, segment_ids, label_ids, input_lens = batch 95 | input_lens = input_lens.cpu().detach().numpy().tolist() 96 | _, loss = self.model.forward_loss(input_ids, segment_ids, input_mask, label_ids, input_lens) 97 | if len(self.n_gpu.split(",")) >= 2: 98 | loss = loss.mean() 99 | if self.gradient_accumulation_steps > 1: 100 | loss = loss / self.gradient_accumulation_steps 101 | if self.fp16: 102 | with amp.scale_loss(loss, self.optimizer) as scaled_loss: 103 | scaled_loss.backward() 104 | clip_grad_norm_(amp.master_params(self.optimizer), self.grad_clip) 105 | else: 106 | loss.backward() 107 | clip_grad_norm_(self.model.parameters(), self.grad_clip) 108 | if (step + 1) % self.gradient_accumulation_steps == 0: 109 | self.optimizer.step() 110 | self.optimizer.zero_grad() 111 | self.global_step += 1 112 | tr_loss.update(loss.item(), n=1) 113 | pbar(step=step, info={'loss': loss.item()}) 114 | info = {'loss': tr_loss.avg} 115 | if "cuda" in str(self.device): 116 | torch.cuda.empty_cache() 117 | return info 118 | 119 | def train(self, train_data, valid_data, epochs, seed): 120 | seed_everything(seed) 121 | for epoch in range(self.start_epoch, self.start_epoch + int(epochs)): 122 | self.logger.info(f"Epoch {epoch}/{int(epochs)}") 123 | train_log = self.train_epoch(train_data) 124 | valid_log, class_info = self.valid_epoch(valid_data) 125 | 126 | logs = dict(train_log, **valid_log) 127 | show_info = f'Epoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()]) 128 | self.logger.info(show_info) 129 | self.logger.info("The entity scores of valid data : ") 130 | for key, value in class_info.items(): 131 | info = f'Entity: {key} - ' + "-".join([f' {key_}: {value_:.4f} ' for key_, value_ in value.items()]) 132 | self.logger.info(info) 133 | 134 | if hasattr(self.lr_scheduler,'epoch_step'): 135 | self.lr_scheduler.epoch_step(metrics=logs[self.model_checkpoint.monitor], epoch=epoch) 136 | # save 137 | if self.training_monitor: 138 | self.training_monitor.epoch_step(logs) 139 | 140 | # save model 141 | if self.model_checkpoint: 142 | state = self.save_info(epoch, best=logs[self.model_checkpoint.monitor]) 143 | self.model_checkpoint.bert_epoch_step(current=logs[self.model_checkpoint.monitor], state=state) 144 | 145 | # early_stopping 146 | if self.early_stopping: 147 | self.early_stopping.epoch_step(epoch=epoch, current=logs[self.early_stopping.monitor]) 148 | if self.early_stopping.stop_training: 149 | break 150 | -------------------------------------------------------------------------------- /run_submit.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | from pydatagrand.configs.base import config 4 | from pydatagrand.common.tools import load_pickle 5 | from pydatagrand.train.ner_utils import get_entities 6 | from pydatagrand.common.tools import seed_everything 7 | from collections import Counter 8 | from glob import glob 9 | from datetime import datetime 10 | 11 | class DataProcessor(object): 12 | """Base class for data converters for sequence classification data sets.""" 13 | def get_train_examples(self, data_dir): 14 | """Gets a collection of `InputExample`s for the train set.""" 15 | raise NotImplementedError() 16 | def get_dev_examples(self, data_dir): 17 | """Gets a collection of `InputExample`s for the dev set.""" 18 | raise NotImplementedError() 19 | 20 | def get_labels(self): 21 | """Gets the list of labels for this data set.""" 22 | raise NotImplementedError() 23 | class NerProcessor(DataProcessor): 24 | """Processor for the CoNLL-2003 data set.""" 25 | def get_labels(self): 26 | return ["X", "O", "B-a", "I-a", "B-b", "I-b", "B-c", "I-c", "S-a", "S-b", "S-c", "[CLS]", "[SEP]"] 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument("--task_name",default='ner',type=str) 31 | parser.add_argument("--do_test",action='store_true') 32 | parser.add_argument("--do_eval",action='store_true') 33 | parser.add_argument('--seed',default=42,type=str) 34 | args = parser.parse_args() 35 | 36 | seed_everything(seed=args.seed) 37 | dt = str(datetime.today()).split(" ")[0] 38 | test_path = config['data_dir'] / 'test.txt' 39 | test_result_path = config['result'] / f'{dt}_submit_test.txt' 40 | processors = {"ner": NerProcessor} 41 | task_name = args.task_name.lower() 42 | processor = processors[task_name]() 43 | label_list = processor.get_labels() 44 | id2label = {i: label for i, label in enumerate(label_list, 0)} 45 | test_data = [] 46 | with open(str(test_path), 'r') as fr: 47 | for line in fr: 48 | line = line.strip("\n") 49 | test_data.append(line) 50 | fw = open(str(test_result_path), 'w') 51 | cv_test_pred = [] 52 | for file in glob(f"{str(config['result']/ '*.pkl')}"): 53 | data = load_pickle(file) 54 | cv_test_pred.append(data) 55 | vote_pred = [] 56 | for i in range(len(test_data)): 57 | t = [np.array([x[i]]).T for x in cv_test_pred] 58 | t2 = np.concatenate(t, axis=1) 59 | t3 = [] 60 | for line in t2: 61 | c = Counter() 62 | c.update(line) 63 | t3.append(c.most_common(1)[0][0]) 64 | vote_pred.append(t3) 65 | for tag,line in zip(vote_pred,test_data): 66 | token_a = line.split("_") 67 | label_entities = get_entities(tag, id2label) 68 | if len(label_entities) == 0: 69 | record = "_".join(token_a) + "/o" 70 | else: 71 | labels = [] 72 | label_entities = sorted(label_entities, key=lambda x: x[1]) 73 | o_s = 0 74 | for i, entity in enumerate(label_entities): 75 | begin = entity[1] 76 | end = entity[2] 77 | tag = entity[0] 78 | if begin != o_s: 79 | labels.append("_".join(token_a[o_s:begin]) + "/o") 80 | labels.append("_".join(token_a[begin:end + 1]) + f"/{tag}") 81 | o_s = end + 1 82 | if i == len(label_entities) - 1: 83 | if o_s <= len(token_a) - 1: 84 | labels.append("_".join(token_a[o_s:]) + "/o") 85 | record = " ".join(labels) 86 | fw.write(record + "\n") 87 | fw.close() 88 | 89 | if __name__ == "__main__": 90 | main() --------------------------------------------------------------------------------