├── README.md ├── scheme1 ├── code1 │ ├── data │ │ ├── Round2_train.csv │ │ ├── Test_Data.csv │ │ ├── Train_Data.csv │ │ ├── emotion_voting_three_models_39215985.csv │ │ ├── preprocess.py │ │ ├── round2_test.csv │ │ └── submit_example.csv │ ├── pystart.py │ ├── pytorch_transformers │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── file_utils.py │ │ ├── modeling_bert.py │ │ ├── modeling_utils.py │ │ ├── optimization.py │ │ ├── tokenization_auto.py │ │ ├── tokenization_bert.py │ │ └── tokenization_utils.py │ ├── requirements.txt │ └── run_bert.py └── code2 │ └── code.py ├── scheme2 └── code2.py └── scheme3 └── code3.py /README.md: -------------------------------------------------------------------------------- 1 | # ccf_financial_negative 2 | CCF BDCI 金融信息负面及主体判定 冠军代码 3 | 其中scheme1,2,3分别为方案一、二、三的代码 4 | 并且scheme1中的code1修改自guoday分享的代码,在这里很感谢guoday的开源代码 5 | -------------------------------------------------------------------------------- /scheme1/code1/data/Round2_train.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiong666/ccf_financial_negative/66292f4724f2a7b40e83d1c74d0ee05822d69f7e/scheme1/code1/data/Round2_train.csv -------------------------------------------------------------------------------- /scheme1/code1/data/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import os 3 | import random 4 | import numpy as np 5 | import re 6 | import textdistance 7 | 8 | train_df = pd.read_csv('./data/Round2_train.csv') 9 | test_data = pd.read_csv('./data/round2_test.csv') 10 | 11 | train_df['text'] = train_df.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1) 12 | test_data['text'] = test_data.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1) 13 | 14 | def entity_clear(df): 15 | for index, row in df.iterrows(): 16 | if type(row.entity) == float or type(row.text) == float: 17 | continue 18 | entities = row.entity.split(';') 19 | entities.sort(key = lambda x : len(x)) 20 | n = len(entities) 21 | tmp = entities.copy() 22 | for i in range(n): 23 | entity_tmp = entities[i] 24 | if i + 1 >= n: 25 | break 26 | for entity_tmp2 in entities[i+1:]: 27 | if entity_tmp2.find(entity_tmp) != -1 and row.text.replace(entity_tmp2, '').find(entity_tmp) == -1: 28 | tmp.remove(entity_tmp) 29 | break 30 | df.loc[index, 'entity'] = ';'.join(tmp) 31 | return df 32 | train_data = entity_clear(train_df) 33 | test_df = entity_clear(test_data) 34 | 35 | train_data.dropna(subset = ['entity'], inplace=True) 36 | train_data.reset_index(drop=True, inplace=True) 37 | test_df.dropna(subset = ['entity'], inplace=True) 38 | test_df.reset_index(drop=True, inplace=True) 39 | test_df['negative'] = 0 40 | train_data['title'] = train_data['title'].fillna('无') 41 | train_data['text'] = train_data['text'].fillna('无') 42 | test_df['title'] = test_df['title'].fillna('无') 43 | test_df['text'] = test_df['text'].fillna('无') 44 | 45 | train_data['text'] = train_data['text'].map(lambda index: re.sub(r'http.*$', "", index)) 46 | test_df['text'] = test_df['text'].map(lambda index: re.sub(r'http.*$', "", index)) 47 | 48 | train_data['title'] = train_data['title'].map(lambda index: index.replace(' ', '')) 49 | train_data['text'] = train_data['text'].map(lambda index: index.replace(' ', '')) 50 | train_data['title_len'] = train_data['title'].map(lambda index:len(index)) 51 | 52 | test_df['title'] = test_df['title'].map(lambda index: index.replace(' ', '')) 53 | test_df['text'] = test_df['text'].map(lambda index: index.replace(' ', '')) 54 | test_df['title_len'] = test_df['title'].map(lambda index:len(index)) 55 | 56 | distance = textdistance.Levenshtein(external = False) 57 | train_data['distance'] = train_data.apply(lambda index: distance(index.title, index.text), axis=1) 58 | test_df['distance'] = test_df.apply(lambda index: distance(index.title, index.text), axis=1) 59 | 60 | train_data['title_in_text'] = train_data.apply(lambda index: 1 if index.text.find(index.title) != -1 else 0, axis=1) 61 | test_df['title_in_text'] = test_df.apply(lambda index: 1 if index.text.find(index.title) != -1 else 0, axis=1) 62 | 63 | train_data['content'] = train_data.apply(lambda index: index.title + index.text if (index.title_len != 0) & (index.title_in_text != 1) & (index.distance > 100) else index.text, axis=1) 64 | test_df['content'] = test_df.apply(lambda index: index.title + index.text if (index.title_len != 0) & (index.title_in_text != 1) & (index.distance > 100) else index.text, axis=1) 65 | 66 | def get_content(x ,y): 67 | try: 68 | if str(y) == 'nan': 69 | return x 70 | y = y.split(';') 71 | y = sorted(y, key=lambda i: len(i), reverse=True) 72 | for i in y: 73 | x = '实体词'.join(x.split(i)) 74 | return x 75 | except: 76 | return x 77 | train_data['content'] = list(map(lambda x,y: get_content(x,y), train_data['content'], train_data['entity'])) 78 | test_df['content'] = list(map(lambda x,y: get_content(x,y), test_df['content'], test_df['entity'])) 79 | 80 | train_data.rename(columns={'negative':'label'}, inplace=True) 81 | test_df.rename(columns={'negative':'label'}, inplace=True) 82 | 83 | features = ['id', 'content' ,'entity', 'label'] 84 | index = set(range(train_data.shape[0])) 85 | 86 | K_fold = [] 87 | for i in range(10): 88 | if i == 9: 89 | tmp = index 90 | else: 91 | tmp = random.sample(index, int(1.0 /10 * train_data.shape[0])) 92 | index = index - set(tmp) 93 | print('number:', len(tmp)) 94 | K_fold.append(tmp) 95 | 96 | for i in range(10): 97 | print('Fold', i) 98 | os.system('mkdir ./data/data_{}'.format(i)) 99 | dev_index = list(K_fold[i]) 100 | train_index = [] 101 | for j in range(10): 102 | if j != i: 103 | train_index += K_fold[j] 104 | train_data[features].iloc[train_index].to_csv('./data/data_{}/train.csv'.format(i)) 105 | train_data[features].iloc[dev_index].to_csv('./data/data_{}/dev.csv'.format(i)) 106 | test_df[features].to_csv('./data/data_{}/test.csv'.format(i)) 107 | -------------------------------------------------------------------------------- /scheme1/code1/pystart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 22 10:39:24 2019 4 | 5 | @author: xiong 6 | """ 7 | 8 | import os 9 | import pandas as pd 10 | import numpy as np 11 | import argparse 12 | 13 | for i in range(10): 14 | params = '--model_type bert \ 15 | --model_name_or_path chinese_roberta_wwm_large_ext \ 16 | --do_train \ 17 | --do_eval \ 18 | --do_test \ 19 | --data_dir %s \ 20 | --output_dir %s \ 21 | --max_seq_length 512 \ 22 | --split_num 1 \ 23 | --lstm_hidden_size 512 \ 24 | --lstm_layers 1 \ 25 | --lstm_dropout 0.1 \ 26 | --eval_steps 1000 \ 27 | --per_gpu_train_batch_size 16 \ 28 | --gradient_accumulation_steps 8 \ 29 | --warmup_steps 0 \ 30 | --per_gpu_eval_batch_size 32 \ 31 | --learning_rate 8e-6 \ 32 | --adam_epsilon 1e-6 \ 33 | --weight_decay 0 \ 34 | --train_steps 40000 \ 35 | --device_id %d' % ('./data/data_'+str(i), './model_roberta_wwm_large_ext'+str(i), 0) 36 | ex = os.system("python run_bert.py %s" %params) 37 | print('The fold:', i) 38 | 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument("--model_prefix", default='./model_roberta_wwm_large_ext', type=str) 41 | args = parser.parse_args() 42 | 43 | k = 10 44 | df = pd.read_csv('data/data_0/test.csv') 45 | df['0'] = 0 46 | df['1'] = 1 47 | for i in range(k): 48 | temp = pd.read_csv('{}{}/test_pb.csv'.format(args.model_prefix, i)) 49 | df['0'] += temp['label_0'] / k 50 | df['1'] += temp['label_1'] / k 51 | print('The end for combining.') 52 | 53 | df['pre_label'] = np.argmax(df[['0','1']].values, -1) 54 | df['key_entity'] = np.nan 55 | df.rename(columns={'pre_label':'negative'}, inplace=True) 56 | df[['id','negative','key_entity']].to_csv('./result/submit_emotion.csv', encoding='utf-8', index=None) #######right####### -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.1.0" 2 | from .tokenization_auto import AutoTokenizer 3 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer 4 | 5 | from .tokenization_utils import (PreTrainedTokenizer) 6 | 7 | from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, 8 | BertForMaskedLM, BertForNextSentencePrediction, 9 | BertForSequenceClassification, BertForMultipleChoice, 10 | BertForTokenClassification, BertForQuestionAnswering, 11 | load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, 12 | BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) 13 | 14 | from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, 15 | PretrainedConfig, PreTrainedModel, prune_layer, Conv1D) 16 | 17 | from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, 18 | WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) 19 | 20 | from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path) 21 | -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/__main__.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | def main(): 3 | import sys 4 | if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: 5 | print( 6 | "Should be used as one of: \n" 7 | ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" 8 | ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" 9 | ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" 10 | ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" 11 | ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" 12 | ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") 13 | else: 14 | if sys.argv[1] == "bert": 15 | try: 16 | from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch 17 | except ImportError: 18 | print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " 19 | "In that case, it requires TensorFlow to be installed. Please see " 20 | "https://www.tensorflow.org/install/ for installation instructions.") 21 | raise 22 | 23 | if len(sys.argv) != 5: 24 | # pylint: disable=line-too-long 25 | print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") 26 | else: 27 | PYTORCH_DUMP_OUTPUT = sys.argv.pop() 28 | TF_CONFIG = sys.argv.pop() 29 | TF_CHECKPOINT = sys.argv.pop() 30 | convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | from __future__ import (absolute_import, division, print_function, unicode_literals) 7 | 8 | import sys 9 | import json 10 | import logging 11 | import os 12 | import shutil 13 | import tempfile 14 | import fnmatch 15 | from functools import wraps 16 | from hashlib import sha256 17 | from io import open 18 | 19 | import boto3 20 | from botocore.config import Config 21 | from botocore.exceptions import ClientError 22 | import requests 23 | from tqdm import tqdm 24 | 25 | try: 26 | from torch.hub import _get_torch_home 27 | torch_cache_home = _get_torch_home() 28 | except ImportError: 29 | torch_cache_home = os.path.expanduser( 30 | os.getenv('TORCH_HOME', os.path.join( 31 | os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) 32 | default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers') 33 | 34 | try: 35 | from urllib.parse import urlparse 36 | except ImportError: 37 | from urlparse import urlparse 38 | 39 | try: 40 | from pathlib import Path 41 | PYTORCH_PRETRAINED_BERT_CACHE = Path( 42 | os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))) 43 | except (AttributeError, ImportError): 44 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE', 45 | os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 46 | default_cache_path)) 47 | 48 | PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility 49 | 50 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 51 | 52 | 53 | def url_to_filename(url, etag=None): 54 | """ 55 | Convert `url` into a hashed filename in a repeatable way. 56 | If `etag` is specified, append its hash to the url's, delimited 57 | by a period. 58 | """ 59 | url_bytes = url.encode('utf-8') 60 | url_hash = sha256(url_bytes) 61 | filename = url_hash.hexdigest() 62 | 63 | if etag: 64 | etag_bytes = etag.encode('utf-8') 65 | etag_hash = sha256(etag_bytes) 66 | filename += '.' + etag_hash.hexdigest() 67 | 68 | return filename 69 | 70 | 71 | def filename_to_url(filename, cache_dir=None): 72 | """ 73 | Return the url and etag (which may be ``None``) stored for `filename`. 74 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 75 | """ 76 | if cache_dir is None: 77 | cache_dir = PYTORCH_TRANSFORMERS_CACHE 78 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 79 | cache_dir = str(cache_dir) 80 | 81 | cache_path = os.path.join(cache_dir, filename) 82 | if not os.path.exists(cache_path): 83 | raise EnvironmentError("file {} not found".format(cache_path)) 84 | 85 | meta_path = cache_path + '.json' 86 | if not os.path.exists(meta_path): 87 | raise EnvironmentError("file {} not found".format(meta_path)) 88 | 89 | with open(meta_path, encoding="utf-8") as meta_file: 90 | metadata = json.load(meta_file) 91 | url = metadata['url'] 92 | etag = metadata['etag'] 93 | 94 | return url, etag 95 | 96 | 97 | def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None): 98 | """ 99 | Given something that might be a URL (or might be a local path), 100 | determine which. If it's a URL, download the file and cache it, and 101 | return the path to the cached file. If it's already a local path, 102 | make sure the file exists and then return the path. 103 | Args: 104 | cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). 105 | force_download: if True, re-dowload the file even if it's already cached in the cache dir. 106 | """ 107 | if cache_dir is None: 108 | cache_dir = PYTORCH_TRANSFORMERS_CACHE 109 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 110 | url_or_filename = str(url_or_filename) 111 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 112 | cache_dir = str(cache_dir) 113 | 114 | parsed = urlparse(url_or_filename) 115 | 116 | if parsed.scheme in ('http', 'https', 's3'): 117 | # URL, so get it from the cache (downloading if necessary) 118 | return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies) 119 | elif os.path.exists(url_or_filename): 120 | # File, and it exists. 121 | return url_or_filename 122 | elif parsed.scheme == '': 123 | # File, but it doesn't exist. 124 | raise EnvironmentError("file {} not found".format(url_or_filename)) 125 | else: 126 | # Something unknown 127 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 128 | 129 | 130 | def split_s3_path(url): 131 | """Split a full s3 path into the bucket name and path.""" 132 | parsed = urlparse(url) 133 | if not parsed.netloc or not parsed.path: 134 | raise ValueError("bad s3 path {}".format(url)) 135 | bucket_name = parsed.netloc 136 | s3_path = parsed.path 137 | # Remove '/' at beginning of path. 138 | if s3_path.startswith("/"): 139 | s3_path = s3_path[1:] 140 | return bucket_name, s3_path 141 | 142 | 143 | def s3_request(func): 144 | """ 145 | Wrapper function for s3 requests in order to create more helpful error 146 | messages. 147 | """ 148 | 149 | @wraps(func) 150 | def wrapper(url, *args, **kwargs): 151 | try: 152 | return func(url, *args, **kwargs) 153 | except ClientError as exc: 154 | if int(exc.response["Error"]["Code"]) == 404: 155 | raise EnvironmentError("file {} not found".format(url)) 156 | else: 157 | raise 158 | 159 | return wrapper 160 | 161 | 162 | @s3_request 163 | def s3_etag(url, proxies=None): 164 | """Check ETag on S3 object.""" 165 | s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) 166 | bucket_name, s3_path = split_s3_path(url) 167 | s3_object = s3_resource.Object(bucket_name, s3_path) 168 | return s3_object.e_tag 169 | 170 | 171 | @s3_request 172 | def s3_get(url, temp_file, proxies=None): 173 | """Pull a file directly from S3.""" 174 | s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) 175 | bucket_name, s3_path = split_s3_path(url) 176 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 177 | 178 | 179 | def http_get(url, temp_file, proxies=None): 180 | req = requests.get(url, stream=True, proxies=proxies) 181 | content_length = req.headers.get('Content-Length') 182 | total = int(content_length) if content_length is not None else None 183 | progress = tqdm(unit="B", total=total) 184 | for chunk in req.iter_content(chunk_size=1024): 185 | if chunk: # filter out keep-alive new chunks 186 | progress.update(len(chunk)) 187 | temp_file.write(chunk) 188 | progress.close() 189 | 190 | 191 | def get_from_cache(url, cache_dir=None, force_download=False, proxies=None): 192 | """ 193 | Given a URL, look for the corresponding dataset in the local cache. 194 | If it's not there, download it. Then return the path to the cached file. 195 | """ 196 | if cache_dir is None: 197 | cache_dir = PYTORCH_TRANSFORMERS_CACHE 198 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 199 | cache_dir = str(cache_dir) 200 | if sys.version_info[0] == 2 and not isinstance(cache_dir, str): 201 | cache_dir = str(cache_dir) 202 | 203 | if not os.path.exists(cache_dir): 204 | os.makedirs(cache_dir) 205 | 206 | # Get eTag to add to filename, if it exists. 207 | if url.startswith("s3://"): 208 | etag = s3_etag(url, proxies=proxies) 209 | else: 210 | try: 211 | response = requests.head(url, allow_redirects=True, proxies=proxies) 212 | if response.status_code != 200: 213 | etag = None 214 | else: 215 | etag = response.headers.get("ETag") 216 | except EnvironmentError: 217 | etag = None 218 | 219 | if sys.version_info[0] == 2 and etag is not None: 220 | etag = etag.decode('utf-8') 221 | filename = url_to_filename(url, etag) 222 | 223 | # get cache path to put the file 224 | cache_path = os.path.join(cache_dir, filename) 225 | 226 | # If we don't have a connection (etag is None) and can't identify the file 227 | # try to get the last downloaded one 228 | if not os.path.exists(cache_path) and etag is None: 229 | matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') 230 | matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) 231 | if matching_files: 232 | cache_path = os.path.join(cache_dir, matching_files[-1]) 233 | 234 | if not os.path.exists(cache_path) or force_download: 235 | # Download to temporary file, then copy to cache dir once finished. 236 | # Otherwise you get corrupt cache entries if the download gets interrupted. 237 | with tempfile.NamedTemporaryFile() as temp_file: 238 | logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) 239 | 240 | # GET file object 241 | if url.startswith("s3://"): 242 | s3_get(url, temp_file, proxies=proxies) 243 | else: 244 | http_get(url, temp_file, proxies=proxies) 245 | 246 | # we are copying the file before closing it, so flush to avoid truncation 247 | temp_file.flush() 248 | # shutil.copyfileobj() starts at the current position, so go to the start 249 | temp_file.seek(0) 250 | 251 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 252 | with open(cache_path, 'wb') as cache_file: 253 | shutil.copyfileobj(temp_file, cache_file) 254 | 255 | logger.info("creating metadata file for %s", cache_path) 256 | meta = {'url': url, 'etag': etag} 257 | meta_path = cache_path + '.json' 258 | with open(meta_path, 'w') as meta_file: 259 | output_string = json.dumps(meta) 260 | if sys.version_info[0] == 2 and isinstance(output_string, str): 261 | output_string = unicode(output_string, 'utf-8') # The beauty of python 2 262 | meta_file.write(output_string) 263 | 264 | logger.info("removing temp file %s", temp_file.name) 265 | 266 | return cache_path 267 | -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """PyTorch optimization for BERT model.""" 16 | 17 | import logging 18 | import math 19 | 20 | import torch 21 | from torch.optim import Optimizer 22 | from torch.optim.lr_scheduler import LambdaLR 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | class ConstantLRSchedule(LambdaLR): 27 | """ Constant learning rate schedule. 28 | """ 29 | def __init__(self, optimizer, last_epoch=-1): 30 | super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch) 31 | 32 | 33 | class WarmupConstantSchedule(LambdaLR): 34 | """ Linear warmup and then constant. 35 | Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps. 36 | Keeps learning rate schedule equal to 1. after warmup_steps. 37 | """ 38 | def __init__(self, optimizer, warmup_steps, last_epoch=-1): 39 | self.warmup_steps = warmup_steps 40 | super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 41 | 42 | def lr_lambda(self, step): 43 | if step < self.warmup_steps: 44 | return float(step) / float(max(1.0, self.warmup_steps)) 45 | return 1. 46 | 47 | 48 | class WarmupLinearSchedule(LambdaLR): 49 | """ Linear warmup and then linear decay. 50 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 51 | Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps. 52 | """ 53 | def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1): 54 | self.warmup_steps = warmup_steps 55 | self.t_total = t_total 56 | super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 57 | 58 | def lr_lambda(self, step): 59 | if step < self.warmup_steps: 60 | return float(step) / float(max(1, self.warmup_steps)) 61 | return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps))) 62 | 63 | 64 | class WarmupCosineSchedule(LambdaLR): 65 | """ Linear warmup and then cosine decay. 66 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 67 | Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve. 68 | If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. 69 | """ 70 | def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1): 71 | self.warmup_steps = warmup_steps 72 | self.t_total = t_total 73 | self.cycles = cycles 74 | super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 75 | 76 | def lr_lambda(self, step): 77 | if step < self.warmup_steps: 78 | return float(step) / float(max(1.0, self.warmup_steps)) 79 | # progress after warmup 80 | progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) 81 | return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) 82 | 83 | 84 | class WarmupCosineWithHardRestartsSchedule(LambdaLR): 85 | """ Linear warmup and then cosine cycles with hard restarts. 86 | Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. 87 | If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying 88 | learning rate (with hard restarts). 89 | """ 90 | def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1): 91 | self.warmup_steps = warmup_steps 92 | self.t_total = t_total 93 | self.cycles = cycles 94 | super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) 95 | 96 | def lr_lambda(self, step): 97 | if step < self.warmup_steps: 98 | return float(step) / float(max(1, self.warmup_steps)) 99 | # progress after warmup 100 | progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) 101 | if progress >= 1.0: 102 | return 0.0 103 | return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0)))) 104 | 105 | 106 | 107 | class AdamW(Optimizer): 108 | """ Implements Adam algorithm with weight decay fix. 109 | 110 | Parameters: 111 | lr (float): learning rate. Default 1e-3. 112 | betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) 113 | eps (float): Adams epsilon. Default: 1e-6 114 | weight_decay (float): Weight decay. Default: 0.0 115 | correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. 116 | """ 117 | def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): 118 | if lr < 0.0: 119 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 120 | if not 0.0 <= betas[0] < 1.0: 121 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) 122 | if not 0.0 <= betas[1] < 1.0: 123 | raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) 124 | if not 0.0 <= eps: 125 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) 126 | defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, 127 | correct_bias=correct_bias) 128 | super(AdamW, self).__init__(params, defaults) 129 | 130 | def step(self, closure=None): 131 | """Performs a single optimization step. 132 | 133 | Arguments: 134 | closure (callable, optional): A closure that reevaluates the model 135 | and returns the loss. 136 | """ 137 | loss = None 138 | if closure is not None: 139 | loss = closure() 140 | 141 | for group in self.param_groups: 142 | for p in group['params']: 143 | if p.grad is None: 144 | continue 145 | grad = p.grad.data 146 | if grad.is_sparse: 147 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 148 | 149 | state = self.state[p] 150 | 151 | # State initialization 152 | if len(state) == 0: 153 | state['step'] = 0 154 | # Exponential moving average of gradient values 155 | state['exp_avg'] = torch.zeros_like(p.data) 156 | # Exponential moving average of squared gradient values 157 | state['exp_avg_sq'] = torch.zeros_like(p.data) 158 | 159 | exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] 160 | beta1, beta2 = group['betas'] 161 | 162 | state['step'] += 1 163 | 164 | # Decay the first and second moment running average coefficient 165 | # In-place operations to update the averages at the same time 166 | exp_avg.mul_(beta1).add_(1.0 - beta1, grad) 167 | exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) 168 | denom = exp_avg_sq.sqrt().add_(group['eps']) 169 | 170 | step_size = group['lr'] 171 | if group['correct_bias']: # No bias correction for Bert 172 | bias_correction1 = 1.0 - beta1 ** state['step'] 173 | bias_correction2 = 1.0 - beta2 ** state['step'] 174 | step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 175 | 176 | p.data.addcdiv_(-step_size, exp_avg, denom) 177 | 178 | # Just adding the square of the weights to the loss function is *not* 179 | # the correct way of using L2 regularization/weight decay with Adam, 180 | # since that will interact with the m and v parameters in strange ways. 181 | # 182 | # Instead we want to decay the weights in a manner that doesn't interact 183 | # with the m/v parameters. This is equivalent to adding the square 184 | # of the weights to the loss with plain (non-momentum) SGD. 185 | # Add weight decay at the end (fixed version) 186 | if group['weight_decay'] > 0.0: 187 | p.data.add_(-group['lr'] * group['weight_decay'], p.data) 188 | 189 | return loss 190 | -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/tokenization_auto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Auto Model class. """ 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import logging 20 | 21 | from .tokenization_bert import BertTokenizer 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | class AutoTokenizer(object): 26 | r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class 27 | that will be instantiated as one of the tokenizer classes of the library 28 | when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` 29 | class method. 30 | 31 | The `from_pretrained()` method take care of returning the correct tokenizer class instance 32 | using pattern matching on the `pretrained_model_name_or_path` string. 33 | 34 | The tokenizer class to instantiate is selected as the first pattern matching 35 | in the `pretrained_model_name_or_path` string (in the following order): 36 | - contains `bert`: BertTokenizer (Bert model) 37 | - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) 38 | - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) 39 | - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) 40 | - contains `xlnet`: XLNetTokenizer (XLNet model) 41 | - contains `xlm`: XLMTokenizer (XLM model) 42 | - contains `roberta`: RobertaTokenizer (RoBERTa model) 43 | 44 | This class cannot be instantiated using `__init__()` (throw an error). 45 | """ 46 | def __init__(self): 47 | raise EnvironmentError("AutoTokenizer is designed to be instantiated " 48 | "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.") 49 | 50 | @classmethod 51 | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): 52 | r""" Instantiate a one of the tokenizer classes of the library 53 | from a pre-trained model vocabulary. 54 | 55 | The tokenizer class to instantiate is selected as the first pattern matching 56 | in the `pretrained_model_name_or_path` string (in the following order): 57 | - contains `bert`: BertTokenizer (Bert model) 58 | - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) 59 | - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) 60 | - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) 61 | - contains `xlnet`: XLNetTokenizer (XLNet model) 62 | - contains `xlm`: XLMTokenizer (XLM model) 63 | - contains `roberta`: RobertaTokenizer (XLM model) 64 | 65 | Params: 66 | **pretrained_model_name_or_path**: either: 67 | - a string with the `shortcut name` of a pre-trained model configuration to load from cache 68 | or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). 69 | - a path to a `directory` containing a configuration file saved 70 | using the `save_pretrained(save_directory)` method. 71 | - a path or url to a saved configuration `file`. 72 | **cache_dir**: (`optional`) string: 73 | Path to a directory in which a downloaded pre-trained model 74 | configuration should be cached if the standard cache should not be used. 75 | 76 | Examples:: 77 | 78 | config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. 79 | config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` 80 | 81 | """ 82 | if 'roberta' in pretrained_model_name_or_path: 83 | return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 84 | elif 'bert' in pretrained_model_name_or_path: 85 | return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 86 | elif 'openai-gpt' in pretrained_model_name_or_path: 87 | return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 88 | elif 'gpt2' in pretrained_model_name_or_path: 89 | return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 90 | elif 'transfo-xl' in pretrained_model_name_or_path: 91 | return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 92 | elif 'xlnet' in pretrained_model_name_or_path: 93 | return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 94 | elif 'xlm' in pretrained_model_name_or_path: 95 | return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 96 | 97 | raise ValueError("Unrecognized model identifier in {}. Should contains one of " 98 | "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " 99 | "'xlm', 'roberta'".format(pretrained_model_name_or_path)) 100 | -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/tokenization_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes.""" 16 | 17 | from __future__ import absolute_import, division, print_function, unicode_literals 18 | 19 | import collections 20 | import logging 21 | import os 22 | import unicodedata 23 | from io import open 24 | 25 | from .tokenization_utils import PreTrainedTokenizer 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} 30 | 31 | PRETRAINED_VOCAB_FILES_MAP = { 32 | 'vocab_file': 33 | { 34 | 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", 35 | 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", 36 | 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", 37 | 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", 38 | 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", 39 | 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", 40 | 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 41 | 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", 42 | 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", 43 | 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", 44 | 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", 45 | 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", 46 | 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", 47 | } 48 | } 49 | 50 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 51 | 'bert-base-uncased': 512, 52 | 'bert-large-uncased': 512, 53 | 'bert-base-cased': 512, 54 | 'bert-large-cased': 512, 55 | 'bert-base-multilingual-uncased': 512, 56 | 'bert-base-multilingual-cased': 512, 57 | 'bert-base-chinese': 512, 58 | 'bert-base-german-cased': 512, 59 | 'bert-large-uncased-whole-word-masking': 512, 60 | 'bert-large-cased-whole-word-masking': 512, 61 | 'bert-large-uncased-whole-word-masking-finetuned-squad': 512, 62 | 'bert-large-cased-whole-word-masking-finetuned-squad': 512, 63 | 'bert-base-cased-finetuned-mrpc': 512, 64 | } 65 | 66 | def load_vocab(vocab_file): 67 | """Loads a vocabulary file into a dictionary.""" 68 | vocab = collections.OrderedDict() 69 | with open(vocab_file, "r", encoding="utf-8") as reader: 70 | tokens = reader.readlines() 71 | for index, token in enumerate(tokens): 72 | token = token.rstrip('\n') 73 | vocab[token] = index 74 | return vocab 75 | 76 | 77 | def whitespace_tokenize(text): 78 | """Runs basic whitespace cleaning and splitting on a piece of text.""" 79 | text = text.strip() 80 | if not text: 81 | return [] 82 | tokens = text.split() 83 | return tokens 84 | 85 | 86 | class BertTokenizer(PreTrainedTokenizer): 87 | r""" 88 | Constructs a BertTokenizer. 89 | :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece 90 | 91 | Args: 92 | vocab_file: Path to a one-wordpiece-per-line vocabulary file 93 | do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False 94 | do_basic_tokenize: Whether to do basic tokenization before wordpiece. 95 | max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the 96 | minimum of this value (if specified) and the underlying BERT model's sequence length. 97 | never_split: List of tokens which will never be split during tokenization. Only has an effect when 98 | do_wordpiece_only=False 99 | """ 100 | 101 | vocab_files_names = VOCAB_FILES_NAMES 102 | pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 103 | max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 104 | 105 | def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, 106 | unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", 107 | mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs): 108 | """Constructs a BertTokenizer. 109 | 110 | Args: 111 | **vocab_file**: Path to a one-wordpiece-per-line vocabulary file 112 | **do_lower_case**: (`optional`) boolean (default True) 113 | Whether to lower case the input 114 | Only has an effect when do_basic_tokenize=True 115 | **do_basic_tokenize**: (`optional`) boolean (default True) 116 | Whether to do basic tokenization before wordpiece. 117 | **never_split**: (`optional`) list of string 118 | List of tokens which will never be split during tokenization. 119 | Only has an effect when do_basic_tokenize=True 120 | **tokenize_chinese_chars**: (`optional`) boolean (default True) 121 | Whether to tokenize Chinese characters. 122 | This should likely be deactivated for Japanese: 123 | see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 124 | """ 125 | super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, 126 | pad_token=pad_token, cls_token=cls_token, 127 | mask_token=mask_token, **kwargs) 128 | if not os.path.isfile(vocab_file): 129 | raise ValueError( 130 | "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " 131 | "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) 132 | self.vocab = load_vocab(vocab_file) 133 | self.ids_to_tokens = collections.OrderedDict( 134 | [(ids, tok) for tok, ids in self.vocab.items()]) 135 | self.do_basic_tokenize = do_basic_tokenize 136 | if do_basic_tokenize: 137 | self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, 138 | never_split=never_split, 139 | tokenize_chinese_chars=tokenize_chinese_chars) 140 | self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) 141 | 142 | @property 143 | def vocab_size(self): 144 | return len(self.vocab) 145 | 146 | def _tokenize(self, text): 147 | split_tokens = [] 148 | if self.do_basic_tokenize: 149 | for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): 150 | for sub_token in self.wordpiece_tokenizer.tokenize(token): 151 | split_tokens.append(sub_token) 152 | else: 153 | split_tokens = self.wordpiece_tokenizer.tokenize(text) 154 | return split_tokens 155 | 156 | def _convert_token_to_id(self, token): 157 | """ Converts a token (str/unicode) in an id using the vocab. """ 158 | return self.vocab.get(token, self.vocab.get(self.unk_token)) 159 | 160 | def _convert_id_to_token(self, index): 161 | """Converts an index (integer) in a token (string/unicode) using the vocab.""" 162 | return self.ids_to_tokens.get(index, self.unk_token) 163 | 164 | def convert_tokens_to_string(self, tokens): 165 | """ Converts a sequence of tokens (string) in a single string. """ 166 | out_string = ' '.join(tokens).replace(' ##', '').strip() 167 | return out_string 168 | 169 | def add_special_tokens_single_sentence(self, token_ids): 170 | """ 171 | Adds special tokens to the a sequence for sequence classification tasks. 172 | A BERT sequence has the following format: [CLS] X [SEP] 173 | """ 174 | return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] 175 | 176 | def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): 177 | """ 178 | Adds special tokens to a sequence pair for sequence classification tasks. 179 | A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] 180 | """ 181 | sep = [self._convert_token_to_id(self.sep_token)] 182 | cls = [self._convert_token_to_id(self.cls_token)] 183 | return cls + token_ids_0 + sep + token_ids_1 + sep 184 | 185 | def save_vocabulary(self, vocab_path): 186 | """Save the tokenizer vocabulary to a directory or file.""" 187 | index = 0 188 | if os.path.isdir(vocab_path): 189 | vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) 190 | else: 191 | vocab_file = vocab_path 192 | with open(vocab_file, "w", encoding="utf-8") as writer: 193 | for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): 194 | if index != token_index: 195 | logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." 196 | " Please check that the vocabulary is not corrupted!".format(vocab_file)) 197 | index = token_index 198 | writer.write(token + u'\n') 199 | index += 1 200 | return (vocab_file,) 201 | 202 | @classmethod 203 | def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): 204 | """ Instantiate a BertTokenizer from pre-trained vocabulary files. 205 | """ 206 | if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES: 207 | if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): 208 | logger.warning("The pre-trained model you are loading is a cased model but you have not set " 209 | "`do_lower_case` to False. We are setting `do_lower_case=False` for you but " 210 | "you may want to check this behavior.") 211 | kwargs['do_lower_case'] = False 212 | elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): 213 | logger.warning("The pre-trained model you are loading is an uncased model but you have set " 214 | "`do_lower_case` to False. We are setting `do_lower_case=True` for you " 215 | "but you may want to check this behavior.") 216 | kwargs['do_lower_case'] = True 217 | 218 | return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) 219 | 220 | 221 | class BasicTokenizer(object): 222 | """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" 223 | 224 | def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): 225 | """ Constructs a BasicTokenizer. 226 | 227 | Args: 228 | **do_lower_case**: Whether to lower case the input. 229 | **never_split**: (`optional`) list of str 230 | Kept for backward compatibility purposes. 231 | Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) 232 | List of token not to split. 233 | **tokenize_chinese_chars**: (`optional`) boolean (default True) 234 | Whether to tokenize Chinese characters. 235 | This should likely be deactivated for Japanese: 236 | see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 237 | """ 238 | if never_split is None: 239 | never_split = [] 240 | self.do_lower_case = do_lower_case 241 | self.never_split = never_split 242 | self.tokenize_chinese_chars = tokenize_chinese_chars 243 | 244 | def tokenize(self, text, never_split=None): 245 | """ Basic Tokenization of a piece of text. 246 | Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. 247 | 248 | Args: 249 | **never_split**: (`optional`) list of str 250 | Kept for backward compatibility purposes. 251 | Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) 252 | List of token not to split. 253 | """ 254 | never_split = self.never_split + (never_split if never_split is not None else []) 255 | text = self._clean_text(text) 256 | # This was added on November 1st, 2018 for the multilingual and Chinese 257 | # models. This is also applied to the English models now, but it doesn't 258 | # matter since the English models were not trained on any Chinese data 259 | # and generally don't have any Chinese data in them (there are Chinese 260 | # characters in the vocabulary because Wikipedia does have some Chinese 261 | # words in the English Wikipedia.). 262 | if self.tokenize_chinese_chars: 263 | text = self._tokenize_chinese_chars(text) 264 | orig_tokens = whitespace_tokenize(text) 265 | split_tokens = [] 266 | for token in orig_tokens: 267 | if self.do_lower_case and token not in never_split: 268 | token = token.lower() 269 | token = self._run_strip_accents(token) 270 | split_tokens.extend(self._run_split_on_punc(token)) 271 | 272 | output_tokens = whitespace_tokenize(" ".join(split_tokens)) 273 | return output_tokens 274 | 275 | def _run_strip_accents(self, text): 276 | """Strips accents from a piece of text.""" 277 | text = unicodedata.normalize("NFD", text) 278 | output = [] 279 | for char in text: 280 | cat = unicodedata.category(char) 281 | if cat == "Mn": 282 | continue 283 | output.append(char) 284 | return "".join(output) 285 | 286 | def _run_split_on_punc(self, text, never_split=None): 287 | """Splits punctuation on a piece of text.""" 288 | if never_split is not None and text in never_split: 289 | return [text] 290 | chars = list(text) 291 | i = 0 292 | start_new_word = True 293 | output = [] 294 | while i < len(chars): 295 | char = chars[i] 296 | if _is_punctuation(char): 297 | output.append([char]) 298 | start_new_word = True 299 | else: 300 | if start_new_word: 301 | output.append([]) 302 | start_new_word = False 303 | output[-1].append(char) 304 | i += 1 305 | 306 | return ["".join(x) for x in output] 307 | 308 | def _tokenize_chinese_chars(self, text): 309 | """Adds whitespace around any CJK character.""" 310 | output = [] 311 | for char in text: 312 | cp = ord(char) 313 | if self._is_chinese_char(cp): 314 | output.append(" ") 315 | output.append(char) 316 | output.append(" ") 317 | else: 318 | output.append(char) 319 | return "".join(output) 320 | 321 | def _is_chinese_char(self, cp): 322 | """Checks whether CP is the codepoint of a CJK character.""" 323 | # This defines a "chinese character" as anything in the CJK Unicode block: 324 | # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) 325 | # 326 | # Note that the CJK Unicode block is NOT all Japanese and Korean characters, 327 | # despite its name. The modern Korean Hangul alphabet is a different block, 328 | # as is Japanese Hiragana and Katakana. Those alphabets are used to write 329 | # space-separated words, so they are not treated specially and handled 330 | # like the all of the other languages. 331 | if ((cp >= 0x4E00 and cp <= 0x9FFF) or # 332 | (cp >= 0x3400 and cp <= 0x4DBF) or # 333 | (cp >= 0x20000 and cp <= 0x2A6DF) or # 334 | (cp >= 0x2A700 and cp <= 0x2B73F) or # 335 | (cp >= 0x2B740 and cp <= 0x2B81F) or # 336 | (cp >= 0x2B820 and cp <= 0x2CEAF) or 337 | (cp >= 0xF900 and cp <= 0xFAFF) or # 338 | (cp >= 0x2F800 and cp <= 0x2FA1F)): # 339 | return True 340 | 341 | return False 342 | 343 | def _clean_text(self, text): 344 | """Performs invalid character removal and whitespace cleanup on text.""" 345 | output = [] 346 | for char in text: 347 | cp = ord(char) 348 | if cp == 0 or cp == 0xfffd or _is_control(char): 349 | continue 350 | if _is_whitespace(char): 351 | output.append(" ") 352 | else: 353 | output.append(char) 354 | return "".join(output) 355 | 356 | 357 | class WordpieceTokenizer(object): 358 | """Runs WordPiece tokenization.""" 359 | 360 | def __init__(self, vocab, unk_token, max_input_chars_per_word=100): 361 | self.vocab = vocab 362 | self.unk_token = unk_token 363 | self.max_input_chars_per_word = max_input_chars_per_word 364 | 365 | def tokenize(self, text): 366 | """Tokenizes a piece of text into its word pieces. 367 | 368 | This uses a greedy longest-match-first algorithm to perform tokenization 369 | using the given vocabulary. 370 | 371 | For example: 372 | input = "unaffable" 373 | output = ["un", "##aff", "##able"] 374 | 375 | Args: 376 | text: A single token or whitespace separated tokens. This should have 377 | already been passed through `BasicTokenizer`. 378 | 379 | Returns: 380 | A list of wordpiece tokens. 381 | """ 382 | 383 | output_tokens = [] 384 | for token in whitespace_tokenize(text): 385 | chars = list(token) 386 | if len(chars) > self.max_input_chars_per_word: 387 | output_tokens.append(self.unk_token) 388 | continue 389 | 390 | is_bad = False 391 | start = 0 392 | sub_tokens = [] 393 | while start < len(chars): 394 | end = len(chars) 395 | cur_substr = None 396 | while start < end: 397 | substr = "".join(chars[start:end]) 398 | if start > 0: 399 | substr = "##" + substr 400 | if substr in self.vocab: 401 | cur_substr = substr 402 | break 403 | end -= 1 404 | if cur_substr is None: 405 | is_bad = True 406 | break 407 | sub_tokens.append(cur_substr) 408 | start = end 409 | 410 | if is_bad: 411 | output_tokens.append(self.unk_token) 412 | else: 413 | output_tokens.extend(sub_tokens) 414 | return output_tokens 415 | 416 | 417 | def _is_whitespace(char): 418 | """Checks whether `chars` is a whitespace character.""" 419 | # \t, \n, and \r are technically contorl characters but we treat them 420 | # as whitespace since they are generally considered as such. 421 | if char == " " or char == "\t" or char == "\n" or char == "\r": 422 | return True 423 | cat = unicodedata.category(char) 424 | if cat == "Zs": 425 | return True 426 | return False 427 | 428 | 429 | def _is_control(char): 430 | """Checks whether `chars` is a control character.""" 431 | # These are technically control characters but we count them as whitespace 432 | # characters. 433 | if char == "\t" or char == "\n" or char == "\r": 434 | return False 435 | cat = unicodedata.category(char) 436 | if cat.startswith("C"): 437 | return True 438 | return False 439 | 440 | 441 | def _is_punctuation(char): 442 | """Checks whether `chars` is a punctuation character.""" 443 | cp = ord(char) 444 | # We treat all non-letter/number ASCII as punctuation. 445 | # Characters such as "^", "$", and "`" are not in the Unicode 446 | # Punctuation class but we treat them as punctuation anyways, for 447 | # consistency. 448 | if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or 449 | (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): 450 | return True 451 | cat = unicodedata.category(char) 452 | if cat.startswith("P"): 453 | return True 454 | return False 455 | -------------------------------------------------------------------------------- /scheme1/code1/pytorch_transformers/tokenization_utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for OpenAI GPT.""" 16 | from __future__ import (absolute_import, division, print_function, 17 | unicode_literals) 18 | 19 | import logging 20 | import os 21 | import json 22 | import six 23 | from io import open 24 | 25 | from .file_utils import cached_path 26 | 27 | logger = logging.getLogger(__name__) 28 | 29 | SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json' 30 | ADDED_TOKENS_FILE = 'added_tokens.json' 31 | 32 | class PreTrainedTokenizer(object): 33 | """ Base class for all tokenizers. 34 | Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. 35 | 36 | This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). 37 | 38 | Class attributes (overridden by derived classes): 39 | 40 | - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). 41 | - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. 42 | - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. 43 | 44 | Parameters: 45 | 46 | - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` 47 | 48 | - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` 49 | 50 | - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` 51 | 52 | - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` 53 | 54 | - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` 55 | 56 | - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` 57 | 58 | - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` 59 | 60 | - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` 61 | """ 62 | vocab_files_names = {} 63 | pretrained_vocab_files_map = {} 64 | max_model_input_sizes = {} 65 | 66 | SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token", 67 | "pad_token", "cls_token", "mask_token", 68 | "additional_special_tokens"] 69 | 70 | @property 71 | def bos_token(self): 72 | """ Beginning of sentence token (string). Log an error if used while not having been set. """ 73 | if self._bos_token is None: 74 | logger.error("Using bos_token, but it is not set yet.") 75 | return self._bos_token 76 | 77 | @property 78 | def eos_token(self): 79 | """ End of sentence token (string). Log an error if used while not having been set. """ 80 | if self._eos_token is None: 81 | logger.error("Using eos_token, but it is not set yet.") 82 | return self._eos_token 83 | 84 | @property 85 | def unk_token(self): 86 | """ Unknown token (string). Log an error if used while not having been set. """ 87 | if self._unk_token is None: 88 | logger.error("Using unk_token, but it is not set yet.") 89 | return self._unk_token 90 | 91 | @property 92 | def sep_token(self): 93 | """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ 94 | if self._sep_token is None: 95 | logger.error("Using sep_token, but it is not set yet.") 96 | return self._sep_token 97 | 98 | @property 99 | def pad_token(self): 100 | """ Padding token (string). Log an error if used while not having been set. """ 101 | if self._pad_token is None: 102 | logger.error("Using pad_token, but it is not set yet.") 103 | return self._pad_token 104 | 105 | @property 106 | def cls_token(self): 107 | """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ 108 | if self._cls_token is None: 109 | logger.error("Using cls_token, but it is not set yet.") 110 | return self._cls_token 111 | 112 | @property 113 | def mask_token(self): 114 | """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ 115 | if self._mask_token is None: 116 | logger.error("Using mask_token, but it is not set yet.") 117 | return self._mask_token 118 | 119 | @property 120 | def additional_special_tokens(self): 121 | """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ 122 | if self._additional_special_tokens is None: 123 | logger.error("Using additional_special_tokens, but it is not set yet.") 124 | return self._additional_special_tokens 125 | 126 | @bos_token.setter 127 | def bos_token(self, value): 128 | self._bos_token = value 129 | 130 | @eos_token.setter 131 | def eos_token(self, value): 132 | self._eos_token = value 133 | 134 | @unk_token.setter 135 | def unk_token(self, value): 136 | self._unk_token = value 137 | 138 | @sep_token.setter 139 | def sep_token(self, value): 140 | self._sep_token = value 141 | 142 | @pad_token.setter 143 | def pad_token(self, value): 144 | self._pad_token = value 145 | 146 | @cls_token.setter 147 | def cls_token(self, value): 148 | self._cls_token = value 149 | 150 | @mask_token.setter 151 | def mask_token(self, value): 152 | self._mask_token = value 153 | 154 | @additional_special_tokens.setter 155 | def additional_special_tokens(self, value): 156 | self._additional_special_tokens = value 157 | 158 | def __init__(self, max_len=None, **kwargs): 159 | self._bos_token = None 160 | self._eos_token = None 161 | self._unk_token = None 162 | self._sep_token = None 163 | self._pad_token = None 164 | self._cls_token = None 165 | self._mask_token = None 166 | self._additional_special_tokens = [] 167 | 168 | self.max_len = max_len if max_len is not None else int(1e12) 169 | self.added_tokens_encoder = {} 170 | self.added_tokens_decoder = {} 171 | 172 | for key, value in kwargs.items(): 173 | if key in self.SPECIAL_TOKENS_ATTRIBUTES: 174 | if key == 'additional_special_tokens': 175 | assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value) 176 | else: 177 | assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) 178 | setattr(self, key, value) 179 | 180 | 181 | @classmethod 182 | def from_pretrained(cls, *inputs, **kwargs): 183 | r""" 184 | Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. 185 | 186 | Args: 187 | pretrained_model_name_or_path: either: 188 | 189 | - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. 190 | - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. 191 | - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. 192 | 193 | cache_dir: (`optional`) string: 194 | Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. 195 | 196 | force_download: (`optional`) boolean, default False: 197 | Force to (re-)download the vocabulary files and override the cached versions if they exists. 198 | 199 | proxies: (`optional`) dict, default None: 200 | A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. 201 | The proxies are used on each request. 202 | 203 | inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. 204 | 205 | kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details. 206 | 207 | Examples:: 208 | 209 | # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer 210 | 211 | # Download vocabulary from S3 and cache. 212 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 213 | 214 | # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) 215 | tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') 216 | 217 | # If the tokenizer uses a single vocabulary file, you can point directly to this file 218 | tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') 219 | 220 | # You can link tokens to special vocabulary when instantiating 221 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') 222 | # You should be sure '' is in the vocabulary when doing that. 223 | # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) 224 | assert tokenizer.unk_token == '' 225 | 226 | """ 227 | return cls._from_pretrained(*inputs, **kwargs) 228 | 229 | 230 | @classmethod 231 | def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): 232 | cache_dir = kwargs.pop('cache_dir', None) 233 | force_download = kwargs.pop('force_download', False) 234 | proxies = kwargs.pop('proxies', None) 235 | 236 | s3_models = list(cls.max_model_input_sizes.keys()) 237 | vocab_files = {} 238 | if pretrained_model_name_or_path in s3_models: 239 | # Get the vocabulary from AWS S3 bucket 240 | for file_id, map_list in cls.pretrained_vocab_files_map.items(): 241 | vocab_files[file_id] = map_list[pretrained_model_name_or_path] 242 | else: 243 | # Get the vocabulary from local files 244 | logger.info( 245 | "Model name '{}' not found in model shortcut name list ({}). " 246 | "Assuming '{}' is a path or url to a directory containing tokenizer files.".format( 247 | pretrained_model_name_or_path, ', '.join(s3_models), 248 | pretrained_model_name_or_path)) 249 | 250 | # Look for the tokenizer main vocabulary files 251 | for file_id, file_name in cls.vocab_files_names.items(): 252 | if os.path.isdir(pretrained_model_name_or_path): 253 | # If a directory is provided we look for the standard filenames 254 | full_file_name = os.path.join(pretrained_model_name_or_path, file_name) 255 | else: 256 | # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) 257 | full_file_name = pretrained_model_name_or_path 258 | if not os.path.exists(full_file_name): 259 | logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) 260 | full_file_name = None 261 | vocab_files[file_id] = full_file_name 262 | 263 | # Look for the additional tokens files 264 | all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, 265 | 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} 266 | 267 | # If a path to a file was provided, get the parent directory 268 | saved_directory = pretrained_model_name_or_path 269 | if os.path.exists(saved_directory) and not os.path.isdir(saved_directory): 270 | saved_directory = os.path.dirname(saved_directory) 271 | 272 | for file_id, file_name in all_vocab_files_names.items(): 273 | full_file_name = os.path.join(saved_directory, file_name) 274 | if not os.path.exists(full_file_name): 275 | logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) 276 | full_file_name = None 277 | vocab_files[file_id] = full_file_name 278 | 279 | if all(full_file_name is None for full_file_name in vocab_files.values()): 280 | logger.error( 281 | "Model name '{}' was not found in model name list ({}). " 282 | "We assumed '{}' was a path or url but couldn't find tokenizer files" 283 | "at this path or url.".format( 284 | pretrained_model_name_or_path, ', '.join(s3_models), 285 | pretrained_model_name_or_path, )) 286 | return None 287 | 288 | # Get files from url, cache, or disk depending on the case 289 | try: 290 | resolved_vocab_files = {} 291 | for file_id, file_path in vocab_files.items(): 292 | if file_path is None: 293 | resolved_vocab_files[file_id] = None 294 | else: 295 | resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies) 296 | except EnvironmentError as e: 297 | if pretrained_model_name_or_path in s3_models: 298 | logger.error("Couldn't reach server to download vocabulary.") 299 | else: 300 | logger.error( 301 | "Model name '{}' was not found in model name list ({}). " 302 | "We assumed '{}' was a path or url but couldn't find files {} " 303 | "at this path or url.".format( 304 | pretrained_model_name_or_path, ', '.join(s3_models), 305 | pretrained_model_name_or_path, str(vocab_files.keys()))) 306 | raise e 307 | 308 | for file_id, file_path in vocab_files.items(): 309 | if file_path == resolved_vocab_files[file_id]: 310 | logger.info("loading file {}".format(file_path)) 311 | else: 312 | logger.info("loading file {} from cache at {}".format( 313 | file_path, resolved_vocab_files[file_id])) 314 | 315 | # Set max length if needed 316 | if pretrained_model_name_or_path in cls.max_model_input_sizes: 317 | # if we're using a pretrained model, ensure the tokenizer 318 | # wont index sequences longer than the number of positional embeddings 319 | max_len = cls.max_model_input_sizes[pretrained_model_name_or_path] 320 | if max_len is not None and isinstance(max_len, (int, float)): 321 | kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) 322 | 323 | # Merge resolved_vocab_files arguments in kwargs. 324 | added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None) 325 | special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None) 326 | for args_name, file_path in resolved_vocab_files.items(): 327 | if args_name not in kwargs: 328 | kwargs[args_name] = file_path 329 | if special_tokens_map_file is not None: 330 | special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8")) 331 | for key, value in special_tokens_map.items(): 332 | if key not in kwargs: 333 | kwargs[key] = value 334 | 335 | # Instantiate tokenizer. 336 | tokenizer = cls(*inputs, **kwargs) 337 | 338 | # Add supplementary tokens. 339 | if added_tokens_file is not None: 340 | added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8")) 341 | added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} 342 | tokenizer.added_tokens_encoder.update(added_tok_encoder) 343 | tokenizer.added_tokens_decoder.update(added_tok_decoder) 344 | 345 | return tokenizer 346 | 347 | 348 | def save_pretrained(self, save_directory): 349 | """ Save the tokenizer vocabulary files (with added tokens) and the 350 | special-tokens-to-class-attributes-mapping to a directory. 351 | 352 | This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method. 353 | """ 354 | if not os.path.isdir(save_directory): 355 | logger.error("Saving directory ({}) should be a directory".format(save_directory)) 356 | return 357 | 358 | special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) 359 | added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) 360 | 361 | with open(special_tokens_map_file, 'w', encoding='utf-8') as f: 362 | f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) 363 | 364 | with open(added_tokens_file, 'w', encoding='utf-8') as f: 365 | if self.added_tokens_encoder: 366 | out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) 367 | else: 368 | out_str = u"{}" 369 | f.write(out_str) 370 | 371 | vocab_files = self.save_vocabulary(save_directory) 372 | 373 | return vocab_files + (special_tokens_map_file, added_tokens_file) 374 | 375 | 376 | def save_vocabulary(self, save_directory): 377 | """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens 378 | and special token mappings. 379 | 380 | Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method. 381 | """ 382 | raise NotImplementedError 383 | 384 | 385 | def vocab_size(self): 386 | """ Size of the base vocabulary (without the added tokens) """ 387 | raise NotImplementedError 388 | 389 | 390 | def __len__(self): 391 | """ Size of the full vocabulary with the added tokens """ 392 | return self.vocab_size + len(self.added_tokens_encoder) 393 | 394 | 395 | def add_tokens(self, new_tokens): 396 | """ 397 | Add a list of new tokens to the tokenizer class. If the new tokens are not in the 398 | vocabulary, they are added to it with indices starting from length of the current vocabulary. 399 | 400 | Args: 401 | new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). 402 | 403 | Returns: 404 | Number of tokens added to the vocabulary. 405 | 406 | Examples:: 407 | 408 | # Let's see how to increase the vocabulary of Bert model and tokenizer 409 | tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') 410 | model = BertModel.from_pretrained('bert-base-uncased') 411 | 412 | num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) 413 | print('We have added', num_added_toks, 'tokens') 414 | model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. 415 | """ 416 | if not new_tokens: 417 | return 0 418 | 419 | to_add_tokens = [] 420 | for token in new_tokens: 421 | assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) 422 | if token != self.unk_token and \ 423 | self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token): 424 | to_add_tokens.append(token) 425 | logger.info("Adding %s to the vocabulary", token) 426 | 427 | added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens)) 428 | added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} 429 | self.added_tokens_encoder.update(added_tok_encoder) 430 | self.added_tokens_decoder.update(added_tok_decoder) 431 | 432 | return len(to_add_tokens) 433 | 434 | 435 | def add_special_tokens(self, special_tokens_dict): 436 | """ 437 | Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them 438 | to class attributes. If special tokens are NOT in the vocabulary, they are added 439 | to it (indexed starting from the last index of the current vocabulary). 440 | 441 | Args: 442 | special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: 443 | [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, 444 | ``additional_special_tokens``]. 445 | 446 | Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). 447 | 448 | Returns: 449 | Number of tokens added to the vocabulary. 450 | 451 | Examples:: 452 | 453 | # Let's see how to add a new classification token to GPT-2 454 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 455 | model = GPT2Model.from_pretrained('gpt2') 456 | 457 | special_tokens_dict = {'cls_token': ''} 458 | 459 | num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) 460 | print('We have added', num_added_toks, 'tokens') 461 | model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. 462 | 463 | assert tokenizer.cls_token == '' 464 | """ 465 | if not special_tokens_dict: 466 | return 0 467 | 468 | added_tokens = 0 469 | for key, value in special_tokens_dict.items(): 470 | assert key in self.SPECIAL_TOKENS_ATTRIBUTES 471 | if key == 'additional_special_tokens': 472 | assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value) 473 | added_tokens += self.add_tokens(value) 474 | else: 475 | assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) 476 | added_tokens += self.add_tokens([value]) 477 | logger.info("Assigning %s to the %s key of the tokenizer", value, key) 478 | setattr(self, key, value) 479 | 480 | return added_tokens 481 | 482 | def tokenize(self, text, **kwargs): 483 | """ Converts a string in a sequence of tokens (string), using the tokenizer. 484 | Split in words for word-based vocabulary or sub-words for sub-word-based 485 | vocabularies (BPE/SentencePieces/WordPieces). 486 | 487 | Take care of added tokens. 488 | """ 489 | def split_on_token(tok, text): 490 | result = [] 491 | split_text = text.split(tok) 492 | for i, sub_text in enumerate(split_text): 493 | sub_text = sub_text.strip() 494 | if i == 0 and not sub_text: 495 | result += [tok] 496 | elif i == len(split_text) - 1: 497 | if sub_text: 498 | result += [sub_text] 499 | else: 500 | pass 501 | else: 502 | if sub_text: 503 | result += [sub_text] 504 | result += [tok] 505 | return result 506 | 507 | def split_on_tokens(tok_list, text): 508 | if not text: 509 | return [] 510 | if not tok_list: 511 | return self._tokenize(text, **kwargs) 512 | 513 | tokenized_text = [] 514 | text_list = [text] 515 | for tok in tok_list: 516 | tokenized_text = [] 517 | for sub_text in text_list: 518 | if sub_text not in self.added_tokens_encoder \ 519 | and sub_text not in self.all_special_tokens: 520 | tokenized_text += split_on_token(tok, sub_text) 521 | else: 522 | tokenized_text += [sub_text] 523 | text_list = tokenized_text 524 | 525 | return sum((self._tokenize(token, **kwargs) if token not \ 526 | in self.added_tokens_encoder and token not in self.all_special_tokens \ 527 | else [token] for token in tokenized_text), []) 528 | 529 | added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens 530 | tokenized_text = split_on_tokens(added_tokens, text) 531 | return tokenized_text 532 | 533 | def _tokenize(self, text, **kwargs): 534 | """ Converts a string in a sequence of tokens (string), using the tokenizer. 535 | Split in words for word-based vocabulary or sub-words for sub-word-based 536 | vocabularies (BPE/SentencePieces/WordPieces). 537 | 538 | Do NOT take care of added tokens. 539 | """ 540 | raise NotImplementedError 541 | 542 | def convert_tokens_to_ids(self, tokens): 543 | """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id 544 | (resp. a sequence of ids), using the vocabulary. 545 | """ 546 | if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): 547 | return self._convert_token_to_id_with_added_voc(tokens) 548 | 549 | ids = [] 550 | for token in tokens: 551 | ids.append(self._convert_token_to_id_with_added_voc(token)) 552 | if len(ids) > self.max_len: 553 | logger.warning("Token indices sequence length is longer than the specified maximum sequence length " 554 | "for this model ({} > {}). Running this sequence through the model will result in " 555 | "indexing errors".format(len(ids), self.max_len)) 556 | return ids 557 | 558 | def _convert_token_to_id_with_added_voc(self, token): 559 | if token in self.added_tokens_encoder: 560 | return self.added_tokens_encoder[token] 561 | return self._convert_token_to_id(token) 562 | 563 | def _convert_token_to_id(self, token): 564 | raise NotImplementedError 565 | 566 | def encode(self, text, text_pair=None, add_special_tokens=False): 567 | """ 568 | Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. 569 | 570 | Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. 571 | 572 | Args: 573 | text: The first sequence to be encoded. 574 | text_pair: Optional second sequence to be encoded. 575 | add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative 576 | to their model. 577 | """ 578 | if text_pair is None: 579 | if add_special_tokens: 580 | return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text))) 581 | else: 582 | return self.convert_tokens_to_ids(self.tokenize(text)) 583 | 584 | first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)] 585 | second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)] 586 | 587 | if add_special_tokens: 588 | return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens) 589 | else: 590 | return first_sentence_tokens, second_sentence_tokens 591 | 592 | def add_special_tokens_single_sentence(self, token_ids): 593 | raise NotImplementedError 594 | 595 | def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): 596 | raise NotImplementedError 597 | 598 | def convert_ids_to_tokens(self, ids, skip_special_tokens=False): 599 | """ Converts a single index or a sequence of indices (integers) in a token " 600 | (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. 601 | 602 | Args: 603 | skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False 604 | """ 605 | if isinstance(ids, int): 606 | if ids in self.added_tokens_decoder: 607 | return self.added_tokens_decoder[ids] 608 | else: 609 | return self._convert_id_to_token(ids) 610 | tokens = [] 611 | for index in ids: 612 | if index in self.all_special_ids and skip_special_tokens: 613 | continue 614 | if index in self.added_tokens_decoder: 615 | tokens.append(self.added_tokens_decoder[index]) 616 | else: 617 | tokens.append(self._convert_id_to_token(index)) 618 | return tokens 619 | 620 | def _convert_id_to_token(self, index): 621 | raise NotImplementedError 622 | 623 | def convert_tokens_to_string(self, tokens): 624 | """ Converts a sequence of tokens (string) in a single string. 625 | The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) 626 | but we often want to remove sub-word tokenization artifacts at the same time. 627 | """ 628 | return ' '.join(self.convert_ids_to_tokens(tokens)) 629 | 630 | def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): 631 | """ 632 | Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary 633 | with options to remove special tokens and clean up tokenization spaces. 634 | Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. 635 | """ 636 | filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) 637 | text = self.convert_tokens_to_string(filtered_tokens) 638 | 639 | if self.sep_token is not None and self.sep_token in text: 640 | text = text.replace(self.cls_token, self.sep_token) 641 | split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token))) 642 | if clean_up_tokenization_spaces: 643 | clean_text = [self.clean_up_tokenization(text) for text in split_text] 644 | return clean_text 645 | else: 646 | return split_text 647 | else: 648 | if clean_up_tokenization_spaces: 649 | clean_text = self.clean_up_tokenization(text) 650 | return clean_text 651 | else: 652 | return text 653 | 654 | @property 655 | def special_tokens_map(self): 656 | """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their 657 | values ('', ''...) 658 | """ 659 | set_attr = {} 660 | for attr in self.SPECIAL_TOKENS_ATTRIBUTES: 661 | attr_value = getattr(self, "_" + attr) 662 | if attr_value: 663 | set_attr[attr] = attr_value 664 | return set_attr 665 | 666 | @property 667 | def all_special_tokens(self): 668 | """ List all the special tokens ('', ''...) mapped to class attributes 669 | (cls_token, unk_token...). 670 | """ 671 | all_toks = [] 672 | set_attr = self.special_tokens_map 673 | for attr_value in set_attr.values(): 674 | all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value]) 675 | all_toks = list(set(all_toks)) 676 | return all_toks 677 | 678 | @property 679 | def all_special_ids(self): 680 | """ List the vocabulary indices of the special tokens ('', ''...) mapped to 681 | class attributes (cls_token, unk_token...). 682 | """ 683 | all_toks = self.all_special_tokens 684 | all_ids = list(self._convert_token_to_id(t) for t in all_toks) 685 | return all_ids 686 | 687 | @staticmethod 688 | def clean_up_tokenization(out_string): 689 | """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. 690 | """ 691 | out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' 692 | ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" 693 | ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") 694 | return out_string 695 | -------------------------------------------------------------------------------- /scheme1/code1/requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | torch>=1.0.0 3 | # progress bars in model download and training scripts 4 | tqdm 5 | # Accessing files from S3 directly. 6 | boto3 7 | # Used for downloading models over HTTP 8 | requests 9 | # For OpenAI GPT 10 | regex 11 | # For XLNet 12 | sentencepiece -------------------------------------------------------------------------------- /scheme1/code1/run_bert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """BERT finetuning runner.""" 17 | 18 | from __future__ import absolute_import 19 | 20 | import argparse 21 | import csv 22 | import logging 23 | import os 24 | import random 25 | import sys 26 | from io import open 27 | import pandas as pd 28 | import numpy as np 29 | import torch 30 | import gc 31 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, 32 | TensorDataset) 33 | from torch.utils.data.distributed import DistributedSampler 34 | from tqdm import tqdm, trange 35 | from sklearn.metrics import f1_score 36 | from sklearn.metrics import accuracy_score 37 | import json 38 | from pytorch_transformers.modeling_bert import BertForSequenceClassification, BertConfig 39 | from pytorch_transformers import AdamW, WarmupLinearSchedule 40 | from pytorch_transformers.tokenization_bert import BertTokenizer 41 | from itertools import cycle 42 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 43 | datefmt = '%m/%d/%Y %H:%M:%S', 44 | level = logging.INFO) 45 | MODEL_CLASSES = { 46 | 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), 47 | } 48 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in ( BertConfig,)), ()) 49 | 50 | logger = logging.getLogger(__name__) 51 | 52 | 53 | class InputExample(object): 54 | """A single training/test example for simple sequence classification.""" 55 | 56 | def __init__(self, guid, text_a, text_b=None, label=None): 57 | """Constructs a InputExample. 58 | 59 | Args: 60 | guid: Unique id for the example. 61 | text_a: string. The untokenized text of the first sequence. For single 62 | sequence tasks, only this sequence must be specified. 63 | text_b: (Optional) string. The untokenized text of the second sequence. 64 | Only must be specified for sequence pair tasks. 65 | label: (Optional) string. The label of the example. This should be 66 | specified for train and dev examples, but not for test examples. 67 | """ 68 | self.guid = guid 69 | self.text_a = text_a 70 | self.text_b = text_b 71 | self.label = label 72 | 73 | 74 | class InputFeatures(object): 75 | def __init__(self, 76 | example_id, 77 | choices_features, 78 | label 79 | 80 | ): 81 | self.example_id = example_id 82 | self.choices_features = [ 83 | { 84 | 'input_ids': input_ids, 85 | 'input_mask': input_mask, 86 | 'segment_ids': segment_ids 87 | } 88 | for _, input_ids, input_mask, segment_ids in choices_features 89 | ] 90 | self.label = label 91 | 92 | def read_examples(input_file, is_training): 93 | df=pd.read_csv(input_file) 94 | df['content'] = df['content'].fillna('无') 95 | df['entity'] = df['entity'].fillna('无') 96 | examples=[] 97 | for val in df[['id','content','entity','label']].values: 98 | examples.append(InputExample(guid=val[0],text_a=val[1],text_b=val[2],label=val[3])) 99 | return examples 100 | 101 | def convert_examples_to_features(examples, tokenizer, max_seq_length,split_num, 102 | is_training): 103 | """Loads a data file into a list of `InputBatch`s.""" 104 | 105 | # Swag is a multiple choice task. To perform this task using Bert, 106 | # we will use the formatting proposed in "Improving Language 107 | # Understanding by Generative Pre-Training" and suggested by 108 | # @jacobdevlin-google in this issue 109 | # https://github.com/google-research/bert/issues/38. 110 | # 111 | # Each choice will correspond to a sample on which we run the 112 | # inference. For a given Swag example, we will create the 4 113 | # following inputs: 114 | # - [CLS] context [SEP] choice_1 [SEP] 115 | # - [CLS] context [SEP] choice_2 [SEP] 116 | # - [CLS] context [SEP] choice_3 [SEP] 117 | # - [CLS] context [SEP] choice_4 [SEP] 118 | # The model will output a single value for each input. To get the 119 | # final decision of the model, we will run a softmax over these 4 120 | # outputs. 121 | features = [] 122 | for example_index, example in enumerate(examples): 123 | 124 | context_tokens=tokenizer.tokenize(example.text_a) 125 | ending_tokens=tokenizer.tokenize(example.text_b) 126 | 127 | 128 | skip_len=len(context_tokens)/split_num 129 | choices_features = [] 130 | for i in range(split_num): 131 | context_tokens_choice=context_tokens[int(i*skip_len):int((i+1)*skip_len)] 132 | _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3) 133 | tokens = ["[CLS]"]+ ending_tokens + ["[SEP]"] +context_tokens_choice + ["[SEP]"] 134 | segment_ids = [0] * (len(ending_tokens) + 2) + [1] * (len(context_tokens_choice) + 1) 135 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 136 | input_mask = [1] * len(input_ids) 137 | 138 | 139 | padding_length = max_seq_length - len(input_ids) 140 | input_ids += ([0] * padding_length) 141 | input_mask += ([0] * padding_length) 142 | segment_ids += ([0] * padding_length) 143 | choices_features.append((tokens, input_ids, input_mask, segment_ids)) 144 | 145 | 146 | label = example.label 147 | if example_index < 1 and is_training: 148 | logger.info("*** Example ***") 149 | logger.info("idx: {}".format(example_index)) 150 | logger.info("guid: {}".format(example.guid)) 151 | logger.info("tokens: {}".format(' '.join(tokens).replace('\u2581','_'))) 152 | logger.info("input_ids: {}".format(' '.join(map(str, input_ids)))) 153 | logger.info("input_mask: {}".format(' '.join(map(str, input_mask)))) 154 | logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids)))) 155 | logger.info("label: {}".format(label)) 156 | 157 | 158 | features.append( 159 | InputFeatures( 160 | example_id=example.guid, 161 | choices_features=choices_features, 162 | label=label 163 | ) 164 | ) 165 | return features 166 | 167 | 168 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 169 | """Truncates a sequence pair in place to the maximum length.""" 170 | 171 | # This is a simple heuristic which will always truncate the longer sequence 172 | # one token at a time. This makes more sense than truncating an equal percent 173 | # of tokens from each, since if one sequence is very short then each token 174 | # that's truncated likely contains more information than a longer sequence. 175 | 176 | while True: 177 | total_length = len(tokens_a) + len(tokens_b) 178 | if total_length <= max_length: 179 | break 180 | if len(tokens_a) > len(tokens_b): 181 | tokens_a.pop() 182 | else: 183 | tokens_b.pop() 184 | 185 | def accuracy(out, labels): 186 | outputs = np.argmax(out, axis=1) 187 | #return f1_score(labels,outputs,labels=[0,1],average='macro') 188 | return accuracy_score(labels, outputs, normalize=True) 189 | 190 | def select_field(features, field): 191 | return [ 192 | [ 193 | choice[field] 194 | for choice in feature.choices_features 195 | ] 196 | for feature in features 197 | ] 198 | 199 | def set_seed(args): 200 | random.seed(args.seed) 201 | np.random.seed(args.seed) 202 | torch.manual_seed(args.seed) 203 | if args.n_gpu > 0: 204 | torch.cuda.manual_seed_all(args.seed) 205 | 206 | 207 | def main(): 208 | parser = argparse.ArgumentParser() 209 | 210 | ## Required parameters 211 | parser.add_argument("--data_dir", default=None, type=str, required=True, 212 | help="The input data dir. Should contain the .tsv files (or other data files) for the task.") 213 | parser.add_argument("--model_type", default=None, type=str, required=True, 214 | help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) 215 | parser.add_argument("--model_name_or_path", default=None, type=str, required=True, 216 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) 217 | parser.add_argument("--meta_path", default=None, type=str, required=False, 218 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) 219 | parser.add_argument("--output_dir", default=None, type=str, required=True, 220 | help="The output directory where the model predictions and checkpoints will be written.") 221 | 222 | ## Other parameters 223 | parser.add_argument("--config_name", default="", type=str, 224 | help="Pretrained config name or path if not the same as model_name") 225 | parser.add_argument("--tokenizer_name", default="", type=str, 226 | help="Pretrained tokenizer name or path if not the same as model_name") 227 | parser.add_argument("--cache_dir", default="", type=str, 228 | help="Where do you want to store the pre-trained models downloaded from s3") 229 | parser.add_argument("--max_seq_length", default=128, type=int, 230 | help="The maximum total input sequence length after tokenization. Sequences longer " 231 | "than this will be truncated, sequences shorter will be padded.") 232 | parser.add_argument("--do_train", action='store_true', 233 | help="Whether to run training.") 234 | parser.add_argument("--do_test", action='store_true', 235 | help="Whether to run training.") 236 | parser.add_argument("--do_eval", action='store_true', 237 | help="Whether to run eval on the dev set.") 238 | parser.add_argument("--evaluate_during_training", action='store_true', 239 | help="Rul evaluation during training at each logging step.") 240 | parser.add_argument("--do_lower_case", action='store_true', 241 | help="Set this flag if you are using an uncased model.") 242 | 243 | parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, 244 | help="Batch size per GPU/CPU for training.") 245 | parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, 246 | help="Batch size per GPU/CPU for evaluation.") 247 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 248 | help="Number of updates steps to accumulate before performing a backward/update pass.") 249 | parser.add_argument("--learning_rate", default=5e-5, type=float, 250 | help="The initial learning rate for Adam.") 251 | parser.add_argument("--weight_decay", default=0.0, type=float, 252 | help="Weight deay if we apply some.") 253 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 254 | help="Epsilon for Adam optimizer.") 255 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 256 | help="Max gradient norm.") 257 | parser.add_argument("--num_train_epochs", default=3.0, type=float, 258 | help="Total number of training epochs to perform.") 259 | parser.add_argument("--max_steps", default=-1, type=int, 260 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 261 | parser.add_argument("--eval_steps", default=-1, type=int, 262 | help="") 263 | parser.add_argument("--lstm_hidden_size", default=300, type=int, 264 | help="") 265 | parser.add_argument("--lstm_layers", default=2, type=int, 266 | help="") 267 | parser.add_argument("--lstm_dropout", default=0.5, type=float, 268 | help="") 269 | 270 | parser.add_argument("--train_steps", default=-1, type=int, 271 | help="") 272 | parser.add_argument("--report_steps", default=-1, type=int, 273 | help="") 274 | parser.add_argument("--warmup_steps", default=0, type=int, 275 | help="Linear warmup over warmup_steps.") 276 | parser.add_argument("--split_num", default=3, type=int, 277 | help="text split") 278 | parser.add_argument('--logging_steps', type=int, default=50, 279 | help="Log every X updates steps.") 280 | parser.add_argument('--save_steps', type=int, default=50, 281 | help="Save checkpoint every X updates steps.") 282 | parser.add_argument("--eval_all_checkpoints", action='store_true', 283 | help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") 284 | parser.add_argument("--no_cuda", action='store_true', 285 | help="Avoid using CUDA when available") 286 | parser.add_argument('--overwrite_output_dir', action='store_true', 287 | help="Overwrite the content of the output directory") 288 | parser.add_argument('--overwrite_cache', action='store_true', 289 | help="Overwrite the cached training and evaluation sets") 290 | parser.add_argument('--seed', type=int, default=42, 291 | help="random seed for initialization") 292 | 293 | parser.add_argument('--fp16', action='store_true', 294 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 295 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 296 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 297 | "See details at https://nvidia.github.io/apex/amp.html") 298 | parser.add_argument("--local_rank", type=int, default=-1, 299 | help="For distributed training: local_rank") 300 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 301 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 302 | parser.add_argument('--device_id', type=str, default='0', help="The CUDA device is for training.") 303 | args = parser.parse_args() 304 | 305 | os.environ['CUDA_VISIBLE_DEVICES'] = args.device_id 306 | 307 | # Setup CUDA, GPU & distributed training 308 | if args.local_rank == -1 or args.no_cuda: 309 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 310 | args.n_gpu = torch.cuda.device_count() 311 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 312 | torch.cuda.set_device(args.local_rank) 313 | device = torch.device("cuda", args.local_rank) 314 | torch.distributed.init_process_group(backend='nccl') 315 | args.n_gpu = 1 316 | args.device = device 317 | 318 | 319 | # Setup logging 320 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', 321 | datefmt = '%m/%d/%Y %H:%M:%S', 322 | level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 323 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 324 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 325 | 326 | # Set seed 327 | set_seed(args) 328 | 329 | 330 | try: 331 | os.makedirs(args.output_dir) 332 | except: 333 | pass 334 | 335 | tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case) 336 | 337 | 338 | 339 | config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2) 340 | 341 | # Prepare model 342 | model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,args,config=config) 343 | 344 | 345 | 346 | if args.fp16: 347 | model.half() 348 | model.to(device) 349 | if args.local_rank != -1: 350 | try: 351 | from apex.parallel import DistributedDataParallel as DDP 352 | except ImportError: 353 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 354 | 355 | model = DDP(model) 356 | elif args.n_gpu > 1: 357 | model = torch.nn.DataParallel(model) 358 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 359 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 360 | if args.do_train: 361 | 362 | # Prepare data loader 363 | 364 | train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) 365 | train_features = convert_examples_to_features( 366 | train_examples, tokenizer, args.max_seq_length,args.split_num, True) 367 | all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) 368 | all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) 369 | all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) 370 | all_label = torch.tensor([f.label for f in train_features], dtype=torch.long) 371 | train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) 372 | if args.local_rank == -1: 373 | train_sampler = RandomSampler(train_data) 374 | else: 375 | train_sampler = DistributedSampler(train_data) 376 | train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps) 377 | 378 | num_train_optimization_steps = args.train_steps 379 | 380 | 381 | # Prepare optimizer 382 | 383 | param_optimizer = list(model.named_parameters()) 384 | 385 | # hack to remove pooler, which is not used 386 | # thus it produce None grad that break apex 387 | param_optimizer = [n for n in param_optimizer] 388 | 389 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 390 | optimizer_grouped_parameters = [ 391 | {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 392 | {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 393 | ] 394 | 395 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 396 | scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps) 397 | 398 | global_step = 0 399 | 400 | logger.info("***** Running training *****") 401 | logger.info(" Num examples = %d", len(train_examples)) 402 | logger.info(" Batch size = %d", args.train_batch_size) 403 | logger.info(" Num steps = %d", num_train_optimization_steps) 404 | 405 | best_acc=0 406 | model.train() 407 | tr_loss = 0 408 | nb_tr_examples, nb_tr_steps = 0, 0 409 | bar = tqdm(range(num_train_optimization_steps),total=num_train_optimization_steps) 410 | train_dataloader=cycle(train_dataloader) 411 | 412 | 413 | for step in bar: 414 | batch = next(train_dataloader) 415 | batch = tuple(t.to(device) for t in batch) 416 | input_ids, input_mask, segment_ids, label_ids = batch 417 | loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) 418 | if args.n_gpu > 1: 419 | loss = loss.mean() # mean() to average on multi-gpu. 420 | if args.fp16 and args.loss_scale != 1.0: 421 | loss = loss * args.loss_scale 422 | if args.gradient_accumulation_steps > 1: 423 | loss = loss / args.gradient_accumulation_steps 424 | tr_loss += loss.item() 425 | train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4) 426 | bar.set_description("loss {}".format(train_loss)) 427 | nb_tr_examples += input_ids.size(0) 428 | nb_tr_steps += 1 429 | 430 | if args.fp16: 431 | optimizer.backward(loss) 432 | else: 433 | 434 | loss.backward() 435 | 436 | if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0: 437 | if args.fp16: 438 | # modify learning rate with special warm up BERT uses 439 | # if args.fp16 is False, BertAdam is used that handles this automatically 440 | lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) 441 | for param_group in optimizer.param_groups: 442 | param_group['lr'] = lr_this_step 443 | scheduler.step() 444 | optimizer.step() 445 | optimizer.zero_grad() 446 | global_step += 1 447 | 448 | 449 | if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: 450 | tr_loss = 0 451 | nb_tr_examples, nb_tr_steps = 0, 0 452 | logger.info("***** Report result *****") 453 | logger.info(" %s = %s", 'global_step', str(global_step)) 454 | logger.info(" %s = %s", 'train loss', str(train_loss)) 455 | 456 | 457 | if args.do_eval and (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0: 458 | for file in ['dev.csv']: 459 | inference_labels=[] 460 | gold_labels=[] 461 | inference_logits=[] 462 | eval_examples = read_examples(os.path.join(args.data_dir, file), is_training = True) 463 | eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,args.split_num,False) 464 | all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) 465 | all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) 466 | all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) 467 | all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) 468 | 469 | 470 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label) 471 | 472 | logger.info("***** Running evaluation *****") 473 | logger.info(" Num examples = %d", len(eval_examples)) 474 | logger.info(" Batch size = %d", args.eval_batch_size) 475 | 476 | # Run prediction for full data 477 | eval_sampler = SequentialSampler(eval_data) 478 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 479 | 480 | model.eval() 481 | eval_loss, eval_accuracy = 0, 0 482 | nb_eval_steps, nb_eval_examples = 0, 0 483 | for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: 484 | input_ids = input_ids.to(device) 485 | input_mask = input_mask.to(device) 486 | segment_ids = segment_ids.to(device) 487 | label_ids = label_ids.to(device) 488 | 489 | 490 | with torch.no_grad(): 491 | tmp_eval_loss= model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) 492 | logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) 493 | 494 | logits = logits.detach().cpu().numpy() 495 | label_ids = label_ids.to('cpu').numpy() 496 | inference_labels.append(np.argmax(logits, axis=1)) 497 | gold_labels.append(label_ids) 498 | inference_logits.append(logits) 499 | eval_loss += tmp_eval_loss.mean().item() 500 | nb_eval_examples += input_ids.size(0) 501 | nb_eval_steps += 1 502 | 503 | gold_labels=np.concatenate(gold_labels,0) 504 | inference_logits=np.concatenate(inference_logits,0) 505 | model.train() 506 | eval_loss = eval_loss / nb_eval_steps 507 | eval_accuracy = accuracy(inference_logits, gold_labels) 508 | 509 | result = {'eval_loss': eval_loss, 510 | 'eval_F1': eval_accuracy, 511 | 'global_step': global_step, 512 | 'loss': train_loss} 513 | 514 | output_eval_file = os.path.join(args.output_dir, "eval_results.txt") 515 | with open(output_eval_file, "a") as writer: 516 | for key in sorted(result.keys()): 517 | logger.info(" %s = %s", key, str(result[key])) 518 | writer.write("%s = %s\n" % (key, str(result[key]))) 519 | writer.write('*'*80) 520 | writer.write('\n') 521 | if eval_accuracy>best_acc and 'dev' in file: 522 | print("="*80) 523 | print("Best F1",eval_accuracy) 524 | print("Saving Model......") 525 | best_acc=eval_accuracy 526 | # Save a trained model 527 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 528 | output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") 529 | torch.save(model_to_save.state_dict(), output_model_file) 530 | print("="*80) 531 | else: 532 | print("="*80) 533 | if args.do_test: 534 | del model 535 | gc.collect() 536 | args.do_train=False 537 | model = BertForSequenceClassification.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"),args,config=config) 538 | if args.fp16: 539 | model.half() 540 | model.to(device) 541 | if args.local_rank != -1: 542 | try: 543 | from apex.parallel import DistributedDataParallel as DDP 544 | except ImportError: 545 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") 546 | 547 | model = DDP(model) 548 | elif args.n_gpu > 1: 549 | model = torch.nn.DataParallel(model) 550 | 551 | 552 | for file,flag in [('dev.csv','dev'),('test.csv','test')]: 553 | inference_labels=[] 554 | gold_labels=[] 555 | eval_examples = read_examples(os.path.join(args.data_dir, file), is_training = False) 556 | eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,args.split_num,False) 557 | all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) 558 | all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) 559 | all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) 560 | all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long) 561 | 562 | 563 | eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label) 564 | # Run prediction for full data 565 | eval_sampler = SequentialSampler(eval_data) 566 | eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) 567 | 568 | model.eval() 569 | eval_loss, eval_accuracy = 0, 0 570 | nb_eval_steps, nb_eval_examples = 0, 0 571 | for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: 572 | input_ids = input_ids.to(device) 573 | input_mask = input_mask.to(device) 574 | segment_ids = segment_ids.to(device) 575 | label_ids = label_ids.to(device) 576 | 577 | with torch.no_grad(): 578 | logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy() 579 | label_ids = label_ids.to('cpu').numpy() 580 | inference_labels.append(logits) 581 | gold_labels.append(label_ids) 582 | gold_labels=np.concatenate(gold_labels,0) 583 | logits=np.concatenate(inference_labels,0) 584 | print(flag, accuracy(logits, gold_labels)) 585 | if flag=='test': 586 | df=pd.read_csv(os.path.join(args.data_dir, file)) 587 | df['label_0']=logits[:,0] 588 | df['label_1']=logits[:,1] 589 | #df['label_2']=logits[:,2] 590 | df[['id','entity','label_0','label_1']].to_csv(os.path.join(args.output_dir, "test_pb.csv"),index=False) 591 | 592 | 593 | if __name__ == "__main__": 594 | main() 595 | -------------------------------------------------------------------------------- /scheme1/code2/code.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import gc 4 | import sys 5 | import json 6 | import codecs 7 | import datetime 8 | import warnings 9 | import numpy as np 10 | import pandas as pd 11 | import tensorflow as tf 12 | from tqdm import tqdm 13 | from random import choice 14 | import matplotlib.pyplot as plt 15 | from collections import Counter 16 | from sklearn.metrics import f1_score 17 | from sklearn.model_selection import KFold 18 | from sklearn.model_selection import KFold 19 | from sklearn.preprocessing import LabelEncoder 20 | from sklearn.model_selection import StratifiedKFold 21 | from sklearn.model_selection import train_test_split 22 | 23 | import keras.backend as K 24 | from keras.layers import * 25 | from keras.callbacks import * 26 | from keras.models import Model 27 | from keras.optimizers import Adam 28 | from keras.utils import to_categorical 29 | from keras.metrics import top_k_categorical_accuracy, categorical_accuracy 30 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 31 | 32 | tqdm.pandas() 33 | np.random.seed(214683) 34 | warnings.filterwarnings('ignore') 35 | 36 | 37 | data_path = 'datasets/' 38 | train = pd.read_csv(data_path + 'Round2_train.csv', encoding='utf-8') 39 | train2= pd.read_csv(data_path + 'Train_data.csv', encoding='utf-8') 40 | train=pd.concat([train,train2],axis=0,sort=True) 41 | test = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8') 42 | 43 | train = train[train['entity'].notnull()] 44 | test = test[test['entity'].notnull()] 45 | 46 | train=train.drop_duplicates(['title','text','entity','negative','key_entity']) #去掉重复的data 47 | 48 | 49 | def get_or_content(y,z): 50 | s='' 51 | if str(y)!='nan': 52 | s+=y 53 | if str(z)!='nan': 54 | s+=z 55 | return s 56 | 57 | #获取title+text 58 | train['content']=list(map(lambda y,z: get_or_content(y,z),train['title'],train['text'])) 59 | test['content']=list(map(lambda y,z: get_or_content(y,z),test['title'],test['text'])) 60 | 61 | def entity_clear_row(entity,content): 62 | entities = entity.split(';') 63 | entities.sort(key=lambda x: len(x)) 64 | n = len(entities) 65 | tmp = entities.copy() 66 | for i in range(n): 67 | entity_tmp = entities[i] 68 | #长度小于等于1 69 | if len(entity_tmp)<=1: 70 | tmp.remove(entity_tmp) 71 | continue 72 | if i + 1 >= n: 73 | break 74 | for entity_tmp2 in entities[i + 1:]: 75 | if entity_tmp2.find(entity_tmp) != -1 and ( 76 | entity_tmp2.find('?') != -1 or content.replace(entity_tmp2, '').find(entity_tmp) == -1): 77 | tmp.remove(entity_tmp) 78 | break 79 | return ';'.join(tmp) 80 | 81 | train['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),train['entity'],train['content'])) 82 | test['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),test['entity'],test['content'])) 83 | 84 | # 去掉空实体 85 | def duplicate_entity(entity): 86 | def is_empty(x): 87 | return (x != '') & (x != ' ') 88 | 89 | if entity is np.nan: 90 | return entity 91 | else: 92 | entity = filter(is_empty, entity.split(';')) 93 | return ';'.join(list(set(entity))) 94 | 95 | train['entity'] = train['entity'].apply(lambda index: duplicate_entity(index)) 96 | test['entity'] = test['entity'].apply(lambda index: duplicate_entity(index)) 97 | 98 | # 正则表达式清洗文本 99 | def delete_tag(s): 100 | 101 | s = re.sub('\{IMG:.?.?.?\}', '', s) #图片 102 | s = re.sub(re.compile(r'[a-zA-Z]+://[^\s]+'), '', s) #网址 103 | s = re.sub(re.compile('<.*?>'), '', s) #网页标签 104 | # s = re.sub(re.compile('&[a-zA-Z]+;?'), ' ', s) #网页标签 105 | # s = re.sub(re.compile('[a-zA-Z0-9]*[./]+[a-zA-Z0-9./]+[a-zA-Z0-9./]*'), ' ', s) 106 | # r4=re.compile('\d{4}[-/]\d{2}[-/]\d{2}') #日期 107 | # s=re.sub(r4,'某时',s) 108 | return s 109 | 110 | train['title'] = train['title'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x) 111 | train['text'] = train['text'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x) 112 | test['title'] = test['title'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x) 113 | test['text'] = test['text'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x) 114 | 115 | # 使用title来填充测试集中text的缺失值,text null:1 116 | train['title'] = train.apply(lambda index: index.text if index.title is np.nan else index.title, axis=1) 117 | test['title'] = test.apply(lambda index: index.text if index.title is np.nan else index.title, axis=1) 118 | train['text'] = train.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1) 119 | test['text'] = test.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1) 120 | 121 | # 选取非空样本 122 | train = train[train['entity'].notnull()] # train entity null:18 123 | test = test[test['entity'].notnull()] # test entity null:16 124 | 125 | # train 126 | train_id_entity = train[['id', 'entity']] 127 | train_id_entity['entity'] = train_id_entity['entity'].apply(lambda index: index.split(';')) 128 | ids, entity = [], [] 129 | for index in range(len(train_id_entity['entity'])): 130 | entity.extend(list(train_id_entity['entity'])[index]) 131 | ids.extend([list(train_id_entity['id'])[index]] * len(list(train_id_entity['entity'])[index])) 132 | train_id_entity = pd.DataFrame({'id': ids, 'entity_label': entity}) # train len:11448 133 | 134 | # test 135 | test_id_entity = test[['id', 'entity']] 136 | test_id_entity['entity'] = test_id_entity['entity'].apply(lambda index: index.split(';')) 137 | ids, entity = [], [] 138 | for index in range(len(test_id_entity['entity'])): 139 | entity.extend(list(test_id_entity['entity'])[index]) 140 | ids.extend([list(test_id_entity['id'])[index]] * len(list(test_id_entity['entity'])[index])) 141 | test_id_entity = pd.DataFrame({'id': ids, 'entity_label': entity}) # test len:11580 142 | 143 | # 144 | train.pop('negative') # 去掉negative列 145 | train = train.merge(train_id_entity, on='id', how='left') 146 | train['label'] = train.apply(lambda index: 0 if index.key_entity is np.nan else 1, axis=1) 147 | train['key_entity'] = train['key_entity'].fillna('') 148 | # train['label'] = train.apply(lambda index: 1 if index.key_entity.find(index.entity_label) != -1 else 0, axis=1) 149 | train['label'] = train.apply(lambda index: 1 if index.key_entity.split(';').count(index.entity_label) >= 1 else 0, axis=1) 150 | 151 | test = test.merge(test_id_entity, on='id', how='left') 152 | 153 | # 去除长度小于的1的entity 154 | train['entity_label_len'] = train['entity_label'].apply(lambda x: len(x)) 155 | test['entity_label_len'] = test['entity_label'].apply(lambda x: len(x)) 156 | train = train[train['entity_label_len']>1] 157 | test = test[test['entity_label_len']>1] 158 | 159 | def get_first_index(row, flag): 160 | if flag=='title': 161 | return row['title'].find(row['entity_label']) 162 | else: 163 | return row['text'].find(row['entity_label']) 164 | 165 | train['title_first_index'] = train.apply(lambda row: get_first_index(row, 'title'), axis=1) 166 | train['text_first_index'] = train.apply(lambda row: get_first_index(row, 'text'), axis=1) 167 | 168 | test['title_first_index'] = test.apply(lambda row: get_first_index(row, 'title'), axis=1) 169 | test['text_first_index'] = test.apply(lambda row: get_first_index(row, 'text'), axis=1) 170 | 171 | def text_truncate(row): 172 | title_first_index = row['title_first_index'] 173 | text_first_index = row['text_first_index'] 174 | if title_first_index==-1 and text_first_index>480: 175 | return row['text'][text_first_index-200:text_first_index+300] 176 | else: 177 | return row['text'] 178 | 179 | train['text'] = train.apply(lambda row: text_truncate(row), axis=1) 180 | test['text'] = test.apply(lambda row: text_truncate(row), axis=1) 181 | 182 | def get_content(x, y, z): 183 | s='[E]' # E:Entity 184 | if str(x)!='nan': 185 | s+=x 186 | if str(y)!='nan' and str(z)!='nan' and y==z: 187 | s+='[S]' # S:Same 188 | s+=y 189 | else: 190 | s+='[T]' # T:Title 191 | if str(y)!='nan': 192 | s+=y 193 | s+='[C]' # C:Content 194 | if str(z)!='nan': 195 | s+=z 196 | #添加 197 | # if str(x)!='nan': 198 | # x_len=len(x) 199 | # end=len(s)-x_len 200 | # i=0 201 | # out='' 202 | # while i<=end: 203 | # if s[i:i+x_len]==x: 204 | # out+='$' 205 | # out+=x 206 | # out+="$" 207 | # i+=x_len 208 | # else: 209 | # out+=s[i] 210 | # i+=1 211 | # if i!=len(s): 212 | # out+=s[i:] 213 | # s=out 214 | return s 215 | 216 | # def get_content(x, y, z): 217 | # s='' # E:Entity 218 | # # if str(x)!='nan': 219 | # # s+=x 220 | # if str(y)!='nan' and str(z)!='nan' and y==z: 221 | # s+='[S]' # S:Same 222 | # s+=y 223 | # else: 224 | # s+='[T]' # T:Title 225 | # if str(y)!='nan': 226 | # s+=y 227 | # s+='[C]' # C:Content 228 | # if str(z)!='nan': 229 | # s+=z 230 | # #添加 231 | # # if str(x)!='nan': 232 | # # x_len=len(x) 233 | # # end=len(s)-x_len 234 | # # i=0 235 | # # out='' 236 | # # while i<=end: 237 | # # if s[i:i+x_len]==x: 238 | # # out+='$' 239 | # # out+=x 240 | # # out+="$" 241 | # # i+=x_len 242 | # # else: 243 | # # out+=s[i] 244 | # # i+=1 245 | # # if i!=len(s): 246 | # # out+=s[i:] 247 | # # s=out 248 | # return s 249 | 250 | train['corpus']=list(map(lambda x,y,z: get_content(x,y,z),tqdm(train['entity_label'].values),train['title'],train['text'])) 251 | test['corpus']=list(map(lambda x,y,z: get_content(x,y,z),tqdm(test['entity_label'].values),test['title'],test['text'])) 252 | 253 | def get_other_content(x,y): 254 | entitys=x.split(';') 255 | if len(entitys)<=1: 256 | return np.nan 257 | l=[] 258 | for e in entitys: 259 | if e!=y and e!='': 260 | l.append(e) 261 | return ';'.join(l) 262 | train['other_entity'] = list(map(lambda x, y: get_other_content(x, y), train['entity'], train['entity_label'])) 263 | test['other_entity'] = list(map(lambda x, y: get_other_content(x, y), test['entity'], test['entity_label'])) 264 | 265 | def get_content(x, y): 266 | if str(y) == 'nan': 267 | return x 268 | y = y.split(';') 269 | for i in y: 270 | # x=x.replace(i,'其他实体') 271 | x = 'O_E'.join(x.split(i)) # O_E:Other_Entity 272 | return x 273 | train['corpus'] = list(map(lambda x, y: get_content(x, y), train['corpus'], train['other_entity'])) 274 | test['corpus'] = list(map(lambda x, y: get_content(x, y), test['corpus'], test['other_entity'])) 275 | 276 | MAXLEN = 510 # 510 277 | 278 | bert_path = 'E:/NLP_corpus/BERT/hgd/chinese_roberta_wwm_ext_L-12_H-768_A-12/' 279 | config_path = bert_path + 'bert_config.json' 280 | checkpoint_path = bert_path + 'bert_model.ckpt' 281 | dict_path = bert_path + 'vocab.txt' 282 | 283 | # 给每个token按序编号,构建词表 284 | token_dict = {} 285 | with codecs.open(dict_path, 'r', 'utf8') as reader: 286 | for line in reader: 287 | token = line.strip() 288 | token_dict[token] = len(token_dict) 289 | 290 | # 分词器 291 | class OurTokenizer(Tokenizer): 292 | def _tokenize(self, text): 293 | R = [] 294 | for c in text: 295 | if c in self._token_dict: 296 | R.append(c) 297 | elif self._is_space(c): 298 | R.append('[unused1]') # space类用未经训练的[unused1]表示 299 | else: 300 | R.append('[UNK]') # 剩余的字符是[UNK] 301 | return R 302 | tokenizer = OurTokenizer(token_dict) 303 | 304 | # Padding,默认添 0 305 | def seq_padding(X, padding=0): 306 | L = [len(x) for x in X] 307 | ML = max(L) 308 | return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X]) 309 | 310 | # 数据生成 311 | class data_generator: 312 | def __init__(self, data, batch_size=8, shuffle=True): # 8 313 | self.data = data 314 | self.batch_size = batch_size 315 | self.shuffle = shuffle 316 | self.steps = len(self.data) // self.batch_size # 迭代完一个epoch需要的步数 317 | if len(self.data) % self.batch_size != 0: # 保证步数为整数 318 | self.steps += 1 319 | 320 | def __len__(self): 321 | return self.steps 322 | 323 | def __iter__(self): 324 | while True: 325 | idxs = list(range(len(self.data))) 326 | if self.shuffle: 327 | np.random.shuffle(idxs) 328 | 329 | X1, X2, Y = [], [], [] 330 | for i in idxs: 331 | d = self.data[i] 332 | text = d[0][:MAXLEN] 333 | x1, x2 = tokenizer.encode(first=text) 334 | y = d[1] 335 | X1.append(x1) 336 | X2.append(x2) 337 | # Y.append([y]) 338 | Y.append(y) 339 | if len(X1) == self.batch_size or i == idxs[-1]: 340 | X1 = seq_padding(X1) 341 | X2 = seq_padding(X2) 342 | Y = seq_padding(Y) 343 | yield [X1, X2], Y#[:, 0, :] 344 | X1, X2, Y = [], [], [] 345 | 346 | # 计算:最高的k分类准确率 347 | def acc_top2(y_true, y_pred): 348 | return top_k_categorical_accuracy(y_true, y_pred, k=2) 349 | 350 | # 计算:F1值 351 | def f1_metric(y_true, y_pred): 352 | ''' 353 | metric from here 354 | https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras 355 | ''' 356 | def recall(y_true, y_pred): 357 | """Recall metric. 358 | Only computes a batch-wise average of recall. 359 | Computes the recall, a metric for multi-label classification of 360 | how many relevant items are selected. 361 | """ 362 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 363 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 364 | recall = true_positives / (possible_positives + K.epsilon()) 365 | return recall 366 | 367 | def precision(y_true, y_pred): 368 | """Precision metric. 369 | 370 | Only computes a batch-wise average of precision. 371 | 372 | Computes the precision, a metric for multi-label classification of 373 | how many selected items are relevant. 374 | """ 375 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 376 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 377 | precision = true_positives / (predicted_positives + K.epsilon()) 378 | return precision 379 | precision = precision(y_true, y_pred) 380 | recall = recall(y_true, y_pred) 381 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 382 | 383 | # BERT模型建立 384 | def build_bert(nclass): 385 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) 386 | 387 | for layer in bert_model.layers: 388 | # print(l) 389 | layer.trainable = True 390 | 391 | # inputs 392 | x1_in = Input(shape=(None,)) 393 | x2_in = Input(shape=(None,)) 394 | 395 | x = bert_model([x1_in, x2_in]) 396 | # print('Bert output shape', x.shape) 397 | x = Lambda(lambda x: x[:, 0])(x) 398 | 399 | # outputs 400 | p = Dense(nclass, activation='softmax')(x) 401 | 402 | # 模型建立与编译 403 | model = Model([x1_in, x2_in], p) 404 | model.compile(loss='categorical_crossentropy', 405 | optimizer=Adam(1e-5), 406 | metrics=['accuracy', f1_metric, categorical_accuracy]) 407 | print(model.summary()) 408 | return model 409 | 410 | from keras.callbacks import Callback 411 | from sklearn.metrics import f1_score,accuracy_score 412 | 413 | learning_rate = 5e-5 414 | min_learning_rate = 1e-5 415 | 416 | class Evaluate(Callback): 417 | def __init__(self): 418 | self.best = 0. 419 | self.passed = 0 420 | 421 | def on_batch_begin(self, batch, logs=None): 422 | """第一个epoch用来warmup,第二个epoch把学习率降到最低 423 | """ 424 | if self.passed < self.params['steps']: 425 | lr = (self.passed + 1.) / self.params['steps'] * learning_rate 426 | K.set_value(self.model.optimizer.lr, lr) 427 | self.passed += 1 428 | elif self.params['steps'] <= self.passed < self.params['steps'] * 2: 429 | lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate) 430 | lr += min_learning_rate 431 | K.set_value(self.model.optimizer.lr, lr) 432 | self.passed += 1 433 | 434 | # 训练集去重 435 | train.drop_duplicates('corpus', inplace=True) 436 | 437 | DATA_LIST = [] 438 | for data_row in train.iloc[:].itertuples(): 439 | DATA_LIST.append((data_row.corpus, to_categorical(data_row.label, 2))) 440 | DATA_LIST = np.array(DATA_LIST) 441 | 442 | DATA_LIST_TEST = [] 443 | for data_row in test.iloc[:].itertuples(): 444 | DATA_LIST_TEST.append((data_row.corpus, to_categorical(0, 2))) 445 | DATA_LIST_TEST = np.array(DATA_LIST_TEST) 446 | 447 | f1 = [] 448 | def run_cv(nfolds, data, data_label, data_test, epochs=10, date_str='1107'): 449 | skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=214683).split(data, train['label']) 450 | train_model_pred = np.zeros((len(data), 2)) 451 | test_model_pred = np.zeros((len(data_test), 2)) 452 | 453 | for i, (train_fold, test_fold) in enumerate(skf): 454 | print('Fold: ', i+1) 455 | 456 | '''数据部分''' 457 | # 数据划分 458 | X_train, X_valid, = data[train_fold, :], data[test_fold, :] 459 | train_D = data_generator(X_train, shuffle=True) 460 | valid_D = data_generator(X_valid, shuffle=False) 461 | test_D = data_generator(data_test, shuffle=False) 462 | 463 | time_now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') 464 | '''模型部分''' 465 | # 生成模型 466 | model = build_bert(2) 467 | # callbacks 468 | early_stopping = EarlyStopping(monitor='val_f1_metric', patience=3) # val_acc 469 | plateau = ReduceLROnPlateau(monitor="val_f1_metric", verbose=1, mode='max', factor=0.5, patience=1) # max:未上升则降速 470 | checkpoint = ModelCheckpoint('./models/keras_model/fusai' + date_str + str(i) + '.hdf5', monitor='val_f1_metric', 471 | verbose=2, save_best_only=True, mode='max',save_weights_only=True) # period=1: 每1轮保存 472 | 473 | evaluator = Evaluate() 474 | 475 | # 模型训练,使用生成器方式训练 476 | model.fit_generator( 477 | train_D.__iter__(), 478 | steps_per_epoch=len(train_D), ## ?? ## 479 | epochs=epochs, 480 | validation_data=valid_D.__iter__(), 481 | validation_steps=len(valid_D), 482 | callbacks=[early_stopping, plateau, checkpoint, evaluator], # evaluator, 483 | verbose=2 484 | ) 485 | 486 | model.load_weights('./models/keras_model/fusai' + date_str + str(i) + '.hdf5') 487 | 488 | # return model 489 | val = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=0) 490 | print(val) 491 | 492 | score = f1_score(train['label'].values[test_fold], np.argmax(val, axis=1)) 493 | global f1 494 | f1.append(score) 495 | print('validate {} f1_score:{}'.format(i+1, score)) 496 | 497 | train_model_pred[test_fold, :] = val 498 | test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D),verbose=0) 499 | 500 | del model 501 | gc.collect() 502 | 503 | K.clear_session() 504 | # break 505 | 506 | return train_model_pred, test_model_pred 507 | 508 | 509 | start_time = time.time() 510 | 511 | n_folds = 10 512 | folds_num = str(n_folds) + 'folds_' 513 | date_str = '1114' 514 | strategy = '_withprocess_chusai&fusaidata_' 515 | model = 'robeta_large' 516 | 517 | train_model_pred, test_model_pred = run_cv(n_folds, DATA_LIST, None, DATA_LIST_TEST, date_str=date_str) 518 | print('Validate 5folds average f1 score:', np.average(f1)) 519 | np.save('weights/keras_weight/fusai/train' + model + strategy + folds_num + date_str + '.npy', train_model_pred) 520 | np.save('weights/keras_weight/fusai/test' + model + strategy + folds_num + date_str + '.npy', test_model_pred) 521 | 522 | end_time = time.time() 523 | print('Time cost(min): ', (end_time-start_time)/60) 524 | 525 | 526 | def return_list(group): 527 | return ';'.join(list(group)) 528 | 529 | sub = test.copy() 530 | sub['label'] = [np.argmax(index) for index in test_model_pred] 531 | sub_label = sub[sub['label'] == 1].groupby(['id'], as_index=False)['entity_label'].agg({'key_entity': return_list}) 532 | 533 | test_2 = pd.read_csv('datasets/round2_test.csv', encoding='utf-8') # 导入测试集 534 | submit = test_2[['id']] 535 | submit = submit.merge(sub_label, on='id', how='left') 536 | submit['negative'] = submit['key_entity'].apply(lambda index: 0 if index is np.nan else 1) 537 | submit = submit[['id', 'negative', 'key_entity']] 538 | 539 | time_now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') 540 | print(time_now) 541 | submit.to_csv('submission/' + model + strategy + folds_num + '{}.csv'.format(time_now), encoding='utf-8', index=None) 542 | 543 | -------------------------------------------------------------------------------- /scheme2/code2.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | import os 3 | import re 4 | import gc 5 | import sys 6 | import json 7 | import codecs 8 | import random 9 | import warnings 10 | import numpy as np 11 | import pandas as pd 12 | from tqdm import tqdm 13 | from random import choice 14 | import tensorflow as tf 15 | import matplotlib.pyplot as plt 16 | from collections import Counter 17 | from sklearn.model_selection import KFold 18 | from sklearn.preprocessing import LabelEncoder 19 | from sklearn.metrics import f1_score, accuracy_score 20 | from sklearn.model_selection import StratifiedKFold 21 | from sklearn.model_selection import train_test_split 22 | 23 | import keras.backend as K 24 | from keras.layers import * 25 | from keras.callbacks import * 26 | from keras.models import Model 27 | from keras.optimizers import Adam 28 | from keras.initializers import glorot_uniform 29 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 30 | 31 | 32 | tqdm.pandas() 33 | seed = 2019 34 | random.seed(seed) 35 | tf.set_random_seed(seed) 36 | np.random.seed(seed) 37 | warnings.filterwarnings('ignore') 38 | 39 | 40 | 41 | ################################################################ 42 | data_path = './data/' 43 | 44 | train = pd.read_csv(data_path + 'Round2_train.csv', encoding='utf-8') 45 | train2= pd.read_csv(data_path + './Train_Data.csv', encoding='utf-8') 46 | train=pd.concat([train, train2], axis=0, sort=True) 47 | test = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8') 48 | 49 | train = train[train['entity'].notnull()] 50 | test = test[test['entity'].notnull()] 51 | 52 | train=train.drop_duplicates(['title','text','entity','negative','key_entity']) # 去掉重复的data 53 | 54 | print(train.shape) ###(10526, 6) 55 | print(test.shape) ####((9997, 4) 56 | 57 | 58 | def get_or_content(y,z): 59 | s='' 60 | if str(y)!='nan': 61 | s+=y 62 | if str(z)!='nan': 63 | s+=z 64 | return s 65 | 66 | #获取title+text 67 | train['content']=list(map(lambda y,z: get_or_content(y,z),train['title'],train['text'])) 68 | test['content']=list(map(lambda y,z: get_or_content(y,z),test['title'],test['text'])) 69 | 70 | 71 | def entity_clear_row(entity,content): 72 | entities = entity.split(';') 73 | entities.sort(key=lambda x: len(x)) 74 | n = len(entities) 75 | tmp = entities.copy() 76 | for i in range(n): 77 | entity_tmp = entities[i] 78 | #长度小于等于1 79 | if len(entity_tmp)<=1: 80 | tmp.remove(entity_tmp) 81 | continue 82 | if i + 1 >= n: 83 | break 84 | for entity_tmp2 in entities[i + 1:]: 85 | if entity_tmp2.find(entity_tmp) != -1 and ( 86 | entity_tmp2.find('?') != -1 or content.replace(entity_tmp2, '').find(entity_tmp) == -1): 87 | tmp.remove(entity_tmp) 88 | break 89 | return ';'.join(tmp) 90 | 91 | train['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),train['entity'],train['content'])) 92 | test['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),test['entity'],test['content'])) 93 | 94 | test['text'] = test.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1) 95 | train = train[(train['entity'].notnull()) & (train['negative'] == 1)] ### 96 | 97 | emotion = pd.read_csv('./submit/sub_qinggan_vote20191109_score0392098.csv', encoding='utf-8') 98 | emotion = emotion[emotion['negative'] == 1] 99 | test = emotion.merge(test, on='id', how='left') 100 | 101 | 102 | ################################################################ 103 | train_id_entity = train[['id', 'entity']] 104 | train_id_entity['entity'] = train_id_entity['entity'].apply(lambda index: index.split(';')) 105 | id, entity = [], [] 106 | for index in range(len(train_id_entity['entity'])): 107 | entity.extend(list(train_id_entity['entity'])[index]) 108 | id.extend([list(train_id_entity['id'])[index]] * len(list(train_id_entity['entity'])[index])) 109 | 110 | train_id_entity = pd.DataFrame({'id': id, 'entity_label': entity}) 111 | 112 | test_id_entity = test[['id', 'entity']] 113 | test_id_entity['entity'] = test_id_entity['entity'].apply(lambda index: index.split(';')) 114 | id, entity = [], [] 115 | for index in range(len(test_id_entity['entity'])): 116 | entity.extend(list(test_id_entity['entity'])[index]) 117 | id.extend([list(test_id_entity['id'])[index]] * len(list(test_id_entity['entity'])[index])) 118 | 119 | test_id_entity = pd.DataFrame({'id': id, 'entity_label': entity}) 120 | 121 | train = train.merge(train_id_entity, on='id', how='left') 122 | train['flag'] = train.apply(lambda index: 1 if index.key_entity.split(';').count(index.entity_label) >= 1 else 0, axis=1) 123 | test = test.merge(test_id_entity, on='id', how='left') 124 | 125 | ################################################################ 126 | print(train.shape) 127 | print(test.shape) 128 | 129 | def extract_feature(data): 130 | data['sub_word_num'] = data.apply(lambda index: index.entity.count(index.entity_label) - 1, axis=1) 131 | data['question_mark_num'] = data['entity_label'].apply(lambda index: index.count('?')) 132 | data['occur_in_title_num'] = data.apply(lambda index: 0 if index.title is np.nan else index.title.count(index.entity_label), axis=1) 133 | data['occur_in_text_num'] = data.apply(lambda index: 0 if index.text is np.nan else index.text.count(index.entity_label), axis=1) 134 | data['occur_in_partial_text_num'] = data.apply(lambda index: 0 if index.text is np.nan else index.text[:507].count(index.entity_label), axis=1) 135 | data['occur_in_entity'] = data.apply(lambda index: 0 if index.text is np.nan else index.entity.count(index.entity_label) - 1, axis=1) 136 | data['is_occur_in_article'] = data.apply(lambda index: 1 if (index.occur_in_title_num >= 1) | (index.occur_in_text_num >= 1) else 0, axis=1) 137 | return data 138 | 139 | train = extract_feature(train) 140 | test = extract_feature(test) 141 | print(train.columns) 142 | 143 | 144 | train['entity_len'] = train['entity_label'].progress_apply(lambda index: len(index)) 145 | test['entity_len'] = test['entity_label'].progress_apply(lambda index: len(index)) 146 | 147 | train[train['entity_len'] == 1].shape 148 | train = train[train['entity_len'] > 1] 149 | 150 | test[test['entity_len'] == 1].shape 151 | test = test[test['entity_len'] > 1] 152 | 153 | train_feature = train[['sub_word_num', 'question_mark_num', 'occur_in_title_num', 'occur_in_text_num', 'is_occur_in_article', 'occur_in_entity', 'occur_in_partial_text_num']] 154 | test_feature = test[['sub_word_num', 'question_mark_num', 'occur_in_title_num', 'occur_in_text_num', 'is_occur_in_article', 'occur_in_entity', 'occur_in_partial_text_num']] 155 | 156 | # Normalization 157 | from sklearn.preprocessing import MinMaxScaler 158 | scaler = MinMaxScaler() 159 | train_feature = scaler.fit_transform(train_feature) 160 | test_feature = scaler.fit_transform(test_feature) 161 | 162 | def get_other_content(x,y): 163 | entitys=x.split(";") 164 | if len(entitys)<=1: 165 | return np.nan 166 | l=[] 167 | for e in entitys: 168 | if e!=y: 169 | l.append(e) 170 | return ';'.join(l) 171 | 172 | train['other_entity']=list(map(lambda x,y :get_other_content(x,y),train['entity'],train['entity_label'])) 173 | test['other_entity']=list(map(lambda x,y :get_other_content(x,y),test['entity'],test['entity_label'])) 174 | 175 | def get_content(x,y): 176 | if str(y)=='nan': 177 | return x 178 | y=y.split(";") 179 | y = sorted(y, key=lambda i:len(i),reverse=True) 180 | for i in y: 181 | x = '其他实体'.join(x.split(i)) 182 | return x 183 | 184 | train['text']=list(map(lambda x,y: get_content(x,y), train['text'], train['other_entity'])) 185 | test['text']=list(map(lambda x,y: get_content(x,y), test['text'], test['other_entity'])) 186 | 187 | maxlen = 509 188 | bert_path = 'E:/chinese_wwm_ext_L-12_H-768_A-12/' # chinese_L-12_H-768_A-12 chinese_wwm_ext_L-12_H-768_A-12 189 | config_path = bert_path + 'bert_config.json' 190 | checkpoint_path = bert_path + 'bert_model.ckpt' 191 | dict_path = bert_path + 'vocab.txt' 192 | 193 | token_dict = {} 194 | with codecs.open(dict_path, 'r', 'utf8') as reader: 195 | for line in reader: 196 | token = line.strip() 197 | token_dict[token] = len(token_dict) # 给每个token 按序编号 198 | 199 | class OurTokenizer(Tokenizer): 200 | def _tokenize(self, text): 201 | R = [] 202 | for c in text: 203 | if c in self._token_dict: 204 | R.append(c) 205 | elif self._is_space(c): 206 | R.append('[unused1]') # space类用未经训练的[unused1]表示 207 | else: 208 | R.append('[UNK]') # 剩余的字符是[UNK] 209 | return R 210 | 211 | tokenizer = OurTokenizer(token_dict) 212 | 213 | def seq_padding(X, padding=0): 214 | L = [len(x) for x in X] 215 | ML = max(L) 216 | return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X]) 217 | 218 | class data_generator: 219 | def __init__(self, data, feature, batch_size=8, shuffle=True): # 8 220 | self.data = data 221 | self.batch_size = batch_size 222 | self.shuffle = shuffle 223 | self.feature = feature 224 | self.steps = len(self.data) // self.batch_size 225 | if len(self.data) % self.batch_size != 0: 226 | self.steps += 1 227 | def __len__(self): 228 | return self.steps 229 | def __iter__(self): 230 | while True: 231 | idxs = list(range(len(self.data))) 232 | 233 | if self.shuffle: 234 | np.random.shuffle(idxs) 235 | 236 | X1, X2, Y, Fea = [], [], [], [] 237 | for i in idxs: 238 | d = self.data[i] 239 | fea = self.feature[i] # add feature 240 | first_text = d[0] 241 | second_text = d[2][:maxlen - d[1]] 242 | x1, x2 = tokenizer.encode(first=first_text, second=second_text) # , max_len=512 243 | y = d[3] 244 | Fea.append(fea) 245 | X1.append(x1) 246 | X2.append(x2) 247 | Y.append([y]) 248 | if len(X1) == self.batch_size or i == idxs[-1]: 249 | X1 = seq_padding(X1) 250 | X2 = seq_padding(X2, padding=1) 251 | Fea = seq_padding(Fea) 252 | Y = seq_padding(Y) 253 | yield [X1, X2, Fea], Y[:, 0, :] 254 | [X1, X2, Y, Fea] = [], [], [], [] 255 | 256 | from keras.metrics import top_k_categorical_accuracy 257 | from keras.metrics import categorical_accuracy 258 | 259 | def acc_top2(y_true, y_pred): 260 | return top_k_categorical_accuracy(y_true, y_pred, k=1) 261 | 262 | 263 | def f1_metric(y_true, y_pred): 264 | def recall(y_true, y_pred): 265 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 266 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 267 | recall = true_positives / (possible_positives + K.epsilon()) 268 | return recall 269 | 270 | def precision(y_true, y_pred): 271 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 272 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 273 | precision = true_positives / (predicted_positives + K.epsilon()) 274 | return precision 275 | precision = precision(y_true, y_pred) 276 | recall = recall(y_true, y_pred) 277 | return 2*((precision*recall)/(precision+recall+K.epsilon())) 278 | 279 | 280 | def build_bert(nclass): 281 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None) 282 | 283 | for l in bert_model.layers: 284 | # print(l) 285 | l.trainable = True 286 | 287 | x1_in = Input(shape=(None,)) 288 | x2_in = Input(shape=(None,)) 289 | x3_in = Input(shape=(train_feature.shape[1],)) 290 | 291 | feature = Dense(64, activation='relu')(x3_in) 292 | 293 | x = bert_model([x1_in, x2_in]) 294 | x = Lambda(lambda x: x[:, 0])(x) 295 | x = concatenate([x, feature]) 296 | p = Dense(nclass, activation='softmax')(x) 297 | 298 | model = Model([x1_in, x2_in, x3_in], p) 299 | model.compile(loss='categorical_crossentropy', 300 | optimizer=Adam(1e-5), # lr: 5e-5 3e-5 2e-5 epoch: 3, 4 batch_size: 16, 32 301 | metrics=['accuracy', f1_metric]) # categorical_accuracy 302 | print(model.summary()) 303 | return model 304 | 305 | 306 | ################################################################ 307 | from keras.utils import to_categorical 308 | 309 | DATA_LIST = [] 310 | for data_row in train.iloc[:].itertuples(): 311 | DATA_LIST.append((data_row.entity_label, data_row.entity_len, data_row.text, to_categorical(data_row.flag, 2))) 312 | DATA_LIST = np.array(DATA_LIST) 313 | 314 | DATA_LIST_TEST = [] 315 | for data_row in test.iloc[:].itertuples(): 316 | DATA_LIST_TEST.append((data_row.entity_label, data_row.entity_len, data_row.text, to_categorical(0, 2))) 317 | DATA_LIST_TEST = np.array(DATA_LIST_TEST) 318 | ################################################################ 319 | 320 | f1, acc = [], [] 321 | def run_cv(nfold, data, feature_train, data_label, data_test, feature_test): 322 | kf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed).split(data, train['flag']) 323 | train_model_pred = np.zeros((len(data), 2)) # 2 324 | test_model_pred = np.zeros((len(data_test), 2)) # 2 325 | 326 | for i, (train_fold, test_fold) in enumerate(kf): 327 | X_train, X_valid, = data[train_fold, :], data[test_fold, :] 328 | X_train_fea, X_valid_fea = feature_train[train_fold, :], feature_train[test_fold, :] 329 | 330 | model = build_bert(2) # 2 331 | early_stopping = EarlyStopping(monitor='val_acc', patience=2) # val_acc 332 | plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=1) 333 | checkpoint = ModelCheckpoint('./model/' + str(i) + '.hdf5', monitor='val_acc', 334 | verbose=2, save_best_only=True, mode='max',save_weights_only=True) 335 | 336 | train_D = data_generator(X_train, X_train_fea, shuffle=True) 337 | valid_D = data_generator(X_valid, X_valid_fea, shuffle=False) 338 | test_D = data_generator(data_test, feature_test, shuffle=False) 339 | 340 | model.fit_generator( 341 | train_D.__iter__(), 342 | steps_per_epoch=len(train_D), ## ?? ## 343 | epochs=10, 344 | validation_data=valid_D.__iter__(), 345 | validation_steps=len(valid_D), 346 | callbacks=[early_stopping, plateau, checkpoint], 347 | verbose=2 348 | ) 349 | 350 | model.load_weights('./model/' + str(i) + '.hdf5') 351 | 352 | # return model 353 | val = model.predict_generator(valid_D.__iter__(), steps=len(valid_D),verbose=0) 354 | 355 | print(val) 356 | score = f1_score(train['flag'].values[test_fold], np.argmax(val, axis=1)) 357 | acc_score = accuracy_score(train['flag'].values[test_fold], np.argmax(val, axis=1)) 358 | global f1, acc 359 | f1.append(score) 360 | acc.append(acc_score) 361 | print('validate f1 score:', score) 362 | print('validate accuracy score:', acc_score) 363 | 364 | train_model_pred[test_fold, :] = val 365 | test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D),verbose=0) 366 | 367 | del model; gc.collect() 368 | K.clear_session() 369 | return train_model_pred, test_model_pred 370 | 371 | 372 | ################################################################ 373 | train_model_pred, test_model_pred = run_cv(10, DATA_LIST, train_feature, None, DATA_LIST_TEST, test_feature) 374 | print('validate aver f1 score:', np.average(f1)) 375 | print('validate aver accuracy score:', np.average(acc)) 376 | np.save('weights/bert_prob_train_binary_label_add_feature_extend_trainSet-PreProcess-roberta-large.npy', train_model_pred) 377 | np.save('weights/bert_prob_test_binary_label_add_feature_extend_trainSet-PreProcess-roberta-large.npy', test_model_pred) 378 | ################################################################ 379 | 380 | # 结果一 # 381 | def return_list(group): 382 | return ';'.join(list(group)) 383 | 384 | sub = test.copy() 385 | sub['label'] = [np.argmax(index) for index in test_model_pred] 386 | 387 | test_2 = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8') 388 | submit = test_2[['id']] 389 | 390 | sub = sub[sub['label'] == 1] 391 | key_entity = sub.groupby(['id'], as_index=False)['entity_label'].agg({'key_entity': return_list}) 392 | 393 | submit = submit.merge(key_entity, on='id', how='left') 394 | submit['negative'] = submit['key_entity'].apply(lambda index: 0 if index is np.nan else 1) 395 | submit = submit[['id', 'negative', 'key_entity']] 396 | submit.to_csv('submit/sub_binary_label_roberta-large.csv', encoding='utf-8', index=None) 397 | print(submit[submit['key_entity'].notnull()].shape) 398 | 399 | 400 | ################################################################ 401 | # 结果二 # 402 | def return_list(group): 403 | return ';'.join(list(group)) 404 | 405 | sub = test.copy() 406 | sub['label'] = [np.argmax(index) for index in test_model_pred] 407 | test['prob'] = [index[1] for index in test_model_pred] 408 | 409 | sub = sub[sub['label'] == 1] 410 | key_entity = sub.groupby(['id'], as_index=False)['entity_label'].agg({'key_entity': return_list}) 411 | 412 | sub_id = set(test['id']) - set(key_entity['id']) 413 | sub_test = test[test['id'].isin(sub_id)] 414 | sub_test = sub_test.sort_values(by=['id', 'prob'], ascending=False).drop_duplicates(['id'], keep='first') 415 | sub_test['key_entity'] = sub_test['entity_label'] 416 | key_entity = pd.concat([key_entity, sub_test[['id', 'key_entity']]], axis=0, ignore_index=True) 417 | 418 | test_2 = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8') 419 | submit = test_2[['id']] 420 | 421 | submit = submit.merge(key_entity, on='id', how='left') 422 | submit['negative'] = submit['key_entity'].apply(lambda index: 0 if index is np.nan else 1) 423 | submit = submit[['id', 'negative', 'key_entity']] 424 | submit.to_csv('submit/sub_binary_label_roberta-large_all_neg_samples.csv', encoding='utf-8', index=None) 425 | print(submit[submit['key_entity'].notnull()].shape) 426 | 427 | -------------------------------------------------------------------------------- /scheme3/code3.py: -------------------------------------------------------------------------------- 1 | #! -*- coding:utf-8 -*- 2 | import os 3 | import re 4 | import gc 5 | import sys 6 | import json 7 | import codecs 8 | import warnings 9 | import numpy as np 10 | import pandas as pd 11 | from keras import initializers 12 | from tqdm import tqdm 13 | from random import choice 14 | import matplotlib.pyplot as plt 15 | from collections import Counter 16 | from sklearn.metrics import f1_score 17 | from sklearn.model_selection import KFold 18 | from sklearn.model_selection import KFold 19 | from sklearn.preprocessing import LabelEncoder 20 | from sklearn.model_selection import StratifiedKFold 21 | from sklearn.model_selection import train_test_split 22 | 23 | import keras.backend as K 24 | from keras.layers import * 25 | from keras.callbacks import * 26 | from keras.models import Model 27 | from keras.optimizers import Adam 28 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 29 | from keras_tqdm import TQDMNotebookCallback 30 | tqdm.pandas() 31 | np.random.seed(123) 32 | warnings.filterwarnings('ignore') 33 | 34 | import os 35 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" #指定gpu 36 | 37 | 38 | data_path = '../data/' #数据地址 39 | 40 | train_path=data_path + 'Round2_train.csv' 41 | train2_path=data_path + 'Train_data.csv' 42 | test_path=data_path + 'round2_test.csv' 43 | 44 | maxlen = 1 # 510 45 | learning_rate = 5e-5 46 | min_learning_rate = 1e-5 47 | batch_size=20 48 | 49 | 50 | 51 | save_model_path='zy' 52 | save_mdoel_name_pre='large' 53 | 54 | bert_path ='E:/code/pre_model/bert/chinese_L-12_H-768_A-12/' #模型地址 55 | 56 | config_path = bert_path + 'bert_config.json' 57 | checkpoint_path = bert_path + 'bert_model.ckpt' 58 | dict_path = bert_path + 'vocab.txt' 59 | 60 | 61 | #################################################load data######################################### 62 | train = pd.read_csv(train_path, encoding='utf-8') 63 | train2= pd.read_csv(train2_path, encoding='utf-8') 64 | 65 | 66 | train=pd.concat([train,train2],axis=0,sort=True) 67 | test = pd.read_csv(test_path, encoding='utf-8') 68 | test['text'] = test.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1) 69 | 70 | train = train[train['entity'].notnull()] 71 | test = test[test['entity'].notnull()] 72 | 73 | train=train.drop_duplicates(['title','text','entity','negative','key_entity']) #去掉重复的data 74 | 75 | print(train.shape) 76 | print(test.shape) 77 | 78 | #########################################删选实体#################################################################### 79 | '大家一样的函数处理' 80 | def get_or_content(y,z): 81 | s='' 82 | if str(y)!='nan': 83 | s+=y 84 | if str(z)!='nan': 85 | s+=z 86 | return s 87 | 88 | #获取title+text 89 | train['content']=list(map(lambda y,z: get_or_content(y,z),train['title'],train['text'])) 90 | test['content']=list(map(lambda y,z: get_or_content(y,z),test['title'],test['text'])) 91 | 92 | 93 | def entity_clear_row(entity,content): 94 | entities = entity.split(';') 95 | entities.sort(key=lambda x: len(x)) 96 | n = len(entities) 97 | tmp = entities.copy() 98 | for i in range(n): 99 | entity_tmp = entities[i] 100 | #长度小于等于1 101 | if len(entity_tmp)<=1: 102 | tmp.remove(entity_tmp) 103 | continue 104 | if i + 1 >= n: 105 | break 106 | for entity_tmp2 in entities[i + 1:]: 107 | if entity_tmp2.find(entity_tmp) != -1 and ( 108 | entity_tmp2.find('?') != -1 or content.replace(entity_tmp2, '').find(entity_tmp) == -1): 109 | tmp.remove(entity_tmp) 110 | break 111 | return ';'.join(tmp) 112 | 113 | train['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),train['entity'],train['content'])) 114 | test['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),test['entity'],test['content'])) 115 | 116 | 117 | #####################################################句子预处理#################################################### 118 | def delete_tag(s): 119 | r1 = re.compile(r'\{IMG:.?.?.?\}') # 图片 120 | s = re.sub(r1, '', s) 121 | r2 = re.compile(r'[a-zA-Z]+://[^\u4e00-\u9fa5|\?]+') # 网址 122 | s = re.sub(r2, '', s) 123 | r3 = re.compile(r'<.*?>') # 网页标签 124 | s = re.sub(r3, '', s) 125 | r4 = re.compile(r'&[a-zA-Z0-9]{1,4}') #   > &type &rdqu .... 126 | s = re.sub(r4, '', s) 127 | r5 = re.compile(r'[0-9a-zA-Z]+@[0-9a-zA-Z]+') # 邮箱 128 | s = re.sub(r5, '', s) 129 | r6 = re.compile(r'[#]') # #号 130 | s = re.sub(r6, '', s) 131 | return s 132 | 133 | 134 | train['title'] = train['title'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x) 135 | train['text'] = train['text'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x) 136 | test['title'] = test['title'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x) 137 | test['text'] = test['text'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x) 138 | 139 | 140 | ###############################################获取content################################################# 141 | def get_or_content(y, z): 142 | s = '' 143 | if str(y) != 'nan' and str(z) != 'nan' and y == z: 144 | s += '标题和内容相同,是' 145 | s += y 146 | else: 147 | s += '标题是' 148 | if str(y) != 'nan': 149 | if len(y) > 172: 150 | s += y[:172] 151 | else: 152 | s += y 153 | else: 154 | s += '无' 155 | s += '内容是' 156 | if str(z) != 'nan': 157 | s += z 158 | return s 159 | 160 | 161 | train['content'] = list(map(lambda y, z: get_or_content(y, z), train['title'], train['text'])) 162 | test['content'] = list(map(lambda y, z: get_or_content(y, z), test['title'], test['text'])) 163 | 164 | 165 | ####################################################mark label################################################## 166 | def get_id_entity(data): 167 | right_id, right_entity = [], [] 168 | for row in data.itertuples(): 169 | entities=row.entity.strip(';').split(';') 170 | entities.sort(key=lambda x:len(x)) #排序 171 | entities_num=len(entities) 172 | for index,entity in enumerate(entities): 173 | right_entity.append(entity) 174 | right_id.append(row.id) 175 | return pd.DataFrame({'id': right_id, 'entity_label': right_entity}) 176 | 177 | train_id_entity =get_id_entity(train[['id', 'entity','content']]) 178 | test_id_entity = get_id_entity(test[['id', 'entity','content']]) 179 | 180 | print('success') 181 | # train.pop('negative') 182 | train = train.merge(train_id_entity, on='id', how='left') 183 | train['label'] = train.apply(lambda index: 0 if index.key_entity is np.nan else 1, axis=1) 184 | train['key_entity'] = train['key_entity'].fillna('') 185 | train['label'] = train.apply(lambda index: 1 if index.key_entity.split(';').count(index.entity_label) >= 1 else 0, axis=1) 186 | 187 | test = test.merge(test_id_entity, on='id', how='left') 188 | 189 | print(test.shape) 190 | print(train.shape) 191 | 192 | ####################################获取预料############################################################ 193 | 194 | 'tttttt 获取实体出现的位置 前510' 195 | 196 | 197 | def get_new_content(entity, content): 198 | len_append = len('实体是' + entity) 199 | if len_append + len(content) > 510: 200 | return content[:510 - len_append] 201 | return content 202 | 203 | 204 | def get_position(entity, corpus): 205 | tag = int(corpus.count(entity) > 0) 206 | # 为了消除空格的影响 207 | if tag > 0: 208 | return 1 209 | return int(corpus.count(entity.strip()) > 0) 210 | 211 | 212 | train['content2'] = list(map(lambda x, y: get_new_content(x, y), train['entity'], train['content'])) 213 | test['content2'] = list(map(lambda x, y: get_new_content(x, y), test['entity'], test['content'])) 214 | train['entity_label_position2'] = list(map(lambda x, y: get_position(x, y), train['entity_label'], train['content2'])) 215 | test['entity_label_position2'] = list(map(lambda x, y: get_position(x, y), test['entity_label'], test['content2'])) 216 | 217 | train['entity_label_position'] = list(map(lambda x, y: get_position(x, y), train['entity_label'], train['content'])) 218 | test['entity_label_position'] = list(map(lambda x, y: get_position(x, y), test['entity_label'], test['content'])) 219 | 220 | train.pop('content') 221 | train.pop('content2') 222 | 223 | test.pop('content') 224 | test.pop('content2') 225 | print('success') 226 | 227 | print(train['entity_label_position2'].value_counts()) 228 | print(train['entity_label_position'].value_counts()) 229 | print(test['entity_label_position2'].value_counts()) 230 | print(test['entity_label_position'].value_counts()) 231 | 232 | import re 233 | 234 | 235 | # 前510没有内容的样本 corpus=i实体+标题+部分内容(无) 236 | def get_context_content(x, y, z, p2): 237 | s = '实体是' 238 | if str(x) != 'nan': 239 | s += x 240 | s += '。标题和部分内容没有实体' 241 | if str(y) != 'nan' and str(z) != 'nan' and y == z: 242 | s += ',标题和内容相同,是' 243 | s += y 244 | else: 245 | s += '标题是' 246 | if str(y) != 'nan': 247 | if len(y) > 172: 248 | s += y[:172] 249 | else: 250 | s += y 251 | else: 252 | s += '无' 253 | # 有部分内容 254 | if p2 == 1: 255 | s += '。部分内容是' 256 | if str(z) != 'nan': 257 | z_list = re.split(r',|。|?|;', z) 258 | for i in z_list: 259 | if x in i: 260 | s += i 261 | if len(s) > 700: 262 | break 263 | else: 264 | s += '无' 265 | else: 266 | s += '全文没有匹配的内容。内容是' 267 | if str(z) != 'nan': 268 | s += z 269 | else: 270 | s += '无' 271 | if len(s) > 700: 272 | return s[:700] 273 | return s 274 | 275 | 276 | def get_content(x, y, z, position, p2): 277 | # 前510有内容 278 | if position == 1: 279 | s = '实体是' 280 | if str(x) != 'nan': 281 | s += x 282 | if str(y) != 'nan' and str(z) != 'nan' and y == z: 283 | s += ',标题和内容相同,是' 284 | s += y 285 | else: 286 | s += '标题是' 287 | if str(y) != 'nan': 288 | if len(y) > 172: 289 | s += y[:172] 290 | else: 291 | s += y 292 | else: 293 | s += '无' 294 | s += '内容是' 295 | if str(z) != 'nan': 296 | s += z 297 | else: 298 | s += '无' 299 | else: 300 | # 前510没有内容 301 | s = get_context_content(x, y, z, p2) 302 | if len(s) > 700: 303 | return s[:700] 304 | return s 305 | 306 | 307 | train['corpus'] = list( 308 | map(lambda x, y, z, position, p2: get_content(x, y, z, position, p2), tqdm(train['entity_label'].values), 309 | train['title'], train[ 310 | 'text'], train['entity_label_position2'], train['entity_label_position'])) 311 | test['corpus'] = list( 312 | map(lambda x, y, z, position, p2: get_content(x, y, z, position, p2), tqdm(test['entity_label'].values), 313 | test['title'], test[ 314 | 'text'], test['entity_label_position2'], test['entity_label_position'])) 315 | 316 | def get_position(entity,corpus): 317 | tag=int(corpus.count(entity)>1) 318 | #为了消除空格的影响 319 | if tag>1: 320 | return 1 321 | return int(corpus.count(entity.strip())>1) 322 | 323 | train['entity_label_position3']=list(map(lambda x,y:get_position(x,y),train['entity_label'],train['corpus'])) 324 | test['entity_label_position3']=list(map(lambda x,y:get_position(x,y),test['entity_label'],test['corpus'])) 325 | 326 | 327 | print(train['entity_label_position3'].value_counts()) 328 | print(test['entity_label_position3'].value_counts()) 329 | 330 | 331 | 332 | def get_other_content(x,y): 333 | entitys=x.strip(';').split(";") 334 | if len(entitys)<=1: 335 | return np.nan 336 | l=[] 337 | for e in entitys: 338 | if e!=y: 339 | l.append(e) 340 | return ';'.join(l) 341 | train['other_entity']=list(map(lambda x,y :get_other_content(x,y),train['entity'],train['entity_label'])) 342 | test['other_entity']=list(map(lambda x,y :get_other_content(x,y),test['entity'],test['entity_label'])) 343 | def get_content(x,y,z): 344 | if str(y)=='nan': 345 | return x 346 | y=y.split(";") 347 | y=sorted(y,key=lambda x: len(x),reverse=True) 348 | #如果该实体没有出现直接返回原始语句 349 | if x.count(z)<=1: 350 | return x 351 | #出现了就直接替换其他的实体 352 | for i in y: 353 | if i not in z: #不是该候选实体的子串 354 | x='其他实体'.join(x.split(i)) 355 | return x 356 | train['corpus']=list(map(lambda x,y,z :get_content(x,y,z),train['corpus'],train['other_entity'],train['entity_label'])) 357 | test['corpus']=list(map(lambda x,y ,z:get_content(x,y,z),test['corpus'],test['other_entity'],test['entity_label'])) 358 | 359 | 360 | 361 | from keras.utils import to_categorical 362 | 363 | DATA_LIST = [] 364 | for data_row in train.iloc[:].itertuples(): 365 | DATA_LIST.append((data_row.corpus, data_row.negative,data_row.label)) 366 | DATA_LIST = np.array(DATA_LIST) 367 | print(DATA_LIST.shape) 368 | 369 | DATA_LIST_TEST = [] 370 | for data_row in test.iloc[:].itertuples(): 371 | DATA_LIST_TEST.append((data_row.corpus, 0,0)) 372 | DATA_LIST_TEST = np.array(DATA_LIST_TEST) 373 | print(DATA_LIST_TEST.shape) 374 | 375 | 376 | ########################################模型部分############################################# 377 | 378 | token_dict = {} 379 | with codecs.open(dict_path, 'r', 'utf8') as reader: 380 | for line in reader: 381 | token = line.strip() 382 | token_dict[token] = len(token_dict) # 给每个token 按序编号 383 | 384 | 385 | class OurTokenizer(Tokenizer): 386 | def _tokenize(self, text): 387 | R = [] 388 | for c in text: 389 | if c in self._token_dict: 390 | R.append(c) 391 | elif self._is_space(c): 392 | R.append('[unused1]') # space类用未经训练的[unused1]表示 393 | else: 394 | R.append('[UNK]') # 剩余的字符是[UNK] 395 | return R 396 | 397 | 398 | tokenizer = OurTokenizer(token_dict) 399 | 400 | '填充序列长度' 401 | 402 | 403 | def seq_padding(X, padding=0): 404 | L = [len(x) for x in X] 405 | ML = max(L) 406 | return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X]) 407 | 408 | 409 | class data_generator: 410 | def __init__(self, data, batch_size=batch_size, shuffle=True): # 8 411 | self.data = data 412 | self.batch_size = batch_size 413 | self.shuffle = shuffle 414 | self.steps = len(self.data) // self.batch_size 415 | if len(self.data) % self.batch_size != 0: 416 | self.steps += 1 417 | 418 | def __len__(self): 419 | return self.steps 420 | 421 | def __iter__(self): 422 | while True: 423 | idxs = list(range(len(self.data))) 424 | 425 | if self.shuffle: 426 | np.random.shuffle(idxs) 427 | 428 | X1, X2, Y1, Y2 = [], [], [], [] 429 | for i in idxs: 430 | d = self.data[i] 431 | text = d[0][:maxlen] 432 | x1, x2 = tokenizer.encode(first=text) 433 | y1 = d[1] 434 | y2 = d[2] 435 | X1.append(x1) 436 | X2.append(x2) 437 | Y1.append([y1]) 438 | Y2.append([y2]) 439 | if len(X1) == self.batch_size or i == idxs[-1]: 440 | X1 = seq_padding(X1) 441 | X2 = seq_padding(X2) 442 | Y1 = seq_padding(Y1) 443 | Y2 = seq_padding(Y2) 444 | yield [X1, X2, Y1, Y2], None 445 | X1, X2, Y1, Y2 = [], [], [], [] 446 | 447 | 448 | from keras.metrics import top_k_categorical_accuracy 449 | from keras.metrics import categorical_accuracy 450 | 451 | 452 | def acc_top2(y_true, y_pred): 453 | return top_k_categorical_accuracy(y_true, y_pred, k=2) 454 | 455 | 456 | def f1_metric(y_true, y_pred): 457 | ''' 458 | metric from here 459 | https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras 460 | ''' 461 | 462 | def recall(y_true, y_pred): 463 | """Recall metric. 464 | Only computes a batch-wise average of recall. 465 | Computes the recall, a metric for multi-label classification of 466 | how many relevant items are selected. 467 | """ 468 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 469 | possible_positives = K.sum(K.round(K.clip(y_true, 0, 1))) 470 | recall = true_positives / (possible_positives + K.epsilon()) 471 | return recall 472 | 473 | def precision(y_true, y_pred): 474 | """Precision metric. 475 | 476 | Only computes a batch-wise average of precision. 477 | 478 | Computes the precision, a metric for multi-label classification of 479 | how many selected items are relevant. 480 | """ 481 | true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) 482 | predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1))) 483 | precision = true_positives / (predicted_positives + K.epsilon()) 484 | return precision 485 | 486 | precision = precision(y_true, y_pred) 487 | recall = recall(y_true, y_pred) 488 | return 2 * ((precision * recall) / (precision + recall + K.epsilon())) 489 | 490 | 491 | # from keras.utils import multi_gpu_model 492 | 493 | 494 | class MyLayer(Layer): 495 | def __init__(self, **kwargs): 496 | super(MyLayer, self).__init__(**kwargs) 497 | 498 | def build(self, input_shape): 499 | self.w = self.add_weight(shape=(input_shape[1],), 500 | initializer=initializers.glorot_uniform(seed=2019), trainable=True, 501 | name='kernel') 502 | super(MyLayer, self).build(input_shape) 503 | 504 | def call(self, x): 505 | soft_w = K.softmax(self.w) 506 | 507 | return x * soft_w 508 | 509 | def compute_output_shape(self, input_shape): 510 | return (input_shape[0], input_shape[1]) 511 | 512 | 513 | # def get_means_bert_n_layers(x): 514 | # shape = list(x.shape) 515 | # layer_size = 768 516 | # number = int(int(shape[1]) / layer_size) 517 | # res = x[:, :layer_size] 518 | # for i in range(2, number + 1): 519 | # res += x[:, layer_size * (i - 1):layer_size * i] 520 | # print('return ') 521 | # return tf.divide(res, number) 522 | 523 | # 524 | # def get_means_bert_n_layers_shape(input_shape): 525 | # shape = list(input_shape) 526 | # layer_size = 768 527 | # shape[-1] = layer_size 528 | # return tuple(shape) 529 | 530 | 531 | def build_bert(nclass): 532 | seed = 2019 533 | bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, output_layer_num=1, seq_len=None) 534 | 535 | layer_ = 1 536 | for l in bert_model.layers: 537 | if layer_ >= 105 - 105: 538 | l.trainable = True 539 | # print(l) 540 | layer_ += 1 541 | print(' layer:', layer_) 542 | x1_in = Input(shape=(None,)) 543 | x2_in = Input(shape=(None,)) 544 | y1_in = Input(shape=(None,)) 545 | y2_in = Input(shape=(None,)) 546 | 547 | x = bert_model([x1_in, x2_in]) 548 | 549 | x0 = Lambda(lambda x: x[:, 0])(x) 550 | 551 | # x0=Lambda(get_means_bert_n_layers,get_means_bert_n_layers_shape)(x0) 552 | 553 | x1 = Dense(300, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=seed))(x0) 554 | p1 = Dense(nclass, activation='sigmoid')(x1) 555 | 556 | x1_st = Lambda(lambda x: K.stop_gradient(x))(x1) 557 | # x1_st=MyLayer()(x1_st) 558 | p1_st = Lambda(lambda x: K.stop_gradient(x))(p1) 559 | 560 | x2 = concatenate([x0, x1_st, p1_st]) 561 | 562 | x2 = Dense(300, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=seed))(x2) 563 | p2 = Dense(nclass, activation='sigmoid', kernel_initializer=initializers.glorot_uniform(seed=seed))(x2) 564 | 565 | model = Model([x1_in, x2_in], [p1, p2]) 566 | 567 | train_model = Model([x1_in, x2_in, y1_in, y2_in], [p1, p2]) 568 | 569 | loss1 = K.mean(K.binary_crossentropy(y1_in, p1, from_logits=False)) 570 | loss2 = K.mean(K.binary_crossentropy(y2_in, p2, from_logits=False)) 571 | 572 | # 带权重的loss 573 | # w_loss=K.softmax(K.variable([0.5,0.5])) 574 | 575 | # loss=w_loss[0]*loss1+w_loss[1]*loss2 576 | loss = loss1 + loss2 577 | 578 | train_model.add_loss(loss) 579 | 580 | 581 | train_model.compile(optimizer=Adam(learning_rate)) 582 | # print(train_model.summary()) 583 | return train_model 584 | 585 | 586 | from sklearn import metrics 587 | 588 | '寻找最好的阀值' 589 | def get_best_F1_score(pred, ture): 590 | f1_scores = [] 591 | cut_offs = [] 592 | for threshold in np.arange(0.01, 1, 0.01): 593 | pred_binary = (pred >= threshold) * 1 594 | f1_tmp = metrics.f1_score(y_true=ture, y_pred=pred_binary) 595 | f1_scores.append(f1_tmp) 596 | cut_offs.append(threshold) 597 | max_index = f1_scores.index(max(f1_scores)) 598 | max_x_axis = cut_offs[max_index] 599 | max_y_axis_F1 = f1_scores[max_index] 600 | return max_x_axis, max_y_axis_F1 601 | 602 | 603 | from keras.callbacks import Callback 604 | 605 | 606 | class Evaluate(Callback): 607 | def __init__(self, X_train, X_valid, tag): 608 | self.X_valid = X_valid 609 | self.X_train = X_train 610 | self.Y1_train, self.Y2_train = [int(i) for i in X_train[:, 1]], [int(i) for i in X_train[:, 2]] 611 | self.Y1_valid, self.Y2_valid = [int(i) for i in X_valid[:, 1]], [int(i) for i in X_valid[:, 2]] 612 | self.tag = tag 613 | self.best = 0. 614 | self.passed = 0 615 | 616 | def on_batch_begin(self, batch, logs=None): 617 | """第一个epoch用来warmup,第二个epoch把学习率降到最低 618 | """ 619 | if self.passed < self.params['steps']: 620 | lr = (self.passed + 1.) / self.params['steps'] * learning_rate 621 | K.set_value(self.model.optimizer.lr, lr) 622 | self.passed += 1 623 | elif self.params['steps'] <= self.passed < self.params['steps'] * 2: 624 | lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate) 625 | lr += min_learning_rate 626 | K.set_value(self.model.optimizer.lr, lr) 627 | self.passed += 1 628 | 629 | def on_epoch_end(self, epoch, logs=None): 630 | # train_D = data_generator(self.X_train, shuffle=False) 631 | valid_D = data_generator(self.X_valid, shuffle=False) 632 | # tra_1,tra_2 = self.model.predict_generator(train_D.__iter__(), steps=len(train_D),verbose=0) 633 | val_1, val_2 = self.model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=0) 634 | # tra_1=[int(i>0.5) for i in tra_1] 635 | # tra_2=[int(i>0.5) for i in tra_2] 636 | val_1 = [int(i > 0.5) for i in val_1] 637 | val_2 = [int(i > 0.5) for i in val_2] 638 | # train 639 | # t_f1_1=f1_score(self.Y1_train,tra_1) 640 | # t_f1_2=f1_score(self.Y2_train,tra_2) 641 | # t_acc_1=accuracy_score(self.Y1_train,tra_1) 642 | # t_acc_2=accuracy_score(self.Y2_train,tra_2) 643 | # val 644 | f1_1 = f1_score(self.Y1_valid, val_1) 645 | f1_2 = f1_score(self.Y2_valid, val_2) 646 | # acc_1 = accuracy_score(self.Y1_valid, val_1) 647 | # acc_2 = accuracy_score(self.Y2_valid, val_2) 648 | 649 | print(' ----val -f1_1:{:.5f} -f1_2:{:.5f}'.format(f1_1, f1_2)) 650 | # print('train -acc_1:{} -acc_2:{} ----val -acc_1:{} -acc_2:{}'.format(t_acc_1,t_acc_2,acc_1,acc_2)) 651 | 652 | f_mean = f1_1 * 0.4 + f1_2 * 0.6 653 | if f_mean >= self.best: 654 | self.best = f_mean 655 | self.model.save_weights(save_model_path + save_mdoel_name_pre + '{}.hdf5'.format(self.tag)) 656 | print('f_mean : {:.5f} best f_mean :{:.5f}'.format(f_mean, self.best)) 657 | 658 | 659 | 660 | #########################################训练部分######################################## 661 | 662 | def run_cv(nfold, data, data_label, data_test): 663 | f1_1_list = [] 664 | f1_2_list = [] 665 | 666 | val_best_threshold1 = [] # 预测1 667 | val_best_threshold2 = [] # 预测2 668 | kf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=2019).split(data, train['entity_label_position3']) 669 | train_model_pred1 = np.zeros((len(data), 1)) 670 | train_model_pred2 = np.zeros((len(data), 1)) 671 | test_model_preds1 = np.zeros((len(data_test), nfold)) 672 | test_model_preds2 = np.zeros((len(data_test), nfold)) 673 | 674 | for i, (train_fold, test_fold) in enumerate(kf): 675 | # if i<2: 676 | # continue 677 | X_train, X_valid, = data[train_fold, :], data[test_fold, :] 678 | print(X_train.shape) 679 | print(X_valid.shape) 680 | 681 | print(train_fold[:20]) 682 | print(test_fold[:20]) 683 | 684 | print('*' * 50, i + 1, '*' * 50) 685 | model = build_bert(1) 686 | # early_stopping = EarlyStopping(monitor='val_loss', patience=3) # val_acc 687 | # plateau = ReduceLROnPlateau(monitor="val_loss", verbose=1, mode='min', factor=0.5, patience=1) 688 | # checkpoint = ModelCheckpoint('./model/' + str(i) + '.hdf5', monitor='val_loss', 689 | # verbose=2, save_best_only=True, mode='min',save_weights_only=True) 690 | 691 | train_D = data_generator(X_train, shuffle=True) 692 | valid_D = data_generator(X_valid, shuffle=False) 693 | test_D = data_generator(data_test, shuffle=False) 694 | 695 | evaluator = Evaluate(X_train, X_valid, i + 1) 696 | 697 | model.fit_generator( 698 | train_D.__iter__(), 699 | steps_per_epoch=len(train_D), 700 | validation_data=valid_D.__iter__(), 701 | validation_steps=len(valid_D), 702 | epochs=1, 703 | callbacks=[evaluator], 704 | verbose=2 705 | ) 706 | 707 | model.load_weights(save_model_path + save_mdoel_name_pre + '{}.hdf5'.format(i + 1)) 708 | 709 | # return model 710 | val_1, val_2 = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=0) 711 | 712 | # 保存预测结果 713 | train_model_pred1[test_fold, :] = val_1 714 | train_model_pred2[test_fold, :] = val_2 715 | 716 | val_y1 = [int(i) for i in X_valid[:, 1]] 717 | val_y2 = [int(i) for i in X_valid[:, 2]] 718 | best_threshold1, f1_1 = get_best_F1_score(val_1, val_y1) 719 | best_threshold2, f1_2 = get_best_F1_score(val_2, val_y2) 720 | 721 | val_best_threshold1.append(best_threshold1) 722 | val_best_threshold2.append(best_threshold2) 723 | 724 | f1_1_list.append(f1_1) 725 | f1_2_list.append(f1_2) 726 | 727 | print('validate score -f1:{:.5f} -val_best_threshold1:{:.5f} -f2{:.5f} -val_best_threshold2:{:.5f}'.format( 728 | f1_1_list[-1], val_best_threshold1[-1], f1_2_list[-1], val_best_threshold2[-1])) 729 | 730 | # 预测 731 | t1, t2 = model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=0) 732 | for j in range(len(t1)): 733 | test_model_preds1[j, i] = t1[j] 734 | test_model_preds2[j, i] = t2[j] 735 | 736 | del model; 737 | gc.collect() 738 | K.clear_session() 739 | print('*' * 50, i + 1, '-done-', '*' * 45) 740 | 741 | return train_model_pred1, train_model_pred2, test_model_preds1, test_model_preds2, val_best_threshold1, val_best_threshold2 742 | 743 | train_model_pred1,train_model_pred2, test_pred_list1,test_pred_list2,val_best_threshold1,val_best_threshold2 = run_cv(5, DATA_LIST, None, DATA_LIST_TEST) 744 | 745 | ######################################################预测部分################################################ 746 | 'best train f1 and threshold' 747 | train_best_threshold1,train_f1=get_best_F1_score(train_model_pred1,train['negative'].values) 748 | print('train best_f1_1:',train_f1) 749 | print('train best_threshold_1:',train_best_threshold1) 750 | 751 | 'best train f1 and threshold' 752 | train_best_threshold2,train_f2=get_best_F1_score(train_model_pred2,train['label'].values) 753 | print('train best_f1_2:',train_f2) 754 | print('train best_threshold_2:',train_best_threshold2) 755 | 756 | from sklearn.metrics import f1_score, accuracy_score 757 | 758 | '获取5折的分数' 759 | pred1 = [int(index[0] > 0.5) for index in train_model_pred1] 760 | pred2 = [int(index[0] > 0.5) for index in train_model_pred2] 761 | true1 = [int(index) for index in DATA_LIST[:, 1]] 762 | true2 = [int(index) for index in DATA_LIST[:, 2]] 763 | 764 | print('f1_1:{}'.format(f1_score(true1, pred1))) 765 | print('f1_2:{}'.format(f1_score(true2, pred2))) 766 | 767 | print('acc_1:{}'.format(accuracy_score(true1, pred1))) 768 | print('acc_2:{}'.format(accuracy_score(true2, pred2))) 769 | 770 | # 保存概率 771 | train['pred1'] = train_model_pred1[:, 0] 772 | train['pred2'] = train_model_pred2[:, 0] 773 | train.to_csv('{}_train_preds.csv'.format(save_mdoel_name_pre), index=False) 774 | 775 | # 线下模拟线上分数 776 | train['preb1'] = train_model_pred1[:, 0] 777 | train['preb2'] = train_model_pred2[:, 0] 778 | df = train[['id', 'negative', 'label', 'preb1', 'preb2', 'entity_label']] 779 | df['n_pred'] = df['preb1'].apply(lambda x: int(x > 0.5)) 780 | df['l_pred'] = df['preb2'].apply(lambda x: int(x > 0.5)) 781 | 782 | train_or = pd.read_csv(train_path, encoding='utf-8') 783 | train2 = pd.read_csv(train2_path, encoding='utf-8') 784 | train_or = pd.concat([train_or, train2], axis=0, sort=True) 785 | train_or = train_or[train_or['entity'].notnull()] 786 | 787 | # pred1 788 | temp = df[['id', 'n_pred']].groupby('id')['n_pred'].agg(lambda x: list(x)) 789 | train_or['n_pred_list'] = train_or['id'].map(temp) 790 | train_or = train_or[train_or['n_pred_list'].notnull()] 791 | 792 | # entity label list 793 | temp = df[['id', 'entity_label']].groupby('id')['entity_label'].agg(lambda x: list(x)) 794 | train_or['entity_label_list'] = train_or['id'].map(temp) 795 | 796 | # pred2 797 | temp = df[['id', 'l_pred']].groupby('id')['l_pred'].agg(lambda x: list(x)) 798 | train_or['l_pred_list'] = train_or['id'].map(temp) 799 | 800 | train_or['n_pred_type'] = train_or['n_pred_list'].apply(lambda x: len(set(x))) 801 | 802 | 803 | def get_negative_and_entity(n_pred_list, l_pred_list, entitys): 804 | assert len(entitys) == len(n_pred_list) 805 | sub = [] 806 | neg = 0 807 | for i in range(len(n_pred_list)): 808 | if n_pred_list[i] == 1 and l_pred_list[i] == 1: 809 | sub.append(entitys[i]) 810 | neg = 1 811 | if len(sub) == 0: 812 | return np.nan 813 | return ';'.join(sub) 814 | 815 | 816 | train_or['preb_entity'] = list(map(lambda x, y, z: get_negative_and_entity(x, y, z), 817 | train_or['n_pred_list'], train_or['l_pred_list'], train_or['entity_label_list'])) 818 | 819 | train_or['pred_label'] = train_or['preb_entity'].apply(lambda x: 1 if str(x) != 'nan' else 0) 820 | 821 | 822 | def get_f1(y1, y2, y1_pred, y2_pred): 823 | F1_1 = f1_score(y1, y1_pred) 824 | print('f1_1:{}'.format(F1_1)) 825 | 826 | '计算实体F1' 827 | TP = 0 828 | FN = 0 829 | FP = 0 830 | for i in range(len(y2)): 831 | if str(y2[i]) == 'nan': 832 | y2_i = set() 833 | else: 834 | y2_i = set(y2[i].split(';')) 835 | if str(y2_pred[i]) == 'nan': 836 | y2_pred_i = set() 837 | else: 838 | y2_pred_i = set(y2_pred[i].split(';')) 839 | TPi = len(y2_i & y2_pred_i) 840 | FNi = len(y2_i.difference(y2_pred_i)) 841 | FPi = len(y2_pred_i.difference(y2_i)) 842 | TP += TPi 843 | FN += FNi 844 | FP += FPi 845 | P = TP / (TP + FP) 846 | R = TP / (TP + FN) 847 | F1_2 = 2 * P * R / (P + R) 848 | print('f1_2:{}'.format(F1_2)) 849 | print('score:', 0.4 * F1_1 + 0.6 * F1_2) 850 | 851 | 852 | print(get_f1(train_or['negative'].values, train_or['key_entity'].values, train_or['pred_label'].values, 853 | train_or['preb_entity'].values)) 854 | 855 | 856 | pred1_df=np.mean(test_pred_list1,axis=1) 857 | pred2_df=np.mean(test_pred_list2,axis=1) 858 | 859 | sub=test.copy() 860 | sub['preb1']=list(pred1_df) 861 | sub['preb2']=list(pred2_df) 862 | 863 | 864 | df=sub[['id','preb1','preb2','entity_label']] 865 | df['n_pred']=df['preb1'].apply(lambda x: int(x>0.5)) 866 | df['l_pred']=df['preb2'].apply(lambda x: int(x>0.5)) 867 | 868 | 869 | sub.to_csv('{}_test_preds.csv'.format(save_mdoel_name_pre),index=False) 870 | 871 | 872 | test_or = pd.read_csv(test_path, encoding='utf-8') 873 | test_or= test_or[test_or['entity'].notnull()] 874 | 875 | #pred1 876 | temp=df[['id','n_pred']].groupby('id')['n_pred'].agg(lambda x:list(x)) 877 | test_or['n_pred_list']=test_or['id'].map(temp) 878 | 879 | #pred2 880 | temp=df[['id','l_pred']].groupby('id')['l_pred'].agg(lambda x:list(x)) 881 | test_or['l_pred_list']=test_or['id'].map(temp) 882 | 883 | #entitys 884 | temp=df[['id','entity_label']].groupby('id')['entity_label'].agg(lambda x:list(x)) 885 | test_or['entity_label_list']=test_or['id'].map(temp) 886 | 887 | #key_entity 888 | test_or['preb_entity']=list(map(lambda x,y,z:get_negative_and_entity(x,y,z), 889 | test_or['n_pred_list'],test_or['l_pred_list'],test_or['entity_label_list'])) 890 | 891 | 892 | test_or['pred_label']=test_or['preb_entity'].apply(lambda x: 1 if str(x)!='nan' else 0) 893 | 894 | test_or=test_or[['id','pred_label','preb_entity']] 895 | 896 | test_2 = pd.read_csv(test_path, encoding='utf-8') 897 | submit = test_2[['id']] 898 | submit = submit.merge(test_or, on='id', how='left') 899 | submit.columns=['id', 'negative', 'key_entity'] 900 | submit['negative']=submit['negative'].apply(lambda x: int(x) if str(x)!='nan' else 0) 901 | submit['negative']=submit['negative'].astype('int') 902 | # submit['key_entity']=np.nan 903 | submit.to_csv('{}_result_mean.csv'.format(save_mdoel_name_pre),index=False) 904 | print(submit.isnull().sum()) 905 | 906 | --------------------------------------------------------------------------------