├── README.md
├── scheme1
    ├── code1
    │   ├── data
    │   │   ├── Round2_train.csv
    │   │   ├── Test_Data.csv
    │   │   ├── Train_Data.csv
    │   │   ├── emotion_voting_three_models_39215985.csv
    │   │   ├── preprocess.py
    │   │   ├── round2_test.csv
    │   │   └── submit_example.csv
    │   ├── pystart.py
    │   ├── pytorch_transformers
    │   │   ├── __init__.py
    │   │   ├── __main__.py
    │   │   ├── file_utils.py
    │   │   ├── modeling_bert.py
    │   │   ├── modeling_utils.py
    │   │   ├── optimization.py
    │   │   ├── tokenization_auto.py
    │   │   ├── tokenization_bert.py
    │   │   └── tokenization_utils.py
    │   ├── requirements.txt
    │   └── run_bert.py
    └── code2
    │   └── code.py
├── scheme2
    └── code2.py
└── scheme3
    └── code3.py


/README.md:
--------------------------------------------------------------------------------
1 | # ccf_financial_negative
2 | CCF BDCI 金融信息负面及主体判定 冠军代码
3 | 其中scheme1,2,3分别为方案一、二、三的代码
4 | 并且scheme1中的code1修改自guoday分享的代码，在这里很感谢guoday的开源代码
5 | 


--------------------------------------------------------------------------------
/scheme1/code1/data/Round2_train.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiong666/ccf_financial_negative/66292f4724f2a7b40e83d1c74d0ee05822d69f7e/scheme1/code1/data/Round2_train.csv


--------------------------------------------------------------------------------
/scheme1/code1/data/preprocess.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import os
  3 | import random
  4 | import numpy as np
  5 | import re
  6 | import textdistance
  7 | 
  8 | train_df = pd.read_csv('./data/Round2_train.csv')
  9 | test_data = pd.read_csv('./data/round2_test.csv')
 10 | 
 11 | train_df['text'] = train_df.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1)
 12 | test_data['text'] = test_data.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1)
 13 | 
 14 | def entity_clear(df):
 15 |     for index, row in df.iterrows():
 16 |         if type(row.entity) == float or type(row.text) == float:
 17 |             continue
 18 |         entities = row.entity.split(';')
 19 |         entities.sort(key = lambda x : len(x))
 20 |         n = len(entities)
 21 |         tmp = entities.copy()
 22 |         for i in range(n):
 23 |             entity_tmp = entities[i]
 24 |             if i + 1 >= n:
 25 |                 break
 26 |             for entity_tmp2 in entities[i+1:]:
 27 |                 if entity_tmp2.find(entity_tmp) != -1 and row.text.replace(entity_tmp2, '').find(entity_tmp) == -1:
 28 |                     tmp.remove(entity_tmp)
 29 |                     break
 30 |         df.loc[index, 'entity'] = ';'.join(tmp)
 31 |     return df
 32 | train_data = entity_clear(train_df)
 33 | test_df = entity_clear(test_data)
 34 | 
 35 | train_data.dropna(subset = ['entity'], inplace=True)
 36 | train_data.reset_index(drop=True, inplace=True)
 37 | test_df.dropna(subset = ['entity'], inplace=True)
 38 | test_df.reset_index(drop=True, inplace=True)
 39 | test_df['negative'] = 0
 40 | train_data['title'] = train_data['title'].fillna('无')
 41 | train_data['text'] = train_data['text'].fillna('无')
 42 | test_df['title'] = test_df['title'].fillna('无')
 43 | test_df['text'] = test_df['text'].fillna('无')
 44 | 
 45 | train_data['text'] = train_data['text'].map(lambda index: re.sub(r'http.*$', "", index))
 46 | test_df['text'] = test_df['text'].map(lambda index: re.sub(r'http.*$', "", index))
 47 | 
 48 | train_data['title'] = train_data['title'].map(lambda index: index.replace(' ', ''))
 49 | train_data['text'] = train_data['text'].map(lambda index: index.replace(' ', ''))
 50 | train_data['title_len'] = train_data['title'].map(lambda index:len(index))
 51 | 
 52 | test_df['title'] = test_df['title'].map(lambda index: index.replace(' ', ''))
 53 | test_df['text'] = test_df['text'].map(lambda index: index.replace(' ', ''))
 54 | test_df['title_len'] = test_df['title'].map(lambda index:len(index))
 55 | 
 56 | distance = textdistance.Levenshtein(external = False)
 57 | train_data['distance'] = train_data.apply(lambda index: distance(index.title, index.text), axis=1)
 58 | test_df['distance'] = test_df.apply(lambda index: distance(index.title, index.text), axis=1)
 59 | 
 60 | train_data['title_in_text'] = train_data.apply(lambda index: 1 if index.text.find(index.title) != -1 else 0, axis=1)
 61 | test_df['title_in_text'] = test_df.apply(lambda index: 1 if index.text.find(index.title) != -1 else 0, axis=1)
 62 | 
 63 | train_data['content'] = train_data.apply(lambda index: index.title + index.text if (index.title_len != 0) & (index.title_in_text != 1) & (index.distance > 100) else index.text, axis=1)
 64 | test_df['content'] = test_df.apply(lambda index: index.title + index.text if (index.title_len != 0) & (index.title_in_text != 1) & (index.distance > 100) else index.text, axis=1)
 65 |     
 66 | def get_content(x ,y):
 67 |     try:
 68 |         if str(y) == 'nan':
 69 |             return x
 70 |         y = y.split(';')
 71 |         y = sorted(y, key=lambda i: len(i), reverse=True)
 72 |         for i in y:
 73 |             x = '实体词'.join(x.split(i))
 74 |         return x
 75 |     except:
 76 |         return x
 77 | train_data['content'] = list(map(lambda x,y: get_content(x,y), train_data['content'], train_data['entity']))
 78 | test_df['content'] = list(map(lambda x,y: get_content(x,y), test_df['content'], test_df['entity']))
 79 | 
 80 | train_data.rename(columns={'negative':'label'}, inplace=True)
 81 | test_df.rename(columns={'negative':'label'}, inplace=True)
 82 | 
 83 | features = ['id', 'content' ,'entity', 'label']
 84 | index = set(range(train_data.shape[0]))
 85 | 
 86 | K_fold = []
 87 | for i in range(10):
 88 |     if i == 9:
 89 |         tmp = index
 90 |     else:
 91 |         tmp = random.sample(index, int(1.0 /10 * train_data.shape[0]))
 92 |     index = index - set(tmp)
 93 |     print('number:', len(tmp))
 94 |     K_fold.append(tmp)
 95 | 
 96 | for i in range(10):
 97 |     print('Fold', i)
 98 |     os.system('mkdir ./data/data_{}'.format(i))
 99 |     dev_index = list(K_fold[i])
100 |     train_index = []
101 |     for j in range(10):
102 |         if j != i:
103 |             train_index += K_fold[j]
104 |     train_data[features].iloc[train_index].to_csv('./data/data_{}/train.csv'.format(i))
105 |     train_data[features].iloc[dev_index].to_csv('./data/data_{}/dev.csv'.format(i))
106 |     test_df[features].to_csv('./data/data_{}/test.csv'.format(i))
107 |     


--------------------------------------------------------------------------------
/scheme1/code1/pystart.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Nov 22 10:39:24 2019
 4 | 
 5 | @author: xiong
 6 | """
 7 | 
 8 | import os
 9 | import pandas as pd
10 | import numpy as np
11 | import argparse
12 | 
13 | for i in range(10):
14 |     params = '--model_type bert \
15 |             --model_name_or_path chinese_roberta_wwm_large_ext \
16 |             --do_train \
17 |             --do_eval \
18 |             --do_test \
19 |             --data_dir %s \
20 |             --output_dir %s \
21 |             --max_seq_length 512 \
22 |             --split_num 1 \
23 |             --lstm_hidden_size 512 \
24 |             --lstm_layers 1 \
25 |             --lstm_dropout 0.1 \
26 |             --eval_steps 1000 \
27 |             --per_gpu_train_batch_size 16 \
28 |             --gradient_accumulation_steps 8 \
29 |             --warmup_steps 0 \
30 |             --per_gpu_eval_batch_size 32 \
31 |             --learning_rate 8e-6 \
32 |             --adam_epsilon 1e-6 \
33 |             --weight_decay 0 \
34 |             --train_steps 40000 \
35 |             --device_id %d' % ('./data/data_'+str(i), './model_roberta_wwm_large_ext'+str(i), 0)
36 |     ex = os.system("python run_bert.py %s" %params)
37 |     print('The fold:', i)
38 | 
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument("--model_prefix", default='./model_roberta_wwm_large_ext', type=str)
41 | args = parser.parse_args()
42 | 
43 | k = 10
44 | df = pd.read_csv('data/data_0/test.csv')
45 | df['0'] = 0
46 | df['1'] = 1
47 | for i in range(k):
48 |     temp = pd.read_csv('{}{}/test_pb.csv'.format(args.model_prefix, i))
49 |     df['0'] += temp['label_0'] / k
50 |     df['1'] += temp['label_1'] / k
51 | print('The end for combining.')
52 | 
53 | df['pre_label'] = np.argmax(df[['0','1']].values, -1)
54 | df['key_entity'] = np.nan
55 | df.rename(columns={'pre_label':'negative'}, inplace=True)
56 | df[['id','negative','key_entity']].to_csv('./result/submit_emotion.csv', encoding='utf-8', index=None)     #######right#######


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "1.1.0"
 2 | from .tokenization_auto import AutoTokenizer
 3 | from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 4 | 
 5 | from .tokenization_utils import (PreTrainedTokenizer)
 6 | 
 7 | from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
 8 |                             BertForMaskedLM, BertForNextSentencePrediction,
 9 |                             BertForSequenceClassification, BertForMultipleChoice,
10 |                             BertForTokenClassification, BertForQuestionAnswering,
11 |                             load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
12 |                             BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
13 | 
14 | from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
15 |                           PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
16 | 
17 | from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
18 |                            WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
19 | 
20 | from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
21 | 


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/__main__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | def main():
 3 |     import sys
 4 |     if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
 5 |         print(
 6 |         "Should be used as one of: \n"
 7 |         ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
 8 |         ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
 9 |         ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
10 |         ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
11 |         ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
12 |         ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
13 |     else:
14 |         if sys.argv[1] == "bert":
15 |             try:
16 |                 from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
17 |             except ImportError:
18 |                 print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
19 |                     "In that case, it requires TensorFlow to be installed. Please see "
20 |                     "https://www.tensorflow.org/install/ for installation instructions.")
21 |                 raise
22 | 
23 |             if len(sys.argv) != 5:
24 |                 # pylint: disable=line-too-long
25 |                 print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
26 |             else:
27 |                 PYTORCH_DUMP_OUTPUT = sys.argv.pop()
28 |                 TF_CONFIG = sys.argv.pop()
29 |                 TF_CHECKPOINT = sys.argv.pop()
30 |                 convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | from __future__ import (absolute_import, division, print_function, unicode_literals)
  7 | 
  8 | import sys
  9 | import json
 10 | import logging
 11 | import os
 12 | import shutil
 13 | import tempfile
 14 | import fnmatch
 15 | from functools import wraps
 16 | from hashlib import sha256
 17 | from io import open
 18 | 
 19 | import boto3
 20 | from botocore.config import Config
 21 | from botocore.exceptions import ClientError
 22 | import requests
 23 | from tqdm import tqdm
 24 | 
 25 | try:
 26 |     from torch.hub import _get_torch_home
 27 |     torch_cache_home = _get_torch_home()
 28 | except ImportError:
 29 |     torch_cache_home = os.path.expanduser(
 30 |         os.getenv('TORCH_HOME', os.path.join(
 31 |             os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
 32 | default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')
 33 | 
 34 | try:
 35 |     from urllib.parse import urlparse
 36 | except ImportError:
 37 |     from urlparse import urlparse
 38 | 
 39 | try:
 40 |     from pathlib import Path
 41 |     PYTORCH_PRETRAINED_BERT_CACHE = Path(
 42 |         os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
 43 | except (AttributeError, ImportError):
 44 |     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
 45 |                                               os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 46 |                                                         default_cache_path))
 47 | 
 48 | PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 49 | 
 50 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 51 | 
 52 | 
 53 | def url_to_filename(url, etag=None):
 54 |     """
 55 |     Convert `url` into a hashed filename in a repeatable way.
 56 |     If `etag` is specified, append its hash to the url's, delimited
 57 |     by a period.
 58 |     """
 59 |     url_bytes = url.encode('utf-8')
 60 |     url_hash = sha256(url_bytes)
 61 |     filename = url_hash.hexdigest()
 62 | 
 63 |     if etag:
 64 |         etag_bytes = etag.encode('utf-8')
 65 |         etag_hash = sha256(etag_bytes)
 66 |         filename += '.' + etag_hash.hexdigest()
 67 | 
 68 |     return filename
 69 | 
 70 | 
 71 | def filename_to_url(filename, cache_dir=None):
 72 |     """
 73 |     Return the url and etag (which may be ``None``) stored for `filename`.
 74 |     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
 75 |     """
 76 |     if cache_dir is None:
 77 |         cache_dir = PYTORCH_TRANSFORMERS_CACHE
 78 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 79 |         cache_dir = str(cache_dir)
 80 | 
 81 |     cache_path = os.path.join(cache_dir, filename)
 82 |     if not os.path.exists(cache_path):
 83 |         raise EnvironmentError("file {} not found".format(cache_path))
 84 | 
 85 |     meta_path = cache_path + '.json'
 86 |     if not os.path.exists(meta_path):
 87 |         raise EnvironmentError("file {} not found".format(meta_path))
 88 | 
 89 |     with open(meta_path, encoding="utf-8") as meta_file:
 90 |         metadata = json.load(meta_file)
 91 |     url = metadata['url']
 92 |     etag = metadata['etag']
 93 | 
 94 |     return url, etag
 95 | 
 96 | 
 97 | def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
 98 |     """
 99 |     Given something that might be a URL (or might be a local path),
100 |     determine which. If it's a URL, download the file and cache it, and
101 |     return the path to the cached file. If it's already a local path,
102 |     make sure the file exists and then return the path.
103 |     Args:
104 |         cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
105 |         force_download: if True, re-dowload the file even if it's already cached in the cache dir.
106 |     """
107 |     if cache_dir is None:
108 |         cache_dir = PYTORCH_TRANSFORMERS_CACHE
109 |     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
110 |         url_or_filename = str(url_or_filename)
111 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
112 |         cache_dir = str(cache_dir)
113 | 
114 |     parsed = urlparse(url_or_filename)
115 | 
116 |     if parsed.scheme in ('http', 'https', 's3'):
117 |         # URL, so get it from the cache (downloading if necessary)
118 |         return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
119 |     elif os.path.exists(url_or_filename):
120 |         # File, and it exists.
121 |         return url_or_filename
122 |     elif parsed.scheme == '':
123 |         # File, but it doesn't exist.
124 |         raise EnvironmentError("file {} not found".format(url_or_filename))
125 |     else:
126 |         # Something unknown
127 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
128 | 
129 | 
130 | def split_s3_path(url):
131 |     """Split a full s3 path into the bucket name and path."""
132 |     parsed = urlparse(url)
133 |     if not parsed.netloc or not parsed.path:
134 |         raise ValueError("bad s3 path {}".format(url))
135 |     bucket_name = parsed.netloc
136 |     s3_path = parsed.path
137 |     # Remove '/' at beginning of path.
138 |     if s3_path.startswith("/"):
139 |         s3_path = s3_path[1:]
140 |     return bucket_name, s3_path
141 | 
142 | 
143 | def s3_request(func):
144 |     """
145 |     Wrapper function for s3 requests in order to create more helpful error
146 |     messages.
147 |     """
148 | 
149 |     @wraps(func)
150 |     def wrapper(url, *args, **kwargs):
151 |         try:
152 |             return func(url, *args, **kwargs)
153 |         except ClientError as exc:
154 |             if int(exc.response["Error"]["Code"]) == 404:
155 |                 raise EnvironmentError("file {} not found".format(url))
156 |             else:
157 |                 raise
158 | 
159 |     return wrapper
160 | 
161 | 
162 | @s3_request
163 | def s3_etag(url, proxies=None):
164 |     """Check ETag on S3 object."""
165 |     s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
166 |     bucket_name, s3_path = split_s3_path(url)
167 |     s3_object = s3_resource.Object(bucket_name, s3_path)
168 |     return s3_object.e_tag
169 | 
170 | 
171 | @s3_request
172 | def s3_get(url, temp_file, proxies=None):
173 |     """Pull a file directly from S3."""
174 |     s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
175 |     bucket_name, s3_path = split_s3_path(url)
176 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
177 | 
178 | 
179 | def http_get(url, temp_file, proxies=None):
180 |     req = requests.get(url, stream=True, proxies=proxies)
181 |     content_length = req.headers.get('Content-Length')
182 |     total = int(content_length) if content_length is not None else None
183 |     progress = tqdm(unit="B", total=total)
184 |     for chunk in req.iter_content(chunk_size=1024):
185 |         if chunk: # filter out keep-alive new chunks
186 |             progress.update(len(chunk))
187 |             temp_file.write(chunk)
188 |     progress.close()
189 | 
190 | 
191 | def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
192 |     """
193 |     Given a URL, look for the corresponding dataset in the local cache.
194 |     If it's not there, download it. Then return the path to the cached file.
195 |     """
196 |     if cache_dir is None:
197 |         cache_dir = PYTORCH_TRANSFORMERS_CACHE
198 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
199 |         cache_dir = str(cache_dir)
200 |     if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
201 |         cache_dir = str(cache_dir)
202 | 
203 |     if not os.path.exists(cache_dir):
204 |         os.makedirs(cache_dir)
205 | 
206 |     # Get eTag to add to filename, if it exists.
207 |     if url.startswith("s3://"):
208 |         etag = s3_etag(url, proxies=proxies)
209 |     else:
210 |         try:
211 |             response = requests.head(url, allow_redirects=True, proxies=proxies)
212 |             if response.status_code != 200:
213 |                 etag = None
214 |             else:
215 |                 etag = response.headers.get("ETag")
216 |         except EnvironmentError:
217 |             etag = None
218 | 
219 |     if sys.version_info[0] == 2 and etag is not None:
220 |         etag = etag.decode('utf-8')
221 |     filename = url_to_filename(url, etag)
222 | 
223 |     # get cache path to put the file
224 |     cache_path = os.path.join(cache_dir, filename)
225 | 
226 |     # If we don't have a connection (etag is None) and can't identify the file
227 |     # try to get the last downloaded one
228 |     if not os.path.exists(cache_path) and etag is None:
229 |         matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*')
230 |         matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files))
231 |         if matching_files:
232 |             cache_path = os.path.join(cache_dir, matching_files[-1])
233 | 
234 |     if not os.path.exists(cache_path) or force_download:
235 |         # Download to temporary file, then copy to cache dir once finished.
236 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
237 |         with tempfile.NamedTemporaryFile() as temp_file:
238 |             logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
239 | 
240 |             # GET file object
241 |             if url.startswith("s3://"):
242 |                 s3_get(url, temp_file, proxies=proxies)
243 |             else:
244 |                 http_get(url, temp_file, proxies=proxies)
245 | 
246 |             # we are copying the file before closing it, so flush to avoid truncation
247 |             temp_file.flush()
248 |             # shutil.copyfileobj() starts at the current position, so go to the start
249 |             temp_file.seek(0)
250 | 
251 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
252 |             with open(cache_path, 'wb') as cache_file:
253 |                 shutil.copyfileobj(temp_file, cache_file)
254 | 
255 |             logger.info("creating metadata file for %s", cache_path)
256 |             meta = {'url': url, 'etag': etag}
257 |             meta_path = cache_path + '.json'
258 |             with open(meta_path, 'w') as meta_file:
259 |                 output_string = json.dumps(meta)
260 |                 if sys.version_info[0] == 2 and isinstance(output_string, str):
261 |                     output_string = unicode(output_string, 'utf-8')  # The beauty of python 2
262 |                 meta_file.write(output_string)
263 | 
264 |             logger.info("removing temp file %s", temp_file.name)
265 | 
266 |     return cache_path
267 | 


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """PyTorch optimization for BERT model."""
 16 | 
 17 | import logging
 18 | import math
 19 | 
 20 | import torch
 21 | from torch.optim import Optimizer
 22 | from torch.optim.lr_scheduler import LambdaLR
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | class ConstantLRSchedule(LambdaLR):
 27 |     """ Constant learning rate schedule.
 28 |     """
 29 |     def __init__(self, optimizer, last_epoch=-1):
 30 |         super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
 31 | 
 32 | 
 33 | class WarmupConstantSchedule(LambdaLR):
 34 |     """ Linear warmup and then constant.
 35 |         Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
 36 |         Keeps learning rate schedule equal to 1. after warmup_steps.
 37 |     """
 38 |     def __init__(self, optimizer, warmup_steps, last_epoch=-1):
 39 |         self.warmup_steps = warmup_steps
 40 |         super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 41 | 
 42 |     def lr_lambda(self, step):
 43 |         if step < self.warmup_steps:
 44 |             return float(step) / float(max(1.0, self.warmup_steps))
 45 |         return 1.
 46 | 
 47 | 
 48 | class WarmupLinearSchedule(LambdaLR):
 49 |     """ Linear warmup and then linear decay.
 50 |         Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
 51 |         Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
 52 |     """
 53 |     def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
 54 |         self.warmup_steps = warmup_steps
 55 |         self.t_total = t_total
 56 |         super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 57 | 
 58 |     def lr_lambda(self, step):
 59 |         if step < self.warmup_steps:
 60 |             return float(step) / float(max(1, self.warmup_steps))
 61 |         return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))
 62 | 
 63 | 
 64 | class WarmupCosineSchedule(LambdaLR):
 65 |     """ Linear warmup and then cosine decay.
 66 |         Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
 67 |         Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
 68 |         If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
 69 |     """
 70 |     def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
 71 |         self.warmup_steps = warmup_steps
 72 |         self.t_total = t_total
 73 |         self.cycles = cycles
 74 |         super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 75 | 
 76 |     def lr_lambda(self, step):
 77 |         if step < self.warmup_steps:
 78 |             return float(step) / float(max(1.0, self.warmup_steps))
 79 |         # progress after warmup
 80 |         progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
 81 |         return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
 82 | 
 83 | 
 84 | class WarmupCosineWithHardRestartsSchedule(LambdaLR):
 85 |     """ Linear warmup and then cosine cycles with hard restarts.
 86 |         Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
 87 |         If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
 88 |         learning rate (with hard restarts).
 89 |     """
 90 |     def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
 91 |         self.warmup_steps = warmup_steps
 92 |         self.t_total = t_total
 93 |         self.cycles = cycles
 94 |         super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
 95 | 
 96 |     def lr_lambda(self, step):
 97 |         if step < self.warmup_steps:
 98 |             return float(step) / float(max(1, self.warmup_steps))
 99 |         # progress after warmup
100 |         progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps))
101 |         if progress >= 1.0:
102 |             return 0.0
103 |         return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0))))
104 | 
105 | 
106 | 
107 | class AdamW(Optimizer):
108 |     """ Implements Adam algorithm with weight decay fix.
109 | 
110 |     Parameters:
111 |         lr (float): learning rate. Default 1e-3.
112 |         betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
113 |         eps (float): Adams epsilon. Default: 1e-6
114 |         weight_decay (float): Weight decay. Default: 0.0
115 |         correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
116 |     """
117 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
118 |         if lr < 0.0:
119 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
120 |         if not 0.0 <= betas[0] < 1.0:
121 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
122 |         if not 0.0 <= betas[1]  < 1.0:
123 |             raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
124 |         if not 0.0 <= eps:
125 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
126 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
127 |                         correct_bias=correct_bias)
128 |         super(AdamW, self).__init__(params, defaults)
129 | 
130 |     def step(self, closure=None):
131 |         """Performs a single optimization step.
132 | 
133 |         Arguments:
134 |             closure (callable, optional): A closure that reevaluates the model
135 |                 and returns the loss.
136 |         """
137 |         loss = None
138 |         if closure is not None:
139 |             loss = closure()
140 | 
141 |         for group in self.param_groups:
142 |             for p in group['params']:
143 |                 if p.grad is None:
144 |                     continue
145 |                 grad = p.grad.data
146 |                 if grad.is_sparse:
147 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
148 | 
149 |                 state = self.state[p]
150 | 
151 |                 # State initialization
152 |                 if len(state) == 0:
153 |                     state['step'] = 0
154 |                     # Exponential moving average of gradient values
155 |                     state['exp_avg'] = torch.zeros_like(p.data)
156 |                     # Exponential moving average of squared gradient values
157 |                     state['exp_avg_sq'] = torch.zeros_like(p.data)
158 | 
159 |                 exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
160 |                 beta1, beta2 = group['betas']
161 | 
162 |                 state['step'] += 1
163 | 
164 |                 # Decay the first and second moment running average coefficient
165 |                 # In-place operations to update the averages at the same time
166 |                 exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
167 |                 exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
168 |                 denom = exp_avg_sq.sqrt().add_(group['eps'])
169 | 
170 |                 step_size = group['lr']
171 |                 if group['correct_bias']:  # No bias correction for Bert
172 |                     bias_correction1 = 1.0 - beta1 ** state['step']
173 |                     bias_correction2 = 1.0 - beta2 ** state['step']
174 |                     step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
175 | 
176 |                 p.data.addcdiv_(-step_size, exp_avg, denom)
177 | 
178 |                 # Just adding the square of the weights to the loss function is *not*
179 |                 # the correct way of using L2 regularization/weight decay with Adam,
180 |                 # since that will interact with the m and v parameters in strange ways.
181 |                 #
182 |                 # Instead we want to decay the weights in a manner that doesn't interact
183 |                 # with the m/v parameters. This is equivalent to adding the square
184 |                 # of the weights to the loss with plain (non-momentum) SGD.
185 |                 # Add weight decay at the end (fixed version)
186 |                 if group['weight_decay'] > 0.0:
187 |                     p.data.add_(-group['lr'] * group['weight_decay'], p.data)
188 | 
189 |         return loss
190 | 


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/tokenization_auto.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Auto Model class. """
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import logging
 20 | 
 21 | from .tokenization_bert import BertTokenizer
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | class AutoTokenizer(object):
 26 |     r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
 27 |         that will be instantiated as one of the tokenizer classes of the library
 28 |         when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
 29 |         class method.
 30 | 
 31 |         The `from_pretrained()` method take care of returning the correct tokenizer class instance
 32 |         using pattern matching on the `pretrained_model_name_or_path` string.
 33 | 
 34 |         The tokenizer class to instantiate is selected as the first pattern matching
 35 |         in the `pretrained_model_name_or_path` string (in the following order):
 36 |             - contains `bert`: BertTokenizer (Bert model)
 37 |             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
 38 |             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
 39 |             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
 40 |             - contains `xlnet`: XLNetTokenizer (XLNet model)
 41 |             - contains `xlm`: XLMTokenizer (XLM model)
 42 |             - contains `roberta`: RobertaTokenizer (RoBERTa model)
 43 | 
 44 |         This class cannot be instantiated using `__init__()` (throw an error).
 45 |     """
 46 |     def __init__(self):
 47 |         raise EnvironmentError("AutoTokenizer is designed to be instantiated "
 48 |             "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
 49 | 
 50 |     @classmethod
 51 |     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
 52 |         r""" Instantiate a one of the tokenizer classes of the library
 53 |         from a pre-trained model vocabulary.
 54 | 
 55 |         The tokenizer class to instantiate is selected as the first pattern matching
 56 |         in the `pretrained_model_name_or_path` string (in the following order):
 57 |             - contains `bert`: BertTokenizer (Bert model)
 58 |             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
 59 |             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
 60 |             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
 61 |             - contains `xlnet`: XLNetTokenizer (XLNet model)
 62 |             - contains `xlm`: XLMTokenizer (XLM model)
 63 |             - contains `roberta`: RobertaTokenizer (XLM model)
 64 | 
 65 |         Params:
 66 |             **pretrained_model_name_or_path**: either:
 67 |                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache
 68 |                     or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
 69 |                 - a path to a `directory` containing a configuration file saved
 70 |                     using the `save_pretrained(save_directory)` method.
 71 |                 - a path or url to a saved configuration `file`.
 72 |             **cache_dir**: (`optional`) string:
 73 |                 Path to a directory in which a downloaded pre-trained model
 74 |                 configuration should be cached if the standard cache should not be used.
 75 | 
 76 |         Examples::
 77 | 
 78 |             config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
 79 |             config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 80 | 
 81 |         """
 82 |         if 'roberta' in pretrained_model_name_or_path:
 83 |             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 84 |         elif 'bert' in pretrained_model_name_or_path:
 85 |             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 86 |         elif 'openai-gpt' in pretrained_model_name_or_path:
 87 |             return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 88 |         elif 'gpt2' in pretrained_model_name_or_path:
 89 |             return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 90 |         elif 'transfo-xl' in pretrained_model_name_or_path:
 91 |             return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 92 |         elif 'xlnet' in pretrained_model_name_or_path:
 93 |             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 94 |         elif 'xlm' in pretrained_model_name_or_path:
 95 |             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 96 | 
 97 |         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
 98 |                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
 99 |                          "'xlm', 'roberta'".format(pretrained_model_name_or_path))
100 | 


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/tokenization_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes."""
 16 | 
 17 | from __future__ import absolute_import, division, print_function, unicode_literals
 18 | 
 19 | import collections
 20 | import logging
 21 | import os
 22 | import unicodedata
 23 | from io import open
 24 | 
 25 | from .tokenization_utils import PreTrainedTokenizer
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 30 | 
 31 | PRETRAINED_VOCAB_FILES_MAP = {
 32 |     'vocab_file':
 33 |     {
 34 |         'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
 35 |         'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
 36 |         'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
 37 |         'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
 38 |         'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
 39 |         'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
 40 |         'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 41 |         'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
 42 |         'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
 43 |         'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
 44 |         'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
 45 |         'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
 46 |         'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
 47 |     }
 48 | }
 49 | 
 50 | PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 51 |     'bert-base-uncased': 512,
 52 |     'bert-large-uncased': 512,
 53 |     'bert-base-cased': 512,
 54 |     'bert-large-cased': 512,
 55 |     'bert-base-multilingual-uncased': 512,
 56 |     'bert-base-multilingual-cased': 512,
 57 |     'bert-base-chinese': 512,
 58 |     'bert-base-german-cased': 512,
 59 |     'bert-large-uncased-whole-word-masking': 512,
 60 |     'bert-large-cased-whole-word-masking': 512,
 61 |     'bert-large-uncased-whole-word-masking-finetuned-squad': 512,
 62 |     'bert-large-cased-whole-word-masking-finetuned-squad': 512,
 63 |     'bert-base-cased-finetuned-mrpc': 512,
 64 | }
 65 | 
 66 | def load_vocab(vocab_file):
 67 |     """Loads a vocabulary file into a dictionary."""
 68 |     vocab = collections.OrderedDict()
 69 |     with open(vocab_file, "r", encoding="utf-8") as reader:
 70 |         tokens = reader.readlines()
 71 |     for index, token in enumerate(tokens):
 72 |         token = token.rstrip('\n')
 73 |         vocab[token] = index
 74 |     return vocab
 75 | 
 76 | 
 77 | def whitespace_tokenize(text):
 78 |     """Runs basic whitespace cleaning and splitting on a piece of text."""
 79 |     text = text.strip()
 80 |     if not text:
 81 |         return []
 82 |     tokens = text.split()
 83 |     return tokens
 84 | 
 85 | 
 86 | class BertTokenizer(PreTrainedTokenizer):
 87 |     r"""
 88 |     Constructs a BertTokenizer.
 89 |     :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
 90 | 
 91 |     Args:
 92 |         vocab_file: Path to a one-wordpiece-per-line vocabulary file
 93 |         do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
 94 |         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
 95 |         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
 96 |             minimum of this value (if specified) and the underlying BERT model's sequence length.
 97 |         never_split: List of tokens which will never be split during tokenization. Only has an effect when
 98 |             do_wordpiece_only=False
 99 |     """
100 | 
101 |     vocab_files_names = VOCAB_FILES_NAMES
102 |     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
103 |     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
104 | 
105 |     def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
106 |                  unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
107 |                  mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs):
108 |         """Constructs a BertTokenizer.
109 | 
110 |         Args:
111 |             **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
112 |             **do_lower_case**: (`optional`) boolean (default True)
113 |                 Whether to lower case the input
114 |                 Only has an effect when do_basic_tokenize=True
115 |             **do_basic_tokenize**: (`optional`) boolean (default True)
116 |                 Whether to do basic tokenization before wordpiece.
117 |             **never_split**: (`optional`) list of string
118 |                 List of tokens which will never be split during tokenization.
119 |                 Only has an effect when do_basic_tokenize=True
120 |             **tokenize_chinese_chars**: (`optional`) boolean (default True)
121 |                 Whether to tokenize Chinese characters.
122 |                 This should likely be deactivated for Japanese:
123 |                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
124 |         """
125 |         super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
126 |                                             pad_token=pad_token, cls_token=cls_token,
127 |                                             mask_token=mask_token, **kwargs)
128 |         if not os.path.isfile(vocab_file):
129 |             raise ValueError(
130 |                 "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
131 |                 "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
132 |         self.vocab = load_vocab(vocab_file)
133 |         self.ids_to_tokens = collections.OrderedDict(
134 |             [(ids, tok) for tok, ids in self.vocab.items()])
135 |         self.do_basic_tokenize = do_basic_tokenize
136 |         if do_basic_tokenize:
137 |             self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
138 |                                                   never_split=never_split,
139 |                                                   tokenize_chinese_chars=tokenize_chinese_chars)
140 |         self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
141 | 
142 |     @property
143 |     def vocab_size(self):
144 |         return len(self.vocab)
145 | 
146 |     def _tokenize(self, text):
147 |         split_tokens = []
148 |         if self.do_basic_tokenize:
149 |             for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
150 |                 for sub_token in self.wordpiece_tokenizer.tokenize(token):
151 |                     split_tokens.append(sub_token)
152 |         else:
153 |             split_tokens = self.wordpiece_tokenizer.tokenize(text)
154 |         return split_tokens
155 | 
156 |     def _convert_token_to_id(self, token):
157 |         """ Converts a token (str/unicode) in an id using the vocab. """
158 |         return self.vocab.get(token, self.vocab.get(self.unk_token))
159 | 
160 |     def _convert_id_to_token(self, index):
161 |         """Converts an index (integer) in a token (string/unicode) using the vocab."""
162 |         return self.ids_to_tokens.get(index, self.unk_token)
163 | 
164 |     def convert_tokens_to_string(self, tokens):
165 |         """ Converts a sequence of tokens (string) in a single string. """
166 |         out_string = ' '.join(tokens).replace(' ##', '').strip()
167 |         return out_string
168 | 
169 |     def add_special_tokens_single_sentence(self, token_ids):
170 |         """
171 |         Adds special tokens to the a sequence for sequence classification tasks.
172 |         A BERT sequence has the following format: [CLS] X [SEP]
173 |         """
174 |         return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
175 | 
176 |     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
177 |         """
178 |         Adds special tokens to a sequence pair for sequence classification tasks.
179 |         A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
180 |         """
181 |         sep = [self._convert_token_to_id(self.sep_token)]
182 |         cls = [self._convert_token_to_id(self.cls_token)]
183 |         return cls + token_ids_0 + sep + token_ids_1 + sep
184 | 
185 |     def save_vocabulary(self, vocab_path):
186 |         """Save the tokenizer vocabulary to a directory or file."""
187 |         index = 0
188 |         if os.path.isdir(vocab_path):
189 |             vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
190 |         else:
191 |             vocab_file = vocab_path
192 |         with open(vocab_file, "w", encoding="utf-8") as writer:
193 |             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
194 |                 if index != token_index:
195 |                     logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
196 |                                    " Please check that the vocabulary is not corrupted!".format(vocab_file))
197 |                     index = token_index
198 |                 writer.write(token + u'\n')
199 |                 index += 1
200 |         return (vocab_file,)
201 | 
202 |     @classmethod
203 |     def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
204 |         """ Instantiate a BertTokenizer from pre-trained vocabulary files.
205 |         """
206 |         if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
207 |             if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
208 |                 logger.warning("The pre-trained model you are loading is a cased model but you have not set "
209 |                                "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
210 |                                "you may want to check this behavior.")
211 |                 kwargs['do_lower_case'] = False
212 |             elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
213 |                 logger.warning("The pre-trained model you are loading is an uncased model but you have set "
214 |                                "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
215 |                                "but you may want to check this behavior.")
216 |                 kwargs['do_lower_case'] = True
217 | 
218 |         return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
219 | 
220 | 
221 | class BasicTokenizer(object):
222 |     """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
223 | 
224 |     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
225 |         """ Constructs a BasicTokenizer.
226 | 
227 |         Args:
228 |             **do_lower_case**: Whether to lower case the input.
229 |             **never_split**: (`optional`) list of str
230 |                 Kept for backward compatibility purposes.
231 |                 Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
232 |                 List of token not to split.
233 |             **tokenize_chinese_chars**: (`optional`) boolean (default True)
234 |                 Whether to tokenize Chinese characters.
235 |                 This should likely be deactivated for Japanese:
236 |                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
237 |         """
238 |         if never_split is None:
239 |             never_split = []
240 |         self.do_lower_case = do_lower_case
241 |         self.never_split = never_split
242 |         self.tokenize_chinese_chars = tokenize_chinese_chars
243 | 
244 |     def tokenize(self, text, never_split=None):
245 |         """ Basic Tokenization of a piece of text.
246 |             Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
247 | 
248 |         Args:
249 |             **never_split**: (`optional`) list of str
250 |                 Kept for backward compatibility purposes.
251 |                 Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
252 |                 List of token not to split.
253 |         """
254 |         never_split = self.never_split + (never_split if never_split is not None else [])
255 |         text = self._clean_text(text)
256 |         # This was added on November 1st, 2018 for the multilingual and Chinese
257 |         # models. This is also applied to the English models now, but it doesn't
258 |         # matter since the English models were not trained on any Chinese data
259 |         # and generally don't have any Chinese data in them (there are Chinese
260 |         # characters in the vocabulary because Wikipedia does have some Chinese
261 |         # words in the English Wikipedia.).
262 |         if self.tokenize_chinese_chars:
263 |             text = self._tokenize_chinese_chars(text)
264 |         orig_tokens = whitespace_tokenize(text)
265 |         split_tokens = []
266 |         for token in orig_tokens:
267 |             if self.do_lower_case and token not in never_split:
268 |                 token = token.lower()
269 |                 token = self._run_strip_accents(token)
270 |             split_tokens.extend(self._run_split_on_punc(token))
271 | 
272 |         output_tokens = whitespace_tokenize(" ".join(split_tokens))
273 |         return output_tokens
274 | 
275 |     def _run_strip_accents(self, text):
276 |         """Strips accents from a piece of text."""
277 |         text = unicodedata.normalize("NFD", text)
278 |         output = []
279 |         for char in text:
280 |             cat = unicodedata.category(char)
281 |             if cat == "Mn":
282 |                 continue
283 |             output.append(char)
284 |         return "".join(output)
285 | 
286 |     def _run_split_on_punc(self, text, never_split=None):
287 |         """Splits punctuation on a piece of text."""
288 |         if never_split is not None and text in never_split:
289 |             return [text]
290 |         chars = list(text)
291 |         i = 0
292 |         start_new_word = True
293 |         output = []
294 |         while i < len(chars):
295 |             char = chars[i]
296 |             if _is_punctuation(char):
297 |                 output.append([char])
298 |                 start_new_word = True
299 |             else:
300 |                 if start_new_word:
301 |                     output.append([])
302 |                 start_new_word = False
303 |                 output[-1].append(char)
304 |             i += 1
305 | 
306 |         return ["".join(x) for x in output]
307 | 
308 |     def _tokenize_chinese_chars(self, text):
309 |         """Adds whitespace around any CJK character."""
310 |         output = []
311 |         for char in text:
312 |             cp = ord(char)
313 |             if self._is_chinese_char(cp):
314 |                 output.append(" ")
315 |                 output.append(char)
316 |                 output.append(" ")
317 |             else:
318 |                 output.append(char)
319 |         return "".join(output)
320 | 
321 |     def _is_chinese_char(self, cp):
322 |         """Checks whether CP is the codepoint of a CJK character."""
323 |         # This defines a "chinese character" as anything in the CJK Unicode block:
324 |         #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
325 |         #
326 |         # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
327 |         # despite its name. The modern Korean Hangul alphabet is a different block,
328 |         # as is Japanese Hiragana and Katakana. Those alphabets are used to write
329 |         # space-separated words, so they are not treated specially and handled
330 |         # like the all of the other languages.
331 |         if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
332 |                 (cp >= 0x3400 and cp <= 0x4DBF) or  #
333 |                 (cp >= 0x20000 and cp <= 0x2A6DF) or  #
334 |                 (cp >= 0x2A700 and cp <= 0x2B73F) or  #
335 |                 (cp >= 0x2B740 and cp <= 0x2B81F) or  #
336 |                 (cp >= 0x2B820 and cp <= 0x2CEAF) or
337 |                 (cp >= 0xF900 and cp <= 0xFAFF) or  #
338 |                 (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
339 |             return True
340 | 
341 |         return False
342 | 
343 |     def _clean_text(self, text):
344 |         """Performs invalid character removal and whitespace cleanup on text."""
345 |         output = []
346 |         for char in text:
347 |             cp = ord(char)
348 |             if cp == 0 or cp == 0xfffd or _is_control(char):
349 |                 continue
350 |             if _is_whitespace(char):
351 |                 output.append(" ")
352 |             else:
353 |                 output.append(char)
354 |         return "".join(output)
355 | 
356 | 
357 | class WordpieceTokenizer(object):
358 |     """Runs WordPiece tokenization."""
359 | 
360 |     def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
361 |         self.vocab = vocab
362 |         self.unk_token = unk_token
363 |         self.max_input_chars_per_word = max_input_chars_per_word
364 | 
365 |     def tokenize(self, text):
366 |         """Tokenizes a piece of text into its word pieces.
367 | 
368 |         This uses a greedy longest-match-first algorithm to perform tokenization
369 |         using the given vocabulary.
370 | 
371 |         For example:
372 |           input = "unaffable"
373 |           output = ["un", "##aff", "##able"]
374 | 
375 |         Args:
376 |           text: A single token or whitespace separated tokens. This should have
377 |             already been passed through `BasicTokenizer`.
378 | 
379 |         Returns:
380 |           A list of wordpiece tokens.
381 |         """
382 | 
383 |         output_tokens = []
384 |         for token in whitespace_tokenize(text):
385 |             chars = list(token)
386 |             if len(chars) > self.max_input_chars_per_word:
387 |                 output_tokens.append(self.unk_token)
388 |                 continue
389 | 
390 |             is_bad = False
391 |             start = 0
392 |             sub_tokens = []
393 |             while start < len(chars):
394 |                 end = len(chars)
395 |                 cur_substr = None
396 |                 while start < end:
397 |                     substr = "".join(chars[start:end])
398 |                     if start > 0:
399 |                         substr = "##" + substr
400 |                     if substr in self.vocab:
401 |                         cur_substr = substr
402 |                         break
403 |                     end -= 1
404 |                 if cur_substr is None:
405 |                     is_bad = True
406 |                     break
407 |                 sub_tokens.append(cur_substr)
408 |                 start = end
409 | 
410 |             if is_bad:
411 |                 output_tokens.append(self.unk_token)
412 |             else:
413 |                 output_tokens.extend(sub_tokens)
414 |         return output_tokens
415 | 
416 | 
417 | def _is_whitespace(char):
418 |     """Checks whether `chars` is a whitespace character."""
419 |     # \t, \n, and \r are technically contorl characters but we treat them
420 |     # as whitespace since they are generally considered as such.
421 |     if char == " " or char == "\t" or char == "\n" or char == "\r":
422 |         return True
423 |     cat = unicodedata.category(char)
424 |     if cat == "Zs":
425 |         return True
426 |     return False
427 | 
428 | 
429 | def _is_control(char):
430 |     """Checks whether `chars` is a control character."""
431 |     # These are technically control characters but we count them as whitespace
432 |     # characters.
433 |     if char == "\t" or char == "\n" or char == "\r":
434 |         return False
435 |     cat = unicodedata.category(char)
436 |     if cat.startswith("C"):
437 |         return True
438 |     return False
439 | 
440 | 
441 | def _is_punctuation(char):
442 |     """Checks whether `chars` is a punctuation character."""
443 |     cp = ord(char)
444 |     # We treat all non-letter/number ASCII as punctuation.
445 |     # Characters such as "^", "$", and "`" are not in the Unicode
446 |     # Punctuation class but we treat them as punctuation anyways, for
447 |     # consistency.
448 |     if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
449 |             (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
450 |         return True
451 |     cat = unicodedata.category(char)
452 |     if cat.startswith("P"):
453 |         return True
454 |     return False
455 | 


--------------------------------------------------------------------------------
/scheme1/code1/pytorch_transformers/tokenization_utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Tokenization classes for OpenAI GPT."""
 16 | from __future__ import (absolute_import, division, print_function,
 17 |                         unicode_literals)
 18 | 
 19 | import logging
 20 | import os
 21 | import json
 22 | import six
 23 | from io import open
 24 | 
 25 | from .file_utils import cached_path
 26 | 
 27 | logger = logging.getLogger(__name__)
 28 | 
 29 | SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
 30 | ADDED_TOKENS_FILE = 'added_tokens.json'
 31 | 
 32 | class PreTrainedTokenizer(object):
 33 |     """ Base class for all tokenizers.
 34 |     Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
 35 | 
 36 |     This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
 37 | 
 38 |     Class attributes (overridden by derived classes):
 39 | 
 40 |         - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
 41 |         - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
 42 |         - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
 43 | 
 44 |     Parameters:
 45 | 
 46 |         - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
 47 | 
 48 |         - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
 49 | 
 50 |         - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
 51 | 
 52 |         - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
 53 | 
 54 |         - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
 55 | 
 56 |         - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
 57 | 
 58 |         - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
 59 | 
 60 |         - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
 61 |     """
 62 |     vocab_files_names = {}
 63 |     pretrained_vocab_files_map = {}
 64 |     max_model_input_sizes = {}
 65 | 
 66 |     SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token",
 67 |                                  "pad_token", "cls_token", "mask_token",
 68 |                                  "additional_special_tokens"]
 69 | 
 70 |     @property
 71 |     def bos_token(self):
 72 |         """ Beginning of sentence token (string). Log an error if used while not having been set. """
 73 |         if self._bos_token is None:
 74 |             logger.error("Using bos_token, but it is not set yet.")
 75 |         return self._bos_token
 76 | 
 77 |     @property
 78 |     def eos_token(self):
 79 |         """ End of sentence token (string). Log an error if used while not having been set. """
 80 |         if self._eos_token is None:
 81 |             logger.error("Using eos_token, but it is not set yet.")
 82 |         return self._eos_token
 83 | 
 84 |     @property
 85 |     def unk_token(self):
 86 |         """ Unknown token (string). Log an error if used while not having been set. """
 87 |         if self._unk_token is None:
 88 |             logger.error("Using unk_token, but it is not set yet.")
 89 |         return self._unk_token
 90 | 
 91 |     @property
 92 |     def sep_token(self):
 93 |         """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
 94 |         if self._sep_token is None:
 95 |             logger.error("Using sep_token, but it is not set yet.")
 96 |         return self._sep_token
 97 | 
 98 |     @property
 99 |     def pad_token(self):
100 |         """ Padding token (string). Log an error if used while not having been set. """
101 |         if self._pad_token is None:
102 |             logger.error("Using pad_token, but it is not set yet.")
103 |         return self._pad_token
104 | 
105 |     @property
106 |     def cls_token(self):
107 |         """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
108 |         if self._cls_token is None:
109 |             logger.error("Using cls_token, but it is not set yet.")
110 |         return self._cls_token
111 | 
112 |     @property
113 |     def mask_token(self):
114 |         """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
115 |         if self._mask_token is None:
116 |             logger.error("Using mask_token, but it is not set yet.")
117 |         return self._mask_token
118 | 
119 |     @property
120 |     def additional_special_tokens(self):
121 |         """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
122 |         if self._additional_special_tokens is None:
123 |             logger.error("Using additional_special_tokens, but it is not set yet.")
124 |         return self._additional_special_tokens
125 | 
126 |     @bos_token.setter
127 |     def bos_token(self, value):
128 |         self._bos_token = value
129 | 
130 |     @eos_token.setter
131 |     def eos_token(self, value):
132 |         self._eos_token = value
133 | 
134 |     @unk_token.setter
135 |     def unk_token(self, value):
136 |         self._unk_token = value
137 | 
138 |     @sep_token.setter
139 |     def sep_token(self, value):
140 |         self._sep_token = value
141 | 
142 |     @pad_token.setter
143 |     def pad_token(self, value):
144 |         self._pad_token = value
145 | 
146 |     @cls_token.setter
147 |     def cls_token(self, value):
148 |         self._cls_token = value
149 | 
150 |     @mask_token.setter
151 |     def mask_token(self, value):
152 |         self._mask_token = value
153 | 
154 |     @additional_special_tokens.setter
155 |     def additional_special_tokens(self, value):
156 |         self._additional_special_tokens = value
157 | 
158 |     def __init__(self, max_len=None, **kwargs):
159 |         self._bos_token = None
160 |         self._eos_token = None
161 |         self._unk_token = None
162 |         self._sep_token = None
163 |         self._pad_token = None
164 |         self._cls_token = None
165 |         self._mask_token = None
166 |         self._additional_special_tokens = []
167 | 
168 |         self.max_len = max_len if max_len is not None else int(1e12)
169 |         self.added_tokens_encoder = {}
170 |         self.added_tokens_decoder = {}
171 | 
172 |         for key, value in kwargs.items():
173 |             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
174 |                 if key == 'additional_special_tokens':
175 |                     assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
176 |                 else:
177 |                     assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
178 |                 setattr(self, key, value)
179 | 
180 | 
181 |     @classmethod
182 |     def from_pretrained(cls, *inputs, **kwargs):
183 |         r"""
184 |         Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
185 | 
186 |         Args:
187 |             pretrained_model_name_or_path: either:
188 | 
189 |                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
190 |                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
191 |                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
192 | 
193 |             cache_dir: (`optional`) string:
194 |                 Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
195 | 
196 |             force_download: (`optional`) boolean, default False:
197 |                 Force to (re-)download the vocabulary files and override the cached versions if they exists.
198 | 
199 |             proxies: (`optional`) dict, default None:
200 |                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
201 |                 The proxies are used on each request.
202 | 
203 |             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
204 | 
205 |             kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
206 | 
207 |         Examples::
208 | 
209 |             # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
210 | 
211 |             # Download vocabulary from S3 and cache.
212 |             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
213 | 
214 |             # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
215 |             tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
216 | 
217 |             # If the tokenizer uses a single vocabulary file, you can point directly to this file
218 |             tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
219 | 
220 |             # You can link tokens to special vocabulary when instantiating
221 |             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
222 |             # You should be sure '<unk>' is in the vocabulary when doing that.
223 |             # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
224 |             assert tokenizer.unk_token == '<unk>'
225 | 
226 |         """
227 |         return cls._from_pretrained(*inputs, **kwargs)
228 | 
229 | 
230 |     @classmethod
231 |     def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
232 |         cache_dir = kwargs.pop('cache_dir', None)
233 |         force_download = kwargs.pop('force_download', False)
234 |         proxies = kwargs.pop('proxies', None)
235 | 
236 |         s3_models = list(cls.max_model_input_sizes.keys())
237 |         vocab_files = {}
238 |         if pretrained_model_name_or_path in s3_models:
239 |             # Get the vocabulary from AWS S3 bucket
240 |             for file_id, map_list in cls.pretrained_vocab_files_map.items():
241 |                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
242 |         else:
243 |             # Get the vocabulary from local files
244 |             logger.info(
245 |                 "Model name '{}' not found in model shortcut name list ({}). "
246 |                 "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
247 |                     pretrained_model_name_or_path, ', '.join(s3_models),
248 |                     pretrained_model_name_or_path))
249 | 
250 |             # Look for the tokenizer main vocabulary files
251 |             for file_id, file_name in cls.vocab_files_names.items():
252 |                 if os.path.isdir(pretrained_model_name_or_path):
253 |                     # If a directory is provided we look for the standard filenames
254 |                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
255 |                 else:
256 |                     # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
257 |                     full_file_name = pretrained_model_name_or_path
258 |                 if not os.path.exists(full_file_name):
259 |                     logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
260 |                     full_file_name = None
261 |                 vocab_files[file_id] = full_file_name
262 | 
263 |             # Look for the additional tokens files
264 |             all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
265 |                                      'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
266 | 
267 |             # If a path to a file was provided, get the parent directory
268 |             saved_directory = pretrained_model_name_or_path
269 |             if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
270 |                 saved_directory = os.path.dirname(saved_directory)
271 | 
272 |             for file_id, file_name in all_vocab_files_names.items():
273 |                 full_file_name = os.path.join(saved_directory, file_name)
274 |                 if not os.path.exists(full_file_name):
275 |                     logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
276 |                     full_file_name = None
277 |                 vocab_files[file_id] = full_file_name
278 | 
279 |             if all(full_file_name is None for full_file_name in vocab_files.values()):
280 |                 logger.error(
281 |                     "Model name '{}' was not found in model name list ({}). "
282 |                     "We assumed '{}' was a path or url but couldn't find tokenizer files"
283 |                     "at this path or url.".format(
284 |                         pretrained_model_name_or_path, ', '.join(s3_models),
285 |                         pretrained_model_name_or_path, ))
286 |                 return None
287 | 
288 |         # Get files from url, cache, or disk depending on the case
289 |         try:
290 |             resolved_vocab_files = {}
291 |             for file_id, file_path in vocab_files.items():
292 |                 if file_path is None:
293 |                     resolved_vocab_files[file_id] = None
294 |                 else:
295 |                     resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
296 |         except EnvironmentError as e:
297 |             if pretrained_model_name_or_path in s3_models:
298 |                 logger.error("Couldn't reach server to download vocabulary.")
299 |             else:
300 |                 logger.error(
301 |                     "Model name '{}' was not found in model name list ({}). "
302 |                     "We assumed '{}' was a path or url but couldn't find files {} "
303 |                     "at this path or url.".format(
304 |                         pretrained_model_name_or_path, ', '.join(s3_models),
305 |                         pretrained_model_name_or_path, str(vocab_files.keys())))
306 |             raise e
307 | 
308 |         for file_id, file_path in vocab_files.items():
309 |             if file_path == resolved_vocab_files[file_id]:
310 |                 logger.info("loading file {}".format(file_path))
311 |             else:
312 |                 logger.info("loading file {} from cache at {}".format(
313 |                     file_path, resolved_vocab_files[file_id]))
314 | 
315 |         # Set max length if needed
316 |         if pretrained_model_name_or_path in cls.max_model_input_sizes:
317 |             # if we're using a pretrained model, ensure the tokenizer
318 |             # wont index sequences longer than the number of positional embeddings
319 |             max_len = cls.max_model_input_sizes[pretrained_model_name_or_path]
320 |             if max_len is not None and isinstance(max_len, (int, float)):
321 |                 kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
322 | 
323 |         # Merge resolved_vocab_files arguments in kwargs.
324 |         added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None)
325 |         special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None)
326 |         for args_name, file_path in resolved_vocab_files.items():
327 |             if args_name not in kwargs:
328 |                 kwargs[args_name] = file_path
329 |         if special_tokens_map_file is not None:
330 |             special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8"))
331 |             for key, value in special_tokens_map.items():
332 |                 if key not in kwargs:
333 |                     kwargs[key] = value
334 | 
335 |         # Instantiate tokenizer.
336 |         tokenizer = cls(*inputs, **kwargs)
337 | 
338 |         # Add supplementary tokens.
339 |         if added_tokens_file is not None:
340 |             added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8"))
341 |             added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
342 |             tokenizer.added_tokens_encoder.update(added_tok_encoder)
343 |             tokenizer.added_tokens_decoder.update(added_tok_decoder)
344 | 
345 |         return tokenizer
346 | 
347 | 
348 |     def save_pretrained(self, save_directory):
349 |         """ Save the tokenizer vocabulary files (with added tokens) and the
350 |             special-tokens-to-class-attributes-mapping to a directory.
351 | 
352 |             This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
353 |         """
354 |         if not os.path.isdir(save_directory):
355 |             logger.error("Saving directory ({}) should be a directory".format(save_directory))
356 |             return
357 | 
358 |         special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE)
359 |         added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE)
360 | 
361 |         with open(special_tokens_map_file, 'w', encoding='utf-8') as f:
362 |             f.write(json.dumps(self.special_tokens_map, ensure_ascii=False))
363 | 
364 |         with open(added_tokens_file, 'w', encoding='utf-8') as f:
365 |             if self.added_tokens_encoder:
366 |                 out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
367 |             else:
368 |                 out_str = u"{}"
369 |             f.write(out_str)
370 | 
371 |         vocab_files = self.save_vocabulary(save_directory)
372 | 
373 |         return vocab_files + (special_tokens_map_file, added_tokens_file)
374 | 
375 | 
376 |     def save_vocabulary(self, save_directory):
377 |         """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
378 |             and special token mappings.
379 | 
380 |             Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
381 |         """
382 |         raise NotImplementedError
383 | 
384 | 
385 |     def vocab_size(self):
386 |         """ Size of the base vocabulary (without the added tokens) """
387 |         raise NotImplementedError
388 | 
389 | 
390 |     def __len__(self):
391 |         """ Size of the full vocabulary with the added tokens """
392 |         return self.vocab_size + len(self.added_tokens_encoder)
393 | 
394 | 
395 |     def add_tokens(self, new_tokens):
396 |         """
397 |         Add a list of new tokens to the tokenizer class. If the new tokens are not in the
398 |         vocabulary, they are added to it with indices starting from length of the current vocabulary.
399 | 
400 |         Args:
401 |             new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
402 | 
403 |         Returns:
404 |             Number of tokens added to the vocabulary.
405 | 
406 |         Examples::
407 | 
408 |             # Let's see how to increase the vocabulary of Bert model and tokenizer
409 |             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
410 |             model = BertModel.from_pretrained('bert-base-uncased')
411 | 
412 |             num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
413 |             print('We have added', num_added_toks, 'tokens')
414 |             model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
415 |         """
416 |         if not new_tokens:
417 |             return 0
418 | 
419 |         to_add_tokens = []
420 |         for token in new_tokens:
421 |             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
422 |             if token != self.unk_token and \
423 |                     self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
424 |                 to_add_tokens.append(token)
425 |                 logger.info("Adding %s to the vocabulary", token)
426 | 
427 |         added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens))
428 |         added_tok_decoder = {v:k for k, v in added_tok_encoder.items()}
429 |         self.added_tokens_encoder.update(added_tok_encoder)
430 |         self.added_tokens_decoder.update(added_tok_decoder)
431 | 
432 |         return len(to_add_tokens)
433 | 
434 | 
435 |     def add_special_tokens(self, special_tokens_dict):
436 |         """
437 |         Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
438 |         to class attributes. If special tokens are NOT in the vocabulary, they are added
439 |         to it (indexed starting from the last index of the current vocabulary).
440 | 
441 |         Args:
442 |             special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
443 |                 [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
444 |                 ``additional_special_tokens``].
445 | 
446 |                 Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
447 | 
448 |         Returns:
449 |             Number of tokens added to the vocabulary.
450 | 
451 |         Examples::
452 | 
453 |             # Let's see how to add a new classification token to GPT-2
454 |             tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
455 |             model = GPT2Model.from_pretrained('gpt2')
456 | 
457 |             special_tokens_dict = {'cls_token': '<CLS>'}
458 | 
459 |             num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
460 |             print('We have added', num_added_toks, 'tokens')
461 |             model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
462 | 
463 |             assert tokenizer.cls_token == '<CLS>'
464 |         """
465 |         if not special_tokens_dict:
466 |             return 0
467 | 
468 |         added_tokens = 0
469 |         for key, value in special_tokens_dict.items():
470 |             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
471 |             if key == 'additional_special_tokens':
472 |                 assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
473 |                 added_tokens += self.add_tokens(value)
474 |             else:
475 |                 assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
476 |                 added_tokens += self.add_tokens([value])
477 |             logger.info("Assigning %s to the %s key of the tokenizer", value, key)
478 |             setattr(self, key, value)
479 | 
480 |         return added_tokens
481 | 
482 |     def tokenize(self, text, **kwargs):
483 |         """ Converts a string in a sequence of tokens (string), using the tokenizer.
484 |             Split in words for word-based vocabulary or sub-words for sub-word-based
485 |             vocabularies (BPE/SentencePieces/WordPieces).
486 | 
487 |             Take care of added tokens.
488 |         """
489 |         def split_on_token(tok, text):
490 |             result = []
491 |             split_text = text.split(tok)
492 |             for i, sub_text in enumerate(split_text):
493 |                 sub_text = sub_text.strip()
494 |                 if i == 0 and not sub_text:
495 |                     result += [tok]
496 |                 elif i == len(split_text) - 1:
497 |                     if sub_text:
498 |                         result += [sub_text]
499 |                     else:
500 |                         pass
501 |                 else:
502 |                     if sub_text:
503 |                         result += [sub_text]
504 |                     result += [tok]
505 |             return result
506 | 
507 |         def split_on_tokens(tok_list, text):
508 |             if not text:
509 |                 return []
510 |             if not tok_list:
511 |                 return self._tokenize(text, **kwargs)
512 | 
513 |             tokenized_text = []
514 |             text_list = [text]
515 |             for tok in tok_list:
516 |                 tokenized_text = []
517 |                 for sub_text in text_list:
518 |                     if sub_text not in self.added_tokens_encoder \
519 |                             and sub_text not in self.all_special_tokens:
520 |                         tokenized_text += split_on_token(tok, sub_text)
521 |                     else:
522 |                         tokenized_text += [sub_text]
523 |                 text_list = tokenized_text
524 | 
525 |             return sum((self._tokenize(token, **kwargs) if token not \
526 |                     in self.added_tokens_encoder and token not in self.all_special_tokens \
527 |                     else [token] for token in tokenized_text), [])
528 | 
529 |         added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
530 |         tokenized_text = split_on_tokens(added_tokens, text)
531 |         return tokenized_text
532 | 
533 |     def _tokenize(self, text, **kwargs):
534 |         """ Converts a string in a sequence of tokens (string), using the tokenizer.
535 |             Split in words for word-based vocabulary or sub-words for sub-word-based
536 |             vocabularies (BPE/SentencePieces/WordPieces).
537 | 
538 |             Do NOT take care of added tokens.
539 |         """
540 |         raise NotImplementedError
541 | 
542 |     def convert_tokens_to_ids(self, tokens):
543 |         """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
544 |             (resp. a sequence of ids), using the vocabulary.
545 |         """
546 |         if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
547 |             return self._convert_token_to_id_with_added_voc(tokens)
548 | 
549 |         ids = []
550 |         for token in tokens:
551 |             ids.append(self._convert_token_to_id_with_added_voc(token))
552 |         if len(ids) > self.max_len:
553 |             logger.warning("Token indices sequence length is longer than the specified maximum sequence length "
554 |                            "for this model ({} > {}). Running this sequence through the model will result in "
555 |                            "indexing errors".format(len(ids), self.max_len))
556 |         return ids
557 | 
558 |     def _convert_token_to_id_with_added_voc(self, token):
559 |         if token in self.added_tokens_encoder:
560 |             return self.added_tokens_encoder[token]
561 |         return self._convert_token_to_id(token)
562 | 
563 |     def _convert_token_to_id(self, token):
564 |         raise NotImplementedError
565 | 
566 |     def encode(self, text, text_pair=None, add_special_tokens=False):
567 |         """
568 |         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
569 |         
570 |         Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
571 | 
572 |         Args:
573 |             text: The first sequence to be encoded.
574 |             text_pair: Optional second sequence to be encoded.
575 |             add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
576 |                 to their model.
577 |         """
578 |         if text_pair is None:
579 |             if add_special_tokens:
580 |                 return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
581 |             else:
582 |                 return self.convert_tokens_to_ids(self.tokenize(text))
583 | 
584 |         first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
585 |         second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
586 | 
587 |         if add_special_tokens:
588 |             return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
589 |         else:
590 |             return first_sentence_tokens, second_sentence_tokens
591 | 
592 |     def add_special_tokens_single_sentence(self, token_ids):
593 |         raise NotImplementedError
594 | 
595 |     def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
596 |         raise NotImplementedError
597 | 
598 |     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
599 |         """ Converts a single index or a sequence of indices (integers) in a token "
600 |             (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens.
601 | 
602 |             Args:
603 |                 skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
604 |         """
605 |         if isinstance(ids, int):
606 |             if ids in self.added_tokens_decoder:
607 |                 return self.added_tokens_decoder[ids]
608 |             else:
609 |                 return self._convert_id_to_token(ids)
610 |         tokens = []
611 |         for index in ids:
612 |             if index in self.all_special_ids and skip_special_tokens:
613 |                 continue
614 |             if index in self.added_tokens_decoder:
615 |                 tokens.append(self.added_tokens_decoder[index])
616 |             else:
617 |                 tokens.append(self._convert_id_to_token(index))
618 |         return tokens
619 | 
620 |     def _convert_id_to_token(self, index):
621 |         raise NotImplementedError
622 | 
623 |     def convert_tokens_to_string(self, tokens):
624 |         """ Converts a sequence of tokens (string) in a single string.
625 |             The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
626 |             but we often want to remove sub-word tokenization artifacts at the same time.
627 |         """
628 |         return ' '.join(self.convert_ids_to_tokens(tokens))
629 | 
630 |     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
631 |         """
632 |         Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
633 |         with options to remove special tokens and clean up tokenization spaces.
634 |         Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
635 |         """
636 |         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
637 |         text = self.convert_tokens_to_string(filtered_tokens)
638 | 
639 |         if self.sep_token is not None and self.sep_token in text:
640 |             text = text.replace(self.cls_token, self.sep_token)
641 |             split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token)))
642 |             if clean_up_tokenization_spaces:
643 |                 clean_text = [self.clean_up_tokenization(text) for text in split_text]
644 |                 return clean_text
645 |             else:
646 |                 return split_text
647 |         else:
648 |             if clean_up_tokenization_spaces:
649 |                 clean_text = self.clean_up_tokenization(text)
650 |                 return clean_text
651 |             else:
652 |                 return text
653 | 
654 |     @property
655 |     def special_tokens_map(self):
656 |         """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their
657 |             values ('<unk>', '<cls>'...)
658 |         """
659 |         set_attr = {}
660 |         for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
661 |             attr_value = getattr(self, "_" + attr)
662 |             if attr_value:
663 |                 set_attr[attr] = attr_value
664 |         return set_attr
665 | 
666 |     @property
667 |     def all_special_tokens(self):
668 |         """ List all the special tokens ('<unk>', '<cls>'...) mapped to class attributes
669 |             (cls_token, unk_token...).
670 |         """
671 |         all_toks = []
672 |         set_attr = self.special_tokens_map
673 |         for attr_value in set_attr.values():
674 |             all_toks = all_toks + (attr_value if isinstance(attr_value, (list, tuple)) else [attr_value])
675 |         all_toks = list(set(all_toks))
676 |         return all_toks
677 | 
678 |     @property
679 |     def all_special_ids(self):
680 |         """ List the vocabulary indices of the special tokens ('<unk>', '<cls>'...) mapped to
681 |             class attributes (cls_token, unk_token...).
682 |         """
683 |         all_toks = self.all_special_tokens
684 |         all_ids = list(self._convert_token_to_id(t) for t in all_toks)
685 |         return all_ids
686 | 
687 |     @staticmethod
688 |     def clean_up_tokenization(out_string):
689 |         """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
690 |         """
691 |         out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
692 |                         ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
693 |                         ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
694 |         return out_string
695 | 


--------------------------------------------------------------------------------
/scheme1/code1/requirements.txt:
--------------------------------------------------------------------------------
 1 | # PyTorch
 2 | torch>=1.0.0
 3 | # progress bars in model download and training scripts
 4 | tqdm
 5 | # Accessing files from S3 directly.
 6 | boto3
 7 | # Used for downloading models over HTTP
 8 | requests
 9 | # For OpenAI GPT
10 | regex
11 | # For XLNet
12 | sentencepiece


--------------------------------------------------------------------------------
/scheme1/code1/run_bert.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """BERT finetuning runner."""
 17 | 
 18 | from __future__ import absolute_import
 19 | 
 20 | import argparse
 21 | import csv
 22 | import logging
 23 | import os
 24 | import random
 25 | import sys
 26 | from io import open
 27 | import pandas as pd
 28 | import numpy as np
 29 | import torch
 30 | import gc
 31 | from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 32 |                               TensorDataset)
 33 | from torch.utils.data.distributed import DistributedSampler
 34 | from tqdm import tqdm, trange
 35 | from sklearn.metrics import f1_score
 36 | from sklearn.metrics import accuracy_score
 37 | import json
 38 | from pytorch_transformers.modeling_bert import BertForSequenceClassification, BertConfig
 39 | from pytorch_transformers import AdamW, WarmupLinearSchedule
 40 | from pytorch_transformers.tokenization_bert import BertTokenizer
 41 | from itertools import cycle
 42 | logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
 43 |                     datefmt = '%m/%d/%Y %H:%M:%S',
 44 |                     level = logging.INFO)
 45 | MODEL_CLASSES = {
 46 |     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
 47 | }
 48 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in ( BertConfig,)), ())
 49 | 
 50 | logger = logging.getLogger(__name__)
 51 | 
 52 | 
 53 | class InputExample(object):
 54 |     """A single training/test example for simple sequence classification."""
 55 | 
 56 |     def __init__(self, guid, text_a, text_b=None, label=None):
 57 |         """Constructs a InputExample.
 58 | 
 59 |         Args:
 60 |             guid: Unique id for the example.
 61 |             text_a: string. The untokenized text of the first sequence. For single
 62 |             sequence tasks, only this sequence must be specified.
 63 |             text_b: (Optional) string. The untokenized text of the second sequence.
 64 |             Only must be specified for sequence pair tasks.
 65 |             label: (Optional) string. The label of the example. This should be
 66 |             specified for train and dev examples, but not for test examples.
 67 |         """
 68 |         self.guid = guid
 69 |         self.text_a = text_a
 70 |         self.text_b = text_b
 71 |         self.label = label
 72 | 
 73 | 
 74 | class InputFeatures(object):
 75 |     def __init__(self,
 76 |                  example_id,
 77 |                  choices_features,
 78 |                  label
 79 | 
 80 |     ):
 81 |         self.example_id = example_id
 82 |         self.choices_features = [
 83 |             {
 84 |                 'input_ids': input_ids,
 85 |                 'input_mask': input_mask,
 86 |                 'segment_ids': segment_ids
 87 |             }
 88 |             for _, input_ids, input_mask, segment_ids in choices_features
 89 |         ]
 90 |         self.label = label
 91 |         
 92 | def read_examples(input_file, is_training):
 93 |     df=pd.read_csv(input_file)
 94 |     df['content'] = df['content'].fillna('无')
 95 |     df['entity'] = df['entity'].fillna('无')
 96 |     examples=[]
 97 |     for val in df[['id','content','entity','label']].values:
 98 |         examples.append(InputExample(guid=val[0],text_a=val[1],text_b=val[2],label=val[3]))
 99 |     return examples
100 | 
101 | def convert_examples_to_features(examples, tokenizer, max_seq_length,split_num,
102 |                                  is_training):
103 |     """Loads a data file into a list of `InputBatch`s."""
104 | 
105 |     # Swag is a multiple choice task. To perform this task using Bert,
106 |     # we will use the formatting proposed in "Improving Language
107 |     # Understanding by Generative Pre-Training" and suggested by
108 |     # @jacobdevlin-google in this issue
109 |     # https://github.com/google-research/bert/issues/38.
110 |     #
111 |     # Each choice will correspond to a sample on which we run the
112 |     # inference. For a given Swag example, we will create the 4
113 |     # following inputs:
114 |     # - [CLS] context [SEP] choice_1 [SEP]
115 |     # - [CLS] context [SEP] choice_2 [SEP]
116 |     # - [CLS] context [SEP] choice_3 [SEP]
117 |     # - [CLS] context [SEP] choice_4 [SEP]
118 |     # The model will output a single value for each input. To get the
119 |     # final decision of the model, we will run a softmax over these 4
120 |     # outputs.
121 |     features = []
122 |     for example_index, example in enumerate(examples):
123 | 
124 |         context_tokens=tokenizer.tokenize(example.text_a)
125 |         ending_tokens=tokenizer.tokenize(example.text_b)
126 |         
127 | 
128 |         skip_len=len(context_tokens)/split_num  
129 |         choices_features = []
130 |         for i in range(split_num):
131 |             context_tokens_choice=context_tokens[int(i*skip_len):int((i+1)*skip_len)]   
132 |             _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
133 |             tokens = ["[CLS]"]+ ending_tokens + ["[SEP]"] +context_tokens_choice  + ["[SEP]"]
134 |             segment_ids = [0] * (len(ending_tokens) + 2) + [1] * (len(context_tokens_choice) + 1)
135 |             input_ids = tokenizer.convert_tokens_to_ids(tokens)
136 |             input_mask = [1] * len(input_ids)
137 | 
138 | 
139 |             padding_length = max_seq_length - len(input_ids)
140 |             input_ids += ([0] * padding_length)
141 |             input_mask += ([0] * padding_length)
142 |             segment_ids += ([0] * padding_length)
143 |             choices_features.append((tokens, input_ids, input_mask, segment_ids))
144 | 
145 | 
146 |             label = example.label
147 |             if example_index < 1 and is_training:
148 |                 logger.info("*** Example ***")
149 |                 logger.info("idx: {}".format(example_index))
150 |                 logger.info("guid: {}".format(example.guid))
151 |                 logger.info("tokens: {}".format(' '.join(tokens).replace('\u2581','_')))
152 |                 logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
153 |                 logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
154 |                 logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
155 |                 logger.info("label: {}".format(label))
156 | 
157 | 
158 |         features.append(
159 |             InputFeatures(
160 |                 example_id=example.guid,
161 |                 choices_features=choices_features,
162 |                 label=label
163 |             )
164 |         )
165 |     return features
166 | 
167 | 
168 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
169 |     """Truncates a sequence pair in place to the maximum length."""
170 | 
171 |     # This is a simple heuristic which will always truncate the longer sequence
172 |     # one token at a time. This makes more sense than truncating an equal percent
173 |     # of tokens from each, since if one sequence is very short then each token
174 |     # that's truncated likely contains more information than a longer sequence.
175 |     
176 |     while True:
177 |         total_length = len(tokens_a) + len(tokens_b)
178 |         if total_length <= max_length:
179 |             break
180 |         if len(tokens_a) > len(tokens_b):
181 |             tokens_a.pop()
182 |         else:
183 |             tokens_b.pop()
184 | 
185 | def accuracy(out, labels):
186 |     outputs = np.argmax(out, axis=1)
187 |     #return f1_score(labels,outputs,labels=[0,1],average='macro')
188 |     return accuracy_score(labels, outputs, normalize=True)
189 | 
190 | def select_field(features, field):
191 |     return [
192 |         [
193 |             choice[field]
194 |             for choice in feature.choices_features
195 |         ]
196 |         for feature in features
197 |     ]
198 | 
199 | def set_seed(args):
200 |     random.seed(args.seed)
201 |     np.random.seed(args.seed)
202 |     torch.manual_seed(args.seed)
203 |     if args.n_gpu > 0:
204 |         torch.cuda.manual_seed_all(args.seed)
205 |         
206 |         
207 | def main():
208 |     parser = argparse.ArgumentParser()
209 | 
210 |     ## Required parameters
211 |     parser.add_argument("--data_dir", default=None, type=str, required=True,
212 |                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
213 |     parser.add_argument("--model_type", default=None, type=str, required=True,
214 |                         help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
215 |     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
216 |                         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
217 |     parser.add_argument("--meta_path", default=None, type=str, required=False,
218 |                         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
219 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
220 |                         help="The output directory where the model predictions and checkpoints will be written.")
221 | 
222 |     ## Other parameters
223 |     parser.add_argument("--config_name", default="", type=str,
224 |                         help="Pretrained config name or path if not the same as model_name")
225 |     parser.add_argument("--tokenizer_name", default="", type=str,
226 |                         help="Pretrained tokenizer name or path if not the same as model_name")
227 |     parser.add_argument("--cache_dir", default="", type=str,
228 |                         help="Where do you want to store the pre-trained models downloaded from s3")
229 |     parser.add_argument("--max_seq_length", default=128, type=int,
230 |                         help="The maximum total input sequence length after tokenization. Sequences longer "
231 |                              "than this will be truncated, sequences shorter will be padded.")
232 |     parser.add_argument("--do_train", action='store_true',
233 |                         help="Whether to run training.")
234 |     parser.add_argument("--do_test", action='store_true',
235 |                         help="Whether to run training.")
236 |     parser.add_argument("--do_eval", action='store_true',
237 |                         help="Whether to run eval on the dev set.")
238 |     parser.add_argument("--evaluate_during_training", action='store_true',
239 |                         help="Rul evaluation during training at each logging step.")
240 |     parser.add_argument("--do_lower_case", action='store_true',
241 |                         help="Set this flag if you are using an uncased model.")
242 | 
243 |     parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
244 |                         help="Batch size per GPU/CPU for training.")
245 |     parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
246 |                         help="Batch size per GPU/CPU for evaluation.")
247 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
248 |                         help="Number of updates steps to accumulate before performing a backward/update pass.")
249 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
250 |                         help="The initial learning rate for Adam.")
251 |     parser.add_argument("--weight_decay", default=0.0, type=float,
252 |                         help="Weight deay if we apply some.")
253 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
254 |                         help="Epsilon for Adam optimizer.")
255 |     parser.add_argument("--max_grad_norm", default=1.0, type=float,
256 |                         help="Max gradient norm.")
257 |     parser.add_argument("--num_train_epochs", default=3.0, type=float,
258 |                         help="Total number of training epochs to perform.")
259 |     parser.add_argument("--max_steps", default=-1, type=int,
260 |                         help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
261 |     parser.add_argument("--eval_steps", default=-1, type=int,
262 |                         help="")
263 |     parser.add_argument("--lstm_hidden_size", default=300, type=int,
264 |                         help="")
265 |     parser.add_argument("--lstm_layers", default=2, type=int,
266 |                         help="")
267 |     parser.add_argument("--lstm_dropout", default=0.5, type=float,
268 |                         help="")    
269 |     
270 |     parser.add_argument("--train_steps", default=-1, type=int,
271 |                         help="")
272 |     parser.add_argument("--report_steps", default=-1, type=int,
273 |                         help="")
274 |     parser.add_argument("--warmup_steps", default=0, type=int,
275 |                         help="Linear warmup over warmup_steps.")
276 |     parser.add_argument("--split_num", default=3, type=int,
277 |                         help="text split")
278 |     parser.add_argument('--logging_steps', type=int, default=50,
279 |                         help="Log every X updates steps.")
280 |     parser.add_argument('--save_steps', type=int, default=50,
281 |                         help="Save checkpoint every X updates steps.")
282 |     parser.add_argument("--eval_all_checkpoints", action='store_true',
283 |                         help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
284 |     parser.add_argument("--no_cuda", action='store_true',
285 |                         help="Avoid using CUDA when available")
286 |     parser.add_argument('--overwrite_output_dir', action='store_true',
287 |                         help="Overwrite the content of the output directory")
288 |     parser.add_argument('--overwrite_cache', action='store_true',
289 |                         help="Overwrite the cached training and evaluation sets")
290 |     parser.add_argument('--seed', type=int, default=42,
291 |                         help="random seed for initialization")
292 | 
293 |     parser.add_argument('--fp16', action='store_true',
294 |                         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
295 |     parser.add_argument('--fp16_opt_level', type=str, default='O1',
296 |                         help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
297 |                              "See details at https://nvidia.github.io/apex/amp.html")
298 |     parser.add_argument("--local_rank", type=int, default=-1,
299 |                         help="For distributed training: local_rank")
300 |     parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
301 |     parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
302 |     parser.add_argument('--device_id', type=str, default='0', help="The CUDA device is for training.")
303 |     args = parser.parse_args()
304 |     
305 |     os.environ['CUDA_VISIBLE_DEVICES'] = args.device_id
306 | 
307 |     # Setup CUDA, GPU & distributed training
308 |     if args.local_rank == -1 or args.no_cuda:
309 |         device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
310 |         args.n_gpu = torch.cuda.device_count()
311 |     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
312 |         torch.cuda.set_device(args.local_rank)
313 |         device = torch.device("cuda", args.local_rank)
314 |         torch.distributed.init_process_group(backend='nccl')
315 |         args.n_gpu = 1
316 |     args.device = device
317 |     
318 |     
319 |     # Setup logging
320 |     logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
321 |                         datefmt = '%m/%d/%Y %H:%M:%S',
322 |                         level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
323 |     logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
324 |                     args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
325 |     
326 |     # Set seed
327 |     set_seed(args)
328 | 
329 | 
330 |     try:
331 |         os.makedirs(args.output_dir)
332 |     except:
333 |         pass
334 |     
335 |     tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case)
336 |     
337 |     
338 |     
339 |     config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=2)
340 |     
341 |     # Prepare model
342 |     model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,args,config=config)
343 | 
344 | 
345 |         
346 |     if args.fp16:
347 |         model.half()
348 |     model.to(device)
349 |     if args.local_rank != -1:
350 |         try:
351 |             from apex.parallel import DistributedDataParallel as DDP
352 |         except ImportError:
353 |             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
354 | 
355 |         model = DDP(model)
356 |     elif args.n_gpu > 1:
357 |         model = torch.nn.DataParallel(model)
358 |     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
359 |     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
360 |     if args.do_train:
361 | 
362 |         # Prepare data loader
363 | 
364 |         train_examples = read_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True)
365 |         train_features = convert_examples_to_features(
366 |             train_examples, tokenizer, args.max_seq_length,args.split_num, True)
367 |         all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long)
368 |         all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long)
369 |         all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long)
370 |         all_label = torch.tensor([f.label for f in train_features], dtype=torch.long)
371 |         train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
372 |         if args.local_rank == -1:
373 |             train_sampler = RandomSampler(train_data)
374 |         else:
375 |             train_sampler = DistributedSampler(train_data)
376 |         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size//args.gradient_accumulation_steps)
377 | 
378 |         num_train_optimization_steps =  args.train_steps
379 | 
380 | 
381 |         # Prepare optimizer
382 | 
383 |         param_optimizer = list(model.named_parameters())
384 | 
385 |         # hack to remove pooler, which is not used
386 |         # thus it produce None grad that break apex
387 |         param_optimizer = [n for n in param_optimizer]
388 | 
389 |         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
390 |         optimizer_grouped_parameters = [
391 |             {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
392 |             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
393 |             ]
394 | 
395 |         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
396 |         scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=args.train_steps)
397 |         
398 |         global_step = 0
399 | 
400 |         logger.info("***** Running training *****")
401 |         logger.info("  Num examples = %d", len(train_examples))
402 |         logger.info("  Batch size = %d", args.train_batch_size)
403 |         logger.info("  Num steps = %d", num_train_optimization_steps)
404 |         
405 |         best_acc=0
406 |         model.train()
407 |         tr_loss = 0
408 |         nb_tr_examples, nb_tr_steps = 0, 0        
409 |         bar = tqdm(range(num_train_optimization_steps),total=num_train_optimization_steps)
410 |         train_dataloader=cycle(train_dataloader)
411 | 
412 |         
413 |         for step in bar:
414 |             batch = next(train_dataloader)
415 |             batch = tuple(t.to(device) for t in batch)
416 |             input_ids, input_mask, segment_ids, label_ids = batch
417 |             loss = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
418 |             if args.n_gpu > 1:
419 |                 loss = loss.mean() # mean() to average on multi-gpu.
420 |             if args.fp16 and args.loss_scale != 1.0:
421 |                 loss = loss * args.loss_scale
422 |             if args.gradient_accumulation_steps > 1:
423 |                 loss = loss / args.gradient_accumulation_steps
424 |             tr_loss += loss.item()
425 |             train_loss=round(tr_loss*args.gradient_accumulation_steps/(nb_tr_steps+1),4)
426 |             bar.set_description("loss {}".format(train_loss))
427 |             nb_tr_examples += input_ids.size(0)
428 |             nb_tr_steps += 1
429 | 
430 |             if args.fp16:
431 |                 optimizer.backward(loss)
432 |             else:
433 | 
434 |                 loss.backward()
435 | 
436 |             if (nb_tr_steps + 1) % args.gradient_accumulation_steps == 0:
437 |                 if args.fp16:
438 |                     # modify learning rate with special warm up BERT uses
439 |                     # if args.fp16 is False, BertAdam is used that handles this automatically
440 |                     lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
441 |                     for param_group in optimizer.param_groups:
442 |                         param_group['lr'] = lr_this_step
443 |                 scheduler.step()
444 |                 optimizer.step()
445 |                 optimizer.zero_grad()
446 |                 global_step += 1
447 | 
448 | 
449 |             if (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0:
450 |                 tr_loss = 0
451 |                 nb_tr_examples, nb_tr_steps = 0, 0 
452 |                 logger.info("***** Report result *****")
453 |                 logger.info("  %s = %s", 'global_step', str(global_step))
454 |                 logger.info("  %s = %s", 'train loss', str(train_loss))
455 | 
456 | 
457 |             if args.do_eval and (step + 1) %(args.eval_steps*args.gradient_accumulation_steps)==0:
458 |                 for file in ['dev.csv']:
459 |                     inference_labels=[]
460 |                     gold_labels=[]
461 |                     inference_logits=[]
462 |                     eval_examples = read_examples(os.path.join(args.data_dir, file), is_training = True)
463 |                     eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,args.split_num,False)
464 |                     all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
465 |                     all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
466 |                     all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
467 |                     all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)                   
468 | 
469 | 
470 |                     eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
471 |                         
472 |                     logger.info("***** Running evaluation *****")
473 |                     logger.info("  Num examples = %d", len(eval_examples))
474 |                     logger.info("  Batch size = %d", args.eval_batch_size)  
475 |                         
476 |                     # Run prediction for full data
477 |                     eval_sampler = SequentialSampler(eval_data)
478 |                     eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
479 | 
480 |                     model.eval()
481 |                     eval_loss, eval_accuracy = 0, 0
482 |                     nb_eval_steps, nb_eval_examples = 0, 0
483 |                     for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
484 |                         input_ids = input_ids.to(device)
485 |                         input_mask = input_mask.to(device)
486 |                         segment_ids = segment_ids.to(device)
487 |                         label_ids = label_ids.to(device)
488 | 
489 | 
490 |                         with torch.no_grad():
491 |                             tmp_eval_loss= model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids)
492 |                             logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
493 | 
494 |                         logits = logits.detach().cpu().numpy()
495 |                         label_ids = label_ids.to('cpu').numpy()
496 |                         inference_labels.append(np.argmax(logits, axis=1))
497 |                         gold_labels.append(label_ids)
498 |                         inference_logits.append(logits)
499 |                         eval_loss += tmp_eval_loss.mean().item()
500 |                         nb_eval_examples += input_ids.size(0)
501 |                         nb_eval_steps += 1
502 |                         
503 |                     gold_labels=np.concatenate(gold_labels,0) 
504 |                     inference_logits=np.concatenate(inference_logits,0)
505 |                     model.train()
506 |                     eval_loss = eval_loss / nb_eval_steps
507 |                     eval_accuracy = accuracy(inference_logits, gold_labels)
508 | 
509 |                     result = {'eval_loss': eval_loss,
510 |                               'eval_F1': eval_accuracy,
511 |                               'global_step': global_step,
512 |                               'loss': train_loss}
513 | 
514 |                     output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
515 |                     with open(output_eval_file, "a") as writer:
516 |                         for key in sorted(result.keys()):
517 |                             logger.info("  %s = %s", key, str(result[key]))
518 |                             writer.write("%s = %s\n" % (key, str(result[key])))
519 |                         writer.write('*'*80)
520 |                         writer.write('\n')
521 |                     if eval_accuracy>best_acc and 'dev' in file:
522 |                         print("="*80)
523 |                         print("Best F1",eval_accuracy)
524 |                         print("Saving Model......")
525 |                         best_acc=eval_accuracy
526 |                         # Save a trained model
527 |                         model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
528 |                         output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
529 |                         torch.save(model_to_save.state_dict(), output_model_file)
530 |                         print("="*80)
531 |                     else:
532 |                         print("="*80)
533 |     if args.do_test:
534 |         del model
535 |         gc.collect()
536 |         args.do_train=False
537 |         model = BertForSequenceClassification.from_pretrained(os.path.join(args.output_dir, "pytorch_model.bin"),args,config=config)
538 |         if args.fp16:
539 |             model.half()
540 |         model.to(device)
541 |         if args.local_rank != -1:
542 |             try:
543 |                 from apex.parallel import DistributedDataParallel as DDP
544 |             except ImportError:
545 |                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
546 | 
547 |             model = DDP(model)
548 |         elif args.n_gpu > 1:
549 |             model = torch.nn.DataParallel(model)        
550 |         
551 |         
552 |         for file,flag in [('dev.csv','dev'),('test.csv','test')]:
553 |             inference_labels=[]
554 |             gold_labels=[]
555 |             eval_examples = read_examples(os.path.join(args.data_dir, file), is_training = False)
556 |             eval_features = convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,args.split_num,False)
557 |             all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long)
558 |             all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long)
559 |             all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long)
560 |             all_label = torch.tensor([f.label for f in eval_features], dtype=torch.long)                           
561 | 
562 | 
563 |             eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,all_label)
564 |             # Run prediction for full data
565 |             eval_sampler = SequentialSampler(eval_data)
566 |             eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
567 | 
568 |             model.eval()
569 |             eval_loss, eval_accuracy = 0, 0
570 |             nb_eval_steps, nb_eval_examples = 0, 0
571 |             for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
572 |                 input_ids = input_ids.to(device)
573 |                 input_mask = input_mask.to(device)
574 |                 segment_ids = segment_ids.to(device)
575 |                 label_ids = label_ids.to(device)
576 | 
577 |                 with torch.no_grad():
578 |                     logits = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask).detach().cpu().numpy()
579 |                 label_ids = label_ids.to('cpu').numpy()
580 |                 inference_labels.append(logits)
581 |                 gold_labels.append(label_ids)
582 |             gold_labels=np.concatenate(gold_labels,0)
583 |             logits=np.concatenate(inference_labels,0)
584 |             print(flag, accuracy(logits, gold_labels))
585 |             if flag=='test':
586 |                 df=pd.read_csv(os.path.join(args.data_dir, file))
587 |                 df['label_0']=logits[:,0]
588 |                 df['label_1']=logits[:,1]
589 |                 #df['label_2']=logits[:,2]
590 |                 df[['id','entity','label_0','label_1']].to_csv(os.path.join(args.output_dir, "test_pb.csv"),index=False)
591 |             
592 |             
593 | if __name__ == "__main__":
594 |     main()
595 | 


--------------------------------------------------------------------------------
/scheme1/code2/code.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import gc
  4 | import sys
  5 | import json
  6 | import codecs
  7 | import datetime
  8 | import warnings
  9 | import numpy as np
 10 | import pandas as pd
 11 | import tensorflow as tf
 12 | from tqdm import tqdm
 13 | from random import choice
 14 | import matplotlib.pyplot as plt
 15 | from collections import Counter
 16 | from sklearn.metrics import f1_score
 17 | from sklearn.model_selection import KFold
 18 | from sklearn.model_selection import KFold
 19 | from sklearn.preprocessing import LabelEncoder
 20 | from sklearn.model_selection import StratifiedKFold
 21 | from sklearn.model_selection import train_test_split
 22 | 
 23 | import keras.backend as K
 24 | from keras.layers import *
 25 | from keras.callbacks import *
 26 | from keras.models import Model
 27 | from keras.optimizers import Adam
 28 | from keras.utils import to_categorical
 29 | from keras.metrics import top_k_categorical_accuracy, categorical_accuracy
 30 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer
 31 | 
 32 | tqdm.pandas()
 33 | np.random.seed(214683)
 34 | warnings.filterwarnings('ignore')
 35 | 
 36 | 
 37 | data_path = 'datasets/'
 38 | train = pd.read_csv(data_path + 'Round2_train.csv', encoding='utf-8')
 39 | train2= pd.read_csv(data_path + 'Train_data.csv', encoding='utf-8')
 40 | train=pd.concat([train,train2],axis=0,sort=True)
 41 | test = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8')
 42 | 
 43 | train = train[train['entity'].notnull()]
 44 | test = test[test['entity'].notnull()]
 45 | 
 46 | train=train.drop_duplicates(['title','text','entity','negative','key_entity'])   #去掉重复的data
 47 | 
 48 | 
 49 | def get_or_content(y,z):
 50 |     s=''
 51 |     if str(y)!='nan':
 52 |         s+=y
 53 |     if str(z)!='nan':
 54 |         s+=z
 55 |     return s
 56 | 
 57 | #获取title+text
 58 | train['content']=list(map(lambda y,z: get_or_content(y,z),train['title'],train['text']))
 59 | test['content']=list(map(lambda y,z: get_or_content(y,z),test['title'],test['text']))
 60 | 
 61 | def entity_clear_row(entity,content):
 62 |     entities = entity.split(';')
 63 |     entities.sort(key=lambda x: len(x))
 64 |     n = len(entities)
 65 |     tmp = entities.copy()
 66 |     for i in range(n):
 67 |         entity_tmp = entities[i]
 68 |         #长度小于等于1
 69 |         if len(entity_tmp)<=1:
 70 |             tmp.remove(entity_tmp)
 71 |             continue
 72 |         if i + 1 >= n:
 73 |             break
 74 |         for entity_tmp2 in entities[i + 1:]:
 75 |             if entity_tmp2.find(entity_tmp) != -1 and (
 76 |                     entity_tmp2.find('?') != -1 or content.replace(entity_tmp2, '').find(entity_tmp) == -1):
 77 |                 tmp.remove(entity_tmp)
 78 |                 break
 79 |     return ';'.join(tmp)
 80 | 
 81 | train['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),train['entity'],train['content']))
 82 | test['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),test['entity'],test['content']))
 83 | 
 84 | # 去掉空实体
 85 | def duplicate_entity(entity):
 86 |     def is_empty(x):
 87 |         return (x != '') & (x != ' ')
 88 | 
 89 |     if entity is np.nan:
 90 |         return entity
 91 |     else:
 92 |         entity = filter(is_empty, entity.split(';'))
 93 |         return ';'.join(list(set(entity)))
 94 | 
 95 | train['entity'] = train['entity'].apply(lambda index: duplicate_entity(index))
 96 | test['entity'] = test['entity'].apply(lambda index: duplicate_entity(index))
 97 | 
 98 | # 正则表达式清洗文本
 99 | def delete_tag(s):
100 |     
101 |     s = re.sub('\{IMG:.?.?.?\}', '', s)                    #图片
102 |     s = re.sub(re.compile(r'[a-zA-Z]+://[^\s]+'), '', s)   #网址
103 |     s = re.sub(re.compile('<.*?>'), '', s)                 #网页标签
104 | #     s = re.sub(re.compile('&[a-zA-Z]+;?'), ' ', s)         #网页标签
105 | #     s = re.sub(re.compile('[a-zA-Z0-9]*[./]+[a-zA-Z0-9./]+[a-zA-Z0-9./]*'), ' ', s)
106 | #     r4=re.compile('\d{4}[-/]\d{2}[-/]\d{2}')             #日期
107 | #     s=re.sub(r4,'某时',s)    
108 |     return s
109 | 
110 | train['title'] = train['title'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x)
111 | train['text'] = train['text'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x)
112 | test['title'] = test['title'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x)
113 | test['text'] = test['text'].apply(lambda x: delete_tag(x) if str(x)!='nan' else x)
114 | 
115 | # 使用title来填充测试集中text的缺失值，text null:1
116 | train['title'] = train.apply(lambda index: index.text if index.title is np.nan else index.title, axis=1)
117 | test['title'] = test.apply(lambda index: index.text if index.title is np.nan else index.title, axis=1)
118 | train['text'] = train.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1)
119 | test['text'] = test.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1)
120 | 
121 | # 选取非空样本
122 | train = train[train['entity'].notnull()] # train entity null:18
123 | test = test[test['entity'].notnull()]    #  test entity null:16
124 | 
125 | # train
126 | train_id_entity = train[['id', 'entity']]
127 | train_id_entity['entity'] = train_id_entity['entity'].apply(lambda index: index.split(';'))
128 | ids, entity = [], [] 
129 | for index in range(len(train_id_entity['entity'])):
130 |     entity.extend(list(train_id_entity['entity'])[index])
131 |     ids.extend([list(train_id_entity['id'])[index]] * len(list(train_id_entity['entity'])[index]))
132 | train_id_entity = pd.DataFrame({'id': ids, 'entity_label': entity}) # train len:11448
133 | 
134 | # test
135 | test_id_entity = test[['id', 'entity']]
136 | test_id_entity['entity'] = test_id_entity['entity'].apply(lambda index: index.split(';'))
137 | ids, entity = [], [] 
138 | for index in range(len(test_id_entity['entity'])):
139 |     entity.extend(list(test_id_entity['entity'])[index])
140 |     ids.extend([list(test_id_entity['id'])[index]] * len(list(test_id_entity['entity'])[index]))
141 | test_id_entity = pd.DataFrame({'id': ids, 'entity_label': entity})  # test len:11580
142 | 
143 | # 
144 | train.pop('negative') # 去掉negative列
145 | train = train.merge(train_id_entity, on='id', how='left')
146 | train['label'] = train.apply(lambda index: 0 if index.key_entity is np.nan else 1, axis=1)
147 | train['key_entity'] = train['key_entity'].fillna('')
148 | # train['label'] = train.apply(lambda index: 1 if index.key_entity.find(index.entity_label) != -1 else 0, axis=1)
149 | train['label'] = train.apply(lambda index: 1 if index.key_entity.split(';').count(index.entity_label) >= 1 else 0, axis=1)
150 | 
151 | test = test.merge(test_id_entity, on='id', how='left')
152 | 
153 | # 去除长度小于的1的entity
154 | train['entity_label_len'] = train['entity_label'].apply(lambda x: len(x))
155 | test['entity_label_len'] = test['entity_label'].apply(lambda x: len(x))
156 | train = train[train['entity_label_len']>1]
157 | test = test[test['entity_label_len']>1]
158 | 
159 | def get_first_index(row, flag):
160 |     if flag=='title':
161 |         return row['title'].find(row['entity_label'])
162 |     else:
163 |         return row['text'].find(row['entity_label'])
164 | 
165 | train['title_first_index'] = train.apply(lambda row: get_first_index(row, 'title'), axis=1)
166 | train['text_first_index'] = train.apply(lambda row: get_first_index(row, 'text'), axis=1)
167 | 
168 | test['title_first_index'] = test.apply(lambda row: get_first_index(row, 'title'), axis=1)
169 | test['text_first_index'] = test.apply(lambda row: get_first_index(row, 'text'), axis=1)
170 | 
171 | def text_truncate(row):
172 |     title_first_index = row['title_first_index']
173 |     text_first_index = row['text_first_index']
174 |     if title_first_index==-1 and text_first_index>480:
175 |         return row['text'][text_first_index-200:text_first_index+300]
176 |     else:
177 |         return row['text']
178 | 
179 | train['text'] = train.apply(lambda row: text_truncate(row), axis=1)
180 | test['text'] = test.apply(lambda row: text_truncate(row), axis=1)
181 | 
182 | def get_content(x, y, z):
183 |     s='[E]' # E:Entity
184 |     if str(x)!='nan':
185 |         s+=x
186 |     if str(y)!='nan' and str(z)!='nan' and  y==z:
187 |         s+='[S]' # S:Same
188 |         s+=y
189 |     else:
190 |         s+='[T]' # T:Title
191 |         if str(y)!='nan':
192 |             s+=y
193 |         s+='[C]' # C:Content
194 |         if str(z)!='nan':
195 |             s+=z  
196 |     #添加
197 | #     if str(x)!='nan':
198 | #         x_len=len(x)
199 | #         end=len(s)-x_len
200 | #         i=0
201 | #         out=''
202 | #         while i<=end:
203 | #             if s[i:i+x_len]==x:
204 | #                 out+='$'
205 | #                 out+=x
206 | #                 out+="$"
207 | #                 i+=x_len
208 | #             else:
209 | #                 out+=s[i]
210 | #                 i+=1
211 | #         if i!=len(s):
212 | #             out+=s[i:]
213 | #         s=out
214 |     return s 
215 | 
216 | # def get_content(x, y, z):
217 | #     s=''      # E:Entity
218 | # #     if str(x)!='nan':
219 | # #         s+=x
220 | #     if str(y)!='nan' and str(z)!='nan' and  y==z:
221 | #         s+='[S]' # S:Same
222 | #         s+=y
223 | #     else:
224 | #         s+='[T]' # T:Title
225 | #         if str(y)!='nan':
226 | #             s+=y
227 | #         s+='[C]' # C:Content
228 | #         if str(z)!='nan':
229 | #             s+=z  
230 | #     #添加
231 | # #     if str(x)!='nan':
232 | # #         x_len=len(x)
233 | # #         end=len(s)-x_len
234 | # #         i=0
235 | # #         out=''
236 | # #         while i<=end:
237 | # #             if s[i:i+x_len]==x:
238 | # #                 out+='$'
239 | # #                 out+=x
240 | # #                 out+="$"
241 | # #                 i+=x_len
242 | # #             else:
243 | # #                 out+=s[i]
244 | # #                 i+=1
245 | # #         if i!=len(s):
246 | # #             out+=s[i:]
247 | # #         s=out
248 | #     return s 
249 | 
250 | train['corpus']=list(map(lambda x,y,z: get_content(x,y,z),tqdm(train['entity_label'].values),train['title'],train['text']))
251 | test['corpus']=list(map(lambda x,y,z: get_content(x,y,z),tqdm(test['entity_label'].values),test['title'],test['text']))
252 | 
253 | def get_other_content(x,y):
254 |     entitys=x.split(';')
255 |     if len(entitys)<=1:
256 |         return np.nan
257 |     l=[]
258 |     for e in entitys:
259 |         if e!=y and e!='':
260 |             l.append(e)
261 |     return ';'.join(l)
262 | train['other_entity'] = list(map(lambda x, y: get_other_content(x, y), train['entity'], train['entity_label']))
263 | test['other_entity'] = list(map(lambda x, y: get_other_content(x, y), test['entity'], test['entity_label']))
264 | 
265 | def get_content(x, y):
266 |     if str(y) == 'nan':
267 |         return x
268 |     y = y.split(';')
269 |     for i in y:
270 | #         x=x.replace(i,'其他实体')
271 |         x = 'O_E'.join(x.split(i)) # O_E:Other_Entity
272 |     return x
273 | train['corpus'] = list(map(lambda x, y: get_content(x, y), train['corpus'], train['other_entity']))
274 | test['corpus'] = list(map(lambda x, y: get_content(x, y), test['corpus'], test['other_entity']))
275 | 
276 | MAXLEN = 510 # 510
277 | 
278 | bert_path = 'E:/NLP_corpus/BERT/hgd/chinese_roberta_wwm_ext_L-12_H-768_A-12/'
279 | config_path = bert_path + 'bert_config.json'
280 | checkpoint_path = bert_path + 'bert_model.ckpt'
281 | dict_path = bert_path + 'vocab.txt'
282 | 
283 | # 给每个token按序编号，构建词表
284 | token_dict = {}
285 | with codecs.open(dict_path, 'r', 'utf8') as reader:
286 |     for line in reader:
287 |         token = line.strip()
288 |         token_dict[token] = len(token_dict)
289 | 
290 | # 分词器
291 | class OurTokenizer(Tokenizer):
292 |     def _tokenize(self, text):
293 |         R = []
294 |         for c in text:
295 |             if c in self._token_dict:
296 |                 R.append(c)
297 |             elif self._is_space(c):
298 |                 R.append('[unused1]') # space类用未经训练的[unused1]表示
299 |             else:
300 |                 R.append('[UNK]')     # 剩余的字符是[UNK]
301 |         return R
302 | tokenizer = OurTokenizer(token_dict)
303 | 
304 | # Padding，默认添 0
305 | def seq_padding(X, padding=0):
306 |     L = [len(x) for x in X]
307 |     ML = max(L)
308 |     return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])
309 | 
310 | # 数据生成
311 | class data_generator:
312 |     def __init__(self, data, batch_size=8, shuffle=True):    # 8
313 |         self.data = data
314 |         self.batch_size = batch_size
315 |         self.shuffle = shuffle
316 |         self.steps = len(self.data) // self.batch_size # 迭代完一个epoch需要的步数
317 |         if len(self.data) % self.batch_size != 0:      # 保证步数为整数
318 |             self.steps += 1
319 |             
320 |     def __len__(self):
321 |         return self.steps
322 |     
323 |     def __iter__(self):
324 |         while True:
325 |             idxs = list(range(len(self.data)))
326 |             if self.shuffle:
327 |                 np.random.shuffle(idxs)
328 |             
329 |             X1, X2, Y = [], [], []
330 |             for i in idxs:
331 |                 d = self.data[i]
332 |                 text = d[0][:MAXLEN]
333 |                 x1, x2 = tokenizer.encode(first=text)
334 |                 y = d[1]
335 |                 X1.append(x1)
336 |                 X2.append(x2)
337 | #                 Y.append([y])
338 |                 Y.append(y)
339 |                 if len(X1) == self.batch_size or i == idxs[-1]:
340 |                     X1 = seq_padding(X1)
341 |                     X2 = seq_padding(X2)
342 |                     Y = seq_padding(Y)
343 |                     yield [X1, X2], Y#[:, 0, :]
344 |                     X1, X2, Y = [], [], []
345 | 
346 | # 计算：最高的k分类准确率
347 | def acc_top2(y_true, y_pred):
348 |     return top_k_categorical_accuracy(y_true, y_pred, k=2)
349 | 
350 | # 计算：F1值
351 | def f1_metric(y_true, y_pred):
352 |     '''
353 |     metric from here 
354 |     https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
355 |     '''
356 |     def recall(y_true, y_pred):
357 |         """Recall metric.
358 |         Only computes a batch-wise average of recall.
359 |         Computes the recall, a metric for multi-label classification of
360 |         how many relevant items are selected.
361 |         """
362 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
363 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
364 |         recall = true_positives / (possible_positives + K.epsilon())
365 |         return recall
366 | 
367 |     def precision(y_true, y_pred):
368 |         """Precision metric.
369 | 
370 |         Only computes a batch-wise average of precision.
371 | 
372 |         Computes the precision, a metric for multi-label classification of
373 |         how many selected items are relevant.
374 |         """
375 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
376 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
377 |         precision = true_positives / (predicted_positives + K.epsilon())
378 |         return precision
379 |     precision = precision(y_true, y_pred)
380 |     recall = recall(y_true, y_pred)
381 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))
382 |                     
383 | # BERT模型建立
384 | def build_bert(nclass):
385 |     bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
386 | 
387 |     for layer in bert_model.layers:
388 | #         print(l)
389 |         layer.trainable = True
390 | 
391 |     # inputs
392 |     x1_in = Input(shape=(None,))
393 |     x2_in = Input(shape=(None,))
394 | 
395 |     x = bert_model([x1_in, x2_in])
396 | #     print('Bert output shape', x.shape)
397 |     x = Lambda(lambda x: x[:, 0])(x)
398 |     
399 |     # outputs
400 |     p = Dense(nclass, activation='softmax')(x)
401 | 
402 |     # 模型建立与编译
403 |     model = Model([x1_in, x2_in], p)
404 |     model.compile(loss='categorical_crossentropy', 
405 |                   optimizer=Adam(1e-5),
406 |                   metrics=['accuracy', f1_metric, categorical_accuracy])
407 |     print(model.summary())
408 |     return model
409 | 
410 | from keras.callbacks import Callback
411 | from sklearn.metrics import f1_score,accuracy_score
412 | 
413 | learning_rate = 5e-5
414 | min_learning_rate = 1e-5
415 | 
416 | class Evaluate(Callback):
417 |     def __init__(self):
418 |         self.best = 0.
419 |         self.passed = 0
420 |         
421 |     def on_batch_begin(self, batch, logs=None):
422 |         """第一个epoch用来warmup，第二个epoch把学习率降到最低
423 |         """
424 |         if self.passed < self.params['steps']:
425 |             lr = (self.passed + 1.) / self.params['steps'] * learning_rate
426 |             K.set_value(self.model.optimizer.lr, lr)
427 |             self.passed += 1
428 |         elif self.params['steps'] <= self.passed < self.params['steps'] * 2:
429 |             lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate)
430 |             lr += min_learning_rate
431 |             K.set_value(self.model.optimizer.lr, lr)
432 |             self.passed += 1
433 |             
434 | # 训练集去重
435 | train.drop_duplicates('corpus', inplace=True)
436 | 
437 | DATA_LIST = []
438 | for data_row in train.iloc[:].itertuples():
439 |     DATA_LIST.append((data_row.corpus, to_categorical(data_row.label, 2)))
440 | DATA_LIST = np.array(DATA_LIST)
441 | 
442 | DATA_LIST_TEST = []
443 | for data_row in test.iloc[:].itertuples():
444 |     DATA_LIST_TEST.append((data_row.corpus, to_categorical(0, 2)))
445 | DATA_LIST_TEST = np.array(DATA_LIST_TEST)
446 | 
447 | f1 = []
448 | def run_cv(nfolds, data, data_label, data_test, epochs=10, date_str='1107'):
449 |     skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=214683).split(data, train['label'])
450 |     train_model_pred = np.zeros((len(data), 2))
451 |     test_model_pred = np.zeros((len(data_test), 2))
452 | 
453 |     for i, (train_fold, test_fold) in enumerate(skf):
454 |         print('Fold: ', i+1)
455 |         
456 |         '''数据部分'''
457 |         # 数据划分
458 |         X_train, X_valid, = data[train_fold, :], data[test_fold, :]
459 |         train_D = data_generator(X_train, shuffle=True)
460 |         valid_D = data_generator(X_valid, shuffle=False)
461 |         test_D = data_generator(data_test, shuffle=False)
462 |         
463 |         time_now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
464 |         '''模型部分'''
465 |         # 生成模型
466 |         model = build_bert(2)
467 |         # callbacks
468 |         early_stopping = EarlyStopping(monitor='val_f1_metric', patience=3)   # val_acc
469 |         plateau = ReduceLROnPlateau(monitor="val_f1_metric", verbose=1, mode='max', factor=0.5, patience=1) # max：未上升则降速
470 |         checkpoint = ModelCheckpoint('./models/keras_model/fusai' + date_str + str(i) + '.hdf5', monitor='val_f1_metric', 
471 |                                      verbose=2, save_best_only=True, mode='max',save_weights_only=True) # period=1: 每1轮保存
472 |         
473 |         evaluator = Evaluate()
474 |         
475 |         # 模型训练，使用生成器方式训练
476 |         model.fit_generator(
477 |             train_D.__iter__(),
478 |             steps_per_epoch=len(train_D),   ## ?? ##
479 |             epochs=epochs,
480 |             validation_data=valid_D.__iter__(),
481 |             validation_steps=len(valid_D),
482 |             callbacks=[early_stopping, plateau, checkpoint, evaluator], # evaluator, 
483 |             verbose=2
484 |         )
485 |         
486 |         model.load_weights('./models/keras_model/fusai' + date_str + str(i) + '.hdf5')
487 |         
488 |         # return model
489 |         val = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=0)
490 |         print(val)
491 |         
492 |         score = f1_score(train['label'].values[test_fold], np.argmax(val, axis=1))
493 |         global f1
494 |         f1.append(score)
495 |         print('validate {} f1_score:{}'.format(i+1, score))
496 |         
497 |         train_model_pred[test_fold, :] = val
498 |         test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D),verbose=0)
499 |         
500 |         del model
501 |         gc.collect()
502 |         
503 |         K.clear_session()
504 |         # break
505 |         
506 |     return train_model_pred, test_model_pred
507 | 
508 | 
509 | start_time = time.time()
510 | 
511 | n_folds = 10
512 | folds_num = str(n_folds) + 'folds_' 
513 | date_str = '1114'
514 | strategy = '_withprocess_chusai&fusaidata_'
515 | model = 'robeta_large'
516 | 
517 | train_model_pred, test_model_pred = run_cv(n_folds, DATA_LIST, None, DATA_LIST_TEST, date_str=date_str)
518 | print('Validate 5folds average f1 score:', np.average(f1))
519 | np.save('weights/keras_weight/fusai/train' + model + strategy + folds_num + date_str + '.npy', train_model_pred)
520 | np.save('weights/keras_weight/fusai/test' + model + strategy + folds_num + date_str + '.npy', test_model_pred)
521 | 
522 | end_time = time.time()
523 | print('Time cost(min): ', (end_time-start_time)/60)
524 | 
525 | 
526 | def return_list(group):
527 |     return ';'.join(list(group))
528 | 
529 | sub = test.copy()
530 | sub['label'] = [np.argmax(index) for index in test_model_pred]
531 | sub_label = sub[sub['label'] == 1].groupby(['id'], as_index=False)['entity_label'].agg({'key_entity': return_list})
532 | 
533 | test_2 = pd.read_csv('datasets/round2_test.csv', encoding='utf-8') # 导入测试集
534 | submit = test_2[['id']]
535 | submit = submit.merge(sub_label, on='id', how='left')
536 | submit['negative'] = submit['key_entity'].apply(lambda index: 0 if index is np.nan else 1)
537 | submit = submit[['id', 'negative', 'key_entity']]
538 | 
539 | time_now = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
540 | print(time_now)
541 | submit.to_csv('submission/' + model + strategy + folds_num + '{}.csv'.format(time_now), encoding='utf-8', index=None)
542 | 
543 | 


--------------------------------------------------------------------------------
/scheme2/code2.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding:utf-8 -*-
  2 | import os
  3 | import re
  4 | import gc
  5 | import sys
  6 | import json
  7 | import codecs
  8 | import random
  9 | import warnings
 10 | import numpy as np
 11 | import pandas as pd
 12 | from tqdm import tqdm
 13 | from random import choice
 14 | import tensorflow as tf
 15 | import matplotlib.pyplot as plt
 16 | from collections import Counter
 17 | from sklearn.model_selection import KFold
 18 | from sklearn.preprocessing import LabelEncoder
 19 | from sklearn.metrics import f1_score, accuracy_score
 20 | from sklearn.model_selection import StratifiedKFold
 21 | from sklearn.model_selection import train_test_split
 22 | 
 23 | import keras.backend as K
 24 | from keras.layers import *
 25 | from keras.callbacks import *
 26 | from keras.models import Model
 27 | from keras.optimizers import Adam
 28 | from keras.initializers import glorot_uniform
 29 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer
 30 | 
 31 | 
 32 | tqdm.pandas()
 33 | seed = 2019
 34 | random.seed(seed)
 35 | tf.set_random_seed(seed)
 36 | np.random.seed(seed)
 37 | warnings.filterwarnings('ignore')
 38 | 
 39 | 
 40 | 
 41 | ################################################################
 42 | data_path = './data/'
 43 | 
 44 | train = pd.read_csv(data_path + 'Round2_train.csv', encoding='utf-8')
 45 | train2= pd.read_csv(data_path + './Train_Data.csv', encoding='utf-8')
 46 | train=pd.concat([train, train2], axis=0, sort=True)
 47 | test = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8')
 48 | 
 49 | train = train[train['entity'].notnull()]
 50 | test = test[test['entity'].notnull()]
 51 | 
 52 | train=train.drop_duplicates(['title','text','entity','negative','key_entity'])   # 去掉重复的data
 53 | 
 54 | print(train.shape) ###(10526, 6)
 55 | print(test.shape)  ####((9997, 4)
 56 | 
 57 | 
 58 | def get_or_content(y,z):
 59 |     s=''
 60 |     if str(y)!='nan':
 61 |         s+=y
 62 |     if str(z)!='nan':
 63 |         s+=z
 64 |     return s
 65 | 
 66 | #获取title+text
 67 | train['content']=list(map(lambda y,z: get_or_content(y,z),train['title'],train['text']))
 68 | test['content']=list(map(lambda y,z: get_or_content(y,z),test['title'],test['text']))
 69 | 
 70 | 
 71 | def entity_clear_row(entity,content):
 72 |     entities = entity.split(';')
 73 |     entities.sort(key=lambda x: len(x))
 74 |     n = len(entities)
 75 |     tmp = entities.copy()
 76 |     for i in range(n):
 77 |         entity_tmp = entities[i]
 78 |         #长度小于等于1
 79 |         if len(entity_tmp)<=1:
 80 |             tmp.remove(entity_tmp)
 81 |             continue
 82 |         if i + 1 >= n:
 83 |             break
 84 |         for entity_tmp2 in entities[i + 1:]:
 85 |             if entity_tmp2.find(entity_tmp) != -1 and (
 86 |                     entity_tmp2.find('?') != -1 or content.replace(entity_tmp2, '').find(entity_tmp) == -1):
 87 |                 tmp.remove(entity_tmp)
 88 |                 break
 89 |     return ';'.join(tmp)
 90 | 
 91 | train['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),train['entity'],train['content']))
 92 | test['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),test['entity'],test['content']))
 93 | 
 94 | test['text'] = test.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1)
 95 | train = train[(train['entity'].notnull()) & (train['negative'] == 1)]   ### 
 96 | 
 97 | emotion = pd.read_csv('./submit/sub_qinggan_vote20191109_score0392098.csv', encoding='utf-8')
 98 | emotion = emotion[emotion['negative'] == 1]
 99 | test = emotion.merge(test, on='id', how='left')
100 | 
101 | 
102 | ################################################################
103 | train_id_entity = train[['id', 'entity']]
104 | train_id_entity['entity'] = train_id_entity['entity'].apply(lambda index: index.split(';'))
105 | id, entity = [], [] 
106 | for index in range(len(train_id_entity['entity'])):
107 |     entity.extend(list(train_id_entity['entity'])[index])
108 |     id.extend([list(train_id_entity['id'])[index]] * len(list(train_id_entity['entity'])[index]))
109 | 
110 | train_id_entity = pd.DataFrame({'id': id, 'entity_label': entity})
111 | 
112 | test_id_entity = test[['id', 'entity']]
113 | test_id_entity['entity'] = test_id_entity['entity'].apply(lambda index: index.split(';'))
114 | id, entity = [], [] 
115 | for index in range(len(test_id_entity['entity'])):
116 |     entity.extend(list(test_id_entity['entity'])[index])
117 |     id.extend([list(test_id_entity['id'])[index]] * len(list(test_id_entity['entity'])[index]))
118 | 
119 | test_id_entity = pd.DataFrame({'id': id, 'entity_label': entity})
120 | 
121 | train = train.merge(train_id_entity, on='id', how='left')
122 | train['flag'] = train.apply(lambda index: 1 if index.key_entity.split(';').count(index.entity_label) >= 1 else 0, axis=1)
123 | test = test.merge(test_id_entity, on='id', how='left')
124 | 
125 | ################################################################
126 | print(train.shape)
127 | print(test.shape)
128 | 
129 | def extract_feature(data):
130 |     data['sub_word_num'] = data.apply(lambda index: index.entity.count(index.entity_label) - 1, axis=1)
131 |     data['question_mark_num'] = data['entity_label'].apply(lambda index: index.count('?'))
132 |     data['occur_in_title_num'] = data.apply(lambda index: 0 if index.title is np.nan else index.title.count(index.entity_label), axis=1)
133 |     data['occur_in_text_num'] = data.apply(lambda index: 0 if index.text is np.nan else index.text.count(index.entity_label), axis=1)
134 |     data['occur_in_partial_text_num'] = data.apply(lambda index: 0 if index.text is np.nan else index.text[:507].count(index.entity_label), axis=1)
135 |     data['occur_in_entity'] = data.apply(lambda index: 0 if index.text is np.nan else index.entity.count(index.entity_label) - 1, axis=1)
136 |     data['is_occur_in_article'] = data.apply(lambda index: 1 if (index.occur_in_title_num >= 1) | (index.occur_in_text_num >= 1) else 0, axis=1)
137 |     return data
138 | 
139 | train = extract_feature(train)
140 | test = extract_feature(test)
141 | print(train.columns)
142 | 
143 | 
144 | train['entity_len'] = train['entity_label'].progress_apply(lambda index: len(index))
145 | test['entity_len'] = test['entity_label'].progress_apply(lambda index: len(index))
146 | 
147 | train[train['entity_len'] == 1].shape
148 | train = train[train['entity_len'] > 1]
149 | 
150 | test[test['entity_len'] == 1].shape
151 | test = test[test['entity_len'] > 1]
152 | 
153 | train_feature = train[['sub_word_num', 'question_mark_num', 'occur_in_title_num', 'occur_in_text_num', 'is_occur_in_article', 'occur_in_entity', 'occur_in_partial_text_num']]
154 | test_feature = test[['sub_word_num', 'question_mark_num', 'occur_in_title_num', 'occur_in_text_num', 'is_occur_in_article', 'occur_in_entity', 'occur_in_partial_text_num']]
155 | 
156 | # Normalization
157 | from sklearn.preprocessing import MinMaxScaler
158 | scaler = MinMaxScaler()
159 | train_feature = scaler.fit_transform(train_feature)
160 | test_feature = scaler.fit_transform(test_feature)
161 | 
162 | def get_other_content(x,y):
163 |     entitys=x.split(";")
164 |     if len(entitys)<=1:
165 |         return  np.nan
166 |     l=[]
167 |     for e in entitys:
168 |         if e!=y:
169 |             l.append(e)
170 |     return ';'.join(l)
171 | 
172 | train['other_entity']=list(map(lambda x,y :get_other_content(x,y),train['entity'],train['entity_label']))
173 | test['other_entity']=list(map(lambda x,y :get_other_content(x,y),test['entity'],test['entity_label']))
174 | 
175 | def get_content(x,y):
176 |     if str(y)=='nan':
177 |         return x
178 |     y=y.split(";")
179 |     y = sorted(y, key=lambda i:len(i),reverse=True)
180 |     for i in y:
181 |         x = '其他实体'.join(x.split(i))
182 |     return x
183 | 
184 | train['text']=list(map(lambda x,y: get_content(x,y), train['text'], train['other_entity']))
185 | test['text']=list(map(lambda x,y: get_content(x,y), test['text'], test['other_entity']))
186 | 
187 | maxlen = 509
188 | bert_path = 'E:/chinese_wwm_ext_L-12_H-768_A-12/'     # chinese_L-12_H-768_A-12    chinese_wwm_ext_L-12_H-768_A-12
189 | config_path = bert_path + 'bert_config.json'
190 | checkpoint_path = bert_path + 'bert_model.ckpt'
191 | dict_path = bert_path + 'vocab.txt'
192 | 
193 | token_dict = {}
194 | with codecs.open(dict_path, 'r', 'utf8') as reader:
195 |     for line in reader:
196 |         token = line.strip()
197 |         token_dict[token] = len(token_dict)  # 给每个token 按序编号
198 | 
199 | class OurTokenizer(Tokenizer):
200 |     def _tokenize(self, text):
201 |         R = []
202 |         for c in text:
203 |             if c in self._token_dict:
204 |                 R.append(c)
205 |             elif self._is_space(c):
206 |                 R.append('[unused1]') # space类用未经训练的[unused1]表示
207 |             else:
208 |                 R.append('[UNK]') # 剩余的字符是[UNK]
209 |         return R
210 | 
211 | tokenizer = OurTokenizer(token_dict)
212 | 
213 | def seq_padding(X, padding=0):
214 |     L = [len(x) for x in X]
215 |     ML = max(L)
216 |     return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])
217 | 
218 | class data_generator:
219 |     def __init__(self, data, feature, batch_size=8, shuffle=True):    # 8
220 |         self.data = data
221 |         self.batch_size = batch_size
222 |         self.shuffle = shuffle
223 |         self.feature = feature
224 |         self.steps = len(self.data) // self.batch_size
225 |         if len(self.data) % self.batch_size != 0:
226 |             self.steps += 1
227 |     def __len__(self):
228 |         return self.steps
229 |     def __iter__(self):
230 |         while True:
231 |             idxs = list(range(len(self.data)))
232 |             
233 |             if self.shuffle:
234 |                 np.random.shuffle(idxs)
235 |             
236 |             X1, X2, Y, Fea = [], [], [], []
237 |             for i in idxs:
238 |                 d = self.data[i]
239 |                 fea = self.feature[i]   # add feature
240 |                 first_text = d[0]
241 |                 second_text = d[2][:maxlen - d[1]]
242 |                 x1, x2 = tokenizer.encode(first=first_text, second=second_text)   # , max_len=512
243 |                 y = d[3]
244 |                 Fea.append(fea)
245 |                 X1.append(x1)
246 |                 X2.append(x2)
247 |                 Y.append([y])
248 |                 if len(X1) == self.batch_size or i == idxs[-1]:
249 |                     X1 = seq_padding(X1)
250 |                     X2 = seq_padding(X2, padding=1)
251 |                     Fea = seq_padding(Fea)
252 |                     Y = seq_padding(Y)
253 |                     yield [X1, X2, Fea], Y[:, 0, :]
254 |                     [X1, X2, Y, Fea] = [], [], [], []
255 | 
256 | from keras.metrics import top_k_categorical_accuracy
257 | from keras.metrics import categorical_accuracy
258 | 
259 | def acc_top2(y_true, y_pred):
260 |     return top_k_categorical_accuracy(y_true, y_pred, k=1)
261 | 
262 | 
263 | def f1_metric(y_true, y_pred):
264 |     def recall(y_true, y_pred):
265 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
266 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
267 |         recall = true_positives / (possible_positives + K.epsilon())
268 |         return recall
269 | 
270 |     def precision(y_true, y_pred):
271 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
272 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
273 |         precision = true_positives / (predicted_positives + K.epsilon())
274 |         return precision
275 |     precision = precision(y_true, y_pred)
276 |     recall = recall(y_true, y_pred)
277 |     return 2*((precision*recall)/(precision+recall+K.epsilon()))
278 |                     
279 | 
280 | def build_bert(nclass):
281 |     bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, seq_len=None)
282 | 
283 |     for l in bert_model.layers:
284 | #         print(l)
285 |         l.trainable = True
286 | 
287 |     x1_in = Input(shape=(None,))
288 |     x2_in = Input(shape=(None,))
289 |     x3_in = Input(shape=(train_feature.shape[1],))
290 |     
291 |     feature = Dense(64, activation='relu')(x3_in)
292 |     
293 |     x = bert_model([x1_in, x2_in])
294 |     x = Lambda(lambda x: x[:, 0])(x)
295 |     x = concatenate([x, feature])
296 |     p = Dense(nclass, activation='softmax')(x)
297 | 
298 |     model = Model([x1_in, x2_in, x3_in], p)
299 |     model.compile(loss='categorical_crossentropy', 
300 |                   optimizer=Adam(1e-5),                # lr: 5e-5   3e-5   2e-5    epoch: 3, 4    batch_size: 16, 32    
301 |                   metrics=['accuracy', f1_metric])      # categorical_accuracy
302 |     print(model.summary())
303 |     return model
304 | 
305 | 
306 | ################################################################
307 | from keras.utils import to_categorical
308 | 
309 | DATA_LIST = []
310 | for data_row in train.iloc[:].itertuples():
311 |     DATA_LIST.append((data_row.entity_label, data_row.entity_len, data_row.text, to_categorical(data_row.flag, 2)))
312 | DATA_LIST = np.array(DATA_LIST)
313 | 
314 | DATA_LIST_TEST = []
315 | for data_row in test.iloc[:].itertuples():
316 |     DATA_LIST_TEST.append((data_row.entity_label, data_row.entity_len, data_row.text, to_categorical(0, 2)))
317 | DATA_LIST_TEST = np.array(DATA_LIST_TEST)
318 | ################################################################
319 | 
320 | f1, acc = [], []
321 | def run_cv(nfold, data, feature_train, data_label, data_test, feature_test):
322 |     kf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed).split(data, train['flag'])
323 |     train_model_pred = np.zeros((len(data), 2))           # 2
324 |     test_model_pred = np.zeros((len(data_test), 2))       # 2
325 | 
326 |     for i, (train_fold, test_fold) in enumerate(kf):
327 |         X_train, X_valid, = data[train_fold, :], data[test_fold, :]
328 |         X_train_fea, X_valid_fea = feature_train[train_fold, :], feature_train[test_fold, :]
329 |         
330 |         model = build_bert(2)                             # 2
331 |         early_stopping = EarlyStopping(monitor='val_acc', patience=2)   # val_acc
332 |         plateau = ReduceLROnPlateau(monitor="val_acc", verbose=1, mode='max', factor=0.5, patience=1)
333 |         checkpoint = ModelCheckpoint('./model/' + str(i) + '.hdf5', monitor='val_acc', 
334 |                                          verbose=2, save_best_only=True, mode='max',save_weights_only=True)
335 | 
336 |         train_D = data_generator(X_train, X_train_fea, shuffle=True)
337 |         valid_D = data_generator(X_valid, X_valid_fea, shuffle=False)
338 |         test_D = data_generator(data_test, feature_test, shuffle=False)
339 | 
340 |         model.fit_generator(
341 |             train_D.__iter__(),
342 |             steps_per_epoch=len(train_D),   ## ?? ##
343 |             epochs=10,
344 |             validation_data=valid_D.__iter__(),
345 |             validation_steps=len(valid_D),
346 |             callbacks=[early_stopping, plateau, checkpoint],
347 |             verbose=2
348 |         )
349 | 
350 |         model.load_weights('./model/' + str(i) + '.hdf5')
351 |         
352 |         # return model
353 |         val = model.predict_generator(valid_D.__iter__(), steps=len(valid_D),verbose=0)
354 |         
355 |         print(val)
356 |         score = f1_score(train['flag'].values[test_fold], np.argmax(val, axis=1))
357 |         acc_score = accuracy_score(train['flag'].values[test_fold], np.argmax(val, axis=1))
358 |         global f1, acc
359 |         f1.append(score)
360 |         acc.append(acc_score)
361 |         print('validate f1 score:', score)
362 |         print('validate accuracy score:', acc_score)
363 | 
364 |         train_model_pred[test_fold, :] = val
365 |         test_model_pred += model.predict_generator(test_D.__iter__(), steps=len(test_D),verbose=0)
366 | 
367 |         del model; gc.collect()
368 |         K.clear_session()
369 |     return train_model_pred, test_model_pred
370 | 
371 | 
372 | ################################################################
373 | train_model_pred, test_model_pred = run_cv(10, DATA_LIST, train_feature, None, DATA_LIST_TEST, test_feature)
374 | print('validate aver f1 score:', np.average(f1))
375 | print('validate aver accuracy score:', np.average(acc))
376 | np.save('weights/bert_prob_train_binary_label_add_feature_extend_trainSet-PreProcess-roberta-large.npy', train_model_pred)
377 | np.save('weights/bert_prob_test_binary_label_add_feature_extend_trainSet-PreProcess-roberta-large.npy', test_model_pred)
378 | ################################################################
379 | 
380 | # 结果一 #
381 | def return_list(group):
382 |     return ';'.join(list(group))
383 | 
384 | sub = test.copy()
385 | sub['label'] = [np.argmax(index) for index in test_model_pred]
386 | 
387 | test_2 = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8')
388 | submit = test_2[['id']]
389 | 
390 | sub = sub[sub['label'] == 1]
391 | key_entity = sub.groupby(['id'], as_index=False)['entity_label'].agg({'key_entity': return_list})
392 | 
393 | submit = submit.merge(key_entity, on='id', how='left')
394 | submit['negative'] = submit['key_entity'].apply(lambda index: 0 if index is np.nan else 1)
395 | submit = submit[['id', 'negative', 'key_entity']]
396 | submit.to_csv('submit/sub_binary_label_roberta-large.csv', encoding='utf-8', index=None)
397 | print(submit[submit['key_entity'].notnull()].shape)
398 | 
399 | 
400 | ################################################################
401 | # 结果二 #
402 | def return_list(group):
403 |     return ';'.join(list(group))
404 | 
405 | sub = test.copy()
406 | sub['label'] = [np.argmax(index) for index in test_model_pred]
407 | test['prob'] = [index[1] for index in test_model_pred]
408 | 
409 | sub = sub[sub['label'] == 1]
410 | key_entity = sub.groupby(['id'], as_index=False)['entity_label'].agg({'key_entity': return_list})
411 | 
412 | sub_id = set(test['id']) - set(key_entity['id'])
413 | sub_test = test[test['id'].isin(sub_id)]
414 | sub_test = sub_test.sort_values(by=['id', 'prob'], ascending=False).drop_duplicates(['id'], keep='first')
415 | sub_test['key_entity'] = sub_test['entity_label']
416 | key_entity = pd.concat([key_entity, sub_test[['id', 'key_entity']]], axis=0, ignore_index=True)
417 | 
418 | test_2 = pd.read_csv(data_path + 'round2_test.csv', encoding='utf-8')
419 | submit = test_2[['id']]
420 | 
421 | submit = submit.merge(key_entity, on='id', how='left')
422 | submit['negative'] = submit['key_entity'].apply(lambda index: 0 if index is np.nan else 1)
423 | submit = submit[['id', 'negative', 'key_entity']]
424 | submit.to_csv('submit/sub_binary_label_roberta-large_all_neg_samples.csv', encoding='utf-8', index=None)
425 | print(submit[submit['key_entity'].notnull()].shape)
426 | 
427 | 


--------------------------------------------------------------------------------
/scheme3/code3.py:
--------------------------------------------------------------------------------
  1 | #! -*- coding:utf-8 -*-
  2 | import os
  3 | import re
  4 | import gc
  5 | import sys
  6 | import json
  7 | import codecs
  8 | import warnings
  9 | import numpy as np
 10 | import pandas as pd
 11 | from keras import initializers
 12 | from tqdm import tqdm
 13 | from random import choice
 14 | import matplotlib.pyplot as plt
 15 | from collections import Counter
 16 | from sklearn.metrics import f1_score
 17 | from sklearn.model_selection import KFold
 18 | from sklearn.model_selection import KFold
 19 | from sklearn.preprocessing import LabelEncoder
 20 | from sklearn.model_selection import StratifiedKFold
 21 | from sklearn.model_selection import train_test_split
 22 | 
 23 | import keras.backend as K
 24 | from keras.layers import *
 25 | from keras.callbacks import *
 26 | from keras.models import Model
 27 | from keras.optimizers import Adam
 28 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer
 29 | from keras_tqdm import TQDMNotebookCallback
 30 | tqdm.pandas()
 31 | np.random.seed(123)
 32 | warnings.filterwarnings('ignore')
 33 | 
 34 | import os
 35 | os.environ["CUDA_VISIBLE_DEVICES"] = "0"   #指定gpu
 36 | 
 37 | 
 38 | data_path = '../data/'   #数据地址
 39 | 
 40 | train_path=data_path + 'Round2_train.csv'
 41 | train2_path=data_path + 'Train_data.csv'
 42 | test_path=data_path + 'round2_test.csv'
 43 | 
 44 | maxlen =  1   # 510
 45 | learning_rate = 5e-5
 46 | min_learning_rate = 1e-5
 47 | batch_size=20
 48 | 
 49 | 
 50 | 
 51 | save_model_path='zy'
 52 | save_mdoel_name_pre='large'
 53 | 
 54 | bert_path ='E:/code/pre_model/bert/chinese_L-12_H-768_A-12/'    #模型地址
 55 | 
 56 | config_path = bert_path + 'bert_config.json'
 57 | checkpoint_path = bert_path + 'bert_model.ckpt'
 58 | dict_path = bert_path + 'vocab.txt'
 59 | 
 60 | 
 61 | #################################################load data#########################################
 62 | train = pd.read_csv(train_path, encoding='utf-8')
 63 | train2= pd.read_csv(train2_path, encoding='utf-8')
 64 | 
 65 | 
 66 | train=pd.concat([train,train2],axis=0,sort=True)
 67 | test = pd.read_csv(test_path, encoding='utf-8')
 68 | test['text'] = test.apply(lambda index: index.title if index.text is np.nan else index.text, axis=1)
 69 | 
 70 | train = train[train['entity'].notnull()]
 71 | test = test[test['entity'].notnull()]
 72 | 
 73 | train=train.drop_duplicates(['title','text','entity','negative','key_entity'])   #去掉重复的data
 74 | 
 75 | print(train.shape)
 76 | print(test.shape)
 77 | 
 78 | #########################################删选实体####################################################################
 79 | '大家一样的函数处理'
 80 | def get_or_content(y,z):
 81 |     s=''
 82 |     if str(y)!='nan':
 83 |         s+=y
 84 |     if str(z)!='nan':
 85 |         s+=z
 86 |     return s
 87 | 
 88 | #获取title+text
 89 | train['content']=list(map(lambda y,z: get_or_content(y,z),train['title'],train['text']))
 90 | test['content']=list(map(lambda y,z: get_or_content(y,z),test['title'],test['text']))
 91 | 
 92 | 
 93 | def entity_clear_row(entity,content):
 94 |     entities = entity.split(';')
 95 |     entities.sort(key=lambda x: len(x))
 96 |     n = len(entities)
 97 |     tmp = entities.copy()
 98 |     for i in range(n):
 99 |         entity_tmp = entities[i]
100 |         #长度小于等于1
101 |         if len(entity_tmp)<=1:
102 |             tmp.remove(entity_tmp)
103 |             continue
104 |         if i + 1 >= n:
105 |             break
106 |         for entity_tmp2 in entities[i + 1:]:
107 |             if entity_tmp2.find(entity_tmp) != -1 and (
108 |                     entity_tmp2.find('?') != -1 or content.replace(entity_tmp2, '').find(entity_tmp) == -1):
109 |                 tmp.remove(entity_tmp)
110 |                 break
111 |     return ';'.join(tmp)
112 | 
113 | train['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),train['entity'],train['content']))
114 | test['entity']=list(map(lambda entity,content:entity_clear_row(entity,content),test['entity'],test['content']))
115 | 
116 | 
117 | #####################################################句子预处理####################################################
118 | def delete_tag(s):
119 |     r1 = re.compile(r'\{IMG:.?.?.?\}')  # 图片
120 |     s = re.sub(r1, '', s)
121 |     r2 = re.compile(r'[a-zA-Z]+://[^\u4e00-\u9fa5|\?]+')  # 网址
122 |     s = re.sub(r2, '', s)
123 |     r3 = re.compile(r'<.*?>')  # 网页标签
124 |     s = re.sub(r3, '', s)
125 |     r4 = re.compile(r'&[a-zA-Z0-9]{1,4}')  # &nbsp  &gt  &type &rdqu   ....
126 |     s = re.sub(r4, '', s)
127 |     r5 = re.compile(r'[0-9a-zA-Z]+@[0-9a-zA-Z]+')  # 邮箱
128 |     s = re.sub(r5, '', s)
129 |     r6 = re.compile(r'[#]')  # #号
130 |     s = re.sub(r6, '', s)
131 |     return s
132 | 
133 | 
134 | train['title'] = train['title'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x)
135 | train['text'] = train['text'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x)
136 | test['title'] = test['title'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x)
137 | test['text'] = test['text'].apply(lambda x: delete_tag(x) if str(x) != 'nan' else x)
138 | 
139 | 
140 | ###############################################获取content#################################################
141 | def get_or_content(y, z):
142 |     s = ''
143 |     if str(y) != 'nan' and str(z) != 'nan' and y == z:
144 |         s += '标题和内容相同，是'
145 |         s += y
146 |     else:
147 |         s += '标题是'
148 |         if str(y) != 'nan':
149 |             if len(y) > 172:
150 |                 s += y[:172]
151 |             else:
152 |                 s += y
153 |         else:
154 |             s += '无'
155 |         s += '内容是'
156 |         if str(z) != 'nan':
157 |             s += z
158 |     return s
159 | 
160 | 
161 | train['content'] = list(map(lambda y, z: get_or_content(y, z), train['title'], train['text']))
162 | test['content'] = list(map(lambda y, z: get_or_content(y, z), test['title'], test['text']))
163 | 
164 | 
165 | ####################################################mark label##################################################
166 | def get_id_entity(data):
167 |     right_id, right_entity = [], []
168 |     for row in data.itertuples():
169 |         entities=row.entity.strip(';').split(';')
170 |         entities.sort(key=lambda x:len(x))  #排序
171 |         entities_num=len(entities)
172 |         for index,entity in enumerate(entities):
173 |             right_entity.append(entity)
174 |             right_id.append(row.id)
175 |     return pd.DataFrame({'id': right_id, 'entity_label': right_entity})
176 | 
177 | train_id_entity =get_id_entity(train[['id', 'entity','content']])
178 | test_id_entity = get_id_entity(test[['id', 'entity','content']])
179 | 
180 | print('success')
181 | # train.pop('negative')
182 | train = train.merge(train_id_entity, on='id', how='left')
183 | train['label'] = train.apply(lambda index: 0 if index.key_entity is np.nan else 1, axis=1)
184 | train['key_entity'] = train['key_entity'].fillna('')
185 | train['label'] = train.apply(lambda index: 1 if index.key_entity.split(';').count(index.entity_label) >= 1 else 0, axis=1)
186 | 
187 | test = test.merge(test_id_entity, on='id', how='left')
188 | 
189 | print(test.shape)
190 | print(train.shape)
191 | 
192 | ####################################获取预料############################################################
193 | 
194 | 'tttttt 获取实体出现的位置 前510'
195 | 
196 | 
197 | def get_new_content(entity, content):
198 |     len_append = len('实体是' + entity)
199 |     if len_append + len(content) > 510:
200 |         return content[:510 - len_append]
201 |     return content
202 | 
203 | 
204 | def get_position(entity, corpus):
205 |     tag = int(corpus.count(entity) > 0)
206 |     # 为了消除空格的影响
207 |     if tag > 0:
208 |         return 1
209 |     return int(corpus.count(entity.strip()) > 0)
210 | 
211 | 
212 | train['content2'] = list(map(lambda x, y: get_new_content(x, y), train['entity'], train['content']))
213 | test['content2'] = list(map(lambda x, y: get_new_content(x, y), test['entity'], test['content']))
214 | train['entity_label_position2'] = list(map(lambda x, y: get_position(x, y), train['entity_label'], train['content2']))
215 | test['entity_label_position2'] = list(map(lambda x, y: get_position(x, y), test['entity_label'], test['content2']))
216 | 
217 | train['entity_label_position'] = list(map(lambda x, y: get_position(x, y), train['entity_label'], train['content']))
218 | test['entity_label_position'] = list(map(lambda x, y: get_position(x, y), test['entity_label'], test['content']))
219 | 
220 | train.pop('content')
221 | train.pop('content2')
222 | 
223 | test.pop('content')
224 | test.pop('content2')
225 | print('success')
226 | 
227 | print(train['entity_label_position2'].value_counts())
228 | print(train['entity_label_position'].value_counts())
229 | print(test['entity_label_position2'].value_counts())
230 | print(test['entity_label_position'].value_counts())
231 | 
232 | import re
233 | 
234 | 
235 | # 前510没有内容的样本 corpus=i实体+标题+部分内容（无）
236 | def get_context_content(x, y, z, p2):
237 |     s = '实体是'
238 |     if str(x) != 'nan':
239 |         s += x
240 |     s += '。标题和部分内容没有实体'
241 |     if str(y) != 'nan' and str(z) != 'nan' and y == z:
242 |         s += '，标题和内容相同，是'
243 |         s += y
244 |     else:
245 |         s += '标题是'
246 |         if str(y) != 'nan':
247 |             if len(y) > 172:
248 |                 s += y[:172]
249 |             else:
250 |                 s += y
251 |         else:
252 |             s += '无'
253 |         # 有部分内容
254 |         if p2 == 1:
255 |             s += '。部分内容是'
256 |             if str(z) != 'nan':
257 |                 z_list = re.split(r'，|。|？|；', z)
258 |                 for i in z_list:
259 |                     if x in i:
260 |                         s += i
261 |                     if len(s) > 700:
262 |                         break
263 |             else:
264 |                 s += '无'
265 |         else:
266 |             s += '全文没有匹配的内容。内容是'
267 |             if str(z) != 'nan':
268 |                 s += z
269 |             else:
270 |                 s += '无'
271 |     if len(s) > 700:
272 |         return s[:700]
273 |     return s
274 | 
275 | 
276 | def get_content(x, y, z, position, p2):
277 |     # 前510有内容
278 |     if position == 1:
279 |         s = '实体是'
280 |         if str(x) != 'nan':
281 |             s += x
282 |         if str(y) != 'nan' and str(z) != 'nan' and y == z:
283 |             s += '，标题和内容相同，是'
284 |             s += y
285 |         else:
286 |             s += '标题是'
287 |             if str(y) != 'nan':
288 |                 if len(y) > 172:
289 |                     s += y[:172]
290 |                 else:
291 |                     s += y
292 |             else:
293 |                 s += '无'
294 |             s += '内容是'
295 |             if str(z) != 'nan':
296 |                 s += z
297 |             else:
298 |                 s += '无'
299 |     else:
300 |         # 前510没有内容
301 |         s = get_context_content(x, y, z, p2)
302 |     if len(s) > 700:
303 |         return s[:700]
304 |     return s
305 | 
306 | 
307 | train['corpus'] = list(
308 |     map(lambda x, y, z, position, p2: get_content(x, y, z, position, p2), tqdm(train['entity_label'].values),
309 |         train['title'], train[
310 |             'text'], train['entity_label_position2'], train['entity_label_position']))
311 | test['corpus'] = list(
312 |     map(lambda x, y, z, position, p2: get_content(x, y, z, position, p2), tqdm(test['entity_label'].values),
313 |         test['title'], test[
314 |             'text'], test['entity_label_position2'], test['entity_label_position']))
315 | 
316 | def get_position(entity,corpus):
317 |     tag=int(corpus.count(entity)>1)
318 |     #为了消除空格的影响
319 |     if tag>1:
320 |         return 1
321 |     return int(corpus.count(entity.strip())>1)
322 | 
323 | train['entity_label_position3']=list(map(lambda x,y:get_position(x,y),train['entity_label'],train['corpus']))
324 | test['entity_label_position3']=list(map(lambda x,y:get_position(x,y),test['entity_label'],test['corpus']))
325 | 
326 | 
327 | print(train['entity_label_position3'].value_counts())
328 | print(test['entity_label_position3'].value_counts())
329 | 
330 | 
331 | 
332 | def get_other_content(x,y):
333 |     entitys=x.strip(';').split(";")
334 |     if len(entitys)<=1:
335 |         return  np.nan
336 |     l=[]
337 |     for e in entitys:
338 |         if e!=y:
339 |             l.append(e)
340 |     return ';'.join(l)
341 | train['other_entity']=list(map(lambda x,y :get_other_content(x,y),train['entity'],train['entity_label']))
342 | test['other_entity']=list(map(lambda x,y :get_other_content(x,y),test['entity'],test['entity_label']))
343 | def get_content(x,y,z):
344 |     if str(y)=='nan':
345 |         return x
346 |     y=y.split(";")
347 |     y=sorted(y,key=lambda x: len(x),reverse=True)
348 |     #如果该实体没有出现直接返回原始语句
349 |     if x.count(z)<=1:
350 |         return x
351 |     #出现了就直接替换其他的实体
352 |     for i in y:
353 |         if i not in z:               #不是该候选实体的子串
354 |             x='其他实体'.join(x.split(i))
355 |     return x
356 | train['corpus']=list(map(lambda x,y,z :get_content(x,y,z),train['corpus'],train['other_entity'],train['entity_label']))
357 | test['corpus']=list(map(lambda x,y ,z:get_content(x,y,z),test['corpus'],test['other_entity'],test['entity_label']))
358 | 
359 | 
360 | 
361 | from keras.utils import to_categorical
362 | 
363 | DATA_LIST = []
364 | for data_row in train.iloc[:].itertuples():
365 |     DATA_LIST.append((data_row.corpus, data_row.negative,data_row.label))
366 | DATA_LIST = np.array(DATA_LIST)
367 | print(DATA_LIST.shape)
368 | 
369 | DATA_LIST_TEST = []
370 | for data_row in test.iloc[:].itertuples():
371 |     DATA_LIST_TEST.append((data_row.corpus, 0,0))
372 | DATA_LIST_TEST = np.array(DATA_LIST_TEST)
373 | print(DATA_LIST_TEST.shape)
374 | 
375 | 
376 | ########################################模型部分#############################################
377 | 
378 | token_dict = {}
379 | with codecs.open(dict_path, 'r', 'utf8') as reader:
380 |     for line in reader:
381 |         token = line.strip()
382 |         token_dict[token] = len(token_dict)  # 给每个token 按序编号
383 | 
384 | 
385 | class OurTokenizer(Tokenizer):
386 |     def _tokenize(self, text):
387 |         R = []
388 |         for c in text:
389 |             if c in self._token_dict:
390 |                 R.append(c)
391 |             elif self._is_space(c):
392 |                 R.append('[unused1]')  # space类用未经训练的[unused1]表示
393 |             else:
394 |                 R.append('[UNK]')  # 剩余的字符是[UNK]
395 |         return R
396 | 
397 | 
398 | tokenizer = OurTokenizer(token_dict)
399 | 
400 | '填充序列长度'
401 | 
402 | 
403 | def seq_padding(X, padding=0):
404 |     L = [len(x) for x in X]
405 |     ML = max(L)
406 |     return np.array([np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X])
407 | 
408 | 
409 | class data_generator:
410 |     def __init__(self, data, batch_size=batch_size, shuffle=True):  # 8
411 |         self.data = data
412 |         self.batch_size = batch_size
413 |         self.shuffle = shuffle
414 |         self.steps = len(self.data) // self.batch_size
415 |         if len(self.data) % self.batch_size != 0:
416 |             self.steps += 1
417 | 
418 |     def __len__(self):
419 |         return self.steps
420 | 
421 |     def __iter__(self):
422 |         while True:
423 |             idxs = list(range(len(self.data)))
424 | 
425 |             if self.shuffle:
426 |                 np.random.shuffle(idxs)
427 | 
428 |             X1, X2, Y1, Y2 = [], [], [], []
429 |             for i in idxs:
430 |                 d = self.data[i]
431 |                 text = d[0][:maxlen]
432 |                 x1, x2 = tokenizer.encode(first=text)
433 |                 y1 = d[1]
434 |                 y2 = d[2]
435 |                 X1.append(x1)
436 |                 X2.append(x2)
437 |                 Y1.append([y1])
438 |                 Y2.append([y2])
439 |                 if len(X1) == self.batch_size or i == idxs[-1]:
440 |                     X1 = seq_padding(X1)
441 |                     X2 = seq_padding(X2)
442 |                     Y1 = seq_padding(Y1)
443 |                     Y2 = seq_padding(Y2)
444 |                     yield [X1, X2, Y1, Y2], None
445 |                     X1, X2, Y1, Y2 = [], [], [], []
446 | 
447 | 
448 | from keras.metrics import top_k_categorical_accuracy
449 | from keras.metrics import categorical_accuracy
450 | 
451 | 
452 | def acc_top2(y_true, y_pred):
453 |     return top_k_categorical_accuracy(y_true, y_pred, k=2)
454 | 
455 | 
456 | def f1_metric(y_true, y_pred):
457 |     '''
458 |     metric from here
459 |     https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
460 |     '''
461 | 
462 |     def recall(y_true, y_pred):
463 |         """Recall metric.
464 |         Only computes a batch-wise average of recall.
465 |         Computes the recall, a metric for multi-label classification of
466 |         how many relevant items are selected.
467 |         """
468 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
469 |         possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
470 |         recall = true_positives / (possible_positives + K.epsilon())
471 |         return recall
472 | 
473 |     def precision(y_true, y_pred):
474 |         """Precision metric.
475 | 
476 |         Only computes a batch-wise average of precision.
477 | 
478 |         Computes the precision, a metric for multi-label classification of
479 |         how many selected items are relevant.
480 |         """
481 |         true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
482 |         predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
483 |         precision = true_positives / (predicted_positives + K.epsilon())
484 |         return precision
485 | 
486 |     precision = precision(y_true, y_pred)
487 |     recall = recall(y_true, y_pred)
488 |     return 2 * ((precision * recall) / (precision + recall + K.epsilon()))
489 | 
490 | 
491 | # from keras.utils import multi_gpu_model
492 | 
493 | 
494 | class MyLayer(Layer):
495 |     def __init__(self, **kwargs):
496 |         super(MyLayer, self).__init__(**kwargs)
497 | 
498 |     def build(self, input_shape):
499 |         self.w = self.add_weight(shape=(input_shape[1],),
500 |                                  initializer=initializers.glorot_uniform(seed=2019), trainable=True,
501 |                                  name='kernel')
502 |         super(MyLayer, self).build(input_shape)
503 | 
504 |     def call(self, x):
505 |         soft_w = K.softmax(self.w)
506 | 
507 |         return x * soft_w
508 | 
509 |     def compute_output_shape(self, input_shape):
510 |         return (input_shape[0], input_shape[1])
511 | 
512 | 
513 | # def get_means_bert_n_layers(x):
514 | #     shape = list(x.shape)
515 | #     layer_size = 768
516 | #     number = int(int(shape[1]) / layer_size)
517 | #     res = x[:, :layer_size]
518 | #     for i in range(2, number + 1):
519 | #         res += x[:, layer_size * (i - 1):layer_size * i]
520 | #     print('return ')
521 | #     return tf.divide(res, number)
522 | 
523 | #
524 | # def get_means_bert_n_layers_shape(input_shape):
525 | #     shape = list(input_shape)
526 | #     layer_size = 768
527 | #     shape[-1] = layer_size
528 | #     return tuple(shape)
529 | 
530 | 
531 | def build_bert(nclass):
532 |     seed = 2019
533 |     bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, output_layer_num=1, seq_len=None)
534 | 
535 |     layer_ = 1
536 |     for l in bert_model.layers:
537 |         if layer_ >= 105 - 105:
538 |             l.trainable = True
539 |         #         print(l)
540 |         layer_ += 1
541 |     print(' layer:', layer_)
542 |     x1_in = Input(shape=(None,))
543 |     x2_in = Input(shape=(None,))
544 |     y1_in = Input(shape=(None,))
545 |     y2_in = Input(shape=(None,))
546 | 
547 |     x = bert_model([x1_in, x2_in])
548 | 
549 |     x0 = Lambda(lambda x: x[:, 0])(x)
550 | 
551 |     #     x0=Lambda(get_means_bert_n_layers,get_means_bert_n_layers_shape)(x0)
552 | 
553 |     x1 = Dense(300, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=seed))(x0)
554 |     p1 = Dense(nclass, activation='sigmoid')(x1)
555 | 
556 |     x1_st = Lambda(lambda x: K.stop_gradient(x))(x1)
557 |     #     x1_st=MyLayer()(x1_st)
558 |     p1_st = Lambda(lambda x: K.stop_gradient(x))(p1)
559 | 
560 |     x2 = concatenate([x0, x1_st, p1_st])
561 | 
562 |     x2 = Dense(300, activation='relu', kernel_initializer=initializers.glorot_uniform(seed=seed))(x2)
563 |     p2 = Dense(nclass, activation='sigmoid', kernel_initializer=initializers.glorot_uniform(seed=seed))(x2)
564 | 
565 |     model = Model([x1_in, x2_in], [p1, p2])
566 | 
567 |     train_model = Model([x1_in, x2_in, y1_in, y2_in], [p1, p2])
568 | 
569 |     loss1 = K.mean(K.binary_crossentropy(y1_in, p1, from_logits=False))
570 |     loss2 = K.mean(K.binary_crossentropy(y2_in, p2, from_logits=False))
571 | 
572 |     # 带权重的loss
573 |     #     w_loss=K.softmax(K.variable([0.5,0.5]))
574 | 
575 |     #     loss=w_loss[0]*loss1+w_loss[1]*loss2
576 |     loss = loss1 + loss2
577 | 
578 |     train_model.add_loss(loss)
579 | 
580 | 
581 |     train_model.compile(optimizer=Adam(learning_rate))
582 |     # print(train_model.summary())
583 |     return train_model
584 | 
585 | 
586 | from sklearn import metrics
587 | 
588 | '寻找最好的阀值'
589 | def get_best_F1_score(pred, ture):
590 |     f1_scores = []
591 |     cut_offs = []
592 |     for threshold in np.arange(0.01, 1, 0.01):
593 |         pred_binary = (pred >= threshold) * 1
594 |         f1_tmp = metrics.f1_score(y_true=ture, y_pred=pred_binary)
595 |         f1_scores.append(f1_tmp)
596 |         cut_offs.append(threshold)
597 |     max_index = f1_scores.index(max(f1_scores))
598 |     max_x_axis = cut_offs[max_index]
599 |     max_y_axis_F1 = f1_scores[max_index]
600 |     return max_x_axis, max_y_axis_F1
601 | 
602 | 
603 | from keras.callbacks import Callback
604 | 
605 | 
606 | class Evaluate(Callback):
607 |     def __init__(self, X_train, X_valid, tag):
608 |         self.X_valid = X_valid
609 |         self.X_train = X_train
610 |         self.Y1_train, self.Y2_train = [int(i) for i in X_train[:, 1]], [int(i) for i in X_train[:, 2]]
611 |         self.Y1_valid, self.Y2_valid = [int(i) for i in X_valid[:, 1]], [int(i) for i in X_valid[:, 2]]
612 |         self.tag = tag
613 |         self.best = 0.
614 |         self.passed = 0
615 | 
616 |     def on_batch_begin(self, batch, logs=None):
617 |         """第一个epoch用来warmup，第二个epoch把学习率降到最低
618 |         """
619 |         if self.passed < self.params['steps']:
620 |             lr = (self.passed + 1.) / self.params['steps'] * learning_rate
621 |             K.set_value(self.model.optimizer.lr, lr)
622 |             self.passed += 1
623 |         elif self.params['steps'] <= self.passed < self.params['steps'] * 2:
624 |             lr = (2 - (self.passed + 1.) / self.params['steps']) * (learning_rate - min_learning_rate)
625 |             lr += min_learning_rate
626 |             K.set_value(self.model.optimizer.lr, lr)
627 |             self.passed += 1
628 | 
629 |     def on_epoch_end(self, epoch, logs=None):
630 |         #         train_D = data_generator(self.X_train, shuffle=False)
631 |         valid_D = data_generator(self.X_valid, shuffle=False)
632 |         #         tra_1,tra_2 = self.model.predict_generator(train_D.__iter__(), steps=len(train_D),verbose=0)
633 |         val_1, val_2 = self.model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=0)
634 |         #         tra_1=[int(i>0.5) for i in tra_1]
635 |         #         tra_2=[int(i>0.5) for i in tra_2]
636 |         val_1 = [int(i > 0.5) for i in val_1]
637 |         val_2 = [int(i > 0.5) for i in val_2]
638 |         # train
639 |         #         t_f1_1=f1_score(self.Y1_train,tra_1)
640 |         #         t_f1_2=f1_score(self.Y2_train,tra_2)
641 |         #         t_acc_1=accuracy_score(self.Y1_train,tra_1)
642 |         #         t_acc_2=accuracy_score(self.Y2_train,tra_2)
643 |         # val
644 |         f1_1 = f1_score(self.Y1_valid, val_1)
645 |         f1_2 = f1_score(self.Y2_valid, val_2)
646 |         # acc_1 = accuracy_score(self.Y1_valid, val_1)
647 |         # acc_2 = accuracy_score(self.Y2_valid, val_2)
648 | 
649 |         print(' ----val -f1_1:{:.5f} -f1_2:{:.5f}'.format(f1_1, f1_2))
650 |         #         print('train -acc_1:{} -acc_2:{} ----val -acc_1:{} -acc_2:{}'.format(t_acc_1,t_acc_2,acc_1,acc_2))
651 | 
652 |         f_mean = f1_1 * 0.4 + f1_2 * 0.6
653 |         if f_mean >= self.best:
654 |             self.best = f_mean
655 |             self.model.save_weights(save_model_path + save_mdoel_name_pre + '{}.hdf5'.format(self.tag))
656 |             print('f_mean : {:.5f} best f_mean :{:.5f}'.format(f_mean, self.best))
657 | 
658 | 
659 | 
660 | #########################################训练部分########################################
661 | 
662 | def run_cv(nfold, data, data_label, data_test):
663 |     f1_1_list = []
664 |     f1_2_list = []
665 | 
666 |     val_best_threshold1 = []  # 预测1
667 |     val_best_threshold2 = []  # 预测2
668 |     kf = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=2019).split(data, train['entity_label_position3'])
669 |     train_model_pred1 = np.zeros((len(data), 1))
670 |     train_model_pred2 = np.zeros((len(data), 1))
671 |     test_model_preds1 = np.zeros((len(data_test), nfold))
672 |     test_model_preds2 = np.zeros((len(data_test), nfold))
673 | 
674 |     for i, (train_fold, test_fold) in enumerate(kf):
675 |         #         if i<2:
676 |         #             continue
677 |         X_train, X_valid, = data[train_fold, :], data[test_fold, :]
678 |         print(X_train.shape)
679 |         print(X_valid.shape)
680 | 
681 |         print(train_fold[:20])
682 |         print(test_fold[:20])
683 | 
684 |         print('*' * 50, i + 1, '*' * 50)
685 |         model = build_bert(1)
686 |         #         early_stopping = EarlyStopping(monitor='val_loss', patience=3)   # val_acc
687 |         #         plateau = ReduceLROnPlateau(monitor="val_loss", verbose=1, mode='min', factor=0.5, patience=1)
688 |         #         checkpoint = ModelCheckpoint('./model/' + str(i) + '.hdf5', monitor='val_loss',
689 |         #                                          verbose=2, save_best_only=True, mode='min',save_weights_only=True)
690 | 
691 |         train_D = data_generator(X_train, shuffle=True)
692 |         valid_D = data_generator(X_valid, shuffle=False)
693 |         test_D = data_generator(data_test, shuffle=False)
694 | 
695 |         evaluator = Evaluate(X_train, X_valid, i + 1)
696 | 
697 |         model.fit_generator(
698 |             train_D.__iter__(),
699 |             steps_per_epoch=len(train_D),
700 |             validation_data=valid_D.__iter__(),
701 |             validation_steps=len(valid_D),
702 |             epochs=1,
703 |             callbacks=[evaluator],
704 |             verbose=2
705 |         )
706 | 
707 |         model.load_weights(save_model_path + save_mdoel_name_pre + '{}.hdf5'.format(i + 1))
708 | 
709 |         # return model
710 |         val_1, val_2 = model.predict_generator(valid_D.__iter__(), steps=len(valid_D), verbose=0)
711 | 
712 |         # 保存预测结果
713 |         train_model_pred1[test_fold, :] = val_1
714 |         train_model_pred2[test_fold, :] = val_2
715 | 
716 |         val_y1 = [int(i) for i in X_valid[:, 1]]
717 |         val_y2 = [int(i) for i in X_valid[:, 2]]
718 |         best_threshold1, f1_1 = get_best_F1_score(val_1, val_y1)
719 |         best_threshold2, f1_2 = get_best_F1_score(val_2, val_y2)
720 | 
721 |         val_best_threshold1.append(best_threshold1)
722 |         val_best_threshold2.append(best_threshold2)
723 | 
724 |         f1_1_list.append(f1_1)
725 |         f1_2_list.append(f1_2)
726 | 
727 |         print('validate  score -f1:{:.5f}  -val_best_threshold1:{:.5f} -f2{:.5f} -val_best_threshold2:{:.5f}'.format(
728 |             f1_1_list[-1], val_best_threshold1[-1], f1_2_list[-1], val_best_threshold2[-1]))
729 | 
730 |         # 预测
731 |         t1, t2 = model.predict_generator(test_D.__iter__(), steps=len(test_D), verbose=0)
732 |         for j in range(len(t1)):
733 |             test_model_preds1[j, i] = t1[j]
734 |             test_model_preds2[j, i] = t2[j]
735 | 
736 |         del model;
737 |         gc.collect()
738 |         K.clear_session()
739 |         print('*' * 50, i + 1, '-done-', '*' * 45)
740 | 
741 |     return train_model_pred1, train_model_pred2, test_model_preds1, test_model_preds2, val_best_threshold1, val_best_threshold2
742 | 
743 | train_model_pred1,train_model_pred2, test_pred_list1,test_pred_list2,val_best_threshold1,val_best_threshold2 = run_cv(5, DATA_LIST, None, DATA_LIST_TEST)
744 | 
745 | ######################################################预测部分################################################
746 | 'best train f1 and threshold'
747 | train_best_threshold1,train_f1=get_best_F1_score(train_model_pred1,train['negative'].values)
748 | print('train best_f1_1:',train_f1)
749 | print('train best_threshold_1:',train_best_threshold1)
750 | 
751 | 'best train f1 and threshold'
752 | train_best_threshold2,train_f2=get_best_F1_score(train_model_pred2,train['label'].values)
753 | print('train best_f1_2:',train_f2)
754 | print('train best_threshold_2:',train_best_threshold2)
755 | 
756 | from sklearn.metrics import f1_score, accuracy_score
757 | 
758 | '获取5折的分数'
759 | pred1 = [int(index[0] > 0.5) for index in train_model_pred1]
760 | pred2 = [int(index[0] > 0.5) for index in train_model_pred2]
761 | true1 = [int(index) for index in DATA_LIST[:, 1]]
762 | true2 = [int(index) for index in DATA_LIST[:, 2]]
763 | 
764 | print('f1_1:{}'.format(f1_score(true1, pred1)))
765 | print('f1_2:{}'.format(f1_score(true2, pred2)))
766 | 
767 | print('acc_1:{}'.format(accuracy_score(true1, pred1)))
768 | print('acc_2:{}'.format(accuracy_score(true2, pred2)))
769 | 
770 | # 保存概率
771 | train['pred1'] = train_model_pred1[:, 0]
772 | train['pred2'] = train_model_pred2[:, 0]
773 | train.to_csv('{}_train_preds.csv'.format(save_mdoel_name_pre), index=False)
774 | 
775 | # 线下模拟线上分数
776 | train['preb1'] = train_model_pred1[:, 0]
777 | train['preb2'] = train_model_pred2[:, 0]
778 | df = train[['id', 'negative', 'label', 'preb1', 'preb2', 'entity_label']]
779 | df['n_pred'] = df['preb1'].apply(lambda x: int(x > 0.5))
780 | df['l_pred'] = df['preb2'].apply(lambda x: int(x > 0.5))
781 | 
782 | train_or = pd.read_csv(train_path, encoding='utf-8')
783 | train2 = pd.read_csv(train2_path, encoding='utf-8')
784 | train_or = pd.concat([train_or, train2], axis=0, sort=True)
785 | train_or = train_or[train_or['entity'].notnull()]
786 | 
787 | # pred1
788 | temp = df[['id', 'n_pred']].groupby('id')['n_pred'].agg(lambda x: list(x))
789 | train_or['n_pred_list'] = train_or['id'].map(temp)
790 | train_or = train_or[train_or['n_pred_list'].notnull()]
791 | 
792 | # entity label list
793 | temp = df[['id', 'entity_label']].groupby('id')['entity_label'].agg(lambda x: list(x))
794 | train_or['entity_label_list'] = train_or['id'].map(temp)
795 | 
796 | # pred2
797 | temp = df[['id', 'l_pred']].groupby('id')['l_pred'].agg(lambda x: list(x))
798 | train_or['l_pred_list'] = train_or['id'].map(temp)
799 | 
800 | train_or['n_pred_type'] = train_or['n_pred_list'].apply(lambda x: len(set(x)))
801 | 
802 | 
803 | def get_negative_and_entity(n_pred_list, l_pred_list, entitys):
804 |     assert len(entitys) == len(n_pred_list)
805 |     sub = []
806 |     neg = 0
807 |     for i in range(len(n_pred_list)):
808 |         if n_pred_list[i] == 1 and l_pred_list[i] == 1:
809 |             sub.append(entitys[i])
810 |             neg = 1
811 |     if len(sub) == 0:
812 |         return np.nan
813 |     return ';'.join(sub)
814 | 
815 | 
816 | train_or['preb_entity'] = list(map(lambda x, y, z: get_negative_and_entity(x, y, z),
817 |                                    train_or['n_pred_list'], train_or['l_pred_list'], train_or['entity_label_list']))
818 | 
819 | train_or['pred_label'] = train_or['preb_entity'].apply(lambda x: 1 if str(x) != 'nan' else 0)
820 | 
821 | 
822 | def get_f1(y1, y2, y1_pred, y2_pred):
823 |     F1_1 = f1_score(y1, y1_pred)
824 |     print('f1_1:{}'.format(F1_1))
825 | 
826 |     '计算实体F1'
827 |     TP = 0
828 |     FN = 0
829 |     FP = 0
830 |     for i in range(len(y2)):
831 |         if str(y2[i]) == 'nan':
832 |             y2_i = set()
833 |         else:
834 |             y2_i = set(y2[i].split(';'))
835 |         if str(y2_pred[i]) == 'nan':
836 |             y2_pred_i = set()
837 |         else:
838 |             y2_pred_i = set(y2_pred[i].split(';'))
839 |         TPi = len(y2_i & y2_pred_i)
840 |         FNi = len(y2_i.difference(y2_pred_i))
841 |         FPi = len(y2_pred_i.difference(y2_i))
842 |         TP += TPi
843 |         FN += FNi
844 |         FP += FPi
845 |     P = TP / (TP + FP)
846 |     R = TP / (TP + FN)
847 |     F1_2 = 2 * P * R / (P + R)
848 |     print('f1_2:{}'.format(F1_2))
849 |     print('score：', 0.4 * F1_1 + 0.6 * F1_2)
850 | 
851 | 
852 | print(get_f1(train_or['negative'].values, train_or['key_entity'].values, train_or['pred_label'].values,
853 |        train_or['preb_entity'].values))
854 | 
855 | 
856 | pred1_df=np.mean(test_pred_list1,axis=1)
857 | pred2_df=np.mean(test_pred_list2,axis=1)
858 | 
859 | sub=test.copy()
860 | sub['preb1']=list(pred1_df)
861 | sub['preb2']=list(pred2_df)
862 | 
863 | 
864 | df=sub[['id','preb1','preb2','entity_label']]
865 | df['n_pred']=df['preb1'].apply(lambda x: int(x>0.5))
866 | df['l_pred']=df['preb2'].apply(lambda x: int(x>0.5))
867 | 
868 | 
869 | sub.to_csv('{}_test_preds.csv'.format(save_mdoel_name_pre),index=False)
870 | 
871 | 
872 | test_or = pd.read_csv(test_path, encoding='utf-8')
873 | test_or= test_or[test_or['entity'].notnull()]
874 | 
875 | #pred1
876 | temp=df[['id','n_pred']].groupby('id')['n_pred'].agg(lambda x:list(x))
877 | test_or['n_pred_list']=test_or['id'].map(temp)
878 | 
879 | #pred2
880 | temp=df[['id','l_pred']].groupby('id')['l_pred'].agg(lambda x:list(x))
881 | test_or['l_pred_list']=test_or['id'].map(temp)
882 | 
883 | #entitys
884 | temp=df[['id','entity_label']].groupby('id')['entity_label'].agg(lambda x:list(x))
885 | test_or['entity_label_list']=test_or['id'].map(temp)
886 | 
887 | #key_entity
888 | test_or['preb_entity']=list(map(lambda x,y,z:get_negative_and_entity(x,y,z),
889 |                              test_or['n_pred_list'],test_or['l_pred_list'],test_or['entity_label_list']))
890 | 
891 | 
892 | test_or['pred_label']=test_or['preb_entity'].apply(lambda x: 1 if str(x)!='nan' else 0)
893 | 
894 | test_or=test_or[['id','pred_label','preb_entity']]
895 | 
896 | test_2 = pd.read_csv(test_path, encoding='utf-8')
897 | submit = test_2[['id']]
898 | submit = submit.merge(test_or, on='id', how='left')
899 | submit.columns=['id', 'negative', 'key_entity']
900 | submit['negative']=submit['negative'].apply(lambda x: int(x) if str(x)!='nan' else 0)
901 | submit['negative']=submit['negative'].astype('int')
902 | # submit['key_entity']=np.nan
903 | submit.to_csv('{}_result_mean.csv'.format(save_mdoel_name_pre),index=False)
904 | print(submit.isnull().sum())
905 | 
906 | 


--------------------------------------------------------------------------------