├── BERT ├── model.py └── run.py ├── README.md ├── models ├── ctrNet.py └── model.py ├── picture ├── fusion-layer.png ├── mlm.png ├── model.png └── output.png ├── run.py ├── run.sh └── src ├── data_loader.py ├── extract_features.py ├── merge_submission.py ├── preprocess.py └── w2v.py /BERT/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch 4 | from torch.autograd import Variable 5 | import copy 6 | from transformers.modeling_bert import BertLayerNorm 7 | import torch.nn.functional as F 8 | from torch.nn import CrossEntropyLoss, MSELoss 9 | 10 | 11 | class Model(nn.Module): 12 | def __init__(self, encoder,config,args): 13 | super(Model, self).__init__() 14 | self.encoder = encoder 15 | self.lm_head=[] 16 | self.text_embeddings=nn.Embedding(args.vocab_size_v1,args.vocab_dim_v1) 17 | self.text_embeddings.apply(self._init_weights) 18 | self.text_linear=nn.Linear(args.text_dim+args.vocab_dim_v1*len(args.text_features), config.hidden_size) 19 | self.text_linear.apply(self._init_weights) 20 | for x in args.vocab_size: 21 | self.lm_head.append(nn.Linear(config.hidden_size, x, bias=False)) 22 | self.lm_head=nn.ModuleList(self.lm_head) 23 | self.config=config 24 | self.args=args 25 | 26 | def _init_weights(self, module): 27 | """ Initialize the weights """ 28 | if isinstance(module, (nn.Linear, nn.Embedding)): 29 | # Slightly different from the TF version which uses truncated_normal for initialization 30 | # cf https://github.com/pytorch/pytorch/pull/5617 31 | module.weight.data.normal_(mean=0.0, std=0.02) 32 | 33 | def forward(self, inputs,inputs_ids,masks,labels): 34 | inputs_embedding=self.text_embeddings(inputs_ids).view(inputs.size(0),inputs.size(1),-1) 35 | inputs=torch.cat((inputs.float(),inputs_embedding),-1) 36 | inputs=torch.relu(self.text_linear(inputs)) 37 | outputs = self.encoder(inputs_embeds=inputs,attention_mask=masks.float())[0] 38 | loss=0 39 | for idx,(x,y) in enumerate(zip(self.lm_head,self.args.text_features)): 40 | if y[3] is True: 41 | outputs_tmp=outputs[labels[:,:,idx].ne(-100)] 42 | labels_tmp=labels[:,:,idx] 43 | labels_tmp=labels_tmp[labels_tmp.ne(-100)].long() 44 | prediction_scores = x(outputs_tmp) 45 | loss_fct = CrossEntropyLoss() 46 | masked_lm_loss = loss_fct(prediction_scores, labels_tmp) 47 | loss=loss+masked_lm_loss 48 | return loss 49 | 50 | -------------------------------------------------------------------------------- /BERT/run.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa). 18 | GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned 19 | using a masked language modeling (MLM) loss. 20 | """ 21 | 22 | from __future__ import absolute_import, division, print_function 23 | from collections import Counter 24 | import argparse 25 | import glob 26 | import logging 27 | import os 28 | import pickle 29 | import random 30 | import re 31 | import shutil 32 | import json 33 | import numpy as np 34 | import torch 35 | import pandas as pd 36 | from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset 37 | from torch.utils.data.distributed import DistributedSampler 38 | 39 | try: 40 | from torch.utils.tensorboard import SummaryWriter 41 | except: 42 | from tensorboardX import SummaryWriter 43 | 44 | from tqdm import tqdm, trange 45 | import multiprocessing 46 | from model import Model 47 | cpu_cont = 4 48 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, 49 | RobertaConfig, RobertaModel, RobertaTokenizer) 50 | 51 | logger = logging.getLogger(__name__) 52 | 53 | MODEL_CLASSES = { 54 | 'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer), 55 | } 56 | 57 | class TextDataset(Dataset): 58 | def __init__(self, args,df,embedding_table): 59 | self.text_features=[df[x[1]].values for x in args.text_features] 60 | self.embedding_table=embedding_table 61 | self.args=args 62 | self.vocab=[list(x) for x in args.vocab] 63 | 64 | def __len__(self): 65 | return len(self.text_features[0]) 66 | 67 | def __getitem__(self, i): 68 | text_features=np.zeros((self.args.block_size,self.args.text_dim)) 69 | text_ids=np.zeros((self.args.block_size,len(self.args.text_features)),dtype=np.int64) 70 | text_masks=np.zeros(self.args.block_size) 71 | text_label=np.zeros((self.args.block_size,len(self.args.text_features)),dtype=np.int64)-100 72 | begin_dim=0 73 | #选择20%的token进行掩码,其中80%设为[mask], 10%设为[UNK],10%随机选择 74 | for idx,x in enumerate(self.args.text_features): 75 | end_dim=begin_dim+x[2] 76 | for word_idx,word in enumerate(self.text_features[idx][i].split()[:self.args.block_size]): 77 | text_masks[word_idx]=1 78 | if random.random() 0: 115 | torch.cuda.manual_seed_all(args.seed) 116 | 117 | def train(args, train_dataset,dev_dataset, model): 118 | #设置dataloader 119 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 120 | train_sampler = RandomSampler(train_dataset) 121 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4) 122 | t_total = args.max_steps 123 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 124 | 125 | #设置优化器 126 | model.to(args.device) 127 | if args.local_rank not in [-1, 0]: 128 | torch.distributed.barrier() 129 | no_decay = ['bias', 'LayerNorm.weight'] 130 | optimizer_grouped_parameters = [ 131 | {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 132 | 'weight_decay': args.weight_decay}, 133 | {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 134 | ] 135 | optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 136 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, 137 | num_training_steps=t_total) 138 | 139 | checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') 140 | scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt') 141 | optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt') 142 | if os.path.exists(scheduler_last): 143 | scheduler.load_state_dict(torch.load(scheduler_last, map_location="cpu")) 144 | if os.path.exists(optimizer_last): 145 | optimizer.load_state_dict(torch.load(optimizer_last, map_location="cpu")) 146 | if args.local_rank == 0: 147 | torch.distributed.barrier() 148 | if args.fp16: 149 | try: 150 | from apex import amp 151 | except ImportError: 152 | raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") 153 | model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) 154 | 155 | # 多GPU设置 156 | if args.n_gpu > 1: 157 | model = torch.nn.DataParallel(model) 158 | 159 | 160 | # 训练 161 | logger.info("***** Running training *****") 162 | logger.info(" Num examples = %d", len(train_dataset)* ( 163 | torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 164 | logger.info(" Num Epochs = %d", args.num_train_epochs) 165 | logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) 166 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 167 | args.train_batch_size * args.gradient_accumulation_steps * ( 168 | torch.distributed.get_world_size() if args.local_rank != -1 else 1)) 169 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 170 | logger.info(" Total optimization steps = %d", t_total) 171 | 172 | global_step = args.start_step 173 | tr_loss, logging_loss,avg_loss,tr_nb = 0.0, 0.0,0.0,0 174 | model.zero_grad() 175 | set_seed(args) 176 | 177 | for idx in range(args.start_epoch, int(args.num_train_epochs)): 178 | for step, batch in enumerate(train_dataloader): 179 | inputs,inputs_ids,masks,labels = [x.to(args.device) for x in batch] 180 | model.train() 181 | loss = model(inputs,inputs_ids,masks,labels) 182 | 183 | if args.n_gpu > 1: 184 | loss = loss.mean() 185 | if args.gradient_accumulation_steps > 1: 186 | loss = loss / args.gradient_accumulation_steps 187 | 188 | if args.fp16: 189 | with amp.scale_loss(loss, optimizer) as scaled_loss: 190 | scaled_loss.backward() 191 | torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) 192 | else: 193 | loss.backward() 194 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 195 | 196 | tr_loss += loss.item() 197 | 198 | 199 | 200 | if (step + 1) % args.gradient_accumulation_steps == 0: 201 | optimizer.step() 202 | optimizer.zero_grad() 203 | scheduler.step() 204 | global_step += 1 205 | output_flag=True 206 | avg_loss=round(np.exp((tr_loss - logging_loss) /(global_step- tr_nb)),4) 207 | if global_step %100 == 0: 208 | logger.info(" steps: %s ppl: %s", global_step, round(avg_loss,5)) 209 | if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: 210 | # Log metrics 211 | logging_loss = tr_loss 212 | tr_nb=global_step 213 | 214 | #验证 215 | if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: 216 | checkpoint_prefix = 'checkpoint' 217 | if args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well 218 | results = evaluate(args, model, dev_dataset) 219 | for key, value in results.items(): 220 | logger.info(" %s = %s", key, round(value,4)) 221 | # Save model checkpoint 222 | output_dir = os.path.join(args.output_dir, '{}-{}-{}'.format(checkpoint_prefix, global_step,round(results['perplexity'],4))) 223 | 224 | if not os.path.exists(output_dir): 225 | os.makedirs(output_dir) 226 | 227 | #保存模型 228 | model_to_save = model.module.encoder if hasattr(model,'module') else model.encoder # Take care of distributed/parallel training 229 | model_to_save.save_pretrained(output_dir) 230 | logger.info("Saving model checkpoint to %s", output_dir) 231 | 232 | logger.info("Saving linear to %s",os.path.join(args.output_dir, "linear.bin")) 233 | model_to_save_linear = model.module.text_linear if hasattr(model, 'module') else model.text_linear 234 | torch.save(model_to_save_linear.state_dict(), os.path.join(output_dir, "linear.bin")) 235 | logger.info("Saving embeddings to %s",os.path.join(args.output_dir, "embeddings.bin")) 236 | model_to_save_embeddings = model.module.text_embeddings if hasattr(model, 'module') else model.text_embeddings 237 | torch.save(model_to_save_embeddings.state_dict(), os.path.join(output_dir, "embeddings.bin")) 238 | 239 | 240 | last_output_dir = os.path.join(args.output_dir, 'checkpoint-last') 241 | if not os.path.exists(last_output_dir): 242 | os.makedirs(last_output_dir) 243 | model_to_save.save_pretrained(last_output_dir) 244 | logger.info("Saving linear to %s",os.path.join(last_output_dir, "linear.bin")) 245 | model_to_save_linear = model.module.text_linear if hasattr(model, 'module') else model.text_linear 246 | torch.save(model_to_save_linear.state_dict(), os.path.join(last_output_dir, "linear.bin")) 247 | logger.info("Saving embeddings to %s",os.path.join(last_output_dir, "embeddings.bin")) 248 | model_to_save_embeddings = model.module.text_embeddings if hasattr(model, 'module') else model.text_embeddings 249 | torch.save(model_to_save_embeddings.state_dict(), os.path.join(last_output_dir, "embeddings.bin")) 250 | logger.info("Saving model to %s",os.path.join(last_output_dir, "model.bin")) 251 | model_to_save = model.module if hasattr(model, 'module') else model 252 | torch.save(model_to_save.state_dict(), os.path.join(last_output_dir, "model.bin")) 253 | 254 | 255 | idx_file = os.path.join(last_output_dir, 'idx_file.txt') 256 | with open(idx_file, 'w', encoding='utf-8') as idxf: 257 | idxf.write(str( idx) + '\n') 258 | torch.save(optimizer.state_dict(), os.path.join(last_output_dir, "optimizer.pt")) 259 | torch.save(scheduler.state_dict(), os.path.join(last_output_dir, "scheduler.pt")) 260 | logger.info("Saving optimizer and scheduler states to %s", last_output_dir) 261 | 262 | step_file = os.path.join(last_output_dir, 'step_file.txt') 263 | with open(step_file, 'w', encoding='utf-8') as stepf: 264 | stepf.write(str(global_step) + '\n') 265 | if args.max_steps > 0 and global_step > args.max_steps: 266 | break 267 | if args.max_steps > 0 and global_step > args.max_steps: 268 | break 269 | 270 | 271 | 272 | def evaluate(args, model, eval_dataset): 273 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 274 | eval_sampler = SequentialSampler(eval_dataset) 275 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=4) 276 | 277 | eval_loss = 0.0 278 | nb_eval_steps = 0 279 | model.eval() 280 | for batch in eval_dataloader: 281 | inputs,inputs_ids,masks,labels = [x.to(args.device) for x in batch] 282 | with torch.no_grad(): 283 | lm_loss = model(inputs,inputs_ids,masks,labels) 284 | eval_loss += lm_loss.mean().item() 285 | nb_eval_steps += 1 286 | 287 | eval_loss = eval_loss / nb_eval_steps 288 | perplexity = torch.exp(torch.tensor(eval_loss)) 289 | 290 | result = { 291 | "perplexity": float(perplexity) 292 | } 293 | 294 | return result 295 | 296 | 297 | 298 | 299 | 300 | def main(): 301 | parser = argparse.ArgumentParser() 302 | 303 | ## Required parameters 304 | parser.add_argument("--output_dir", default=None, type=str, required=True, 305 | help="The output directory where the model predictions and checkpoints will be written.") 306 | 307 | ## Other parameters 308 | parser.add_argument("--eval_data_file", default=None, type=str, 309 | help="An optional input evaluation data file to evaluate the perplexity on (a text file).") 310 | 311 | parser.add_argument("--model_type", default="bert", type=str, 312 | help="The model architecture to be fine-tuned.") 313 | parser.add_argument("--model_name_or_path", default=None, type=str, 314 | help="The model checkpoint for weights initialization.") 315 | 316 | parser.add_argument("--mlm", action='store_true', 317 | help="Train with masked-language modeling loss instead of language modeling.") 318 | parser.add_argument("--mlm_probability", type=float, default=0.15, 319 | help="Ratio of tokens to mask for masked language modeling loss") 320 | 321 | parser.add_argument("--config_name", default="", type=str, 322 | help="Optional pretrained config name or path if not the same as model_name_or_path") 323 | parser.add_argument("--tokenizer_name", default="", type=str, 324 | help="Optional pretrained tokenizer name or path if not the same as model_name_or_path") 325 | parser.add_argument("--cache_dir", default="", type=str, 326 | help="Optional directory to store the pre-trained models downloaded from s3 (instread of the default one)") 327 | parser.add_argument("--block_size", default=-1, type=int, 328 | help="Optional input sequence length after tokenization." 329 | "The training dataset will be truncated in block of this size for training." 330 | "Default to the model max input length for single sentence inputs (take into account special tokens).") 331 | parser.add_argument("--dfg_size", default=64, type=int, 332 | help="Optional input sequence length after tokenization." 333 | "The training dataset will be truncated in block of this size for training." 334 | "Default to the model max input length for single sentence inputs (take into account special tokens).") 335 | parser.add_argument("--do_train", action='store_true', 336 | help="Whether to run training.") 337 | parser.add_argument("--do_eval", action='store_true', 338 | help="Whether to run eval on the dev set.") 339 | parser.add_argument("--evaluate_during_training", action='store_true', 340 | help="Run evaluation during training at each logging step.") 341 | parser.add_argument("--do_lower_case", action='store_true', 342 | help="Set this flag if you are using an uncased model.") 343 | 344 | parser.add_argument("--per_gpu_train_batch_size", default=4, type=int, 345 | help="Batch size per GPU/CPU for training.") 346 | parser.add_argument("--per_gpu_eval_batch_size", default=4, type=int, 347 | help="Batch size per GPU/CPU for evaluation.") 348 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 349 | help="Number of updates steps to accumulate before performing a backward/update pass.") 350 | parser.add_argument("--learning_rate", default=5e-5, type=float, 351 | help="The initial learning rate for Adam.") 352 | parser.add_argument("--weight_decay", default=0.0, type=float, 353 | help="Weight deay if we apply some.") 354 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 355 | help="Epsilon for Adam optimizer.") 356 | parser.add_argument("--max_grad_norm", default=1.0, type=float, 357 | help="Max gradient norm.") 358 | parser.add_argument("--num_train_epochs", default=1.0, type=float, 359 | help="Total number of training epochs to perform.") 360 | parser.add_argument("--max_steps", default=-1, type=int, 361 | help="If > 0: set total number of training steps to perform. Override num_train_epochs.") 362 | parser.add_argument("--warmup_steps", default=0, type=int, 363 | help="Linear warmup over warmup_steps.") 364 | 365 | parser.add_argument('--logging_steps', type=int, default=50, 366 | help="Log every X updates steps.") 367 | parser.add_argument('--save_steps', type=int, default=50, 368 | help="Save checkpoint every X updates steps.") 369 | parser.add_argument('--save_total_limit', type=int, default=None, 370 | help='Limit the total amount of checkpoints, delete the older checkpoints in the output_dir, does not delete by default') 371 | parser.add_argument("--eval_all_checkpoints", action='store_true', 372 | help="Evaluate all checkpoints starting with the same prefix as model_name_or_path ending and ending with step number") 373 | parser.add_argument("--no_cuda", action='store_true', 374 | help="Avoid using CUDA when available") 375 | parser.add_argument('--overwrite_output_dir', action='store_true', 376 | help="Overwrite the content of the output directory") 377 | parser.add_argument('--overwrite_cache', action='store_true', 378 | help="Overwrite the cached training and evaluation sets") 379 | parser.add_argument('--seed', type=int, default=42, 380 | help="random seed for initialization") 381 | 382 | parser.add_argument('--fp16', action='store_true', 383 | help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") 384 | parser.add_argument('--fp16_opt_level', type=str, default='O1', 385 | help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." 386 | "See details at https://nvidia.github.io/apex/amp.html") 387 | parser.add_argument("--local_rank", type=int, default=-1, 388 | help="For distributed training: local_rank") 389 | parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") 390 | parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") 391 | 392 | parser.add_argument('--log_file', type=str, default='') 393 | parser.add_argument('--tensorboard_dir', type=str) 394 | parser.add_argument('--lang', type=str) 395 | parser.add_argument('--pretrain', type=str, default='') 396 | args = parser.parse_args() 397 | pool = None 398 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 399 | args.n_gpu = torch.cuda.device_count() 400 | args.device = device 401 | 402 | # 设置log信息 403 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 404 | datefmt='%m/%d/%Y %H:%M:%S', 405 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 406 | logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", 407 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) 408 | 409 | 410 | # 设置随机种子 411 | set_seed(args) 412 | 413 | # 判断是否有checkpoint,从而继续预训练 414 | args.start_epoch = 0 415 | args.start_step = 0 416 | checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last') 417 | if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last): 418 | args.model_name_or_path = os.path.join(checkpoint_last, 'pytorch_model.bin') 419 | args.config_name = os.path.join(checkpoint_last, 'config.json') 420 | step_file = os.path.join(checkpoint_last, 'step_file.txt') 421 | if os.path.exists(step_file): 422 | with open(step_file, encoding='utf-8') as stepf: 423 | args.start_step = int(stepf.readlines()[0].strip()) 424 | 425 | logger.info("reload model from {}, resume from {} epoch".format(checkpoint_last, args.start_epoch)) 426 | 427 | 428 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] 429 | 430 | 431 | base_path="../data" 432 | text_features=[ 433 | [base_path+"/sequence_text_user_id_product_id.128d",'sequence_text_user_id_product_id',128,True], 434 | [base_path+"/sequence_text_user_id_ad_id.128d",'sequence_text_user_id_ad_id',128,True], 435 | [base_path+"/sequence_text_user_id_creative_id.128d",'sequence_text_user_id_creative_id',128,True], 436 | [base_path+"/sequence_text_user_id_advertiser_id.128d",'sequence_text_user_id_advertiser_id',128,True], 437 | [base_path+"/sequence_text_user_id_industry.128d",'sequence_text_user_id_industry',128,True], 438 | [base_path+"/sequence_text_user_id_product_category.128d",'sequence_text_user_id_product_category',128,True], 439 | [base_path+"/sequence_text_user_id_time.128d",'sequence_text_user_id_time',128,True], 440 | [base_path+"/sequence_text_user_id_click_times.128d",'sequence_text_user_id_click_times',128,True], 441 | ] 442 | 443 | #读取训练数据 444 | train_df=pd.read_pickle(os.path.join(base_path,'train_user.pkl')) 445 | test_df=pd.read_pickle(os.path.join(base_path,'test_user.pkl')) 446 | dev_data=train_df.iloc[-10000:] 447 | train_data=train_df.iloc[:-10000].append(test_df) 448 | 449 | #创建输入端的词表,每个域最多保留10w个id 450 | try: 451 | dic=pickle.load(open(os.path.join(args.output_dir, 'vocab.pkl'),'rb')) 452 | except: 453 | dic={} 454 | dic['pad']=0 455 | dic['mask']=1 456 | dic['unk']=2 457 | for feature in text_features: 458 | conter=Counter() 459 | for item in train_df[feature[1]].values: 460 | for word in item.split(): 461 | try: 462 | conter[(feature[1],word)]+=1 463 | except: 464 | conter[(feature[1],word)]=1 465 | most_common=conter.most_common(100000) 466 | cont=0 467 | for x in most_common: 468 | if x[1]>5: 469 | dic[x[0]]=len(dic) 470 | cont+=1 471 | if cont<10: 472 | print(x[0],dic[x[0]]) 473 | print(cont) 474 | 475 | #读取或重新创建BERT 476 | if args.model_name_or_path is not None: 477 | config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, 478 | cache_dir=args.cache_dir if args.cache_dir else None) 479 | model = model_class.from_pretrained(args.model_name_or_path, 480 | from_tf=bool('.ckpt' in args.model_name_or_path), 481 | config=config, 482 | cache_dir=args.cache_dir if args.cache_dir else None) 483 | args.text_dim=config.hidden_size 484 | else: 485 | config = RobertaConfig() 486 | config.num_hidden_layers=12 487 | config.hidden_size=512 488 | config.intermediate_size=config.hidden_size*4 489 | config.num_attention_heads=16 490 | config.vocab_size=5 491 | model = model_class(config) 492 | config.vocab_size_v1=len(dic) 493 | config.vocab_dim_v1=64 494 | logger.info("%s",config) 495 | logger.info("Training/evaluation parameters %s", args) 496 | 497 | #保存输入端词表 498 | args.vocab_dic=dic 499 | pickle.dump(dic,open(os.path.join(args.output_dir, 'vocab.pkl'),'wb')) 500 | 501 | #读取word embedding 502 | import gensim 503 | embedding_table=[] 504 | for x in text_features: 505 | print(x) 506 | embedding_table.append(pickle.load(open(x[0],'rb'))) 507 | 508 | #创建输出端词表,每个域最多保留10w个id 509 | vocab=[] 510 | for feature in text_features: 511 | conter=Counter() 512 | for item in train_data[feature[1]].values: 513 | for word in item.split(): 514 | try: 515 | conter[word]+=1 516 | except: 517 | conter[word]=1 518 | most_common=conter.most_common(100000) 519 | dic={} 520 | for idx,x in enumerate(most_common): 521 | dic[x[0]]=idx+1 522 | vocab.append(dic) 523 | 524 | #设置参数 525 | args.vocab_size_v1=config.vocab_size_v1 526 | args.vocab_dim_v1=config.vocab_dim_v1 527 | args.vocab=vocab 528 | args.text_dim=sum([x[2] for x in text_features]) 529 | args.text_features=text_features 530 | train_dataset=TextDataset(args,train_data,embedding_table) 531 | dev_dataset=TextDataset(args,dev_data,embedding_table) 532 | args.vocab_size=[len(x)+1 for x in vocab] 533 | #创建模型 534 | model=Model(model,config,args) 535 | #如果有checkpoint,读取checkpoint 536 | if os.path.exists(checkpoint_last) and os.listdir(checkpoint_last): 537 | logger.info("Load model from %s",os.path.join(checkpoint_last, "model.bin")) 538 | model.load_state_dict(torch.load(os.path.join(checkpoint_last, "model.bin"))) 539 | 540 | #训练 541 | train(args, train_dataset,dev_dataset, model) 542 | 543 | 544 | if __name__ == "__main__": 545 | main() 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 赛题介绍-广告受众基础属性预估 2 | 3 | 比赛将为参赛者提供一组用户在长度为 91 天(3 个月)的时间窗口内的广告点击历史记录作为训练数据集。每条记录中包含了日期 (从 1 到 91)、用户信息 (年龄,性别),被点击的广告的信息(素材 id、广告 id、产品 id、产品类目 id、广告主 id、广告主行业 id 等),以及该用户当天点击该广告的次数。测试数据集将会是另一组用户 的广告点击历史记录。提供给参赛者的测试数据集中不会包含这些用户的年龄和性别信息。 本赛题要求参赛者预测测试数据集中出现的用户的年龄和性别。 4 | 5 | ### 1. 环境配置 6 | 7 | - Pytorch 8 | - Linux Ubuntu 16.04, 256G内存,4*p100 9 | - pip install transformers==2.8.0 pandas gensim scikit-learn filelock gdown 10 | 11 | ### 2. 模型介绍 12 | 13 | ![avatar](picture/model.png) 14 | ![avatar](picture/mlm.png) 15 | ![avatar](picture/fusion-layer.png) 16 | ![avatar](picture/output.png) 17 | 18 | ### 3. 低配置资源建议 19 | 20 | 21 | 1)内存不足或者只是想简单跑下完整代码,请只使用初赛数据: 22 | 23 | 去掉src/prepocess.py的8, 15, 22行 24 | 25 | 2)如果显存不足,请下载10中的bert-small模型,并调整batch size 26 | 27 | ### 4. 运行完整过程 28 | 29 | 可运行以下脚本,运行整个过程并生成结果。或按照3-7节的说明依次运行。 30 | 31 | ```shell 32 | bash run.sh 33 | ``` 34 | 35 | ### 5. 数据下载 36 | 37 | 通过该[网站](https://drive.google.com/file/d/15onAobxlim_uRUNWSMQuK6VxDsmGTtp4/view?usp=sharing)下载数据集到data目录,或运行下面的命令进行下载 38 | 39 | ```shell 40 | gdown https://drive.google.com/uc?id=15onAobxlim_uRUNWSMQuK6VxDsmGTtp4 41 | unzip data.zip 42 | rm data.zip 43 | ``` 44 | 45 | ### 6. 数据预处理 46 | 47 | 合并所有文件,并分为点击记录文件(click.pkl),用户文件(train_user.pkl/test_user.pkl) 48 | 49 | ``` 50 | python src/preprocess.py 51 | ``` 52 | 53 | ### 7. 特征提取 54 | 55 | ```shell 56 | python src/extract_features.py 57 | ``` 58 | 59 | ### 8. 预训练 Word2Vector 与 BERT 60 | 61 | 这里提供两种方式获得预训练权重: 重新预训练或下载预训练好的权重 62 | 63 | 注: Word2Vector和BERT权重必须一致,即要么全部重新预训练,要么全部下载 64 | 65 | #### 1) 预训练Word2Vector 66 | 67 | 预训练word2vector 68 | 69 | ```shell 70 | python src/w2v.py 71 | ``` 72 | 73 | 或下载预训练好的[W2V](https://drive.google.com/file/d/1SUpukAeXR5Ymyf3wH3SRNdQ3Hl2HazQa/view?usp=sharing) 74 | 75 | ```shell 76 | gdown https://drive.google.com/uc?id=1SUpukAeXR5Ymyf3wH3SRNdQ3Hl2HazQa 77 | unzip w2v.zip 78 | cp w2v/* data/ 79 | rm -r w2v* 80 | ``` 81 | 82 | #### 2) 预训练BERT 83 | 84 | 预训练BERT (如果GPU是v100,可以安装apex并在参数上加--fp16进行加速) 85 | 86 | ```shell 87 | cd BERT 88 | mkdir saved_models 89 | python run.py \ 90 | --output_dir saved_models \ 91 | --model_type roberta \ 92 | --config_name roberta-base \ 93 | --mlm \ 94 | --block_size 128 \ 95 | --per_gpu_train_batch_size 64 \ 96 | --per_gpu_eval_batch_size 64 \ 97 | --gradient_accumulation_steps 1 \ 98 | --learning_rate 5e-5 \ 99 | --weight_decay 0.01 \ 100 | --adam_epsilon 1e-6 \ 101 | --max_grad_norm 1.0 \ 102 | --max_steps 100000 \ 103 | --mlm_probability 0.2 \ 104 | --warmup_steps 10000 \ 105 | --logging_steps 50 \ 106 | --save_steps 10000 \ 107 | --evaluate_during_training \ 108 | --save_total_limit 500 \ 109 | --seed 123456 \ 110 | --tensorboard_dir saved_models/tensorboard_logs 111 | rm -r saved_models/bert-base 112 | cp -r saved_models/checkpoint-last saved_models/bert-base 113 | rm saved_models/bert-base/optimizer.pt 114 | cp saved_models/vocab.pkl saved_models/bert-base/vocab.pkl 115 | cd .. 116 | ``` 117 | 118 | 或下载预训练好的[BERT-base](https://drive.google.com/file/d/1ToAJwl_oRAeRNyYF_FK0B2APVXlPFTlq/view?usp=sharing) 119 | 120 | ```shell 121 | gdown https://drive.google.com/uc?id=1ToAJwl_oRAeRNyYF_FK0B2APVXlPFTlq 122 | unzip bert-base.zip 123 | mv bert-base BERT/ 124 | rm bert-base.zip 125 | ``` 126 | 127 | ### 9. 训练模型 128 | 129 | ```shell 130 | mkdir saved_models 131 | mkdir saved_models/log 132 | for((i=0;i<5;i++)); 133 | do 134 | python run.py \ 135 | --kfold=5 \ 136 | --index=$i \ 137 | --train_batch_size=256 \ 138 | --eval_steps=5000 \ 139 | --max_len_text=128 \ 140 | --epoch=5 \ 141 | --lr=1e-4 \ 142 | --output_path=saved_models \ 143 | --pretrained_model_path=BERT/bert-base \ 144 | --eval_batch_size=512 2>&1 | tee saved_models/log/$i.txt 145 | done 146 | ``` 147 | 148 | 合并结果,结果为submission.csv 149 | 150 | ```shell 151 | python src/merge_submission.py 152 | ``` 153 | 154 | ### 10. 不同规模的预训练模型 155 | 156 | 由于此次比赛融合了不同规模大小的预训练模型,在此也提供不同规模的预训练模型: 157 | 158 | [BERT-small](https://drive.google.com/file/d/1bDneO-YhBs5dx-9qC-WrBf3jUc_QCIYn/view?usp=sharing), [BERT-base](https://drive.google.com/file/d/1ToAJwl_oRAeRNyYF_FK0B2APVXlPFTlq/view?usp=sharing), [BERT-large](https://drive.google.com/file/d/1yQeh3O6E_98srPqTVwAnVbr1v-X0A7R-/view?usp=sharing), [BERT-xl](https://drive.google.com/file/d/1jViHtyljOJxxeOBmxn9tOZg_hmWOj0L2/view?usp=sharing) 159 | 160 | 其中bert-base效果最好 161 | 162 | ```shell 163 | #bert-small 164 | gdown https://drive.google.com/uc?id=1bDneO-YhBs5dx-9qC-WrBf3jUc_QCIYn 165 | #bert-base 166 | gdown https://drive.google.com/uc?id=1ToAJwl_oRAeRNyYF_FK0B2APVXlPFTlq 167 | #bert-large 168 | gdown https://drive.google.com/uc?id=1yQeh3O6E_98srPqTVwAnVbr1v-X0A7R- 169 | #bert-xl 170 | gdown https://drive.google.com/uc?id=1jViHtyljOJxxeOBmxn9tOZg_hmWOj0L2 171 | ``` 172 | -------------------------------------------------------------------------------- /models/ctrNet.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import logging 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import torch.nn as nn 9 | from models.model import Model 10 | from torch.nn import CrossEntropyLoss, MSELoss 11 | from torch.utils.data import DataLoader,SequentialSampler, RandomSampler 12 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, 13 | RobertaConfig, RobertaModel) 14 | import random 15 | logger = logging.getLogger(__name__) 16 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 17 | datefmt='%m/%d/%Y %H:%M:%S', 18 | level=logging.INFO) 19 | #设置随机种子 20 | def set_seed(args): 21 | random.seed(args.seed) 22 | np.random.seed(args.seed) 23 | torch.manual_seed(args.seed) 24 | if args.n_gpu > 0: 25 | torch.cuda.manual_seed_all(args.seed) 26 | 27 | 28 | class ctrNet(nn.Module): 29 | def __init__(self,args): 30 | super(ctrNet, self).__init__() 31 | #设置GPU和创建模型 32 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 33 | args.n_gpu = torch.cuda.device_count() 34 | args.device = device 35 | logger.info(" device: %s, n_gpu: %s",device, args.n_gpu) 36 | model=Model(args) 37 | model.to(args.device) 38 | self.model=model 39 | self.args=args 40 | set_seed(args) 41 | 42 | def train(self,train_dataset,dev_dataset=None): 43 | args=self.args 44 | #设置dataloader 45 | train_sampler = RandomSampler(train_dataset) 46 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,num_workers=4) 47 | args.max_steps=args.epoch*len( train_dataloader) 48 | args.save_steps=len( train_dataloader)//10 49 | args.warmup_steps=len( train_dataloader) 50 | args.logging_steps=len( train_dataloader) 51 | args.num_train_epochs=args.epoch 52 | #设置优化器 53 | optimizer = AdamW(self.model.parameters(), lr=args.lr, eps=1e-8,weight_decay=0.08) 54 | scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(len(train_dataloader)*args.num_train_epochs*0.2), num_training_steps=int(len(train_dataloader)*args.num_train_epochs)) 55 | #多GPU设置 56 | if args.n_gpu > 1: 57 | self.model = torch.nn.DataParallel(self.model) 58 | model=self.model 59 | #开始训练 60 | logger.info("***** Running training *****") 61 | logger.info(" Num examples = %d", len(train_dataset)) 62 | logger.info(" Num Epochs = %d", args.num_train_epochs) 63 | if args.n_gpu!=0: 64 | logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size//args.n_gpu) 65 | logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", 66 | args.train_batch_size) 67 | logger.info(" Total optimization steps = %d", args.max_steps) 68 | 69 | global_step = 0 70 | tr_loss, best_age_acc,best_gender_acc,avg_loss,tr_nb = 0.0,0.0, 0.0,0.0,0.0 71 | model.zero_grad() 72 | patience=0 73 | for idx in range(args.num_train_epochs): 74 | tr_num=0 75 | train_loss=0 76 | for step, batch in enumerate(train_dataloader): 77 | #forward和backward 78 | labels,dense_features,text_features,text_ids,text_masks,text_features_1,text_masks_1=(x.to(args.device) for x in batch) 79 | del batch 80 | model.train() 81 | loss = model(dense_features,text_features,text_ids,text_masks,text_features_1,text_masks_1,labels) 82 | if args.n_gpu > 1: 83 | loss = loss.mean() # mean() to average on multi-gpu parallel training 84 | loss.backward() 85 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 86 | tr_loss += loss.item() 87 | tr_num+=1 88 | train_loss+=loss.item() 89 | #输出log 90 | if avg_loss==0: 91 | avg_loss=tr_loss 92 | avg_loss=round(train_loss/tr_num,5) 93 | if (step+1) % args.display_steps == 0: 94 | logger.info(" epoch {} step {} loss {}".format(idx,step+1,avg_loss)) 95 | #update梯度 96 | optimizer.step() 97 | optimizer.zero_grad() 98 | scheduler.step() 99 | global_step += 1 100 | 101 | #测试验证结果 102 | if (step+1) % args.eval_steps == 0 and dev_dataset is not None: 103 | #输出验证集性别和年龄的概率 104 | age_probs,gender_probs = self.infer(dev_dataset) 105 | #输出性别和年龄的loss和acc 106 | age_results= self.eval(dev_dataset.df['age'].values,age_probs) 107 | gender_results= self.eval(dev_dataset.df['gender'].values,gender_probs) 108 | results={} 109 | results['eval_age_loss']=age_results['eval_loss'] 110 | results['eval_gender_loss']=gender_results['eval_loss'] 111 | results['eval_age_acc']=age_results['eval_acc'] 112 | results['eval_gender_acc']=gender_results['eval_acc'] 113 | #打印结果 114 | for key, value in results.items(): 115 | logger.info(" %s = %s", key, round(value,4)) 116 | #保存最好的年龄结果和模型 117 | if results['eval_age_acc']>best_age_acc: 118 | best_age_acc=results['eval_age_acc'] 119 | logger.info(" "+"*"*20) 120 | logger.info(" Best age acc:%s",round(best_age_acc,4)) 121 | logger.info(" "+"*"*20) 122 | try: 123 | os.system("mkdir -p {}".format(args.output_dir)) 124 | except: 125 | pass 126 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 127 | output_model_file = os.path.join(args.output_dir, "pytorch_model_{}.bin".format('age')) 128 | torch.save(model_to_save.state_dict(), output_model_file) 129 | #保存最好的性别结果和模型 130 | if results['eval_gender_acc']>best_gender_acc: 131 | best_gender_acc=results['eval_gender_acc'] 132 | logger.info(" "+"*"*20) 133 | logger.info(" Best gender acc:%s",round(best_gender_acc,4)) 134 | logger.info(" "+"*"*20) 135 | try: 136 | os.system("mkdir -p {}".format(args.output_dir)) 137 | except: 138 | pass 139 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 140 | output_model_file = os.path.join(args.output_dir, "pytorch_model_{}.bin".format('gender')) 141 | torch.save(model_to_save.state_dict(), output_model_file) 142 | logger.info(" best_acc = %s",round(best_age_acc+best_gender_acc,4)) 143 | 144 | #一个epoch结束后,测试验证集结果 145 | if dev_dataset is not None: 146 | #输出验证集性别和年龄的概率 147 | age_probs,gender_probs = self.infer(dev_dataset) 148 | #输出性别和年龄的loss和acc 149 | age_results= self.eval(dev_dataset.df['age'].values,age_probs) 150 | gender_results= self.eval(dev_dataset.df['gender'].values,gender_probs) 151 | results={} 152 | results['eval_age_loss']=age_results['eval_loss'] 153 | results['eval_gender_loss']=gender_results['eval_loss'] 154 | results['eval_age_acc']=age_results['eval_acc'] 155 | results['eval_gender_acc']=gender_results['eval_acc'] 156 | #打印结果 157 | for key, value in results.items(): 158 | logger.info(" %s = %s", key, round(value,4)) 159 | #保存最好的年龄结果和模型 160 | if results['eval_age_acc']>best_age_acc: 161 | best_age_acc=results['eval_age_acc'] 162 | logger.info(" "+"*"*20) 163 | logger.info(" Best age acc:%s",round(best_age_acc,4)) 164 | logger.info(" "+"*"*20) 165 | try: 166 | os.system("mkdir -p {}".format(args.output_dir)) 167 | except: 168 | pass 169 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 170 | output_model_file = os.path.join(args.output_dir, "pytorch_model_{}.bin".format('age')) 171 | torch.save(model_to_save.state_dict(), output_model_file) 172 | #保存最好的性别结果和模型 173 | if results['eval_gender_acc']>best_gender_acc: 174 | best_gender_acc=results['eval_gender_acc'] 175 | logger.info(" "+"*"*20) 176 | logger.info(" Best gender acc:%s",round(best_gender_acc,4)) 177 | logger.info(" "+"*"*20) 178 | try: 179 | os.system("mkdir -p {}".format(args.output_dir)) 180 | except: 181 | pass 182 | model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self 183 | output_model_file = os.path.join(args.output_dir, "pytorch_model_{}.bin".format('gender')) 184 | torch.save(model_to_save.state_dict(), output_model_file) 185 | logger.info(" best_acc = %s",round(best_age_acc+best_gender_acc,4)) 186 | 187 | 188 | def infer(self,eval_dataset): 189 | #预测年龄和性别的概率分布 190 | args=self.args 191 | model=self.model 192 | eval_sampler = SequentialSampler(eval_dataset) 193 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,num_workers=4) 194 | eval_loss = 0.0 195 | nb_eval_steps = 0 196 | age_probs=[] 197 | gender_probs=[] 198 | model.eval() 199 | for batch in eval_dataloader: 200 | _,dense_features,text_features,text_ids,text_masks,text_features_1,text_masks_1=(x.to(args.device) for x in batch) 201 | with torch.no_grad(): 202 | probs_1,probs_2 = model(dense_features,text_features,text_ids,text_masks,text_features_1,text_masks_1) 203 | age_probs.append(probs_1.cpu().numpy()) 204 | gender_probs.append(probs_2.cpu().numpy()) 205 | 206 | age_probs=np.concatenate(age_probs,0) 207 | gender_probs=np.concatenate(gender_probs,0) 208 | return age_probs,gender_probs 209 | 210 | def eval(self,labels,preds): 211 | #求出loss和acc 212 | results={} 213 | results['eval_acc']=np.mean(labels==np.argmax(preds,-1)) 214 | from sklearn.metrics import log_loss 215 | results['eval_loss']=log_loss(labels,preds) 216 | return results 217 | 218 | def reload(self,label): 219 | #读取在验证集结果最好的模型 220 | model=self.model 221 | args=self.args 222 | args.load_model_path=os.path.join(args.output_dir, "pytorch_model_{}.bin".format(label)) 223 | logger.info("Load model from %s",args.load_model_path) 224 | model_to_load = model.module if hasattr(model, 'module') else model # Only save the model it-self 225 | model_to_load.load_state_dict(torch.load(args.load_model_path)) 226 | 227 | -------------------------------------------------------------------------------- /models/model.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import torch 3 | import os 4 | import logging 5 | import numpy as np 6 | import pandas as pd 7 | from tqdm import tqdm 8 | import torch.nn as nn 9 | from torch.nn import CrossEntropyLoss, MSELoss 10 | from torch.utils.data import DataLoader,SequentialSampler, RandomSampler 11 | from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup, 12 | RobertaConfig, RobertaModel) 13 | logger = logging.getLogger(__name__) 14 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 15 | datefmt='%m/%d/%Y %H:%M:%S', 16 | level=logging.INFO) 17 | 18 | 19 | class Model(nn.Module): 20 | def __init__(self,args): 21 | super(Model, self).__init__() 22 | args.out_size=len(args.dense_features) 23 | self.dropout = nn.Dropout(args.hidden_dropout_prob) 24 | self.args=args 25 | 26 | #创建BERT模型,并且导入预训练模型 27 | config = RobertaConfig.from_pretrained(args.pretrained_model_path) 28 | config.output_hidden_states=True 29 | args.hidden_size=config.hidden_size 30 | args.num_hidden_layers=config.num_hidden_layers 31 | self.text_layer=RobertaModel.from_pretrained(args.pretrained_model_path,config=config) 32 | self.text_linear=nn.Linear(args.text_dim+args.vocab_dim_v1*len(args.text_features), args.hidden_size) 33 | logger.info("Load linear from %s",os.path.join(args.pretrained_model_path, "linear.bin")) 34 | self.text_linear.load_state_dict(torch.load(os.path.join(args.pretrained_model_path, "linear.bin"))) 35 | logger.info("Load embeddings from %s",os.path.join(args.pretrained_model_path, "embeddings.bin")) 36 | self.text_embeddings=nn.Embedding.from_pretrained(torch.load(os.path.join(args.pretrained_model_path, "embeddings.bin"))['weight'],freeze=True) 37 | args.out_size+=args.hidden_size*2 38 | 39 | #创建fusion-layer模型,随机初始化 40 | config = RobertaConfig() 41 | config.num_hidden_layers=4 42 | config.intermediate_size=2048 43 | config.hidden_size=512 44 | config.num_attention_heads=16 45 | config.vocab_size=5 46 | self.text_layer_1=RobertaModel(config=config) 47 | self.text_layer_1.apply(self._init_weights) 48 | self.text_linear_1=nn.Linear(args.text_dim_1+args.hidden_size, 512) 49 | self.text_linear_1.apply(self._init_weights) 50 | self.norm= nn.BatchNorm1d(args.text_dim_1+args.hidden_size) 51 | args.out_size+=1024 52 | 53 | #创建分类器,随机初始化 54 | self.classifier=ClassificationHead(args) 55 | self.classifier.apply(self._init_weights) 56 | 57 | 58 | def _init_weights(self, module): 59 | """ Initialize the weights """ 60 | if isinstance(module, (nn.Linear, nn.Embedding)): 61 | # Slightly different from the TF version which uses truncated_normal for initialization 62 | # cf https://github.com/pytorch/pytorch/pull/5617 63 | module.weight.data.normal_(mean=0.0, std=0.02) 64 | 65 | def forward(self,dense_features,text_features,text_ids,text_masks,text_features_1,text_masks_1,labels=None): 66 | outputs=[] 67 | #获取浮点数,作为分类器的输入 68 | outputs.append(dense_features.float()) 69 | #获取BERT模型的hidden state,并且做max pooling和mean pooling作为分类器的输入 70 | text_masks=text_masks.float() 71 | text_embedding=self.text_embeddings(text_ids).view(text_ids.size(0),text_ids.size(1),-1) 72 | text_features=torch.cat((text_features.float(),text_embedding),-1) 73 | text_features=torch.relu(self.text_linear(self.dropout(text_features))) 74 | hidden_states=self.text_layer(inputs_embeds=text_features,attention_mask=text_masks)[0] 75 | embed_mean=(hidden_states*text_masks.unsqueeze(-1)).sum(1)/text_masks.sum(1).unsqueeze(-1) 76 | embed_mean=embed_mean.float() 77 | embed_max=hidden_states+(1-text_masks).unsqueeze(-1)*(-1e10) 78 | embed_max=embed_max.max(1)[0].float() 79 | outputs.append(embed_mean) 80 | outputs.append(embed_max) 81 | #获取fusion-layer的hidden state,并且做max pooling和mean pooling作为分类器的输入 82 | text_masks_1=text_masks_1.float() 83 | text_features_1=torch.cat((text_features_1.float(),hidden_states),-1) 84 | bs,le,dim=text_features_1.size() 85 | text_features_1=self.norm(text_features_1.view(-1,dim)).view(bs,le,dim) 86 | text_features_1=torch.relu(self.text_linear_1(text_features_1)) 87 | hidden_states=self.text_layer_1(inputs_embeds=text_features_1,attention_mask=text_masks_1)[0] 88 | embed_mean=(hidden_states*text_masks_1.unsqueeze(-1)).sum(1)/text_masks_1.sum(1).unsqueeze(-1) 89 | embed_mean=embed_mean.float() 90 | embed_max=hidden_states+(1-text_masks_1).unsqueeze(-1)*(-1e10) 91 | embed_max=embed_max.max(1)[0].float() 92 | outputs.append(embed_mean) 93 | outputs.append(embed_max) 94 | 95 | #将特征输入分类器,得到20分类的logits 96 | final_hidden_state=torch.cat(outputs,-1) 97 | logits=self.classifier(final_hidden_state) 98 | 99 | #返回loss或概率结果 100 | if labels is not None: 101 | loss_fct = CrossEntropyLoss() 102 | loss = loss_fct(logits, labels) 103 | return loss 104 | else: 105 | prob=torch.softmax(logits,-1) 106 | age_probs=prob.view(-1,10,2).sum(2) 107 | gender_probs=prob.view(-1,10,2).sum(1) 108 | return age_probs,gender_probs 109 | 110 | 111 | 112 | class ClassificationHead(nn.Module): 113 | """Head for sentence-level classification tasks.""" 114 | def __init__(self, args): 115 | super().__init__() 116 | self.norm= nn.BatchNorm1d(args.out_size) 117 | self.dense = nn.Linear(args.out_size, args.linear_layer_size[0]) 118 | self.norm_1= nn.BatchNorm1d(args.linear_layer_size[0]) 119 | self.dropout = nn.Dropout(args.hidden_dropout_prob) 120 | self.dense_1 = nn.Linear(args.linear_layer_size[0], args.linear_layer_size[1]) 121 | self.norm_2= nn.BatchNorm1d(args.linear_layer_size[1]) 122 | self.out_proj = nn.Linear(args.linear_layer_size[1], args.num_label) 123 | 124 | def forward(self, features, **kwargs): 125 | x = self.norm(features) 126 | x = self.dropout(x) 127 | x = self.dense(x) 128 | x = torch.relu(self.norm_1(x)) 129 | x = self.dropout(x) 130 | x = self.dense_1(x) 131 | x = torch.relu(self.norm_2(x)) 132 | x = self.dropout(x) 133 | x = self.out_proj(x) 134 | return x 135 | 136 | -------------------------------------------------------------------------------- /picture/fusion-layer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoday/Tencent2020_Rank1st/de07bb117f66713a54e89a7b0c2efec0581a4b77/picture/fusion-layer.png -------------------------------------------------------------------------------- /picture/mlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoday/Tencent2020_Rank1st/de07bb117f66713a54e89a7b0c2efec0581a4b77/picture/mlm.png -------------------------------------------------------------------------------- /picture/model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoday/Tencent2020_Rank1st/de07bb117f66713a54e89a7b0c2efec0581a4b77/picture/model.png -------------------------------------------------------------------------------- /picture/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guoday/Tencent2020_Rank1st/de07bb117f66713a54e89a7b0c2efec0581a4b77/picture/output.png -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gc 3 | import torch 4 | import logging 5 | import argparse 6 | import models.ctrNet as ctrNet 7 | import pickle 8 | import gensim 9 | import random 10 | import pandas as pd 11 | import numpy as np 12 | from tqdm import tqdm 13 | from src.data_loader import TextDataset 14 | from torch.utils.data import DataLoader, SequentialSampler, RandomSampler,TensorDataset 15 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler 16 | from sklearn.model_selection import StratifiedKFold 17 | base_path="data" 18 | #定义浮点数特征 19 | dense_features=['user_id__size', 'user_id_ad_id_unique', 'user_id_creative_id_unique', 'user_id_advertiser_id_unique', 'user_id_industry_unique', 'user_id_product_id_unique', 'user_id_time_unique', 'user_id_click_times_sum', 'user_id_click_times_mean', 'user_id_click_times_std'] 20 | for l in ['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)]: 21 | for f in ['creative_id','ad_id','product_id','advertiser_id','industry']: 22 | dense_features.append(l+'_'+f+'_mean') 23 | 24 | #定义用户点击的序列特征 25 | text_features=[ 26 | [base_path+"/sequence_text_user_id_product_id.128d",'sequence_text_user_id_product_id',128], 27 | [base_path+"/sequence_text_user_id_ad_id.128d",'sequence_text_user_id_ad_id',128], 28 | [base_path+"/sequence_text_user_id_creative_id.128d",'sequence_text_user_id_creative_id',128], 29 | [base_path+"/sequence_text_user_id_advertiser_id.128d",'sequence_text_user_id_advertiser_id',128], 30 | [base_path+"/sequence_text_user_id_industry.128d",'sequence_text_user_id_industry',128], 31 | [base_path+"/sequence_text_user_id_product_category.128d",'sequence_text_user_id_product_category',128], 32 | [base_path+"/sequence_text_user_id_time.128d",'sequence_text_user_id_time',128], 33 | [base_path+"/sequence_text_user_id_click_times.128d",'sequence_text_user_id_click_times',128], 34 | ] 35 | #定义用户点击的人工构造序列特征 36 | text_features_1=[ 37 | [base_path+"/sequence_text_user_id_creative_id_fold.12d",'sequence_text_user_id_creative_id_fold',12], 38 | [base_path+"/sequence_text_user_id_ad_id_fold.12d",'sequence_text_user_id_ad_id_fold',12], 39 | [base_path+"/sequence_text_user_id_product_id_fold.12d",'sequence_text_user_id_product_id_fold',12], 40 | [base_path+"/sequence_text_user_id_advertiser_id_fold.12d",'sequence_text_user_id_advertiser_id_fold',12], 41 | [base_path+"/sequence_text_user_id_industry_fold.12d",'sequence_text_user_id_industry_fold',12], 42 | ] 43 | 44 | if __name__ == "__main__": 45 | logger = logging.getLogger(__name__) 46 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 47 | datefmt='%m/%d/%Y %H:%M:%S', 48 | level=logging.INFO) 49 | parser = argparse.ArgumentParser() 50 | parser.add_argument('--kfold', type=int, default=5) 51 | parser.add_argument('--index', type=int, default=0) 52 | parser.add_argument('--train_batch_size', type=int, default=512) 53 | parser.add_argument('--max_len_text', type=int, default=128) 54 | parser.add_argument('--num_hidden_layers', type=int, default=6) 55 | parser.add_argument('--hidden_dropout_prob', type=float, default=0.2) 56 | parser.add_argument('--output_path', type=str, default=None) 57 | parser.add_argument('--pretrained_model_path', type=str, default=None) 58 | parser.add_argument('--hidden_size', type=int, default=1024) 59 | parser.add_argument('--vocab_size_v1', type=int, default=500000) 60 | parser.add_argument('--vocab_dim_v1', type=int, default=64) 61 | parser.add_argument('--epoch', type=int, default=5) 62 | parser.add_argument('--lr', type=float, default=8e-5) 63 | parser.add_argument('--eval_steps', type=int, default=500) 64 | parser.add_argument('--display_steps', type=int, default=100) 65 | parser.add_argument('--max_grad_norm', type=float, default=1.0) 66 | parser.add_argument('--eval_batch_size', type=int, default=4096) 67 | parser.add_argument('--seed', type=int, default=2020) 68 | parser.add_argument('--num_label', type=int, default=20) 69 | 70 | args = parser.parse_args() 71 | 72 | #设置参数 73 | args.hidden_size=sum([x[-1] for x in text_features]) 74 | logger.info("Argument %s", args) 75 | args.vocab=pickle.load(open(os.path.join(args.pretrained_model_path, "vocab.pkl"),'rb')) 76 | args.vocab_size_v1=len(args.vocab) 77 | args.text_features=text_features 78 | args.text_features_1=text_features_1 79 | args.dense_features=dense_features 80 | args.linear_layer_size=[1024,512] 81 | args.text_dim=sum([x[-1] for x in text_features]) 82 | args.text_dim_1=sum([x[-1] for x in text_features_1]) 83 | args.output_dir="saved_models/index_{}".format(args.index) 84 | 85 | #读取word2vector模型 86 | args.embeddings_tables={} 87 | for x in args.text_features: 88 | if x[0] not in args.embeddings_tables: 89 | try: 90 | args.embeddings_tables[x[0]]=gensim.models.KeyedVectors.load_word2vec_format(x[0],binary=False) 91 | except: 92 | args.embeddings_tables[x[0]]=pickle.load(open(x[0],'rb')) 93 | 94 | args.embeddings_tables_1={} 95 | for x in args.text_features_1: 96 | if x[0] not in args.embeddings_tables_1: 97 | try: 98 | args.embeddings_tables_1[x[0]]=gensim.models.KeyedVectors.load_word2vec_format(x[0],binary=False) 99 | except: 100 | args.embeddings_tables_1[x[0]]=pickle.load(open(x[0],'rb')) 101 | 102 | #读取数据 103 | train_df=pd.read_pickle('data/train_user.pkl') 104 | train_df['label']=train_df['age']*2+train_df['gender'] 105 | test_df=pd.read_pickle('data/test_user.pkl') 106 | test_df['label']=test_df['age']*2+test_df['gender'] 107 | df=train_df[args.dense_features].append(test_df[args.dense_features]) 108 | ss=StandardScaler() 109 | ss.fit(df[args.dense_features]) 110 | train_df[args.dense_features]=ss.transform(train_df[args.dense_features]) 111 | test_df[args.dense_features]=ss.transform(test_df[args.dense_features]) 112 | test_dataset = TextDataset(args,test_df) 113 | 114 | #建立模型 115 | skf=StratifiedKFold(n_splits=5,random_state=2020,shuffle=True) 116 | model=ctrNet.ctrNet(args) 117 | 118 | #训练模型 119 | for i,(train_index,test_index) in enumerate(skf.split(train_df,train_df['label'])): 120 | if i!=args.index: 121 | continue 122 | logger.info("Index: %s",args.index) 123 | train_dataset = TextDataset(args,train_df.iloc[train_index]) 124 | dev_dataset=TextDataset(args,train_df.iloc[test_index]) 125 | model.train(train_dataset,dev_dataset) 126 | dev_df=train_df.iloc[test_index] 127 | 128 | #输出结果 129 | accs=[] 130 | for f,num in [('age',10),('gender',2)]: 131 | model.reload(f) 132 | if f=="age": 133 | dev_preds=model.infer(dev_dataset)[0] 134 | else: 135 | dev_preds=model.infer(dev_dataset)[1] 136 | for j in range(num): 137 | dev_df['{}_{}'.format(f,j)]=np.round(dev_preds[:,j],4) 138 | acc=model.eval(dev_df[f].values,dev_preds)['eval_acc'] 139 | accs.append(acc) 140 | if f=="age": 141 | test_preds=model.infer(test_dataset)[0] 142 | else: 143 | test_preds=model.infer(test_dataset)[1] 144 | 145 | logger.info("Test %s %s",f,np.mean(test_preds,0)) 146 | logger.info("ACC %s %s",f,round(acc,5)) 147 | 148 | out_fs=['user_id','age','gender','predict_{}'.format(f)] 149 | out_fs+=['{}_{}'.format(f,i) for i in range(num)] 150 | for i in range(num): 151 | test_df['{}_{}'.format(f,i)]=np.round(test_preds[:,i],4) 152 | test_df['predict_{}'.format(f)]=np.argmax(test_preds,-1)+1 153 | try: 154 | os.system("mkdir submission") 155 | except: 156 | pass 157 | 158 | test_df[out_fs].to_csv('submission/submission_test_{}_{}_{}.csv'.format(f,args.index,round(acc,5)),index=False) 159 | 160 | logger.info(" best_acc = %s",round(sum(accs),4)) -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | sudo pip install transformers==2.8.0 pandas gensim scikit-learn filelock gdown numpy 2 | pip install transformers==2.8.0 pandas gensim scikit-learn filelock gdown numpy 3 | 4 | #数据下载 5 | gdown https://drive.google.com/uc?id=15onAobxlim_uRUNWSMQuK6VxDsmGTtp4 6 | unzip data.zip 7 | rm data.zip 8 | 9 | #数据预处理 10 | python src/preprocess.py 11 | 12 | #特征提取 13 | python src/extract_features.py 14 | 15 | #下载Word2Vector权重 16 | gdown https://drive.google.com/uc?id=1SUpukAeXR5Ymyf3wH3SRNdQ3Hl2HazQa 17 | unzip w2v.zip 18 | cp w2v/* data/ 19 | rm -r w2v* 20 | 21 | #下载BERT-base权重 22 | gdown https://drive.google.com/uc?id=1ToAJwl_oRAeRNyYF_FK0B2APVXlPFTlq 23 | unzip bert-base.zip 24 | mv bert-base BERT/ 25 | rm bert-base.zip 26 | 27 | #训练模型 28 | mkdir saved_models 29 | mkdir saved_models/log 30 | for((i=0;i<5;i++)); 31 | do 32 | python run.py \ 33 | --kfold=5 \ 34 | --index=$i \ 35 | --train_batch_size=256 \ 36 | --eval_steps=5000 \ 37 | --max_len_text=128 \ 38 | --epoch=5 \ 39 | --lr=1e-4 \ 40 | --output_path=saved_models \ 41 | --pretrained_model_path=BERT/bert-base \ 42 | --eval_batch_size=512 2>&1 | tee saved_models/log/$i.txt 43 | done 44 | 45 | #合并结果 46 | python src/merge_submission.py 47 | -------------------------------------------------------------------------------- /src/data_loader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | import pandas as pd 4 | import numpy as np 5 | from torch.utils.data import Dataset 6 | import gensim 7 | logger = logging.getLogger(__name__) 8 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 9 | datefmt='%m/%d/%Y %H:%M:%S', 10 | level=logging.INFO) 11 | 12 | 13 | class TextDataset(Dataset): 14 | def __init__(self, args,df): 15 | self.label=df['label'].values 16 | self.text_features=df[[x[1] for x in args.text_features]].values 17 | self.text_features_1=df[[x[1] for x in args.text_features_1]].values 18 | self.dense_features=df[args.dense_features].values 19 | self.embeddings_tables=[] 20 | for x in args.text_features: 21 | self.embeddings_tables.append(args.embeddings_tables[x[0]] if x[0] is not None else None) 22 | self.embeddings_tables_1=[] 23 | for x in args.text_features_1: 24 | self.embeddings_tables_1.append(args.embeddings_tables_1[x[0]] if x[0] is not None else None) 25 | self.args=args 26 | self.df=df 27 | 28 | def __len__(self): 29 | return len(self.label) 30 | 31 | def __getitem__(self, i): 32 | #标签信息 33 | label=self.label[i] 34 | #BERT的输入特征 35 | if len(self.args.text_features)==0: 36 | text_features=0 37 | text_masks=0 38 | text_ids=0 39 | else: 40 | text_features=np.zeros((self.args.max_len_text,self.args.text_dim)) 41 | text_masks=np.zeros(self.args.max_len_text) 42 | text_ids=np.zeros((self.args.max_len_text,len(self.args.text_features)),dtype=np.int64) 43 | begin_dim=0 44 | for idx,(embed_table,x) in enumerate(zip(self.embeddings_tables,self.text_features[i])): 45 | end_dim=begin_dim+self.args.text_features[idx][-1] 46 | for w_idx,word in enumerate(x.split()[:self.args.max_len_text]): 47 | text_features[w_idx,begin_dim:end_dim]=embed_table[word] 48 | text_masks[w_idx]=1 49 | try: 50 | text_ids[w_idx,idx]=self.args.vocab[self.args.text_features[idx][1],word] 51 | except: 52 | text_ids[w_idx,idx]=self.args.vocab['unk'] 53 | begin_dim=end_dim 54 | #decoder的输入特征 55 | if len(self.args.text_features_1)==0: 56 | text_features_1=0 57 | text_masks_1=0 58 | else: 59 | text_features_1=np.zeros((self.args.max_len_text,self.args.text_dim_1)) 60 | text_masks_1=np.zeros(self.args.max_len_text) 61 | begin_dim=0 62 | for idx,(embed_table,x) in enumerate(zip(self.embeddings_tables_1,self.text_features_1[i])): 63 | end_dim=begin_dim+self.args.text_features_1[idx][-1] 64 | if embed_table is not None: 65 | for w_idx,word in enumerate(x.split()[:self.args.max_len_text]): 66 | text_features_1[w_idx,begin_dim:end_dim]=embed_table[word] 67 | text_masks_1[w_idx]=1 68 | else: 69 | for w_idx,v in enumerate(x[:self.args.max_len_text]): 70 | text_features_1[w_idx,begin_dim:end_dim]=v 71 | text_masks_1[w_idx]=1 72 | begin_dim=end_dim 73 | #浮点数特征 74 | if len(self.args.dense_features)==0: 75 | dense_features=0 76 | else: 77 | dense_features=self.dense_features[i] 78 | 79 | return ( 80 | torch.tensor(label), 81 | torch.tensor(dense_features), 82 | torch.tensor(text_features), 83 | torch.tensor(text_ids), 84 | torch.tensor(text_masks), 85 | torch.tensor(text_features_1), 86 | torch.tensor(text_masks_1), 87 | ) 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /src/extract_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | import json 4 | import gc 5 | import pickle 6 | import gensim 7 | import pandas as pd 8 | import numpy as np 9 | from gensim.models import Word2Vec 10 | from sklearn import preprocessing 11 | from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler 12 | 13 | def get_agg_features(dfs,f1,f2,agg,log): 14 | #判定特殊情况 15 | if type(f1)==str: 16 | f1=[f1] 17 | if agg!='size': 18 | data=log[f1+[f2]] 19 | else: 20 | data=log[f1] 21 | f_name='_'.join(f1)+"_"+f2+"_"+agg 22 | #聚合操作 23 | if agg=="size": 24 | tmp = pd.DataFrame(data.groupby(f1).size()).reset_index() 25 | elif agg=="count": 26 | tmp = pd.DataFrame(data.groupby(f1)[f2].count()).reset_index() 27 | elif agg=="mean": 28 | tmp = pd.DataFrame(data.groupby(f1)[f2].mean()).reset_index() 29 | elif agg=="unique": 30 | tmp = pd.DataFrame(data.groupby(f1)[f2].nunique()).reset_index() 31 | elif agg=="max": 32 | tmp = pd.DataFrame(data.groupby(f1)[f2].max()).reset_index() 33 | elif agg=="min": 34 | tmp = pd.DataFrame(data.groupby(f1)[f2].min()).reset_index() 35 | elif agg=="sum": 36 | tmp = pd.DataFrame(data.groupby(f1)[f2].sum()).reset_index() 37 | elif agg=="std": 38 | tmp = pd.DataFrame(data.groupby(f1)[f2].std()).reset_index() 39 | elif agg=="median": 40 | tmp = pd.DataFrame(data.groupby(f1)[f2].median()).reset_index() 41 | else: 42 | raise "agg error" 43 | #赋值聚合特征 44 | for df in dfs: 45 | try: 46 | del df[f_name] 47 | except: 48 | pass 49 | tmp.columns = f1+[f_name] 50 | df[f_name]=df.merge(tmp, on=f1, how='left')[f_name] 51 | del tmp 52 | del data 53 | gc.collect() 54 | return [f_name] 55 | 56 | 57 | def sequence_text(dfs,f1,f2,log): 58 | f_name='sequence_text_'+f1+'_'+f2 59 | print(f_name) 60 | #遍历log,获得用户的点击序列 61 | dic,items={},[] 62 | for item in log[[f1,f2]].values: 63 | try: 64 | dic[item[0]].append(str(item[1])) 65 | except: 66 | dic[item[0]]=[str(item[1])] 67 | for key in dic: 68 | items.append([key,' '.join(dic[key])]) 69 | #赋值序列特征 70 | temp=pd.DataFrame(items) 71 | temp.columns=[f1,f_name] 72 | temp = temp.drop_duplicates(f1) 73 | for df in dfs: 74 | try: 75 | del df[f_name] 76 | except: 77 | pass 78 | temp.columns = [f1]+[f_name] 79 | df[f_name]=df.merge(temp, on=f1, how='left')[f_name] 80 | gc.collect() 81 | del temp 82 | del items 83 | del dic 84 | return [f_name] 85 | 86 | def kfold(train_df,test_df,log_data,pivot): 87 | #先对log做kflod统计,统计每条记录中pivot特征的性别年龄分布 88 | kfold_features=['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)] 89 | log=log_data[kfold_features+['user_id',pivot,'fold']] 90 | tmps=[] 91 | for fold in range(6): 92 | tmp = pd.DataFrame(log[(log['fold'] != fold) & (log['fold'] != 5)].groupby(pivot)[kfold_features].mean()).reset_index() 93 | tmp.columns=[pivot]+kfold_features 94 | tmp['fold']=fold 95 | tmps.append(tmp) 96 | tmp=pd.concat(tmps,axis=0).reset_index() 97 | tmp=log[['user_id',pivot,'fold']].merge(tmp,on=[pivot,'fold'],how='left') 98 | del log 99 | del tmps 100 | gc.collect() 101 | #获得用户点击的所有记录的平均性别年龄分布 102 | tmp_mean = pd.DataFrame(tmp.groupby('user_id')[kfold_features].mean()).reset_index() 103 | tmp_mean.columns=['user_id']+[f+'_'+pivot+'_mean' for f in kfold_features] 104 | for df in [train_df,test_df]: 105 | temp=df.merge(tmp_mean,on='user_id',how='left') 106 | temp=temp.fillna(-1) 107 | for f1 in [f+'_'+pivot+'_mean' for f in kfold_features]: 108 | df[f1]=temp[f1] 109 | del temp 110 | gc.collect() 111 | del tmp 112 | del tmp_mean 113 | gc.collect() 114 | 115 | 116 | 117 | def kfold_sequence(train_df,test_df,log_data,pivot): 118 | #先对log做kflod统计,统计每条记录中pivot特征的性别年龄分布 119 | kfold_features=['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)] 120 | log=log_data[kfold_features+[pivot,'fold','user_id']] 121 | tmps=[] 122 | for fold in range(6): 123 | tmp = pd.DataFrame(log[(log['fold'] != fold) & (log['fold'] != 5)].groupby(pivot)[kfold_features].mean()).reset_index() 124 | tmp.columns=[pivot]+kfold_features 125 | tmp['fold']=fold 126 | tmps.append(tmp) 127 | tmp=pd.concat(tmps,axis=0).reset_index() 128 | tmp=log[[pivot,'fold','user_id']].merge(tmp,on=[pivot,'fold'],how='left') 129 | tmp=tmp.fillna(-1) 130 | tmp[pivot+'_fold']=tmp[pivot]*10+tmp['fold'] 131 | del log 132 | del tmps 133 | gc.collect() 134 | #获得用户点击记录的年龄性别分布序列 135 | tmp[pivot+'_fold']=tmp[pivot+'_fold'].astype(int) 136 | kfold_sequence_features=sequence_text([train_df,test_df],'user_id',pivot+'_fold',tmp) 137 | tmp=tmp.drop_duplicates([pivot+'_fold']).reset_index(drop=True) 138 | #对每条记录年龄性别分布进行标准化 139 | kfold_features=['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)] 140 | ss=StandardScaler() 141 | ss.fit(tmp[kfold_features]) 142 | tmp[kfold_features]=ss.transform(tmp[kfold_features]) 143 | for f in kfold_features: 144 | tmp[f]=tmp[f].apply(lambda x:round(x,4)) 145 | #将每条记录年龄性别分布转成w2v形式的文件 146 | with open('data/sequence_text_user_id_'+pivot+'_fold'+".{}d".format(12),'w') as f: 147 | f.write(str(len(tmp))+' '+'12'+'\n') 148 | for item in tmp[[pivot+'_fold']+kfold_features].values: 149 | f.write(' '.join([str(int(item[0]))]+[str(x) for x in item[1:]])+'\n') 150 | tmp=gensim.models.KeyedVectors.load_word2vec_format('data/sequence_text_user_id_'+pivot+'_fold'+".{}d".format(12),binary=False) 151 | pickle.dump(tmp,open('data/sequence_text_user_id_'+pivot+'_fold'+".{}d".format(12),'wb')) 152 | del tmp 153 | gc.collect() 154 | return kfold_sequence_features 155 | 156 | if __name__ == "__main__": 157 | #读取数据 158 | click_log=pd.read_pickle('data/click.pkl') 159 | train_df=pd.read_pickle('data/train_user.pkl') 160 | test_df=pd.read_pickle('data/test_user.pkl') 161 | print(click_log.shape,train_df.shape,test_df.shape) 162 | ################################################################################ 163 | #获取聚合特征 164 | print("Extracting aggregate feature...") 165 | agg_features=[] 166 | agg_features+=get_agg_features([train_df,test_df],'user_id','','size',click_log) 167 | agg_features+=get_agg_features([train_df,test_df],'user_id','ad_id','unique',click_log) 168 | agg_features+=get_agg_features([train_df,test_df],'user_id','creative_id','unique',click_log) 169 | agg_features+=get_agg_features([train_df,test_df],'user_id','advertiser_id','unique',click_log) 170 | agg_features+=get_agg_features([train_df,test_df],'user_id','industry','unique',click_log) 171 | agg_features+=get_agg_features([train_df,test_df],'user_id','product_id','unique',click_log) 172 | agg_features+=get_agg_features([train_df,test_df],'user_id','time','unique',click_log) 173 | agg_features+=get_agg_features([train_df,test_df],'user_id','click_times','sum',click_log) 174 | agg_features+=get_agg_features([train_df,test_df],'user_id','click_times','mean',click_log) 175 | agg_features+=get_agg_features([train_df,test_df],'user_id','click_times','std',click_log) 176 | train_df[agg_features]=train_df[agg_features].fillna(-1) 177 | test_df[agg_features]=test_df[agg_features].fillna(-1) 178 | print("Extracting aggregate feature done!") 179 | print("List aggregate feature names:") 180 | print(agg_features) 181 | ################################################################################ 182 | #获取序列特征,用户点击的id序列 183 | print("Extracting sequence feature...") 184 | text_features=[] 185 | text_features+=sequence_text([train_df,test_df],'user_id','ad_id',click_log) 186 | text_features+=sequence_text([train_df,test_df],'user_id','creative_id',click_log) 187 | text_features+=sequence_text([train_df,test_df],'user_id','advertiser_id',click_log) 188 | text_features+=sequence_text([train_df,test_df],'user_id','product_id',click_log) 189 | text_features+=sequence_text([train_df,test_df],'user_id','industry',click_log) 190 | text_features+=sequence_text([train_df,test_df],'user_id','product_category',click_log) 191 | text_features+=sequence_text([train_df,test_df],'user_id','time',click_log) 192 | text_features+=sequence_text([train_df,test_df],'user_id','click_times',click_log) 193 | print("Extracting sequence feature done!") 194 | print("List sequence feature names:") 195 | print(text_features) 196 | ################################################################################ 197 | #获取K折统计特征,求出用户点击的所有记录的年龄性别平均分布 198 | #赋值index,训练集为0-4,测试集为5 199 | print("Extracting Kflod feature...") 200 | log=click_log.drop_duplicates(['user_id','creative_id']).reset_index(drop=True) 201 | del click_log 202 | gc.collect() 203 | log['cont']=1 204 | train_df['fold']=train_df.index%5 205 | test_df['fold']=5 206 | df=train_df.append(test_df)[['user_id','fold']].reset_index(drop=True) 207 | log=log.merge(df,on='user_id',how='left') 208 | del df 209 | gc.collect() 210 | #获取用户点击某特征的年龄性别平均分布 211 | for pivot in ['creative_id','ad_id','product_id','advertiser_id','industry']: 212 | print("Kfold",pivot) 213 | kfold(train_df,test_df,log,pivot) 214 | del log 215 | gc.collect() 216 | print("Extracting Kflod feature done!") 217 | ################################################################################ 218 | #获取K折序列特征,求出用户点击的每一条记录的年龄性别分布 219 | #赋值index,训练集为0-4,测试集为5 220 | print("Extracting Kflod sequence feature...") 221 | click_log=pd.read_pickle('data/click.pkl') 222 | log=click_log.reset_index(drop=True) 223 | del click_log 224 | gc.collect() 225 | log['cont']=1 226 | train_df['fold']=train_df.index%5 227 | train_df['fold']=train_df['fold'].astype(int) 228 | test_df['fold']=5 229 | df=train_df.append(test_df)[['user_id','fold']].reset_index(drop=True) 230 | log=log.merge(df,on='user_id',how='left') 231 | #获取用户点击某特征的年龄性别分布序列 232 | kfold_sequence_features=[] 233 | for pivot in ['creative_id','ad_id','product_id','advertiser_id','industry']: 234 | print("Kfold sequence",pivot) 235 | kfold_sequence_features+=kfold_sequence(train_df,test_df,log,pivot) 236 | del log 237 | gc.collect() 238 | print("Extracting Kfold sequence feature done!") 239 | print("List Kfold sequence feature names:") 240 | print(kfold_sequence_features) 241 | ################################################################################ 242 | print("Extract features done! saving data...") 243 | train_df.to_pickle('data/train_user.pkl') 244 | test_df.to_pickle('data/test_user.pkl') -------------------------------------------------------------------------------- /src/merge_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | def submit_files(path): 5 | age_files=[] 6 | gender_files=[] 7 | files= os.listdir(path) #得到文件夹下的所有文件名称 8 | s = [] 9 | for file in files: #遍历文件夹 10 | if not os.path.isdir(file): #判断是否是文件夹,不是文件夹才打开 11 | if 'submission_test_gender' in file: 12 | gender_files.append(os.path.join(path,file)) 13 | elif 'submission_test_age' in file: 14 | age_files.append(os.path.join(path,file)) 15 | return age_files,gender_files 16 | 17 | age_files,gender_files=submit_files("submission") 18 | 19 | print("Age Files:") 20 | for f in age_files: 21 | print(f) 22 | print("Gender Files:") 23 | for f in gender_files: 24 | print(f) 25 | age_score=np.mean([float(x.split('_')[-1][:-4]) for x in age_files]) 26 | gender_score=np.mean([float(x.split('_')[-1][:-4]) for x in gender_files]) 27 | print(len(age_files),len(gender_files)) 28 | print(round(age_score,4),round(gender_score,4),round(age_score+gender_score,4)) 29 | 30 | age_dfs=[pd.read_csv(f)[['user_id']+['age_'+str(i) for i in range(10)]] for f in age_files] 31 | age_df=pd.concat(age_dfs,0) 32 | age_df=pd.DataFrame(age_df.groupby('user_id').mean()).sort_values('user_id').reset_index() 33 | age_df['predicted_age']=np.argmax(age_df[['age_'+str(i) for i in range(10)]].values,-1)+1 34 | print(age_df) 35 | 36 | gender_dfs=[pd.read_csv(f)[['user_id']+['gender_'+str(i) for i in range(2)]] for f in gender_files] 37 | gender_df=pd.concat(gender_dfs,0) 38 | gender_df=pd.DataFrame(gender_df.groupby('user_id').mean()).sort_values('user_id').reset_index() 39 | gender_df['predicted_gender']=np.argmax(gender_df[['gender_'+str(i) for i in range(2)]].values,-1)+1 40 | print(gender_df) 41 | 42 | df=age_df 43 | df['predicted_gender']=gender_df['predicted_gender'] 44 | print(df) 45 | 46 | df[['user_id','predicted_age','predicted_gender']].to_csv("submission.csv",index=False) 47 | print(df[['predicted_age','predicted_gender']].mean()) 48 | 49 | -------------------------------------------------------------------------------- /src/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | def merge_files(): 5 | #合并点击记录 6 | print("merge click files...") 7 | click_df=pd.read_csv("data/train_preliminary/click_log.csv") 8 | click_df=click_df.append(pd.read_csv("data/train_semi_final/click_log.csv")) 9 | click_df=click_df.append(pd.read_csv("data/test/click_log.csv")) 10 | click_df=click_df.sort_values(by=["time"]).drop_duplicates() 11 | 12 | #合并广告信息 13 | print("merge ad files...") 14 | ad_df=pd.read_csv("data/train_preliminary/ad.csv") 15 | ad_df=ad_df.append(pd.read_csv("data/train_semi_final/ad.csv")) 16 | ad_df=ad_df.append(pd.read_csv("data/test/ad.csv")) 17 | ad_df=ad_df.drop_duplicates() 18 | 19 | #合并用户信息 20 | print("merge user files...") 21 | train_user=pd.read_csv("data/train_preliminary/user.csv") 22 | train_user=train_user.append(pd.read_csv("data/train_semi_final/user.csv")) 23 | train_user=train_user.reset_index(drop=True) 24 | train_user['age']=train_user['age']-1 25 | train_user['gender']=train_user['gender']-1 26 | test_user=pd.read_csv("data/test/click_log.csv").drop_duplicates('user_id')[['user_id']].reset_index(drop=True) 27 | test_user=test_user.sort_values(by='user_id').reset_index(drop=True) 28 | test_user['age']=-1 29 | test_user['gender']=-1 30 | 31 | #合并点击,广告,用户信息 32 | print("merge all files...") 33 | click_df=click_df.merge(ad_df,on="creative_id",how='left') 34 | click_df=click_df.merge(train_user,on="user_id",how='left') 35 | click_df=click_df.fillna(-1) 36 | click_df=click_df.replace("\\N",-1) 37 | for f in click_df: 38 | click_df[f]=click_df[f].astype(int) 39 | for i in range(10): 40 | click_df['age_{}'.format(i)]=(click_df['age']==i).astype(np.int16) 41 | for i in range(2): 42 | click_df['gender_{}'.format(i)]=(click_df['gender']==i).astype(np.int16) 43 | 44 | 45 | return click_df,train_user,test_user 46 | 47 | 48 | if __name__ == "__main__": 49 | click_df,train_user,test_user=merge_files() 50 | #保存预处理文件 51 | print("preprocess done! saving data...") 52 | click_df.to_pickle("data/click.pkl") 53 | train_user.to_pickle("data/train_user.pkl") 54 | test_user.to_pickle("data/test_user.pkl") 55 | -------------------------------------------------------------------------------- /src/w2v.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | from gensim.models import Word2Vec 5 | from tqdm import tqdm 6 | import pickle 7 | import logging 8 | logger = logging.getLogger(__name__) 9 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 10 | datefmt='%m/%d/%Y %H:%M:%S', 11 | level=logging.INFO) 12 | 13 | def w2v(dfs,f,L=128): 14 | print("w2v",f) 15 | sentences=[] 16 | for df in dfs: 17 | for line in df[f].values: 18 | sentences.append(line.split()) 19 | print("Sentence Num {}".format(len(sentences))) 20 | w2v=Word2Vec(sentences,vector_size=L, window=8,min_count=1,sg=1,workers=32,epochs=10) 21 | print("save w2v to {}".format(os.path.join('data',f+".{}d".format(L)))) 22 | pickle.dump(w2v,open(os.path.join('data',f+".{}d".format(L)),'wb')) 23 | 24 | if __name__ == "__main__": 25 | train_df=pd.read_pickle('data/train_user.pkl') 26 | test_df=pd.read_pickle('data/test_user.pkl') 27 | #训练word2vector,维度为128 28 | w2v([train_df,test_df],'sequence_text_user_id_ad_id',L=128) 29 | w2v([train_df,test_df],'sequence_text_user_id_creative_id',L=128) 30 | w2v([train_df,test_df],'sequence_text_user_id_advertiser_id',L=128) 31 | w2v([train_df,test_df],'sequence_text_user_id_product_id',L=128) 32 | w2v([train_df,test_df],'sequence_text_user_id_industry',L=128) 33 | w2v([train_df,test_df],'sequence_text_user_id_product_category',L=128) 34 | w2v([train_df,test_df],'sequence_text_user_id_time',L=128) 35 | w2v([train_df,test_df],'sequence_text_user_id_click_times',L=128) 36 | 37 | 38 | --------------------------------------------------------------------------------