├── README.md ├── all_code.py ├── config.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # 基于用户画像的商品推荐 2 | ## Usage 3 | ### Requirments 4 | numpy==1.19.5 5 | tensorflow_gpu==2.4.1 6 | gensim==4.0.1 7 | pandas==0.25.3 8 | scikit_learn==0.24.2 9 | 10 | ### 代码说明 11 | #### 数据处理 12 | - 根据tagid是否缺失把train和test(复赛数据)分出两部分数据集 13 | - 将复赛数据集的train和test的tagid未缺失用户的tagid序列用来做Word2Vector 14 | 15 | #### 模型说明 16 | - 两层GRU 17 | - 五折交叉验证 18 | 19 | #### 结果输出 20 | - test中tagid缺失的用户label直接预测为1 21 | - 线下train_tagidNotnull_F1Score为0.6773461 22 | 23 | ## 另外: 24 | - 硬件方面用的自己的一块3060显卡,在batch_size为512的时候,仅有3.6G显存; 25 | - 就算这块显卡再不济,也比我的MacBook Pro计算速度快100倍,比免费版Google colab快30倍。 26 | - 跑一次模型从30min~2h不等,跟embedding size、batch_size、hidden 层数有关。 27 | -------------------------------------------------------------------------------- /all_code.py: -------------------------------------------------------------------------------- 1 | # _*_coding: utf-8-*_ 2 | # @Project : 基于用户画像的商品推荐挑战赛 3 | # @FileNAme: all_code.py 4 | # @Author : Rocket,Qian 5 | # @Time : 2021/9/19 18:28 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | from sklearn.model_selection import StratifiedKFold 10 | from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score 11 | from tensorflow.keras.preprocessing import text, sequence 12 | from tensorflow.keras.models import * 13 | from tensorflow.keras.layers import * 14 | from tensorflow.keras.callbacks import * 15 | from tensorflow.keras.layers import GRU 16 | import tensorflow as tf 17 | from gensim.models import Word2Vec 18 | from config import parser 19 | import os 20 | import warnings 21 | 22 | warnings.filterwarnings('ignore') 23 | 24 | args = parser.parse_args() 25 | # 读取数据,简单处理list数据 26 | train_all = pd.read_csv(args.train_file, header=None) 27 | test_all = pd.read_csv(args.test_file, header=None) 28 | # train_first = pd.read_csv(r'E:\Competition\基于用户画像的商品推荐挑战赛\dataset\train.txt', header=None) 29 | train_all.columns = ['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make'] 30 | test_all.columns = ['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make'] 31 | print('导入数据成功') 32 | train = train_all[train_all['tagid'].notnull()] 33 | test = test_all[test_all['tagid'].notnull()] 34 | 35 | flag = 0 36 | if flag == 1: 37 | train = pd.concat([train_first, train]) 38 | # train.to_csv(r'E:\Competition\基于用户画像的商品推荐挑战赛\train_sum.csv', index=False) 39 | 40 | train['label'] = train['label'].astype(int) 41 | 42 | data = pd.concat([train, test]) 43 | data['label'] = data['label'].fillna(-1) 44 | data['tagid'] = data['tagid'].apply(lambda x: eval(x)) 45 | data['tagid'] = data['tagid'].apply(lambda x: [str(i) for i in x]) 46 | 47 | embed_size = args.embed_size 48 | MAX_WORDS_NUM = args.MAX_WORDS_NUM 49 | MAX_SEQUENCE_LENGTH = args.MAX_SEQUENCE_LENGTH 50 | w2v_model = Word2Vec(sentences=data['tagid'].tolist(), vector_size=embed_size, window=args.window, min_count=1, 51 | epochs=args.epochs, hs=1) 52 | 53 | X_train = data[:train.shape[0]]['tagid'] 54 | X_test = data[train.shape[0]:]['tagid'] 55 | 56 | tokenizer = text.Tokenizer(num_words=MAX_WORDS_NUM) 57 | tokenizer.fit_on_texts(list(X_train) + list(X_test)) 58 | X_train = tokenizer.texts_to_sequences(X_train) 59 | X_test = tokenizer.texts_to_sequences(X_test) 60 | X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', truncating='pre') 61 | X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH, padding='pre', truncating='pre') 62 | word_index = tokenizer.word_index 63 | nb_words = len(word_index) + 1 64 | 65 | embedding_matrix = np.zeros((nb_words, embed_size)) 66 | for word, i in word_index.items(): 67 | try: 68 | embedding_vector = w2v_model.wv.get_vector(word) 69 | except KeyError: 70 | continue 71 | if embedding_vector is not None: 72 | embedding_matrix[i] = embedding_vector 73 | y_cat = train['label'].values 74 | 75 | # GPU设置 76 | os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices' 77 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 78 | 79 | physical_devices = tf.config.list_physical_devices('GPU') 80 | tf.config.experimental.set_memory_growth(physical_devices[0], enable=True) 81 | 82 | 83 | # 定义模型 84 | def my_model(): 85 | embedding_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32') 86 | # 词嵌入(使用预训练的词向量) 87 | embedder = Embedding(nb_words, 88 | embed_size, 89 | input_length=MAX_SEQUENCE_LENGTH, 90 | weights=[embedding_matrix], 91 | trainable=False 92 | ) 93 | embed = embedder(embedding_input) 94 | l = GRU(args.GRU1_hidden_size, return_sequences=True)(embed) 95 | flat = BatchNormalization()(l) 96 | drop = Dropout(args.dropout)(flat) 97 | l2 = GRU(args.GRU2_hidden_size)(drop) 98 | output = Dense(1, activation='sigmoid')(l2) 99 | model = Model(inputs=embedding_input, outputs=output) 100 | model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy']) 101 | return model 102 | 103 | 104 | # 五折交叉验证 105 | folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021) 106 | triain_pre = np.zeros([len(train), 1]) 107 | test_predictions = np.zeros([len(test), 1]) 108 | 109 | for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])): 110 | print("fold n{}".format(fold_ + 1)) 111 | model = my_model() 112 | early_stopping = EarlyStopping(monitor='val_accuracy', patience=5) 113 | bst_model_path = "./{}.h5".format(fold_) 114 | model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) 115 | 116 | X_tra, X_val = X_train[trn_idx], X_train[val_idx] 117 | y_tra, y_val = y_cat[trn_idx], y_cat[val_idx] 118 | 119 | model.fit(X_tra, y_tra, 120 | validation_data=(X_val, y_val), 121 | epochs=args.nn_epochs, batch_size=args.batch_size, shuffle=True, 122 | callbacks=[early_stopping, model_checkpoint]) 123 | 124 | model.load_weights(bst_model_path) 125 | triain_pre[val_idx] = model.predict(X_val) 126 | test_predictions += model.predict(X_test) / folds.n_splits 127 | del model 128 | 129 | 130 | submit = test[['pid']] 131 | submit['tmp'] = test_predictions 132 | submit.columns = ['user_id', 'tmp'] 133 | 134 | submit['rank'] = submit['tmp'].rank() 135 | submit['category_id'] = 1 136 | submit.loc[submit['rank'] <= int(submit.shape[0] * 0.859), 'category_id'] = 0 137 | 138 | submit_null = test_all[test_all['tagid'].isna()][['pid']] 139 | submit_null['category_id'] = 1 140 | 141 | submit_notnull = submit[['user_id', 'category_id']] 142 | submit_notnull.columns = ['pid', 'category_id'] 143 | 144 | sub = pd.concat([submit_null, submit_notnull]) 145 | sub.sort_values(by='pid', ascending=True, inplace=True) 146 | sub.to_csv( 147 | r'E:\Competition\基于用户画像的商品推荐挑战赛\result\0920GRUemd64b400win1_0859.csv', 148 | index=False) 149 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # _*_coding: utf-8-*_ 2 | # @Project : 基于用户画像的商品推荐挑战赛 3 | # @FileNAme: config.py 4 | # @Author : Rocket,Qian 5 | # @Time : 2021/9/10 20:25 6 | import argparse 7 | path = r'E:\Competition\基于用户画像的商品推荐挑战赛\dataset' 8 | parser = argparse.ArgumentParser(description="基于用户画像的商品推荐挑战赛") 9 | 10 | # ========================= Dataset Configs ========================== 11 | parser.add_argument('--train_file', type=str, default=path + r'\data2\train.txt') 12 | parser.add_argument('--test_file', type=str, default=path + r'\data2\test.txt') 13 | 14 | # ========================= Word2Vec Configs ========================== 15 | parser.add_argument('--embed_size', type=int, default=64, help='embedding_size of every tagid') 16 | parser.add_argument('--MAX_WORDS_NUM', type=int, default=224253, help='all word of fusai data') 17 | parser.add_argument('--MAX_SEQUENCE_LENGTH', type=int, default=256) 18 | parser.add_argument('--window', type=int, default=1) 19 | parser.add_argument('--epochs', type=int, default=10, help='Num epochs of word2vec') 20 | 21 | # ========================= Model Configs ========================== 22 | parser.add_argument('--GRU1_hidden_size', type=int, default=128, help='GRU1 hidden_size ') 23 | parser.add_argument('--GRU2_hidden_size', type=int, default=256, help='GRU2 hidden_size ') 24 | parser.add_argument('--dropout', type=float, default=0.2, help='dropout ratio') 25 | parser.add_argument('--nn_epochs', type=int, default=128, help='number of total epochs to train') 26 | parser.add_argument('--batch_size', type=int, default=400, help='number of total epochs to train') 27 | 28 | 29 | args = parser.parse_args() 30 | print(args.dropout) 31 | print(args.train_file) 32 | print(args.batch_size) 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.19.5 2 | tensorflow_gpu==2.4.1 3 | gensim==4.0.1 4 | pandas==0.25.3 5 | scikit_learn==0.24.2 6 | --------------------------------------------------------------------------------