├── README.md └── Warmup_bayes.cpp /README.md: -------------------------------------------------------------------------------- 1 | # 2020HuaweiCodecraft 2 | 用贝叶斯分类器来解决二分类问题,训练数据选择了1580个样本,维度选择400,卡了准确度69.5,最终成绩0.022,rank 12. 3 | 4 | ## 问题描述 5 | 给定训练集,训练集有N个样本,每个样本有1000维特征,样本所属类别有两种,0类和1类。对测试集中的Y个样本(1000维特征)进行预测,最终的分数与预测的准确、程序运行时间相关。 6 | 7 | ## 解题思路 8 | 考虑最终的分数与运行时间、准确度均有关,但是惩罚系数并不是很大,于是就想着去卡70%准确度,寻找一种可以并行的算法来对测试集进行预测。这里选择了贝叶斯分类器,然后就一条路走到黑。对于朴素贝叶斯分类器,详细的介绍可自行百度查阅,最终需要的只是一个公式: 9 | 10 | ![](http://latex.codecogs.com/gif.latex?P(0|x)=P(x_{1}|0)*P(x_{2}|0)*P(x_{3}|0)*...*P(x_{k}|0)*P(0)) 11 | 12 | ![](http://latex.codecogs.com/gif.latex?P(1|x)=P(x_{1}|1)*P(x_{2}|1)*P(x_{3}|1)*...*P(x_{k}|1)*P(1)) 13 | 14 | 如果P(1|x)>P(0|x),那么当前样本预测为1,否则预测为0。为了防止下溢,两边开log可以化为加法,然后平滑处理一下,得到两个公式的系数向量mu[0]和mu[1]。最终的P(x|0) = Dot(vt, mu[0]) + P(0), P(x|1) = Dot(vt, mu[1]) + P(1)。如果 P(x|1) - P(x|0) > 0,就预测为1,这里可以提前预处理mu[1] - m[0],少做一次点乘,即P(x|1) - P(x|0) = Dot(vt, mu[1] - mu[0]) + P(1) - P(0)。 15 | 16 | 数据读取选择mmap,循环加速建议查看华为官方鲲鹏920代码调优指南文档,利用多线程和多进程加速,卡一下准确度,减少程序中的赋值操作等,可以得到一个不错的结果。(面向对象的程序好像会慢一些,就去掉了)。 17 | -------------------------------------------------------------------------------- /Warmup_bayes.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | 32 | #define NUM_THREADS 8 33 | 34 | using namespace std; 35 | 36 | //矩阵操作 37 | struct Matrix { 38 | typedef vector Mat1; 39 | typedef vector> Mat2; 40 | }; 41 | void out(Matrix::Mat1 mat) { 42 | for (auto &x : mat) { 43 | cout << x << " "; 44 | } 45 | cout << "\n"; 46 | } 47 | void out(Matrix::Mat2 mat) { 48 | for (auto &x : mat) { 49 | out(x); 50 | } 51 | } 52 | //点乘 53 | static float Dot(const Matrix::Mat1 &mat1, const Matrix::Mat1 &mat2) { 54 | int n = mat1.size(); 55 | float ans = 0; 56 | for (int i = 0; i < n; i += 16) { 57 | for (int j = 0; j < 16; j++) { 58 | ans += mat1[i + j] * mat2[i + j]; 59 | } 60 | } 61 | return ans; 62 | }; 63 | //乘法 64 | static Matrix::Mat1 operator*(const Matrix::Mat2 &mat1, 65 | const Matrix::Mat1 &mat2) { 66 | int n = mat1.size(); 67 | Matrix::Mat1 mat; 68 | for (const auto &x : mat1) { 69 | mat.emplace_back(Dot(x, mat2)); 70 | } 71 | return mat; 72 | }; 73 | static Matrix::Mat2 operator*(const Matrix::Mat1 &mat1, 74 | const Matrix::Mat1 &mat2) { 75 | int n = mat1.size(), m = mat2.size(), id = 0; 76 | Matrix::Mat2 mat(n); 77 | for (const auto &x : mat1) { 78 | for (const auto &y : mat2) { 79 | mat[id].emplace_back(x * y); 80 | } 81 | id++; 82 | } 83 | return mat; 84 | }; 85 | //转置 86 | static Matrix::Mat2 T(const Matrix::Mat2 &mat1) { 87 | int n = mat1.size(), m = mat1[0].size(); 88 | Matrix::Mat2 mat(m, Matrix::Mat1(n)); 89 | for (int i = 0; i < n; i++) { 90 | for (int j = 0; j < m; j++) { 91 | mat[j][i] = mat1[i][j]; 92 | } 93 | } 94 | return mat; 95 | }; 96 | static Matrix::Mat1 operator-(const Matrix::Mat1 &mat1, 97 | const Matrix::Mat1 &mat2) { 98 | int n = mat1.size(); 99 | Matrix::Mat1 mat(n); 100 | for (int i = 0; i < n; i++) { 101 | mat[i] = mat1[i] - mat2[i]; 102 | } 103 | return mat; 104 | } 105 | static Matrix::Mat2 operator*(const Matrix::Mat2 &mat1, 106 | const Matrix::Mat2 &mat2) { 107 | Matrix::Mat2 mat2T = T(mat2); 108 | int n = mat1.size(), m = mat2[0].size(); 109 | Matrix::Mat2 mat(n, Matrix::Mat1(m)); 110 | for (int i = 0; i < n; i++) { 111 | for (int j = 0; j < m; j++) { 112 | mat[i][j] = Dot(mat1[i], mat2T[j]); 113 | } 114 | } 115 | return mat; 116 | }; 117 | 118 | char answer[40000]; 119 | string trainFile; 120 | string testFile; 121 | string answerFile; 122 | string predictOutFile; 123 | int testLineSize = 6000; //测试集一行字符数 124 | int feature = 400; //特征数 125 | int trainNum = 1580; //参与训练样本数 126 | int predictNum = 100000; 127 | int featureId; 128 | float pLabel0; 129 | float pLabel1; 130 | int items; 131 | vector> mu[2]; 132 | vector> trainLabel; 133 | vector> delta; 134 | vector>> threadStart; 135 | 136 | //预存x * (0.1)^y 137 | float pw[10][5] = {0, 0, 0, 0, 0, 1, 0.1, 0.01, 0.001, 0.0001, 138 | 2, 0.2, 0.02, 0.002, 0.0002, 3, 0.3, 0.03, 0.003, 0.0003, 139 | 4, 0.4, 0.04, 0.004, 0.0004, 5, 0.5, 0.05, 0.005, 0.0005, 140 | 6, 0.6, 0.06, 0.006, 0.0006, 7, 0.7, 0.07, 0.007, 0.0007, 141 | 8, 0.8, 0.08, 0.008, 0.0008, 9, 0.9, 0.09, 0.009, 0.0009}; 142 | float pwChar[255][5]; 143 | 144 | void getReadId(char *buffer) { 145 | int now = 0, pre, threadId = 0, j = 0, 146 | p = (trainNum + NUM_THREADS - 1) / NUM_THREADS, circle = 0; 147 | 148 | for (int i = 0; i < NUM_THREADS; i++) { 149 | if (now + p <= trainNum) { 150 | threadStart.emplace_back(vector>(p)); 151 | now += p; 152 | } else { 153 | threadStart.emplace_back(vector>(trainNum - now)); 154 | } 155 | } 156 | now = 0; 157 | for (int i = 0; i < trainNum; ++i) { 158 | pre = now; 159 | now += testLineSize; 160 | while (buffer[now] != '\n') ++now; 161 | threadStart[threadId][j++] = make_pair(pre, now - 1); 162 | circle++; 163 | if (circle == p) { 164 | threadId++; 165 | j = 0; 166 | circle = 0; 167 | } 168 | now++; 169 | } 170 | } 171 | 172 | void LoadChar(char *buffer, int &pid, int &start, int &end) { 173 | int now = start, id = 0, r = 0; 174 | float num = 0, sum = 0; 175 | bool flag = false; 176 | 177 | int type = buffer[end] - '0'; 178 | trainLabel[pid][type]++; 179 | while (id < feature) { 180 | if (buffer[now] == '-') { 181 | now++; 182 | num = pwChar[buffer[now]][0] + pwChar[buffer[now + 2]][1] + 183 | pwChar[buffer[now + 3]][2] + pwChar[buffer[now + 4]][3]; 184 | mu[type][pid][id++] -= num; 185 | sum -= num; 186 | } else { 187 | num = pwChar[buffer[now]][0] + pwChar[buffer[now + 2]][1] + 188 | pwChar[buffer[now + 3]][2] + pwChar[buffer[now + 4]][3]; 189 | mu[type][pid][id++] += num; 190 | sum += num; 191 | } 192 | now += 6; 193 | } 194 | delta[pid][type] += sum; 195 | } 196 | 197 | void threadLoadData(char *buffer, int pid) { 198 | for (auto &x : threadStart[pid]) { 199 | LoadChar(buffer, pid, x.first, x.second); 200 | } 201 | } 202 | 203 | void loadTrainData() { 204 | struct stat sb; 205 | int fd = open(trainFile.c_str(), O_RDONLY); 206 | fstat(fd, &sb); 207 | char *buffer = 208 | (char *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 209 | getReadId(buffer); 210 | vector td(NUM_THREADS); 211 | for (int i = 0; i < NUM_THREADS; i++) { 212 | td[i] = thread(&threadLoadData, buffer, i); 213 | } 214 | for (auto &t : td) { 215 | t.join(); 216 | } 217 | close(fd); 218 | } 219 | 220 | void threadPredict(char *buffer, int pid, int start, int end, int lineSize) { 221 | int id, initId = start * lineSize, nowId, up, now = 0, j, k, r; 222 | float sum; 223 | int fd = open(predictOutFile.c_str(), O_RDWR | O_CREAT, 0666); 224 | char *answer = 225 | (char *)mmap(NULL, 40000, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 226 | close(fd); 227 | 228 | for (int i = start; i < end; i += items) { 229 | up = items; 230 | if (i + items > end) up = end - i; 231 | for (k = 0; k < up; k++) { 232 | id = 0; 233 | sum = pLabel1; 234 | for (j = 0; j < featureId; j += 60) { 235 | nowId = initId + j; 236 | for (r = 0; r < 60; r += 6) { 237 | sum += (pwChar[buffer[nowId + r]][0] + 238 | pwChar[buffer[nowId + r + 2]][1] + 239 | pwChar[buffer[nowId + r + 3]][2]) * 240 | mu[0][0][id]; 241 | ++id; 242 | } 243 | } 244 | answer[(i + k) << 1 | 1] = '\n'; 245 | answer[(i + k) << 1] = sum > 0 ? '1' : '0'; 246 | initId += lineSize; 247 | } 248 | } 249 | munmap(answer, 40000); 250 | } 251 | 252 | void loadTestData(const string &file, int &lineSize, int pid) { 253 | struct stat sb; 254 | int fd = open(file.c_str(), O_RDONLY); 255 | fstat(fd, &sb); 256 | char *buffer = 257 | (char *)mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 258 | close(fd); 259 | int linenum = sb.st_size / lineSize; 260 | int pre = 0, line = (linenum + NUM_THREADS - 1) / NUM_THREADS; 261 | pre = line * pid; 262 | 263 | threadPredict(buffer, pid, pre, min(pre + line, linenum), lineSize); 264 | } 265 | 266 | 267 | void train() { 268 | for (int i = 1; i < NUM_THREADS; i++) { 269 | trainLabel[0][0] += trainLabel[i][0]; 270 | delta[0][0] += delta[i][0]; 271 | delta[0][1] += delta[i][1]; 272 | } 273 | delta[0][0] += 1; 274 | delta[0][1] += 1; 275 | 276 | pLabel0 = 1.0 * trainLabel[0][0] / trainNum; 277 | pLabel1 = 1.0 - pLabel0; 278 | pLabel0 = log(pLabel0); 279 | pLabel1 = log(pLabel1); 280 | 281 | float al0 = log(1.0 / delta[0][0]), al1 = log(1.0 / delta[0][1]); 282 | int nowId; 283 | for (int i = 0; i < feature; i += items) { 284 | for (int k = 0; k < items; k++) { 285 | nowId = i + k; 286 | for (int j = 1; j < NUM_THREADS; j++) { 287 | mu[0][0][nowId] += mu[0][j][nowId]; 288 | mu[1][0][nowId] += mu[1][j][nowId]; 289 | } 290 | 291 | mu[0][0][nowId] = log(mu[0][0][nowId] + 1) + al0; 292 | mu[1][0][nowId] = log(mu[1][0][nowId] + 1) + al1; 293 | } 294 | } 295 | mu[0][0] = mu[1][0] - mu[0][0]; 296 | pLabel1 -= pLabel0; 297 | } 298 | 299 | void judge() { 300 | vector answer, result; 301 | int x, cor = 0; 302 | ifstream fin(answerFile); 303 | while (fin) { 304 | fin >> x; 305 | answer.emplace_back(x); 306 | } 307 | fin.close(); 308 | ifstream fin2(predictOutFile); 309 | while (fin2) { 310 | fin2 >> x; 311 | result.emplace_back(x); 312 | } 313 | fin2.close(); 314 | for (int i = 0; i < answer.size(); i++) { 315 | if (answer[i] == result[i]) cor++; 316 | } 317 | cout << "准确率: " << 1.0 * cor / answer.size() << "\n"; 318 | } 319 | void init() { 320 | items = 64 / sizeof(float); 321 | for (int i = 0; i < NUM_THREADS; i++) { 322 | mu[0].emplace_back(vector(feature, 0)); 323 | mu[1].emplace_back(vector(feature, 0)); 324 | delta.emplace_back(vector(2, 0)); 325 | trainLabel.emplace_back(vector(2, 0)); 326 | } 327 | FILE *fd = fopen(predictOutFile.c_str(), "w"); 328 | for (int i = 0; i < 40000; i += 4) { 329 | char ch[4] = {' ', ' ', ' ', ' '}; 330 | fwrite(ch, 4, 1, fd); 331 | } 332 | fclose(fd); 333 | for (int i = '0'; i <= '9'; i++) { 334 | for (int j = 0; j < 5; j++) { 335 | pwChar[i][j] = pw[i - '0'][j]; 336 | } 337 | } 338 | featureId = feature * 6; 339 | 340 | loadTrainData(); 341 | train(); 342 | } 343 | 344 | void Bayes(string trainF, string testF, string predictOutF, string answerF) { 345 | trainFile = trainF; 346 | testFile = testF; 347 | predictOutFile = predictOutF; 348 | answerFile = answerF; 349 | init(); 350 | } 351 | int main(int argc, char *argv[]) { 352 | string trainFile = "../data/train_data.txt"; 353 | string testFile = "../data/test_data.txt"; 354 | string predictFile = "../data/result.txt"; 355 | string answerFile = "../data/answer.txt"; 356 | 357 | // string trainFile = "/data/train_data.txt"; 358 | // string testFile = "/data/test_data.txt"; 359 | // string predictFile = "/projects/student/result.txt"; 360 | // string answerFile = "/projects/student/answer.txt"; 361 | 362 | Bayes(trainFile, testFile, predictFile, answerFile); 363 | 364 | pid_t fk[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 365 | 366 | fk[1] = fork(); 367 | if (fk[1]) fk[2] = fork(); 368 | if (fk[2]) fk[3] = fork(); 369 | if (fk[3]) fk[4] = fork(); 370 | if (fk[4]) fk[5] = fork(); 371 | if (fk[5]) fk[6] = fork(); 372 | if (fk[6]) fk[7] = fork(); 373 | 374 | int pid = 0; 375 | for (int i = 1; i <= 7; i++) { 376 | if (!fk[i]) { 377 | pid = i; 378 | break; 379 | } 380 | } 381 | 382 | if (pid <= 7) { 383 | loadTestData(testFile, testLineSize, pid); 384 | exit(0); 385 | } 386 | // judge(); 387 | return 0; 388 | } 389 | --------------------------------------------------------------------------------