├── .gitignore ├── CMakeLists.txt ├── README.md └── main.cu /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | project(tree_lattice C) 3 | 4 | set(CMAKE_C_STANDARD 11) 5 | 6 | include(CheckLanguage) 7 | check_language(CUDA) 8 | if(CMAKE_CUDA_COMPILER) 9 | enable_language(CUDA) 10 | message(STATUS "CUDA support") 11 | else() 12 | message(STATUS "No CUDA support") 13 | endif() 14 | 15 | add_executable(tree_lattice main.cu) 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # seed-reversal-gpu 2 | GPU code, the first stage in the pipeline to reverse the seed of pack.png 3 | 4 | This repository is part of the effort to reverse the seed of the pack.png image. For more information on how this code works, please refer to the relevant section in https://bit.ly/packpng-details 5 | -------------------------------------------------------------------------------- /main.cu: -------------------------------------------------------------------------------- 1 | 2 | // IDE indexing 3 | #ifdef __JETBRAINS_IDE__ 4 | #define __host__ 5 | #define __device__ 6 | #define __shared__ 7 | #define __constant__ 8 | #define __global__ 9 | #define __CUDACC__ 10 | #include 11 | #include <__clang_cuda_builtin_vars.h> 12 | #include <__clang_cuda_intrinsics.h> 13 | #include <__clang_cuda_math_forward_declares.h> 14 | #include <__clang_cuda_complex_builtins.h> 15 | #include <__clang_cuda_cmath.h> 16 | #endif 17 | 18 | 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | 27 | #define signed_seed_t int64_t 28 | #define uint uint32_t 29 | #define ulong uint64_t 30 | // let's be EVIL (and make sure all includes come before this) 31 | #define int int32_t 32 | 33 | #undef JRAND_DOUBLE 34 | 35 | #define RANDOM_MULTIPLIER_LONG 0x5DEECE66DULL 36 | 37 | #ifdef JRAND_DOUBLE 38 | #define Random double 39 | #define RANDOM_MULTIPLIER 0x5DEECE66Dp-48 40 | #define RANDOM_ADDEND 0xBp-48 41 | #define RANDOM_SCALE 0x1p-48 42 | // should be signed with int32_t (to verify) 43 | inline uint __host__ __device__ random_next(Random *random, int bits) { 44 | *random = trunc((*random * RANDOM_MULTIPLIER + RANDOM_ADDEND) * RANDOM_SCALE); 45 | return (uint)((ulong)(*random / RANDOM_SCALE) >> (48 - bits)); 46 | } 47 | 48 | #else 49 | 50 | #define Random ulong 51 | #define RANDOM_MULTIPLIER RANDOM_MULTIPLIER_LONG 52 | #define RANDOM_ADDEND 0xBULL 53 | #define RANDOM_MASK (1ULL << 48) - 1 54 | #define RANDOM_SCALE 1 55 | 56 | #define FAST_NEXT_INT 57 | 58 | // Random::next(bits) 59 | __host__ __device__ inline uint random_next(Random *random, int bits) { 60 | *random = (*random * RANDOM_MULTIPLIER + RANDOM_ADDEND) & RANDOM_MASK; 61 | return (uint)(*random >> (48 - bits)); 62 | } 63 | #endif // ~JRAND_DOUBLE 64 | 65 | // new Random(seed) 66 | #define get_random(seed) ((Random)((seed ^ RANDOM_MULTIPLIER_LONG) & RANDOM_MASK)) 67 | #define get_random_unseeded(state) ((Random) ((state) * RANDOM_SCALE)) 68 | 69 | // Random::nextInt(bound) 70 | __host__ __device__ inline uint random_next_int(Random *random, uint bound) { 71 | int r = random_next(random, 31); 72 | int m = bound - 1; 73 | if ((bound & m) == 0) { 74 | // Could probably use __mul64hi here 75 | r = (uint)((bound * (ulong)r) >> 31); 76 | } else { 77 | #ifdef FAST_NEXT_INT 78 | r %= bound; 79 | #else 80 | for (int u = r; 81 | u - (r = u % bound) + m < 0; 82 | u = random_next(random, 31)); 83 | #endif 84 | } 85 | return r; 86 | } 87 | 88 | __host__ __device__ inline int64_t random_next_long (Random *random) { 89 | return (((int64_t)random_next(random, 32)) << 32) + (int32_t) random_next(random, 32); 90 | } 91 | 92 | #define CHECK_GPU_ERR(code) gpuAssert((code), __FILE__, __LINE__) 93 | inline void gpuAssert(cudaError_t code, const char* file, int line) { 94 | if (code != cudaSuccess) { 95 | fprintf(stderr, "GPUassert: %s (code %d) %s %d\n", cudaGetErrorString(code), code, file, line); 96 | exit(code); 97 | } 98 | } 99 | 100 | // advance 101 | #define advance(rand, multiplier, addend) ((rand) = ((rand) * (multiplier) + (addend)) & RANDOM_MASK) 102 | #define advance_830(rand) advance(rand, 0x859D39E832D9LL, 0xE3E2DF5E9196LL) 103 | #define advance_774(rand) advance(rand, 0xF8D900133F9LL, 0x5738CAC2F85ELL) 104 | #define advance_387(rand) advance(rand, 0x5FE2BCEF32B5LL, 0xB072B3BF0CBDLL) 105 | #define advance_16(rand) advance(rand, 0x6DC260740241LL, 0xD0352014D90LL) 106 | #define advance_m1(rand) advance(rand, 0xDFE05BCB1365LL, 0x615C0E462AA9LL) 107 | #define advance_m3759(rand) advance(rand, 0x63A9985BE4ADLL, 0xA9AA8DA9BC9BLL) 108 | 109 | 110 | 111 | #define TREE_X 4 112 | #define TREE_Z 3 113 | #define TREE_HEIGHT 6 114 | 115 | #define OTHER_TREE_COUNT 3 116 | __device__ inline int getTreeHeight(int x, int z) { 117 | if (x == TREE_X && z == TREE_Z) 118 | return TREE_HEIGHT; 119 | 120 | if (x == 1 && z == 13) 121 | return 5; 122 | 123 | if (x == 6 && z == 12) 124 | return 6; 125 | 126 | if (x == 14 && z == 7) { 127 | return 5; 128 | } 129 | 130 | return 0; 131 | } 132 | 133 | #define WATERFALL_X 9 134 | #define WATERFALL_Y 76 135 | #define WATERFALL_Z 1 136 | 137 | 138 | 139 | #define MODULUS (1LL << 48) 140 | #define SQUARE_SIDE (MODULUS / 16) 141 | #define X_TRANSLATE 0 142 | #define Z_TRANSLATE 11 143 | #define L00 7847617LL 144 | #define L01 (-18218081LL) 145 | #define L10 4824621LL 146 | #define L11 24667315LL 147 | #define LI00 (24667315.0 / 16) 148 | #define LI01 (18218081.0 / 16) 149 | #define LI10 (-4824621.0 / 16) 150 | #define LI11 (7847617.0 / 16) 151 | 152 | #define CONST_MIN(a, b) ((a) < (b) ? (a) : (b)) 153 | #define CONST_MIN4(a, b, c, d) CONST_MIN(CONST_MIN(a, b), CONST_MIN(c, d)) 154 | #define CONST_MAX(a, b) ((a) > (b) ? (a) : (b)) 155 | #define CONST_MAX4(a, b, c, d) CONST_MAX(CONST_MAX(a, b), CONST_MAX(c, d)) 156 | #define CONST_FLOOR(x) ((x) < (signed_seed_t) (x) ? (signed_seed_t) (x) - 1 : (signed_seed_t) (x)) 157 | #define CONST_CEIL(x) ((x) == (signed_seed_t) (x) ? (signed_seed_t) (x) : CONST_FLOOR((x) + 1)) 158 | #define CONST_LOWER(x, m, c) ((m) < 0 ? ((x) + 1 - (double) (c) / MODULUS) * (m) : ((x) - (double) (c) / MODULUS) * (m)) 159 | #define CONST_UPPER(x, m, c) ((m) < 0 ? ((x) - (double) (c) / MODULUS) * (m) : ((x) + 1 - (double) (c) / MODULUS) * (m)) 160 | 161 | // for a parallelogram ABCD https://media.discordapp.net/attachments/668607204009574411/671018577561649163/unknown.png 162 | #define B_X LI00 163 | #define B_Z LI10 164 | #define C_X (LI00 + LI01) 165 | #define C_Z (LI10 + LI11) 166 | #define D_X LI01 167 | #define D_Z LI11 168 | #define LOWER_X CONST_MIN4(0, B_X, C_X, D_X) 169 | #define LOWER_Z CONST_MIN4(0, B_Z, C_Z, D_Z) 170 | #define UPPER_X CONST_MAX4(0, B_X, C_X, D_X) 171 | #define UPPER_Z CONST_MAX4(0, B_Z, C_Z, D_Z) 172 | #define ORIG_SIZE_X (UPPER_X - LOWER_X + 1) 173 | #define SIZE_X CONST_CEIL(ORIG_SIZE_X - D_X) 174 | #define SIZE_Z CONST_CEIL(UPPER_Z - LOWER_Z + 1) 175 | #define TOTAL_WORK_SIZE (SIZE_X * SIZE_Z) 176 | 177 | #define MAX_TREE_ATTEMPTS 12 178 | #define MAX_TREE_SEARCH_BACK (3 * MAX_TREE_ATTEMPTS - 3 + 16 * OTHER_TREE_COUNT) 179 | 180 | __constant__ ulong search_back_multipliers[MAX_TREE_SEARCH_BACK + 1]; 181 | __constant__ ulong search_back_addends[MAX_TREE_SEARCH_BACK + 1]; 182 | int search_back_count; 183 | 184 | #define WORK_UNIT_SIZE (1LL << 23) 185 | #define BLOCK_SIZE 256 186 | 187 | __global__ void doPreWork(ulong offset, Random* starts, int* num_starts) { 188 | // lattice tree position 189 | ulong global_id = blockIdx.x * blockDim.x + threadIdx.x; 190 | 191 | signed_seed_t lattice_x = (signed_seed_t) ((offset + global_id) % SIZE_X) + LOWER_X; 192 | signed_seed_t lattice_z = (signed_seed_t) ((offset + global_id) / SIZE_X) + LOWER_Z; 193 | lattice_z += (B_X * lattice_z < B_Z * lattice_x) * SIZE_Z; 194 | if (D_X * lattice_z > D_Z * lattice_x) { 195 | lattice_x += B_X; 196 | lattice_z += B_Z; 197 | } 198 | lattice_x += (signed_seed_t) (TREE_X * LI00 + TREE_Z * LI01); 199 | lattice_z += (signed_seed_t) (TREE_X * LI10 + TREE_Z * LI11); 200 | 201 | Random rand = (Random) ((lattice_x * L00 + lattice_z * L01 + X_TRANSLATE) % MODULUS); 202 | advance_m1(rand); 203 | 204 | Random tree_start = rand; 205 | advance_m1(tree_start); 206 | 207 | bool res = random_next(&rand, 4) == TREE_X; 208 | res &= random_next(&rand, 4) == TREE_Z; 209 | res &= random_next_int(&rand, 3) == (ulong) (TREE_HEIGHT - 4); 210 | 211 | if(res) { 212 | int index = atomicAdd(num_starts, 1); 213 | starts[index] = tree_start; 214 | } 215 | } 216 | 217 | __global__ void doWork(int* num_starts, Random* tree_starts, int* num_seeds, ulong* seeds, int gpu_search_back_count) { 218 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < *num_starts; i += blockDim.x * gridDim.x) { 219 | Random tree_start = tree_starts[i]; 220 | 221 | for (int treeBackCalls = 0; treeBackCalls <= gpu_search_back_count; treeBackCalls++) { 222 | Random start = (tree_start * search_back_multipliers[treeBackCalls] + search_back_addends[treeBackCalls]) & RANDOM_MASK; 223 | Random rand = start; 224 | 225 | bool this_res = true; 226 | 227 | if(random_next_int(&rand, 10) == 0) 228 | continue; 229 | 230 | char generated_tree[16][2]; 231 | memset(generated_tree, 0x00, sizeof(generated_tree)); 232 | 233 | int treesMatched = 0; 234 | bool any_population_matches = false; 235 | for (int treeAttempt = 0; treeAttempt <= MAX_TREE_ATTEMPTS; treeAttempt++) { 236 | int treeX = random_next(&rand, 4); 237 | int treeZ = random_next(&rand, 4); 238 | int wantedTreeHeight = getTreeHeight(treeX, treeZ); 239 | int treeHeight = random_next_int(&rand, 3) + 4; 240 | 241 | char& boolpack = generated_tree[treeX][treeZ / 8]; 242 | const char mask = 1 << (treeZ % 8); 243 | 244 | if (treeHeight == wantedTreeHeight && !(boolpack & mask)) { 245 | treesMatched++; 246 | boolpack |= mask; 247 | advance_16(rand); 248 | } 249 | 250 | if (treesMatched == OTHER_TREE_COUNT + 1) { 251 | Random before_rest = rand; 252 | // yellow flowers 253 | advance_774(rand); 254 | // red flowers 255 | if (random_next(&rand, 1) == 0) { 256 | advance_387(rand); 257 | } 258 | // brown mushroom 259 | if (random_next(&rand, 2) == 0) { 260 | advance_387(rand); 261 | } 262 | // red mushroom 263 | if (random_next(&rand, 3) == 0) { 264 | advance_387(rand); 265 | } 266 | // reeds 267 | advance_830(rand); 268 | // pumpkins 269 | if (random_next(&rand, 5) == 0) { 270 | advance_387(rand); 271 | } 272 | 273 | for (int i = 0; i < 50; i++) { 274 | bool waterfall_matches = random_next(&rand, 4) == WATERFALL_X; 275 | waterfall_matches &= random_next_int(&rand, random_next_int(&rand, 120) + 8) == WATERFALL_Y; 276 | waterfall_matches &= random_next(&rand, 4) == WATERFALL_Z; 277 | any_population_matches |= waterfall_matches; 278 | } 279 | rand = before_rest; 280 | } 281 | } 282 | 283 | this_res &= any_population_matches; 284 | 285 | if (this_res) { 286 | Random start_chunk_rand = start; 287 | advance_m3759(start_chunk_rand); 288 | 289 | int index = atomicAdd(num_seeds, 1); 290 | seeds[index] = start_chunk_rand; 291 | } 292 | 293 | advance_m1(start); 294 | } 295 | } 296 | } 297 | 298 | struct GPU_Node { 299 | int GPU; 300 | int* num_seeds; 301 | ulong* seeds; 302 | int* num_tree_starts; 303 | Random* tree_starts; 304 | }; 305 | 306 | void setup_gpu_node(GPU_Node* node, int gpu) { 307 | CHECK_GPU_ERR(cudaSetDevice(gpu)); 308 | node->GPU = gpu; 309 | CHECK_GPU_ERR(cudaMallocManaged(&node->num_seeds, sizeof(*node->num_seeds))); 310 | CHECK_GPU_ERR(cudaMallocManaged(&node->seeds, (1LL << 20))); // approx 1MB 311 | CHECK_GPU_ERR(cudaMallocManaged(&node->num_tree_starts, sizeof(*node->num_tree_starts))); 312 | CHECK_GPU_ERR(cudaMallocManaged(&node->tree_starts, (sizeof(Random)*WORK_UNIT_SIZE))); 313 | } 314 | 315 | 316 | void calculate_search_backs(int GPU_COUNT) { 317 | bool allow_search_back[MAX_TREE_SEARCH_BACK + 1]; 318 | memset(allow_search_back, false, sizeof(allow_search_back)); 319 | 320 | for (int i = 0; i <= MAX_TREE_ATTEMPTS - OTHER_TREE_COUNT - 1; i++) { 321 | allow_search_back[i * 3] = true; 322 | } 323 | 324 | for (int tree = 0; tree < OTHER_TREE_COUNT; tree++) { 325 | for (int i = 0; i <= MAX_TREE_SEARCH_BACK - 19; i++) { 326 | if (allow_search_back[i]) 327 | allow_search_back[i + 19] = true; 328 | } 329 | } 330 | 331 | search_back_count = 0; 332 | ulong multiplier = 1; 333 | ulong addend = 0; 334 | ulong multipliers[MAX_TREE_SEARCH_BACK + 1]; 335 | ulong addends[MAX_TREE_SEARCH_BACK + 1]; 336 | for (int i = 0; i <= MAX_TREE_SEARCH_BACK; i++) { 337 | if (allow_search_back[i]) { 338 | int index = search_back_count++; 339 | multipliers[index] = multiplier; 340 | addends[index] = addend; 341 | } 342 | multiplier = (multiplier * 0xDFE05BCB1365LL) & RANDOM_MASK; 343 | addend = (0xDFE05BCB1365LL * addend + 0x615C0E462AA9LL) & RANDOM_MASK; 344 | } 345 | 346 | for (int gpu = 0; gpu < GPU_COUNT; gpu++) { 347 | CHECK_GPU_ERR(cudaSetDevice(gpu)); 348 | CHECK_GPU_ERR(cudaMemcpyToSymbol(search_back_multipliers, &multipliers, search_back_count * sizeof(*multipliers))); 349 | CHECK_GPU_ERR(cudaMemcpyToSymbol(search_back_addends, &addends, search_back_count * sizeof(*addends))); 350 | } 351 | } 352 | 353 | 354 | #undef int 355 | int main(int argc, char *argv[]) { 356 | #define int int32_t 357 | int GPU_COUNT = 1; 358 | for (int i = 1; i < argc; i++) { 359 | if (argv[i][0] == '-') { 360 | switch(argv[i][1]) { 361 | case 'g': 362 | if(isdigit(argv[i][2])) GPU_COUNT = atoi(argv[i] + 2); 363 | break; 364 | default: 365 | printf("Error: Flag not recognized."); 366 | return -1; 367 | } 368 | } else { 369 | printf("Error: Please specify flag before argument."); 370 | return -1; 371 | } 372 | } 373 | GPU_Node *nodes = (GPU_Node*)malloc(sizeof(GPU_Node) * GPU_COUNT); 374 | printf("Searching %lld total seeds...\n", TOTAL_WORK_SIZE); 375 | 376 | calculate_search_backs(GPU_COUNT); 377 | 378 | FILE* out_file = fopen("chunk_seeds.txt", "w"); 379 | 380 | for(int i = 0; i < GPU_COUNT; i++) { 381 | setup_gpu_node(&nodes[i],i); 382 | } 383 | 384 | 385 | ulong count = 0; 386 | clock_t lastIteration = clock(); 387 | clock_t startTime = clock(); 388 | for (ulong offset = 0; offset < TOTAL_WORK_SIZE;) { 389 | 390 | for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { 391 | CHECK_GPU_ERR(cudaSetDevice(gpu_index)); 392 | 393 | *nodes[gpu_index].num_tree_starts = 0; 394 | doPreWork <<>> (offset, nodes[gpu_index].tree_starts, nodes[gpu_index].num_tree_starts); 395 | offset += WORK_UNIT_SIZE; 396 | } 397 | 398 | for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { 399 | CHECK_GPU_ERR(cudaSetDevice(gpu_index)); 400 | CHECK_GPU_ERR(cudaDeviceSynchronize()); 401 | } 402 | 403 | for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { 404 | CHECK_GPU_ERR(cudaSetDevice(gpu_index)); 405 | 406 | *nodes[gpu_index].num_seeds = 0; 407 | doWork <<>> (nodes[gpu_index].num_tree_starts, nodes[gpu_index].tree_starts, nodes[gpu_index].num_seeds, nodes[gpu_index].seeds, search_back_count); 408 | } 409 | 410 | for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { 411 | CHECK_GPU_ERR(cudaSetDevice(gpu_index)); 412 | CHECK_GPU_ERR(cudaDeviceSynchronize()); 413 | 414 | for (int i = 0, e = *nodes[gpu_index].num_seeds; i < e; i++) { 415 | fprintf(out_file, "%lld\n", nodes[gpu_index].seeds[i]); 416 | } 417 | fflush(out_file); 418 | count += *nodes[gpu_index].num_seeds; 419 | } 420 | 421 | double iterationTime = (double)(clock() - lastIteration) / CLOCKS_PER_SEC; 422 | double timeElapsed = (double)(clock() - startTime) / CLOCKS_PER_SEC; 423 | lastIteration = clock(); 424 | ulong numSearched = offset + WORK_UNIT_SIZE * GPU_COUNT; 425 | double speed = (double)WORK_UNIT_SIZE * GPU_COUNT / (double)iterationTime / 1000000.0; 426 | double progress = (double)numSearched / (double)TOTAL_WORK_SIZE * 100.0; 427 | double estimatedTime = (double)(TOTAL_WORK_SIZE - numSearched) / (double) (WORK_UNIT_SIZE * GPU_COUNT) * iterationTime; 428 | char suffix = 's'; 429 | if (estimatedTime >= 3600) { 430 | suffix = 'h'; 431 | estimatedTime /= 3600.0; 432 | } else if (estimatedTime >= 60) { 433 | suffix = 'm'; 434 | estimatedTime /= 60.0; 435 | } 436 | if (progress >= 100.0) { 437 | estimatedTime = 0.0; 438 | suffix = 's'; 439 | } 440 | printf("Searched: %lld seeds. Found: %lld matches. Uptime: %.1fs. Speed: %.2fm seeds/s. Completion: %.3f%%. ETA: %.1f%c.\n", numSearched, count, timeElapsed, speed, progress, estimatedTime, suffix); 441 | 442 | } 443 | 444 | fclose(out_file); 445 | 446 | } 447 | --------------------------------------------------------------------------------