├── README.md └── pwned-passwords-sampler.c /README.md: -------------------------------------------------------------------------------- 1 | Generate representative samples from Pwned Passwords (HIBP) 2 | =========================================================== 3 | 4 | This program generates representative samples from Pwned Passwords (HIBP), 5 | taking the count fields into account. 6 | 7 | To use it, you need a file such as pwned-passwords-ntlm-ordered-by-hash-v8.txt 8 | from https://haveibeenpwned.com/Passwords 9 | 10 | Compile and invoke the program on Linux as follows: 11 | 12 | ```shell 13 | $ gcc pwned-passwords-sampler.c -o pwned-passwords-sampler -O2 -s -Wall 14 | $ ./pwned-passwords-sampler < pwned-passwords-ntlm-ordered-by-hash-v8.txt > pp-sample 15 | Total 5579399834 16 | $ wc -l pp-sample 17 | 1000000 pp-sample 18 | ``` 19 | 20 | With everything already optimally cached in RAM, this takes under 1 minute. 21 | 22 | The input file is expected to use CRLF linefeeds exactly as provided by HIBP, 23 | whereas the output has LF-only linefeeds. 24 | 25 | You need to be on a 64-bit system with at least 48 GB RAM, preferably 72+ GB. 26 | Usage on non-Linux might require minor changes to the code. 27 | -------------------------------------------------------------------------------- /pwned-passwords-sampler.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021,2022 by Solar Designer 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted. 6 | * 7 | * There's ABSOLUTELY NO WARRANTY, express or implied. 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #define HEX 32 /* 32 hex digits is right for NTLM */ 20 | #define MAXI 5579399834ULL /* exact figure for HIBP v8, any larger will do */ 21 | #define SAMPLE 1000000 22 | 23 | int main(void) { 24 | struct stat st; 25 | if (fstat(0, &st)) { 26 | perror("fstat"); 27 | return 1; 28 | } 29 | 30 | char *p = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, 0, 0); 31 | if (p == MAP_FAILED) { 32 | perror("mmap"); 33 | return 1; 34 | } 35 | 36 | if (madvise(p, st.st_size, MADV_SEQUENTIAL | MADV_WILLNEED)) { 37 | perror("madvise"); 38 | return 1; 39 | } 40 | 41 | const uint64_t as = MAXI * sizeof(uint64_t); 42 | assert(as == (size_t)as); /* fail on 32-bit */ 43 | uint64_t * const i2o = malloc(as); 44 | 45 | const char *pp = p; 46 | uint64_t *ip = i2o; 47 | unsigned long long total = 0; 48 | while (pp + (HEX + 4) <= p + st.st_size) { 49 | if (pp[HEX] != ':') { 50 | bad_file: 51 | fprintf(stderr, "\rInput file format error\n"); 52 | return 1; 53 | } 54 | char *e; 55 | unsigned long c = strtoul(pp + (HEX + 1), &e, 10); 56 | if (e[0] != '\r' || e[1] != '\n' || c < 1) 57 | goto bad_file; 58 | if ((total ^ (total + c)) & 0x1000000) 59 | fprintf(stderr, "\r%.1f%%", 100. * (pp - p) / st.st_size); 60 | total += c; 61 | if (total > MAXI) { 62 | fprintf(stderr, "\rTotal exceeds allocation\n"); 63 | return 1; 64 | } 65 | while (c--) 66 | *ip++ = pp - p; 67 | pp = e + 2; 68 | } 69 | assert(total == ip - i2o); 70 | if (pp != p + st.st_size) 71 | goto bad_file; 72 | fprintf(stderr, "\rTotal %llu\n", total); 73 | 74 | if (madvise(p, st.st_size, MADV_RANDOM | MADV_WILLNEED)) { 75 | perror("madvise"); 76 | return 1; 77 | } 78 | 79 | uint64_t mask; 80 | { 81 | uint64_t x = total, y; 82 | while ((y = x & (x - 1))) 83 | x = y; 84 | if (x == total) 85 | mask = x - 1; 86 | else 87 | mask = (x << 1) - 1; 88 | } 89 | 90 | int rfd = open("/dev/urandom", O_RDONLY); 91 | unsigned int n = SAMPLE; 92 | do { 93 | uint64_t rnd; 94 | if (read(rfd, &rnd, sizeof(rnd)) != sizeof(rnd)) 95 | continue; 96 | rnd &= mask; 97 | if (rnd >= total || i2o[rnd] == 1) 98 | continue; 99 | pp = p + i2o[rnd]; 100 | i2o[rnd] = 1; /* taken */ 101 | while (*pp != '\r') 102 | putchar(*pp++); 103 | putchar('\n'); 104 | n--; 105 | } while (n); 106 | close(rfd); 107 | 108 | return 0; 109 | } 110 | --------------------------------------------------------------------------------