├── .gitignore ├── .gitmodules ├── .travis.yml ├── CMakeLists.txt ├── README.md ├── index.js ├── normalize.py ├── package.json ├── src ├── CMakeLists.txt ├── core │ ├── CMakeLists.txt │ ├── bloom_filter.cc │ ├── bloom_filter.h │ ├── debug.h │ ├── util.cc │ └── util.h ├── mkbfidx.cc └── openidx.cc └── test ├── CMakeLists.txt ├── foo.idx ├── foo.txt └── hasher.cc /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | node_modules/ 3 | *.log 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "libbf"] 2 | path = src/libbf 3 | url = https://github.com/kirisetsz/libbf 4 | [submodule "src/docopt"] 5 | path = src/docopt 6 | url = https://github.com/docopt/docopt.cpp 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | sudo: required 3 | dist: trusty 4 | addons: 5 | apt: 6 | sources: 7 | - ubuntu-toolchain-r-test 8 | packages: 9 | - gcc-5 10 | - g++-5 11 | compiler: 12 | - gcc 13 | script: 14 | - mkdir build 15 | - cd build 16 | - cmake -DCMAKE_CXX_COMPILER=g++-5 -DCMAKE_BUILD_TYPE=Release .. 17 | - make 18 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(52g CXX) 2 | 3 | cmake_minimum_required(VERSION 3.2) 4 | 5 | add_subdirectory(src) 6 | add_subdirectory(test) 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 52g 2 | --- 3 | 4 | [![Build Status](https://travis-ci.org/kamikat/52g.svg?branch=master)](https://travis-ci.org/kamikat/52g) 5 | 6 | Process & query leaked password data using bloom filter. 7 | 8 | Installation 9 | ------------ 10 | 11 | Prerequisites: 12 | 13 | - CMake >= 3.0 14 | - GCC >= 4.9 / clang with C++11 support 15 | - Node.js + npm 16 | 17 | On OS X 18 | 19 | brew install cmake nodejs 20 | 21 | On Ubuntu 22 | 23 | apt install build-essential cmake nodejs-legacy npm 24 | 25 | Execute following command in project root: 26 | 27 | git submodule update --init --recursive 28 | npm install 29 | 30 | should build following artifacts in `build/` 31 | 32 | - mkbfidx: create a bloom filter index 33 | - openidx: query a bloom filter index created by mkbfidx 34 | 35 | See `--help` for option details. 36 | 37 | License 38 | ------- 39 | 40 | (The MIT License) 41 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var path = require('path'); 4 | var spawn = require('child_process').spawn; 5 | var death = require('death'); 6 | var fifo = require('fifo'); 7 | var express = require('express'); 8 | var compression = require('compression'); 9 | var morgan = require('morgan'); 10 | var parser = require('body-parser'); 11 | var serveStatic = require('serve-static'); 12 | var assign = require('object-assign'); 13 | 14 | var argv = require('yargs').demand(1).argv; 15 | 16 | var app = express(); 17 | 18 | if ('production' == process.env.NODE_ENV) { 19 | app.use(compression()); 20 | app.use(morgan('combined')); 21 | } else { 22 | app.use(morgan('dev')); 23 | } 24 | 25 | app.use(parser.json()); 26 | 27 | app.get('/algorithm', function (req, res) { 28 | return res.status(200).send(openidx.params); 29 | }); 30 | 31 | app.get('/query', function (req, res, next) { 32 | openidx.lookup(req.query.d, function (result) { 33 | return res.status(200).send({ 34 | found: !!+result 35 | }); 36 | }); 37 | }); 38 | 39 | if (process.env.PROXY_MODE) { 40 | // parse client from X-Forwarded-* headers 41 | app.enable('trust proxy'); 42 | } 43 | 44 | var server = app.listen( 45 | process.env.BIND_PORT || 8080, 46 | process.env.BIND_HOST || 'localhost', 47 | function () { 48 | console.log('Server listening at http://%s:%s', server.address().address, server.address().port); 49 | } 50 | ); 51 | 52 | death(function () { 53 | server.close(); 54 | }); 55 | 56 | var openidx = (function (file) { 57 | var _callbacks = fifo(); 58 | var _params = {}; 59 | 60 | _callbacks.push(function (data) { 61 | console.log(data); 62 | assign(_params, { 63 | k: parseInt(data.match(/k=([0-9]+)/)[1]), 64 | m: parseInt(data.match(/m=([0-9]+)/)[1]), 65 | seed: parseInt(data.match(/seed=([0-9]+)/)[1]) 66 | }); 67 | }); 68 | 69 | var _child = spawn( 70 | path.resolve(__dirname, "build/openidx"), [ '--hex', file ]); 71 | 72 | _child.stdout.on('data', function (data) { 73 | var results = data.toString().split('\n'); 74 | for (var i = 0; i < results.length - 1; i++) { 75 | _callbacks.shift()(results[i]); 76 | } 77 | }); 78 | 79 | _child.stderr.on('data', function (data) { process.stderr.write(data); }); 80 | 81 | death(function () { 82 | _child.kill('SIGTERM'); 83 | }); 84 | 85 | return { 86 | params: _params, 87 | lookup: function (data, cb) { 88 | _callbacks.push(cb); 89 | _child.stdin.write(data.replace(/\r?\n|\r/g, '') + '\n'); 90 | }, 91 | }; 92 | })(argv._[0]); 93 | -------------------------------------------------------------------------------- /normalize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | ## 4 | # normalize data source 5 | ## 6 | 7 | import sys 8 | import re 9 | 10 | re_whitespace = re.compile(r'[ \t]+[,|]?[ \t]*') 11 | re_mail = re.compile(r'^[-+0-9a-zA-Z%_.]*@([-0-9A-Za-z]+\.)+[-0-9A-Za-z_]{2,}$') 12 | 13 | def normalize(line): 14 | x = re_whitespace.split(line); 15 | if len(x) == 2: 16 | if re_mail.match(x[0]): return (x[0], x[1]) 17 | if len(x) == 3: 18 | if re_mail.match(x[1]): return (x[1], x[2]) 19 | if re_mail.match(x[2]): return (x[2], x[1]) 20 | if len(x) == 4: 21 | if re_mail.match(x[2]): return (x[2], x[3]) 22 | if re_mail.match(x[1]): return (x[1], x[3]) 23 | if len(x) == 5: 24 | if re_mail.match(x[0]): return (x[0], x[4]) 25 | x = line.split('|') 26 | if len(x) == 2: 27 | if re_mail.match(x[0]): return (x[0], x[1]) 28 | x = line.split(',') 29 | if len(x) == 2: 30 | if re_mail.match(x[0]): return (x[0], x[1]) 31 | if len(x) == 3: 32 | if re_mail.match(x[1]): return (x[1], x[2]) 33 | if re_mail.match(x[2]): return (x[2], x[1]) 34 | x = line.split('----') 35 | if len(x) == 2: 36 | if re_mail.match(x[0]): return (x[0], x[1]) 37 | if len(x) >=3: 38 | if re_mail.match(x[0]): return (x[0], x[1]) 39 | if re_mail.match(x[1]): return (x[1], x[0]) 40 | if re_mail.match(x[2]): return (x[2], x[1]) 41 | 42 | while 1: 43 | line = sys.stdin.readline() 44 | if not line: 45 | break 46 | norm = normalize(line.rstrip()); 47 | if norm: 48 | mail, pw = norm 49 | print "%s\t%s" % (mail.lower(), pw) 50 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "52g", 3 | "version": "1.0.0", 4 | "description": "Process & query leaked password data using bloom filter.", 5 | "main": "index.js", 6 | "directories": { 7 | "test": "test" 8 | }, 9 | "scripts": { 10 | "dev": "nodemon ./index.js test/foo.idx", 11 | "install": "npm run compile", 12 | "compile": "mkdir build; cd build && cmake -DCMAKE_BUILD_TYPE=Release .. && make mkbfidx openidx", 13 | "clean": "rm -rf build", 14 | "test": "echo \"Error: no test specified\" && exit 1" 15 | }, 16 | "bin": { 17 | "mkbfidx": "./build/mkbfidx", 18 | "openidx": "./build/openidx", 19 | "servidx": "./index.js" 20 | }, 21 | "repository": { 22 | "type": "git", 23 | "url": "github.com:kirisetsz/52g.git" 24 | }, 25 | "keywords": [ 26 | "bloom", 27 | "filter", 28 | "password", 29 | "163", 30 | "netease" 31 | ], 32 | "author": "kirisetsz ", 33 | "license": "MIT", 34 | "dependencies": { 35 | "body-parser": "^1.15.0", 36 | "compression": "^1.6.1", 37 | "death": "^1.0.0", 38 | "express": "^4.13.4", 39 | "fifo": "^2.3.0", 40 | "morgan": "^1.7.0", 41 | "object-assign": "^4.0.1", 42 | "yargs": "^4.4.0" 43 | }, 44 | "devDependencies": { 45 | "nodemon": "^1.9.1" 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(libbf/src) 2 | target_compile_options(libbf PUBLIC -std=c++11) 3 | target_include_directories(libbf PUBLIC libbf/src) 4 | 5 | add_subdirectory(docopt) 6 | add_subdirectory(core) 7 | 8 | # Export executable targets to ${CMAKE_BINARY_DIR} 9 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 10 | 11 | add_executable(mkbfidx mkbfidx.cc) 12 | target_link_libraries(mkbfidx core docopt) 13 | 14 | add_executable(openidx openidx.cc) 15 | target_link_libraries(openidx core docopt) 16 | -------------------------------------------------------------------------------- /src/core/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(core bloom_filter.cc util.cc) 2 | target_include_directories(core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 3 | target_compile_options(core PUBLIC -std=c++11) # FIXME transive usage requirement 4 | target_link_libraries(core PUBLIC libbf) 5 | -------------------------------------------------------------------------------- /src/core/bloom_filter.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | namespace app { 5 | 6 | bloom_filter::bloom_filter(size_t k, size_t m, size_t seed) 7 | : bf::basic_bloom_filter(bf::make_hasher(k), m), k_(k), m_(m), seed_(seed) 8 | { 9 | } 10 | 11 | bloom_filter::bloom_filter(bloom_filter&& o) 12 | : basic_bloom_filter(std::move(o)), k_(o.k_), m_(o.m_), seed_(o.seed_) 13 | { 14 | DEBUG("MOVE bloom filter"); 15 | } 16 | 17 | size_t bloom_filter::lookupDigest(const std::vector& digests) 18 | { 19 | auto bits = storage(); 20 | for (auto d : digests) { 21 | if (!bits[d % bits.size()]) return 0; 22 | } 23 | return 1; 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/core/bloom_filter.h: -------------------------------------------------------------------------------- 1 | #ifndef BLOOM_FILTER_H 2 | #define BLOOM_FILTER_H 3 | 4 | #define private public 5 | #include 6 | #undef private 7 | 8 | #include 9 | 10 | namespace app { 11 | 12 | class bloom_filter: public bf::basic_bloom_filter { 13 | 14 | public: 15 | 16 | // http://hur.st/bloomfilter?n=370069037&p=0.0001 17 | static const size_t BF_K = 13; 18 | static const size_t BF_M = 7094266647; 19 | 20 | bloom_filter(size_t k, size_t m, size_t seed = 1); 21 | 22 | bloom_filter(bloom_filter&&); 23 | 24 | size_t lookupDigest(const std::vector& digests); 25 | 26 | const size_t k_; 27 | const size_t m_; 28 | const size_t seed_; 29 | }; 30 | 31 | typedef struct { 32 | uint64_t k; 33 | uint64_t m; 34 | uint64_t seed; 35 | } Header; 36 | 37 | } 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /src/core/debug.h: -------------------------------------------------------------------------------- 1 | #ifndef DEBUG_H 2 | #define DEBUG_H 3 | 4 | #ifndef NDEBUG 5 | #include 6 | #include 7 | #define DEBUG(expr) std::cerr << "DEBUG " << expr << std::endl 8 | #define HEX(x) std::setw(2) << std::setfill('0') << std::hex << (uint64_t)(x) << std::setw(0) << std::dec 9 | #else 10 | #define DEBUG(expr) 11 | #endif 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/core/util.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void app::fold_str(std::string& line, size_t size) { 5 | if (line.length() <= size) return; 6 | for (int i = size; i < line.length(); i += size) { 7 | for (int j = 0; j < size && i + j < line.length(); j++) { 8 | DEBUG("FOLD line[" << j << "]=" << HEX(line[j]) << ", line[" << i + j << "]=" << HEX(line[i + j])); 9 | line[j] ^= line[i + j]; 10 | DEBUG("FOLD line[" << j << "] XOR line[" << i + j << "]=" << HEX(line[j])); 11 | } 12 | } 13 | if (line.length() > size) line.resize(size); 14 | } 15 | -------------------------------------------------------------------------------- /src/core/util.h: -------------------------------------------------------------------------------- 1 | #ifndef UTIL_H 2 | #define UTIL_H 3 | 4 | #include 5 | 6 | namespace app { 7 | 8 | // Fold looooong strings 9 | void fold_str(std::string&, size_t); 10 | 11 | } 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/mkbfidx.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | const char USAGE[] = 8 | R"( 9 | Make bloom filter index. 10 | 11 | Usage: 12 | mkbfidx [--seed ] 13 | mkbfidx [--seed ] > stdin.idx 14 | mkbfidx [--seed ] < data.txt > data.idx 15 | mkbfidx [--seed ] < data.txt | gzip > data.idx.gz 16 | 17 | Options: 18 | k number of hash functions 19 | m number of bits 20 | -s, --seed hash function seed [default: 1] 21 | -h, --help print help message 22 | )"; 23 | 24 | using namespace app; 25 | using namespace std; 26 | 27 | void dump(bloom_filter& bf, ostream& out) { 28 | Header header = { bf.k_, bf.m_, bf.seed_ }; 29 | out.write((char*) &header, sizeof(header)); 30 | auto& storage = bf.storage(); 31 | char* data = (char*) storage.bits_.data(); 32 | size_t size = sizeof(bf::bitvector::block_type) * storage.blocks(); 33 | out.write(data, size).flush(); 34 | } 35 | 36 | int main(int argc, const char* argv[]) { 37 | auto args = docopt::docopt(USAGE, { argv + 1, argv + argc }); 38 | bloom_filter bf { size_t(args[""].asLong()), size_t(args[""].asLong()), size_t(args["--seed"].asLong()) }; 39 | string line; 40 | while (getline(cin, line)) { 41 | fold_str(line, bf::default_hash_function::max_obj_size); 42 | bf.add(line); 43 | DEBUG("ADD '" << line << "'"); 44 | } 45 | dump(bf, cout); 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /src/openidx.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | const char USAGE[] = 10 | R"( 11 | Command-line tool to query bloom filter index. 12 | 13 | Usage: 14 | openidx [-x] 15 | 16 | Options: 17 | file bloom filter index file 18 | -x, --hex enable hexadecimal query mode [default: false] 19 | -h, --help print help message 20 | )"; 21 | 22 | using namespace app; 23 | using namespace std; 24 | 25 | bloom_filter load(const string& file) { 26 | DEBUG("LOAD " << file); 27 | ifstream ifs { file, ios::binary }; 28 | Header header; 29 | ifs.read((char*) &header, sizeof(header)); 30 | cout << "HEADER k=" << header.k << ", m=" << header.m << ", seed=" << header.seed << endl; 31 | bloom_filter bf(header.k, header.m); 32 | auto& storage = bf.storage(); 33 | char* data = (char*) storage.bits_.data(); 34 | size_t size = sizeof(bf::bitvector::block_type) * storage.blocks(); 35 | ifs.read(data, size); 36 | return move(bf); 37 | } 38 | 39 | int main(int argc, const char* argv[]) { 40 | auto args = docopt::docopt(USAGE, { argv + 1, argv + argc }); 41 | bloom_filter bf = load(args[""].asString()); 42 | string line; 43 | if (args["--hex"].asBool()) { 44 | // hexadecimal input 45 | while (getline(cin, line)) { 46 | istringstream iss { line }; 47 | vector digests { istream_iterator { iss >> hex }, istream_iterator {} }; 48 | DEBUG("FIND digest {"); 49 | for (auto d : digests) { 50 | DEBUG("FIND " << HEX(d)); 51 | } 52 | DEBUG("FIND }"); 53 | cout << bf.lookupDigest(digests) << endl; 54 | } 55 | } else { 56 | // plain input 57 | while (getline(cin, line)) { 58 | DEBUG("FIND plain \"" << line << "\""); 59 | fold_str(line, bf::default_hash_function::max_obj_size); 60 | cout << bf.lookup(line) << endl; 61 | } 62 | } 63 | return 0; 64 | } 65 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(hasher_test hasher.cc) 2 | target_link_libraries(hasher_test libbf) 3 | target_compile_options(hasher_test PRIVATE -std=c++11) 4 | -------------------------------------------------------------------------------- /test/foo.idx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kamikat/52g/64f7b7a7fd7c34e20a1318791a36a8acc0a4a931/test/foo.idx -------------------------------------------------------------------------------- /test/foo.txt: -------------------------------------------------------------------------------- 1 | foo@bar.com 123 2 | abc@bar.com 233 3 | averyveryveryverylongfoo@bar.com 123123123 4 | -------------------------------------------------------------------------------- /test/hasher.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | using namespace bf; 6 | 7 | void test(const hasher& hash, const object& obj) { 8 | for (auto digest : hash(obj)) { 9 | std::cout << std::setw(16) << std::setfill('0') << std::hex << digest << ' '; 10 | } 11 | std::cout << std::endl; 12 | } 13 | 14 | int main () { 15 | hasher hash = make_hasher(2, 1); 16 | test(hash, { "123", 3 }); 17 | test(hash, { "foo", 3 }); 18 | test(hash, { "bar", 3 }); 19 | return 0; 20 | } 21 | --------------------------------------------------------------------------------