├── __init__.py ├── core ├── __init__.py ├── primernumber.py ├── filebitarray.py ├── persistfilter.py └── murmurhash.py ├── test ├── __init__.py ├── pytest.ini ├── test_http_server.py ├── test_murmurhash.py ├── test_filebitarray.py ├── test_rpcserver.py ├── test_tcpserver.py └── test_persistfilter.py ├── readme.md ├── pyproject.toml ├── aaa.py ├── tools.py └── .gitignore /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | pythonpath = .. 3 | log_cli_level = INFO -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ### 基于虚拟内存映射的布隆过滤器 2 | 3 | - 基于硬盘的位数组 4 | - 使用murmurhash作为hash函数,高效&碰撞率低 5 | - 使用硬盘进行去重,相对于内存型过滤器可扩容千倍 6 | - 自定义容纳元素个数与误差率 7 | - 百亿去重约使用33g硬盘空间(误差率百万分之一情况) 8 | - 支持多进程,多个程序可持有同一文件句柄 9 | - 性能及空间高于数据库去重,单条数据最多占用hash函数个数*1bit空间(误差率百万分之一情况下为19bit) 10 | 11 | ###测试 12 | | 京东京造 SATA3 | 西部数据SN550 NVME| 13 | | ------------ | ------------ | 14 | | 8k op/s |5w op/s | 15 | 16 | -------------------------------------------------------------------------------- /test/test_http_server.py: -------------------------------------------------------------------------------- 1 | # import aiohttp 2 | # import zerorpc 3 | from tools import caculate_time 4 | import asyncio 5 | 6 | def test_run_http_client(): 7 | 8 | async def fetch(session): 9 | html = await session.get('http://127.0.0.1:8000/') 10 | 11 | async def main(): 12 | async with aiohttp.ClientSession() as session: 13 | await asyncio.wait([asyncio.create_task(fetch(session)) for i in range(10000)]) 14 | asyncio.run(main()) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "persistfilter" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["dream2333"] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.8" 11 | #mmr3 = "^1.3.1" 12 | pybloomfiltermmap3 = "^0.5.7" 13 | 14 | [tool.poetry.group.dev.dependencies] 15 | black = "^23.1.0" 16 | pytest = "^7.2.2" 17 | 18 | 19 | 20 | 21 | [build-system] 22 | requires = ["poetry-core"] 23 | build-backend = "poetry.core.masonry.api" 24 | -------------------------------------------------------------------------------- /aaa.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | 3 | from core.persistfilter import PersistFilter 4 | 5 | 6 | def multi_test(): 7 | persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001) 8 | for i in range(1000): 9 | persist_filter.add(f"{i}") 10 | persist_filter.close() 11 | 12 | if __name__ == "__main__": 13 | process_list = [] 14 | for i in range(5): #开启5个子进程执行fun1函数 15 | p = Process(multi_test) #实例化进程对象 16 | p.start() 17 | process_list.append(p) 18 | 19 | for i in process_list: 20 | p.join() 21 | -------------------------------------------------------------------------------- /test/test_murmurhash.py: -------------------------------------------------------------------------------- 1 | from tools import detect_python_interpreter 2 | 3 | if detect_python_interpreter() == "PyPy": 4 | from core.murmurhash import hash128 as hash128_x64 5 | else: 6 | from mmr3 import hash128_x64 7 | 8 | 9 | 10 | def test_murmurhash_bench(): 11 | # 一百万次hash取值 12 | for i in range(1000000): 13 | hash = hash128_x64(f"{i}", seed=31) 14 | 15 | def test_get_offset_range(): 16 | for i in range(5000000): 17 | for seed in range(19): 18 | # 计算哈希值 19 | hash128 = hash128_x64("1231242341235345", seed) 20 | 21 | -------------------------------------------------------------------------------- /test/test_filebitarray.py: -------------------------------------------------------------------------------- 1 | from core.filebitarray import FileBitArray 2 | from tools import caculate_time 3 | 4 | 5 | def test_write_bench(): 6 | fba = FileBitArray("/media/dream/软件/Programming/vsProject/PersistFilter/testfilter.bin", 10000000000) 7 | for i in range(10000000): 8 | fba[i] = 1 9 | fba.close() 10 | assert 1 11 | 12 | # 测试读取所有的数组元素 13 | def test_read_array_elements(): 14 | fba = FileBitArray("/media/dream/软件/Programming/vsProject/PersistFilter/testfilter.bin", 10000000000) 15 | for i in range(10000000): 16 | bit = fba[i] 17 | fba.close() 18 | print(bit) 19 | assert 1 -------------------------------------------------------------------------------- /test/test_rpcserver.py: -------------------------------------------------------------------------------- 1 | # import zerorpc 2 | # from tools import caculate_time 3 | # from concurrent.futures.thread import ThreadPoolExecutor 4 | # from concurrent.futures.process import ProcessPoolExecutor 5 | # @caculate_time 6 | # def run_rpc_client(): 7 | # c = zerorpc.Client() 8 | # c.connect("tcp://127.0.0.1:4242") 9 | # def rpc(): 10 | # r = c.hello("hello") 11 | # print(r) 12 | 13 | # with ThreadPoolExecutor(max_workers=5) as pool: 14 | # for i in range(10): 15 | # pool.submit(rpc) 16 | 17 | 18 | # # with ProcessPoolExecutor(max_workers=4) as pool: 19 | # # for i in range(5): 20 | # # pool.submit(run) 21 | 22 | # run_rpc_client() -------------------------------------------------------------------------------- /test/test_tcpserver.py: -------------------------------------------------------------------------------- 1 | import socket 2 | import sys 3 | from tools import caculate_time 4 | 5 | 6 | @caculate_time 7 | def test_tcp(): 8 | HOST, PORT = "localhost", 9999 9 | for i in range(100): 10 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: 11 | sock.connect((HOST, PORT)) 12 | sock.sendall("123456789".encode("utf-8")) 13 | received = bool(sock.recv(1)) 14 | 15 | 16 | def test_tcp_client(): 17 | HOST, PORT = "localhost", 9999 18 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: 19 | sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1) 20 | sock.connect((HOST, PORT)) 21 | sock.sendall("123456789".encode("utf-8")) 22 | received = bool(sock.recv(1)) 23 | print(received) 24 | -------------------------------------------------------------------------------- /tools.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import time 4 | from platform import python_implementation 5 | 6 | # 计算函数运行时间的 7 | def caculate_time(func): 8 | def wrapper(*args, **kwargs): 9 | start_time = time.time() 10 | func(*args, **kwargs) 11 | end_time = time.time() 12 | print("函数运行时间为:%s" % (end_time - start_time)) 13 | return wrapper 14 | 15 | # 检测python解释器 16 | def detect_python_interpreter(): 17 | try: 18 | from platform import python_implementation 19 | except ImportError: # pragma: no cover 20 | def python_implementation(): 21 | """Return a string identifying the Python implementation.""" 22 | if 'PyPy' in sys.version: 23 | return 'PyPy' 24 | if os.name == 'java': 25 | return 'Jython' 26 | if sys.version.startswith('IronPython'): 27 | return 'IronPython' 28 | return 'CPython' 29 | return python_implementation() 30 | 31 | -------------------------------------------------------------------------------- /core/primernumber.py: -------------------------------------------------------------------------------- 1 | from array import array 2 | 3 | primer_numbers = (3, 4 | 5, 5 | 7, 6 | 11, 7 | 13, 8 | 17, 9 | 19, 10 | 23, 11 | 29, 12 | 31, 13 | 37, 14 | 41, 15 | 43, 16 | 47, 17 | 53, 18 | 59, 19 | 61, 20 | 67, 21 | 71, 22 | 73, 23 | 79, 24 | 83, 25 | 89, 26 | 97, 27 | 101, 28 | 103, 29 | 107, 30 | 109, 31 | 113, 32 | 127, 33 | 131, 34 | 137, 35 | 139, 36 | 149, 37 | 151, 38 | 157, 39 | 163, 40 | 167, 41 | 173, 42 | 179, 43 | 181, 44 | 191, 45 | 193, 46 | 197, 47 | 199, 48 | 211, 49 | 223, 50 | 227, 51 | 229, 52 | 233, 53 | 239, 54 | 241, 55 | 251 56 | ) 57 | -------------------------------------------------------------------------------- /core/filebitarray.py: -------------------------------------------------------------------------------- 1 | import math 2 | import mmap 3 | import sys 4 | 5 | 6 | class FileBitArray: 7 | __slots__ = "__f", "__m" 8 | 9 | def __init__(self, filename, bit_array_size): 10 | """ 11 | 将大文件映射到虚拟内存并提供bitarray接口 12 | :param filename: 文件名 13 | :param bit_array_size: 位数组大小 14 | """ 15 | size = math.ceil(bit_array_size / 8) 16 | # 创建指定大小空文件 17 | try: 18 | self.__createfile(filename,size) 19 | except: 20 | ... 21 | self.__f = open(filename, "r+b",buffering=0) 22 | self.__m = mmap.mmap(self.__f.fileno(), size, access=mmap.ACCESS_DEFAULT) 23 | # 如果系统为linux,则通知内核随机读写优化 24 | if sys.version_info >= (3, 8) and sys.platform != "win32": 25 | self.__m.madvise(mmap.MADV_RANDOM) 26 | 27 | 28 | def __createfile(self,filename,size): 29 | with open(filename,'xb') as f: 30 | f.seek(size-1) 31 | f.write(b'\x00') 32 | 33 | # 对文件对象内存映射进行读bit操作 34 | def __getitem__(self, index): 35 | byte_offset, bit_offset = divmod(index, 8) 36 | byte = self.__m[byte_offset] 37 | bit = (byte >> (7 - bit_offset)) & 1 38 | return bit 39 | 40 | # 对文件对象内存映射进行写bit操作 41 | def __setitem__(self, index, value): 42 | byte_offset = index // 8 43 | bit_offset = index % 8 44 | byte = self.__m[byte_offset] 45 | if value: 46 | byte |= 1 << 7 - bit_offset 47 | else: 48 | byte &= ~(1 << 7 - bit_offset) 49 | self.__m[byte_offset] = byte 50 | # 刷盘 51 | # flush_offset = byte_offset // mmap.ALLOCATIONGRANULARITY * mmap.ALLOCATIONGRANULARITY 52 | # self.__m.flush(flush_offset,byte_offset-flush_offset+1) 53 | 54 | # 同步刷全盘 55 | def flush(self): 56 | self.__m.flush() 57 | 58 | def __len__(self): 59 | return self.__m.size() 60 | 61 | def close(self): 62 | self.__m.close() 63 | self.__f.close() 64 | -------------------------------------------------------------------------------- /core/persistfilter.py: -------------------------------------------------------------------------------- 1 | import io 2 | import math 3 | 4 | from core.filebitarray import FileBitArray 5 | from tools import detect_python_interpreter 6 | from core.primernumber import primer_numbers 7 | 8 | # 如果是pypy,将使用自己的hash函数进行jit加速 9 | if detect_python_interpreter() == "PyPy": 10 | from core.murmurhash import hash128 as hash128_x64 11 | else: 12 | from mmr3 import hash128_x64 13 | 14 | 15 | # 素数的英文是 a: 16 | class PersistFilter(object): 17 | __slots__ = "bit_array_size", "hash_func_count", "bit_array", "primer_numbers" 18 | 19 | def __init__(self, filename: str, num: int, error_rate: float): 20 | """ 21 | 基于虚拟内存的持久化过滤器 22 | :param filename: 文件名 23 | :param num: 预计插入的元素个数 24 | :param error_rate: 允许的误判率 25 | :return: 字节数组大小和哈希函数个数 26 | """ 27 | # 计算位数组大小 28 | self.bit_array_size = int(-(num * math.log(error_rate)) / (math.log(2) ** 2)) 29 | # 计算哈希函数个数 30 | self.hash_func_count = int((self.bit_array_size / num) * math.log(2)) 31 | self.primer_numbers = primer_numbers[0:self.hash_func_count] 32 | # 计算过滤器大小 33 | cost_bytes = math.ceil(self.bit_array_size / 8) 34 | cost_gigabytes = cost_bytes / 1024 / 1024 / 1024 35 | # 生成文件字节数组 36 | self.bit_array = FileBitArray(filename, self.bit_array_size) 37 | print( 38 | f"过滤器大小:{cost_gigabytes:.3f}GB " 39 | f"哈希函数个数:{self.hash_func_count} " 40 | f"误判率:{error_rate:1.8f} " 41 | f"预计插入元素个数:{num} " 42 | ) 43 | 44 | def add(self, value: str): 45 | """ 46 | 将元素插入到过滤器中 47 | :param value: 待插入的元素 48 | """ 49 | for index in self.get_offset_index(value): 50 | # 先读后写,将对应的bit位置为1 51 | if self.bit_array[index] == 0: 52 | self.bit_array[index] = 1 53 | # # 只写入 54 | # self.bit_array[index] = 1 55 | 56 | def __contains__(self, value: str): 57 | """ 58 | 判断元素是否存在于过滤器中 59 | :param value: 待查找的元素 60 | :return: True or False 61 | """ 62 | # 对每个哈希值进行判断 63 | for index in self.get_offset_index(value): 64 | # 如果有一个bit位为0,说明不存在 65 | if self.bit_array[index] == 0: 66 | return False 67 | return True 68 | 69 | def get_offset_index(self, value: str): 70 | """ 71 | 根据hash函数的个数,计算不同hash在文件映射的bit位 72 | :param value: 待计算哈希值的元素 73 | :param hash_count: 哈希函数个数 74 | :return: 哈希值列表 75 | """ 76 | for seed in primer_numbers: 77 | # 计算哈希值 78 | hash128 = hash128_x64(value, seed) 79 | # 取模,映射到虚拟内存中的地址 80 | yield hash128 % self.bit_array_size 81 | # 顺序读写提高hdd性能 82 | # offset_list = [(hash128_x64(value, seed) % self.bit_array_size )for seed in primer_numbers] 83 | # offset_list.sort() 84 | # return offset_list 85 | 86 | def close(self): 87 | """ 88 | 关闭文件 89 | """ 90 | self.bit_array.close() 91 | 92 | 93 | if __name__ == "__main__": 94 | var = io.DEFAULT_BUFFER_SIZE 95 | print(var) 96 | PersistFilter("G:/testfilter.bin", 10000000000, 0.000001) 97 | -------------------------------------------------------------------------------- /test/test_persistfilter.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process 2 | from threading import Thread 3 | import time 4 | from core.persistfilter import PersistFilter 5 | from tools import caculate_time 6 | 7 | # 本地过滤器容量10亿 中添加100万元素 单次19hash 最差情况写入760k 8 | 9 | # 单进程 10 | # 随机 读+写 67秒 二次运行 34秒 11 | # 顺序 读+写 72秒 二次运行 35秒 12 | 13 | # 6进程 14 | # 随机 读+写 21秒 二次运行 6秒 15 | # 顺序 读+写 23秒 二次运行 6.85秒 16 | 17 | # 32进程 18 | # 随机 读+写 20-26秒 二次运行 6.24秒 19 | # 顺序 读+写 16-20秒 二次运行 6.05秒 20 | 21 | # 服务器四进程 1.1w/s 22 | 23 | 24 | # 测试在百亿容量中连续添加一万个元素所需的时间 25 | def test_add(): 26 | 27 | # 设置容量十亿,误判率百万分之一 28 | persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001) 29 | for i in range(100000): 30 | persist_filter.add(f"{i}") 31 | persist_filter.close() 32 | assert 1 33 | 34 | # 测试在百亿容量中查找一万个是否存在所需的时间 35 | def test_exist(): 36 | # 设置容量十亿,误判率百万分之一 37 | dup_count = 0 38 | persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001) 39 | for i in range(1000000): 40 | if f"{i}" in persist_filter: 41 | dup_count+=1 42 | print(dup_count) 43 | assert 1 44 | 45 | # 多线程添加 46 | def test_multi_t_add(): 47 | persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001) 48 | def multi_test(start,end): 49 | print(start,end) 50 | time.sleep(2) 51 | for i in range(start,end): 52 | persist_filter.add(f"{i}") 53 | 54 | def run(times,t_count): 55 | thread_list = [] 56 | for i in range(t_count): 57 | step = int(times/t_count) 58 | p = Thread(target= multi_test,args=(i*step,i*step+step)) 59 | p.start() 60 | thread_list.append(p) 61 | 62 | for i in thread_list: 63 | p.join() 64 | persist_filter.close() 65 | run(100000,8) 66 | # 设置容量百亿,误判率百万分之一 67 | 68 | # 多进程添加 69 | def test_multi_p_add(): 70 | def multi_test(start,end): 71 | print(start,end) 72 | persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001) 73 | time.sleep(2) 74 | for i in range(start,end): 75 | persist_filter.add(f"{i}") 76 | persist_filter.close() 77 | 78 | def run(times,t_count): 79 | process_list = [] 80 | for i in range(t_count): 81 | step = int(times/t_count) 82 | p = Process(target= multi_test,args=(i*step,i*step+step)) 83 | p.start() 84 | process_list.append(p) 85 | 86 | for i in process_list: 87 | p.join() 88 | run(1000000,8) 89 | # 设置容量百亿,误判率百万分之一 90 | 91 | # 多进程获取 92 | def test_multi_p_exist(): 93 | def multi_test(start,end): 94 | persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001) 95 | time.sleep(2) 96 | for i in range(start,end): 97 | f"{i}" in persist_filter 98 | persist_filter.close() 99 | 100 | def run(times,t_count): 101 | process_list = [] 102 | step = int(times/t_count) 103 | for i in range(t_count): 104 | p = Process(target= multi_test,args=(i*step,i*step+step)) 105 | p.start() 106 | process_list.append(p) 107 | 108 | for i in process_list: 109 | p.join() 110 | run(1000000,6) 111 | # 设置容量百亿,误判率百万分之一 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | poetry.lock 162 | testfilter.bin 163 | .vscode/launch.json 164 | .gitignore 165 | -------------------------------------------------------------------------------- /core/murmurhash.py: -------------------------------------------------------------------------------- 1 | def hash32(key: str, seed=0x0): 2 | ''' 32位 murmur3 实现 ''' 3 | 4 | key = bytearray(key.encode()) 5 | 6 | def fmix(h): 7 | h ^= h >> 16 8 | h = (h * 0x85ebca6b) & 0xFFFFFFFF 9 | h ^= h >> 13 10 | h = (h * 0xc2b2ae35) & 0xFFFFFFFF 11 | h ^= h >> 16 12 | return h 13 | 14 | length = len(key) 15 | nblocks = int(length / 4) 16 | 17 | h1 = seed 18 | 19 | c1 = 0xcc9e2d51 20 | c2 = 0x1b873593 21 | 22 | 23 | for block_start in range(0, nblocks * 4, 4): 24 | 25 | k1 = key[block_start + 3] << 24 | \ 26 | key[block_start + 2] << 16 | \ 27 | key[block_start + 1] << 8 | \ 28 | key[block_start + 0] 29 | 30 | k1 = (c1 * k1) & 0xFFFFFFFF 31 | k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF 32 | k1 = (c2 * k1) & 0xFFFFFFFF 33 | 34 | h1 ^= k1 35 | h1 = (h1 << 13 | h1 >> 19) & 0xFFFFFFFF 36 | h1 = (h1 * 5 + 0xe6546b64) & 0xFFFFFFFF 37 | 38 | tail_index = nblocks * 4 39 | k1 = 0 40 | tail_size = length & 3 41 | 42 | if tail_size >= 3: 43 | k1 ^= key[tail_index + 2] << 16 44 | if tail_size >= 2: 45 | k1 ^= key[tail_index + 1] << 8 46 | if tail_size >= 1: 47 | k1 ^= key[tail_index + 0] 48 | 49 | if tail_size > 0: 50 | k1 = (k1 * c1) & 0xFFFFFFFF 51 | k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF 52 | k1 = (k1 * c2) & 0xFFFFFFFF 53 | h1 ^= k1 54 | 55 | 56 | unsigned_val = fmix(h1 ^ length) 57 | if unsigned_val & 0x80000000 == 0: 58 | return unsigned_val 59 | else: 60 | return -((unsigned_val ^ 0xFFFFFFFF) + 1) 61 | 62 | 63 | def hash64(key, seed=0x0, x64arch=True): 64 | """ 64位 murmur3 实现 ,返回两个64位int""" 65 | 66 | hash_128 = hash128(key, seed, x64arch) 67 | 68 | unsigned_val1 = hash_128 & 0xFFFFFFFFFFFFFFFF 69 | if unsigned_val1 & 0x8000000000000000 == 0: 70 | signed_val1 = unsigned_val1 71 | else: 72 | signed_val1 = -((unsigned_val1 ^ 0xFFFFFFFFFFFFFFFF) + 1) 73 | 74 | unsigned_val2 = (hash_128 >> 64) & 0xFFFFFFFFFFFFFFFF 75 | if unsigned_val2 & 0x8000000000000000 == 0: 76 | signed_val2 = unsigned_val2 77 | else: 78 | signed_val2 = -((unsigned_val2 ^ 0xFFFFFFFFFFFFFFFF) + 1) 79 | 80 | return (int(signed_val1), int(signed_val2)) 81 | 82 | 83 | def hash128(key, seed=0x0, x64arch=True): 84 | """ 128位 murmur3 实现 """ 85 | 86 | def hash128_x64(key, seed): 87 | def fmix(k): 88 | k ^= k >> 33 89 | k = (k * 0xff51afd7ed558ccd) & 0xFFFFFFFFFFFFFFFF 90 | k ^= k >> 33 91 | k = (k * 0xc4ceb9fe1a85ec53) & 0xFFFFFFFFFFFFFFFF 92 | k ^= k >> 33 93 | return k 94 | 95 | length = len(key) 96 | nblocks = int(length / 16) 97 | 98 | h1 = seed 99 | h2 = seed 100 | 101 | c1 = 0x87c37b91114253d5 102 | c2 = 0x4cf5ad432745937f 103 | 104 | 105 | for block_start in range(0, nblocks * 8, 8): 106 | 107 | k1 = key[2 * block_start + 7] << 56 | \ 108 | key[2 * block_start + 6] << 48 | \ 109 | key[2 * block_start + 5] << 40 | \ 110 | key[2 * block_start + 4] << 32 | \ 111 | key[2 * block_start + 3] << 24 | \ 112 | key[2 * block_start + 2] << 16 | \ 113 | key[2 * block_start + 1] << 8 | \ 114 | key[2 * block_start + 0] 115 | 116 | k2 = key[2 * block_start + 15] << 56 | \ 117 | key[2 * block_start + 14] << 48 | \ 118 | key[2 * block_start + 13] << 40 | \ 119 | key[2 * block_start + 12] << 32 | \ 120 | key[2 * block_start + 11] << 24 | \ 121 | key[2 * block_start + 10] << 16 | \ 122 | key[2 * block_start + 9] << 8 | \ 123 | key[2 * block_start + 8] 124 | 125 | k1 = (c1 * k1) & 0xFFFFFFFFFFFFFFFF 126 | k1 = (k1 << 31 | k1 >> 33) & 0xFFFFFFFFFFFFFFFF 127 | k1 = (c2 * k1) & 0xFFFFFFFFFFFFFFFF 128 | h1 ^= k1 129 | 130 | h1 = (h1 << 27 | h1 >> 37) & 0xFFFFFFFFFFFFFFFF 131 | h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF 132 | h1 = (h1 * 5 + 0x52dce729) & 0xFFFFFFFFFFFFFFFF 133 | 134 | k2 = (c2 * k2) & 0xFFFFFFFFFFFFFFFF 135 | k2 = (k2 << 33 | k2 >> 31) & 0xFFFFFFFFFFFFFFFF 136 | k2 = (c1 * k2) & 0xFFFFFFFFFFFFFFFF 137 | h2 ^= k2 138 | 139 | h2 = (h2 << 31 | h2 >> 33) & 0xFFFFFFFFFFFFFFFF 140 | h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF 141 | h2 = (h2 * 5 + 0x38495ab5) & 0xFFFFFFFFFFFFFFFF 142 | 143 | 144 | tail_index = nblocks * 16 145 | k1 = 0 146 | k2 = 0 147 | tail_size = length & 15 148 | 149 | if tail_size >= 15: 150 | k2 ^= key[tail_index + 14] << 48 151 | if tail_size >= 14: 152 | k2 ^= key[tail_index + 13] << 40 153 | if tail_size >= 13: 154 | k2 ^= key[tail_index + 12] << 32 155 | if tail_size >= 12: 156 | k2 ^= key[tail_index + 11] << 24 157 | if tail_size >= 11: 158 | k2 ^= key[tail_index + 10] << 16 159 | if tail_size >= 10: 160 | k2 ^= key[tail_index + 9] << 8 161 | if tail_size >= 9: 162 | k2 ^= key[tail_index + 8] 163 | 164 | if tail_size > 8: 165 | k2 = (k2 * c2) & 0xFFFFFFFFFFFFFFFF 166 | k2 = (k2 << 33 | k2 >> 31) & 0xFFFFFFFFFFFFFFFF 167 | k2 = (k2 * c1) & 0xFFFFFFFFFFFFFFFF 168 | h2 ^= k2 169 | 170 | if tail_size >= 8: 171 | k1 ^= key[tail_index + 7] << 56 172 | if tail_size >= 7: 173 | k1 ^= key[tail_index + 6] << 48 174 | if tail_size >= 6: 175 | k1 ^= key[tail_index + 5] << 40 176 | if tail_size >= 5: 177 | k1 ^= key[tail_index + 4] << 32 178 | if tail_size >= 4: 179 | k1 ^= key[tail_index + 3] << 24 180 | if tail_size >= 3: 181 | k1 ^= key[tail_index + 2] << 16 182 | if tail_size >= 2: 183 | k1 ^= key[tail_index + 1] << 8 184 | if tail_size >= 1: 185 | k1 ^= key[tail_index + 0] 186 | 187 | if tail_size > 0: 188 | k1 = (k1 * c1) & 0xFFFFFFFFFFFFFFFF 189 | k1 = (k1 << 31 | k1 >> 33) & 0xFFFFFFFFFFFFFFFF 190 | k1 = (k1 * c2) & 0xFFFFFFFFFFFFFFFF 191 | h1 ^= k1 192 | 193 | 194 | h1 ^= length 195 | h2 ^= length 196 | 197 | h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF 198 | h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF 199 | 200 | h1 = fmix(h1) 201 | h2 = fmix(h2) 202 | 203 | h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF 204 | h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF 205 | 206 | return (h2 << 64 | h1) 207 | 208 | def hash128_x86(key, seed): 209 | def fmix(h): 210 | h ^= h >> 16 211 | h = (h * 0x85ebca6b) & 0xFFFFFFFF 212 | h ^= h >> 13 213 | h = (h * 0xc2b2ae35) & 0xFFFFFFFF 214 | h ^= h >> 16 215 | return h 216 | 217 | length = len(key) 218 | nblocks = int(length / 16) 219 | 220 | h1 = seed 221 | h2 = seed 222 | h3 = seed 223 | h4 = seed 224 | 225 | c1 = 0x239b961b 226 | c2 = 0xab0e9789 227 | c3 = 0x38b34ae5 228 | c4 = 0xa1e38b93 229 | 230 | 231 | for block_start in range(0, nblocks * 16, 16): 232 | k1 = key[block_start + 3] << 24 | \ 233 | key[block_start + 2] << 16 | \ 234 | key[block_start + 1] << 8 | \ 235 | key[block_start + 0] 236 | 237 | k2 = key[block_start + 7] << 24 | \ 238 | key[block_start + 6] << 16 | \ 239 | key[block_start + 5] << 8 | \ 240 | key[block_start + 4] 241 | 242 | k3 = key[block_start + 11] << 24 | \ 243 | key[block_start + 10] << 16 | \ 244 | key[block_start + 9] << 8 | \ 245 | key[block_start + 8] 246 | 247 | k4 = key[block_start + 15] << 24 | \ 248 | key[block_start + 14] << 16 | \ 249 | key[block_start + 13] << 8 | \ 250 | key[block_start + 12] 251 | 252 | k1 = (c1 * k1) & 0xFFFFFFFF 253 | k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF 254 | k1 = (c2 * k1) & 0xFFFFFFFF 255 | h1 ^= k1 256 | 257 | h1 = (h1 << 19 | h1 >> 13) & 0xFFFFFFFF 258 | h1 = (h1 + h2) & 0xFFFFFFFF 259 | h1 = (h1 * 5 + 0x561ccd1b) & 0xFFFFFFFF 260 | 261 | k2 = (c2 * k2) & 0xFFFFFFFF 262 | k2 = (k2 << 16 | k2 >> 16) & 0xFFFFFFFF 263 | k2 = (c3 * k2) & 0xFFFFFFFF 264 | h2 ^= k2 265 | 266 | h2 = (h2 << 17 | h2 >> 15) & 0xFFFFFFFF 267 | h2 = (h2 + h3) & 0xFFFFFFFF 268 | h2 = (h2 * 5 + 0x0bcaa747) & 0xFFFFFFFF 269 | 270 | k3 = (c3 * k3) & 0xFFFFFFFF 271 | k3 = (k3 << 17 | k3 >> 15) & 0xFFFFFFFF 272 | k3 = (c4 * k3) & 0xFFFFFFFF 273 | h3 ^= k3 274 | 275 | h3 = (h3 << 15 | h3 >> 17) & 0xFFFFFFFF 276 | h3 = (h3 + h4) & 0xFFFFFFFF 277 | h3 = (h3 * 5 + 0x96cd1c35) & 0xFFFFFFFF 278 | 279 | k4 = (c4 * k4) & 0xFFFFFFFF 280 | k4 = (k4 << 18 | k4 >> 14) & 0xFFFFFFFF 281 | k4 = (c1 * k4) & 0xFFFFFFFF 282 | h4 ^= k4 283 | 284 | h4 = (h4 << 13 | h4 >> 19) & 0xFFFFFFFF 285 | h4 = (h1 + h4) & 0xFFFFFFFF 286 | h4 = (h4 * 5 + 0x32ac3b17) & 0xFFFFFFFF 287 | 288 | 289 | tail_index = nblocks * 16 290 | k1 = 0 291 | k2 = 0 292 | k3 = 0 293 | k4 = 0 294 | tail_size = length & 15 295 | 296 | if tail_size >= 15: 297 | k4 ^= key[tail_index + 14] << 16 298 | if tail_size >= 14: 299 | k4 ^= key[tail_index + 13] << 8 300 | if tail_size >= 13: 301 | k4 ^= key[tail_index + 12] 302 | 303 | if tail_size > 12: 304 | k4 = (k4 * c4) & 0xFFFFFFFF 305 | k4 = (k4 << 18 | k4 >> 14) & 0xFFFFFFFF 306 | k4 = (k4 * c1) & 0xFFFFFFFF 307 | h4 ^= k4 308 | 309 | if tail_size >= 12: 310 | k3 ^= key[tail_index + 11] << 24 311 | if tail_size >= 11: 312 | k3 ^= key[tail_index + 10] << 16 313 | if tail_size >= 10: 314 | k3 ^= key[tail_index + 9] << 8 315 | if tail_size >= 9: 316 | k3 ^= key[tail_index + 8] 317 | 318 | if tail_size > 8: 319 | k3 = (k3 * c3) & 0xFFFFFFFF 320 | k3 = (k3 << 17 | k3 >> 15) & 0xFFFFFFFF 321 | k3 = (k3 * c4) & 0xFFFFFFFF 322 | h3 ^= k3 323 | 324 | if tail_size >= 8: 325 | k2 ^= key[tail_index + 7] << 24 326 | if tail_size >= 7: 327 | k2 ^= key[tail_index + 6] << 16 328 | if tail_size >= 6: 329 | k2 ^= key[tail_index + 5] << 8 330 | if tail_size >= 5: 331 | k2 ^= key[tail_index + 4] 332 | 333 | if tail_size > 4: 334 | k2 = (k2 * c2) & 0xFFFFFFFF 335 | k2 = (k2 << 16 | k2 >> 16) & 0xFFFFFFFF 336 | k2 = (k2 * c3) & 0xFFFFFFFF 337 | h2 ^= k2 338 | 339 | if tail_size >= 4: 340 | k1 ^= key[tail_index + 3] << 24 341 | if tail_size >= 3: 342 | k1 ^= key[tail_index + 2] << 16 343 | if tail_size >= 2: 344 | k1 ^= key[tail_index + 1] << 8 345 | if tail_size >= 1: 346 | k1 ^= key[tail_index + 0] 347 | 348 | if tail_size > 0: 349 | k1 = (k1 * c1) & 0xFFFFFFFF 350 | k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF 351 | k1 = (k1 * c2) & 0xFFFFFFFF 352 | h1 ^= k1 353 | 354 | 355 | h1 ^= length 356 | h2 ^= length 357 | h3 ^= length 358 | h4 ^= length 359 | 360 | h1 = (h1 + h2) & 0xFFFFFFFF 361 | h1 = (h1 + h3) & 0xFFFFFFFF 362 | h1 = (h1 + h4) & 0xFFFFFFFF 363 | h2 = (h1 + h2) & 0xFFFFFFFF 364 | h3 = (h1 + h3) & 0xFFFFFFFF 365 | h4 = (h1 + h4) & 0xFFFFFFFF 366 | 367 | h1 = fmix(h1) 368 | h2 = fmix(h2) 369 | h3 = fmix(h3) 370 | h4 = fmix(h4) 371 | 372 | h1 = (h1 + h2) & 0xFFFFFFFF 373 | h1 = (h1 + h3) & 0xFFFFFFFF 374 | h1 = (h1 + h4) & 0xFFFFFFFF 375 | h2 = (h1 + h2) & 0xFFFFFFFF 376 | h3 = (h1 + h3) & 0xFFFFFFFF 377 | h4 = (h1 + h4) & 0xFFFFFFFF 378 | 379 | return (h4 << 96 | h3 << 64 | h2 << 32 | h1) 380 | 381 | temp = key.encode() 382 | key = bytearray(temp) 383 | 384 | if x64arch: 385 | return hash128_x64(key, seed) 386 | else: 387 | return hash128_x86(key, seed) 388 | 389 | 390 | def hash_bytes(key, seed=0x0, x64arch=True): 391 | """ 128位实现bytes版""" 392 | 393 | hash_128 = hash128(key, seed, x64arch) 394 | 395 | bytestring = '' 396 | 397 | for i in range(0, 16, 1): 398 | lsbyte = hash_128 & 0xFF 399 | bytestring = bytestring + str(chr(lsbyte)) 400 | hash_128 = hash_128 >> 8 401 | 402 | return bytestring 403 | 404 | 405 | if __name__ == '__main__': 406 | print(hash128('hello world')) 407 | --------------------------------------------------------------------------------