├── __init__.py
├── core
    ├── __init__.py
    ├── primernumber.py
    ├── filebitarray.py
    ├── persistfilter.py
    └── murmurhash.py
├── test
    ├── __init__.py
    ├── pytest.ini
    ├── test_http_server.py
    ├── test_murmurhash.py
    ├── test_filebitarray.py
    ├── test_rpcserver.py
    ├── test_tcpserver.py
    └── test_persistfilter.py
├── readme.md
├── pyproject.toml
├── aaa.py
├── tools.py
└── .gitignore


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | pythonpath = ..
3 | log_cli_level = INFO


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ### 基于虚拟内存映射的布隆过滤器
 2 | 
 3 | - 基于硬盘的位数组
 4 | - 使用murmurhash作为hash函数，高效&碰撞率低
 5 | - 使用硬盘进行去重，相对于内存型过滤器可扩容千倍
 6 | - 自定义容纳元素个数与误差率
 7 | - 百亿去重约使用33g硬盘空间（误差率百万分之一情况）
 8 | - 支持多进程，多个程序可持有同一文件句柄
 9 | - 性能及空间高于数据库去重，单条数据最多占用hash函数个数*1bit空间（误差率百万分之一情况下为19bit）
10 | 
11 | ###测试
12 | |  京东京造 SATA3 | 西部数据SN550 NVME|
13 | | ------------ | ------------ |
14 | |  8k op/s |5w op/s   |
15 | 
16 | 


--------------------------------------------------------------------------------
/test/test_http_server.py:
--------------------------------------------------------------------------------
 1 | # import aiohttp
 2 | # import zerorpc
 3 | from tools import caculate_time
 4 | import asyncio
 5 | 
 6 | def test_run_http_client():
 7 | 
 8 |     async def fetch(session):
 9 |         html = await session.get('http://127.0.0.1:8000/')
10 | 
11 |     async def main():
12 |         async with aiohttp.ClientSession() as session:
13 |             await asyncio.wait([asyncio.create_task(fetch(session)) for i in range(10000)])
14 |     asyncio.run(main())


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "persistfilter"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["dream2333"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.8"
11 | #mmr3 = "^1.3.1"
12 | pybloomfiltermmap3 = "^0.5.7"
13 | 
14 | [tool.poetry.group.dev.dependencies]
15 | black = "^23.1.0"
16 | pytest = "^7.2.2"
17 | 
18 | 
19 | 
20 | 
21 | [build-system]
22 | requires = ["poetry-core"]
23 | build-backend = "poetry.core.masonry.api"
24 | 


--------------------------------------------------------------------------------
/aaa.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process
 2 | 
 3 | from core.persistfilter import PersistFilter
 4 | 
 5 | 
 6 | def multi_test():
 7 |     persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001)
 8 |     for i in range(1000):
 9 |         persist_filter.add(f"{i}")
10 |     persist_filter.close()
11 |     
12 | if __name__ == "__main__":
13 |     process_list = []
14 |     for i in range(5):  #开启5个子进程执行fun1函数
15 |         p = Process(multi_test) #实例化进程对象
16 |         p.start()
17 |         process_list.append(p)
18 | 
19 |     for i in process_list:
20 |         p.join()
21 | 


--------------------------------------------------------------------------------
/test/test_murmurhash.py:
--------------------------------------------------------------------------------
 1 | from tools import detect_python_interpreter
 2 | 
 3 | if detect_python_interpreter() == "PyPy":
 4 |     from core.murmurhash import hash128 as hash128_x64
 5 | else:
 6 |     from mmr3 import hash128_x64
 7 |     
 8 | 
 9 | 
10 | def test_murmurhash_bench():
11 |     # 一百万次hash取值
12 |     for i in range(1000000):
13 |         hash = hash128_x64(f"{i}", seed=31)
14 | 
15 | def test_get_offset_range():
16 |     for i in range(5000000):
17 |         for seed in range(19):
18 |             # 计算哈希值
19 |             hash128 = hash128_x64("1231242341235345", seed)
20 | 
21 | 


--------------------------------------------------------------------------------
/test/test_filebitarray.py:
--------------------------------------------------------------------------------
 1 | from core.filebitarray import FileBitArray
 2 | from tools import caculate_time
 3 | 
 4 | 
 5 | def test_write_bench():
 6 |     fba = FileBitArray("/media/dream/软件/Programming/vsProject/PersistFilter/testfilter.bin", 10000000000)
 7 |     for i in range(10000000):
 8 |         fba[i] = 1
 9 |     fba.close()
10 |     assert 1
11 | 
12 | # 测试读取所有的数组元素
13 | def test_read_array_elements():
14 |     fba = FileBitArray("/media/dream/软件/Programming/vsProject/PersistFilter/testfilter.bin", 10000000000)
15 |     for i in range(10000000):
16 |         bit = fba[i]
17 |     fba.close()
18 |     print(bit)
19 |     assert 1


--------------------------------------------------------------------------------
/test/test_rpcserver.py:
--------------------------------------------------------------------------------
 1 | # import zerorpc
 2 | # from tools import caculate_time
 3 | # from concurrent.futures.thread import ThreadPoolExecutor
 4 | # from concurrent.futures.process import ProcessPoolExecutor
 5 | # @caculate_time
 6 | # def run_rpc_client():
 7 | #     c = zerorpc.Client()
 8 | #     c.connect("tcp://127.0.0.1:4242")
 9 | #     def rpc():
10 | #         r = c.hello("hello")
11 | #         print(r)
12 | 
13 | #     with ThreadPoolExecutor(max_workers=5) as pool:
14 | #         for i in range(10):
15 | #             pool.submit(rpc)
16 | 
17 | 
18 | #     # with ProcessPoolExecutor(max_workers=4) as pool:
19 | #     #     for i in range(5):
20 | #     #         pool.submit(run)
21 | 
22 | # run_rpc_client()


--------------------------------------------------------------------------------
/test/test_tcpserver.py:
--------------------------------------------------------------------------------
 1 | import socket
 2 | import sys
 3 | from tools import caculate_time
 4 | 
 5 | 
 6 | @caculate_time
 7 | def test_tcp():
 8 |     HOST, PORT = "localhost", 9999
 9 |     for i in range(100):
10 |         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
11 |             sock.connect((HOST, PORT))
12 |             sock.sendall("123456789".encode("utf-8"))
13 |             received = bool(sock.recv(1))
14 | 
15 | 
16 | def test_tcp_client():
17 |     HOST, PORT = "localhost", 9999
18 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
19 |         sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
20 |         sock.connect((HOST, PORT))
21 |         sock.sendall("123456789".encode("utf-8"))
22 |         received = bool(sock.recv(1))
23 |         print(received)
24 | 


--------------------------------------------------------------------------------
/tools.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import time
 4 | from platform import python_implementation
 5 | 
 6 | # 计算函数运行时间的
 7 | def caculate_time(func):
 8 |     def wrapper(*args, **kwargs):
 9 |         start_time = time.time()
10 |         func(*args, **kwargs)
11 |         end_time = time.time()
12 |         print("函数运行时间为：%s" % (end_time - start_time))
13 |     return wrapper
14 | 
15 | # 检测python解释器
16 | def detect_python_interpreter():
17 |     try:
18 |         from platform import python_implementation
19 |     except ImportError: # pragma: no cover
20 |         def python_implementation():
21 |             """Return a string identifying the Python implementation."""
22 |             if 'PyPy' in sys.version:
23 |                 return 'PyPy'
24 |             if os.name == 'java':
25 |                 return 'Jython'
26 |             if sys.version.startswith('IronPython'):
27 |                 return 'IronPython'
28 |             return 'CPython'
29 |     return python_implementation()
30 | 
31 | 


--------------------------------------------------------------------------------
/core/primernumber.py:
--------------------------------------------------------------------------------
 1 | from array import array
 2 | 
 3 | primer_numbers = (3,
 4 |                   5,
 5 |                   7,
 6 |                   11,
 7 |                   13,
 8 |                   17,
 9 |                   19,
10 |                   23,
11 |                   29,
12 |                   31,
13 |                   37,
14 |                   41,
15 |                   43,
16 |                   47,
17 |                   53,
18 |                   59,
19 |                   61,
20 |                   67,
21 |                   71,
22 |                   73,
23 |                   79,
24 |                   83,
25 |                   89,
26 |                   97,
27 |                   101,
28 |                   103,
29 |                   107,
30 |                   109,
31 |                   113,
32 |                   127,
33 |                   131,
34 |                   137,
35 |                   139,
36 |                   149,
37 |                   151,
38 |                   157,
39 |                   163,
40 |                   167,
41 |                   173,
42 |                   179,
43 |                   181,
44 |                   191,
45 |                   193,
46 |                   197,
47 |                   199,
48 |                   211,
49 |                   223,
50 |                   227,
51 |                   229,
52 |                   233,
53 |                   239,
54 |                   241,
55 |                   251
56 |                   )
57 | 


--------------------------------------------------------------------------------
/core/filebitarray.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import mmap
 3 | import sys
 4 | 
 5 | 
 6 | class FileBitArray:
 7 |     __slots__ = "__f", "__m"
 8 | 
 9 |     def __init__(self, filename, bit_array_size):
10 |         """
11 |         将大文件映射到虚拟内存并提供bitarray接口
12 |         :param filename: 文件名
13 |         :param bit_array_size: 位数组大小
14 |         """
15 |         size = math.ceil(bit_array_size / 8)
16 |         # 创建指定大小空文件
17 |         try:
18 |             self.__createfile(filename,size)
19 |         except:
20 |             ...
21 |         self.__f = open(filename, "r+b",buffering=0)
22 |         self.__m = mmap.mmap(self.__f.fileno(), size, access=mmap.ACCESS_DEFAULT)
23 |         # 如果系统为linux，则通知内核随机读写优化
24 |         if sys.version_info >= (3, 8) and sys.platform != "win32":
25 |             self.__m.madvise(mmap.MADV_RANDOM)
26 | 
27 | 
28 |     def __createfile(self,filename,size):
29 |         with open(filename,'xb') as f:
30 |             f.seek(size-1)
31 |             f.write(b'\x00')
32 |             
33 |     # 对文件对象内存映射进行读bit操作
34 |     def __getitem__(self, index):
35 |         byte_offset, bit_offset = divmod(index, 8)
36 |         byte = self.__m[byte_offset]
37 |         bit = (byte >> (7 - bit_offset)) & 1
38 |         return bit
39 | 
40 |     # 对文件对象内存映射进行写bit操作
41 |     def __setitem__(self, index, value):
42 |         byte_offset = index // 8
43 |         bit_offset = index % 8
44 |         byte = self.__m[byte_offset]
45 |         if value:
46 |             byte |= 1 << 7 - bit_offset
47 |         else:
48 |             byte &= ~(1 << 7 - bit_offset)
49 |         self.__m[byte_offset] = byte
50 |         # 刷盘
51 |         # flush_offset = byte_offset // mmap.ALLOCATIONGRANULARITY * mmap.ALLOCATIONGRANULARITY
52 |         # self.__m.flush(flush_offset,byte_offset-flush_offset+1)
53 | 
54 |     # 同步刷全盘
55 |     def flush(self):
56 |         self.__m.flush()
57 | 
58 |     def __len__(self):
59 |         return self.__m.size()
60 | 
61 |     def close(self):
62 |         self.__m.close()
63 |         self.__f.close()
64 | 


--------------------------------------------------------------------------------
/core/persistfilter.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import math
 3 | 
 4 | from core.filebitarray import FileBitArray
 5 | from tools import detect_python_interpreter
 6 | from core.primernumber import primer_numbers
 7 | 
 8 | # 如果是pypy,将使用自己的hash函数进行jit加速
 9 | if detect_python_interpreter() == "PyPy":
10 |     from core.murmurhash import hash128 as hash128_x64
11 | else:
12 |     from mmr3 import hash128_x64
13 | 
14 | 
15 | # 素数的英文是 a:
16 | class PersistFilter(object):
17 |     __slots__ = "bit_array_size", "hash_func_count", "bit_array", "primer_numbers"
18 |     
19 |     def __init__(self, filename: str, num: int, error_rate: float):
20 |         """
21 |         基于虚拟内存的持久化过滤器
22 |         :param filename: 文件名
23 |         :param num: 预计插入的元素个数
24 |         :param error_rate: 允许的误判率
25 |         :return: 字节数组大小和哈希函数个数
26 |         """
27 |         # 计算位数组大小
28 |         self.bit_array_size = int(-(num * math.log(error_rate)) / (math.log(2) ** 2))
29 |         # 计算哈希函数个数
30 |         self.hash_func_count = int((self.bit_array_size / num) * math.log(2))
31 |         self.primer_numbers = primer_numbers[0:self.hash_func_count]
32 |         # 计算过滤器大小
33 |         cost_bytes = math.ceil(self.bit_array_size / 8)
34 |         cost_gigabytes = cost_bytes / 1024 / 1024 / 1024
35 |         # 生成文件字节数组
36 |         self.bit_array = FileBitArray(filename, self.bit_array_size)
37 |         print(
38 |             f"过滤器大小：{cost_gigabytes:.3f}GB "
39 |             f"哈希函数个数：{self.hash_func_count} "
40 |             f"误判率：{error_rate:1.8f} "
41 |             f"预计插入元素个数：{num} "
42 |         )
43 | 
44 |     def add(self, value: str):
45 |         """
46 |         将元素插入到过滤器中
47 |         :param value: 待插入的元素
48 |         """
49 |         for index in self.get_offset_index(value):
50 |             # 先读后写，将对应的bit位置为1
51 |             if self.bit_array[index] == 0:
52 |                 self.bit_array[index] = 1
53 |             # # 只写入
54 |             # self.bit_array[index] = 1
55 |             
56 |     def __contains__(self, value: str):
57 |         """
58 |         判断元素是否存在于过滤器中
59 |         :param value: 待查找的元素
60 |         :return: True or False
61 |         """
62 |         # 对每个哈希值进行判断
63 |         for index in self.get_offset_index(value):
64 |             # 如果有一个bit位为0，说明不存在
65 |             if self.bit_array[index] == 0:
66 |                 return False
67 |         return True
68 | 
69 |     def get_offset_index(self, value: str):
70 |         """
71 |         根据hash函数的个数，计算不同hash在文件映射的bit位
72 |         :param value: 待计算哈希值的元素
73 |         :param hash_count: 哈希函数个数
74 |         :return: 哈希值列表
75 |         """
76 |         for seed in primer_numbers:
77 |             # 计算哈希值
78 |             hash128 = hash128_x64(value, seed)
79 |             # 取模，映射到虚拟内存中的地址
80 |             yield hash128 % self.bit_array_size
81 |         # 顺序读写提高hdd性能
82 |         # offset_list =  [(hash128_x64(value, seed) % self.bit_array_size )for seed in primer_numbers]
83 |         # offset_list.sort()
84 |         # return offset_list
85 | 
86 |     def close(self):
87 |         """
88 |         关闭文件
89 |         """
90 |         self.bit_array.close()
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     var = io.DEFAULT_BUFFER_SIZE
95 |     print(var)
96 |     PersistFilter("G:/testfilter.bin", 10000000000, 0.000001)
97 | 


--------------------------------------------------------------------------------
/test/test_persistfilter.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process
  2 | from threading import Thread
  3 | import time
  4 | from core.persistfilter import PersistFilter
  5 | from tools import caculate_time
  6 | 
  7 | # 本地过滤器容量10亿 中添加100万元素 单次19hash 最差情况写入760k 
  8 | 
  9 | # 单进程
 10 | # 随机 读+写 67秒 二次运行 34秒
 11 | # 顺序 读+写 72秒 二次运行 35秒
 12 | 
 13 | # 6进程 
 14 | # 随机 读+写 21秒 二次运行 6秒
 15 | # 顺序 读+写 23秒 二次运行 6.85秒
 16 | 
 17 | # 32进程 
 18 | # 随机 读+写 20-26秒 二次运行 6.24秒
 19 | # 顺序 读+写 16-20秒 二次运行 6.05秒
 20 | 
 21 | # 服务器四进程 1.1w/s
 22 | 
 23 | 
 24 | # 测试在百亿容量中连续添加一万个元素所需的时间
 25 | def test_add():
 26 | 
 27 |     # 设置容量十亿，误判率百万分之一
 28 |     persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001)
 29 |     for i in range(100000):
 30 |         persist_filter.add(f"{i}")
 31 |     persist_filter.close()
 32 |     assert 1
 33 | 
 34 | # 测试在百亿容量中查找一万个是否存在所需的时间
 35 | def test_exist():
 36 |     # 设置容量十亿，误判率百万分之一
 37 |     dup_count = 0
 38 |     persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001)
 39 |     for i in range(1000000):
 40 |         if f"{i}" in persist_filter:
 41 |             dup_count+=1
 42 |     print(dup_count)
 43 |     assert 1
 44 | 
 45 | # 多线程添加
 46 | def test_multi_t_add():
 47 |     persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001)
 48 |     def multi_test(start,end):
 49 |         print(start,end)
 50 |         time.sleep(2)
 51 |         for i in range(start,end):
 52 |             persist_filter.add(f"{i}")
 53 |     
 54 |     def run(times,t_count):
 55 |         thread_list = []
 56 |         for i in range(t_count): 
 57 |             step = int(times/t_count)
 58 |             p = Thread(target= multi_test,args=(i*step,i*step+step))
 59 |             p.start()
 60 |             thread_list.append(p)
 61 | 
 62 |         for i in thread_list:
 63 |             p.join()
 64 |         persist_filter.close()
 65 |     run(100000,8)
 66 |     # 设置容量百亿，误判率百万分之一
 67 | 
 68 | # 多进程添加
 69 | def test_multi_p_add():
 70 |     def multi_test(start,end):
 71 |         print(start,end)
 72 |         persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001)
 73 |         time.sleep(2)
 74 |         for i in range(start,end):
 75 |             persist_filter.add(f"{i}")
 76 |         persist_filter.close()
 77 |     
 78 |     def run(times,t_count):
 79 |         process_list = []
 80 |         for i in range(t_count): 
 81 |             step = int(times/t_count)
 82 |             p = Process(target= multi_test,args=(i*step,i*step+step))
 83 |             p.start()
 84 |             process_list.append(p)
 85 | 
 86 |         for i in process_list:
 87 |             p.join()
 88 |     run(1000000,8)
 89 |     # 设置容量百亿，误判率百万分之一
 90 | 
 91 | # 多进程获取
 92 | def test_multi_p_exist():
 93 |     def multi_test(start,end):
 94 |         persist_filter = PersistFilter("/home/dream/桌面/testfilter.bin", 1000000000,0.000001)
 95 |         time.sleep(2)
 96 |         for i in range(start,end):
 97 |             f"{i}" in persist_filter
 98 |         persist_filter.close()
 99 |     
100 |     def run(times,t_count):
101 |         process_list = []
102 |         step = int(times/t_count)
103 |         for i in range(t_count): 
104 |             p = Process(target= multi_test,args=(i*step,i*step+step))
105 |             p.start()
106 |             process_list.append(p)
107 | 
108 |         for i in process_list:
109 |             p.join()
110 |     run(1000000,6)
111 |     # 设置容量百亿，误判率百万分之一


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | poetry.lock
162 | testfilter.bin
163 | .vscode/launch.json
164 | .gitignore
165 | 


--------------------------------------------------------------------------------
/core/murmurhash.py:
--------------------------------------------------------------------------------
  1 | def hash32(key: str, seed=0x0):
  2 |     ''' 32位 murmur3 实现 '''
  3 | 
  4 |     key = bytearray(key.encode())
  5 | 
  6 |     def fmix(h):
  7 |         h ^= h >> 16
  8 |         h = (h * 0x85ebca6b) & 0xFFFFFFFF
  9 |         h ^= h >> 13
 10 |         h = (h * 0xc2b2ae35) & 0xFFFFFFFF
 11 |         h ^= h >> 16
 12 |         return h
 13 | 
 14 |     length = len(key)
 15 |     nblocks = int(length / 4)
 16 | 
 17 |     h1 = seed
 18 | 
 19 |     c1 = 0xcc9e2d51
 20 |     c2 = 0x1b873593
 21 | 
 22 | 
 23 |     for block_start in range(0, nblocks * 4, 4):
 24 | 
 25 |         k1 = key[block_start + 3] << 24 | \
 26 |              key[block_start + 2] << 16 | \
 27 |              key[block_start + 1] << 8 | \
 28 |              key[block_start + 0]
 29 | 
 30 |         k1 = (c1 * k1) & 0xFFFFFFFF
 31 |         k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  
 32 |         k1 = (c2 * k1) & 0xFFFFFFFF
 33 | 
 34 |         h1 ^= k1
 35 |         h1 = (h1 << 13 | h1 >> 19) & 0xFFFFFFFF  
 36 |         h1 = (h1 * 5 + 0xe6546b64) & 0xFFFFFFFF
 37 | 
 38 |     tail_index = nblocks * 4
 39 |     k1 = 0
 40 |     tail_size = length & 3
 41 | 
 42 |     if tail_size >= 3:
 43 |         k1 ^= key[tail_index + 2] << 16
 44 |     if tail_size >= 2:
 45 |         k1 ^= key[tail_index + 1] << 8
 46 |     if tail_size >= 1:
 47 |         k1 ^= key[tail_index + 0]
 48 | 
 49 |     if tail_size > 0:
 50 |         k1 = (k1 * c1) & 0xFFFFFFFF
 51 |         k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  
 52 |         k1 = (k1 * c2) & 0xFFFFFFFF
 53 |         h1 ^= k1
 54 | 
 55 |     
 56 |     unsigned_val = fmix(h1 ^ length)
 57 |     if unsigned_val & 0x80000000 == 0:
 58 |         return unsigned_val
 59 |     else:
 60 |         return -((unsigned_val ^ 0xFFFFFFFF) + 1)
 61 | 
 62 | 
 63 | def hash64(key, seed=0x0, x64arch=True):
 64 |     """ 64位 murmur3 实现 ,返回两个64位int"""
 65 | 
 66 |     hash_128 = hash128(key, seed, x64arch)
 67 | 
 68 |     unsigned_val1 = hash_128 & 0xFFFFFFFFFFFFFFFF
 69 |     if unsigned_val1 & 0x8000000000000000 == 0:
 70 |         signed_val1 = unsigned_val1
 71 |     else:
 72 |         signed_val1 = -((unsigned_val1 ^ 0xFFFFFFFFFFFFFFFF) + 1)
 73 | 
 74 |     unsigned_val2 = (hash_128 >> 64) & 0xFFFFFFFFFFFFFFFF
 75 |     if unsigned_val2 & 0x8000000000000000 == 0:
 76 |         signed_val2 = unsigned_val2
 77 |     else:
 78 |         signed_val2 = -((unsigned_val2 ^ 0xFFFFFFFFFFFFFFFF) + 1)
 79 | 
 80 |     return (int(signed_val1), int(signed_val2))
 81 | 
 82 | 
 83 | def hash128(key, seed=0x0, x64arch=True):
 84 |     """ 128位 murmur3 实现 """
 85 | 
 86 |     def hash128_x64(key, seed):
 87 |         def fmix(k):
 88 |             k ^= k >> 33
 89 |             k = (k * 0xff51afd7ed558ccd) & 0xFFFFFFFFFFFFFFFF
 90 |             k ^= k >> 33
 91 |             k = (k * 0xc4ceb9fe1a85ec53) & 0xFFFFFFFFFFFFFFFF
 92 |             k ^= k >> 33
 93 |             return k
 94 | 
 95 |         length = len(key)
 96 |         nblocks = int(length / 16)
 97 | 
 98 |         h1 = seed
 99 |         h2 = seed
100 | 
101 |         c1 = 0x87c37b91114253d5
102 |         c2 = 0x4cf5ad432745937f
103 | 
104 | 
105 |         for block_start in range(0, nblocks * 8, 8):
106 | 
107 |             k1 = key[2 * block_start + 7] << 56 | \
108 |                  key[2 * block_start + 6] << 48 | \
109 |                  key[2 * block_start + 5] << 40 | \
110 |                  key[2 * block_start + 4] << 32 | \
111 |                  key[2 * block_start + 3] << 24 | \
112 |                  key[2 * block_start + 2] << 16 | \
113 |                  key[2 * block_start + 1] << 8 | \
114 |                  key[2 * block_start + 0]
115 | 
116 |             k2 = key[2 * block_start + 15] << 56 | \
117 |                  key[2 * block_start + 14] << 48 | \
118 |                  key[2 * block_start + 13] << 40 | \
119 |                  key[2 * block_start + 12] << 32 | \
120 |                  key[2 * block_start + 11] << 24 | \
121 |                  key[2 * block_start + 10] << 16 | \
122 |                  key[2 * block_start + 9] << 8 | \
123 |                  key[2 * block_start + 8]
124 | 
125 |             k1 = (c1 * k1) & 0xFFFFFFFFFFFFFFFF
126 |             k1 = (k1 << 31 | k1 >> 33) & 0xFFFFFFFFFFFFFFFF  
127 |             k1 = (c2 * k1) & 0xFFFFFFFFFFFFFFFF
128 |             h1 ^= k1
129 | 
130 |             h1 = (h1 << 27 | h1 >> 37) & 0xFFFFFFFFFFFFFFFF  
131 |             h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
132 |             h1 = (h1 * 5 + 0x52dce729) & 0xFFFFFFFFFFFFFFFF
133 | 
134 |             k2 = (c2 * k2) & 0xFFFFFFFFFFFFFFFF
135 |             k2 = (k2 << 33 | k2 >> 31) & 0xFFFFFFFFFFFFFFFF  
136 |             k2 = (c1 * k2) & 0xFFFFFFFFFFFFFFFF
137 |             h2 ^= k2
138 | 
139 |             h2 = (h2 << 31 | h2 >> 33) & 0xFFFFFFFFFFFFFFFF  
140 |             h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
141 |             h2 = (h2 * 5 + 0x38495ab5) & 0xFFFFFFFFFFFFFFFF
142 | 
143 | 
144 |         tail_index = nblocks * 16
145 |         k1 = 0
146 |         k2 = 0
147 |         tail_size = length & 15
148 | 
149 |         if tail_size >= 15:
150 |             k2 ^= key[tail_index + 14] << 48
151 |         if tail_size >= 14:
152 |             k2 ^= key[tail_index + 13] << 40
153 |         if tail_size >= 13:
154 |             k2 ^= key[tail_index + 12] << 32
155 |         if tail_size >= 12:
156 |             k2 ^= key[tail_index + 11] << 24
157 |         if tail_size >= 11:
158 |             k2 ^= key[tail_index + 10] << 16
159 |         if tail_size >= 10:
160 |             k2 ^= key[tail_index + 9] << 8
161 |         if tail_size >= 9:
162 |             k2 ^= key[tail_index + 8]
163 | 
164 |         if tail_size > 8:
165 |             k2 = (k2 * c2) & 0xFFFFFFFFFFFFFFFF
166 |             k2 = (k2 << 33 | k2 >> 31) & 0xFFFFFFFFFFFFFFFF 
167 |             k2 = (k2 * c1) & 0xFFFFFFFFFFFFFFFF
168 |             h2 ^= k2
169 | 
170 |         if tail_size >= 8:
171 |             k1 ^= key[tail_index + 7] << 56
172 |         if tail_size >= 7:
173 |             k1 ^= key[tail_index + 6] << 48
174 |         if tail_size >= 6:
175 |             k1 ^= key[tail_index + 5] << 40
176 |         if tail_size >= 5:
177 |             k1 ^= key[tail_index + 4] << 32
178 |         if tail_size >= 4:
179 |             k1 ^= key[tail_index + 3] << 24
180 |         if tail_size >= 3:
181 |             k1 ^= key[tail_index + 2] << 16
182 |         if tail_size >= 2:
183 |             k1 ^= key[tail_index + 1] << 8
184 |         if tail_size >= 1:
185 |             k1 ^= key[tail_index + 0]
186 | 
187 |         if tail_size > 0:
188 |             k1 = (k1 * c1) & 0xFFFFFFFFFFFFFFFF
189 |             k1 = (k1 << 31 | k1 >> 33) & 0xFFFFFFFFFFFFFFFF  
190 |             k1 = (k1 * c2) & 0xFFFFFFFFFFFFFFFF
191 |             h1 ^= k1
192 | 
193 | 
194 |         h1 ^= length
195 |         h2 ^= length
196 | 
197 |         h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
198 |         h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
199 | 
200 |         h1 = fmix(h1)
201 |         h2 = fmix(h2)
202 | 
203 |         h1 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
204 |         h2 = (h1 + h2) & 0xFFFFFFFFFFFFFFFF
205 | 
206 |         return (h2 << 64 | h1)
207 | 
208 |     def hash128_x86(key, seed):
209 |         def fmix(h):
210 |             h ^= h >> 16
211 |             h = (h * 0x85ebca6b) & 0xFFFFFFFF
212 |             h ^= h >> 13
213 |             h = (h * 0xc2b2ae35) & 0xFFFFFFFF
214 |             h ^= h >> 16
215 |             return h
216 | 
217 |         length = len(key)
218 |         nblocks = int(length / 16)
219 | 
220 |         h1 = seed
221 |         h2 = seed
222 |         h3 = seed
223 |         h4 = seed
224 | 
225 |         c1 = 0x239b961b
226 |         c2 = 0xab0e9789
227 |         c3 = 0x38b34ae5
228 |         c4 = 0xa1e38b93
229 | 
230 | 
231 |         for block_start in range(0, nblocks * 16, 16):
232 |             k1 = key[block_start + 3] << 24 | \
233 |                  key[block_start + 2] << 16 | \
234 |                  key[block_start + 1] << 8 | \
235 |                  key[block_start + 0]
236 | 
237 |             k2 = key[block_start + 7] << 24 | \
238 |                  key[block_start + 6] << 16 | \
239 |                  key[block_start + 5] << 8 | \
240 |                  key[block_start + 4]
241 | 
242 |             k3 = key[block_start + 11] << 24 | \
243 |                  key[block_start + 10] << 16 | \
244 |                  key[block_start + 9] << 8 | \
245 |                  key[block_start + 8]
246 | 
247 |             k4 = key[block_start + 15] << 24 | \
248 |                  key[block_start + 14] << 16 | \
249 |                  key[block_start + 13] << 8 | \
250 |                  key[block_start + 12]
251 | 
252 |             k1 = (c1 * k1) & 0xFFFFFFFF
253 |             k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  
254 |             k1 = (c2 * k1) & 0xFFFFFFFF
255 |             h1 ^= k1
256 | 
257 |             h1 = (h1 << 19 | h1 >> 13) & 0xFFFFFFFF  
258 |             h1 = (h1 + h2) & 0xFFFFFFFF
259 |             h1 = (h1 * 5 + 0x561ccd1b) & 0xFFFFFFFF
260 | 
261 |             k2 = (c2 * k2) & 0xFFFFFFFF
262 |             k2 = (k2 << 16 | k2 >> 16) & 0xFFFFFFFF  
263 |             k2 = (c3 * k2) & 0xFFFFFFFF
264 |             h2 ^= k2
265 | 
266 |             h2 = (h2 << 17 | h2 >> 15) & 0xFFFFFFFF  
267 |             h2 = (h2 + h3) & 0xFFFFFFFF
268 |             h2 = (h2 * 5 + 0x0bcaa747) & 0xFFFFFFFF
269 | 
270 |             k3 = (c3 * k3) & 0xFFFFFFFF
271 |             k3 = (k3 << 17 | k3 >> 15) & 0xFFFFFFFF  
272 |             k3 = (c4 * k3) & 0xFFFFFFFF
273 |             h3 ^= k3
274 | 
275 |             h3 = (h3 << 15 | h3 >> 17) & 0xFFFFFFFF  
276 |             h3 = (h3 + h4) & 0xFFFFFFFF
277 |             h3 = (h3 * 5 + 0x96cd1c35) & 0xFFFFFFFF
278 | 
279 |             k4 = (c4 * k4) & 0xFFFFFFFF
280 |             k4 = (k4 << 18 | k4 >> 14) & 0xFFFFFFFF 
281 |             k4 = (c1 * k4) & 0xFFFFFFFF
282 |             h4 ^= k4
283 | 
284 |             h4 = (h4 << 13 | h4 >> 19) & 0xFFFFFFFF 
285 |             h4 = (h1 + h4) & 0xFFFFFFFF
286 |             h4 = (h4 * 5 + 0x32ac3b17) & 0xFFFFFFFF
287 | 
288 | 
289 |         tail_index = nblocks * 16
290 |         k1 = 0
291 |         k2 = 0
292 |         k3 = 0
293 |         k4 = 0
294 |         tail_size = length & 15
295 | 
296 |         if tail_size >= 15:
297 |             k4 ^= key[tail_index + 14] << 16
298 |         if tail_size >= 14:
299 |             k4 ^= key[tail_index + 13] << 8
300 |         if tail_size >= 13:
301 |             k4 ^= key[tail_index + 12]
302 | 
303 |         if tail_size > 12:
304 |             k4 = (k4 * c4) & 0xFFFFFFFF
305 |             k4 = (k4 << 18 | k4 >> 14) & 0xFFFFFFFF  
306 |             k4 = (k4 * c1) & 0xFFFFFFFF
307 |             h4 ^= k4
308 | 
309 |         if tail_size >= 12:
310 |             k3 ^= key[tail_index + 11] << 24
311 |         if tail_size >= 11:
312 |             k3 ^= key[tail_index + 10] << 16
313 |         if tail_size >= 10:
314 |             k3 ^= key[tail_index + 9] << 8
315 |         if tail_size >= 9:
316 |             k3 ^= key[tail_index + 8]
317 | 
318 |         if tail_size > 8:
319 |             k3 = (k3 * c3) & 0xFFFFFFFF
320 |             k3 = (k3 << 17 | k3 >> 15) & 0xFFFFFFFF  
321 |             k3 = (k3 * c4) & 0xFFFFFFFF
322 |             h3 ^= k3
323 | 
324 |         if tail_size >= 8:
325 |             k2 ^= key[tail_index + 7] << 24
326 |         if tail_size >= 7:
327 |             k2 ^= key[tail_index + 6] << 16
328 |         if tail_size >= 6:
329 |             k2 ^= key[tail_index + 5] << 8
330 |         if tail_size >= 5:
331 |             k2 ^= key[tail_index + 4]
332 | 
333 |         if tail_size > 4:
334 |             k2 = (k2 * c2) & 0xFFFFFFFF
335 |             k2 = (k2 << 16 | k2 >> 16) & 0xFFFFFFFF  
336 |             k2 = (k2 * c3) & 0xFFFFFFFF
337 |             h2 ^= k2
338 | 
339 |         if tail_size >= 4:
340 |             k1 ^= key[tail_index + 3] << 24
341 |         if tail_size >= 3:
342 |             k1 ^= key[tail_index + 2] << 16
343 |         if tail_size >= 2:
344 |             k1 ^= key[tail_index + 1] << 8
345 |         if tail_size >= 1:
346 |             k1 ^= key[tail_index + 0]
347 | 
348 |         if tail_size > 0:
349 |             k1 = (k1 * c1) & 0xFFFFFFFF
350 |             k1 = (k1 << 15 | k1 >> 17) & 0xFFFFFFFF  
351 |             k1 = (k1 * c2) & 0xFFFFFFFF
352 |             h1 ^= k1
353 | 
354 |         
355 |         h1 ^= length
356 |         h2 ^= length
357 |         h3 ^= length
358 |         h4 ^= length
359 | 
360 |         h1 = (h1 + h2) & 0xFFFFFFFF
361 |         h1 = (h1 + h3) & 0xFFFFFFFF
362 |         h1 = (h1 + h4) & 0xFFFFFFFF
363 |         h2 = (h1 + h2) & 0xFFFFFFFF
364 |         h3 = (h1 + h3) & 0xFFFFFFFF
365 |         h4 = (h1 + h4) & 0xFFFFFFFF
366 | 
367 |         h1 = fmix(h1)
368 |         h2 = fmix(h2)
369 |         h3 = fmix(h3)
370 |         h4 = fmix(h4)
371 | 
372 |         h1 = (h1 + h2) & 0xFFFFFFFF
373 |         h1 = (h1 + h3) & 0xFFFFFFFF
374 |         h1 = (h1 + h4) & 0xFFFFFFFF
375 |         h2 = (h1 + h2) & 0xFFFFFFFF
376 |         h3 = (h1 + h3) & 0xFFFFFFFF
377 |         h4 = (h1 + h4) & 0xFFFFFFFF
378 | 
379 |         return (h4 << 96 | h3 << 64 | h2 << 32 | h1)
380 | 
381 |     temp = key.encode()
382 |     key = bytearray(temp)
383 | 
384 |     if x64arch:
385 |         return hash128_x64(key, seed)
386 |     else:
387 |         return hash128_x86(key, seed)
388 | 
389 | 
390 | def hash_bytes(key, seed=0x0, x64arch=True):
391 |     """ 128位实现bytes版"""
392 | 
393 |     hash_128 = hash128(key, seed, x64arch)
394 | 
395 |     bytestring = ''
396 | 
397 |     for i in range(0, 16, 1):
398 |         lsbyte = hash_128 & 0xFF
399 |         bytestring = bytestring + str(chr(lsbyte))
400 |         hash_128 = hash_128 >> 8
401 | 
402 |     return bytestring
403 | 
404 | 
405 | if __name__ == '__main__':
406 |     print(hash128('hello world'))
407 | 


--------------------------------------------------------------------------------