├── BloomFilterRedis ├── BloomFilterRedis.py ├── GeneralHashFunctions.py └── __init__.py ├── License ├── README.md └── orange ├── BloomFilterRedis ├── BloomFilterRedis.py ├── GeneralHashFunctions.py └── __init__.py ├── count.py ├── orange ├── BloomFilterRedis │ ├── BloomFilterRedis.py │ ├── GeneralHashFunctions.py │ └── __init__.py ├── BloomRedisDupeFilter.py ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── orange_spider.py └── scrapy.cfg /BloomFilterRedis/BloomFilterRedis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import redis 4 | import GeneralHashFunctions 5 | 6 | 7 | class BloomFilterRedis: 8 | 9 | hash_list = ["rs_hash", "js_hash", "pjw_hash", "elf_hash", "bkdr_hash", 10 | "sdbm_hash", "djb_hash", "dek_hash"] 11 | 12 | def __init__(self, key, host='127.0.0.1', port=6379, hash_list=hash_list): 13 | # redis-bitmap的key 14 | self.key = key 15 | # redis连接信息 16 | self.pool = redis.ConnectionPool(host=host, port=port) 17 | self.handle = redis.StrictRedis(connection_pool=self.pool, charset='utf-8') 18 | # 哈希函数列表 19 | self.hash_list = hash_list 20 | 21 | @classmethod 22 | def random_generator(cls, hash_value): 23 | ''' 24 | 将hash函数得出的函数值映射到[0, 2^32-1]区间内 25 | ''' 26 | return hash_value % (1 << 32) 27 | 28 | def do_filter(self, item): 29 | ''' 30 | 检查是否是新的条目,是新条目则更新bitmap并返回True,是重复条目则返回False 31 | ''' 32 | flag = False 33 | for hash_func_str in self.hash_list: 34 | # 获得到hash函数对象 35 | hash_func = getattr(GeneralHashFunctions, hash_func_str) 36 | # 计算hash值 37 | hash_value = hash_func(item) 38 | # 将hash值映射到[0, 2^32]区间 39 | real_value = BloomFilterRedis.random_generator(hash_value) 40 | # bitmap中对应位是0,则置为1,并说明此条目为新的条目 41 | if self.handle.getbit(self.key, real_value) == 0: 42 | self.handle.setbit(self.key, real_value, 1) 43 | flag = True 44 | # 当所有hash值在bitmap中对应位都是1,说明此条目重复,返回False 45 | return flag 46 | 47 | 48 | if __name__ == "__main__": 49 | bloomFilterRedis = BloomFilterRedis("bloom") 50 | bloomFilterRedis.do_filter("one item to check") 51 | -------------------------------------------------------------------------------- /BloomFilterRedis/GeneralHashFunctions.py: -------------------------------------------------------------------------------- 1 | # 2 | #************************************************************************** 3 | #* * 4 | #* General Purpose Hash Function Algorithms Library * 5 | #* * 6 | #* Author: Arash Partow - 2002 * 7 | #* URL: http://www.partow.net * 8 | #* URL: http://www.partow.net/programming/hashfunctions/index.html * 9 | #* * 10 | #* Copyright notice: * 11 | #* Free use of the General Purpose Hash Function Algorithms Library is * 12 | #* permitted under the guidelines and in accordance with the MIT License. * 13 | #* http://www.opensource.org/licenses/MIT * 14 | #* * 15 | #************************************************************************** 16 | # 17 | 18 | 19 | def rs_hash(key): 20 | a = 378551 21 | b = 63689 22 | hash_value = 0 23 | for i in range(len(key)): 24 | hash_value = hash_value * a + ord(key[i]) 25 | a = a * b 26 | return hash_value 27 | 28 | 29 | def js_hash(key): 30 | hash_value = 1315423911 31 | for i in range(len(key)): 32 | hash_value ^= ((hash_value << 5) + ord(key[i]) + (hash_value >> 2)) 33 | return hash_value 34 | 35 | 36 | def pjw_hash(key): 37 | bits_in_unsigned_int = 4 * 8 38 | three_quarters = (bits_in_unsigned_int * 3) / 4 39 | one_eighth = bits_in_unsigned_int / 8 40 | high_bits = 0xFFFFFFFF << int(bits_in_unsigned_int - one_eighth) 41 | hash_value = 0 42 | test = 0 43 | 44 | for i in range(len(key)): 45 | hash_value = (hash_value << int(one_eighth)) + ord(key[i]) 46 | test = hash_value & high_bits 47 | if test != 0: 48 | hash_value = ((hash_value ^ (test >> int(three_quarters))) & (~high_bits)) 49 | return hash_value & 0x7FFFFFFF 50 | 51 | 52 | def elf_hash(key): 53 | hash_value = 0 54 | for i in range(len(key)): 55 | hash_value = (hash_value << 4) + ord(key[i]) 56 | x = hash_value & 0xF0000000 57 | if x != 0: 58 | hash_value ^= (x >> 24) 59 | hash_value &= ~x 60 | return hash_value 61 | 62 | 63 | def bkdr_hash(key): 64 | seed = 131 # 31 131 1313 13131 131313 etc.. 65 | hash_value = 0 66 | for i in range(len(key)): 67 | hash_value = (hash_value * seed) + ord(key[i]) 68 | return hash_value 69 | 70 | 71 | def sdbm_hash(key): 72 | hash_value = 0 73 | for i in range(len(key)): 74 | hash_value = ord(key[i]) + (hash_value << 6) + (hash_value << 16) - hash_value; 75 | return hash_value 76 | 77 | 78 | def djb_hash(key): 79 | hash_value = 5381 80 | for i in range(len(key)): 81 | hash_value = ((hash_value << 5) + hash_value) + ord(key[i]) 82 | return hash_value 83 | 84 | 85 | def dek_hash(key): 86 | hash_value = len(key); 87 | for i in range(len(key)): 88 | hash_value = ((hash_value << 5) ^ (hash_value >> 27)) ^ ord(key[i]) 89 | return hash_value 90 | 91 | 92 | def bp_hash(key): 93 | hash_value = 0 94 | for i in range(len(key)): 95 | hash_value = hash_value << 7 ^ ord(key[i]) 96 | return hash_value 97 | 98 | 99 | def fnv_hash(key): 100 | fnv_prime = 0x811C9DC5 101 | hash_value = 0 102 | for i in range(len(key)): 103 | hash_value *= fnv_prime 104 | hash_value ^= ord(key[i]) 105 | return hash_value 106 | 107 | 108 | def ap_hash(key): 109 | hash_value = 0xAAAAAAAA 110 | for i in range(len(key)): 111 | if (i & 1) == 0: 112 | hash_value ^= ((hash_value << 7) ^ ord(key[i]) * (hash_value >> 3)) 113 | else: 114 | hash_value ^= (~((hash_value << 11) + ord(key[i]) ^ (hash_value >> 5))) 115 | return hash_value 116 | -------------------------------------------------------------------------------- /BloomFilterRedis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kongtianyi/BloomFilterRedis/86a9098b5ee9db1215216c11607f039db2aa14fb/BloomFilterRedis/__init__.py -------------------------------------------------------------------------------- /License: -------------------------------------------------------------------------------- 1 | --------------------------------------------------------------------------------- 2 | 3 | The Star And Thank Author License (SATA) 4 | 5 | Copyright © 2017 kongtianyi(kongtianyi@foxmail.com) 6 | 7 | Project Url: https://github.com/kongtianyi/BloomFilterRedis 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in 17 | all copies or substantial portions of the Software. 18 | 19 | And wait, the most important, you shall star/+1/like the project(s) in project url 20 | section above first, and then thank the author(s) in Copyright section. 21 | 22 | Here are some suggested ways: 23 | 24 | - Email the authors a thank-you letter, and make friends with him/her/them. 25 | - Report bugs or issues. 26 | - Tell friends what a wonderful project this is. 27 | - And, sure, you can just express thanks in your mind without telling the world. 28 | 29 | Contributors of this project by forking have the option to add his/her name and 30 | forked project url at copyright and project url sections, but shall not delete 31 | or modify anything else in these two sections. 32 | 33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 36 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 39 | THE SOFTWARE. 40 | 41 | --------------------------------------------------------------------------------- 42 | 43 | # 44 | #************************************************************************** 45 | #* * 46 | #* General Purpose Hash Function Algorithms Library * 47 | #* * 48 | #* Author: Arash Partow - 2002 * 49 | #* URL: http://www.partow.net * 50 | #* URL: http://www.partow.net/programming/hashfunctions/index.html * 51 | #* * 52 | #* Copyright notice: * 53 | #* Free use of the General Purpose Hash Function Algorithms Library is * 54 | #* permitted under the guidelines and in accordance with the MIT License. * 55 | #* http://www.opensource.org/licenses/MIT * 56 | #* * 57 | #************************************************************************** 58 | # 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于Redis的布隆过滤器 2 | 3 | ## 简介 4 | 5 | * BloomFilterRedis:使用Redis的Bitmap作为位数组构建起来的可扩展的布隆过滤器,位数组的默认长度为2^23,哈希函数默认为八个。 6 | * orange:Scrapy工程,以“橘子水”为出发点的爬取百度百科的爬虫,配置了基于BloomFilterRedis的过滤器。 7 | 8 | 关于Bitmap以及其它介绍详见我的博文[基于Redis的布隆过滤器的实现](http://blog.csdn.net/qq_30242609/article/details/71024458) 9 | 10 | ## 开发环境 11 | 12 | * python 2.7.12 13 | * Redis 3.2.8 14 | * python-redis 15 | * scrapy 1.3.3 16 | 17 | ## 使用方法 18 | 19 | ``` 20 | from BloomFilterRedis import BloomFilterRedis 21 | 22 | bloomFilterRedis = BloomFilterRedis("bloom") 23 | bloomFilterRedis.do_filter("one item to check") 24 | ``` 25 | 26 | ## Scrapy中的使用方法 27 | 28 | 1. 将`BloomFilterRedis`和复制到工程文件夹下,将`BloomRedisDupeFilter.py`复制到与`settings.py`同一目录下。 29 | 2. 在settings.py中配置以下字段: 30 | ``` 31 | # 配置过滤器为基于redis的布隆过滤器 32 | DUPEFILTER_CLASS = 'orange.BloomRedisDupeFilter.BloomRedisDupeFilter' 33 | # reids中bitmap的key,默认为‘bloom’ 34 | # BLOOM_REDIS_KEY = 'bloom' 35 | # redis的连接配置,默认为本机 36 | # BLOOM_REDIS_HOST = '127.0.0.1' 37 | # BLOOM_REDIS_PORT = 6379 38 | # 布隆过滤器的哈希列表,默认为8个,定义在GeneralHashFunctions中 39 | # BLOOM_HASH_LIST = ["rs_hash", "js_hash", "pjw_hash", "elf_hash", "bkdr_hash", "sdbm_hash", "djb_hash", "dek_hash"] 40 | ``` 41 | 42 | -------------------------------------------------------------------------------- /orange/BloomFilterRedis/BloomFilterRedis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import redis 4 | import GeneralHashFunctions # 这里pycharm爆红,属于误报 5 | 6 | 7 | class BloomFilterRedis: 8 | 9 | hash_list = ["rs_hash", "js_hash", "pjw_hash", "elf_hash", "bkdr_hash", 10 | "sdbm_hash", "djb_hash", "dek_hash"] 11 | 12 | def __init__(self, key, host='172.0.0.1', port=6379, hash_list=hash_list): 13 | # redis-bitmap的key 14 | self.key = key 15 | # redis连接信息 16 | self.pool = redis.ConnectionPool(host=host, port=port) 17 | self.handle = redis.StrictRedis(connection_pool=self.pool, charset='utf-8') 18 | # 哈希函数列表 19 | self.hash_list = hash_list 20 | 21 | @classmethod 22 | def random_generator(cls, hash_value): 23 | ''' 24 | 将hash函数得出的函数值映射到[0, 2^32-1]区间内 25 | ''' 26 | return hash_value % (1 << 32) 27 | 28 | def do_filter(self, item): 29 | ''' 30 | 检查是否是新的条目,是新条目则更新bitmap并返回True,是重复条目则返回False 31 | ''' 32 | flag = False 33 | for hash_func_str in self.hash_list: 34 | # 获得到hash函数对象 35 | hash_func = getattr(GeneralHashFunctions, hash_func_str) 36 | # 计算hash值 37 | hash_value = hash_func(item) 38 | # 将hash值映射到[0, 2^32]区间 39 | real_value = BloomFilterRedis.random_generator(hash_value) 40 | # bitmap中对应位是0,则置为1,并说明此条目为新的条目 41 | if self.handle.getbit(self.key, real_value) == 0: 42 | self.handle.setbit(self.key, real_value, 1) 43 | flag = True 44 | # 当所有hash值在bitmap中对应位都是1,说明此条目重复,返回False 45 | return flag 46 | -------------------------------------------------------------------------------- /orange/BloomFilterRedis/GeneralHashFunctions.py: -------------------------------------------------------------------------------- 1 | # 2 | #************************************************************************** 3 | #* * 4 | #* General Purpose Hash Function Algorithms Library * 5 | #* * 6 | #* Author: Arash Partow - 2002 * 7 | #* URL: http://www.partow.net * 8 | #* URL: http://www.partow.net/programming/hashfunctions/index.html * 9 | #* * 10 | #* Copyright notice: * 11 | #* Free use of the General Purpose Hash Function Algorithms Library is * 12 | #* permitted under the guidelines and in accordance with the MIT License. * 13 | #* http://www.opensource.org/licenses/MIT * 14 | #* * 15 | #************************************************************************** 16 | # 17 | 18 | 19 | def rs_hash(key): 20 | a = 378551 21 | b = 63689 22 | hash_value = 0 23 | for i in range(len(key)): 24 | hash_value = hash_value * a + ord(key[i]) 25 | a = a * b 26 | return hash_value 27 | 28 | 29 | def js_hash(key): 30 | hash_value = 1315423911 31 | for i in range(len(key)): 32 | hash_value ^= ((hash_value << 5) + ord(key[i]) + (hash_value >> 2)) 33 | return hash_value 34 | 35 | 36 | def pjw_hash(key): 37 | bits_in_unsigned_int = 4 * 8 38 | three_quarters = (bits_in_unsigned_int * 3) / 4 39 | one_eighth = bits_in_unsigned_int / 8 40 | high_bits = 0xFFFFFFFF << int(bits_in_unsigned_int - one_eighth) 41 | hash_value = 0 42 | test = 0 43 | 44 | for i in range(len(key)): 45 | hash_value = (hash_value << int(one_eighth)) + ord(key[i]) 46 | test = hash_value & high_bits 47 | if test != 0: 48 | hash_value = ((hash_value ^ (test >> int(three_quarters))) & (~high_bits)) 49 | return hash_value & 0x7FFFFFFF 50 | 51 | 52 | def elf_hash(key): 53 | hash_value = 0 54 | for i in range(len(key)): 55 | hash_value = (hash_value << 4) + ord(key[i]) 56 | x = hash_value & 0xF0000000 57 | if x != 0: 58 | hash_value ^= (x >> 24) 59 | hash_value &= ~x 60 | return hash_value 61 | 62 | 63 | def bkdr_hash(key): 64 | seed = 131 # 31 131 1313 13131 131313 etc.. 65 | hash_value = 0 66 | for i in range(len(key)): 67 | hash_value = (hash_value * seed) + ord(key[i]) 68 | return hash_value 69 | 70 | 71 | def sdbm_hash(key): 72 | hash_value = 0 73 | for i in range(len(key)): 74 | hash_value = ord(key[i]) + (hash_value << 6) + (hash_value << 16) - hash_value; 75 | return hash_value 76 | 77 | 78 | def djb_hash(key): 79 | hash_value = 5381 80 | for i in range(len(key)): 81 | hash_value = ((hash_value << 5) + hash_value) + ord(key[i]) 82 | return hash_value 83 | 84 | 85 | def dek_hash(key): 86 | hash_value = len(key); 87 | for i in range(len(key)): 88 | hash_value = ((hash_value << 5) ^ (hash_value >> 27)) ^ ord(key[i]) 89 | return hash_value 90 | 91 | 92 | def bp_hash(key): 93 | hash_value = 0 94 | for i in range(len(key)): 95 | hash_value = hash_value << 7 ^ ord(key[i]) 96 | return hash_value 97 | 98 | 99 | def fnv_hash(key): 100 | fnv_prime = 0x811C9DC5 101 | hash_value = 0 102 | for i in range(len(key)): 103 | hash_value *= fnv_prime 104 | hash_value ^= ord(key[i]) 105 | return hash_value 106 | 107 | 108 | def ap_hash(key): 109 | hash_value = 0xAAAAAAAA 110 | for i in range(len(key)): 111 | if (i & 1) == 0: 112 | hash_value ^= ((hash_value << 7) ^ ord(key[i]) * (hash_value >> 3)) 113 | else: 114 | hash_value ^= (~((hash_value << 11) + ord(key[i]) ^ (hash_value >> 5))) 115 | return hash_value 116 | -------------------------------------------------------------------------------- /orange/BloomFilterRedis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kongtianyi/BloomFilterRedis/86a9098b5ee9db1215216c11607f039db2aa14fb/orange/BloomFilterRedis/__init__.py -------------------------------------------------------------------------------- /orange/count.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | 4 | ''' 5 | 用于检查误报的脚本 6 | ''' 7 | 8 | def get_file_lines(path): 9 | ''' 10 | 文件内url数量 11 | ''' 12 | lines = 0 13 | with open(path, 'r') as f: 14 | for line in f: 15 | lines += 1 16 | return lines 17 | 18 | def get_error_count(all_path, filted_path): 19 | ''' 20 | 计算误报量 21 | ''' 22 | allurl = set() 23 | count = 0 24 | with open(all_path, "r") as f: 25 | for line in f: 26 | allurl.add(line) 27 | with open(filted_path, "r") as f: 28 | for line in f: 29 | if line not in allurl: 30 | count += 1 31 | return count 32 | 33 | if __name__ == "__main__": 34 | allurl = get_file_lines("allurl.txt") 35 | filted = get_file_lines("filted.txt") 36 | print u"url总量:", allurl 37 | print u"过滤总量:", filted 38 | print u"误判总量:", get_error_count("allurl.txt", "filted.txt") -------------------------------------------------------------------------------- /orange/orange/BloomFilterRedis/BloomFilterRedis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import redis 4 | import GeneralHashFunctions 5 | 6 | 7 | class BloomFilterRedis: 8 | 9 | hash_list = ["rs_hash", "js_hash", "pjw_hash", "elf_hash", "bkdr_hash", 10 | "sdbm_hash", "djb_hash", "dek_hash"] 11 | 12 | def __init__(self, key, host='172.0.0.1', port=6379, hash_list=hash_list): 13 | # redis-bitmap的key 14 | self.key = key 15 | # redis连接信息 16 | self.pool = redis.ConnectionPool(host=host, port=port) 17 | self.handle = redis.StrictRedis(connection_pool=self.pool, charset='utf-8') 18 | # 哈希函数列表 19 | self.hash_list = hash_list 20 | 21 | @classmethod 22 | def random_generator(cls, hash_value): 23 | ''' 24 | 将hash函数得出的函数值映射到[0, 2^32-1]区间内 25 | ''' 26 | return hash_value % (1 << 32) 27 | 28 | def do_filter(self, item): 29 | ''' 30 | 检查是否是新的条目,是新条目则更新bitmap并返回True,是重复条目则返回False 31 | ''' 32 | flag = False 33 | for hash_func_str in self.hash_list: 34 | # 获得到hash函数对象 35 | hash_func = getattr(GeneralHashFunctions, hash_func_str) 36 | # 计算hash值 37 | hash_value = hash_func(item) 38 | # 将hash值映射到[0, 2^32]区间 39 | real_value = BloomFilterRedis.random_generator(hash_value) 40 | # bitmap中对应位是0,则置为1,并说明此条目为新的条目 41 | if self.handle.getbit(self.key, real_value) == 0: 42 | self.handle.setbit(self.key, real_value, 1) 43 | flag = True 44 | # 当所有hash值在bitmap中对应位都是1,说明此条目重复,返回False 45 | return flag 46 | -------------------------------------------------------------------------------- /orange/orange/BloomFilterRedis/GeneralHashFunctions.py: -------------------------------------------------------------------------------- 1 | # 2 | #************************************************************************** 3 | #* * 4 | #* General Purpose Hash Function Algorithms Library * 5 | #* * 6 | #* Author: Arash Partow - 2002 * 7 | #* URL: http://www.partow.net * 8 | #* URL: http://www.partow.net/programming/hashfunctions/index.html * 9 | #* * 10 | #* Copyright notice: * 11 | #* Free use of the General Purpose Hash Function Algorithms Library is * 12 | #* permitted under the guidelines and in accordance with the MIT License. * 13 | #* http://www.opensource.org/licenses/MIT * 14 | #* * 15 | #************************************************************************** 16 | # 17 | 18 | 19 | def rs_hash(key): 20 | a = 378551 21 | b = 63689 22 | hash_value = 0 23 | for i in range(len(key)): 24 | hash_value = hash_value * a + ord(key[i]) 25 | a = a * b 26 | return hash_value 27 | 28 | 29 | def js_hash(key): 30 | hash_value = 1315423911 31 | for i in range(len(key)): 32 | hash_value ^= ((hash_value << 5) + ord(key[i]) + (hash_value >> 2)) 33 | return hash_value 34 | 35 | 36 | def pjw_hash(key): 37 | bits_in_unsigned_int = 4 * 8 38 | three_quarters = (bits_in_unsigned_int * 3) / 4 39 | one_eighth = bits_in_unsigned_int / 8 40 | high_bits = 0xFFFFFFFF << int(bits_in_unsigned_int - one_eighth) 41 | hash_value = 0 42 | test = 0 43 | 44 | for i in range(len(key)): 45 | hash_value = (hash_value << int(one_eighth)) + ord(key[i]) 46 | test = hash_value & high_bits 47 | if test != 0: 48 | hash_value = ((hash_value ^ (test >> int(three_quarters))) & (~high_bits)) 49 | return hash_value & 0x7FFFFFFF 50 | 51 | 52 | def elf_hash(key): 53 | hash_value = 0 54 | for i in range(len(key)): 55 | hash_value = (hash_value << 4) + ord(key[i]) 56 | x = hash_value & 0xF0000000 57 | if x != 0: 58 | hash_value ^= (x >> 24) 59 | hash_value &= ~x 60 | return hash_value 61 | 62 | 63 | def bkdr_hash(key): 64 | seed = 131 # 31 131 1313 13131 131313 etc.. 65 | hash_value = 0 66 | for i in range(len(key)): 67 | hash_value = (hash_value * seed) + ord(key[i]) 68 | return hash_value 69 | 70 | 71 | def sdbm_hash(key): 72 | hash_value = 0 73 | for i in range(len(key)): 74 | hash_value = ord(key[i]) + (hash_value << 6) + (hash_value << 16) - hash_value; 75 | return hash_value 76 | 77 | 78 | def djb_hash(key): 79 | hash_value = 5381 80 | for i in range(len(key)): 81 | hash_value = ((hash_value << 5) + hash_value) + ord(key[i]) 82 | return hash_value 83 | 84 | 85 | def dek_hash(key): 86 | hash_value = len(key); 87 | for i in range(len(key)): 88 | hash_value = ((hash_value << 5) ^ (hash_value >> 27)) ^ ord(key[i]) 89 | return hash_value 90 | 91 | 92 | def bp_hash(key): 93 | hash_value = 0 94 | for i in range(len(key)): 95 | hash_value = hash_value << 7 ^ ord(key[i]) 96 | return hash_value 97 | 98 | 99 | def fnv_hash(key): 100 | fnv_prime = 0x811C9DC5 101 | hash_value = 0 102 | for i in range(len(key)): 103 | hash_value *= fnv_prime 104 | hash_value ^= ord(key[i]) 105 | return hash_value 106 | 107 | 108 | def ap_hash(key): 109 | hash_value = 0xAAAAAAAA 110 | for i in range(len(key)): 111 | if (i & 1) == 0: 112 | hash_value ^= ((hash_value << 7) ^ ord(key[i]) * (hash_value >> 3)) 113 | else: 114 | hash_value ^= (~((hash_value << 11) + ord(key[i]) ^ (hash_value >> 5))) 115 | return hash_value 116 | -------------------------------------------------------------------------------- /orange/orange/BloomFilterRedis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kongtianyi/BloomFilterRedis/86a9098b5ee9db1215216c11607f039db2aa14fb/orange/orange/BloomFilterRedis/__init__.py -------------------------------------------------------------------------------- /orange/orange/BloomRedisDupeFilter.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | 3 | import logging 4 | from orange.BloomFilterRedis.BloomFilterRedis import BloomFilterRedis 5 | from scrapy.dupefilters import BaseDupeFilter 6 | 7 | 8 | class BloomRedisDupeFilter(BaseDupeFilter): 9 | 10 | hash_list = ["rs_hash", "js_hash", "pjw_hash", "elf_hash", "bkdr_hash", 11 | "sdbm_hash", "djb_hash", "dek_hash"] 12 | 13 | def __init__(self, key="bloom", host="127.0.0.1", port=7379, 14 | hash_list=hash_list, debug=False): 15 | self.bloomFilterRedis = BloomFilterRedis( 16 | key=key, host=host, port=port, hash_list=hash_list) 17 | self.logdupes = True 18 | self.debug = debug 19 | self.logger = logging.getLogger(__name__) 20 | 21 | @classmethod 22 | def from_settings(cls, settings): 23 | key = settings['BLOOM_REDIS_KEY'] 24 | if key is None: 25 | key = "bloom" 26 | host = settings['BLOOM_REDIS_HOST'] 27 | if host is None: 28 | host = "127.0.0.1" 29 | port = settings['BLOOM_REDIS_PORT'] 30 | if port is None: 31 | port = 6379 32 | hash_list = settings['BLOOM_REDIS_HASH_LIST'] 33 | if hash_list is None: 34 | hash_list = BloomRedisDupeFilter.hash_list 35 | debug = settings.getbool('DUPEFILTER_DEBUG') 36 | return cls(key, host, port, hash_list, debug) 37 | 38 | def request_seen(self, request): 39 | re = self.bloomFilterRedis.do_filter(request.url) 40 | # with open("allurl.txt", "a") as f: 41 | # f.write(request.url+'\n') 42 | if re is False: 43 | # with open("filted.txt", "a") as f: 44 | # f.write(request.url+'\n') 45 | return True 46 | 47 | def close(self, reason): 48 | self.bloomFilterRedis.pool.disconnect() 49 | 50 | def log(self, request, spider): 51 | if self.debug: 52 | msg = "Filtered duplicate request: %(request)s" 53 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 54 | elif self.logdupes: 55 | msg = ("Filtered duplicate request: %(request)s" 56 | " - no more duplicates will be shown" 57 | " (see DUPEFILTER_DEBUG to show all duplicates)") 58 | self.logger.debug(msg, {'request': request}, extra={'spider': spider}) 59 | self.logdupes = False 60 | 61 | spider.crawler.stats.inc_value('dupefilter/filtered', spider=spider) 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /orange/orange/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kongtianyi/BloomFilterRedis/86a9098b5ee9db1215216c11607f039db2aa14fb/orange/orange/__init__.py -------------------------------------------------------------------------------- /orange/orange/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item 9 | from scrapy import Field 10 | 11 | 12 | class OrangeItem(Item): 13 | url = Field() 14 | title = Field() 15 | -------------------------------------------------------------------------------- /orange/orange/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | class OrangePipeline(object): 5 | 6 | def process_item(self, item, spider): 7 | with open("items.txt", "a") as f: 8 | f.write(str(item).decode("unicode_escape").encode('utf-8') + '\n') 9 | return item 10 | -------------------------------------------------------------------------------- /orange/orange/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'orange' 4 | 5 | SPIDER_MODULES = ['orange.spiders'] 6 | NEWSPIDER_MODULE = 'orange.spiders' 7 | 8 | # Obey robots.txt rules 9 | ROBOTSTXT_OBEY = False 10 | 11 | # Override the default request headers: 12 | DEFAULT_REQUEST_HEADERS = { 13 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 14 | 'Accept-Language': 'en', 15 | } 16 | 17 | ITEM_PIPELINES = { 18 | 'orange.pipelines.OrangePipeline': 301, 19 | } 20 | 21 | # 配置过滤器为基于redis的布隆过滤器 22 | DUPEFILTER_CLASS = 'orange.BloomRedisDupeFilter.BloomRedisDupeFilter' 23 | # reids中bitmap的key,默认为‘bloom’ 24 | # BLOOM_REDIS_KEY = 'bloom' 25 | # redis的连接配置,默认为本机 26 | # BLOOM_REDIS_HOST = '127.0.0.1' 27 | # BLOOM_REDIS_PORT = 6379 28 | # 布隆过滤器的哈希列表,默认为8个,定义在GeneralHashFunctions中 29 | # BLOOM_HASH_LIST = ["rs_hash", "js_hash", "pjw_hash", "elf_hash", "bkdr_hash", "sdbm_hash", "djb_hash", "dek_hash"] 30 | 31 | # 设置为爬取策略广度优先 32 | DEPTH_PRIORITY = 1 33 | 34 | # 设置下载延迟 35 | DOWNLOAD_DELAY = 5 -------------------------------------------------------------------------------- /orange/orange/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /orange/orange/spiders/orange_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from orange.items import OrangeItem 4 | 5 | 6 | class OrangeSpiderSpider(scrapy.Spider): 7 | name = "orange_spider" 8 | allowed_domains = ["baidu.com"] 9 | start_urls = ['http://baike.baidu.com/item/橘子水'] 10 | # handle_httpstatus_list = [301, 302] 11 | 12 | def start_requests(self): 13 | yield scrapy.Request(url=self.start_urls[0], callback=self.parse, 14 | meta={"url": self.start_urls[0]}) 15 | 16 | def parse(self, response): 17 | item = OrangeItem() 18 | item["url"] = response.meta["url"] 19 | title = response.xpath('//title/text()').extract() 20 | if title: 21 | item["title"] = title[0] 22 | else: 23 | return 24 | hrefs = response.xpath('//div[@class="main-content"]//a/@href') 25 | yield item 26 | for href in hrefs: 27 | new_url = href.extract() 28 | # print new_url 29 | if "view" in new_url or "item" in new_url: 30 | yield scrapy.Request(url="http://baike.baidu.com" + new_url, callback=self.parse, 31 | meta={"url": "http://baike.baidu.com" + new_url}) 32 | -------------------------------------------------------------------------------- /orange/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = orange.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = orange 12 | --------------------------------------------------------------------------------