├── README └── filter.py /README: -------------------------------------------------------------------------------- 1 | 很短但是觉得挺有用的东东 2 | 所以单独立了个项目备份一下 3 | 4 | USAGE: 5 | 6 | >>> f = DFAFilter() 7 | >>> f.add("sexy") 8 | >>> f.filter("hello sexy baby") 9 | hello **** baby 10 | -------------------------------------------------------------------------------- /filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | from collections import defaultdict 4 | import re 5 | 6 | __all__ = ['NaiveFilter', 'BSFilter', 'DFAFilter'] 7 | __author__ = 'observer' 8 | __date__ = '2012.01.05' 9 | 10 | 11 | class NaiveFilter(): 12 | 13 | '''Filter Messages from keywords 14 | 15 | very simple filter implementation 16 | 17 | >>> f = NaiveFilter() 18 | >>> f.add("sexy") 19 | >>> f.filter("hello sexy baby") 20 | hello **** baby 21 | ''' 22 | 23 | def __init__(self): 24 | self.keywords = set([]) 25 | 26 | def parse(self, path): 27 | for keyword in open(path): 28 | self.keywords.add(keyword.strip().decode('utf-8').lower()) 29 | 30 | def filter(self, message, repl="*"): 31 | message = unicode(message).lower() 32 | for kw in self.keywords: 33 | message = message.replace(kw, repl) 34 | return message 35 | 36 | 37 | class BSFilter: 38 | 39 | '''Filter Messages from keywords 40 | 41 | Use Back Sorted Mapping to reduce replacement times 42 | 43 | >>> f = BSFilter() 44 | >>> f.add("sexy") 45 | >>> f.filter("hello sexy baby") 46 | hello **** baby 47 | ''' 48 | 49 | def __init__(self): 50 | self.keywords = [] 51 | self.kwsets = set([]) 52 | self.bsdict = defaultdict(set) 53 | self.pat_en = re.compile(r'^[0-9a-zA-Z]+$') # english phrase or not 54 | 55 | def add(self, keyword): 56 | if not isinstance(keyword, unicode): 57 | keyword = keyword.decode('utf-8') 58 | keyword = keyword.lower() 59 | if keyword not in self.kwsets: 60 | self.keywords.append(keyword) 61 | self.kwsets.add(keyword) 62 | index = len(self.keywords) - 1 63 | for word in keyword.split(): 64 | if self.pat_en.search(word): 65 | self.bsdict[word].add(index) 66 | else: 67 | for char in word: 68 | self.bsdict[char].add(index) 69 | 70 | def parse(self, path): 71 | with open(path, "r") as f: 72 | for keyword in f: 73 | self.add(keyword.strip()) 74 | 75 | def filter(self, message, repl="*"): 76 | if not isinstance(message, unicode): 77 | message = message.decode('utf-8') 78 | message = message.lower() 79 | for word in message.split(): 80 | if self.pat_en.search(word): 81 | for index in self.bsdict[word]: 82 | message = message.replace(self.keywords[index], repl) 83 | else: 84 | for char in word: 85 | for index in self.bsdict[char]: 86 | message = message.replace(self.keywords[index], repl) 87 | return message 88 | 89 | 90 | class DFAFilter(): 91 | 92 | '''Filter Messages from keywords 93 | 94 | Use DFA to keep algorithm perform constantly 95 | 96 | >>> f = DFAFilter() 97 | >>> f.add("sexy") 98 | >>> f.filter("hello sexy baby") 99 | hello **** baby 100 | ''' 101 | 102 | def __init__(self): 103 | self.keyword_chains = {} 104 | self.delimit = '\x00' 105 | 106 | def add(self, keyword): 107 | if not isinstance(keyword, unicode): 108 | keyword = keyword.decode('utf-8') 109 | keyword = keyword.lower() 110 | chars = keyword.strip() 111 | if not chars: 112 | return 113 | level = self.keyword_chains 114 | for i in range(len(chars)): 115 | if chars[i] in level: 116 | level = level[chars[i]] 117 | else: 118 | if not isinstance(level, dict): 119 | break 120 | for j in range(i, len(chars)): 121 | level[chars[j]] = {} 122 | last_level, last_char = level, chars[j] 123 | level = level[chars[j]] 124 | last_level[last_char] = {self.delimit: 0} 125 | break 126 | if i == len(chars) - 1: 127 | level[self.delimit] = 0 128 | 129 | def parse(self, path): 130 | with open(path) as f: 131 | for keyword in f: 132 | self.add(keyword.strip()) 133 | 134 | def filter(self, message, repl="*"): 135 | if not isinstance(message, unicode): 136 | message = message.decode('utf-8') 137 | message = message.lower() 138 | ret = [] 139 | start = 0 140 | while start < len(message): 141 | level = self.keyword_chains 142 | step_ins = 0 143 | for char in message[start:]: 144 | if char in level: 145 | step_ins += 1 146 | if self.delimit not in level[char]: 147 | level = level[char] 148 | else: 149 | ret.append(repl * step_ins) 150 | start += step_ins - 1 151 | break 152 | else: 153 | ret.append(message[start]) 154 | break 155 | else: 156 | ret.append(message[start]) 157 | start += 1 158 | 159 | return ''.join(ret) 160 | 161 | 162 | def test_first_character(): 163 | gfw = DFAFilter() 164 | gfw.add("1989年") 165 | assert gfw.filter("1989", "*") == "1989" 166 | 167 | 168 | if __name__ == "__main__": 169 | # gfw = NaiveFilter() 170 | # gfw = BSFilter() 171 | gfw = DFAFilter() 172 | gfw.parse("keywords") 173 | import time 174 | t = time.time() 175 | print gfw.filter("法轮功 我操操操", "*") 176 | print gfw.filter("针孔摄像机 我操操操", "*") 177 | print gfw.filter("售假人民币 我操操操", "*") 178 | print gfw.filter("传世私服 我操操操", "*") 179 | print time.time() - t 180 | 181 | test_first_character() 182 | --------------------------------------------------------------------------------