├── img └── query_error.png ├── README.md └── demo.py /img/query_error.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuhuanyong/QueryCorrection/HEAD/img/query_error.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QueryCorrection 2 | self complemented SpellCorrection based pinyin similairity, edit distance ,基于用户词表,拼音相似度与编辑距离的查询纠错。 3 | 4 | # 关于查询纠错 5 | ![Image text](https://github.com/liuhuanyong/QueryCorrection/blob/master/img/query_error.png) 6 | 7 | 8 | # 项目介绍 9 | 对于搜索中的query纠错功能,纠错过程主要分为以下3个过程: 10 | 1, Query纠错判断。对于常见错误,例如常见的拼写错误,使用事先挖掘好的错误query字典,当query在此字典中时纠错。如果用户输入的query查询无结果或结果较少于一定阈值时,尝试纠错,可以根据不同领域的策略和容忍度,配置最少结果数阈值。 11 | 2,不同策略独立纠错。Query有多种纠错策略,包括拼音纠错和编辑距离纠错,模糊音形近字二次纠错等其他纠错策略等。同音策略是用户输入的错误query和候选纠错query有相同的拼音。编辑距离策略就是错误query和候选query之间编辑距离小于一定阈值,并配合其他条件进行过滤。 12 | 3,候选词结果选择。因为每个策略比较独立,不同策略会给出不同的候选词,因此对于候选词的选取,每个策略有所不同。不同策略之间,不同策略内部需要使用不同的评估方式,来选择最优结果。 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # coding: utf-8 3 | # File: demo.py 4 | # Author: lhy 5 | # Date: 18-4-24 6 | from pypinyin import * 7 | 8 | class WordCorrect: 9 | def __init__(self): 10 | self.char_path = 'char.txt' 11 | self.model_path = 'pinyin2word.model' 12 | self.charlist = [word.strip() for word in open(self.char_path) if word.strip()] 13 | self.pinyin_dict = self.load_model(self.model_path) 14 | 15 | def load_model(self, model_path): 16 | f = open(model_path, 'r') 17 | a = f.read() 18 | word_dict = eval(a) 19 | f.close() 20 | return word_dict 21 | 22 | def edit1(self, word): 23 | n = len(word) 24 | return set([word[0:i]+word[i+1:] for i in range(n)] + # deletion 25 | [word[0:i]+word[i+1]+word[i]+word[i+2:] for i in range(n-1)] + # transposition 26 | [word[0:i]+c+word[i+1:] for i in range(n) for c in self.charlist] + # alteration 27 | [word[0:i]+c+word[i:] for i in range(n+1) for c in self.charlist]) # insertion 28 | 29 | def build_model(): 30 | word_dict = {} 31 | count = 0 32 | for line in open('dict.txt'): 33 | count += 1 34 | print(count) 35 | line = line.strip().split(' ') 36 | word = line[0] 37 | word_count = line[1] 38 | word_pinyin = ','.join(lazy_pinyin(word)) 39 | if word_pinyin not in word_dict: 40 | word_dict[word_pinyin] = word + '_' + word_count 41 | else: 42 | word_dict[word_pinyin] += ';' + word + '_' + word_count 43 | 44 | data = {} 45 | for pinyin, words in word_dict.items(): 46 | tmp = {} 47 | for word in words.split(';'): 48 | word_word = word.split('_')[0] 49 | word_count = int(word.split('_')[1]) 50 | tmp[word_word] = word_count 51 | data[pinyin] = tmp 52 | 53 | 54 | f = open('pinyin2word.model', 'w') 55 | f.write(str(data)) 56 | f.close() 57 | 58 | def test(): 59 | corrector = WordCorrect() 60 | word = '我门' 61 | word_pinyin = ','.join(lazy_pinyin(word)) 62 | candiwords = corrector.edit1(word) 63 | print(candiwords) 64 | print(word_pinyin) 65 | print(corrector.pinyin_dict.get(word_pinyin, 'na')) 66 | 67 | test() 68 | --------------------------------------------------------------------------------