├── .gitignore ├── src ├── soundshapecode │ ├── ssc_similarity │ │ ├── __init__.py │ │ └── compute_ssc_similarity.py │ ├── four_corner │ │ ├── data │ │ │ └── data.pkl │ │ └── __init__.py │ ├── __init__.py │ ├── variant_kmp │ │ └── __init__.py │ └── ssc.py └── 图解.docx ├── README.md └── test └── polyphonic_sound_code.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ -------------------------------------------------------------------------------- /src/soundshapecode/ssc_similarity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/图解.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/ssc/HEAD/src/图解.docx -------------------------------------------------------------------------------- /src/soundshapecode/four_corner/data/data.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qingyujean/ssc/HEAD/src/soundshapecode/four_corner/data/data.pkl -------------------------------------------------------------------------------- /src/soundshapecode/four_corner/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import pkg_resources 3 | 4 | class FourCornerMethod(object): 5 | def __init__(self): 6 | data_file = pkg_resources.resource_filename(__name__, "data/data.pkl") 7 | with open(data_file, 'rb') as f: 8 | self.data = pickle.load(f) 9 | 10 | def query(self, input_char, default=None): 11 | return self.data.get(input_char, default) 12 | 13 | if __name__ == "__main__": 14 | """ 15 | i参考: 16 | 1. 四角号码为什么是5位数字编码:https://zhidao.baidu.com/question/1667714057688997667.html 17 | """ 18 | fcm = FourCornerMethod() 19 | #result = fcm.query('日')#量、日;门、闫、闩 20 | 21 | #print(result) 22 | print(fcm.query('量')) 23 | print(fcm.query('日')) 24 | 25 | print(fcm.query('门')) 26 | print(fcm.query('闫')) 27 | print(fcm.query('闩')) 28 | 29 | print(fcm.query('王')) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ssc, Sound Shape Code, 音形码 2 | 基于“音形码”的中文字符串相似度计算方法 3 | 4 | 1. 音形码的思路来源于我看的这篇博客:https://blog.csdn.net/chndata/article/details/41114771 ,我对其中的思路进行了实现和优化。我自己写了一点大致的编码过程的图解示例放在文档src/图解.docx中可供大家查看,其中包含了最后运行代码时的截图效果。 5 | 6 | 2. 代码的入口是目录src/soundshapescode/__init__.py,里面演示了如何使用这部分代码,测试用例是在字符串“国我爱你女生于无娃哇紫狼路爽晕约紫薇路又刘页列而紫粮路掩连哟罗”中寻找“紫琅路”的相近词(依据音形码)最后的结果是“紫狼路”和“紫粮路”。 7 | 8 | 3. 相似度的阈值我目前给的是SIMILARITY_THRESHOLD = 0.8,可在上面提到的__init__.py中看到,大家使用时可进行调整;另外SSC_ENCODE_WAY指的是编码方式,他有3种取值:'ALL','SOUND','SHAPE',分别表示计算相似度时既考虑音又考虑形(代码中权重各占0.5,可调整)、或者只考虑音、或者只考虑形。 9 | 10 | 4. four_corner/用来处理汉字的四角编码,ssc_similarity/主要处理计算相似度,包含各编码位的权重等等(可自行调整权重大小),variant_kmp/是使用模式匹配算法KMP来处理字符串的比较,方法会返回模式串与主串的比较结果在满足要求的情况下的主串的下标,ssc.py里主要是如何进行编码的一些实现。 11 | 12 | 5. src/zh_data/文件夹下有几个文件: 13 | 14 | (1)hanzi_ssc_res.txt是我收集的一个中文字库,然后运用本方法对所有汉字预先做的一个音形码的编码,大家如果对权重没有改变的话,可以直接使用这个文件,每一行是汉字以及汉字对应的音形码;如果权重想要自己调整的,就直接使用代码,而忽略掉这个文件 15 | 16 | (2)unihan_structure.txt是汉字和该汉字的结构形状(例如上下结构、左右结构等)的一个映射文件 17 | 18 | (3)utf8_strokes.txt是汉字和该汉字的笔画数的映射文件 19 | 20 | 6. 我的运行环境是:Python3.7 21 | -------------------------------------------------------------------------------- /src/soundshapecode/__init__.py: -------------------------------------------------------------------------------- 1 | from soundshapecode import ssc 2 | 3 | from soundshapecode.variant_kmp import VatiantKMP 4 | 5 | SIMILARITY_THRESHOLD = 0.8 6 | SSC_ENCODE_WAY = 'ALL'#'ALL','SOUND','SHAPE' 7 | 8 | if __name__=="__main__": 9 | """注意: 10 | 1.声母最多2位,韵母最多3位 11 | 2.我 和 国 楼和有 也和列 可认为只是声母不一样,而韵母分别看成uo和iou和ie,多出来的部分可看成韵母辅音 12 | 3.留和有 留:liu->l iou 有:you->yiou-> y i ou 13 | """ 14 | chi_word1 = '紫琅路' 15 | chi_word2 = '国我爱你女生于无娃哇紫狼路爽晕约紫薇路又刘页列而紫粮路掩连哟罗' 16 | ssc.getHanziStrokesDict() 17 | ssc.getHanziStructureDict() 18 | #ssc.generateHanziSSCFile()#生成汉子-ssc映射文件 19 | ssc.getHanziSSCDict() 20 | 21 | chi_word1_ssc = ssc.getSSC(chi_word1, SSC_ENCODE_WAY) 22 | print(chi_word1_ssc) 23 | 24 | chi_word2_ssc = ssc.getSSC(chi_word2, SSC_ENCODE_WAY) 25 | print(chi_word2_ssc) 26 | 27 | #应用串的模式匹配KMP算法,找变异词。效率比BF算法高 28 | kmp = VatiantKMP(SIMILARITY_THRESHOLD) 29 | kmp.indexKMP(chi_word2_ssc, chi_word1_ssc, SSC_ENCODE_WAY)#主串S、模式串T 30 | print(kmp.startIdxRes) 31 | 32 | variabt_word = set() 33 | for i in kmp.startIdxRes: 34 | variabt_word.add(chi_word2[i:i+len(chi_word1)]) 35 | print('变异词:', variabt_word) -------------------------------------------------------------------------------- /test/polyphonic_sound_code.py: -------------------------------------------------------------------------------- 1 | from pypinyin import pinyin, load_phrases_dict 2 | import pypinyin 3 | from pyhanlp import HanLP 4 | import os, sys 5 | 6 | src_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../src")) 7 | sys.path.append(f'{src_path}') 8 | 9 | from soundshapecode import ssc 10 | from soundshapecode.variant_kmp import VatiantKMP 11 | 12 | 13 | SIMILARITY_THRESHOLD = 0.8 14 | SSC_ENCODE_WAY = 'SOUND'#'ALL','SOUND','SHAPE' 15 | 16 | load_phrases_dict({'沌口': [['zhuàn'], ['kǒu']]}) 17 | 18 | 19 | if __name__=="__main__": 20 | analyzer = HanLP.newSegment('perceptron') 21 | 22 | chi_word1 = '沌口' 23 | chi_word2 = '我住在钻口' 24 | ssc.getHanziStrokesDict() 25 | ssc.getHanziStructureDict() 26 | 27 | chi_word1_ssc = ssc.getSSC_sentence(chi_word1, SSC_ENCODE_WAY, analyzer) 28 | print(chi_word1_ssc) 29 | 30 | chi_word2_ssc = ssc.getSSC_sentence(chi_word2, SSC_ENCODE_WAY, analyzer) 31 | print(chi_word2_ssc) 32 | 33 | #应用串的模式匹配KMP算法,找变异词。效率比BF算法高 34 | kmp = VatiantKMP(SIMILARITY_THRESHOLD) 35 | kmp.indexKMP(chi_word2_ssc, chi_word1_ssc, SSC_ENCODE_WAY)#主串S、模式串T 36 | print(kmp.startIdxRes) 37 | 38 | if kmp.startIdxRes: 39 | variabt_word = set() 40 | for i in kmp.startIdxRes: 41 | variabt_word.add(chi_word2[i:i+len(chi_word1)]) 42 | print('变异词:', variabt_word) 43 | else: 44 | print('变异词没有找到') 45 | -------------------------------------------------------------------------------- /src/soundshapecode/ssc_similarity/compute_ssc_similarity.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2019-4-8 4 | 5 | @author: Yoga 6 | ''' 7 | strokesDictReverse = {'1':1, '2':2, '3':3, '4':4, '5':5, '6':6, '7':7, '8':8, '9':9, 'A':10, 8 | 'B':11, 'C':12, 'D':13, 'E':14, 'F':15, 'G':16, 'H':17, 'I':18, 'J':19, 'K':20, 9 | 'L':21, 'M':22, 'N':23, 'O':24, 'P':25, 'Q':26, 'R':27, 'S':28, 'T':29, 'U':30, 10 | 'V':31, 'W':32, 'X':33, 'Y':34, 'Z':35, '0':0} 11 | 12 | soundWeight=0.5 13 | shapeWeight=0.5 14 | def computeSoundCodeSimilarity(soundCode1, soundCode2):#soundCode=['2', '8', '5', '2'] 15 | featureSize=len(soundCode1) 16 | wights=[0.4,0.4,0.1,0.1] 17 | multiplier=[] 18 | for i in range(featureSize): 19 | if soundCode1[i]==soundCode2[i]: 20 | multiplier.append(1) 21 | else: 22 | multiplier.append(0) 23 | soundSimilarity=0 24 | for i in range(featureSize): 25 | soundSimilarity += wights[i]*multiplier[i] 26 | return soundSimilarity 27 | 28 | def computeShapeCodeSimilarity(shapeCode1, shapeCode2):#shapeCode=['5', '6', '0', '1', '0', '3', '8'] 29 | featureSize=len(shapeCode1) 30 | wights=[0.25,0.1,0.1,0.1,0.1,0.1,0.25] 31 | multiplier=[] 32 | for i in range(featureSize-1): 33 | if shapeCode1[i]==shapeCode2[i]: 34 | multiplier.append(1) 35 | else: 36 | multiplier.append(0) 37 | multiplier.append(1- abs(strokesDictReverse[shapeCode1[-1]]-strokesDictReverse[shapeCode2[-1]])*1.0 / max(strokesDictReverse[shapeCode1[-1]],strokesDictReverse[shapeCode2[-1]]) ) 38 | shapeSimilarity=0 39 | for i in range(featureSize): 40 | shapeSimilarity += wights[i]*multiplier[i] 41 | return shapeSimilarity 42 | 43 | def computeSSCSimilaruty(ssc1, ssc2, ssc_encode_way): 44 | #return 0.5*computeSoundCodeSimilarity(ssc1[:4], ssc2[:4])+0.5*computeShapeCodeSimilarity(ssc1[4:], ssc2[4:]) 45 | if ssc_encode_way=="SOUND": 46 | return computeSoundCodeSimilarity(ssc1, ssc2) 47 | elif ssc_encode_way=="SHAPE": 48 | return computeShapeCodeSimilarity(ssc1, ssc2) 49 | else: 50 | soundSimi=computeSoundCodeSimilarity(ssc1[:4], ssc2[:4]) 51 | shapeSimi=computeShapeCodeSimilarity(ssc1[4:], ssc2[4:]) 52 | return soundWeight*soundSimi+shapeWeight*shapeSimi 53 | -------------------------------------------------------------------------------- /src/soundshapecode/variant_kmp/__init__.py: -------------------------------------------------------------------------------- 1 | from soundshapecode.ssc_similarity.compute_ssc_similarity import computeSSCSimilaruty 2 | class VatiantKMP(object): 3 | #求模式串T的next函数(修正方法)值并存入next数组 4 | #nextVal = [-1] 5 | #startIdxRes = []#写在这里,多次使用kmp时startIdxRes不会被清空而是存放了上一次的数据,影响结果 6 | def __init__(self, threshold): 7 | self.threshold = threshold 8 | self.nextVal = [-1] 9 | self.startIdxRes = [] 10 | 11 | def reset(self): 12 | self.nextVal = [-1] 13 | self.startIdxRes = [] 14 | 15 | def indexKMP(self, haystack, needle, ssc_encode_way): 16 | """ 17 | :type haystack: str 18 | :type needle: str 19 | :rtype: int 20 | """ 21 | """ 22 | try: 23 | return haystack.index(needle) 24 | except: 25 | return -1 26 | """ 27 | #子串定位,即模式匹配,可采用BF算法 也可采用KMP算法,我采用KMP算法 28 | # 0<=pos<= len(strS) - len(strT)) + 1 29 | self.getNextVal(needle, ssc_encode_way) 30 | i = 0 31 | while i< len(haystack): 32 | j = 0 33 | while i< len(haystack) and j < len(needle): 34 | #if j == -1 or haystack[i] == needle[j]: 35 | if j == -1 or computeSSCSimilaruty(haystack[i], needle[j], ssc_encode_way)>self.threshold: 36 | i += 1 37 | j += 1 38 | else: 39 | j = self.nextVal[j] 40 | if j == len(needle): 41 | self.startIdxRes.append(i - len(needle)) 42 | 43 | 44 | 45 | def getNextVal(self, strT, ssc_encode_way): 46 | i = 0 47 | j = -1 48 | while i < len(strT) - 1: 49 | #if j == -1 or strT[i] == strT[j]: 50 | if j == -1 or computeSSCSimilaruty(strT[i], strT[j], ssc_encode_way)>self.threshold: 51 | i += 1 52 | j += 1 53 | #if i < len(strT) and strT[i] == strT[j]: 54 | if i < len(strT) and computeSSCSimilaruty(strT[i], strT[j], ssc_encode_way)>self.threshold: 55 | self.nextVal.append(self.nextVal[j]) 56 | else: 57 | self.nextVal.append(j) 58 | else: 59 | j = self.nextVal[j] 60 | 61 | if __name__=="__main__": 62 | """ 63 | strS = "mississippissipssissips" 64 | strT = "issip" 65 | tmp = VatiantKMP() 66 | tmp.indexKMP(strS, strT) 67 | print(tmp.startIdxRes) 68 | #print strStr(strS, strT) 69 | """ 70 | chi_word1 = '紫琅路' 71 | chi_word2 = '国我爱你女生于无娃哇紫狼路爽晕约紫薇路又刘页列而紫粮路掩连哟罗' 72 | s = ['28525601038', '2J530235507', '7004220407A', '47031272927', '67030404003', 'GG010251005', '6I020104003', '5J022104124', '1J521444149', '1J501640149', '4E03222903C', 'F702143232A', '5704167164D', 'FG53C40804B', 'JI01260504A', 'DI011271206', '4E03222903C', '7J51244248G', '5704167164D', 'BI040774002', 'B7021024006', 'CI040108026', 'C7041122006', 'E0022102276', '4E03222903C', 'F742193932D', '5704167164D', 'FI43154016B', 'F742B343057', '2I411670209', '27522602078'] 73 | t = ['4E03222903C', 'F702113132B', '5704167164D'] 74 | tmp = VatiantKMP(0.8) 75 | tmp.indexKMP(s, t, "ALL") 76 | print(tmp.startIdxRes) -------------------------------------------------------------------------------- /src/soundshapecode/ssc.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | ''' 3 | Created on 2019-4-7 4 | 5 | @author: Yoga 6 | ''' 7 | from pypinyin import pinyin, lazy_pinyin, Style 8 | import pypinyin 9 | import pkg_resources 10 | 11 | from soundshapecode.four_corner import FourCornerMethod 12 | fcm = FourCornerMethod() 13 | 14 | from soundshapecode.variant_kmp import VatiantKMP 15 | 16 | SIMILARITY_THRESHOLD = 0.8 17 | SSC_ENCODE_WAY = 'ALL'#'ALL','SOUND','SHAPE' 18 | 19 | yunmuDict = {'a':'1', 'o':'2', 'e':'3', 'i':'4', 20 | 'u':'5', 'v':'6', 'ai':'7', 'ei':'7', 21 | 'ui':'8', 'ao':'9', 'ou':'A', 'iou':'B',#有:you->yiou->iou->iu 22 | 'ie':'C', 've':'D', 'er':'E', 'an':'F', 23 | 'en':'G', 'in':'H', 'un':'I', 'vn':'J',#晕:yun->yvn->vn->ven 24 | 'ang':'F', 'eng':'G', 'ing':'H', 'ong':'K'} 25 | 26 | shengmuDict = {'b':'1', 'p':'2', 'm':'3', 'f':'4', 27 | 'd':'5', 't':'6', 'n':'7', 'l':'7', 28 | 'g':'8', 'k':'9', 'h':'A', 'j':'B', 29 | 'q':'C', 'x':'D', 'zh':'E', 'ch':'F', 30 | 'sh':'G', 'r':'H', 'z':'E', 'c':'F', 31 | 's':'G', 'y':'I', 'w':'J', '0':'0'} 32 | 33 | shapeDict = {'⿰':'1','⿱':'2','⿲':'3','⿳':'4','⿴':'5',#左右结构、上下、左中右、上中下、全包围 34 | '⿵':'6','⿶':'7','⿷':'8','⿸':'9','⿹':'A',#上三包、下三包、左三包、左上包、右上包 35 | '⿺':'B','⿻':'C', '0':'0'}#左下包、镶嵌、独体字:0 36 | 37 | strokesDict = {1:'1', 2:'2', 3:'3', 4:'4', 5:'5', 6:'6', 7:'7', 8:'8', 9:'9', 10:'A', 38 | 11:'B', 12:'C', 13:'D', 14:'E', 15:'F', 16:'G', 17:'H', 18:'I', 19:'J', 20:'K', 39 | 21:'L', 22:'M', 23:'N', 24:'O', 25:'P', 26:'Q', 27:'R', 28:'S', 29:'T', 30:'U', 40 | 31:'V', 32:'W', 33:'X', 34:'Y', 35:'Z', 0:'0'} 41 | 42 | hanziStrokesDict = {}#汉子:笔画数 43 | hanziStructureDict = {}#汉子:形体结构 44 | 45 | def getSoundCode(one_chi_word): 46 | res = [] 47 | shengmuStr = pinyin(one_chi_word, style=pypinyin.INITIALS, heteronym=False, strict=False)[0][0] 48 | if shengmuStr not in shengmuDict: 49 | shengmuStr = '0' 50 | 51 | yunmuStrFullStrict = pinyin(one_chi_word, style=pypinyin.FINALS_TONE3, heteronym=False, strict=True)[0][0] 52 | 53 | yindiao = '0' 54 | if yunmuStrFullStrict[-1] in ['1','2','3','4']: 55 | yindiao = yunmuStrFullStrict[-1] 56 | yunmuStrFullStrict = yunmuStrFullStrict[:-1] 57 | 58 | if yunmuStrFullStrict in yunmuDict: 59 | #声母,韵母辅音补码,韵母,音调 60 | res.append(yunmuDict[yunmuStrFullStrict]) 61 | res.append(shengmuDict[shengmuStr]) 62 | res.append('0') 63 | elif len(yunmuStrFullStrict)>1: 64 | res.append(yunmuDict[yunmuStrFullStrict[1:]]) 65 | res.append(shengmuDict[shengmuStr]) 66 | res.append(yunmuDict[yunmuStrFullStrict[0]]) 67 | else: 68 | res.append('0') 69 | res.append(shengmuDict[shengmuStr]) 70 | res.append('0') 71 | 72 | res.append(yindiao) 73 | return res 74 | 75 | 76 | def getSoundCodes(words): 77 | 78 | shengmuStrs = pinyin(words, style=pypinyin.INITIALS, heteronym=False, strict=False) 79 | yunmuStrFullStricts = pinyin(words, style=pypinyin.FINALS_TONE3, heteronym=False, strict=True) 80 | soundCodes = [] 81 | for shengmuStr0, yunmuStrFullStrict0 in zip(shengmuStrs, yunmuStrFullStricts): 82 | res = [] 83 | shengmuStr = shengmuStr0[0] 84 | yunmuStrFullStrict = yunmuStrFullStrict0[0] 85 | 86 | if shengmuStr not in shengmuDict: 87 | shengmuStr = '0' 88 | 89 | yindiao = '0' 90 | if yunmuStrFullStrict[-1] in ['1','2','3','4']: 91 | yindiao = yunmuStrFullStrict[-1] 92 | yunmuStrFullStrict = yunmuStrFullStrict[:-1] 93 | 94 | if yunmuStrFullStrict in yunmuDict: 95 | #声母,韵母辅音补码,韵母,音调 96 | res.append(yunmuDict[yunmuStrFullStrict]) 97 | res.append(shengmuDict[shengmuStr]) 98 | res.append('0') 99 | elif len(yunmuStrFullStrict)>1: 100 | res.append(yunmuDict[yunmuStrFullStrict[1:]]) 101 | res.append(shengmuDict[shengmuStr]) 102 | res.append(yunmuDict[yunmuStrFullStrict[0]]) 103 | else: 104 | res.append('0') 105 | res.append(shengmuDict[shengmuStr]) 106 | res.append('0') 107 | 108 | res.append(yindiao) 109 | soundCodes.append(res) 110 | 111 | return soundCodes 112 | 113 | 114 | def getShapeCode(one_chi_word): 115 | res = [] 116 | structureShape = hanziStructureDict.get(one_chi_word, '0')#形体结构 117 | res.append(shapeDict[structureShape]) 118 | 119 | fourCornerCode = fcm.query(one_chi_word)#四角号码(5位数字) 120 | if fourCornerCode is None: 121 | res.extend(['0', '0', '0', '0', '0']) 122 | else: 123 | res.extend(fourCornerCode[:]) 124 | 125 | strokes = hanziStrokesDict.get(one_chi_word, '0')#笔画数 126 | if int(strokes) >35: 127 | res.append('Z') 128 | else: 129 | res.append(strokesDict[int(strokes)]) 130 | return res 131 | 132 | 133 | def getHanziStrokesDict(): 134 | strokes_filepath = pkg_resources.resource_filename(__name__, "../zh_data/utf8_strokes.txt") 135 | with open(strokes_filepath, 'r', encoding='UTF-8') as f:#文件特征: 136 | for line in f: 137 | line = line.split() 138 | hanziStrokesDict[line[1]]=line[2] 139 | 140 | 141 | def getHanziStructureDict(): 142 | structure_filepath = pkg_resources.resource_filename(__name__, "../zh_data/unihan_structure.txt") 143 | with open(structure_filepath, 'r', encoding='UTF-8') as f:#文件特征:U+4EFF\t仿\t⿰亻方\n 144 | for line in f: 145 | line = line.split() 146 | if line[2][0] in shapeDict: 147 | hanziStructureDict[line[1]]=line[2][0] 148 | 149 | 150 | def generateHanziSSCFile(): 151 | readFilePath = pkg_resources.resource_filename(__name__, "../zh_data/unihan_structure.txt") 152 | writeFilePath = pkg_resources.resource_filename(__name__, "../zh_data/hanzi_ssc_res.txt") 153 | writeFile = open(writeFilePath, "w", encoding='UTF-8') 154 | with open(readFilePath, 'r', encoding='UTF-8') as f:#文件特征:U+4EFF\t仿\t⿰亻方\n 155 | for line in f: 156 | line = line.split() 157 | soundCode = getSoundCode(line[1]) 158 | shapeCode = getShapeCode(line[1]) 159 | ssc = "".join(soundCode+shapeCode) 160 | if ssc != '00000000000': 161 | writeFile.write(line[0]+"\t"+line[1]+"\t"+ssc + "\n") 162 | writeFile.close() 163 | print('结束!') 164 | 165 | hanziSSCDict = {}#汉子:SSC码 166 | def getHanziSSCDict(): 167 | hanzi_ssc_filepath = pkg_resources.resource_filename(__name__, "../zh_data/hanzi_ssc_res.txt") 168 | with open(hanzi_ssc_filepath, 'r', encoding='UTF-8') as f:#文件特征:U+4EFF\t仿\t音形码\n 169 | for line in f: 170 | line = line.split() 171 | hanziSSCDict[line[1]]=line[2] 172 | 173 | 174 | 175 | def getSSC(hanzi_sentence, encode_way): 176 | hanzi_sentence_ssc_list = [] 177 | for one_chi_word in hanzi_sentence: 178 | ssc = hanziSSCDict.get(one_chi_word, None) 179 | if ssc is None: 180 | soundCode = getSoundCode(one_chi_word) 181 | shapeCode = getShapeCode(one_chi_word) 182 | ssc = "".join(soundCode+shapeCode) 183 | if encode_way=="SOUND": 184 | ssc=ssc[:4] 185 | elif encode_way=="SHAPE": 186 | ssc=ssc[4:] 187 | else: 188 | pass 189 | hanzi_sentence_ssc_list.append(ssc) 190 | return hanzi_sentence_ssc_list 191 | 192 | 193 | def getSSC_sentence(hanzi_sentence, encode_way, analyzer): 194 | 195 | hanzi_sentence_ssc_list = [] 196 | 197 | result_seg = analyzer.seg(hanzi_sentence) 198 | words = [] 199 | for term in result_seg: 200 | words.append(term.word) 201 | 202 | soundCodes = getSoundCodes(words) 203 | 204 | for one_chi_word, soundCode in zip(hanzi_sentence, soundCodes): 205 | if encode_way == "SOUND": 206 | ssc = "".join(soundCode) 207 | elif encode_way == "SHAPE": 208 | shapeCode = getShapeCode(one_chi_word) 209 | ssc = "".join(shapeCode) 210 | elif encode_way == "ALL": 211 | shapeCode = getShapeCode(one_chi_word) 212 | ssc = "".join(soundCode + shapeCode) 213 | 214 | hanzi_sentence_ssc_list.append(ssc) 215 | 216 | return hanzi_sentence_ssc_list 217 | 218 | 219 | if __name__=="__main__": 220 | """注意: 221 | 1.声母最多2位,韵母最多3位 222 | 2.我 和 国 楼和有 也和列 可认为只是声母不一样,而韵母分别看成uo和iou和ie,多出来的部分可看成韵母辅音 223 | 3.留和有 留:liu->l iou 有:you->yiou-> y i ou 224 | """ 225 | chi_word1 = '紫琅路' 226 | chi_word2 = '国我爱你女生于无娃哇紫狼路爽晕约紫薇路又刘页列而紫粮路掩连哟罗' 227 | getHanziStrokesDict() 228 | getHanziStructureDict() 229 | #generateHanziSSCFile()#生成汉子-ssc映射文件 230 | getHanziSSCDict() 231 | 232 | chi_word1_ssc = getSSC(chi_word1, SSC_ENCODE_WAY) 233 | print(chi_word1_ssc) 234 | 235 | chi_word2_ssc = getSSC(chi_word2, SSC_ENCODE_WAY) 236 | print(chi_word2_ssc) 237 | 238 | #应用串的模式匹配KMP算法,找变异词。效率比BF算法高 239 | kmp = VatiantKMP(SIMILARITY_THRESHOLD) 240 | kmp.indexKMP(chi_word2_ssc, chi_word1_ssc, SSC_ENCODE_WAY)#主串S、模式串T 241 | print(kmp.startIdxRes) 242 | 243 | variabt_word = set() 244 | for i in kmp.startIdxRes: 245 | variabt_word.add(chi_word2[i:i+len(chi_word1)]) 246 | print('变异词:', variabt_word) 247 | 248 | 249 | """ 250 | Style.TONE3音调显示在末尾 251 | Style.TONE2音调显示在韵母 252 | Style.TONE音调为手写格式(默认) 253 | pypinyin.NORMAL不显示音调 254 | pypinyin.INITIALS显示声母 255 | pypinyin.FINALS显示韵母 256 | pinyin风格参考:http://pypinyin.mozillazg.com/zh_CN/master/api.html#style 257 | strict字段参考:http://pypinyin.mozillazg.com/zh_CN/master/usage.html#strict,默认为True 258 | i处理不包含拼音的字符:http://pypinyin.mozillazg.com/zh_CN/master/usage.html#handle-no-pinyin 259 | i汉语拼音中没有四拼音节:http://www.hanyupinyin.cn/yinjie/yj274.html 260 | i每个基本音节由声母、韵母和声调三个部分组成,有的可以没有声母或声调,但一定有韵母:https://baike.baidu.com/item/%E6%B1%89%E8%AF%AD%E6%8B%BC%E9%9F%B3%E9%9F%B3%E8%8A%82/9167981 261 | i百度汉语笔画:https://hanyu.baidu.com/ 262 | i爬虫获取中文笔画数:https://www.cnblogs.com/zhongxinWang/p/8404510.html 263 | i utf8(爬虫)获取中文笔画数:https://blog.csdn.net/zz958712568/article/details/35787139 264 | i unihan获取中文笔画数:https://www.cnblogs.com/Comero/p/8997585.html 265 | https://github.com/helmz/Corpus 266 | 267 | pinyinStr3 = pinyin(chi_word, style=Style.TONE2, heteronym=False)#heteronym:设置多音字模式 268 | print(pinyinStr3) 269 | pinyinStr4 = pinyin(chi_word, style=Style.TONE3, heteronym=False)#heteronym:设置多音字模式 270 | print(pinyinStr4) 271 | pinyinStr5 = pinyin(chi_word, style=pypinyin.INITIALS, heteronym=False, strict=False)#heteronym:设置多音字模式 272 | print(pinyinStr5) 273 | pinyinStr6 = pinyin(chi_word, style=pypinyin.FINALS_TONE2, heteronym=False, strict=False)#heteronym:设置多音字模式 274 | print(pinyinStr6) 275 | pinyinStr2 = lazy_pinyin(chi_word) 276 | print(pinyinStr2) 277 | """ 278 | --------------------------------------------------------------------------------