├── README.md ├── sougou_spider.py └── sougou_transfer.py /README.md: -------------------------------------------------------------------------------- 1 | # SougouWordSpdyer 2 | 搜狗词库抓取与txt转换,目前已经抓取780W词库。 3 | -------------------------------------------------------------------------------- /sougou_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os,sys 3 | import urllib 4 | from urllib import quote, urlretrieve 5 | from lxml import etree 6 | 7 | class ProductSyper: 8 | def __init__(self): 9 | pass 10 | 11 | def get_html(self, url): 12 | return urllib.urlopen(url).read() 13 | 14 | def html_parser(self, html): 15 | return etree.HTML(html) 16 | 17 | def get_links(self, selector, page_index): 18 | data = {} 19 | page_title = selector.xpath('//title/text()')[0] 20 | if len(page_title.split('_')) == 2: 21 | dict_name = page_title.split('_')[0].encode('utf-8') 22 | download_link = 'https://pinyin.sogou.com/d/dict/download_cell.php?id='+str(page_index)+'&name='+quote(dict_name)+'&f=detail' 23 | data['title'] = page_title 24 | data['download_link'] = download_link 25 | data['dict_name'] = dict_name 26 | return data 27 | else: 28 | return {} 29 | 30 | def download_dict(self, data): 31 | download_link = data['download_link'] 32 | dict_name = data['dict_name'] 33 | urlretrieve(download_link, 'dict/%s.scel'%dict_name) 34 | 35 | def spider(self,url): 36 | page_index = url.split('/')[-1] 37 | html = self.get_html(url) 38 | data = self.get_links(self.html_parser(html), page_index) 39 | if data: 40 | try: 41 | self.download_dict(data) 42 | except: 43 | pass 44 | return data 45 | else: 46 | return {} 47 | 48 | def main(): 49 | product_spider = ProductSyper() 50 | for page_index in range(1, 40001): 51 | url = 'https://pinyin.sogou.com/dict/detail/index/%s'%page_index 52 | data = product_spider.spider(url) 53 | print url 54 | if data: 55 | print page_index, data['dict_name'] 56 | 57 | 58 | main() 59 | 60 | -------------------------------------------------------------------------------- /sougou_transfer.py: -------------------------------------------------------------------------------- 1 | #!/bin/python 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import struct 5 | import sys 6 | import binascii 7 | import pdb 8 | import pymongo 9 | conn = pymongo.MongoClient() 10 | 11 | try: 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | except: 15 | pass 16 | 17 | # 搜狗的scel词库就是保存的文本的unicode编码,每两个字节一个字符(中文汉字或者英文字母) 18 | # 找出其每部分的偏移位置即可 19 | # 主要两部分 20 | # 1.全局拼音表,貌似是所有的拼音组合,字典序 21 | # 格式为(index,len,pinyin)的列表 22 | # index: 两个字节的整数 代表这个拼音的索引 23 | # len: 两个字节的整数 拼音的字节长度 24 | # pinyin: 当前的拼音,每个字符两个字节,总长len 25 | # 26 | # 2.汉语词组表 27 | # 格式为(same,py_table_len,py_table,{word_len,word,ext_len,ext})的一个列表 28 | # same: 两个字节 整数 同音词数量 29 | # py_table_len: 两个字节 整数 30 | # py_table: 整数列表,每个整数两个字节,每个整数代表一个拼音的索引 31 | # 32 | # word_len:两个字节 整数 代表中文词组字节数长度 33 | # word: 中文词组,每个中文汉字两个字节,总长度word_len 34 | # ext_len: 两个字节 整数 代表扩展信息的长度,好像都是10 35 | # ext: 扩展信息 前两个字节是一个整数(不知道是不是词频) 后八个字节全是0 36 | # 37 | # {word_len,word,ext_len,ext} 一共重复same次 同音词 相同拼音表 38 | 39 | 40 | def tansfer(file_in, file_index, file_out): 41 | 42 | # 拼音表偏移, 43 | startPy = 0x1540; 44 | 45 | # 汉语词组表偏移 46 | startChinese = 0x2628; 47 | 48 | # 全局拼音表 49 | 50 | GPy_Table = {} 51 | 52 | # 解析结果 53 | # 元组(词频,拼音,中文词组)的列表 54 | GTable = [] 55 | 56 | 57 | def byte2str(data): 58 | '''''将原始字节码转为字符串''' 59 | i = 0; 60 | length = len(data) 61 | ret = u'' 62 | while i < length: 63 | x = data[i] + data[i + 1] 64 | t = unichr(struct.unpack('H', x)[0]) 65 | if t == u'\r': 66 | ret += u'\n' 67 | elif t != u' ': 68 | ret += t 69 | i += 2 70 | return ret 71 | 72 | 73 | # 获取拼音表 74 | def getPyTable(data): 75 | if data[0:4] != "\x9D\x01\x00\x00": 76 | return None 77 | data = data[4:] 78 | pos = 0 79 | length = len(data) 80 | while pos < length: 81 | index = struct.unpack('H', data[pos] + data[pos + 1])[0] 82 | # print index, 83 | pos += 2 84 | l = struct.unpack('H', data[pos] + data[pos + 1])[0] 85 | # print l, 86 | pos += 2 87 | py = byte2str(data[pos:pos + l]) 88 | # print py 89 | GPy_Table[index] = py 90 | pos += l 91 | 92 | # 获取一个词组的拼音 93 | 94 | 95 | def getWordPy(data): 96 | pos = 0 97 | length = len(data) 98 | ret = u'' 99 | while pos < length: 100 | index = struct.unpack('H', data[pos] + data[pos + 1])[0] 101 | ret += GPy_Table[index] 102 | pos += 2 103 | return ret 104 | 105 | 106 | # 获取一个词组 107 | def getWord(data): 108 | pos = 0 109 | length = len(data) 110 | ret = u'' 111 | while pos < length: 112 | index = struct.unpack('H', data[pos] + data[pos + 1])[0] 113 | ret += GPy_Table[index] 114 | pos += 2 115 | return ret 116 | 117 | 118 | # 读取中文表 119 | def getChinese(data): 120 | # import pdb 121 | # pdb.set_trace() 122 | 123 | pos = 0 124 | length = len(data) 125 | while pos < length: 126 | # 同音词数量 127 | same = struct.unpack('H', data[pos] + data[pos + 1])[0] 128 | # print '[same]:',same, 129 | 130 | # 拼音索引表长度 131 | pos += 2 132 | py_table_len = struct.unpack('H', data[pos] + data[pos + 1])[0] 133 | # 拼音索引表 134 | pos += 2 135 | py = getWordPy(data[pos: pos + py_table_len]) 136 | 137 | # 中文词组 138 | pos += py_table_len 139 | for i in xrange(same): 140 | # 中文词组长度 141 | c_len = struct.unpack('H', data[pos] + data[pos + 1])[0] 142 | # 中文词组 143 | pos += 2 144 | word = byte2str(data[pos: pos + c_len]) 145 | # 扩展数据长度 146 | pos += c_len 147 | ext_len = struct.unpack('H', data[pos] + data[pos + 1])[0] 148 | # 词频 149 | pos += 2 150 | count = struct.unpack('H', data[pos] + data[pos + 1])[0] 151 | 152 | # 保存 153 | GTable.append((count, py, word)) 154 | 155 | # 到下个词的偏移位置 156 | pos += ext_len 157 | 158 | 159 | def deal(file_name, file_index): 160 | print '-' * 60 161 | f = open(file_name, 'rb') 162 | data = f.read() 163 | f.close() 164 | 165 | if data[0:12] != "\x40\x15\x00\x00\x44\x43\x53\x01\x01\x00\x00\x00": 166 | print "确认你选择的是搜狗(.scel)词库?" 167 | sys.exit(0) 168 | # pdb.set_trace() 169 | info = {} 170 | 171 | info['dict_name'] = byte2str(data[0x130:0x338]).replace("\x00",'') # .encode('GB18030') 172 | info['dict_type'] = byte2str(data[0x338:0x540]).replace("\x00",'') # .encode('GB18030') 173 | info['dict_desc'] = byte2str(data[0x540:0xd40]).replace("\x00",'') # .encode('GB18030') 174 | info['dict_eg'] = byte2str(data[0xd40:startPy]).replace("\x00",'') # .encode('GB18030') 175 | info['dict_index'] = file_index 176 | conn['sogou_word']['data'].insert(info) 177 | 178 | print file_index, info['dict_name'], info['dict_type'] 179 | getPyTable(data[startPy:startChinese]) 180 | getChinese(data[startChinese:]) 181 | return info['dict_type'] 182 | 183 | dict_type = deal(file_in, file_index) 184 | # 保存结果 185 | if not os.path.exists('./dict_txt2/%s'%dict_type): 186 | os.makedirs('./dict_txt2/%s'%dict_type) 187 | 188 | if len(GTable) > 99 : 189 | print len(GTable) 190 | f = open('./dict_txt2/%s/%s.txt'%(dict_type,file_out), 'w+') 191 | for word in GTable: 192 | # GTable保存着结果,是一个列表,每个元素是一个元组(词频,拼音,中文词组),有需要的话可以保存成自己需要个格式 193 | # 我没排序,所以结果是按照上面输入文件的顺序 194 | #f.write(unicode(word).encode('GB18030')) # 最终保存文件的编码,可以自给改 195 | f.write(word[2]) 196 | f.write('\n') 197 | f.close() 198 | 199 | 200 | 201 | if __name__ == '__main__': 202 | file_index = 1 203 | for root, dirs, files in os.walk('./dict'): 204 | for file in files: 205 | try: 206 | file_in = os.path.join(root,file) 207 | file_out = file.split('.')[0] 208 | tansfer(file_in, file_index, file_out) 209 | file_index += 1 210 | except: 211 | pass 212 | 213 | --------------------------------------------------------------------------------