.+?(?P[^<]+?)',re.DOTALL) 34 | 35 | res=dict() 36 | for stat in stats: 37 | m=_statprog.search(stat) 38 | if m is None: 39 | print('status parse error.stat={}'.format(stat)) 40 | continue 41 | else: 42 | tmpStat=dict() 43 | status_id=m.group('id') 44 | tmpStat['renrenId1'],tmpStat['cur_name'],tmpStat['cur_content']=split_owner(_drop_status_urls(m.group('content'))) 45 | tmpStat['orig_owner'],tmpStat['orig_name'],tmpStat['orig_content']=split_owner(_drop_status_urls(m.group('orig'))) 46 | tmpStat['timestamp']=m.group('timestamp').strip() 47 | res[status_id]=tmpStat 48 | return res 49 | 50 | 51 | _pf_prog=None 52 | def profile_detail(content): 53 | """keys in record: 54 | edu_college/edu_senior/edu_junior/edu_primary, 55 | birth_year/birth_month/birth_day, 56 | gender,hometown""" 57 | if content is None: 58 | return None 59 | global _pf_prog 60 | if _pf_prog is None: 61 | import re 62 | _pf_prog=re.compile(r'

([^:：]+)[:：]?\s*

[^<]*

(.*?)

',re.DOTALL) 63 | content=_drop_pf_extra(''.join(content),' ') 64 | #orig tag/value saved in orig_pf 65 | orig_pf=dict() 66 | for m in _pf_prog.finditer(content): 67 | orig_pf[_sub_space(m.group(1),r'')]=m.group(2).strip(' ') 68 | return orig_pf 69 | 70 | _pf_miniprog=None 71 | def profile_mini(content): 72 | #tl or basic mini 73 | if content is None: 74 | return None 75 | global _pf_miniprog 76 | if _pf_miniprog is None: 77 | import re 78 | _pf_miniprog=re.compile(r'(.*?)',re.DOTALL) 79 | content=_drop_pf_extra(''.join(content),' ') 80 | orig_pf=dict() 81 | for m in _pf_miniprog.finditer(content): 82 | orig_pf[m.group(1)]=m.group(2).strip(' ') 83 | if 'birthday' in orig_pf: 84 | orig_pf['gender'],orig_pf['birthday']=orig_pf.get('birthday',',').replace('，',',').split(',') 85 | return orig_pf 86 | 87 | #---@depresed----profile deep parser----------------- 88 | #birth and gender 89 | _birthprog=None 90 | def _get_birth(content): 91 | if content is None: 92 | return {'birth_year':None,'birth_month':None,'birth_day':None} 93 | global _birthprog 94 | if _birthprog is None: 95 | import re 96 | _birthprog=re.compile(r'(?:(?P\d+)[年后-])?(?P\d+)[月-](?P\d+)[日]?') 97 | m=_birthprog.search(_drop_pf_extra(content,r'')) 98 | if m is None: 99 | return {'birth_year':'9999','birth_month':'99','birth_day':'99'} 100 | return m.groupdict('9999') 101 | def _get_gender(content): 102 | if content is None: 103 | return None 104 | if content.find('男')>-1: 105 | return 'm' 106 | elif content.find('女')>-1: 107 | return 'f' 108 | else: 109 | return 'u' 110 | 111 | #edu info 112 | _edu_highprog=None 113 | def _split_high_edu(content): 114 | global _edu_highprog 115 | if _edu_highprog is None: 116 | import re 117 | _edu_highprog=re.compile(r'(?P[^-<]+)-\s*(?P\d+)\s*年\s*(?:-(?P[^-<]+))?
') 118 | return _split_edu(_edu_highprog,content) 119 | 120 | _edu_lowprog=None 121 | def _split_low_edu(content,level): 122 | global _edu_lowprog 123 | if _edu_lowprog is None: 124 | import re 125 | _edu_lowprog=re.compile(r'(?P[^-<]+)(?:-\s*(?P\d+)\s*年)?') 126 | return _split_edu(_edu_lowprog,content,level) 127 | 128 | def _split_edu(prog,content,level=None): 129 | if content is None: 130 | return None 131 | if level is None: 132 | school_default={} 133 | else: 134 | school_default={'level':level} 135 | schools=[] 136 | for m in prog.finditer(_drop_pf_extra(content)): 137 | school=school_default 138 | for key,value in m.groupdict('').items(): 139 | school[key]=value.strip(' ') 140 | schools.append(school) 141 | return schools 142 | 143 | #-----------------profile----------------------- 144 | #drop extra 145 | def _drop_pf_extra(content,target=r' '): 146 | return _sub_space(_drop_span((_drop_link(content))),target) 147 | 148 | _linkprog=None 149 | def _drop_link(content): 150 | if content is None: 151 | return None 152 | global _linkprog 153 | if _linkprog is None: 154 | import re 155 | _linkprog=re.compile(r']+?>([^<]*?)') 156 | return _linkprog.sub(r'\1',content) 157 | 158 | _spanprog=None 159 | def _drop_span(content): 160 | if content is None: 161 | return None 162 | global _spanprog 163 | if _spanprog is None: 164 | import re 165 | _spanprog=re.compile(r']*>([^<]*?)') 166 | return _spanprog.sub(r'\1',content) 167 | 168 | _spaceprog=None 169 | _space_likeprog=None 170 | def _sub_space(content,target=r''): 171 | if not isinstance(content,str): 172 | return None 173 | global _spaceprog 174 | global _space_likeprog 175 | if _spaceprog is None: 176 | import re 177 | _space_likeprog=re.compile(r'(?:\\n)|(?:\\t)|(?:\\u3000)|(?:\u3000)|(?: )') 178 | _spaceprog=re.compile(r'\s+') 179 | return _spaceprog.sub(target,_space_likeprog.sub(target,content)).strip(' ') 180 | 181 | #-----------------status------------------ 182 | 183 | def _drop_status_urls(content): 184 | if content is None: 185 | return None 186 | else: 187 | return _sub_space(drop_rrurl(drop_img(drop_pf(drop_pubpf(drop_at(content))))),r' ') 188 | 189 | _pfprog=None 190 | def drop_pf(content): 191 | if content is None: 192 | return None 193 | global _pfprog 194 | if _pfprog is None: 195 | import re 196 | _pfprog=re.compile(r']+?http://www.renren.com/profile.do\?id=(\d+)[^>]+>(.*?)',re.DOTALL) 197 | return _pfprog.sub(r'(\1,\2)',content) 198 | 199 | _pubpfprog=None 200 | def drop_pubpf(content): 201 | if content is None: 202 | return None 203 | global _pubpfprog 204 | if _pubpfprog is None: 205 | import re 206 | _pubpfprog=re.compile(r']+?http://page.renren.com/(\d+)[^>]+>(.*?)',re.DOTALL) 207 | return _pubpfprog.sub(r'(\1,\2)',str(content)) 208 | 209 | _atprog=None 210 | def drop_at(content): 211 | if content is None: 212 | return None,None 213 | global _atprog 214 | if _atprog is None: 215 | import re 216 | _atprog=re.compile(r"]+?http://www.renren.com/g/(\d+)[^>]*>(@.*?)",re.DOTALL) 217 | return _atprog.sub(r'\2(\1)',str(content)) 218 | 219 | _imgprog=None 220 | def drop_img(content): 221 | if content is None: 222 | return None 223 | global _imgprog 224 | if _imgprog is None: 225 | import re 226 | _imgprog=re.compile(r"]+alt=\'([^>]*?)\'[^>]*?/>",re.DOTALL) 227 | return _imgprog.sub(r'(img\1img)',content) 228 | 229 | _rrurlprog=None 230 | def drop_rrurl(content): 231 | if content is None: 232 | return None 233 | global _rrurlprog 234 | if _rrurlprog is None: 235 | import re 236 | _rrurlprog=re.compile(r"]+title='([^>]+)'>[^<]+",re.DOTALL) 237 | return _rrurlprog.sub(r'(\1)',content) 238 | 239 | def split_owner(content): 240 | if content is None: 241 | return None,None,None 242 | else: 243 | idx=content.replace('：',':').find(':') 244 | idx2=content.find(',') 245 | if (idx < 0) or (idx2 <0): 246 | return None,None,None 247 | return content[:idx2].strip('( '),content[idx2+1:idx].strip(') '),content[idx+1:].strip(' ') 248 | -------------------------------------------------------------------------------- /repo_file.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-*- 2 | import pickle 3 | import time 4 | 5 | save_period=240 # second 6 | class repo_file: 7 | def __init__(self, name_pre='test'): 8 | self.data_repo={} 9 | self.name_pre=name_pre 10 | self.last_saved=time.time() 11 | 12 | def __del__(self): 13 | self.save('__del__') 14 | 15 | def load(self, pageStyle): 16 | filename='{}_{}.p'.format(self.name_pre, pageStyle) 17 | try: 18 | with open(filename, 'rb') as f: 19 | self.data_repo[pageStyle]=pickle.load(f) 20 | except IOError: 21 | self.data_repo[pageStyle]=dict() 22 | 23 | def save(self,orig=None): 24 | for pageStyle, record in self.data_repo.items(): 25 | filename='{}_{}.p'.format(self.name_pre, pageStyle) 26 | with open(filename, 'wb') as f: 27 | pickle.dump(record, f) 28 | self.last_saved = time.time() 29 | #print('save in {}, called by {}'.format(filename,orig)) 30 | 31 | def save_friendList(self, record, rid, run_info=None): 32 | """save record and return rows affected.save nothing if empty. 33 | return None if input error""" 34 | return self._save_process('friendList', record, rid, run_info) 35 | 36 | def save_status(self, record, rid, run_info=None): 37 | """save record and return rows affected.save nothing if empty. 38 | return None if input error""" 39 | return self._save_process('status', record, rid, run_info) 40 | 41 | def save_profile(self, record, rid, run_info=None): 42 | """save profile and return rows affected.return None if input error""" 43 | return self._save_process('profile', record, rid, run_info) 44 | 45 | def _save_process(self, pageStyle, record, rid, run_info): 46 | if not isinstance(record,dict): 47 | return None 48 | if pageStyle not in self.data_repo: 49 | self.load(pageStyle) 50 | self.data_repo[pageStyle][rid]=record 51 | # save to file every n second 52 | global save_period 53 | if time.time() - self.last_saved > save_period: 54 | self.save('auto') 55 | return len(record) 56 | 57 | def getSearched(self, pageStyle): 58 | if pageStyle not in self.data_repo: 59 | self.load(pageStyle) 60 | return set(self.data_repo[pageStyle].keys()) 61 | 62 | def getFriendList(self, renrenId): 63 | pageStyle='friendList' 64 | if pageStyle not in self.data_repo: 65 | self.load(pageStyle) 66 | return set(self.data_repo[pageStyle].get(renrenId,{})) 67 | -------------------------------------------------------------------------------- /repo_mysql.py: -------------------------------------------------------------------------------- 1 | import MySQLdb 2 | from settings import db_connet_info as connect_info 3 | 4 | 5 | def _sql_log_status(rid, login_id, n_record): 6 | return "INSERT INTO stat_log_status (rid, login_id, n_record) VALUES ('%s', '%s', %d)" % (rid, login_id, n_record) 7 | 8 | def _sql_log_fl(rid, login_id, n_record): 9 | return "INSERT INTO stat_log_friends (rid, login_id, n_record) VALUES ('%s', '%s', %d)" % (rid, login_id, n_record) 10 | 11 | def _sql_fl(record, rid): 12 | val_fl = ','.join(["('%s', '%s')" % (rid, item[0]) for item in record if item[0] != rid]) 13 | return "INSERT INTO friends (rid1, rid2) VALUES %s" % val_fl 14 | 15 | def _sql_name(record): 16 | val_name = ','.join(["('%s', '%s')" % item for item in record]) 17 | return "INSERT INTO profile (rid, name) VALUES %s" % val_name 18 | 19 | 20 | class repo_mysql: 21 | 22 | def __init__(self): 23 | self.conn = MySQLdb.connect(**connect_info) 24 | self.cur = self.conn.cursor() 25 | 26 | def __del__(self): 27 | self.cur.close() 28 | self.conn.close() 29 | 30 | def save_fl(self, login_id, rid, fl_record): 31 | """save record and return rows affected.save nothing if empty. 32 | return None if input error""" 33 | 34 | n_name = 0 35 | 36 | try: 37 | if len(fl_record): 38 | n_fl = self.cur.execute(_sql_fl(fl_record, rid)) 39 | n_name = self.cur.executemany("INSERT INTO profile (rid, name) VALUES (%s, %s) ON DUPLICATE KEY UPDATE rid=VALUES(rid)", fl_record) 40 | self.cur.execute(_sql_log_fl(rid, login_id, len(fl_record))) 41 | except Exception as e: 42 | print 'Error ID: %s' % rid 43 | print e 44 | else: 45 | self.conn.commit() 46 | 47 | return n_name 48 | 49 | def save_status(self, login_id, rid, status_record): 50 | """save record and return rows affected.save nothing if empty. 51 | return None if input error""" 52 | 53 | n_saved = 0 54 | 55 | try: 56 | if len(status_record): 57 | n_saved = self.cur.executemany("INSERT INTO status_raw (rid, status_id, content) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE rid=VALUES(rid)", [(rid, item[0], item[1]) for item in status_record]) 58 | self.cur.execute(_sql_log_status(rid, login_id, len(status_record))) 59 | except Exception as e: 60 | print 'Error ID: %s' % rid 61 | print e 62 | else: 63 | self.conn.commit() 64 | 65 | return n_saved 66 | 67 | def get_fl_searched(self, rid): 68 | self.cur.execute("SELECT rid FROM stat_log_friends where n_record>0 OR login_id=%s" % rid) 69 | return {item[0] for item in self.cur.fetchall()} 70 | 71 | def get_status_searched(self, rid): 72 | self.cur.execute("SELECT rid FROM stat_log_status where n_record>0 OR login_id=%s" % rid) 73 | return {item[0] for item in self.cur.fetchall()} 74 | 75 | def get_fl(self, rid): 76 | self.cur.execute("SELECT rid2 FROM friends where rid1='%s'" % rid) 77 | return {item[0] for item in self.cur.fetchall()} 78 | 79 | def get_status(self, rid): 80 | self.cur.execute("SELECT status_id FROM status_raw where rid='%s'" % rid) 81 | return {item[0] for item in self.cur.fetchall()} 82 | 83 | def _sql_profile(self,record,rid=None): 84 | pageStyle='profile' 85 | if len(record) == 0: 86 | return [] 87 | pf_map=get_cfg_dict('profile_map',has_default=False) 88 | pf_ignore=pf_map.pop('ignore').split(',') 89 | #construct sql 90 | pfs="renrenId1='{}'".format(rid) 91 | for k,v in record.items(): 92 | if k in pf_map.keys(): 93 | pfs += ",{}='{}'".format(pf_map[k],v) 94 | elif k in pf_map.values(): 95 | pfs += ",{}='{}'".format(k,v) 96 | elif k in pf_ignore: 97 | #print('ignore {}'.format(k)) 98 | pass 99 | else: 100 | self.tag_exceed(rid,k,v) 101 | sql_pf="insert into {} set {}".format(self.table_name[pageStyle],pfs) 102 | return [sql_pf] 103 | 104 | def tag_exceed(self,rid,k,v): 105 | print('pf tag exceed. tag={},renrenId={},value={}'.format(k,rid,v)) 106 | 107 | if __name__ == '__main__': 108 | from downloader import renren 109 | test_cookie = raw_input('Input cookie(document.cookie): ') 110 | rr = renren(test_cookie) 111 | rid = rr.renrenId() 112 | target_id = rid 113 | print rid 114 | # record = rr.friendList(target_id) 115 | record = rr.status(target_id) 116 | print '%d got' % len(record) 117 | 118 | repo = repo_mysql() 119 | # print repo.save_fl(rid, target_id, record) 120 | print repo.save_status(rid, target_id, record) 121 | #print 'friends of rid: %s' % len(repo.get_fl(target_id)) 122 | print 'status of rid: %s' % len(repo.get_status(target_id)) 123 | print 'friends searched: %s' % len(repo.get_fl_searched('233330059')) 124 | print 'friends searched: %s' % len(repo.get_fl_searched('23333005')) 125 | print 'status searched: %s' % len(repo.get_status_searched('233330059')) 126 | print 'status searched: %s' % len(repo.get_status_searched('23333005')) 127 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | httplib2 2 | mysql-python 3 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | account = {'email': 'yyttrr3242342@163.com', 2 | 'password': 'bmeB500bmeB500' 3 | } 4 | 5 | db_connet_info = { 6 | 'user': 'root', 7 | 'passwd': 'Kunth123', 8 | 'db': 'renren', 9 | 'host': '127.0.0.1', 10 | 'port': 3306 11 | } 12 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-*- 2 | import time 3 | import os 4 | import logging 5 | import downloader 6 | import repo_mysql 7 | 8 | def debug_log(rel_path='log/spider'): 9 | path = os.path.join(os.path.dirname(__file__), rel_path) 10 | formatter = logging.Formatter('%(asctime)s|%(levelname)s|%(message)s|%(filename)s-%(lineno)s') 11 | 12 | log = logging.getLogger('renre.spider') 13 | log.setLevel(logging.INFO) 14 | lvls = ['debug', 'info', 'warn', 'error'] 15 | 16 | if not os.path.exists(path): 17 | os.makedirs(path) 18 | 19 | for lvl in lvls: 20 | logfile = os.path.join(path, '{}.log'.format(lvl.lower())) 21 | hdlr = logging.FileHandler(logfile) 22 | hdlr.setLevel(getattr(logging, lvl.upper())) 23 | hdlr.setFormatter(formatter) 24 | log.addHandler(hdlr) 25 | return log 26 | 27 | 28 | pf_sleep=2 29 | class spider: 30 | def __init__(self, cookie): 31 | self.dl = downloader.renren(cookie) 32 | self.repo = repo_mysql.repo_mysql() 33 | self.login_id = self.dl.renrenId() 34 | self.fl_searched = self.repo.get_fl_searched(self.login_id) 35 | self.status_searched = self.repo.get_status_searched(self.login_id) 36 | self.log = debug_log() 37 | 38 | def getNet1(self, orig_id): 39 | if not isinstance(orig_id, str): 40 | print('ERROR! str required. orig_id = %s' % orig_id) 41 | return None 42 | if orig_id not in self.fl_searched: 43 | print('{} download net1 of {}'.format(time.strftime('%H:%M:%S', time.localtime()), orig_id)) 44 | record = self.dl.friendList(orig_id) 45 | if record is None: 46 | self.log.error('{}, fail to download friend list.'.format(rid)) 47 | else: 48 | self.repo.save_fl(self.login_id, orig_id, record) 49 | return self.repo.get_fl(orig_id) 50 | 51 | def getNet2(self, orig_id): 52 | n_forbidden = 0 53 | friends = self.getNet1(orig_id) 54 | toSearch = friends - self.fl_searched 55 | print('{} get net2 of {}, toSearch/total: {}/{}'.format(time.strftime('%H:%M:%S',time.localtime()), orig_id, len(toSearch), len(friends))) 56 | for i, rid in zip(range(1, len(toSearch)+1), toSearch): 57 | record = self.dl.friendList(rid) 58 | if record is None: 59 | self.log.error('{}, fail to download friend list.'.format(rid)) 60 | else: 61 | saved = self.repo.save_fl(self.login_id, rid, record) 62 | log_text = '{}/{}, newName/friends: {}/{}, friendlist of {}'.format(i, len(toSearch), saved, len(record), rid) 63 | if saved > 0: 64 | self.log.info(log_text) 65 | self.fl_searched.add(rid) 66 | else: 67 | n_forbidden += 1 68 | self.log.error(log_text) 69 | print('{} Done! net2 of {}, forbidden: {}'.format(time.strftime('%H:%M:%S',time.localtime()), orig_id, n_forbidden)) 70 | return n_forbidden 71 | 72 | def getStatus_friend(self, orig_id): 73 | n_forbidden = 0 74 | friends = self.getNet1(orig_id) 75 | friends.add(orig_id) 76 | toSearch = friends - self.status_searched 77 | print('{} {} of {}, toSearch/total: {}/{}'.format(time.strftime('%H:%M:%S', time.localtime()), 'friends\' status', orig_id, len(toSearch), len(friends)+1)) 78 | for i, rid in zip(range(1, len(toSearch)+1), toSearch): 79 | record = self.dl.status(rid) 80 | if record is None: 81 | self.log.error('{}, fail to download status.'.format(rid)) 82 | else: 83 | saved = self.repo.save_status(self.login_id, rid, record) 84 | log_text = '{}/{}, saved/download: {}/{}, status of {}'.format(i, len(toSearch), saved, len(record), rid) 85 | if saved > 0: 86 | self.log.info(log_text) 87 | self.status_searched.add(rid) 88 | else: 89 | n_forbidden += 1 90 | self.log.error(log_text) 91 | print('{} Done! friends\' status of {}, forbidden: {}'.format(time.strftime('%H:%M:%S',time.localtime()), orig_id, n_forbidden)) 92 | return n_forbidden 93 | 94 | def getProfile_friend(self,orig_id='410941086'): 95 | pageStyle='profile' 96 | if pageStyle not in self.searched: 97 | self.searched[pageStyle]=self.repo.getSearched(pageStyle) 98 | friends=self.getNet1(orig_id) 99 | toSearch=(friends|{orig_id})-self.searched[pageStyle] 100 | print('{} {} of {},toSearch/total:{}/{}'.format(time.strftime('%H:%M:%S',time.localtime()),'friends\' profile',orig_id,len(toSearch),len(friends)+1)) 101 | self.seq_process(toSearch,pageStyle) 102 | 103 | 104 | if __name__ == '__main__': 105 | test_cookie = raw_input('Input cookie(document.cookie): ') 106 | 107 | runner = spider(test_cookie) 108 | 109 | # start by login id 110 | #friends = runner.getNet1(runner.login_id) 111 | #for orig_id in friends: 112 | # runner.getNet2(orig_id) 113 | 114 | runner.getStatus_friend(runner.login_id) 115 | -------------------------------------------------------------------------------- /test_net_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackonYang/renren/a692152c6a1eecccc1b097550a3de5916fc95e31/test_net_graph.png -------------------------------------------------------------------------------- /test_parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8-*- 2 | import unittest 3 | import parse 4 | from browser import browser 5 | 6 | class test_parse(unittest.TestCase): 7 | 8 | def setUp(self): 9 | pass 10 | def tearDown(self): 11 | pass 12 | 13 | def test_friendList(self): 14 | pfHrefs=[ 15 | {'

王瑛', 16 | '

En.王哲'}, 17 | {'

'}, 18 | '

～@%……', 19 | {}, 20 | {'error'}, 21 | None 22 | ] 23 | names=[ 24 | {'6754031':'王瑛','331442':'En.王哲'}, 25 | {'9439171':''}, 26 | {'34134':'～@%……'}, 27 | {}, 28 | None, 29 | None] 30 | for pfHref,name in zip(pfHrefs,names): 31 | self.assertEquals(parse.friendList(pfHref),name) 32 | 33 | def test_profile_detail(self): 34 | contents={ 35 | #some items with no space 36 | """

性别:

女

,\ 37 |

大学:

北学-2013年-学院
理工大学-2011年-生命学院

,\ 38 |

小学:

一个小学-1991年青岛二小-2001年

""" 39 | :{'性别': '女', 40 | '大学': '北学-2013年-学院
理工大学-2011年-生命学院
', 41 | '小学': '一个小学-1991年青岛二小-2001年'}, 42 | #some items with space and \n \t 43 | """

性别 :

女

, \ 44 |

大学 :

\n\\n

\n\\n\n\\n北京中医药大学\n - \n 2013年\n - 东方学院
\ 45 | 北京理工大学\t\t - 2011年 - 生命科学与技术学院六院

,\ 46 |

小学 :

一个小学 - 1991年青岛二小 - 2001年

""" 47 | :{'性别': '女', 48 | '大学': '北京中医药大学 - 2013年 - 东方学院
北京理工大学 - 2011年 - 生命科学与技术学院六院
', 49 | '小学': '一个小学 - 1991年青岛二小 - 2001年'}, 50 | #no items or None 51 | """no item""":{},None:None} 52 | for content,expt in contents.items(): 53 | if content is not None: 54 | content=content.split(',') 55 | self.assertEquals(parse.profile_detail(content),expt) 56 | 57 | def test_profile_mini(self): 58 | contents={ 59 | #full items with space 60 | """

\n\\n\t\\t\n就读于西北大学\n\t\\t
\n\\n\t男生\n\\n\\n\n ，2月13日\\n\t\\t
\n\\n来自内蒙古\n\\n\n延安市\n\n\\n
\\n现居\\n山南地区

""" 65 | :{'school':'就读于西北大学', 66 | 'gender':'男生 ', 67 | 'birthday':'2月13日', 68 | 'hometown':'来自内蒙古延安市', 69 | 'address':'现居山南地区'}, 70 | #full items with no space 71 | """

就读于西北大学
男生，2月13日
来自内蒙古延安市
现居山南地区

""" 76 | :{'hometown':'来自内蒙古延安市','school':'就读于西北大学','birthday':'2月13日','gender':'男生','address':'现居山南地区'}, 77 | #full items with space. basic 78 | """

\n\t\\t\\n男生\t
\n\t\\n来自\\n\\n\n山东\n\\t\n\\n 烟台市\t\\t\n\\n
\n\\n在\t\\t\t\\tFachhochschule Aachen\t\\t\n\\t读书\\t

""" 82 | :{'gender': '男生', 'school': '在 Fachhochschule Aachen 读书', 'hometown': '来自山东烟台市'}, 83 | #full items without space 84 | """

\ 85 | 男生
来自山东烟台市
在Fachhochschule Aachen读书

""" 88 | :{'gender':'男生', 'school':'在Fachhochschule Aachen读书','hometown':'来自山东烟台市'}, 89 | #no items or None 90 | """

""":{},None:None} 91 | for content,expt in contents.items(): 92 | self.assertEquals(parse.profile_mini(content),expt) 93 | 94 | #basic info 95 | def test_get_birth(self): 96 | contents={'80 后 10 月 12 日天秤座':{'birth_day': '12', 'birth_month': '10', 'birth_year': '80'},# xx后 and int(2) 97 | '2012年8月1日狮子座':{'birth_day': '1', 'birth_month': '8', 'birth_year': '2012'},# xx年 and int(1) 98 | ' 3 月 6 日双鱼座':{'birth_day': '6', 'birth_month': '3', 'birth_year': '9999'},#no age info 99 | '1987年9月1日':{'birth_day': '1', 'birth_month': '9', 'birth_year': '1987'},#no star info 100 | '3 月 29 日':{'birth_day': '29', 'birth_month': '3', 'birth_year': '9999'},#no age or star info 101 | '3-29':{'birth_day': '29', 'birth_month': '3', 'birth_year': '9999'},#no age or star info 102 | '3 - 31':{'birth_day': '31', 'birth_month': '3', 'birth_year': '9999'},#no age or star info 103 | '2011-9-1':{'birth_day': '1', 'birth_month': '9', 'birth_year': '2011'},#no star info 104 | '1993 - 9 - 1':{'birth_day': '1', 'birth_month': '9', 'birth_year': '1993'},#no star info 105 | '9999-99-99':{'birth_day':'99','birth_month':'99','birth_year':'9999'}, 106 | '男,':{'birth_day':'99','birth_month':'99','birth_year':'9999'}, 107 | '':{'birth_day':'99','birth_month':'99','birth_year':'9999'}, 108 | None:{'birth_year':None,'birth_month':None,'birth_day':None} 109 | } 110 | for content,expt in contents.items(): 111 | self.assertEquals(parse._get_birth(content),expt) 112 | def test_get_gender(self): 113 | contents={'他是男生':'m','男生':'m','她是女生':'f','女生':'f','女':'f','男':'m','no match':'u',None:None} 114 | for content,expt in contents.items(): 115 | self.assertEquals(parse._get_gender(content),expt) 116 | 117 | #edu info 118 | def test_split_high_edu(self): 119 | contents={ 120 | # two item, full space 121 | ' Birmingam City - 2011 年 - 其它院系
西北大学 - 2012 年 - 其它院系
' 122 | :[{'major': '其它院系', 'name': 'Birmingam City', 'year': '2011'}, {'major': '其它院系', 'name': '西北大学', 'year': '2012'}], 123 | # two item, no space 124 | 'Birmingam City-2011年-其它院系
西北大学-2012年-其它院系
' 125 | :[{'major': '其它院系', 'name': 'Birmingam City', 'year': '2011'}, {'major': '其它院系', 'name': '西北大学', 'year': '2012'}], 126 | # one item, no space 127 | '西北大学-2010年-物理学系
':[{'major': '物理学系', 'name': '西北大学', 'year': '2010'}], 128 | # English with useful space. can't drop 129 | 'Lincoln University - 1970年
':[{'major': '', 'name': 'Lincoln University', 'year': '1970'}], 130 | 'no match':[], 131 | None:None 132 | } 133 | for content,expt in contents.items(): 134 | self.assertEquals(parse._split_high_edu(content),expt) 135 | def test_split_low_edu(self): 136 | contents={ 137 | # full space 138 | ' 万州上海中学 - 2009年万州高级中学 - 2012年 ' 139 | :[{'name': '万州上海中学', 'year': '2009'}, {'name': '万州高级中学', 'year': '2012'}], 140 | # no space 141 | '万州上海中学-2004年万州高级中学-2011年' 142 | :[{'name': '万州上海中学', 'year': '2004'}, {'name': '万州高级中学', 'year': '2011'}], 143 | #one item 144 | '三原县南郊中学- 2005年': 145 | [{'name': '三原县南郊中学', 'year': '2005'}], 146 | None:None 147 | } 148 | for content,expt in contents.items(): 149 | #self.assertEquals(parse._split_low_edu(content),expt) 150 | print(parse._split_low_edu(content,'p')) 151 | 152 | #drops 153 | def test_sub_space(self): 154 | #replace space, and no effect on other word 155 | contents=['abcdefghijklmnopqrstuvwxyz0123456789 nntt003','\n\\n\t\\t \u3000\\u3000abcdefghijklmnopqrstuvwxyz0123456789 \\n\n\\n\t\\u3000\u3000 nntt003'] 156 | expt1='abcdefghijklmnopqrstuvwxyz0123456789 nntt003' 157 | expt2='abcdefghijklmnopqrstuvwxyz0123456789nntt003' 158 | for content in contents: 159 | self.assertEquals(parse._sub_space(content,r' '),expt1) 160 | self.assertEquals(parse._sub_space(content,r''),expt2) 161 | def test_drop_pf_extra(self): 162 | #replace space, and no effect on other word 163 | contents=['abcdefghijklmnopqrstuvwxyz0123456789 nntt003','\n\\n\t\\t \u3000\\u3000abcdefghijklmnopqrstuvwxyz0123456789\\n\n\\n\t\\u3000\u3000 nntt003'] 164 | expt1='abcdefghijklmnopqrstuvwxyz0123456789 nntt003' 165 | expt2='abcdefghijklmnopqrstuvwxyz0123456789nntt003' 166 | for content in contents: 167 | self.assertEquals(parse._drop_pf_extra(content,r' '),expt1) 168 | self.assertEquals(parse._drop_pf_extra(content,r''),expt2) 169 | 170 | def test_drop_href(self): 171 | contents={"""

生日\n\\n\t\\t :

摩羯座陕西 \t\\t\n\\n """:"""

生日\n\\n\t\\t :

1994\n\\n\t\\t 年\n\\n\t\\t 摩羯座陕西 \t\\t\n\\n """,#all kinds of elements in and out 172 | """hello

birth

""":"""hello

birth

""",#no href 173 | """hello""":"""hello""",#start with \n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ，2月13日""":"""\n\\n\t\\t\n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ，2月13日""",#span with all kinds of items 181 | """\n\\n\t\\t\n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ，2月13日""":"""\n\\n\t\\t\n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ，2月13日""",#spanclasslink with all kinds of items 182 | """boy男生""":"""boy男生""",#multi 183 | """nospan""":"""nospan""", 184 | None:None 185 | } 186 | for content,expt in contents.items(): 187 | self.assertEquals(parse.drop_span(content),expt) 188 | 189 | def test_drop_rrurl(self): 190 | contents={"http://rrurl.cn/pNVUbN ":'(http://lang-8.com/)', 191 | None:None, 192 | 'norrurl':'norrurl' 193 | } 194 | for content,expt in contents.items(): 195 | self.assertEquals(parse.drop_rrurl(content),expt) 196 | 197 | def test_split_owner(self): 198 | contents={' (123456,name) : testcase':('123456','name','testcase'),None:(None,None,None),'no ptn':(None,None,None),'32:only':(None,None,None),'asdf,only':(None,None,None)} 199 | for content,expt in contents.items(): 200 | self.assertEquals(parse.split_owner(content),expt) 201 | 202 | if __name__=='__main__': 203 | suite=unittest.TestSuite() 204 | 205 | #checked 206 | runner=unittest.TextTestRunner() 207 | runner.run(suite) 208 | suite.addTest(test_parse('test_friendList'))#full test 209 | suite.addTest(test_parse('test_profile_detail'))#full test 210 | suite.addTest(test_parse('test_profile_mini'))#full test 211 | #private method 212 | #suite.addTest(test_parse('test_get_birth'))#full test 213 | #suite.addTest(test_parse('test_get_gender'))#full test 214 | #suite.addTest(test_parse('test_split_high_edu'))#full test 215 | #suite.addTest(test_parse('test_split_low_edu'))#full test 216 | suite.addTest(test_parse('test_sub_space'))#full test 217 | #suite.addTest(test_parse('test_drop_link')) 218 | #suite.addTest(test_parse('test_drop_pf_extra')) 219 | #suite.addTest(test_parse('test_drop_href')) 220 | #suite.addTest(test_parse('test_drop_span')) 221 | #suite.addTest(test_parse('test_drop_rrurl')) 222 | #suite.addTest(test_parse('test_split_owner')) 223 | runner=unittest.TextTestRunner() 224 | runner.run(suite) 225 | -------------------------------------------------------------------------------- /topic/README.md: -------------------------------------------------------------------------------- 1 | topic analysic 2 | ============== 3 | 4 | #### env 5 | 6 | python2.7 7 | 8 | #### usage 9 | 10 | `python demo.py` 11 | -------------------------------------------------------------------------------- /topic/demo.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import jieba 3 | import jieba.posseg as pseg 4 | 5 | pymysql=None 6 | def getStatus(rid,table_pre='orig_renren'): 7 | global pymysql 8 | if pymysql is None: 9 | import pymysql 10 | tablename='{}_{}'.format(table_pre,'status') 11 | conn=pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='Kunth123', db='data_bang',charset='utf8') 12 | cur=conn.cursor() 13 | cur.execute("select timestamp,cur_content from {} where renrenId1='{}'".format(tablename,rid)) 14 | res={} 15 | for content in cur.fetchall(): 16 | res[content[0]]=content[1] 17 | cur.close() 18 | conn.close() 19 | return res 20 | 21 | _ignore=None 22 | def drop_ignore(data): 23 | global _ignore 24 | if _ignore is None: 25 | _ignore={u'要',u'的',u'了',u'有',u'很',u'上',u'不',u'和',u'我',u'给',u'一个',u'在',u'被',u'是',u'就',u'到',u'现在',u'人',u'今天',u'又',u'啊',u'自己',u'这',u'还',u'去',u'也',u'你',u'好',u'可以',u'让',u'说',u'都',u'就是',u'转自',u'img', 26 | # 2000 up 27 | u'谁',u'一',u'吃',u'这么',u'一下',u'什么',u'把',u'再',u'小',u'得',u'大',u'如果',u'手机',u'多',u'我们',u'没',u'那',u'会',u'生活',u'还是',u'大笑',u'没有',u'个',u'明天',u'事',u'知道',u'着',u'过',u'等',u'不是',u'才',u'里',u'真的',u'这个',u'终于',u'比',u'他',u'怎么',u'呢',u'来',u'这是',u'大家',u'看',u'吧',u'下',u'走',u'想',u'中',u'请',u'对',u'已经',u'能',u'同学',u'看到',u'这样', 28 | # more 29 | u'做',u'跟',u'用',u'从',u'找',u'月',u'但是',u'开始',u'然后',u'以后',u'还有',u'貌似',u'不用',u'应该',u'感觉',u'发现',u'需要',u'各种', 30 | # time 31 | u'早晨',u'中午',u'上午',u'下午',u'晚上',u'今天',u'今天下午',u'今天上午',u'每天',u'刚刚',u'突然',u'经常' 32 | } 33 | for k in _ignore & set(data.keys()): 34 | data.pop(k) 35 | 36 | def _fix_little_age(kword): 37 | if (u'小时' in kword) and (u'时候' in kword): 38 | nxs=kword[u'小时'] 39 | nsh=kword[u'时候'] 40 | if nxs < nsh: 41 | kword.pop(u'小时') 42 | kword[u'小时候']=nxs 43 | kword[u'时候']=nsh-nxs 44 | elif nsh < nxs: 45 | kword.pop(u'时候') 46 | kword[u'小时候']=nsh 47 | kword[u'小时']=nxs-nsh 48 | else: 49 | kword.pop(u'时候') 50 | kword.pop(u'小时') 51 | kword[u'小时候']=nsh 52 | 53 | # extract keyword 54 | def extract_keyword(status): 55 | kword=dict() 56 | for timestamp,status in status.items(): 57 | for word in jieba.cut(status,cut_all=False): 58 | # timestamp to be set() to avoid repeat word in the same status 59 | if word in kword: 60 | kword[word].add(timestamp) 61 | else: 62 | kword[word]={timestamp} 63 | _fix_little_age(kword) 64 | drop_ignore(kword) 65 | return kword 66 | 67 | def get_keyword(status): 68 | kword=dict() 69 | for timestamp,content in status.items(): 70 | words = pseg.cut(content) 71 | for w in words: 72 | if w.flag in kword: 73 | kword[w.flag].add(w.word) 74 | else: 75 | kword[w.flag]={w.word} 76 | for flag,word in kword.items(): 77 | print(u'{}:{}'.format(flag,word)) 78 | print(kword.keys()) 79 | # timestamp to be set() to avoid repeat word in the same status 80 | #if word in kword: 81 | # kword[word].add(timestamp) 82 | #else: 83 | # kword[word]={timestamp} 84 | #_fix_little_age(kword) 85 | #drop_ignore(kword) 86 | return kword 87 | 88 | def show_all_keyword(friend): 89 | rid='233330059' 90 | res=extract_keyword(getStatus(rid)) 91 | for rid in friend: 92 | kword=extract_keyword(getStatus(rid)) 93 | res.update(kword) 94 | print(len(res)) 95 | sort_freq(res) 96 | 97 | def get_common_keyword(friend): 98 | # init res by someone whose keyword more than bound 99 | rid='233330059' 100 | res=set(extract_keyword(getStatus(rid)).keys()) 101 | did=0 102 | undo=0 103 | for i,rid in zip(range(1,len(friend)+1),friend): 104 | kword=extract_keyword(getStatus(rid)) 105 | if len(kword)>2000: 106 | did += 1 107 | res &= set(kword.keys()) 108 | else: 109 | undo += 1 110 | print(u'{} number of keyword < 2000 {} {}'.format(undo,friend[rid],len(kword))) 111 | print(u"common keyword to add to ignore list: {}".format("',u'".join(res))) 112 | print(len(friend),did,undo) 113 | 114 | def _drop_single_word(kword): 115 | for k in kword.keys(): 116 | if len(k) < 2: 117 | kword.pop(k) 118 | 119 | def sort_freq(kword): 120 | # fix 121 | _drop_single_word(kword) 122 | freq=[] 123 | for k,v in kword.items(): 124 | if len(v) > 1: 125 | freq.append((len(v),k)) 126 | freq.sort() 127 | for k,v in freq: 128 | print(u'{},{}'.format(k,v)) 129 | 130 | def show_kword(rid): 131 | status=getStatus(rid) 132 | kword=extract_keyword(status) 133 | sort_freq(kword) 134 | 135 | def nstatus_nkeyword(friend): 136 | data=[] 137 | for i,rid in zip(range(1,len(friend)+1),friend): 138 | status=getStatus(rid) 139 | kword=extract_keyword(status) 140 | data.append((len(status),len(kword))) 141 | return data 142 | 143 | plt=None 144 | def plot_tuple(data): 145 | global plt 146 | if plt is None: 147 | import matplotlib.pyplot as plt 148 | data.sort() 149 | x=[] 150 | y=[] 151 | for a,b in data: 152 | x.append(a) 153 | y.append(b) 154 | fig=plt.figure() 155 | plt.plot(x,y,'o') 156 | plt.grid(True) 157 | plt.show() 158 | 159 | if __name__ == '__main__': 160 | import mytools 161 | friend=mytools.getFriend() 162 | #get_common_keyword(friend) 163 | #show_all_keyword(friend) 164 | # data=nstatus_nkeyword(friend) 165 | # plot_tuple(data) 166 | rid='233330059' 167 | #rid = '232279547' 168 | #show_kword(rid) 169 | status=getStatus(rid) 170 | get_keyword(status) 171 | -------------------------------------------------------------------------------- /topic/jieba-master.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackonYang/renren/a692152c6a1eecccc1b097550a3de5916fc95e31/topic/jieba-master.zip -------------------------------------------------------------------------------- /topic/jieba/README.md: -------------------------------------------------------------------------------- 1 | jieba 2 | ======== 3 | "结巴"中文分词：做最好的Python中文分词组件 4 | "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. 5 | - _Scroll down for English documentation._ 6 | 7 | Feature 8 | ======== 9 | * 支持三种分词模式： 10 | * 1）精确模式，试图将句子最精确地切开，适合文本分析； 11 | * 2）全模式，把句子中所有的可以成词的词语都扫描出来, 速度非常快，但是不能解决歧义； 12 | * 3) 搜索引擎模式，在精确模式的基础上，对长词再次切分，提高召回率，适合用于搜索引擎分词。 13 | 14 | Python Version 15 | ============== 16 | * 目前master分支是只支持Python2.x 的 17 | * Python3.x 版本的分支也已经基本可用： https://github.com/fxsjy/jieba/tree/jieba3k 18 | 19 | Usage 20 | ======== 21 | * 全自动安装：`easy_install jieba` 或者 `pip install jieba` 22 | * 半自动安装：先下载http://pypi.python.org/pypi/jieba/ ，解压后运行python setup.py install 23 | * 手动安装：将jieba目录放置于当前目录或者site-packages目录 24 | * 通过import jieba 来引用（第一次import时需要构建Trie树，需要几秒时间） 25 | 26 | Algorithm 27 | ======== 28 | * 基于Trie树结构实现高效的词图扫描，生成句子中汉字所有可能成词情况所构成的有向无环图（DAG) 29 | * 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合 30 | * 对于未登录词，采用了基于汉字成词能力的HMM模型，使用了Viterbi算法 31 | 32 | 功能 1)：分词 33 | ========== 34 | * `jieba.cut`方法接受两个输入参数: 1) 第一个参数为需要分词的字符串 2）cut_all参数用来控制是否采用全模式 35 | * `jieba.cut_for_search`方法接受一个参数：需要分词的字符串,该方法适合用于搜索引擎构建倒排索引的分词，粒度比较细 36 | * 注意：待分词的字符串可以是gbk字符串、utf-8字符串或者unicode 37 | * `jieba.cut`以及`jieba.cut_for_search`返回的结构都是一个可迭代的generator，可以使用for循环来获得分词后得到的每一个词语(unicode)，也可以用list(jieba.cut(...))转化为list 38 | 39 | 代码示例( 分词 ) 40 | 41 | #encoding=utf-8 42 | import jieba 43 | 44 | seg_list = jieba.cut("我来到北京清华大学",cut_all=True) 45 | print "Full Mode:", "/ ".join(seg_list) #全模式 46 | 47 | seg_list = jieba.cut("我来到北京清华大学",cut_all=False) 48 | print "Default Mode:", "/ ".join(seg_list) #精确模式 49 | 50 | seg_list = jieba.cut("他来到了网易杭研大厦") 51 | print ", ".join(seg_list) 52 | 53 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") #搜索引擎模式 54 | print ", ".join(seg_list) 55 | 56 | Output: 57 | 58 | 【全模式】: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学 59 | 60 | 【精确模式】: 我/ 来到/ 北京/ 清华大学 61 | 62 | 【新词识别】：他, 来到, 了, 网易, 杭研, 大厦 (此处，“杭研”并没有在词典中，但是也被Viterbi算法识别出来了) 63 | 64 | 【搜索引擎模式】：小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造 65 | 66 | 功能 2) ：添加自定义词典 67 | ================ 68 | 69 | * 开发者可以指定自己自定义的词典，以便包含jieba词库里没有的词。虽然jieba有新词识别能力，但是自行添加新词可以保证更高的正确率 70 | * 用法： jieba.load_userdict(file_name) # file_name为自定义词典的路径 71 | * 词典格式和`analyse/idf.txt`一样，一个词占一行；每一行分为两部分，一部分为词语，另一部分为词频，用空格隔开 72 | * 范例： 73 | 74 | 云计算 5 75 | 李小福 2 76 | 创新办 3 77 | 78 | 之前：李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / 79 | 80 | 加载自定义词库后：　李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / 81 | 82 | * 代码示例："通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14 83 | 84 | 功能 3) ：关键词提取 85 | ================ 86 | * jieba.analyse.extract_tags(sentence,topK) #需要先import jieba.analyse 87 | * setence为待提取的文本 88 | * topK为返回几个TF/IDF权重最大的关键词，默认值为20 89 | 90 | 代码示例（关键词提取） 91 | 92 | https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py 93 | 94 | 功能 4) : 词性标注 95 | ================ 96 | * 标注句子分词后每个词的词性，采用和ictclas兼容的标记法 97 | * 用法示例 98 | 99 | >>> import jieba.posseg as pseg 100 | >>> words =pseg.cut("我爱北京天安门") 101 | >>> for w in words: 102 | ... print w.word,w.flag 103 | ... 104 | 我 r 105 | 爱 v 106 | 北京 ns 107 | 天安门 ns 108 | 109 | 110 | 111 | 分词速度 112 | ========= 113 | * 1.5 MB / Second in Full Mode 114 | * 400 KB / Second in Default Mode 115 | * Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz；《围城》.txt 116 | 117 | 在线演示 118 | ========= 119 | http://209.222.69.242:9000/ 120 | 121 | 常见问题 122 | ========= 123 | 1）模型的数据是如何生成的？https://github.com/fxsjy/jieba/issues/7 124 | 125 | 2）这个库的授权是? https://github.com/fxsjy/jieba/issues/2 126 | 127 | 更多问题请点击：https://github.com/fxsjy/jieba/issues?sort=updated&state=closed 128 | 129 | Change Log 130 | ========== 131 | http://www.oschina.net/p/jieba/news#list 132 | 133 | jieba 134 | ======== 135 | "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module. 136 | 137 | Features 138 | ======== 139 | * Support three types of segmentation mode: 140 | * 1) Accurate Mode, attempt to cut the sentence into the most accurate segmentation, which is suitable for text analysis; 141 | * 2) Full Mode, break the words of the sentence into words scanned 142 | * 3) Search Engine Mode, based on the Accurate Mode, with an attempt to cut the long words into several short words, which can enhance the recall rate 143 | 144 | Usage 145 | ======== 146 | * Fully automatic installation: `easy_install jieba` or `pip install jieba` 147 | * Semi-automatic installation: Download http://pypi.python.org/pypi/jieba/ , after extracting run `python setup.py install` 148 | * Manutal installation: place the `jieba` directory in the current directory or python site-packages directory. 149 | * Use `import jieba` to import, which will first build the Trie tree only on first import (takes a few seconds). 150 | 151 | Algorithm 152 | ======== 153 | * Based on the Trie tree structure to achieve efficient word graph scanning; sentences using Chinese characters constitute a directed acyclic graph (DAG) 154 | * Employs memory search to calculate the maximum probability path, in order to identify the maximum tangential points based on word frequency combination 155 | * For unknown words, the character position HMM-based model is used, using the Viterbi algorithm 156 | 157 | Function 1): cut 158 | ========== 159 | * The `jieba.cut` method accepts to input parameters: 1) the first parameter is the string that requires segmentation, and the 2) second parameter is `cut_all`, a parameter used to control the segmentation pattern. 160 | * `jieba.cut` returned structure is an iterative generator, where you can use a `for` loop to get the word segmentation (in unicode), or `list(jieba.cut( ... ))` to create a list. 161 | * `jieba.cut_for_search` accpets only on parameter: the string that requires segmentation, and it will cut the sentence into short words 162 | 163 | Code example: segmentation 164 | ========== 165 | 166 | #encoding=utf-8 167 | import jieba 168 | 169 | seg_list = jieba.cut("我来到北京清华大学",cut_all=True) 170 | print "Full Mode:", "/ ".join(seg_list) #全模式 171 | 172 | seg_list = jieba.cut("我来到北京清华大学",cut_all=False) 173 | print "Default Mode:", "/ ".join(seg_list) #默认模式 174 | 175 | seg_list = jieba.cut("他来到了网易杭研大厦") 176 | print ", ".join(seg_list) 177 | 178 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所，后在日本京都大学深造") #搜索引擎模式 179 | print ", ".join(seg_list) 180 | 181 | Output: 182 | 183 | [Full Mode]: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学 184 | 185 | [Accurate Mode]: 我/ 来到/ 北京/ 清华大学 186 | 187 | [Unknown Words Recognize] 他, 来到, 了, 网易, 杭研, 大厦 (In this case, "杭研" is not in the dictionary, but is identified by the Viterbi algorithm) 188 | 189 | [Search Engine Mode]：小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在 190 | , 日本, 京都, 大学, 日本京都大学, 深造 191 | 192 | 193 | Function 2): Add a custom dictionary 194 | ========== 195 | 196 | * Developers can specify their own custom dictionary to include in the jieba thesaurus. jieba has the ability to identify new words, but adding your own new words can ensure a higher rate of correct segmentation. 197 | * Usage： `jieba.load_userdict(file_name) # file_name is a custom dictionary path` 198 | * The dictionary format is the same as that of `analyse/idf.txt`: one word per line; each line is divided into two parts, the first is the word itself, the other is the word frequency, separated by a space 199 | * Example： 200 | 201 | 云计算 5 202 | 李小福 2 203 | 创新办 3 204 | 205 | 之前：李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 / 206 | 207 | 加载自定义词库后：　李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 / 208 | 209 | Function 3): Keyword Extraction 210 | ================ 211 | * `jieba.analyse.extract_tags(sentence,topK) # needs to first import jieba.analyse` 212 | * `setence`: the text to be extracted 213 | * `topK`: To return several TF / IDF weights for the biggest keywords, the default value is 20 214 | 215 | Code sample (keyword extraction) 216 | 217 | https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py 218 | 219 | 220 | Segmentation speed 221 | ========= 222 | * 1.5 MB / Second in Full Mode 223 | * 400 KB / Second in Default Mode 224 | * Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz；《围城》.txt 225 | 226 | Online demo 227 | ========= 228 | http://209.222.69.242:9000/ 229 | -------------------------------------------------------------------------------- /topic/jieba/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | import os,sys 4 | import pprint 5 | import finalseg 6 | import time 7 | import tempfile 8 | import marshal 9 | 10 | FREQ = {} 11 | total =0.0 12 | re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]") 13 | 14 | def gen_trie(f_name): 15 | lfreq = {} 16 | trie = {} 17 | ltotal = 0.0 18 | content = open(f_name,'rb').read().decode('utf-8') 19 | for line in content.split("\n"): 20 | word,freq,_ = line.split(" ") 21 | freq = float(freq) 22 | lfreq[word] = freq 23 | ltotal+=freq 24 | p = trie 25 | for c in word: 26 | if not c in p: 27 | p[c] ={} 28 | p = p[c] 29 | p['']='' #ending flag 30 | return trie, lfreq,ltotal 31 | 32 | 33 | _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) 34 | 35 | print >> sys.stderr, "Building Trie..." 36 | t1 = time.time() 37 | cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache") 38 | load_from_cache_fail = True 39 | if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")): 40 | print >> sys.stderr, "loading model from cache" 41 | try: 42 | trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb')) 43 | load_from_cache_fail = False 44 | except: 45 | load_from_cache_fail = True 46 | 47 | if load_from_cache_fail: 48 | trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt")) 49 | FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize 50 | min_freq = min(FREQ.itervalues()) 51 | print >> sys.stderr, "dumping model to file cache" 52 | marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb')) 53 | 54 | print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds." 55 | print >> sys.stderr, "Trie has been built succesfully." 56 | 57 | 58 | def __cut_all(sentence): 59 | dag = get_DAG(sentence) 60 | old_j = -1 61 | for k,L in dag.iteritems(): 62 | if len(L)==1 and k>old_j: 63 | yield sentence[k:L[0]+1] 64 | old_j = L[0] 65 | else: 66 | for j in L: 67 | if j>k: 68 | yield sentence[k:j+1] 69 | old_j = j 70 | 71 | def calc(sentence,DAG,idx,route): 72 | N = len(sentence) 73 | route[N] = (1.0,'') 74 | for idx in xrange(N-1,-1,-1): 75 | candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ] 76 | route[idx] = max(candidates) 77 | 78 | def get_DAG(sentence): 79 | N = len(sentence) 80 | i,j=0,0 81 | p = trie 82 | DAG = {} 83 | while i=N: 93 | i+=1 94 | j=i 95 | p=trie 96 | else: 97 | p = trie 98 | i+=1 99 | j=i 100 | for i in xrange(len(sentence)): 101 | if not i in DAG: 102 | DAG[i] =[i] 103 | return DAG 104 | 105 | def __cut_DAG(sentence): 106 | DAG = get_DAG(sentence) 107 | route ={} 108 | calc(sentence,DAG,0,route=route) 109 | x = 0 110 | buf =u'' 111 | N = len(sentence) 112 | while x0: 119 | if len(buf)==1: 120 | yield buf 121 | buf=u'' 122 | else: 123 | regognized = finalseg.cut(buf) 124 | for t in regognized: 125 | yield t 126 | buf=u'' 127 | yield l_word 128 | x =y 129 | 130 | if len(buf)>0: 131 | if len(buf)==1: 132 | yield buf 133 | else: 134 | regognized = finalseg.cut(buf) 135 | for t in regognized: 136 | yield t 137 | 138 | 139 | def cut(sentence,cut_all=False): 140 | if not ( type(sentence) is unicode): 141 | try: 142 | sentence = sentence.decode('utf-8') 143 | except: 144 | sentence = sentence.decode('gbk','ignore') 145 | 146 | blocks = re_han.split(sentence) 147 | cut_block = __cut_DAG 148 | if cut_all: 149 | cut_block = __cut_all 150 | for blk in blocks: 151 | if re_han.match(blk): 152 | #pprint.pprint(__cut_DAG(blk)) 153 | for word in cut_block(blk): 154 | yield word 155 | else: 156 | tmp = re_skip.split(blk) 157 | for x in tmp: 158 | if x!="": 159 | yield x 160 | 161 | def cut_for_search(sentence): 162 | words = cut(sentence) 163 | for w in words: 164 | if len(w)>2: 165 | for i in xrange(len(w)-1): 166 | gram2 = w[i:i+2] 167 | if gram2 in FREQ: 168 | yield gram2 169 | if len(w)>3: 170 | for i in xrange(len(w)-2): 171 | gram3 = w[i:i+3] 172 | if gram3 in FREQ: 173 | yield gram3 174 | yield w 175 | 176 | def load_userdict(f): 177 | global trie,total,FREQ 178 | if isinstance(f, (str, unicode)): 179 | f = open(f, 'rb') 180 | content = f.read().decode('utf-8') 181 | for line in content.split("\n"): 182 | if line.rstrip()=='': continue 183 | word,freq = line.split(" ") 184 | freq = float(freq) 185 | FREQ[word] = freq / total 186 | p = trie 187 | for c in word: 188 | if not c in p: 189 | p[c] ={} 190 | p = p[c] 191 | p['']='' #ending flag 192 | -------------------------------------------------------------------------------- /topic/jieba/analyse/__init__.py: -------------------------------------------------------------------------------- 1 | import jieba 2 | import os 3 | 4 | _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) 5 | f_name = os.path.join(_curpath,"idf.txt") 6 | content = open(f_name,'rb').read().decode('utf-8') 7 | 8 | idf_freq = {} 9 | lines = content.split('\n') 10 | for line in lines: 11 | word,freq = line.split(' ') 12 | idf_freq[word] = float(freq) 13 | max_idf = max(idf_freq.values()) 14 | 15 | def extract_tags(sentence,topK=20): 16 | words = jieba.cut(sentence) 17 | freq = {} 18 | for w in words: 19 | if len(w.strip())<2: continue 20 | freq[w]=freq.get(w,0.0)+1.0 21 | total = sum(freq.values()) 22 | freq = [(k,v/total) for k,v in freq.iteritems()] 23 | 24 | tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq] 25 | st_list = sorted(tf_idf_list,reverse=True) 26 | 27 | top_tuples= st_list[:topK] 28 | tags = [a[1] for a in top_tuples] 29 | return tags 30 | 31 | -------------------------------------------------------------------------------- /topic/jieba/finalseg/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | 4 | def load_model(f_name): 5 | _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) ) 6 | prob_p_path = os.path.join(_curpath,f_name) 7 | return eval(open(prob_p_path,"rb").read()) 8 | 9 | prob_start = load_model("prob_start.py") 10 | prob_trans = load_model("prob_trans.py") 11 | prob_emit = load_model("prob_emit.py") 12 | 13 | 14 | 15 | def viterbi(obs, states, start_p, trans_p, emit_p): 16 | V = [{}] #tabular 17 | path = {} 18 | for y in states: #init 19 | V[0][y] = start_p[y] * emit_p[y].get(obs[0],0) 20 | path[y] = [y] 21 | for t in range(1,len(obs)): 22 | V.append({}) 23 | newpath = {} 24 | for y in states: 25 | (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ]) 26 | V[t][y] =prob 27 | newpath[y] = path[state] + [y] 28 | path = newpath 29 | 30 | (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')]) 31 | 32 | return (prob, path[state]) 33 | 34 | 35 | def __cut(sentence): 36 | prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit) 37 | begin, next = 0,0 38 | #print pos_list, sentence 39 | for i,char in enumerate(sentence): 40 | pos = pos_list[i] 41 | if pos=='B': 42 | begin = i 43 | elif pos=='E': 44 | yield sentence[begin:i+1] 45 | next = i+1 46 | elif pos=='S': 47 | yield char 48 | next = i+1 49 | if next0: 99 | if len(buf)==1: 100 | yield pair(buf,word_tag_tab.get(buf,'x')) 101 | buf=u'' 102 | else: 103 | regognized = __cut_detail(buf) 104 | for t in regognized: 105 | yield t 106 | buf=u'' 107 | yield pair(l_word,word_tag_tab.get(l_word,'x')) 108 | x =y 109 | 110 | if len(buf)>0: 111 | if len(buf)==1: 112 | yield pair(buf,word_tag_tab.get(buf,'x')) 113 | else: 114 | regognized = __cut_detail(buf) 115 | for t in regognized: 116 | yield t 117 | 118 | 119 | def cut(sentence): 120 | if not ( type(sentence) is unicode): 121 | try: 122 | sentence = sentence.decode('utf-8') 123 | except: 124 | sentence = sentence.decode('gbk','ignore') 125 | re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]") 126 | re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+") 127 | blocks = re_han.split(sentence) 128 | for blk in blocks: 129 | if re_han.match(blk): 130 | for word in __cut_DAG(blk): 131 | yield word 132 | else: 133 | tmp = re_skip.split(blk) 134 | for x in tmp: 135 | if x!="": 136 | if re_num.match(x): 137 | yield pair(x,'m') 138 | elif re_eng.match(x): 139 | yield pair(x,'eng') 140 | else: 141 | yield pair(x,'x') 142 | -------------------------------------------------------------------------------- /topic/jieba/posseg/prob_start.py: -------------------------------------------------------------------------------- 1 | {('B', 'a'): 0.008545886571090637, 2 | ('B', 'ad'): 0.0012556950477614949, 3 | ('B', 'ag'): 0.0, 4 | ('B', 'an'): 0.0001670724139577068, 5 | ('B', 'b'): 0.006615272009801582, 6 | ('B', 'bg'): 0.0, 7 | ('B', 'c'): 0.03258575057944956, 8 | ('B', 'd'): 0.018778408940230508, 9 | ('B', 'df'): 0.00013790104009207547, 10 | ('B', 'dg'): 0.0, 11 | ('B', 'e'): 0.00019093990166595064, 12 | ('B', 'en'): 0.0, 13 | ('B', 'f'): 0.004121119544290101, 14 | ('B', 'g'): 0.0, 15 | ('B', 'h'): 1.3259715393468796e-06, 16 | ('B', 'i'): 0.0022077426130125543, 17 | ('B', 'in'): 0.0, 18 | ('B', 'j'): 0.006360685474246981, 19 | ('B', 'jn'): 0.0, 20 | ('B', 'k'): 0.0, 21 | ('B', 'l'): 0.007402899104173628, 22 | ('B', 'ln'): 0.0, 23 | ('B', 'm'): 0.02592804748038888, 24 | ('B', 'mg'): 0.0, 25 | ('B', 'mq'): 0.0011284017799841944, 26 | ('B', 'n'): 0.18330097962777328, 27 | ('B', 'ng'): 0.0, 28 | ('B', 'nr'): 0.10741562843095136, 29 | ('B', 'nrfg'): 0.0028123856349547313, 30 | ('B', 'nrt'): 0.006835383285333164, 31 | ('B', 'ns'): 0.05943667425122387, 32 | ('B', 'nt'): 0.007859033313708954, 33 | ('B', 'nz'): 0.0193127754705873, 34 | ('B', 'o'): 0.00021745933245288822, 35 | ('B', 'p'): 0.014980826451541043, 36 | ('B', 'q'): 0.00091359439061, 37 | ('B', 'qe'): 0.0, 38 | ('B', 'qg'): 0.0, 39 | ('B', 'r'): 0.033047188675142274, 40 | ('B', 'rg'): 0.0, 41 | ('B', 'rr'): 3.977914618040638e-06, 42 | ('B', 'rz'): 0.0003540344010056168, 43 | ('B', 's'): 0.0039951522480521475, 44 | ('B', 't'): 0.03457072997385184, 45 | ('B', 'tg'): 0.0, 46 | ('B', 'u'): 0.00010475175160840347, 47 | ('B', 'ud'): 0.0, 48 | ('B', 'ug'): 0.0, 49 | ('B', 'uj'): 0.0, 50 | ('B', 'ul'): 0.0, 51 | ('B', 'uv'): 0.0, 52 | ('B', 'uz'): 0.0, 53 | ('B', 'v'): 0.06897173559066729, 54 | ('B', 'vd'): 0.00011801146700187228, 55 | ('B', 'vg'): 0.0, 56 | ('B', 'vi'): 3.977914618040638e-06, 57 | ('B', 'vn'): 0.01314700781262431, 58 | ('B', 'vq'): 5.303886157387518e-06, 59 | ('B', 'w'): 0.0, 60 | ('B', 'x'): 0.0, 61 | ('B', 'y'): 5.303886157387518e-05, 62 | ('B', 'yg'): 0.0, 63 | ('B', 'z'): 0.0008711633013508998, 64 | ('B', 'zg'): 0.0, 65 | ('E', 'a'): 0.0, 66 | ('E', 'ad'): 0.0, 67 | ('E', 'ag'): 0.0, 68 | ('E', 'an'): 0.0, 69 | ('E', 'b'): 0.0, 70 | ('E', 'bg'): 0.0, 71 | ('E', 'c'): 0.0, 72 | ('E', 'd'): 0.0, 73 | ('E', 'df'): 0.0, 74 | ('E', 'dg'): 0.0, 75 | ('E', 'e'): 0.0, 76 | ('E', 'en'): 0.0, 77 | ('E', 'f'): 0.0, 78 | ('E', 'g'): 0.0, 79 | ('E', 'h'): 0.0, 80 | ('E', 'i'): 0.0, 81 | ('E', 'in'): 0.0, 82 | ('E', 'j'): 0.0, 83 | ('E', 'jn'): 0.0, 84 | ('E', 'k'): 0.0, 85 | ('E', 'l'): 0.0, 86 | ('E', 'ln'): 0.0, 87 | ('E', 'm'): 0.0, 88 | ('E', 'mg'): 0.0, 89 | ('E', 'mq'): 0.0, 90 | ('E', 'n'): 0.0, 91 | ('E', 'ng'): 0.0, 92 | ('E', 'nr'): 0.0, 93 | ('E', 'nrfg'): 0.0, 94 | ('E', 'nrt'): 0.0, 95 | ('E', 'ns'): 0.0, 96 | ('E', 'nt'): 0.0, 97 | ('E', 'nz'): 0.0, 98 | ('E', 'o'): 0.0, 99 | ('E', 'p'): 0.0, 100 | ('E', 'q'): 0.0, 101 | ('E', 'qe'): 0.0, 102 | ('E', 'qg'): 0.0, 103 | ('E', 'r'): 0.0, 104 | ('E', 'rg'): 0.0, 105 | ('E', 'rr'): 0.0, 106 | ('E', 'rz'): 0.0, 107 | ('E', 's'): 0.0, 108 | ('E', 't'): 0.0, 109 | ('E', 'tg'): 0.0, 110 | ('E', 'u'): 0.0, 111 | ('E', 'ud'): 0.0, 112 | ('E', 'ug'): 0.0, 113 | ('E', 'uj'): 0.0, 114 | ('E', 'ul'): 0.0, 115 | ('E', 'uv'): 0.0, 116 | ('E', 'uz'): 0.0, 117 | ('E', 'v'): 0.0, 118 | ('E', 'vd'): 0.0, 119 | ('E', 'vg'): 0.0, 120 | ('E', 'vi'): 0.0, 121 | ('E', 'vn'): 0.0, 122 | ('E', 'vq'): 0.0, 123 | ('E', 'w'): 0.0, 124 | ('E', 'x'): 0.0, 125 | ('E', 'y'): 0.0, 126 | ('E', 'yg'): 0.0, 127 | ('E', 'z'): 0.0, 128 | ('E', 'zg'): 0.0, 129 | ('M', 'a'): 0.0, 130 | ('M', 'ad'): 0.0, 131 | ('M', 'ag'): 0.0, 132 | ('M', 'an'): 0.0, 133 | ('M', 'b'): 0.0, 134 | ('M', 'bg'): 0.0, 135 | ('M', 'c'): 0.0, 136 | ('M', 'd'): 0.0, 137 | ('M', 'df'): 0.0, 138 | ('M', 'dg'): 0.0, 139 | ('M', 'e'): 0.0, 140 | ('M', 'en'): 0.0, 141 | ('M', 'f'): 0.0, 142 | ('M', 'g'): 0.0, 143 | ('M', 'h'): 0.0, 144 | ('M', 'i'): 0.0, 145 | ('M', 'in'): 0.0, 146 | ('M', 'j'): 0.0, 147 | ('M', 'jn'): 0.0, 148 | ('M', 'k'): 0.0, 149 | ('M', 'l'): 0.0, 150 | ('M', 'ln'): 0.0, 151 | ('M', 'm'): 0.0, 152 | ('M', 'mg'): 0.0, 153 | ('M', 'mq'): 0.0, 154 | ('M', 'n'): 0.0, 155 | ('M', 'ng'): 0.0, 156 | ('M', 'nr'): 0.0, 157 | ('M', 'nrfg'): 0.0, 158 | ('M', 'nrt'): 0.0, 159 | ('M', 'ns'): 0.0, 160 | ('M', 'nt'): 0.0, 161 | ('M', 'nz'): 0.0, 162 | ('M', 'o'): 0.0, 163 | ('M', 'p'): 0.0, 164 | ('M', 'q'): 0.0, 165 | ('M', 'qe'): 0.0, 166 | ('M', 'qg'): 0.0, 167 | ('M', 'r'): 0.0, 168 | ('M', 'rg'): 0.0, 169 | ('M', 'rr'): 0.0, 170 | ('M', 'rz'): 0.0, 171 | ('M', 's'): 0.0, 172 | ('M', 't'): 0.0, 173 | ('M', 'tg'): 0.0, 174 | ('M', 'u'): 0.0, 175 | ('M', 'ud'): 0.0, 176 | ('M', 'ug'): 0.0, 177 | ('M', 'uj'): 0.0, 178 | ('M', 'ul'): 0.0, 179 | ('M', 'uv'): 0.0, 180 | ('M', 'uz'): 0.0, 181 | ('M', 'v'): 0.0, 182 | ('M', 'vd'): 0.0, 183 | ('M', 'vg'): 0.0, 184 | ('M', 'vi'): 0.0, 185 | ('M', 'vn'): 0.0, 186 | ('M', 'vq'): 0.0, 187 | ('M', 'w'): 0.0, 188 | ('M', 'x'): 0.0, 189 | ('M', 'y'): 0.0, 190 | ('M', 'yg'): 0.0, 191 | ('M', 'z'): 0.0, 192 | ('M', 'zg'): 0.0, 193 | ('S', 'a'): 0.020190568629634933, 194 | ('S', 'ad'): 1.5911658472162552e-05, 195 | ('S', 'ag'): 0.0009546995083297532, 196 | ('S', 'an'): 2.651943078693759e-06, 197 | ('S', 'b'): 0.0015447568433391145, 198 | ('S', 'bg'): 0.0, 199 | ('S', 'c'): 0.008337709039413178, 200 | ('S', 'd'): 0.020162723227308648, 201 | ('S', 'df'): 0.0, 202 | ('S', 'dg'): 0.0001299452108559942, 203 | ('S', 'e'): 0.0026254236479068215, 204 | ('S', 'en'): 0.0, 205 | ('S', 'f'): 0.0055452129775486496, 206 | ('S', 'g'): 0.0014917179817652395, 207 | ('S', 'h'): 0.00017502824319378808, 208 | ('S', 'i'): 0.0, 209 | ('S', 'in'): 0.0, 210 | ('S', 'j'): 0.007357816071835834, 211 | ('S', 'jn'): 0.0, 212 | ('S', 'k'): 0.000967959223723222, 213 | ('S', 'l'): 0.0, 214 | ('S', 'ln'): 0.0, 215 | ('S', 'm'): 0.038036819577704585, 216 | ('S', 'mg'): 1.988957309020319e-05, 217 | ('S', 'mq'): 0.0, 218 | ('S', 'n'): 0.021170461597212278, 219 | ('S', 'ng'): 0.007347208299521059, 220 | ('S', 'nr'): 0.011291973629078026, 221 | ('S', 'nrfg'): 0.0, 222 | ('S', 'nrt'): 0.0, 223 | ('S', 'ns'): 0.0, 224 | ('S', 'nt'): 5.303886157387518e-06, 225 | ('S', 'nz'): 0.0, 226 | ('S', 'o'): 0.00021082947475615385, 227 | ('S', 'p'): 0.05044658721445203, 228 | ('S', 'q'): 0.007531518343490275, 229 | ('S', 'qe'): 0.0, 230 | ('S', 'qg'): 0.0, 231 | ('S', 'r'): 0.06306851029749498, 232 | ('S', 'rg'): 3.447526002301887e-05, 233 | ('S', 'rr'): 0.0, 234 | ('S', 'rz'): 0.0, 235 | ('S', 's'): 0.0, 236 | ('S', 't'): 0.0, 237 | ('S', 'tg'): 0.0018868575004906095, 238 | ('S', 'u'): 0.000967959223723222, 239 | ('S', 'ud'): 0.000440222551063164, 240 | ('S', 'ug'): 0.0005317145872780986, 241 | ('S', 'uj'): 0.001056799316859463, 242 | ('S', 'ul'): 0.00022143724707092888, 243 | ('S', 'uv'): 0.00028640985249892595, 244 | ('S', 'uz'): 9.149203621493468e-05, 245 | ('S', 'v'): 0.04720326082920956, 246 | ('S', 'vd'): 0.0, 247 | ('S', 'vg'): 0.0026240976763674743, 248 | ('S', 'vi'): 0.0, 249 | ('S', 'vn'): 1.0607772314775036e-05, 250 | ('S', 'vq'): 0.0, 251 | ('S', 'w'): 0.0, 252 | ('S', 'x'): 0.0002187853039922351, 253 | ('S', 'y'): 0.00203536631289746, 254 | ('S', 'yg'): 1.3259715393468796e-06, 255 | ('S', 'z'): 0.0, 256 | ('S', 'zg'): 0.0} 257 | -------------------------------------------------------------------------------- /topic/jieba/posseg/viterbi.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | def get_top_states(t_state_v,K=4): 4 | items = t_state_v.items() 5 | topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K] 6 | return [x[0] for x in topK] 7 | 8 | def viterbi(obs, states, start_p, trans_p, emit_p): 9 | V = [{}] #tabular 10 | mem_path = [{}] 11 | all_states = trans_p.keys() 12 | for y in states.get(obs[0],all_states): #init 13 | V[0][y] = start_p[y] * emit_p[y].get(obs[0],0) 14 | mem_path[0][y] = '' 15 | for t in range(1,len(obs)): 16 | V.append({}) 17 | mem_path.append({}) 18 | prev_states = get_top_states(V[t-1]) 19 | prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ] 20 | 21 | prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) ) 22 | obs_states = states.get(obs[t],all_states) 23 | obs_states = set(obs_states) & set(prev_states_expect_next) 24 | 25 | if len(obs_states)==0: obs_states = all_states 26 | for y in obs_states: 27 | (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states]) 28 | V[t][y] =prob 29 | mem_path[t][y] = state 30 | 31 | last = [(V[-1][y], y) for y in mem_path[-1].keys() ] 32 | #if len(last)==0: 33 | #print obs 34 | (prob, state) = max(last) 35 | 36 | route = [None] * len(obs) 37 | i = len(obs)-1 38 | while i>=0: 39 | route[i] = state 40 | state = mem_path[i][state] 41 | i-=1 42 | return (prob, route) -------------------------------------------------------------------------------- /topic/nstatus_nkeyword.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JackonYang/renren/a692152c6a1eecccc1b097550a3de5916fc95e31/topic/nstatus_nkeyword.png --------------------------------------------------------------------------------

\s*(?P.+?)