.+?
(?P[^<]+?)',re.DOTALL)
34 |
35 | res=dict()
36 | for stat in stats:
37 | m=_statprog.search(stat)
38 | if m is None:
39 | print('status parse error.stat={}'.format(stat))
40 | continue
41 | else:
42 | tmpStat=dict()
43 | status_id=m.group('id')
44 | tmpStat['renrenId1'],tmpStat['cur_name'],tmpStat['cur_content']=split_owner(_drop_status_urls(m.group('content')))
45 | tmpStat['orig_owner'],tmpStat['orig_name'],tmpStat['orig_content']=split_owner(_drop_status_urls(m.group('orig')))
46 | tmpStat['timestamp']=m.group('timestamp').strip()
47 | res[status_id]=tmpStat
48 | return res
49 |
50 |
51 | _pf_prog=None
52 | def profile_detail(content):
53 | """keys in record:
54 | edu_college/edu_senior/edu_junior/edu_primary,
55 | birth_year/birth_month/birth_day,
56 | gender,hometown"""
57 | if content is None:
58 | return None
59 | global _pf_prog
60 | if _pf_prog is None:
61 | import re
62 | _pf_prog=re.compile(r'
([^::]+)[::]?\s*[^<]*
(.*?)',re.DOTALL)
63 | content=_drop_pf_extra(''.join(content),' ')
64 | #orig tag/value saved in orig_pf
65 | orig_pf=dict()
66 | for m in _pf_prog.finditer(content):
67 | orig_pf[_sub_space(m.group(1),r'')]=m.group(2).strip(' ')
68 | return orig_pf
69 |
70 | _pf_miniprog=None
71 | def profile_mini(content):
72 | #tl or basic mini
73 | if content is None:
74 | return None
75 | global _pf_miniprog
76 | if _pf_miniprog is None:
77 | import re
78 | _pf_miniprog=re.compile(r'
(.*?)',re.DOTALL)
79 | content=_drop_pf_extra(''.join(content),' ')
80 | orig_pf=dict()
81 | for m in _pf_miniprog.finditer(content):
82 | orig_pf[m.group(1)]=m.group(2).strip(' ')
83 | if 'birthday' in orig_pf:
84 | orig_pf['gender'],orig_pf['birthday']=orig_pf.get('birthday',',').replace(',',',').split(',')
85 | return orig_pf
86 |
87 | #---@depresed----profile deep parser-----------------
88 | #birth and gender
89 | _birthprog=None
90 | def _get_birth(content):
91 | if content is None:
92 | return {'birth_year':None,'birth_month':None,'birth_day':None}
93 | global _birthprog
94 | if _birthprog is None:
95 | import re
96 | _birthprog=re.compile(r'(?:(?P
\d+)[年后-])?(?P\d+)[月-](?P\d+)[日]?')
97 | m=_birthprog.search(_drop_pf_extra(content,r''))
98 | if m is None:
99 | return {'birth_year':'9999','birth_month':'99','birth_day':'99'}
100 | return m.groupdict('9999')
101 | def _get_gender(content):
102 | if content is None:
103 | return None
104 | if content.find('男')>-1:
105 | return 'm'
106 | elif content.find('女')>-1:
107 | return 'f'
108 | else:
109 | return 'u'
110 |
111 | #edu info
112 | _edu_highprog=None
113 | def _split_high_edu(content):
114 | global _edu_highprog
115 | if _edu_highprog is None:
116 | import re
117 | _edu_highprog=re.compile(r'(?P[^-<]+)-\s*(?P\d+)\s*年\s*(?:-(?P[^-<]+))?
')
118 | return _split_edu(_edu_highprog,content)
119 |
120 | _edu_lowprog=None
121 | def _split_low_edu(content,level):
122 | global _edu_lowprog
123 | if _edu_lowprog is None:
124 | import re
125 | _edu_lowprog=re.compile(r'(?P[^-<]+)(?:-\s*(?P\d+)\s*年)?')
126 | return _split_edu(_edu_lowprog,content,level)
127 |
128 | def _split_edu(prog,content,level=None):
129 | if content is None:
130 | return None
131 | if level is None:
132 | school_default={}
133 | else:
134 | school_default={'level':level}
135 | schools=[]
136 | for m in prog.finditer(_drop_pf_extra(content)):
137 | school=school_default
138 | for key,value in m.groupdict('').items():
139 | school[key]=value.strip(' ')
140 | schools.append(school)
141 | return schools
142 |
143 | #-----------------profile-----------------------
144 | #drop extra
145 | def _drop_pf_extra(content,target=r' '):
146 | return _sub_space(_drop_span((_drop_link(content))),target)
147 |
148 | _linkprog=None
149 | def _drop_link(content):
150 | if content is None:
151 | return None
152 | global _linkprog
153 | if _linkprog is None:
154 | import re
155 | _linkprog=re.compile(r']+?>([^<]*?)')
156 | return _linkprog.sub(r'\1',content)
157 |
158 | _spanprog=None
159 | def _drop_span(content):
160 | if content is None:
161 | return None
162 | global _spanprog
163 | if _spanprog is None:
164 | import re
165 | _spanprog=re.compile(r']*>([^<]*?)')
166 | return _spanprog.sub(r'\1',content)
167 |
168 | _spaceprog=None
169 | _space_likeprog=None
170 | def _sub_space(content,target=r''):
171 | if not isinstance(content,str):
172 | return None
173 | global _spaceprog
174 | global _space_likeprog
175 | if _spaceprog is None:
176 | import re
177 | _space_likeprog=re.compile(r'(?:\\n)|(?:\\t)|(?:\\u3000)|(?:\u3000)|(?: )')
178 | _spaceprog=re.compile(r'\s+')
179 | return _spaceprog.sub(target,_space_likeprog.sub(target,content)).strip(' ')
180 |
181 | #-----------------status------------------
182 |
183 | def _drop_status_urls(content):
184 | if content is None:
185 | return None
186 | else:
187 | return _sub_space(drop_rrurl(drop_img(drop_pf(drop_pubpf(drop_at(content))))),r' ')
188 |
189 | _pfprog=None
190 | def drop_pf(content):
191 | if content is None:
192 | return None
193 | global _pfprog
194 | if _pfprog is None:
195 | import re
196 | _pfprog=re.compile(r']+?http://www.renren.com/profile.do\?id=(\d+)[^>]+>(.*?)',re.DOTALL)
197 | return _pfprog.sub(r'(\1,\2)',content)
198 |
199 | _pubpfprog=None
200 | def drop_pubpf(content):
201 | if content is None:
202 | return None
203 | global _pubpfprog
204 | if _pubpfprog is None:
205 | import re
206 | _pubpfprog=re.compile(r']+?http://page.renren.com/(\d+)[^>]+>(.*?)',re.DOTALL)
207 | return _pubpfprog.sub(r'(\1,\2)',str(content))
208 |
209 | _atprog=None
210 | def drop_at(content):
211 | if content is None:
212 | return None,None
213 | global _atprog
214 | if _atprog is None:
215 | import re
216 | _atprog=re.compile(r"]+?http://www.renren.com/g/(\d+)[^>]*>(@.*?)",re.DOTALL)
217 | return _atprog.sub(r'\2(\1)',str(content))
218 |
219 | _imgprog=None
220 | def drop_img(content):
221 | if content is None:
222 | return None
223 | global _imgprog
224 | if _imgprog is None:
225 | import re
226 | _imgprog=re.compile(r"
]+alt=\'([^>]*?)\'[^>]*?/>",re.DOTALL)
227 | return _imgprog.sub(r'(img\1img)',content)
228 |
229 | _rrurlprog=None
230 | def drop_rrurl(content):
231 | if content is None:
232 | return None
233 | global _rrurlprog
234 | if _rrurlprog is None:
235 | import re
236 | _rrurlprog=re.compile(r"]+title='([^>]+)'>[^<]+",re.DOTALL)
237 | return _rrurlprog.sub(r'(\1)',content)
238 |
239 | def split_owner(content):
240 | if content is None:
241 | return None,None,None
242 | else:
243 | idx=content.replace(':',':').find(':')
244 | idx2=content.find(',')
245 | if (idx < 0) or (idx2 <0):
246 | return None,None,None
247 | return content[:idx2].strip('( '),content[idx2+1:idx].strip(') '),content[idx+1:].strip(' ')
248 |
--------------------------------------------------------------------------------
/repo_file.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8-*-
2 | import pickle
3 | import time
4 |
5 | save_period=240 # second
6 | class repo_file:
7 | def __init__(self, name_pre='test'):
8 | self.data_repo={}
9 | self.name_pre=name_pre
10 | self.last_saved=time.time()
11 |
12 | def __del__(self):
13 | self.save('__del__')
14 |
15 | def load(self, pageStyle):
16 | filename='{}_{}.p'.format(self.name_pre, pageStyle)
17 | try:
18 | with open(filename, 'rb') as f:
19 | self.data_repo[pageStyle]=pickle.load(f)
20 | except IOError:
21 | self.data_repo[pageStyle]=dict()
22 |
23 | def save(self,orig=None):
24 | for pageStyle, record in self.data_repo.items():
25 | filename='{}_{}.p'.format(self.name_pre, pageStyle)
26 | with open(filename, 'wb') as f:
27 | pickle.dump(record, f)
28 | self.last_saved = time.time()
29 | #print('save in {}, called by {}'.format(filename,orig))
30 |
31 | def save_friendList(self, record, rid, run_info=None):
32 | """save record and return rows affected.save nothing if empty.
33 | return None if input error"""
34 | return self._save_process('friendList', record, rid, run_info)
35 |
36 | def save_status(self, record, rid, run_info=None):
37 | """save record and return rows affected.save nothing if empty.
38 | return None if input error"""
39 | return self._save_process('status', record, rid, run_info)
40 |
41 | def save_profile(self, record, rid, run_info=None):
42 | """save profile and return rows affected.return None if input error"""
43 | return self._save_process('profile', record, rid, run_info)
44 |
45 | def _save_process(self, pageStyle, record, rid, run_info):
46 | if not isinstance(record,dict):
47 | return None
48 | if pageStyle not in self.data_repo:
49 | self.load(pageStyle)
50 | self.data_repo[pageStyle][rid]=record
51 | # save to file every n second
52 | global save_period
53 | if time.time() - self.last_saved > save_period:
54 | self.save('auto')
55 | return len(record)
56 |
57 | def getSearched(self, pageStyle):
58 | if pageStyle not in self.data_repo:
59 | self.load(pageStyle)
60 | return set(self.data_repo[pageStyle].keys())
61 |
62 | def getFriendList(self, renrenId):
63 | pageStyle='friendList'
64 | if pageStyle not in self.data_repo:
65 | self.load(pageStyle)
66 | return set(self.data_repo[pageStyle].get(renrenId,{}))
67 |
--------------------------------------------------------------------------------
/repo_mysql.py:
--------------------------------------------------------------------------------
1 | import MySQLdb
2 | from settings import db_connet_info as connect_info
3 |
4 |
5 | def _sql_log_status(rid, login_id, n_record):
6 | return "INSERT INTO stat_log_status (rid, login_id, n_record) VALUES ('%s', '%s', %d)" % (rid, login_id, n_record)
7 |
8 | def _sql_log_fl(rid, login_id, n_record):
9 | return "INSERT INTO stat_log_friends (rid, login_id, n_record) VALUES ('%s', '%s', %d)" % (rid, login_id, n_record)
10 |
11 | def _sql_fl(record, rid):
12 | val_fl = ','.join(["('%s', '%s')" % (rid, item[0]) for item in record if item[0] != rid])
13 | return "INSERT INTO friends (rid1, rid2) VALUES %s" % val_fl
14 |
15 | def _sql_name(record):
16 | val_name = ','.join(["('%s', '%s')" % item for item in record])
17 | return "INSERT INTO profile (rid, name) VALUES %s" % val_name
18 |
19 |
20 | class repo_mysql:
21 |
22 | def __init__(self):
23 | self.conn = MySQLdb.connect(**connect_info)
24 | self.cur = self.conn.cursor()
25 |
26 | def __del__(self):
27 | self.cur.close()
28 | self.conn.close()
29 |
30 | def save_fl(self, login_id, rid, fl_record):
31 | """save record and return rows affected.save nothing if empty.
32 | return None if input error"""
33 |
34 | n_name = 0
35 |
36 | try:
37 | if len(fl_record):
38 | n_fl = self.cur.execute(_sql_fl(fl_record, rid))
39 | n_name = self.cur.executemany("INSERT INTO profile (rid, name) VALUES (%s, %s) ON DUPLICATE KEY UPDATE rid=VALUES(rid)", fl_record)
40 | self.cur.execute(_sql_log_fl(rid, login_id, len(fl_record)))
41 | except Exception as e:
42 | print 'Error ID: %s' % rid
43 | print e
44 | else:
45 | self.conn.commit()
46 |
47 | return n_name
48 |
49 | def save_status(self, login_id, rid, status_record):
50 | """save record and return rows affected.save nothing if empty.
51 | return None if input error"""
52 |
53 | n_saved = 0
54 |
55 | try:
56 | if len(status_record):
57 | n_saved = self.cur.executemany("INSERT INTO status_raw (rid, status_id, content) VALUES (%s, %s, %s) ON DUPLICATE KEY UPDATE rid=VALUES(rid)", [(rid, item[0], item[1]) for item in status_record])
58 | self.cur.execute(_sql_log_status(rid, login_id, len(status_record)))
59 | except Exception as e:
60 | print 'Error ID: %s' % rid
61 | print e
62 | else:
63 | self.conn.commit()
64 |
65 | return n_saved
66 |
67 | def get_fl_searched(self, rid):
68 | self.cur.execute("SELECT rid FROM stat_log_friends where n_record>0 OR login_id=%s" % rid)
69 | return {item[0] for item in self.cur.fetchall()}
70 |
71 | def get_status_searched(self, rid):
72 | self.cur.execute("SELECT rid FROM stat_log_status where n_record>0 OR login_id=%s" % rid)
73 | return {item[0] for item in self.cur.fetchall()}
74 |
75 | def get_fl(self, rid):
76 | self.cur.execute("SELECT rid2 FROM friends where rid1='%s'" % rid)
77 | return {item[0] for item in self.cur.fetchall()}
78 |
79 | def get_status(self, rid):
80 | self.cur.execute("SELECT status_id FROM status_raw where rid='%s'" % rid)
81 | return {item[0] for item in self.cur.fetchall()}
82 |
83 | def _sql_profile(self,record,rid=None):
84 | pageStyle='profile'
85 | if len(record) == 0:
86 | return []
87 | pf_map=get_cfg_dict('profile_map',has_default=False)
88 | pf_ignore=pf_map.pop('ignore').split(',')
89 | #construct sql
90 | pfs="renrenId1='{}'".format(rid)
91 | for k,v in record.items():
92 | if k in pf_map.keys():
93 | pfs += ",{}='{}'".format(pf_map[k],v)
94 | elif k in pf_map.values():
95 | pfs += ",{}='{}'".format(k,v)
96 | elif k in pf_ignore:
97 | #print('ignore {}'.format(k))
98 | pass
99 | else:
100 | self.tag_exceed(rid,k,v)
101 | sql_pf="insert into {} set {}".format(self.table_name[pageStyle],pfs)
102 | return [sql_pf]
103 |
104 | def tag_exceed(self,rid,k,v):
105 | print('pf tag exceed. tag={},renrenId={},value={}'.format(k,rid,v))
106 |
107 | if __name__ == '__main__':
108 | from downloader import renren
109 | test_cookie = raw_input('Input cookie(document.cookie): ')
110 | rr = renren(test_cookie)
111 | rid = rr.renrenId()
112 | target_id = rid
113 | print rid
114 | # record = rr.friendList(target_id)
115 | record = rr.status(target_id)
116 | print '%d got' % len(record)
117 |
118 | repo = repo_mysql()
119 | # print repo.save_fl(rid, target_id, record)
120 | print repo.save_status(rid, target_id, record)
121 | #print 'friends of rid: %s' % len(repo.get_fl(target_id))
122 | print 'status of rid: %s' % len(repo.get_status(target_id))
123 | print 'friends searched: %s' % len(repo.get_fl_searched('233330059'))
124 | print 'friends searched: %s' % len(repo.get_fl_searched('23333005'))
125 | print 'status searched: %s' % len(repo.get_status_searched('233330059'))
126 | print 'status searched: %s' % len(repo.get_status_searched('23333005'))
127 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | httplib2
2 | mysql-python
3 |
--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
1 | account = {'email': 'yyttrr3242342@163.com',
2 | 'password': 'bmeB500bmeB500'
3 | }
4 |
5 | db_connet_info = {
6 | 'user': 'root',
7 | 'passwd': 'Kunth123',
8 | 'db': 'renren',
9 | 'host': '127.0.0.1',
10 | 'port': 3306
11 | }
12 |
--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8-*-
2 | import time
3 | import os
4 | import logging
5 | import downloader
6 | import repo_mysql
7 |
8 | def debug_log(rel_path='log/spider'):
9 | path = os.path.join(os.path.dirname(__file__), rel_path)
10 | formatter = logging.Formatter('%(asctime)s|%(levelname)s|%(message)s|%(filename)s-%(lineno)s')
11 |
12 | log = logging.getLogger('renre.spider')
13 | log.setLevel(logging.INFO)
14 | lvls = ['debug', 'info', 'warn', 'error']
15 |
16 | if not os.path.exists(path):
17 | os.makedirs(path)
18 |
19 | for lvl in lvls:
20 | logfile = os.path.join(path, '{}.log'.format(lvl.lower()))
21 | hdlr = logging.FileHandler(logfile)
22 | hdlr.setLevel(getattr(logging, lvl.upper()))
23 | hdlr.setFormatter(formatter)
24 | log.addHandler(hdlr)
25 | return log
26 |
27 |
28 | pf_sleep=2
29 | class spider:
30 | def __init__(self, cookie):
31 | self.dl = downloader.renren(cookie)
32 | self.repo = repo_mysql.repo_mysql()
33 | self.login_id = self.dl.renrenId()
34 | self.fl_searched = self.repo.get_fl_searched(self.login_id)
35 | self.status_searched = self.repo.get_status_searched(self.login_id)
36 | self.log = debug_log()
37 |
38 | def getNet1(self, orig_id):
39 | if not isinstance(orig_id, str):
40 | print('ERROR! str required. orig_id = %s' % orig_id)
41 | return None
42 | if orig_id not in self.fl_searched:
43 | print('{} download net1 of {}'.format(time.strftime('%H:%M:%S', time.localtime()), orig_id))
44 | record = self.dl.friendList(orig_id)
45 | if record is None:
46 | self.log.error('{}, fail to download friend list.'.format(rid))
47 | else:
48 | self.repo.save_fl(self.login_id, orig_id, record)
49 | return self.repo.get_fl(orig_id)
50 |
51 | def getNet2(self, orig_id):
52 | n_forbidden = 0
53 | friends = self.getNet1(orig_id)
54 | toSearch = friends - self.fl_searched
55 | print('{} get net2 of {}, toSearch/total: {}/{}'.format(time.strftime('%H:%M:%S',time.localtime()), orig_id, len(toSearch), len(friends)))
56 | for i, rid in zip(range(1, len(toSearch)+1), toSearch):
57 | record = self.dl.friendList(rid)
58 | if record is None:
59 | self.log.error('{}, fail to download friend list.'.format(rid))
60 | else:
61 | saved = self.repo.save_fl(self.login_id, rid, record)
62 | log_text = '{}/{}, newName/friends: {}/{}, friendlist of {}'.format(i, len(toSearch), saved, len(record), rid)
63 | if saved > 0:
64 | self.log.info(log_text)
65 | self.fl_searched.add(rid)
66 | else:
67 | n_forbidden += 1
68 | self.log.error(log_text)
69 | print('{} Done! net2 of {}, forbidden: {}'.format(time.strftime('%H:%M:%S',time.localtime()), orig_id, n_forbidden))
70 | return n_forbidden
71 |
72 | def getStatus_friend(self, orig_id):
73 | n_forbidden = 0
74 | friends = self.getNet1(orig_id)
75 | friends.add(orig_id)
76 | toSearch = friends - self.status_searched
77 | print('{} {} of {}, toSearch/total: {}/{}'.format(time.strftime('%H:%M:%S', time.localtime()), 'friends\' status', orig_id, len(toSearch), len(friends)+1))
78 | for i, rid in zip(range(1, len(toSearch)+1), toSearch):
79 | record = self.dl.status(rid)
80 | if record is None:
81 | self.log.error('{}, fail to download status.'.format(rid))
82 | else:
83 | saved = self.repo.save_status(self.login_id, rid, record)
84 | log_text = '{}/{}, saved/download: {}/{}, status of {}'.format(i, len(toSearch), saved, len(record), rid)
85 | if saved > 0:
86 | self.log.info(log_text)
87 | self.status_searched.add(rid)
88 | else:
89 | n_forbidden += 1
90 | self.log.error(log_text)
91 | print('{} Done! friends\' status of {}, forbidden: {}'.format(time.strftime('%H:%M:%S',time.localtime()), orig_id, n_forbidden))
92 | return n_forbidden
93 |
94 | def getProfile_friend(self,orig_id='410941086'):
95 | pageStyle='profile'
96 | if pageStyle not in self.searched:
97 | self.searched[pageStyle]=self.repo.getSearched(pageStyle)
98 | friends=self.getNet1(orig_id)
99 | toSearch=(friends|{orig_id})-self.searched[pageStyle]
100 | print('{} {} of {},toSearch/total:{}/{}'.format(time.strftime('%H:%M:%S',time.localtime()),'friends\' profile',orig_id,len(toSearch),len(friends)+1))
101 | self.seq_process(toSearch,pageStyle)
102 |
103 |
104 | if __name__ == '__main__':
105 | test_cookie = raw_input('Input cookie(document.cookie): ')
106 |
107 | runner = spider(test_cookie)
108 |
109 | # start by login id
110 | #friends = runner.getNet1(runner.login_id)
111 | #for orig_id in friends:
112 | # runner.getNet2(orig_id)
113 |
114 | runner.getStatus_friend(runner.login_id)
115 |
--------------------------------------------------------------------------------
/test_net_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackonYang/renren/a692152c6a1eecccc1b097550a3de5916fc95e31/test_net_graph.png
--------------------------------------------------------------------------------
/test_parse.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8-*-
2 | import unittest
3 | import parse
4 | from browser import browser
5 |
6 | class test_parse(unittest.TestCase):
7 |
8 | def setUp(self):
9 | pass
10 | def tearDown(self):
11 | pass
12 |
13 | def test_friendList(self):
14 | pfHrefs=[
15 | {'王瑛',
16 | 'En.王哲'},
17 | {''},
18 | '~@%……',
19 | {},
20 | {'error'},
21 | None
22 | ]
23 | names=[
24 | {'6754031':'王瑛','331442':'En.王哲'},
25 | {'9439171':''},
26 | {'34134':'~@%……'},
27 | {},
28 | None,
29 | None]
30 | for pfHref,name in zip(pfHrefs,names):
31 | self.assertEquals(parse.friendList(pfHref),name)
32 |
33 | def test_profile_detail(self):
34 | contents={
35 | #some items with no space
36 | """性别:女,\
37 | 大学:北学-2013年-学院
理工大学-2011年-生命学院
,\
38 | 小学:一个小学-1991年青岛二小-2001年"""
39 | :{'性别': '女',
40 | '大学': '北学-2013年-学院
理工大学-2011年-生命学院
',
41 | '小学': '一个小学-1991年青岛二小-2001年'},
42 | #some items with space and \n \t
43 | """ 性 别 : 女 , \
44 | 大 学 : \n\\n\n\\n\n\\n北京中医药大学\n - \n 2013年\n - 东方学院
\
45 | 北京理工大学\t\t - 2011年 - 生命科学与技术学院六院
,\
46 | 小 学 : 一个小学 - 1991年 青岛二小 - 2001年 """
47 | :{'性别': '女',
48 | '大学': '北京中医药大学 - 2013年 - 东方学院
北京理工大学 - 2011年 - 生命科学与技术学院六院
',
49 | '小学': '一个小学 - 1991年 青岛二小 - 2001年'},
50 | #no items or None
51 | """no item""":{},None:None}
52 | for content,expt in contents.items():
53 | if content is not None:
54 | content=content.split(',')
55 | self.assertEquals(parse.profile_detail(content),expt)
56 |
57 | def test_profile_mini(self):
58 | contents={
59 | #full items with space
60 | """"""
65 | :{'school':'就读于西北大学',
66 | 'gender':'男生 ',
67 | 'birthday':'2月13日',
68 | 'hometown':'来自内蒙古 延安市',
69 | 'address':'现居 山南地区'},
70 | #full items with no space
71 | """"""
76 | :{'hometown':'来自内蒙古延安市','school':'就读于西北大学','birthday':'2月13日','gender':'男生','address':'现居山南地区'},
77 | #full items with space. basic
78 | """\
79 | - \n\t\\t\\n男生\t
\t\\t\n\\n\
80 | - \n\t\\n来自\\n\\n\n山东\n\\t\n\\n 烟台市\t\\t\n\\n
\
81 | - \n\\n在\t\\t\t\\tFachhochschule Aachen\t\\t\n\\t读书\\t
"""
82 | :{'gender': '男生', 'school': '在 Fachhochschule Aachen 读书', 'hometown': '来自 山东 烟台市'},
83 | #full items without space
84 | """- \
85 | 男生
\
86 | - 来自山东烟台市
\
87 | - 在Fachhochschule Aachen读书
"""
88 | :{'gender':'男生', 'school':'在Fachhochschule Aachen读书','hometown':'来自山东烟台市'},
89 | #no items or None
90 | """""":{},None:None}
91 | for content,expt in contents.items():
92 | self.assertEquals(parse.profile_mini(content),expt)
93 |
94 | #basic info
95 | def test_get_birth(self):
96 | contents={'80 后 10 月 12 日天秤座':{'birth_day': '12', 'birth_month': '10', 'birth_year': '80'},# xx后 and int(2)
97 | '2012年8月1日狮子座':{'birth_day': '1', 'birth_month': '8', 'birth_year': '2012'},# xx年 and int(1)
98 | ' 3 月 6 日 双鱼座':{'birth_day': '6', 'birth_month': '3', 'birth_year': '9999'},#no age info
99 | '1987年9月1日':{'birth_day': '1', 'birth_month': '9', 'birth_year': '1987'},#no star info
100 | '3 月 29 日':{'birth_day': '29', 'birth_month': '3', 'birth_year': '9999'},#no age or star info
101 | '3-29':{'birth_day': '29', 'birth_month': '3', 'birth_year': '9999'},#no age or star info
102 | '3 - 31':{'birth_day': '31', 'birth_month': '3', 'birth_year': '9999'},#no age or star info
103 | '2011-9-1':{'birth_day': '1', 'birth_month': '9', 'birth_year': '2011'},#no star info
104 | '1993 - 9 - 1':{'birth_day': '1', 'birth_month': '9', 'birth_year': '1993'},#no star info
105 | '9999-99-99':{'birth_day':'99','birth_month':'99','birth_year':'9999'},
106 | '男,':{'birth_day':'99','birth_month':'99','birth_year':'9999'},
107 | '':{'birth_day':'99','birth_month':'99','birth_year':'9999'},
108 | None:{'birth_year':None,'birth_month':None,'birth_day':None}
109 | }
110 | for content,expt in contents.items():
111 | self.assertEquals(parse._get_birth(content),expt)
112 | def test_get_gender(self):
113 | contents={'他是男生':'m','男生':'m','她是女生':'f','女生':'f','女':'f','男':'m','no match':'u',None:None}
114 | for content,expt in contents.items():
115 | self.assertEquals(parse._get_gender(content),expt)
116 |
117 | #edu info
118 | def test_split_high_edu(self):
119 | contents={
120 | # two item, full space
121 | ' Birmingam City - 2011 年 - 其它院系
西北大学 - 2012 年 - 其它院系
'
122 | :[{'major': '其它院系', 'name': 'Birmingam City', 'year': '2011'}, {'major': '其它院系', 'name': '西北大学', 'year': '2012'}],
123 | # two item, no space
124 | 'Birmingam City-2011年-其它院系
西北大学-2012年-其它院系
'
125 | :[{'major': '其它院系', 'name': 'Birmingam City', 'year': '2011'}, {'major': '其它院系', 'name': '西北大学', 'year': '2012'}],
126 | # one item, no space
127 | '西北大学-2010年-物理学系
':[{'major': '物理学系', 'name': '西北大学', 'year': '2010'}],
128 | # English with useful space. can't drop
129 | 'Lincoln University - 1970年
':[{'major': '', 'name': 'Lincoln University', 'year': '1970'}],
130 | 'no match':[],
131 | None:None
132 | }
133 | for content,expt in contents.items():
134 | self.assertEquals(parse._split_high_edu(content),expt)
135 | def test_split_low_edu(self):
136 | contents={
137 | # full space
138 | ' 万州上海中学 - 2009年 万州高级中学 - 2012年 '
139 | :[{'name': '万州上海中学', 'year': '2009'}, {'name': '万州高级中学', 'year': '2012'}],
140 | # no space
141 | '万州上海中学-2004年万州高级中学-2011年'
142 | :[{'name': '万州上海中学', 'year': '2004'}, {'name': '万州高级中学', 'year': '2011'}],
143 | #one item
144 | '三原县南郊中学- 2005年':
145 | [{'name': '三原县南郊中学', 'year': '2005'}],
146 | None:None
147 | }
148 | for content,expt in contents.items():
149 | #self.assertEquals(parse._split_low_edu(content),expt)
150 | print(parse._split_low_edu(content,'p'))
151 |
152 | #drops
153 | def test_sub_space(self):
154 | #replace space, and no effect on other word
155 | contents=['abcdefghijklmnopqrstuvwxyz0123456789 nntt003','\n\\n\t\\t \u3000\\u3000abcdefghijklmnopqrstuvwxyz0123456789 \\n\n\\n\t\\u3000\u3000 nntt003']
156 | expt1='abcdefghijklmnopqrstuvwxyz0123456789 nntt003'
157 | expt2='abcdefghijklmnopqrstuvwxyz0123456789nntt003'
158 | for content in contents:
159 | self.assertEquals(parse._sub_space(content,r' '),expt1)
160 | self.assertEquals(parse._sub_space(content,r''),expt2)
161 | def test_drop_pf_extra(self):
162 | #replace space, and no effect on other word
163 | contents=['abcdefghijklmnopqrstuvwxyz0123456789 nntt003','\n\\n\t\\t \u3000\\u3000abcdefghijklmnopqrstuvwxyz0123456789\\n\n\\n\t\\u3000\u3000 nntt003']
164 | expt1='abcdefghijklmnopqrstuvwxyz0123456789 nntt003'
165 | expt2='abcdefghijklmnopqrstuvwxyz0123456789nntt003'
166 | for content in contents:
167 | self.assertEquals(parse._drop_pf_extra(content,r' '),expt1)
168 | self.assertEquals(parse._drop_pf_extra(content,r''),expt2)
169 |
170 | def test_drop_href(self):
171 | contents={"""生日\n\\n\t\\t :摩羯座陕西 \t\\t\n\\n """:"""生日\n\\n\t\\t : 1994\n\\n\t\\t 年\n\\n\t\\t 摩羯座陕西 \t\\t\n\\n """,#all kinds of elements in and out
172 | """hellobirth""":"""hellobirth""",#no href
173 | """hello""":"""hello""",#start with \n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ,2月13日""":"""\n\\n\t\\t\n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ,2月13日""",#span with all kinds of items
181 | """\n\\n\t\\t\n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ,2月13日""":"""\n\\n\t\\t\n\\n\t\\t男生boy123\n\\n\t\\t \n\\n\t\\t ,2月13日""",#spanclasslink with all kinds of items
182 | """boy男生""":"""boy男生""",#multi
183 | """nospan""":"""nospan""",
184 | None:None
185 | }
186 | for content,expt in contents.items():
187 | self.assertEquals(parse.drop_span(content),expt)
188 |
189 | def test_drop_rrurl(self):
190 | contents={"http://rrurl.cn/pNVUbN ":'(http://lang-8.com/)',
191 | None:None,
192 | 'norrurl':'norrurl'
193 | }
194 | for content,expt in contents.items():
195 | self.assertEquals(parse.drop_rrurl(content),expt)
196 |
197 | def test_split_owner(self):
198 | contents={' (123456,name) : testcase':('123456','name','testcase'),None:(None,None,None),'no ptn':(None,None,None),'32:only':(None,None,None),'asdf,only':(None,None,None)}
199 | for content,expt in contents.items():
200 | self.assertEquals(parse.split_owner(content),expt)
201 |
202 | if __name__=='__main__':
203 | suite=unittest.TestSuite()
204 |
205 | #checked
206 | runner=unittest.TextTestRunner()
207 | runner.run(suite)
208 | suite.addTest(test_parse('test_friendList'))#full test
209 | suite.addTest(test_parse('test_profile_detail'))#full test
210 | suite.addTest(test_parse('test_profile_mini'))#full test
211 | #private method
212 | #suite.addTest(test_parse('test_get_birth'))#full test
213 | #suite.addTest(test_parse('test_get_gender'))#full test
214 | #suite.addTest(test_parse('test_split_high_edu'))#full test
215 | #suite.addTest(test_parse('test_split_low_edu'))#full test
216 | suite.addTest(test_parse('test_sub_space'))#full test
217 | #suite.addTest(test_parse('test_drop_link'))
218 | #suite.addTest(test_parse('test_drop_pf_extra'))
219 | #suite.addTest(test_parse('test_drop_href'))
220 | #suite.addTest(test_parse('test_drop_span'))
221 | #suite.addTest(test_parse('test_drop_rrurl'))
222 | #suite.addTest(test_parse('test_split_owner'))
223 | runner=unittest.TextTestRunner()
224 | runner.run(suite)
225 |
--------------------------------------------------------------------------------
/topic/README.md:
--------------------------------------------------------------------------------
1 | topic analysic
2 | ==============
3 |
4 | #### env
5 |
6 | python2.7
7 |
8 | #### usage
9 |
10 | `python demo.py`
11 |
--------------------------------------------------------------------------------
/topic/demo.py:
--------------------------------------------------------------------------------
1 | #encoding=utf-8
2 | import jieba
3 | import jieba.posseg as pseg
4 |
5 | pymysql=None
6 | def getStatus(rid,table_pre='orig_renren'):
7 | global pymysql
8 | if pymysql is None:
9 | import pymysql
10 | tablename='{}_{}'.format(table_pre,'status')
11 | conn=pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='Kunth123', db='data_bang',charset='utf8')
12 | cur=conn.cursor()
13 | cur.execute("select timestamp,cur_content from {} where renrenId1='{}'".format(tablename,rid))
14 | res={}
15 | for content in cur.fetchall():
16 | res[content[0]]=content[1]
17 | cur.close()
18 | conn.close()
19 | return res
20 |
21 | _ignore=None
22 | def drop_ignore(data):
23 | global _ignore
24 | if _ignore is None:
25 | _ignore={u'要',u'的',u'了',u'有',u'很',u'上',u'不',u'和',u'我',u'给',u'一个',u'在',u'被',u'是',u'就',u'到',u'现在',u'人',u'今天',u'又',u'啊',u'自己',u'这',u'还',u'去',u'也',u'你',u'好',u'可以',u'让',u'说',u'都',u'就是',u'转自',u'img',
26 | # 2000 up
27 | u'谁',u'一',u'吃',u'这么',u'一下',u'什么',u'把',u'再',u'小',u'得',u'大',u'如果',u'手机',u'多',u'我们',u'没',u'那',u'会',u'生活',u'还是',u'大笑',u'没有',u'个',u'明天',u'事',u'知道',u'着',u'过',u'等',u'不是',u'才',u'里',u'真的',u'这个',u'终于',u'比',u'他',u'怎么',u'呢',u'来',u'这是',u'大家',u'看',u'吧',u'下',u'走',u'想',u'中',u'请',u'对',u'已经',u'能',u'同学',u'看到',u'这样',
28 | # more
29 | u'做',u'跟',u'用',u'从',u'找',u'月',u'但是',u'开始',u'然后',u'以后',u'还有',u'貌似',u'不用',u'应该',u'感觉',u'发现',u'需要',u'各种',
30 | # time
31 | u'早晨',u'中午',u'上午',u'下午',u'晚上',u'今天',u'今天下午',u'今天上午',u'每天',u'刚刚',u'突然',u'经常'
32 | }
33 | for k in _ignore & set(data.keys()):
34 | data.pop(k)
35 |
36 | def _fix_little_age(kword):
37 | if (u'小时' in kword) and (u'时候' in kword):
38 | nxs=kword[u'小时']
39 | nsh=kword[u'时候']
40 | if nxs < nsh:
41 | kword.pop(u'小时')
42 | kword[u'小时候']=nxs
43 | kword[u'时候']=nsh-nxs
44 | elif nsh < nxs:
45 | kword.pop(u'时候')
46 | kword[u'小时候']=nsh
47 | kword[u'小时']=nxs-nsh
48 | else:
49 | kword.pop(u'时候')
50 | kword.pop(u'小时')
51 | kword[u'小时候']=nsh
52 |
53 | # extract keyword
54 | def extract_keyword(status):
55 | kword=dict()
56 | for timestamp,status in status.items():
57 | for word in jieba.cut(status,cut_all=False):
58 | # timestamp to be set() to avoid repeat word in the same status
59 | if word in kword:
60 | kword[word].add(timestamp)
61 | else:
62 | kword[word]={timestamp}
63 | _fix_little_age(kword)
64 | drop_ignore(kword)
65 | return kword
66 |
67 | def get_keyword(status):
68 | kword=dict()
69 | for timestamp,content in status.items():
70 | words = pseg.cut(content)
71 | for w in words:
72 | if w.flag in kword:
73 | kword[w.flag].add(w.word)
74 | else:
75 | kword[w.flag]={w.word}
76 | for flag,word in kword.items():
77 | print(u'{}:{}'.format(flag,word))
78 | print(kword.keys())
79 | # timestamp to be set() to avoid repeat word in the same status
80 | #if word in kword:
81 | # kword[word].add(timestamp)
82 | #else:
83 | # kword[word]={timestamp}
84 | #_fix_little_age(kword)
85 | #drop_ignore(kword)
86 | return kword
87 |
88 | def show_all_keyword(friend):
89 | rid='233330059'
90 | res=extract_keyword(getStatus(rid))
91 | for rid in friend:
92 | kword=extract_keyword(getStatus(rid))
93 | res.update(kword)
94 | print(len(res))
95 | sort_freq(res)
96 |
97 | def get_common_keyword(friend):
98 | # init res by someone whose keyword more than bound
99 | rid='233330059'
100 | res=set(extract_keyword(getStatus(rid)).keys())
101 | did=0
102 | undo=0
103 | for i,rid in zip(range(1,len(friend)+1),friend):
104 | kword=extract_keyword(getStatus(rid))
105 | if len(kword)>2000:
106 | did += 1
107 | res &= set(kword.keys())
108 | else:
109 | undo += 1
110 | print(u'{} number of keyword < 2000 {} {}'.format(undo,friend[rid],len(kword)))
111 | print(u"common keyword to add to ignore list: {}".format("',u'".join(res)))
112 | print(len(friend),did,undo)
113 |
114 | def _drop_single_word(kword):
115 | for k in kword.keys():
116 | if len(k) < 2:
117 | kword.pop(k)
118 |
119 | def sort_freq(kword):
120 | # fix
121 | _drop_single_word(kword)
122 | freq=[]
123 | for k,v in kword.items():
124 | if len(v) > 1:
125 | freq.append((len(v),k))
126 | freq.sort()
127 | for k,v in freq:
128 | print(u'{},{}'.format(k,v))
129 |
130 | def show_kword(rid):
131 | status=getStatus(rid)
132 | kword=extract_keyword(status)
133 | sort_freq(kword)
134 |
135 | def nstatus_nkeyword(friend):
136 | data=[]
137 | for i,rid in zip(range(1,len(friend)+1),friend):
138 | status=getStatus(rid)
139 | kword=extract_keyword(status)
140 | data.append((len(status),len(kword)))
141 | return data
142 |
143 | plt=None
144 | def plot_tuple(data):
145 | global plt
146 | if plt is None:
147 | import matplotlib.pyplot as plt
148 | data.sort()
149 | x=[]
150 | y=[]
151 | for a,b in data:
152 | x.append(a)
153 | y.append(b)
154 | fig=plt.figure()
155 | plt.plot(x,y,'o')
156 | plt.grid(True)
157 | plt.show()
158 |
159 | if __name__ == '__main__':
160 | import mytools
161 | friend=mytools.getFriend()
162 | #get_common_keyword(friend)
163 | #show_all_keyword(friend)
164 | # data=nstatus_nkeyword(friend)
165 | # plot_tuple(data)
166 | rid='233330059'
167 | #rid = '232279547'
168 | #show_kword(rid)
169 | status=getStatus(rid)
170 | get_keyword(status)
171 |
--------------------------------------------------------------------------------
/topic/jieba-master.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackonYang/renren/a692152c6a1eecccc1b097550a3de5916fc95e31/topic/jieba-master.zip
--------------------------------------------------------------------------------
/topic/jieba/README.md:
--------------------------------------------------------------------------------
1 | jieba
2 | ========
3 | "结巴"中文分词:做最好的Python中文分词组件
4 | "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
5 | - _Scroll down for English documentation._
6 |
7 | Feature
8 | ========
9 | * 支持三种分词模式:
10 | * 1)精确模式,试图将句子最精确地切开,适合文本分析;
11 | * 2)全模式,把句子中所有的可以成词的词语都扫描出来, 速度非常快,但是不能解决歧义;
12 | * 3) 搜索引擎模式,在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词。
13 |
14 | Python Version
15 | ==============
16 | * 目前master分支是只支持Python2.x 的
17 | * Python3.x 版本的分支也已经基本可用: https://github.com/fxsjy/jieba/tree/jieba3k
18 |
19 | Usage
20 | ========
21 | * 全自动安装:`easy_install jieba` 或者 `pip install jieba`
22 | * 半自动安装:先下载http://pypi.python.org/pypi/jieba/ ,解压后运行python setup.py install
23 | * 手动安装:将jieba目录放置于当前目录或者site-packages目录
24 | * 通过import jieba 来引用 (第一次import时需要构建Trie树,需要几秒时间)
25 |
26 | Algorithm
27 | ========
28 | * 基于Trie树结构实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图(DAG)
29 | * 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
30 | * 对于未登录词,采用了基于汉字成词能力的HMM模型,使用了Viterbi算法
31 |
32 | 功能 1):分词
33 | ==========
34 | * `jieba.cut`方法接受两个输入参数: 1) 第一个参数为需要分词的字符串 2)cut_all参数用来控制是否采用全模式
35 | * `jieba.cut_for_search`方法接受一个参数:需要分词的字符串,该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细
36 | * 注意:待分词的字符串可以是gbk字符串、utf-8字符串或者unicode
37 | * `jieba.cut`以及`jieba.cut_for_search`返回的结构都是一个可迭代的generator,可以使用for循环来获得分词后得到的每一个词语(unicode),也可以用list(jieba.cut(...))转化为list
38 |
39 | 代码示例( 分词 )
40 |
41 | #encoding=utf-8
42 | import jieba
43 |
44 | seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
45 | print "Full Mode:", "/ ".join(seg_list) #全模式
46 |
47 | seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
48 | print "Default Mode:", "/ ".join(seg_list) #精确模式
49 |
50 | seg_list = jieba.cut("他来到了网易杭研大厦")
51 | print ", ".join(seg_list)
52 |
53 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
54 | print ", ".join(seg_list)
55 |
56 | Output:
57 |
58 | 【全模式】: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
59 |
60 | 【精确模式】: 我/ 来到/ 北京/ 清华大学
61 |
62 | 【新词识别】:他, 来到, 了, 网易, 杭研, 大厦 (此处,“杭研”并没有在词典中,但是也被Viterbi算法识别出来了)
63 |
64 | 【搜索引擎模式】: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在, 日本, 京都, 大学, 日本京都大学, 深造
65 |
66 | 功能 2) :添加自定义词典
67 | ================
68 |
69 | * 开发者可以指定自己自定义的词典,以便包含jieba词库里没有的词。虽然jieba有新词识别能力,但是自行添加新词可以保证更高的正确率
70 | * 用法: jieba.load_userdict(file_name) # file_name为自定义词典的路径
71 | * 词典格式和`analyse/idf.txt`一样,一个词占一行;每一行分为两部分,一部分为词语,另一部分为词频,用空格隔开
72 | * 范例:
73 |
74 | 云计算 5
75 | 李小福 2
76 | 创新办 3
77 |
78 | 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
79 |
80 | 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
81 |
82 | * 代码示例:"通过用户自定义词典来增强歧义纠错能力" --- https://github.com/fxsjy/jieba/issues/14
83 |
84 | 功能 3) :关键词提取
85 | ================
86 | * jieba.analyse.extract_tags(sentence,topK) #需要先import jieba.analyse
87 | * setence为待提取的文本
88 | * topK为返回几个TF/IDF权重最大的关键词,默认值为20
89 |
90 | 代码示例 (关键词提取)
91 |
92 | https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
93 |
94 | 功能 4) : 词性标注
95 | ================
96 | * 标注句子分词后每个词的词性,采用和ictclas兼容的标记法
97 | * 用法示例
98 |
99 | >>> import jieba.posseg as pseg
100 | >>> words =pseg.cut("我爱北京天安门")
101 | >>> for w in words:
102 | ... print w.word,w.flag
103 | ...
104 | 我 r
105 | 爱 v
106 | 北京 ns
107 | 天安门 ns
108 |
109 |
110 |
111 | 分词速度
112 | =========
113 | * 1.5 MB / Second in Full Mode
114 | * 400 KB / Second in Default Mode
115 | * Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt
116 |
117 | 在线演示
118 | =========
119 | http://209.222.69.242:9000/
120 |
121 | 常见问题
122 | =========
123 | 1)模型的数据是如何生成的?https://github.com/fxsjy/jieba/issues/7
124 |
125 | 2)这个库的授权是? https://github.com/fxsjy/jieba/issues/2
126 |
127 | 更多问题请点击:https://github.com/fxsjy/jieba/issues?sort=updated&state=closed
128 |
129 | Change Log
130 | ==========
131 | http://www.oschina.net/p/jieba/news#list
132 |
133 | jieba
134 | ========
135 | "Jieba" (Chinese for "to stutter") Chinese text segmentation: built to be the best Python Chinese word segmentation module.
136 |
137 | Features
138 | ========
139 | * Support three types of segmentation mode:
140 | * 1) Accurate Mode, attempt to cut the sentence into the most accurate segmentation, which is suitable for text analysis;
141 | * 2) Full Mode, break the words of the sentence into words scanned
142 | * 3) Search Engine Mode, based on the Accurate Mode, with an attempt to cut the long words into several short words, which can enhance the recall rate
143 |
144 | Usage
145 | ========
146 | * Fully automatic installation: `easy_install jieba` or `pip install jieba`
147 | * Semi-automatic installation: Download http://pypi.python.org/pypi/jieba/ , after extracting run `python setup.py install`
148 | * Manutal installation: place the `jieba` directory in the current directory or python site-packages directory.
149 | * Use `import jieba` to import, which will first build the Trie tree only on first import (takes a few seconds).
150 |
151 | Algorithm
152 | ========
153 | * Based on the Trie tree structure to achieve efficient word graph scanning; sentences using Chinese characters constitute a directed acyclic graph (DAG)
154 | * Employs memory search to calculate the maximum probability path, in order to identify the maximum tangential points based on word frequency combination
155 | * For unknown words, the character position HMM-based model is used, using the Viterbi algorithm
156 |
157 | Function 1): cut
158 | ==========
159 | * The `jieba.cut` method accepts to input parameters: 1) the first parameter is the string that requires segmentation, and the 2) second parameter is `cut_all`, a parameter used to control the segmentation pattern.
160 | * `jieba.cut` returned structure is an iterative generator, where you can use a `for` loop to get the word segmentation (in unicode), or `list(jieba.cut( ... ))` to create a list.
161 | * `jieba.cut_for_search` accpets only on parameter: the string that requires segmentation, and it will cut the sentence into short words
162 |
163 | Code example: segmentation
164 | ==========
165 |
166 | #encoding=utf-8
167 | import jieba
168 |
169 | seg_list = jieba.cut("我来到北京清华大学",cut_all=True)
170 | print "Full Mode:", "/ ".join(seg_list) #全模式
171 |
172 | seg_list = jieba.cut("我来到北京清华大学",cut_all=False)
173 | print "Default Mode:", "/ ".join(seg_list) #默认模式
174 |
175 | seg_list = jieba.cut("他来到了网易杭研大厦")
176 | print ", ".join(seg_list)
177 |
178 | seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") #搜索引擎模式
179 | print ", ".join(seg_list)
180 |
181 | Output:
182 |
183 | [Full Mode]: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
184 |
185 | [Accurate Mode]: 我/ 来到/ 北京/ 清华大学
186 |
187 | [Unknown Words Recognize] 他, 来到, 了, 网易, 杭研, 大厦 (In this case, "杭研" is not in the dictionary, but is identified by the Viterbi algorithm)
188 |
189 | [Search Engine Mode]: 小明, 硕士, 毕业, 于, 中国, 科学, 学院, 科学院, 中国科学院, 计算, 计算所, 后, 在
190 | , 日本, 京都, 大学, 日本京都大学, 深造
191 |
192 |
193 | Function 2): Add a custom dictionary
194 | ==========
195 |
196 | * Developers can specify their own custom dictionary to include in the jieba thesaurus. jieba has the ability to identify new words, but adding your own new words can ensure a higher rate of correct segmentation.
197 | * Usage: `jieba.load_userdict(file_name) # file_name is a custom dictionary path`
198 | * The dictionary format is the same as that of `analyse/idf.txt`: one word per line; each line is divided into two parts, the first is the word itself, the other is the word frequency, separated by a space
199 | * Example:
200 |
201 | 云计算 5
202 | 李小福 2
203 | 创新办 3
204 |
205 | 之前: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
206 |
207 | 加载自定义词库后: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /
208 |
209 | Function 3): Keyword Extraction
210 | ================
211 | * `jieba.analyse.extract_tags(sentence,topK) # needs to first import jieba.analyse`
212 | * `setence`: the text to be extracted
213 | * `topK`: To return several TF / IDF weights for the biggest keywords, the default value is 20
214 |
215 | Code sample (keyword extraction)
216 |
217 | https://github.com/fxsjy/jieba/blob/master/test/extract_tags.py
218 |
219 |
220 | Segmentation speed
221 | =========
222 | * 1.5 MB / Second in Full Mode
223 | * 400 KB / Second in Default Mode
224 | * Test Env: Intel(R) Core(TM) i7-2600 CPU @ 3.4GHz;《围城》.txt
225 |
226 | Online demo
227 | =========
228 | http://209.222.69.242:9000/
229 |
--------------------------------------------------------------------------------
/topic/jieba/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 | import math
3 | import os,sys
4 | import pprint
5 | import finalseg
6 | import time
7 | import tempfile
8 | import marshal
9 |
10 | FREQ = {}
11 | total =0.0
12 | re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
13 |
14 | def gen_trie(f_name):
15 | lfreq = {}
16 | trie = {}
17 | ltotal = 0.0
18 | content = open(f_name,'rb').read().decode('utf-8')
19 | for line in content.split("\n"):
20 | word,freq,_ = line.split(" ")
21 | freq = float(freq)
22 | lfreq[word] = freq
23 | ltotal+=freq
24 | p = trie
25 | for c in word:
26 | if not c in p:
27 | p[c] ={}
28 | p = p[c]
29 | p['']='' #ending flag
30 | return trie, lfreq,ltotal
31 |
32 |
33 | _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
34 |
35 | print >> sys.stderr, "Building Trie..."
36 | t1 = time.time()
37 | cache_file = os.path.join(tempfile.gettempdir(),"jieba.cache")
38 | load_from_cache_fail = True
39 | if os.path.exists(cache_file) and os.path.getmtime(cache_file)>os.path.getmtime(os.path.join(_curpath,"dict.txt")):
40 | print >> sys.stderr, "loading model from cache"
41 | try:
42 | trie,FREQ,total,min_freq = marshal.load(open(cache_file,'rb'))
43 | load_from_cache_fail = False
44 | except:
45 | load_from_cache_fail = True
46 |
47 | if load_from_cache_fail:
48 | trie,FREQ,total = gen_trie(os.path.join(_curpath,"dict.txt"))
49 | FREQ = dict([(k,float(v)/total) for k,v in FREQ.iteritems()]) #normalize
50 | min_freq = min(FREQ.itervalues())
51 | print >> sys.stderr, "dumping model to file cache"
52 | marshal.dump((trie,FREQ,total,min_freq),open(cache_file,'wb'))
53 |
54 | print >> sys.stderr, "loading model cost ", time.time() - t1, "seconds."
55 | print >> sys.stderr, "Trie has been built succesfully."
56 |
57 |
58 | def __cut_all(sentence):
59 | dag = get_DAG(sentence)
60 | old_j = -1
61 | for k,L in dag.iteritems():
62 | if len(L)==1 and k>old_j:
63 | yield sentence[k:L[0]+1]
64 | old_j = L[0]
65 | else:
66 | for j in L:
67 | if j>k:
68 | yield sentence[k:j+1]
69 | old_j = j
70 |
71 | def calc(sentence,DAG,idx,route):
72 | N = len(sentence)
73 | route[N] = (1.0,'')
74 | for idx in xrange(N-1,-1,-1):
75 | candidates = [ ( FREQ.get(sentence[idx:x+1],min_freq) * route[x+1][0],x ) for x in DAG[idx] ]
76 | route[idx] = max(candidates)
77 |
78 | def get_DAG(sentence):
79 | N = len(sentence)
80 | i,j=0,0
81 | p = trie
82 | DAG = {}
83 | while i=N:
93 | i+=1
94 | j=i
95 | p=trie
96 | else:
97 | p = trie
98 | i+=1
99 | j=i
100 | for i in xrange(len(sentence)):
101 | if not i in DAG:
102 | DAG[i] =[i]
103 | return DAG
104 |
105 | def __cut_DAG(sentence):
106 | DAG = get_DAG(sentence)
107 | route ={}
108 | calc(sentence,DAG,0,route=route)
109 | x = 0
110 | buf =u''
111 | N = len(sentence)
112 | while x0:
119 | if len(buf)==1:
120 | yield buf
121 | buf=u''
122 | else:
123 | regognized = finalseg.cut(buf)
124 | for t in regognized:
125 | yield t
126 | buf=u''
127 | yield l_word
128 | x =y
129 |
130 | if len(buf)>0:
131 | if len(buf)==1:
132 | yield buf
133 | else:
134 | regognized = finalseg.cut(buf)
135 | for t in regognized:
136 | yield t
137 |
138 |
139 | def cut(sentence,cut_all=False):
140 | if not ( type(sentence) is unicode):
141 | try:
142 | sentence = sentence.decode('utf-8')
143 | except:
144 | sentence = sentence.decode('gbk','ignore')
145 |
146 | blocks = re_han.split(sentence)
147 | cut_block = __cut_DAG
148 | if cut_all:
149 | cut_block = __cut_all
150 | for blk in blocks:
151 | if re_han.match(blk):
152 | #pprint.pprint(__cut_DAG(blk))
153 | for word in cut_block(blk):
154 | yield word
155 | else:
156 | tmp = re_skip.split(blk)
157 | for x in tmp:
158 | if x!="":
159 | yield x
160 |
161 | def cut_for_search(sentence):
162 | words = cut(sentence)
163 | for w in words:
164 | if len(w)>2:
165 | for i in xrange(len(w)-1):
166 | gram2 = w[i:i+2]
167 | if gram2 in FREQ:
168 | yield gram2
169 | if len(w)>3:
170 | for i in xrange(len(w)-2):
171 | gram3 = w[i:i+3]
172 | if gram3 in FREQ:
173 | yield gram3
174 | yield w
175 |
176 | def load_userdict(f):
177 | global trie,total,FREQ
178 | if isinstance(f, (str, unicode)):
179 | f = open(f, 'rb')
180 | content = f.read().decode('utf-8')
181 | for line in content.split("\n"):
182 | if line.rstrip()=='': continue
183 | word,freq = line.split(" ")
184 | freq = float(freq)
185 | FREQ[word] = freq / total
186 | p = trie
187 | for c in word:
188 | if not c in p:
189 | p[c] ={}
190 | p = p[c]
191 | p['']='' #ending flag
192 |
--------------------------------------------------------------------------------
/topic/jieba/analyse/__init__.py:
--------------------------------------------------------------------------------
1 | import jieba
2 | import os
3 |
4 | _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
5 | f_name = os.path.join(_curpath,"idf.txt")
6 | content = open(f_name,'rb').read().decode('utf-8')
7 |
8 | idf_freq = {}
9 | lines = content.split('\n')
10 | for line in lines:
11 | word,freq = line.split(' ')
12 | idf_freq[word] = float(freq)
13 | max_idf = max(idf_freq.values())
14 |
15 | def extract_tags(sentence,topK=20):
16 | words = jieba.cut(sentence)
17 | freq = {}
18 | for w in words:
19 | if len(w.strip())<2: continue
20 | freq[w]=freq.get(w,0.0)+1.0
21 | total = sum(freq.values())
22 | freq = [(k,v/total) for k,v in freq.iteritems()]
23 |
24 | tf_idf_list = [(v * idf_freq.get(k,max_idf),k) for k,v in freq]
25 | st_list = sorted(tf_idf_list,reverse=True)
26 |
27 | top_tuples= st_list[:topK]
28 | tags = [a[1] for a in top_tuples]
29 | return tags
30 |
31 |
--------------------------------------------------------------------------------
/topic/jieba/finalseg/__init__.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 |
4 | def load_model(f_name):
5 | _curpath=os.path.normpath( os.path.join( os.getcwd(), os.path.dirname(__file__) ) )
6 | prob_p_path = os.path.join(_curpath,f_name)
7 | return eval(open(prob_p_path,"rb").read())
8 |
9 | prob_start = load_model("prob_start.py")
10 | prob_trans = load_model("prob_trans.py")
11 | prob_emit = load_model("prob_emit.py")
12 |
13 |
14 |
15 | def viterbi(obs, states, start_p, trans_p, emit_p):
16 | V = [{}] #tabular
17 | path = {}
18 | for y in states: #init
19 | V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
20 | path[y] = [y]
21 | for t in range(1,len(obs)):
22 | V.append({})
23 | newpath = {}
24 | for y in states:
25 | (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in states ])
26 | V[t][y] =prob
27 | newpath[y] = path[state] + [y]
28 | path = newpath
29 |
30 | (prob, state) = max([(V[len(obs) - 1][y], y) for y in ('E','S')])
31 |
32 | return (prob, path[state])
33 |
34 |
35 | def __cut(sentence):
36 | prob, pos_list = viterbi(sentence,('B','M','E','S'), prob_start, prob_trans, prob_emit)
37 | begin, next = 0,0
38 | #print pos_list, sentence
39 | for i,char in enumerate(sentence):
40 | pos = pos_list[i]
41 | if pos=='B':
42 | begin = i
43 | elif pos=='E':
44 | yield sentence[begin:i+1]
45 | next = i+1
46 | elif pos=='S':
47 | yield char
48 | next = i+1
49 | if next0:
99 | if len(buf)==1:
100 | yield pair(buf,word_tag_tab.get(buf,'x'))
101 | buf=u''
102 | else:
103 | regognized = __cut_detail(buf)
104 | for t in regognized:
105 | yield t
106 | buf=u''
107 | yield pair(l_word,word_tag_tab.get(l_word,'x'))
108 | x =y
109 |
110 | if len(buf)>0:
111 | if len(buf)==1:
112 | yield pair(buf,word_tag_tab.get(buf,'x'))
113 | else:
114 | regognized = __cut_detail(buf)
115 | for t in regognized:
116 | yield t
117 |
118 |
119 | def cut(sentence):
120 | if not ( type(sentence) is unicode):
121 | try:
122 | sentence = sentence.decode('utf-8')
123 | except:
124 | sentence = sentence.decode('gbk','ignore')
125 | re_han, re_skip = re.compile(ur"([\u4E00-\u9FA5a-zA-Z0-9+#]+)"), re.compile(ur"[^\r\n]")
126 | re_eng,re_num = re.compile(ur"[a-zA-Z+#]+"), re.compile(ur"[0-9]+")
127 | blocks = re_han.split(sentence)
128 | for blk in blocks:
129 | if re_han.match(blk):
130 | for word in __cut_DAG(blk):
131 | yield word
132 | else:
133 | tmp = re_skip.split(blk)
134 | for x in tmp:
135 | if x!="":
136 | if re_num.match(x):
137 | yield pair(x,'m')
138 | elif re_eng.match(x):
139 | yield pair(x,'eng')
140 | else:
141 | yield pair(x,'x')
142 |
--------------------------------------------------------------------------------
/topic/jieba/posseg/prob_start.py:
--------------------------------------------------------------------------------
1 | {('B', 'a'): 0.008545886571090637,
2 | ('B', 'ad'): 0.0012556950477614949,
3 | ('B', 'ag'): 0.0,
4 | ('B', 'an'): 0.0001670724139577068,
5 | ('B', 'b'): 0.006615272009801582,
6 | ('B', 'bg'): 0.0,
7 | ('B', 'c'): 0.03258575057944956,
8 | ('B', 'd'): 0.018778408940230508,
9 | ('B', 'df'): 0.00013790104009207547,
10 | ('B', 'dg'): 0.0,
11 | ('B', 'e'): 0.00019093990166595064,
12 | ('B', 'en'): 0.0,
13 | ('B', 'f'): 0.004121119544290101,
14 | ('B', 'g'): 0.0,
15 | ('B', 'h'): 1.3259715393468796e-06,
16 | ('B', 'i'): 0.0022077426130125543,
17 | ('B', 'in'): 0.0,
18 | ('B', 'j'): 0.006360685474246981,
19 | ('B', 'jn'): 0.0,
20 | ('B', 'k'): 0.0,
21 | ('B', 'l'): 0.007402899104173628,
22 | ('B', 'ln'): 0.0,
23 | ('B', 'm'): 0.02592804748038888,
24 | ('B', 'mg'): 0.0,
25 | ('B', 'mq'): 0.0011284017799841944,
26 | ('B', 'n'): 0.18330097962777328,
27 | ('B', 'ng'): 0.0,
28 | ('B', 'nr'): 0.10741562843095136,
29 | ('B', 'nrfg'): 0.0028123856349547313,
30 | ('B', 'nrt'): 0.006835383285333164,
31 | ('B', 'ns'): 0.05943667425122387,
32 | ('B', 'nt'): 0.007859033313708954,
33 | ('B', 'nz'): 0.0193127754705873,
34 | ('B', 'o'): 0.00021745933245288822,
35 | ('B', 'p'): 0.014980826451541043,
36 | ('B', 'q'): 0.00091359439061,
37 | ('B', 'qe'): 0.0,
38 | ('B', 'qg'): 0.0,
39 | ('B', 'r'): 0.033047188675142274,
40 | ('B', 'rg'): 0.0,
41 | ('B', 'rr'): 3.977914618040638e-06,
42 | ('B', 'rz'): 0.0003540344010056168,
43 | ('B', 's'): 0.0039951522480521475,
44 | ('B', 't'): 0.03457072997385184,
45 | ('B', 'tg'): 0.0,
46 | ('B', 'u'): 0.00010475175160840347,
47 | ('B', 'ud'): 0.0,
48 | ('B', 'ug'): 0.0,
49 | ('B', 'uj'): 0.0,
50 | ('B', 'ul'): 0.0,
51 | ('B', 'uv'): 0.0,
52 | ('B', 'uz'): 0.0,
53 | ('B', 'v'): 0.06897173559066729,
54 | ('B', 'vd'): 0.00011801146700187228,
55 | ('B', 'vg'): 0.0,
56 | ('B', 'vi'): 3.977914618040638e-06,
57 | ('B', 'vn'): 0.01314700781262431,
58 | ('B', 'vq'): 5.303886157387518e-06,
59 | ('B', 'w'): 0.0,
60 | ('B', 'x'): 0.0,
61 | ('B', 'y'): 5.303886157387518e-05,
62 | ('B', 'yg'): 0.0,
63 | ('B', 'z'): 0.0008711633013508998,
64 | ('B', 'zg'): 0.0,
65 | ('E', 'a'): 0.0,
66 | ('E', 'ad'): 0.0,
67 | ('E', 'ag'): 0.0,
68 | ('E', 'an'): 0.0,
69 | ('E', 'b'): 0.0,
70 | ('E', 'bg'): 0.0,
71 | ('E', 'c'): 0.0,
72 | ('E', 'd'): 0.0,
73 | ('E', 'df'): 0.0,
74 | ('E', 'dg'): 0.0,
75 | ('E', 'e'): 0.0,
76 | ('E', 'en'): 0.0,
77 | ('E', 'f'): 0.0,
78 | ('E', 'g'): 0.0,
79 | ('E', 'h'): 0.0,
80 | ('E', 'i'): 0.0,
81 | ('E', 'in'): 0.0,
82 | ('E', 'j'): 0.0,
83 | ('E', 'jn'): 0.0,
84 | ('E', 'k'): 0.0,
85 | ('E', 'l'): 0.0,
86 | ('E', 'ln'): 0.0,
87 | ('E', 'm'): 0.0,
88 | ('E', 'mg'): 0.0,
89 | ('E', 'mq'): 0.0,
90 | ('E', 'n'): 0.0,
91 | ('E', 'ng'): 0.0,
92 | ('E', 'nr'): 0.0,
93 | ('E', 'nrfg'): 0.0,
94 | ('E', 'nrt'): 0.0,
95 | ('E', 'ns'): 0.0,
96 | ('E', 'nt'): 0.0,
97 | ('E', 'nz'): 0.0,
98 | ('E', 'o'): 0.0,
99 | ('E', 'p'): 0.0,
100 | ('E', 'q'): 0.0,
101 | ('E', 'qe'): 0.0,
102 | ('E', 'qg'): 0.0,
103 | ('E', 'r'): 0.0,
104 | ('E', 'rg'): 0.0,
105 | ('E', 'rr'): 0.0,
106 | ('E', 'rz'): 0.0,
107 | ('E', 's'): 0.0,
108 | ('E', 't'): 0.0,
109 | ('E', 'tg'): 0.0,
110 | ('E', 'u'): 0.0,
111 | ('E', 'ud'): 0.0,
112 | ('E', 'ug'): 0.0,
113 | ('E', 'uj'): 0.0,
114 | ('E', 'ul'): 0.0,
115 | ('E', 'uv'): 0.0,
116 | ('E', 'uz'): 0.0,
117 | ('E', 'v'): 0.0,
118 | ('E', 'vd'): 0.0,
119 | ('E', 'vg'): 0.0,
120 | ('E', 'vi'): 0.0,
121 | ('E', 'vn'): 0.0,
122 | ('E', 'vq'): 0.0,
123 | ('E', 'w'): 0.0,
124 | ('E', 'x'): 0.0,
125 | ('E', 'y'): 0.0,
126 | ('E', 'yg'): 0.0,
127 | ('E', 'z'): 0.0,
128 | ('E', 'zg'): 0.0,
129 | ('M', 'a'): 0.0,
130 | ('M', 'ad'): 0.0,
131 | ('M', 'ag'): 0.0,
132 | ('M', 'an'): 0.0,
133 | ('M', 'b'): 0.0,
134 | ('M', 'bg'): 0.0,
135 | ('M', 'c'): 0.0,
136 | ('M', 'd'): 0.0,
137 | ('M', 'df'): 0.0,
138 | ('M', 'dg'): 0.0,
139 | ('M', 'e'): 0.0,
140 | ('M', 'en'): 0.0,
141 | ('M', 'f'): 0.0,
142 | ('M', 'g'): 0.0,
143 | ('M', 'h'): 0.0,
144 | ('M', 'i'): 0.0,
145 | ('M', 'in'): 0.0,
146 | ('M', 'j'): 0.0,
147 | ('M', 'jn'): 0.0,
148 | ('M', 'k'): 0.0,
149 | ('M', 'l'): 0.0,
150 | ('M', 'ln'): 0.0,
151 | ('M', 'm'): 0.0,
152 | ('M', 'mg'): 0.0,
153 | ('M', 'mq'): 0.0,
154 | ('M', 'n'): 0.0,
155 | ('M', 'ng'): 0.0,
156 | ('M', 'nr'): 0.0,
157 | ('M', 'nrfg'): 0.0,
158 | ('M', 'nrt'): 0.0,
159 | ('M', 'ns'): 0.0,
160 | ('M', 'nt'): 0.0,
161 | ('M', 'nz'): 0.0,
162 | ('M', 'o'): 0.0,
163 | ('M', 'p'): 0.0,
164 | ('M', 'q'): 0.0,
165 | ('M', 'qe'): 0.0,
166 | ('M', 'qg'): 0.0,
167 | ('M', 'r'): 0.0,
168 | ('M', 'rg'): 0.0,
169 | ('M', 'rr'): 0.0,
170 | ('M', 'rz'): 0.0,
171 | ('M', 's'): 0.0,
172 | ('M', 't'): 0.0,
173 | ('M', 'tg'): 0.0,
174 | ('M', 'u'): 0.0,
175 | ('M', 'ud'): 0.0,
176 | ('M', 'ug'): 0.0,
177 | ('M', 'uj'): 0.0,
178 | ('M', 'ul'): 0.0,
179 | ('M', 'uv'): 0.0,
180 | ('M', 'uz'): 0.0,
181 | ('M', 'v'): 0.0,
182 | ('M', 'vd'): 0.0,
183 | ('M', 'vg'): 0.0,
184 | ('M', 'vi'): 0.0,
185 | ('M', 'vn'): 0.0,
186 | ('M', 'vq'): 0.0,
187 | ('M', 'w'): 0.0,
188 | ('M', 'x'): 0.0,
189 | ('M', 'y'): 0.0,
190 | ('M', 'yg'): 0.0,
191 | ('M', 'z'): 0.0,
192 | ('M', 'zg'): 0.0,
193 | ('S', 'a'): 0.020190568629634933,
194 | ('S', 'ad'): 1.5911658472162552e-05,
195 | ('S', 'ag'): 0.0009546995083297532,
196 | ('S', 'an'): 2.651943078693759e-06,
197 | ('S', 'b'): 0.0015447568433391145,
198 | ('S', 'bg'): 0.0,
199 | ('S', 'c'): 0.008337709039413178,
200 | ('S', 'd'): 0.020162723227308648,
201 | ('S', 'df'): 0.0,
202 | ('S', 'dg'): 0.0001299452108559942,
203 | ('S', 'e'): 0.0026254236479068215,
204 | ('S', 'en'): 0.0,
205 | ('S', 'f'): 0.0055452129775486496,
206 | ('S', 'g'): 0.0014917179817652395,
207 | ('S', 'h'): 0.00017502824319378808,
208 | ('S', 'i'): 0.0,
209 | ('S', 'in'): 0.0,
210 | ('S', 'j'): 0.007357816071835834,
211 | ('S', 'jn'): 0.0,
212 | ('S', 'k'): 0.000967959223723222,
213 | ('S', 'l'): 0.0,
214 | ('S', 'ln'): 0.0,
215 | ('S', 'm'): 0.038036819577704585,
216 | ('S', 'mg'): 1.988957309020319e-05,
217 | ('S', 'mq'): 0.0,
218 | ('S', 'n'): 0.021170461597212278,
219 | ('S', 'ng'): 0.007347208299521059,
220 | ('S', 'nr'): 0.011291973629078026,
221 | ('S', 'nrfg'): 0.0,
222 | ('S', 'nrt'): 0.0,
223 | ('S', 'ns'): 0.0,
224 | ('S', 'nt'): 5.303886157387518e-06,
225 | ('S', 'nz'): 0.0,
226 | ('S', 'o'): 0.00021082947475615385,
227 | ('S', 'p'): 0.05044658721445203,
228 | ('S', 'q'): 0.007531518343490275,
229 | ('S', 'qe'): 0.0,
230 | ('S', 'qg'): 0.0,
231 | ('S', 'r'): 0.06306851029749498,
232 | ('S', 'rg'): 3.447526002301887e-05,
233 | ('S', 'rr'): 0.0,
234 | ('S', 'rz'): 0.0,
235 | ('S', 's'): 0.0,
236 | ('S', 't'): 0.0,
237 | ('S', 'tg'): 0.0018868575004906095,
238 | ('S', 'u'): 0.000967959223723222,
239 | ('S', 'ud'): 0.000440222551063164,
240 | ('S', 'ug'): 0.0005317145872780986,
241 | ('S', 'uj'): 0.001056799316859463,
242 | ('S', 'ul'): 0.00022143724707092888,
243 | ('S', 'uv'): 0.00028640985249892595,
244 | ('S', 'uz'): 9.149203621493468e-05,
245 | ('S', 'v'): 0.04720326082920956,
246 | ('S', 'vd'): 0.0,
247 | ('S', 'vg'): 0.0026240976763674743,
248 | ('S', 'vi'): 0.0,
249 | ('S', 'vn'): 1.0607772314775036e-05,
250 | ('S', 'vq'): 0.0,
251 | ('S', 'w'): 0.0,
252 | ('S', 'x'): 0.0002187853039922351,
253 | ('S', 'y'): 0.00203536631289746,
254 | ('S', 'yg'): 1.3259715393468796e-06,
255 | ('S', 'z'): 0.0,
256 | ('S', 'zg'): 0.0}
257 |
--------------------------------------------------------------------------------
/topic/jieba/posseg/viterbi.py:
--------------------------------------------------------------------------------
1 | import operator
2 |
3 | def get_top_states(t_state_v,K=4):
4 | items = t_state_v.items()
5 | topK= sorted(items,key=operator.itemgetter(1),reverse=True)[:K]
6 | return [x[0] for x in topK]
7 |
8 | def viterbi(obs, states, start_p, trans_p, emit_p):
9 | V = [{}] #tabular
10 | mem_path = [{}]
11 | all_states = trans_p.keys()
12 | for y in states.get(obs[0],all_states): #init
13 | V[0][y] = start_p[y] * emit_p[y].get(obs[0],0)
14 | mem_path[0][y] = ''
15 | for t in range(1,len(obs)):
16 | V.append({})
17 | mem_path.append({})
18 | prev_states = get_top_states(V[t-1])
19 | prev_states =[ x for x in mem_path[t-1].keys() if len(trans_p[x])>0 ]
20 |
21 | prev_states_expect_next = set( (y for x in prev_states for y in trans_p[x].keys() ) )
22 | obs_states = states.get(obs[t],all_states)
23 | obs_states = set(obs_states) & set(prev_states_expect_next)
24 |
25 | if len(obs_states)==0: obs_states = all_states
26 | for y in obs_states:
27 | (prob,state ) = max([(V[t-1][y0] * trans_p[y0].get(y,0) * emit_p[y].get(obs[t],0) ,y0) for y0 in prev_states])
28 | V[t][y] =prob
29 | mem_path[t][y] = state
30 |
31 | last = [(V[-1][y], y) for y in mem_path[-1].keys() ]
32 | #if len(last)==0:
33 | #print obs
34 | (prob, state) = max(last)
35 |
36 | route = [None] * len(obs)
37 | i = len(obs)-1
38 | while i>=0:
39 | route[i] = state
40 | state = mem_path[i][state]
41 | i-=1
42 | return (prob, route)
--------------------------------------------------------------------------------
/topic/nstatus_nkeyword.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JackonYang/renren/a692152c6a1eecccc1b097550a3de5916fc95e31/topic/nstatus_nkeyword.png
--------------------------------------------------------------------------------