├── .idea
├── W2VTextRank4ZH.iml
├── encodings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── FastTextRank
├── FastTextRank4Sentence.py
├── FastTextRank4Word.py
├── __init__.py
├── __pycache__
│ ├── FastTextRank4Sentence.cpython-36.pyc
│ ├── FastTextRank4Word.cpython-36.pyc
│ ├── W2VTextRank4Sentence.cpython-36.pyc
│ ├── W2VTextRank4Word.cpython-36.pyc
│ ├── __init__.cpython-36.pyc
│ └── util.cpython-36.pyc
├── stopwords.txt
├── test
│ ├── KeyWord.py
│ ├── Sentence.py
│ ├── __init__.py
│ └── text1.txt
└── util.py
└── README.md
/.idea/W2VTextRank4ZH.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
86 |
87 |
88 |
89 | model
90 | build_word
91 | filter
92 | combi
93 | replace
94 | cosine_similarity
95 | build_word_grah
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 | true
125 | DEFINITION_ORDER
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 | 1533005973303
327 |
328 |
329 | 1533005973303
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
--------------------------------------------------------------------------------
/FastTextRank/FastTextRank4Sentence.py:
--------------------------------------------------------------------------------
1 | #-*- encoding:utf-8 -*-
2 | import jieba
3 | import math
4 | from string import punctuation
5 | from heapq import nlargest
6 | from itertools import product, count
7 | from gensim.models import Word2Vec
8 | from FastTextRank import util
9 | import numpy as np
10 | import os
11 | import codecs
12 | from itertools import count
13 |
14 | class FastTextRank4Sentence(object):
15 | def __init__(self, use_stopword = False, stop_words_file=None, use_w2v=False, dict_path=None,max_iter=100,tol=0.0001):
16 | """
17 |
18 | :param use_stopword: 是否使用停用词
19 | :param stop_words_file: 停用词文件路径
20 | :param use_w2v: 是否使用词向量计算句子相似性
21 | :param dict_path: 词向量字典文件路径
22 | :param max_iter: 最大迭代伦茨
23 | :param tol: 最大容忍误差
24 | """
25 | if use_w2v==False and dict_path!=None:
26 | raise RuntimeError("再使用词向量之前必须令参数use_w2v=True")
27 | self.__use_stopword = use_stopword
28 | self.__use_w2v=use_w2v
29 | self.__dict_path = dict_path
30 | self.__max_iter = max_iter
31 | self.__tol = tol
32 | if self.__use_w2v:
33 | self.__word2vec = Word2Vec.load(self.__dict_path)
34 | self.__stop_words = set()
35 | self.__stop_words_file = self.get_default_stop_words_file()
36 | if type(stop_words_file) is str:
37 | self.__stop_words_file = stop_words_file
38 | if use_stopword:
39 | for word in codecs.open(self.__stop_words_file, 'r', 'utf-8', 'ignore'):
40 | self.__stop_words.add(word.strip())
41 | np.seterr(all='warn')#Print a RuntimeWarning for all types of floating-point errors
42 |
43 | def get_default_stop_words_file(self):
44 | d = os.path.dirname(os.path.realpath(__file__))
45 | return os.path.join(d, 'stopwords.txt')
46 |
47 | #可以改进为删除停用词,词性不需要的词
48 | def filter_dictword(self,sents):
49 | """
50 | 删除词向量字典里不存的词
51 | :param sents:
52 | :return:
53 | """
54 | _sents = []
55 | dele=set()
56 | for sentence in sents:
57 | for word in sentence:
58 | if word not in self.__word2vec:
59 | dele.add(word)
60 | if sentence:
61 | _sents.append([word for word in sentence if word not in dele])
62 | return _sents
63 |
64 | def summarize(self,text,n):
65 | text = text.replace('\n', '')
66 | text = text.replace('\r', '')
67 | text = util.as_text(text)#处理编码问题
68 | tokens=util.cut_sentences(text)
69 | #sentences用于记录文章最原本的句子,sents用于各种计算操作
70 | sentences, sents=util.cut_filter_words(tokens,self.__stop_words,self.__use_stopword)
71 | if self.__use_w2v:
72 | sents = self.filter_dictword(sents)
73 | graph = self.create_graph_sentence(sents,self.__use_w2v)
74 | scores = util.weight_map_rank(graph,self.__max_iter,self.__tol)
75 | sent_selected = nlargest(n, zip(scores, count()))
76 | sent_index = []
77 | for i in range(n):
78 | sent_index.append(sent_selected[i][1]) # 添加入关键词在原来文章中的下标
79 | return [sentences[i] for i in sent_index]
80 |
81 | def create_graph_sentence(self,word_sent, use_w2v):
82 | """
83 | 传入句子链表 返回句子之间相似度的图
84 | :param word_sent:
85 | :return:
86 | """
87 | num = len(word_sent)
88 | board = [[0.0 for _ in range(num)] for _ in range(num)]
89 |
90 | for i, j in product(range(num), repeat=2):
91 | if i != j:
92 | if use_w2v:
93 | board[i][j] = self.compute_similarity_by_avg(word_sent[i], word_sent[j])
94 | else:
95 | board[i][j]=util.two_sentences_similarity(word_sent[i], word_sent[j])
96 | return board
97 |
98 | def compute_similarity_by_avg(self,sents_1, sents_2):
99 | '''
100 | 对两个句子求平均词向量
101 | :param sents_1:
102 | :param sents_2:
103 | :return:
104 | '''
105 | if len(sents_1) == 0 or len(sents_2) == 0:
106 | return 0.0
107 | #把一个句子中的所有词向量相加
108 | vec1 = self.__word2vec[sents_1[0]]
109 | for word1 in sents_1[1:]:
110 | vec1 = vec1 + self.__word2vec[word1]
111 |
112 | vec2 = self.__word2vec[sents_2[0]]
113 | for word2 in sents_2[1:]:
114 | vec2 = vec2 + self.__word2vec[word2]
115 |
116 | similarity = util.cosine_similarity(vec1 / len(sents_1), vec2 / len(sents_2))
117 | return similarity
118 |
119 |
120 |
--------------------------------------------------------------------------------
/FastTextRank/FastTextRank4Word.py:
--------------------------------------------------------------------------------
1 | #-*- encoding:utf-8 -*-
2 | import jieba
3 | import math
4 | from string import punctuation
5 | from heapq import nlargest
6 | from itertools import product, count
7 | from gensim.models import Word2Vec
8 | from FastTextRank import util
9 | import numpy as np
10 | import os
11 | from itertools import count
12 | import codecs
13 |
14 | class FastTextRank4Word(object):
15 | def __init__(self,use_stopword=False,stop_words_file=None,max_iter=100,tol=0.0001,window=2):
16 | """
17 | :param max_iter: 最大的迭代轮次
18 | :param tol: 最大的容忍误差
19 | :param window: 词语窗口
20 | :return:
21 | """
22 | self.__use_stopword = use_stopword
23 | self.__max_iter = max_iter
24 | self.__tol = tol
25 | self.__window = window
26 | self.__stop_words = set()
27 | self.__stop_words_file = self.get_default_stop_words_file()
28 | if type(stop_words_file) is str:
29 | self.__stop_words_file = stop_words_file
30 | if use_stopword:
31 | for word in codecs.open(self.__stop_words_file, 'r', 'utf-8', 'ignore'):
32 | self.__stop_words.add(word.strip())
33 | # Print a RuntimeWarning for all types of floating-point errors
34 | np.seterr(all='warn')
35 |
36 | def get_default_stop_words_file(self):
37 | d = os.path.dirname(os.path.realpath(__file__))
38 | return os.path.join(d, 'stopwords.txt')
39 |
40 | def build_worddict(self,sents):
41 | """
42 | 构建字典,是词语和下标之间生成一对一的联系,为之后的词图构建做准备
43 | :param sents:
44 | :return:
45 | """
46 | word_index = {}
47 | index_word = {}
48 | words_number = 0
49 | for word_list in sents:
50 | for word in word_list:
51 | if not word in word_index:
52 | word_index[word] = words_number
53 | index_word[words_number] = word
54 | words_number += 1
55 | return word_index,index_word,words_number
56 |
57 | def build_word_grah(self,sents,words_number,word_index,window=2):
58 | graph = [[0.0 for _ in range(words_number)] for _ in range(words_number)]
59 | for word_list in sents:
60 | for w1, w2 in util.combine(word_list, window):
61 | if w1 in word_index and w2 in word_index:
62 | index1 = word_index[w1]
63 | index2 = word_index[w2]
64 | graph[index1][index2] += 1.0
65 | graph[index2][index1] += 1.0
66 | return graph
67 |
68 | def summarize(self,text,n):
69 | text = text.replace('\n', '')
70 | text = text.replace('\r', '')
71 | text = util.as_text(text)#处理编码问题
72 | tokens=util.cut_sentences(text)
73 | #sentences用于记录文章最原本的句子,sents用于各种计算操作
74 | sentences,sents=util.psegcut_filter_words(tokens,self.__stop_words,self.__use_stopword)
75 |
76 | word_index, index_word, words_number=self.build_worddict(sents)
77 | graph=self.build_word_grah(sents,words_number,word_index,window=self.__window)
78 | scores = util.weight_map_rank(graph,max_iter=self.__max_iter,tol=self.__tol)
79 | sent_selected = nlargest(n, zip(scores, count()))
80 | sent_index = []
81 | for i in range(n):
82 | sent_index.append(sent_selected[i][1]) # 添加入关键词在原来文章中的下标
83 | return [index_word[i] for i in sent_index]
84 |
85 |
86 |
--------------------------------------------------------------------------------
/FastTextRank/__init__.py:
--------------------------------------------------------------------------------
1 | #-*- encoding:utf-8 -*-
2 | from __future__ import absolute_import
3 | from .FastTextRank4Sentence import FastTextRank4Sentence
4 | # from .TextRank4Sentence import TextRank4Sentence
5 | from . import util
6 |
7 | version = '0.2'
--------------------------------------------------------------------------------
/FastTextRank/__pycache__/FastTextRank4Sentence.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/__pycache__/FastTextRank4Sentence.cpython-36.pyc
--------------------------------------------------------------------------------
/FastTextRank/__pycache__/FastTextRank4Word.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/__pycache__/FastTextRank4Word.cpython-36.pyc
--------------------------------------------------------------------------------
/FastTextRank/__pycache__/W2VTextRank4Sentence.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/__pycache__/W2VTextRank4Sentence.cpython-36.pyc
--------------------------------------------------------------------------------
/FastTextRank/__pycache__/W2VTextRank4Word.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/__pycache__/W2VTextRank4Word.cpython-36.pyc
--------------------------------------------------------------------------------
/FastTextRank/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/FastTextRank/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/__pycache__/util.cpython-36.pyc
--------------------------------------------------------------------------------
/FastTextRank/stopwords.txt:
--------------------------------------------------------------------------------
1 | ?
2 | 、
3 | 。
4 | “
5 | ”
6 | 《
7 | 》
8 | !
9 | ,
10 | :
11 | ;
12 | ?
13 | 啊
14 | 阿
15 | 哎
16 | 哎呀
17 | 哎哟
18 | 唉
19 | 俺
20 | 俺们
21 | 按
22 | 按照
23 | 吧
24 | 吧哒
25 | 把
26 | 罢了
27 | 被
28 | 本
29 | 本着
30 | 比
31 | 比方
32 | 比如
33 | 鄙人
34 | 彼
35 | 彼此
36 | 边
37 | 别
38 | 别的
39 | 别说
40 | 并
41 | 并且
42 | 不比
43 | 不成
44 | 不单
45 | 不但
46 | 不独
47 | 不管
48 | 不光
49 | 不过
50 | 不仅
51 | 不拘
52 | 不论
53 | 不怕
54 | 不然
55 | 不如
56 | 不特
57 | 不惟
58 | 不问
59 | 不只
60 | 朝
61 | 朝着
62 | 趁
63 | 趁着
64 | 乘
65 | 冲
66 | 除
67 | 除此之外
68 | 除非
69 | 除了
70 | 此
71 | 此间
72 | 此外
73 | 从
74 | 从而
75 | 打
76 | 待
77 | 但
78 | 但是
79 | 当
80 | 当着
81 | 到
82 | 得
83 | 的
84 | 的话
85 | 等
86 | 等等
87 | 地
88 | 第
89 | 叮咚
90 | 对
91 | 对于
92 | 多
93 | 多少
94 | 而
95 | 而况
96 | 而且
97 | 而是
98 | 而外
99 | 而言
100 | 而已
101 | 尔后
102 | 反过来
103 | 反过来说
104 | 反之
105 | 非但
106 | 非徒
107 | 否则
108 | 嘎
109 | 嘎登
110 | 该
111 | 赶
112 | 个
113 | 各
114 | 各个
115 | 各位
116 | 各种
117 | 各自
118 | 给
119 | 根据
120 | 跟
121 | 故
122 | 故此
123 | 固然
124 | 关于
125 | 管
126 | 归
127 | 果然
128 | 果真
129 | 过
130 | 哈
131 | 哈哈
132 | 呵
133 | 和
134 | 何
135 | 何处
136 | 何况
137 | 何时
138 | 嘿
139 | 哼
140 | 哼唷
141 | 呼哧
142 | 乎
143 | 哗
144 | 还是
145 | 还有
146 | 换句话说
147 | 换言之
148 | 或
149 | 或是
150 | 或者
151 | 极了
152 | 及
153 | 及其
154 | 及至
155 | 即
156 | 即便
157 | 即或
158 | 即令
159 | 即若
160 | 即使
161 | 几
162 | 几时
163 | 己
164 | 既
165 | 既然
166 | 既是
167 | 继而
168 | 加之
169 | 假如
170 | 假若
171 | 假使
172 | 鉴于
173 | 将
174 | 较
175 | 较之
176 | 叫
177 | 接着
178 | 结果
179 | 借
180 | 紧接着
181 | 进而
182 | 尽
183 | 尽管
184 | 经
185 | 经过
186 | 就
187 | 就是
188 | 就是说
189 | 据
190 | 具体地说
191 | 具体说来
192 | 开始
193 | 开外
194 | 靠
195 | 咳
196 | 可
197 | 可见
198 | 可是
199 | 可以
200 | 况且
201 | 啦
202 | 来
203 | 来着
204 | 离
205 | 例如
206 | 哩
207 | 连
208 | 连同
209 | 两者
210 | 了
211 | 临
212 | 另
213 | 另外
214 | 另一方面
215 | 论
216 | 嘛
217 | 吗
218 | 慢说
219 | 漫说
220 | 冒
221 | 么
222 | 每
223 | 每当
224 | 们
225 | 莫若
226 | 某
227 | 某个
228 | 某些
229 | 拿
230 | 哪
231 | 哪边
232 | 哪儿
233 | 哪个
234 | 哪里
235 | 哪年
236 | 哪怕
237 | 哪天
238 | 哪些
239 | 哪样
240 | 那
241 | 那边
242 | 那儿
243 | 那个
244 | 那会儿
245 | 那里
246 | 那么
247 | 那么些
248 | 那么样
249 | 那时
250 | 那些
251 | 那样
252 | 乃
253 | 乃至
254 | 呢
255 | 能
256 | 你
257 | 你们
258 | 您
259 | 宁
260 | 宁可
261 | 宁肯
262 | 宁愿
263 | 哦
264 | 呕
265 | 啪达
266 | 旁人
267 | 呸
268 | 凭
269 | 凭借
270 | 其
271 | 其次
272 | 其二
273 | 其他
274 | 其它
275 | 其一
276 | 其余
277 | 其中
278 | 起
279 | 起见
280 | 起见
281 | 岂但
282 | 恰恰相反
283 | 前后
284 | 前者
285 | 且
286 | 然而
287 | 然后
288 | 然则
289 | 让
290 | 人家
291 | 任
292 | 任何
293 | 任凭
294 | 如
295 | 如此
296 | 如果
297 | 如何
298 | 如其
299 | 如若
300 | 如上所述
301 | 若
302 | 若非
303 | 若是
304 | 啥
305 | 上下
306 | 尚且
307 | 设若
308 | 设使
309 | 甚而
310 | 甚么
311 | 甚至
312 | 省得
313 | 时候
314 | 什么
315 | 什么样
316 | 使得
317 | 是
318 | 是的
319 | 首先
320 | 谁
321 | 谁知
322 | 顺
323 | 顺着
324 | 似的
325 | 虽
326 | 虽然
327 | 虽说
328 | 虽则
329 | 随
330 | 随着
331 | 所
332 | 所以
333 | 他
334 | 他们
335 | 他人
336 | 它
337 | 它们
338 | 她
339 | 她们
340 | 倘
341 | 倘或
342 | 倘然
343 | 倘若
344 | 倘使
345 | 腾
346 | 替
347 | 通过
348 | 同
349 | 同时
350 | 哇
351 | 万一
352 | 往
353 | 望
354 | 为
355 | 为何
356 | 为了
357 | 为什么
358 | 为着
359 | 喂
360 | 嗡嗡
361 | 我
362 | 我们
363 | 呜
364 | 呜呼
365 | 乌乎
366 | 无论
367 | 无宁
368 | 毋宁
369 | 嘻
370 | 吓
371 | 相对而言
372 | 像
373 | 向
374 | 向着
375 | 嘘
376 | 呀
377 | 焉
378 | 沿
379 | 沿着
380 | 要
381 | 要不
382 | 要不然
383 | 要不是
384 | 要么
385 | 要是
386 | 也
387 | 也罢
388 | 也好
389 | 一
390 | 一般
391 | 一旦
392 | 一方面
393 | 一来
394 | 一切
395 | 一样
396 | 一则
397 | 依
398 | 依照
399 | 矣
400 | 以
401 | 以便
402 | 以及
403 | 以免
404 | 以至
405 | 以至于
406 | 以致
407 | 抑或
408 | 因
409 | 因此
410 | 因而
411 | 因为
412 | 哟
413 | 用
414 | 由
415 | 由此可见
416 | 由于
417 | 有
418 | 有的
419 | 有关
420 | 有些
421 | 又
422 | 于
423 | 于是
424 | 于是乎
425 | 与
426 | 与此同时
427 | 与否
428 | 与其
429 | 越是
430 | 云云
431 | 哉
432 | 再说
433 | 再者
434 | 在
435 | 在下
436 | 咱
437 | 咱们
438 | 则
439 | 怎
440 | 怎么
441 | 怎么办
442 | 怎么样
443 | 怎样
444 | 咋
445 | 照
446 | 照着
447 | 者
448 | 这
449 | 这边
450 | 这儿
451 | 这个
452 | 这会儿
453 | 这就是说
454 | 这里
455 | 这么
456 | 这么点儿
457 | 这么些
458 | 这么样
459 | 这时
460 | 这些
461 | 这样
462 | 正如
463 | 吱
464 | 之
465 | 之类
466 | 之所以
467 | 之一
468 | 只是
469 | 只限
470 | 只要
471 | 只有
472 | 至
473 | 至于
474 | 诸位
475 | 着
476 | 着呢
477 | 自
478 | 自从
479 | 自个儿
480 | 自各儿
481 | 自己
482 | 自家
483 | 自身
484 | 综上所述
485 | 总的来看
486 | 总的来说
487 | 总的说来
488 | 总而言之
489 | 总之
490 | 纵
491 | 纵令
492 | 纵然
493 | 纵使
494 | 遵照
495 | 作为
496 | 兮
497 | 呃
498 | 呗
499 | 咚
500 | 咦
501 | 喏
502 | 啐
503 | 喔唷
504 | 嗬
505 | 嗯
506 | 嗳
507 | a
508 | able
509 | about
510 | above
511 | abroad
512 | according
513 | accordingly
514 | across
515 | actually
516 | adj
517 | after
518 | afterwards
519 | again
520 | against
521 | ago
522 | ahead
523 | ain't
524 | all
525 | allow
526 | allows
527 | almost
528 | alone
529 | along
530 | alongside
531 | already
532 | also
533 | although
534 | always
535 | am
536 | amid
537 | amidst
538 | among
539 | amongst
540 | an
541 | and
542 | another
543 | any
544 | anybody
545 | anyhow
546 | anyone
547 | anything
548 | anyway
549 | anyways
550 | anywhere
551 | apart
552 | appear
553 | appreciate
554 | appropriate
555 | are
556 | aren't
557 | around
558 | as
559 | a's
560 | aside
561 | ask
562 | asking
563 | associated
564 | at
565 | available
566 | away
567 | awfully
568 | b
569 | back
570 | backward
571 | backwards
572 | be
573 | became
574 | because
575 | become
576 | becomes
577 | becoming
578 | been
579 | before
580 | beforehand
581 | begin
582 | behind
583 | being
584 | believe
585 | below
586 | beside
587 | besides
588 | best
589 | better
590 | between
591 | beyond
592 | both
593 | brief
594 | but
595 | by
596 | c
597 | came
598 | can
599 | cannot
600 | cant
601 | can't
602 | caption
603 | cause
604 | causes
605 | certain
606 | certainly
607 | changes
608 | clearly
609 | c'mon
610 | co
611 | co.
612 | com
613 | come
614 | comes
615 | concerning
616 | consequently
617 | consider
618 | considering
619 | contain
620 | containing
621 | contains
622 | corresponding
623 | could
624 | couldn't
625 | course
626 | c's
627 | currently
628 | d
629 | dare
630 | daren't
631 | definitely
632 | described
633 | despite
634 | did
635 | didn't
636 | different
637 | directly
638 | do
639 | does
640 | doesn't
641 | doing
642 | done
643 | don't
644 | down
645 | downwards
646 | during
647 | e
648 | each
649 | edu
650 | eg
651 | eight
652 | eighty
653 | either
654 | else
655 | elsewhere
656 | end
657 | ending
658 | enough
659 | entirely
660 | especially
661 | et
662 | etc
663 | even
664 | ever
665 | evermore
666 | every
667 | everybody
668 | everyone
669 | everything
670 | everywhere
671 | ex
672 | exactly
673 | example
674 | except
675 | f
676 | fairly
677 | far
678 | farther
679 | few
680 | fewer
681 | fifth
682 | first
683 | five
684 | followed
685 | following
686 | follows
687 | for
688 | forever
689 | former
690 | formerly
691 | forth
692 | forward
693 | found
694 | four
695 | from
696 | further
697 | furthermore
698 | g
699 | get
700 | gets
701 | getting
702 | given
703 | gives
704 | go
705 | goes
706 | going
707 | gone
708 | got
709 | gotten
710 | greetings
711 | h
712 | had
713 | hadn't
714 | half
715 | happens
716 | hardly
717 | has
718 | hasn't
719 | have
720 | haven't
721 | having
722 | he
723 | he'd
724 | he'll
725 | hello
726 | help
727 | hence
728 | her
729 | here
730 | hereafter
731 | hereby
732 | herein
733 | here's
734 | hereupon
735 | hers
736 | herself
737 | he's
738 | hi
739 | him
740 | himself
741 | his
742 | hither
743 | hopefully
744 | how
745 | howbeit
746 | however
747 | hundred
748 | i
749 | i'd
750 | ie
751 | if
752 | ignored
753 | i'll
754 | i'm
755 | immediate
756 | in
757 | inasmuch
758 | inc
759 | inc.
760 | indeed
761 | indicate
762 | indicated
763 | indicates
764 | inner
765 | inside
766 | insofar
767 | instead
768 | into
769 | inward
770 | is
771 | isn't
772 | it
773 | it'd
774 | it'll
775 | its
776 | it's
777 | itself
778 | i've
779 | j
780 | just
781 | k
782 | keep
783 | keeps
784 | kept
785 | know
786 | known
787 | knows
788 | l
789 | last
790 | lately
791 | later
792 | latter
793 | latterly
794 | least
795 | less
796 | lest
797 | let
798 | let's
799 | like
800 | liked
801 | likely
802 | likewise
803 | little
804 | look
805 | looking
806 | looks
807 | low
808 | lower
809 | ltd
810 | m
811 | made
812 | mainly
813 | make
814 | makes
815 | many
816 | may
817 | maybe
818 | mayn't
819 | me
820 | mean
821 | meantime
822 | meanwhile
823 | merely
824 | might
825 | mightn't
826 | mine
827 | minus
828 | miss
829 | more
830 | moreover
831 | most
832 | mostly
833 | mr
834 | mrs
835 | much
836 | must
837 | mustn't
838 | my
839 | myself
840 | n
841 | name
842 | namely
843 | nd
844 | near
845 | nearly
846 | necessary
847 | need
848 | needn't
849 | needs
850 | neither
851 | never
852 | neverf
853 | neverless
854 | nevertheless
855 | new
856 | next
857 | nine
858 | ninety
859 | no
860 | nobody
861 | non
862 | none
863 | nonetheless
864 | noone
865 | no-one
866 | nor
867 | normally
868 | not
869 | nothing
870 | notwithstanding
871 | novel
872 | now
873 | nowhere
874 | o
875 | obviously
876 | of
877 | off
878 | often
879 | oh
880 | ok
881 | okay
882 | old
883 | on
884 | once
885 | one
886 | ones
887 | one's
888 | only
889 | onto
890 | opposite
891 | or
892 | other
893 | others
894 | otherwise
895 | ought
896 | oughtn't
897 | our
898 | ours
899 | ourselves
900 | out
901 | outside
902 | over
903 | overall
904 | own
905 | p
906 | particular
907 | particularly
908 | past
909 | per
910 | perhaps
911 | placed
912 | please
913 | plus
914 | possible
915 | presumably
916 | probably
917 | provided
918 | provides
919 | q
920 | que
921 | quite
922 | qv
923 | r
924 | rather
925 | rd
926 | re
927 | really
928 | reasonably
929 | recent
930 | recently
931 | regarding
932 | regardless
933 | regards
934 | relatively
935 | respectively
936 | right
937 | round
938 | s
939 | said
940 | same
941 | saw
942 | say
943 | saying
944 | says
945 | second
946 | secondly
947 | see
948 | seeing
949 | seem
950 | seemed
951 | seeming
952 | seems
953 | seen
954 | self
955 | selves
956 | sensible
957 | sent
958 | serious
959 | seriously
960 | seven
961 | several
962 | shall
963 | shan't
964 | she
965 | she'd
966 | she'll
967 | she's
968 | should
969 | shouldn't
970 | since
971 | six
972 | so
973 | some
974 | somebody
975 | someday
976 | somehow
977 | someone
978 | something
979 | sometime
980 | sometimes
981 | somewhat
982 | somewhere
983 | soon
984 | sorry
985 | specified
986 | specify
987 | specifying
988 | still
989 | sub
990 | such
991 | sup
992 | sure
993 | t
994 | take
995 | taken
996 | taking
997 | tell
998 | tends
999 | th
1000 | than
1001 | thank
1002 | thanks
1003 | thanx
1004 | that
1005 | that'll
1006 | thats
1007 | that's
1008 | that've
1009 | the
1010 | their
1011 | theirs
1012 | them
1013 | themselves
1014 | then
1015 | thence
1016 | there
1017 | thereafter
1018 | thereby
1019 | there'd
1020 | therefore
1021 | therein
1022 | there'll
1023 | there're
1024 | theres
1025 | there's
1026 | thereupon
1027 | there've
1028 | these
1029 | they
1030 | they'd
1031 | they'll
1032 | they're
1033 | they've
1034 | thing
1035 | things
1036 | think
1037 | third
1038 | thirty
1039 | this
1040 | thorough
1041 | thoroughly
1042 | those
1043 | though
1044 | three
1045 | through
1046 | throughout
1047 | thru
1048 | thus
1049 | till
1050 | to
1051 | together
1052 | too
1053 | took
1054 | toward
1055 | towards
1056 | tried
1057 | tries
1058 | truly
1059 | try
1060 | trying
1061 | t's
1062 | twice
1063 | two
1064 | u
1065 | un
1066 | under
1067 | underneath
1068 | undoing
1069 | unfortunately
1070 | unless
1071 | unlike
1072 | unlikely
1073 | until
1074 | unto
1075 | up
1076 | upon
1077 | upwards
1078 | us
1079 | use
1080 | used
1081 | useful
1082 | uses
1083 | using
1084 | usually
1085 | v
1086 | value
1087 | various
1088 | versus
1089 | very
1090 | via
1091 | viz
1092 | vs
1093 | w
1094 | want
1095 | wants
1096 | was
1097 | wasn't
1098 | way
1099 | we
1100 | we'd
1101 | welcome
1102 | well
1103 | we'll
1104 | went
1105 | were
1106 | we're
1107 | weren't
1108 | we've
1109 | what
1110 | whatever
1111 | what'll
1112 | what's
1113 | what've
1114 | when
1115 | whence
1116 | whenever
1117 | where
1118 | whereafter
1119 | whereas
1120 | whereby
1121 | wherein
1122 | where's
1123 | whereupon
1124 | wherever
1125 | whether
1126 | which
1127 | whichever
1128 | while
1129 | whilst
1130 | whither
1131 | who
1132 | who'd
1133 | whoever
1134 | whole
1135 | who'll
1136 | whom
1137 | whomever
1138 | who's
1139 | whose
1140 | why
1141 | will
1142 | willing
1143 | wish
1144 | with
1145 | within
1146 | without
1147 | wonder
1148 | won't
1149 | would
1150 | wouldn't
1151 | x
1152 | y
1153 | yes
1154 | yet
1155 | you
1156 | you'd
1157 | you'll
1158 | your
1159 | you're
1160 | yours
1161 | yourself
1162 | yourselves
1163 | you've
1164 | z
1165 | zero
--------------------------------------------------------------------------------
/FastTextRank/test/KeyWord.py:
--------------------------------------------------------------------------------
1 | from FastTextRank.FastTextRank4Word import FastTextRank4Word
2 | import codecs
3 | import datetime
4 | mod = FastTextRank4Word(tol=0.0001,window=2)
5 | # with open("text1.txt", "r", encoding='utf-8') as myfile:
6 | old_time = datetime.datetime.now()
7 | for i in range(1):
8 | text = codecs.open('text' + str(i + 1) + '.txt', 'r', 'utf-8').read()
9 | print('摘要')
10 | old_time = datetime.datetime.now()
11 | print(mod.summarize(text, 3))
12 | po=mod.summarize(text, 3)
13 | import types
14 | print(type(po[0]))
15 | print(datetime.datetime.now() - old_time)
--------------------------------------------------------------------------------
/FastTextRank/test/Sentence.py:
--------------------------------------------------------------------------------
1 | from FastTextRank.FastTextRank4Sentence import FastTextRank4Sentence
2 | import codecs
3 | import datetime
4 | mod = FastTextRank4Sentence(use_w2v=False,tol=0.0001)
5 | # with open("text1.txt", "r", encoding='utf-8') as myfile:
6 | old_time = datetime.datetime.now()
7 | for i in range(1):
8 | text = codecs.open('text' + str(i + 1) + '.txt', 'r', 'utf-8').read()
9 | print('摘要'+str(i+1)+':')
10 | old_time = datetime.datetime.now()
11 | print(mod.summarize(text, 1))
12 | po=mod.summarize(text, 1)
13 | print(datetime.datetime.now() - old_time)
--------------------------------------------------------------------------------
/FastTextRank/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ArtistScript/FastTextRank/0af1f353b4ff3180b8cac2953196d84fe012f7bc/FastTextRank/test/__init__.py
--------------------------------------------------------------------------------
/FastTextRank/test/text1.txt:
--------------------------------------------------------------------------------
1 | 亚马逊依然行进在快车道:一边疯狂投资,一边创造高额利润
2 | 腾讯科技07-2617:11
3 |
4 | 腾讯科技讯 据外媒报道,这些年,亚马逊一路高歌猛进,势不可挡。亚马逊的网络零售销售额占美国全部网络零售销售额的将近一半。而且,它通过全食超市、亚马逊书店和新的自助便利店不断扩大其实体店的版图。该公司的每个新的业务,从广告到云计算服务到卫生医疗,几乎都变成了10亿美元的业务。
5 |
6 | 华尔街预计,在周四收盘后亚马逊公布其第二季度财报的时候,它的统治地位可能仍然不会动摇。分析师预计在第二季度亚马逊的销售额将会超过530亿美元,较去年同期增长40%。
7 |
8 | 每当亚马逊做一些不可思议的事情的时候,它总有办法将它做成盈利的。分析师预计,在第二季度,亚马逊的每股收益将会达到2.50美元,较一年前增长40%,尽管该公司一直在重磅投资订单履行中心、新零售店以及原创内容。
9 |
10 | 金融服务公司Canaccord Genuity的分析师迈克尔-格雷厄姆(Michael Graham)本月在一份研究简报中称,在FANG公司(包括Facebook、亚马逊、Netflix和谷歌)中,亚马逊具有“最稳健、最持久的发展前景”。
11 |
12 | 亚马逊Prime会员服务仍然是推动其快速发展的核心。现在有超过1亿人付费成为亚马逊Prime会员,该服务包括两日免费送货上门服务以及不断丰富的影视节目库。在第二季度,亚马逊将Prime会费的价格从每年99美元提高到了119美元。
13 |
14 | “Prime仍然是亚马逊未来发展的重要推动力。”市场研究公司GBH Insights的分析师丹尼尔-艾维斯(Daniel Ives)在上周的一份投资者研究简报中说。亚马逊斥资137亿美元收购了全食超市,并利用各种打折活动和免费两小时送货上门服务吸引了更多Prime会员。
15 |
16 | 而且,亚马逊在每年7月还会举办国际购物节(Prime Day)活动。尽管今年长达36个小时的促销活动在一开始出现了一些技术上的瑕疵,但是它成为了Prime Day四年历史上最盛大的购物节,吸引的新Prime用户数量打破了记录。
17 |
18 | 但是,亚马逊绝不仅仅只依靠Prime。亚马逊云服务AWS尽管面临着来自微软和谷歌的挑战,但是仍然占据着统治地位。在第一季度,AWS的营收增长了50%,增长到了50亿美元以上。它也是保障亚马逊盈利的关键业务。
19 |
20 | 亚马逊在广告领域也获得了长足发展。在第一季度,该公司的“其他业务”(主要包括广告服务)的销售额超过了20亿美元,是一年前的一倍以上。德意志银行的分析师认为,“亚马逊是谷歌和Facebook的替代者”。
21 |
22 | 西诺佛信托公司的高级投资组合经理丹-摩根(Dan Morgan)称,这些“摇钱树”让亚马逊“一边疯狂地投资,一边创造高额利润”。
23 |
24 | 亚马逊的市值也有望赶超苹果,从而成为全球最有价值的公司。周四公布的强劲财报有望让亚马逊超过苹果,成为首家市值突破1万亿美元的公司。
--------------------------------------------------------------------------------
/FastTextRank/util.py:
--------------------------------------------------------------------------------
1 | #-*- encoding:utf-8 -*-
2 | import sys
3 | import jieba
4 | import math
5 | import numpy as np
6 | import jieba.posseg as pseg
7 |
8 | sentence_delimiters=frozenset(u'。!?……')
9 | allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
10 |
11 | PY2 = sys.version_info[0] == 2
12 | if not PY2:
13 | # Python 3.x and up
14 | text_type = str
15 | string_types = (str,)
16 | xrange = range
17 |
18 | def as_text(v): ## 生成unicode字符串
19 | if v is None:
20 | return None
21 | elif isinstance(v, bytes):
22 | return v.decode('utf-8', errors='ignore')
23 | elif isinstance(v, str):
24 | return v
25 | else:
26 | raise ValueError('Unknown type %r' % type(v))
27 |
28 | def is_text(v):
29 | return isinstance(v, text_type)
30 |
31 | else:
32 | # Python 2.x
33 | text_type = unicode
34 | string_types = (str, unicode)
35 | xrange = xrange
36 |
37 | def as_text(v):
38 | if v is None:
39 | return None
40 | elif isinstance(v, unicode):
41 | return v
42 | elif isinstance(v, str):
43 | return v.decode('utf-8', errors='ignore')
44 | else:
45 | raise ValueError('Invalid type %r' % type(v))
46 |
47 | def is_text(v):
48 | return isinstance(v, text_type)
49 |
50 | def cut_sentences(sentence):
51 | tmp = []
52 | for ch in sentence: # 遍历字符串中的每一个字
53 | tmp.append(ch)
54 | if sentence_delimiters.__contains__(ch):
55 | yield ''.join(tmp)
56 | tmp = []
57 | yield ''.join(tmp)
58 |
59 | def cut_filter_words(cutted_sentences,stopwords,use_stopwords=False):
60 | sentences = []
61 | sents = []
62 | for sent in cutted_sentences:
63 | sentences.append(sent)
64 | if use_stopwords:
65 | sents.append([word for word in jieba.cut(sent) if word and word not in stopwords]) # 把句子分成词语
66 | else:
67 | sents.append([word for word in jieba.cut(sent) if word])
68 | return sentences,sents
69 |
70 | def psegcut_filter_words(cutted_sentences,stopwords,use_stopwords=True,use_speech_tags_filter=True):
71 | sents = []
72 | sentences = []
73 | for sent in cutted_sentences:
74 | sentences.append(sent)
75 | jieba_result = pseg.cut(sent)
76 | if use_speech_tags_filter == True:
77 | jieba_result = [w for w in jieba_result if w.flag in allow_speech_tags]
78 | else:
79 | jieba_result = [w for w in jieba_result]
80 | word_list = [w.word.strip() for w in jieba_result if w.flag != 'x']
81 | word_list = [word for word in word_list if len(word) > 0]
82 | if use_stopwords:
83 | word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
84 | sents.append(word_list)
85 | return sentences,sents
86 |
87 | def weight_map_rank(weight_graph,max_iter,tol):
88 | '''
89 | 输入相似度的图(矩阵)
90 | 返回各个句子的分数
91 | :param weight_graph:
92 | :return:
93 | '''
94 | # 初始分数设置为0.5
95 | #初始化每个句子的分子和老分数
96 | scores = [0.5 for _ in range(len(weight_graph))]
97 | old_scores = [0.0 for _ in range(len(weight_graph))]
98 | denominator = caculate_degree(weight_graph)
99 |
100 | # 开始迭代
101 | count=0
102 | while different(scores, old_scores,tol):
103 | for i in range(len(weight_graph)):
104 | old_scores[i] = scores[i]
105 | #计算每个句子的分数
106 | for i in range(len(weight_graph)):
107 | scores[i] = calculate_score(weight_graph,denominator, i)
108 | count+=1
109 | if count>max_iter:
110 | break
111 | return scores
112 |
113 | def caculate_degree(weight_graph):
114 | length = len(weight_graph)
115 | denominator = [0.0 for _ in range(len(weight_graph))]
116 | for j in range(length):
117 | for k in range(length):
118 | denominator[j] += weight_graph[j][k]
119 | if denominator[j] == 0:
120 | denominator[j] = 1.0
121 | return denominator
122 |
123 |
124 | def calculate_score(weight_graph,denominator, i):#i表示第i个句子
125 | """
126 | 计算句子在图中的分数
127 | :param weight_graph:
128 | :param scores:
129 | :param i:
130 | :return:
131 | """
132 | length = len(weight_graph)
133 | d = 0.85
134 | added_score = 0.0
135 |
136 | for j in range(length):
137 | fraction = 0.0
138 | # 计算分子
139 | #[j,i]是指句子j指向句子i
140 | fraction = weight_graph[j][i] * 1.0
141 | #除以j的出度
142 | added_score += fraction / denominator[j]
143 | #算出最终的分数
144 | weighted_score = (1 - d) + d * added_score
145 | return weighted_score
146 |
147 | def different(scores, old_scores,tol=0.0001):
148 | '''
149 | 判断前后分数有无变化
150 | :param scores:
151 | :param old_scores:
152 | :return:
153 | '''
154 | flag = False
155 | for i in range(len(scores)):
156 | if math.fabs(scores[i] - old_scores[i]) >= tol:#原始是0.0001
157 | flag = True
158 | break
159 | return flag
160 |
161 | def cosine_similarity(vec1, vec2):
162 | '''
163 | 计算两个向量之间的余弦相似度
164 | :param vec1:
165 | :param vec2:
166 | :return:
167 | '''
168 | tx = np.array(vec1)
169 | ty = np.array(vec2)
170 | cos1 = np.sum(tx * ty)
171 | cos21 = np.sqrt(sum(tx ** 2))
172 | cos22 = np.sqrt(sum(ty ** 2))
173 | cosine_value = cos1 / float(cos21 * cos22)
174 | return cosine_value
175 |
176 |
177 | def combine(word_list, window=2):
178 | """构造在window下的单词组合,用来构造单词之间的边。
179 |
180 | Keyword arguments:
181 | word_list -- list of str, 由单词组成的列表。
182 | windows -- int, 窗口大小。
183 | """
184 | if window < 2: window = 2
185 | for x in xrange(1, window):
186 | if x >= len(word_list):
187 | break
188 | word_list2 = word_list[x:]
189 | res = zip(word_list, word_list2)
190 | for r in res:
191 | yield r
192 |
193 |
194 | def two_sentences_similarity(sents_1, sents_2):
195 | '''
196 | 计算两个句子的相似性
197 | :param sents_1:
198 | :param sents_2:
199 | :return:
200 | '''
201 | counter = 0
202 | for sent in sents_1:
203 | if sent in sents_2:
204 | counter += 1
205 | if counter==0:
206 | return 0
207 | return counter / (math.log(len(sents_1) + len(sents_2)))
208 |
209 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 快速文本摘要及关键词提取
2 | 从中文文本中提取摘要及关键词,并对算法时间复杂度进行了修改,计算图最大权节点的时间复杂度由o(n^2)降低到了o(n)。在有限的测试文本上(10篇文章),其运行速度相比于textrank4zh这个包快了8倍。[算法原理见知乎文章](https://zhuanlan.zhihu.com/p/41241390)
3 | ## 安装
4 | Numpy>=1.14.5
5 | gensim>=3.5.0
6 | pip install FastTextRank==1.1
7 |
8 | ## 使用
9 | 详情请见./FastTextRank/test文件夹
10 | KeyWord.py:提取关键字示例
11 | Sentence.py:提取摘要示例
12 |
13 | ## 额外
14 | 如有优化点,欢迎pull requests
15 | 如有问题,欢迎提issues
16 |
17 | # FastTextRank
18 | Extract abstracts and keywords from Chinese text, use *optimized iterative algorithms* to improve running **speed**, and *selectively use word vectors* to improve **accuracy**.
19 | ## PageRank
20 | PageRank is a website page ranking algorithm from Google.
21 | PageRank was originally used to calculate the importance of web pages. The entire www can be seen as a directed graph, and the node is a web page.
22 | This algorithm can caculate all node's importance by their connections.
23 | * My algorithm changed the iterative algorithm to make the algorithm much faster, it costs 10ms per article, on the mean while TextRank4ZH costs 80ms on my data.
24 | * My algorithm also use word2vec to make the abstract more accurate, but it will cost more time to run the algorithm. Using word2vec costs 40ms per article on the same traning data.
25 |
26 | ## FastTextRank4Sentence
27 | ### Introduction
28 | 1. Cut article into sentence
29 | 2. Calculate similarity between sentences:
30 | * Using word vectors' cosine similarity
31 | * Using two sentences' common words
32 | 3. Build a graph by sentences' similarity
33 | 4. Caculate the importance of each sentence by improved iterative algorithm
34 | 5. Get the abstract
35 | ### API
36 | * use_stopword: boolean, default True
37 | * stop_words_file: str, default None.
38 | The stop words file you want to use. If it is None, you will use this package's stop words.
39 | * use_w2v: boolean, default False
40 | If it is True, you must input passing dict_path parameter.
41 | * dict_path: str, default None.
42 | * max_iter:maximum iteration round
43 | * tol: maximum tolerance error
44 |
45 | ## FastTextRank4Word
46 |
47 | ### Introduction
48 | 1. Cut artile into word
49 | 2. Calculate similarity between word:
50 | If two words are all in window distance, then the graph's side of this two word add 1.0. Window is set by user.
51 | 3. Build a graph by word' similarity
52 | 4. Caculate the importance of each word by improved iterative algorithm
53 | 5. Get the key word
54 |
55 | ### API
56 | * use_stopword=boolean, default True
57 | * stop_words_file=str, default None.
58 | The stop words file you want to use. If it is None, you will use this package's stop words.
59 | * max_iter=maximum iteration round
60 | * tol=maximum tolerance error
61 | * window=int, default 2
62 | The window to determine if two words are related
--------------------------------------------------------------------------------