├── test
    ├── images
    │   ├── macropodus_logo.png
    │   └── __init__.py
    ├── __init__.py
    ├── evaluate
    │   ├── __init__.py
    │   ├── data
    │   │   ├── __init__.py
    │   │   └── ambiguity.txt
    │   ├── tet_summarize.py
    │   ├── tet_evaluate.py
    │   ├── tet_macropodus.py
    │   └── tet_nlg_yongzhuo.py
    ├── style_data
    │   ├── __init__.py
    │   ├── pku_training.utf8
    │   └── tag_seg_BMES.py
    ├── other
    │   ├── tools
    │   │   └── pkuseg.py
    │   └── pos_tagging_1998
    │   │   └── compare_tags.py
    └── survey_report
    │   └── nlp_platfom_survey.md
├── requirements.txt
├── __init__.py
├── macropodus
    ├── logs
    │   └── __init__.py
    ├── base
    │   ├── __init__.py
    │   ├── word2vec.py
    │   └── seg_basic.py
    ├── conf
    │   ├── __init__.py
    │   ├── path_log.py
    │   └── path_config.py
    ├── data
    │   ├── __init__.py
    │   ├── cache
    │   │   └── __init__.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── ner_albert_people_1998
    │   │   │   └── __init__.py
    │   │   └── tag_albert_people_1998
    │   │   │   └── __init__.py
    │   ├── words_common
    │   │   └── __init__.py
    │   └── embedding
    │   │   └── albert_base_zh
    │   │       └── __init__.py
    ├── network
    │   ├── base
    │   │   └── __init__.py
    │   ├── graph
    │   │   ├── __init__.py
    │   │   ├── crf.py
    │   │   ├── bilstm.py
    │   │   └── bilstm_crf.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── non_mask_layer.py
    │   │   ├── keras_lookahead.py
    │   │   ├── keras_radam.py
    │   │   └── crf.py
    │   ├── train
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── preprocess
    │   │   └── __init__.py
    │   └── service
    │   │   ├── __init__.py
    │   │   ├── thread_manage.py
    │   │   ├── keras_dump.py
    │   │   ├── server_streamer.py
    │   │   └── server_streamer_flask.py
    ├── preprocess
    │   ├── __init__.py
    │   ├── tools_clear.py
    │   ├── tools_common.py
    │   └── tools_ml.py
    ├── tookit
    │   ├── han2zh
    │   │   ├── __init__.py
    │   │   └── han2zh.py
    │   ├── pinyin
    │   │   ├── __init__.py
    │   │   └── pinyin.py
    │   ├── number2roman
    │   │   ├── __init__.py
    │   │   └── ri.py
    │   ├── trie_tree
    │   │   ├── __init__.py
    │   │   └── trie_tree.py
    │   ├── calculator_sihui
    │   │   ├── __init__.py
    │   │   ├── calcultor_number.py
    │   │   ├── calcultor_formula.py
    │   │   └── calcultor_function.py
    │   ├── chinese2number
    │   │   └── __init__.py
    │   └── __init__.py
    ├── segment
    │   ├── seg_statistics
    │   │   ├── __init__.py
    │   │   ├── seg_forward.py
    │   │   ├── seg_bidirectional.py
    │   │   ├── seg_reverse.py
    │   │   └── seg_dag.py
    │   ├── word_discovery
    │   │   └── __init__.py
    │   └── __init__.py
    ├── summarize
    │   ├── feature_base
    │   │   ├── __init__.py
    │   │   ├── mmr.py
    │   │   ├── word_significance.py
    │   │   └── text_teaser.py
    │   ├── graph_base
    │   │   ├── __init__.py
    │   │   ├── textrank_sklearn.py
    │   │   ├── textrank.py
    │   │   └── textrank_word2vec.py
    │   ├── nous_base
    │   │   ├── __init__.py
    │   │   └── lead_3.py
    │   ├── topic_base
    │   │   ├── __init__.py
    │   │   ├── topic_lsi.py
    │   │   ├── topic_lda.py
    │   │   └── topic_nmf.py
    │   ├── yongzhuo_nlg
    │   │   ├── __init__.py
    │   │   └── README.md
    │   └── __init__.py
    ├── version.py
    ├── similarity
    │   ├── __init__.py
    │   └── similarity_word2vec_char.py
    ├── __init__.py
    └── __init_tf_keras.py
├── requirements-all.txt
├── LICENSE
├── .gitignore
└── setup.py


/test/images/macropodus_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yongzhuo/Macropodus/HEAD/test/images/macropodus_logo.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | scikit-learn
4 | passlib==1.7.1
5 | gensim==3.7.1
6 | tqdm==4.31.1
7 | networkx==2.4
8 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # !/usr/bin/python
3 | # @time     :2019/12/3 22:50
4 | # @author   :Mo
5 | # @function :
6 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2020/1/4 21:04
4 | # @author  : Mo
5 | # @function:
6 | 


--------------------------------------------------------------------------------
/macropodus/logs/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/11/19 0:20
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/test/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/17 10:38
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/test/images/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/20 21:54
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/base/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/11/12 23:04
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/conf/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/11/18 23:59
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/21 23:06
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/cache/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/3 0:25
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/test/evaluate/data/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/17 10:41
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/test/style_data/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/21 23:11
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/model/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/21 23:06
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/network/base/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 22:32
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/network/graph/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 22:32
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/network/layers/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/3 20:43
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/network/train/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/20 22:18
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/11/19 10:39
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/tookit/han2zh/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2020/1/8 21:51
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/tookit/pinyin/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2020/1/7 19:59
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/words_common/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 20:29
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/network/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 22:26
4 | # @author  : Mo
5 | # @function:
6 | 
7 | 


--------------------------------------------------------------------------------
/macropodus/network/preprocess/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 22:35
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/network/service/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2020/1/16 22:01
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/tookit/number2roman/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2020/1/2 9:13
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/tookit/trie_tree/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 22:06
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/segment/seg_statistics/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/11/19 9:25
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/segment/word_discovery/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/19 15:36
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/summarize/feature_base/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # !/usr/bin/python
3 | # @time     :2019/12/25 21:41
4 | # @author   :Mo
5 | # @function :


--------------------------------------------------------------------------------
/macropodus/summarize/graph_base/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # !/usr/bin/python
3 | # @time     :2019/11/25 21:42
4 | # @author   :Mo
5 | # @function :


--------------------------------------------------------------------------------
/macropodus/summarize/nous_base/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # !/usr/bin/python
3 | # @time     :2019/11/25 21:44
4 | # @author   :Mo
5 | # @function :


--------------------------------------------------------------------------------
/macropodus/summarize/topic_base/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | # !/usr/bin/python
3 | # @time     :2019/11/29 20:35
4 | # @author   :Mo
5 | # @function :


--------------------------------------------------------------------------------
/macropodus/tookit/calculator_sihui/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/3 20:25
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/tookit/chinese2number/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/5 22:00
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/embedding/albert_base_zh/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/11/2 1:08
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/model/ner_albert_people_1998/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/21 23:06
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/data/model/tag_albert_people_1998/__init__.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/21 23:06
4 | # @author  : Mo
5 | # @function:


--------------------------------------------------------------------------------
/macropodus/version.py:
--------------------------------------------------------------------------------
1 | # !/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 | # @time    : 2019/12/21 22:24
4 | # @author  : Mo
5 | # @function: version of Macropodus
6 | 
7 | 
8 | __version__ = "0.0.7"
9 | 


--------------------------------------------------------------------------------
/macropodus/summarize/yongzhuo_nlg/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/5/14 21:11
 4 | # @author  : Mo
 5 | # @function: nlg-yongzhuo
 6 | 
 7 | 
 8 | from nlg_yongzhuo import *
 9 | 
10 | 


--------------------------------------------------------------------------------
/test/other/tools/pkuseg.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/10 15:00
 4 | # @author  : Mo
 5 | # @function:
 6 | 
 7 | 
 8 | import pkuseg
 9 | 
10 | ps = pkuseg.pkuseg()
11 | res = ps.cut("帝国主义要把我们的地瓜分掉")
12 | print(res)


--------------------------------------------------------------------------------
/requirements-all.txt:
--------------------------------------------------------------------------------
 1 | scikit-learn==0.19.1
 2 | pandas==0.23.4
 3 | passlib==1.7.1
 4 | gensim==3.7.1
 5 | numpy==1.16.2
 6 | tqdm==4.31.1
 7 | networkx==2.4
 8 | tensorflow-gpu==1.15.0
 9 | keras-bert==0.80.0
10 | keras-adaptive-softmax==0.6.0
11 | nlg-yongzhuo==0.0.4


--------------------------------------------------------------------------------
/test/style_data/pku_training.utf8:
--------------------------------------------------------------------------------
1 | 迈向  充满  希望  的  新  世纪  ——  一九九八年  新年  讲话  （  附  图片  １  张  ）  
2 | 中共中央  总书记  、  国家  主席  江  泽民  
3 | （  一九九七年  十二月  三十一日  ）  
4 | １２月  ３１日  ，  中共中央  总书记  、  国家  主席  江  泽民  发表  １９９８年  新年  讲话  《  迈向  充满  希望  的  新  世纪  》  。
5 | 同胞  们  、  朋友  们  、  女士  们  、  先生  们  ：
6 | 


--------------------------------------------------------------------------------
/macropodus/similarity/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/18 22:04
 4 | # @author  : Mo
 5 | # @function:
 6 | 
 7 | 
 8 | from macropodus.similarity.similarity_word2vec_char import SimW2vChar
 9 | import os
10 | 
11 | # 词向量, 默认使用缓存
12 | use_cache = True
13 | if not os.environ.get("macropodus_use_w2v_cache", True):
14 |     use_cache = False  # 不使用缓存，重新加载
15 | # 文本相似度
16 | swc = SimW2vChar(use_cache)
17 | sim = swc.similarity
18 | 


--------------------------------------------------------------------------------
/macropodus/network/service/thread_manage.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/16 15:08
 4 | # @author  : Mo
 5 | # @function:
 6 | 
 7 | 
 8 | from multiprocessing import Process, Manager
 9 | 
10 | def f(d, l):
11 |     d[1] = '1'
12 |     d['2'] = 2
13 |     d[0.25] = None
14 |     l.reverse()
15 | 
16 | if __name__ == '__main__':
17 |     manager = Manager()
18 | 
19 |     d = manager.dict()
20 |     l = manager.list(range(10))
21 | 
22 |     p = Process(target=f, args=(d, l))
23 |     p.start()
24 |     p.join()
25 | 
26 |     print (d)
27 |     print (l)


--------------------------------------------------------------------------------
/macropodus/preprocess/tools_clear.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/5 22:02
 4 | # @author  : Mo
 5 | # @function: clear text
 6 | 
 7 | 
 8 | def is_total_num(text):
 9 |     """
10 |       判断是否是数字的
11 |     :param text: str
12 |     :return: boolean, True or false
13 |     """
14 |     try:
15 |         text_clear = text.replace(" ", "").strip()
16 |         number = 0
17 |         for one in text_clear:
18 |             if one.isdigit():
19 |                 number += 1
20 |         if number == len(text_clear):
21 |             return True
22 |         else:
23 |             return False
24 |     except:
25 |         return False
26 | 
27 | 


--------------------------------------------------------------------------------
/macropodus/network/layers/non_mask_layer.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/11/10 21:35
 4 | # @author   :Mo
 5 | # @function :NonMaskingLayer of bert
 6 | # @codefrom :https://github.com/jacoxu
 7 | 
 8 | 
 9 | from __future__ import print_function, division
10 | from tensorflow.python.keras.layers import Layer
11 | 
12 | 
13 | class NonMaskingLayer(Layer):
14 |     """
15 |     fix convolutional 1D can't receive masked input,
16 |     detail: https://github.com/keras-team/keras/issues/4978
17 |     """
18 | 
19 |     def __init__(self, **kwargs):
20 |         self.supports_masking = True
21 |         super(NonMaskingLayer, self).__init__(**kwargs)
22 | 
23 |     def build(self, input_shape):
24 |         pass
25 | 
26 |     def compute_mask(self, inputs, input_mask=None):
27 |         # do not pass the mask to the next layers
28 |         return None
29 | 
30 |     def call(self, x, mask=None):
31 |         return x
32 | 


--------------------------------------------------------------------------------
/macropodus/segment/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/18 22:00
 4 | # @author  : Mo
 5 | # @function: segment of sent
 6 | 
 7 | 
 8 | from macropodus.segment.seg_statistics.seg_statistics import SegStatistics
 9 | from macropodus.segment.word_discovery.word_discovery import WordDiscovery
10 | import os
11 | 
12 | 
13 | # 机械分词,默认使用缓存
14 | use_cache = True
15 | if not os.environ.get("macropodus_use_seg_cache", True):
16 |     use_cache = False  # 不使用缓存，重新加载
17 | segs = SegStatistics(use_cache)
18 | cut_bidirectional = segs.cut_bidirectional
19 | cut_forward = segs.cut_forward
20 | cut_reverse = segs.cut_reverse
21 | cut_search = segs.cut_search
22 | cut_dag = segs.cut_dag
23 | cut = segs.cut
24 | 
25 | # 用户词典增删改查
26 | load_user_dict = segs.load_user_dict
27 | save_delete_words = segs.save_delete_words
28 | save_add_words = segs.save_add_words
29 | delete_word = segs.delete_word
30 | add_word = segs.add_word
31 | 
32 | # 新词发现
33 | wd = WordDiscovery()
34 | find = wd.find_word
35 | 


--------------------------------------------------------------------------------
/macropodus/network/service/keras_dump.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/16 22:34
 4 | # @author  : Mo
 5 | # @function: dump of keras, error, no use.
 6 | 
 7 | 
 8 | from tensorflow.python.keras.models import save_model, load_model, Model
 9 | import tempfile
10 | import types
11 | 
12 | 
13 | def make_keras_picklable():
14 |     def __getstate__(self):
15 |         model_str = ""
16 |         with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
17 |             save_model(self, fd.name, overwrite=True)
18 |             model_str = fd.read()
19 |         d = {'model_str': model_str}
20 |         return d
21 | 
22 |     def __setstate__(self, state):
23 |         with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
24 |             fd.write(state['model_str'])
25 |             fd.flush()
26 |             model = load_model(fd.name)
27 |         self.__dict__ = model.__dict__
28 | 
29 |     cls = Model
30 |     cls.__getstate__ = __getstate__
31 |     cls.__setstate__ = __setstate__
32 | 


--------------------------------------------------------------------------------
/macropodus/tookit/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/28 20:49
 4 | # @author  : Mo
 5 | # @function: tookit
 6 | 
 7 | 
 8 | # tookit
 9 | from macropodus.tookit.chinese2number.chinese2number import Chi2Num, Num2Chi
10 | from macropodus.tookit.calculator_sihui.calcultor_sihui import Calculator
11 | from macropodus.tookit.trie_tree.trie_tree import TrieTree
12 | from macropodus.tookit.han2zh.han2zh import Han2Zh
13 | from macropodus.tookit.pinyin.pinyin import PinYin
14 | from macropodus.tookit.number2roman.ri import RI
15 | 
16 | # 常用工具(tookit, 计算器, 中文与阿拉伯数字转化, 前缀树, 中文与罗马数字相互转化, 中文转拼音, 繁简转化)
17 | Calcul = Calculator()
18 | Chi2num = Chi2Num()
19 | Num2chi = Num2Chi()
20 | Trie = TrieTree()
21 | hanzh = Han2Zh()
22 | piyi = PinYin()
23 | ri = RI()
24 | 
25 | calculate = Calcul.calculator_sihui
26 | chi2num = Chi2num.compose_decimal
27 | num2chi = Num2chi.decimal_chinese
28 | roman2num = ri.roman2int
29 | num2roman = ri.int2roman
30 | han2zh = hanzh.han2zh
31 | zh2han = hanzh.zh2han
32 | pinyin = piyi.pinyin
33 | 


--------------------------------------------------------------------------------
/test/style_data/tag_seg_BMES.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/21 23:11
 4 | # @author  : Mo
 5 | # @function: BMES标注法
 6 | 
 7 | 
 8 | from macropodus.preprocess.tools_common import load_json, save_json
 9 | from macropodus.preprocess.tools_common import txt_write, txt_read
10 | import json
11 | 
12 | pku_training = txt_read("pku_training.utf8")
13 | file = open("pku_train.json", "w", encoding="utf-8")
14 | pku_ = []
15 | for pku in pku_training:
16 |     pkus = pku.split("  ")
17 |     label_pkus = ""
18 |     for pku_sig in pkus:
19 |         len_pku = len(pku_sig)
20 |         if len_pku==1:
21 |             label_pkus += "S"
22 |         elif len_pku==2:
23 |             label_pkus += "BE"
24 |         else:
25 |             label_pkus += "B" + "M"*(len_pku-2) + "E"
26 |     label_pkus_l = list(label_pkus)
27 |     pku_res = {}
28 |     pku_res["question"] = list("".join(pkus))
29 |     pku_res["label"] = label_pkus_l
30 |     p_json = json.dumps(pku_res, ensure_ascii=False)
31 |     file.write(p_json + "\n")
32 | #     pku_.append(pku_res)
33 | # save_json(pku_, "pku_train.json")
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 yongzhuo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/evaluate/tet_summarize.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/5/14 0:11
 4 | # @author  : Mo
 5 | # @function: test summarize of corpus
 6 | 
 7 | 
 8 | import macropodus
 9 | 
10 | 
11 | summary = "PageRank算法简介。" \
12 |           "是上世纪90年代末提出的一种计算网页权重的算法! " \
13 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
14 |           "业界急需一种相对比较准确的网页重要性计算方法。 " \
15 |           "是人们能够从海量互联网世界中找出自己需要的信息。 " \
16 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
17 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
18 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
19 |           "和投票目标的等级来决定新的等级。简单的说， " \
20 |           "一个高等级的页面可以使其他低等级页面的等级提升。 " \
21 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
22 |           "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
23 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
24 |           "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
25 | 
26 | 
27 | # 文本摘要(summarize, 默认接口)
28 | sents = macropodus.summarize(summary)
29 | print(sents)
30 | 
31 | # (summarization, 可定义方法, 提供9种文本摘要方法, 'lda', 'mmr', 'textrank', 'text_teaser'
32 | ttypes = ['text_pronouns', 'text_teaser', 'word_sign', 'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf']
33 | 
34 | for ttp in ttypes:
35 |     sents = macropodus.summarization(text=summary, type_summarize=ttp)
36 |     print("\n" + ttp + ":  ")
37 |     print(sents)
38 | 


--------------------------------------------------------------------------------
/test/evaluate/tet_evaluate.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/17 21:13
 4 | # @author  : Mo
 5 | # @function: test evulate
 6 | 
 7 | 
 8 | from macropodus.preprocess.tools_common import txt_write, txt_read
 9 | import macropodus
10 | import time
11 | 
12 | 
13 | def evulate_file(path_file):
14 | 	"""
15 | 	    验证切词的各种指标
16 | 	:param path_file: str, like '/train.txt'
17 | 	:return: float
18 | 	"""
19 | 	# 读取数据
20 | 	sents = txt_read(path_file)
21 | 	# 初始化统计计数
22 | 	count_macropodus = 0
23 | 	count_real = 0
24 | 	count_true = 0
25 | 	count = 0
26 | 	# 切词与统计, true
27 | 	for sent in sents:
28 | 		sent_sp = sent.strip()
29 | 		res_real = sent_sp.split(' ')
30 | 		sentence = sent_sp.replace(' ','')
31 | 		res_macropodus = macropodus.cut(sentence)
32 | 		print(res_macropodus)
33 | 		count += 1
34 | 		count_real += len(res_real)
35 | 		count_macropodus += len(res_macropodus)
36 | 		for cm in res_macropodus:
37 | 			if cm in res_real:
38 | 				count_true += 1
39 | 				res_real.remove(cm)
40 | 	# precision, recall, f1
41 | 	precision = count_true / count_macropodus
42 | 	recall = count_true / count_real
43 | 	f1 = (precision * recall * 2) / (precision + recall)
44 | 
45 | 	return precision, recall, f1
46 | 
47 | 
48 | if __name__ == "__main__":
49 | 	path_file = 'data/ambiguity.txt'
50 | 	time_start = time.time()
51 | 	precision, recall, f1 = evulate_file(path_file)
52 | 	print('time: ' + str(time.time()-time_start))
53 | 	print('precision\t', 'recall\t', 'f1')
54 | 	print(precision, recall, f1)
55 | 


--------------------------------------------------------------------------------
/test/evaluate/data/ambiguity.txt:
--------------------------------------------------------------------------------
 1 | 工信处 女 干事 每月 经过 下属 科室 都要 亲口 交代 24 口 交换机 等 技术性 器件 的 安装 工作
 2 | 研究 生命科学 \t 研究 生命 科学
 3 | 研究生 命令 本科生
 4 | 我 从 马 上 下来
 5 | 我 马上 下来
 6 | 北京 大学生 喝 进口 红酒
 7 | 在 北京大学 生活区 喝 进口 红酒
 8 | 从小 学 电脑
 9 | 从 小学 毕业
10 | 美军 中将 竟 公然 说
11 | 新建 地铁 中 将 禁止 商业 摊点
12 | 这块 地 面积 还真 不小
13 | 地面 积了 厚厚 的 雪
14 | 让 我们 以 爱心 和 平等 来 对待 动物
15 | 阿美 首脑 会议 将 讨论 巴以 和平 等 问题
16 | 锌 合金 把手 的 相关 求购 信息
17 | 别 把 手 伸进 别人 的 口袋 里
18 | 将 信息 技术 应用 于 教学 实践
19 | 信息 技术 应用 于 教学 中 的 哪个 方面
20 | 上级 解除 了 他 的 职务
21 | 方程 的 解 除了 零 以外 还有 …
22 | 我们 一起 去 故宫
23 | 一起 恶性 交通 事故
24 | 我 不想 吃 东西
25 | 你 就 不 想想
26 | 各 国有 企业 相继 倒闭
27 | 各国 有 各国 的 困难
28 | 老人家 身体 不错
29 | 老人 家中 很 干净
30 | 和服 务必 归还
31 | 技术 和 服务
32 | 他 站 起 身
33 | 他 起身 去 北京
34 | 问题 的 确定
35 | 这的 确定 不 下来
36 | 结合 成分
37 | 为 人民 工作
38 | 中国 产品 质量
39 | 原子 结合 成 分子 时
40 | 部分 居民 生活 水平
41 | 治理 解放 大道 路面 积水
42 | 这样 的 人 才能 经受 住 考验
43 | 他俩 儿 谈 恋爱 是 从 头年 元月 开始 的
44 | 在 这些 企业 中 国有 企业 有 十个
45 | 结婚 的 和 尚未 结婚 的
46 | 热海 景区
47 | 热海 景区 +
48 | 崔永元 炮轰 范冰冰
49 | 这 源自 萧红 写给 萧军 信中 的 一句话
50 | 阿里 大华 腾讯 百度
51 | 亲家公 亲家母
52 | 情侣 们 在 海南岛 上 海誓山盟
53 | 在于 不断 提高 人们 信以为真 的 情感 纪实 的 能力 。
54 | 四川 发改委 发文 取缔 p2p 和 P2p
55 | 字节跳动 是 今日头条 的 母公司
56 | 今日头条 白嫖 东风快递 令人喷饭 勿谓言之不预也 白嫖 口区 弓虽 口丕 我酸了 祖安人 迷惑行为
57 | 5G 996 007 1118 35 120 251 nmsl nsdd wdnmd CSGO
58 | 唱跳 rap 篮球 鸡你太美 cxk
59 | 盘它 撞梗 融梗 雨女无瓜 要你寡
60 | 刺激战场 绝地求生
61 | 狼灭 狼火 狼炎 狼焱 灵魂八问 硬核 奥力给 有内味了 awsl 影流之主 巨魔之王
62 | 帝国主义 要 把 我们 的 地 瓜分 掉
63 | 小米 上半年 业绩 稳健 增长 ，Q2 净利 大 超 市场 预期
64 | 陈建仁 请 辞任 蔡英文 副手 人选 台 ” 中研院 ： 祝福
65 | 车易拍 获 北汽产投 注资 接盘
66 | 我 家 门前 有 条 水沟 很难 过
67 | 中华 人民 共和国
68 | 郑州 天和 服装厂
69 | 完成 千万 元 天使轮 投资 找 米斗 从 B2B 交易 切入
70 | 环球网 评 共建 共知 共享 ， 以 社会 治理 提升 人民 获得感
71 | 董监高 频换 ， 公司 毛利 下降


--------------------------------------------------------------------------------
/test/other/pos_tagging_1998/compare_tags.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/14 17:05
 4 | # @author  : Mo
 5 | # @function:
 6 | 
 7 | 
 8 | tags_res = ['m', 'vn', 'v', 'Yg', 'Tg', 'l', 'p', 'nt', 'y', 'Rg', 'e', 'i', 'an', 'q', 'k', 'nr', 'Ag', 'n', 'vvn', 'd', 'f', 'ad', 'vd', 'z', 'Mg', 'nx', 'a', 'h', 's', 'u', 'na', 'Bg', 'j', 'w', 'Ng', 'o', 'nz', 'ns', 'b', 'Vg', 'Dg', 'r', 't', 'c']
 9 | # ['Rg', 'nt', 'Ng', 'm', 'u', 'nx', 'an', 'na', 'b', 'd', 'c', 'vd', 'j', 'ns', 'ad', 's', 'z', 'Mg', 'vn', 'l', 't', 'f', 'v', 'vvn', 'n', 'r', 'Tg', 'Dg', 'Bg', 'i', 'nr', 'k', 'q', 'o', 'a', 'w', 'e', 'h', 'p', 'y', 'nz', 'Ag', 'Yg', 'Vg']
10 | 
11 | tags_res = [tr.upper() for tr in tags_res]
12 | 
13 | from macropodus.preprocess.tools_common import txt_read
14 | 
15 | tag_jiagus = txt_read("data/tag_jiagu.txt")
16 | tag_jiebas = txt_read("data/tag_jieba.txt")
17 | 
18 | tgu = []
19 | for tag_jiagu in tag_jiagus:
20 |     tags = tag_jiagu.split("\u3000")
21 |     tag = tags[0].strip()
22 |     tgu.append(tag.upper())
23 | 
24 | tga = []
25 | for tag_jieba in tag_jiebas:
26 |     tags = tag_jieba.split("\t")
27 |     tag = tags[0].strip()
28 |     tga.append(tag.upper())
29 | 
30 | tgus = []
31 | tgas = []
32 | for tr in tags_res:
33 |     if tr.upper() not in tgu:
34 |         tgus.append(tr.upper())
35 |     if tr.upper() not in tga:
36 |         tgas.append(tr.upper())
37 | 
38 | tgus.sort()
39 | tgas.sort()
40 | print("jiagu: ")
41 | print(tgus)
42 | print("jieba: ")
43 | print(tgas)
44 | 
45 | bbc = ['AG', 'B', 'BG', 'DG', 'E', 'H', 'I', 'J', 'K', 'L', 'MG', 'NA', 'NG', 'NX', 'O', 'RG', 'TG', 'VG', 'VVN', 'Y', 'YG', 'Z']
46 | gg = 0
47 | 


--------------------------------------------------------------------------------
/macropodus/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/12 22:57
 4 | # @author  : Mo
 5 | # @function: init of macropodus (tookit, keras of tensorflow)
 6 | 
 7 | 
 8 | # macropodus
 9 | from macropodus.tookit import calculate, chi2num, num2chi, Trie, roman2num, num2roman, pinyin, zh2han, han2zh
10 | from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find
11 | from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word
12 | from macropodus.summarize import keyword, textrank, summarization
13 | from macropodus.version import __version__ # 版本
14 | from macropodus.similarity import sim
15 | import os
16 | 
17 | # 机械分词
18 | cut_bidirectional = cut_bidirectional
19 | cut_forward = cut_forward
20 | cut_reverse = cut_reverse
21 | cut_search = cut_search
22 | cut_dag = cut_dag
23 | cut = cut
24 | 
25 | # 用户词典操作
26 | load_user_dict = load_user_dict
27 | save_delete_words = save_delete_words # 保存到用户词典的
28 | save_add_words = save_add_words
29 | delete_word = delete_word
30 | add_word = add_word
31 | 
32 | # 新词发现
33 | find = find
34 | 
35 | # 文本相似度
36 | sim = sim
37 | 
38 | # 文本摘要, 关键词
39 | keyword = keyword
40 | summarize = textrank
41 | summarization = summarization
42 | 
43 | # 常用工具(tookit, 计算器, 中文与阿拉伯数字转化, 前缀树, 罗马数字与阿拉伯数字转化)
44 | calculate = calculate
45 | chi2num = chi2num
46 | num2chi = num2chi
47 | roman2num = roman2num
48 | num2roman = num2roman
49 | han2zh = han2zh
50 | zh2han = zh2han
51 | pinyin = pinyin
52 | 
53 | if os.environ.get("macropodus_use_dl", False)=="1":
54 |    from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects
55 | 


--------------------------------------------------------------------------------
/macropodus/conf/path_log.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/18 23:59
 4 | # @author  : Mo
 5 | # @function: logger of macropodus
 6 | 
 7 | 
 8 | from macropodus.conf.path_config import path_log_basic
 9 | from logging.handlers import RotatingFileHandler
10 | import logging
11 | import time
12 | import os
13 | 
14 | 
15 | logger_level = logging.INFO
16 | # log目录地址
17 | path_logs = path_log_basic #  + '/logs'
18 | if not os.path.exists(path_logs):
19 |     os.mkdir(path_logs)
20 | # 全局日志格式
21 | logging.basicConfig(level=logger_level,
22 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
23 | # 定义一个日志记录器
24 | logger = logging.getLogger("macropodus")
25 | logger.setLevel(level = logger_level)
26 | # 日志文件名,为启动时的日期
27 | log_file_name = time.strftime('macropodus-%Y-%m-%d', time.localtime(time.time())) + ".log"
28 | log_name_day = os.path.join(path_logs, log_file_name)
29 | # 文件输出, 定义一个RotatingFileHandler，最多备份32个日志文件，每个日志文件最大32K
30 | fHandler = RotatingFileHandler(log_name_day, maxBytes = 32*1024, backupCount = 32)
31 | fHandler.setLevel(logger_level)
32 | # 日志输出格式
33 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
34 | fHandler.setFormatter(formatter)
35 | # # 控制台输出
36 | # console = logging.StreamHandler()
37 | # console.setLevel(logger_level)
38 | # console.setFormatter(formatter)
39 | # logger加到handel里边
40 | logger.addHandler(fHandler)
41 | # logger.addHandler(console)
42 | 
43 | 
44 | def get_logger_root(name="macropodus"):
45 |     """
46 |         全局日志引用
47 |     :param name: str, name of logger
48 |     :return: object, logging
49 |     """
50 |     return logging.getLogger(name)
51 | 


--------------------------------------------------------------------------------
/macropodus/tookit/number2roman/ri.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/2 9:14
 4 | # @author  : Mo
 5 | # @function: 罗马数字与阿拉伯数字相互转化
 6 | 
 7 | 
 8 | class RI:
 9 |     def __init__(self):
10 |         self.algorithm = "roman2int"
11 | 
12 |     def roman2int(self, roman: str) -> int:
13 |         """
14 |             罗马数字转阿拉伯数字
15 |         :param roman: str, like "IX"
16 |         :return: int, like 9
17 |         """
18 |         roman2int_dict = {'I': 1, 'IV': 4, 'V': 5, 'IX': 9,
19 |                       'X': 10, 'XL': 40, 'L': 50, 'XC': 90,
20 |                       'C': 100, 'CD': 400, 'D': 500, 'CM': 900,
21 |                       'M': 1000}
22 |         nums = 0
23 |         while roman:
24 |             if roman[0:2] in roman2int_dict.keys():
25 |                 nums += roman2int_dict[roman[0:2]]
26 |                 roman = roman[2:]
27 |             elif roman[0:1] in roman2int_dict.keys():
28 |                 nums += roman2int_dict[roman[0:1]]
29 |                 roman = roman[1:]
30 |         return nums
31 | 
32 |     def int2roman(self, num: int) -> str:
33 |         """
34 |             阿拉伯数字转罗马数字
35 |         :param num: int, like 199
36 |         :return: str, like "CXCIX"
37 |         """
38 |         int2roman_dict = {1: 'I', 4: 'IV', 5: 'V', 9: 'IX',
39 |                           10: 'X', 40: 'XL', 50: 'L', 90: 'XC',
40 |                           100: 'C', 400: 'CD', 500: 'D', 900: 'CM', 1000: 'M'}
41 |         res = ""
42 |         for key in sorted(int2roman_dict.keys())[::-1]:
43 |             if (num == 0):
44 |                 break
45 |             tmp = num // key
46 |             if (tmp == 0):
47 |                 continue
48 |             res += int2roman_dict[key] * (tmp)
49 |             num -= key * (tmp)
50 |         return res
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     ri = RI()
55 |     roman = "LVIII" # "IX" # "LVIII"
56 |     num = 199
57 |     res1 = ri.roman2int(roman)
58 |     res2 = ri.int2roman(num)
59 |     print(res1)
60 |     print(res2)
61 | 


--------------------------------------------------------------------------------
/macropodus/segment/seg_statistics/seg_forward.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/19 9:54
 4 | # @author  : Mo
 5 | # @function: cut sentences of forward of maxlength
 6 | 
 7 | 
 8 | from macropodus.base.seg_basic import SegBasic
 9 | 
10 | 
11 | class SegForward(SegBasic):
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |     def cut(self, sentence, len_max=7):
16 |         """
17 |             正向最大切词
18 |         :param sentence: str, like '大漠帝国'
19 |         :param len_max: int, like 32
20 |         :return: yield
21 |         """
22 |         len_sen = len(sentence)
23 |         i = 0
24 |         while i < len_sen: # while判断条件
25 |             flag = False   # flag标志位,确定有没有在字典里边的单字词或多字词
26 |             for j in range(min(len_sen+1, i+len_max), -i, -1):  # 遍历从当前字到句子末尾可能成词的部分, 从最后i+len_max算起
27 |                 word_maybe = sentence[i:j] # 正向可能成词的语
28 |                 if word_maybe in self.dict_words_freq:  # 是否在字典里边
29 |                     i = j  # 成词前标志i向后移动
30 |                     flag = True  # flag标志位变化
31 |                     yield word_maybe
32 |                     break  # 成词则跳出循环
33 |             if not flag: # 未选中后单个字的情况
34 |                yield sentence[i]
35 |                i += 1
36 | 
37 | if __name__ == '__main__':
38 |     sf = SegForward()
39 |     sentence = "macropodus是啥子呢"
40 |     sentence = "方程的解除了零以外还有…"
41 |     print(list(sf.cut(sentence)))
42 | 
43 |     # 测试性能
44 |     from macropodus.preprocess.tools_common import txt_read, txt_write
45 |     from macropodus.conf.path_config import path_root
46 |     import time
47 | 
48 |     path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
49 |     sentences = txt_read(path_wordseg_a)
50 | 
51 |     time_start = time.time()
52 |     count = 0
53 |     for i in range(10000):
54 |         for sen in sentences:
55 |             # print(sen)
56 |             count += 1
57 |             res = sf.cut(sen)
58 |             # print(list(res))
59 |     time_end = time.time()
60 |     print(time_end - time_start)
61 |     print(count/(time_end - time_start))
62 | 
63 |     # 10000/0.17*50 = 2831272(line/s)
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/macropodus/segment/seg_statistics/seg_bidirectional.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/19 9:55
 4 | # @author  : Mo
 5 | # @function: cut sentences of forward of reverse of maxlength
 6 | 
 7 | 
 8 | from macropodus.segment.seg_statistics.seg_forward import SegForward
 9 | from macropodus.segment.seg_statistics.seg_reverse import SegReverse
10 | 
11 | 
12 | class SegBidirectional(object):
13 |     def __init__(self):
14 |         self.seg_forward = SegForward()
15 |         self.seg_reverse = SegReverse()
16 | 
17 |     def cut(self, sentence):
18 |         """
19 |             最大双向词典切词, 即最大正向切词与最大反向切词合并, 选择词数小的那个返回
20 |         :param sentence: str
21 |         :return: 
22 |         """
23 |         res_forward = self.seg_forward.cut(sentence)
24 |         res_reverse = self.seg_reverse.cut(sentence)
25 |         res_forward_list = list(res_forward)
26 |         res_reverse_list = list(res_reverse)
27 |         len_res_forward = len(res_forward_list)
28 |         len_res_reverse = len(res_reverse_list)
29 |         if len_res_forward >= len_res_reverse:
30 |             for rrl in res_reverse_list:
31 |                 yield rrl
32 |         else:
33 |             for rfl in res_forward_list:
34 |                 yield rfl
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     sb = SegBidirectional()
39 |     sentence = "研究生命科学研究生命科学"
40 |     print(list(sb.cut(sentence)))
41 | 
42 |     # 测试性能
43 |     from macropodus.preprocess.tools_common import txt_read, txt_write
44 |     from macropodus.conf.path_config import path_root
45 |     import time
46 | 
47 |     path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
48 |     sentences = txt_read(path_wordseg_a)
49 | 
50 |     time_start = time.time()
51 |     count = 0
52 |     for i in range(10000):
53 |         for sen in sentences:
54 |             count += 1
55 |             res = sb.cut(sen)
56 |             # print(list(res))
57 |     time_end = time.time()
58 |     print(time_end - time_start)
59 |     print(count/(time_end - time_start))
60 |     # yield
61 |     # 10000/0.17*50 = 2500*50    = 2896810(line/s)
62 |     # 50000/0.90*50 = 2500000/20 = 2763600(line/s)


--------------------------------------------------------------------------------
/macropodus/tookit/pinyin/pinyin.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/8 21:40
 4 | # @author  : Mo
 5 | # @function: 汉字转拼音(zh2pinyin)
 6 | 
 7 | 
 8 | from macropodus.preprocess.tools_common import re_zh_cn, load_json
 9 | from macropodus.preprocess.tools_ml import macropodus_cut
10 | from macropodus.conf.path_config import path_dict_pinyin
11 | from collections import defaultdict
12 | 
13 | 
14 | class PinYin:
15 |     def __init__(self):
16 |         self.algorithm = "pinyin"
17 |         self.dict_pinyin = defaultdict()
18 |         self.load_pinyin_dict()
19 | 
20 |     def load_pinyin_dict(self):
21 |         """
22 |             加载默认的拼音pinyin字典
23 |         :return: None
24 |         """
25 |         dict_pinyin = load_json(path_dict_pinyin)[0] # 加载json字典文件
26 |         for k, v in dict_pinyin.items():
27 |             self.dict_pinyin[k] = v
28 | 
29 |     def pinyin(self, text):
30 |         """
31 |             中文(大陆)转拼音
32 |         :param text: str, like "大漠帝国"
33 |         :return: list, like ["da", "mo", "di", "guo"]
34 |         """
35 |         res_pinyin = []
36 |         # 只选择中文(zh), split筛选
37 |         text_re = re_zh_cn.split(text)
38 |         for tr in text_re:
39 |             if re_zh_cn.match(tr):
40 |                 # 切词
41 |                 tr_cut = macropodus_cut(tr)
42 |                 for trc in tr_cut: # 切词后的词语
43 |                     # get words from dict of default
44 |                     trc_pinyin = self.dict_pinyin.get(trc)
45 |                     if trc_pinyin: res_pinyin += trc_pinyin
46 |                     else: # 单个字的问题
47 |                         for trc_ in trc:
48 |                             # get trem from dict of default
49 |                             trc_pinyin = self.dict_pinyin.get(trc_)
50 |                             if trc_pinyin: res_pinyin += trc_pinyin
51 |         return res_pinyin
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     text = "macropodus是一种中国产的淡水鱼，广泛分布于两广地区，abcdefghijklmnopqrstuvwxyz"
56 |     py = PinYin()
57 |     res = py.pinyin(text)
58 |     print(res)
59 |     while True:
60 |         print("请输入:")
61 |         ques = input()
62 |         print(py.pinyin(ques))
63 | 


--------------------------------------------------------------------------------
/macropodus/summarize/graph_base/textrank_sklearn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/8/21 22:01
 4 | # @author   :Mo
 5 | # @function : textrank using tfidf of sklearn, pagerank of networkx
 6 | 
 7 | 
 8 | from sklearn.feature_extraction.text import TfidfTransformer
 9 | from macropodus.preprocess.tools_ml import cut_sentence
10 | from macropodus.preprocess.tools_ml import tdidf_sim
11 | import networkx as nx
12 | 
13 | 
14 | class TextrankSklearn:
15 |     def __init__(self):
16 |         self.algorithm = 'textrank_sklearn'
17 | 
18 |     def summarize(self, text, num=320):
19 |         # 切句
20 |         if type(text) == str:
21 |             sentences = cut_sentence(text)
22 |         elif type(text) == list:
23 |             sentences = text
24 |         else:
25 |             raise RuntimeError("text type must be list or str")
26 |         # tf-idf相似度
27 |         matrix = tdidf_sim(sentences)
28 |         matrix_norm = TfidfTransformer().fit_transform(matrix)
29 |         # 构建相似度矩阵
30 |         tfidf_sim = nx.from_scipy_sparse_matrix(matrix_norm * matrix_norm.T)
31 |         # nx.pagerank
32 |         sens_scores = nx.pagerank(tfidf_sim)
33 |         # 得分排序
34 |         sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
35 |         # 保留topk个, 防止越界
36 |         topk = min(len(sentences), num)
37 |         # 返回原句子和得分
38 |         return [(sr[1], sentences[sr[0]]) for sr in sen_rank][0:topk]
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \
43 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长，" \
44 |           "业界急需一种相对比较准确的网页重要性计算方法，" \
45 |           "是人们能够从海量互联网世界中找出自己需要的信息。" \
46 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
47 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票，" \
48 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面" \
49 |           "和投票目标的等级来决定新的等级。简单的说，" \
50 |           "一个高等级的页面可以使其他低等级页面的等级提升。" \
51 |           "PageRank The PageRank Citation Ranking: Bringing Order to the Web，"\
52 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设，" \
53 |           "即数量假设：一个网页被越多的其他页面链接，就越重）；" \
54 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
55 |           "总的来说就是一句话，从全局角度考虑，获取重要的信息。"
56 |     doc = doc.encode('utf-8').decode('utf-8')
57 |     ts = TextrankSklearn()
58 |     textrank_tfidf = ts.summarize(doc, 32)
59 |     for score_sen in textrank_tfidf:
60 |         print(score_sen)
61 | 


--------------------------------------------------------------------------------
/macropodus/segment/seg_statistics/seg_reverse.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/19 9:54
 4 | # @author  : Mo
 5 | # @function: cut sentences of reverse of maxlength
 6 | 
 7 | 
 8 | from macropodus.base.seg_basic import SegBasic
 9 | 
10 | 
11 | class SegReverse(SegBasic):
12 |     def __init__(self):
13 |         super().__init__()
14 | 
15 |     def cut(self, sentence, len_max=7):
16 |         """
17 |             反向最大切词
18 |         :param sentence: str, like '大漠帝国'
19 |         :param len_max: int, like 32
20 |         :return: yield
21 |         """
22 |         len_sen = len(sentence)
23 |         i = len_sen
24 |         res = []
25 |         while i > 0:  # while判断条件
26 |             flag = False  # flag标志位,确定有没有在字典里边的单字词或多字词
27 |             for j in range(max(0, i - len_max), i):  # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起
28 |                 word_maybe = sentence[j:i]  # 正向可能成词的语
29 |                 if word_maybe in self.dict_words_freq:  # 是否在字典里边
30 |                     i = j  # 成词前标志i向后移动
31 |                     flag = True  # flag标志位变化
32 |                     res.append(word_maybe)
33 |                     # yield word_maybe
34 |                     break  # 成词则跳出循环
35 |             if not flag:  # 未选中后单个字的情况
36 |                 i -= 1
37 |                 # yield sentence[i]
38 |                 res.append(sentence[i])
39 |         for i in range(len(res)-1, 0, -1):
40 |             yield res[i]
41 |         # return res
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     a = max(0,5)
46 |     sf = SegReverse()
47 |     sentence = "研究生命科学\t研究 生命 科学"
48 |     print(list(sf.cut(sentence)))
49 |     print(list(sf.cut("")))
50 | 
51 |     # 测试性能
52 |     from macropodus.preprocess.tools_common import txt_read, txt_write
53 |     from macropodus.conf.path_config import path_root
54 |     import time
55 |     path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
56 |     sentences = txt_read(path_wordseg_a)
57 | 
58 |     time_start = time.time()
59 |     count = 0
60 |     for i in range(50000):
61 |         for sen in sentences:
62 |             # print(sen)
63 |             count += 1
64 |             res = (sf.cut(sen))
65 |             # print(res)
66 |     time_end = time.time()
67 |     print(time_end-time_start)
68 |     print(count/(time_end - time_start))
69 | 
70 |     # 10000/0.18*50 = 2500*50    = 2784226(line/s)
71 |     # 50000/0.98*50 = 2500000/20 = 2550109(line/s)
72 | 
73 | 


--------------------------------------------------------------------------------
/macropodus/similarity/similarity_word2vec_char.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/17 14:50
 4 | # @author  : Mo
 5 | # @function: similarity of sentence of word2vec
 6 | 
 7 | 
 8 | from macropodus.base.word2vec import W2v
 9 | 
10 | 
11 | class SimW2vChar(W2v):
12 |     def __init__(self, use_cache=True):
13 |         super().__init__(use_cache)
14 | 
15 |     def encode(self, sent, type_encode="other"):
16 |         """
17 |             生成句向量, 字符级别, char
18 |         :param sent: str, like "大漠帝国"
19 |         :param type_encode: str, like "avg", "other"
20 |         :return: vector
21 |         """
22 |         sentence_vec = self.w2v_char.wv[self.w2v_char.index2word[1]] * 0
23 |         len_sent = len(sent)
24 |         for i in range(len_sent):
25 |             word = sent[i]
26 |             try:
27 |                 sentence_vec = sentence_vec + self.w2v_char.wv[word]
28 |             except Exception as e:
29 |                 sentence_vec = sentence_vec + 0.01  # unknow_know词加0.01
30 |         if type_encode == "avg":
31 |             sentence_vec = sentence_vec / len_sent
32 |         return sentence_vec
33 | 
34 |     def similarity(self, sent1, sent2, type_sim="total", type_encode="avg"):
35 |         """
36 |             相似度计算, 默认余弦相似度+jaccard相似度
37 |         :param sen1: str, like "大漠帝国"
38 |         :param sen2: str, like "Macropodus"
39 |         :param type_sim: str, like "total" or "cosine"
40 |         :param type_encode: str, like "other" or "avg"
41 |         :return: float, like 0.998
42 |         """
43 |         if sent1 and sent2:
44 |             encode_sen1 = self.encode(sent1, type_encode)
45 |             encode_sen2 = self.encode(sent2, type_encode)
46 |             score_res = self.cosine(encode_sen1, encode_sen2)
47 |         else:
48 |             score_res = 0.0
49 |         if type_sim=="total":
50 |             score_jaccard = self.jaccard(sent1, sent2)
51 |             score_res = (score_res + score_jaccard)/2
52 |         return score_res
53 | 
54 | 
55 | if __name__ == '__main__':
56 | 
57 |     sent1 = "大漠帝国"
58 |     sent2 = "macropodus"
59 |     swc = SimW2vChar(use_cache=True)
60 |     sen_encede = swc.encode(sent1)
61 |     score = swc.similarity(sent1, sent2)
62 |     print(score)
63 |     gg = 0
64 |     while True:
65 |         print("请输入sent1:")
66 |         sent1 = input()
67 |         print("请输入sent2:")
68 |         sent2 = input()
69 |         print(swc.similarity(sent1, sent2))
70 | 


--------------------------------------------------------------------------------
/macropodus/summarize/nous_base/lead_3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/8/24 22:43
 4 | # @author   :Mo
 5 | # @function :text_summary with lead-3
 6 | 
 7 | 
 8 | from macropodus.preprocess.tools_ml import cut_sentence
 9 | 
10 | 
11 | class Lead3Sum:
12 |     def __init__(self):
13 |         self.algorithm = 'lead_3'
14 | 
15 |     def summarize(self, text, num=320, type_l='mix'):
16 |         """
17 |             lead-s
18 |         :param sentences: list
19 |         :param type: str, you can choose 'begin', 'end' or 'mix'
20 |         :return: list
21 |         """
22 |         # 切句
23 |         if type(text) == str:
24 |             sentences = cut_sentence(text)
25 |         elif type(text) == list:
26 |             sentences = text
27 |         else:
28 |             raise RuntimeError("text type must be list or str")
29 |         # 最小句子数
30 |         num_min = min(num, len(sentences))
31 |         if type_l=='begin':
32 |             summers = sentences[0:num]
33 |         elif type_l=='end':
34 |             summers = sentences[-num:]
35 |         else:
36 |             summers = [sentences[0]] + [sentences[-1]] + sentences[1:num-1]
37 |         summers_s = {}
38 |         for i in range(len(summers)): # 得分计算
39 |             if len(summers) - i == 1:
40 |                 summers_s[summers[i]] = (num - 0.75) / (num + 1)
41 |             else:
42 |                 summers_s[summers[i]] = (num - i - 0.5) / (num + 1)
43 |         score_sen = [(rc[1], rc[0]) for rc in sorted(summers_s.items(), key=lambda d: d[1], reverse=True)][0:num_min]
44 |         return score_sen
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \
49 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长，" \
50 |           "业界急需一种相对比较准确的网页重要性计算方法，" \
51 |           "是人们能够从海量互联网世界中找出自己需要的信息。" \
52 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
53 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票，" \
54 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面" \
55 |           "和投票目标的等级来决定新的等级。简单的说，" \
56 |           "一个高等级的页面可以使其他低等级页面的等级提升。" \
57 |           "PageRank The PageRank Citation Ranking: Bringing Order to the Web，"\
58 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设，" \
59 |           "即数量假设：一个网页被越多的其他页面链接，就越重）；" \
60 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
61 |           "总的来说就是一句话，从全局角度考虑，获取重要的信息。"
62 |     text = doc.encode('utf-8').decode('utf-8')
63 |     l3 = Lead3Sum()
64 |     for score_sen in l3.summarize(text, type_l='mix', num=320):
65 |         print(score_sen)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/macropodus/network/graph/crf.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2020/1/9 21:44
 4 | # @author  : Mo
 5 | # @function: CRF
 6 | 
 7 | 
 8 | from macropodus.network.base.graph import graph
 9 | from macropodus.network.layers.crf import CRF
10 | import tensorflow as tf
11 | 
12 | 
13 | class CRFGraph(graph):
14 |     def __init__(self, hyper_parameters):
15 |         """
16 |             初始化
17 |         :param hyper_parameters: json，超参
18 |         """
19 |         self.crf_mode = hyper_parameters["model"].get("crf_mode", "reg") # "reg", pad
20 |         self.supports_masking = hyper_parameters["model"].get("supports_masking", True) # True or False
21 |         super().__init__(hyper_parameters)
22 | 
23 |     def create_model(self, hyper_parameters):
24 |         """
25 |             构建神经网络
26 |         :param hyper_parameters:json,  hyper parameters of network
27 |         :return: tensor, moedl
28 |         """
29 |         super().create_model(hyper_parameters)
30 |         x = self.word_embedding.output
31 |         # TimeDistributed
32 |         x_64 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(128, activation="softmax"),
33 |                                                  name='layer_time_distributed')(x)
34 |         # dense to a smaller units
35 |         tensor = tf.keras.layers.Dense(units=self.label, activation=self.activate_rnn, name="layer_dense_64")(x_64)
36 |         # crf, "pad" or "reg"
37 |         if self.crf_mode == "pad":
38 |             # length of real sentence
39 |             x_mask = tf.keras.layers.Input(shape=(1), dtype=tf.int32)
40 |             self.crf = CRF(self.label, mode="pad", supports_masking=True, name="layer_crf")
41 |             self.output = self.crf([tensor, x_mask])
42 |             if self.embedding_type in ["bert", "albert"]:
43 |                 self.inputs = [self.word_embedding.input[0], self.word_embedding.input[1], x_mask]
44 |             else:
45 |                 self.inputs = [self.word_embedding.input, x_mask]
46 |         else:
47 |             self.crf = CRF(self.label, mode="reg", name="layer_crf")
48 |             self.output = self.crf(tensor)
49 |             self.inputs = self.word_embedding.input
50 |         self.model = tf.keras.Model(self.inputs, self.output)
51 |         self.model.summary(132)
52 | 
53 |     def create_compile(self):
54 |         """
55 |           构建优化器、损失函数和评价函数
56 |         :return: 
57 |         """
58 |         self.loss = self.crf.loss
59 |         self.metrics = self.crf.viterbi_accuracy
60 |         super().create_compile()
61 | 


--------------------------------------------------------------------------------
/macropodus/base/word2vec.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/5 22:52
 4 | # @author  : Mo
 5 | # @function: word2vec of gensim
 6 | 
 7 | 
 8 | from macropodus.conf.path_config import path_embedding_word2vec_char, path_macropodus_w2v_char_cache
 9 | from macropodus.conf.path_log import get_logger_root
10 | import numpy as np
11 | import gensim
12 | import pickle
13 | import time
14 | import os
15 | 
16 | 
17 | logger = get_logger_root()
18 | gensim.logger.level=40 # gensim只打印ERROR信息等
19 | logger.info("path of w2v cache is {}!".format(path_macropodus_w2v_char_cache))
20 | 
21 | 
22 | class W2v:
23 |     def __init__(self, use_cache=True):
24 |         # time_start = time.time()
25 |         # 存在缓存则直接读取, 序列化加速缓存读取速度
26 |         if use_cache and os.path.exists(path_macropodus_w2v_char_cache):
27 |             with open(path_macropodus_w2v_char_cache, "rb") as fpmc:
28 |                 self.w2v_char= pickle.load(fpmc)
29 |                 fpmc.close()
30 |                 # logger.info("word2vec: " + str(time.time() - time_start)) # 0.12
31 |         else:
32 |             # gensim加载词向量
33 |             self.w2v_char = gensim.models.KeyedVectors.load_word2vec_format(path_embedding_word2vec_char)
34 |             # logger.info("word2vec: " + str(time.time() - time_start)) # 0.99, 0.78
35 |             # 第一次跑macropodus, 序列化需要的缓存
36 |             if use_cache and not os.path.exists(path_macropodus_w2v_char_cache):
37 |                 with open(path_macropodus_w2v_char_cache, "wb") as fpmc:
38 |                     pickle.dump(self.w2v_char, fpmc)
39 | 
40 |     def cosine(self, sen_1, sen_2):
41 |         """
42 |             余弦距离
43 |         :param sen_1: numpy.array
44 |         :param sen_2: numpy.array
45 |         :return: float, like 0.0
46 |         """
47 |         if sen_1.all() and sen_2.all():
48 |             return np.dot(sen_1, sen_2) / (np.linalg.norm(sen_1) * np.linalg.norm(sen_2))
49 |         else:
50 |             return 0.0
51 | 
52 |     def jaccard(self, sen_1, sen_2):
53 |         """
54 |             jaccard距离
55 |         :param sen1: str, like "大漠帝国"
56 |         :param sen2: str, like "Macropodus"
57 |         :return: float, like 0.998
58 |         """
59 |         try:
60 |             sent_intersection = list(set(list(sen_1)).intersection(set(list(sen_2))))
61 |             sent_union = list(set(list(sen_1)).union(set(list(sen_2))))
62 |             score_jaccard = float(len(sent_intersection) / len(sent_union))
63 |         except:
64 |             score_jaccard = 0.0
65 |         return score_jaccard
66 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/12/30 22:17
 4 | # @author   :Mo
 5 | # @function :setup of Macropodus
 6 | # @codes    :fix it and copy reference from https://github.com/TianWenQAQ/Kashgari/blob/master/setup.py
 7 | 
 8 | 
 9 | from macropodus.version import __version__
10 | from setuptools import find_packages, setup
11 | import codecs
12 | 
13 | 
14 | # Package meta-data.
15 | NAME = 'Macropodus'
16 | DESCRIPTION = 'Macropodus: Tookit of Chinese Natural Language Processing'
17 | URL = 'https://github.com/yongzhuo/Macropodus'
18 | EMAIL = '1903865025@qq.com'
19 | AUTHOR = 'yongzhuo'
20 | LICENSE = 'MIT'
21 | 
22 | with codecs.open('README.md', 'r', 'utf8') as reader:
23 |     long_description = "\n".join(reader.readlines())
24 | with codecs.open('requirements.txt', 'r', 'utf8') as reader:
25 |     install_requires = list(map(lambda x: x.strip(), reader.readlines()))
26 | 
27 | setup(name=NAME,
28 |         version=__version__,
29 |         description=DESCRIPTION,
30 |         long_description=long_description,
31 |         long_description_content_type="text/markdown",
32 |         author=AUTHOR,
33 |         author_email=EMAIL,
34 |         url=URL,
35 |         packages=find_packages(),    # (exclude=('test')),
36 |         package_data={'macropodus': ['*.*', 'data/*', 'data/dict/*',
37 |                                    'data/embedding/*', 'data/embedding/word2vec/*',
38 |                                    'data/model/*']
39 |                     },
40 |         install_requires=install_requires,
41 |         license=LICENSE,
42 |         classifiers=['License :: OSI Approved :: MIT License',
43 |                      'Programming Language :: Python :: 3.5',
44 |                      'Programming Language :: Python :: 3.6',
45 |                      'Programming Language :: Python :: 3.7',
46 |                      'Programming Language :: Python :: 3.8',
47 |                      'Programming Language :: Python :: 3.9',
48 |                      'Programming Language :: Python :: Implementation :: CPython',
49 |                      'Programming Language :: Python :: Implementation :: PyPy'],
50 |       )
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     print("setup ok!")
55 | 
56 | # 说明, tensorflow>=1.13.0 or tensorflow-gpu>=1.13.0
57 | # 项目工程目录这里Macropodus, 实际上, 下边还要有一层macropodus, 也就是说, macropodus和setup同一层
58 | # data包里必须要有__init__.py, 否则文件不会生成, .py文件才能copy
59 | 
60 | # anaconda3创建环境
61 | # conda remove -n py35 --all
62 | # conda create -n py351 python=3.5
63 | 
64 | # 编译的2种方案:
65 | 
66 | # 方案一
67 | #     打开cmd
68 | #     到达安装目录
69 | #     python setup.py build
70 | #     python setup.py install
71 | 
72 | # 方案二
73 | # python setup.py bdist_wheel --universal
74 | # twine upload dist/*
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/macropodus/network/graph/bilstm.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/19 22:30
 4 | # @author  : Mo
 5 | # @function: Bi-LSTM
 6 | 
 7 | 
 8 | from macropodus.network.base.graph import graph
 9 | import tensorflow as tf
10 | 
11 | 
12 | class BiLSTMGraph(graph):
13 |     def __init__(self, hyper_parameters):
14 |         """
15 |             初始化
16 |         :param hyper_parameters: json，超参
17 |         """
18 |         self.filters = hyper_parameters['model'].get('filters', [2, 3, 4])
19 |         self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 1)
20 |         self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM')
21 |         self.rnn_units = hyper_parameters['model'].get('rnn_units', 256)
22 |         super().__init__(hyper_parameters)
23 | 
24 |     def create_model(self, hyper_parameters):
25 |         """
26 |             构建神经网络
27 |         :param hyper_parameters:json,  hyper parameters of network
28 |         :return: tensor, moedl
29 |         """
30 |         super().create_model(hyper_parameters)
31 |         self.rnn_layer = {'LSTM':tf.keras.layers.LSTM, 'GRU':tf.keras.layers.GRU}[self.rnn_type]
32 |         embedding = self.word_embedding.output
33 |         # 提取n-gram特征和最大池化， 一般不用平均池化
34 |         conv_pools = [embedding]
35 |         for filter in self.filters:
36 |             conv = tf.keras.layers.Conv1D(filters=self.filters_num,
37 |                                           kernel_size=filter,
38 |                                           padding='same',
39 |                                           kernel_initializer='normal',
40 |                                           activation='relu', )(embedding)
41 |             pooled = tf.keras.layers.MaxPool1D(pool_size=2,
42 |                                                strides=1,
43 |                                                padding='same', )(conv)
44 |             conv_pools.append(pooled)
45 |         # 拼接
46 |         x = tf.keras.layers.Concatenate(axis=-1)(conv_pools)
47 |         # Bi-LSTM
48 |         for nrl in range(self.num_rnn_layers):
49 |             x = tf.keras.layers.Bidirectional(self.rnn_layer(units=self.rnn_units,
50 |                                              return_sequences=True,
51 |                                              activation=self.activate_rnn,
52 |                                              kernel_regularizer=tf.keras.regularizers.l2(self.l2),
53 |                                              recurrent_regularizer=tf.keras.regularizers.l2(self.l2)
54 |                                              ))(x)
55 |             x = tf.keras.layers.Dropout(self.dropout)(x)
56 |         x = tf.keras.layers.Dense(self.label, activation=self.activate_classify, name='layer_dense_3')(x)
57 |         self.output = x
58 |         self.model = tf.keras.Model(self.word_embedding.input, self.output)
59 |         self.model.summary(132)
60 | 


--------------------------------------------------------------------------------
/macropodus/conf/path_config.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/18 23:59
 4 | # @author  : Mo
 5 | # @function: path of macropodus
 6 | 
 7 | 
 8 | import sys
 9 | import os
10 | path_root = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
11 | sys.path.append(path_root)
12 | 
13 | 
14 | # path of basic of segnment
15 | path_dict_macropodus = os.path.join(path_root, "data/dict/macropodus.dict")
16 | path_dict_user = os.path.join(path_root, "data/dict/user.dict")
17 | path_log_basic = os.path.join(path_root, "logs")
18 | 
19 | # path of cache
20 | path_macropodus_w2v_char_cache = os.path.join(path_root, 'data/cache/word2vec_char.cache')
21 | path_macropodus_dict_freq_cache = os.path.join(path_root, 'data/cache/macropodus.cache')
22 | 
23 | # path of basic of tookit
24 | path_dict_pinyin = os.path.join(path_root, "data/dict/pinyin.dict")
25 | path_dict_zh2han = os.path.join(path_root, "data/dict/zh2han.dict")
26 | 
27 | # path of embedding
28 | path_embedding_word2vec_char = os.path.join(path_root, 'data/embedding/word2vec/w2v_model_wiki_char.vec')
29 | path_embedding_bert = os.path.join(path_root, 'data/embedding/chinese_L-12_H-768_A-12/')
30 | path_embedding_random_char = os.path.join(path_root, 'data/embedding/term_char.txt')
31 | path_embedding_random_word = os.path.join(path_root, 'data/embedding/term_word.txt')
32 | path_embedding_albert = os.path.join(path_root, 'data/embedding/albert_base_zh')
33 | 
34 | # path of train data of ner people 1998
35 | path_ner_people_1998_train = os.path.join(path_root, "data/corpus/ner_people_1998/train.json")
36 | path_ner_people_1998_valid = os.path.join(path_root, "data/corpus/ner_people_1998/dev.json")
37 | # path of train data of seg pku 1998
38 | path_seg_pku_1998_train = os.path.join(path_root, "data/corpus/seg_pku_1998/train.json")
39 | path_seg_pku_1998_bi_train = os.path.join(path_root, "data/corpus/seg_pku_1998/train_BI_126.json")
40 | # path of train data of tag people 1998
41 | path_tag_people_1998_train = os.path.join(path_root, "data/corpus/tag_people_1998/train.json")
42 | # path of train data of tag people 2014
43 | path_tag_people_2014_train = os.path.join(path_root, "data/corpus/tag_people_2014/train.json")
44 | path_tag_people_2014_valid = os.path.join(path_root, "data/corpus/tag_people_2014/dev.json")
45 | # path of ccks_2020
46 | path_ccks_2020 = os.path.join(path_root, "data/ccks_8_data_v2_ner")
47 | 
48 | path_ccks_2020_ner = os.path.join(path_root, "data/ccks_8_data_v2_ner/ccks_2020_ner.json")
49 | path_ccks_2020_ner_train = os.path.join(path_root, "data/ccks_8_data_v2_ner/train.json")
50 | path_ccks_2020_ner_dev = os.path.join(path_root, "data/ccks_8_data_v2_ner/dev.json")
51 | 
52 | # path of training model save dir
53 | path_model_dir = os.path.join(path_root, "data", "model")
54 | path_hyper_parameters = os.path.join(path_model_dir, "params.json")
55 | path_model_l2i_i2l = os.path.join(path_model_dir, "l2i_i2l.json")
56 | path_fineture = os.path.join(path_model_dir, "embedding.h5")
57 | path_model = os.path.join(path_model_dir, "model.h5")
58 | 


--------------------------------------------------------------------------------
/macropodus/summarize/graph_base/textrank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/11/29 22:39
 4 | # @author   :Mo
 5 | # @function :textrank of textrank4zh, sklearn or gensim
 6 | 
 7 | 
 8 | from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec
 9 | from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum
10 | from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn
11 | import os
12 | 
13 | # 词向量, 默认使用缓存
14 | use_cache = True
15 | if not os.environ.get("macropodus_use_w2v_cache", True):
16 |     use_cache = False  # 不使用缓存，重新加载
17 | # textrank of gensim
18 | trgs = TextrankGensimSum()
19 | # textrank of word2vec
20 | trwv = TextrankWord2vec()
21 | # textrank of sklearn
22 | trsk = TextrankSklearn()
23 | 
24 | 
25 | class TextRankSum:
26 |     def __init__(self):
27 |         self.algorithm = 'textrank'
28 | 
29 |     def summarize(self, text, num=6, model_type="textrank_word2vec"):
30 |         """
31 |             文本摘要
32 |         :param text:str, like "你好！大漠帝国！" 
33 |         :param num: int, like 3
34 |         :param model_type: str, like "textrank_sklearn"
35 |         :return: list
36 |         """
37 |         if model_type=="textrank_sklearn":
38 |             res = trsk.summarize(text, num=num)
39 |         elif model_type=="textrank_gensim":
40 |             res = trgs.summarize(text, num=num)
41 |         elif model_type=="textrank_word2vec":
42 |             res = trwv.summarize(text, num=num)
43 |         else:
44 |             raise RuntimeError(" model_type must be 'textrank_textrank4zh', 'text_rank_sklearn' or 'textrank_gensim' ")
45 | 
46 |         return res
47 | 
48 | 
49 | class TextRankKey:
50 |     def __init__(self):
51 |         self.algorithm = 'keyword'
52 | 
53 |     def keyword(self, text, num=6, score_min=0.025, model_type="keywor_word2vec"):
54 |         if model_type=="keywor_word2vec":
55 |             res = trwv.keyword(text, num=num, score_min=score_min)
56 |         else:
57 |             raise RuntimeError(" model_type must be 'keywor_word2vec'")
58 | 
59 |         return res
60 | 
61 | 
62 | 
63 | if __name__ == '__main__':
64 | 
65 |     doc = "和投票目标的等级来决定新的等级.简单的说。" \
66 |            "是上世纪90年代末提出的一种计算网页权重的算法!" \
67 |            "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。" \
68 |            "业界急需一种相对比较准确的网页重要性计算方法。" \
69 |            "是人们能够从海量互联网世界中找出自己需要的信息。" \
70 |            "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
71 |            "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
72 |            "Google根据投票来源甚至来源的来源，即链接到A页面的页面。" \
73 |            "一个高等级的页面可以使其他低等级页面的等级提升。" \
74 |            "具体说来就是，PageRank有两个基本思想，也可以说是假设。" \
75 |            "即数量假设：一个网页被越多的其他页面链接，就越重）。" \
76 |            "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
77 |            "总的来说就是一句话，从全局角度考虑，获取重要的信。"
78 | 
79 |     text = doc.encode('utf-8').decode('utf-8')
80 | 
81 |     tr = TextRankSum()
82 |     kw = TextRankKey()
83 |     score_ques = tr.summarize(text, num=100, model_type="textrank_gensim") # "text_rank_sklearn")
84 |     for sq in score_ques:
85 |         print(sq)
86 | 
87 |     score_ques = kw.keyword(text, num=100, model_type="keywor_word2vec") # "text_rank_sklearn")
88 |     for sq in score_ques:
89 |         print(sq)
90 | 


--------------------------------------------------------------------------------
/macropodus/__init_tf_keras.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/20 22:22
 4 | # @author  : Mo
 5 | # @function: init of keras of tensorflow
 6 | 
 7 | 
 8 | from macropodus.conf.path_log import get_logger_root
 9 | 
10 | 
11 | logger = get_logger_root()
12 | 
13 | 
14 | try:
15 |     #####################(tensorflow, keras)############################
16 |     import sys
17 |     import os
18 | 
19 |     path_root = os.path.abspath(os.path.dirname(__file__))
20 |     sys.path.append(path_root)  # 环境引入根目录
21 |     # 默认cpu环境, tensorflow
22 |     os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
23 |     os.environ["CUDA_VISIBLE_DEVICES"] = "0"
24 |     os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
25 |     os.environ['TF_KERAS'] = '1'
26 | 
27 |     # tensorflow.python.keras
28 |     from macropodus.network.service.server_prdeict import AlbertBilstmPredict
29 |     from keras_adaptive_softmax import AdaptiveEmbedding, AdaptiveSoftmax
30 |     from macropodus.network.layers.non_mask_layer import NonMaskingLayer
31 |     from macropodus.conf.path_config import path_model_dir
32 |     from macropodus.network.layers.crf import CRF
33 |     import tensorflow.python.keras as keras
34 |     import tensorflow as tf
35 |     import keras_bert
36 | 
37 | 
38 |     tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
39 | 
40 |     # custom_objects
41 |     custom_objects = keras_bert.get_custom_objects()
42 |     custom_objects['AdaptiveEmbedding'] = AdaptiveEmbedding
43 |     custom_objects['AdaptiveSoftmax'] = AdaptiveSoftmax
44 |     custom_objects['NonMaskingLayer'] = NonMaskingLayer
45 |     custom_objects['CRF'] = CRF
46 | 
47 |     # init model of dl(deep learning)
48 |     # 加载训练好的模型, 命名实体提取
49 |     try:
50 |         path_ner_albert_bilstm_crf = os.path.join(path_model_dir, 'ner_albert_people_1998')
51 |         ner_albert_bilstm_crf = AlbertBilstmPredict(path_ner_albert_bilstm_crf, custom_objects)
52 |         ner = ner_albert_bilstm_crf.predict_single
53 |         ners = ner_albert_bilstm_crf.predict
54 |     except Exception as e:
55 |         logger.info(str(e))
56 | 
57 |     # 加载训练好的模型, 词性标注
58 |     try:
59 |         path_tag_albert_bilstm_crf = os.path.join(path_model_dir, 'tag_albert_people_1998')
60 |         tag_albert_bilstm_crf = AlbertBilstmPredict(path_tag_albert_bilstm_crf, custom_objects)
61 |         postag = tag_albert_bilstm_crf.pos_tag
62 |         postags = tag_albert_bilstm_crf.pos_tags
63 |     except Exception as e:
64 |         logger.info(str(e))
65 |     # # layers
66 |     # preprocessing = keras.preprocessing
67 |     # applications = keras.applications
68 |     # regularizers = keras.regularizers
69 |     # initializers = keras.initializers
70 |     # activations = keras.activations
71 |     # constraints = keras.constraints
72 |     # optimizers = keras.optimizers
73 |     # callbacks = keras.callbacks
74 |     # datasets = keras.datasets
75 |     # wrappers = keras.wrappers
76 |     # metrics = keras.metrics
77 |     # backend = keras.backend
78 |     # engine = keras.engine
79 |     # layers = keras.layers
80 |     # models = keras.models
81 |     # losses = keras.losses
82 |     # utils = keras.utils
83 | except Exception as e:
84 |     logger.info(str(e))
85 | 


--------------------------------------------------------------------------------
/macropodus/network/layers/keras_lookahead.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/12 21:14
 4 | # @author  : Mo
 5 | # @function: lookahead of keras
 6 | # @codefrom: https://github.com/bojone/keras_lookahead
 7 | 
 8 | 
 9 | import tensorflow.python.keras.backend as K
10 | 
11 | 
12 | class Lookahead(object):
13 |     """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/).
14 |     """
15 | 
16 |     def __init__(self, k=5, alpha=0.5):
17 |         self.k = k
18 |         self.alpha = alpha
19 |         self.count = 0
20 | 
21 |     def inject(self, model):
22 |         """Inject the Lookahead algorithm for the given model.
23 |         The following code is modified from keras's _make_train_function method.
24 |         See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497
25 |         """
26 |         if not hasattr(model, 'train_function'):
27 |             raise RuntimeError('You must compile your model before using it.')
28 | 
29 |         model._check_trainable_weights_consistency()
30 | 
31 |         if model.train_function is None:
32 |             inputs = (model._feed_inputs +
33 |                       model._feed_targets +
34 |                       model._feed_sample_weights)
35 |             if model._uses_dynamic_learning_phase():
36 |                 inputs += [K.learning_phase()]
37 |             fast_params = model._collected_trainable_weights
38 | 
39 |             with K.name_scope('training'):
40 |                 with K.name_scope(model.optimizer.__class__.__name__):
41 |                     training_updates = model.optimizer.get_updates(
42 |                         params=fast_params,
43 |                         loss=model.total_loss)
44 |                     slow_params = [K.variable(p) for p in fast_params]
45 |                 fast_updates = (model.updates +
46 |                                 training_updates +
47 |                                 model.metrics_updates)
48 | 
49 |                 slow_updates, copy_updates = [], []
50 |                 for p, q in zip(fast_params, slow_params):
51 |                     slow_updates.append(K.update(q, q + self.alpha * (p - q)))
52 |                     copy_updates.append(K.update(p, q))
53 | 
54 |                 # Gets loss and metrics. Updates weights at each call.
55 |                 fast_train_function = K.function(
56 |                     inputs,
57 |                     [model.total_loss] + model.metrics_tensors,
58 |                     updates=fast_updates,
59 |                     name='fast_train_function',
60 |                     **model._function_kwargs)
61 | 
62 |                 def F(inputs):
63 |                     self.count += 1
64 |                     R = fast_train_function(inputs)
65 |                     if self.count % self.k == 0:
66 |                         K.batch_get_value(slow_updates)
67 |                         K.batch_get_value(copy_updates)
68 |                     return R
69 | 
70 |                 model.train_function = F
71 | 
72 | if __name__ == '__main__':
73 |     gg = 0
74 |     # # useage
75 |     # model.compile(optimizer=Adam(1e-3), loss='mse')  # Any optimizer
76 |     # lookahead = Lookahead(k=5, alpha=0.5)  # Initialize Lookahead
77 |     # lookahead.inject(model)  # add into model
78 | 


--------------------------------------------------------------------------------
/macropodus/summarize/__init__.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/18 22:10
 4 | # @author  : Mo
 5 | # @function: text summarize
 6 | 
 7 | 
 8 | # text_summarize of extractive
 9 | from macropodus.summarize.feature_base.word_significance import WordSignificanceSum
10 | from macropodus.summarize.feature_base.text_pronouns import TextPronounsSum
11 | from macropodus.summarize.graph_base.textrank import TextRankSum, TextRankKey
12 | from macropodus.summarize.feature_base.text_teaser import TextTeaserSum
13 | from macropodus.summarize.feature_base.mmr import MMRSum
14 | 
15 | from macropodus.summarize.topic_base.topic_lda import LDASum
16 | from macropodus.summarize.topic_base.topic_lsi import LSISum
17 | from macropodus.summarize.topic_base.topic_nmf import NMFSum
18 | 
19 | from macropodus.summarize.nous_base.lead_3 import Lead3Sum
20 | 
21 | # feature
22 | wss = WordSignificanceSum()
23 | tps = TextPronounsSum()
24 | tts = TextTeaserSum()
25 | mms = MMRSum()
26 | 
27 | # graph-3
28 | trs = TextRankSum()
29 | trk = TextRankKey()
30 | 
31 | # nous
32 | l3s = Lead3Sum()
33 | 
34 | # topic
35 | lds = LDASum()
36 | lss = LSISum()
37 | nms = NMFSum()
38 | 
39 | # summarization
40 | text_pronouns = tps.summarize
41 | text_teaser = tts.summarize
42 | word_sign = wss.summarize
43 | textrank = trs.summarize
44 | lead3 = l3s.summarize
45 | mmr = mms.summarize
46 | lda = lds.summarize
47 | lsi = lss.summarize
48 | nmf = nms.summarize
49 | 
50 | # keyword
51 | keyword = trk.keyword
52 | 
53 | # 函数接口
54 | def summarization(text, num=320, type_summarize="lda", topic_min=6, judge_topic=False, alpha=0.6, type_l='mix', model_type="textrank_sklearn", title=None):
55 |     """
56 |         文本摘要汇总
57 |     :param text: str, like "你是。大漠帝国。不是吧错了。哈哈。我的。"
58 |     :param num: int, like 32
59 |     :param type_summarize: str, like "lda", must in ['text_pronouns',  'text_teaser', 'word_sign', 'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf']
60 |     :return: 
61 |     """
62 | 
63 |     if type_summarize=="text_pronouns": # title, str, 可填标题, like "震惊,MacropodusXXX"
64 |         res = text_pronouns(text, num, title)
65 |     elif type_summarize=="text_teaser": # title, str, 可填标题, like "震惊,MacropodusXXX"
66 |         res = text_teaser(text, num, title)
67 |     elif type_summarize=="word_sign": #
68 |         res = word_sign(text, num)
69 |     elif type_summarize=="textrank": # model_type 可填 'textrank_textrank4zh', 'text_rank_sklearn' or 'textrank_gensim'
70 |         res = textrank(text, num)
71 |     elif type_summarize=="lead3":
72 |         res = lead3(text, num, type_l) # type_l 可填 'begin', 'end' or 'mix'
73 |     elif type_summarize=="mmr":
74 |         res = mmr(text, num, alpha) # alpha 可填 0-1
75 |     elif type_summarize=="lda": # topic_min>1, judge_topic=True or False
76 |         res = lda(text, num, topic_min, judge_topic)
77 |     elif type_summarize=="lsi": # topic_min>1, judge_topic=True or False
78 |         res = lsi(text, num, topic_min, judge_topic)
79 |     elif type_summarize=="nmf": # topic_min>1, judge_topic=True or False
80 |         res = nmf(text, num, topic_min, judge_topic)
81 |     else:
82 |         raise RuntimeError("your input type_summarize is wrong, it must be in "
83 |                            "['text_pronouns',  'text_teaser', 'word_sign', "
84 |                            "'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf']")
85 |     return res
86 | 


--------------------------------------------------------------------------------
/macropodus/summarize/feature_base/mmr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/11/28 20:16
 4 | # @author   :Mo
 5 | # @function :mmr
 6 | 
 7 | 
 8 | from macropodus.preprocess.tools_ml import extract_chinese, cut_sentence
 9 | from macropodus.preprocess.tools_ml import macropodus_cut, tfidf_fit
10 | from macropodus.data.words_common.stop_words import stop_words
11 | import copy
12 | 
13 | 
14 | class MMRSum:
15 |     def __init__(self):
16 |         self.stop_words = stop_words.values()
17 |         self.algorithm = 'mmr'
18 | 
19 |     def summarize(self, text, num=8, alpha=0.6):
20 |         """
21 | 
22 |         :param text: str
23 |         :param num: int
24 |         :return: list
25 |         """
26 |         # 切句
27 |         if type(text) == str:
28 |             self.sentences = cut_sentence(text)
29 |         elif type(text) == list:
30 |             self.sentences = text
31 |         else:
32 |             raise RuntimeError("text type must be list or str")
33 |         # 切词
34 |         sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
35 |                           if word.strip()] for sentence in self.sentences]
36 |         # 去除停用词等
37 |         self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
38 |         self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
39 |         # # 计算每个句子的词语个数
40 |         # sen_word_len = [len(sc)+1 for sc in sentences_cut]
41 |         # 计算每个句子的tfidf
42 |         sen_tfidf = tfidf_fit(self.sentences_cut)
43 |         # 矩阵中两两句子相似度
44 |         SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3]  # "第2篇与第4篇的相似度"
45 |         # 输入文本句子长度
46 |         len_sen = len(self.sentences)
47 |         # 句子标号
48 |         sen_idx = [i for i in range(len_sen)]
49 |         summary_set = []
50 |         mmr = {}
51 |         for i in range(len_sen):
52 |             if not self.sentences[i] in summary_set:
53 |                 sen_idx_pop = copy.deepcopy(sen_idx)
54 |                 sen_idx_pop.pop(i)
55 |                 # 两两句子相似度
56 |                 sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop]
57 |                 score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确
58 |                 mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j)
59 |                 summary_set.append(self.sentences[i])
60 |         score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)]
61 |         if len(mmr) > num:
62 |             score_sen = score_sen[0:num]
63 |         return score_sen
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     mmr_sum = MMRSum()
68 |     doc = "PageRank算法简介。" \
69 |           "是上世纪90年代末提出的一种计算网页权重的算法! " \
70 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
71 |           "业界急需一种相对比较准确的网页重要性计算方法。 " \
72 |           "是人们能够从海量互联网世界中找出自己需要的信息。 " \
73 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
74 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
75 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
76 |           "和投票目标的等级来决定新的等级。简单的说， " \
77 |           "一个高等级的页面可以使其他低等级页面的等级提升。 " \
78 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
79 |           "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
80 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
81 |           "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
82 |     sum = mmr_sum.summarize(doc)
83 |     for i in sum:
84 |         print(i)
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 


--------------------------------------------------------------------------------
/macropodus/network/graph/bilstm_crf.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/12/6 20:45
 4 | # @author  : Mo
 5 | # @function: Bi-LSTM-CRF
 6 | 
 7 | 
 8 | from macropodus.network.base.graph import graph
 9 | from macropodus.network.layers.crf import CRF
10 | import tensorflow as tf
11 | 
12 | 
13 | class BilstmCRFGraph(graph):
14 |     def __init__(self, hyper_parameters):
15 |         """
16 |             初始化
17 |         :param hyper_parameters: json，超参
18 |         """
19 |         self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 1) # 1, 2, 3
20 |         self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM') # 'LSTM', 'GRU'
21 |         self.rnn_units = hyper_parameters['model'].get('rnn_units', 512) # 128, 256, 512, 768, 1024
22 |         self.crf_mode = hyper_parameters['model'].get('crf_mode', 'reg') # "reg", pad
23 |         self.supports_masking = hyper_parameters['model'].get('supports_masking', True) # True or False
24 |         super().__init__(hyper_parameters)
25 | 
26 |     def create_model(self, hyper_parameters):
27 |         """
28 |             构建神经网络
29 |         :param hyper_parameters:json,  hyper parameters of network
30 |         :return: tensor, moedl
31 |         """
32 |         super().create_model(hyper_parameters)
33 |         # LSTM or GRU
34 |         self.rnn_layer = {'LSTM':tf.keras.layers.LSTM, 'GRU':tf.keras.layers.GRU}[self.rnn_type]
35 |         x = self.word_embedding.output
36 |         # Bi-LSTM
37 |         for nrl in range(self.num_rnn_layers):
38 |             x = tf.keras.layers.Bidirectional(self.rnn_layer(units=self.rnn_units,
39 |                                          return_sequences=True,
40 |                                          activation=self.activate_rnn,
41 |                                          kernel_regularizer=tf.keras.regularizers.l2(self.l2 * 0.1),
42 |                                          recurrent_regularizer=tf.keras.regularizers.l2(self.l2)
43 |                                          ))(x)
44 |             x = tf.keras.layers.Dropout(self.dropout)(x)
45 |         x = tf.keras.layers.Dense(units=self.rnn_units, activation=self.activate_rnn,)(x)
46 |         # crf, 'pad' or 'reg'
47 |         if self.crf_mode == "pad":
48 |             # length of real sentence
49 |             x_mask = tf.keras.layers.Input(shape=(1), dtype=tf.int32)
50 |             self.crf = CRF(self.label, mode='pad', supports_masking=True, name='crf')
51 |             tensor = tf.keras.layers.Dense(self.label, name='crf_dense')(x)
52 |             self.output = self.crf([tensor, x_mask])
53 |             if self.embedding_type in ["bert", "albert"]:
54 |                 self.inputs = [self.word_embedding.input[0], self.word_embedding.input[1], x_mask]
55 |             else:
56 |                 self.inputs = [self.word_embedding.input, x_mask]
57 |         else:
58 |             self.crf = CRF(self.label, mode='reg', name='crf')
59 |             tensor = tf.keras.layers.Dense(self.label, name='crf_dense')(x)
60 |             self.output = self.crf(tensor)
61 |             if self.embedding_type in ["bert", "albert"]:
62 |                 self.inputs = self.word_embedding.input
63 |             else:
64 |                 self.inputs = self.word_embedding.input
65 |         self.model = tf.keras.Model(self.inputs, self.output)
66 |         self.model.summary(132)
67 | 
68 |     def create_compile(self):
69 |         """
70 |           构建优化器、损失函数和评价函数
71 |         :return: 
72 |         """
73 |         self.loss = self.crf.loss
74 |         self.metrics = self.crf.viterbi_accuracy
75 |         super().create_compile()
76 | 


--------------------------------------------------------------------------------
/macropodus/tookit/han2zh/han2zh.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2020/1/8 15:51
  4 | # @author  : Mo
  5 | # @function: 中文繁简转化
  6 | 
  7 | 
  8 | from macropodus.tookit.han2zh.zh_wiki import zh2han, han2zh, cn2zh, sg2zh
  9 | from collections import defaultdict
 10 | 
 11 | 
 12 | class Han2Zh:
 13 |     def __init__(self):
 14 |         self.algorithm = "han2zh"
 15 |         # dict转为defaultdict
 16 |         self.han2zhs = self.load_han_zh_dict([han2zh, cn2zh, sg2zh])
 17 |         self.zh2hans = self.load_han_zh_dict([zh2han])
 18 | 
 19 |     def load_han_zh_dict(self, dicts):
 20 |         """
 21 |             多个dict转为一个defaultdict
 22 |         :param dicts: list<dict>, like [{"丟": "丢"}, {"並": "并"}]
 23 |         :return: dict, like {"丟": "丢", "並": "并"}
 24 |         """
 25 |         dict_han_zh = defaultdict()
 26 |         for ds in dicts:
 27 |             for k, v in ds.items():
 28 |                 dict_han_zh[k] = v
 29 |         return dict_han_zh
 30 | 
 31 |     def han2zh(self, text, len_max=11):
 32 |         """
 33 |             繁体字转简体字, 反向最大切词
 34 |         :param sentence: str, like '雪鐵龍'
 35 |         :param len_max: int, like 9
 36 |         :return: str, like '雪铁龙'
 37 |         """
 38 |         len_sen = len(text)
 39 |         i = len_sen
 40 |         res = [""]
 41 |         while i > 0:  # while判断条件
 42 |             flag = False  # flag标志位,确定有没有在字典里边的单字词或多字词
 43 |             for j in range(max(0, i - len_max), i):  # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起
 44 |                 word_maybe = text[j:i]  # 正向可能成词的语
 45 |                 if word_maybe in self.han2zhs:  # 是否在字典里边
 46 |                     i = j  # 成词前标志i向后移动
 47 |                     flag = True  # flag标志位变化
 48 |                     res.append(self.han2zhs.get(word_maybe))
 49 |                     break  # 成词则跳出循环
 50 |             if not flag:  # 未选中后单个字的情况
 51 |                 i -= 1
 52 |                 res_i = self.han2zhs.get(text[i])
 53 |                 if res_i:
 54 |                     res.append(res_i)
 55 |                 else:
 56 |                     res.append(text[i])
 57 |         res.reverse()
 58 |         return "".join(res)
 59 | 
 60 |     def zh2han(self, text, len_max=5):
 61 |         """
 62 |             简体字转繁体字, 反向最大切词
 63 |         :param sentence: str, like '大漠帝国'
 64 |         :param len_max: int, like 32
 65 |         :return: yield
 66 |         """
 67 |         len_sen = len(text)
 68 |         i = len_sen
 69 |         res = [""]
 70 |         while i > 0:  # while判断条件
 71 |             flag = False  # flag标志位,确定有没有在字典里边的单字词或多字词
 72 |             for j in range(max(0, i - len_max), i):  # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起
 73 |                 word_maybe = text[j:i]  # 正向可能成词的语
 74 |                 if word_maybe in self.zh2hans:  # 是否在字典里边
 75 |                     i = j  # 成词前标志i向后移动
 76 |                     flag = True  # flag标志位变化
 77 |                     res.append(self.zh2hans.get(word_maybe))
 78 |                     break  # 成词则跳出循环
 79 |             if not flag:  # 未选中后单个字的情况
 80 |                 i -= 1
 81 |                 res_i = self.zh2hans.get(text[i])
 82 |                 if res_i:
 83 |                     res.append(res_i)
 84 |                 else:
 85 |                     res.append(text[i])
 86 |         res.reverse()
 87 |         return "".join(res)
 88 | 
 89 | 
 90 | if __name__ == '__main__':
 91 |     hz = Han2Zh()
 92 |     text = ""
 93 |     res_han2zh = hz.han2zh(text)
 94 |     res_zh2han = hz.zh2han(text)
 95 |     print(res_han2zh)
 96 |     print(res_zh2han)
 97 |     while True:
 98 |         print("请输入:")
 99 |         ques = input()
100 |         print(hz.han2zh(ques))
101 |         print(hz.zh2han(ques))
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/macropodus/network/layers/keras_radam.py:
--------------------------------------------------------------------------------
 1 | # !/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | # @time    : 2019/11/12 20:12
 4 | # @author  : Mo
 5 | # @function: radam of keras
 6 | # @codefrom: https://github.com/bojone/keras_radam
 7 | 
 8 | 
 9 | from tensorflow.python.keras.optimizers import Optimizer
10 | # from tensorflow.python.keras.legacy import interfaces
11 | import tensorflow.python.keras.backend as K
12 | 
13 | 
14 | class RAdam(Optimizer):
15 |     """RAdam optimizer.
16 |     Default parameters follow those provided in the original Adam paper.
17 |     # Arguments
18 |         lr: float >= 0. Learning rate.
19 |         beta_1: float, 0 < beta < 1. Generally close to 1.
20 |         beta_2: float, 0 < beta < 1. Generally close to 1.
21 |         epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
22 |         decay: float >= 0. Learning rate decay over each update.
23 |         amsgrad: boolean. Whether to apply the AMSGrad variant of this
24 |             algorithm from the paper "On the Convergence of Adam and
25 |             Beyond".
26 |     # References
27 |         - [RAdam - A Method for Stochastic Optimization]
28 |           (https://arxiv.org/abs/1908.03265)
29 |         - [On The Variance Of The Adaptive Learning Rate And Beyond]
30 |           (https://arxiv.org/abs/1908.03265)
31 |     """
32 | 
33 |     def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
34 |                  epsilon=None, decay=0., **kwargs):
35 |         super(RAdam, self).__init__(**kwargs)
36 |         with K.name_scope(self.__class__.__name__):
37 |             self.iterations = K.variable(0, dtype='int64', name='iterations')
38 |             self.lr = K.variable(lr, name='lr')
39 |             self.beta_1 = K.variable(beta_1, name='beta_1')
40 |             self.beta_2 = K.variable(beta_2, name='beta_2')
41 |             self.decay = K.variable(decay, name='decay')
42 |         if epsilon is None:
43 |             epsilon = K.epsilon()
44 |         self.epsilon = epsilon
45 |         self.initial_decay = decay
46 | 
47 |     # @interfaces.legacy_get_updates_support
48 |     def get_updates(self, loss, params):
49 |         grads = self.get_gradients(loss, params)
50 |         self.updates = [K.update_add(self.iterations, 1)]
51 | 
52 |         lr = self.lr
53 |         if self.initial_decay > 0:
54 |             lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
55 |                                                       K.dtype(self.decay))))
56 | 
57 |         t = K.cast(self.iterations, K.floatx()) + 1
58 |         beta_1_t = K.pow(self.beta_1, t)
59 |         beta_2_t = K.pow(self.beta_2, t)
60 |         rho = 2 / (1 - self.beta_2) - 1
61 |         rho_t = rho - 2 * t * beta_2_t / (1 - beta_2_t)
62 |         r_t = K.sqrt(
63 |             K.relu(rho_t - 4) * K.relu(rho_t - 2) * rho / ((rho - 4) * (rho - 2) * rho_t)
64 |         )
65 |         flag = K.cast(rho_t > 4, K.floatx())
66 | 
67 |         ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
68 |         vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
69 |         self.weights = [self.iterations] + ms + vs
70 | 
71 |         for p, g, m, v in zip(params, grads, ms, vs):
72 |             m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
73 |             v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
74 |             mhat_t = m_t / (1 - beta_1_t)
75 |             vhat_t = K.sqrt(v_t / (1 - beta_2_t))
76 |             p_t = p - lr * mhat_t * (flag * r_t / (vhat_t + self.epsilon) + (1 - flag))
77 | 
78 |             self.updates.append(K.update(m, m_t))
79 |             self.updates.append(K.update(v, v_t))
80 |             new_p = p_t
81 | 
82 |             # Apply constraints.
83 |             if getattr(p, 'constraint', None) is not None:
84 |                 new_p = p.constraint(new_p)
85 | 
86 |             self.updates.append(K.update(p, new_p))
87 |         return self.updates
88 | 
89 |     def get_config(self):
90 |         config = {'lr': float(K.get_value(self.lr)),
91 |                   'beta_1': float(K.get_value(self.beta_1)),
92 |                   'beta_2': float(K.get_value(self.beta_2)),
93 |                   'decay': float(K.get_value(self.decay)),
94 |                   'epsilon': self.epsilon}
95 |         base_config = super(RAdam, self).get_config()
96 |         return dict(list(base_config.items()) + list(config.items()))


--------------------------------------------------------------------------------
/macropodus/segment/seg_statistics/seg_dag.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2019/11/19 9:58
  4 | # @author  : Mo
  5 | # @function: segmentation of maximum probability using dictionary
  6 | 
  7 | 
  8 | from macropodus.preprocess.tools_common import re_continue
  9 | from macropodus.base.seg_basic import SegBasic
 10 | from math import log
 11 | 
 12 | 
 13 | class SegDAG(SegBasic):
 14 |     def __init__(self):
 15 |         super().__init__()
 16 | 
 17 |     def build_dag(self, sentence, len_word_max=105):
 18 |         """
 19 |             构建句子的词典概率有向图;
 20 |             jieba使用的是前缀字典替代前缀树,内存比前缀树小,且比前缀树快;
 21 |             基本思想是构建'大漠帝国:132','大漠帝','大漠:640','大':1024等，没有则置为0,
 22 |             搜索时候前缀不存在就跳出,不用继续下去了
 23 |         :param sentence: str, like '大漠帝国是谁'
 24 |         :param sentence: int, like 132
 25 |         :return: dict, like {0:[0,1], 1:[1]}
 26 |         """
 27 |         len_sen = len(sentence)
 28 |         dag_sen = {}
 29 |         for i in range(len_sen):  # 前向遍历, 全切分
 30 |             enum_j = [i]          # 单个字就是它本身
 31 |             for j in range(i+1, min(len_sen, i+len_word_max)):    # 遍历从当前字到句子末尾可能成词的部分, 当前的不取, 设置最大成词长度为132
 32 |                 word_maybe = sentence[i:j+1]
 33 |                 if word_maybe in self.dict_words_freq:
 34 |                     enum_j.append(j)
 35 |             dag_sen[i] = enum_j
 36 |         return dag_sen
 37 | 
 38 |     def calculate_prob(self, sentence, DAG, route):
 39 |         """
 40 |             动态规划求取最大概率, 代码来自jieba项目
 41 |             code from: https://github.com/fxsjy/jieba
 42 |         :param sentence: str, input of sentence, like "大漠帝国是谁?"
 43 |         :param DAG: dict, 
 44 |         :param route: dict, 
 45 |         :return: None
 46 |         """
 47 |         len_sen = len(sentence)
 48 |         route[len_sen] = (0, 0)
 49 |         log_total = log(self.num_words)
 50 |         for index in range(len_sen - 1, -1, -1): # 动态规划
 51 |             route[index] = max((log(self.dict_words_freq.get(sentence[index:x + 1]) or 1)
 52 |                               - log_total + route[x + 1][0], x) for x in DAG[index])
 53 | 
 54 |     def cut(self, sentence):
 55 |         """
 56 |             seg_dag字典最大概率切词, 代码来自jieba项目
 57 |             code from: https://github.com/fxsjy/jieba
 58 |         :param sentence: str, input of sentence, like "大漠帝国是谁?"
 59 |         :return: None
 60 |         """
 61 |         len_sen = len(sentence)
 62 |         word_temp = ''
 63 |         route = {}
 64 |         i = 0
 65 |         DAG = self.build_dag(sentence) # 根据sentence构建有向图dag
 66 |         self.calculate_prob(sentence, DAG, route) # 动态规划计算概率最大的路径
 67 |         while i < len_sen:
 68 |             j = route[i][1] + 1 # 获取index, i为成词的begin, j为成词的end
 69 |             word_ch = sentence[i:j] # 概率成词
 70 |             if (j-i<2) and re_continue.match(word_ch): # 单个字判断是否为连续, 字母-数字-.-@等为连续
 71 |                 word_temp += word_ch
 72 |                 i = j
 73 |             else: # 成词后返回一个yield可迭代对象, yield后转list有点耗时
 74 |                 if word_temp: # 有word_temp的情况下 word_ch也没有迭代返回
 75 |                     yield word_temp
 76 |                     word_temp = ''
 77 |                 yield word_ch
 78 |                 i = j
 79 |         if word_temp: # 最后一个成词为"字母-数字-.-@等为连续"的情况
 80 |             yield word_temp
 81 | 
 82 | 
 83 | if __name__ == '__main__':
 84 |     sd = SegDAG()
 85 |     sd.add_word(str('知识图谱'))
 86 | 
 87 |     # for i in range(50000):
 88 |     sd_enum = sd.cut(sentence='apple_pir大漠帝国我再也找不到了')
 89 |     print(list(sd_enum))
 90 | 
 91 |     # 测试性能
 92 |     from macropodus.preprocess.tools_common import txt_read, txt_write
 93 |     from macropodus.conf.path_config import path_root
 94 |     import time
 95 |     path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt"
 96 |     sentences = txt_read(path_wordseg_a)
 97 | 
 98 |     time_start = time.time()
 99 |     count = 0
100 |     for i in range(10000):
101 |         for sen in sentences:
102 |             # print("原句:"+sen)
103 |             count += 1
104 |             res = sd.cut(sen)
105 |             # print(list(res))
106 |     time_end = time.time()
107 |     print(time_end-time_start)
108 |     print(count/(time_end - time_start))
109 | 
110 | while True:
111 |     print("请输入:")
112 |     sen = input()
113 |     print(list(sd.cut(sen)))
114 | # win10测试, i7 8th + 16G RAM
115 | # 10000/0.17*50 = 2864136(line/s)
116 | # 50000/0.87*50 = 2872092(line/s)
117 | 
118 | 
119 | 


--------------------------------------------------------------------------------
/macropodus/summarize/topic_base/topic_lsi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | # !/usr/bin/python
 3 | # @time     :2019/11/2 21:03
 4 | # @author   :Mo
 5 | # @function :topic model of LSI
 6 | # @paper    :Text summarization using Latent Semantic Analysis
 7 | 
 8 | 
 9 | from macropodus.preprocess.tools_ml import cut_sentence, macropodus_cut
10 | from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
11 | from macropodus.data.words_common.stop_words import stop_words
12 | # sklearn
13 | from sklearn.decomposition import TruncatedSVD
14 | import numpy as np
15 | 
16 | 
17 | class LSISum:
18 |     def __init__(self):
19 |         self.stop_words = stop_words.values()
20 |         self.algorithm = 'lsi'
21 | 
22 |     def summarize(self, text, num=320, topic_min=5, judge_topic='all'):
23 |         """
24 |             
25 |         :param text: 
26 |         :param num: 
27 |         :return: 
28 |         """
29 |         # 切句
30 |         if type(text) == str:
31 |             self.sentences = cut_sentence(text)
32 |         elif type(text) == list:
33 |             self.sentences = text
34 |         else:
35 |             raise RuntimeError("text type must be list or str")
36 |         len_sentences_cut = len(self.sentences)
37 |         # 切词
38 |         sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
39 |                           if word.strip()] for sentence in self.sentences]
40 |         # 去除停用词等
41 |         self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
42 |         self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
43 |         # 计算每个句子的tfidf
44 |         sen_tfidf = tfidf_fit(self.sentences_cut)
45 |         # 主题数, 经验判断
46 |         topic_num = min(topic_min, int(len(sentences_cut)/2))  # 设定最小主题数为3
47 |         svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32)
48 |         res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T)
49 |         res_svd_v = svd_tfidf.components_
50 | 
51 |         if judge_topic:
52 |             ### 方案一, 获取最大那个主题的k个句子
53 |             ##################################################################################
54 |             topic_t_score = np.sum(res_svd_v, axis=-1)
55 |             # 对每列(一个句子topic_num个主题),得分进行排序,0为最大
56 |             res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1]
57 |             # 统计为最大每个主题的句子个数
58 |             exist = (res_nmf_h_soft <= 0) * 1.0
59 |             factor = np.ones(res_nmf_h_soft.shape[1])
60 |             topic_t_count = np.dot(exist, factor)
61 |             # 标准化
62 |             topic_t_count /= np.sum(topic_t_count, axis=-1)
63 |             topic_t_score /= np.sum(topic_t_score, axis=-1)
64 |             # 主题最大个数占比, 与主题总得分占比选择最大的主题
65 |             topic_t_tc = topic_t_count + topic_t_score
66 |             topic_t_tc_argmax = np.argmax(topic_t_tc)
67 |             # 最后得分选择该最大主题的
68 |             res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist()
69 |             res_combine = {}
70 |             for l in range(len_sentences_cut):
71 |                 res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
72 |             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
73 |             #####################################################################################
74 |         else:
75 |             ### 方案二, 获取最大主题概率的句子, 不分主题
76 |             res_combine = {}
77 |             for i in range(len_sentences_cut):
78 |                 res_row_i = res_svd_v[:, i]
79 |                 res_row_i_argmax = np.argmax(res_row_i)
80 |                 res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
81 |             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
82 |         num_min = min(num, int(len_sentences_cut * 0.6))
83 |         return score_sen[0:num_min]
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     lsi = LSISum()
88 |     doc = "多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，" \
89 |           "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金（有限合伙）共同发起设立嘉道方直教育产业投资基金（暂定名）。" \
90 |           "该基金认缴出资总规模为人民币3.01亿元。" \
91 |           "基金的出资方式具体如下：出资进度方面，基金合伙人的出资应于基金成立之日起四年内分四期缴足，每期缴付7525万元；" \
92 |           "各基金合伙人每期按其出资比例缴付。合伙期限为11年，投资目标为教育领域初创期或成长期企业。" \
93 |           "截止公告披露日，深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日，深圳嘉道功程股权投资基金产权结构如下:" \
94 |           "公告还披露，方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
95 |           "方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
96 |     sum = lsi.summarize(doc, num=8)
97 |     for i in sum:
98 |         print(i)
99 | 


--------------------------------------------------------------------------------
/test/evaluate/tet_macropodus.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2019/12/17 21:13
  4 | # @author  : Mo
  5 | # @function: test macropodus
  6 | 
  7 | # import os
  8 | # os.environ['TF_KERAS'] = '1'
  9 | 
 10 | 
 11 | import time
 12 | time_start = time.time()
 13 | import macropodus
 14 | print('macropodus初始化耗时: ' + str(time.time()-time_start) + 's')
 15 | 
 16 | # import sys
 17 | # import os
 18 | # print(os.name)
 19 | # print(sys.platform)
 20 | 
 21 | # macropodus.load_user_dict(path_user="user.json", type_user="json")
 22 | macropodus.add_word(word="斗鱼属")
 23 | macropodus.add_word(word="斗鱼科")
 24 | macropodus.add_word(word="鲈形目")
 25 | macropodus.save_add_words(word_freqs={"喜斗":32, "护卵":64, "护幼":132})
 26 | macropodus.add_word(word="坑爹的平衡性基金")
 27 | macropodus.save_add_words(word_freqs={"BBC":132})
 28 | 
 29 | # sent = "今日头条 白嫖 东风快递 令人喷饭 勿谓言之不预也 白嫖 口区 弓虽 口丕 我酸了 祖安人 迷惑行为 5G 996 007 1118 35 120 251 nmsl nsdd wdnmd CSGO 唱跳 rap 篮球 鸡你太美 cxk 盘它 撞梗 融梗 雨女无瓜 要你寡 刺激战场 绝地求生"
 30 | # sent = "狼灭 狼火 狼炎 狼焱 灵魂八问 硬核 奥力给 有内味了 awsl 影流之主 巨魔之王"
 31 | # words = sent.split(" ")
 32 | # word_dict = {}
 33 | # for w in words:
 34 | #     word_dict[w] = 132
 35 | # macropodus.save_add_words(word_freqs=word_dict)
 36 | 
 37 | print(macropodus.cut("坑爹的平衡性基金啊,坑爹呀斗鱼属，Macropodus (Lacépède, 1801)，鲈形目斗鱼科的一属鱼类。"
 38 |                           "本属鱼类通称斗鱼。因喜斗而得名。分布于亚洲东南部。中国有2种，即叉尾斗鱼，分布于长江及以南各省；"
 39 |                           "叉尾斗鱼，分布于辽河到珠江流域。其喜栖居于小溪、河沟、池塘、稻田等缓流或静水中。"
 40 |                           "雄鱼好斗，产卵期集草成巢，雄鱼口吐粘液泡沫，雌鱼产卵其中，卵浮性，受精卵在泡沫内孵化。雄鱼尚有护卵和护幼现象。"
 41 |                           ))
 42 | 
 43 | sen_calculate = "23 + 13 * (25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3))加根号144你算得几多"
 44 | sen_chi2num = "三千零七十八亿三千零十五万零三百一十二点一九九四"
 45 | sen_num2chi = 1994.1994
 46 | sen_roman2int = "IX"
 47 | sen_int2roman = 132
 48 | # sent1 = "PageRank算法简介"
 49 | # sent2 = "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。"
 50 | sent1 = "香蕉的翻译"
 51 | sent2 = "用英语说香蕉"
 52 | summary = "四川发文取缔全部不合规p2p。字节跳动与今日头条。成都日报，成都市，李太白与杜甫"\
 53 |            "PageRank算法简介。" \
 54 |            "是上世纪90年代末提出的一种计算网页权重的算法! " \
 55 |            "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
 56 |            "业界急需一种相对比较准确的网页重要性计算方法。 " \
 57 |            "是人们能够从海量互联网世界中找出自己需要的信息。 " \
 58 |            "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
 59 |            "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
 60 |            "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
 61 |            "和投票目标的等级来决定新的等级。简单的说， " \
 62 |            "一个高等级的页面可以使其他低等级页面的等级提升。 " \
 63 |            "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
 64 |            "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
 65 |            "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
 66 |            "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
 67 | 
 68 | # 分词(词典最大概率分词DAG)
 69 | words = macropodus.cut(summary)
 70 | print(words)
 71 | # 新词发现
 72 | new_words = macropodus.find(summary)
 73 | print(new_words)
 74 | # 文本摘要
 75 | sum = macropodus.summarize(summary)
 76 | print(sum)
 77 | # 关键词抽取
 78 | keyword = macropodus.keyword(summary)
 79 | print(keyword)
 80 | # 文本相似度
 81 | sim = macropodus.sim(sent1, sent2)
 82 | print(sim)
 83 | # tookit
 84 | # 计算器
 85 | score_calcul = macropodus.calculate(sen_calculate)
 86 | print(score_calcul)
 87 | # 中文数字与阿拉伯数字相互转化
 88 | res_chi2num = macropodus.chi2num(sen_chi2num)
 89 | print(res_chi2num)
 90 | res_num2chi = macropodus.num2chi(sen_num2chi)
 91 | print(res_num2chi)
 92 | # 阿拉伯数字与罗马数字相互转化
 93 | res_roman2int = macropodus.roman2num(sen_roman2int)
 94 | print(res_roman2int)
 95 | res_int2roman = macropodus.num2roman(sen_int2roman)
 96 | print(res_int2roman)
 97 | # 中文汉字转拼音
 98 | res_pinyin = macropodus.pinyin(summary)
 99 | print(res_pinyin)
100 | # 中文繁简转化
101 | res_zh2han = macropodus.zh2han(summary)
102 | print(res_zh2han)
103 | res_han2zh = macropodus.han2zh(res_zh2han)
104 | print(res_han2zh)
105 | 
106 | # 命名实体提取,
107 | #    ner, albert+bilstm+crf网络架构, 最大支持126个字符;
108 | #    需要安装tensorflow==1.15.0(pip安装不默认下载, 1.15.0以下未实验, 1.13以上应该可以)
109 | #    需要下载模型(pip安装不默认下载, 将ner_albert_people_1998覆盖到安装目录macropodus/data/model);
110 | summary = ["美丽的广西是我国华南地区的一颗璀璨的明珠,山清水秀生态美,风生水起万象新。", "广西壮族自治区，简称“桂”，是中华人民共和国省级行政区"]
111 | res_ner = macropodus.ner(summary[0])
112 | print(res_ner)
113 | res_ners = macropodus.ners(summary)
114 | print(res_ners)
115 | 
116 | # 词性标注,
117 | #     pos tag, albert+bilstm+crf网络架构, 最大支持126个字符;
118 | #     需要安装tensorflow==1.15.0(pip安装不默认下载, 1.15.0以下未实验, 1.13以上应该可以)
119 | #     需要下载模型(pip安装不默认下载, 将tag_albert_people_1998覆盖到安装目录macropodus/data/model);
120 | res_postag = macropodus.postag(summary[0])
121 | print(res_postag)
122 | res_postags = macropodus.postags(summary)
123 | print(res_postags)
124 | 
125 | 


--------------------------------------------------------------------------------
/macropodus/summarize/feature_base/word_significance.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/8/26 23:42
  4 | # @author   :Mo
  5 | # @function :text summarize of extraction of word significance
  6 | # @paper    :The Automatic Creation of Literature Abstracts*
  7 | # @url      :http://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf
  8 | 
  9 | 
 10 | from macropodus.data.words_common.stop_words import stop_words
 11 | from macropodus.preprocess.tools_ml import extract_chinese
 12 | from macropodus.preprocess.tools_ml import macropodus_cut
 13 | from macropodus.preprocess.tools_ml import cut_sentence
 14 | from collections import Counter
 15 | 
 16 | 
 17 | class WordSignificanceSum:
 18 |     def __init__(self):
 19 |         """
 20 |         features:
 21 |             1. words mix in title and sentence
 22 |             2. keywords in sentence
 23 |             3. Position of sentence
 24 |             4. Length of sentence
 25 |         """
 26 |         self.algorithm = 'word_significance'
 27 |         self.stop_words = stop_words.values()
 28 |         self.num = 0
 29 | 
 30 |     def summarize(self, text, num=320):
 31 |         """
 32 |             根据词语意义确定中心句
 33 |         :param text: str
 34 |         :param num: int
 35 |         :return: list
 36 |         """
 37 |         # 切句
 38 |         if type(text) == str:
 39 |             self.sentences = cut_sentence(text)
 40 |         elif type(text) == list:
 41 |             self.sentences = text
 42 |         else:
 43 |             raise RuntimeError("text type must be list or str")
 44 |         # 切词
 45 |         sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
 46 |                           if word.strip()] for sentence in self.sentences]
 47 |         # 去除停用词等
 48 |         self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
 49 |         # 词频统计
 50 |         self.words = []
 51 |         for sen in self.sentences_cut:
 52 |             self.words = self.words + sen
 53 |         self.word_count = dict(Counter(self.words))
 54 |         self.word_count_rank = sorted(self.word_count.items(), key=lambda f: f[1], reverse=True)
 55 |         # 最小句子数
 56 |         num_min = min(num, int(len(self.word_count)*0.6))
 57 |         # 词语排序, 按照词频
 58 |         self.word_rank = [wcr[0] for wcr in self.word_count_rank][0:num_min]
 59 |         res_sentence = []
 60 |         # 抽取句子, 顺序, 如果词频高的词语在句子里, 则抽取
 61 |         for word in self.word_rank:
 62 |             for i in range(0, len(self.sentences)):
 63 |                 # 当返回关键句子到达一定量, 则结束返回
 64 |                 if len(res_sentence) < num_min:
 65 |                     added = False
 66 |                     for sent in res_sentence:
 67 |                         if sent == self.sentences[i]: added = True
 68 |                     if (added == False and word in self.sentences[i]):
 69 |                         res_sentence.append(self.sentences[i])
 70 |                         break
 71 |         # 只是计算各得分,没什么用
 72 |         len_sentence = len(self.sentences)
 73 |         res_sentence = [(1-1/(len_sentence+len_sentence/(k+1)), rs) for k, rs in enumerate(res_sentence)]
 74 |         return res_sentence
 75 | 
 76 | 
 77 | if __name__ == "__main__":
 78 |     doc = "PageRank算法简介。" \
 79 |            "是上世纪90年代末提出的一种计算网页权重的算法! " \
 80 |            "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
 81 |            "业界急需一种相对比较准确的网页重要性计算方法。 " \
 82 |            "是人们能够从海量互联网世界中找出自己需要的信息。 " \
 83 |            "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
 84 |            "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
 85 |            "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
 86 |            "和投票目标的等级来决定新的等级。简单的说， " \
 87 |            "一个高等级的页面可以使其他低等级页面的等级提升。 " \
 88 |            "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
 89 |            "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
 90 |            "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
 91 |            "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
 92 | 
 93 |     doc1 = "多知网. "\
 94 |           "多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，" \
 95 |           "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金（有限合伙）共同发起设立嘉道方直教育产业投资基金（暂定名）。" \
 96 |           "该基金认缴出资总规模为人民币3.01亿元。" \
 97 |           "基金的出资方式具体如下：出资进度方面，基金合伙人的出资应于基金成立之日起四年内分四期缴足，每期缴付7525万元；" \
 98 |           "各基金合伙人每期按其出资比例缴付。合伙期限为11年，投资目标为教育领域初创期或成长期企业。" \
 99 |           "截止公告披露日，深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日，深圳嘉道功程股权投资基金产权结构如下:" \
100 |           "公告还披露，方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
101 |           "方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
102 | 
103 |     ws = WordSignificanceSum()
104 |     res = ws.summarize(doc, num=6)
105 |     for r in res:
106 |         print(r)
107 | 


--------------------------------------------------------------------------------
/test/survey_report/nlp_platfom_survey.md:
--------------------------------------------------------------------------------
  1 | # 中文自然语言处理(nlp)工具调研与汇总(截至2019.11.16)
  2 | 
  3 | 
  4 | ## 1.常见平台与功能
  5 | 平台|语言|star|year|中文分词|词性标注|依存句法|实体识别|关键词提取|文本摘要|文本聚类|情感识别|文本相似|关系抽取|free|
  6 | ---|---|---|---|---|---|---|---|---|---|---|---|---|---|---
  7 | jieba|python|20.8k|7/0.5|是|是|否|否|是|否|否|是|否|否|MIT
  8 | THULAC-Python|python|1.2k|4/1|是|是|否|否|否|否|否|否|否|否|MIT
  9 | pkuseg-python|python|4.3k|0.9/0.5|是|是|否|否|否|否|否|否|否|否|MIT
 10 | snownlp|python|4.4k|6/3/*|是|是|否|否|是|是|否|是|是|否|MIT
 11 | deepnlp|python|1.3k|2/2/!|是|是|是|是|是|是|否|否|否|否|MIT
 12 | fastNLP|python|0.9k|2/0|是|是|否|是|否|否|否|是|否|否|MIT
 13 | Jiagu|python|0.97k|0.9/0|是|是|是|是|是|是|是|是|否|是|MIT
 14 | YaYaNLP|python|0.05k|4/4/!|是|是|否|是|否|否|否|否|否|否|MIT
 15 | HanLP|java|16.4k|0.9/0|是|是|是|是|是|是|是|是|否|否|MIT
 16 | ansj-seg|java|5.2k|3/0.4|是|是|是|是|是|是|否|是|否|否|Apache-2.0
 17 | word|java|1.4k|5/1|是|是|否|是|否|否|否|否|是|否|Apache-2.0
 18 | Jcseg|java|0.69k|3/0|是|是|是|是|是|是|否|否|否|否|Apache-2.0
 19 | ik-analyzer|java|0.53k|9/9/!|是|是|是|否|否|否|否|否|否|否|LGPL-3.0
 20 | CoreNLP|java|6.7k|9/9/!|是|是|是|是|是|否|否|否|否|否|GUN2.0
 21 | fnlp|java|2.2k|6/0.9/!|是|是|是|是|是|是|是|否|否|否|LGPL-3.0
 22 | NLPIR|java|2.5k|?/1/!|是|是|否|否|否|否|是|否|否|否|not open
 23 | sego|go|1.2k|6/1/!|是|是|否|否|否|否|是|否|否|否|Apache-2.0
 24 | ltp|c++|2.3k|6/1/!|是|是|是|是|是|是|是|否|否|否|LGPL-3.0
 25 | PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0
 26 | 
 27 | 
 28 | ##备注
 29 | * 1.year中"6/3/*"表示"项目开始时间/最近更新时间/在维护";!表示不维护,超过一年不维护,不回复issiue则认为放弃;
 30 | * 2.其他功能
 31 |     * snownlp: 拼音转换,繁简转换,tf-idf计算,切句子
 32 |     * deepnlp: tensorflow1.4训练的各种模型
 33 |     * NLPIR: 检索,敏感信息,文档去重,编码转换
 34 |     * Ltp: 事件抽取,srl,时间抽取,
 35 |     * HanLP: 人民日报2014分词,文本推荐(相似度),索引分词
 36 |     * ansj-seg: 比较混乱,主页没有调用说明,词典是个大杂烩
 37 |     * word: 词频统计、词性标注、同义标注、反义标注、拼音标注
 38 |     * ltp: 特征裁剪策略,语义角色标注
 39 |     * PaddleNLP: Paddle训练,以及基础包,enienr生成等各种任务
 40 | * 3.更多的统计学习方法
 41 |     摘要,情感识别(酸甜苦辣),新词发现,实体与关系抽取,领域分类,生成
 42 |     
 43 |     
 44 | ##分词算法
 45 | * 1.jieba
 46 |    * 1.1 基于前缀词典实现高效的词图扫描，生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG)
 47 |    * 1.2 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合
 48 |    * 1.3 对于未登录词，采用了基于汉字成词能力的 HMM 模型，使用了 Viterbi 算法
 49 | * 2.THULAC,pkuseg,Jiagu,fastNLP
 50 |    * 2.1 CRF(char,word,elmo,bert)
 51 |    * 2.2 feature+CRF
 52 | * 3.ansj-seg
 53 |    * 3.1 n-Gram+CRF+HMM
 54 | * 4.HanLP
 55 |    * 4.1 n-Gram, CRF
 56 | * 5.sego
 57 |    * 5.1 基于词频的最短路径加动态规划
 58 | * 6.Ltp
 59 |    * 6.1 bilstm+crf
 60 |    * 6.2    英文、URI一类特殊词识别规则
 61 |             利用空格等自然标注线索
 62 |             在统计模型中融入词典信息
 63 |             从大规模未标注数据中统计的字间互信息、上下文丰富程度
 64 | * 7.PaddleNLP
 65 |    * 7.1 gru+crf
 66 | * 8.word(最大匹配法、最大概率法、最短路径法)
 67 |    * 8.1 正向最大匹配算法,逆向最大匹配算法,正向最小匹配算法,逆向最小匹配算法
 68 |    * 8.2 双向最大匹配算法,双向最小匹配算法,双向最大最小匹配算法
 69 |    * 8.3 全切分算法,最少词数算法,最大Ngram分值算法,最短路径法
 70 |    * 8.4 语义切分:扩充转移网络法、知识分词语义分析法、邻接约束法、综合匹配法、后缀分词法、特征词库法、矩阵约束法、语法分析法
 71 | 
 72 | 
 73 | ## 工具包地址
 74 | * jiba:[https://github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
 75 | * HanLP:[https://github.com/hankcs/HanLP](https://github.com/hankcs/HanLP)
 76 | * CoreNLP:[https://github.com/stanfordnlp/CoreNLP](https://github.com/stanfordnlp/CoreNLP)
 77 | * ansj-seg:[https://github.com/lionsoul2014/jcseg](https://github.com/lionsoul2014/jcseg)
 78 | * THULAC-Python:[https://github.com/thunlp/THULAC-Python](https://github.com/thunlp/THULAC-Python)
 79 | * pkuseg-python:[https://github.com/lancopku/pkuseg-python](https://github.com/lancopku/pkuseg-python)
 80 | * snownlp:[https://github.com/isnowfy/snownlp](https://github.com/isnowfy/snownlp)
 81 | * deepnlp:[https://github.com/rockingdingo/deepnlp](https://github.com/rockingdingo/deepnlp)
 82 | * fastNLP:[https://github.com/fastnlp/fastNLP](https://github.com/fastnlp/fastNLP)
 83 | * Jiagu:[https://github.com/ownthink/Jiagu](https://github.com/ownthink/Jiagu)
 84 | * xmnlp:[https://github.com/SeanLee97/xmnlp](https://github.com/SeanLee97/xmnlp)
 85 | * word:[https://github.com/ysc/word](https://github.com/ysc/word)
 86 | * jcseg:[https://github.com/lionsoul2014/jcseg](https://github.com/lionsoul2014/jcseg)
 87 | * paddleNLP:[https://github.com/PaddlePaddle/models](https://github.com/PaddlePaddle/models)
 88 | * sego:[https://github.com/huichen/sego](https://github.com/huichen/sego)
 89 | * ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer)
 90 | * fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp)
 91 | * NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR)
 92 | 
 93 | ###
 94 | 新词发现:
 95 | 1. Matrix67: The Aha Moments的信息熵方法: [互联网时代的社会语言学：基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044)
 96 |    1.词频、左右熵(丰度,字符组合左右邻字的丰富程度, -p*log(p))、
 97 |    2.互信息(凝固度,内部凝聚程度, pmi = p(x,y)*log(p(x,y)/(p(x)*p(y))))等构建得分函数
 98 | 2. HanLP的长短语构造方法: [基于互信息和左右信息熵的短语提取识别](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html)
 99 |    1.切词(只统计词典)，统计词语共现(一阶、二阶、三阶)
100 |    2.左右熵、互信息。合并词典词语，构建短语
101 | 3. SmoothNLP:["新词发现"算法探讨与优化-SmoothNLP](https://zhuanlan.zhihu.com/p/80385615)
102 |    1.左右熵权重: Ew =log((El*e^Er+Er*e^EL)/|Er-El|)
103 |    2.平均互信息AMI:(1/n) * log(p(w)/(p(1)p(2)...p(n)))
104 |    3.过滤条件:对在candidate ngram中, 首字或者尾字出现次数特别多的进行筛选, 如"XX的,美丽的,漂亮的"剔出字典
105 | 
106 | 


--------------------------------------------------------------------------------
/macropodus/preprocess/tools_common.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2019/11/19 0:15
  4 | # @author  : Mo
  5 | # @function: common tools of macropodus
  6 | 
  7 | 
  8 | from macropodus.conf.path_log import get_logger_root
  9 | import json
 10 | import os
 11 | import re
 12 | 
 13 | 
 14 | re_continue = re.compile("[A-Za-z0-9.@_]", re.U)
 15 | re_zh_cn = re.compile("([\u4E00-\u9FD5]+)", re.U)
 16 | 
 17 | 
 18 | logger = get_logger_root()
 19 | 
 20 | 
 21 | __all__ = ["txt_read",
 22 |            "txt_write",
 23 |            "save_json",
 24 |            "load_json",
 25 |            "delete_file"]
 26 | 
 27 | 
 28 | def txt_read(path_file, encode_type='utf-8'):
 29 |     """
 30 |         读取txt文件，默认utf8格式, 不能有空行
 31 |     :param file_path: str, 文件路径
 32 |     :param encode_type: str, 编码格式
 33 |     :return: list
 34 |     """
 35 |     list_line = []
 36 |     try:
 37 |         file = open(path_file, 'r', encoding=encode_type)
 38 |         while True:
 39 |             line = file.readline().strip()
 40 |             if not line:
 41 |                 break
 42 |             list_line.append(line)
 43 |         file.close()
 44 |     except Exception as e:
 45 |         logger.info(str(e))
 46 |     finally:
 47 |         return list_line
 48 | 
 49 | 
 50 | def txt_write(list_line, file_path, type='w', encode_type='utf-8'):
 51 |     """
 52 |       txt写入list文件
 53 |     :param listLine:list, list文件，写入要带"\n" 
 54 |     :param filePath:str, 写入文件的路径
 55 |     :param type: str, 写入类型, w, a等
 56 |     :param encode_type: 
 57 |     :return: 
 58 |     """
 59 |     try:
 60 |         file = open(file_path, type, encoding=encode_type)
 61 |         file.writelines(list_line)
 62 |         file.close()
 63 |     except Exception as e:
 64 |         logger.info(str(e))
 65 | 
 66 | 
 67 | def save_json(json_lines, json_path, encoding='utf-8', indent=4):
 68 |     """
 69 |       保存json，
 70 |     :param json_lines: json 
 71 |     :param path: str
 72 |     :return: None
 73 |     """
 74 |     with open(json_path, 'w', encoding=encoding) as fj:
 75 |         fj.write(json.dumps(json_lines, ensure_ascii=False, indent=indent))
 76 |     fj.close()
 77 | 
 78 | 
 79 | def load_json(path, encoding="utf-8"):
 80 |     """
 81 |       获取json, json存储为[{}]格式, like [{'大漠帝国':132}]
 82 |     :param path: str
 83 |     :return: json
 84 |     """
 85 |     with open(path, 'r', encoding=encoding) as fj:
 86 |         model_json = json.load(fj)
 87 |     return model_json
 88 | 
 89 | 
 90 | def delete_file(path):
 91 |     """
 92 |         删除一个目录下的所有文件
 93 |     :param path: str, dir path
 94 |     :return: None
 95 |     """
 96 |     for i in os.listdir(path):
 97 |         # 取文件或者目录的绝对路径
 98 |         path_children = os.path.join(path, i)
 99 |         if os.path.isfile(path_children):
100 |             if path_children.endswith(".h5") or path_children.endswith(".json") or "events" in path_children or "trace" in path_children:
101 |                 os.remove(path_children)
102 |         else:# 递归, 删除目录下的所有文件
103 |             delete_file(path_children)
104 | 
105 | 
106 | def get_dir_files(path_dir):
107 |     """
108 |         递归获取某个目录下的所有文件(单层)
109 |     :param path_dir: str, like '/home/data'
110 |     :return: list, like ['2019_12_5.txt']
111 |     """
112 | 
113 |     def get_dir_files_func(file_list, dir_list, root_path=path_dir):
114 |         """
115 |             递归获取某个目录下的所有文件
116 |         :param root_path: str, like '/home/data'
117 |         :param file_list: list, like []
118 |         :param dir_list: list, like []
119 |         :return: None
120 |         """
121 |         # 获取该目录下所有的文件名称和目录名称
122 |         dir_or_files = os.listdir(root_path)
123 |         for dir_file in dir_or_files:
124 |             # 获取目录或者文件的路径
125 |             dir_file_path = os.path.join(root_path, dir_file)
126 |             # 判断该路径为文件还是路径
127 |             if os.path.isdir(dir_file_path):
128 |                 dir_list.append(dir_file_path)
129 |                 # 递归获取所有文件和目录的路径
130 |                 get_dir_files_func(dir_file_path, file_list, dir_list)
131 |             else:
132 |                 file_list.append(dir_file_path)
133 | 
134 |     # 用来存放所有的文件路径
135 |     _files = []
136 |     # 用来存放所有的目录路径
137 |     dir_list = []
138 |     get_dir_files_func(_files, dir_list, path_dir)
139 |     return _files
140 | 
141 | 
142 | def get_all_dirs_files(path_dir):
143 |     """
144 |         递归获取某个目录下的所有文件(所有层, 包括子目录)
145 |     :param path_dir: str, like '/home/data'
146 |     :return: list, like ['2020_01_08.txt']
147 |     """
148 |     path_files = []
149 |     def get_path_files(path_dir):
150 |         """
151 |             递归函数, 获取某个目录下的所有文件
152 |         :param path_dir: str, like '/home/data'
153 |         :return: list, like ['2020_01_08.txt']
154 |         """
155 |         for root, dirs, files in os.walk(path_dir):
156 |             for fi in files: # 递归的终止条件
157 |                 path_file = os.path.join(root, fi)
158 |                 path_files.append(path_file)
159 |             for di in dirs:  # 语间目录便继续递归
160 |                 path_dir = os.path.join(root, di)
161 |                 get_path_files(path_dir)
162 |     get_path_files(path_dir)
163 |     return path_files
164 | 


--------------------------------------------------------------------------------
/macropodus/tookit/trie_tree/trie_tree.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/11/19 20:40
  4 | # @author   :Mo
  5 | # @function :TrieTree of keywords find, 只返回查全的情况, 查找句子中的关键词（例如影视名、人名、关键词、实体等）
  6 | 
  7 | 
  8 | from macropodus.conf.path_log import get_logger_root
  9 | 
 10 | 
 11 | logger = get_logger_root()
 12 | 
 13 | 
 14 | class TrieNode:
 15 |     """
 16 |         前缀树节点-链表
 17 |     """
 18 |     def __init__(self):
 19 |         self.child = {}
 20 | 
 21 | 
 22 | class TrieTree:
 23 |     """
 24 |         前缀树构建, 新增关键词, 关键词词语查找等
 25 |     """
 26 |     def __init__(self):
 27 |         self.algorithm = "trietree"
 28 |         self.root = TrieNode()
 29 | 
 30 |     def add_keyword(self, keyword):
 31 |         """
 32 |             新增一个关键词
 33 |         :param keyword: str, 构建的关键词
 34 |         :return: None
 35 |         """
 36 |         node_curr = self.root
 37 |         for word in keyword:
 38 |             if node_curr.child.get(word) is None:
 39 |                 node_next = TrieNode()
 40 |                 node_curr.child[word] = node_next
 41 |             node_curr = node_curr.child[word]
 42 |         # 每个关键词词后边, 加入end标志位
 43 |         if node_curr.child.get('[END]') is None:
 44 |             node_next = TrieNode()
 45 |             node_curr.child['[END]'] = node_next
 46 |         node_curr = node_curr.child['[END]']
 47 |         logger.info("add {} success!".format("".join(keyword)))
 48 | 
 49 |     def delete_keyword(self, keyword):
 50 |         """
 51 |             删除一个关键词
 52 |         :param keyword: str, 构建的关键词
 53 |         :return: None
 54 |         """
 55 |         node_curr = self.root
 56 |         flag = 1
 57 |         for word in keyword:
 58 |             if node_curr.child.get(word) is not None:
 59 |                 node_curr = node_curr.child[word]
 60 |             else:
 61 |                 flag = 0
 62 |         # 每个关键词词后边, 加入end标志位
 63 |         if node_curr.child.get('[END]') is not None and flag == 1:
 64 |             node_curr.child.pop('[END]')
 65 |         else:
 66 |             logger.info("{} is not in trietree, delete keyword faild!".format("".join(keyword)))
 67 | 
 68 |     def add_keywords_from_list(self, keywords):
 69 |         """
 70 |             新增关键词s, 格式为list
 71 |         :param keyword: list, 构建的关键词
 72 |         :return: None
 73 |         """
 74 |         for keyword in keywords:
 75 |             self.add_keyword(keyword)
 76 | 
 77 |     def find_keyword(self, sentence):
 78 |         """
 79 |             从句子中提取关键词, 可提取多个
 80 |         :param sentence: str, 输入的句子
 81 |         :return: list, 提取到的关键词
 82 |         """
 83 |         assert type(sentence) == str
 84 |         if not sentence: # 空格字符不取
 85 |             return []
 86 | 
 87 |         node_curr = self.root # 关键词的头, 每遍历完一遍后需要重新初始化
 88 |         index_last = len(sentence)
 89 |         keyword_list = []
 90 |         keyword = ''
 91 |         count = 0
 92 |         for word in sentence:
 93 |             count += 1
 94 |             if node_curr.child.get(word) is None: # 查看有无后缀, 即匹配到一个关键词最后一个字符的时候
 95 |                 if keyword: # 提取到的关键词(也可能是前面的几位)
 96 |                     if node_curr.child.get('[END]') is not None: # 取以end结尾的关键词
 97 |                         keyword_list.append(keyword)
 98 |                     if self.root.child.get(word) is not None: # 处理连续的关键词情况, 如"第九区流浪地球"
 99 |                         keyword = word
100 |                         node_curr = self.root.child[word]
101 |                     else: #
102 |                         keyword = ''
103 |                         node_curr = self.root  # 重新初始化
104 |             else: # 有后缀就加到name里边
105 |                 keyword = keyword + word
106 |                 node_curr = node_curr.child[word]
107 |                 if count == index_last:  # 实体结尾的情况
108 |                     if node_curr.child.get('[END]') is not None:
109 |                         keyword_list.append(keyword)
110 |         return keyword_list
111 | 
112 |     def match_keyword(self, keyword):
113 |         """
114 |             判断keyword在不在trietree里边
115 |         :param keyword: str, input word
116 |         :return: boolean, True or False
117 |         """
118 |         node = self.root
119 |         for kw in keyword:
120 |             if not node.child.get(kw):
121 |                 return False
122 |             node = node.child[kw]
123 |         if not node.child.get('[END]'):
124 |             return False
125 |         return True
126 | 
127 | 
128 | def get_trie_tree_class(keywords):
129 |     """
130 |         根据list关键词，初始化trie树
131 |     :param keywords: list, input
132 |     :return: objext, 返回实例化的trie
133 |     """
134 |     trie = TrieTree()
135 |     trie.add_keywords_from_list(keywords)
136 |     return trie
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     print("".join("你好呀"))
141 |     # 测试1, class实例
142 |     trie = TrieTree()
143 |     keywords = ['英雄', '人在囧途', '那些年,我们一起追过的女孩', '流浪地球', '华娱',
144 |                 '犬夜叉', '火影', '名侦探柯南', '约会大作战', '名作之壁', '动漫',
145 |                 '乃木坂46', 'akb48', '飘', '最后的武士', '约会', '英雄2', '日娱',
146 |                 '2012', '第九区', '星球大战', '侏罗纪公园', '泰坦尼克号', 'Speed']
147 |     keywords = [list(keyword.strip()) for keyword in keywords]
148 |     trie.add_keywords_from_list(keywords) # 创建树
149 |     keyword = trie.find_keyword('第九区约会, 侏罗纪公园和泰坦尼克号泰坦尼克号')
150 |     print(keyword)
151 |     gg = trie.delete_keyword('英雄')
152 |     gg = trie.delete_keyword('英雄3')
153 | 
154 |     keyword = trie.match_keyword('英雄')
155 |     keyword2 = trie.match_keyword('英雄2')
156 | 
157 |     print(keyword)
158 | 
159 | 
160 |     # 测试2, get树
161 |     trie_tree = get_trie_tree_class(keywords) # 创建树并返回实例化class
162 |     while True:
163 |         print("sihui请你输入:")
164 |         input_ques = input()
165 |         keywords = trie_tree.find_keyword(input_ques)
166 |         print(keywords)
167 | 


--------------------------------------------------------------------------------
/macropodus/summarize/topic_base/topic_lda.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/11/31 21:33
  4 | # @author   :Mo
  5 | # @function :topic model of LDA
  6 | # @paper    :Latent Dirichlet Allocation
  7 | 
  8 | 
  9 | from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
 10 | from macropodus.data.words_common.stop_words import stop_words
 11 | from macropodus.preprocess.tools_ml import macropodus_cut
 12 | from macropodus.preprocess.tools_ml import cut_sentence
 13 | # sklearn
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | from sklearn.decomposition import LatentDirichletAllocation
 16 | import numpy as np
 17 | 
 18 | 
 19 | class LDASum:
 20 |     def __init__(self):
 21 |         self.stop_words = stop_words.values()
 22 |         self.algorithm = 'lda'
 23 | 
 24 |     def summarize(self, text, num=8, topic_min=6, judge_topic=None):
 25 |         """
 26 |             LDA
 27 |         :param text: str
 28 |         :param num: int
 29 |         :param topic_min: int 
 30 |         :param judge_topic: boolean
 31 |         :return: 
 32 |         """
 33 |         # 切句
 34 |         if type(text) == str:
 35 |             self.sentences = cut_sentence(text)
 36 |         elif type(text) == list:
 37 |             self.sentences = text
 38 |         else:
 39 |             raise RuntimeError("text type must be list or str")
 40 |         len_sentences_cut = len(self.sentences)
 41 |         # 切词
 42 |         sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
 43 |                           if word.strip()] for sentence in self.sentences]
 44 |         # 去除停用词等
 45 |         self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
 46 |         self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
 47 |         # # 计算每个句子的tf
 48 |         # vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words)
 49 |         # tf_ngram = vector_c.fit_transform(self.sentences_cut)
 50 |         # 计算每个句子的tfidf
 51 |         tf_ngram = tfidf_fit(self.sentences_cut)
 52 |         # 主题数, 经验判断
 53 |         topic_num = min(topic_min, int(len(sentences_cut) / 2))  # 设定最小主题数为3
 54 |         lda = LatentDirichletAllocation(n_components=topic_num, max_iter=32,
 55 |                                         learning_method='online',
 56 |                                         learning_offset=50.,
 57 |                                         random_state=2019)
 58 |         res_lda_u = lda.fit_transform(tf_ngram.T)
 59 |         res_lda_v = lda.components_
 60 | 
 61 |         if judge_topic:
 62 |             ### 方案一, 获取最大那个主题的k个句子
 63 |             ##################################################################################
 64 |             topic_t_score = np.sum(res_lda_v, axis=-1)
 65 |             # 对每列(一个句子topic_num个主题),得分进行排序,0为最大
 66 |             res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1]
 67 |             # 统计为最大每个主题的句子个数
 68 |             exist = (res_nmf_h_soft <= 0) * 1.0
 69 |             factor = np.ones(res_nmf_h_soft.shape[1])
 70 |             topic_t_count = np.dot(exist, factor)
 71 |             # 标准化
 72 |             topic_t_count /= np.sum(topic_t_count, axis=-1)
 73 |             topic_t_score /= np.sum(topic_t_score, axis=-1)
 74 |             # 主题最大个数占比, 与主题总得分占比选择最大的主题
 75 |             topic_t_tc = topic_t_count + topic_t_score
 76 |             topic_t_tc_argmax = np.argmax(topic_t_tc)
 77 |             # 最后得分选择该最大主题的
 78 |             res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist()
 79 |             res_combine = {}
 80 |             for l in range(len_sentences_cut):
 81 |                 res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
 82 |             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
 83 |             #####################################################################################
 84 |         else:
 85 |             ### 方案二, 获取最大主题概率的句子, 不分主题
 86 |             res_combine = {}
 87 |             for i in range(len_sentences_cut):
 88 |                 res_row_i = res_lda_v[:, i]
 89 |                 res_row_i_argmax = np.argmax(res_row_i)
 90 |                 res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
 91 |             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
 92 |         num_min = min(num, int(len_sentences_cut * 0.6))
 93 |         return score_sen[0:num_min]
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     lda = LDASum()
 98 |     doc = "多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，" \
 99 |           "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金（有限合伙）共同发起设立嘉道方直教育产业投资基金（暂定名）。" \
100 |           "该基金认缴出资总规模为人民币3.01亿元。" \
101 |           "基金的出资方式具体如下：出资进度方面，基金合伙人的出资应于基金成立之日起四年内分四期缴足，每期缴付7525万元；" \
102 |           "各基金合伙人每期按其出资比例缴付。合伙期限为11年，投资目标为教育领域初创期或成长期企业。" \
103 |           "截止公告披露日，深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日，深圳嘉道功程股权投资基金产权结构如下:" \
104 |           "公告还披露，方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
105 |           "方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
106 | 
107 |     doc = "PageRank算法简介。" \
108 |            "是上世纪90年代末提出的一种计算网页权重的算法! " \
109 |            "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
110 |            "业界急需一种相对比较准确的网页重要性计算方法。 " \
111 |            "是人们能够从海量互联网世界中找出自己需要的信息。 " \
112 |            "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
113 |            "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
114 |            "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
115 |            "和投票目标的等级来决定新的等级。简单的说， " \
116 |            "一个高等级的页面可以使其他低等级页面的等级提升。 " \
117 |            "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
118 |            "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
119 |            "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
120 |            "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
121 | 
122 |     sum = lda.summarize(doc, num=8)
123 |     for i in sum:
124 |         print(i)
125 | 


--------------------------------------------------------------------------------
/test/evaluate/tet_nlg_yongzhuo.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2020/5/14 21:42
  4 | # @author  : Mo
  5 | # @function: nlg-yongzhuo
  6 | 
  7 | 
  8 | from macropodus.summarize.yongzhuo_nlg import *
  9 | 
 10 | doc = """PageRank算法简介。" \
 11 |               "是上世纪90年代末提出的一种计算网页权重的算法! " \
 12 |               "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
 13 |               "业界急需一种相对比较准确的网页重要性计算方法。 " \
 14 |               "是人们能够从海量互联网世界中找出自己需要的信息。 " \
 15 |               "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
 16 |               "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
 17 |               "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
 18 |               "和投票目标的等级来决定新的等级。简单的说， " \
 19 |               "一个高等级的页面可以使其他低等级页面的等级提升。 " \
 20 |               "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
 21 |               "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
 22 |               "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
 23 |               "总的来说就是一句话，从全局角度考虑，获取重要的信。 """.replace(" ", "").replace('"', '')
 24 | 
 25 | # 是否使用多进程, fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
 26 | res_score = text_summarize(doc, multi_process=False, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
 27 | for rs in res_score:
 28 |     print(rs)
 29 | 
 30 | 
 31 | docs ="和投票目标的等级来决定新的等级.简单的说。" \
 32 |           "是上世纪90年代末提出的一种计算网页权重的算法! " \
 33 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。" \
 34 |           "业界急需一种相对比较准确的网页重要性计算方法。" \
 35 |           "是人们能够从海量互联网世界中找出自己需要的信息。" \
 36 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
 37 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
 38 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面。" \
 39 |           "一个高等级的页面可以使其他低等级页面的等级提升。" \
 40 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设。" \
 41 |           "即数量假设：一个网页被越多的其他页面链接，就越重）。" \
 42 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
 43 |           "总的来说就是一句话，从全局角度考虑，获取重要的信。"
 44 | # 1. word_significance
 45 | sums_word_significance = word_significance.summarize(docs, num=6)
 46 | print("word_significance:")
 47 | for sum_ in sums_word_significance:
 48 |     print(sum_)
 49 | 
 50 | # 2. text_pronouns
 51 | sums_text_pronouns = text_pronouns.summarize(docs, num=6)
 52 | print("text_pronouns:")
 53 | for sum_ in sums_text_pronouns:
 54 |     print(sum_)
 55 | 
 56 | # 3. text_teaser
 57 | sums_text_teaser = text_teaser.summarize(docs, num=6)
 58 | print("text_teaser:")
 59 | for sum_ in sums_text_teaser:
 60 |     print(sum_)
 61 | # 4. mmr
 62 | sums_mmr = mmr.summarize(docs, num=6)
 63 | print("mmr:")
 64 | for sum_ in sums_mmr:
 65 |     print(sum_)
 66 | # 5.text_rank
 67 | sums_text_rank = text_rank.summarize(docs, num=6)
 68 | print("text_rank:")
 69 | for sum_ in sums_text_rank:
 70 |     print(sum_)
 71 | # 6. lda
 72 | sums_lda = lda.summarize(docs, num=6)
 73 | print("lda:")
 74 | for sum_ in sums_lda:
 75 |     print(sum_)
 76 | # 7. lsi
 77 | sums_lsi = lsi.summarize(docs, num=6)
 78 | print("mmr:")
 79 | for sum_ in sums_lsi:
 80 |     print(sum_)
 81 | # 8. nmf
 82 | sums_nmf = nmf.summarize(docs, num=6)
 83 | print("nmf:")
 84 | for sum_ in sums_nmf:
 85 |     print(sum_)
 86 | # 9. lead3
 87 | sums_lead3 = lead3.summarize(docs, num=6)
 88 | print("lead3:")
 89 | for sum_ in sums_lead3:
 90 |     print(sum_)
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | docs = """AutoML机器学习自动化与NNI
 97 | 原创大漠帝国 最后发布于2020-02-29 19:46:21 阅读数 221  收藏
 98 | 编辑 展开
 99 | 一、AutoML简介
100 | 
101 |         AutoML(Automated Machine Learning)，中文可以翻译为自动机器学习，我比较喜欢叫它“机器学习自动化”，更加接近人们所津津乐道的通用人工智能吧。
102 | 
103 |         人们一直有个朴素的想法，可以有一个通用的AI系统，它包罗万象，能够对整个宇宙进行建模，对我们遇到的一切问题，都给出解决办法。这在幻想书籍中数见不新鲜，比如漫威电影中钢铁侠的人工智能贾维斯，又比如说芯片系统流派的网络小说。不过这些大概可以算是人工智能的高级模式了吧，人们还是很宽容的，没有期待一步到位。
104 | 
105 |        现在算是AI的高潮期，尤其是以深度学习DL为代表的当代人工智能技术的成功，给以人类以无限的想象空间。那么，降低要求，以DL技术为基础，去开发一个低配版通用人工智能，也是可以的吧。所以，随着人工智能的火爆，2014年以来，AutoML也越发火热起来。
106 | 
107 |        深度学习时代的鲜明特征是大数据量、深层次网络、特征学习与端到端学习。我们希望能够从数据一步得到模型，而不需要其他的什么人为参与过程。如果再加上语音助手什么的，或许我们就能达到浅层次通用人工智能的目标呢。在深度学习DL模型架构难以取得更大突破的时候，给它再开辟一条道路呢。一如蒸馏模型，又如MobileNet。
108 | 
109 |         工程化和应用级市场，更能带来意想不到的惊喜。这一点，从近年来微软开源的AutoML工具NNI大受欢迎中，可以管中窥豹。
110 | 
111 |  
112 | 
113 | 二、AutoML特性
114 | 
115 |         从比较出名的开源Auto平台、互联网大厂AutoML云产品，以及AI公司的AutoML软件来看，一般包括特征工程(FE，Auto feature engine)、神经网络搜索(NAS，Neural Architecture Search) 和超参数优化(HPO，Hyper-parameter optimization) 等功能，如下图所示：
116 | 
117 | 
118 | 
119 |         可能还存在其他一些小功能，如数据增强(几何,颜色), 激活函数(swish,Hybrid DNN), 归一化方法(Switchable Normalization, BN, IN, LN, GN), 优化方法(Neural Optimizer Search, sgd，rmsprop，adam, 衰减, 函数的组合), 优化目标(AM-LFS, Learning to teach with dynamic loss functions), 模型剪枝(AMC), 模型量化(HAQ), 部署上线等。
120 | 
121 |         AutoML优点：可用于传统机器学习、图像等较成熟领域，自动化摒弃了人为因素的干扰、增强泛化性；
122 | 
123 |                      缺点：消耗资源大、优化方法可能达不到经验模型甚至是严重偏向。
124 | 
125 |  
126 | 
127 | 三、 NNI
128 | 
129 |         NNI (Neural Network Intelligence，[翻译为神经网络智能？]) 是微软开源的自动机器学习（AutoML）的Python工具包。NNI 通过 nni_manager模块 等管理 AutoML 的 Experiment (实验)，调度并运行各种调优算法生成的 Trial (尝试) 任务，来完成搜索最优神经网络架构、超参数等。同时支持本机，远程服务器，单机，多机，OpenPAI，Kubeflow，K8S和其它云服务等训练环境。
130 | 
131 |         对比其他开源项目，或大公司产品可以发现，NNI支持的神经网络结构搜索、超参数优化等调优算法更多，功能最强大。
132 | 
133 |         以我的使用体验来看，NNI更像一个黑盒，浅度用户使用可能比较舒服。使用nni的SDK可以完美嵌入自己的网络结构进行超参数优化，详情如下:
134 | 
135 | 
136 | 
137 |         超参数优化需要定义搜索空间search_space.json，NNI配置config.yml，以及主程序调用main.py函数。
138 | 
139 |         此外，NNI还需要用特定命令行启动，自由度似乎不太够。
140 | 
141 | 希望对你有所帮助!
142 | ————————————————
143 | 版权声明：本文为CSDN博主「大漠帝国」的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。
144 | 原文链接：https://blog.csdn.net/rensihui/article/details/104578756"""
145 | 
146 | 
147 | sums_textrank_textrank4zh = text_rank.summarize(docs, num=6, model_type="textrank_textrank4zh")
148 | print("textrank_textrank4zh:")
149 | for sum_ in sums_textrank_textrank4zh:
150 |     print(sum_)
151 | 
152 | sums_textrank_sklearn = text_rank.summarize(docs, num=6, model_type="textrank_sklearn")
153 | print("textrank_sklearn:")
154 | for sum_ in sums_textrank_sklearn:
155 |     print(sum_)
156 | 
157 | # gensim自带的textrank只支持英文, 分隔符为". ", "? ", "! "
158 | sums_textrank_gensim = text_rank.summarize(docs, num=100, model_type="textrank_gensim")
159 | print("textrank_gensim:")
160 | for sum_ in sums_textrank_gensim:
161 |     print(sum_)
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/macropodus/network/layers/crf.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | 
  3 | # author: BrikerMan
  4 | # contact: eliyar917@gmail.com
  5 | # blog: https://eliyar.biz
  6 | # code from:
  7 | 
  8 | # file: crf.py
  9 | # time: 2019-06-28 14:33
 10 | 
 11 | 
 12 | import tensorflow as tf
 13 | 
 14 | 
 15 | class CRF(tf.keras.layers.Layer):
 16 |     """
 17 |         Conditional Random Field layer (tf.keras)
 18 |         `CRF` can be used as the last layer in a network (as a classifier). Input shape (features)
 19 |         must be equal to the number of classes the CRF can predict (a linear layer is recommended).
 20 |         Note: the loss and accuracy functions of networks using `CRF` must
 21 |         use the provided loss and accuracy functions (denoted as loss and viterbi_accuracy)
 22 |         as the classification of sequences are used with the layers internal weights.
 23 |         Args:
 24 |             output_dim (int): the number of labels to tag each temporal input.
 25 |         Input shape:
 26 |             nD tensor with shape `(batch_size, sentence length, num_classes)`.
 27 |         Output shape:
 28 |             nD tensor with shape: `(batch_size, sentence length, num_classes)`.
 29 |         """
 30 | 
 31 |     def __init__(self,
 32 |                  output_dim,
 33 |                  mode='reg',
 34 |                  supports_masking=False,
 35 |                  transitions=None,
 36 |                  **kwargs):
 37 |         self.transitions = None
 38 |         super(CRF, self).__init__(**kwargs)
 39 |         self.output_dim = int(output_dim)
 40 |         self.mode = mode
 41 |         if self.mode == 'pad':
 42 |             self.input_spec = [tf.keras.layers.InputSpec(min_ndim=3), tf.keras.layers.InputSpec(min_ndim=2)]
 43 |         elif self.mode == 'reg':
 44 |             self.input_spec = tf.keras.layers.InputSpec(min_ndim=3)
 45 |         else:
 46 |             raise ValueError
 47 |         self.supports_masking = supports_masking
 48 |         self.sequence_lengths = None
 49 | 
 50 |     def get_config(self):
 51 |         config = {
 52 |             'output_dim': self.output_dim,
 53 |             'mode': self.mode,
 54 |             'supports_masking': self.supports_masking,
 55 |             'transitions': tf.keras.backend.eval(self.transitions)
 56 |         }
 57 |         base_config = super(CRF, self).get_config()
 58 |         return dict(list(base_config.items()) + list(config.items()))
 59 | 
 60 |     def build(self, input_shape):
 61 |         if self.mode == 'pad':
 62 |             assert len(input_shape) == 2
 63 |             assert len(input_shape[0]) == 3
 64 |             assert len(input_shape[1]) == 2
 65 |             f_shape = tf.TensorShape(input_shape[0])
 66 |             input_spec = [tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]}),
 67 |                           tf.keras.layers.InputSpec(min_ndim=2, axes={-1: 1}, dtype=tf.int32)]
 68 |         else:
 69 |             assert len(input_shape) == 3
 70 |             f_shape = tf.TensorShape(input_shape)
 71 |             input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]})
 72 | 
 73 |         if f_shape[-1] is None:
 74 |             raise ValueError('The last dimension of the inputs to `CRF` should be defined. Found `None`.')
 75 |         if f_shape[-1] != self.output_dim:
 76 |             raise ValueError('The last dimension of the input shape must be equal to output shape. '
 77 |                              'Use a linear layer if needed.')
 78 |         self.input_spec = input_spec
 79 |         self.transitions = self.add_weight(name='transitions',
 80 |                                            shape=[self.output_dim, self.output_dim],
 81 |                                            initializer='glorot_uniform',
 82 |                                            trainable=True)
 83 |         self.built = True
 84 | 
 85 |     def call(self, inputs, **kwargs):
 86 |         if self.mode == 'pad':
 87 |             sequences = tf.convert_to_tensor(inputs[0], dtype=self.dtype)
 88 |             self.sequence_lengths = tf.keras.backend.flatten(inputs[-1])
 89 |         else:
 90 |             sequences = tf.convert_to_tensor(inputs, dtype=self.dtype)
 91 |             shape = tf.shape(inputs)
 92 |             self.sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
 93 |         viterbi_sequence, _ = tf.contrib.crf.crf_decode(sequences, self.transitions,
 94 |                                                         self.sequence_lengths)
 95 |         output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
 96 |         return tf.keras.backend.in_train_phase(sequences, output)
 97 | 
 98 |     def loss(self, y_true, y_pred):
 99 |         y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype)
100 |         log_likelihood, self.transitions = tf.contrib.crf.crf_log_likelihood(y_pred,
101 |                                                                              tf.cast(tf.keras.backend.argmax(y_true),
102 |                                                                                      dtype=tf.int32),
103 |                                                                              self.sequence_lengths,
104 |                                                                              transition_params=self.transitions)
105 |         # loss_crf = tf.reduce_mean(-log_likelihood)
106 |         # return tf.math.log(loss_crf)
107 |         return tf.reduce_mean(-log_likelihood)
108 | 
109 |     def compute_output_shape(self, input_shape):
110 |         if self.mode == 'pad':
111 |             data_shape = input_shape[0]
112 |         else:
113 |             data_shape = input_shape
114 |         tf.TensorShape(data_shape).assert_has_rank(3)
115 |         return data_shape[:2] + (self.output_dim,)
116 | 
117 |     @property
118 |     def viterbi_accuracy(self):
119 |         def accuracy(y_true, y_pred):
120 |             shape = tf.shape(y_pred)
121 |             sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1])
122 |             viterbi_sequence, _ = tf.contrib.crf.crf_decode(y_pred, self.transitions, sequence_lengths)
123 |             output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim)
124 |             return tf.keras.metrics.categorical_accuracy(y_true, output)
125 | 
126 |         accuracy.func_name = 'viterbi_accuracy'
127 |         return accuracy
128 | 
129 | 


--------------------------------------------------------------------------------
/macropodus/preprocess/tools_ml.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2019/12/5 20:23
  4 | # @author  : Mo
  5 | # @function: data utils of ml, text_summarization
  6 | 
  7 | 
  8 | from sklearn.feature_extraction.text import TfidfTransformer
  9 | from sklearn.feature_extraction.text import TfidfVectorizer
 10 | import macropodus
 11 | import re
 12 | 
 13 | 
 14 | __all__ = ["extract_chinese",
 15 |            "macropodus_cut",
 16 |            "jieba_tag_cut",
 17 |            "cut_sentence",
 18 |            "remove_urls",
 19 |            "tfidf_fit",
 20 |            "tfidf_sim"
 21 |            ]
 22 | 
 23 | 
 24 | def extract_chinese(text):
 25 |     """
 26 |       只提取出中文、字母和数字
 27 |     :param text: str, input of sentence
 28 |     :return: str
 29 |     """
 30 |     chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@. ])", text))
 31 |     return chinese_exttract
 32 | 
 33 | 
 34 | def jieba_tag_cut(text):
 35 |     """
 36 |         jieba cut and tagged
 37 |     :param text:str 
 38 |     :return: dict
 39 |     """
 40 |     import jieba.posseg as pseg
 41 |     words = pseg.cut(text)
 42 |     return dict(words)
 43 | 
 44 | 
 45 | def macropodus_cut(text):
 46 |     """
 47 |       Macropodus cut
 48 |     :param text: input sentence
 49 |     :return: list
 50 |     """
 51 |     return macropodus.cut(text)
 52 | 
 53 | 
 54 | def cut_sentence(text, use_type="summarize"):
 55 |     """
 56 |         分句(文本摘要)
 57 |     :param sentence:str, like "大漠帝国"
 58 |     :param use_type:str, like "summarize" or "new-word-discovery"
 59 |     :return:list
 60 |     """
 61 |     if use_type=="summarize":
 62 |         re_sen = re.compile('[:;!?。：；？！\n\r]') #.不加是因为不确定.是小数还是英文句号(中文省略号......)
 63 |     elif use_type=="new-word-discovery":
 64 |         re_sen = re.compile('[,，"“”、<>《》{}【】:;!?。：；？！\n\r]') #.不加是因为不确定.是小数还是英文句号(中文省略号......)
 65 |     else:
 66 |         raise RuntimeError("use_type must be 'summarize' or 'new-word-discovery'")
 67 |     sentences = re_sen.split(text)
 68 |     sen_cuts = []
 69 |     for sen in sentences:
 70 |         if sen and str(sen).strip():
 71 |             sen_cuts.append(sen)
 72 |     return sen_cuts
 73 | 
 74 | 
 75 | def remove_urls(text):
 76 |     """
 77 |         删除https/http等无用url
 78 |     :param text: str
 79 |     :return: str
 80 |     """
 81 |     text_remove_url = re.sub(r'(全文：)?(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',
 82 |                              '', text, flags=re.MULTILINE)
 83 |     return text_remove_url
 84 | 
 85 | 
 86 | def gram_uni_bi_tri(text):
 87 |     """
 88 |         获取文本的unigram, trugram, bigram等特征
 89 |     :param text: str
 90 |     :return: list
 91 |     """
 92 |     len_text = len(text)
 93 |     gram_uni = []
 94 |     gram_bi = []
 95 |     gram_tri = []
 96 |     for i in range(len_text):
 97 |         if i + 3 <= len_text:
 98 |             gram_uni.append(text[i])
 99 |             gram_bi.append(text[i:i+2])
100 |             gram_tri.append(text[i:i+3])
101 |         elif i + 2 <= len_text:
102 |             gram_uni.append(text[i])
103 |             gram_bi.append(text[i:i+2])
104 |         elif i + 1 <= len_text:
105 |             gram_uni.append(text[i])
106 |         else:
107 |             break
108 |     return gram_uni, gram_bi, gram_tri
109 | 
110 | 
111 | def get_ngrams(text, ns=[1], use_type="summarize", len_max=7):
112 |     """
113 |         获取文本的ngram等特征
114 |     :param text: str, like "大漠帝国"
115 |     :param ns: list, like [1, 2, 3]
116 |     :param type: str, like "summarize" or "new-word-discovery"
117 |     :param type: int, like 6, 7
118 |     :return: list<list> or list
119 |     """
120 |     if type(ns) != list:
121 |         raise RuntimeError("ns of function get_ngram() must be list!")
122 |     for n in ns:
123 |         if n < 1:
124 |             raise RuntimeError("enum of ns must '>1'!")
125 |     len_text = len(text)
126 |     ngrams = []
127 |     if use_type == "summarize": # 分别返回uni, bi, tri...
128 |         for n in ns:
129 |             ngram_n = []
130 |             for i in range(len_text):
131 |                 if i + n <= len_text:
132 |                     ngram_n.append(text[i:i + n])
133 |                 else:
134 |                     break
135 |             if not ngram_n:
136 |                 ngram_n.append(text)
137 |             ngrams.append(ngram_n)
138 |     else: # 只返回一个list
139 |         for i in range(len_text):
140 |             ngrams += [text[i: j + i]
141 |                        for j in range(1, min(len_max + 1, len_text - i + 1))]
142 |     return ngrams
143 | 
144 | 
145 | def tfidf_fit(sentences):
146 |     """
147 |        tfidf相似度
148 |     :param sentences: str
149 |     :return: list, list, list
150 |     """
151 |     # tfidf计算
152 |     model = TfidfVectorizer(ngram_range=(1, 2), # 3,5
153 |                             stop_words=[' ', '\t', '\n'],  # 停用词
154 |                             max_features=10000,
155 |                             token_pattern=r"(?u)\b\w+\b",  # 过滤停用词
156 |                             min_df=1,
157 |                             max_df=0.9,
158 |                             use_idf=1,  # 光滑
159 |                             smooth_idf=1,  # 光滑
160 |                             sublinear_tf=1, )  # 光滑
161 |     matrix = model.fit_transform(sentences)
162 |     return matrix
163 | 
164 | 
165 | def tdidf_sim(sentences):
166 |     """
167 |        tfidf相似度
168 |     :param sentences: 
169 |     :return: 
170 |     """
171 |     # tfidf计算
172 |     model = TfidfVectorizer(tokenizer=macropodus_cut,
173 |                             ngram_range=(1, 2), # 3,5
174 |                             stop_words=[' ', '\t', '\n'],  # 停用词
175 |                             max_features=10000,
176 |                             token_pattern=r"(?u)\b\w+\b",  # 过滤停用词
177 |                             min_df=1,
178 |                             max_df=0.9,
179 |                             use_idf=1,  # 光滑
180 |                             smooth_idf=1,  # 光滑
181 |                             sublinear_tf=1, )  # 光滑
182 |     matrix = model.fit_transform(sentences)
183 |     matrix_norm = TfidfTransformer().fit_transform(matrix)
184 |     return matrix_norm
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     text = "你喜欢谁,小老弟,你好烦哇。"
189 |     # gg = jieba_tag_cut("我不再喜欢你，正如你的不喜欢我")
190 |     grams = get_ngrams(text, use_type="new-word-discovery", len_max=7)
191 |     # print(gg)
192 |     print(grams)
193 | 


--------------------------------------------------------------------------------
/macropodus/summarize/yongzhuo_nlg/README.md:
--------------------------------------------------------------------------------
  1 | # nlg, API(联合调用, 整合几种算法)
  2 | ```bash
  3 | from nlg_yongzhuo import *
  4 | 
  5 | doc = """PageRank算法简介。" \
  6 |               "是上世纪90年代末提出的一种计算网页权重的算法! " \
  7 |               "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
  8 |               "业界急需一种相对比较准确的网页重要性计算方法。 " \
  9 |               "是人们能够从海量互联网世界中找出自己需要的信息。 " \
 10 |               "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
 11 |               "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
 12 |               "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
 13 |               "和投票目标的等级来决定新的等级。简单的说， " \
 14 |               "一个高等级的页面可以使其他低等级页面的等级提升。 " \
 15 |               "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
 16 |               "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
 17 |               "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
 18 |               "总的来说就是一句话，从全局角度考虑，获取重要的信。 """.replace(" ", "").replace('"', '')
 19 | 
 20 | # 是否使用多进程, fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf
 21 | res_score = text_summarize(doc, multi_process=True, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf])
 22 | for rs in res_score:
 23 |     print(rs)
 24 | 
 25 | ```
 26 | 
 27 | # nlg, 单个方法
 28 | ```
 29 | # feature_base
 30 | from nlg_yongzhuo import word_significance
 31 | from nlg_yongzhuo import text_pronouns
 32 | from nlg_yongzhuo import text_teaser
 33 | from nlg_yongzhuo import mmr
 34 | # graph_base
 35 | from nlg_yongzhuo import text_rank
 36 | # topic_base
 37 | from nlg_yongzhuo import lda
 38 | from nlg_yongzhuo import lsi
 39 | from nlg_yongzhuo import nmf
 40 | # nous_base
 41 | from nlg_yongzhuo import lead3
 42 | 
 43 | 
 44 | docs ="和投票目标的等级来决定新的等级.简单的说。" \
 45 |           "是上世纪90年代末提出的一种计算网页权重的算法! " \
 46 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。" \
 47 |           "业界急需一种相对比较准确的网页重要性计算方法。" \
 48 |           "是人们能够从海量互联网世界中找出自己需要的信息。" \
 49 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
 50 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
 51 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面。" \
 52 |           "一个高等级的页面可以使其他低等级页面的等级提升。" \
 53 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设。" \
 54 |           "即数量假设：一个网页被越多的其他页面链接，就越重）。" \
 55 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
 56 |           "总的来说就是一句话，从全局角度考虑，获取重要的信。"
 57 | # 1. word_significance
 58 | sums_word_significance = word_significance.summarize(docs, num=6)
 59 | print("word_significance:")
 60 | for sum_ in sums_word_significance:
 61 |     print(sum_)
 62 | 
 63 | # 2. text_pronouns
 64 | sums_text_pronouns = text_pronouns.summarize(docs, num=6)
 65 | print("text_pronouns:")
 66 | for sum_ in sums_text_pronouns:
 67 |     print(sum_)
 68 | 
 69 | # 3. text_teaser
 70 | sums_text_teaser = text_teaser.summarize(docs, num=6)
 71 | print("text_teaser:")
 72 | for sum_ in sums_text_teaser:
 73 |     print(sum_)
 74 | # 4. mmr
 75 | sums_mmr = mmr.summarize(docs, num=6)
 76 | print("mmr:")
 77 | for sum_ in sums_mmr:
 78 |     print(sum_)
 79 | # 5.text_rank
 80 | sums_text_rank = text_rank.summarize(docs, num=6)
 81 | print("text_rank:")
 82 | for sum_ in sums_text_rank:
 83 |     print(sum_)
 84 | # 6. lda
 85 | sums_lda = lda.summarize(docs, num=6)
 86 | print("lda:")
 87 | for sum_ in sums_lda:
 88 |     print(sum_)
 89 | # 7. lsi
 90 | sums_lsi = lsi.summarize(docs, num=6)
 91 | print("mmr:")
 92 | for sum_ in sums_lsi:
 93 |     print(sum_)
 94 | # 8. nmf
 95 | sums_nmf = nmf.summarize(docs, num=6)
 96 | print("nmf:")
 97 | for sum_ in sums_nmf:
 98 |     print(sum_)
 99 | # 9. lead3
100 | sums_lead3 = lead3.summarize(docs, num=6)
101 | print("lead3:")
102 | for sum_ in sums_lead3:
103 |     print(sum_)
104 | ```
105 | 
106 | # nlg, sklearn
107 | ```
108 | docs = """AutoML机器学习自动化与NNI
109 | 原创大漠帝国 最后发布于2020-02-29 19:46:21 阅读数 221  收藏
110 | 编辑 展开
111 | 一、AutoML简介
112 | 
113 |         AutoML(Automated Machine Learning)，中文可以翻译为自动机器学习，我比较喜欢叫它“机器学习自动化”，更加接近人们所津津乐道的通用人工智能吧。
114 | 
115 |         人们一直有个朴素的想法，可以有一个通用的AI系统，它包罗万象，能够对整个宇宙进行建模，对我们遇到的一切问题，都给出解决办法。这在幻想书籍中数见不新鲜，比如漫威电影中钢铁侠的人工智能贾维斯，又比如说芯片系统流派的网络小说。不过这些大概可以算是人工智能的高级模式了吧，人们还是很宽容的，没有期待一步到位。
116 | 
117 |        现在算是AI的高潮期，尤其是以深度学习DL为代表的当代人工智能技术的成功，给以人类以无限的想象空间。那么，降低要求，以DL技术为基础，去开发一个低配版通用人工智能，也是可以的吧。所以，随着人工智能的火爆，2014年以来，AutoML也越发火热起来。
118 | 
119 |        深度学习时代的鲜明特征是大数据量、深层次网络、特征学习与端到端学习。我们希望能够从数据一步得到模型，而不需要其他的什么人为参与过程。如果再加上语音助手什么的，或许我们就能达到浅层次通用人工智能的目标呢。在深度学习DL模型架构难以取得更大突破的时候，给它再开辟一条道路呢。一如蒸馏模型，又如MobileNet。
120 | 
121 |         工程化和应用级市场，更能带来意想不到的惊喜。这一点，从近年来微软开源的AutoML工具NNI大受欢迎中，可以管中窥豹。
122 | 
123 |  
124 | 
125 | 二、AutoML特性
126 | 
127 |         从比较出名的开源Auto平台、互联网大厂AutoML云产品，以及AI公司的AutoML软件来看，一般包括特征工程(FE，Auto feature engine)、神经网络搜索(NAS，Neural Architecture Search) 和超参数优化(HPO，Hyper-parameter optimization) 等功能，如下图所示：
128 | 
129 | 
130 | 
131 |         可能还存在其他一些小功能，如数据增强(几何,颜色), 激活函数(swish,Hybrid DNN), 归一化方法(Switchable Normalization, BN, IN, LN, GN), 优化方法(Neural Optimizer Search, sgd，rmsprop，adam, 衰减, 函数的组合), 优化目标(AM-LFS, Learning to teach with dynamic loss functions), 模型剪枝(AMC), 模型量化(HAQ), 部署上线等。
132 | 
133 |         AutoML优点：可用于传统机器学习、图像等较成熟领域，自动化摒弃了人为因素的干扰、增强泛化性；
134 | 
135 |                      缺点：消耗资源大、优化方法可能达不到经验模型甚至是严重偏向。
136 | 
137 |  
138 | 
139 | 三、 NNI
140 | 
141 |         NNI (Neural Network Intelligence，[翻译为神经网络智能？]) 是微软开源的自动机器学习（AutoML）的Python工具包。NNI 通过 nni_manager模块 等管理 AutoML 的 Experiment (实验)，调度并运行各种调优算法生成的 Trial (尝试) 任务，来完成搜索最优神经网络架构、超参数等。同时支持本机，远程服务器，单机，多机，OpenPAI，Kubeflow，K8S和其它云服务等训练环境。
142 | 
143 |         对比其他开源项目，或大公司产品可以发现，NNI支持的神经网络结构搜索、超参数优化等调优算法更多，功能最强大。
144 | 
145 |         以我的使用体验来看，NNI更像一个黑盒，浅度用户使用可能比较舒服。使用nni的SDK可以完美嵌入自己的网络结构进行超参数优化，详情如下:
146 | 
147 | 
148 | 
149 |         超参数优化需要定义搜索空间search_space.json，NNI配置config.yml，以及主程序调用main.py函数。
150 | 
151 |         此外，NNI还需要用特定命令行启动，自由度似乎不太够。
152 | 
153 | 希望对你有所帮助!
154 | ————————————————
155 | 版权声明：本文为CSDN博主「大漠帝国」的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。
156 | 原文链接：https://blog.csdn.net/rensihui/article/details/104578756"""
157 | 
158 | 
159 | sums_textrank_textrank4zh = text_rank.summarize(docs, num=6, model_type="textrank_textrank4zh")
160 | print("textrank_textrank4zh:")
161 | for sum_ in sums_textrank_textrank4zh:
162 |     print(sum_)
163 | 
164 | sums_textrank_sklearn = text_rank.summarize(docs, num=6, model_type="textrank_sklearn")
165 | print("textrank_sklearn:")
166 | for sum_ in sums_textrank_sklearn:
167 |     print(sum_)
168 | 
169 | # gensim自带的textrank只支持英文, 分隔符为". ", "? ", "! "
170 | sums_textrank_gensim = text_rank.summarize(docs, num=100, model_type="textrank_gensim")
171 | print("textrank_gensim:")
172 | for sum_ in sums_textrank_gensim:
173 |     print(sum_)
174 | ```
175 | 


--------------------------------------------------------------------------------
/macropodus/base/seg_basic.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2019/11/28 20:17
  4 | # @author  : Mo
  5 | # @function: basic of segment, dictionary
  6 | 
  7 | 
  8 | from macropodus.preprocess.tools_common import load_json, save_json, txt_read
  9 | from macropodus.conf.path_config import path_dict_macropodus, path_dict_user
 10 | from macropodus.conf.path_config import path_macropodus_dict_freq_cache
 11 | from macropodus.conf.path_log import get_logger_root
 12 | from collections import defaultdict
 13 | import pickle
 14 | import time
 15 | import os
 16 | 
 17 | 
 18 | logger = get_logger_root()
 19 | logger.info("path of dict cache is {}!".format(path_macropodus_dict_freq_cache))
 20 | 
 21 | 
 22 | class SegBasic:
 23 |     def __init__(self, use_cache=True):
 24 |         # time_start = time.time()
 25 |         # 存在缓存则直接读取, 序列化加速缓存读取速度
 26 |         if use_cache and os.path.exists(path_macropodus_dict_freq_cache):
 27 |             with open(path_macropodus_dict_freq_cache, "rb") as fpmc:
 28 |                 [self.dict_words_freq, self.num_words, self.dict_user] = pickle.load(fpmc)
 29 |                 fpmc.close()
 30 |             # logger.info("seg: " + str(time.time()-time_start)) # 5.29, 5.26
 31 |         else:
 32 |             self.dict_words_freq = defaultdict()
 33 |             self.dict_user = {}
 34 |             self.load_macropodus_dict() # 默认字典
 35 |             self.load_user_dict() # 用户字典
 36 |             # logger.info("seg: " + str(time.time() - time_start)) # 10.13, 10.33
 37 |             # 第一次跑macropodus, 序列化需要的缓存
 38 |             if use_cache and not os.path.exists(path_macropodus_dict_freq_cache):
 39 |                 with open(path_macropodus_dict_freq_cache, "wb") as fpmc:
 40 |                     pickle.dump([self.dict_words_freq, self.num_words, self.dict_user], fpmc)
 41 | 
 42 |     def load_macropodus_dict(self):
 43 |         """
 44 |             加载默认的基础字典
 45 |         :return: None
 46 |         """
 47 |         dict_macropodus = load_json(path_dict_macropodus)[0]  # (path_dict_jiagu)[0] # (path_dict_macropodus)[0] # 加载json字典文件
 48 |         dict_macropodus_def = defaultdict()  # 转为defaultdict
 49 |         for k,v in dict_macropodus.items():
 50 |             dict_macropodus_def[k] = v
 51 |         self.dict_words_freq = dict_macropodus_def  # {}词-词频字典
 52 | 
 53 |     def load_user_dict(self, path_user=path_dict_user, type_user="json"):
 54 |         """
 55 |             加载用户词典
 56 |         :param path_user:str, like '/home/user.dict' 
 57 |         :return: None
 58 |         """
 59 |         if not os.path.exists(path_user):
 60 |             raise RuntimeError("your path_user is not exist!")
 61 |         if type_user == "json":
 62 |             self.dict_user = load_json(path_user)[0]  # 加载json字典文件
 63 |             for k, v in self.dict_user.items():
 64 |                 if k not in self.dict_words_freq:
 65 |                     self.dict_words_freq[k] = v   # 更新到总字典, words_freq
 66 |                 else:
 67 |                     self.dict_words_freq[k] = self.dict_words_freq[k] + v   # 更新到总字典, words_freq
 68 |             self.num_words = sum(self.dict_words_freq.values())
 69 |         elif type_user == "txt":
 70 |             words_all = txt_read(path_user)
 71 |             for word_freq in words_all:
 72 |                 wf = word_freq.split(" ") # 空格' '区分带不带词频的情况
 73 |                 if len(wf) == 2:
 74 |                     word = wf[0]
 75 |                     freq = wf[1]
 76 |                 else:
 77 |                     word = wf[0]
 78 |                     freq = 132
 79 |                 if word not in self.dict_words_freq:
 80 |                     self.dict_words_freq[word] = freq   # 更新到总字典, words_freq
 81 |                 else:
 82 |                     self.dict_words_freq[word] = self.dict_words_freq[word] + freq   # 更新到总字典, words_freq
 83 |             self.num_words = sum(self.dict_words_freq.values())
 84 |         elif type_user == "csv":
 85 |             words_all = txt_read(path_user)
 86 |             for word_freq in words_all:
 87 |                 wf = word_freq.split(",") # 逗号','区分带不带词频的情况
 88 |                 if len(wf)==2:
 89 |                     word = wf[0]
 90 |                     freq = wf[1]
 91 |                 else:
 92 |                     word = wf[0]
 93 |                     freq = 132
 94 |                 if word not in self.dict_words_freq:
 95 |                     self.dict_words_freq[word] = freq   # 更新到总字典, words_freq
 96 |                 else:
 97 |                     self.dict_words_freq[word] = self.dict_words_freq[word] + freq   # 更新到总字典, words_freq
 98 |             self.num_words = sum(self.dict_words_freq.values())
 99 |         else:
100 |             raise EOFError
101 | 
102 |     def add_word(self, word, freq=132):
103 |         """
104 |             新增词典到词语, 不可持久化, 重载消失
105 |         :param word: str, like '大漠帝国'
106 |         :param freq: int, like 132
107 |         :return: None
108 |         """
109 |         assert type(word) == str
110 |         if word in self.dict_words_freq:
111 |             self.dict_words_freq[word] = self.dict_words_freq[word] if freq !=132 else freq
112 |         else:
113 |             self.dict_words_freq[word] = freq
114 |         self.num_words += freq
115 | 
116 |     def delete_word(self, word):
117 |         """
118 |             删除词语, 不可持久化, 重载消失
119 |         :param word_freqs: str, like '大漠帝国'
120 |         :return: None
121 |         """
122 |         assert type(word) == str
123 |         if word in self.dict_words_freq:
124 |             self.num_words -= self.dict_words_freq[word]
125 |             self.dict_words_freq.pop(word)
126 | 
127 |     def save_add_words(self, word_freqs):
128 |         """
129 |             新增词语到用户词典, 可持久化, 重载有效
130 |         :param word_freqs: dict, like {'大漠帝国':132}
131 |         :return: None
132 |         """
133 |         assert type(word_freqs) == dict
134 |         for k, v in word_freqs.items():
135 |             self.add_word(k, v)    # 新增到总字典, 不持久化
136 |             self.dict_user[k] = v  # 新增到用户字典, 持久化
137 |         save_json([self.dict_user], path_dict_user)
138 | 
139 |     def save_delete_words(self, words):
140 |         """
141 |             删除词语到用户词典, 可持久化, 重载有效
142 |         :param word_freqs: list, like ['大漠帝国']
143 |         :return: None
144 |         """
145 |         assert type(words) == list
146 |         for w in words:
147 |             self.delete_word(w) # 删除到总字典, 不持久化
148 |             if w in self.dict_user: self.dict_user.pop(w) # 删除到用户字典, 持久化
149 |         save_json([self.dict_user], path_dict_user)
150 | 


--------------------------------------------------------------------------------
/macropodus/summarize/topic_base/topic_nmf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/12/2 20:33
  4 | # @author   :Mo
  5 | # @function :topic model of NMF
  6 | 
  7 | 
  8 | from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit
  9 | from macropodus.data.words_common.stop_words import stop_words
 10 | from macropodus.preprocess.tools_ml import macropodus_cut
 11 | from macropodus.preprocess.tools_ml import cut_sentence
 12 | # sklearn
 13 | from sklearn.decomposition import NMF
 14 | import numpy as np
 15 | 
 16 | 
 17 | class NMFSum:
 18 |     def __init__(self):
 19 |         self.stop_words = stop_words.values()
 20 |         self.algorithm = 'lsi'
 21 | 
 22 |     def summarize(self, text, num=320, topic_min=5, judge_topic="all"):
 23 |         """
 24 | 
 25 |         :param text: text or list, input docs
 26 |         :param num: int, number or amount of return
 27 |         :param topic_min: int, topic number
 28 |         :param judge_topic: str, calculate ways of topic
 29 |         :return: 
 30 |         """
 31 |         # 切句
 32 |         if type(text) == str:
 33 |             self.sentences = cut_sentence(text)
 34 |         elif type(text) == list:
 35 |             self.sentences = text
 36 |         else:
 37 |             raise RuntimeError("text type must be list or str")
 38 |         # 切词
 39 |         sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
 40 |                           if word.strip()] for sentence in self.sentences]
 41 |         len_sentences_cut = len(sentences_cut)
 42 |         # 去除停用词等
 43 |         self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
 44 |         self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
 45 |         # 计算每个句子的tfidf
 46 |         sen_tfidf = tfidf_fit(self.sentences_cut)
 47 |         # 主题数, 经验判断
 48 |         topic_num = min(topic_min, int(len(sentences_cut) / 2))  # 设定最小主题数为3
 49 |         nmf_tfidf = NMF(n_components=topic_num, max_iter=320)
 50 |         res_nmf_w = nmf_tfidf.fit_transform(sen_tfidf.T) # 基矩阵 or 权重矩阵
 51 |         res_nmf_h = nmf_tfidf.components_                # 系数矩阵 or 降维矩阵
 52 | 
 53 |         if judge_topic:
 54 |             ### 方案一, 获取最大那个主题的k个句子
 55 |             ##################################################################################
 56 |             topic_t_score = np.sum(res_nmf_h, axis=-1)
 57 |             # 对每列(一个句子topic_num个主题),得分进行排序,0为最大
 58 |             res_nmf_h_soft = res_nmf_h.argsort(axis=0)[-topic_num:][::-1]
 59 |             # 统计为最大每个主题的句子个数
 60 |             exist = (res_nmf_h_soft <= 0) * 1.0
 61 |             factor = np.ones(res_nmf_h_soft.shape[1])
 62 |             topic_t_count = np.dot(exist, factor)
 63 |             # 标准化
 64 |             topic_t_count /= np.sum(topic_t_count, axis=-1)
 65 |             topic_t_score /= np.sum(topic_t_score, axis=-1)
 66 |             # 主题最大个数占比, 与主题总得分占比选择最大的主题
 67 |             topic_t_tc = topic_t_count + topic_t_score
 68 |             topic_t_tc_argmax = np.argmax(topic_t_tc)
 69 |             # 最后得分选择该最大主题的
 70 |             res_nmf_h_soft_argmax = res_nmf_h[topic_t_tc_argmax].tolist()
 71 |             res_combine = {}
 72 |             for l in range(len_sentences_cut):
 73 |                 res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
 74 |             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
 75 |             #####################################################################################
 76 |         else:
 77 |             ### 方案二, 获取最大主题概率的句子, 不分主题
 78 |             res_combine = {}
 79 |             for i in range(len_sentences_cut):
 80 |                 res_row_i = res_nmf_h[:, i]
 81 |                 res_row_i_argmax = np.argmax(res_row_i)
 82 |                 res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
 83 |             score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)]
 84 |         num_min = min(num, int(len_sentences_cut * 0.6))
 85 |         return score_sen[0:num_min]
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     nmf = NMFSum()
 90 |     doc = "多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，" \
 91 |           "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金（有限合伙）共同发起设立嘉道方直教育产业投资基金（暂定名）。" \
 92 |           "该基金认缴出资总规模为人民币3.01亿元。" \
 93 |           "基金的出资方式具体如下：出资进度方面，基金合伙人的出资应于基金成立之日起四年内分四期缴足，每期缴付7525万元；" \
 94 |           "各基金合伙人每期按其出资比例缴付。合伙期限为11年，投资目标为教育领域初创期或成长期企业。" \
 95 |           "截止公告披露日，深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日，深圳嘉道功程股权投资基金产权结构如下:" \
 96 |           "公告还披露，方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
 97 |           "方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
 98 | 
 99 |     doc = "和投票目标的等级来决定新的等级.简单的说。" \
100 |            "是上世纪90年代末提出的一种计算网页权重的算法! " \
101 |            "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。" \
102 |            "业界急需一种相对比较准确的网页重要性计算方法。" \
103 |            "是人们能够从海量互联网世界中找出自己需要的信息。" \
104 |            "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
105 |            "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \
106 |            "Google根据投票来源甚至来源的来源，即链接到A页面的页面。" \
107 |            "一个高等级的页面可以使其他低等级页面的等级提升。" \
108 |            "具体说来就是，PageRank有两个基本思想，也可以说是假设。" \
109 |            "即数量假设：一个网页被越多的其他页面链接，就越重）。" \
110 |            "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
111 |            "总的来说就是一句话，从全局角度考虑，获取重要的信。"
112 | 
113 |     doc = '早年林志颖带kimi上《爸爸去哪儿》的时候，当时遮遮掩掩的林志颖老婆低调探班，总让人觉得格外神秘，大概是特别不喜欢' \
114 |            '在公众面前曝光自己日常的那种人。可能这么些年过去，心态不断调整过了，至少在微博上，陈若仪越来越放得开，晒自己带' \
115 |            '娃照顾双子星的点滴，也晒日常自己的护肤心得，时不时安利一些小东西。都快晚上十点半，睡美容觉的最佳时候，结果才带' \
116 |            '完一天娃的陈若仪还是不忘先保养自己，敷起了面膜。泡完澡，这次用的是一个稍微平价的面膜，脸上、甚至仔细到脖子上都' \
117 |            '抹上了。陈若仪也是多此一举，特别说自己不是裸体，是裹着浴巾的，谁在意这个呀，目光完全被你那又长又扑闪的睫毛给吸' \
118 |            '引住了。这也太吓人吧，怎么能够长那么长那么密那么翘。嫉妒地说一句，真的很像种的假睫毛呐。陈若仪的睫毛应该是天生' \
119 |            '的基础好吧，要不然也不会遗传给小孩，一家子都是睫毛精，几个儿子现在这么小都是长睫毛。只是陈若仪现在这个完美状态，' \
120 |            '一定是后天再经过悉心的呵护培养。网友已经迫不及待让她教教怎么弄睫毛了，陈若仪也是答应地好好的。各种私人物品主动' \
121 |            '揭秘，安利一些品牌给大家，虽然一再强调是自己的日常小物，还是很让人怀疑，陈若仪是不是在做微商当网红呐，网友建议' \
122 |            '她开个店，看这回复，也是很有意愿了。她应该不缺这个钱才对。隔三差五介绍下自己用的小刷子之类，陈若仪乐于向大家传' \
123 |            '授自己的保养呵护之道。她是很容易就被晒出斑的肤质，去海岛参加婚礼，都要必备这几款超爱用的防晒隔离。日常用的、太' \
124 |            '阳大时候用的，好几个种类，活得相当精致。你们按照自己的需要了解一下。画眉毛，最爱用的是intergrate的眉笔。也是个' \
125 |            '念旧的人，除了Dior，陈若仪的另一个眉粉其中一个是她高中就开始用的Kate。一般都是大学才开始化妆修饰自己，感受得到' \
126 |            '陈若仪从小就很爱美。各种小零小碎的化妆品，已经买过七八次的粉红胡椒抛光美体油，每天洗完澡陈若仪都会喷在肚子、大' \
127 |            '腿、屁股和膝盖手肘，说是能保持肌肤的平滑紧致程度。每安利一样东西，总有网友要在下面问其他问题咋个办，真是相当信' \
128 |            '任陈若仪了。每次她也很耐心的解答，"去黑头我用的是SUQQU洁面去角质按摩膏磨砂洁面洗面奶，"一定要先按摩再用。她自己' \
129 |            '已经回购过好几次，意思是你们再了解一下。了解归了解，买不买随意。毕竟像她另一个爱用的达尔肤面膜，效果好是好，价' \
130 |            '格据说比sk2都还要贵，不是大多数人日常能够消费得起的，大家就看个热闹就好了，还是多买多试多用才能找到最适合自己的' \
131 |            '护肤方法。'
132 | 
133 |     sum = nmf.summarize(doc, num=320)
134 |     for i in sum:
135 |         print(i)
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/macropodus/network/service/server_streamer.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2020/1/16 22:18
  4 | # @author  : Mo
  5 | # @function: service streamer of multiprocessing
  6 | 
  7 | 
  8 | # 多进程, win10必须加, 否则报错
  9 | import platform
 10 | sys = platform.system()
 11 | if sys == "Windows":
 12 |     import multiprocessing as mp
 13 |     mp.freeze_support()
 14 |     mp.set_start_method("spawn", force=True)
 15 | 
 16 | from macropodus.network.service.server_base import Streamer, ThreadedStreamer
 17 | from macropodus.preprocess.tools_ml import extract_chinese
 18 | from tensorflow.python.keras.models import model_from_json
 19 | from macropodus.preprocess.tools_common import load_json
 20 | from keras_bert import Tokenizer
 21 | import numpy as np
 22 | import macropodus
 23 | import codecs
 24 | import os
 25 | 
 26 | 
 27 | # 常规
 28 | class AlbertBilstmPredict:
 29 |     def __init__(self, path_dir):
 30 |         self.path_dir = path_dir
 31 |         self.tokenizer_init()
 32 |         self.l2i_i2l_init()
 33 |         self.params_init()
 34 |         self.model_init()
 35 | 
 36 |     def model_init(self):
 37 |         """模型初始化"""
 38 |         # import tensorflow as tf
 39 |         # self.model = None
 40 |         # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)  # gpu_memory_fraction
 41 |         # config = tf.ConfigProto(gpu_options=gpu_options)
 42 |         # self.graph = tf.Graph()
 43 |         # self.sess = tf.Session(graph=self.graph, config=config)
 44 |         # with self.sess.as_default():
 45 |         #     with self.graph.as_default():
 46 |         # self.model = None
 47 |         # graph = tf.get_default_graph()
 48 |         # sess = tf.Session(graph=graph)
 49 |         # with sess.as_default():
 50 |         #     with graph.as_default():
 51 |         #         tf.global_variables_initializer().run()
 52 |         path_graph = os.path.join(self.path_dir, "graph.json")
 53 |         path_model = os.path.join(self.path_dir, "model.h5")
 54 |         # 加载模型结构
 55 |         self.model = model_from_json(open(path_graph, "r", encoding="utf-8").read(),
 56 |                                      custom_objects=macropodus.custom_objects)
 57 |         # 加载模型权重
 58 |         self.model.load_weights(path_model)
 59 | 
 60 |     def tokenizer_init(self):
 61 |         """字典"""
 62 |         # reader tokenizer
 63 |         token_dict = {}
 64 |         path_dict = os.path.join(self.path_dir, "vocab.txt")
 65 |         with codecs.open(path_dict, 'r', 'utf8') as reader:
 66 |             for line in reader:
 67 |                 token = line.strip()
 68 |                 token_dict[token] = len(token_dict)
 69 |         # vocab_size = len(token_dict)
 70 |         self.tokenizer = Tokenizer(token_dict)
 71 | 
 72 |     def params_init(self):
 73 |         """超参数初始化"""
 74 |         # params
 75 |         path_params = os.path.join(self.path_dir, "params.json")
 76 |         self.params = load_json(path_params)
 77 |         self.len_max = self.params["len_max"]
 78 | 
 79 |     def l2i_i2l_init(self):
 80 |         """类别与数字项目转化"""
 81 |         # l2i_i2l
 82 |         path_l2i_i2l = os.path.join(self.path_dir, "l2i_i2l.json")
 83 |         self.l2i_i2l = load_json(path_l2i_i2l)
 84 | 
 85 |     def sentence2idx(self, text, second_text=None):
 86 |         """数据预处理"""
 87 |         text = extract_chinese(str(text).upper())
 88 |         input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
 89 |         input_mask = len([1 for ids in input_id if ids != 0])
 90 |         return [input_id, input_type_id, input_mask]
 91 | 
 92 |     def predict(self, quess):
 93 |         """预测多个问句"""
 94 |         quess_encode = [self.sentence2idx(ques) for ques in quess]
 95 |         x_ = np.array(quess_encode)
 96 |         x_1 = np.array([x[0] for x in x_])
 97 |         x_2 = np.array([x[1] for x in x_])
 98 |         x_3 = np.array([x[2] for x in x_])
 99 |         ress = self.model.predict([x_1, x_2, x_3])
100 |         ress_idxs = [[np.argmax(rl) for rl in res_list] for res_list in ress.tolist()]
101 |         ress_label = [[self.l2i_i2l["i2l"][str(ri)] if str(ri) in self.l2i_i2l["i2l"] else "O" for ri in res_idxs]
102 |                       for res_idxs in ress_idxs]
103 |         ress_select = [ress_label[i][1:len(quess[i]) + 1] for i in range(len(quess))]
104 |         return ress_select
105 | 
106 | 
107 | # 一个进程多个线程&多进程等
108 | class ServiceNer:
109 |     def __init__(self, path_abs, cuda_devices="0", stream_type="processing",
110 |                  max_latency=0.1, worker_num=1, batch_size=32):
111 |         self.algorithm = 'albert-ner-bilstm-crf'
112 |         self.cuda_devices = cuda_devices
113 |         self.stream_type = stream_type
114 |         self.max_latency = max_latency
115 |         self.worker_num = worker_num
116 |         self.batch_size = batch_size
117 |         self.path_abs = path_abs
118 |         self.streamer_init()
119 | 
120 |     def streamer_init(self):
121 |         """
122 |             ner初始化
123 |         :param model: class, like "ner_model"
124 |         :param cuda_devices: str, like "processing", "thread"
125 |         :param stream_type: str, like "0,1"
126 |         :param batch_size: int, like 32
127 |         :param max_latency: float, 0-1, like 0.01
128 |         :param worker_num: int, like 2
129 |         :return: 
130 |         """
131 |         model = AlbertBilstmPredict(self.path_abs)
132 |         if self.stream_type == "thread":
133 |             self.streamer = ThreadedStreamer(model, self.batch_size, self.max_latency)
134 |         else:
135 |             self.streamer = Streamer(predict_function_or_model=model,
136 |                                      cuda_devices=self.cuda_devices,
137 |                                      max_latency=self.max_latency,
138 |                                      worker_num=self.worker_num,
139 |                                      batch_size=self.batch_size)
140 |             self.streamer._wait_for_worker_ready()
141 | 
142 |     # def predict(self, text):
143 |     #     """
144 |     #         预测返回
145 |     #     :param text: str, like "桂林"
146 |     #     :return: list, like ["B-LOC", "I-LOC"]
147 |     #     """
148 |     #     return self.streamer.predict(text)
149 | 
150 | 
151 | # 模型加载
152 | # path = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/tag_seg_pku_1998_w2v_16"
153 | path = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/ner_people_1998_mix_albert_1"
154 | model_server = ServiceNer(path, stream_type="thread", cuda_devices="-1", max_latency=0.1, worker_num=1, batch_size=32).streamer
155 | 
156 | 
157 | if __name__ == '__main__':
158 |     ques = "北京欢迎您, 南宁2020东盟博览会"
159 |     res = model_server.predict([ques])
160 |     print(res)
161 |     while True:
162 |         print("请输入:")
163 |         ques = input()
164 |         res = model_server.predict([ques])
165 |         print(res)
166 | 


--------------------------------------------------------------------------------
/macropodus/network/service/server_streamer_flask.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | # @time    : 2020/1/16 22:18
  4 | # @author  : Mo
  5 | # @function: service streamer of multiprocessing
  6 | 
  7 | 
  8 | import platform
  9 | 
 10 | # 多进程, win10必须加, 否则报错
 11 | sys = platform.system()
 12 | if sys == "Windows":
 13 |     import multiprocessing as mp
 14 |     mp.freeze_support()
 15 |     mp.set_start_method("spawn", force=True)
 16 | 
 17 | from macropodus.network.service.server_base import ThreadedStreamer, Streamer
 18 | from macropodus.preprocess.tools_ml import extract_chinese
 19 | from tensorflow.python.keras.models import model_from_json
 20 | from macropodus.preprocess.tools_common import load_json
 21 | from macropodus.conf.path_log import get_logger_root
 22 | from keras_bert import Tokenizer
 23 | import numpy as np
 24 | import macropodus
 25 | import codecs
 26 | import os
 27 | 
 28 | # flask
 29 | from flask import Flask, request, jsonify
 30 | logger = get_logger_root()
 31 | 
 32 | 
 33 | # 常规
 34 | class AlbertBilstmPredict:
 35 |     def __init__(self, path_dir):
 36 |         self.path_dir = path_dir
 37 |         self.tokenizer_init()
 38 |         self.l2i_i2l_init()
 39 |         self.params_init()
 40 |         self.model_init()
 41 | 
 42 |     def model_init(self):
 43 |         """模型初始化"""
 44 |         path_graph = os.path.join(self.path_dir, "graph.json")
 45 |         path_model = os.path.join(self.path_dir, "model.h5")
 46 |         # 加载模型结构
 47 |         self.model = model_from_json(open(path_graph, "r", encoding="utf-8").read(),
 48 |                                      custom_objects=macropodus.custom_objects)
 49 |         # 加载模型权重
 50 |         self.model.load_weights(path_model)
 51 | 
 52 |     def tokenizer_init(self):
 53 |         """字典"""
 54 |         # reader tokenizer
 55 |         token_dict = {}
 56 |         path_dict = os.path.join(self.path_dir, "vocab.txt")
 57 |         with codecs.open(path_dict, 'r', 'utf8') as reader:
 58 |             for line in reader:
 59 |                 token = line.strip()
 60 |                 token_dict[token] = len(token_dict)
 61 |         # vocab_size = len(token_dict)
 62 |         self.tokenizer = Tokenizer(token_dict)
 63 | 
 64 |     def params_init(self):
 65 |         """超参数初始化"""
 66 |         # params
 67 |         path_params = os.path.join(self.path_dir, "params.json")
 68 |         self.params = load_json(path_params)
 69 |         self.len_max = self.params["len_max"]
 70 | 
 71 |     def l2i_i2l_init(self):
 72 |         """类别与数字项目转化"""
 73 |         # l2i_i2l
 74 |         path_l2i_i2l = os.path.join(self.path_dir, "l2i_i2l.json")
 75 |         self.l2i_i2l = load_json(path_l2i_i2l)
 76 | 
 77 |     def sentence2idx(self, text, second_text=None):
 78 |         """数据预处理"""
 79 |         text = extract_chinese(str(text).upper())
 80 |         input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max)
 81 |         input_mask = len([1 for ids in input_id if ids != 0])
 82 |         # return input_id, input_type_id, input_mask
 83 |         # x_ = np.array((input_id, input_type_id, input_mask))
 84 |         x = [[input_id, input_type_id, input_mask]]
 85 |         x_ = np.array(x)
 86 |         x_1 = np.array([x[0] for x in x_])
 87 |         x_2 = np.array([x[1] for x in x_])
 88 |         x_3 = np.array([x[2] for x in x_])
 89 |         return [x_1, x_2, x_3]
 90 | 
 91 |     def predict(self, ques):
 92 |         """预测"""
 93 |         mode_input = self.sentence2idx(ques)
 94 |         res = self.model.predict(mode_input)
 95 |         res_list = res.tolist()[0]
 96 |         res_idxs = [np.argmax(rl) for rl in res_list]
 97 |         res_label = [self.l2i_i2l["i2l"][str(ri)] if str(ri) in self.l2i_i2l["i2l"] else "O" for ri in res_idxs]
 98 |         return res_label[1:len(ques) + 1]
 99 | 
100 | 
101 | # 一个进程多个线程等
102 | class ServiceNer:
103 |     def __init__(self, path_model_dir, cuda_devices="0", stream_type="processing",
104 |                  max_latency=0.1, worker_num=1, batch_size=32):
105 |         self.path_model_dir = path_model_dir
106 |         self.cuda_devices = cuda_devices
107 |         self.stream_type = stream_type
108 |         self.max_latency = max_latency
109 |         self.worker_num = worker_num
110 |         self.batch_size = batch_size
111 |         self.algorithm = 'albert-ner-bilstm-crf'
112 |         self.streamer_init(self.path_model_dir, cuda_devices=self.cuda_devices, stream_type=self.stream_type,
113 |                            max_latency=self.max_latency, worker_num=self.worker_num,
114 |                            batch_size=self.batch_size)
115 | 
116 |     def streamer_init(self, path_abs, cuda_devices="0", stream_type="processing",
117 |                             max_latency=0.1, worker_num=1, batch_size=32):
118 |         """
119 |             ner初始化
120 |         :param path_abs: str, like "ner_model"
121 |         :param cuda_devices: str, like "processing", "thread"
122 |         :param stream_type: str, like "0,1"
123 |         :param batch_size: int, like 32
124 |         :param max_latency: float, 0-1, like 0.01
125 |         :param worker_num: int, like 2
126 |         :return: 
127 |         """
128 |         abp = AlbertBilstmPredict(path_abs)
129 |         if stream_type == "thread":
130 |             self.streamer = ThreadedStreamer(abp, batch_size, max_latency)
131 |         else:
132 |             self.streamer = Streamer(predict_function_or_model=abp,
133 |                                      cuda_devices=cuda_devices,
134 |                                      max_latency=max_latency,
135 |                                      worker_num=worker_num,
136 |                                      batch_size=batch_size)
137 | 
138 |     def predict(self, text):
139 |         """
140 |             预测返回
141 |         :param text: str, like "桂林"
142 |         :return: list, like ["B-LOC", "I-LOC"]
143 |         """
144 |         return self.streamer.predict(text)
145 | 
146 | 
147 | def streamer_predict(streamer_real):
148 |     """
149 |         复合使函数方法通用
150 |     :return: 
151 |     """
152 |     params = request.form if request.form else request.json
153 |     sentences = params.get('texts', '')
154 |     res = []
155 |     try:
156 |         res = streamer_real.predict(sentences)
157 |     except Exception as e:
158 |         logger.info(str(e))
159 |     return res
160 | 
161 | 
162 | # 模型加载
163 | path = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/ner_people_1998_mix_albert_1"
164 | sn = ServiceNer(path, cuda_devices="0,1", max_latency=0.1, worker_num=1, batch_size=32)
165 | app = Flask(__name__)
166 | 
167 | 
168 | @app.route('/ner/predict', methods=["POST, GET"])
169 | def ner_predict_3():
170 |     res = streamer_predict(sn)
171 |     return jsonify(content=res,
172 |                    content_type='charset = utf-8; application/json',
173 |                    reason='success',
174 |                    charset='utf-8',
175 |                    status='200')
176 | 
177 | 
178 | if __name__ == '__main__':
179 | 
180 |     app.run(port=8080, threaded=True, host='0.0.0.0', debug=False)
181 | 
182 | 


--------------------------------------------------------------------------------
/macropodus/summarize/graph_base/textrank_word2vec.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/12/20 20:39
  4 | # @author   :Mo
  5 | # @function :textrank of word2vec, keyword and sentence
  6 | 
  7 | 
  8 | from macropodus.similarity.similarity_word2vec_char import SimW2vChar
  9 | from macropodus.data.words_common.stop_words import stop_words
 10 | from macropodus.preprocess.tools_ml import macropodus_cut
 11 | from macropodus.preprocess.tools_ml import cut_sentence
 12 | import networkx as nx
 13 | import numpy as np
 14 | 
 15 | 
 16 | class TextrankWord2vec(SimW2vChar):
 17 |     def __init__(self, use_cache=True):
 18 |         self.algorithm = 'textrank_word2vec'
 19 |         self.stop_words = stop_words
 20 |         super().__init__(use_cache) # self.w2v_char
 21 | 
 22 |     def cut_window(self, sent_words, win_size=2):
 23 |         """
 24 |             滑动窗口切词
 25 |         :param sent_words: list, like ["我", "是", "大漠帝国"]
 26 |         :param win_size: int, like 3
 27 |         :return: yield
 28 |         """
 29 |         if win_size < 2:
 30 |             win_size = 2
 31 |         for i in range(1, win_size):
 32 |             if i >= len(sent_words):
 33 |                 break
 34 |             sent_terms = sent_words[i:] # 后面的
 35 |             sent_zip = zip(sent_words, sent_terms) # 候选词对
 36 |             for sz in sent_zip:
 37 |                 yield sz
 38 | 
 39 |     def keyword(self, text, num=6, score_min=0.025, win_size=3, type_sim="total", type_encode="avg", config={"alpha": 0.86, "max_iter":100}):
 40 |         """
 41 |             关键词抽取, textrank of word2vec cosine
 42 |         :param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道？嗯。"
 43 |         :param num: int, length of sentence like 6
 44 |         :param win_size: int, windows size of combine. like 2
 45 |         :param type_sim: str, type of simiilarity. like "total", "cosine"
 46 |         :param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100}
 47 |         :return: list, result of keyword. like [(0.020411696169510562, '手机'), (0.016149784106276977, '夏普')]
 48 |         """
 49 |         # 切句
 50 |         if type(text) == str:
 51 |             self.sentences = cut_sentence(text)
 52 |         elif type(text) == list:
 53 |             self.sentences = text
 54 |         else:
 55 |             raise RuntimeError("text type must be list or str")
 56 |         # macropodus_cut 切词
 57 |         self.macropodus_word = [macropodus_cut(sentence) for sentence in self.sentences]
 58 |         # 去除停用词等
 59 |         self.sentences_word = [[w for w in mw if w not in self.stop_words.values()] for mw in self.macropodus_word]
 60 |         # 构建图的顶点
 61 |         word2index = {}
 62 |         index2word = {}
 63 |         word_index = 0
 64 |         for sent_words in self.sentences_word:
 65 |             for word in sent_words:
 66 |                 if not word in word2index: # index
 67 |                     word2index[word] = word_index
 68 |                     index2word[word_index] = word
 69 |                     word_index += 1
 70 |         graph_words = np.zeros((word_index, word_index))
 71 |         # 构建图的边, 以两个词语的余弦相似度为基础
 72 |         for sent_words in self.sentences_word:
 73 |             for cw_1, cw_2 in self.cut_window(sent_words, win_size=win_size):
 74 |                 if cw_1 in word2index and cw_2 in word2index:
 75 |                     idx_1, idx_2 = word2index[cw_1], word2index[cw_2]
 76 |                     score_w2v_cosine = self.similarity(cw_1, cw_2, type_sim=type_sim,
 77 |                                                        type_encode=type_encode)
 78 |                     graph_words[idx_1][idx_2] = score_w2v_cosine
 79 |                     graph_words[idx_2][idx_1] = score_w2v_cosine
 80 |         # 构建相似度矩阵
 81 |         w2v_cosine_sim = nx.from_numpy_matrix(graph_words)
 82 |         # nx.pagerank
 83 |         sens_scores = nx.pagerank(w2v_cosine_sim, **config)
 84 |         # 得分排序
 85 |         sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
 86 |         # 保留topk个, 防止越界
 87 |         topk = min(len(sen_rank), num)
 88 |         # 返回原句子和得分
 89 |         return [(sr[1], index2word[sr[0]]) for sr in sen_rank if len(index2word[sr[0]])>1 and score_min<=sr[1]][0:topk]
 90 | 
 91 |     def summarize(self, text, num=320, type_sim="cosine", type_encode="avg", config={"alpha": 0.33, "max_iter":100}):
 92 |         """
 93 |             文本摘要抽取, textrank of word2vec cosine
 94 |         :param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道？嗯。"
 95 |         :param num: int, length of sentence like 6
 96 |         :param type_sim: str, type of simiilarity. like "total", "cosine"
 97 |         :param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100}
 98 |         :return: list, result of keyword. like [(0.06900223298930287, 'PageRank The PageRank Citation Ranking'), (0.08698940285163381, 'PageRank通过网络浩瀚的超链接关系来确定一个页面的等级')]
 99 |         """
100 |         # 切句
101 |         if type(text) == str:
102 |             self.sentences = cut_sentence(text)
103 |         elif type(text) == list:
104 |             self.sentences = text
105 |         else:
106 |             raise RuntimeError("text type must be list or str")
107 |         # 输入文本句子长度
108 |         len_sen = len(self.sentences)
109 |         # 构建图的顶点
110 |         sent2idx = {}
111 |         idx2sent = {}
112 |         sent_idx = 0
113 |         for sent in self.sentences:
114 |             sent2idx[sent] = sent_idx
115 |             idx2sent[sent_idx] = sent
116 |             sent_idx += 1
117 |         graph_sents = np.zeros((sent_idx, sent_idx))
118 |         # 构建图的边, 以两个句子的余弦相似度为基础
119 |         for i in range(len_sen):
120 |             for j in range(len_sen):
121 |                 score_w2v_cosine = self.similarity(self.sentences[i], self.sentences[j],
122 |                                                    type_sim=type_sim, type_encode=type_encode)
123 |                 graph_sents[i][j] = score_w2v_cosine
124 |                 graph_sents[j][i] = score_w2v_cosine
125 |         # 构建相似度矩阵
126 |         w2v_cosine_sim = nx.from_numpy_matrix(graph_sents)
127 |         # nx.pagerank
128 |         sens_scores = nx.pagerank(w2v_cosine_sim, **config)
129 |         # 得分排序
130 |         sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True)
131 |         # 保留topk个, 防止越界
132 |         topk = min(len(sen_rank), num)
133 |         # 返回原句子和得分
134 |         return [(sr[1], self.sentences[sr[0]]) for sr in sen_rank][0:topk]
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     text = "是上世纪90年代末提出的一种计算网页权重的算法。" \
139 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长，" \
140 |           "业界急需一种相对比较准确的网页重要性计算方法，" \
141 |           "是人们能够从海量互联网世界中找出自己需要的信息。" \
142 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \
143 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票，" \
144 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面" \
145 |           "和投票目标的等级来决定新的等级。简单的说，" \
146 |           "一个高等级的页面可以使其他低等级页面的等级提升。" \
147 |           "PageRank The PageRank Citation Ranking: Bringing Order to the Web，" \
148 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设，" \
149 |           "即数量假设：一个网页被越多的其他页面链接，就越重）；" \
150 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。" \
151 |           "总的来说就是一句话，从全局角度考虑，获取重要的信息。"
152 |     trww = TextrankWord2vec()
153 |     keyword = trww.keyword(text, num=8)
154 |     summary = trww.summarize(text, num=32)
155 |     print(keyword)
156 |     print(summary)
157 | 
158 | 


--------------------------------------------------------------------------------
/macropodus/tookit/calculator_sihui/calcultor_number.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/11/22 20:09
  4 | # @author   :Mo
  5 | # @function :extract number from sentence of chinese or mix。提取数字,中文，或者混合中文-阿拉伯数字
  6 | 
  7 | 
  8 | # import regex as re
  9 | import re
 10 | 
 11 | 
 12 | # * 字符串预处理模块，为分析器TimeNormalizer提供相应的字符串预处理服务
 13 | class StringPreHandler:
 14 |     # @Author  : zhm
 15 |     # @codes   : code from github: https://github.com/zhanzecheng/Time_NLP
 16 |     # @function :StringPreHandler.py
 17 |     @classmethod
 18 |     def delKeyword(cls, target, rules):
 19 |         """
 20 |         该方法删除一字符串中所有匹配某一规则字串
 21 |         可用于清理一个字符串中的空白符和语气助词
 22 |         :param target: 待处理字符串
 23 |         :param rules: 删除规则
 24 |         :return: 清理工作完成后的字符串
 25 |         """
 26 |         pattern = re.compile(rules)
 27 |         res = pattern.sub('', target)
 28 |         # print res
 29 |         return res
 30 | 
 31 | 
 32 |     @classmethod
 33 |     def numberTranslator(cls, target):
 34 |         """
 35 |         该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字
 36 |         如"这里有一千两百个人，六百零五个来自中国"可以转化为
 37 |         "这里有1200个人，605个来自中国"
 38 |         此外添加支持了部分不规则表达方法
 39 |         如两万零六百五可转化为20650
 40 |         两百一十四和两百十四都可以转化为214
 41 |         一六零加一五八可以转化为160+158
 42 |         该方法目前支持的正确转化范围是0-99999999
 43 |         该功能模块具有良好的复用性
 44 |         :param target: 待转化的字符串
 45 |         :return: 转化完毕后的字符串
 46 |         """
 47 |         pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))")
 48 |         match = pattern.finditer(target)
 49 |         for m in match:
 50 |             group = m.group()
 51 |             s = group.split(u"万")
 52 |             s = list(filter(None, s))
 53 |             num = 0
 54 |             if len(s) == 2:
 55 |                 num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000
 56 |             target = pattern.sub(str(num), target, 1)
 57 | 
 58 |         pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))")
 59 |         match = pattern.finditer(target)
 60 |         for m in match:
 61 |             group = m.group()
 62 |             s = group.split(u"千")
 63 |             s = list(filter(None, s))
 64 |             num = 0
 65 |             if len(s) == 2:
 66 |                 num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100
 67 |             target = pattern.sub(str(num), target, 1)
 68 | 
 69 |         pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)")
 70 |         match = pattern.finditer(target)
 71 |         for m in match:
 72 |             group = m.group()
 73 |             s = group.split(u"百")
 74 |             s = list(filter(None, s))
 75 |             num = 0
 76 |             if len(s) == 2:
 77 |                 num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10
 78 |             target = pattern.sub(str(num), target, 1)
 79 | 
 80 |         pattern = re.compile(u"[零一二两三四五六七八九]")
 81 |         match = pattern.finditer(target)
 82 |         for m in match:
 83 |             target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
 84 | 
 85 |         # pattern = re.compile(u"(?<=(周|星期))[末天日]")
 86 |         pattern = re.compile(u"((?<=周)[末天日])|((?<=星期)[末天日])")
 87 |         match = pattern.finditer(target)
 88 |         for m in match:
 89 |             target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1)
 90 | 
 91 |         # pattern = re.compile(u"(?<!(周|星期))0?[0-9]?十[0-9]?")
 92 |         pattern = re.compile(u"((?<!(周))0?[0-9]?十[0-9]?)|(?<!(星期))0?[0-9]?十[0-9]?")
 93 |         match = pattern.finditer(target)
 94 |         for m in match:
 95 |             group = m.group()
 96 |             s = group.split(u"十")
 97 |             num = 0
 98 |             ten = cls.strToInt(s[0])
 99 |             if ten == 0:
100 |                 ten = 1
101 |             unit = cls.strToInt(s[1])
102 |             num = ten * 10 + unit
103 |             target = pattern.sub(str(num), target, 1)
104 | 
105 |         pattern = re.compile(u"0?[1-9]百[0-9]?[0-9]?")
106 |         match = pattern.finditer(target)
107 |         for m in match:
108 |             group = m.group()
109 |             s = group.split(u"百")
110 |             s = list(filter(None, s))
111 |             num = 0
112 |             if len(s) == 1:
113 |                 hundred = int(s[0])
114 |                 num += hundred * 100
115 |             elif len(s) == 2:
116 |                 hundred = int(s[0])
117 |                 num += hundred * 100
118 |                 num += int(s[1])
119 |             target = pattern.sub(str(num), target, 1)
120 | 
121 |         pattern = re.compile(u"0?[1-9]千[0-9]?[0-9]?[0-9]?")
122 |         match = pattern.finditer(target)
123 |         for m in match:
124 |             group = m.group()
125 |             s = group.split(u"千")
126 |             s = list(filter(None, s))
127 |             num = 0
128 |             if len(s) == 1:
129 |                 thousand = int(s[0])
130 |                 num += thousand * 1000
131 |             elif len(s) == 2:
132 |                 thousand = int(s[0])
133 |                 num += thousand * 1000
134 |                 num += int(s[1])
135 |             target = pattern.sub(str(num), target, 1)
136 | 
137 |         pattern = re.compile(u"[0-9]+万[0-9]?[0-9]?[0-9]?[0-9]?")
138 |         match = pattern.finditer(target)
139 |         for m in match:
140 |             group = m.group()
141 |             s = group.split(u"万")
142 |             s = list(filter(None, s))
143 |             num = 0
144 |             if len(s) == 1:
145 |                 tenthousand = int(s[0])
146 |                 num += tenthousand * 10000
147 |             elif len(s) == 2:
148 |                 tenthousand = int(s[0])
149 |                 num += tenthousand * 10000
150 |                 num += int(s[1])
151 |             target = pattern.sub(str(num), target, 1)
152 | 
153 |         return target
154 | 
155 |     @classmethod
156 |     def wordToNumber(cls, s):
157 |         """
158 |         方法numberTranslator的辅助方法，可将[零-九]正确翻译为[0-9]
159 |         :param s: 大写数字
160 |         :return: 对应的整形数，如果不是数字返回-1
161 |         """
162 |         if (s == u'零') or (s == '0'):
163 |             return 0
164 |         elif (s == u'一') or (s == '1'):
165 |             return 1
166 |         elif (s == u'二') or (s == u'两') or (s == '2'):
167 |             return 2
168 |         elif (s == u'三') or (s == '3'):
169 |             return 3
170 |         elif (s == u'四') or (s == '4'):
171 |             return 4
172 |         elif (s == u'五') or (s == '5'):
173 |             return 5
174 |         elif (s == u'六') or (s == '6'):
175 |             return 6
176 |         elif (s == u'七') or (s == u'天') or (s == u'日') or (s == u'末') or (s == '7'):
177 |             return 7
178 |         elif (s == u'八') or (s == '8'):
179 |             return 8
180 |         elif (s == u'九') or (s == '9'):
181 |             return 9
182 |         else:
183 |             return -1
184 | 
185 |     @classmethod
186 |     def strToInt(cls, s):
187 |         try:
188 |             res = int(s)
189 |         except:
190 |             res = 0
191 |         return res
192 | 
193 | 
194 | sph = StringPreHandler()
195 | 
196 | 
197 | def extract_number(sentence):
198 |     """
199 |         提取数字,纯数字
200 |     :param sentence: str
201 |     :return: list<str>
202 |     """
203 |     res = sph.numberTranslator(target=sentence)
204 |     find_list = []
205 |     for i in re.finditer('(\d+(\.\d+)?)', res):
206 |         find_list.append(i.group())
207 |     return find_list
208 | 
209 | 
210 | if __name__ == '__main__':
211 |     sen = "1000.一加1等于几，周末和星期天，星期一星期二"
212 |     res = extract_number(sen)
213 |     print(res)
214 | 


--------------------------------------------------------------------------------
/macropodus/tookit/calculator_sihui/calcultor_formula.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/11/21 23:38
  4 | # @author   :Mo
  5 | # @function :calcultor of text, not filter and redundancy 
  6 | 
  7 | 
  8 | from macropodus.conf.path_log import get_logger_root
  9 | import re
 10 | 
 11 | 
 12 | logger = get_logger_root()
 13 | 
 14 | 
 15 | def change_symbol(formula):
 16 |     """
 17 |         提取负号
 18 |         eg：-9-2-5-2*3-5/3-40*4/-1.0/5+6*3  ===>  -(9+2+5+2*3+5/3+40*4/1.0/5-6*3)
 19 |     :param formula: 
 20 |     :return: 
 21 |     """
 22 |     def primary_change(for_str):  # 把算式中的全角 + - 对应换成 - +
 23 |         temp = for_str.split("+")
 24 |         new_formula = []
 25 |         for value in temp:
 26 |             value = value.replace("-", "+")
 27 |             new_formula.append(value)
 28 |         return "-".join(new_formula)
 29 | 
 30 |     if formula.startswith("-"):
 31 |         formula = formula.replace("-", "", 1)
 32 |         formula = primary_change(formula)
 33 |         formula = formula.join(["-(", ")"])
 34 |     elif formula.startswith("+"):
 35 |         formula = primary_change(formula)
 36 |         formula = formula.join(["-(", ")"])
 37 |     else:
 38 |         formula = primary_change(formula)
 39 |         formula = formula.join(["-(-", ")"])
 40 |     return formula
 41 | 
 42 | 
 43 | def remove_repeat(formula):
 44 |     """
 45 |         去掉连续的重复的运算符
 46 |     :param formula: str, like: "1++2"
 47 |     :return: str, like:"1+2"
 48 |     """
 49 |     temp = formula.replace("++", "+")
 50 |     temp = temp.replace("+-", "-")
 51 |     temp = temp.replace("-+", "-")
 52 |     temp = temp.replace("--", "+")
 53 |     temp = temp.replace("*+", "*")
 54 |     temp = temp.replace("+*", "*")
 55 |     temp = temp.replace("/+", "/")
 56 |     temp = temp.replace("+/", "/")
 57 |     return temp
 58 | 
 59 | 
 60 | def has_special_operator(formula, special_operator):
 61 |     """
 62 |         判断是否有 *+ +- /- 之类的运算符
 63 |     :param formula: 
 64 |     :param special_operator: 
 65 |     :return: 
 66 |     """
 67 |     for operator in special_operator:
 68 |         if formula.find(operator) != -1:
 69 |             return operator
 70 |     return ""
 71 | 
 72 | 
 73 | def handle_special_operator(formula, operator):
 74 |     """
 75 |         如果有 "*-", "-*", "/-", "-/" 这些运算符，
 76 |         提取负号，去掉重复的运算符
 77 |     :param formula: 
 78 |     :param operator: 
 79 |     :return: 
 80 |     """
 81 |     temp = ""
 82 |     regex = "\d*[.]?\d+"
 83 |     opera = operator.replace("*", "[*]")
 84 |     ret = re.compile(opera.join([regex, regex]))
 85 |     while ret.search(formula):
 86 |         search_res = ret.search(formula).group()
 87 |         if operator.find("*") != -1:
 88 |             temp = search_res.replace(operator, "*")
 89 |         elif operator.find("/") != -1:
 90 |             temp = search_res.replace(operator, "/")
 91 |         temp = "-".join(["", temp])
 92 |         formula = formula.replace(search_res, temp, 1)
 93 |     return formula
 94 | 
 95 | 
 96 | def has_parentheses(formula):
 97 |     """
 98 |         判断是否还有括号
 99 |     :param formula: str
100 |     :return: boolean
101 |     """
102 |     if re.search("[()]", formula):
103 |         return True
104 |     return False
105 | 
106 | 
107 | def judge_illegal(formula):
108 |     """
109 |         判断括号是否匹配完全，运算符是否合法
110 |         没有考虑  **  //  的计算
111 |     :param formula: str
112 |     :return: str
113 |     """
114 |     if len(re.findall("[(]", formula)) != len(re.findall("[)]", formula)):
115 |         return True
116 |     if formula.startswith("*") or formula.startswith("/"):
117 |         return True
118 |     if has_special_operator(formula, ["*/", "/*", "**", "//"]):
119 |         return True
120 |     return False
121 | 
122 | 
123 | def calculator_formula(formula):
124 |     """
125 |         计算算式，这里计算的是不带括号的算式
126 |     计算次序是 / * - +
127 |     计算过程中出现括号则停止计算，返回当前的算式
128 |     :param formula: 
129 |     :return: 
130 |     """
131 |     def primary_operator(for_str, operation):
132 |         try:
133 |             primary_result = 0
134 |             regex = "\d*[.]?\d*"
135 |             ret = re.compile(operation.join(["[", "]"]).join([regex, regex]))
136 |             while ret.search(for_str):
137 |                 ret_opera = has_special_operator(for_str, ["*-", "-*", "/-", "-/"])
138 |                 while ret_opera:
139 |                     for_str = handle_special_operator(for_str, ret_opera)
140 |                     ret_opera = has_special_operator(for_str, ["*-", "-*", "/-", "-/"])
141 |                 while has_special_operator(for_str, ["+-", "-+", "++", "--", "+*", "*+", "+/", "/+"]):
142 |                     for_str = remove_repeat(for_str)
143 |                 # print("primary_operator:", for_str)
144 |                 if has_parentheses(for_str):
145 |                     return for_str
146 |                 if for_str.startswith("-"):
147 |                     temp = re.findall("^-\d*[.]?\d*$", for_str)
148 |                     if temp:
149 |                         return temp[0]
150 |                     return change_symbol(for_str)
151 |                 if for_str.startswith("+"):
152 |                     for_str = for_str.replace("+", "", 1)
153 |                 if not ret.search(for_str):
154 |                     continue
155 |                 search_res = ret.search(for_str).group()
156 |                 operand_list = search_res.split(operation)
157 |                 if operation == "/":
158 |                     primary_result = float(operand_list[0]) / float(operand_list[1])
159 |                 elif operation == "*":
160 |                     primary_result = float(operand_list[0]) * float(operand_list[1])
161 |                 elif operation == "-":
162 |                     primary_result = float(operand_list[0]) - float(operand_list[1])
163 |                 elif operation == "+":
164 |                     primary_result = float(operand_list[0]) + float(operand_list[1])
165 |                 for_str = for_str.replace(search_res, '%f' % (primary_result), 1)
166 |             return for_str
167 |         except Exception as e:
168 |             logger.info(str(e))
169 |             return None
170 |     try:
171 |         formula = primary_operator(formula, "/")
172 |         formula = primary_operator(formula, "*")
173 |         formula = primary_operator(formula, "-")
174 |         formula = primary_operator(formula, "+")
175 |     except Exception as e:
176 |         logger.info(str(e))
177 |         return None
178 |     return formula
179 | 
180 | 
181 | def remove_parentheses(formula):
182 |     """
183 |         去掉算式的括号，计算括号里算式
184 |     :param formula: 
185 |     :return: 
186 |     """
187 |     parentheses = re.compile("\([^()]+\)")
188 |     while parentheses.search(formula):
189 |         search_res = parentheses.search(formula).group()
190 |         for_str = re.sub("[()]", "", search_res)
191 |         if judge_illegal(for_str):
192 |             return ""
193 |         for_str = calculator_formula(for_str)
194 |         formula = formula.replace(search_res, for_str, 1)
195 |     """
196 |     会有去掉所有括号算式还没算完的情况
197 |     eg：1-2*65
198 |     需要再计算一遍算式
199 |     """
200 |     formula = calculator_formula(formula)
201 |     return formula
202 | 
203 | 
204 | def result_formula(formula):
205 |     """  
206 |         简单计算器, 纯粹四则运算
207 |         去完括号后额外计算的那一次若再次出现括号，
208 |         则重复去括号运算，直至再没有括号
209 |     :param formula: str
210 |     :return: str
211 |     """
212 | 
213 |     def remove_space(formula):
214 |         """
215 |             去掉算式的空格
216 |         :param formula: str
217 |         :return: str
218 |         """
219 |         return formula.replace(" ", "")
220 | 
221 |     def first_calculator(for_str):
222 |         """
223 |             先计算括号里边的
224 |         :param for_str: 
225 |         :return: 
226 |         """
227 |         if judge_illegal(for_str):
228 |             return None
229 |         return remove_parentheses(for_str)
230 | 
231 |     formula = remove_space(formula)
232 | 
233 |     formula = first_calculator(formula)
234 |     if not formula:
235 |         return None
236 |     while has_parentheses(formula):
237 |         formula = first_calculator(formula)
238 |         # print("calculator_result:", formula)
239 |     if not formula:
240 |         return None
241 |     return formula
242 | 
243 | 
244 | if __name__ == '__main__':
245 |     cal = result_formula("1+1+2+3*(35+1-5*7-10/5)/2*2")
246 |     print(cal)
247 | 


--------------------------------------------------------------------------------
/macropodus/summarize/feature_base/text_teaser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/11/26 20:02
  4 | # @author   :Mo
  5 | # @function :text summary of feature-base of TextTeaser
  6 | # @paper    :Automatic Text Summarization for Indonesian Language Using TextTeaser(2013)
  7 | # @url      :using Google Scholar
  8 | 
  9 | 
 10 | from macropodus.data.words_common.stop_words import stop_words
 11 | from macropodus.preprocess.tools_ml import extract_chinese
 12 | from macropodus.preprocess.tools_ml import macropodus_cut
 13 | from macropodus.preprocess.tools_ml import cut_sentence
 14 | from collections import Counter
 15 | 
 16 | 
 17 | class TextTeaserSum:
 18 |     def __init__(self):
 19 |         self.algorithm = 'text_teaser'
 20 |         self.stop_words = stop_words.values()
 21 |         self.len_ideal = 18 # 中心句子长度, 默认
 22 | 
 23 |     def score_position(self):
 24 |         """
 25 |             文本句子位置得分
 26 |         :param sentence: 
 27 |         :return: 
 28 |         """
 29 |         score_position = []
 30 |         for i, sen in enumerate(self.sentences):
 31 |             score_standard = i / (len(self.sentences))
 32 |             if score_standard >= 0 and score_standard <= 0.1:
 33 |                 score_position.append(0.17)
 34 |             elif score_standard > 0.1 and score_standard <= 0.2:
 35 |                 score_position.append(0.23)
 36 |             elif score_standard > 0.2 and score_standard <= 0.3:
 37 |                 score_position.append(0.14)
 38 |             elif score_standard > 0.3 and score_standard <= 0.4:
 39 |                 score_position.append(0.08)
 40 |             elif score_standard > 0.4 and score_standard <= 0.5:
 41 |                 score_position.append(0.05)
 42 |             elif score_standard > 0.5 and score_standard <= 0.6:
 43 |                 score_position.append(0.04)
 44 |             elif score_standard > 0.6 and score_standard <= 0.7:
 45 |                 score_position.append(0.06)
 46 |             elif score_standard > 0.7 and score_standard <= 0.8:
 47 |                 score_position.append(0.04)
 48 |             elif score_standard > 0.8 and score_standard <= 0.9:
 49 |                 score_position.append(0.04)
 50 |             elif score_standard > 0.9 and score_standard <= 1.0:
 51 |                 score_position.append(0.15)
 52 |             else:
 53 |                 score_position.append(0)
 54 |         return score_position
 55 | 
 56 |     def score_length(self, sentence):
 57 |         """
 58 |             文本长度得分
 59 |         :param sentence: 
 60 |         :return: 
 61 |         """
 62 |         score_length = 1 - min(abs(self.len_ideal - len(sentence)), self.len_ideal) / self.len_ideal
 63 |         return score_length
 64 | 
 65 |     def score_sbs(self, words):
 66 |         """
 67 |             单个句子的sbs分数
 68 |         :param words: 
 69 |         :return: 
 70 |         """
 71 |         score_sbs = 0.0
 72 |         for word in words:
 73 |             if word in self.word_freqs:
 74 |                 score_sbs += self.word_freqs[word]
 75 |         return ((1.0 / abs(len(words))) if len(words) else 1e-9) * score_sbs
 76 | 
 77 |     def score_dbs(self, words):
 78 |         """
 79 |             单个句子的dbs分数
 80 |         :param words: 
 81 |         :return: 
 82 |         """
 83 |         words_all = list(self.word_freqs.keys())
 84 |         pun = len(set(words)&set(words_all)) + 1
 85 |         score_dbs = 0.0
 86 |         wf_first = []
 87 |         for i, word in enumerate(words):
 88 |             if word in words_all:
 89 |                 index = words_all.index(word)
 90 |                 if not wf_first:
 91 |                     wf_first = [index, self.word_freqs[word]]
 92 |                 else:
 93 |                     score_dbs += wf_first[1]*self.word_freqs[word] / (((wf_first[0] - index) if (wf_first[0] - index)!=0 else self.len_words)**2)
 94 |         score_dbs = score_dbs if score_dbs !=0 else 1e-9
 95 |         return (1.0 / pun * (pun + 1.0)) * score_dbs
 96 | 
 97 |     def score_title(self, words):
 98 |         """
 99 |             与标题重合部分词语
100 |         :param words: 
101 |         :return: 
102 |         """
103 |         mix_word = [word for word in words if word in self.title]
104 |         len_mix_word = len(mix_word)
105 |         len_title_word = len(self.title)
106 |         return (len_mix_word + 1.0) / (len_mix_word + 2.0) / len_title_word
107 | 
108 |     def summarize(self, text, num=320, title=None):
109 |         # 切句
110 |         if type(text) == str:
111 |             self.sentences = cut_sentence(text)
112 |         elif type(text) == list:
113 |             self.sentences = text
114 |         else:
115 |             raise RuntimeError("text type must be list or str")
116 |         self.title = title
117 |         if self.title:
118 |             self.title = macropodus_cut(title)
119 |         # 切词
120 |         sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence))
121 |                              if word.strip()] for sentence in self.sentences]
122 |         # 去除停用词等
123 |         self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut]
124 |         # 词频统计
125 |         self.words = []
126 |         for sen in self.sentences_cut:
127 |             self.words = self.words + sen
128 |         self.word_count = dict(Counter(self.words))
129 |         # word_count_rank = sorted(word_count.items(), key=lambda f:f[1], reverse=True)
130 |         # self.word_freqs = [{'word':wcr[0], 'freq':wcr[1]} for wcr in word_count_rank]
131 |         # 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}]
132 |         self.word_freqs = {}
133 |         self.len_words = len(self.words)
134 |         for k, v in self.word_count.items():
135 |             self.word_freqs[k] = v * 0.5 / self.len_words
136 |         # 句子位置打分
137 |         scores_posi = self.score_position()
138 |         res_rank = {}
139 |         self.res_score = []
140 |         for i in range(len(sentences_cut)):
141 |             sen = self.sentences[i] # 句子
142 |             sen_cut = self.sentences_cut[i] # 句子中的词语
143 |             score_sbs = self.score_sbs(sen_cut) # 句子中的词语打分1
144 |             score_dbs = self.score_dbs(sen_cut) # 句子中的词语打分2
145 |             score_word = (score_sbs + score_dbs) * 10.0 / 2.0 # 句子中的词语打分mix
146 |             score_length = self.score_length(sen) # 句子文本长度打分
147 |             score_posi = scores_posi[i]
148 |             if self.title: # 有标题的文本打分合并
149 |                 score_title = self.score_title(sen_cut)
150 |                 score_total = (score_title * 0.5 + score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 4.0
151 |                 # 可查阅各部分得分统计
152 |                 self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "score_title", "sentences"])
153 |                 self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, score_title, self.sentences[i]])
154 |             else: # 无标题的文本打分合并
155 |                 score_total = (score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 3.5
156 |                 self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "sentences"])
157 |                 self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, self.sentences[i].strip()])
158 |             res_rank[self.sentences[i].strip()] = score_total
159 |         # 最小句子数
160 |         num_min = min(num, int(len(self.word_count) * 0.6))
161 |         score_sen = [(rc[1], rc[0]) for rc in sorted(res_rank.items(), key=lambda d: d[1], reverse=True)][0:num_min]
162 |         return score_sen
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     doc1 = "PageRank算法简介。" \
167 |           "是上世纪90年代末提出的一种计算网页权重的算法! " \
168 |           "当时，互联网技术突飞猛进，各种网页网站爆炸式增长。 " \
169 |           "业界急需一种相对比较准确的网页重要性计算方法。 " \
170 |           "是人们能够从海量互联网世界中找出自己需要的信息。 " \
171 |           "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \
172 |           "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \
173 |           "Google根据投票来源甚至来源的来源，即链接到A页面的页面。 " \
174 |           "和投票目标的等级来决定新的等级。简单的说， " \
175 |           "一个高等级的页面可以使其他低等级页面的等级提升。 " \
176 |           "具体说来就是，PageRank有两个基本思想，也可以说是假设。 " \
177 |           "即数量假设：一个网页被越多的其他页面链接，就越重）。 " \
178 |           "质量假设：一个网页越是被高质量的网页链接，就越重要。 " \
179 |           "总的来说就是一句话，从全局角度考虑，获取重要的信。 "
180 |     title = "方直科技等公司合伙设立教育投资基金"
181 |     doc = "多知网5月26日消息，今日，方直科技发公告，拟用自有资金人民币1.2亿元，" \
182 |           "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金（有限合伙）共同发起设立嘉道方直教育产业投资基金（暂定名）。" \
183 |           "该基金认缴出资总规模为人民币3.01亿元。" \
184 |           "基金的出资方式具体如下：出资进度方面，基金合伙人的出资应于基金成立之日起四年内分四期缴足，每期缴付7525万元；" \
185 |           "各基金合伙人每期按其出资比例缴付。合伙期限为11年，投资目标为教育领域初创期或成长期企业。" \
186 |           "截止公告披露日，深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日，深圳嘉道功程股权投资基金产权结构如下:" \
187 |           "公告还披露，方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \
188 |           "方直科技2016年营业收入9691万元，营业利润1432万元，归属于普通股股东的净利润1847万元。（多知网 黎珊）}}"
189 |     tt = TextTeaserSum()
190 |     res_ = tt.summarize(doc)
191 |     for res in res_:
192 |         print(res)
193 |     gg = 0
194 | 


--------------------------------------------------------------------------------
/macropodus/tookit/calculator_sihui/calcultor_function.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | # !/usr/bin/python
  3 | # @time     :2019/11/21 23:36
  4 | # @author   :Mo
  5 | # @function :function of some basic Extraction Scientific Computing
  6 | 
  7 | 
  8 | from macropodus.tookit.calculator_sihui.calcultor_number import extract_number
  9 | from macropodus.conf.path_log import get_logger_root
 10 | import math
 11 | import re
 12 | 
 13 | 
 14 | logger = get_logger_root()
 15 | 
 16 | 
 17 | def rackets_replace(rackets_char, myformula):
 18 |     """
 19 |         将2(3换成2*(3, 3)4换成3)*4
 20 |     :param rackets_char: 
 21 |     :param myformula: 
 22 |     :return: 
 23 |     """
 24 |     if rackets_char in myformula: # "("在算式里边
 25 |         if rackets_char =="(":
 26 |             rackets_re = r'\('
 27 |         else:
 28 |             rackets_re = r'\)'
 29 |         pos_rackets = re.finditer(rackets_re, myformula)
 30 |         count = 0
 31 |         for pos in pos_rackets:
 32 |             pos_single = pos.start() + count
 33 |             if pos_single != 0 and rackets_char =="(":
 34 |                 if myformula[pos_single-1] in '零一二两三四五六七八九0123456789百十千万亿':
 35 |                     myformula = myformula[:pos_single] + "*" + myformula[pos_single:]
 36 |                     count += 1
 37 |             if pos_single != len(myformula)-1 and rackets_char ==")":
 38 |                 if myformula[pos_single+1] in '零一二两三四五六七八九0123456789百十千万亿':
 39 |                     myformula = myformula[:pos_single+1] + "*" + myformula[pos_single+1:]
 40 |                     count += 1
 41 |         return myformula
 42 |     else:
 43 |         return myformula
 44 | 
 45 | 
 46 | 
 47 | def reagan(words, wordsminus):
 48 |     """
 49 |         求平方根，立方根，n次方根
 50 |     :param words: str, 原句
 51 |     :param wordsminus:str , 处理后的句子
 52 |     :return: 
 53 |     """
 54 |     try:
 55 |         if '根号' in words:
 56 |             reagan = wordsminus.replace("开", "").replace("根号", "").replace("的", "")
 57 |             radicalaa = float(extract_number(reagan)[0])
 58 |             if radicalaa < 0.0:
 59 |                 return 'illegal math'
 60 |             radicalbb = math.sqrt(radicalaa)
 61 |             results = str(radicalbb)
 62 |         elif "平方根" in words:
 63 |             reagan = wordsminus.replace("开", "").replace("平方根", "").replace("平方", "").replace("的", "")
 64 |             reagan = extract_number(reagan)[0]
 65 |             squarerootaa = float(reagan)
 66 |             if squarerootaa < 0.0:
 67 |                 return 'illegal math'
 68 |             squarerootbb = math.sqrt(squarerootaa)
 69 |             results = str(squarerootbb)
 70 |         elif "立方根" in words:
 71 |             reagan = wordsminus.replace("开", "").replace("立方根", "").replace("立方", "").replace("的", "")
 72 |             reagan = extract_number(reagan)[0]
 73 |             squarerootaa = float(reagan)
 74 |             squarerootbb = math.pow(squarerootaa, 1.0 / 3)
 75 |             results = str(squarerootbb)
 76 |         elif "次方根" in words:
 77 |             reagan = wordsminus.replace("开", "").replace("次方根", "").replace("次方", "")
 78 |             squareroot = reagan.split("的")
 79 |             squarerootaa = float(extract_number(squareroot[0])[0])
 80 |             squarerootbb = float(extract_number(squareroot[1])[0])
 81 |             if squarerootaa % 2 == 0 and squarerootbb < 0.0:
 82 |                 return 'illegal math'
 83 |             squarerootcc = math.pow(squarerootaa, 1.0 / squarerootbb)
 84 |             results = str(squarerootcc)
 85 |         else:
 86 |             results = words
 87 |         return results
 88 |     except Exception as e:
 89 |         logger.info(str(e))
 90 |         return words
 91 | 
 92 | 
 93 | def power(words, wordsminus):
 94 |     """
 95 |         求指数，求平方
 96 |     :param words: 
 97 |     :param wordsminus: 
 98 |     :return: 
 99 |     """
100 |     try:
101 |         if "平方根" not in words and "平方" in words:
102 |             reagan = wordsminus.replace("平方", "").replace("开", "").replace("的", "")
103 |             reagan = extract_number(reagan)[0]
104 |             square = float(reagan)
105 |             radicalbb = math.pow(square, 2)
106 |             results = str(radicalbb)
107 |         elif "立方根" not in words and "立方" in words:
108 |             reagan = wordsminus.replace("立方", "").replace("开", "").replace("的", "")
109 |             reagan = extract_number(reagan)[0]
110 |             square = float(reagan)
111 |             radicalbb = math.pow(square, 3)
112 |             results = str(radicalbb)
113 |         elif (("次方" in words or "次幂" in words) and "次方根" not in words and "次幂根" not in words):
114 |             reagan = wordsminus.replace("次方", "").replace("开", "").replace("次幂", "")
115 |             squareroot = reagan.split("的")
116 |             squarerootaa = float(extract_number(squareroot[0])[0])
117 |             squarerootbb = float(extract_number(squareroot[1])[0])
118 |             squarerootcc = math.pow(squarerootaa, squarerootbb)
119 |             results = str(squarerootcc)
120 |         else:
121 |             results = words
122 |         return results
123 |     except Exception as e:
124 |         logger.info(str(e))
125 |         return words
126 | 
127 | 
128 | def logarithm(words, wordsminus):
129 |     """
130 |         求对数
131 |     :param words: 
132 |     :param wordsminus: 
133 |     :return: 
134 |     """
135 |     try:
136 |         if "LG" in words or "LOG" in words:
137 |             Lg = wordsminus.replace("LOG", "").replace("LG", "").replace(" ", "").replace("的", "")
138 |             Lg = float(extract_number(Lg)[0])
139 |             if Lg <= 0.0:
140 |                 return 'illegal math'
141 |             lgbb = math.log(Lg)
142 |             results = str(lgbb)
143 |         elif "对数" in words:
144 |             Logg = wordsminus.replace("以", "").replace("对数", "").replace("的对数", "").replace(" ", "").replace("的", "")
145 |             root = Logg.split("为底")
146 |             rootaa = float(extract_number(root[0])[0])
147 |             rootbb = float(extract_number(root[1])[0])
148 |             if rootaa <= 0.0 or rootbb <= 0.0:
149 |                 return 'illegal math'
150 |             rootcc = math.log(rootbb) / math.log(rootaa)
151 |             results = str(rootcc)
152 |         else:
153 |             results = words
154 |         return results
155 |     except Exception as e:
156 |         logger.info(str(e))
157 |         return words
158 | 
159 | 
160 | def fraction(words, wordsminus):
161 |     """
162 |         求分数
163 |     :param words: 
164 |     :param wordsminus: 
165 |     :return: 
166 |     """
167 |     try:
168 |         if "fenzhi" in words:
169 |             fenzhi = wordsminus.replace("fenzhi", "/").replace(" ", "").replace("的", "")
170 |             root = fenzhi.split("/")
171 |             rootaa = float(extract_number(root[0])[0])
172 |             rootbb = float(extract_number(root[1])[0])
173 |             rootcc = rootbb / rootaa
174 |             results = str(rootcc)
175 |         else:
176 |             results = words
177 |         return results
178 |     except Exception as e:
179 |         logger.info(str(e))
180 |         return words
181 | 
182 | 
183 | def fractiontwo(words, wordsminus):
184 |     """
185 |         取分数
186 |     :param words: 
187 |     :param wordsminus: 
188 |     :return: 
189 |     """
190 |     try:
191 |         if "fenzhi" in words:
192 |             fenzhi = wordsminus.replace("fenzhi", "/").replace(" ", "").replace("的", "")
193 |             root = fenzhi.split("/")
194 |             rootaa = float(extract_number(root[0])[0])
195 |             rootbb = float(extract_number(root[1])[0])
196 |             results = str(rootaa/rootbb)
197 |         else:
198 |             results = words
199 |         return results
200 |     except Exception as e:
201 |         logger.info(str(e))
202 |         return words
203 | 
204 | 
205 | def factorial(words, wordsminus):
206 |     """
207 |         求阶乘
208 |     :param words: 
209 |     :param wordsminus: 
210 |     :return: 
211 |     """
212 |     results = words
213 |     try:
214 |         if "jiecheng的" in words:
215 |             factory = wordsminus.replace("jiecheng的", "").replace("的", "").replace(" ", "")
216 |             fact = float(extract_number(factory)[0])
217 |             if fact <= 10000:
218 |                 results = str(math.factorial(fact))
219 |             else:
220 |                 results = words
221 |         return results
222 |     except Exception as e:
223 |         logger.info(str(e))
224 |         return words
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     res = reagan("根号4", "根号4")
229 |     print(res)
230 |     res = reagan("27的3次方根是多少", "27的3次方根")
231 |     print(res)
232 |     res = power("9的平方", "9的平方")
233 |     print(res)
234 |     res = power("27的立方是几", "9的立方")
235 |     print(res)
236 |     res = power("3的3次方是几", "3的3次方实")
237 |     print(res)
238 |     res = logarithm("LG8", "LG8")
239 |     print(res)
240 |     res = logarithm("以2为底64的对数", "以2为底64的对数")
241 |     print(res)
242 |     res = fraction("1fenzhi6是多少", "1fenzhi6")
243 |     print(res)
244 |     res = factorial("10jiecheng的", "10jiecheng的")
245 |     print(res)
246 | 


--------------------------------------------------------------------------------