├── test ├── images │ ├── macropodus_logo.png │ └── __init__.py ├── __init__.py ├── evaluate │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ └── ambiguity.txt │ ├── tet_summarize.py │ ├── tet_evaluate.py │ ├── tet_macropodus.py │ └── tet_nlg_yongzhuo.py ├── style_data │ ├── __init__.py │ ├── pku_training.utf8 │ └── tag_seg_BMES.py ├── other │ ├── tools │ │ └── pkuseg.py │ └── pos_tagging_1998 │ │ └── compare_tags.py └── survey_report │ └── nlp_platfom_survey.md ├── requirements.txt ├── __init__.py ├── macropodus ├── logs │ └── __init__.py ├── base │ ├── __init__.py │ ├── word2vec.py │ └── seg_basic.py ├── conf │ ├── __init__.py │ ├── path_log.py │ └── path_config.py ├── data │ ├── __init__.py │ ├── cache │ │ └── __init__.py │ ├── model │ │ ├── __init__.py │ │ ├── ner_albert_people_1998 │ │ │ └── __init__.py │ │ └── tag_albert_people_1998 │ │ │ └── __init__.py │ ├── words_common │ │ └── __init__.py │ └── embedding │ │ └── albert_base_zh │ │ └── __init__.py ├── network │ ├── base │ │ └── __init__.py │ ├── graph │ │ ├── __init__.py │ │ ├── crf.py │ │ ├── bilstm.py │ │ └── bilstm_crf.py │ ├── layers │ │ ├── __init__.py │ │ ├── non_mask_layer.py │ │ ├── keras_lookahead.py │ │ ├── keras_radam.py │ │ └── crf.py │ ├── train │ │ └── __init__.py │ ├── __init__.py │ ├── preprocess │ │ └── __init__.py │ └── service │ │ ├── __init__.py │ │ ├── thread_manage.py │ │ ├── keras_dump.py │ │ ├── server_streamer.py │ │ └── server_streamer_flask.py ├── preprocess │ ├── __init__.py │ ├── tools_clear.py │ ├── tools_common.py │ └── tools_ml.py ├── tookit │ ├── han2zh │ │ ├── __init__.py │ │ └── han2zh.py │ ├── pinyin │ │ ├── __init__.py │ │ └── pinyin.py │ ├── number2roman │ │ ├── __init__.py │ │ └── ri.py │ ├── trie_tree │ │ ├── __init__.py │ │ └── trie_tree.py │ ├── calculator_sihui │ │ ├── __init__.py │ │ ├── calcultor_number.py │ │ ├── calcultor_formula.py │ │ └── calcultor_function.py │ ├── chinese2number │ │ └── __init__.py │ └── __init__.py ├── segment │ ├── seg_statistics │ │ ├── __init__.py │ │ ├── seg_forward.py │ │ ├── seg_bidirectional.py │ │ ├── seg_reverse.py │ │ └── seg_dag.py │ ├── word_discovery │ │ └── __init__.py │ └── __init__.py ├── summarize │ ├── feature_base │ │ ├── __init__.py │ │ ├── mmr.py │ │ ├── word_significance.py │ │ └── text_teaser.py │ ├── graph_base │ │ ├── __init__.py │ │ ├── textrank_sklearn.py │ │ ├── textrank.py │ │ └── textrank_word2vec.py │ ├── nous_base │ │ ├── __init__.py │ │ └── lead_3.py │ ├── topic_base │ │ ├── __init__.py │ │ ├── topic_lsi.py │ │ ├── topic_lda.py │ │ └── topic_nmf.py │ ├── yongzhuo_nlg │ │ ├── __init__.py │ │ └── README.md │ └── __init__.py ├── version.py ├── similarity │ ├── __init__.py │ └── similarity_word2vec_char.py ├── __init__.py └── __init_tf_keras.py ├── requirements-all.txt ├── LICENSE ├── .gitignore └── setup.py /test/images/macropodus_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/Macropodus/HEAD/test/images/macropodus_logo.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | scikit-learn 4 | passlib==1.7.1 5 | gensim==3.7.1 6 | tqdm==4.31.1 7 | networkx==2.4 8 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/12/3 22:50 4 | # @author :Mo 5 | # @function : 6 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/4 21:04 4 | # @author : Mo 5 | # @function: 6 | -------------------------------------------------------------------------------- /macropodus/logs/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 0:20 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/17 10:38 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/images/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/20 21:54 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/base/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/12 23:04 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/conf/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/18 23:59 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 23:06 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/cache/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/3 0:25 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/evaluate/data/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/17 10:41 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /test/style_data/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 23:11 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/model/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 23:06 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/network/base/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:32 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/network/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:32 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/network/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/3 20:43 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/network/train/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/20 22:18 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 10:39 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/tookit/han2zh/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/8 21:51 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/tookit/pinyin/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/7 19:59 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/words_common/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 20:29 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/network/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:26 4 | # @author : Mo 5 | # @function: 6 | 7 | -------------------------------------------------------------------------------- /macropodus/network/preprocess/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:35 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/network/service/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/16 22:01 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/tookit/number2roman/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/2 9:13 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/tookit/trie_tree/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:06 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/segment/seg_statistics/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 9:25 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/segment/word_discovery/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/19 15:36 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/summarize/feature_base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/12/25 21:41 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /macropodus/summarize/graph_base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/25 21:42 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /macropodus/summarize/nous_base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/25 21:44 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /macropodus/summarize/topic_base/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/29 20:35 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /macropodus/tookit/calculator_sihui/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/3 20:25 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/tookit/chinese2number/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:00 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/embedding/albert_base_zh/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/2 1:08 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/model/ner_albert_people_1998/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 23:06 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/data/model/tag_albert_people_1998/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 23:06 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /macropodus/version.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 22:24 4 | # @author : Mo 5 | # @function: version of Macropodus 6 | 7 | 8 | __version__ = "0.0.7" 9 | -------------------------------------------------------------------------------- /macropodus/summarize/yongzhuo_nlg/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/5/14 21:11 4 | # @author : Mo 5 | # @function: nlg-yongzhuo 6 | 7 | 8 | from nlg_yongzhuo import * 9 | 10 | -------------------------------------------------------------------------------- /test/other/tools/pkuseg.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/10 15:00 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | import pkuseg 9 | 10 | ps = pkuseg.pkuseg() 11 | res = ps.cut("帝国主义要把我们的地瓜分掉") 12 | print(res) -------------------------------------------------------------------------------- /requirements-all.txt: -------------------------------------------------------------------------------- 1 | scikit-learn==0.19.1 2 | pandas==0.23.4 3 | passlib==1.7.1 4 | gensim==3.7.1 5 | numpy==1.16.2 6 | tqdm==4.31.1 7 | networkx==2.4 8 | tensorflow-gpu==1.15.0 9 | keras-bert==0.80.0 10 | keras-adaptive-softmax==0.6.0 11 | nlg-yongzhuo==0.0.4 -------------------------------------------------------------------------------- /test/style_data/pku_training.utf8: -------------------------------------------------------------------------------- 1 | 迈向 充满 希望 的 新 世纪 —— 一九九八年 新年 讲话 ( 附 图片 1 张 ) 2 | 中共中央 总书记 、 国家 主席 江 泽民 3 | ( 一九九七年 十二月 三十一日 ) 4 | 12月 31日 , 中共中央 总书记 、 国家 主席 江 泽民 发表 1998年 新年 讲话 《 迈向 充满 希望 的 新 世纪 》 。 5 | 同胞 们 、 朋友 们 、 女士 们 、 先生 们 : 6 | -------------------------------------------------------------------------------- /macropodus/similarity/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/18 22:04 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | from macropodus.similarity.similarity_word2vec_char import SimW2vChar 9 | import os 10 | 11 | # 词向量, 默认使用缓存 12 | use_cache = True 13 | if not os.environ.get("macropodus_use_w2v_cache", True): 14 | use_cache = False # 不使用缓存,重新加载 15 | # 文本相似度 16 | swc = SimW2vChar(use_cache) 17 | sim = swc.similarity 18 | -------------------------------------------------------------------------------- /macropodus/network/service/thread_manage.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/16 15:08 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | from multiprocessing import Process, Manager 9 | 10 | def f(d, l): 11 | d[1] = '1' 12 | d['2'] = 2 13 | d[0.25] = None 14 | l.reverse() 15 | 16 | if __name__ == '__main__': 17 | manager = Manager() 18 | 19 | d = manager.dict() 20 | l = manager.list(range(10)) 21 | 22 | p = Process(target=f, args=(d, l)) 23 | p.start() 24 | p.join() 25 | 26 | print (d) 27 | print (l) -------------------------------------------------------------------------------- /macropodus/preprocess/tools_clear.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:02 4 | # @author : Mo 5 | # @function: clear text 6 | 7 | 8 | def is_total_num(text): 9 | """ 10 | 判断是否是数字的 11 | :param text: str 12 | :return: boolean, True or false 13 | """ 14 | try: 15 | text_clear = text.replace(" ", "").strip() 16 | number = 0 17 | for one in text_clear: 18 | if one.isdigit(): 19 | number += 1 20 | if number == len(text_clear): 21 | return True 22 | else: 23 | return False 24 | except: 25 | return False 26 | 27 | -------------------------------------------------------------------------------- /macropodus/network/layers/non_mask_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/10 21:35 4 | # @author :Mo 5 | # @function :NonMaskingLayer of bert 6 | # @codefrom :https://github.com/jacoxu 7 | 8 | 9 | from __future__ import print_function, division 10 | from tensorflow.python.keras.layers import Layer 11 | 12 | 13 | class NonMaskingLayer(Layer): 14 | """ 15 | fix convolutional 1D can't receive masked input, 16 | detail: https://github.com/keras-team/keras/issues/4978 17 | """ 18 | 19 | def __init__(self, **kwargs): 20 | self.supports_masking = True 21 | super(NonMaskingLayer, self).__init__(**kwargs) 22 | 23 | def build(self, input_shape): 24 | pass 25 | 26 | def compute_mask(self, inputs, input_mask=None): 27 | # do not pass the mask to the next layers 28 | return None 29 | 30 | def call(self, x, mask=None): 31 | return x 32 | -------------------------------------------------------------------------------- /macropodus/segment/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/18 22:00 4 | # @author : Mo 5 | # @function: segment of sent 6 | 7 | 8 | from macropodus.segment.seg_statistics.seg_statistics import SegStatistics 9 | from macropodus.segment.word_discovery.word_discovery import WordDiscovery 10 | import os 11 | 12 | 13 | # 机械分词,默认使用缓存 14 | use_cache = True 15 | if not os.environ.get("macropodus_use_seg_cache", True): 16 | use_cache = False # 不使用缓存,重新加载 17 | segs = SegStatistics(use_cache) 18 | cut_bidirectional = segs.cut_bidirectional 19 | cut_forward = segs.cut_forward 20 | cut_reverse = segs.cut_reverse 21 | cut_search = segs.cut_search 22 | cut_dag = segs.cut_dag 23 | cut = segs.cut 24 | 25 | # 用户词典增删改查 26 | load_user_dict = segs.load_user_dict 27 | save_delete_words = segs.save_delete_words 28 | save_add_words = segs.save_add_words 29 | delete_word = segs.delete_word 30 | add_word = segs.add_word 31 | 32 | # 新词发现 33 | wd = WordDiscovery() 34 | find = wd.find_word 35 | -------------------------------------------------------------------------------- /macropodus/network/service/keras_dump.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/16 22:34 4 | # @author : Mo 5 | # @function: dump of keras, error, no use. 6 | 7 | 8 | from tensorflow.python.keras.models import save_model, load_model, Model 9 | import tempfile 10 | import types 11 | 12 | 13 | def make_keras_picklable(): 14 | def __getstate__(self): 15 | model_str = "" 16 | with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd: 17 | save_model(self, fd.name, overwrite=True) 18 | model_str = fd.read() 19 | d = {'model_str': model_str} 20 | return d 21 | 22 | def __setstate__(self, state): 23 | with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd: 24 | fd.write(state['model_str']) 25 | fd.flush() 26 | model = load_model(fd.name) 27 | self.__dict__ = model.__dict__ 28 | 29 | cls = Model 30 | cls.__getstate__ = __getstate__ 31 | cls.__setstate__ = __setstate__ 32 | -------------------------------------------------------------------------------- /macropodus/tookit/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/28 20:49 4 | # @author : Mo 5 | # @function: tookit 6 | 7 | 8 | # tookit 9 | from macropodus.tookit.chinese2number.chinese2number import Chi2Num, Num2Chi 10 | from macropodus.tookit.calculator_sihui.calcultor_sihui import Calculator 11 | from macropodus.tookit.trie_tree.trie_tree import TrieTree 12 | from macropodus.tookit.han2zh.han2zh import Han2Zh 13 | from macropodus.tookit.pinyin.pinyin import PinYin 14 | from macropodus.tookit.number2roman.ri import RI 15 | 16 | # 常用工具(tookit, 计算器, 中文与阿拉伯数字转化, 前缀树, 中文与罗马数字相互转化, 中文转拼音, 繁简转化) 17 | Calcul = Calculator() 18 | Chi2num = Chi2Num() 19 | Num2chi = Num2Chi() 20 | Trie = TrieTree() 21 | hanzh = Han2Zh() 22 | piyi = PinYin() 23 | ri = RI() 24 | 25 | calculate = Calcul.calculator_sihui 26 | chi2num = Chi2num.compose_decimal 27 | num2chi = Num2chi.decimal_chinese 28 | roman2num = ri.roman2int 29 | num2roman = ri.int2roman 30 | han2zh = hanzh.han2zh 31 | zh2han = hanzh.zh2han 32 | pinyin = piyi.pinyin 33 | -------------------------------------------------------------------------------- /test/style_data/tag_seg_BMES.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/21 23:11 4 | # @author : Mo 5 | # @function: BMES标注法 6 | 7 | 8 | from macropodus.preprocess.tools_common import load_json, save_json 9 | from macropodus.preprocess.tools_common import txt_write, txt_read 10 | import json 11 | 12 | pku_training = txt_read("pku_training.utf8") 13 | file = open("pku_train.json", "w", encoding="utf-8") 14 | pku_ = [] 15 | for pku in pku_training: 16 | pkus = pku.split(" ") 17 | label_pkus = "" 18 | for pku_sig in pkus: 19 | len_pku = len(pku_sig) 20 | if len_pku==1: 21 | label_pkus += "S" 22 | elif len_pku==2: 23 | label_pkus += "BE" 24 | else: 25 | label_pkus += "B" + "M"*(len_pku-2) + "E" 26 | label_pkus_l = list(label_pkus) 27 | pku_res = {} 28 | pku_res["question"] = list("".join(pkus)) 29 | pku_res["label"] = label_pkus_l 30 | p_json = json.dumps(pku_res, ensure_ascii=False) 31 | file.write(p_json + "\n") 32 | # pku_.append(pku_res) 33 | # save_json(pku_, "pku_train.json") 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 yongzhuo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/evaluate/tet_summarize.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/5/14 0:11 4 | # @author : Mo 5 | # @function: test summarize of corpus 6 | 7 | 8 | import macropodus 9 | 10 | 11 | summary = "PageRank算法简介。" \ 12 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 13 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 14 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 15 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 16 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 17 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 18 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 19 | "和投票目标的等级来决定新的等级。简单的说, " \ 20 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 21 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 22 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 23 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 24 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 25 | 26 | 27 | # 文本摘要(summarize, 默认接口) 28 | sents = macropodus.summarize(summary) 29 | print(sents) 30 | 31 | # (summarization, 可定义方法, 提供9种文本摘要方法, 'lda', 'mmr', 'textrank', 'text_teaser' 32 | ttypes = ['text_pronouns', 'text_teaser', 'word_sign', 'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf'] 33 | 34 | for ttp in ttypes: 35 | sents = macropodus.summarization(text=summary, type_summarize=ttp) 36 | print("\n" + ttp + ": ") 37 | print(sents) 38 | -------------------------------------------------------------------------------- /test/evaluate/tet_evaluate.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/17 21:13 4 | # @author : Mo 5 | # @function: test evulate 6 | 7 | 8 | from macropodus.preprocess.tools_common import txt_write, txt_read 9 | import macropodus 10 | import time 11 | 12 | 13 | def evulate_file(path_file): 14 | """ 15 | 验证切词的各种指标 16 | :param path_file: str, like '/train.txt' 17 | :return: float 18 | """ 19 | # 读取数据 20 | sents = txt_read(path_file) 21 | # 初始化统计计数 22 | count_macropodus = 0 23 | count_real = 0 24 | count_true = 0 25 | count = 0 26 | # 切词与统计, true 27 | for sent in sents: 28 | sent_sp = sent.strip() 29 | res_real = sent_sp.split(' ') 30 | sentence = sent_sp.replace(' ','') 31 | res_macropodus = macropodus.cut(sentence) 32 | print(res_macropodus) 33 | count += 1 34 | count_real += len(res_real) 35 | count_macropodus += len(res_macropodus) 36 | for cm in res_macropodus: 37 | if cm in res_real: 38 | count_true += 1 39 | res_real.remove(cm) 40 | # precision, recall, f1 41 | precision = count_true / count_macropodus 42 | recall = count_true / count_real 43 | f1 = (precision * recall * 2) / (precision + recall) 44 | 45 | return precision, recall, f1 46 | 47 | 48 | if __name__ == "__main__": 49 | path_file = 'data/ambiguity.txt' 50 | time_start = time.time() 51 | precision, recall, f1 = evulate_file(path_file) 52 | print('time: ' + str(time.time()-time_start)) 53 | print('precision\t', 'recall\t', 'f1') 54 | print(precision, recall, f1) 55 | -------------------------------------------------------------------------------- /test/evaluate/data/ambiguity.txt: -------------------------------------------------------------------------------- 1 | 工信处 女 干事 每月 经过 下属 科室 都要 亲口 交代 24 口 交换机 等 技术性 器件 的 安装 工作 2 | 研究 生命科学 \t 研究 生命 科学 3 | 研究生 命令 本科生 4 | 我 从 马 上 下来 5 | 我 马上 下来 6 | 北京 大学生 喝 进口 红酒 7 | 在 北京大学 生活区 喝 进口 红酒 8 | 从小 学 电脑 9 | 从 小学 毕业 10 | 美军 中将 竟 公然 说 11 | 新建 地铁 中 将 禁止 商业 摊点 12 | 这块 地 面积 还真 不小 13 | 地面 积了 厚厚 的 雪 14 | 让 我们 以 爱心 和 平等 来 对待 动物 15 | 阿美 首脑 会议 将 讨论 巴以 和平 等 问题 16 | 锌 合金 把手 的 相关 求购 信息 17 | 别 把 手 伸进 别人 的 口袋 里 18 | 将 信息 技术 应用 于 教学 实践 19 | 信息 技术 应用 于 教学 中 的 哪个 方面 20 | 上级 解除 了 他 的 职务 21 | 方程 的 解 除了 零 以外 还有 … 22 | 我们 一起 去 故宫 23 | 一起 恶性 交通 事故 24 | 我 不想 吃 东西 25 | 你 就 不 想想 26 | 各 国有 企业 相继 倒闭 27 | 各国 有 各国 的 困难 28 | 老人家 身体 不错 29 | 老人 家中 很 干净 30 | 和服 务必 归还 31 | 技术 和 服务 32 | 他 站 起 身 33 | 他 起身 去 北京 34 | 问题 的 确定 35 | 这的 确定 不 下来 36 | 结合 成分 37 | 为 人民 工作 38 | 中国 产品 质量 39 | 原子 结合 成 分子 时 40 | 部分 居民 生活 水平 41 | 治理 解放 大道 路面 积水 42 | 这样 的 人 才能 经受 住 考验 43 | 他俩 儿 谈 恋爱 是 从 头年 元月 开始 的 44 | 在 这些 企业 中 国有 企业 有 十个 45 | 结婚 的 和 尚未 结婚 的 46 | 热海 景区 47 | 热海 景区 + 48 | 崔永元 炮轰 范冰冰 49 | 这 源自 萧红 写给 萧军 信中 的 一句话 50 | 阿里 大华 腾讯 百度 51 | 亲家公 亲家母 52 | 情侣 们 在 海南岛 上 海誓山盟 53 | 在于 不断 提高 人们 信以为真 的 情感 纪实 的 能力 。 54 | 四川 发改委 发文 取缔 p2p 和 P2p 55 | 字节跳动 是 今日头条 的 母公司 56 | 今日头条 白嫖 东风快递 令人喷饭 勿谓言之不预也 白嫖 口区 弓虽 口丕 我酸了 祖安人 迷惑行为 57 | 5G 996 007 1118 35 120 251 nmsl nsdd wdnmd CSGO 58 | 唱跳 rap 篮球 鸡你太美 cxk 59 | 盘它 撞梗 融梗 雨女无瓜 要你寡 60 | 刺激战场 绝地求生 61 | 狼灭 狼火 狼炎 狼焱 灵魂八问 硬核 奥力给 有内味了 awsl 影流之主 巨魔之王 62 | 帝国主义 要 把 我们 的 地 瓜分 掉 63 | 小米 上半年 业绩 稳健 增长 ,Q2 净利 大 超 市场 预期 64 | 陈建仁 请 辞任 蔡英文 副手 人选 台 ” 中研院 : 祝福 65 | 车易拍 获 北汽产投 注资 接盘 66 | 我 家 门前 有 条 水沟 很难 过 67 | 中华 人民 共和国 68 | 郑州 天和 服装厂 69 | 完成 千万 元 天使轮 投资 找 米斗 从 B2B 交易 切入 70 | 环球网 评 共建 共知 共享 , 以 社会 治理 提升 人民 获得感 71 | 董监高 频换 , 公司 毛利 下降 -------------------------------------------------------------------------------- /test/other/pos_tagging_1998/compare_tags.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/14 17:05 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | tags_res = ['m', 'vn', 'v', 'Yg', 'Tg', 'l', 'p', 'nt', 'y', 'Rg', 'e', 'i', 'an', 'q', 'k', 'nr', 'Ag', 'n', 'vvn', 'd', 'f', 'ad', 'vd', 'z', 'Mg', 'nx', 'a', 'h', 's', 'u', 'na', 'Bg', 'j', 'w', 'Ng', 'o', 'nz', 'ns', 'b', 'Vg', 'Dg', 'r', 't', 'c'] 9 | # ['Rg', 'nt', 'Ng', 'm', 'u', 'nx', 'an', 'na', 'b', 'd', 'c', 'vd', 'j', 'ns', 'ad', 's', 'z', 'Mg', 'vn', 'l', 't', 'f', 'v', 'vvn', 'n', 'r', 'Tg', 'Dg', 'Bg', 'i', 'nr', 'k', 'q', 'o', 'a', 'w', 'e', 'h', 'p', 'y', 'nz', 'Ag', 'Yg', 'Vg'] 10 | 11 | tags_res = [tr.upper() for tr in tags_res] 12 | 13 | from macropodus.preprocess.tools_common import txt_read 14 | 15 | tag_jiagus = txt_read("data/tag_jiagu.txt") 16 | tag_jiebas = txt_read("data/tag_jieba.txt") 17 | 18 | tgu = [] 19 | for tag_jiagu in tag_jiagus: 20 | tags = tag_jiagu.split("\u3000") 21 | tag = tags[0].strip() 22 | tgu.append(tag.upper()) 23 | 24 | tga = [] 25 | for tag_jieba in tag_jiebas: 26 | tags = tag_jieba.split("\t") 27 | tag = tags[0].strip() 28 | tga.append(tag.upper()) 29 | 30 | tgus = [] 31 | tgas = [] 32 | for tr in tags_res: 33 | if tr.upper() not in tgu: 34 | tgus.append(tr.upper()) 35 | if tr.upper() not in tga: 36 | tgas.append(tr.upper()) 37 | 38 | tgus.sort() 39 | tgas.sort() 40 | print("jiagu: ") 41 | print(tgus) 42 | print("jieba: ") 43 | print(tgas) 44 | 45 | bbc = ['AG', 'B', 'BG', 'DG', 'E', 'H', 'I', 'J', 'K', 'L', 'MG', 'NA', 'NG', 'NX', 'O', 'RG', 'TG', 'VG', 'VVN', 'Y', 'YG', 'Z'] 46 | gg = 0 47 | -------------------------------------------------------------------------------- /macropodus/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/12 22:57 4 | # @author : Mo 5 | # @function: init of macropodus (tookit, keras of tensorflow) 6 | 7 | 8 | # macropodus 9 | from macropodus.tookit import calculate, chi2num, num2chi, Trie, roman2num, num2roman, pinyin, zh2han, han2zh 10 | from macropodus.segment import cut_bidirectional, cut_forward, cut_reverse, cut_search, cut_dag, cut, find 11 | from macropodus.segment import load_user_dict, save_delete_words, save_add_words, delete_word, add_word 12 | from macropodus.summarize import keyword, textrank, summarization 13 | from macropodus.version import __version__ # 版本 14 | from macropodus.similarity import sim 15 | import os 16 | 17 | # 机械分词 18 | cut_bidirectional = cut_bidirectional 19 | cut_forward = cut_forward 20 | cut_reverse = cut_reverse 21 | cut_search = cut_search 22 | cut_dag = cut_dag 23 | cut = cut 24 | 25 | # 用户词典操作 26 | load_user_dict = load_user_dict 27 | save_delete_words = save_delete_words # 保存到用户词典的 28 | save_add_words = save_add_words 29 | delete_word = delete_word 30 | add_word = add_word 31 | 32 | # 新词发现 33 | find = find 34 | 35 | # 文本相似度 36 | sim = sim 37 | 38 | # 文本摘要, 关键词 39 | keyword = keyword 40 | summarize = textrank 41 | summarization = summarization 42 | 43 | # 常用工具(tookit, 计算器, 中文与阿拉伯数字转化, 前缀树, 罗马数字与阿拉伯数字转化) 44 | calculate = calculate 45 | chi2num = chi2num 46 | num2chi = num2chi 47 | roman2num = roman2num 48 | num2roman = num2roman 49 | han2zh = han2zh 50 | zh2han = zh2han 51 | pinyin = pinyin 52 | 53 | if os.environ.get("macropodus_use_dl", False)=="1": 54 | from macropodus.__init_tf_keras import * # tf.python.keras, custom_objects 55 | -------------------------------------------------------------------------------- /macropodus/conf/path_log.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/18 23:59 4 | # @author : Mo 5 | # @function: logger of macropodus 6 | 7 | 8 | from macropodus.conf.path_config import path_log_basic 9 | from logging.handlers import RotatingFileHandler 10 | import logging 11 | import time 12 | import os 13 | 14 | 15 | logger_level = logging.INFO 16 | # log目录地址 17 | path_logs = path_log_basic # + '/logs' 18 | if not os.path.exists(path_logs): 19 | os.mkdir(path_logs) 20 | # 全局日志格式 21 | logging.basicConfig(level=logger_level, 22 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') 23 | # 定义一个日志记录器 24 | logger = logging.getLogger("macropodus") 25 | logger.setLevel(level = logger_level) 26 | # 日志文件名,为启动时的日期 27 | log_file_name = time.strftime('macropodus-%Y-%m-%d', time.localtime(time.time())) + ".log" 28 | log_name_day = os.path.join(path_logs, log_file_name) 29 | # 文件输出, 定义一个RotatingFileHandler,最多备份32个日志文件,每个日志文件最大32K 30 | fHandler = RotatingFileHandler(log_name_day, maxBytes = 32*1024, backupCount = 32) 31 | fHandler.setLevel(logger_level) 32 | # 日志输出格式 33 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 34 | fHandler.setFormatter(formatter) 35 | # # 控制台输出 36 | # console = logging.StreamHandler() 37 | # console.setLevel(logger_level) 38 | # console.setFormatter(formatter) 39 | # logger加到handel里边 40 | logger.addHandler(fHandler) 41 | # logger.addHandler(console) 42 | 43 | 44 | def get_logger_root(name="macropodus"): 45 | """ 46 | 全局日志引用 47 | :param name: str, name of logger 48 | :return: object, logging 49 | """ 50 | return logging.getLogger(name) 51 | -------------------------------------------------------------------------------- /macropodus/tookit/number2roman/ri.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/2 9:14 4 | # @author : Mo 5 | # @function: 罗马数字与阿拉伯数字相互转化 6 | 7 | 8 | class RI: 9 | def __init__(self): 10 | self.algorithm = "roman2int" 11 | 12 | def roman2int(self, roman: str) -> int: 13 | """ 14 | 罗马数字转阿拉伯数字 15 | :param roman: str, like "IX" 16 | :return: int, like 9 17 | """ 18 | roman2int_dict = {'I': 1, 'IV': 4, 'V': 5, 'IX': 9, 19 | 'X': 10, 'XL': 40, 'L': 50, 'XC': 90, 20 | 'C': 100, 'CD': 400, 'D': 500, 'CM': 900, 21 | 'M': 1000} 22 | nums = 0 23 | while roman: 24 | if roman[0:2] in roman2int_dict.keys(): 25 | nums += roman2int_dict[roman[0:2]] 26 | roman = roman[2:] 27 | elif roman[0:1] in roman2int_dict.keys(): 28 | nums += roman2int_dict[roman[0:1]] 29 | roman = roman[1:] 30 | return nums 31 | 32 | def int2roman(self, num: int) -> str: 33 | """ 34 | 阿拉伯数字转罗马数字 35 | :param num: int, like 199 36 | :return: str, like "CXCIX" 37 | """ 38 | int2roman_dict = {1: 'I', 4: 'IV', 5: 'V', 9: 'IX', 39 | 10: 'X', 40: 'XL', 50: 'L', 90: 'XC', 40 | 100: 'C', 400: 'CD', 500: 'D', 900: 'CM', 1000: 'M'} 41 | res = "" 42 | for key in sorted(int2roman_dict.keys())[::-1]: 43 | if (num == 0): 44 | break 45 | tmp = num // key 46 | if (tmp == 0): 47 | continue 48 | res += int2roman_dict[key] * (tmp) 49 | num -= key * (tmp) 50 | return res 51 | 52 | 53 | if __name__ == '__main__': 54 | ri = RI() 55 | roman = "LVIII" # "IX" # "LVIII" 56 | num = 199 57 | res1 = ri.roman2int(roman) 58 | res2 = ri.int2roman(num) 59 | print(res1) 60 | print(res2) 61 | -------------------------------------------------------------------------------- /macropodus/segment/seg_statistics/seg_forward.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 9:54 4 | # @author : Mo 5 | # @function: cut sentences of forward of maxlength 6 | 7 | 8 | from macropodus.base.seg_basic import SegBasic 9 | 10 | 11 | class SegForward(SegBasic): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def cut(self, sentence, len_max=7): 16 | """ 17 | 正向最大切词 18 | :param sentence: str, like '大漠帝国' 19 | :param len_max: int, like 32 20 | :return: yield 21 | """ 22 | len_sen = len(sentence) 23 | i = 0 24 | while i < len_sen: # while判断条件 25 | flag = False # flag标志位,确定有没有在字典里边的单字词或多字词 26 | for j in range(min(len_sen+1, i+len_max), -i, -1): # 遍历从当前字到句子末尾可能成词的部分, 从最后i+len_max算起 27 | word_maybe = sentence[i:j] # 正向可能成词的语 28 | if word_maybe in self.dict_words_freq: # 是否在字典里边 29 | i = j # 成词前标志i向后移动 30 | flag = True # flag标志位变化 31 | yield word_maybe 32 | break # 成词则跳出循环 33 | if not flag: # 未选中后单个字的情况 34 | yield sentence[i] 35 | i += 1 36 | 37 | if __name__ == '__main__': 38 | sf = SegForward() 39 | sentence = "macropodus是啥子呢" 40 | sentence = "方程的解除了零以外还有…" 41 | print(list(sf.cut(sentence))) 42 | 43 | # 测试性能 44 | from macropodus.preprocess.tools_common import txt_read, txt_write 45 | from macropodus.conf.path_config import path_root 46 | import time 47 | 48 | path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt" 49 | sentences = txt_read(path_wordseg_a) 50 | 51 | time_start = time.time() 52 | count = 0 53 | for i in range(10000): 54 | for sen in sentences: 55 | # print(sen) 56 | count += 1 57 | res = sf.cut(sen) 58 | # print(list(res)) 59 | time_end = time.time() 60 | print(time_end - time_start) 61 | print(count/(time_end - time_start)) 62 | 63 | # 10000/0.17*50 = 2831272(line/s) 64 | 65 | 66 | -------------------------------------------------------------------------------- /macropodus/segment/seg_statistics/seg_bidirectional.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 9:55 4 | # @author : Mo 5 | # @function: cut sentences of forward of reverse of maxlength 6 | 7 | 8 | from macropodus.segment.seg_statistics.seg_forward import SegForward 9 | from macropodus.segment.seg_statistics.seg_reverse import SegReverse 10 | 11 | 12 | class SegBidirectional(object): 13 | def __init__(self): 14 | self.seg_forward = SegForward() 15 | self.seg_reverse = SegReverse() 16 | 17 | def cut(self, sentence): 18 | """ 19 | 最大双向词典切词, 即最大正向切词与最大反向切词合并, 选择词数小的那个返回 20 | :param sentence: str 21 | :return: 22 | """ 23 | res_forward = self.seg_forward.cut(sentence) 24 | res_reverse = self.seg_reverse.cut(sentence) 25 | res_forward_list = list(res_forward) 26 | res_reverse_list = list(res_reverse) 27 | len_res_forward = len(res_forward_list) 28 | len_res_reverse = len(res_reverse_list) 29 | if len_res_forward >= len_res_reverse: 30 | for rrl in res_reverse_list: 31 | yield rrl 32 | else: 33 | for rfl in res_forward_list: 34 | yield rfl 35 | 36 | 37 | if __name__ == '__main__': 38 | sb = SegBidirectional() 39 | sentence = "研究生命科学研究生命科学" 40 | print(list(sb.cut(sentence))) 41 | 42 | # 测试性能 43 | from macropodus.preprocess.tools_common import txt_read, txt_write 44 | from macropodus.conf.path_config import path_root 45 | import time 46 | 47 | path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt" 48 | sentences = txt_read(path_wordseg_a) 49 | 50 | time_start = time.time() 51 | count = 0 52 | for i in range(10000): 53 | for sen in sentences: 54 | count += 1 55 | res = sb.cut(sen) 56 | # print(list(res)) 57 | time_end = time.time() 58 | print(time_end - time_start) 59 | print(count/(time_end - time_start)) 60 | # yield 61 | # 10000/0.17*50 = 2500*50 = 2896810(line/s) 62 | # 50000/0.90*50 = 2500000/20 = 2763600(line/s) -------------------------------------------------------------------------------- /macropodus/tookit/pinyin/pinyin.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/8 21:40 4 | # @author : Mo 5 | # @function: 汉字转拼音(zh2pinyin) 6 | 7 | 8 | from macropodus.preprocess.tools_common import re_zh_cn, load_json 9 | from macropodus.preprocess.tools_ml import macropodus_cut 10 | from macropodus.conf.path_config import path_dict_pinyin 11 | from collections import defaultdict 12 | 13 | 14 | class PinYin: 15 | def __init__(self): 16 | self.algorithm = "pinyin" 17 | self.dict_pinyin = defaultdict() 18 | self.load_pinyin_dict() 19 | 20 | def load_pinyin_dict(self): 21 | """ 22 | 加载默认的拼音pinyin字典 23 | :return: None 24 | """ 25 | dict_pinyin = load_json(path_dict_pinyin)[0] # 加载json字典文件 26 | for k, v in dict_pinyin.items(): 27 | self.dict_pinyin[k] = v 28 | 29 | def pinyin(self, text): 30 | """ 31 | 中文(大陆)转拼音 32 | :param text: str, like "大漠帝国" 33 | :return: list, like ["da", "mo", "di", "guo"] 34 | """ 35 | res_pinyin = [] 36 | # 只选择中文(zh), split筛选 37 | text_re = re_zh_cn.split(text) 38 | for tr in text_re: 39 | if re_zh_cn.match(tr): 40 | # 切词 41 | tr_cut = macropodus_cut(tr) 42 | for trc in tr_cut: # 切词后的词语 43 | # get words from dict of default 44 | trc_pinyin = self.dict_pinyin.get(trc) 45 | if trc_pinyin: res_pinyin += trc_pinyin 46 | else: # 单个字的问题 47 | for trc_ in trc: 48 | # get trem from dict of default 49 | trc_pinyin = self.dict_pinyin.get(trc_) 50 | if trc_pinyin: res_pinyin += trc_pinyin 51 | return res_pinyin 52 | 53 | 54 | if __name__ == "__main__": 55 | text = "macropodus是一种中国产的淡水鱼,广泛分布于两广地区,abcdefghijklmnopqrstuvwxyz" 56 | py = PinYin() 57 | res = py.pinyin(text) 58 | print(res) 59 | while True: 60 | print("请输入:") 61 | ques = input() 62 | print(py.pinyin(ques)) 63 | -------------------------------------------------------------------------------- /macropodus/summarize/graph_base/textrank_sklearn.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/21 22:01 4 | # @author :Mo 5 | # @function : textrank using tfidf of sklearn, pagerank of networkx 6 | 7 | 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | from macropodus.preprocess.tools_ml import cut_sentence 10 | from macropodus.preprocess.tools_ml import tdidf_sim 11 | import networkx as nx 12 | 13 | 14 | class TextrankSklearn: 15 | def __init__(self): 16 | self.algorithm = 'textrank_sklearn' 17 | 18 | def summarize(self, text, num=320): 19 | # 切句 20 | if type(text) == str: 21 | sentences = cut_sentence(text) 22 | elif type(text) == list: 23 | sentences = text 24 | else: 25 | raise RuntimeError("text type must be list or str") 26 | # tf-idf相似度 27 | matrix = tdidf_sim(sentences) 28 | matrix_norm = TfidfTransformer().fit_transform(matrix) 29 | # 构建相似度矩阵 30 | tfidf_sim = nx.from_scipy_sparse_matrix(matrix_norm * matrix_norm.T) 31 | # nx.pagerank 32 | sens_scores = nx.pagerank(tfidf_sim) 33 | # 得分排序 34 | sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True) 35 | # 保留topk个, 防止越界 36 | topk = min(len(sentences), num) 37 | # 返回原句子和得分 38 | return [(sr[1], sentences[sr[0]]) for sr in sen_rank][0:topk] 39 | 40 | 41 | if __name__ == '__main__': 42 | doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \ 43 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \ 44 | "业界急需一种相对比较准确的网页重要性计算方法," \ 45 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 46 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 47 | "Google把从A页面到B页面的链接解释为A页面给B页面投票," \ 48 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面" \ 49 | "和投票目标的等级来决定新的等级。简单的说," \ 50 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 51 | "PageRank The PageRank Citation Ranking: Bringing Order to the Web,"\ 52 | "具体说来就是,PageRank有两个基本思想,也可以说是假设," \ 53 | "即数量假设:一个网页被越多的其他页面链接,就越重);" \ 54 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 55 | "总的来说就是一句话,从全局角度考虑,获取重要的信息。" 56 | doc = doc.encode('utf-8').decode('utf-8') 57 | ts = TextrankSklearn() 58 | textrank_tfidf = ts.summarize(doc, 32) 59 | for score_sen in textrank_tfidf: 60 | print(score_sen) 61 | -------------------------------------------------------------------------------- /macropodus/segment/seg_statistics/seg_reverse.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 9:54 4 | # @author : Mo 5 | # @function: cut sentences of reverse of maxlength 6 | 7 | 8 | from macropodus.base.seg_basic import SegBasic 9 | 10 | 11 | class SegReverse(SegBasic): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def cut(self, sentence, len_max=7): 16 | """ 17 | 反向最大切词 18 | :param sentence: str, like '大漠帝国' 19 | :param len_max: int, like 32 20 | :return: yield 21 | """ 22 | len_sen = len(sentence) 23 | i = len_sen 24 | res = [] 25 | while i > 0: # while判断条件 26 | flag = False # flag标志位,确定有没有在字典里边的单字词或多字词 27 | for j in range(max(0, i - len_max), i): # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起 28 | word_maybe = sentence[j:i] # 正向可能成词的语 29 | if word_maybe in self.dict_words_freq: # 是否在字典里边 30 | i = j # 成词前标志i向后移动 31 | flag = True # flag标志位变化 32 | res.append(word_maybe) 33 | # yield word_maybe 34 | break # 成词则跳出循环 35 | if not flag: # 未选中后单个字的情况 36 | i -= 1 37 | # yield sentence[i] 38 | res.append(sentence[i]) 39 | for i in range(len(res)-1, 0, -1): 40 | yield res[i] 41 | # return res 42 | 43 | 44 | if __name__ == '__main__': 45 | a = max(0,5) 46 | sf = SegReverse() 47 | sentence = "研究生命科学\t研究 生命 科学" 48 | print(list(sf.cut(sentence))) 49 | print(list(sf.cut(""))) 50 | 51 | # 测试性能 52 | from macropodus.preprocess.tools_common import txt_read, txt_write 53 | from macropodus.conf.path_config import path_root 54 | import time 55 | path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt" 56 | sentences = txt_read(path_wordseg_a) 57 | 58 | time_start = time.time() 59 | count = 0 60 | for i in range(50000): 61 | for sen in sentences: 62 | # print(sen) 63 | count += 1 64 | res = (sf.cut(sen)) 65 | # print(res) 66 | time_end = time.time() 67 | print(time_end-time_start) 68 | print(count/(time_end - time_start)) 69 | 70 | # 10000/0.18*50 = 2500*50 = 2784226(line/s) 71 | # 50000/0.98*50 = 2500000/20 = 2550109(line/s) 72 | 73 | -------------------------------------------------------------------------------- /macropodus/similarity/similarity_word2vec_char.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/17 14:50 4 | # @author : Mo 5 | # @function: similarity of sentence of word2vec 6 | 7 | 8 | from macropodus.base.word2vec import W2v 9 | 10 | 11 | class SimW2vChar(W2v): 12 | def __init__(self, use_cache=True): 13 | super().__init__(use_cache) 14 | 15 | def encode(self, sent, type_encode="other"): 16 | """ 17 | 生成句向量, 字符级别, char 18 | :param sent: str, like "大漠帝国" 19 | :param type_encode: str, like "avg", "other" 20 | :return: vector 21 | """ 22 | sentence_vec = self.w2v_char.wv[self.w2v_char.index2word[1]] * 0 23 | len_sent = len(sent) 24 | for i in range(len_sent): 25 | word = sent[i] 26 | try: 27 | sentence_vec = sentence_vec + self.w2v_char.wv[word] 28 | except Exception as e: 29 | sentence_vec = sentence_vec + 0.01 # unknow_know词加0.01 30 | if type_encode == "avg": 31 | sentence_vec = sentence_vec / len_sent 32 | return sentence_vec 33 | 34 | def similarity(self, sent1, sent2, type_sim="total", type_encode="avg"): 35 | """ 36 | 相似度计算, 默认余弦相似度+jaccard相似度 37 | :param sen1: str, like "大漠帝国" 38 | :param sen2: str, like "Macropodus" 39 | :param type_sim: str, like "total" or "cosine" 40 | :param type_encode: str, like "other" or "avg" 41 | :return: float, like 0.998 42 | """ 43 | if sent1 and sent2: 44 | encode_sen1 = self.encode(sent1, type_encode) 45 | encode_sen2 = self.encode(sent2, type_encode) 46 | score_res = self.cosine(encode_sen1, encode_sen2) 47 | else: 48 | score_res = 0.0 49 | if type_sim=="total": 50 | score_jaccard = self.jaccard(sent1, sent2) 51 | score_res = (score_res + score_jaccard)/2 52 | return score_res 53 | 54 | 55 | if __name__ == '__main__': 56 | 57 | sent1 = "大漠帝国" 58 | sent2 = "macropodus" 59 | swc = SimW2vChar(use_cache=True) 60 | sen_encede = swc.encode(sent1) 61 | score = swc.similarity(sent1, sent2) 62 | print(score) 63 | gg = 0 64 | while True: 65 | print("请输入sent1:") 66 | sent1 = input() 67 | print("请输入sent2:") 68 | sent2 = input() 69 | print(swc.similarity(sent1, sent2)) 70 | -------------------------------------------------------------------------------- /macropodus/summarize/nous_base/lead_3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/24 22:43 4 | # @author :Mo 5 | # @function :text_summary with lead-3 6 | 7 | 8 | from macropodus.preprocess.tools_ml import cut_sentence 9 | 10 | 11 | class Lead3Sum: 12 | def __init__(self): 13 | self.algorithm = 'lead_3' 14 | 15 | def summarize(self, text, num=320, type_l='mix'): 16 | """ 17 | lead-s 18 | :param sentences: list 19 | :param type: str, you can choose 'begin', 'end' or 'mix' 20 | :return: list 21 | """ 22 | # 切句 23 | if type(text) == str: 24 | sentences = cut_sentence(text) 25 | elif type(text) == list: 26 | sentences = text 27 | else: 28 | raise RuntimeError("text type must be list or str") 29 | # 最小句子数 30 | num_min = min(num, len(sentences)) 31 | if type_l=='begin': 32 | summers = sentences[0:num] 33 | elif type_l=='end': 34 | summers = sentences[-num:] 35 | else: 36 | summers = [sentences[0]] + [sentences[-1]] + sentences[1:num-1] 37 | summers_s = {} 38 | for i in range(len(summers)): # 得分计算 39 | if len(summers) - i == 1: 40 | summers_s[summers[i]] = (num - 0.75) / (num + 1) 41 | else: 42 | summers_s[summers[i]] = (num - i - 0.5) / (num + 1) 43 | score_sen = [(rc[1], rc[0]) for rc in sorted(summers_s.items(), key=lambda d: d[1], reverse=True)][0:num_min] 44 | return score_sen 45 | 46 | 47 | if __name__ == '__main__': 48 | doc = "是上世纪90年代末提出的一种计算网页权重的算法。" \ 49 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \ 50 | "业界急需一种相对比较准确的网页重要性计算方法," \ 51 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 52 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 53 | "Google把从A页面到B页面的链接解释为A页面给B页面投票," \ 54 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面" \ 55 | "和投票目标的等级来决定新的等级。简单的说," \ 56 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 57 | "PageRank The PageRank Citation Ranking: Bringing Order to the Web,"\ 58 | "具体说来就是,PageRank有两个基本思想,也可以说是假设," \ 59 | "即数量假设:一个网页被越多的其他页面链接,就越重);" \ 60 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 61 | "总的来说就是一句话,从全局角度考虑,获取重要的信息。" 62 | text = doc.encode('utf-8').decode('utf-8') 63 | l3 = Lead3Sum() 64 | for score_sen in l3.summarize(text, type_l='mix', num=320): 65 | print(score_sen) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /macropodus/network/graph/crf.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/9 21:44 4 | # @author : Mo 5 | # @function: CRF 6 | 7 | 8 | from macropodus.network.base.graph import graph 9 | from macropodus.network.layers.crf import CRF 10 | import tensorflow as tf 11 | 12 | 13 | class CRFGraph(graph): 14 | def __init__(self, hyper_parameters): 15 | """ 16 | 初始化 17 | :param hyper_parameters: json,超参 18 | """ 19 | self.crf_mode = hyper_parameters["model"].get("crf_mode", "reg") # "reg", pad 20 | self.supports_masking = hyper_parameters["model"].get("supports_masking", True) # True or False 21 | super().__init__(hyper_parameters) 22 | 23 | def create_model(self, hyper_parameters): 24 | """ 25 | 构建神经网络 26 | :param hyper_parameters:json, hyper parameters of network 27 | :return: tensor, moedl 28 | """ 29 | super().create_model(hyper_parameters) 30 | x = self.word_embedding.output 31 | # TimeDistributed 32 | x_64 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(128, activation="softmax"), 33 | name='layer_time_distributed')(x) 34 | # dense to a smaller units 35 | tensor = tf.keras.layers.Dense(units=self.label, activation=self.activate_rnn, name="layer_dense_64")(x_64) 36 | # crf, "pad" or "reg" 37 | if self.crf_mode == "pad": 38 | # length of real sentence 39 | x_mask = tf.keras.layers.Input(shape=(1), dtype=tf.int32) 40 | self.crf = CRF(self.label, mode="pad", supports_masking=True, name="layer_crf") 41 | self.output = self.crf([tensor, x_mask]) 42 | if self.embedding_type in ["bert", "albert"]: 43 | self.inputs = [self.word_embedding.input[0], self.word_embedding.input[1], x_mask] 44 | else: 45 | self.inputs = [self.word_embedding.input, x_mask] 46 | else: 47 | self.crf = CRF(self.label, mode="reg", name="layer_crf") 48 | self.output = self.crf(tensor) 49 | self.inputs = self.word_embedding.input 50 | self.model = tf.keras.Model(self.inputs, self.output) 51 | self.model.summary(132) 52 | 53 | def create_compile(self): 54 | """ 55 | 构建优化器、损失函数和评价函数 56 | :return: 57 | """ 58 | self.loss = self.crf.loss 59 | self.metrics = self.crf.viterbi_accuracy 60 | super().create_compile() 61 | -------------------------------------------------------------------------------- /macropodus/base/word2vec.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 22:52 4 | # @author : Mo 5 | # @function: word2vec of gensim 6 | 7 | 8 | from macropodus.conf.path_config import path_embedding_word2vec_char, path_macropodus_w2v_char_cache 9 | from macropodus.conf.path_log import get_logger_root 10 | import numpy as np 11 | import gensim 12 | import pickle 13 | import time 14 | import os 15 | 16 | 17 | logger = get_logger_root() 18 | gensim.logger.level=40 # gensim只打印ERROR信息等 19 | logger.info("path of w2v cache is {}!".format(path_macropodus_w2v_char_cache)) 20 | 21 | 22 | class W2v: 23 | def __init__(self, use_cache=True): 24 | # time_start = time.time() 25 | # 存在缓存则直接读取, 序列化加速缓存读取速度 26 | if use_cache and os.path.exists(path_macropodus_w2v_char_cache): 27 | with open(path_macropodus_w2v_char_cache, "rb") as fpmc: 28 | self.w2v_char= pickle.load(fpmc) 29 | fpmc.close() 30 | # logger.info("word2vec: " + str(time.time() - time_start)) # 0.12 31 | else: 32 | # gensim加载词向量 33 | self.w2v_char = gensim.models.KeyedVectors.load_word2vec_format(path_embedding_word2vec_char) 34 | # logger.info("word2vec: " + str(time.time() - time_start)) # 0.99, 0.78 35 | # 第一次跑macropodus, 序列化需要的缓存 36 | if use_cache and not os.path.exists(path_macropodus_w2v_char_cache): 37 | with open(path_macropodus_w2v_char_cache, "wb") as fpmc: 38 | pickle.dump(self.w2v_char, fpmc) 39 | 40 | def cosine(self, sen_1, sen_2): 41 | """ 42 | 余弦距离 43 | :param sen_1: numpy.array 44 | :param sen_2: numpy.array 45 | :return: float, like 0.0 46 | """ 47 | if sen_1.all() and sen_2.all(): 48 | return np.dot(sen_1, sen_2) / (np.linalg.norm(sen_1) * np.linalg.norm(sen_2)) 49 | else: 50 | return 0.0 51 | 52 | def jaccard(self, sen_1, sen_2): 53 | """ 54 | jaccard距离 55 | :param sen1: str, like "大漠帝国" 56 | :param sen2: str, like "Macropodus" 57 | :return: float, like 0.998 58 | """ 59 | try: 60 | sent_intersection = list(set(list(sen_1)).intersection(set(list(sen_2)))) 61 | sent_union = list(set(list(sen_1)).union(set(list(sen_2)))) 62 | score_jaccard = float(len(sent_intersection) / len(sent_union)) 63 | except: 64 | score_jaccard = 0.0 65 | return score_jaccard 66 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/12/30 22:17 4 | # @author :Mo 5 | # @function :setup of Macropodus 6 | # @codes :fix it and copy reference from https://github.com/TianWenQAQ/Kashgari/blob/master/setup.py 7 | 8 | 9 | from macropodus.version import __version__ 10 | from setuptools import find_packages, setup 11 | import codecs 12 | 13 | 14 | # Package meta-data. 15 | NAME = 'Macropodus' 16 | DESCRIPTION = 'Macropodus: Tookit of Chinese Natural Language Processing' 17 | URL = 'https://github.com/yongzhuo/Macropodus' 18 | EMAIL = '1903865025@qq.com' 19 | AUTHOR = 'yongzhuo' 20 | LICENSE = 'MIT' 21 | 22 | with codecs.open('README.md', 'r', 'utf8') as reader: 23 | long_description = "\n".join(reader.readlines()) 24 | with codecs.open('requirements.txt', 'r', 'utf8') as reader: 25 | install_requires = list(map(lambda x: x.strip(), reader.readlines())) 26 | 27 | setup(name=NAME, 28 | version=__version__, 29 | description=DESCRIPTION, 30 | long_description=long_description, 31 | long_description_content_type="text/markdown", 32 | author=AUTHOR, 33 | author_email=EMAIL, 34 | url=URL, 35 | packages=find_packages(), # (exclude=('test')), 36 | package_data={'macropodus': ['*.*', 'data/*', 'data/dict/*', 37 | 'data/embedding/*', 'data/embedding/word2vec/*', 38 | 'data/model/*'] 39 | }, 40 | install_requires=install_requires, 41 | license=LICENSE, 42 | classifiers=['License :: OSI Approved :: MIT License', 43 | 'Programming Language :: Python :: 3.5', 44 | 'Programming Language :: Python :: 3.6', 45 | 'Programming Language :: Python :: 3.7', 46 | 'Programming Language :: Python :: 3.8', 47 | 'Programming Language :: Python :: 3.9', 48 | 'Programming Language :: Python :: Implementation :: CPython', 49 | 'Programming Language :: Python :: Implementation :: PyPy'], 50 | ) 51 | 52 | 53 | if __name__ == "__main__": 54 | print("setup ok!") 55 | 56 | # 说明, tensorflow>=1.13.0 or tensorflow-gpu>=1.13.0 57 | # 项目工程目录这里Macropodus, 实际上, 下边还要有一层macropodus, 也就是说, macropodus和setup同一层 58 | # data包里必须要有__init__.py, 否则文件不会生成, .py文件才能copy 59 | 60 | # anaconda3创建环境 61 | # conda remove -n py35 --all 62 | # conda create -n py351 python=3.5 63 | 64 | # 编译的2种方案: 65 | 66 | # 方案一 67 | # 打开cmd 68 | # 到达安装目录 69 | # python setup.py build 70 | # python setup.py install 71 | 72 | # 方案二 73 | # python setup.py bdist_wheel --universal 74 | # twine upload dist/* 75 | 76 | 77 | -------------------------------------------------------------------------------- /macropodus/network/graph/bilstm.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/19 22:30 4 | # @author : Mo 5 | # @function: Bi-LSTM 6 | 7 | 8 | from macropodus.network.base.graph import graph 9 | import tensorflow as tf 10 | 11 | 12 | class BiLSTMGraph(graph): 13 | def __init__(self, hyper_parameters): 14 | """ 15 | 初始化 16 | :param hyper_parameters: json,超参 17 | """ 18 | self.filters = hyper_parameters['model'].get('filters', [2, 3, 4]) 19 | self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 1) 20 | self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM') 21 | self.rnn_units = hyper_parameters['model'].get('rnn_units', 256) 22 | super().__init__(hyper_parameters) 23 | 24 | def create_model(self, hyper_parameters): 25 | """ 26 | 构建神经网络 27 | :param hyper_parameters:json, hyper parameters of network 28 | :return: tensor, moedl 29 | """ 30 | super().create_model(hyper_parameters) 31 | self.rnn_layer = {'LSTM':tf.keras.layers.LSTM, 'GRU':tf.keras.layers.GRU}[self.rnn_type] 32 | embedding = self.word_embedding.output 33 | # 提取n-gram特征和最大池化, 一般不用平均池化 34 | conv_pools = [embedding] 35 | for filter in self.filters: 36 | conv = tf.keras.layers.Conv1D(filters=self.filters_num, 37 | kernel_size=filter, 38 | padding='same', 39 | kernel_initializer='normal', 40 | activation='relu', )(embedding) 41 | pooled = tf.keras.layers.MaxPool1D(pool_size=2, 42 | strides=1, 43 | padding='same', )(conv) 44 | conv_pools.append(pooled) 45 | # 拼接 46 | x = tf.keras.layers.Concatenate(axis=-1)(conv_pools) 47 | # Bi-LSTM 48 | for nrl in range(self.num_rnn_layers): 49 | x = tf.keras.layers.Bidirectional(self.rnn_layer(units=self.rnn_units, 50 | return_sequences=True, 51 | activation=self.activate_rnn, 52 | kernel_regularizer=tf.keras.regularizers.l2(self.l2), 53 | recurrent_regularizer=tf.keras.regularizers.l2(self.l2) 54 | ))(x) 55 | x = tf.keras.layers.Dropout(self.dropout)(x) 56 | x = tf.keras.layers.Dense(self.label, activation=self.activate_classify, name='layer_dense_3')(x) 57 | self.output = x 58 | self.model = tf.keras.Model(self.word_embedding.input, self.output) 59 | self.model.summary(132) 60 | -------------------------------------------------------------------------------- /macropodus/conf/path_config.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/18 23:59 4 | # @author : Mo 5 | # @function: path of macropodus 6 | 7 | 8 | import sys 9 | import os 10 | path_root = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) 11 | sys.path.append(path_root) 12 | 13 | 14 | # path of basic of segnment 15 | path_dict_macropodus = os.path.join(path_root, "data/dict/macropodus.dict") 16 | path_dict_user = os.path.join(path_root, "data/dict/user.dict") 17 | path_log_basic = os.path.join(path_root, "logs") 18 | 19 | # path of cache 20 | path_macropodus_w2v_char_cache = os.path.join(path_root, 'data/cache/word2vec_char.cache') 21 | path_macropodus_dict_freq_cache = os.path.join(path_root, 'data/cache/macropodus.cache') 22 | 23 | # path of basic of tookit 24 | path_dict_pinyin = os.path.join(path_root, "data/dict/pinyin.dict") 25 | path_dict_zh2han = os.path.join(path_root, "data/dict/zh2han.dict") 26 | 27 | # path of embedding 28 | path_embedding_word2vec_char = os.path.join(path_root, 'data/embedding/word2vec/w2v_model_wiki_char.vec') 29 | path_embedding_bert = os.path.join(path_root, 'data/embedding/chinese_L-12_H-768_A-12/') 30 | path_embedding_random_char = os.path.join(path_root, 'data/embedding/term_char.txt') 31 | path_embedding_random_word = os.path.join(path_root, 'data/embedding/term_word.txt') 32 | path_embedding_albert = os.path.join(path_root, 'data/embedding/albert_base_zh') 33 | 34 | # path of train data of ner people 1998 35 | path_ner_people_1998_train = os.path.join(path_root, "data/corpus/ner_people_1998/train.json") 36 | path_ner_people_1998_valid = os.path.join(path_root, "data/corpus/ner_people_1998/dev.json") 37 | # path of train data of seg pku 1998 38 | path_seg_pku_1998_train = os.path.join(path_root, "data/corpus/seg_pku_1998/train.json") 39 | path_seg_pku_1998_bi_train = os.path.join(path_root, "data/corpus/seg_pku_1998/train_BI_126.json") 40 | # path of train data of tag people 1998 41 | path_tag_people_1998_train = os.path.join(path_root, "data/corpus/tag_people_1998/train.json") 42 | # path of train data of tag people 2014 43 | path_tag_people_2014_train = os.path.join(path_root, "data/corpus/tag_people_2014/train.json") 44 | path_tag_people_2014_valid = os.path.join(path_root, "data/corpus/tag_people_2014/dev.json") 45 | # path of ccks_2020 46 | path_ccks_2020 = os.path.join(path_root, "data/ccks_8_data_v2_ner") 47 | 48 | path_ccks_2020_ner = os.path.join(path_root, "data/ccks_8_data_v2_ner/ccks_2020_ner.json") 49 | path_ccks_2020_ner_train = os.path.join(path_root, "data/ccks_8_data_v2_ner/train.json") 50 | path_ccks_2020_ner_dev = os.path.join(path_root, "data/ccks_8_data_v2_ner/dev.json") 51 | 52 | # path of training model save dir 53 | path_model_dir = os.path.join(path_root, "data", "model") 54 | path_hyper_parameters = os.path.join(path_model_dir, "params.json") 55 | path_model_l2i_i2l = os.path.join(path_model_dir, "l2i_i2l.json") 56 | path_fineture = os.path.join(path_model_dir, "embedding.h5") 57 | path_model = os.path.join(path_model_dir, "model.h5") 58 | -------------------------------------------------------------------------------- /macropodus/summarize/graph_base/textrank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/29 22:39 4 | # @author :Mo 5 | # @function :textrank of textrank4zh, sklearn or gensim 6 | 7 | 8 | from macropodus.summarize.graph_base.textrank_word2vec import TextrankWord2vec 9 | from macropodus.summarize.graph_base.textrank_gensim import TextrankGensimSum 10 | from macropodus.summarize.graph_base.textrank_sklearn import TextrankSklearn 11 | import os 12 | 13 | # 词向量, 默认使用缓存 14 | use_cache = True 15 | if not os.environ.get("macropodus_use_w2v_cache", True): 16 | use_cache = False # 不使用缓存,重新加载 17 | # textrank of gensim 18 | trgs = TextrankGensimSum() 19 | # textrank of word2vec 20 | trwv = TextrankWord2vec() 21 | # textrank of sklearn 22 | trsk = TextrankSklearn() 23 | 24 | 25 | class TextRankSum: 26 | def __init__(self): 27 | self.algorithm = 'textrank' 28 | 29 | def summarize(self, text, num=6, model_type="textrank_word2vec"): 30 | """ 31 | 文本摘要 32 | :param text:str, like "你好!大漠帝国!" 33 | :param num: int, like 3 34 | :param model_type: str, like "textrank_sklearn" 35 | :return: list 36 | """ 37 | if model_type=="textrank_sklearn": 38 | res = trsk.summarize(text, num=num) 39 | elif model_type=="textrank_gensim": 40 | res = trgs.summarize(text, num=num) 41 | elif model_type=="textrank_word2vec": 42 | res = trwv.summarize(text, num=num) 43 | else: 44 | raise RuntimeError(" model_type must be 'textrank_textrank4zh', 'text_rank_sklearn' or 'textrank_gensim' ") 45 | 46 | return res 47 | 48 | 49 | class TextRankKey: 50 | def __init__(self): 51 | self.algorithm = 'keyword' 52 | 53 | def keyword(self, text, num=6, score_min=0.025, model_type="keywor_word2vec"): 54 | if model_type=="keywor_word2vec": 55 | res = trwv.keyword(text, num=num, score_min=score_min) 56 | else: 57 | raise RuntimeError(" model_type must be 'keywor_word2vec'") 58 | 59 | return res 60 | 61 | 62 | 63 | if __name__ == '__main__': 64 | 65 | doc = "和投票目标的等级来决定新的等级.简单的说。" \ 66 | "是上世纪90年代末提出的一种计算网页权重的算法!" \ 67 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \ 68 | "业界急需一种相对比较准确的网页重要性计算方法。" \ 69 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 70 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 71 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \ 72 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \ 73 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 74 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。" \ 75 | "即数量假设:一个网页被越多的其他页面链接,就越重)。" \ 76 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 77 | "总的来说就是一句话,从全局角度考虑,获取重要的信。" 78 | 79 | text = doc.encode('utf-8').decode('utf-8') 80 | 81 | tr = TextRankSum() 82 | kw = TextRankKey() 83 | score_ques = tr.summarize(text, num=100, model_type="textrank_gensim") # "text_rank_sklearn") 84 | for sq in score_ques: 85 | print(sq) 86 | 87 | score_ques = kw.keyword(text, num=100, model_type="keywor_word2vec") # "text_rank_sklearn") 88 | for sq in score_ques: 89 | print(sq) 90 | -------------------------------------------------------------------------------- /macropodus/__init_tf_keras.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/20 22:22 4 | # @author : Mo 5 | # @function: init of keras of tensorflow 6 | 7 | 8 | from macropodus.conf.path_log import get_logger_root 9 | 10 | 11 | logger = get_logger_root() 12 | 13 | 14 | try: 15 | #####################(tensorflow, keras)############################ 16 | import sys 17 | import os 18 | 19 | path_root = os.path.abspath(os.path.dirname(__file__)) 20 | sys.path.append(path_root) # 环境引入根目录 21 | # 默认cpu环境, tensorflow 22 | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 23 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 24 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 25 | os.environ['TF_KERAS'] = '1' 26 | 27 | # tensorflow.python.keras 28 | from macropodus.network.service.server_prdeict import AlbertBilstmPredict 29 | from keras_adaptive_softmax import AdaptiveEmbedding, AdaptiveSoftmax 30 | from macropodus.network.layers.non_mask_layer import NonMaskingLayer 31 | from macropodus.conf.path_config import path_model_dir 32 | from macropodus.network.layers.crf import CRF 33 | import tensorflow.python.keras as keras 34 | import tensorflow as tf 35 | import keras_bert 36 | 37 | 38 | tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR) 39 | 40 | # custom_objects 41 | custom_objects = keras_bert.get_custom_objects() 42 | custom_objects['AdaptiveEmbedding'] = AdaptiveEmbedding 43 | custom_objects['AdaptiveSoftmax'] = AdaptiveSoftmax 44 | custom_objects['NonMaskingLayer'] = NonMaskingLayer 45 | custom_objects['CRF'] = CRF 46 | 47 | # init model of dl(deep learning) 48 | # 加载训练好的模型, 命名实体提取 49 | try: 50 | path_ner_albert_bilstm_crf = os.path.join(path_model_dir, 'ner_albert_people_1998') 51 | ner_albert_bilstm_crf = AlbertBilstmPredict(path_ner_albert_bilstm_crf, custom_objects) 52 | ner = ner_albert_bilstm_crf.predict_single 53 | ners = ner_albert_bilstm_crf.predict 54 | except Exception as e: 55 | logger.info(str(e)) 56 | 57 | # 加载训练好的模型, 词性标注 58 | try: 59 | path_tag_albert_bilstm_crf = os.path.join(path_model_dir, 'tag_albert_people_1998') 60 | tag_albert_bilstm_crf = AlbertBilstmPredict(path_tag_albert_bilstm_crf, custom_objects) 61 | postag = tag_albert_bilstm_crf.pos_tag 62 | postags = tag_albert_bilstm_crf.pos_tags 63 | except Exception as e: 64 | logger.info(str(e)) 65 | # # layers 66 | # preprocessing = keras.preprocessing 67 | # applications = keras.applications 68 | # regularizers = keras.regularizers 69 | # initializers = keras.initializers 70 | # activations = keras.activations 71 | # constraints = keras.constraints 72 | # optimizers = keras.optimizers 73 | # callbacks = keras.callbacks 74 | # datasets = keras.datasets 75 | # wrappers = keras.wrappers 76 | # metrics = keras.metrics 77 | # backend = keras.backend 78 | # engine = keras.engine 79 | # layers = keras.layers 80 | # models = keras.models 81 | # losses = keras.losses 82 | # utils = keras.utils 83 | except Exception as e: 84 | logger.info(str(e)) 85 | -------------------------------------------------------------------------------- /macropodus/network/layers/keras_lookahead.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/12 21:14 4 | # @author : Mo 5 | # @function: lookahead of keras 6 | # @codefrom: https://github.com/bojone/keras_lookahead 7 | 8 | 9 | import tensorflow.python.keras.backend as K 10 | 11 | 12 | class Lookahead(object): 13 | """Add the [Lookahead Optimizer](https://arxiv.org/abs/1907.08610) functionality for [keras](https://keras.io/). 14 | """ 15 | 16 | def __init__(self, k=5, alpha=0.5): 17 | self.k = k 18 | self.alpha = alpha 19 | self.count = 0 20 | 21 | def inject(self, model): 22 | """Inject the Lookahead algorithm for the given model. 23 | The following code is modified from keras's _make_train_function method. 24 | See: https://github.com/keras-team/keras/blob/master/keras/engine/training.py#L497 25 | """ 26 | if not hasattr(model, 'train_function'): 27 | raise RuntimeError('You must compile your model before using it.') 28 | 29 | model._check_trainable_weights_consistency() 30 | 31 | if model.train_function is None: 32 | inputs = (model._feed_inputs + 33 | model._feed_targets + 34 | model._feed_sample_weights) 35 | if model._uses_dynamic_learning_phase(): 36 | inputs += [K.learning_phase()] 37 | fast_params = model._collected_trainable_weights 38 | 39 | with K.name_scope('training'): 40 | with K.name_scope(model.optimizer.__class__.__name__): 41 | training_updates = model.optimizer.get_updates( 42 | params=fast_params, 43 | loss=model.total_loss) 44 | slow_params = [K.variable(p) for p in fast_params] 45 | fast_updates = (model.updates + 46 | training_updates + 47 | model.metrics_updates) 48 | 49 | slow_updates, copy_updates = [], [] 50 | for p, q in zip(fast_params, slow_params): 51 | slow_updates.append(K.update(q, q + self.alpha * (p - q))) 52 | copy_updates.append(K.update(p, q)) 53 | 54 | # Gets loss and metrics. Updates weights at each call. 55 | fast_train_function = K.function( 56 | inputs, 57 | [model.total_loss] + model.metrics_tensors, 58 | updates=fast_updates, 59 | name='fast_train_function', 60 | **model._function_kwargs) 61 | 62 | def F(inputs): 63 | self.count += 1 64 | R = fast_train_function(inputs) 65 | if self.count % self.k == 0: 66 | K.batch_get_value(slow_updates) 67 | K.batch_get_value(copy_updates) 68 | return R 69 | 70 | model.train_function = F 71 | 72 | if __name__ == '__main__': 73 | gg = 0 74 | # # useage 75 | # model.compile(optimizer=Adam(1e-3), loss='mse') # Any optimizer 76 | # lookahead = Lookahead(k=5, alpha=0.5) # Initialize Lookahead 77 | # lookahead.inject(model) # add into model 78 | -------------------------------------------------------------------------------- /macropodus/summarize/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/18 22:10 4 | # @author : Mo 5 | # @function: text summarize 6 | 7 | 8 | # text_summarize of extractive 9 | from macropodus.summarize.feature_base.word_significance import WordSignificanceSum 10 | from macropodus.summarize.feature_base.text_pronouns import TextPronounsSum 11 | from macropodus.summarize.graph_base.textrank import TextRankSum, TextRankKey 12 | from macropodus.summarize.feature_base.text_teaser import TextTeaserSum 13 | from macropodus.summarize.feature_base.mmr import MMRSum 14 | 15 | from macropodus.summarize.topic_base.topic_lda import LDASum 16 | from macropodus.summarize.topic_base.topic_lsi import LSISum 17 | from macropodus.summarize.topic_base.topic_nmf import NMFSum 18 | 19 | from macropodus.summarize.nous_base.lead_3 import Lead3Sum 20 | 21 | # feature 22 | wss = WordSignificanceSum() 23 | tps = TextPronounsSum() 24 | tts = TextTeaserSum() 25 | mms = MMRSum() 26 | 27 | # graph-3 28 | trs = TextRankSum() 29 | trk = TextRankKey() 30 | 31 | # nous 32 | l3s = Lead3Sum() 33 | 34 | # topic 35 | lds = LDASum() 36 | lss = LSISum() 37 | nms = NMFSum() 38 | 39 | # summarization 40 | text_pronouns = tps.summarize 41 | text_teaser = tts.summarize 42 | word_sign = wss.summarize 43 | textrank = trs.summarize 44 | lead3 = l3s.summarize 45 | mmr = mms.summarize 46 | lda = lds.summarize 47 | lsi = lss.summarize 48 | nmf = nms.summarize 49 | 50 | # keyword 51 | keyword = trk.keyword 52 | 53 | # 函数接口 54 | def summarization(text, num=320, type_summarize="lda", topic_min=6, judge_topic=False, alpha=0.6, type_l='mix', model_type="textrank_sklearn", title=None): 55 | """ 56 | 文本摘要汇总 57 | :param text: str, like "你是。大漠帝国。不是吧错了。哈哈。我的。" 58 | :param num: int, like 32 59 | :param type_summarize: str, like "lda", must in ['text_pronouns', 'text_teaser', 'word_sign', 'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf'] 60 | :return: 61 | """ 62 | 63 | if type_summarize=="text_pronouns": # title, str, 可填标题, like "震惊,MacropodusXXX" 64 | res = text_pronouns(text, num, title) 65 | elif type_summarize=="text_teaser": # title, str, 可填标题, like "震惊,MacropodusXXX" 66 | res = text_teaser(text, num, title) 67 | elif type_summarize=="word_sign": # 68 | res = word_sign(text, num) 69 | elif type_summarize=="textrank": # model_type 可填 'textrank_textrank4zh', 'text_rank_sklearn' or 'textrank_gensim' 70 | res = textrank(text, num) 71 | elif type_summarize=="lead3": 72 | res = lead3(text, num, type_l) # type_l 可填 'begin', 'end' or 'mix' 73 | elif type_summarize=="mmr": 74 | res = mmr(text, num, alpha) # alpha 可填 0-1 75 | elif type_summarize=="lda": # topic_min>1, judge_topic=True or False 76 | res = lda(text, num, topic_min, judge_topic) 77 | elif type_summarize=="lsi": # topic_min>1, judge_topic=True or False 78 | res = lsi(text, num, topic_min, judge_topic) 79 | elif type_summarize=="nmf": # topic_min>1, judge_topic=True or False 80 | res = nmf(text, num, topic_min, judge_topic) 81 | else: 82 | raise RuntimeError("your input type_summarize is wrong, it must be in " 83 | "['text_pronouns', 'text_teaser', 'word_sign', " 84 | "'textrank', 'lead3', 'mmr', 'lda', 'lsi', 'nmf']") 85 | return res 86 | -------------------------------------------------------------------------------- /macropodus/summarize/feature_base/mmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/28 20:16 4 | # @author :Mo 5 | # @function :mmr 6 | 7 | 8 | from macropodus.preprocess.tools_ml import extract_chinese, cut_sentence 9 | from macropodus.preprocess.tools_ml import macropodus_cut, tfidf_fit 10 | from macropodus.data.words_common.stop_words import stop_words 11 | import copy 12 | 13 | 14 | class MMRSum: 15 | def __init__(self): 16 | self.stop_words = stop_words.values() 17 | self.algorithm = 'mmr' 18 | 19 | def summarize(self, text, num=8, alpha=0.6): 20 | """ 21 | 22 | :param text: str 23 | :param num: int 24 | :return: list 25 | """ 26 | # 切句 27 | if type(text) == str: 28 | self.sentences = cut_sentence(text) 29 | elif type(text) == list: 30 | self.sentences = text 31 | else: 32 | raise RuntimeError("text type must be list or str") 33 | # 切词 34 | sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) 35 | if word.strip()] for sentence in self.sentences] 36 | # 去除停用词等 37 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 38 | self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] 39 | # # 计算每个句子的词语个数 40 | # sen_word_len = [len(sc)+1 for sc in sentences_cut] 41 | # 计算每个句子的tfidf 42 | sen_tfidf = tfidf_fit(self.sentences_cut) 43 | # 矩阵中两两句子相似度 44 | SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度" 45 | # 输入文本句子长度 46 | len_sen = len(self.sentences) 47 | # 句子标号 48 | sen_idx = [i for i in range(len_sen)] 49 | summary_set = [] 50 | mmr = {} 51 | for i in range(len_sen): 52 | if not self.sentences[i] in summary_set: 53 | sen_idx_pop = copy.deepcopy(sen_idx) 54 | sen_idx_pop.pop(i) 55 | # 两两句子相似度 56 | sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop] 57 | score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确 58 | mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j) 59 | summary_set.append(self.sentences[i]) 60 | score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)] 61 | if len(mmr) > num: 62 | score_sen = score_sen[0:num] 63 | return score_sen 64 | 65 | 66 | if __name__ == '__main__': 67 | mmr_sum = MMRSum() 68 | doc = "PageRank算法简介。" \ 69 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 70 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 71 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 72 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 73 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 74 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 75 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 76 | "和投票目标的等级来决定新的等级。简单的说, " \ 77 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 78 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 79 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 80 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 81 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 82 | sum = mmr_sum.summarize(doc) 83 | for i in sum: 84 | print(i) 85 | 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /macropodus/network/graph/bilstm_crf.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/6 20:45 4 | # @author : Mo 5 | # @function: Bi-LSTM-CRF 6 | 7 | 8 | from macropodus.network.base.graph import graph 9 | from macropodus.network.layers.crf import CRF 10 | import tensorflow as tf 11 | 12 | 13 | class BilstmCRFGraph(graph): 14 | def __init__(self, hyper_parameters): 15 | """ 16 | 初始化 17 | :param hyper_parameters: json,超参 18 | """ 19 | self.num_rnn_layers = hyper_parameters['model'].get('num_rnn_layers', 1) # 1, 2, 3 20 | self.rnn_type = hyper_parameters['model'].get('rnn_type', 'LSTM') # 'LSTM', 'GRU' 21 | self.rnn_units = hyper_parameters['model'].get('rnn_units', 512) # 128, 256, 512, 768, 1024 22 | self.crf_mode = hyper_parameters['model'].get('crf_mode', 'reg') # "reg", pad 23 | self.supports_masking = hyper_parameters['model'].get('supports_masking', True) # True or False 24 | super().__init__(hyper_parameters) 25 | 26 | def create_model(self, hyper_parameters): 27 | """ 28 | 构建神经网络 29 | :param hyper_parameters:json, hyper parameters of network 30 | :return: tensor, moedl 31 | """ 32 | super().create_model(hyper_parameters) 33 | # LSTM or GRU 34 | self.rnn_layer = {'LSTM':tf.keras.layers.LSTM, 'GRU':tf.keras.layers.GRU}[self.rnn_type] 35 | x = self.word_embedding.output 36 | # Bi-LSTM 37 | for nrl in range(self.num_rnn_layers): 38 | x = tf.keras.layers.Bidirectional(self.rnn_layer(units=self.rnn_units, 39 | return_sequences=True, 40 | activation=self.activate_rnn, 41 | kernel_regularizer=tf.keras.regularizers.l2(self.l2 * 0.1), 42 | recurrent_regularizer=tf.keras.regularizers.l2(self.l2) 43 | ))(x) 44 | x = tf.keras.layers.Dropout(self.dropout)(x) 45 | x = tf.keras.layers.Dense(units=self.rnn_units, activation=self.activate_rnn,)(x) 46 | # crf, 'pad' or 'reg' 47 | if self.crf_mode == "pad": 48 | # length of real sentence 49 | x_mask = tf.keras.layers.Input(shape=(1), dtype=tf.int32) 50 | self.crf = CRF(self.label, mode='pad', supports_masking=True, name='crf') 51 | tensor = tf.keras.layers.Dense(self.label, name='crf_dense')(x) 52 | self.output = self.crf([tensor, x_mask]) 53 | if self.embedding_type in ["bert", "albert"]: 54 | self.inputs = [self.word_embedding.input[0], self.word_embedding.input[1], x_mask] 55 | else: 56 | self.inputs = [self.word_embedding.input, x_mask] 57 | else: 58 | self.crf = CRF(self.label, mode='reg', name='crf') 59 | tensor = tf.keras.layers.Dense(self.label, name='crf_dense')(x) 60 | self.output = self.crf(tensor) 61 | if self.embedding_type in ["bert", "albert"]: 62 | self.inputs = self.word_embedding.input 63 | else: 64 | self.inputs = self.word_embedding.input 65 | self.model = tf.keras.Model(self.inputs, self.output) 66 | self.model.summary(132) 67 | 68 | def create_compile(self): 69 | """ 70 | 构建优化器、损失函数和评价函数 71 | :return: 72 | """ 73 | self.loss = self.crf.loss 74 | self.metrics = self.crf.viterbi_accuracy 75 | super().create_compile() 76 | -------------------------------------------------------------------------------- /macropodus/tookit/han2zh/han2zh.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/8 15:51 4 | # @author : Mo 5 | # @function: 中文繁简转化 6 | 7 | 8 | from macropodus.tookit.han2zh.zh_wiki import zh2han, han2zh, cn2zh, sg2zh 9 | from collections import defaultdict 10 | 11 | 12 | class Han2Zh: 13 | def __init__(self): 14 | self.algorithm = "han2zh" 15 | # dict转为defaultdict 16 | self.han2zhs = self.load_han_zh_dict([han2zh, cn2zh, sg2zh]) 17 | self.zh2hans = self.load_han_zh_dict([zh2han]) 18 | 19 | def load_han_zh_dict(self, dicts): 20 | """ 21 | 多个dict转为一个defaultdict 22 | :param dicts: list, like [{"丟": "丢"}, {"並": "并"}] 23 | :return: dict, like {"丟": "丢", "並": "并"} 24 | """ 25 | dict_han_zh = defaultdict() 26 | for ds in dicts: 27 | for k, v in ds.items(): 28 | dict_han_zh[k] = v 29 | return dict_han_zh 30 | 31 | def han2zh(self, text, len_max=11): 32 | """ 33 | 繁体字转简体字, 反向最大切词 34 | :param sentence: str, like '雪鐵龍' 35 | :param len_max: int, like 9 36 | :return: str, like '雪铁龙' 37 | """ 38 | len_sen = len(text) 39 | i = len_sen 40 | res = [""] 41 | while i > 0: # while判断条件 42 | flag = False # flag标志位,确定有没有在字典里边的单字词或多字词 43 | for j in range(max(0, i - len_max), i): # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起 44 | word_maybe = text[j:i] # 正向可能成词的语 45 | if word_maybe in self.han2zhs: # 是否在字典里边 46 | i = j # 成词前标志i向后移动 47 | flag = True # flag标志位变化 48 | res.append(self.han2zhs.get(word_maybe)) 49 | break # 成词则跳出循环 50 | if not flag: # 未选中后单个字的情况 51 | i -= 1 52 | res_i = self.han2zhs.get(text[i]) 53 | if res_i: 54 | res.append(res_i) 55 | else: 56 | res.append(text[i]) 57 | res.reverse() 58 | return "".join(res) 59 | 60 | def zh2han(self, text, len_max=5): 61 | """ 62 | 简体字转繁体字, 反向最大切词 63 | :param sentence: str, like '大漠帝国' 64 | :param len_max: int, like 32 65 | :return: yield 66 | """ 67 | len_sen = len(text) 68 | i = len_sen 69 | res = [""] 70 | while i > 0: # while判断条件 71 | flag = False # flag标志位,确定有没有在字典里边的单字词或多字词 72 | for j in range(max(0, i - len_max), i): # 遍历从句子末尾向前可能成词的部分, 从最后i-len_max算起 73 | word_maybe = text[j:i] # 正向可能成词的语 74 | if word_maybe in self.zh2hans: # 是否在字典里边 75 | i = j # 成词前标志i向后移动 76 | flag = True # flag标志位变化 77 | res.append(self.zh2hans.get(word_maybe)) 78 | break # 成词则跳出循环 79 | if not flag: # 未选中后单个字的情况 80 | i -= 1 81 | res_i = self.zh2hans.get(text[i]) 82 | if res_i: 83 | res.append(res_i) 84 | else: 85 | res.append(text[i]) 86 | res.reverse() 87 | return "".join(res) 88 | 89 | 90 | if __name__ == '__main__': 91 | hz = Han2Zh() 92 | text = "" 93 | res_han2zh = hz.han2zh(text) 94 | res_zh2han = hz.zh2han(text) 95 | print(res_han2zh) 96 | print(res_zh2han) 97 | while True: 98 | print("请输入:") 99 | ques = input() 100 | print(hz.han2zh(ques)) 101 | print(hz.zh2han(ques)) 102 | 103 | 104 | -------------------------------------------------------------------------------- /macropodus/network/layers/keras_radam.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/12 20:12 4 | # @author : Mo 5 | # @function: radam of keras 6 | # @codefrom: https://github.com/bojone/keras_radam 7 | 8 | 9 | from tensorflow.python.keras.optimizers import Optimizer 10 | # from tensorflow.python.keras.legacy import interfaces 11 | import tensorflow.python.keras.backend as K 12 | 13 | 14 | class RAdam(Optimizer): 15 | """RAdam optimizer. 16 | Default parameters follow those provided in the original Adam paper. 17 | # Arguments 18 | lr: float >= 0. Learning rate. 19 | beta_1: float, 0 < beta < 1. Generally close to 1. 20 | beta_2: float, 0 < beta < 1. Generally close to 1. 21 | epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`. 22 | decay: float >= 0. Learning rate decay over each update. 23 | amsgrad: boolean. Whether to apply the AMSGrad variant of this 24 | algorithm from the paper "On the Convergence of Adam and 25 | Beyond". 26 | # References 27 | - [RAdam - A Method for Stochastic Optimization] 28 | (https://arxiv.org/abs/1908.03265) 29 | - [On The Variance Of The Adaptive Learning Rate And Beyond] 30 | (https://arxiv.org/abs/1908.03265) 31 | """ 32 | 33 | def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, 34 | epsilon=None, decay=0., **kwargs): 35 | super(RAdam, self).__init__(**kwargs) 36 | with K.name_scope(self.__class__.__name__): 37 | self.iterations = K.variable(0, dtype='int64', name='iterations') 38 | self.lr = K.variable(lr, name='lr') 39 | self.beta_1 = K.variable(beta_1, name='beta_1') 40 | self.beta_2 = K.variable(beta_2, name='beta_2') 41 | self.decay = K.variable(decay, name='decay') 42 | if epsilon is None: 43 | epsilon = K.epsilon() 44 | self.epsilon = epsilon 45 | self.initial_decay = decay 46 | 47 | # @interfaces.legacy_get_updates_support 48 | def get_updates(self, loss, params): 49 | grads = self.get_gradients(loss, params) 50 | self.updates = [K.update_add(self.iterations, 1)] 51 | 52 | lr = self.lr 53 | if self.initial_decay > 0: 54 | lr = lr * (1. / (1. + self.decay * K.cast(self.iterations, 55 | K.dtype(self.decay)))) 56 | 57 | t = K.cast(self.iterations, K.floatx()) + 1 58 | beta_1_t = K.pow(self.beta_1, t) 59 | beta_2_t = K.pow(self.beta_2, t) 60 | rho = 2 / (1 - self.beta_2) - 1 61 | rho_t = rho - 2 * t * beta_2_t / (1 - beta_2_t) 62 | r_t = K.sqrt( 63 | K.relu(rho_t - 4) * K.relu(rho_t - 2) * rho / ((rho - 4) * (rho - 2) * rho_t) 64 | ) 65 | flag = K.cast(rho_t > 4, K.floatx()) 66 | 67 | ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 68 | vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] 69 | self.weights = [self.iterations] + ms + vs 70 | 71 | for p, g, m, v in zip(params, grads, ms, vs): 72 | m_t = (self.beta_1 * m) + (1. - self.beta_1) * g 73 | v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g) 74 | mhat_t = m_t / (1 - beta_1_t) 75 | vhat_t = K.sqrt(v_t / (1 - beta_2_t)) 76 | p_t = p - lr * mhat_t * (flag * r_t / (vhat_t + self.epsilon) + (1 - flag)) 77 | 78 | self.updates.append(K.update(m, m_t)) 79 | self.updates.append(K.update(v, v_t)) 80 | new_p = p_t 81 | 82 | # Apply constraints. 83 | if getattr(p, 'constraint', None) is not None: 84 | new_p = p.constraint(new_p) 85 | 86 | self.updates.append(K.update(p, new_p)) 87 | return self.updates 88 | 89 | def get_config(self): 90 | config = {'lr': float(K.get_value(self.lr)), 91 | 'beta_1': float(K.get_value(self.beta_1)), 92 | 'beta_2': float(K.get_value(self.beta_2)), 93 | 'decay': float(K.get_value(self.decay)), 94 | 'epsilon': self.epsilon} 95 | base_config = super(RAdam, self).get_config() 96 | return dict(list(base_config.items()) + list(config.items())) -------------------------------------------------------------------------------- /macropodus/segment/seg_statistics/seg_dag.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 9:58 4 | # @author : Mo 5 | # @function: segmentation of maximum probability using dictionary 6 | 7 | 8 | from macropodus.preprocess.tools_common import re_continue 9 | from macropodus.base.seg_basic import SegBasic 10 | from math import log 11 | 12 | 13 | class SegDAG(SegBasic): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def build_dag(self, sentence, len_word_max=105): 18 | """ 19 | 构建句子的词典概率有向图; 20 | jieba使用的是前缀字典替代前缀树,内存比前缀树小,且比前缀树快; 21 | 基本思想是构建'大漠帝国:132','大漠帝','大漠:640','大':1024等,没有则置为0, 22 | 搜索时候前缀不存在就跳出,不用继续下去了 23 | :param sentence: str, like '大漠帝国是谁' 24 | :param sentence: int, like 132 25 | :return: dict, like {0:[0,1], 1:[1]} 26 | """ 27 | len_sen = len(sentence) 28 | dag_sen = {} 29 | for i in range(len_sen): # 前向遍历, 全切分 30 | enum_j = [i] # 单个字就是它本身 31 | for j in range(i+1, min(len_sen, i+len_word_max)): # 遍历从当前字到句子末尾可能成词的部分, 当前的不取, 设置最大成词长度为132 32 | word_maybe = sentence[i:j+1] 33 | if word_maybe in self.dict_words_freq: 34 | enum_j.append(j) 35 | dag_sen[i] = enum_j 36 | return dag_sen 37 | 38 | def calculate_prob(self, sentence, DAG, route): 39 | """ 40 | 动态规划求取最大概率, 代码来自jieba项目 41 | code from: https://github.com/fxsjy/jieba 42 | :param sentence: str, input of sentence, like "大漠帝国是谁?" 43 | :param DAG: dict, 44 | :param route: dict, 45 | :return: None 46 | """ 47 | len_sen = len(sentence) 48 | route[len_sen] = (0, 0) 49 | log_total = log(self.num_words) 50 | for index in range(len_sen - 1, -1, -1): # 动态规划 51 | route[index] = max((log(self.dict_words_freq.get(sentence[index:x + 1]) or 1) 52 | - log_total + route[x + 1][0], x) for x in DAG[index]) 53 | 54 | def cut(self, sentence): 55 | """ 56 | seg_dag字典最大概率切词, 代码来自jieba项目 57 | code from: https://github.com/fxsjy/jieba 58 | :param sentence: str, input of sentence, like "大漠帝国是谁?" 59 | :return: None 60 | """ 61 | len_sen = len(sentence) 62 | word_temp = '' 63 | route = {} 64 | i = 0 65 | DAG = self.build_dag(sentence) # 根据sentence构建有向图dag 66 | self.calculate_prob(sentence, DAG, route) # 动态规划计算概率最大的路径 67 | while i < len_sen: 68 | j = route[i][1] + 1 # 获取index, i为成词的begin, j为成词的end 69 | word_ch = sentence[i:j] # 概率成词 70 | if (j-i<2) and re_continue.match(word_ch): # 单个字判断是否为连续, 字母-数字-.-@等为连续 71 | word_temp += word_ch 72 | i = j 73 | else: # 成词后返回一个yield可迭代对象, yield后转list有点耗时 74 | if word_temp: # 有word_temp的情况下 word_ch也没有迭代返回 75 | yield word_temp 76 | word_temp = '' 77 | yield word_ch 78 | i = j 79 | if word_temp: # 最后一个成词为"字母-数字-.-@等为连续"的情况 80 | yield word_temp 81 | 82 | 83 | if __name__ == '__main__': 84 | sd = SegDAG() 85 | sd.add_word(str('知识图谱')) 86 | 87 | # for i in range(50000): 88 | sd_enum = sd.cut(sentence='apple_pir大漠帝国我再也找不到了') 89 | print(list(sd_enum)) 90 | 91 | # 测试性能 92 | from macropodus.preprocess.tools_common import txt_read, txt_write 93 | from macropodus.conf.path_config import path_root 94 | import time 95 | path_wordseg_a = path_root.replace("macropodus", "") + "/test/tet/ambiguity.txt" 96 | sentences = txt_read(path_wordseg_a) 97 | 98 | time_start = time.time() 99 | count = 0 100 | for i in range(10000): 101 | for sen in sentences: 102 | # print("原句:"+sen) 103 | count += 1 104 | res = sd.cut(sen) 105 | # print(list(res)) 106 | time_end = time.time() 107 | print(time_end-time_start) 108 | print(count/(time_end - time_start)) 109 | 110 | while True: 111 | print("请输入:") 112 | sen = input() 113 | print(list(sd.cut(sen))) 114 | # win10测试, i7 8th + 16G RAM 115 | # 10000/0.17*50 = 2864136(line/s) 116 | # 50000/0.87*50 = 2872092(line/s) 117 | 118 | 119 | -------------------------------------------------------------------------------- /macropodus/summarize/topic_base/topic_lsi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/2 21:03 4 | # @author :Mo 5 | # @function :topic model of LSI 6 | # @paper :Text summarization using Latent Semantic Analysis 7 | 8 | 9 | from macropodus.preprocess.tools_ml import cut_sentence, macropodus_cut 10 | from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit 11 | from macropodus.data.words_common.stop_words import stop_words 12 | # sklearn 13 | from sklearn.decomposition import TruncatedSVD 14 | import numpy as np 15 | 16 | 17 | class LSISum: 18 | def __init__(self): 19 | self.stop_words = stop_words.values() 20 | self.algorithm = 'lsi' 21 | 22 | def summarize(self, text, num=320, topic_min=5, judge_topic='all'): 23 | """ 24 | 25 | :param text: 26 | :param num: 27 | :return: 28 | """ 29 | # 切句 30 | if type(text) == str: 31 | self.sentences = cut_sentence(text) 32 | elif type(text) == list: 33 | self.sentences = text 34 | else: 35 | raise RuntimeError("text type must be list or str") 36 | len_sentences_cut = len(self.sentences) 37 | # 切词 38 | sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) 39 | if word.strip()] for sentence in self.sentences] 40 | # 去除停用词等 41 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 42 | self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] 43 | # 计算每个句子的tfidf 44 | sen_tfidf = tfidf_fit(self.sentences_cut) 45 | # 主题数, 经验判断 46 | topic_num = min(topic_min, int(len(sentences_cut)/2)) # 设定最小主题数为3 47 | svd_tfidf = TruncatedSVD(n_components=topic_num, n_iter=32) 48 | res_svd_u = svd_tfidf.fit_transform(sen_tfidf.T) 49 | res_svd_v = svd_tfidf.components_ 50 | 51 | if judge_topic: 52 | ### 方案一, 获取最大那个主题的k个句子 53 | ################################################################################## 54 | topic_t_score = np.sum(res_svd_v, axis=-1) 55 | # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 56 | res_nmf_h_soft = res_svd_v.argsort(axis=0)[-topic_num:][::-1] 57 | # 统计为最大每个主题的句子个数 58 | exist = (res_nmf_h_soft <= 0) * 1.0 59 | factor = np.ones(res_nmf_h_soft.shape[1]) 60 | topic_t_count = np.dot(exist, factor) 61 | # 标准化 62 | topic_t_count /= np.sum(topic_t_count, axis=-1) 63 | topic_t_score /= np.sum(topic_t_score, axis=-1) 64 | # 主题最大个数占比, 与主题总得分占比选择最大的主题 65 | topic_t_tc = topic_t_count + topic_t_score 66 | topic_t_tc_argmax = np.argmax(topic_t_tc) 67 | # 最后得分选择该最大主题的 68 | res_nmf_h_soft_argmax = res_svd_v[topic_t_tc_argmax].tolist() 69 | res_combine = {} 70 | for l in range(len_sentences_cut): 71 | res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] 72 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] 73 | ##################################################################################### 74 | else: 75 | ### 方案二, 获取最大主题概率的句子, 不分主题 76 | res_combine = {} 77 | for i in range(len_sentences_cut): 78 | res_row_i = res_svd_v[:, i] 79 | res_row_i_argmax = np.argmax(res_row_i) 80 | res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] 81 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] 82 | num_min = min(num, int(len_sentences_cut * 0.6)) 83 | return score_sen[0:num_min] 84 | 85 | 86 | if __name__ == '__main__': 87 | lsi = LSISum() 88 | doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \ 89 | "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \ 90 | "该基金认缴出资总规模为人民币3.01亿元。" \ 91 | "基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \ 92 | "各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \ 93 | "截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \ 94 | "公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \ 95 | "方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}" 96 | sum = lsi.summarize(doc, num=8) 97 | for i in sum: 98 | print(i) 99 | -------------------------------------------------------------------------------- /test/evaluate/tet_macropodus.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/17 21:13 4 | # @author : Mo 5 | # @function: test macropodus 6 | 7 | # import os 8 | # os.environ['TF_KERAS'] = '1' 9 | 10 | 11 | import time 12 | time_start = time.time() 13 | import macropodus 14 | print('macropodus初始化耗时: ' + str(time.time()-time_start) + 's') 15 | 16 | # import sys 17 | # import os 18 | # print(os.name) 19 | # print(sys.platform) 20 | 21 | # macropodus.load_user_dict(path_user="user.json", type_user="json") 22 | macropodus.add_word(word="斗鱼属") 23 | macropodus.add_word(word="斗鱼科") 24 | macropodus.add_word(word="鲈形目") 25 | macropodus.save_add_words(word_freqs={"喜斗":32, "护卵":64, "护幼":132}) 26 | macropodus.add_word(word="坑爹的平衡性基金") 27 | macropodus.save_add_words(word_freqs={"BBC":132}) 28 | 29 | # sent = "今日头条 白嫖 东风快递 令人喷饭 勿谓言之不预也 白嫖 口区 弓虽 口丕 我酸了 祖安人 迷惑行为 5G 996 007 1118 35 120 251 nmsl nsdd wdnmd CSGO 唱跳 rap 篮球 鸡你太美 cxk 盘它 撞梗 融梗 雨女无瓜 要你寡 刺激战场 绝地求生" 30 | # sent = "狼灭 狼火 狼炎 狼焱 灵魂八问 硬核 奥力给 有内味了 awsl 影流之主 巨魔之王" 31 | # words = sent.split(" ") 32 | # word_dict = {} 33 | # for w in words: 34 | # word_dict[w] = 132 35 | # macropodus.save_add_words(word_freqs=word_dict) 36 | 37 | print(macropodus.cut("坑爹的平衡性基金啊,坑爹呀斗鱼属,Macropodus (Lacépède, 1801),鲈形目斗鱼科的一属鱼类。" 38 | "本属鱼类通称斗鱼。因喜斗而得名。分布于亚洲东南部。中国有2种,即叉尾斗鱼,分布于长江及以南各省;" 39 | "叉尾斗鱼,分布于辽河到珠江流域。其喜栖居于小溪、河沟、池塘、稻田等缓流或静水中。" 40 | "雄鱼好斗,产卵期集草成巢,雄鱼口吐粘液泡沫,雌鱼产卵其中,卵浮性,受精卵在泡沫内孵化。雄鱼尚有护卵和护幼现象。" 41 | )) 42 | 43 | sen_calculate = "23 + 13 * (25+(-9-2-5-2*3-6/3-40*4/(2-3)/5+6*3))加根号144你算得几多" 44 | sen_chi2num = "三千零七十八亿三千零十五万零三百一十二点一九九四" 45 | sen_num2chi = 1994.1994 46 | sen_roman2int = "IX" 47 | sen_int2roman = 132 48 | # sent1 = "PageRank算法简介" 49 | # sent2 = "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" 50 | sent1 = "香蕉的翻译" 51 | sent2 = "用英语说香蕉" 52 | summary = "四川发文取缔全部不合规p2p。字节跳动与今日头条。成都日报,成都市,李太白与杜甫"\ 53 | "PageRank算法简介。" \ 54 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 55 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 56 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 57 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 58 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 59 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 60 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 61 | "和投票目标的等级来决定新的等级。简单的说, " \ 62 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 63 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 64 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 65 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 66 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 67 | 68 | # 分词(词典最大概率分词DAG) 69 | words = macropodus.cut(summary) 70 | print(words) 71 | # 新词发现 72 | new_words = macropodus.find(summary) 73 | print(new_words) 74 | # 文本摘要 75 | sum = macropodus.summarize(summary) 76 | print(sum) 77 | # 关键词抽取 78 | keyword = macropodus.keyword(summary) 79 | print(keyword) 80 | # 文本相似度 81 | sim = macropodus.sim(sent1, sent2) 82 | print(sim) 83 | # tookit 84 | # 计算器 85 | score_calcul = macropodus.calculate(sen_calculate) 86 | print(score_calcul) 87 | # 中文数字与阿拉伯数字相互转化 88 | res_chi2num = macropodus.chi2num(sen_chi2num) 89 | print(res_chi2num) 90 | res_num2chi = macropodus.num2chi(sen_num2chi) 91 | print(res_num2chi) 92 | # 阿拉伯数字与罗马数字相互转化 93 | res_roman2int = macropodus.roman2num(sen_roman2int) 94 | print(res_roman2int) 95 | res_int2roman = macropodus.num2roman(sen_int2roman) 96 | print(res_int2roman) 97 | # 中文汉字转拼音 98 | res_pinyin = macropodus.pinyin(summary) 99 | print(res_pinyin) 100 | # 中文繁简转化 101 | res_zh2han = macropodus.zh2han(summary) 102 | print(res_zh2han) 103 | res_han2zh = macropodus.han2zh(res_zh2han) 104 | print(res_han2zh) 105 | 106 | # 命名实体提取, 107 | # ner, albert+bilstm+crf网络架构, 最大支持126个字符; 108 | # 需要安装tensorflow==1.15.0(pip安装不默认下载, 1.15.0以下未实验, 1.13以上应该可以) 109 | # 需要下载模型(pip安装不默认下载, 将ner_albert_people_1998覆盖到安装目录macropodus/data/model); 110 | summary = ["美丽的广西是我国华南地区的一颗璀璨的明珠,山清水秀生态美,风生水起万象新。", "广西壮族自治区,简称“桂”,是中华人民共和国省级行政区"] 111 | res_ner = macropodus.ner(summary[0]) 112 | print(res_ner) 113 | res_ners = macropodus.ners(summary) 114 | print(res_ners) 115 | 116 | # 词性标注, 117 | # pos tag, albert+bilstm+crf网络架构, 最大支持126个字符; 118 | # 需要安装tensorflow==1.15.0(pip安装不默认下载, 1.15.0以下未实验, 1.13以上应该可以) 119 | # 需要下载模型(pip安装不默认下载, 将tag_albert_people_1998覆盖到安装目录macropodus/data/model); 120 | res_postag = macropodus.postag(summary[0]) 121 | print(res_postag) 122 | res_postags = macropodus.postags(summary) 123 | print(res_postags) 124 | 125 | -------------------------------------------------------------------------------- /macropodus/summarize/feature_base/word_significance.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/26 23:42 4 | # @author :Mo 5 | # @function :text summarize of extraction of word significance 6 | # @paper :The Automatic Creation of Literature Abstracts* 7 | # @url :http://courses.ischool.berkeley.edu/i256/f06/papers/luhn58.pdf 8 | 9 | 10 | from macropodus.data.words_common.stop_words import stop_words 11 | from macropodus.preprocess.tools_ml import extract_chinese 12 | from macropodus.preprocess.tools_ml import macropodus_cut 13 | from macropodus.preprocess.tools_ml import cut_sentence 14 | from collections import Counter 15 | 16 | 17 | class WordSignificanceSum: 18 | def __init__(self): 19 | """ 20 | features: 21 | 1. words mix in title and sentence 22 | 2. keywords in sentence 23 | 3. Position of sentence 24 | 4. Length of sentence 25 | """ 26 | self.algorithm = 'word_significance' 27 | self.stop_words = stop_words.values() 28 | self.num = 0 29 | 30 | def summarize(self, text, num=320): 31 | """ 32 | 根据词语意义确定中心句 33 | :param text: str 34 | :param num: int 35 | :return: list 36 | """ 37 | # 切句 38 | if type(text) == str: 39 | self.sentences = cut_sentence(text) 40 | elif type(text) == list: 41 | self.sentences = text 42 | else: 43 | raise RuntimeError("text type must be list or str") 44 | # 切词 45 | sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) 46 | if word.strip()] for sentence in self.sentences] 47 | # 去除停用词等 48 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 49 | # 词频统计 50 | self.words = [] 51 | for sen in self.sentences_cut: 52 | self.words = self.words + sen 53 | self.word_count = dict(Counter(self.words)) 54 | self.word_count_rank = sorted(self.word_count.items(), key=lambda f: f[1], reverse=True) 55 | # 最小句子数 56 | num_min = min(num, int(len(self.word_count)*0.6)) 57 | # 词语排序, 按照词频 58 | self.word_rank = [wcr[0] for wcr in self.word_count_rank][0:num_min] 59 | res_sentence = [] 60 | # 抽取句子, 顺序, 如果词频高的词语在句子里, 则抽取 61 | for word in self.word_rank: 62 | for i in range(0, len(self.sentences)): 63 | # 当返回关键句子到达一定量, 则结束返回 64 | if len(res_sentence) < num_min: 65 | added = False 66 | for sent in res_sentence: 67 | if sent == self.sentences[i]: added = True 68 | if (added == False and word in self.sentences[i]): 69 | res_sentence.append(self.sentences[i]) 70 | break 71 | # 只是计算各得分,没什么用 72 | len_sentence = len(self.sentences) 73 | res_sentence = [(1-1/(len_sentence+len_sentence/(k+1)), rs) for k, rs in enumerate(res_sentence)] 74 | return res_sentence 75 | 76 | 77 | if __name__ == "__main__": 78 | doc = "PageRank算法简介。" \ 79 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 80 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 81 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 82 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 83 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 84 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 85 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 86 | "和投票目标的等级来决定新的等级。简单的说, " \ 87 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 88 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 89 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 90 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 91 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 92 | 93 | doc1 = "多知网. "\ 94 | "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \ 95 | "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \ 96 | "该基金认缴出资总规模为人民币3.01亿元。" \ 97 | "基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \ 98 | "各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \ 99 | "截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \ 100 | "公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \ 101 | "方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}" 102 | 103 | ws = WordSignificanceSum() 104 | res = ws.summarize(doc, num=6) 105 | for r in res: 106 | print(r) 107 | -------------------------------------------------------------------------------- /test/survey_report/nlp_platfom_survey.md: -------------------------------------------------------------------------------- 1 | # 中文自然语言处理(nlp)工具调研与汇总(截至2019.11.16) 2 | 3 | 4 | ## 1.常见平台与功能 5 | 平台|语言|star|year|中文分词|词性标注|依存句法|实体识别|关键词提取|文本摘要|文本聚类|情感识别|文本相似|关系抽取|free| 6 | ---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- 7 | jieba|python|20.8k|7/0.5|是|是|否|否|是|否|否|是|否|否|MIT 8 | THULAC-Python|python|1.2k|4/1|是|是|否|否|否|否|否|否|否|否|MIT 9 | pkuseg-python|python|4.3k|0.9/0.5|是|是|否|否|否|否|否|否|否|否|MIT 10 | snownlp|python|4.4k|6/3/*|是|是|否|否|是|是|否|是|是|否|MIT 11 | deepnlp|python|1.3k|2/2/!|是|是|是|是|是|是|否|否|否|否|MIT 12 | fastNLP|python|0.9k|2/0|是|是|否|是|否|否|否|是|否|否|MIT 13 | Jiagu|python|0.97k|0.9/0|是|是|是|是|是|是|是|是|否|是|MIT 14 | YaYaNLP|python|0.05k|4/4/!|是|是|否|是|否|否|否|否|否|否|MIT 15 | HanLP|java|16.4k|0.9/0|是|是|是|是|是|是|是|是|否|否|MIT 16 | ansj-seg|java|5.2k|3/0.4|是|是|是|是|是|是|否|是|否|否|Apache-2.0 17 | word|java|1.4k|5/1|是|是|否|是|否|否|否|否|是|否|Apache-2.0 18 | Jcseg|java|0.69k|3/0|是|是|是|是|是|是|否|否|否|否|Apache-2.0 19 | ik-analyzer|java|0.53k|9/9/!|是|是|是|否|否|否|否|否|否|否|LGPL-3.0 20 | CoreNLP|java|6.7k|9/9/!|是|是|是|是|是|否|否|否|否|否|GUN2.0 21 | fnlp|java|2.2k|6/0.9/!|是|是|是|是|是|是|是|否|否|否|LGPL-3.0 22 | NLPIR|java|2.5k|?/1/!|是|是|否|否|否|否|是|否|否|否|not open 23 | sego|go|1.2k|6/1/!|是|是|否|否|否|否|是|否|否|否|Apache-2.0 24 | ltp|c++|2.3k|6/1/!|是|是|是|是|是|是|是|否|否|否|LGPL-3.0 25 | PaddleNLP|c++|3.4k|6/1/!|是|是|是|是|是|是|是|是|是|是|Apache-2.0 26 | 27 | 28 | ##备注 29 | * 1.year中"6/3/*"表示"项目开始时间/最近更新时间/在维护";!表示不维护,超过一年不维护,不回复issiue则认为放弃; 30 | * 2.其他功能 31 | * snownlp: 拼音转换,繁简转换,tf-idf计算,切句子 32 | * deepnlp: tensorflow1.4训练的各种模型 33 | * NLPIR: 检索,敏感信息,文档去重,编码转换 34 | * Ltp: 事件抽取,srl,时间抽取, 35 | * HanLP: 人民日报2014分词,文本推荐(相似度),索引分词 36 | * ansj-seg: 比较混乱,主页没有调用说明,词典是个大杂烩 37 | * word: 词频统计、词性标注、同义标注、反义标注、拼音标注 38 | * ltp: 特征裁剪策略,语义角色标注 39 | * PaddleNLP: Paddle训练,以及基础包,enienr生成等各种任务 40 | * 3.更多的统计学习方法 41 | 摘要,情感识别(酸甜苦辣),新词发现,实体与关系抽取,领域分类,生成 42 | 43 | 44 | ##分词算法 45 | * 1.jieba 46 | * 1.1 基于前缀词典实现高效的词图扫描,生成句子中汉字所有可能成词情况所构成的有向无环图 (DAG) 47 | * 1.2 采用了动态规划查找最大概率路径, 找出基于词频的最大切分组合 48 | * 1.3 对于未登录词,采用了基于汉字成词能力的 HMM 模型,使用了 Viterbi 算法 49 | * 2.THULAC,pkuseg,Jiagu,fastNLP 50 | * 2.1 CRF(char,word,elmo,bert) 51 | * 2.2 feature+CRF 52 | * 3.ansj-seg 53 | * 3.1 n-Gram+CRF+HMM 54 | * 4.HanLP 55 | * 4.1 n-Gram, CRF 56 | * 5.sego 57 | * 5.1 基于词频的最短路径加动态规划 58 | * 6.Ltp 59 | * 6.1 bilstm+crf 60 | * 6.2 英文、URI一类特殊词识别规则 61 | 利用空格等自然标注线索 62 | 在统计模型中融入词典信息 63 | 从大规模未标注数据中统计的字间互信息、上下文丰富程度 64 | * 7.PaddleNLP 65 | * 7.1 gru+crf 66 | * 8.word(最大匹配法、最大概率法、最短路径法) 67 | * 8.1 正向最大匹配算法,逆向最大匹配算法,正向最小匹配算法,逆向最小匹配算法 68 | * 8.2 双向最大匹配算法,双向最小匹配算法,双向最大最小匹配算法 69 | * 8.3 全切分算法,最少词数算法,最大Ngram分值算法,最短路径法 70 | * 8.4 语义切分:扩充转移网络法、知识分词语义分析法、邻接约束法、综合匹配法、后缀分词法、特征词库法、矩阵约束法、语法分析法 71 | 72 | 73 | ## 工具包地址 74 | * jiba:[https://github.com/fxsjy/jieba](https://github.com/fxsjy/jieba) 75 | * HanLP:[https://github.com/hankcs/HanLP](https://github.com/hankcs/HanLP) 76 | * CoreNLP:[https://github.com/stanfordnlp/CoreNLP](https://github.com/stanfordnlp/CoreNLP) 77 | * ansj-seg:[https://github.com/lionsoul2014/jcseg](https://github.com/lionsoul2014/jcseg) 78 | * THULAC-Python:[https://github.com/thunlp/THULAC-Python](https://github.com/thunlp/THULAC-Python) 79 | * pkuseg-python:[https://github.com/lancopku/pkuseg-python](https://github.com/lancopku/pkuseg-python) 80 | * snownlp:[https://github.com/isnowfy/snownlp](https://github.com/isnowfy/snownlp) 81 | * deepnlp:[https://github.com/rockingdingo/deepnlp](https://github.com/rockingdingo/deepnlp) 82 | * fastNLP:[https://github.com/fastnlp/fastNLP](https://github.com/fastnlp/fastNLP) 83 | * Jiagu:[https://github.com/ownthink/Jiagu](https://github.com/ownthink/Jiagu) 84 | * xmnlp:[https://github.com/SeanLee97/xmnlp](https://github.com/SeanLee97/xmnlp) 85 | * word:[https://github.com/ysc/word](https://github.com/ysc/word) 86 | * jcseg:[https://github.com/lionsoul2014/jcseg](https://github.com/lionsoul2014/jcseg) 87 | * paddleNLP:[https://github.com/PaddlePaddle/models](https://github.com/PaddlePaddle/models) 88 | * sego:[https://github.com/huichen/sego](https://github.com/huichen/sego) 89 | * ik-analyzer:[https://github.com/wks/ik-analyzer](https://github.com/wks/ik-analyzer) 90 | * fnlp:[https://github.com/FudanNLP/fnlp](https://github.com/FudanNLP/fnlp) 91 | * NLPIR:[https://github.com/NLPIR-team/NLPIR](https://github.com/NLPIR-team/NLPIR) 92 | 93 | ### 94 | 新词发现: 95 | 1. Matrix67: The Aha Moments的信息熵方法: [互联网时代的社会语言学:基于SNS的文本数据挖掘](http://www.matrix67.com/blog/archives/5044) 96 | 1.词频、左右熵(丰度,字符组合左右邻字的丰富程度, -p*log(p))、 97 | 2.互信息(凝固度,内部凝聚程度, pmi = p(x,y)*log(p(x,y)/(p(x)*p(y))))等构建得分函数 98 | 2. HanLP的长短语构造方法: [基于互信息和左右信息熵的短语提取识别](https://www.hankcs.com/nlp/extraction-and-identification-of-mutual-information-about-the-phrase-based-on-information-entropy.html) 99 | 1.切词(只统计词典),统计词语共现(一阶、二阶、三阶) 100 | 2.左右熵、互信息。合并词典词语,构建短语 101 | 3. SmoothNLP:["新词发现"算法探讨与优化-SmoothNLP](https://zhuanlan.zhihu.com/p/80385615) 102 | 1.左右熵权重: Ew =log((El*e^Er+Er*e^EL)/|Er-El|) 103 | 2.平均互信息AMI:(1/n) * log(p(w)/(p(1)p(2)...p(n))) 104 | 3.过滤条件:对在candidate ngram中, 首字或者尾字出现次数特别多的进行筛选, 如"XX的,美丽的,漂亮的"剔出字典 105 | 106 | -------------------------------------------------------------------------------- /macropodus/preprocess/tools_common.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/19 0:15 4 | # @author : Mo 5 | # @function: common tools of macropodus 6 | 7 | 8 | from macropodus.conf.path_log import get_logger_root 9 | import json 10 | import os 11 | import re 12 | 13 | 14 | re_continue = re.compile("[A-Za-z0-9.@_]", re.U) 15 | re_zh_cn = re.compile("([\u4E00-\u9FD5]+)", re.U) 16 | 17 | 18 | logger = get_logger_root() 19 | 20 | 21 | __all__ = ["txt_read", 22 | "txt_write", 23 | "save_json", 24 | "load_json", 25 | "delete_file"] 26 | 27 | 28 | def txt_read(path_file, encode_type='utf-8'): 29 | """ 30 | 读取txt文件,默认utf8格式, 不能有空行 31 | :param file_path: str, 文件路径 32 | :param encode_type: str, 编码格式 33 | :return: list 34 | """ 35 | list_line = [] 36 | try: 37 | file = open(path_file, 'r', encoding=encode_type) 38 | while True: 39 | line = file.readline().strip() 40 | if not line: 41 | break 42 | list_line.append(line) 43 | file.close() 44 | except Exception as e: 45 | logger.info(str(e)) 46 | finally: 47 | return list_line 48 | 49 | 50 | def txt_write(list_line, file_path, type='w', encode_type='utf-8'): 51 | """ 52 | txt写入list文件 53 | :param listLine:list, list文件,写入要带"\n" 54 | :param filePath:str, 写入文件的路径 55 | :param type: str, 写入类型, w, a等 56 | :param encode_type: 57 | :return: 58 | """ 59 | try: 60 | file = open(file_path, type, encoding=encode_type) 61 | file.writelines(list_line) 62 | file.close() 63 | except Exception as e: 64 | logger.info(str(e)) 65 | 66 | 67 | def save_json(json_lines, json_path, encoding='utf-8', indent=4): 68 | """ 69 | 保存json, 70 | :param json_lines: json 71 | :param path: str 72 | :return: None 73 | """ 74 | with open(json_path, 'w', encoding=encoding) as fj: 75 | fj.write(json.dumps(json_lines, ensure_ascii=False, indent=indent)) 76 | fj.close() 77 | 78 | 79 | def load_json(path, encoding="utf-8"): 80 | """ 81 | 获取json, json存储为[{}]格式, like [{'大漠帝国':132}] 82 | :param path: str 83 | :return: json 84 | """ 85 | with open(path, 'r', encoding=encoding) as fj: 86 | model_json = json.load(fj) 87 | return model_json 88 | 89 | 90 | def delete_file(path): 91 | """ 92 | 删除一个目录下的所有文件 93 | :param path: str, dir path 94 | :return: None 95 | """ 96 | for i in os.listdir(path): 97 | # 取文件或者目录的绝对路径 98 | path_children = os.path.join(path, i) 99 | if os.path.isfile(path_children): 100 | if path_children.endswith(".h5") or path_children.endswith(".json") or "events" in path_children or "trace" in path_children: 101 | os.remove(path_children) 102 | else:# 递归, 删除目录下的所有文件 103 | delete_file(path_children) 104 | 105 | 106 | def get_dir_files(path_dir): 107 | """ 108 | 递归获取某个目录下的所有文件(单层) 109 | :param path_dir: str, like '/home/data' 110 | :return: list, like ['2019_12_5.txt'] 111 | """ 112 | 113 | def get_dir_files_func(file_list, dir_list, root_path=path_dir): 114 | """ 115 | 递归获取某个目录下的所有文件 116 | :param root_path: str, like '/home/data' 117 | :param file_list: list, like [] 118 | :param dir_list: list, like [] 119 | :return: None 120 | """ 121 | # 获取该目录下所有的文件名称和目录名称 122 | dir_or_files = os.listdir(root_path) 123 | for dir_file in dir_or_files: 124 | # 获取目录或者文件的路径 125 | dir_file_path = os.path.join(root_path, dir_file) 126 | # 判断该路径为文件还是路径 127 | if os.path.isdir(dir_file_path): 128 | dir_list.append(dir_file_path) 129 | # 递归获取所有文件和目录的路径 130 | get_dir_files_func(dir_file_path, file_list, dir_list) 131 | else: 132 | file_list.append(dir_file_path) 133 | 134 | # 用来存放所有的文件路径 135 | _files = [] 136 | # 用来存放所有的目录路径 137 | dir_list = [] 138 | get_dir_files_func(_files, dir_list, path_dir) 139 | return _files 140 | 141 | 142 | def get_all_dirs_files(path_dir): 143 | """ 144 | 递归获取某个目录下的所有文件(所有层, 包括子目录) 145 | :param path_dir: str, like '/home/data' 146 | :return: list, like ['2020_01_08.txt'] 147 | """ 148 | path_files = [] 149 | def get_path_files(path_dir): 150 | """ 151 | 递归函数, 获取某个目录下的所有文件 152 | :param path_dir: str, like '/home/data' 153 | :return: list, like ['2020_01_08.txt'] 154 | """ 155 | for root, dirs, files in os.walk(path_dir): 156 | for fi in files: # 递归的终止条件 157 | path_file = os.path.join(root, fi) 158 | path_files.append(path_file) 159 | for di in dirs: # 语间目录便继续递归 160 | path_dir = os.path.join(root, di) 161 | get_path_files(path_dir) 162 | get_path_files(path_dir) 163 | return path_files 164 | -------------------------------------------------------------------------------- /macropodus/tookit/trie_tree/trie_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/19 20:40 4 | # @author :Mo 5 | # @function :TrieTree of keywords find, 只返回查全的情况, 查找句子中的关键词(例如影视名、人名、关键词、实体等) 6 | 7 | 8 | from macropodus.conf.path_log import get_logger_root 9 | 10 | 11 | logger = get_logger_root() 12 | 13 | 14 | class TrieNode: 15 | """ 16 | 前缀树节点-链表 17 | """ 18 | def __init__(self): 19 | self.child = {} 20 | 21 | 22 | class TrieTree: 23 | """ 24 | 前缀树构建, 新增关键词, 关键词词语查找等 25 | """ 26 | def __init__(self): 27 | self.algorithm = "trietree" 28 | self.root = TrieNode() 29 | 30 | def add_keyword(self, keyword): 31 | """ 32 | 新增一个关键词 33 | :param keyword: str, 构建的关键词 34 | :return: None 35 | """ 36 | node_curr = self.root 37 | for word in keyword: 38 | if node_curr.child.get(word) is None: 39 | node_next = TrieNode() 40 | node_curr.child[word] = node_next 41 | node_curr = node_curr.child[word] 42 | # 每个关键词词后边, 加入end标志位 43 | if node_curr.child.get('[END]') is None: 44 | node_next = TrieNode() 45 | node_curr.child['[END]'] = node_next 46 | node_curr = node_curr.child['[END]'] 47 | logger.info("add {} success!".format("".join(keyword))) 48 | 49 | def delete_keyword(self, keyword): 50 | """ 51 | 删除一个关键词 52 | :param keyword: str, 构建的关键词 53 | :return: None 54 | """ 55 | node_curr = self.root 56 | flag = 1 57 | for word in keyword: 58 | if node_curr.child.get(word) is not None: 59 | node_curr = node_curr.child[word] 60 | else: 61 | flag = 0 62 | # 每个关键词词后边, 加入end标志位 63 | if node_curr.child.get('[END]') is not None and flag == 1: 64 | node_curr.child.pop('[END]') 65 | else: 66 | logger.info("{} is not in trietree, delete keyword faild!".format("".join(keyword))) 67 | 68 | def add_keywords_from_list(self, keywords): 69 | """ 70 | 新增关键词s, 格式为list 71 | :param keyword: list, 构建的关键词 72 | :return: None 73 | """ 74 | for keyword in keywords: 75 | self.add_keyword(keyword) 76 | 77 | def find_keyword(self, sentence): 78 | """ 79 | 从句子中提取关键词, 可提取多个 80 | :param sentence: str, 输入的句子 81 | :return: list, 提取到的关键词 82 | """ 83 | assert type(sentence) == str 84 | if not sentence: # 空格字符不取 85 | return [] 86 | 87 | node_curr = self.root # 关键词的头, 每遍历完一遍后需要重新初始化 88 | index_last = len(sentence) 89 | keyword_list = [] 90 | keyword = '' 91 | count = 0 92 | for word in sentence: 93 | count += 1 94 | if node_curr.child.get(word) is None: # 查看有无后缀, 即匹配到一个关键词最后一个字符的时候 95 | if keyword: # 提取到的关键词(也可能是前面的几位) 96 | if node_curr.child.get('[END]') is not None: # 取以end结尾的关键词 97 | keyword_list.append(keyword) 98 | if self.root.child.get(word) is not None: # 处理连续的关键词情况, 如"第九区流浪地球" 99 | keyword = word 100 | node_curr = self.root.child[word] 101 | else: # 102 | keyword = '' 103 | node_curr = self.root # 重新初始化 104 | else: # 有后缀就加到name里边 105 | keyword = keyword + word 106 | node_curr = node_curr.child[word] 107 | if count == index_last: # 实体结尾的情况 108 | if node_curr.child.get('[END]') is not None: 109 | keyword_list.append(keyword) 110 | return keyword_list 111 | 112 | def match_keyword(self, keyword): 113 | """ 114 | 判断keyword在不在trietree里边 115 | :param keyword: str, input word 116 | :return: boolean, True or False 117 | """ 118 | node = self.root 119 | for kw in keyword: 120 | if not node.child.get(kw): 121 | return False 122 | node = node.child[kw] 123 | if not node.child.get('[END]'): 124 | return False 125 | return True 126 | 127 | 128 | def get_trie_tree_class(keywords): 129 | """ 130 | 根据list关键词,初始化trie树 131 | :param keywords: list, input 132 | :return: objext, 返回实例化的trie 133 | """ 134 | trie = TrieTree() 135 | trie.add_keywords_from_list(keywords) 136 | return trie 137 | 138 | 139 | if __name__ == "__main__": 140 | print("".join("你好呀")) 141 | # 测试1, class实例 142 | trie = TrieTree() 143 | keywords = ['英雄', '人在囧途', '那些年,我们一起追过的女孩', '流浪地球', '华娱', 144 | '犬夜叉', '火影', '名侦探柯南', '约会大作战', '名作之壁', '动漫', 145 | '乃木坂46', 'akb48', '飘', '最后的武士', '约会', '英雄2', '日娱', 146 | '2012', '第九区', '星球大战', '侏罗纪公园', '泰坦尼克号', 'Speed'] 147 | keywords = [list(keyword.strip()) for keyword in keywords] 148 | trie.add_keywords_from_list(keywords) # 创建树 149 | keyword = trie.find_keyword('第九区约会, 侏罗纪公园和泰坦尼克号泰坦尼克号') 150 | print(keyword) 151 | gg = trie.delete_keyword('英雄') 152 | gg = trie.delete_keyword('英雄3') 153 | 154 | keyword = trie.match_keyword('英雄') 155 | keyword2 = trie.match_keyword('英雄2') 156 | 157 | print(keyword) 158 | 159 | 160 | # 测试2, get树 161 | trie_tree = get_trie_tree_class(keywords) # 创建树并返回实例化class 162 | while True: 163 | print("sihui请你输入:") 164 | input_ques = input() 165 | keywords = trie_tree.find_keyword(input_ques) 166 | print(keywords) 167 | -------------------------------------------------------------------------------- /macropodus/summarize/topic_base/topic_lda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/31 21:33 4 | # @author :Mo 5 | # @function :topic model of LDA 6 | # @paper :Latent Dirichlet Allocation 7 | 8 | 9 | from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit 10 | from macropodus.data.words_common.stop_words import stop_words 11 | from macropodus.preprocess.tools_ml import macropodus_cut 12 | from macropodus.preprocess.tools_ml import cut_sentence 13 | # sklearn 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | from sklearn.decomposition import LatentDirichletAllocation 16 | import numpy as np 17 | 18 | 19 | class LDASum: 20 | def __init__(self): 21 | self.stop_words = stop_words.values() 22 | self.algorithm = 'lda' 23 | 24 | def summarize(self, text, num=8, topic_min=6, judge_topic=None): 25 | """ 26 | LDA 27 | :param text: str 28 | :param num: int 29 | :param topic_min: int 30 | :param judge_topic: boolean 31 | :return: 32 | """ 33 | # 切句 34 | if type(text) == str: 35 | self.sentences = cut_sentence(text) 36 | elif type(text) == list: 37 | self.sentences = text 38 | else: 39 | raise RuntimeError("text type must be list or str") 40 | len_sentences_cut = len(self.sentences) 41 | # 切词 42 | sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) 43 | if word.strip()] for sentence in self.sentences] 44 | # 去除停用词等 45 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 46 | self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] 47 | # # 计算每个句子的tf 48 | # vector_c = CountVectorizer(ngram_range=(1, 2), stop_words=self.stop_words) 49 | # tf_ngram = vector_c.fit_transform(self.sentences_cut) 50 | # 计算每个句子的tfidf 51 | tf_ngram = tfidf_fit(self.sentences_cut) 52 | # 主题数, 经验判断 53 | topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3 54 | lda = LatentDirichletAllocation(n_components=topic_num, max_iter=32, 55 | learning_method='online', 56 | learning_offset=50., 57 | random_state=2019) 58 | res_lda_u = lda.fit_transform(tf_ngram.T) 59 | res_lda_v = lda.components_ 60 | 61 | if judge_topic: 62 | ### 方案一, 获取最大那个主题的k个句子 63 | ################################################################################## 64 | topic_t_score = np.sum(res_lda_v, axis=-1) 65 | # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 66 | res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1] 67 | # 统计为最大每个主题的句子个数 68 | exist = (res_nmf_h_soft <= 0) * 1.0 69 | factor = np.ones(res_nmf_h_soft.shape[1]) 70 | topic_t_count = np.dot(exist, factor) 71 | # 标准化 72 | topic_t_count /= np.sum(topic_t_count, axis=-1) 73 | topic_t_score /= np.sum(topic_t_score, axis=-1) 74 | # 主题最大个数占比, 与主题总得分占比选择最大的主题 75 | topic_t_tc = topic_t_count + topic_t_score 76 | topic_t_tc_argmax = np.argmax(topic_t_tc) 77 | # 最后得分选择该最大主题的 78 | res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist() 79 | res_combine = {} 80 | for l in range(len_sentences_cut): 81 | res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] 82 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] 83 | ##################################################################################### 84 | else: 85 | ### 方案二, 获取最大主题概率的句子, 不分主题 86 | res_combine = {} 87 | for i in range(len_sentences_cut): 88 | res_row_i = res_lda_v[:, i] 89 | res_row_i_argmax = np.argmax(res_row_i) 90 | res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] 91 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] 92 | num_min = min(num, int(len_sentences_cut * 0.6)) 93 | return score_sen[0:num_min] 94 | 95 | 96 | if __name__ == '__main__': 97 | lda = LDASum() 98 | doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \ 99 | "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \ 100 | "该基金认缴出资总规模为人民币3.01亿元。" \ 101 | "基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \ 102 | "各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \ 103 | "截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \ 104 | "公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \ 105 | "方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}" 106 | 107 | doc = "PageRank算法简介。" \ 108 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 109 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 110 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 111 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 112 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 113 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 114 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 115 | "和投票目标的等级来决定新的等级。简单的说, " \ 116 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 117 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 118 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 119 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 120 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 121 | 122 | sum = lda.summarize(doc, num=8) 123 | for i in sum: 124 | print(i) 125 | -------------------------------------------------------------------------------- /test/evaluate/tet_nlg_yongzhuo.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/5/14 21:42 4 | # @author : Mo 5 | # @function: nlg-yongzhuo 6 | 7 | 8 | from macropodus.summarize.yongzhuo_nlg import * 9 | 10 | doc = """PageRank算法简介。" \ 11 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 12 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 13 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 14 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 15 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 16 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 17 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 18 | "和投票目标的等级来决定新的等级。简单的说, " \ 19 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 20 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 21 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 22 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 23 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 """.replace(" ", "").replace('"', '') 24 | 25 | # 是否使用多进程, fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf 26 | res_score = text_summarize(doc, multi_process=False, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]) 27 | for rs in res_score: 28 | print(rs) 29 | 30 | 31 | docs ="和投票目标的等级来决定新的等级.简单的说。" \ 32 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 33 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \ 34 | "业界急需一种相对比较准确的网页重要性计算方法。" \ 35 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 36 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 37 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \ 38 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \ 39 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 40 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。" \ 41 | "即数量假设:一个网页被越多的其他页面链接,就越重)。" \ 42 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 43 | "总的来说就是一句话,从全局角度考虑,获取重要的信。" 44 | # 1. word_significance 45 | sums_word_significance = word_significance.summarize(docs, num=6) 46 | print("word_significance:") 47 | for sum_ in sums_word_significance: 48 | print(sum_) 49 | 50 | # 2. text_pronouns 51 | sums_text_pronouns = text_pronouns.summarize(docs, num=6) 52 | print("text_pronouns:") 53 | for sum_ in sums_text_pronouns: 54 | print(sum_) 55 | 56 | # 3. text_teaser 57 | sums_text_teaser = text_teaser.summarize(docs, num=6) 58 | print("text_teaser:") 59 | for sum_ in sums_text_teaser: 60 | print(sum_) 61 | # 4. mmr 62 | sums_mmr = mmr.summarize(docs, num=6) 63 | print("mmr:") 64 | for sum_ in sums_mmr: 65 | print(sum_) 66 | # 5.text_rank 67 | sums_text_rank = text_rank.summarize(docs, num=6) 68 | print("text_rank:") 69 | for sum_ in sums_text_rank: 70 | print(sum_) 71 | # 6. lda 72 | sums_lda = lda.summarize(docs, num=6) 73 | print("lda:") 74 | for sum_ in sums_lda: 75 | print(sum_) 76 | # 7. lsi 77 | sums_lsi = lsi.summarize(docs, num=6) 78 | print("mmr:") 79 | for sum_ in sums_lsi: 80 | print(sum_) 81 | # 8. nmf 82 | sums_nmf = nmf.summarize(docs, num=6) 83 | print("nmf:") 84 | for sum_ in sums_nmf: 85 | print(sum_) 86 | # 9. lead3 87 | sums_lead3 = lead3.summarize(docs, num=6) 88 | print("lead3:") 89 | for sum_ in sums_lead3: 90 | print(sum_) 91 | 92 | 93 | 94 | 95 | 96 | docs = """AutoML机器学习自动化与NNI 97 | 原创大漠帝国 最后发布于2020-02-29 19:46:21 阅读数 221 收藏 98 | 编辑 展开 99 | 一、AutoML简介 100 | 101 |         AutoML(Automated Machine Learning),中文可以翻译为自动机器学习,我比较喜欢叫它“机器学习自动化”,更加接近人们所津津乐道的通用人工智能吧。 102 | 103 |         人们一直有个朴素的想法,可以有一个通用的AI系统,它包罗万象,能够对整个宇宙进行建模,对我们遇到的一切问题,都给出解决办法。这在幻想书籍中数见不新鲜,比如漫威电影中钢铁侠的人工智能贾维斯,又比如说芯片系统流派的网络小说。不过这些大概可以算是人工智能的高级模式了吧,人们还是很宽容的,没有期待一步到位。 104 | 105 |        现在算是AI的高潮期,尤其是以深度学习DL为代表的当代人工智能技术的成功,给以人类以无限的想象空间。那么,降低要求,以DL技术为基础,去开发一个低配版通用人工智能,也是可以的吧。所以,随着人工智能的火爆,2014年以来,AutoML也越发火热起来。 106 | 107 |        深度学习时代的鲜明特征是大数据量、深层次网络、特征学习与端到端学习。我们希望能够从数据一步得到模型,而不需要其他的什么人为参与过程。如果再加上语音助手什么的,或许我们就能达到浅层次通用人工智能的目标呢。在深度学习DL模型架构难以取得更大突破的时候,给它再开辟一条道路呢。一如蒸馏模型,又如MobileNet。 108 | 109 |         工程化和应用级市场,更能带来意想不到的惊喜。这一点,从近年来微软开源的AutoML工具NNI大受欢迎中,可以管中窥豹。 110 | 111 |   112 | 113 | 二、AutoML特性 114 | 115 |         从比较出名的开源Auto平台、互联网大厂AutoML云产品,以及AI公司的AutoML软件来看,一般包括特征工程(FE,Auto feature engine)、神经网络搜索(NAS,Neural Architecture Search) 和超参数优化(HPO,Hyper-parameter optimization) 等功能,如下图所示: 116 | 117 | 118 | 119 |         可能还存在其他一些小功能,如数据增强(几何,颜色), 激活函数(swish,Hybrid DNN), 归一化方法(Switchable Normalization, BN, IN, LN, GN), 优化方法(Neural Optimizer Search, sgd,rmsprop,adam, 衰减, 函数的组合), 优化目标(AM-LFS, Learning to teach with dynamic loss functions), 模型剪枝(AMC), 模型量化(HAQ), 部署上线等。 120 | 121 |         AutoML优点:可用于传统机器学习、图像等较成熟领域,自动化摒弃了人为因素的干扰、增强泛化性; 122 | 123 |                      缺点:消耗资源大、优化方法可能达不到经验模型甚至是严重偏向。 124 | 125 |   126 | 127 | 三、 NNI 128 | 129 |         NNI (Neural Network Intelligence,[翻译为神经网络智能?]) 是微软开源的自动机器学习(AutoML)的Python工具包。NNI 通过 nni_manager模块 等管理 AutoML 的 Experiment (实验),调度并运行各种调优算法生成的 Trial (尝试) 任务,来完成搜索最优神经网络架构、超参数等。同时支持本机,远程服务器,单机,多机,OpenPAI,Kubeflow,K8S和其它云服务等训练环境。 130 | 131 |         对比其他开源项目,或大公司产品可以发现,NNI支持的神经网络结构搜索、超参数优化等调优算法更多,功能最强大。 132 | 133 |         以我的使用体验来看,NNI更像一个黑盒,浅度用户使用可能比较舒服。使用nni的SDK可以完美嵌入自己的网络结构进行超参数优化,详情如下: 134 | 135 | 136 | 137 |         超参数优化需要定义搜索空间search_space.json,NNI配置config.yml,以及主程序调用main.py函数。 138 | 139 |         此外,NNI还需要用特定命令行启动,自由度似乎不太够。 140 | 141 | 希望对你有所帮助! 142 | ———————————————— 143 | 版权声明:本文为CSDN博主「大漠帝国」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。 144 | 原文链接:https://blog.csdn.net/rensihui/article/details/104578756""" 145 | 146 | 147 | sums_textrank_textrank4zh = text_rank.summarize(docs, num=6, model_type="textrank_textrank4zh") 148 | print("textrank_textrank4zh:") 149 | for sum_ in sums_textrank_textrank4zh: 150 | print(sum_) 151 | 152 | sums_textrank_sklearn = text_rank.summarize(docs, num=6, model_type="textrank_sklearn") 153 | print("textrank_sklearn:") 154 | for sum_ in sums_textrank_sklearn: 155 | print(sum_) 156 | 157 | # gensim自带的textrank只支持英文, 分隔符为". ", "? ", "! " 158 | sums_textrank_gensim = text_rank.summarize(docs, num=100, model_type="textrank_gensim") 159 | print("textrank_gensim:") 160 | for sum_ in sums_textrank_gensim: 161 | print(sum_) 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /macropodus/network/layers/crf.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | 3 | # author: BrikerMan 4 | # contact: eliyar917@gmail.com 5 | # blog: https://eliyar.biz 6 | # code from: 7 | 8 | # file: crf.py 9 | # time: 2019-06-28 14:33 10 | 11 | 12 | import tensorflow as tf 13 | 14 | 15 | class CRF(tf.keras.layers.Layer): 16 | """ 17 | Conditional Random Field layer (tf.keras) 18 | `CRF` can be used as the last layer in a network (as a classifier). Input shape (features) 19 | must be equal to the number of classes the CRF can predict (a linear layer is recommended). 20 | Note: the loss and accuracy functions of networks using `CRF` must 21 | use the provided loss and accuracy functions (denoted as loss and viterbi_accuracy) 22 | as the classification of sequences are used with the layers internal weights. 23 | Args: 24 | output_dim (int): the number of labels to tag each temporal input. 25 | Input shape: 26 | nD tensor with shape `(batch_size, sentence length, num_classes)`. 27 | Output shape: 28 | nD tensor with shape: `(batch_size, sentence length, num_classes)`. 29 | """ 30 | 31 | def __init__(self, 32 | output_dim, 33 | mode='reg', 34 | supports_masking=False, 35 | transitions=None, 36 | **kwargs): 37 | self.transitions = None 38 | super(CRF, self).__init__(**kwargs) 39 | self.output_dim = int(output_dim) 40 | self.mode = mode 41 | if self.mode == 'pad': 42 | self.input_spec = [tf.keras.layers.InputSpec(min_ndim=3), tf.keras.layers.InputSpec(min_ndim=2)] 43 | elif self.mode == 'reg': 44 | self.input_spec = tf.keras.layers.InputSpec(min_ndim=3) 45 | else: 46 | raise ValueError 47 | self.supports_masking = supports_masking 48 | self.sequence_lengths = None 49 | 50 | def get_config(self): 51 | config = { 52 | 'output_dim': self.output_dim, 53 | 'mode': self.mode, 54 | 'supports_masking': self.supports_masking, 55 | 'transitions': tf.keras.backend.eval(self.transitions) 56 | } 57 | base_config = super(CRF, self).get_config() 58 | return dict(list(base_config.items()) + list(config.items())) 59 | 60 | def build(self, input_shape): 61 | if self.mode == 'pad': 62 | assert len(input_shape) == 2 63 | assert len(input_shape[0]) == 3 64 | assert len(input_shape[1]) == 2 65 | f_shape = tf.TensorShape(input_shape[0]) 66 | input_spec = [tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]}), 67 | tf.keras.layers.InputSpec(min_ndim=2, axes={-1: 1}, dtype=tf.int32)] 68 | else: 69 | assert len(input_shape) == 3 70 | f_shape = tf.TensorShape(input_shape) 71 | input_spec = tf.keras.layers.InputSpec(min_ndim=3, axes={-1: f_shape[-1]}) 72 | 73 | if f_shape[-1] is None: 74 | raise ValueError('The last dimension of the inputs to `CRF` should be defined. Found `None`.') 75 | if f_shape[-1] != self.output_dim: 76 | raise ValueError('The last dimension of the input shape must be equal to output shape. ' 77 | 'Use a linear layer if needed.') 78 | self.input_spec = input_spec 79 | self.transitions = self.add_weight(name='transitions', 80 | shape=[self.output_dim, self.output_dim], 81 | initializer='glorot_uniform', 82 | trainable=True) 83 | self.built = True 84 | 85 | def call(self, inputs, **kwargs): 86 | if self.mode == 'pad': 87 | sequences = tf.convert_to_tensor(inputs[0], dtype=self.dtype) 88 | self.sequence_lengths = tf.keras.backend.flatten(inputs[-1]) 89 | else: 90 | sequences = tf.convert_to_tensor(inputs, dtype=self.dtype) 91 | shape = tf.shape(inputs) 92 | self.sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1]) 93 | viterbi_sequence, _ = tf.contrib.crf.crf_decode(sequences, self.transitions, 94 | self.sequence_lengths) 95 | output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim) 96 | return tf.keras.backend.in_train_phase(sequences, output) 97 | 98 | def loss(self, y_true, y_pred): 99 | y_pred = tf.convert_to_tensor(y_pred, dtype=self.dtype) 100 | log_likelihood, self.transitions = tf.contrib.crf.crf_log_likelihood(y_pred, 101 | tf.cast(tf.keras.backend.argmax(y_true), 102 | dtype=tf.int32), 103 | self.sequence_lengths, 104 | transition_params=self.transitions) 105 | # loss_crf = tf.reduce_mean(-log_likelihood) 106 | # return tf.math.log(loss_crf) 107 | return tf.reduce_mean(-log_likelihood) 108 | 109 | def compute_output_shape(self, input_shape): 110 | if self.mode == 'pad': 111 | data_shape = input_shape[0] 112 | else: 113 | data_shape = input_shape 114 | tf.TensorShape(data_shape).assert_has_rank(3) 115 | return data_shape[:2] + (self.output_dim,) 116 | 117 | @property 118 | def viterbi_accuracy(self): 119 | def accuracy(y_true, y_pred): 120 | shape = tf.shape(y_pred) 121 | sequence_lengths = tf.ones(shape[0], dtype=tf.int32) * (shape[1]) 122 | viterbi_sequence, _ = tf.contrib.crf.crf_decode(y_pred, self.transitions, sequence_lengths) 123 | output = tf.keras.backend.one_hot(viterbi_sequence, self.output_dim) 124 | return tf.keras.metrics.categorical_accuracy(y_true, output) 125 | 126 | accuracy.func_name = 'viterbi_accuracy' 127 | return accuracy 128 | 129 | -------------------------------------------------------------------------------- /macropodus/preprocess/tools_ml.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/12/5 20:23 4 | # @author : Mo 5 | # @function: data utils of ml, text_summarization 6 | 7 | 8 | from sklearn.feature_extraction.text import TfidfTransformer 9 | from sklearn.feature_extraction.text import TfidfVectorizer 10 | import macropodus 11 | import re 12 | 13 | 14 | __all__ = ["extract_chinese", 15 | "macropodus_cut", 16 | "jieba_tag_cut", 17 | "cut_sentence", 18 | "remove_urls", 19 | "tfidf_fit", 20 | "tfidf_sim" 21 | ] 22 | 23 | 24 | def extract_chinese(text): 25 | """ 26 | 只提取出中文、字母和数字 27 | :param text: str, input of sentence 28 | :return: str 29 | """ 30 | chinese_exttract = ''.join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@. ])", text)) 31 | return chinese_exttract 32 | 33 | 34 | def jieba_tag_cut(text): 35 | """ 36 | jieba cut and tagged 37 | :param text:str 38 | :return: dict 39 | """ 40 | import jieba.posseg as pseg 41 | words = pseg.cut(text) 42 | return dict(words) 43 | 44 | 45 | def macropodus_cut(text): 46 | """ 47 | Macropodus cut 48 | :param text: input sentence 49 | :return: list 50 | """ 51 | return macropodus.cut(text) 52 | 53 | 54 | def cut_sentence(text, use_type="summarize"): 55 | """ 56 | 分句(文本摘要) 57 | :param sentence:str, like "大漠帝国" 58 | :param use_type:str, like "summarize" or "new-word-discovery" 59 | :return:list 60 | """ 61 | if use_type=="summarize": 62 | re_sen = re.compile('[:;!?。:;?!\n\r]') #.不加是因为不确定.是小数还是英文句号(中文省略号......) 63 | elif use_type=="new-word-discovery": 64 | re_sen = re.compile('[,,"“”、<>《》{}【】:;!?。:;?!\n\r]') #.不加是因为不确定.是小数还是英文句号(中文省略号......) 65 | else: 66 | raise RuntimeError("use_type must be 'summarize' or 'new-word-discovery'") 67 | sentences = re_sen.split(text) 68 | sen_cuts = [] 69 | for sen in sentences: 70 | if sen and str(sen).strip(): 71 | sen_cuts.append(sen) 72 | return sen_cuts 73 | 74 | 75 | def remove_urls(text): 76 | """ 77 | 删除https/http等无用url 78 | :param text: str 79 | :return: str 80 | """ 81 | text_remove_url = re.sub(r'(全文:)?(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', 82 | '', text, flags=re.MULTILINE) 83 | return text_remove_url 84 | 85 | 86 | def gram_uni_bi_tri(text): 87 | """ 88 | 获取文本的unigram, trugram, bigram等特征 89 | :param text: str 90 | :return: list 91 | """ 92 | len_text = len(text) 93 | gram_uni = [] 94 | gram_bi = [] 95 | gram_tri = [] 96 | for i in range(len_text): 97 | if i + 3 <= len_text: 98 | gram_uni.append(text[i]) 99 | gram_bi.append(text[i:i+2]) 100 | gram_tri.append(text[i:i+3]) 101 | elif i + 2 <= len_text: 102 | gram_uni.append(text[i]) 103 | gram_bi.append(text[i:i+2]) 104 | elif i + 1 <= len_text: 105 | gram_uni.append(text[i]) 106 | else: 107 | break 108 | return gram_uni, gram_bi, gram_tri 109 | 110 | 111 | def get_ngrams(text, ns=[1], use_type="summarize", len_max=7): 112 | """ 113 | 获取文本的ngram等特征 114 | :param text: str, like "大漠帝国" 115 | :param ns: list, like [1, 2, 3] 116 | :param type: str, like "summarize" or "new-word-discovery" 117 | :param type: int, like 6, 7 118 | :return: list or list 119 | """ 120 | if type(ns) != list: 121 | raise RuntimeError("ns of function get_ngram() must be list!") 122 | for n in ns: 123 | if n < 1: 124 | raise RuntimeError("enum of ns must '>1'!") 125 | len_text = len(text) 126 | ngrams = [] 127 | if use_type == "summarize": # 分别返回uni, bi, tri... 128 | for n in ns: 129 | ngram_n = [] 130 | for i in range(len_text): 131 | if i + n <= len_text: 132 | ngram_n.append(text[i:i + n]) 133 | else: 134 | break 135 | if not ngram_n: 136 | ngram_n.append(text) 137 | ngrams.append(ngram_n) 138 | else: # 只返回一个list 139 | for i in range(len_text): 140 | ngrams += [text[i: j + i] 141 | for j in range(1, min(len_max + 1, len_text - i + 1))] 142 | return ngrams 143 | 144 | 145 | def tfidf_fit(sentences): 146 | """ 147 | tfidf相似度 148 | :param sentences: str 149 | :return: list, list, list 150 | """ 151 | # tfidf计算 152 | model = TfidfVectorizer(ngram_range=(1, 2), # 3,5 153 | stop_words=[' ', '\t', '\n'], # 停用词 154 | max_features=10000, 155 | token_pattern=r"(?u)\b\w+\b", # 过滤停用词 156 | min_df=1, 157 | max_df=0.9, 158 | use_idf=1, # 光滑 159 | smooth_idf=1, # 光滑 160 | sublinear_tf=1, ) # 光滑 161 | matrix = model.fit_transform(sentences) 162 | return matrix 163 | 164 | 165 | def tdidf_sim(sentences): 166 | """ 167 | tfidf相似度 168 | :param sentences: 169 | :return: 170 | """ 171 | # tfidf计算 172 | model = TfidfVectorizer(tokenizer=macropodus_cut, 173 | ngram_range=(1, 2), # 3,5 174 | stop_words=[' ', '\t', '\n'], # 停用词 175 | max_features=10000, 176 | token_pattern=r"(?u)\b\w+\b", # 过滤停用词 177 | min_df=1, 178 | max_df=0.9, 179 | use_idf=1, # 光滑 180 | smooth_idf=1, # 光滑 181 | sublinear_tf=1, ) # 光滑 182 | matrix = model.fit_transform(sentences) 183 | matrix_norm = TfidfTransformer().fit_transform(matrix) 184 | return matrix_norm 185 | 186 | 187 | if __name__ == '__main__': 188 | text = "你喜欢谁,小老弟,你好烦哇。" 189 | # gg = jieba_tag_cut("我不再喜欢你,正如你的不喜欢我") 190 | grams = get_ngrams(text, use_type="new-word-discovery", len_max=7) 191 | # print(gg) 192 | print(grams) 193 | -------------------------------------------------------------------------------- /macropodus/summarize/yongzhuo_nlg/README.md: -------------------------------------------------------------------------------- 1 | # nlg, API(联合调用, 整合几种算法) 2 | ```bash 3 | from nlg_yongzhuo import * 4 | 5 | doc = """PageRank算法简介。" \ 6 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 7 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 8 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 9 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 10 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 11 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 12 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 13 | "和投票目标的等级来决定新的等级。简单的说, " \ 14 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 15 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 16 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 17 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 18 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 """.replace(" ", "").replace('"', '') 19 | 20 | # 是否使用多进程, fs可以填其中一个或几个 text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf 21 | res_score = text_summarize(doc, multi_process=True, fs=[text_pronouns, text_teaser, mmr, text_rank, lead3, lda, lsi, nmf]) 22 | for rs in res_score: 23 | print(rs) 24 | 25 | ``` 26 | 27 | # nlg, 单个方法 28 | ``` 29 | # feature_base 30 | from nlg_yongzhuo import word_significance 31 | from nlg_yongzhuo import text_pronouns 32 | from nlg_yongzhuo import text_teaser 33 | from nlg_yongzhuo import mmr 34 | # graph_base 35 | from nlg_yongzhuo import text_rank 36 | # topic_base 37 | from nlg_yongzhuo import lda 38 | from nlg_yongzhuo import lsi 39 | from nlg_yongzhuo import nmf 40 | # nous_base 41 | from nlg_yongzhuo import lead3 42 | 43 | 44 | docs ="和投票目标的等级来决定新的等级.简单的说。" \ 45 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 46 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \ 47 | "业界急需一种相对比较准确的网页重要性计算方法。" \ 48 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 49 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 50 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \ 51 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \ 52 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 53 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。" \ 54 | "即数量假设:一个网页被越多的其他页面链接,就越重)。" \ 55 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 56 | "总的来说就是一句话,从全局角度考虑,获取重要的信。" 57 | # 1. word_significance 58 | sums_word_significance = word_significance.summarize(docs, num=6) 59 | print("word_significance:") 60 | for sum_ in sums_word_significance: 61 | print(sum_) 62 | 63 | # 2. text_pronouns 64 | sums_text_pronouns = text_pronouns.summarize(docs, num=6) 65 | print("text_pronouns:") 66 | for sum_ in sums_text_pronouns: 67 | print(sum_) 68 | 69 | # 3. text_teaser 70 | sums_text_teaser = text_teaser.summarize(docs, num=6) 71 | print("text_teaser:") 72 | for sum_ in sums_text_teaser: 73 | print(sum_) 74 | # 4. mmr 75 | sums_mmr = mmr.summarize(docs, num=6) 76 | print("mmr:") 77 | for sum_ in sums_mmr: 78 | print(sum_) 79 | # 5.text_rank 80 | sums_text_rank = text_rank.summarize(docs, num=6) 81 | print("text_rank:") 82 | for sum_ in sums_text_rank: 83 | print(sum_) 84 | # 6. lda 85 | sums_lda = lda.summarize(docs, num=6) 86 | print("lda:") 87 | for sum_ in sums_lda: 88 | print(sum_) 89 | # 7. lsi 90 | sums_lsi = lsi.summarize(docs, num=6) 91 | print("mmr:") 92 | for sum_ in sums_lsi: 93 | print(sum_) 94 | # 8. nmf 95 | sums_nmf = nmf.summarize(docs, num=6) 96 | print("nmf:") 97 | for sum_ in sums_nmf: 98 | print(sum_) 99 | # 9. lead3 100 | sums_lead3 = lead3.summarize(docs, num=6) 101 | print("lead3:") 102 | for sum_ in sums_lead3: 103 | print(sum_) 104 | ``` 105 | 106 | # nlg, sklearn 107 | ``` 108 | docs = """AutoML机器学习自动化与NNI 109 | 原创大漠帝国 最后发布于2020-02-29 19:46:21 阅读数 221 收藏 110 | 编辑 展开 111 | 一、AutoML简介 112 | 113 |         AutoML(Automated Machine Learning),中文可以翻译为自动机器学习,我比较喜欢叫它“机器学习自动化”,更加接近人们所津津乐道的通用人工智能吧。 114 | 115 |         人们一直有个朴素的想法,可以有一个通用的AI系统,它包罗万象,能够对整个宇宙进行建模,对我们遇到的一切问题,都给出解决办法。这在幻想书籍中数见不新鲜,比如漫威电影中钢铁侠的人工智能贾维斯,又比如说芯片系统流派的网络小说。不过这些大概可以算是人工智能的高级模式了吧,人们还是很宽容的,没有期待一步到位。 116 | 117 |        现在算是AI的高潮期,尤其是以深度学习DL为代表的当代人工智能技术的成功,给以人类以无限的想象空间。那么,降低要求,以DL技术为基础,去开发一个低配版通用人工智能,也是可以的吧。所以,随着人工智能的火爆,2014年以来,AutoML也越发火热起来。 118 | 119 |        深度学习时代的鲜明特征是大数据量、深层次网络、特征学习与端到端学习。我们希望能够从数据一步得到模型,而不需要其他的什么人为参与过程。如果再加上语音助手什么的,或许我们就能达到浅层次通用人工智能的目标呢。在深度学习DL模型架构难以取得更大突破的时候,给它再开辟一条道路呢。一如蒸馏模型,又如MobileNet。 120 | 121 |         工程化和应用级市场,更能带来意想不到的惊喜。这一点,从近年来微软开源的AutoML工具NNI大受欢迎中,可以管中窥豹。 122 | 123 |   124 | 125 | 二、AutoML特性 126 | 127 |         从比较出名的开源Auto平台、互联网大厂AutoML云产品,以及AI公司的AutoML软件来看,一般包括特征工程(FE,Auto feature engine)、神经网络搜索(NAS,Neural Architecture Search) 和超参数优化(HPO,Hyper-parameter optimization) 等功能,如下图所示: 128 | 129 | 130 | 131 |         可能还存在其他一些小功能,如数据增强(几何,颜色), 激活函数(swish,Hybrid DNN), 归一化方法(Switchable Normalization, BN, IN, LN, GN), 优化方法(Neural Optimizer Search, sgd,rmsprop,adam, 衰减, 函数的组合), 优化目标(AM-LFS, Learning to teach with dynamic loss functions), 模型剪枝(AMC), 模型量化(HAQ), 部署上线等。 132 | 133 |         AutoML优点:可用于传统机器学习、图像等较成熟领域,自动化摒弃了人为因素的干扰、增强泛化性; 134 | 135 |                      缺点:消耗资源大、优化方法可能达不到经验模型甚至是严重偏向。 136 | 137 |   138 | 139 | 三、 NNI 140 | 141 |         NNI (Neural Network Intelligence,[翻译为神经网络智能?]) 是微软开源的自动机器学习(AutoML)的Python工具包。NNI 通过 nni_manager模块 等管理 AutoML 的 Experiment (实验),调度并运行各种调优算法生成的 Trial (尝试) 任务,来完成搜索最优神经网络架构、超参数等。同时支持本机,远程服务器,单机,多机,OpenPAI,Kubeflow,K8S和其它云服务等训练环境。 142 | 143 |         对比其他开源项目,或大公司产品可以发现,NNI支持的神经网络结构搜索、超参数优化等调优算法更多,功能最强大。 144 | 145 |         以我的使用体验来看,NNI更像一个黑盒,浅度用户使用可能比较舒服。使用nni的SDK可以完美嵌入自己的网络结构进行超参数优化,详情如下: 146 | 147 | 148 | 149 |         超参数优化需要定义搜索空间search_space.json,NNI配置config.yml,以及主程序调用main.py函数。 150 | 151 |         此外,NNI还需要用特定命令行启动,自由度似乎不太够。 152 | 153 | 希望对你有所帮助! 154 | ———————————————— 155 | 版权声明:本文为CSDN博主「大漠帝国」的原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接及本声明。 156 | 原文链接:https://blog.csdn.net/rensihui/article/details/104578756""" 157 | 158 | 159 | sums_textrank_textrank4zh = text_rank.summarize(docs, num=6, model_type="textrank_textrank4zh") 160 | print("textrank_textrank4zh:") 161 | for sum_ in sums_textrank_textrank4zh: 162 | print(sum_) 163 | 164 | sums_textrank_sklearn = text_rank.summarize(docs, num=6, model_type="textrank_sklearn") 165 | print("textrank_sklearn:") 166 | for sum_ in sums_textrank_sklearn: 167 | print(sum_) 168 | 169 | # gensim自带的textrank只支持英文, 分隔符为". ", "? ", "! " 170 | sums_textrank_gensim = text_rank.summarize(docs, num=100, model_type="textrank_gensim") 171 | print("textrank_gensim:") 172 | for sum_ in sums_textrank_gensim: 173 | print(sum_) 174 | ``` 175 | -------------------------------------------------------------------------------- /macropodus/base/seg_basic.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2019/11/28 20:17 4 | # @author : Mo 5 | # @function: basic of segment, dictionary 6 | 7 | 8 | from macropodus.preprocess.tools_common import load_json, save_json, txt_read 9 | from macropodus.conf.path_config import path_dict_macropodus, path_dict_user 10 | from macropodus.conf.path_config import path_macropodus_dict_freq_cache 11 | from macropodus.conf.path_log import get_logger_root 12 | from collections import defaultdict 13 | import pickle 14 | import time 15 | import os 16 | 17 | 18 | logger = get_logger_root() 19 | logger.info("path of dict cache is {}!".format(path_macropodus_dict_freq_cache)) 20 | 21 | 22 | class SegBasic: 23 | def __init__(self, use_cache=True): 24 | # time_start = time.time() 25 | # 存在缓存则直接读取, 序列化加速缓存读取速度 26 | if use_cache and os.path.exists(path_macropodus_dict_freq_cache): 27 | with open(path_macropodus_dict_freq_cache, "rb") as fpmc: 28 | [self.dict_words_freq, self.num_words, self.dict_user] = pickle.load(fpmc) 29 | fpmc.close() 30 | # logger.info("seg: " + str(time.time()-time_start)) # 5.29, 5.26 31 | else: 32 | self.dict_words_freq = defaultdict() 33 | self.dict_user = {} 34 | self.load_macropodus_dict() # 默认字典 35 | self.load_user_dict() # 用户字典 36 | # logger.info("seg: " + str(time.time() - time_start)) # 10.13, 10.33 37 | # 第一次跑macropodus, 序列化需要的缓存 38 | if use_cache and not os.path.exists(path_macropodus_dict_freq_cache): 39 | with open(path_macropodus_dict_freq_cache, "wb") as fpmc: 40 | pickle.dump([self.dict_words_freq, self.num_words, self.dict_user], fpmc) 41 | 42 | def load_macropodus_dict(self): 43 | """ 44 | 加载默认的基础字典 45 | :return: None 46 | """ 47 | dict_macropodus = load_json(path_dict_macropodus)[0] # (path_dict_jiagu)[0] # (path_dict_macropodus)[0] # 加载json字典文件 48 | dict_macropodus_def = defaultdict() # 转为defaultdict 49 | for k,v in dict_macropodus.items(): 50 | dict_macropodus_def[k] = v 51 | self.dict_words_freq = dict_macropodus_def # {}词-词频字典 52 | 53 | def load_user_dict(self, path_user=path_dict_user, type_user="json"): 54 | """ 55 | 加载用户词典 56 | :param path_user:str, like '/home/user.dict' 57 | :return: None 58 | """ 59 | if not os.path.exists(path_user): 60 | raise RuntimeError("your path_user is not exist!") 61 | if type_user == "json": 62 | self.dict_user = load_json(path_user)[0] # 加载json字典文件 63 | for k, v in self.dict_user.items(): 64 | if k not in self.dict_words_freq: 65 | self.dict_words_freq[k] = v # 更新到总字典, words_freq 66 | else: 67 | self.dict_words_freq[k] = self.dict_words_freq[k] + v # 更新到总字典, words_freq 68 | self.num_words = sum(self.dict_words_freq.values()) 69 | elif type_user == "txt": 70 | words_all = txt_read(path_user) 71 | for word_freq in words_all: 72 | wf = word_freq.split(" ") # 空格' '区分带不带词频的情况 73 | if len(wf) == 2: 74 | word = wf[0] 75 | freq = wf[1] 76 | else: 77 | word = wf[0] 78 | freq = 132 79 | if word not in self.dict_words_freq: 80 | self.dict_words_freq[word] = freq # 更新到总字典, words_freq 81 | else: 82 | self.dict_words_freq[word] = self.dict_words_freq[word] + freq # 更新到总字典, words_freq 83 | self.num_words = sum(self.dict_words_freq.values()) 84 | elif type_user == "csv": 85 | words_all = txt_read(path_user) 86 | for word_freq in words_all: 87 | wf = word_freq.split(",") # 逗号','区分带不带词频的情况 88 | if len(wf)==2: 89 | word = wf[0] 90 | freq = wf[1] 91 | else: 92 | word = wf[0] 93 | freq = 132 94 | if word not in self.dict_words_freq: 95 | self.dict_words_freq[word] = freq # 更新到总字典, words_freq 96 | else: 97 | self.dict_words_freq[word] = self.dict_words_freq[word] + freq # 更新到总字典, words_freq 98 | self.num_words = sum(self.dict_words_freq.values()) 99 | else: 100 | raise EOFError 101 | 102 | def add_word(self, word, freq=132): 103 | """ 104 | 新增词典到词语, 不可持久化, 重载消失 105 | :param word: str, like '大漠帝国' 106 | :param freq: int, like 132 107 | :return: None 108 | """ 109 | assert type(word) == str 110 | if word in self.dict_words_freq: 111 | self.dict_words_freq[word] = self.dict_words_freq[word] if freq !=132 else freq 112 | else: 113 | self.dict_words_freq[word] = freq 114 | self.num_words += freq 115 | 116 | def delete_word(self, word): 117 | """ 118 | 删除词语, 不可持久化, 重载消失 119 | :param word_freqs: str, like '大漠帝国' 120 | :return: None 121 | """ 122 | assert type(word) == str 123 | if word in self.dict_words_freq: 124 | self.num_words -= self.dict_words_freq[word] 125 | self.dict_words_freq.pop(word) 126 | 127 | def save_add_words(self, word_freqs): 128 | """ 129 | 新增词语到用户词典, 可持久化, 重载有效 130 | :param word_freqs: dict, like {'大漠帝国':132} 131 | :return: None 132 | """ 133 | assert type(word_freqs) == dict 134 | for k, v in word_freqs.items(): 135 | self.add_word(k, v) # 新增到总字典, 不持久化 136 | self.dict_user[k] = v # 新增到用户字典, 持久化 137 | save_json([self.dict_user], path_dict_user) 138 | 139 | def save_delete_words(self, words): 140 | """ 141 | 删除词语到用户词典, 可持久化, 重载有效 142 | :param word_freqs: list, like ['大漠帝国'] 143 | :return: None 144 | """ 145 | assert type(words) == list 146 | for w in words: 147 | self.delete_word(w) # 删除到总字典, 不持久化 148 | if w in self.dict_user: self.dict_user.pop(w) # 删除到用户字典, 持久化 149 | save_json([self.dict_user], path_dict_user) 150 | -------------------------------------------------------------------------------- /macropodus/summarize/topic_base/topic_nmf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/12/2 20:33 4 | # @author :Mo 5 | # @function :topic model of NMF 6 | 7 | 8 | from macropodus.preprocess.tools_ml import extract_chinese, tfidf_fit 9 | from macropodus.data.words_common.stop_words import stop_words 10 | from macropodus.preprocess.tools_ml import macropodus_cut 11 | from macropodus.preprocess.tools_ml import cut_sentence 12 | # sklearn 13 | from sklearn.decomposition import NMF 14 | import numpy as np 15 | 16 | 17 | class NMFSum: 18 | def __init__(self): 19 | self.stop_words = stop_words.values() 20 | self.algorithm = 'lsi' 21 | 22 | def summarize(self, text, num=320, topic_min=5, judge_topic="all"): 23 | """ 24 | 25 | :param text: text or list, input docs 26 | :param num: int, number or amount of return 27 | :param topic_min: int, topic number 28 | :param judge_topic: str, calculate ways of topic 29 | :return: 30 | """ 31 | # 切句 32 | if type(text) == str: 33 | self.sentences = cut_sentence(text) 34 | elif type(text) == list: 35 | self.sentences = text 36 | else: 37 | raise RuntimeError("text type must be list or str") 38 | # 切词 39 | sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) 40 | if word.strip()] for sentence in self.sentences] 41 | len_sentences_cut = len(sentences_cut) 42 | # 去除停用词等 43 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 44 | self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] 45 | # 计算每个句子的tfidf 46 | sen_tfidf = tfidf_fit(self.sentences_cut) 47 | # 主题数, 经验判断 48 | topic_num = min(topic_min, int(len(sentences_cut) / 2)) # 设定最小主题数为3 49 | nmf_tfidf = NMF(n_components=topic_num, max_iter=320) 50 | res_nmf_w = nmf_tfidf.fit_transform(sen_tfidf.T) # 基矩阵 or 权重矩阵 51 | res_nmf_h = nmf_tfidf.components_ # 系数矩阵 or 降维矩阵 52 | 53 | if judge_topic: 54 | ### 方案一, 获取最大那个主题的k个句子 55 | ################################################################################## 56 | topic_t_score = np.sum(res_nmf_h, axis=-1) 57 | # 对每列(一个句子topic_num个主题),得分进行排序,0为最大 58 | res_nmf_h_soft = res_nmf_h.argsort(axis=0)[-topic_num:][::-1] 59 | # 统计为最大每个主题的句子个数 60 | exist = (res_nmf_h_soft <= 0) * 1.0 61 | factor = np.ones(res_nmf_h_soft.shape[1]) 62 | topic_t_count = np.dot(exist, factor) 63 | # 标准化 64 | topic_t_count /= np.sum(topic_t_count, axis=-1) 65 | topic_t_score /= np.sum(topic_t_score, axis=-1) 66 | # 主题最大个数占比, 与主题总得分占比选择最大的主题 67 | topic_t_tc = topic_t_count + topic_t_score 68 | topic_t_tc_argmax = np.argmax(topic_t_tc) 69 | # 最后得分选择该最大主题的 70 | res_nmf_h_soft_argmax = res_nmf_h[topic_t_tc_argmax].tolist() 71 | res_combine = {} 72 | for l in range(len_sentences_cut): 73 | res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l] 74 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] 75 | ##################################################################################### 76 | else: 77 | ### 方案二, 获取最大主题概率的句子, 不分主题 78 | res_combine = {} 79 | for i in range(len_sentences_cut): 80 | res_row_i = res_nmf_h[:, i] 81 | res_row_i_argmax = np.argmax(res_row_i) 82 | res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax] 83 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_combine.items(), key=lambda d: d[1], reverse=True)] 84 | num_min = min(num, int(len_sentences_cut * 0.6)) 85 | return score_sen[0:num_min] 86 | 87 | 88 | if __name__ == '__main__': 89 | nmf = NMFSum() 90 | doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \ 91 | "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \ 92 | "该基金认缴出资总规模为人民币3.01亿元。" \ 93 | "基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \ 94 | "各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \ 95 | "截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \ 96 | "公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \ 97 | "方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}" 98 | 99 | doc = "和投票目标的等级来决定新的等级.简单的说。" \ 100 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 101 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。" \ 102 | "业界急需一种相对比较准确的网页重要性计算方法。" \ 103 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 104 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 105 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。" \ 106 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。" \ 107 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 108 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。" \ 109 | "即数量假设:一个网页被越多的其他页面链接,就越重)。" \ 110 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 111 | "总的来说就是一句话,从全局角度考虑,获取重要的信。" 112 | 113 | doc = '早年林志颖带kimi上《爸爸去哪儿》的时候,当时遮遮掩掩的林志颖老婆低调探班,总让人觉得格外神秘,大概是特别不喜欢' \ 114 | '在公众面前曝光自己日常的那种人。可能这么些年过去,心态不断调整过了,至少在微博上,陈若仪越来越放得开,晒自己带' \ 115 | '娃照顾双子星的点滴,也晒日常自己的护肤心得,时不时安利一些小东西。都快晚上十点半,睡美容觉的最佳时候,结果才带' \ 116 | '完一天娃的陈若仪还是不忘先保养自己,敷起了面膜。泡完澡,这次用的是一个稍微平价的面膜,脸上、甚至仔细到脖子上都' \ 117 | '抹上了。陈若仪也是多此一举,特别说自己不是裸体,是裹着浴巾的,谁在意这个呀,目光完全被你那又长又扑闪的睫毛给吸' \ 118 | '引住了。这也太吓人吧,怎么能够长那么长那么密那么翘。嫉妒地说一句,真的很像种的假睫毛呐。陈若仪的睫毛应该是天生' \ 119 | '的基础好吧,要不然也不会遗传给小孩,一家子都是睫毛精,几个儿子现在这么小都是长睫毛。只是陈若仪现在这个完美状态,' \ 120 | '一定是后天再经过悉心的呵护培养。网友已经迫不及待让她教教怎么弄睫毛了,陈若仪也是答应地好好的。各种私人物品主动' \ 121 | '揭秘,安利一些品牌给大家,虽然一再强调是自己的日常小物,还是很让人怀疑,陈若仪是不是在做微商当网红呐,网友建议' \ 122 | '她开个店,看这回复,也是很有意愿了。她应该不缺这个钱才对。隔三差五介绍下自己用的小刷子之类,陈若仪乐于向大家传' \ 123 | '授自己的保养呵护之道。她是很容易就被晒出斑的肤质,去海岛参加婚礼,都要必备这几款超爱用的防晒隔离。日常用的、太' \ 124 | '阳大时候用的,好几个种类,活得相当精致。你们按照自己的需要了解一下。画眉毛,最爱用的是intergrate的眉笔。也是个' \ 125 | '念旧的人,除了Dior,陈若仪的另一个眉粉其中一个是她高中就开始用的Kate。一般都是大学才开始化妆修饰自己,感受得到' \ 126 | '陈若仪从小就很爱美。各种小零小碎的化妆品,已经买过七八次的粉红胡椒抛光美体油,每天洗完澡陈若仪都会喷在肚子、大' \ 127 | '腿、屁股和膝盖手肘,说是能保持肌肤的平滑紧致程度。每安利一样东西,总有网友要在下面问其他问题咋个办,真是相当信' \ 128 | '任陈若仪了。每次她也很耐心的解答,"去黑头我用的是SUQQU洁面去角质按摩膏磨砂洁面洗面奶,"一定要先按摩再用。她自己' \ 129 | '已经回购过好几次,意思是你们再了解一下。了解归了解,买不买随意。毕竟像她另一个爱用的达尔肤面膜,效果好是好,价' \ 130 | '格据说比sk2都还要贵,不是大多数人日常能够消费得起的,大家就看个热闹就好了,还是多买多试多用才能找到最适合自己的' \ 131 | '护肤方法。' 132 | 133 | sum = nmf.summarize(doc, num=320) 134 | for i in sum: 135 | print(i) 136 | 137 | 138 | -------------------------------------------------------------------------------- /macropodus/network/service/server_streamer.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/16 22:18 4 | # @author : Mo 5 | # @function: service streamer of multiprocessing 6 | 7 | 8 | # 多进程, win10必须加, 否则报错 9 | import platform 10 | sys = platform.system() 11 | if sys == "Windows": 12 | import multiprocessing as mp 13 | mp.freeze_support() 14 | mp.set_start_method("spawn", force=True) 15 | 16 | from macropodus.network.service.server_base import Streamer, ThreadedStreamer 17 | from macropodus.preprocess.tools_ml import extract_chinese 18 | from tensorflow.python.keras.models import model_from_json 19 | from macropodus.preprocess.tools_common import load_json 20 | from keras_bert import Tokenizer 21 | import numpy as np 22 | import macropodus 23 | import codecs 24 | import os 25 | 26 | 27 | # 常规 28 | class AlbertBilstmPredict: 29 | def __init__(self, path_dir): 30 | self.path_dir = path_dir 31 | self.tokenizer_init() 32 | self.l2i_i2l_init() 33 | self.params_init() 34 | self.model_init() 35 | 36 | def model_init(self): 37 | """模型初始化""" 38 | # import tensorflow as tf 39 | # self.model = None 40 | # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1) # gpu_memory_fraction 41 | # config = tf.ConfigProto(gpu_options=gpu_options) 42 | # self.graph = tf.Graph() 43 | # self.sess = tf.Session(graph=self.graph, config=config) 44 | # with self.sess.as_default(): 45 | # with self.graph.as_default(): 46 | # self.model = None 47 | # graph = tf.get_default_graph() 48 | # sess = tf.Session(graph=graph) 49 | # with sess.as_default(): 50 | # with graph.as_default(): 51 | # tf.global_variables_initializer().run() 52 | path_graph = os.path.join(self.path_dir, "graph.json") 53 | path_model = os.path.join(self.path_dir, "model.h5") 54 | # 加载模型结构 55 | self.model = model_from_json(open(path_graph, "r", encoding="utf-8").read(), 56 | custom_objects=macropodus.custom_objects) 57 | # 加载模型权重 58 | self.model.load_weights(path_model) 59 | 60 | def tokenizer_init(self): 61 | """字典""" 62 | # reader tokenizer 63 | token_dict = {} 64 | path_dict = os.path.join(self.path_dir, "vocab.txt") 65 | with codecs.open(path_dict, 'r', 'utf8') as reader: 66 | for line in reader: 67 | token = line.strip() 68 | token_dict[token] = len(token_dict) 69 | # vocab_size = len(token_dict) 70 | self.tokenizer = Tokenizer(token_dict) 71 | 72 | def params_init(self): 73 | """超参数初始化""" 74 | # params 75 | path_params = os.path.join(self.path_dir, "params.json") 76 | self.params = load_json(path_params) 77 | self.len_max = self.params["len_max"] 78 | 79 | def l2i_i2l_init(self): 80 | """类别与数字项目转化""" 81 | # l2i_i2l 82 | path_l2i_i2l = os.path.join(self.path_dir, "l2i_i2l.json") 83 | self.l2i_i2l = load_json(path_l2i_i2l) 84 | 85 | def sentence2idx(self, text, second_text=None): 86 | """数据预处理""" 87 | text = extract_chinese(str(text).upper()) 88 | input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max) 89 | input_mask = len([1 for ids in input_id if ids != 0]) 90 | return [input_id, input_type_id, input_mask] 91 | 92 | def predict(self, quess): 93 | """预测多个问句""" 94 | quess_encode = [self.sentence2idx(ques) for ques in quess] 95 | x_ = np.array(quess_encode) 96 | x_1 = np.array([x[0] for x in x_]) 97 | x_2 = np.array([x[1] for x in x_]) 98 | x_3 = np.array([x[2] for x in x_]) 99 | ress = self.model.predict([x_1, x_2, x_3]) 100 | ress_idxs = [[np.argmax(rl) for rl in res_list] for res_list in ress.tolist()] 101 | ress_label = [[self.l2i_i2l["i2l"][str(ri)] if str(ri) in self.l2i_i2l["i2l"] else "O" for ri in res_idxs] 102 | for res_idxs in ress_idxs] 103 | ress_select = [ress_label[i][1:len(quess[i]) + 1] for i in range(len(quess))] 104 | return ress_select 105 | 106 | 107 | # 一个进程多个线程&多进程等 108 | class ServiceNer: 109 | def __init__(self, path_abs, cuda_devices="0", stream_type="processing", 110 | max_latency=0.1, worker_num=1, batch_size=32): 111 | self.algorithm = 'albert-ner-bilstm-crf' 112 | self.cuda_devices = cuda_devices 113 | self.stream_type = stream_type 114 | self.max_latency = max_latency 115 | self.worker_num = worker_num 116 | self.batch_size = batch_size 117 | self.path_abs = path_abs 118 | self.streamer_init() 119 | 120 | def streamer_init(self): 121 | """ 122 | ner初始化 123 | :param model: class, like "ner_model" 124 | :param cuda_devices: str, like "processing", "thread" 125 | :param stream_type: str, like "0,1" 126 | :param batch_size: int, like 32 127 | :param max_latency: float, 0-1, like 0.01 128 | :param worker_num: int, like 2 129 | :return: 130 | """ 131 | model = AlbertBilstmPredict(self.path_abs) 132 | if self.stream_type == "thread": 133 | self.streamer = ThreadedStreamer(model, self.batch_size, self.max_latency) 134 | else: 135 | self.streamer = Streamer(predict_function_or_model=model, 136 | cuda_devices=self.cuda_devices, 137 | max_latency=self.max_latency, 138 | worker_num=self.worker_num, 139 | batch_size=self.batch_size) 140 | self.streamer._wait_for_worker_ready() 141 | 142 | # def predict(self, text): 143 | # """ 144 | # 预测返回 145 | # :param text: str, like "桂林" 146 | # :return: list, like ["B-LOC", "I-LOC"] 147 | # """ 148 | # return self.streamer.predict(text) 149 | 150 | 151 | # 模型加载 152 | # path = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/tag_seg_pku_1998_w2v_16" 153 | path = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/ner_people_1998_mix_albert_1" 154 | model_server = ServiceNer(path, stream_type="thread", cuda_devices="-1", max_latency=0.1, worker_num=1, batch_size=32).streamer 155 | 156 | 157 | if __name__ == '__main__': 158 | ques = "北京欢迎您, 南宁2020东盟博览会" 159 | res = model_server.predict([ques]) 160 | print(res) 161 | while True: 162 | print("请输入:") 163 | ques = input() 164 | res = model_server.predict([ques]) 165 | print(res) 166 | -------------------------------------------------------------------------------- /macropodus/network/service/server_streamer_flask.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/1/16 22:18 4 | # @author : Mo 5 | # @function: service streamer of multiprocessing 6 | 7 | 8 | import platform 9 | 10 | # 多进程, win10必须加, 否则报错 11 | sys = platform.system() 12 | if sys == "Windows": 13 | import multiprocessing as mp 14 | mp.freeze_support() 15 | mp.set_start_method("spawn", force=True) 16 | 17 | from macropodus.network.service.server_base import ThreadedStreamer, Streamer 18 | from macropodus.preprocess.tools_ml import extract_chinese 19 | from tensorflow.python.keras.models import model_from_json 20 | from macropodus.preprocess.tools_common import load_json 21 | from macropodus.conf.path_log import get_logger_root 22 | from keras_bert import Tokenizer 23 | import numpy as np 24 | import macropodus 25 | import codecs 26 | import os 27 | 28 | # flask 29 | from flask import Flask, request, jsonify 30 | logger = get_logger_root() 31 | 32 | 33 | # 常规 34 | class AlbertBilstmPredict: 35 | def __init__(self, path_dir): 36 | self.path_dir = path_dir 37 | self.tokenizer_init() 38 | self.l2i_i2l_init() 39 | self.params_init() 40 | self.model_init() 41 | 42 | def model_init(self): 43 | """模型初始化""" 44 | path_graph = os.path.join(self.path_dir, "graph.json") 45 | path_model = os.path.join(self.path_dir, "model.h5") 46 | # 加载模型结构 47 | self.model = model_from_json(open(path_graph, "r", encoding="utf-8").read(), 48 | custom_objects=macropodus.custom_objects) 49 | # 加载模型权重 50 | self.model.load_weights(path_model) 51 | 52 | def tokenizer_init(self): 53 | """字典""" 54 | # reader tokenizer 55 | token_dict = {} 56 | path_dict = os.path.join(self.path_dir, "vocab.txt") 57 | with codecs.open(path_dict, 'r', 'utf8') as reader: 58 | for line in reader: 59 | token = line.strip() 60 | token_dict[token] = len(token_dict) 61 | # vocab_size = len(token_dict) 62 | self.tokenizer = Tokenizer(token_dict) 63 | 64 | def params_init(self): 65 | """超参数初始化""" 66 | # params 67 | path_params = os.path.join(self.path_dir, "params.json") 68 | self.params = load_json(path_params) 69 | self.len_max = self.params["len_max"] 70 | 71 | def l2i_i2l_init(self): 72 | """类别与数字项目转化""" 73 | # l2i_i2l 74 | path_l2i_i2l = os.path.join(self.path_dir, "l2i_i2l.json") 75 | self.l2i_i2l = load_json(path_l2i_i2l) 76 | 77 | def sentence2idx(self, text, second_text=None): 78 | """数据预处理""" 79 | text = extract_chinese(str(text).upper()) 80 | input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max) 81 | input_mask = len([1 for ids in input_id if ids != 0]) 82 | # return input_id, input_type_id, input_mask 83 | # x_ = np.array((input_id, input_type_id, input_mask)) 84 | x = [[input_id, input_type_id, input_mask]] 85 | x_ = np.array(x) 86 | x_1 = np.array([x[0] for x in x_]) 87 | x_2 = np.array([x[1] for x in x_]) 88 | x_3 = np.array([x[2] for x in x_]) 89 | return [x_1, x_2, x_3] 90 | 91 | def predict(self, ques): 92 | """预测""" 93 | mode_input = self.sentence2idx(ques) 94 | res = self.model.predict(mode_input) 95 | res_list = res.tolist()[0] 96 | res_idxs = [np.argmax(rl) for rl in res_list] 97 | res_label = [self.l2i_i2l["i2l"][str(ri)] if str(ri) in self.l2i_i2l["i2l"] else "O" for ri in res_idxs] 98 | return res_label[1:len(ques) + 1] 99 | 100 | 101 | # 一个进程多个线程等 102 | class ServiceNer: 103 | def __init__(self, path_model_dir, cuda_devices="0", stream_type="processing", 104 | max_latency=0.1, worker_num=1, batch_size=32): 105 | self.path_model_dir = path_model_dir 106 | self.cuda_devices = cuda_devices 107 | self.stream_type = stream_type 108 | self.max_latency = max_latency 109 | self.worker_num = worker_num 110 | self.batch_size = batch_size 111 | self.algorithm = 'albert-ner-bilstm-crf' 112 | self.streamer_init(self.path_model_dir, cuda_devices=self.cuda_devices, stream_type=self.stream_type, 113 | max_latency=self.max_latency, worker_num=self.worker_num, 114 | batch_size=self.batch_size) 115 | 116 | def streamer_init(self, path_abs, cuda_devices="0", stream_type="processing", 117 | max_latency=0.1, worker_num=1, batch_size=32): 118 | """ 119 | ner初始化 120 | :param path_abs: str, like "ner_model" 121 | :param cuda_devices: str, like "processing", "thread" 122 | :param stream_type: str, like "0,1" 123 | :param batch_size: int, like 32 124 | :param max_latency: float, 0-1, like 0.01 125 | :param worker_num: int, like 2 126 | :return: 127 | """ 128 | abp = AlbertBilstmPredict(path_abs) 129 | if stream_type == "thread": 130 | self.streamer = ThreadedStreamer(abp, batch_size, max_latency) 131 | else: 132 | self.streamer = Streamer(predict_function_or_model=abp, 133 | cuda_devices=cuda_devices, 134 | max_latency=max_latency, 135 | worker_num=worker_num, 136 | batch_size=batch_size) 137 | 138 | def predict(self, text): 139 | """ 140 | 预测返回 141 | :param text: str, like "桂林" 142 | :return: list, like ["B-LOC", "I-LOC"] 143 | """ 144 | return self.streamer.predict(text) 145 | 146 | 147 | def streamer_predict(streamer_real): 148 | """ 149 | 复合使函数方法通用 150 | :return: 151 | """ 152 | params = request.form if request.form else request.json 153 | sentences = params.get('texts', '') 154 | res = [] 155 | try: 156 | res = streamer_real.predict(sentences) 157 | except Exception as e: 158 | logger.info(str(e)) 159 | return res 160 | 161 | 162 | # 模型加载 163 | path = "D:/workspace/pythonMyCode/Macropodus/macropodus/data/ner_people_1998_mix_albert_1" 164 | sn = ServiceNer(path, cuda_devices="0,1", max_latency=0.1, worker_num=1, batch_size=32) 165 | app = Flask(__name__) 166 | 167 | 168 | @app.route('/ner/predict', methods=["POST, GET"]) 169 | def ner_predict_3(): 170 | res = streamer_predict(sn) 171 | return jsonify(content=res, 172 | content_type='charset = utf-8; application/json', 173 | reason='success', 174 | charset='utf-8', 175 | status='200') 176 | 177 | 178 | if __name__ == '__main__': 179 | 180 | app.run(port=8080, threaded=True, host='0.0.0.0', debug=False) 181 | 182 | -------------------------------------------------------------------------------- /macropodus/summarize/graph_base/textrank_word2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/12/20 20:39 4 | # @author :Mo 5 | # @function :textrank of word2vec, keyword and sentence 6 | 7 | 8 | from macropodus.similarity.similarity_word2vec_char import SimW2vChar 9 | from macropodus.data.words_common.stop_words import stop_words 10 | from macropodus.preprocess.tools_ml import macropodus_cut 11 | from macropodus.preprocess.tools_ml import cut_sentence 12 | import networkx as nx 13 | import numpy as np 14 | 15 | 16 | class TextrankWord2vec(SimW2vChar): 17 | def __init__(self, use_cache=True): 18 | self.algorithm = 'textrank_word2vec' 19 | self.stop_words = stop_words 20 | super().__init__(use_cache) # self.w2v_char 21 | 22 | def cut_window(self, sent_words, win_size=2): 23 | """ 24 | 滑动窗口切词 25 | :param sent_words: list, like ["我", "是", "大漠帝国"] 26 | :param win_size: int, like 3 27 | :return: yield 28 | """ 29 | if win_size < 2: 30 | win_size = 2 31 | for i in range(1, win_size): 32 | if i >= len(sent_words): 33 | break 34 | sent_terms = sent_words[i:] # 后面的 35 | sent_zip = zip(sent_words, sent_terms) # 候选词对 36 | for sz in sent_zip: 37 | yield sz 38 | 39 | def keyword(self, text, num=6, score_min=0.025, win_size=3, type_sim="total", type_encode="avg", config={"alpha": 0.86, "max_iter":100}): 40 | """ 41 | 关键词抽取, textrank of word2vec cosine 42 | :param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道?嗯。" 43 | :param num: int, length of sentence like 6 44 | :param win_size: int, windows size of combine. like 2 45 | :param type_sim: str, type of simiilarity. like "total", "cosine" 46 | :param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100} 47 | :return: list, result of keyword. like [(0.020411696169510562, '手机'), (0.016149784106276977, '夏普')] 48 | """ 49 | # 切句 50 | if type(text) == str: 51 | self.sentences = cut_sentence(text) 52 | elif type(text) == list: 53 | self.sentences = text 54 | else: 55 | raise RuntimeError("text type must be list or str") 56 | # macropodus_cut 切词 57 | self.macropodus_word = [macropodus_cut(sentence) for sentence in self.sentences] 58 | # 去除停用词等 59 | self.sentences_word = [[w for w in mw if w not in self.stop_words.values()] for mw in self.macropodus_word] 60 | # 构建图的顶点 61 | word2index = {} 62 | index2word = {} 63 | word_index = 0 64 | for sent_words in self.sentences_word: 65 | for word in sent_words: 66 | if not word in word2index: # index 67 | word2index[word] = word_index 68 | index2word[word_index] = word 69 | word_index += 1 70 | graph_words = np.zeros((word_index, word_index)) 71 | # 构建图的边, 以两个词语的余弦相似度为基础 72 | for sent_words in self.sentences_word: 73 | for cw_1, cw_2 in self.cut_window(sent_words, win_size=win_size): 74 | if cw_1 in word2index and cw_2 in word2index: 75 | idx_1, idx_2 = word2index[cw_1], word2index[cw_2] 76 | score_w2v_cosine = self.similarity(cw_1, cw_2, type_sim=type_sim, 77 | type_encode=type_encode) 78 | graph_words[idx_1][idx_2] = score_w2v_cosine 79 | graph_words[idx_2][idx_1] = score_w2v_cosine 80 | # 构建相似度矩阵 81 | w2v_cosine_sim = nx.from_numpy_matrix(graph_words) 82 | # nx.pagerank 83 | sens_scores = nx.pagerank(w2v_cosine_sim, **config) 84 | # 得分排序 85 | sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True) 86 | # 保留topk个, 防止越界 87 | topk = min(len(sen_rank), num) 88 | # 返回原句子和得分 89 | return [(sr[1], index2word[sr[0]]) for sr in sen_rank if len(index2word[sr[0]])>1 and score_min<=sr[1]][0:topk] 90 | 91 | def summarize(self, text, num=320, type_sim="cosine", type_encode="avg", config={"alpha": 0.33, "max_iter":100}): 92 | """ 93 | 文本摘要抽取, textrank of word2vec cosine 94 | :param text: str, doc. like "大漠帝国是历史上存在的国家吗?你知不知道?嗯。" 95 | :param num: int, length of sentence like 6 96 | :param type_sim: str, type of simiilarity. like "total", "cosine" 97 | :param config: dict, config of pagerank. like {"alpha": 0.86, "max_iter":100} 98 | :return: list, result of keyword. like [(0.06900223298930287, 'PageRank The PageRank Citation Ranking'), (0.08698940285163381, 'PageRank通过网络浩瀚的超链接关系来确定一个页面的等级')] 99 | """ 100 | # 切句 101 | if type(text) == str: 102 | self.sentences = cut_sentence(text) 103 | elif type(text) == list: 104 | self.sentences = text 105 | else: 106 | raise RuntimeError("text type must be list or str") 107 | # 输入文本句子长度 108 | len_sen = len(self.sentences) 109 | # 构建图的顶点 110 | sent2idx = {} 111 | idx2sent = {} 112 | sent_idx = 0 113 | for sent in self.sentences: 114 | sent2idx[sent] = sent_idx 115 | idx2sent[sent_idx] = sent 116 | sent_idx += 1 117 | graph_sents = np.zeros((sent_idx, sent_idx)) 118 | # 构建图的边, 以两个句子的余弦相似度为基础 119 | for i in range(len_sen): 120 | for j in range(len_sen): 121 | score_w2v_cosine = self.similarity(self.sentences[i], self.sentences[j], 122 | type_sim=type_sim, type_encode=type_encode) 123 | graph_sents[i][j] = score_w2v_cosine 124 | graph_sents[j][i] = score_w2v_cosine 125 | # 构建相似度矩阵 126 | w2v_cosine_sim = nx.from_numpy_matrix(graph_sents) 127 | # nx.pagerank 128 | sens_scores = nx.pagerank(w2v_cosine_sim, **config) 129 | # 得分排序 130 | sen_rank = sorted(sens_scores.items(), key=lambda x: x[1], reverse=True) 131 | # 保留topk个, 防止越界 132 | topk = min(len(sen_rank), num) 133 | # 返回原句子和得分 134 | return [(sr[1], self.sentences[sr[0]]) for sr in sen_rank][0:topk] 135 | 136 | 137 | if __name__ == '__main__': 138 | text = "是上世纪90年代末提出的一种计算网页权重的算法。" \ 139 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长," \ 140 | "业界急需一种相对比较准确的网页重要性计算方法," \ 141 | "是人们能够从海量互联网世界中找出自己需要的信息。" \ 142 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。" \ 143 | "Google把从A页面到B页面的链接解释为A页面给B页面投票," \ 144 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面" \ 145 | "和投票目标的等级来决定新的等级。简单的说," \ 146 | "一个高等级的页面可以使其他低等级页面的等级提升。" \ 147 | "PageRank The PageRank Citation Ranking: Bringing Order to the Web," \ 148 | "具体说来就是,PageRank有两个基本思想,也可以说是假设," \ 149 | "即数量假设:一个网页被越多的其他页面链接,就越重);" \ 150 | "质量假设:一个网页越是被高质量的网页链接,就越重要。" \ 151 | "总的来说就是一句话,从全局角度考虑,获取重要的信息。" 152 | trww = TextrankWord2vec() 153 | keyword = trww.keyword(text, num=8) 154 | summary = trww.summarize(text, num=32) 155 | print(keyword) 156 | print(summary) 157 | 158 | -------------------------------------------------------------------------------- /macropodus/tookit/calculator_sihui/calcultor_number.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/22 20:09 4 | # @author :Mo 5 | # @function :extract number from sentence of chinese or mix。提取数字,中文,或者混合中文-阿拉伯数字 6 | 7 | 8 | # import regex as re 9 | import re 10 | 11 | 12 | # * 字符串预处理模块,为分析器TimeNormalizer提供相应的字符串预处理服务 13 | class StringPreHandler: 14 | # @Author : zhm 15 | # @codes : code from github: https://github.com/zhanzecheng/Time_NLP 16 | # @function :StringPreHandler.py 17 | @classmethod 18 | def delKeyword(cls, target, rules): 19 | """ 20 | 该方法删除一字符串中所有匹配某一规则字串 21 | 可用于清理一个字符串中的空白符和语气助词 22 | :param target: 待处理字符串 23 | :param rules: 删除规则 24 | :return: 清理工作完成后的字符串 25 | """ 26 | pattern = re.compile(rules) 27 | res = pattern.sub('', target) 28 | # print res 29 | return res 30 | 31 | 32 | @classmethod 33 | def numberTranslator(cls, target): 34 | """ 35 | 该方法可以将字符串中所有的用汉字表示的数字转化为用阿拉伯数字表示的数字 36 | 如"这里有一千两百个人,六百零五个来自中国"可以转化为 37 | "这里有1200个人,605个来自中国" 38 | 此外添加支持了部分不规则表达方法 39 | 如两万零六百五可转化为20650 40 | 两百一十四和两百十四都可以转化为214 41 | 一六零加一五八可以转化为160+158 42 | 该方法目前支持的正确转化范围是0-99999999 43 | 该功能模块具有良好的复用性 44 | :param target: 待转化的字符串 45 | :return: 转化完毕后的字符串 46 | """ 47 | pattern = re.compile(u"[一二两三四五六七八九123456789]万[一二两三四五六七八九123456789](?!(千|百|十))") 48 | match = pattern.finditer(target) 49 | for m in match: 50 | group = m.group() 51 | s = group.split(u"万") 52 | s = list(filter(None, s)) 53 | num = 0 54 | if len(s) == 2: 55 | num += cls.wordToNumber(s[0]) * 10000 + cls.wordToNumber(s[1]) * 1000 56 | target = pattern.sub(str(num), target, 1) 57 | 58 | pattern = re.compile(u"[一二两三四五六七八九123456789]千[一二两三四五六七八九123456789](?!(百|十))") 59 | match = pattern.finditer(target) 60 | for m in match: 61 | group = m.group() 62 | s = group.split(u"千") 63 | s = list(filter(None, s)) 64 | num = 0 65 | if len(s) == 2: 66 | num += cls.wordToNumber(s[0]) * 1000 + cls.wordToNumber(s[1]) * 100 67 | target = pattern.sub(str(num), target, 1) 68 | 69 | pattern = re.compile(u"[一二两三四五六七八九123456789]百[一二两三四五六七八九123456789](?!十)") 70 | match = pattern.finditer(target) 71 | for m in match: 72 | group = m.group() 73 | s = group.split(u"百") 74 | s = list(filter(None, s)) 75 | num = 0 76 | if len(s) == 2: 77 | num += cls.wordToNumber(s[0]) * 100 + cls.wordToNumber(s[1]) * 10 78 | target = pattern.sub(str(num), target, 1) 79 | 80 | pattern = re.compile(u"[零一二两三四五六七八九]") 81 | match = pattern.finditer(target) 82 | for m in match: 83 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 84 | 85 | # pattern = re.compile(u"(?<=(周|星期))[末天日]") 86 | pattern = re.compile(u"((?<=周)[末天日])|((?<=星期)[末天日])") 87 | match = pattern.finditer(target) 88 | for m in match: 89 | target = pattern.sub(str(cls.wordToNumber(m.group())), target, 1) 90 | 91 | # pattern = re.compile(u"(? 202 | """ 203 | res = sph.numberTranslator(target=sentence) 204 | find_list = [] 205 | for i in re.finditer('(\d+(\.\d+)?)', res): 206 | find_list.append(i.group()) 207 | return find_list 208 | 209 | 210 | if __name__ == '__main__': 211 | sen = "1000.一加1等于几,周末和星期天,星期一星期二" 212 | res = extract_number(sen) 213 | print(res) 214 | -------------------------------------------------------------------------------- /macropodus/tookit/calculator_sihui/calcultor_formula.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/21 23:38 4 | # @author :Mo 5 | # @function :calcultor of text, not filter and redundancy 6 | 7 | 8 | from macropodus.conf.path_log import get_logger_root 9 | import re 10 | 11 | 12 | logger = get_logger_root() 13 | 14 | 15 | def change_symbol(formula): 16 | """ 17 | 提取负号 18 | eg:-9-2-5-2*3-5/3-40*4/-1.0/5+6*3 ===> -(9+2+5+2*3+5/3+40*4/1.0/5-6*3) 19 | :param formula: 20 | :return: 21 | """ 22 | def primary_change(for_str): # 把算式中的全角 + - 对应换成 - + 23 | temp = for_str.split("+") 24 | new_formula = [] 25 | for value in temp: 26 | value = value.replace("-", "+") 27 | new_formula.append(value) 28 | return "-".join(new_formula) 29 | 30 | if formula.startswith("-"): 31 | formula = formula.replace("-", "", 1) 32 | formula = primary_change(formula) 33 | formula = formula.join(["-(", ")"]) 34 | elif formula.startswith("+"): 35 | formula = primary_change(formula) 36 | formula = formula.join(["-(", ")"]) 37 | else: 38 | formula = primary_change(formula) 39 | formula = formula.join(["-(-", ")"]) 40 | return formula 41 | 42 | 43 | def remove_repeat(formula): 44 | """ 45 | 去掉连续的重复的运算符 46 | :param formula: str, like: "1++2" 47 | :return: str, like:"1+2" 48 | """ 49 | temp = formula.replace("++", "+") 50 | temp = temp.replace("+-", "-") 51 | temp = temp.replace("-+", "-") 52 | temp = temp.replace("--", "+") 53 | temp = temp.replace("*+", "*") 54 | temp = temp.replace("+*", "*") 55 | temp = temp.replace("/+", "/") 56 | temp = temp.replace("+/", "/") 57 | return temp 58 | 59 | 60 | def has_special_operator(formula, special_operator): 61 | """ 62 | 判断是否有 *+ +- /- 之类的运算符 63 | :param formula: 64 | :param special_operator: 65 | :return: 66 | """ 67 | for operator in special_operator: 68 | if formula.find(operator) != -1: 69 | return operator 70 | return "" 71 | 72 | 73 | def handle_special_operator(formula, operator): 74 | """ 75 | 如果有 "*-", "-*", "/-", "-/" 这些运算符, 76 | 提取负号,去掉重复的运算符 77 | :param formula: 78 | :param operator: 79 | :return: 80 | """ 81 | temp = "" 82 | regex = "\d*[.]?\d+" 83 | opera = operator.replace("*", "[*]") 84 | ret = re.compile(opera.join([regex, regex])) 85 | while ret.search(formula): 86 | search_res = ret.search(formula).group() 87 | if operator.find("*") != -1: 88 | temp = search_res.replace(operator, "*") 89 | elif operator.find("/") != -1: 90 | temp = search_res.replace(operator, "/") 91 | temp = "-".join(["", temp]) 92 | formula = formula.replace(search_res, temp, 1) 93 | return formula 94 | 95 | 96 | def has_parentheses(formula): 97 | """ 98 | 判断是否还有括号 99 | :param formula: str 100 | :return: boolean 101 | """ 102 | if re.search("[()]", formula): 103 | return True 104 | return False 105 | 106 | 107 | def judge_illegal(formula): 108 | """ 109 | 判断括号是否匹配完全,运算符是否合法 110 | 没有考虑 ** // 的计算 111 | :param formula: str 112 | :return: str 113 | """ 114 | if len(re.findall("[(]", formula)) != len(re.findall("[)]", formula)): 115 | return True 116 | if formula.startswith("*") or formula.startswith("/"): 117 | return True 118 | if has_special_operator(formula, ["*/", "/*", "**", "//"]): 119 | return True 120 | return False 121 | 122 | 123 | def calculator_formula(formula): 124 | """ 125 | 计算算式,这里计算的是不带括号的算式 126 | 计算次序是 / * - + 127 | 计算过程中出现括号则停止计算,返回当前的算式 128 | :param formula: 129 | :return: 130 | """ 131 | def primary_operator(for_str, operation): 132 | try: 133 | primary_result = 0 134 | regex = "\d*[.]?\d*" 135 | ret = re.compile(operation.join(["[", "]"]).join([regex, regex])) 136 | while ret.search(for_str): 137 | ret_opera = has_special_operator(for_str, ["*-", "-*", "/-", "-/"]) 138 | while ret_opera: 139 | for_str = handle_special_operator(for_str, ret_opera) 140 | ret_opera = has_special_operator(for_str, ["*-", "-*", "/-", "-/"]) 141 | while has_special_operator(for_str, ["+-", "-+", "++", "--", "+*", "*+", "+/", "/+"]): 142 | for_str = remove_repeat(for_str) 143 | # print("primary_operator:", for_str) 144 | if has_parentheses(for_str): 145 | return for_str 146 | if for_str.startswith("-"): 147 | temp = re.findall("^-\d*[.]?\d*$", for_str) 148 | if temp: 149 | return temp[0] 150 | return change_symbol(for_str) 151 | if for_str.startswith("+"): 152 | for_str = for_str.replace("+", "", 1) 153 | if not ret.search(for_str): 154 | continue 155 | search_res = ret.search(for_str).group() 156 | operand_list = search_res.split(operation) 157 | if operation == "/": 158 | primary_result = float(operand_list[0]) / float(operand_list[1]) 159 | elif operation == "*": 160 | primary_result = float(operand_list[0]) * float(operand_list[1]) 161 | elif operation == "-": 162 | primary_result = float(operand_list[0]) - float(operand_list[1]) 163 | elif operation == "+": 164 | primary_result = float(operand_list[0]) + float(operand_list[1]) 165 | for_str = for_str.replace(search_res, '%f' % (primary_result), 1) 166 | return for_str 167 | except Exception as e: 168 | logger.info(str(e)) 169 | return None 170 | try: 171 | formula = primary_operator(formula, "/") 172 | formula = primary_operator(formula, "*") 173 | formula = primary_operator(formula, "-") 174 | formula = primary_operator(formula, "+") 175 | except Exception as e: 176 | logger.info(str(e)) 177 | return None 178 | return formula 179 | 180 | 181 | def remove_parentheses(formula): 182 | """ 183 | 去掉算式的括号,计算括号里算式 184 | :param formula: 185 | :return: 186 | """ 187 | parentheses = re.compile("\([^()]+\)") 188 | while parentheses.search(formula): 189 | search_res = parentheses.search(formula).group() 190 | for_str = re.sub("[()]", "", search_res) 191 | if judge_illegal(for_str): 192 | return "" 193 | for_str = calculator_formula(for_str) 194 | formula = formula.replace(search_res, for_str, 1) 195 | """ 196 | 会有去掉所有括号算式还没算完的情况 197 | eg:1-2*65 198 | 需要再计算一遍算式 199 | """ 200 | formula = calculator_formula(formula) 201 | return formula 202 | 203 | 204 | def result_formula(formula): 205 | """ 206 | 简单计算器, 纯粹四则运算 207 | 去完括号后额外计算的那一次若再次出现括号, 208 | 则重复去括号运算,直至再没有括号 209 | :param formula: str 210 | :return: str 211 | """ 212 | 213 | def remove_space(formula): 214 | """ 215 | 去掉算式的空格 216 | :param formula: str 217 | :return: str 218 | """ 219 | return formula.replace(" ", "") 220 | 221 | def first_calculator(for_str): 222 | """ 223 | 先计算括号里边的 224 | :param for_str: 225 | :return: 226 | """ 227 | if judge_illegal(for_str): 228 | return None 229 | return remove_parentheses(for_str) 230 | 231 | formula = remove_space(formula) 232 | 233 | formula = first_calculator(formula) 234 | if not formula: 235 | return None 236 | while has_parentheses(formula): 237 | formula = first_calculator(formula) 238 | # print("calculator_result:", formula) 239 | if not formula: 240 | return None 241 | return formula 242 | 243 | 244 | if __name__ == '__main__': 245 | cal = result_formula("1+1+2+3*(35+1-5*7-10/5)/2*2") 246 | print(cal) 247 | -------------------------------------------------------------------------------- /macropodus/summarize/feature_base/text_teaser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/26 20:02 4 | # @author :Mo 5 | # @function :text summary of feature-base of TextTeaser 6 | # @paper :Automatic Text Summarization for Indonesian Language Using TextTeaser(2013) 7 | # @url :using Google Scholar 8 | 9 | 10 | from macropodus.data.words_common.stop_words import stop_words 11 | from macropodus.preprocess.tools_ml import extract_chinese 12 | from macropodus.preprocess.tools_ml import macropodus_cut 13 | from macropodus.preprocess.tools_ml import cut_sentence 14 | from collections import Counter 15 | 16 | 17 | class TextTeaserSum: 18 | def __init__(self): 19 | self.algorithm = 'text_teaser' 20 | self.stop_words = stop_words.values() 21 | self.len_ideal = 18 # 中心句子长度, 默认 22 | 23 | def score_position(self): 24 | """ 25 | 文本句子位置得分 26 | :param sentence: 27 | :return: 28 | """ 29 | score_position = [] 30 | for i, sen in enumerate(self.sentences): 31 | score_standard = i / (len(self.sentences)) 32 | if score_standard >= 0 and score_standard <= 0.1: 33 | score_position.append(0.17) 34 | elif score_standard > 0.1 and score_standard <= 0.2: 35 | score_position.append(0.23) 36 | elif score_standard > 0.2 and score_standard <= 0.3: 37 | score_position.append(0.14) 38 | elif score_standard > 0.3 and score_standard <= 0.4: 39 | score_position.append(0.08) 40 | elif score_standard > 0.4 and score_standard <= 0.5: 41 | score_position.append(0.05) 42 | elif score_standard > 0.5 and score_standard <= 0.6: 43 | score_position.append(0.04) 44 | elif score_standard > 0.6 and score_standard <= 0.7: 45 | score_position.append(0.06) 46 | elif score_standard > 0.7 and score_standard <= 0.8: 47 | score_position.append(0.04) 48 | elif score_standard > 0.8 and score_standard <= 0.9: 49 | score_position.append(0.04) 50 | elif score_standard > 0.9 and score_standard <= 1.0: 51 | score_position.append(0.15) 52 | else: 53 | score_position.append(0) 54 | return score_position 55 | 56 | def score_length(self, sentence): 57 | """ 58 | 文本长度得分 59 | :param sentence: 60 | :return: 61 | """ 62 | score_length = 1 - min(abs(self.len_ideal - len(sentence)), self.len_ideal) / self.len_ideal 63 | return score_length 64 | 65 | def score_sbs(self, words): 66 | """ 67 | 单个句子的sbs分数 68 | :param words: 69 | :return: 70 | """ 71 | score_sbs = 0.0 72 | for word in words: 73 | if word in self.word_freqs: 74 | score_sbs += self.word_freqs[word] 75 | return ((1.0 / abs(len(words))) if len(words) else 1e-9) * score_sbs 76 | 77 | def score_dbs(self, words): 78 | """ 79 | 单个句子的dbs分数 80 | :param words: 81 | :return: 82 | """ 83 | words_all = list(self.word_freqs.keys()) 84 | pun = len(set(words)&set(words_all)) + 1 85 | score_dbs = 0.0 86 | wf_first = [] 87 | for i, word in enumerate(words): 88 | if word in words_all: 89 | index = words_all.index(word) 90 | if not wf_first: 91 | wf_first = [index, self.word_freqs[word]] 92 | else: 93 | score_dbs += wf_first[1]*self.word_freqs[word] / (((wf_first[0] - index) if (wf_first[0] - index)!=0 else self.len_words)**2) 94 | score_dbs = score_dbs if score_dbs !=0 else 1e-9 95 | return (1.0 / pun * (pun + 1.0)) * score_dbs 96 | 97 | def score_title(self, words): 98 | """ 99 | 与标题重合部分词语 100 | :param words: 101 | :return: 102 | """ 103 | mix_word = [word for word in words if word in self.title] 104 | len_mix_word = len(mix_word) 105 | len_title_word = len(self.title) 106 | return (len_mix_word + 1.0) / (len_mix_word + 2.0) / len_title_word 107 | 108 | def summarize(self, text, num=320, title=None): 109 | # 切句 110 | if type(text) == str: 111 | self.sentences = cut_sentence(text) 112 | elif type(text) == list: 113 | self.sentences = text 114 | else: 115 | raise RuntimeError("text type must be list or str") 116 | self.title = title 117 | if self.title: 118 | self.title = macropodus_cut(title) 119 | # 切词 120 | sentences_cut = [[word for word in macropodus_cut(extract_chinese(sentence)) 121 | if word.strip()] for sentence in self.sentences] 122 | # 去除停用词等 123 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 124 | # 词频统计 125 | self.words = [] 126 | for sen in self.sentences_cut: 127 | self.words = self.words + sen 128 | self.word_count = dict(Counter(self.words)) 129 | # word_count_rank = sorted(word_count.items(), key=lambda f:f[1], reverse=True) 130 | # self.word_freqs = [{'word':wcr[0], 'freq':wcr[1]} for wcr in word_count_rank] 131 | # 按频次计算词语的得分, 得到self.word_freq=[{'word':, 'freq':, 'score':}] 132 | self.word_freqs = {} 133 | self.len_words = len(self.words) 134 | for k, v in self.word_count.items(): 135 | self.word_freqs[k] = v * 0.5 / self.len_words 136 | # 句子位置打分 137 | scores_posi = self.score_position() 138 | res_rank = {} 139 | self.res_score = [] 140 | for i in range(len(sentences_cut)): 141 | sen = self.sentences[i] # 句子 142 | sen_cut = self.sentences_cut[i] # 句子中的词语 143 | score_sbs = self.score_sbs(sen_cut) # 句子中的词语打分1 144 | score_dbs = self.score_dbs(sen_cut) # 句子中的词语打分2 145 | score_word = (score_sbs + score_dbs) * 10.0 / 2.0 # 句子中的词语打分mix 146 | score_length = self.score_length(sen) # 句子文本长度打分 147 | score_posi = scores_posi[i] 148 | if self.title: # 有标题的文本打分合并 149 | score_title = self.score_title(sen_cut) 150 | score_total = (score_title * 0.5 + score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 4.0 151 | # 可查阅各部分得分统计 152 | self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "score_title", "sentences"]) 153 | self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, score_title, self.sentences[i]]) 154 | else: # 无标题的文本打分合并 155 | score_total = (score_word * 2.0 + score_length * 0.5 + score_posi * 1.0) / 3.5 156 | self.res_score.append(["score_total", "score_sbs", "score_dbs", "score_word", "score_length", "score_posi", "sentences"]) 157 | self.res_score.append([score_total, score_sbs, score_dbs, score_word, score_length, score_posi, self.sentences[i].strip()]) 158 | res_rank[self.sentences[i].strip()] = score_total 159 | # 最小句子数 160 | num_min = min(num, int(len(self.word_count) * 0.6)) 161 | score_sen = [(rc[1], rc[0]) for rc in sorted(res_rank.items(), key=lambda d: d[1], reverse=True)][0:num_min] 162 | return score_sen 163 | 164 | 165 | if __name__ == '__main__': 166 | doc1 = "PageRank算法简介。" \ 167 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 168 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 169 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 170 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 171 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 172 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 173 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 174 | "和投票目标的等级来决定新的等级。简单的说, " \ 175 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 176 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 177 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 178 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 179 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 180 | title = "方直科技等公司合伙设立教育投资基金" 181 | doc = "多知网5月26日消息,今日,方直科技发公告,拟用自有资金人民币1.2亿元," \ 182 | "与深圳嘉道谷投资管理有限公司、深圳嘉道功程股权投资基金(有限合伙)共同发起设立嘉道方直教育产业投资基金(暂定名)。" \ 183 | "该基金认缴出资总规模为人民币3.01亿元。" \ 184 | "基金的出资方式具体如下:出资进度方面,基金合伙人的出资应于基金成立之日起四年内分四期缴足,每期缴付7525万元;" \ 185 | "各基金合伙人每期按其出资比例缴付。合伙期限为11年,投资目标为教育领域初创期或成长期企业。" \ 186 | "截止公告披露日,深圳嘉道谷投资管理有限公司股权结构如下:截止公告披露日,深圳嘉道功程股权投资基金产权结构如下:" \ 187 | "公告还披露,方直科技将探索在中小学教育、在线教育、非学历教育、学前教育、留学咨询等教育行业其他分支领域的投资。" \ 188 | "方直科技2016年营业收入9691万元,营业利润1432万元,归属于普通股股东的净利润1847万元。(多知网 黎珊)}}" 189 | tt = TextTeaserSum() 190 | res_ = tt.summarize(doc) 191 | for res in res_: 192 | print(res) 193 | gg = 0 194 | -------------------------------------------------------------------------------- /macropodus/tookit/calculator_sihui/calcultor_function.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/11/21 23:36 4 | # @author :Mo 5 | # @function :function of some basic Extraction Scientific Computing 6 | 7 | 8 | from macropodus.tookit.calculator_sihui.calcultor_number import extract_number 9 | from macropodus.conf.path_log import get_logger_root 10 | import math 11 | import re 12 | 13 | 14 | logger = get_logger_root() 15 | 16 | 17 | def rackets_replace(rackets_char, myformula): 18 | """ 19 | 将2(3换成2*(3, 3)4换成3)*4 20 | :param rackets_char: 21 | :param myformula: 22 | :return: 23 | """ 24 | if rackets_char in myformula: # "("在算式里边 25 | if rackets_char =="(": 26 | rackets_re = r'\(' 27 | else: 28 | rackets_re = r'\)' 29 | pos_rackets = re.finditer(rackets_re, myformula) 30 | count = 0 31 | for pos in pos_rackets: 32 | pos_single = pos.start() + count 33 | if pos_single != 0 and rackets_char =="(": 34 | if myformula[pos_single-1] in '零一二两三四五六七八九0123456789百十千万亿': 35 | myformula = myformula[:pos_single] + "*" + myformula[pos_single:] 36 | count += 1 37 | if pos_single != len(myformula)-1 and rackets_char ==")": 38 | if myformula[pos_single+1] in '零一二两三四五六七八九0123456789百十千万亿': 39 | myformula = myformula[:pos_single+1] + "*" + myformula[pos_single+1:] 40 | count += 1 41 | return myformula 42 | else: 43 | return myformula 44 | 45 | 46 | 47 | def reagan(words, wordsminus): 48 | """ 49 | 求平方根,立方根,n次方根 50 | :param words: str, 原句 51 | :param wordsminus:str , 处理后的句子 52 | :return: 53 | """ 54 | try: 55 | if '根号' in words: 56 | reagan = wordsminus.replace("开", "").replace("根号", "").replace("的", "") 57 | radicalaa = float(extract_number(reagan)[0]) 58 | if radicalaa < 0.0: 59 | return 'illegal math' 60 | radicalbb = math.sqrt(radicalaa) 61 | results = str(radicalbb) 62 | elif "平方根" in words: 63 | reagan = wordsminus.replace("开", "").replace("平方根", "").replace("平方", "").replace("的", "") 64 | reagan = extract_number(reagan)[0] 65 | squarerootaa = float(reagan) 66 | if squarerootaa < 0.0: 67 | return 'illegal math' 68 | squarerootbb = math.sqrt(squarerootaa) 69 | results = str(squarerootbb) 70 | elif "立方根" in words: 71 | reagan = wordsminus.replace("开", "").replace("立方根", "").replace("立方", "").replace("的", "") 72 | reagan = extract_number(reagan)[0] 73 | squarerootaa = float(reagan) 74 | squarerootbb = math.pow(squarerootaa, 1.0 / 3) 75 | results = str(squarerootbb) 76 | elif "次方根" in words: 77 | reagan = wordsminus.replace("开", "").replace("次方根", "").replace("次方", "") 78 | squareroot = reagan.split("的") 79 | squarerootaa = float(extract_number(squareroot[0])[0]) 80 | squarerootbb = float(extract_number(squareroot[1])[0]) 81 | if squarerootaa % 2 == 0 and squarerootbb < 0.0: 82 | return 'illegal math' 83 | squarerootcc = math.pow(squarerootaa, 1.0 / squarerootbb) 84 | results = str(squarerootcc) 85 | else: 86 | results = words 87 | return results 88 | except Exception as e: 89 | logger.info(str(e)) 90 | return words 91 | 92 | 93 | def power(words, wordsminus): 94 | """ 95 | 求指数,求平方 96 | :param words: 97 | :param wordsminus: 98 | :return: 99 | """ 100 | try: 101 | if "平方根" not in words and "平方" in words: 102 | reagan = wordsminus.replace("平方", "").replace("开", "").replace("的", "") 103 | reagan = extract_number(reagan)[0] 104 | square = float(reagan) 105 | radicalbb = math.pow(square, 2) 106 | results = str(radicalbb) 107 | elif "立方根" not in words and "立方" in words: 108 | reagan = wordsminus.replace("立方", "").replace("开", "").replace("的", "") 109 | reagan = extract_number(reagan)[0] 110 | square = float(reagan) 111 | radicalbb = math.pow(square, 3) 112 | results = str(radicalbb) 113 | elif (("次方" in words or "次幂" in words) and "次方根" not in words and "次幂根" not in words): 114 | reagan = wordsminus.replace("次方", "").replace("开", "").replace("次幂", "") 115 | squareroot = reagan.split("的") 116 | squarerootaa = float(extract_number(squareroot[0])[0]) 117 | squarerootbb = float(extract_number(squareroot[1])[0]) 118 | squarerootcc = math.pow(squarerootaa, squarerootbb) 119 | results = str(squarerootcc) 120 | else: 121 | results = words 122 | return results 123 | except Exception as e: 124 | logger.info(str(e)) 125 | return words 126 | 127 | 128 | def logarithm(words, wordsminus): 129 | """ 130 | 求对数 131 | :param words: 132 | :param wordsminus: 133 | :return: 134 | """ 135 | try: 136 | if "LG" in words or "LOG" in words: 137 | Lg = wordsminus.replace("LOG", "").replace("LG", "").replace(" ", "").replace("的", "") 138 | Lg = float(extract_number(Lg)[0]) 139 | if Lg <= 0.0: 140 | return 'illegal math' 141 | lgbb = math.log(Lg) 142 | results = str(lgbb) 143 | elif "对数" in words: 144 | Logg = wordsminus.replace("以", "").replace("对数", "").replace("的对数", "").replace(" ", "").replace("的", "") 145 | root = Logg.split("为底") 146 | rootaa = float(extract_number(root[0])[0]) 147 | rootbb = float(extract_number(root[1])[0]) 148 | if rootaa <= 0.0 or rootbb <= 0.0: 149 | return 'illegal math' 150 | rootcc = math.log(rootbb) / math.log(rootaa) 151 | results = str(rootcc) 152 | else: 153 | results = words 154 | return results 155 | except Exception as e: 156 | logger.info(str(e)) 157 | return words 158 | 159 | 160 | def fraction(words, wordsminus): 161 | """ 162 | 求分数 163 | :param words: 164 | :param wordsminus: 165 | :return: 166 | """ 167 | try: 168 | if "fenzhi" in words: 169 | fenzhi = wordsminus.replace("fenzhi", "/").replace(" ", "").replace("的", "") 170 | root = fenzhi.split("/") 171 | rootaa = float(extract_number(root[0])[0]) 172 | rootbb = float(extract_number(root[1])[0]) 173 | rootcc = rootbb / rootaa 174 | results = str(rootcc) 175 | else: 176 | results = words 177 | return results 178 | except Exception as e: 179 | logger.info(str(e)) 180 | return words 181 | 182 | 183 | def fractiontwo(words, wordsminus): 184 | """ 185 | 取分数 186 | :param words: 187 | :param wordsminus: 188 | :return: 189 | """ 190 | try: 191 | if "fenzhi" in words: 192 | fenzhi = wordsminus.replace("fenzhi", "/").replace(" ", "").replace("的", "") 193 | root = fenzhi.split("/") 194 | rootaa = float(extract_number(root[0])[0]) 195 | rootbb = float(extract_number(root[1])[0]) 196 | results = str(rootaa/rootbb) 197 | else: 198 | results = words 199 | return results 200 | except Exception as e: 201 | logger.info(str(e)) 202 | return words 203 | 204 | 205 | def factorial(words, wordsminus): 206 | """ 207 | 求阶乘 208 | :param words: 209 | :param wordsminus: 210 | :return: 211 | """ 212 | results = words 213 | try: 214 | if "jiecheng的" in words: 215 | factory = wordsminus.replace("jiecheng的", "").replace("的", "").replace(" ", "") 216 | fact = float(extract_number(factory)[0]) 217 | if fact <= 10000: 218 | results = str(math.factorial(fact)) 219 | else: 220 | results = words 221 | return results 222 | except Exception as e: 223 | logger.info(str(e)) 224 | return words 225 | 226 | 227 | if __name__ == '__main__': 228 | res = reagan("根号4", "根号4") 229 | print(res) 230 | res = reagan("27的3次方根是多少", "27的3次方根") 231 | print(res) 232 | res = power("9的平方", "9的平方") 233 | print(res) 234 | res = power("27的立方是几", "9的立方") 235 | print(res) 236 | res = power("3的3次方是几", "3的3次方实") 237 | print(res) 238 | res = logarithm("LG8", "LG8") 239 | print(res) 240 | res = logarithm("以2为底64的对数", "以2为底64的对数") 241 | print(res) 242 | res = fraction("1fenzhi6是多少", "1fenzhi6") 243 | print(res) 244 | res = factorial("10jiecheng的", "10jiecheng的") 245 | print(res) 246 | --------------------------------------------------------------------------------