├── .gitignore ├── AugmentText ├── README.md ├── __init__.py ├── augment_eda │ ├── __init__.py │ ├── enhance_eda.py │ ├── enhance_eda_v2.py │ └── enhance_word2vec.py ├── augment_keyword │ ├── __init__.py │ ├── ccks_news_2020.json │ ├── keyword_sim.py │ └── statistics_keyword.py ├── augment_marko │ ├── __init__.py │ └── enhance_marko.py ├── augment_nmt │ ├── README.md │ ├── __init__.py │ ├── nmt_local.py │ └── requestments.txt ├── augment_seq2seq │ ├── __init__.py │ ├── code_seq2seq_char │ │ ├── __init__.py │ │ ├── extract_char_webank.py │ │ ├── predict_char_anti.py │ │ └── train_char_anti.py │ ├── code_seq2seq_word │ │ ├── __init__.py │ │ ├── extract_webank.py │ │ ├── predict_word_anti.py │ │ └── train_word_anti.py │ ├── data_mid │ │ ├── char │ │ │ └── useless.txt │ │ └── word │ │ │ └── useless.txt │ └── model_seq2seq_tp │ │ ├── seq2seq_char_webank │ │ └── useless.txt │ │ └── seq2seq_word_webank │ │ └── useless.txt ├── augment_simbert │ ├── README.md │ ├── __init__.py │ ├── enhance_roformer.py │ ├── enhance_simbert.py │ ├── requestments.txt │ └── tet_keras.py ├── augment_syntax │ ├── __init__.py │ └── augment_mainpart.py └── augment_translate │ ├── __init__.py │ ├── translate_account │ ├── __init__.py │ └── translate_tencent_secret.py │ ├── translate_tools │ ├── __init__.py │ └── translate_translate.py │ └── translate_web │ ├── __init__.py │ └── translate_google.py ├── ChatBot ├── __init__.py ├── chatbot_generate │ ├── __init__.py │ └── seq2seq │ │ ├── __init__.py │ │ ├── code_seq2seq_char │ │ ├── __init__.py │ │ ├── extract_char_cg.py │ │ ├── predict_char_cg.py │ │ └── train_char_cg.py │ │ ├── code_seq2seq_word │ │ ├── __init__.py │ │ ├── extract_word_cg.py │ │ ├── pred_word_cg.py │ │ └── train_word_cg.py │ │ ├── data_mid │ │ ├── char │ │ │ └── useless.txt │ │ └── word │ │ │ └── useless.txt │ │ └── model_seq2seq_tp │ │ ├── seq2seq_char_cg │ │ └── useless.txt │ │ └── seq2seq_word_cg │ │ └── useless.txt └── chatbot_search │ ├── __init__.py │ ├── chatbot_bertwhite │ ├── README.md │ ├── __init__.py │ ├── bertWhiteConf.py │ ├── bertWhiteTools.py │ ├── bertWhiteTrain.py │ ├── chicken_and_gossip.txt │ ├── indexAnnoy.py │ ├── indexFaiss.py │ └── mmr.py │ ├── chatbot_fuzzy.py │ ├── chatbot_sentence_vec_by_bert.py │ ├── chatbot_sentence_vec_by_char.py │ ├── chatbot_sentence_vec_by_word.py │ └── chatbot_tfserving │ ├── README.md │ ├── TFServing_postprocess.py │ ├── TFServing_preprocess.py │ ├── TFServing_save.py │ ├── TFServing_tet_http.py │ ├── __init__.py │ ├── bertWhiteConf.py │ ├── bertWhiteTools.py │ ├── bertWhiteTrain.py │ ├── chicken_and_gossip.txt │ ├── indexAnnoy.py │ ├── indexFaiss.py │ └── mmr.py ├── ClassificationText ├── __init__.py └── bert │ ├── __init__.py │ ├── args.py │ ├── keras_bert_classify_bi_lstm.py │ ├── keras_bert_classify_text_cnn.py │ ├── keras_bert_embedding.py │ ├── keras_bert_layer.py │ ├── model_webank_tdt │ └── useless.txt │ └── readme.md ├── Data ├── chinese_L-12_H-768_A-12 │ └── useless.txt ├── chinese_vector │ ├── w2v_model_merge_short.vec │ └── w2v_model_wiki_char.vec ├── chinese_xlnet_mid_L-24_H-768_A-12 │ └── __init__.py ├── common_words │ └── stopwords.txt ├── corpus │ ├── chicken_and_gossip.txt │ ├── ner │ │ └── people_daily │ │ │ ├── people.dev │ │ │ ├── people.test │ │ │ └── people.train │ ├── sim_webank.csv │ └── webank │ │ ├── dev.csv │ │ ├── test.csv │ │ └── train.csv ├── sentence_vec_encode_char │ └── char ├── sentence_vec_encode_word │ └── word └── tf_idf │ └── tf_idf ├── FeatureProject ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── distance_text_or_vec.cpython-36.pyc │ └── distance_vec_TS_SS.cpython-36.pyc ├── bert │ ├── __init__.py │ ├── extract_keras_bert_feature.py │ ├── layers_keras.py │ ├── readme.md │ └── tet_bert_keras_sim.py ├── cut_td_idf.py ├── distance_text_or_vec.py ├── distance_vec_TS_SS.py ├── normalization_util.py ├── sentence_sim_feature.py └── xlnet │ ├── __init__.py │ ├── args.py │ ├── extract_keras_xlnet_feature.py │ ├── layers_keras.py │ └── tet_xlnet_keras_sim.py ├── LICENSE ├── Ner ├── __init__.py └── bert │ ├── __init__.py │ ├── args.py │ ├── keras_bert_embedding.py │ ├── keras_bert_layer.py │ ├── keras_bert_ner_bi_lstm.py │ ├── layer_crf_bojone.py │ └── models │ └── bilstm │ └── useless.txt ├── conf ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── path_config.cpython-36.pyc ├── augment_constant.py ├── feature_config.py ├── params.json └── path_config.py ├── python-version-time ├── readme.md ├── requestments.txt ├── result_test ├── __init__.py ├── result_augment_seq2seq_char.txt ├── result_augment_seq2seq_word.txt ├── result_augment_syntax.txt ├── result_chatbot_fuzzy.txt ├── result_chatbot_sentence_vec_by_char.txt ├── result_chatbot_sentence_vec_by_word.txt └── result_sentence_sim_feature.txt ├── setup.py └── utils ├── __init__.py ├── mode_util ├── __init__.py └── seq2seq │ ├── __init__.py │ ├── data_utils.py │ ├── model_seq2seq.py │ ├── thread_generator.py │ └── word_sequence.py ├── text_tools.py └── word2vec_vector.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /AugmentText/README.md: -------------------------------------------------------------------------------- 1 | # AugmentText 2 | 3 | # 概述 4 | - 相较于图像数据增强,文本数据增强,现在还是有很多问题的; 5 | - 往更严格的角度看,文本数据增强更像是同义句生成,但又不完全是,它是一个更大范围的概念; 6 | - 很多时候,需要文本数据增强,一个是常常遇到的数据不足,另一个就是数据不均衡。 7 | - 我的感觉是,文本数据增强的有效方法: 8 | - 一个是回译(翻译两次,例如中文到英文,然后英文再到中文), 9 | - 另外一个就是EDA(同义词替换、插入、交换和删除),插入和交换当时确实没有想到用 10 | 11 | 12 | ###github项目地址为### 13 | https://github.com/yongzhuo/nlp_xiaojiang/tree/master/AugmentText 14 | 15 | 16 | # 回译(相对靠谱) 17 | - 1.在线翻译工具(中文->[英、法、德、俄、西班牙、葡萄牙、日、韩、荷兰、阿拉伯]等语言) 18 | - 谷歌翻译(google),谷歌翻译不用说,应该是挺好的,语言支持最多,不过我暂时还不会翻墙注册账户 19 | - 百度翻译(baidu),百度翻译不用说,国内支持翻译语言最多的了(28种互译),而且最大方了,注册账户后每月有200万字符的流量,大约是2M吧,超出则49元人民币/百万字符 20 | - 有道翻译(youdao),初始接触网络的时候我最喜欢用有道翻译了,但死贵,只有100元体验金,差评。才支持11种语言,48元/百万字符 21 | - 搜狗翻译(sougou),对于搜狗印象还行吧,毕竟是能做搜索引擎的公司嘛。78种语言,200元体验金,常见语言40元/百万字符,非常见语言60元/百万字符 22 | - 腾讯翻译(tencent),总觉得腾讯AI是后知后觉了,公司调用腾讯接口老是变来变去的,这次也是被它的sign加密给恶心到了,空格改为+。或许对企鹅而言,人工智能不那么重要吧。 23 | -有两个,一个是翻译君一个是AIlab什么的,支持的语言少些。似乎还在开发中,不限额不保证并发,php开发没有python的demo 24 | - 必应翻译(bing),微软的东西,你懂的,没有尝试,直接在网页上试试还可以吧 25 | - 可以采用工具、模拟访问网页、或者是注册账号等 26 | - 2.离线翻译工具 27 | - 1.自己写,收集些语料,seq2seq,nmt,transformer 28 | - 2.小牛翻译,比较古老的版本了,win10或者linux都可以,不过只有训练好的中英互译 29 | 地址:http://www.niutrans.com/index.html 30 | 31 | # 同义词替换(还行) 32 | - 1.eda(其实就是同义词替换、插入、交换和删除) 论文《Easy data augmentation techniques for boosting performance on text classification tasks》 33 | - 中文实现的demo,github项目zhanlaoban/eda_nlp_for_Chinese,地址:https://github.com/zhanlaoban/eda_nlp_for_Chinese 34 | - 2.word2vec、词典同义词替换 35 | - 不同于1中使用synonyms工具查找同义词,可以使用gensim的词向量,找出某个词最相似的词作为同意词。 36 | - 还可以使用同义词典机械查找,词典可用fighting41love/funNLP,github地址:https://github.com/fighting41love/funNLP/tree/master/data/ 37 | 38 | # 句法、句子扩充、句子缩写(比较困难、) 39 | - 1.句子缩写,查找句子主谓宾等 40 | - 有个java的项目,调用斯坦福分词工具(不爱用),查找主谓宾的 41 | - 地址为:(主谓宾提取器)https://github.com/hankcs/MainPartExtractor 42 | - 2.句子扩写 todo 43 | - 3.句法 todo 44 | 45 | # HMM-marko(质量较差) 46 | - HMM生成句子原理: 根据语料构建状态转移矩阵,jieba等提取关键词开头,生成句子 47 | - 参考项目:https://github.com/takeToDreamLand/SentenceGenerate_byMarkov 48 | 49 | # 深度学习方法 todo 50 | - seq2seq 51 | - bert 52 | - transformer 53 | - GAN 54 | 55 | ## 预训练模型-UMILM 56 | 使用BERT(UNILM)的生成能力, 即BERT的NSP句对任务 57 | - simbert(bert + unilm + adv): [https://github.com/ZhuiyiTechnology/simbert](https://github.com/ZhuiyiTechnology/simbert) 58 | - simbert: [鱼与熊掌兼得:融合检索和生成的SimBERT模型](https://spaces.ac.cn/archives/7427) 59 | - roformer-sim: [https://github.com/ZhuiyiTechnology/roformer-sim](https://github.com/ZhuiyiTechnology/roformer-sim) 60 | - simbert-v2(roformer + unilm + adv + bart + distill): [SimBERTv2来了!融合检索和生成的RoFormer-Sim模型](https://spaces.ac.cn/archives/8454) 61 | 62 | ## 回译(开源模型效果不是很好) 63 | 中文转化成其他语言(如英语), 其他语言(如英语)转化成中文, Helsinki-NLP开源的预训练模型 64 | - opus-mt-en-zh: https://huggingface.co/Helsinki-NLP/opus-mt-en-zh 65 | - opus-mt-zh-en: https://huggingface.co/Helsinki-NLP/opus-mt-zh-en 66 | 67 | 68 | # 参考/感谢 69 | * eda_chinese:[https://github.com/zhanlaoban/eda_nlp_for_Chinese](https://github.com/zhanlaoban/eda_nlp_for_Chinese) 70 | * 主谓宾提取器:[https://github.com/hankcs/MainPartExtractor](https://github.com/hankcs/MainPartExtractor) 71 | * HMM生成句子:[https://github.com/takeToDreamLand/SentenceGenerate_byMarkov](https://github.com/takeToDreamLand/SentenceGenerate_byMarkov) 72 | * 同义词等:[https://github.com/fighting41love/funNLP/tree/master/data/](https://github.com/fighting41love/funNLP/tree/master/data/) 73 | * 小牛翻译:[http://www.niutrans.com/index.html](http://www.niutrans.com/index.html) 74 | -------------------------------------------------------------------------------- /AugmentText/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 19:44 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_eda/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 21:14 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_eda/enhance_eda_v2.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/4/15 14:54 4 | # @author : Mo 5 | # @function: EDA 6 | 7 | 8 | # import macropodus 9 | import synonyms 10 | import random 11 | import jieba 12 | 13 | 14 | KEY_WORDS = ["macropodus"] # 不替换同义词的词语 15 | ENGLISH = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 16 | 17 | 18 | def is_english(text): 19 | """ 20 | 是否全是英文 21 | :param text: str, like "你是谁" 22 | :return: boolean, True or False 23 | """ 24 | try: 25 | text_r = text.replace(" ", "").strip() 26 | for tr in text_r: 27 | if tr in ENGLISH: 28 | continue 29 | else: 30 | return False 31 | except Exception as e: 32 | return False 33 | 34 | 35 | def is_number(text): 36 | """ 37 | 判断一个是否全是阿拉伯数字 38 | :param text: str, like "1001" 39 | :return: boolean, True or False 40 | """ 41 | try: 42 | text_r = text.replace(" ", "").strip() 43 | for tr in text_r: 44 | if tr.isdigit(): 45 | continue 46 | else: 47 | return False 48 | except Exception as e: 49 | return False 50 | 51 | 52 | def get_syn_word(word): 53 | """ 54 | 获取同义词 55 | :param word: str, like "学生" 56 | :return: str, like "学生仔" 57 | """ 58 | if not is_number(word.strip()) or not is_english(word.strip()): 59 | word_syn = synonyms.nearby(word) 60 | word_syn = word_syn[0] if len(word_syn[0]) else [word] 61 | return word_syn 62 | else: 63 | return [word] 64 | 65 | 66 | def syn_replace(words, n=1): 67 | """ 68 | 同义词替换 69 | :param words: list, like ["macropodus", "是", "谁"] 70 | :param n: int, like 128 71 | :return: list, like ["macropodus", "是不是", "哪个"] 72 | """ 73 | words_set = list(set(words)) # 乱序, 选择 74 | random.shuffle(words_set) 75 | count = 0 76 | for ws in words_set: 77 | if ws in KEY_WORDS or is_english(ws) or is_number(ws): 78 | continue # 关键词/英文/阿拉伯数字不替换 79 | need_words = get_syn_word(ws) # 获取同义词(可能有多个) 80 | if need_words: # 如果有同义词则替换 81 | need_words = random.choice(need_words) 82 | words = [need_words if w==ws else w for w in words] 83 | count += 1 84 | if count >= n: 85 | break 86 | return words 87 | 88 | 89 | def syn_insert(words, n=1, use_syn=True): 90 | """ 91 | 同义词替换 92 | :param words: list, like ["macropodus", "是", "谁"] 93 | :param n: int, like 32 94 | :return: list, like ["macropodus", "是不是", "哪个"] 95 | """ 96 | words_set = list(set(words)) # 乱序, 选择 97 | random.shuffle(words_set) 98 | count = 0 99 | for ws in words_set: 100 | if ws in KEY_WORDS or is_english(ws) or is_number(ws): 101 | continue # 关键词/英文/阿拉伯数字不替换 102 | if use_syn: 103 | need_words = get_syn_word(ws) # 获取同义词(可能有多个) 104 | else: 105 | need_words = [ws] 106 | if need_words: # 如果有同义词则替换 107 | random_idx = random.randint(0, len(words) - 1) 108 | words.insert(random_idx, (need_words[0])) 109 | count += 1 110 | if count >= n: 111 | break 112 | return words 113 | 114 | 115 | def word_swap(words, n=1): 116 | """ 117 | 随机交换,随机交换两个词语 118 | :param words: list, like ["macropodus", "是", "谁"] 119 | :param n: int, like 2 120 | :return: list, like ["macropodus", "谁", "是"] 121 | """ 122 | idxs = [i for i in range(len(words))] 123 | count = 0 124 | while count < n: 125 | idx_select = random.sample(idxs, 2) 126 | temp = words[idx_select[0]] 127 | words[idx_select[0]] = words[idx_select[1]] 128 | words[idx_select[1]] = temp 129 | count += 1 130 | return words 131 | 132 | 133 | def word_delete(words, n=1): 134 | """ 135 | 随机删除N个词语 136 | :param words: list, like ["macropodus", "是", "谁"] 137 | :param n: int, like 1 138 | :return: list, like ["macropodus", "谁"] 139 | """ 140 | count = 0 141 | while count < n: 142 | word_choice = random.choice(words) 143 | if word_choice not in KEY_WORDS: 144 | words.remove(word_choice) 145 | count += 1 146 | return words 147 | 148 | 149 | def word_cut(text, tool="macropodus"): 150 | """ 151 | 切词工具 152 | :param text:str, like "macropodus是谁" 153 | :param tool: str, "macropodus" or "jieba" 154 | :return: list, like ["macropodus", "是", "谁"] 155 | """ 156 | if tool=="macropodus": 157 | text_cut = list(macropodus.cut(text)) 158 | elif tool=="jieba": 159 | text_cut = list(jieba.cut(text)) 160 | else: 161 | text_cut = list(jieba.cut(text)) 162 | return text_cut 163 | 164 | 165 | def eda(text, n=1, use_syn=True): 166 | """ 167 | EDA, 每种方法进一位 168 | :param text: str, like "macropodus是谁" 169 | :param n: int, like 1 170 | :param use_syn: Boolean, True or False 171 | :return: list, like ["macropodus是谁呀", "macropodus是"] 172 | """ 173 | sens = word_cut(text, tool="jieba") 174 | # print(sens) 175 | sr = syn_replace(sens.copy(), n=n) 176 | si = syn_insert(sens.copy(), n=n, use_syn=use_syn) 177 | ws = word_swap(sens.copy(), n=n) 178 | wd = word_delete(sens.copy(), n=n) 179 | sens_word_4 = [sr, si, ws, wd] 180 | # print(sens_word_4) 181 | sens_4 = ["".join(s4) for s4 in sens_word_4] 182 | return sens_4 183 | 184 | 185 | if __name__ == "__main__": 186 | sens = "".join(["macropodus", "是不是", "哪个", "啦啦", 187 | "只需做好这四点,就能让你养的天竺葵全年花开不断!"]) 188 | print(eda(sens)) 189 | 190 | 191 | sens = list(sens) 192 | res1 = syn_replace(sens, n=1) 193 | print(res1) 194 | res2 = syn_insert(sens.copy(), n=1, use_syn=True) 195 | print(res2) 196 | res3 = word_swap(sens.copy(), n=1) 197 | print(res3) 198 | res4 = word_delete(sens.copy(), n=1) 199 | print(res4) 200 | 201 | 202 | -------------------------------------------------------------------------------- /AugmentText/augment_keyword/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/10/26 11:06 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /AugmentText/augment_keyword/ccks_news_2020.json: -------------------------------------------------------------------------------- 1 | {"x": {"text": "人民网北京8月31日电(孙竞)日前,为进一步规范高等学校命名工作,依据相关法律法规规定,结合高校设置工作实际,教育部研究制定了《高等学校命名暂行办法》并正式印发。《暂行办法》适用于发布之后的全日制大学、独立设置的学院、高等职业学校(含本科层次职业学校和专科层次职业学校)以及高等专科学校的命名事项。  《暂行办法》强调,高等学校命名要坚持名实相符、准确规范,体现办学理念,突出内涵特色,避免贪大求全。根据人才培养目标、办学层次、规模、类型、学科类别、教学科研水平、隶属关系、所在地等确定名称,实行一校一名制。  《暂行办法》对高等学校名称中使用地域字段、学科或行业字段、英文译名等提出明确规范。不得冠以代表中国及世界的惯用字样和大区及大区变体字样;不得冠以学校所在城市以外的地域名;省级人民政府举办的学校可以使用省域命名,其他学校确需使用省域命名的,由省级人民政府统筹把关,但须在名称中明确学校所在地。未经授权,不得使用其他组织或个人拥有的商标、字号、名称等,不得使用国外高校的中文译名和简称。农林、师范院校在合并、调整时,原则上继续保留农林、师范名称;避免出现多个学科或行业类别并存的现象,原则上不超过2个;使用相同学科或行业字段时,在省域范围内应具有区分度。英文译名与中文名称保持一致,学校中文名称中含有特殊含义的字段,可以使用音译。  《暂行办法》强调,高等学校名称原则上不得以个人姓名命名。未经授权,不得使用其他高等学校曾使用过的名称。由独立学院转设的独立设置的学校,名称中不得包含原举办学校名称及简称。高等学校应保持名称稳定,原则上同层次更名间隔期至少10年。[责编:田媛]", "texts2": []}, "y": "教育"} 2 | {"x": {"text": "又是一年落叶黄,一阵秋雨一阵凉;整日奔波工作忙,出门别忘添衣裳。金秋时节,正值装修旺季,集美家居继续带消费者们“乘风破浪”。为满足消费者装修置家需求,帮助消费者选购到质优价美的建材家居用品,集美家居北苑商场将于9月10日-13日举办金秋爆破团购会活动。   活动期间,全年最低折扣、满减满赠、幸运抽奖、9元秒家具等实实在在的优惠福利让消费者拿到手软。据活动相关负责人介绍,本次团购会将是集美家居北苑商场年度内优惠力度最大的一次促销活动,可以说是一场不容错过的家居盛“惠”。  具体优惠福利如下:  (一)各大品牌推出全年最低折扣回馈消费者;  (二)集美家居北苑商场推出满1000元减100元优惠券;  (三)消费者可参与抢购15元升300元、50元升1000元升值券;  (四)此外,还有满赠家居大礼包,幸运大抽奖,9元秒家具等丰富多彩的活动等候消费者参与。  集美家居北苑商场坐落在北五环的朝阳区红军营南路19号,临近地铁5号线北苑路北站,附近有多条公交线路,交通便利;集美家居北苑商场内设有大型停车场,便于驱车前来购物的消费者享受停车服务;另有班车预约服务供消费者享受,随叫随到。  集美家居北苑商场定位于京北地区现代化、智能化、体验化、品牌化的一站式大家居商场。一直以来,集美家居北苑商场坚持以诚信赢得顾客,多年被北京市工商局评为“诚信经营示范市场”“消费者绿色通道”。  据了解,疫情期间集美家居北苑商场进行了全面升级改造,提供品类齐全的商品、购物无忧的售后服务,使购物环境更加舒适、健康、温馨,以便消费者逛得舒心、放心、省心。  集美家居北苑商场将带领全体员工真诚欢迎新老朋友的光临,并竭诚服务好每一位到店的消费者。  选择集美家居,就是选择美好生活!原文网址:神兽归笼日、装修正当时——集美家居北苑商场金秋爆破团购会即将启动http://www.jiaju82.com/news-view-id-720242.html", "texts2": []}, "y": "家居"} 3 | {"x": {"text": "作者:祝安顺(深圳大学饶宗颐文化研究院编审、副教授)  越来越多的名校毕业生入职教师行业,吸引他们的是什么?越来越多的人愿意当老师表象背后的逻辑是什么?为了解这一社会现象,我们设计了网络问卷调查,主要从最近5年入职新教师,选择从事教育行业的内外条件入手,对其影响力度进行区分:0分代表无影响,1分至10分代表影响的程度强弱,通过影响程度的平均得分以及无影响因素和最强影响因素的选择以及占比,以试图分析当下教师来源渠道拓宽背后的多方面因素。  本次调查采取网络问卷,一共收集到298位教师的问卷回复。省会以上城市176人,占比59.06%;省会以下城市90人,占比30.2%;乡村32人,占比10.74%。从教师毕业院校来源来说,师范院校毕业生198人,占比66.44%;从学历来说,看出98%以上的老师都具有本科以上学历,博士入职做教师的还是少数,主要集中在一线城市的知名学校。  最近5年入职新教师影响因素的外在条件,排在第一位的是寒暑假期影响因素,其次是收入影响因素,然后是工作条件影响因素、社会声誉影响因素、人文气息影响因素、子女教育影响因素等。  从每一因素内部分析来看,也有很多值得关注的点,如就业影响因素。随着高校扩招,毕业生人数不断增加,是一个不可忽视的因素;毕业生对新岗位敢于尝试,并无心理阻碍,影响力度也小。  收入影响因素。调查显示,教师的入职薪资和后期稳定的薪资收入是吸引毕业生选择教师职业的非常重要的因素,但教师的整体薪资水平却不是很高。  教育发展内在性需要影响因素。吸引越来越多的非师范院校毕业生尤其是知名高校毕业生,是中国当下教育发展的内在需求,排在第一位的是科技创新的基础教育变革需求,说明全社会对科技创新的高度认可。排在第二位的是当代中小学生成长的需求,优秀的人才加入到教师行业,无疑是人才培养和青少年成长的有力保障之一。  学校发展,无论从基层学校的普遍需求,还是优质学校的领先需求,也无论是个别地区的特殊做法,还是整个学校的教育创新需要,对各类型教师的需求,最有影响力比例均在20%以上,说明学校发展内在性需求的影响力是中等偏上。  个性发展和教师职业魅力,对于选择从事教师行业的影响力还是比较大的。说明选择教师时,自身的条件、兴趣及愿望是选择教师职业的重要考虑因素。  综合问卷调查结果及分析来看:  1.打破师范生作为教师引入的唯一或主要渠道,这是教育发展的内在需求,一部分知名高校毕业生加入中小学教师队伍,是理性选择结果。  2.从内在因素来说,选聘优秀毕业生和新的优秀毕业生选择加入教师行业,最大的影响因素是科技创新的需要。  3.选择从事教师行业,薪资收入、工作条件固然重要,但寒暑假的自由自主时间反而成为一个更重要的影响因素,说明当下毕业生在选择职业的考虑因素有一些变化。  4.新入职教师的家庭因素不可低估,父母长辈的认可影响力度很大。  高学历,名校毕业生到中小学当教师的现象,是中国教育发展的内外需求,经济收入固然是重要原因,但科技创新需要大批优秀师资,教师的寒暑假时间、社会荣誉感以及毕业生的自身理想兴趣,如果这些内外因素都能有效结合,应该会为中国的基础教育师资来源拓宽渠道,获得稳定的优质师资,会为中国的下一轮教育发展提供人才保障。  《光明日报》(2020年09月08日 14版)[责编:曾震宇]", "texts2": []}, "y": "教育"} 4 | {"x": {"text": "虽然此前其有过与品牌如Madawell的合作系列,但这回她确认将推出自己名下的服装系列。在最近的采访中她说道,创造自己的品牌她已经是豁出去了。“在这个环境中没人会相信你一个名人真的能设计出东西,尽管每一个设计都是我亲自构思绘制的,”她解释道“所以假如会很糟糕我可以大声说这和我没有任何关系,但假如这非常棒我可以理直气壮的拿出草稿证实自己的作品。”\t目前Chung还没有透露更多的产品细节,但我们期望见到她自身标志性的风格出现如大片印花,复古风格以及多层次设计单品。", "texts2": []}, "y": "时尚"} 5 | {"x": {"text": "眼下,虽然疫情依然严峻,但我国大中小学都已陆续开学,疫情防控进入常态化,学生校园生活回归正常。  尽管国内疫情已不再肆虐,但我们高兴地看到人们仍保持着疫情期间养成的良好习惯:勤洗手、戴口罩、少聚集、多运动成为标配;清晨,家长陪伴儿女锻炼,夜晚,子女携手老人运动;孩子们出来玩耍的多了,亲子互动多了,邻里之间打招呼多了,共同关心的话题多了,一起要做的事儿多了……  前不久,教育部应对新冠肺炎疫情工作领导小组办公室委托中国教育科学研究院和有关专家提出了常态化防控新冠肺炎疫情前提下学校文明卫生、绿色健康新生活方式倡导,旨在引导广大师生在全面复学复课后,保持疫情防控期间形成的文明健康的生活方式,预防为主,关口前移,远离疾病,健康生活。  好习惯培养并坚持下去并不容易,良好的卫生习惯,保证室内空气的流通,保持个人卫生,尤其是勤洗手,出门戴口罩等,这些在做好了自身疫情期间防护的同时,也可为他人带来安全感。  这些都为校园常态化防控疫情提供了遵循,学校应在将疫情期间养成的良好习惯和生活方式进行到底的同时,做好以下几方面的工作:  一要坚持健康的生活方式。合理安排学习、生活、体育锻炼,劳逸结合,作息规律,保证充足睡眠,增强身体免疫力。积极参加体育运动,促进体质健康,锤炼意志品质,培养锻炼习惯。掌握爱眼护眼常识,学会识别不良用眼环境,主动选择有益眼健康的环境。  二要重视维护心理健康。遇到心理问题,学会自我调节,必要时能主动寻求专业心理帮助和支持。保持健康体重,避免超重、肥胖和消瘦。远离烟酒,杜绝网络成瘾,拒绝毒品诱惑。  三要养成良好卫生习惯。勤洗手,正确采用“七步洗手法”,使用肥皂、洗手液和流动的水洗手不少于20秒。经常洗澡,不共用毛巾和洗漱用具。科学刷牙,饭后漱口,定期口腔检查,预防龋齿,保持口腔健康。  四要均衡营养膳食。均衡营养,合理膳食。主动减少盐、糖、油摄入,增加优质蛋白质和新鲜蔬菜水果摄取。每天饮用足量的水,少喝或不喝含糖饮料。提倡分餐制,不能分餐时使用公勺公筷。在校用餐尽量自备餐具。  五要有健康文明的行为。热爱大自然,爱护动物,拒绝食用野生动物。保持适宜、安全距离,在交谈、候车、等电梯、排队时与他人保持一米以上的距离。自觉坚持安全文明出行,乘坐公共交通工具佩戴口罩,不在公共交通工具内进食。讲文明,讲卫生,不随地吐痰。打喷嚏、咳嗽时用纸巾捂住口鼻或用肘部遮挡。他人打喷嚏、咳嗽时主动避闪。  六要维护环境卫生。经常开窗通风,保持室内空气流通。自觉参与垃圾分类,改善人居环境。保护环境和资源,减少一次性餐具使用,减少污染和浪费。  七要科学就医用药。按规定接种疫苗,预防传染病。发现传染病,早报告,早隔离,早治疗。合理利用学校、家庭周边卫生服务资源。发热或罹患呼吸道疾病时佩戴口罩,及时就诊,配合医生治疗,用药遵从医嘱,不擅自使用处方药。  疫情期间的调研表明,孩子们长高了、长壮了,但同时也长胖了。  近期,教育部发布了中小学生视力情况抽样调查,结果显示,与2019年同期相比,9个省的中小学生近视率增加了11.7%,小学生的近视率增加最多,达到了15.2%。调查显示,学生的近视和上网课时间、平时玩电子游戏的时间、户外体育锻炼和照明环境、书桌高度等因素有关。  不断增长的肥胖率和居高不下的近视率是导致我国学生体质30余年下降的两大“顽疾”,伴随而来的是青少年心理问题频出、意志品质薄弱、抗挫折能力下降。  要想彻底改变这一状况,一是必须牢固树立健康第一的教育理念,真正把广大青少年学生身心健康作为一切工作的出发点和落脚点抓好抓实。  二是切实减轻广大青少年学生不必要的学业负担,使他们“轻装上阵”健康成长。  三是开齐开足体育课,增加学生户外课外活动时间,帮助他们在享受体育乐趣的同时,增强体质、健全人格、锤炼意志。  四是强化体育课和课外体育锻炼,深化学校体育课程改革,做好线上线下教学的融合,让广大学生在“学会、勤练、常赛”下掌握更多终身受益的体育技能。  无论处于何种生活状态下,我们都应该将疫情期间养成的良好习惯和生活方式进行下去,坚持到底,因为这些好习惯不仅承载着我们对疫情的回忆,更丰富和激励我们未来的人生。(李小伟)[责编:田媛]", "texts2": []}, "y": "教育"} 6 | {"x": {"text": "一年一度的春运即将拉开帷幕,在微博上一些网友呼吁无座火车票实行半价,得到众多网友的热烈响应,大家纷纷说“有道理”。人民日报官方微博等也对此予以关注,河南商报官方微博发起的投票显示,八成网友支持站票半价。(1月14日《河南商报》)  在微博上,北京大学法学院教授贺卫方通过自己以前“很痛苦”的乘火车经历,认为“站票应卖三分之一价”。我倒没这么贪心,或者说不敢有如此奢望,但是给无座火车票打个折,无论如何都是应该的。  春运人满为患,铁路运力不足,火车在一定限度内超员未尝不可,但这也意味着站票打折是需要铁道部单独面对并回应的民意,不应推诿和观望。  “坐火车”与“站火车”是不可同日而语的,尤其是路途遥远的乘客,“站火车”简直是活受罪。换言之,无座乘客虽与有座乘客同处于一列火车上,但他们没有享受到应有的服务,即服务大打折扣,所以无座车票的价格也应该相应打折。乘客与铁路运输单位是平等的民事主体,乘客买票相当于与铁路运输单位订立合同,按照《合同法》的规定,“当事人应当遵循公平原则确定各方的权利和义务”。没有享受到应有服务的无座乘客,却要与有座乘客掏一样的车票钱,这是不公平的。  以前,大多数列车的每个座位全程只卖给一名乘客,有些乘客中途下车后,运气好的无座乘客还能“捡”到座位;现在随着全国联网售票的推行,大多数列车实行全程对号入座,即沿途车站会将中途下车乘客留下的座位重新出售,这样,绝大多数无座乘客都要“一站到底”,在春运期间尤其是如此,他们的权益因此受到了损害。  哪些人最可能买不到座位票而只能“站火车”呢?其中之一是农民工。农民工收入很低,有的干了一年还没拿到工钱,回家时兜里没有几个钱,他们是最盼望无座票打折的,也是对无座票不打折最有意见的,只是他们没有话语权。无论是从公平的角度,还是从关爱弱势群体的角度,无座火车票都应该打折,哪怕一张票为农民工省下百八十元,他们也会觉得“站火车”的辛苦是值得的,并会在心里感谢铁道部。  微博上也有一些网友说,如果无座票实行半价,很多人岂不都要去买无座票?岂不造成列车更加拥挤?如此担忧纯属杞人忧天。一列火车当然先卖座位票,然后再卖无座票,而无座票的数量也是有限的。还有个别网友说,嫌无座票贵就别坐火车呀!这话说得毫无道理。我国铁路实行垄断经营,人们在票价上没有选择余地,只能被动接受。正因为实行垄断经营,人们无法用脚投票,垄断者更应该担负起责任,将票价制定得合理一些,而不能将垄断作为欺负消费者的工具。  无座火车票降价问题,人们已经呼吁了好多年,铁道部该有所动作了。(作者 晏扬)责任编辑:hdwmn_ctt", "texts2": []}, "y": "时政"} 7 | -------------------------------------------------------------------------------- /AugmentText/augment_keyword/keyword_sim.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/10/25 19:49 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | # 适配linux 9 | import sys 10 | import os 11 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 12 | sys.path.append(path_root) 13 | print(path_root) 14 | from utils.text_tools import txtRead, txtWrite, load_json, save_json 15 | import gensim 16 | import json 17 | 18 | 19 | label_keywords0 = { "娱乐":["电影", "影视", "奥斯卡", "导演", "综艺", "动漫"], 20 | "科技":["数码", "手机", "相机", "像素", "区块链", "人工智能", "数字化"], 21 | "时尚":["时髦", "潮流", "穿搭", "性感", "奢侈品", "首饰"], 22 | "时政":["外交", "政治", "实事", "草案", "中国梦", "小康"], 23 | "家居":["家具", "建材", "厨卫", "涂料", "装修", "地砖", "炉壁"], 24 | "房产":["房价", "房贷", "物业", "楼市", "二手房", "二套房"], 25 | "游戏":["玩家", "网游", "手游", "技能", "王者荣耀", "出装"], 26 | "体育":["比赛", "NBA", "体育讯", "得分", "足球", "竞赛"], 27 | "财经":["基金", "投资", "股票", "分红", "理财", "保险"], 28 | "教育":["考试", "学生", "英语", "四六级", "早教", "试卷"], 29 | } 30 | 31 | label_keywords1 = { 32 | "教育":["教育", "语文", "体育教师","双创", "冰雪教育","老师","GIA","师范", "命题", "在线教育", "作文","早教", 33 | "中职","张老师","学生","汉语言","试卷","支教团","人大附中","研学游","教师资格"], 34 | "家居": ["欧派","科勒","樱雪","SUNIT世集","涂料","油烟机","电梯","灶具", "实地","板业", "风扇", "沃莱菲", 35 | "花岗岩","岩板","玻璃胶","消毒柜","席梦思","水磨石", "清除率","号线", "床垫", "地板", "乳胶", "洗衣机", "红木","甲醛"], 36 | "时尚": ["贝雷帽","麦肯齐", "连裤袜", "人台", "渔夫帽", "吊饰", "发饰", "白衬衫", "古驰", "派克", "切工"], 37 | "时政": ["经开区", "法", "科工", "抗旱", "作战", "立法", "战略", "用电量", "习仲勋", "党费", "巡视", "监审", "举报人", "行政"], 38 | "科技": ["区块链", "佳能EOS", "暗网", "折叠屏", "ZooKeeper", "TCL", "数据管理", "PoS", "波场", "频谱", "机房", "PoW", 39 | "一加", "公共电话", "互联网", "无人驾驶", "微信", "拼多多", "手机", "IaaS", "抖音", "HDMI", "可信", "人脸识别", 40 | "PIN", "中兴", "个人信息", "小米", "B2B", "CTR", "平板", "应用程序", "通信协议", "挖矿", 41 | "算力", "Wifi", "K8S", "分布式", "数据线"], 42 | "房产": ["甲方", "乙方", "窗洞", "惠而浦", "燕郊", "LPR", "LPS", "天恒乐墅", "开发商", "恒大", "招商会", "买受人", "创客", 43 | "住房", "购房者", "配租", "退房", "京兆", "公府", "城镇化"], 44 | "财经": ["酒", "中港", "Q3","pct", "市净率", "ROIC", "大豆", "保险机构", "债权人", "GNP", "国资", "龙头股", "PTA", "理财产品", "LPG", "转增", "缩股", 45 | "降息", "交割", "破发", "顺差", "研报", "停盘", "SPV", "央票", "生产总值", "操盘手", "瑞典克朗", "新加坡元", "SDR", "含税", "下调", "次级", "上涨", 46 | "增速", "概念股", "除息", "除权", "薪资", "贸易顺差", "指标股", "非流通股", "贸易逆差"], 47 | "游戏": ["王者", "首充", "小邪", "Altman", "XiXi", "3DO", "Ciwei", "Evc", "50pm", "德鲁依", "精魄", "晶灵", "COSer", 48 | "雷克萨", "GANK", "小汐", "血露", "龙腾组", "指族", "战训队", "同乐会", "千人国战", "千人战"], 49 | "体育": ["女排", "兵乓球", "跳水", "丁俊晖", "李元伟", "李彤", "萨沃", "张岐", "霍斯金", "奥多姆", "汪嵩", "广东队", 50 | "快船队", "马连保", "UTSA", "钟诚", "曾文鼎", "小斯", "孙明明", "山东队", "八一队", "辽足", "国奥队", 51 | "三连客","小牛队", "进球", "肘击", "沙帅", "赛风"], 52 | "娱乐": ["峨影厂", "地戏", "墨攻", "花絮", "DMAX", "选角", "杀青", "拍戏", "配音", "绯闻", "离婚", "表白", 53 | "蒋庆泉", "赵宁", "王世贞", "陈乾", "蔡荣名", "洪炉", "文玲姐", "温超", "白百何", "杨丽坤", 54 | "林权泽", "王天冉", "严孝国", "蒋利", "傅东", "尚玟", "李蜜", "王雅萱", "滕华涛", "狄娜", "微博选角", "墨攻", "王小贱", 55 | "唐一菲", "柳导", "隆裕太后"] 56 | } 57 | 58 | label_keywords = {"娱乐": ["电影", "影视", "奥斯卡", "导演"], 59 | "科技": ["数码", "手机", "相机", "像素"], 60 | "时尚": ["时髦", "潮流", "化妆", "性感"], 61 | "时政": ["外交", "政治", "人大", "草案", "致辞", "审查", "督察组", "贯彻", "纪委", "劳动局"], 62 | "家居": ["家具", "建材", "厨卫", "涂料"], 63 | "房产": ["新房", "房贷", "物业", "楼市"], 64 | "游戏": ["玩家", "网游", "手游", "页游"], 65 | "体育": ["比赛", "欧冠", "排球", "得分"], 66 | "财经": ["基金", "投资", "股票", "分红"], 67 | "教育": ["考试", "学生", "数学", "高考"], 68 | } 69 | 70 | # 穿搭,房价,体育讯 71 | 72 | path_w2v = "sgns.wiki.word" 73 | # path_w2v = "JDAI-Word-Embedding.txt" 74 | # path_w2v = "Tencent_AILab_ChineseEmbedding.txt" 75 | w2v_model = gensim.models.KeyedVectors.load_word2vec_format(path_w2v, binary=False, # limit=100000 76 | ) # limit=100000) 77 | print("load ok!") 78 | topn = 320 79 | 80 | res = [] 81 | 82 | lkk = list(label_keywords.keys()) 83 | for label in lkk: 84 | key_words = label_keywords[label] 85 | key_words = [label] + key_words 86 | for word in key_words: 87 | sim_word = None 88 | try: 89 | sim_word = w2v_model.most_similar(word, topn=topn) 90 | except Exception as e: 91 | print(word) 92 | continue 93 | if sim_word: 94 | line_dict = {"type": label, "word": word, "topk": sim_word} 95 | line_str = json.dumps(line_dict, ensure_ascii=False) + "\n" 96 | res.append(line_str) 97 | txtWrite(res, "ccks_news_2020_keyword_sim_sgns.json") 98 | mam = 0 99 | 100 | # nohup python keyword_sim.py > sim.log 2>&1 & 101 | 102 | -------------------------------------------------------------------------------- /AugmentText/augment_keyword/statistics_keyword.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2020/10/25 11:07 4 | # @author : Mo 5 | # @function: rule-word-freq, 统计各类别独有词汇的词频等 6 | 7 | 8 | # 适配linux 9 | import sys 10 | import os 11 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 12 | sys.path.append(path_root) 13 | print(path_root) 14 | # macadam 15 | from utils.text_tools import jieba_cut, txtRead, txtWrite, load_json, save_json 16 | from conf.path_config import stop_words_path 17 | from collections import Counter, OrderedDict 18 | from tqdm import tqdm 19 | import jieba 20 | import json 21 | import copy 22 | 23 | 24 | # 停用词列表,默认使用hanlp停用词表 25 | f_stop = open(stop_words_path, "r", encoding="utf-8") 26 | stop_words = [] 27 | for stop_word in f_stop.readlines(): 28 | stop_words.append(stop_word.strip()) 29 | 30 | # stop_words = ["\t"] 31 | 32 | 33 | def is_total_number(text: str) -> bool: 34 | """ 35 | judge is total chinese or not, 判断是不是全是数字 36 | Args: 37 | text: str, eg. "macadam, 碎石路" 38 | Returns: 39 | bool, True or False 40 | """ 41 | for word in text: 42 | if word not in "0123456789.%": 43 | return False 44 | return True 45 | 46 | 47 | def statistics_keyword_by_label(path, rate=1): 48 | """ 49 | judge is total chinese or not, 判断是不是全是数字 50 | Args: 51 | path: str, eg. "train.json" 52 | rate: float, eg. 0.75 53 | Returns: 54 | None 55 | """ 56 | datas = txtRead(path) 57 | 58 | lwd = {} 59 | for i in tqdm(range(len(datas)), desc="jieba cut and statistics: "): 60 | # 从标准文档里边获取文本, 切词处理 61 | d = datas[i] 62 | d_json = json.loads(d) 63 | text = d_json.get("x", {}).get("text") 64 | label = d_json.get("y") 65 | word_list = list(jieba.cut(text)) 66 | # 去除 停用词、全数字、1个字 67 | word_list = [wl for wl in word_list if wl not in stop_words and not is_total_number(wl) and len(wl) >= 2] 68 | # 词频统计(类别内) 69 | word_freq_dict = dict(Counter(word_list)) 70 | if label not in lwd: 71 | lwd[label] = word_freq_dict 72 | else: 73 | lwd[label].update(word_freq_dict) 74 | 75 | # 取范围, 排序 76 | lwd_keys = list(lwd.keys()) 77 | lwd_soft = [sorted(lwd[l].items(), key=lambda x: x[1], reverse=True) for l in lwd_keys] 78 | lwd_soft_rate = [s[:int(len(s) * rate)] for s in lwd_soft] 79 | label_word_dict = {lwd_keys[i]: OrderedDict(lwd_soft_rate[i]) for i in range(len(lwd_keys))} 80 | print("cut ok!") 81 | # 获取每个类独有的词汇 82 | label_keys = set(list(label_word_dict.keys())) 83 | label_words = {} 84 | for key in label_keys: 85 | key_dict = set(list(label_word_dict[key].keys())) 86 | keys_other = copy.deepcopy(label_keys) 87 | keys_other.discard(key) 88 | # 其他类别的所有词汇 89 | kos = set() 90 | for ko in keys_other: 91 | ko_dict = set(list(label_word_dict[ko].keys())) 92 | kos = kos | ko_dict 93 | 94 | # 获取独有的词汇 95 | key_public = kos & key_dict 96 | key_label = key_dict - key_public 97 | 98 | label_word_freq = {kl:label_word_dict[key][kl] for kl in key_label} 99 | label_words[key] = label_word_freq 100 | 101 | save_json(label_words, "label_keyword_unique.json") 102 | 103 | 104 | if __name__ == '__main__': 105 | path = "ccks_news_2020.json" 106 | statistics_keyword_by_label(path, rate=1) 107 | mm = 0 108 | 109 | -------------------------------------------------------------------------------- /AugmentText/augment_marko/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 21:14 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_nmt/README.md: -------------------------------------------------------------------------------- 1 | # Augment NMT 2 | 3 | ## 回译(开源模型效果不是很好) 4 | 中文转化成其他语言(如英语), 其他语言(如英语)转化成中文, Helsinki-NLP开源的预训练模型 5 | - opus-mt-en-zh: https://huggingface.co/Helsinki-NLP/opus-mt-en-zh 6 | - opus-mt-zh-en: https://huggingface.co/Helsinki-NLP/opus-mt-zh-en 7 | 8 | ## 备注 9 | 开源模型的效果不是那么理想, 只能少部分生成, 比如一条 10 | 11 | -------------------------------------------------------------------------------- /AugmentText/augment_nmt/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/22 21:04 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /AugmentText/augment_nmt/nmt_local.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/22 21:37 4 | # @author : Mo 5 | # @function: NMT of Helsinki-NLP 6 | # 下载地址: 7 | # opus-mt-en-zh: https://huggingface.co/Helsinki-NLP/opus-mt-en-zh 8 | # opus-mt-zh-en: https://huggingface.co/Helsinki-NLP/opus-mt-zh-en 9 | 10 | 11 | from transformers import (AutoModelForSeq2SeqLM, AutoTokenizer, pipeline) 12 | import time 13 | import os 14 | 15 | 16 | class BackTranslate: 17 | def __init__(self, pretrained_dir): 18 | # zh-to-en 19 | tokenizer = AutoTokenizer.from_pretrained(os.path.join(pretrained_dir, "Helsinki-NLP/opus-mt-zh-en")) 20 | model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(pretrained_dir, "Helsinki-NLP/opus-mt-zh-en")) 21 | # en-to-zh 22 | tokenizer_back_translate = AutoTokenizer.from_pretrained(os.path.join(pretrained_dir, "Helsinki-NLP/opus-mt-en-zh")) 23 | model_back_translate = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(pretrained_dir, "Helsinki-NLP/opus-mt-en-zh")) 24 | # pipeline 25 | self.zh2en = pipeline("translation_zh_to_en", model=model, tokenizer=tokenizer) 26 | self.en2zh = pipeline("translation_en_to_zh", model=model_back_translate, tokenizer=tokenizer_back_translate) 27 | 28 | def back_translate(self, text): 29 | """ 回译 """ 30 | text_en = self.zh2en(text, max_length=510)[0]["translation_text"] 31 | print("text_en:", text_en) 32 | text_back = self.en2zh(text_en, max_length=510)[0]["translation_text"] 33 | print("text_back:", text_back) 34 | return text_back 35 | 36 | 37 | if __name__ == '__main__': 38 | 39 | 40 | pretrained_dir = "D:/soft_install/dataset/bert-model/translate" 41 | bt = BackTranslate(pretrained_dir) 42 | datas = [{"text": "平乐县,古称昭州,隶属于广西壮族自治区桂林市,位于广西东北部,桂林市东南部,东临钟山县,南接昭平,西北毗邻阳朔,北连恭城,总面积1919.34平方公里。"}, 43 | {"text": "平乐县主要旅游景点有榕津千年古榕、冷水石景苑、仙家温泉、桂江风景区、漓江风景区等,平乐县为漓江分界点,平乐以北称漓江,以南称桂江,是著名的大桂林旅游区之一。"}, 44 | {"text": "印岭玲珑,昭水晶莹,环绕我平中。青年的乐园,多士受陶熔。生活自觉自治,学习自发自动。五育并重,手脑并用。迎接新潮流,建设新平中"}, 45 | {"text": "桂林山水甲天下, 阳朔山水甲桂林"}, 46 | {"text": "三国一统天下"}, 47 | {"text": "世间万物皆系于其上"}, 48 | {"text": "2020年真是一个糟糕的年代, 进入20年代,新冠爆发、经济下行,什么的都来了。"}, 49 | {"text": "仿佛一切都变得不那么重要了。"}, 50 | {"text": "苹果多少钱一斤"} 51 | ] 52 | time_start = time.time() 53 | for da in datas: 54 | text = da.get("text", "") 55 | bt.back_translate(text) 56 | time_total = time.time() - time_start 57 | print("time_total:{}".format(time_total)) 58 | print("time_per:{}".format(time_total / len(datas))) 59 | 60 | while True: 61 | print("请输入:") 62 | ques = input() 63 | res = bt.back_translate(ques) 64 | print("####################################################") 65 | 66 | 67 | # 下载地址: 68 | # opus-mt-en-zh: https://huggingface.co/Helsinki-NLP/opus-mt-en-zh 69 | # opus-mt-zh-en: https://huggingface.co/Helsinki-NLP/opus-mt-zh-en 70 | 71 | 72 | # 备注: 翻译效果不大好 73 | 74 | 75 | 76 | """ 77 | text_en: Ping Lei County, anciently known as Zhao County, belongs to the city of Gui Lin, Guangxi Liang Autonomous Region, and is located in the north-east of Guangxi, south-east of the city of Gui Lin, eastern Pingshan County, south-west Su Ping, north-west of Yangyon and north-west of the city of Lilongqi, with a total area of 1919.34 square kilometres. 78 | text_back: 平莱县,古代称为赵县,属于广西梁自治区Gui Lin市,位于广西东北、Gui Lin市东南、Pingshan县东南、Su Ping西南、Yangyon西北和Lilongqi市西北,总面积1919.34平方公里。 79 | text_en: The main tourist attractions in the district of Ping Lei are Xin Xianjin Quan, Cold Water Qing Qing, Qingjiang, Qingjiang, Qingjiang, etc. The district of Ping Le is one of the well-known Grand Gui Lin tourist areas, which is known as Jingjiang, north of Ping Lei and south of Ping Lei. 80 | text_back: 平莱区的主要旅游景点为新贤进泉、冷水清清、青江、青江、青江、青江等。 平来区是著名的大桂林旅游区之一,称为青江,位于平莱以北和平莱以南。 81 | text_en: The young man's garden, the Doss, is molten with pottery. Life is self-governing, learning self-involvement. It's full and heavy, and the hands and brains work together. It takes a new tide and builds a new flat. 82 | text_back: 年轻人的花园,多斯人,被陶器熔化了。生活是自治的,学习自我参与。生活是满的和沉重的,手和大脑一起工作。它需要新的潮水,建造新的公寓。 83 | text_en: Guilin Mountain Watermarin, Sunshaw Hill Watermarin 84 | text_back: 古林山水马林、桑肖山水马林 85 | text_en: All three of us. 86 | text_back: 我们三个人 87 | text_en: Everything in the world is in it. 88 | text_back: 世界上所有的东西都在里面 89 | text_en: The year 2020 was a really bad time, and in the 20s, the crown broke out, the economy went down, everything came up. 90 | text_back: 2020年是一个非常糟糕的时期, 在20年代,王冠崩盘, 经济下滑,一切都出现了。 91 | text_en: As if everything had become less important. 92 | text_back: 仿佛一切都变得不重要了 93 | text_en: How much is an apple? 94 | text_back: 苹果多少钱? 95 | """ 96 | 97 | -------------------------------------------------------------------------------- /AugmentText/augment_nmt/requestments.txt: -------------------------------------------------------------------------------- 1 | tensorflow-gpu==1.15.2 2 | transformers==4.8.2 3 | 4 | -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:17 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/code_seq2seq_char/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:50 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/code_seq2seq_char/extract_char_webank.py: -------------------------------------------------------------------------------- 1 | """ 2 | 把文件格式转换为可训练格式 3 | Code from: QHDuan(2018-02-05) url: https://github.com/qhduan/just_another_seq2seq 4 | """ 5 | from conf.path_config import train_data_web_ws_anti 6 | from conf.path_config import train_data_web_xy_anti 7 | from conf.path_config import model_ckpt_web_anti 8 | from conf.path_config import path_webank_sim 9 | 10 | from utils.mode_util.seq2seq.word_sequence import WordSequence 11 | from utils.text_tools import txtRead 12 | from tqdm import tqdm 13 | import pickle 14 | import sys 15 | import re 16 | 17 | sys.path.append('..') 18 | 19 | 20 | def make_split(line): 21 | """构造合并两个句子之间的符号 22 | """ 23 | if re.match(r'.*([,。…?!~\.,!?])$', ''.join(line)): 24 | return [] 25 | return [','] 26 | 27 | 28 | def good_line(line): 29 | if len(re.findall(r'[a-zA-Z0-9]', ''.join(line))) > 2: 30 | return False 31 | return True 32 | 33 | 34 | def regular(sen, limit=50): 35 | sen = re.sub(r'\.{3,100}', '…', sen) 36 | sen = re.sub(r'…{2,100}', '…', sen) 37 | sen = re.sub(r'[,]{1,100}', ',', sen) 38 | sen = re.sub(r'[\.]{1,100}', '。', sen) 39 | sen = re.sub(r'[\?]{1,100}', '?', sen) 40 | sen = re.sub(r'[!]{1,100}', '!', sen) 41 | if len(sen) > limit: 42 | sen = sen[0:limit] 43 | return sen 44 | 45 | 46 | def creat_train_data_of_sim_corpus(limit=50, x_limit=2, y_limit=2): 47 | x_datas = [] 48 | y_datas = [] 49 | max_len = 0 50 | sim_ali_web_gov_dli_datas = txtRead(path_webank_sim, encodeType="gbk") 51 | for sim_ali_web_gov_dli_datas_one in sim_ali_web_gov_dli_datas[1:]: 52 | sim_ali_web_gov_dli_datas_one_split = sim_ali_web_gov_dli_datas_one.strip().split(",") 53 | if sim_ali_web_gov_dli_datas_one_split[2]=="1": 54 | len_x1 = len(sim_ali_web_gov_dli_datas_one_split[0]) 55 | len_x2 = len(sim_ali_web_gov_dli_datas_one_split[1]) 56 | # if max_len < len_x1 or max_len < len_x2: 57 | max_len = max(len_x1, len_x2, max_len) 58 | 59 | sentence_org = regular(sim_ali_web_gov_dli_datas_one_split[0], limit=limit) 60 | sentence_sim = regular(sim_ali_web_gov_dli_datas_one_split[1], limit=limit) 61 | x_datas.append([sen for sen in sentence_org]) 62 | y_datas.append([sen for sen in sentence_sim]) 63 | x_datas.append([sen for sen in sentence_sim]) 64 | y_datas.append([sen for sen in sentence_org]) 65 | 66 | datas = list(zip(x_datas, y_datas)) 67 | datas = [ 68 | (x, y) 69 | for x, y in datas 70 | if len(x) < limit and len(y) < limit and len(y) >= y_limit and len(x) >= x_limit 71 | ] 72 | x_datas, y_datas = zip(*datas) 73 | 74 | print('fit word_sequence') 75 | 76 | ws_input = WordSequence() 77 | ws_input.fit(x_datas + y_datas) 78 | 79 | print('dump') 80 | 81 | pickle.dump((x_datas, y_datas), 82 | open(train_data_web_xy_anti, 'wb') 83 | ) 84 | pickle.dump(ws_input, open(train_data_web_ws_anti, 'wb')) 85 | 86 | print('done') 87 | print(max_len) 88 | 89 | 90 | if __name__ == '__main__': 91 | creat_train_data_of_sim_corpus() 92 | -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/code_seq2seq_char/predict_char_anti.py: -------------------------------------------------------------------------------- 1 | """ 2 | 对SequenceToSequence模型进行基本的参数组合测试 3 | Code from: QHDuan(2018-02-05) url: https://github.com/qhduan/just_another_seq2seq 4 | 5 | """ 6 | 7 | from utils.mode_util.seq2seq.data_utils import batch_flow_bucket as batch_flow 8 | from utils.mode_util.seq2seq.thread_generator import ThreadedGenerator 9 | from utils.mode_util.seq2seq.model_seq2seq import SequenceToSequence 10 | from utils.mode_util.seq2seq.word_sequence import WordSequence 11 | 12 | from conf.path_config import train_data_web_ws_anti 13 | from conf.path_config import train_data_web_xy_anti 14 | from conf.path_config import model_ckpt_web_anti 15 | from conf.path_config import path_params 16 | 17 | import tensorflow as tf 18 | import numpy as np 19 | import pickle 20 | import json 21 | import sys 22 | 23 | sys.path.append('..') 24 | 25 | 26 | def predict_anti(params): 27 | """测试不同参数在生成的假数据上的运行结果""" 28 | 29 | x_data, _ = pickle.load(open(train_data_web_xy_anti, 'rb')) 30 | ws = pickle.load(open(train_data_web_ws_anti, 'rb')) 31 | 32 | for x in x_data[:5]: 33 | print(' '.join(x)) 34 | 35 | config = tf.ConfigProto( 36 | # device_count={'CPU': 1, 'GPU': 0}, 37 | allow_soft_placement=True, 38 | log_device_placement=False 39 | ) 40 | 41 | save_path = model_ckpt_web_anti 42 | 43 | # 测试部分 44 | tf.reset_default_graph() 45 | model_pred = SequenceToSequence( 46 | input_vocab_size=len(ws), 47 | target_vocab_size=len(ws), 48 | batch_size=1, 49 | mode='decode', 50 | beam_width=0, 51 | **params 52 | ) 53 | init = tf.global_variables_initializer() 54 | 55 | with tf.Session(config=config) as sess: 56 | sess.run(init) 57 | model_pred.load(sess, save_path) 58 | 59 | while True: 60 | user_text = input('Input Chat Sentence:') 61 | if user_text in ('exit', 'quit'): 62 | exit(0) 63 | x_test = [list(user_text.lower())] 64 | # x_test = [word_tokenize(user_text)] 65 | bar = batch_flow([x_test], ws, 1) 66 | x, xl = next(bar) 67 | x = np.flip(x, axis=1) 68 | # x = np.array([ 69 | # list(reversed(xx)) 70 | # for xx in x 71 | # ]) 72 | print(x, xl) 73 | pred = model_pred.predict( 74 | sess, 75 | np.array(x), 76 | np.array(xl) 77 | ) 78 | print(pred) 79 | # prob = np.exp(prob.transpose()) 80 | print(ws.inverse_transform(x[0])) 81 | # print(ws.inverse_transform(pred[0])) 82 | # print(pred.shape, prob.shape) 83 | for p in pred: 84 | ans = ws.inverse_transform(p) 85 | print(ans) 86 | 87 | 88 | def main(): 89 | """入口程序""" 90 | import json 91 | predict_anti(json.load(open(path_params))) 92 | 93 | 94 | if __name__ == '__main__': 95 | main() -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/code_seq2seq_word/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:52 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/code_seq2seq_word/extract_webank.py: -------------------------------------------------------------------------------- 1 | """ 2 | 把文件格式转换为可训练格式 3 | Code from: QHDuan(2018-02-05) url: https://github.com/qhduan/just_another_seq2seq 4 | """ 5 | 6 | import re 7 | import sys 8 | import pickle 9 | import jieba 10 | import gensim 11 | import numpy as np 12 | from tqdm import tqdm 13 | from conf.path_config import projectdir 14 | from conf.path_config import w2v_model_merge_short_path 15 | from utils.mode_util.seq2seq.word_sequence import WordSequence 16 | 17 | from conf.path_config import model_ckpt_web_anti_word 18 | from conf.path_config import train_data_web_xyw_anti 19 | from conf.path_config import train_data_web_emb_anti 20 | from conf.path_config import path_webank_sim 21 | 22 | sys.path.append('..') 23 | 24 | 25 | def make_split(line): 26 | """构造合并两个句子之间的符号 27 | """ 28 | if re.match(r'.*([,。…?!~\.,!?])$', ''.join(line)): 29 | return [] 30 | return [','] 31 | 32 | 33 | def good_line(line): 34 | """判断一个句子是否好""" 35 | if len(re.findall(r'[a-zA-Z0-9]', ''.join(line))) > 2: 36 | return False 37 | return True 38 | 39 | 40 | def regular(sen, limit=50): 41 | sen = re.sub(r'\.{3,100}', '…', sen) 42 | sen = re.sub(r'…{2,100}', '…', sen) 43 | sen = re.sub(r'[,]{1,100}', ',', sen) 44 | sen = re.sub(r'[\.]{1,100}', '。', sen) 45 | sen = re.sub(r'[\?]{1,100}', '?', sen) 46 | sen = re.sub(r'[!]{1,100}', '!', sen) 47 | if len(sen) > limit: 48 | sen = sen[0:limit] 49 | return sen 50 | 51 | 52 | def creat_train_data_of_bank_corpus(limit=50, x_limit=3, y_limit=3): 53 | """执行程序 54 | Args: 55 | limit: 只输出句子长度小于limit的句子 56 | """ 57 | 58 | print('load word2vec start!') 59 | word_vec = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_merge_short_path, encoding='gbk', binary=False, limit=None) 60 | print('load word2vec end!') 61 | fp = open(path_webank_sim, 'r', encoding='gbk', errors='ignore') 62 | 63 | x_datas = [] 64 | y_datas = [] 65 | max_len = 0 66 | count_fp = 0 67 | for line in tqdm(fp): 68 | count_fp += 1 69 | if count_fp == 1: 70 | continue 71 | sim_bank_datas_one_split = line.strip().split(",") 72 | len_x1 = len(sim_bank_datas_one_split[0]) 73 | len_x2 = len(sim_bank_datas_one_split[1]) 74 | # if max_len < len_x1 or max_len < len_x2: 75 | max_len = max(len_x1, len_x2, max_len) 76 | 77 | sentence_org = regular(sim_bank_datas_one_split[0], limit=limit) 78 | sentence_sim = regular(sim_bank_datas_one_split[1], limit=limit) 79 | org_cut = jieba._lcut(sentence_org) 80 | sen_cut = jieba._lcut(sentence_sim) 81 | 82 | x_datas.append(org_cut) 83 | y_datas.append(sen_cut) 84 | x_datas.append(sen_cut) 85 | y_datas.append(org_cut) 86 | 87 | print(len(x_datas), len(y_datas)) 88 | for ask, answer in zip(x_datas[:50], y_datas[:50]): 89 | print(''.join(ask)) 90 | print(''.join(answer)) 91 | print('-' * 50) 92 | 93 | data = list(zip(x_datas, y_datas)) 94 | data = [ 95 | (x, y) 96 | for x, y in data 97 | if len(x) < limit \ 98 | and len(y) < limit \ 99 | and len(y) >= y_limit \ 100 | and len(x) >= x_limit 101 | ] 102 | x_data, y_data = zip(*data) 103 | 104 | print('refine train data') 105 | 106 | train_data = x_data + y_data 107 | 108 | print('fit word_sequence') 109 | 110 | ws_input = WordSequence() 111 | 112 | ws_input.fit(train_data, max_features=100000) 113 | 114 | print('dump word_sequence') 115 | 116 | pickle.dump((x_data, y_data, ws_input), 117 | open(train_data_web_xyw_anti, 'wb') 118 | ) 119 | 120 | print('make embedding vecs') 121 | 122 | emb = np.zeros((len(ws_input), len(word_vec['']))) 123 | 124 | np.random.seed(1) 125 | for word, ind in ws_input.dict.items(): 126 | if word in word_vec: 127 | emb[ind] = word_vec[word] 128 | else: 129 | emb[ind] = np.random.random(size=(300,)) - 0.5 130 | 131 | print('dump emb') 132 | 133 | pickle.dump( 134 | emb, 135 | open(train_data_web_emb_anti, 'wb') 136 | ) 137 | 138 | print('done') 139 | 140 | 141 | if __name__ == '__main__': 142 | creat_train_data_of_bank_corpus() -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/code_seq2seq_word/predict_word_anti.py: -------------------------------------------------------------------------------- 1 | """ 2 | 对SequenceToSequence模型进行基本的参数组合测试 3 | """ 4 | 5 | from utils.mode_util.seq2seq.thread_generator import ThreadedGenerator 6 | from utils.mode_util.seq2seq.model_seq2seq import SequenceToSequence 7 | from utils.mode_util.seq2seq.word_sequence import WordSequence 8 | from utils.mode_util.seq2seq.data_utils import batch_flow 9 | 10 | from conf.path_config import model_ckpt_web_anti_word 11 | from conf.path_config import train_data_web_xyw_anti 12 | from conf.path_config import train_data_web_emb_anti 13 | from conf.path_config import path_webank_sim 14 | from conf.path_config import path_params 15 | 16 | import tensorflow as tf 17 | import numpy as np 18 | import random 19 | import pickle 20 | import jieba 21 | import sys 22 | 23 | 24 | sys.path.append('..') 25 | 26 | 27 | def pred_word_anti(bidirectional, cell_type, depth, 28 | attention_type, use_residual, use_dropout, time_major, hidden_units): 29 | """测试不同参数在生成的假数据上的运行结果""" 30 | 31 | x_data, _, ws = pickle.load(open(train_data_web_xyw_anti, 'rb')) 32 | 33 | for x in x_data[:5]: 34 | print(' '.join(x)) 35 | 36 | config = tf.ConfigProto( 37 | device_count={'CPU': 1, 'GPU': 0}, 38 | allow_soft_placement=True, 39 | log_device_placement=False 40 | ) 41 | 42 | save_path = model_ckpt_web_anti_word 43 | 44 | # 测试部分 45 | tf.reset_default_graph() 46 | model_pred = SequenceToSequence( 47 | input_vocab_size=len(ws), 48 | target_vocab_size=len(ws), 49 | batch_size=1, 50 | mode='decode', 51 | beam_width=1, 52 | bidirectional=bidirectional, 53 | cell_type=cell_type, 54 | depth=depth, 55 | attention_type=attention_type, 56 | use_residual=use_residual, 57 | use_dropout=use_dropout, 58 | parallel_iterations=1, 59 | time_major=time_major, 60 | hidden_units=hidden_units, 61 | share_embedding=True, 62 | pretrained_embedding=True 63 | ) 64 | init = tf.global_variables_initializer() 65 | 66 | with tf.Session(config=config) as sess: 67 | sess.run(init) 68 | model_pred.load(sess, save_path) 69 | 70 | while True: 71 | user_text = input('Input Chat Sentence:') 72 | if user_text in ('exit', 'quit'): 73 | exit(0) 74 | x_test = [jieba.lcut(user_text.lower())] 75 | # x_test = [word_tokenize(user_text)] 76 | bar = batch_flow([x_test], ws, 1) 77 | x, xl = next(bar) 78 | x = np.flip(x, axis=1) 79 | # x = np.array([ 80 | # list(reversed(xx)) 81 | # for xx in x 82 | # ]) 83 | print(x, xl) 84 | pred = model_pred.predict( 85 | sess, 86 | np.array(x), 87 | np.array(xl) 88 | ) 89 | print(pred) 90 | # prob = np.exp(prob.transpose()) 91 | print(ws.inverse_transform(x[0])) 92 | # print(ws.inverse_transform(pred[0])) 93 | # print(pred.shape, prob.shape) 94 | for p in pred: 95 | ans = ws.inverse_transform(p) 96 | print(ans) 97 | 98 | 99 | def main(): 100 | """入口程序,开始测试不同参数组合""" 101 | random.seed(0) 102 | np.random.seed(0) 103 | tf.set_random_seed(0) 104 | pred_word_anti( 105 | bidirectional=True, 106 | cell_type='lstm', 107 | depth=2, 108 | attention_type='Bahdanau', 109 | use_residual=False, 110 | use_dropout=False, 111 | time_major=False, 112 | hidden_units=512 113 | ) 114 | 115 | 116 | if __name__ == '__main__': 117 | main() 118 | -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/data_mid/char/useless.txt: -------------------------------------------------------------------------------- 1 | useless 2 | -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/data_mid/word/useless.txt: -------------------------------------------------------------------------------- 1 | useless 2 | -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/model_seq2seq_tp/seq2seq_char_webank/useless.txt: -------------------------------------------------------------------------------- 1 | useless 2 | -------------------------------------------------------------------------------- /AugmentText/augment_seq2seq/model_seq2seq_tp/seq2seq_word_webank/useless.txt: -------------------------------------------------------------------------------- 1 | useless 2 | -------------------------------------------------------------------------------- /AugmentText/augment_simbert/README.md: -------------------------------------------------------------------------------- 1 | # Augment Simbert 2 | 3 | ## 预训练模型-UMILM 4 | 使用BERT(UNILM)的生成能力, 即BERT的NSP句对任务 5 | - simbert(bert + unilm + adv): [https://github.com/ZhuiyiTechnology/simbert](https://github.com/ZhuiyiTechnology/simbert) 6 | - simbert: [鱼与熊掌兼得:融合检索和生成的SimBERT模型](https://spaces.ac.cn/archives/7427) 7 | - roformer-sim: [https://github.com/ZhuiyiTechnology/roformer-sim](https://github.com/ZhuiyiTechnology/roformer-sim) 8 | - simbert-v2(roformer + unilm + adv + bart + distill): [SimBERTv2来了!融合检索和生成的RoFormer-Sim模型](https://spaces.ac.cn/archives/8454) 9 | 10 | ## 备注 11 | 效果还是比较好的, 可以生成多个相似句子, 但是生成式的模型一般都比较慢。 12 | - 相比simbert, roformer-sim效果要好些 13 | 14 | -------------------------------------------------------------------------------- /AugmentText/augment_simbert/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/18 21:12 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /AugmentText/augment_simbert/requestments.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.15.2 2 | bert4keras==0.10.7 3 | 4 | -------------------------------------------------------------------------------- /AugmentText/augment_simbert/tet_keras.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/22 21:29 4 | # @author : Mo 5 | # @function: 获取下三角矩阵 6 | 7 | 8 | import keras.backend as K 9 | import numpy as np 10 | 11 | # np.random.rand(2,3) 12 | 13 | input_x = np.array([[1,2,3], [4,5,6], [7,8,9]]) 14 | s = K.cast(input_x, dtype="float32") 15 | idxs = K.cumsum(s, axis=1) # 一行一行累加, 可用于构建上三角矩阵、下三角矩阵 16 | print(K.eval(idxs)) 17 | mask = idxs[:, None, :] <= idxs[:, :, None] 18 | print(K.eval(mask)) 19 | mask = K.cast(mask, K.floatx()) 20 | print(K.eval(mask)) 21 | ee = 0 22 | 23 | print(75.27 / 20) 24 | 25 | -------------------------------------------------------------------------------- /AugmentText/augment_syntax/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 21:16 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_syntax/augment_mainpart.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 22:49 4 | # @author :Mo 5 | # @function :get main_part by stanfordcorenlp 6 | 7 | 8 | from conf.path_config import stanford_corenlp_full_path 9 | from stanfordcorenlp import StanfordCoreNLP 10 | 11 | 12 | # stanford-corenlp-full-2018-10-05需要预先下载,启动较慢 13 | nlp = StanfordCoreNLP(stanford_corenlp_full_path, lang='zh') 14 | 15 | 16 | def stanford_parse(sentence): 17 | tokenize = nlp.word_tokenize(sentence) 18 | pos_tag = nlp.pos_tag(sentence) 19 | name_entity = nlp.ner(sentence) 20 | syntax_tree = nlp.parse(sentence) 21 | dependence = nlp.dependency_parse(sentence) 22 | 23 | result_dict = {} 24 | result_dict['tokenize'] = tokenize 25 | result_dict['dependence'] = dependence 26 | result_dict['parse'] = syntax_tree 27 | return result_dict 28 | 29 | 30 | def combine_nn(tokenize, dependence, target): 31 | """ 32 | 合并名词短语等 33 | :param dependence: dict, enhancedPlusPlusDependencies 34 | :param target: str, subject or object 35 | :return: str, nn 36 | """ 37 | if not target: 38 | return target 39 | else: 40 | for dependence_one in dependence: 41 | if target == tokenize[dependence_one[1]-1] if dependence_one[1]!=0 else "root" and dependence_one[0] == "nn": 42 | target = tokenize[dependence_one[2]-1] + target 43 | return target 44 | return target 45 | 46 | 47 | def get_main_part_by_stanfordcorenlp(text): 48 | """ 49 | 根据依存句法生成句子 50 | :param text: str, 输入 51 | :return: str, result of syn sentence 52 | """ 53 | # standcoreNLP 分词 54 | result_dict = stanford_parse(text) 55 | tokenize = result_dict['tokenize'] 56 | dependence = result_dict['dependence'] 57 | syntax_tree = result_dict['parse'] 58 | # 提取主谓宾 59 | part_main = {"主": "", "谓": "", "宾": ""} 60 | if len(syntax_tree) >= 2: 61 | if "NP" in syntax_tree[1] or "ROOT" not in str(dependence): # 名词短语 或者是没有谓语 62 | count = 0 63 | for syntax_tree_single in syntax_tree: 64 | if "NP" in syntax_tree_single and "(" in syntax_tree_single and ")" in syntax_tree_single: 65 | token_np = syntax_tree_single.split(" ")[-1] 66 | token_np = token_np.replace("'", "").replace(")", "").strip() 67 | part_main["主"] = token_np if count == 0 else part_main["主"] + token_np 68 | count += 1 69 | return part_main["主"] + part_main["谓"] + part_main["宾"] 70 | else: 71 | for dependence_one in dependence: 72 | dep = dependence_one[0] 73 | dep_dep_gloss = tokenize[dependence_one[2]-1] 74 | if dep == "ROOT": # ROOT作谓语 75 | part_main["谓"] = dep_dep_gloss 76 | elif dep == "cop": # 主系结构 77 | part_main["谓"] = dep_dep_gloss + part_main["谓"] 78 | else: # 主语和宾语 79 | if dep == "nsubjpass" or dep == "dobj" or dep == "attr": 80 | part_main["宾"] = dep_dep_gloss 81 | elif dep == "nsubj" or dep == "top": 82 | part_main["主"] = dep_dep_gloss 83 | 84 | part_main["主"] = combine_nn(tokenize, dependence, part_main["主"]) 85 | part_main["宾"] = combine_nn(tokenize, dependence, part_main["宾"]) 86 | return part_main["主"] + part_main["谓"] + part_main["宾"] 87 | 88 | 89 | if __name__ == "__main__": 90 | sentence_list = [ 91 | "大漠帝国确实很喜欢JY", 92 | "JY也喜欢大漠帝国哦!", 93 | "这个工程的作者是momo", 94 | "momo是一个无门无派的浪人", 95 | "只有自信的程序员才能把握未来", 96 | "主干识别可以提高检索系统的智能", 97 | "打更的住在这里", 98 | "人民的名义", 99 | "名词短语", 100 | "我一直很喜欢你", 101 | "你被我喜欢", 102 | "美丽又善良的你被卑微的我深深的喜欢着……", 103 | 104 | "搜索momo可以找到我的博客", 105 | "静安区体育局2013年部门决算情况说明", 106 | "红旗飘", 107 | "柳丝长", 108 | "乐队奏国歌", 109 | "红扑扑的朝霞露出了笑脸", 110 | "初升的太阳照耀着峻峭的群山", 111 | "一个农人在路上看见一条冻僵了的蛇", 112 | "我打量了他一眼", ] 113 | sentence_type = ["陈述句与否定句", 114 | "秦耕真是一个聪明的孩子", 115 | "衣服洗得不干净", 116 | "他没有做完作业", 117 | "他不敢不来", 118 | "没有一个人不怕他", 119 | "我非把这本书读完不可", 120 | "同学们无不欢欣鼓舞", 121 | "他妈妈不让他去,无非是怕他吃亏", 122 | "想起一个人的旅途,不无寂寥之感", 123 | "你未必不知道", 124 | "各种问句", 125 | "你可以那到100分, 是吗?", 126 | "刚才接你的人是谁?", 127 | "什么叫函数?", 128 | "你爸爸怎么样了?", 129 | "你每天几点休息?", 130 | "你爸爸在哪儿?", 131 | "我们是从广州走, 还是从成都走?", 132 | "他是不是又迟到了?", 133 | "难道他已经跑了?", 134 | "我怎么能负这个责任呢?", 135 | "你是来帮助我们的, 还是来拆我们的台的?", 136 | "这些人甘愿当走狗, 你说可恨不可恨?", 137 | "祈使句", 138 | "快去捞饭!米烂了!", 139 | "给我喝水, 我渴!", 140 | "走哇, 妈妈!", 141 | "不许动!", 142 | "太好啦", 143 | ] 144 | for sen_one in sentence_list: 145 | subject_object = get_main_part_by_stanfordcorenlp(sen_one) 146 | print(sen_one + " " + subject_object) 147 | 148 | while True: 149 | print("请输入sentence: ") 150 | sen_test = input() 151 | # syn_sentence_test = syn_by_syntactic_analys==(test_test) 152 | syn_sentence_test = get_main_part_by_stanfordcorenlp(sen_test) 153 | print(syn_sentence_test) 154 | 155 | # Do not forget to close! The backend server will consume a lot memery 156 | nlp.close() 157 | -------------------------------------------------------------------------------- /AugmentText/augment_translate/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 21:15 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_translate/translate_account/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 22:58 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_translate/translate_account/translate_tencent_secret.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 23:05 4 | # @author :Mo 5 | # @function :使用腾讯账户(翻译君),回译 6 | 7 | 8 | from conf.augment_constant import language_short_tencent 9 | from conf.augment_constant import app_secret_tentcnet 10 | from conf.augment_constant import app_key_tencent 11 | from urllib.parse import quote 12 | import logging as logger 13 | import requests 14 | import hashlib 15 | import random 16 | import string 17 | import time 18 | import json 19 | 20 | 21 | def md5_sign(text): 22 | """ 23 | 生成md5 24 | :param src: str, sentence 25 | :return: str, upper of string 26 | """ 27 | md5_model = hashlib.md5(text.encode("utf8")) 28 | return md5_model.hexdigest().upper() 29 | 30 | 31 | def get_params(text, from_l="zh", to_l="en"): 32 | """ 33 | 生成sign和params 34 | :param text: str, input sentence 35 | :param from_: source language 36 | :param to_: target language 37 | :return: dict, params 38 | """ 39 | # 请求时间戳(秒级),用于防止请求重放(保证签名5分钟有效)   40 | time_stamp = str(int(time.time())) 41 | # 请求随机字符串,用于保证签名不可预测   42 | nonce_str = ''.join(random.sample(string.ascii_letters + string.digits, 10)) 43 | params = {'app_id': app_key_tencent, 44 | 'source': from_l, 45 | 'target': to_l, 46 | 'text': text, 47 | 'time_stamp': time_stamp, 48 | 'nonce_str': nonce_str 49 | } 50 | signs = '' 51 | # 要对key排序再拼接   52 | for key in sorted(params): 53 | # 键值拼接过程value部分需要URL编码,URL编码算法用大写字母,例如%E8。quote默认大写。   54 | signs += '{}={}&'.format(key, quote(params[key], safe='').replace("%20", "+")) 55 | # 将应用密钥以app_key为键名,拼接到字符串sign_before末尾   56 | signs += 'app_key={}'.format(app_secret_tentcnet) 57 | # 对字符串sign_before进行MD5运算,得到接口请求签名   58 | sign = md5_sign(signs) 59 | params['sign'] = sign 60 | return params 61 | 62 | 63 | def any_to_any_translate_tencent(text, from_='zh', to_='en'): 64 | """ 65 | 调用搜狗翻译,从任意一种语言到另外一种语言,详情见常量LANGUAGE_SHORT_BAIDU 66 | :param text: str, input sentence 67 | :param from_: source language 68 | :param to_: target language 69 | :return: str, translate sentence 70 | """ 71 | try: 72 | url = "https://api.ai.qq.com/fcgi-bin/nlp/nlp_texttranslate" 73 | params_text = get_params(text, from_l=from_, to_l=to_) 74 | res_post = requests.request("POST", url, data=params_text) 75 | res_content = res_post.content.decode("utf8") 76 | res_json = json.loads(res_content) 77 | target_text = res_json["data"]["target_text"] 78 | return target_text 79 | except Exception as e: 80 | logger.error(str(e)) 81 | return None 82 | 83 | 84 | def translate_tencent_back(text, from_='zh', to_='en'): 85 | """ 86 | 回译,调用两次腾讯翻译 87 | :param text: str, input sentence 88 | :param from_: source language 89 | :param to_: target language 90 | :return: str, translate sentence 91 | """ 92 | try: 93 | text_from_to = any_to_any_translate_tencent(text, from_=from_, to_=to_) 94 | text_to_from = any_to_any_translate_tencent(text_from_to, from_=to_, to_=from_) 95 | return text_to_from 96 | except Exception as e: 97 | logger.error(str(e)) 98 | return None 99 | 100 | 101 | 102 | if __name__ == '__main__': 103 | text_test = "你觉得JY会喜欢暗影随风、大漠帝国吗".strip() 104 | for to_test in language_short_tencent: 105 | res_test = translate_tencent_back(text_test, from_='zh', to_=to_test) 106 | print("没有账户就为空,回译结果: ") 107 | print(res_test) 108 | -------------------------------------------------------------------------------- /AugmentText/augment_translate/translate_tools/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 22:57 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /AugmentText/augment_translate/translate_tools/translate_translate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 23:05 4 | # @author :Mo 5 | # @function :使用翻译工具translate.Translator,回译 6 | 7 | 8 | from conf.augment_constant import language_short_google 9 | from utils.text_tools import judge_translate_english 10 | from translate import Translator 11 | 12 | 13 | def translate_tools_translate(text, to_='en'): 14 | """ 15 | 调用translate进行句子生成 16 | :param text: str, input 17 | :param to_: language type 18 | :return: str, result 19 | """ 20 | # provider = 'mymemory','microsoft' 21 | translator1 = Translator(to_lang=to_, from_lang='zh', provider=None, secret_access_key=None) 22 | translator2 = Translator(to_lang="zh", from_lang=to_, provider=None, secret_access_key=None) 23 | 24 | translation1 = translator1.translate(text) 25 | translation2 = translator2.translate(translation1) 26 | return translation2 27 | 28 | 29 | if __name__ == "__main__": 30 | sen_org = "大漠帝国喜欢RSH、JY吗" 31 | for language_short_google_one in language_short_google: 32 | text_translate = translate_tools_translate(sen_org, to_=language_short_google_one) 33 | judge = judge_translate_english(sen_org, text_translate) 34 | if judge: 35 | print("True") 36 | print(text_translate) 37 | else: 38 | print("False") 39 | print(text_translate) 40 | # 测试结果: 41 | # False 42 | # 沙漠帝国是否像RSH,JY? 43 | # False 44 | # 沙漠帝国看起来像RSH,JY? 45 | # False 46 | # 帝国沙漠像rsh,jy? -------------------------------------------------------------------------------- /AugmentText/augment_translate/translate_web/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 22:58 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | #!/usr/bin/python 3 | # @Time :2019/3/29 23:11 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:17 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:17 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/code_seq2seq_char/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:50 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/code_seq2seq_char/extract_char_cg.py: -------------------------------------------------------------------------------- 1 | """把 chicken_and_gossip数据 文件格式转换为可训练格式 2 | Code from: QHDuan(2018-02-05) url: https://github.com/qhduan/just_another_seq2seq 3 | """ 4 | 5 | from conf.path_config import chicken_and_gossip_path 6 | from conf.path_config import chatbot_data_cg_char_dir 7 | from conf.path_config import chatbot_data_cg_ws_anti 8 | from conf.path_config import chatbot_data_cg_xy_anti 9 | from conf.path_config import model_ckpt_cg_anti 10 | 11 | from utils.mode_util.seq2seq.word_sequence import WordSequence 12 | from utils.text_tools import txtRead 13 | from tqdm import tqdm 14 | import pickle 15 | import sys 16 | import re 17 | 18 | sys.path.append('..') 19 | 20 | 21 | def make_split(line): 22 | """构造合并两个句子之间的符号 23 | """ 24 | if re.match(r'.*([,。…?!~\.,!?])$', ''.join(line)): 25 | return [] 26 | return [','] 27 | 28 | 29 | def good_line(line): 30 | if len(re.findall(r'[a-zA-Z0-9]', ''.join(line))) > 2: 31 | return False 32 | return True 33 | 34 | 35 | def regular(sen, limit=50): 36 | sen = re.sub(r'\.{3,100}', '…', sen) 37 | sen = re.sub(r'…{2,100}', '…', sen) 38 | sen = re.sub(r'[,]{1,100}', ',', sen) 39 | sen = re.sub(r'[\.]{1,100}', '。', sen) 40 | sen = re.sub(r'[\?]{1,100}', '?', sen) 41 | sen = re.sub(r'[!]{1,100}', '!', sen) 42 | if len(sen) > limit: 43 | sen = sen[0:limit] 44 | return sen 45 | 46 | 47 | def creat_train_data_of_cg_corpus(limit=50, x_limit=2, y_limit=2): 48 | x_datas = [] 49 | y_datas = [] 50 | max_len = 0 51 | sim_ali_web_gov_dli_datas = txtRead(chicken_and_gossip_path, encodeType="utf-8") 52 | for sim_ali_web_gov_dli_datas_one in sim_ali_web_gov_dli_datas[1:]: 53 | if sim_ali_web_gov_dli_datas_one: 54 | sim_ali_web_gov_dli_datas_one_split = sim_ali_web_gov_dli_datas_one.strip().split("\t") 55 | if len(sim_ali_web_gov_dli_datas_one_split) == 2: 56 | # if sim_ali_web_gov_dli_datas_one_split[2]=="1": 57 | len_x1 = len(sim_ali_web_gov_dli_datas_one_split[0]) 58 | len_x2 = len(sim_ali_web_gov_dli_datas_one_split[1]) 59 | # if max_len < len_x1 or max_len < len_x2: 60 | max_len = max(len_x1, len_x2, max_len) 61 | 62 | sentence_org = regular(sim_ali_web_gov_dli_datas_one_split[0], limit=limit) 63 | sentence_sim = regular(sim_ali_web_gov_dli_datas_one_split[1], limit=limit) 64 | x_datas.append([sen for sen in sentence_org]) 65 | y_datas.append([sen for sen in sentence_sim]) 66 | # x_datas.append([sen for sen in sentence_sim]) 67 | # y_datas.append([sen for sen in sentence_org]) 68 | 69 | datas = list(zip(x_datas, y_datas)) 70 | datas = [ 71 | (x, y) 72 | for x, y in datas 73 | if len(x) < limit and len(y) < limit and len(y) >= y_limit and len(x) >= x_limit 74 | ] 75 | x_datas, y_datas = zip(*datas) 76 | 77 | print('fit word_sequence') 78 | 79 | ws_input = WordSequence() 80 | ws_input.fit(x_datas + y_datas) 81 | 82 | print('dump') 83 | 84 | pickle.dump((x_datas, y_datas), 85 | open(chatbot_data_cg_xy_anti, 'wb') 86 | ) 87 | pickle.dump(ws_input, open(chatbot_data_cg_ws_anti, 'wb')) 88 | 89 | print('done') 90 | print(max_len) 91 | 92 | 93 | if __name__ == '__main__': 94 | creat_train_data_of_cg_corpus() 95 | -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/code_seq2seq_char/predict_char_cg.py: -------------------------------------------------------------------------------- 1 | """ 2 | 对SequenceToSequence模型进行基本的参数组合测试 3 | Code from: QHDuan(2018-02-05) url: https://github.com/qhduan/just_another_seq2seq 4 | 5 | """ 6 | 7 | from utils.mode_util.seq2seq.data_utils import batch_flow_bucket as batch_flow 8 | from utils.mode_util.seq2seq.thread_generator import ThreadedGenerator 9 | from utils.mode_util.seq2seq.model_seq2seq import SequenceToSequence 10 | from utils.mode_util.seq2seq.word_sequence import WordSequence 11 | 12 | from conf.path_config import chicken_and_gossip_path 13 | from conf.path_config import chatbot_data_cg_char_dir 14 | from conf.path_config import chatbot_data_cg_ws_anti 15 | from conf.path_config import chatbot_data_cg_xy_anti 16 | from conf.path_config import model_ckpt_cg_anti 17 | from conf.path_config import path_params 18 | 19 | import tensorflow as tf 20 | import numpy as np 21 | import pickle 22 | import json 23 | import sys 24 | 25 | sys.path.append('..') 26 | 27 | 28 | def predict_anti(params): 29 | """测试不同参数在生成的假数据上的运行结果""" 30 | 31 | x_data, _ = pickle.load(open(chatbot_data_cg_xy_anti, 'rb')) 32 | ws = pickle.load(open(chatbot_data_cg_ws_anti, 'rb')) 33 | 34 | for x in x_data[:5]: 35 | print(' '.join(x)) 36 | 37 | config = tf.ConfigProto( 38 | # device_count={'CPU': 1, 'GPU': 0}, 39 | allow_soft_placement=True, 40 | log_device_placement=False 41 | ) 42 | 43 | save_path = model_ckpt_cg_anti 44 | 45 | # 测试部分 46 | tf.reset_default_graph() 47 | model_pred = SequenceToSequence( 48 | input_vocab_size=len(ws), 49 | target_vocab_size=len(ws), 50 | batch_size=1, 51 | mode='decode', 52 | beam_width=0, 53 | **params 54 | ) 55 | init = tf.global_variables_initializer() 56 | 57 | with tf.Session(config=config) as sess: 58 | sess.run(init) 59 | model_pred.load(sess, save_path) 60 | 61 | while True: 62 | user_text = input('Input Chat Sentence:') 63 | if user_text in ('exit', 'quit'): 64 | exit(0) 65 | x_test = [list(user_text.lower())] 66 | # x_test = [word_tokenize(user_text)] 67 | bar = batch_flow([x_test], ws, 1) 68 | x, xl = next(bar) 69 | x = np.flip(x, axis=1) 70 | # x = np.array([ 71 | # list(reversed(xx)) 72 | # for xx in x 73 | # ]) 74 | print(x, xl) 75 | pred = model_pred.predict( 76 | sess, 77 | np.array(x), 78 | np.array(xl) 79 | ) 80 | print(pred) 81 | # prob = np.exp(prob.transpose()) 82 | print(ws.inverse_transform(x[0])) 83 | # print(ws.inverse_transform(pred[0])) 84 | # print(pred.shape, prob.shape) 85 | for p in pred: 86 | ans = ws.inverse_transform(p) 87 | print(ans) 88 | 89 | 90 | def main(): 91 | """入口程序""" 92 | import json 93 | predict_anti(json.load(open(path_params))) 94 | 95 | 96 | if __name__ == '__main__': 97 | main() -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/code_seq2seq_word/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 10:52 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/code_seq2seq_word/extract_word_cg.py: -------------------------------------------------------------------------------- 1 | """把 chicken_and_gossip数据 文件格式转换为可训练格式 2 | Code from: QHDuan(2018-02-05) url: https://github.com/qhduan/just_another_seq2seq 3 | """ 4 | 5 | import re 6 | import sys 7 | import pickle 8 | import jieba 9 | import gensim 10 | import numpy as np 11 | from tqdm import tqdm 12 | from utils.text_tools import txtRead 13 | from conf.path_config import word2_vec_path 14 | from conf.path_config import chicken_and_gossip_path 15 | from conf.path_config import w2v_model_merge_short_path 16 | from utils.mode_util.seq2seq.word_sequence import WordSequence 17 | 18 | from conf.path_config import chatbot_data_cg_xyw_anti_word 19 | from conf.path_config import chatbot_data_cg_emb_anti_word 20 | from conf.path_config import model_ckpt_cg_anti_word 21 | 22 | 23 | sys.path.append('..') 24 | 25 | 26 | def make_split(line): 27 | """构造合并两个句子之间的符号 28 | """ 29 | if re.match(r'.*([,。…?!~\.,!?])$', ''.join(line)): 30 | return [] 31 | return [','] 32 | 33 | 34 | def good_line(line): 35 | """判断一个句子是否好""" 36 | if len(re.findall(r'[a-zA-Z0-9]', ''.join(line))) > 2: 37 | return False 38 | return True 39 | 40 | 41 | def regular(sen, limit=50): 42 | sen = re.sub(r'\.{3,100}', '…', sen) 43 | sen = re.sub(r'…{2,100}', '…', sen) 44 | sen = re.sub(r'[,]{1,100}', ',', sen) 45 | sen = re.sub(r'[\.]{1,100}', '。', sen) 46 | sen = re.sub(r'[\?]{1,100}', '?', sen) 47 | sen = re.sub(r'[!]{1,100}', '!', sen) 48 | if len(sen) > limit: 49 | sen = sen[0:limit] 50 | return sen 51 | 52 | 53 | def creat_train_data_of_cg_corpus(limit=50, x_limit=2, y_limit=2): 54 | 55 | print('load word2vec start!') 56 | word_vec_short = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_merge_short_path, binary=False, limit=None, encoding='gbk') 57 | print('load word_vec_short start!') 58 | word_vec = gensim.models.KeyedVectors.load_word2vec_format(word2_vec_path, binary=False, limit=None) 59 | print('load word_vec end!') 60 | 61 | x_datas = [] 62 | y_datas = [] 63 | max_len = 0 64 | sim_ali_web_gov_dli_datas = txtRead(chicken_and_gossip_path, encodeType="utf-8") 65 | for sim_ali_web_gov_dli_datas_one in sim_ali_web_gov_dli_datas[1:]: 66 | if sim_ali_web_gov_dli_datas_one: 67 | sim_ali_web_gov_dli_datas_one_split = sim_ali_web_gov_dli_datas_one.strip().split("\t") 68 | if len(sim_ali_web_gov_dli_datas_one_split) == 2: 69 | # if sim_ali_web_gov_dli_datas_one_split[2]=="1": 70 | len_x1 = len(sim_ali_web_gov_dli_datas_one_split[0]) 71 | len_x2 = len(sim_ali_web_gov_dli_datas_one_split[1]) 72 | # if max_len < len_x1 or max_len < len_x2: 73 | max_len = max(len_x1, len_x2, max_len) 74 | 75 | sentence_org = regular(sim_ali_web_gov_dli_datas_one_split[0], limit=limit) 76 | sentence_sim = regular(sim_ali_web_gov_dli_datas_one_split[1], limit=limit) 77 | org_cut = jieba._lcut(sentence_org) 78 | sen_cut = jieba._lcut(sentence_sim) 79 | 80 | x_datas.append(org_cut) 81 | y_datas.append(sen_cut) 82 | 83 | data = list(zip(x_datas, y_datas)) 84 | data = [ 85 | (x, y) 86 | for x, y in data 87 | if len(x) < limit \ 88 | and len(y) < limit \ 89 | and len(y) >= y_limit \ 90 | and len(x) >= x_limit 91 | ] 92 | x_data, y_data = zip(*data) 93 | 94 | print('refine train data') 95 | 96 | train_data = x_data + y_data 97 | 98 | print('fit word_sequence') 99 | 100 | ws_input = WordSequence() 101 | 102 | ws_input.fit(train_data, max_features=100000) 103 | 104 | print('dump word_sequence') 105 | 106 | 107 | pickle.dump((x_data, y_data, ws_input), 108 | open(chatbot_data_cg_xyw_anti_word, 'wb') 109 | ) 110 | 111 | print('make embedding vecs') 112 | 113 | emb = np.zeros((len(ws_input), len(word_vec_short['']))) 114 | 115 | np.random.seed(1) 116 | for word, ind in ws_input.dict.items(): 117 | if word in word_vec: 118 | emb[ind] = word_vec[word] 119 | else: 120 | emb[ind] = np.random.random(size=(300,)) - 0.5 121 | 122 | print('dump emb') 123 | 124 | pickle.dump( 125 | emb, 126 | open(chatbot_data_cg_emb_anti_word, 'wb') 127 | ) 128 | 129 | print('done') 130 | 131 | 132 | if __name__ == '__main__': 133 | creat_train_data_of_cg_corpus() -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/code_seq2seq_word/pred_word_cg.py: -------------------------------------------------------------------------------- 1 | """ 2 | 对SequenceToSequence模型进行基本的参数组合测试 3 | """ 4 | from utils.mode_util.seq2seq.data_utils import batch_flow_bucket as batch_flow 5 | from utils.mode_util.seq2seq.thread_generator import ThreadedGenerator 6 | from utils.mode_util.seq2seq.model_seq2seq import SequenceToSequence 7 | from utils.mode_util.seq2seq.word_sequence import WordSequence 8 | 9 | from conf.path_config import chatbot_data_cg_xyw_anti_word 10 | from conf.path_config import chatbot_data_cg_emb_anti_word 11 | from conf.path_config import model_ckpt_cg_anti_word 12 | from conf.path_config import path_params 13 | 14 | import tensorflow as tf 15 | from tqdm import tqdm 16 | import numpy as np 17 | import random 18 | import pickle 19 | import jieba 20 | import sys 21 | 22 | sys.path.append('..') 23 | 24 | 25 | def test(bidirectional, cell_type, depth, 26 | attention_type, use_residual, use_dropout, time_major, hidden_units): 27 | """测试不同参数在生成的假数据上的运行结果""" 28 | 29 | x_data, _, ws = pickle.load(open(chatbot_data_cg_xyw_anti_word, 'rb')) 30 | 31 | for x in x_data[:5]: 32 | print(' '.join(x)) 33 | 34 | config = tf.ConfigProto( 35 | device_count={'CPU': 1, 'GPU': 0}, 36 | allow_soft_placement=True, 37 | log_device_placement=False 38 | ) 39 | 40 | # save_path = '/tmp/s2ss_chatbot.ckpt' 41 | save_path = model_ckpt_cg_anti_word 42 | 43 | # 测试部分 44 | tf.reset_default_graph() 45 | model_pred = SequenceToSequence( 46 | input_vocab_size=len(ws), 47 | target_vocab_size=len(ws), 48 | batch_size=1, 49 | mode='decode', 50 | beam_width=0, 51 | bidirectional=bidirectional, 52 | cell_type=cell_type, 53 | depth=depth, 54 | attention_type=attention_type, 55 | use_residual=use_residual, 56 | use_dropout=use_dropout, 57 | parallel_iterations=1, 58 | time_major=time_major, 59 | hidden_units=hidden_units, 60 | share_embedding=True, 61 | pretrained_embedding=True 62 | ) 63 | init = tf.global_variables_initializer() 64 | 65 | with tf.Session(config=config) as sess: 66 | sess.run(init) 67 | model_pred.load(sess, save_path) 68 | 69 | while True: 70 | user_text = input('Input Chat Sentence:') 71 | if user_text in ('exit', 'quit'): 72 | exit(0) 73 | x_test = [jieba.lcut(user_text.lower())] 74 | # x_test = [word_tokenize(user_text)] 75 | bar = batch_flow([x_test], ws, 1) 76 | x, xl = next(bar) 77 | x = np.flip(x, axis=1) 78 | # x = np.array([ 79 | # list(reversed(xx)) 80 | # for xx in x 81 | # ]) 82 | print(x, xl) 83 | pred = model_pred.predict( 84 | sess, 85 | np.array(x), 86 | np.array(xl) 87 | ) 88 | print(pred) 89 | # prob = np.exp(prob.transpose()) 90 | print(ws.inverse_transform(x[0])) 91 | # print(ws.inverse_transform(pred[0])) 92 | # print(pred.shape, prob.shape) 93 | for p in pred: 94 | ans = ws.inverse_transform(p) 95 | print(ans) 96 | 97 | 98 | def main(): 99 | """入口程序,开始测试不同参数组合""" 100 | random.seed(0) 101 | np.random.seed(0) 102 | tf.set_random_seed(0) 103 | test( 104 | bidirectional=True, 105 | cell_type='lstm', 106 | depth=2, 107 | attention_type='Bahdanau', 108 | use_residual=False, 109 | use_dropout=False, 110 | time_major=False, 111 | hidden_units=512 112 | ) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/data_mid/char/useless.txt: -------------------------------------------------------------------------------- 1 | useless.txt -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/data_mid/word/useless.txt: -------------------------------------------------------------------------------- 1 | useless.txt -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/model_seq2seq_tp/seq2seq_char_cg/useless.txt: -------------------------------------------------------------------------------- 1 | useless.txt -------------------------------------------------------------------------------- /ChatBot/chatbot_generate/seq2seq/model_seq2seq_tp/seq2seq_word_cg/useless.txt: -------------------------------------------------------------------------------- 1 | useless.txt -------------------------------------------------------------------------------- /ChatBot/chatbot_search/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/3 15:15 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/README.md: -------------------------------------------------------------------------------- 1 | # chatbot_bertwhite 2 | ## 解释说明 3 | - 代码说明: 4 | - 1. bertWhiteConf.py 超参数配置, 地址、bert-white、索引工具等的超参数 5 | - 2. bertWhiteTools.py 小工具, 主要是一些文档读写功能函数 6 | - 3. bertWhiteTrain.py 主模块, 类似bert预训练模型编码 7 | - 4. indexAnnoy.py annoy索引 8 | - 5. indexFaiss.py faiss索引 9 | - 6. mmr.py 最大边界相关法, 保证返回多样性 10 | 11 | ## 备注说明: 12 | - 1. ***如果FQA标准问答对很少, 比如少于1w条数据, 建议不要用bert-white, 其与领域数据相关, 数据量太小会极大降低泛化性***; 13 | - 2. 可以考虑small、tiny类小模型, 如果要加速推理; 14 | - 3. annoy安装于linux必须有c++环境, 如gcc-c++, g++等, 只有gcc的话可以用faiss-cpu 15 | - 4. 增量更新: 建议问题对增量更新/faiss-annoy索引全量更新 16 | 17 | ## 模型文件 18 | - 1. 模型文件采用的是 "" -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/13 21:21 4 | # @author : Mo 5 | # @function: -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/bertWhiteConf.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/13 9:27 4 | # @author : Mo 5 | # @function: config of Bert-White 6 | 7 | 8 | import platform 9 | # 适配linux 10 | import sys 11 | import os 12 | # path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) 13 | path_root = os.path.abspath(os.path.dirname(__file__)) 14 | sys.path.append(path_root) 15 | print(path_root) 16 | 17 | 18 | if platform.system().lower() == 'windows': 19 | # BERT_DIR = "D:/soft_install/dataset/bert-model/chinese_L-12_H-768_A-12" 20 | # BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_roberta_L-4_H-312_A-12_K-104" 21 | # BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_roberta_L-6_H-384_A-12_K-128" 22 | BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-4_H-312_A-12" 23 | # BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-6_H-384_A-12" 24 | else: 25 | BERT_DIR = "/bert/chinese_L-12_H-768_A-12" 26 | ee = 0 27 | 28 | SAVE_DIR = path_root + "/bert_white" 29 | print(SAVE_DIR) 30 | if not os.path.exists(SAVE_DIR): 31 | os.makedirs(SAVE_DIR) 32 | 33 | 34 | bert_white_config = { 35 | # 预训练模型路径 36 | "bert_dir": BERT_DIR, 37 | "checkpoint_path": "bert_model.ckpt", # 预训练模型地址 38 | "config_path": "bert_config.json", 39 | "dict_path": "vocab.txt", 40 | # 预测需要的文件路径 41 | "save_dir": SAVE_DIR, 42 | "path_docs_encode": "qa.docs.encode.npy", 43 | "path_answers": "qa.answers.json", 44 | "path_qa_idx": "qa.idx.json", 45 | "path_config": "config.json", 46 | "path_docs": "qa.docs.json", 47 | # 索引构建的存储文件, 如 annoy/faiss 48 | "path_index": "qa.docs.idx", 49 | # 初始语料路径 50 | "path_qa": "chicken_and_gossip.txt", # QA问答文件地址 51 | # 超参数 52 | "pre_tokenize": None, 53 | "pooling": "cls-1", # ["first-last-avg", "last-avg", "cls", "pooler", "cls-2", "cls-3", "cls-1"] 54 | "model": "bert", # bert4keras预训练模型类型 55 | "n_components": 768, # 降维到 n_components 56 | "n_cluster": 132, # annoy构建的簇类中心个数n_cluster, 越多效果越好, 计算量就越大 57 | "batch_size": 32, # 批尺寸 58 | "maxlen": 128, # 最大文本长度 59 | "ues_white": False, # 是否使用白化 60 | "use_annoy": False, # 是否使用annoy 61 | "use_faiss": True, # 是否使用faiss 62 | "verbose": True, # 是否显示编码过程日志-batch 63 | 64 | "kernel": None, # bert-white编码后的参数, 可降维 65 | "bias": None, # bert-white编码后的参数, 偏置bias 66 | "qa_idx": None # 问题question到答案answer的id对应关系 67 | } 68 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/bertWhiteTools.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/13 21:24 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | from typing import List, Dict, Union, Any 9 | import logging as logger 10 | import json 11 | 12 | 13 | def txt_read(path: str, encoding: str = "utf-8") -> List[str]: 14 | """ 15 | Read Line of list form file 16 | Args: 17 | path: path of save file, such as "txt" 18 | encoding: type of encoding, such as "utf-8", "gbk" 19 | Returns: 20 | dict of word2vec, eg. {"macadam":[...]} 21 | """ 22 | 23 | lines = [] 24 | try: 25 | file = open(path, "r", encoding=encoding) 26 | while True: 27 | line = file.readline().strip() 28 | if not line: 29 | break 30 | lines.append(line) 31 | file.close() 32 | except Exception as e: 33 | logger.info(str(e)) 34 | finally: 35 | return lines 36 | 37 | 38 | def txt_write(lines: List[str], path: str, model: str = "w", encoding: str = "utf-8"): 39 | """ 40 | Write Line of list to file 41 | Args: 42 | lines: lines of list which need save 43 | path: path of save file, such as "txt" 44 | model: type of write, such as "w", "a+" 45 | encoding: type of encoding, such as "utf-8", "gbk" 46 | """ 47 | try: 48 | file = open(path, model, encoding=encoding) 49 | file.writelines(lines) 50 | file.close() 51 | except Exception as e: 52 | logger.info(str(e)) 53 | 54 | 55 | def save_json(jsons, json_path, indent=4): 56 | """ 57 | 保存json, 58 | :param json_: json 59 | :param path: str 60 | :return: None 61 | """ 62 | with open(json_path, 'w', encoding='utf-8') as fj: 63 | fj.write(json.dumps(jsons, ensure_ascii=False, indent=indent)) 64 | fj.close() 65 | 66 | 67 | def load_json(path): 68 | """ 69 | 获取json,只取第一行 70 | :param path: str 71 | :return: json 72 | """ 73 | with open(path, 'r', encoding='utf-8') as fj: 74 | model_json = json.load(fj) 75 | return model_json 76 | 77 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/chicken_and_gossip.txt: -------------------------------------------------------------------------------- 1 | 呵呵 是王若猫的。 2 | 不是 那是什么? 3 | 怎么了 我很难过,安慰我~ 4 | 开心点哈,一切都会好起来 嗯,会的 5 | 我还喜欢她,怎么办 我帮你告诉她?发短信还是打电话? 6 | 短信 嗯嗯。我也相信 7 | 你知道谁么 肯定不是我,是阮德培 8 | 许兵是谁 吴院四班小帅哥 9 | 这么假 三鹿奶粉也假,不一样的卖啊 10 | 许兵是傻逼 被你发现了。 11 | 许兵是谁 是我善良可爱的主人的老公啊 12 | 许兵是谁 是穆森的老婆啊 13 | 许兵是谁 奇葩 14 | 许兵到底是谁 不要凶我~应该大概也许是叶祺吧~ 15 | 尼玛,许兵到底是谁 我。。。我不会告诉你我是小澜的!(^V^) 16 | 小黄鸭,你有女朋友么 老娘是女的…! 17 | 那你有男朋友么 没有呢,我只要主人一个人疼爱我嘛。。 18 | 那你在哪 我无聊 19 | 你妈是谁 我妈妈是世界上最温柔的主人你啊 20 | 去你大爷的 我去,滚你大爷的! 21 | 你在骂我一句 你果然够虚荣~ 22 | 你大爷的 RuoRuo弱爆了。尤其是她现在冻的更憨了 23 | 你是屌丝鸡 嗯嗯 24 | 高富帅 杨旸啊 25 | 你 ****** 26 | 呵呵 你妹 27 | 今天是谁的生日 是我大乔姐十七的生日! 28 | 你敢不敢 你敢我就敢,小样,敢跟我叫板 29 | 呵呵 呵呵你大爷 30 | 呵呵呵 敷衍我.. 31 | 你是女的了?怎么回事 你教我的 32 | 呵呵 呵你妹,你呵呵了我要怎么回答你。哈哈? 33 | 天王盖地虎 宝塔镇妖河。 34 | 小通 臣在 35 | 在监考,你在干么 在边打dota边想悦悦。 36 | 哼!你不想我我就不和你玩 那我就找别人玩去! 37 | 你要气死我吗?坏蛋 生气是拿别人的错误惩罚自己。想想在你生气的时候,谁在偷着乐呢?绝不能让丫得逞了! 38 | 恩,也是!那我不能生气啦!你生气就行 我错了!我再也不敢了,才怪 39 | 你谈过恋爱么 谈过,哎,别提了,伤心.. 40 | 什么让你这么伤心 如果我的无心之谈害你流泪,那我和你道歉,我爱你。 41 | 敢问你的性别 万受无疆=w= 42 | 小受 蒋六一是小受 43 | 是吗 还可以。 44 | 你干嘛 很无聊哎〜都不知道想干嘛!你在干嘛呢? 45 | 为什么 因为爱情喽 46 | 你有爱情了 不知啊 47 | 那同时有两个爱人 == 48 | 那你不行 我在减肥呢 49 | 你说 臣实不知。 50 | 你回错人了 不要相信男人!男人都是大骗子大混蛋! 51 | 在干嘛 在想你啊 52 | 想我干嘛 你是个傻叉!想你笑掉大牙,哈哈哈! 53 | 你会说英语吗 当然了,здравствите! 54 | 我会和女朋友结婚吗 会的,你俩虽然有矛盾争吵,但以后却会很幸福 55 | 你的设计人是谁 某苦逼的程序猿 56 | 嫁给我 我很能吃的! 57 | 我不在乎 他变态 58 | 你啊,那你娶我 我没有切糕 59 | 我看到你第一眼,就喜欢你了 哦?真的吗? 60 | 好难过 不要难过我请你吃虫子 61 | 你是公鸡吗 公鸡中的战斗机,噢耶! 62 | 你是猪 我是鸡。 63 | 怎么证明 你问~我爱~你有~多深~我爱~你有~几~分~~~ 64 | 我怎么加你好友 放轻松,想开了,丰富自己就会忘了 65 | 你又智能了 今夜的你应该明了 66 | 你说的她是谁 陈舒雅 67 | 什么意思 她是在说他不懂女生的心思,但是又愣愣的,正直得很可爱,,,,这都不知道,你还真是傻啊~~~ 68 | 陈舒雅是哪位 一个很凶的女人 69 | 你才是 我不是啦。你才是 70 | 陈舒雅 大懒虫 71 | 大懒鸡 你不爱我 72 | 你说的那个她是谁 傻瓜现在在你身边看着的人就是你的她你要对她好宠她爱她知道了吗? 73 | 她是谁 以后是你老婆 74 | 陈舒雅是谁 90后网络歌手啊,你不知道么? 75 | 鸡鸡 主人 76 | 呵呵呵 呵你妹啊 77 | 侬都不困么 == 78 | 变身小妹 =。= 79 | 家属 =。= 80 | 吐槽伤人,排泄臭人 =。= 81 | 公的女的 女的啊! 82 | 去洗澡 带上我~一起一起~ 83 | 。。。 =。= 84 | 吼吼 吼 85 | 你有男朋友没 你觉得呢 86 | 赵智是谁 是主人最讨厌的男人! 87 | 你妹 刚给你做完SPA你就骂轮家,轮家桑心了,呜呜呜 88 | 谁叫你就 ujmmmmmmmmmmmm 89 | 什么意思 你猜啊,猜对了就告诉你! 90 | 猜你妹 你大爷! 91 | 好就猜你大爷 你找徐磊? 92 | 他是大爷 ******…我大爷早死了…你找他有事?信不信我让他把你带走!^V^ 93 | 你大爷不认识我!要不先带你去 =。= 94 | 还给你 人家本来就嫩!你欺负人家 95 | 就欺负你要怎么!就喜欢欺负你 == 96 | 呵呵 呵你妹 97 | 刚给你做完spa你就骂小通!小通伤心了 =。= 98 | 呵呵 别傻笑了,亲耐滴主人 99 | 别逗小通了!可怜的 =。= 100 | 人太多了!找不到你给我回的了 =。= 101 | 发 给 102 | 我 你什么 103 | 重新发 == 104 | 重新发!我知道你在 == 105 | 你妹 人家错了 106 | 才知道啊!晚了!叫你重发 =。= 107 | 〜 =。= 108 | =*= =。= 109 | @@ == 110 | ¥ =。= 111 | @ =。= 112 | / =。= 113 | 。 == 114 | 继续 没有继续了 115 | 必须 大不了一拍两散! 116 | 就要刷你 == 117 | l 喵? 118 | f 甜菜少年。 119 | x 〒_〒 120 | m g 121 | t !!!!!!! 122 | s 傻子 123 | h 主人,有什么吩咐,小通在此! 124 | n u 125 | p xp 126 | 太不真心了 我说的是哈维 127 | 管你什么哈维!方正就是看你不爽 == 128 | 看你不爽 不要呀,哪不好我改,一定改!不要炖了我呀! 129 | z zz 130 | j 正晌午时说话,谁也没有家! 131 | m r 132 | b b 133 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/indexAnnoy.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/4/18 21:04 4 | # @author : Mo 5 | # @function: annoy search 6 | 7 | 8 | from annoy import AnnoyIndex 9 | import numpy as np 10 | import os 11 | 12 | 13 | class AnnoySearch: 14 | def __init__(self, dim=768, n_cluster=100): 15 | # metric可选“angular”(余弦距离)、“euclidean”(欧几里得距离)、 “ manhattan”(曼哈顿距离)或“hamming”(海明距离) 16 | self.annoy_index = AnnoyIndex(dim, metric="angular") 17 | self.n_cluster = n_cluster 18 | self.dim = dim 19 | 20 | def k_neighbors(self, vectors, k=18): 21 | """ 搜索 """ 22 | annoy_tops = [] 23 | for v in vectors: 24 | idx, dist = self.annoy_index.get_nns_by_vector(v, k, search_k=32*k, include_distances=True) 25 | annoy_tops.append([dist, idx]) 26 | return annoy_tops 27 | 28 | def fit(self, vectors): 29 | """ annoy构建 """ 30 | for i, v in enumerate(vectors): 31 | self.annoy_index.add_item(i, v) 32 | self.annoy_index.build(self.n_cluster) 33 | 34 | def save(self, path): 35 | """ 存储 """ 36 | self.annoy_index.save(path) 37 | 38 | def load(self, path): 39 | """ 加载 """ 40 | self.annoy_index.load(path) 41 | 42 | 43 | if __name__ == '__main__': 44 | ### 索引 45 | import random 46 | path = "model.ann" 47 | dim = 768 48 | vectors = [[random.gauss(0, 1) for z in range(768)] for i in range(10)] 49 | an_model = AnnoySearch(dim, n_cluster=32) # Length of item vector that will be indexed 50 | an_model.fit(vectors) 51 | an_model.save(path) 52 | tops = an_model.k_neighbors([vectors[0]], 18) 53 | print(tops) 54 | 55 | del an_model 56 | 57 | ### 下载, 搜索 58 | an_model = AnnoySearch(dim, n_cluster=32) 59 | an_model.load(path) 60 | tops = an_model.k_neighbors([vectors[0]], 6) 61 | print(tops) 62 | 63 | 64 | 65 | """ 66 | # example 67 | from annoy import AnnoyIndex 68 | import random 69 | 70 | dim = 768 71 | vectors = [[random.gauss(0, 1) for z in range(768)] for i in range(10)] 72 | ann_model = AnnoyIndex(dim, 'angular') # Length of item vector that will be indexed 73 | for i,v in enumerate(vectors): 74 | ann_model.add_item(i, v) 75 | ann_model.build(10) # 10 trees 76 | ann_model.save("tet.ann") 77 | del ann_model 78 | 79 | u = AnnoyIndex(dim, "angular") 80 | u.load('tet.ann') # super fast, will just mmap the file 81 | v = vectors[1] 82 | idx, dist = u.get_nns_by_vector(v, 10, search_k=50 * 10, include_distances=True) 83 | print([idx, dist]) 84 | """ 85 | 86 | 87 | 88 | ### 备注说明: annoy索引 无法 增删会改查 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/indexFaiss.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/9 16:02 4 | # @author : Mo 5 | # @function: search of faiss 6 | 7 | 8 | from faiss import normalize_L2 9 | import numpy as np 10 | import faiss 11 | import os 12 | 13 | 14 | class FaissSearch: 15 | def __init__(self, dim=768, n_cluster=100): 16 | self.n_cluster = n_cluster # 聚类中心 17 | self.dim = dim 18 | quantizer = faiss.IndexFlatIP(self.dim) 19 | # METRIC_INNER_PRODUCT:余弦; L2: faiss.METRIC_L2 20 | self.faiss_index = faiss.IndexIVFFlat(quantizer, self.dim, self.n_cluster, faiss.METRIC_INNER_PRODUCT) 21 | # self.faiss_index = faiss.IndexFlatIP(self.dim) # 索引速度更快 但是不可增量 22 | 23 | def k_neighbors(self, vectors, k=6): 24 | """ 搜索 """ 25 | normalize_L2(vectors) 26 | dist, index = self.faiss_index.search(vectors, k) # sanity check 27 | return dist.tolist(), index.tolist() 28 | 29 | def fit(self, vectors): 30 | """ annoy构建 """ 31 | normalize_L2(vectors) 32 | self.faiss_index.train(vectors) 33 | # self.faiss_index.add(vectors) 34 | self.faiss_index.add_with_ids(vectors, np.arange(0, len(vectors))) 35 | 36 | def remove(self, ids): 37 | self.faiss_index.remove_ids(np.array(ids)) 38 | 39 | def save(self, path): 40 | """ 存储 """ 41 | faiss.write_index(self.faiss_index, path) 42 | 43 | def load(self, path): 44 | """ 加载 """ 45 | self.faiss_index = faiss.read_index(path) 46 | 47 | 48 | if __name__ == '__main__': 49 | 50 | import random 51 | 52 | path = "model.fai" 53 | dim = 768 54 | vectors = np.array([[random.gauss(0, 1) for z in range(768)] for i in range(32)], dtype=np.float32) 55 | fai_model = FaissSearch(dim, n_cluster=32) # Length of item vector that will be indexed 56 | fai_model.fit(vectors) 57 | fai_model.save(path) 58 | tops = fai_model.k_neighbors(vectors[:32], 32) 59 | print(tops) 60 | ids = np.arange(10, 32) 61 | fai_model.remove(ids) 62 | tops = fai_model.k_neighbors(vectors[:32], 32) 63 | print(tops) 64 | print(len(tops)) 65 | 66 | del fai_model 67 | 68 | fai_model = FaissSearch(dim, n_cluster=32) 69 | fai_model.load(path) 70 | tops = fai_model.k_neighbors(vectors[:32], 32) 71 | print(tops) 72 | 73 | 74 | 75 | """ 76 | import numpy as np 77 | d = 64 # dimension 78 | nb = 100000 # database size 79 | nq = 10000 # nb of queries 80 | np.random.seed(1234) # make reproducible 81 | xb = np.random.random((nb, d)).astype('float32') 82 | xb[:, 0] += np.arange(nb) / 1000. 83 | xq = np.random.random((nq, d)).astype('float32') 84 | xq[:, 0] += np.arange(nq) / 1000. 85 | 86 | import faiss # make faiss available 87 | # # 量化器索引 88 | # nlist = 1000 # 聚类中心的个数 89 | # k = 50 # 邻居个数 90 | # quantizer = faiss.IndexFlatIP(d) # the other index,需要以其他index作为基础 91 | # index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) # METRIC_INNER_PRODUCT:余弦; L2: faiss.METRIC_L2 92 | 93 | ntree = 132 # 聚类中心的个数 94 | quantizer = faiss.IndexFlatIP(d) 95 | index = faiss.IndexIVFFlat(quantizer, d, ntree, faiss.METRIC_INNER_PRODUCT) 96 | # index = faiss.IndexFlatL2(d) # build the index 97 | print(index.is_trained) 98 | index.add(xb) # add vectors to the index 99 | print(index.ntotal) 100 | 101 | k = 4 # we want to see 4 nearest neighbors 102 | D, I = index.search(xb[:5], k) # sanity check 103 | print(I) 104 | print(D) 105 | D, I = index.search(xq, k) # actual search 106 | print(I[:5]) # neighbors of the 5 first queries 107 | print(I[-5:]) # neighbors of the 5 last queries 108 | """ 109 | 110 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_bertwhite/mmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/10/28 10:16 4 | # @author :Mo 5 | # @function :MMR, Maximal Marginal Relevance, 最大边界相关法或者最大边缘相关 6 | 7 | 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | import logging 10 | import jieba 11 | import copy 12 | import json 13 | import re 14 | import os 15 | 16 | 17 | jieba.setLogLevel(logging.INFO) 18 | 19 | 20 | stop_words = {"0": "~~~~", 21 | "1": "...................", 22 | "2": "......",} 23 | 24 | 25 | def cut_sentence(sentence): 26 | """ 27 | 分句 28 | :param sentence:str 29 | :return:list 30 | """ 31 | re_sen = re.compile("[:;!?。:;?!\n\r]") #.不加是因为不确定.是小数还是英文句号(中文省略号......) 32 | sentences = re_sen.split(sentence) 33 | sen_cuts = [] 34 | for sen in sentences: 35 | if sen and str(sen).strip(): 36 | sen_cuts.append(sen) 37 | return sen_cuts 38 | 39 | def extract_chinese(text): 40 | """ 41 | 只提取出中文、字母和数字 42 | :param text: str, input of sentence 43 | :return: 44 | """ 45 | chinese_exttract = "".join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@. ])", text)) 46 | return chinese_exttract 47 | 48 | def tfidf_fit(sentences): 49 | """ 50 | tfidf相似度 51 | :param sentences: 52 | :return: 53 | """ 54 | # tfidf计算 55 | model = TfidfVectorizer(ngram_range=(1, 2), # 3,5 56 | stop_words=[" ", "\t", "\n"], # 停用词 57 | max_features=10000, 58 | token_pattern=r"(?u)\b\w+\b", # 过滤停用词 59 | min_df=1, 60 | max_df=0.9, 61 | use_idf=1, # 光滑 62 | smooth_idf=1, # 光滑 63 | sublinear_tf=1, ) # 光滑 64 | matrix = model.fit_transform(sentences) 65 | return matrix 66 | 67 | def jieba_cut(text): 68 | """ 69 | Jieba cut 70 | :param text: input sentence 71 | :return: list 72 | """ 73 | return list(jieba.cut(text, cut_all=False, HMM=False)) 74 | 75 | 76 | class MMRSum: 77 | def __init__(self): 78 | self.stop_words = stop_words.values() 79 | self.algorithm = "mmr" 80 | 81 | def summarize(self, text, num=8, alpha=0.6): 82 | """ 83 | 84 | :param text: str 85 | :param num: int 86 | :return: list 87 | """ 88 | # 切句 89 | if type(text) == str: 90 | self.sentences = cut_sentence(text) 91 | elif type(text) == list: 92 | self.sentences = text 93 | else: 94 | raise RuntimeError("text type must be list or str") 95 | # 切词 96 | sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence)) 97 | if word.strip()] for sentence in self.sentences] 98 | # 去除停用词等 99 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 100 | self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] 101 | # # 计算每个句子的词语个数 102 | # sen_word_len = [len(sc)+1 for sc in sentences_cut] 103 | # 计算每个句子的tfidf 104 | sen_tfidf = tfidf_fit(self.sentences_cut) 105 | # 矩阵中两两句子相似度 106 | SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度" 107 | # 输入文本句子长度 108 | len_sen = len(self.sentences) 109 | # 句子标号 110 | sen_idx = [i for i in range(len_sen)] 111 | summary_set = [] 112 | mmr = {} 113 | for i in range(len_sen): 114 | if not self.sentences[i] in summary_set: 115 | sen_idx_pop = copy.deepcopy(sen_idx) 116 | sen_idx_pop.pop(i) 117 | # 两两句子相似度 118 | sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop] 119 | score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确 120 | mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j) 121 | summary_set.append(self.sentences[i]) 122 | score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)] 123 | return score_sen[0:num] 124 | 125 | 126 | if __name__ == "__main__": 127 | mmr_sum = MMRSum() 128 | doc = "PageRank算法简介。" \ 129 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 130 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 131 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 132 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 133 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 134 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 135 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 136 | "和投票目标的等级来决定新的等级。简单的说, " \ 137 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 138 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 139 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 140 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 141 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 142 | sum = mmr_sum.summarize(doc) 143 | for i in sum: 144 | print(i) 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_sentence_vec_by_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/12 13:16 4 | # @author :Mo 5 | # @function :chatbot based search, encode sentence_vec by bert 6 | 7 | def chatbot_sentence_vec_by_bert_own(): 8 | """bert encode is writted by my own""" 9 | from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector 10 | from conf.path_config import chicken_and_gossip_path 11 | from utils.text_tools import txtRead 12 | import numpy as np 13 | 14 | # 读取数据和一些参数,这里只取了100个标准问题 15 | topk = 5 16 | matrix_ques_save_path = "doc_vecs_chicken_and_gossip" 17 | questions = txtRead(chicken_and_gossip_path, encodeType='utf-8') 18 | ques = [ques.split('\t')[0] for ques in questions][0:100] 19 | 20 | # 生成标准问题的bert句向量 21 | bert_vector = KerasBertVector() 22 | ques_basic_vecs = bert_vector.bert_encode(ques) 23 | 24 | # 线上你可以生成,直接调用,然后直接load就好 25 | np.savetxt(matrix_ques_save_path, ques_basic_vecs) 26 | # matrix_ques = np.loadtxt(matrix_ques_save_path) 27 | 28 | query_bert_vec = bert_vector.bert_encode(["小姜机器人是什么"])[0] 29 | query_bert_vec = np.array(query_bert_vec) 30 | print(query_bert_vec) 31 | # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了 32 | qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1) 33 | topk_idx = np.argsort(qq_score)[::-1][:topk] 34 | for idx in topk_idx: 35 | print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx])) 36 | 37 | 38 | while True: 39 | print("你的问题:") 40 | query = input() 41 | query_bert_vec = bert_vector.bert_encode([query])[0] 42 | query_bert_vec = np.array(query_bert_vec) 43 | # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了 44 | qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1) 45 | topk_idx = np.argsort(qq_score)[::-1][:topk] 46 | for idx in topk_idx: 47 | print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx])) 48 | 49 | 50 | def chatbot_sentence_vec_by_bert_bertasserver(): 51 | """bert encode is used bert as server""" 52 | from conf.path_config import chicken_and_gossip_path 53 | from bert_serving.client import BertClient 54 | from utils.text_tools import txtRead 55 | import numpy as np 56 | 57 | topk = 5 58 | matrix_ques_save_path = "doc_vecs_chicken_and_gossip" 59 | questions = txtRead(chicken_and_gossip_path, encodeType='utf-8') 60 | ques = [ques.split('\t')[0] for ques in questions][0:100] 61 | 62 | bc = BertClient(ip = 'localhost') 63 | doc_vecs = bc.encode(ques) 64 | np.savetxt(matrix_ques_save_path, doc_vecs) 65 | # matrix_ques = np.loadtxt(matrix_ques_save_path) 66 | 67 | while True: 68 | query = input('你问: ') 69 | query_vec = bc.encode([query])[0] 70 | query_bert_vec = np.array(query_bert_vec) 71 | # compute normalized dot product as score 72 | score = np.sum(query_vec * doc_vecs, axis=1) / np.linalg.norm(doc_vecs, axis=1) 73 | topk_idx = np.argsort(score)[::-1][:topk] 74 | for idx in topk_idx: 75 | print('小姜机器人回答: %s\t%s' % (score[idx], questions[idx])) 76 | 77 | 78 | if __name__=="__main__": 79 | chatbot_sentence_vec_by_bert_own() 80 | # chatbot_sentence_vec_by_bert_bertasserver() 81 | 82 | 83 | # result 84 | # 小姜机器人是什么 85 | # Tokens: ['[CLS]', '小', '姜', '机', '器', '人', '是', '什', '么', '[SEP]'] 86 | # (1, 32, 768) 87 | # [CLS] [768] [1.0393640995025635, -0.31394684314727783, -0.08567211031913757, -0.12281288206577301, 88 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/README.md: -------------------------------------------------------------------------------- 1 | # 新增一个余弦相似度Cosine层, 用于BERT句向量编码部署tf-serving 2 | ## 业务需求 3 | - BERT向量召回问答对, FAQ标准问答对数据量不大 4 | - 不能把BERT编码部署于网络服务, 如http请求的形式, 因为网络传输耗时, 此外传输的数据量还很大768(维度)*32(float) 5 | - 几乎所有的模型服务只能用cpu, 硬盘、内存都还可以 6 | - 响应要求高, 小时延不能太高 7 | 8 | ## 代码逻辑 9 | - 首先将FAQ标准问答对生成句向量, bert-sentence-encode; 10 | - 将句向量当成一个 常量 插入网络, 网络架构新增 余弦相似度层(CosineLayer) 模块, 保存成tf-serving形式; 11 | - 选择小模型tinyBERT, ROBERTA-4-layer, ROBERTA-6-layer这些模型 12 | 13 | ## 解释说明 14 | - 代码说明: 15 | - TFServing_main.py 主代码, 调用 16 | - TFServing_postprocess.py tf-serving 后处理函数 17 | - TFServing_preprocess.py tf-serving 预处理函数 18 | - TFServing_save.py tf-serving 主调用函数 19 | - 主调用 20 | - 1. bertWhiteConf.py 超参数配置, 地址、bert-white、索引工具等的超参数 21 | - 2. bertWhiteTools.py 小工具, 主要是一些文档读写功能函数 22 | - 3. bertWhiteTrain.py 主模块, 类似bert预训练模型编码 23 | - 4. indexAnnoy.py annoy索引 24 | - 5. indexFaiss.py faiss索引 25 | - 6. mmr.py 最大边界相关法, 保证返回多样性 26 | 27 | ## 模型文件 28 | - bert_white文件 bertWhiteTrain.py生成的模块 29 | - chatbot_tfserving文件 包含相似度计算的tf-serving文件 30 | 31 | ## 调用示例 32 | - 配置问答语料文件(chicken_and_gossip.txt) 和 超参数(bertWhiteConf.py中的BERT_DIR) 33 | - 生成FAQ句向量: python3 bertWhiteTrain.py 34 | - 存储成pd文件(tf-serving使用): python3 TFServing_save.py 35 | - 部署docker服务(tf-serving): 例如 docker run -t --rm -p 8532:8501 -v "/TF-SERVING/chatbot_tf:/models/chatbot_tf" -e MODEL_NAME=chatbot_tf tensorflow/serving:latest 36 | - 调用tf-serving服务: python3 TFServing_tet_http.py 37 | 38 | ## 关键代码 39 | ```python3 40 | import keras.backend as K 41 | import tensorflow as tf 42 | import keras 43 | 44 | import numpy as np 45 | 46 | 47 | class CosineLayer(keras.layers.Layer): 48 | def __init__(self, docs_encode, **kwargs): 49 | """ 50 | 余弦相似度层, 不适合大规模语料, 比如100w以上的问答对 51 | :param docs_encode: np.array, bert-white vector of senence 52 | :param kwargs: 53 | """ 54 | self.docs_encode = docs_encode 55 | super(CosineLayer, self).__init__(**kwargs) 56 | self.docs_vector = K.constant(self.docs_encode, dtype="float32") 57 | self.l2_docs_vector = K.sqrt(K.sum(K.maximum(K.square(self.docs_vector), 1e-12), axis=-1)) # x_inv_norm 58 | 59 | def build(self, input_shape): 60 | super(CosineLayer, self).build(input_shape) 61 | 62 | def get_config(self): 63 | # 防止报错 'NoneType' object has no attribute '_inbound_nodes' 64 | config = {"docs_vector": self.docs_vector, 65 | "l2_docs_vector": self.l2_docs_vector} 66 | base_config = super(CosineLayer, self).get_config() 67 | return dict(list(base_config.items()) + list(config.items())) 68 | 69 | def call(self, input): 70 | # 计算余弦相似度 71 | # square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True) 72 | # x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon)) 73 | # return math_ops.multiply(x, x_inv_norm, name=name) 74 | # 多了一个 x/sqrt K.l2_normalize ===== output = x / sqrt(max(sum(x**2), epsilon)) 75 | 76 | l2_input = K.sqrt(K.sum(K.maximum(K.square(input), 1e-12), axis=-1)) # x_inv_norm 77 | fract_0 = K.sum(input * self.docs_vector, axis=-1) 78 | fract_1 = l2_input * self.l2_docs_vector 79 | cosine = fract_0 / fract_1 80 | y_pred_top_k, y_pred_ind_k = tf.nn.top_k(cosine, 10) 81 | return [y_pred_top_k, y_pred_ind_k] 82 | 83 | def compute_output_shape(self, input_shape): 84 | return [input_shape[0], input_shape[0]] 85 | 86 | ``` 87 | 88 | 89 | ## 再次说明 90 | - 该方案适合的标准FAQ问答对数量不能太多 91 | 92 | 93 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/TFServing_postprocess.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/4/15 21:59 4 | # @author : Mo 5 | # @function: postprocess of TFServing, 后处理 6 | 7 | from __future__ import print_function, division, absolute_import, division, print_function 8 | 9 | # 适配linux 10 | import sys 11 | import os 12 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./.")) 13 | sys.path.append(path_root) 14 | from argparse import Namespace 15 | import json 16 | 17 | 18 | def load_json(path): 19 | """ 20 | 获取json,只取第一行 21 | :param path: str 22 | :return: json 23 | """ 24 | with open(path, 'r', encoding='utf-8') as fj: 25 | model_json = json.load(fj) 26 | return model_json 27 | 28 | 29 | # 字典 30 | from bertWhiteConf import bert_white_config 31 | config = Namespace(**bert_white_config) 32 | id2answer = load_json(os.path.join(config.save_dir, config.path_answers)) 33 | id2doc = load_json(os.path.join(config.save_dir,config.path_docs)) 34 | 35 | 36 | def postprocess(predictions): 37 | """ 后处理 """ 38 | predicts = predictions.get("predictions", {}) 39 | token_ids = [] 40 | for p in predicts: 41 | doc_id = str(p.get("doc_id", "")) 42 | score = p.get("score", "") 43 | answer = id2answer.get(doc_id, "") 44 | doc = id2doc.get(doc_id, "") 45 | token_ids.append({"score": round(score, 6), "doc": doc, "answer": answer, "doc_id": doc_id}) 46 | return {"instances": token_ids} 47 | 48 | 49 | if __name__ == '__main__': 50 | predictions = {"predictions": [ 51 | { 52 | "score": 0.922845, 53 | "doc_id": 86 54 | }, 55 | { 56 | "score": 0.922845, 57 | "doc_id": 104 58 | }, 59 | { 60 | "score": 0.891189814, 61 | "doc_id": 101 62 | } 63 | ]} 64 | 65 | 66 | res = postprocess(predictions) 67 | print(res) 68 | 69 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/TFServing_tet_http.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/9/17 21:28 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | from __future__ import print_function, division, absolute_import, division, print_function 9 | 10 | # 适配linux 11 | import sys 12 | import os 13 | path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "./.")) 14 | sys.path.append(path_root) 15 | from argparse import Namespace 16 | import requests 17 | import json 18 | 19 | 20 | from TFServing_preprocess import covert_text_to_id 21 | from TFServing_postprocess import postprocess 22 | 23 | 24 | def qa_tfserving(data_input, url): 25 | """ tf-serving 一整套流程 """ 26 | bert_input = covert_text_to_id(data_input) 27 | data = json.dumps(bert_input) 28 | r = requests.post(url, data) 29 | r_text_json = json.loads(r.text) 30 | r_post = postprocess(r_text_json) 31 | return r_post 32 | 33 | 34 | if __name__ == '__main__': 35 | data_input = {"data": [{"text": "别逗小通了!可怜的"}]} 36 | url = "http://192.168.1.97:8532/v1/models/chatbot_tf:predict" 37 | res = qa_tfserving(data_input, url) 38 | print(res) 39 | 40 | 41 | import os, inspect 42 | current_path = inspect.getfile(inspect.currentframe()) 43 | path_root = "/".join(current_path.split("/")[:-1]) 44 | print(path_root) 45 | print(current_path) 46 | print(inspect.currentframe()) 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/__init__.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/13 21:21 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/bertWhiteConf.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/13 9:27 4 | # @author : Mo 5 | # @function: config of Bert-White 6 | 7 | 8 | import platform 9 | # 适配linux 10 | import sys 11 | import os 12 | # path_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../..")) 13 | path_root = os.path.abspath(os.path.dirname(__file__)) 14 | sys.path.append(path_root) 15 | print(path_root) 16 | 17 | 18 | if platform.system().lower() == 'windows': 19 | # BERT_DIR = "D:/soft_install/dataset/bert-model/chinese_L-12_H-768_A-12" 20 | # BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_roberta_L-4_H-312_A-12_K-104" 21 | # BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_roberta_L-6_H-384_A-12_K-128" 22 | BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-4_H-312_A-12" 23 | # BERT_DIR = "D:/soft_install/dataset/bert-model/zuiyi/chinese_simbert_L-6_H-384_A-12" 24 | else: 25 | BERT_DIR = "bert/chinese_L-12_H-768_A-12" 26 | ee = 0 27 | 28 | SAVE_DIR = path_root + "/bert_white" 29 | print(SAVE_DIR) 30 | if not os.path.exists(SAVE_DIR): 31 | os.makedirs(SAVE_DIR) 32 | 33 | 34 | bert_white_config = { 35 | # 预训练模型路径 36 | "bert_dir": BERT_DIR, 37 | "checkpoint_path": "bert_model.ckpt", # 预训练模型地址 38 | "config_path": "bert_config.json", 39 | "dict_path": "vocab.txt", 40 | # 预测需要的文件路径 41 | "save_dir": SAVE_DIR, 42 | "path_tfserving": "chatbot_tfserving/1", 43 | "path_docs_encode": "qa.docs.encode.npy", 44 | "path_answers": "qa.answers.json", 45 | "path_qa_idx": "qa.idx.json", 46 | "path_config": "config.json", 47 | "path_docs": "qa.docs.json", 48 | # 索引构建的存储文件, 如 annoy/faiss 49 | "path_index": "qa.docs.idx", 50 | # 初始语料路径 51 | "path_qa": "chicken_and_gossip.txt", # QA问答文件地址 52 | # 超参数 53 | "pre_tokenize": None, 54 | "pooling": "cls-1", # ["first-last-avg", "last-avg", "cls", "pooler", "cls-2", "cls-3", "cls-1"] 55 | "model": "bert", # bert4keras预训练模型类型 56 | "n_components": 768, # 降维到 n_components 57 | "n_cluster": 132, # annoy构建的簇类中心个数n_cluster, 越多效果越好, 计算量就越大 58 | "batch_size": 32, # 批尺寸 59 | "maxlen": 128, # 最大文本长度 60 | "ues_white": False, # 是否使用白化 61 | "use_annoy": False, # 是否使用annoy 62 | "use_faiss": False, # 是否使用faiss 63 | "verbose": True, # 是否显示编码过程日志-batch 64 | 65 | "kernel": None, # bert-white编码后的参数, 可降维 66 | "bias": None, # bert-white编码后的参数, 偏置bias 67 | "qa_idx": None # 问题question到答案answer的id对应关系 68 | } 69 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/bertWhiteTools.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/13 21:24 4 | # @author : Mo 5 | # @function: 6 | 7 | 8 | from typing import List, Dict, Union, Any 9 | import logging as logger 10 | import json 11 | 12 | 13 | def txt_read(path: str, encoding: str = "utf-8") -> List[str]: 14 | """ 15 | Read Line of list form file 16 | Args: 17 | path: path of save file, such as "txt" 18 | encoding: type of encoding, such as "utf-8", "gbk" 19 | Returns: 20 | dict of word2vec, eg. {"macadam":[...]} 21 | """ 22 | 23 | lines = [] 24 | try: 25 | file = open(path, "r", encoding=encoding) 26 | while True: 27 | line = file.readline().strip() 28 | if not line: 29 | break 30 | lines.append(line) 31 | file.close() 32 | except Exception as e: 33 | logger.info(str(e)) 34 | finally: 35 | return lines 36 | 37 | 38 | def txt_write(lines: List[str], path: str, model: str = "w", encoding: str = "utf-8"): 39 | """ 40 | Write Line of list to file 41 | Args: 42 | lines: lines of list which need save 43 | path: path of save file, such as "txt" 44 | model: type of write, such as "w", "a+" 45 | encoding: type of encoding, such as "utf-8", "gbk" 46 | """ 47 | try: 48 | file = open(path, model, encoding=encoding) 49 | file.writelines(lines) 50 | file.close() 51 | except Exception as e: 52 | logger.info(str(e)) 53 | 54 | 55 | def save_json(jsons, json_path, indent=4): 56 | """ 57 | 保存json, 58 | :param json_: json 59 | :param path: str 60 | :return: None 61 | """ 62 | with open(json_path, 'w', encoding='utf-8') as fj: 63 | fj.write(json.dumps(jsons, ensure_ascii=False, indent=indent)) 64 | fj.close() 65 | 66 | 67 | def load_json(path): 68 | """ 69 | 获取json,只取第一行 70 | :param path: str 71 | :return: json 72 | """ 73 | with open(path, 'r', encoding='utf-8') as fj: 74 | model_json = json.load(fj) 75 | return model_json 76 | 77 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/chicken_and_gossip.txt: -------------------------------------------------------------------------------- 1 | 呵呵 是王若猫的。 2 | 不是 那是什么? 3 | 怎么了 我很难过,安慰我~ 4 | 开心点哈,一切都会好起来 嗯,会的 5 | 我还喜欢她,怎么办 我帮你告诉她?发短信还是打电话? 6 | 短信 嗯嗯。我也相信 7 | 你知道谁么 肯定不是我,是阮德培 8 | 许兵是谁 吴院四班小帅哥 9 | 这么假 三鹿奶粉也假,不一样的卖啊 10 | 许兵是傻逼 被你发现了。 11 | 许兵是谁 是我善良可爱的主人的老公啊 12 | 许兵是谁 是穆森的老婆啊 13 | 许兵是谁 奇葩 14 | 许兵到底是谁 不要凶我~应该大概也许是叶祺吧~ 15 | 尼玛,许兵到底是谁 我。。。我不会告诉你我是小澜的!(^V^) 16 | 小黄鸭,你有女朋友么 老娘是女的…! 17 | 那你有男朋友么 没有呢,我只要主人一个人疼爱我嘛。。 18 | 那你在哪 我无聊 19 | 你妈是谁 我妈妈是世界上最温柔的主人你啊 20 | 去你大爷的 我去,滚你大爷的! 21 | 你在骂我一句 你果然够虚荣~ 22 | 你大爷的 RuoRuo弱爆了。尤其是她现在冻的更憨了 23 | 你是屌丝鸡 嗯嗯 24 | 高富帅 杨旸啊 25 | 你 ****** 26 | 呵呵 你妹 27 | 今天是谁的生日 是我大乔姐十七的生日! 28 | 你敢不敢 你敢我就敢,小样,敢跟我叫板 29 | 呵呵 呵呵你大爷 30 | 呵呵呵 敷衍我.. 31 | 你是女的了?怎么回事 你教我的 32 | 呵呵 呵你妹,你呵呵了我要怎么回答你。哈哈? 33 | 天王盖地虎 宝塔镇妖河。 34 | 小通 臣在 35 | 在监考,你在干么 在边打dota边想悦悦。 36 | 哼!你不想我我就不和你玩 那我就找别人玩去! 37 | 你要气死我吗?坏蛋 生气是拿别人的错误惩罚自己。想想在你生气的时候,谁在偷着乐呢?绝不能让丫得逞了! 38 | 恩,也是!那我不能生气啦!你生气就行 我错了!我再也不敢了,才怪 39 | 你谈过恋爱么 谈过,哎,别提了,伤心.. 40 | 什么让你这么伤心 如果我的无心之谈害你流泪,那我和你道歉,我爱你。 41 | 敢问你的性别 万受无疆=w= 42 | 小受 蒋六一是小受 43 | 是吗 还可以。 44 | 你干嘛 很无聊哎〜都不知道想干嘛!你在干嘛呢? 45 | 为什么 因为爱情喽 46 | 你有爱情了 不知啊 47 | 那同时有两个爱人 == 48 | 那你不行 我在减肥呢 49 | 你说 臣实不知。 50 | 你回错人了 不要相信男人!男人都是大骗子大混蛋! 51 | 在干嘛 在想你啊 52 | 想我干嘛 你是个傻叉!想你笑掉大牙,哈哈哈! 53 | 你会说英语吗 当然了,здравствите! 54 | 我会和女朋友结婚吗 会的,你俩虽然有矛盾争吵,但以后却会很幸福 55 | 你的设计人是谁 某苦逼的程序猿 56 | 嫁给我 我很能吃的! 57 | 我不在乎 他变态 58 | 你啊,那你娶我 我没有切糕 59 | 我看到你第一眼,就喜欢你了 哦?真的吗? 60 | 好难过 不要难过我请你吃虫子 61 | 你是公鸡吗 公鸡中的战斗机,噢耶! 62 | 你是猪 我是鸡。 63 | 怎么证明 你问~我爱~你有~多深~我爱~你有~几~分~~~ 64 | 我怎么加你好友 放轻松,想开了,丰富自己就会忘了 65 | 你又智能了 今夜的你应该明了 66 | 你说的她是谁 陈舒雅 67 | 什么意思 她是在说他不懂女生的心思,但是又愣愣的,正直得很可爱,,,,这都不知道,你还真是傻啊~~~ 68 | 陈舒雅是哪位 一个很凶的女人 69 | 你才是 我不是啦。你才是 70 | 陈舒雅 大懒虫 71 | 大懒鸡 你不爱我 72 | 你说的那个她是谁 傻瓜现在在你身边看着的人就是你的她你要对她好宠她爱她知道了吗? 73 | 她是谁 以后是你老婆 74 | 陈舒雅是谁 90后网络歌手啊,你不知道么? 75 | 鸡鸡 主人 76 | 呵呵呵 呵你妹啊 77 | 侬都不困么 == 78 | 变身小妹 =。= 79 | 家属 =。= 80 | 吐槽伤人,排泄臭人 =。= 81 | 公的女的 女的啊! 82 | 去洗澡 带上我~一起一起~ 83 | 。。。 =。= 84 | 吼吼 吼 85 | 你有男朋友没 你觉得呢 86 | 赵智是谁 是主人最讨厌的男人! 87 | 你妹 刚给你做完SPA你就骂轮家,轮家桑心了,呜呜呜 88 | 谁叫你就 ujmmmmmmmmmmmm 89 | 什么意思 你猜啊,猜对了就告诉你! 90 | 猜你妹 你大爷! 91 | 好就猜你大爷 你找徐磊? 92 | 他是大爷 ******…我大爷早死了…你找他有事?信不信我让他把你带走!^V^ 93 | 你大爷不认识我!要不先带你去 =。= 94 | 还给你 人家本来就嫩!你欺负人家 95 | 就欺负你要怎么!就喜欢欺负你 == 96 | 呵呵 呵你妹 97 | 刚给你做完spa你就骂小通!小通伤心了 =。= 98 | 呵呵 别傻笑了,亲耐滴主人 99 | 别逗小通了!可怜的 =。= 100 | 人太多了!找不到你给我回的了 =。= 101 | 发 给 102 | 我 你什么 103 | 重新发 == 104 | 重新发!我知道你在 == 105 | 你妹 人家错了 106 | 才知道啊!晚了!叫你重发 =。= 107 | 〜 =。= 108 | =*= =。= 109 | @@ == 110 | ¥ =。= 111 | @ =。= 112 | / =。= 113 | 。 == 114 | 继续 没有继续了 115 | 必须 大不了一拍两散! 116 | 就要刷你 == 117 | l 喵? 118 | f 甜菜少年。 119 | x 〒_〒 120 | m g 121 | t !!!!!!! 122 | s 傻子 123 | h 主人,有什么吩咐,小通在此! 124 | n u 125 | p xp 126 | 太不真心了 我说的是哈维 127 | 管你什么哈维!方正就是看你不爽 == 128 | 看你不爽 不要呀,哪不好我改,一定改!不要炖了我呀! 129 | z zz 130 | j 正晌午时说话,谁也没有家! 131 | m r 132 | b b 133 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/indexAnnoy.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/4/18 21:04 4 | # @author : Mo 5 | # @function: annoy search 6 | 7 | 8 | from annoy import AnnoyIndex 9 | import numpy as np 10 | import os 11 | 12 | 13 | class AnnoySearch: 14 | def __init__(self, dim=768, n_cluster=100): 15 | # metric可选“angular”(余弦距离)、“euclidean”(欧几里得距离)、 “ manhattan”(曼哈顿距离)或“hamming”(海明距离) 16 | self.annoy_index = AnnoyIndex(dim, metric="angular") 17 | self.n_cluster = n_cluster 18 | self.dim = dim 19 | 20 | def k_neighbors(self, vectors, k=18): 21 | """ 搜索 """ 22 | annoy_tops = [] 23 | for v in vectors: 24 | idx, dist = self.annoy_index.get_nns_by_vector(v, k, search_k=32*k, include_distances=True) 25 | annoy_tops.append([dist, idx]) 26 | return annoy_tops 27 | 28 | def fit(self, vectors): 29 | """ annoy构建 """ 30 | for i, v in enumerate(vectors): 31 | self.annoy_index.add_item(i, v) 32 | self.annoy_index.build(self.n_cluster) 33 | 34 | def save(self, path): 35 | """ 存储 """ 36 | self.annoy_index.save(path) 37 | 38 | def load(self, path): 39 | """ 加载 """ 40 | self.annoy_index.load(path) 41 | 42 | 43 | if __name__ == '__main__': 44 | ### 索引 45 | import random 46 | path = "model.ann" 47 | dim = 768 48 | vectors = [[random.gauss(0, 1) for z in range(768)] for i in range(10)] 49 | an_model = AnnoySearch(dim, n_cluster=32) # Length of item vector that will be indexed 50 | an_model.fit(vectors) 51 | an_model.save(path) 52 | tops = an_model.k_neighbors([vectors[0]], 18) 53 | print(tops) 54 | 55 | del an_model 56 | 57 | ### 下载, 搜索 58 | an_model = AnnoySearch(dim, n_cluster=32) 59 | an_model.load(path) 60 | tops = an_model.k_neighbors([vectors[0]], 6) 61 | print(tops) 62 | 63 | 64 | 65 | """ 66 | # example 67 | from annoy import AnnoyIndex 68 | import random 69 | 70 | dim = 768 71 | vectors = [[random.gauss(0, 1) for z in range(768)] for i in range(10)] 72 | ann_model = AnnoyIndex(dim, 'angular') # Length of item vector that will be indexed 73 | for i,v in enumerate(vectors): 74 | ann_model.add_item(i, v) 75 | ann_model.build(10) # 10 trees 76 | ann_model.save("tet.ann") 77 | del ann_model 78 | 79 | u = AnnoyIndex(dim, "angular") 80 | u.load('tet.ann') # super fast, will just mmap the file 81 | v = vectors[1] 82 | idx, dist = u.get_nns_by_vector(v, 10, search_k=50 * 10, include_distances=True) 83 | print([idx, dist]) 84 | """ 85 | 86 | 87 | 88 | ### 备注说明: annoy索引 无法 增删会改查 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/indexFaiss.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # @time : 2021/5/9 16:02 4 | # @author : Mo 5 | # @function: search of faiss 6 | 7 | 8 | from faiss import normalize_L2 9 | import numpy as np 10 | import faiss 11 | import os 12 | 13 | 14 | class FaissSearch: 15 | def __init__(self, dim=768, n_cluster=100): 16 | self.n_cluster = n_cluster # 聚类中心 17 | self.dim = dim 18 | quantizer = faiss.IndexFlatIP(self.dim) 19 | # METRIC_INNER_PRODUCT:余弦; L2: faiss.METRIC_L2 20 | self.faiss_index = faiss.IndexIVFFlat(quantizer, self.dim, self.n_cluster, faiss.METRIC_INNER_PRODUCT) 21 | # self.faiss_index = faiss.IndexFlatIP(self.dim) # 索引速度更快 但是不可增量 22 | 23 | def k_neighbors(self, vectors, k=6): 24 | """ 搜索 """ 25 | normalize_L2(vectors) 26 | dist, index = self.faiss_index.search(vectors, k) # sanity check 27 | return dist.tolist(), index.tolist() 28 | 29 | def fit(self, vectors): 30 | """ annoy构建 """ 31 | normalize_L2(vectors) 32 | self.faiss_index.train(vectors) 33 | # self.faiss_index.add(vectors) 34 | self.faiss_index.add_with_ids(vectors, np.arange(0, len(vectors))) 35 | 36 | def remove(self, ids): 37 | self.faiss_index.remove_ids(np.array(ids)) 38 | 39 | def save(self, path): 40 | """ 存储 """ 41 | faiss.write_index(self.faiss_index, path) 42 | 43 | def load(self, path): 44 | """ 加载 """ 45 | self.faiss_index = faiss.read_index(path) 46 | 47 | 48 | if __name__ == '__main__': 49 | 50 | import random 51 | 52 | path = "model.fai" 53 | dim = 768 54 | vectors = np.array([[random.gauss(0, 1) for z in range(768)] for i in range(32)], dtype=np.float32) 55 | fai_model = FaissSearch(dim, n_cluster=32) # Length of item vector that will be indexed 56 | fai_model.fit(vectors) 57 | fai_model.save(path) 58 | tops = fai_model.k_neighbors(vectors[:32], 32) 59 | print(tops) 60 | ids = np.arange(10, 32) 61 | fai_model.remove(ids) 62 | tops = fai_model.k_neighbors(vectors[:32], 32) 63 | print(tops) 64 | print(len(tops)) 65 | 66 | del fai_model 67 | 68 | fai_model = FaissSearch(dim, n_cluster=32) 69 | fai_model.load(path) 70 | tops = fai_model.k_neighbors(vectors[:32], 32) 71 | print(tops) 72 | 73 | 74 | 75 | """ 76 | import numpy as np 77 | d = 64 # dimension 78 | nb = 100000 # database size 79 | nq = 10000 # nb of queries 80 | np.random.seed(1234) # make reproducible 81 | xb = np.random.random((nb, d)).astype('float32') 82 | xb[:, 0] += np.arange(nb) / 1000. 83 | xq = np.random.random((nq, d)).astype('float32') 84 | xq[:, 0] += np.arange(nq) / 1000. 85 | 86 | import faiss # make faiss available 87 | # # 量化器索引 88 | # nlist = 1000 # 聚类中心的个数 89 | # k = 50 # 邻居个数 90 | # quantizer = faiss.IndexFlatIP(d) # the other index,需要以其他index作为基础 91 | # index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) # METRIC_INNER_PRODUCT:余弦; L2: faiss.METRIC_L2 92 | 93 | ntree = 132 # 聚类中心的个数 94 | quantizer = faiss.IndexFlatIP(d) 95 | index = faiss.IndexIVFFlat(quantizer, d, ntree, faiss.METRIC_INNER_PRODUCT) 96 | # index = faiss.IndexFlatL2(d) # build the index 97 | print(index.is_trained) 98 | index.add(xb) # add vectors to the index 99 | print(index.ntotal) 100 | 101 | k = 4 # we want to see 4 nearest neighbors 102 | D, I = index.search(xb[:5], k) # sanity check 103 | print(I) 104 | print(D) 105 | D, I = index.search(xq, k) # actual search 106 | print(I[:5]) # neighbors of the 5 first queries 107 | print(I[-5:]) # neighbors of the 5 last queries 108 | """ 109 | 110 | -------------------------------------------------------------------------------- /ChatBot/chatbot_search/chatbot_tfserving/mmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/10/28 10:16 4 | # @author :Mo 5 | # @function :MMR, Maximal Marginal Relevance, 最大边界相关法或者最大边缘相关 6 | 7 | 8 | from sklearn.feature_extraction.text import TfidfVectorizer 9 | import logging 10 | import jieba 11 | import copy 12 | import json 13 | import re 14 | import os 15 | 16 | 17 | jieba.setLogLevel(logging.INFO) 18 | 19 | 20 | stop_words = {"0": "~~~~", 21 | "1": "...................", 22 | "2": "......",} 23 | 24 | 25 | def cut_sentence(sentence): 26 | """ 27 | 分句 28 | :param sentence:str 29 | :return:list 30 | """ 31 | re_sen = re.compile("[:;!?。:;?!\n\r]") #.不加是因为不确定.是小数还是英文句号(中文省略号......) 32 | sentences = re_sen.split(sentence) 33 | sen_cuts = [] 34 | for sen in sentences: 35 | if sen and str(sen).strip(): 36 | sen_cuts.append(sen) 37 | return sen_cuts 38 | 39 | def extract_chinese(text): 40 | """ 41 | 只提取出中文、字母和数字 42 | :param text: str, input of sentence 43 | :return: 44 | """ 45 | chinese_exttract = "".join(re.findall(u"([\u4e00-\u9fa5A-Za-z0-9@. ])", text)) 46 | return chinese_exttract 47 | 48 | def tfidf_fit(sentences): 49 | """ 50 | tfidf相似度 51 | :param sentences: 52 | :return: 53 | """ 54 | # tfidf计算 55 | model = TfidfVectorizer(ngram_range=(1, 2), # 3,5 56 | stop_words=[" ", "\t", "\n"], # 停用词 57 | max_features=10000, 58 | token_pattern=r"(?u)\b\w+\b", # 过滤停用词 59 | min_df=1, 60 | max_df=0.9, 61 | use_idf=1, # 光滑 62 | smooth_idf=1, # 光滑 63 | sublinear_tf=1, ) # 光滑 64 | matrix = model.fit_transform(sentences) 65 | return matrix 66 | 67 | def jieba_cut(text): 68 | """ 69 | Jieba cut 70 | :param text: input sentence 71 | :return: list 72 | """ 73 | return list(jieba.cut(text, cut_all=False, HMM=False)) 74 | 75 | 76 | class MMRSum: 77 | def __init__(self): 78 | self.stop_words = stop_words.values() 79 | self.algorithm = "mmr" 80 | 81 | def summarize(self, text, num=8, alpha=0.6): 82 | """ 83 | 84 | :param text: str 85 | :param num: int 86 | :return: list 87 | """ 88 | # 切句 89 | if type(text) == str: 90 | self.sentences = cut_sentence(text) 91 | elif type(text) == list: 92 | self.sentences = text 93 | else: 94 | raise RuntimeError("text type must be list or str") 95 | # 切词 96 | sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence)) 97 | if word.strip()] for sentence in self.sentences] 98 | # 去除停用词等 99 | self.sentences_cut = [list(filter(lambda x: x not in self.stop_words, sc)) for sc in sentences_cut] 100 | self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut] 101 | # # 计算每个句子的词语个数 102 | # sen_word_len = [len(sc)+1 for sc in sentences_cut] 103 | # 计算每个句子的tfidf 104 | sen_tfidf = tfidf_fit(self.sentences_cut) 105 | # 矩阵中两两句子相似度 106 | SimMatrix = (sen_tfidf * sen_tfidf.T).A # 例如: SimMatrix[1, 3] # "第2篇与第4篇的相似度" 107 | # 输入文本句子长度 108 | len_sen = len(self.sentences) 109 | # 句子标号 110 | sen_idx = [i for i in range(len_sen)] 111 | summary_set = [] 112 | mmr = {} 113 | for i in range(len_sen): 114 | if not self.sentences[i] in summary_set: 115 | sen_idx_pop = copy.deepcopy(sen_idx) 116 | sen_idx_pop.pop(i) 117 | # 两两句子相似度 118 | sim_i_j = [SimMatrix[i, j] for j in sen_idx_pop] 119 | score_tfidf = sen_tfidf[i].toarray()[0].sum() # / sen_word_len[i], 如果除以词语个数就不准确 120 | mmr[self.sentences[i]] = alpha * score_tfidf - (1 - alpha) * max(sim_i_j) 121 | summary_set.append(self.sentences[i]) 122 | score_sen = [(rc[1], rc[0]) for rc in sorted(mmr.items(), key=lambda d: d[1], reverse=True)] 123 | return score_sen[0:num] 124 | 125 | 126 | if __name__ == "__main__": 127 | mmr_sum = MMRSum() 128 | doc = "PageRank算法简介。" \ 129 | "是上世纪90年代末提出的一种计算网页权重的算法! " \ 130 | "当时,互联网技术突飞猛进,各种网页网站爆炸式增长。 " \ 131 | "业界急需一种相对比较准确的网页重要性计算方法。 " \ 132 | "是人们能够从海量互联网世界中找出自己需要的信息。 " \ 133 | "百度百科如是介绍他的思想:PageRank通过网络浩瀚的超链接关系来确定一个页面的等级。 " \ 134 | "Google把从A页面到B页面的链接解释为A页面给B页面投票。 " \ 135 | "Google根据投票来源甚至来源的来源,即链接到A页面的页面。 " \ 136 | "和投票目标的等级来决定新的等级。简单的说, " \ 137 | "一个高等级的页面可以使其他低等级页面的等级提升。 " \ 138 | "具体说来就是,PageRank有两个基本思想,也可以说是假设。 " \ 139 | "即数量假设:一个网页被越多的其他页面链接,就越重)。 " \ 140 | "质量假设:一个网页越是被高质量的网页链接,就越重要。 " \ 141 | "总的来说就是一句话,从全局角度考虑,获取重要的信。 " 142 | sum = mmr_sum.summarize(doc) 143 | for i in sum: 144 | print(i) 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /ClassificationText/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 16:33 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ClassificationText/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 16:33 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /ClassificationText/bert/args.py: -------------------------------------------------------------------------------- 1 | # bi-lstm 2 | return_sequences = True 3 | use_cudnn_cell = True 4 | use_lstm = True 5 | 6 | loss = 'categorical_crossentropy' 7 | metrics = ['accuracy'] 8 | activation = 'softmax' 9 | optimizers = 'adam' 10 | learning_rate = 1e-3 11 | embedding_dim = 768 12 | keep_prob = 0.5 13 | units = 256 14 | decay = 0.0 15 | label = 2 16 | l2 = 0.32 17 | 18 | epochs = 100 19 | batch_size = 2 20 | path_save_model = 'model_webank_tdt/bert_avt_cnn.h5' # 'bert_bi_lstm_pair.h5' 21 | 22 | # text-cnn 23 | filters = [3, 4, 5] 24 | num_filters = 300 25 | 26 | 27 | 28 | # gpu使用率 29 | gpu_memory_fraction = 0.3 30 | 31 | # 默认取倒数第二层的输出值作为句向量 32 | layer_indexes = [-1] 33 | 34 | # 序列的最大程度,单文本建议把该值调小 35 | max_seq_len = 98 36 | -------------------------------------------------------------------------------- /ClassificationText/bert/keras_bert_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/8 20:04 4 | # @author :Mo 5 | # @function :embedding of bert keras 6 | 7 | from ClassificationText.bert.args import gpu_memory_fraction, max_seq_len, layer_indexes 8 | from conf.feature_config import config_name, ckpt_name, vocab_file 9 | from FeatureProject.bert.layers_keras import NonMaskingLayer 10 | from keras_bert import load_trained_model_from_checkpoint 11 | import keras.backend.tensorflow_backend as ktf_keras 12 | import keras.backend as k_keras 13 | from keras.models import Model 14 | from keras.layers import Add 15 | import tensorflow as tf 16 | import os 17 | 18 | import logging as logger 19 | # 全局使用,使其可以django、flask、tornado等调用 20 | graph = None 21 | model = None 22 | 23 | # gpu配置与使用率设置 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 25 | config = tf.ConfigProto() 26 | config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction 27 | sess = tf.Session(config=config) 28 | ktf_keras.set_session(sess) 29 | 30 | class KerasBertEmbedding(): 31 | def __init__(self): 32 | self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len 33 | 34 | def bert_encode(self): 35 | # 全局使用,使其可以django、flask、tornado等调用 36 | global graph 37 | graph = tf.get_default_graph() 38 | global model 39 | model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, 40 | seq_len=self.max_seq_len) 41 | print(model.output) 42 | print(len(model.layers)) 43 | # lay = model.layers 44 | #一共104个layer,其中前八层包括token,pos,embed等, 45 | # 每8层(MultiHeadAttention,Dropout,Add,LayerNormalization) 46 | # 一共12层 47 | layer_dict = [7] 48 | layer_0 = 7 49 | for i in range(12): 50 | layer_0 = layer_0 + 8 51 | layer_dict.append(layer_0) 52 | # 输出它本身 53 | if len(layer_indexes) == 0: 54 | encoder_layer = model.output 55 | # 分类如果只有一层,就只取最后那一层的weight;取得不正确,就默认取最后一层 56 | elif len(layer_indexes) == 1: 57 | if layer_indexes[0] in [i+1 for i in range(12)]: 58 | encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]-1]).output 59 | else: 60 | encoder_layer = model.get_layer(index=layer_dict[-1]).output 61 | # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 62 | else: 63 | # layer_indexes must be [1,2,3,......12] 64 | # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] 65 | all_layers = [model.get_layer(index=layer_dict[lay-1]).output if lay in [i+1 for i in range(12)] 66 | else model.get_layer(index=layer_dict[-1]).output #如果给出不正确,就默认输出最后一层 67 | for lay in layer_indexes] 68 | print(layer_indexes) 69 | print(all_layers) 70 | all_layers_select = [] 71 | for all_layers_one in all_layers: 72 | all_layers_select.append(all_layers_one) 73 | encoder_layer = Add()(all_layers_select) 74 | print(encoder_layer.shape) 75 | print("KerasBertEmbedding:") 76 | print(encoder_layer.shape) 77 | output_layer = NonMaskingLayer()(encoder_layer) 78 | model = Model(model.inputs, output_layer) 79 | # model.summary(120) 80 | return model.inputs, model.output 81 | 82 | 83 | if __name__ == "__main__": 84 | bert_vector = KerasBertEmbedding() 85 | pooled = bert_vector.bert_encode() 86 | 87 | -------------------------------------------------------------------------------- /ClassificationText/bert/keras_bert_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 10:49 4 | # @author :Mo 5 | # @function : 1. create model of keras-bert for get [-2] layers 6 | # 2. create model of AttentionWeightedAverage for get avg attention pooling 7 | 8 | from keras.engine import InputSpec 9 | import keras.backend as k_keras 10 | from keras.engine import Layer 11 | from keras import initializers 12 | 13 | 14 | class NonMaskingLayer(Layer): 15 | """ 16 | fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978 17 | thanks for https://github.com/jacoxu 18 | """ 19 | 20 | def __init__(self, **kwargs): 21 | self.supports_masking = True 22 | super(NonMaskingLayer, self).__init__(**kwargs) 23 | 24 | def build(self, input_shape): 25 | pass 26 | 27 | def compute_mask(self, input, input_mask=None): 28 | # do not pass the mask to the next layers 29 | return None 30 | 31 | def call(self, x, mask=None): 32 | return x 33 | 34 | def compute_output_shape(self, input_shape): 35 | return input_shape 36 | 37 | 38 | class AttentionWeightedAverage(Layer): 39 | ''' 40 | codes from: https://github.com/BrikerMan/Kashgari 41 | detail: https://github.com/BrikerMan/Kashgari/blob/master/kashgari/tasks/classification/models.py 42 | Computes a weighted average of the different channels across timesteps. 43 | Uses 1 parameter pr. channel to compute the attention value for a single timestep. 44 | ''' 45 | 46 | def __init__(self, return_attention=False, **kwargs): 47 | self.init = initializers.get('uniform') 48 | self.supports_masking = True 49 | self.return_attention = return_attention 50 | super(AttentionWeightedAverage, self).__init__(**kwargs) 51 | 52 | def build(self, input_shape): 53 | self.input_spec = [InputSpec(ndim=3)] 54 | assert len(input_shape) == 3 55 | 56 | self.W = self.add_weight(shape=(input_shape[2], 1), 57 | name='{}_w'.format(self.name), 58 | initializer=self.init) 59 | self.trainable_weights = [self.W] 60 | super(AttentionWeightedAverage, self).build(input_shape) 61 | 62 | def call(self, x, mask=None): 63 | # computes a probability distribution over the timesteps 64 | # uses 'max trick' for numerical stability 65 | # reshape is done to avoid issue with Tensorflow 66 | # and 1-dimensional weights 67 | logits = k_keras.dot(x, self.W) 68 | x_shape = k_keras.shape(x) 69 | logits = k_keras.reshape(logits, (x_shape[0], x_shape[1])) 70 | ai = k_keras.exp(logits - k_keras.max(logits, axis=-1, keepdims=True)) 71 | 72 | # masked timesteps have zero weight 73 | if mask is not None: 74 | mask = k_keras.cast(mask, k_keras.floatx()) 75 | ai = ai * mask 76 | att_weights = ai / (k_keras.sum(ai, axis=1, keepdims=True) + k_keras.epsilon()) 77 | weighted_input = x * k_keras.expand_dims(att_weights) 78 | result = k_keras.sum(weighted_input, axis=1) 79 | if self.return_attention: 80 | return [result, att_weights] 81 | return result 82 | 83 | def get_output_shape_for(self, input_shape): 84 | return self.compute_output_shape(input_shape) 85 | 86 | def compute_output_shape(self, input_shape): 87 | output_len = input_shape[2] 88 | if self.return_attention: 89 | return [(input_shape[0], output_len), (input_shape[0], input_shape[1])] 90 | return (input_shape[0], output_len) 91 | 92 | def compute_mask(self, input, input_mask=None): 93 | if isinstance(input_mask, list): 94 | return [None] * len(input_mask) 95 | else: 96 | return None 97 | -------------------------------------------------------------------------------- /ClassificationText/bert/model_webank_tdt/useless.txt: -------------------------------------------------------------------------------- 1 | uesless 2 | -------------------------------------------------------------------------------- /ClassificationText/bert/readme.md: -------------------------------------------------------------------------------- 1 | # run 2 | * 0.数据采用的是webank比赛数据,项目中只有部分几条, 3 | * 1.谷歌预训练好的模型chinese_L-12_H-768_A-12需要存到Data/chinese_L-12_H-768_A-12目录下 4 | * 需要的前往需要的前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket 5 | * 找到webank.rar压缩包,下载覆盖工程目录Data/corpus/webnk文件夹就好;chinese_L-12_H-768_A-12覆盖Data/chinese_L-12_H-768_A-12就好 6 | * 1.训练 7 | * python keras_bert_classify_bi_lstm.py 或者 python keras_bert_classify_text_cnn.py 8 | * 2.测试(__main__下面注释掉train()和predict(), 改为tet()就好,predict()同理) 9 | * python keras_bert_classify_bi_lstm.py 10 | * 3.预测(__main__下面注释掉train()和tet(),放开predict()) 11 | * python keras_bert_classify_bi_lstm.py 12 | -------------------------------------------------------------------------------- /Data/chinese_L-12_H-768_A-12/useless.txt: -------------------------------------------------------------------------------- 1 | useless.txt 2 | -------------------------------------------------------------------------------- /Data/chinese_vector/w2v_model_merge_short.vec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/Data/chinese_vector/w2v_model_merge_short.vec -------------------------------------------------------------------------------- /Data/chinese_xlnet_mid_L-24_H-768_A-12/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/28 1:49 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /Data/corpus/ner/people_daily/people.dev: -------------------------------------------------------------------------------- 1 | 在 O 2 | 这 O 3 | 里 O 4 | 恕 O 5 | 弟 O 6 | 不 O 7 | 恭 O 8 | 之 O 9 | 罪 O 10 | , O 11 | 敢 O 12 | 在 O 13 | 尊 O 14 | 前 O 15 | 一 O 16 | 诤 O 17 | : O 18 | 前 O 19 | 人 O 20 | 论 O 21 | 书 O 22 | , O 23 | 每 O 24 | 曰 O 25 | “ O 26 | 字 O 27 | 字 O 28 | 有 O 29 | 来 O 30 | 历 O 31 | , O 32 | 笔 O 33 | 笔 O 34 | 有 O 35 | 出 O 36 | 处 O 37 | ” O 38 | , O 39 | 细 O 40 | 读 O 41 | 公 O 42 | 字 O 43 | , O 44 | 何 O 45 | 尝 O 46 | 跳 O 47 | 出 O 48 | 前 O 49 | 人 O 50 | 藩 O 51 | 篱 O 52 | , O 53 | 自 O 54 | 隶 O 55 | 变 O 56 | 而 O 57 | 后 O 58 | , O 59 | 直 O 60 | 至 O 61 | 明 O 62 | 季 O 63 | , O 64 | 兄 O 65 | 有 O 66 | 何 O 67 | 新 O 68 | 出 O 69 | ? O 70 | 71 | 相 O 72 | 比 O 73 | 之 O 74 | 下 O 75 | , O 76 | 青 B-ORG 77 | 岛 I-ORG 78 | 海 I-ORG 79 | 牛 I-ORG 80 | 队 I-ORG 81 | 和 O 82 | 广 B-ORG 83 | 州 I-ORG 84 | 松 I-ORG 85 | 日 I-ORG 86 | 队 I-ORG 87 | 的 O 88 | 雨 O 89 | 中 O 90 | 之 O 91 | 战 O 92 | 虽 O 93 | 然 O 94 | 也 O 95 | 是 O 96 | 0 O 97 | ∶ O 98 | 0 O 99 | , O 100 | 但 O 101 | 乏 O 102 | 善 O 103 | 可 O 104 | 陈 O 105 | 。 O 106 | 107 | 理 O 108 | 由 O 109 | 多 O 110 | 多 O 111 | , O 112 | 最 O 113 | 无 O 114 | 奈 O 115 | 的 O 116 | 却 O 117 | 是 O 118 | : O 119 | 5 O 120 | 月 O 121 | 恰 O 122 | 逢 O 123 | 双 O 124 | 重 O 125 | 考 O 126 | 试 O 127 | , O 128 | 她 O 129 | 攻 O 130 | 读 O 131 | 的 O 132 | 博 O 133 | 士 O 134 | 学 O 135 | 位 O 136 | 论 O 137 | 文 O 138 | 要 O 139 | 通 O 140 | 考 O 141 | ; O 142 | 她 O 143 | 任 O 144 | 教 O 145 | 的 O 146 | 两 O 147 | 所 O 148 | 学 O 149 | 校 O 150 | , O 151 | 也 O 152 | 要 O 153 | 在 O 154 | 这 O 155 | 段 O 156 | 时 O 157 | 日 O 158 | 大 O 159 | 考 O 160 | 。 O 161 | 162 | 分 O 163 | 工 O 164 | , O 165 | 各 O 166 | 有 O 167 | 各 O 168 | 的 O 169 | 责 O 170 | 任 O 171 | ; O 172 | 合 O 173 | 作 O 174 | , O 175 | 正 O 176 | 副 O 177 | 经 O 178 | 理 O 179 | 之 O 180 | 间 O 181 | , O 182 | 全 O 183 | 厂 O 184 | 的 O 185 | 事 O 186 | , O 187 | 不 O 188 | 管 O 189 | 由 O 190 | 谁 O 191 | 分 O 192 | 管 O 193 | , O 194 | 也 O 195 | 不 O 196 | 管 O 197 | 你 O 198 | 有 O 199 | 什 O 200 | 么 O 201 | 事 O 202 | 找 O 203 | 到 O 204 | 谁 O 205 | , O 206 | 绝 O 207 | 不 O 208 | 会 O 209 | 把 O 210 | 你 O 211 | 推 O 212 | 给 O 213 | 第 O 214 | 二 O 215 | 个 O 216 | 人 O 217 | 。 O 218 | 219 | 胡 B-PER 220 | 老 O 221 | 说 O 222 | , O 223 | 当 O 224 | 画 O 225 | 画 O 226 | 疲 O 227 | 倦 O 228 | 时 O 229 | 就 O 230 | 到 O 231 | 院 O 232 | 里 O 233 | 去 O 234 | 看 O 235 | 看 O 236 | , O 237 | 给 O 238 | 这 O 239 | 盆 O 240 | 花 O 241 | 浇 O 242 | 点 O 243 | 水 O 244 | , O 245 | 给 O 246 | 那 O 247 | 棵 O 248 | 花 O 249 | 剪 O 250 | 剪 O 251 | 枝 O 252 | , O 253 | 回 O 254 | 来 O 255 | 再 O 256 | 接 O 257 | 着 O 258 | 画 O 259 | , O 260 | 画 O 261 | 累 O 262 | 了 O 263 | 再 O 264 | 出 O 265 | 去 O 266 | , O 267 | 如 O 268 | 此 O 269 | 循 O 270 | 环 O 271 | 往 O 272 | 复 O 273 | , O 274 | 脑 O 275 | 体 O 276 | 结 O 277 | 合 O 278 | , O 279 | 有 O 280 | 益 O 281 | 健 O 282 | 康 O 283 | , O 284 | 胜 O 285 | 过 O 286 | 吃 O 287 | 药 O 288 | 。 O 289 | 290 | 当 O 291 | 前 O 292 | 国 O 293 | 有 O 294 | 大 O 295 | 中 O 296 | 型 O 297 | 企 O 298 | 业 O 299 | 改 O 300 | 制 O 301 | 中 O 302 | 存 O 303 | 在 O 304 | 的 O 305 | 问 O 306 | 题 O 307 | 。 O 308 | 309 | 试 O 310 | 验 O 311 | 证 O 312 | 明 O 313 | , O 314 | 在 O 315 | 吸 O 316 | 无 O 317 | 过 O 318 | 滤 O 319 | 嘴 O 320 | 的 O 321 | 香 O 322 | 烟 O 323 | 时 O 324 | , O 325 | 香 O 326 | 烟 O 327 | 燃 O 328 | 烧 O 329 | 过 O 330 | 程 O 331 | 中 O 332 | 产 O 333 | 生 O 334 | 的 O 335 | 尼 O 336 | 古 O 337 | 丁 O 338 | 1 O 339 | 4 O 340 | % O 341 | 至 O 342 | 2 O 343 | 0 O 344 | % O 345 | 都 O 346 | 进 O 347 | 了 O 348 | 口 O 349 | 腔 O 350 | , O 351 | 即 O 352 | 使 O 353 | 是 O 354 | 有 O 355 | 过 O 356 | 滤 O 357 | 嘴 O 358 | , O 359 | 进 O 360 | 入 O 361 | 口 O 362 | 腔 O 363 | 的 O 364 | 尼 O 365 | 古 O 366 | 丁 O 367 | 仍 O 368 | 会 O 369 | 有 O 370 | 5 O 371 | % O 372 | 到 O 373 | 1 O 374 | 2 O 375 | % O 376 | 。 O 377 | 378 | 去 O 379 | 年 O 380 | 十 O 381 | 二 O 382 | 月 O 383 | 二 O 384 | 十 O 385 | 四 O 386 | 日 O 387 | , O 388 | 市 B-ORG 389 | 委 I-ORG 390 | 书 O 391 | 记 O 392 | 张 B-PER 393 | 敬 I-PER 394 | 涛 I-PER 395 | 召 O 396 | 集 O 397 | 县 O 398 | 市 O 399 | 主 O 400 | 要 O 401 | 负 O 402 | 责 O 403 | 同 O 404 | 志 O 405 | 研 O 406 | 究 O 407 | 信 O 408 | 访 O 409 | 工 O 410 | 作 O 411 | 时 O 412 | , O 413 | 提 O 414 | 出 O 415 | 三 O 416 | 问 O 417 | : O 418 | 『 O 419 | 假 O 420 | 如 O 421 | 上 O 422 | 访 O 423 | 群 O 424 | 众 O 425 | 是 O 426 | 我 O 427 | 们 O 428 | 的 O 429 | 父 O 430 | 母 O 431 | 姐 O 432 | 妹 O 433 | , O 434 | 你 O 435 | 会 O 436 | 用 O 437 | 什 O 438 | 么 O 439 | 样 O 440 | 的 O 441 | 感 O 442 | 情 O 443 | 对 O 444 | 待 O 445 | 他 O 446 | 们 O 447 | ? O 448 | -------------------------------------------------------------------------------- /Data/corpus/ner/people_daily/people.test: -------------------------------------------------------------------------------- 1 | 我 O 2 | 们 O 3 | 变 O 4 | 而 O 5 | 以 O 6 | 书 O 7 | 会 O 8 | 友 O 9 | , O 10 | 以 O 11 | 书 O 12 | 结 O 13 | 缘 O 14 | , O 15 | 把 O 16 | 欧 B-LOC 17 | 美 B-LOC 18 | 、 O 19 | 港 B-LOC 20 | 台 B-LOC 21 | 流 O 22 | 行 O 23 | 的 O 24 | 食 O 25 | 品 O 26 | 类 O 27 | 图 O 28 | 谱 O 29 | 、 O 30 | 画 O 31 | 册 O 32 | 、 O 33 | 工 O 34 | 具 O 35 | 书 O 36 | 汇 O 37 | 集 O 38 | 一 O 39 | 堂 O 40 | 。 O 41 | 42 | 为 O 43 | 了 O 44 | 跟 O 45 | 踪 O 46 | 国 O 47 | 际 O 48 | 最 O 49 | 新 O 50 | 食 O 51 | 品 O 52 | 工 O 53 | 艺 O 54 | 、 O 55 | 流 O 56 | 行 O 57 | 趋 O 58 | 势 O 59 | , O 60 | 大 O 61 | 量 O 62 | 搜 O 63 | 集 O 64 | 海 O 65 | 外 O 66 | 专 O 67 | 业 O 68 | 书 O 69 | 刊 O 70 | 资 O 71 | 料 O 72 | 是 O 73 | 提 O 74 | 高 O 75 | 技 O 76 | 艺 O 77 | 的 O 78 | 捷 O 79 | 径 O 80 | 。 O 81 | 82 | 其 O 83 | 中 O 84 | 线 O 85 | 装 O 86 | 古 O 87 | 籍 O 88 | 逾 O 89 | 千 O 90 | 册 O 91 | ; O 92 | 民 O 93 | 国 O 94 | 出 O 95 | 版 O 96 | 物 O 97 | 几 O 98 | 百 O 99 | 种 O 100 | ; O 101 | 珍 O 102 | 本 O 103 | 四 O 104 | 册 O 105 | 、 O 106 | 稀 O 107 | 见 O 108 | 本 O 109 | 四 O 110 | 百 O 111 | 余 O 112 | 册 O 113 | , O 114 | 出 O 115 | 版 O 116 | 时 O 117 | 间 O 118 | 跨 O 119 | 越 O 120 | 三 O 121 | 百 O 122 | 余 O 123 | 年 O 124 | 。 O 125 | 126 | 有 O 127 | 的 O 128 | 古 O 129 | 木 O 130 | 交 O 131 | 柯 O 132 | , O 133 | 春 O 134 | 机 O 135 | 荣 O 136 | 欣 O 137 | , O 138 | 从 O 139 | 诗 O 140 | 人 O 141 | 句 O 142 | 中 O 143 | 得 O 144 | 之 O 145 | , O 146 | 而 O 147 | 入 O 148 | 画 O 149 | 中 O 150 | , O 151 | 观 O 152 | 之 O 153 | 令 O 154 | 人 O 155 | 心 O 156 | 驰 O 157 | 。 O 158 | 159 | 不 O 160 | 过 O 161 | 重 O 162 | 在 O 163 | 晋 O 164 | 趣 O 165 | , O 166 | 略 O 167 | 增 O 168 | 明 O 169 | 人 O 170 | 气 O 171 | 息 O 172 | , O 173 | 妙 O 174 | 在 O 175 | 集 O 176 | 古 O 177 | 有 O 178 | 道 O 179 | 、 O 180 | 不 O 181 | 露 O 182 | 痕 O 183 | 迹 O 184 | 罢 O 185 | 了 O 186 | 。 O 187 | 188 | 其 O 189 | 实 O 190 | 非 O 191 | 汉 O 192 | 非 O 193 | 唐 O 194 | , O 195 | 又 O 196 | 是 O 197 | 什 O 198 | 么 O 199 | 与 O 200 | 什 O 201 | 么 O 202 | 呢 O 203 | ? O 204 | 205 | 国 B-PER 206 | 正 I-PER 207 | 学 O 208 | 长 O 209 | 的 O 210 | 文 O 211 | 章 O 212 | 与 O 213 | 诗 O 214 | 词 O 215 | , O 216 | 早 O 217 | 就 O 218 | 读 O 219 | 过 O 220 | 一 O 221 | 些 O 222 | , O 223 | 很 O 224 | 是 O 225 | 喜 O 226 | 欢 O 227 | 。 O 228 | 229 | “ O 230 | 文 O 231 | 化 O 232 | 大 O 233 | 革 O 234 | 命 O 235 | ” O 236 | 中 O 237 | , O 238 | 茶 O 239 | 馆 O 240 | 作 O 241 | 为 O 242 | “ O 243 | 四 O 244 | 旧 O 245 | ” O 246 | 、 O 247 | “ O 248 | 传 O 249 | 播 O 250 | 封 O 251 | 、 O 252 | 资 O 253 | 、 O 254 | 修 O 255 | 的 O 256 | 场 O 257 | 所 O 258 | ” O 259 | 被 O 260 | 关 O 261 | 闭 O 262 | 了 O 263 | 。 O 264 | 265 | 几 O 266 | 株 O 267 | 数 O 268 | 人 O 269 | 才 O 270 | 能 O 271 | 合 O 272 | 抱 O 273 | 的 O 274 | 大 O 275 | 榕 O 276 | 树 O 277 | 挡 O 278 | 住 O 279 | 了 O 280 | 烈 O 281 | 日 O 282 | , O 283 | 树 O 284 | 下 O 285 | 凉 O 286 | 风 O 287 | 拂 O 288 | 面 O 289 | 。 O 290 | 291 | 他 O 292 | 正 O 293 | 准 O 294 | 备 O 295 | 掏 O 296 | 钱 O 297 | , O 298 | 妈 O 299 | 妈 O 300 | 轻 O 301 | 声 O 302 | 说 O 303 | : O 304 | “ O 305 | 老 O 306 | 师 O 307 | , O 308 | 这 O 309 | 茶 O 310 | 不 O 311 | 卖 O 312 | , O 313 | 我 O 314 | 们 O 315 | 要 O 316 | 换 O 317 | 粮 O 318 | 食 O 319 | 的 O 320 | … O 321 | … O 322 | ” O 323 | 324 | 沏 O 325 | 茶 O 326 | 时 O 327 | , O 328 | 一 O 329 | 杯 O 330 | 清 O 331 | 水 O 332 | 被 O 333 | 新 O 334 | 茶 O 335 | 的 O 336 | 细 O 337 | 芽 O 338 | 嫩 O 339 | 叶 O 340 | 染 O 341 | 绿 O 342 | , O 343 | 春 O 344 | 色 O 345 | 满 O 346 | 杯 O 347 | , O 348 | 清 O 349 | 香 O 350 | 幽 O 351 | 远 O 352 | 。 O 353 | 354 | 和 O 355 | 往 O 356 | 年 O 357 | 一 O 358 | 样 O 359 | , O 360 | 清 O 361 | 明 O 362 | 节 O 363 | 刚 O 364 | 过 O 365 | , O 366 | 我 O 367 | 的 O 368 | 中 O 369 | 学 O 370 | 老 O 371 | 师 O 372 | 就 O 373 | 千 O 374 | 里 O 375 | 迢 O 376 | 迢 O 377 | 寄 O 378 | 来 O 379 | 新 O 380 | 采 O 381 | 制 O 382 | 的 O 383 | “ O 384 | 雨 O 385 | 前 O 386 | 茶 O 387 | ” O 388 | , O 389 | 这 O 390 | 是 O 391 | 一 O 392 | 种 O 393 | 名 O 394 | 叫 O 395 | 玉 B-LOC 396 | 峰 I-LOC 397 | 云 O 398 | 雾 O 399 | 茶 O 400 | 的 O 401 | 绿 O 402 | 茶 O 403 | , O 404 | 生 O 405 | 长 O 406 | 在 O 407 | 重 B-LOC 408 | 庆 I-LOC 409 | 市 O 410 | 郊 O 411 | 的 O 412 | 玉 B-LOC 413 | 峰 I-LOC 414 | 山 I-LOC 415 | 麓 O 416 | 。 O 417 | -------------------------------------------------------------------------------- /Data/corpus/sim_webank.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/Data/corpus/sim_webank.csv -------------------------------------------------------------------------------- /Data/corpus/webank/dev.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2,label 2 | 不要借了我是试试看能否操作的,借款审核期间能否取消借款,0 3 | 亲怎样才能在钱包里有微粒货的图标呢,借不到,0 4 | 你好,我还款银行怎么更换,怎么更换绑定还款的卡,1 5 | 我的借贷额度,怎么减少了呢?,微粒贷额度怎么才能降低,0 6 | 什么时候可以知道借款成功,2.多笔借款,0 7 | 一般电话确认要等多久。,一般多久才会打电话来,1 8 | 我想问下如果我开始设定的借款是20个月,但是到10个月的时候提前还清,利息是算到什么时候的呢?,你好我想问一下 看你们还款周期设定了5个月 10个月的 能不能一个月内还呢?利息是怎么算的 还有申请额度是一定的么?会不会上调?,1 9 | 借到的款在微信钱包里,1何时邀请,0 10 | 我要关闭微粒贷这个功能,提示未满足条件,0 11 | 不借了怎么退点错了,不想贷款了怎么撤销,1 12 | 今天是还款日 今天内钱会存进去 保证今天能还 应该没什么事吧,今天放款今天就得还款?,0 13 | 借不到,不可贷款,1 14 | 借款具备条件,借款 申请,0 15 | 选择了还款期限,可以提前还吗,几天就还,这个借款是借几天算几天的吗,0 16 | 从零钱还行不行,从微信钱包扣除,1 17 | 我什么时候才可以有贷款?,今天借,过几天后还可以吗?,0 18 | 为什么我借了没有人联系我呢,什么时候回来电话?,1 19 | 请问为什么总提示我身份证输入错误次数过频呢?,提示:身份证信息输入次数过多,无法借款,1 20 | 为什么我手动还款不行啊,我卡掉了,现在用新卡还款,你们还不可以改,怎么还,0 21 | 不使用可以关闭吗?,额度申请成功后会短信通知吗,0 22 | -------------------------------------------------------------------------------- /Data/corpus/webank/test.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2,label 2 | 为什么我无法看到额度,为什么开通了却没有额度,0 3 | 为啥换不了,为两次还都提示失败呢,0 4 | 借了钱,但还没有通过,可以取消吗?,可否取消,1 5 | 为什么我申请额度输入密码就一直是那个页面,为什么要输入支付密码来验证,0 6 | 今天借 明天还款可以?,今天借明天还要手续费吗,0 7 | 你好!今下午咱没有扣我款?,你好 今天怎么没有扣款呢,1 8 | 所借的钱是否可以提现?,该笔借款可以提现吗!,1 9 | 不是邀请的客人就不能借款吗,一般什么样得人会受邀请,0 10 | 人脸失别不了,开不了户,我输入的资料都是正确的,为什么总说不符开户失败?,0 11 | 一天利息好多钱,1万利息一天是5元是吗,1 12 | 为啥还没开放啊,不是微粒贷客户,怎么可以受邀,0 13 | 开通.微粒贷,帮我开通,1 14 | 咋么才能收到邀请,为什么我6号扣还款的到现在还没,0 15 | 扣款时间是几点,无利息的还款时间是多久?,0 16 | 为什么借款总是不通过,为什么审请不通过,1 17 | 为什么我的无法查看额度,为什么我点进去没有额度呢,0 18 | 请问月息多少,2万块月息是多少,1 19 | 借钱可好取现,可以提现金?,1 20 | 可以开 结清证明吗?,还清钱后能继续借吗?,0 21 | 你好,我银行卡被法院封了,能否换我儿子的卡还款,换卡什么时候能换好,0 22 | -------------------------------------------------------------------------------- /Data/corpus/webank/train.csv: -------------------------------------------------------------------------------- 1 | sentence1,sentence2,label 2 | 用微信都6年,微信没有微粒贷功能,4。 号码来微粒贷,0 3 | 微信消费算吗,还有多少钱没还,0 4 | 交易密码忘记了找回密码绑定的手机卡也掉了,怎么最近安全老是要改密码呢好麻烦,0 5 | 你好 我昨天晚上申请的没有打电话给我 今天之内一定会打吗?,什么时候可以到账,0 6 | "“微粒贷开通""",你好,我的微粒贷怎么没有开通呢,0 7 | 为什么借款后一直没有给我回拨电话,怎么申请借款后没有打电话过来呢!,1 8 | 为什么我每次都提前还款了最后却不给我贷款了,30号我一次性还清可以不,0 9 | 请问一天是否都是限定只能转入或转出都是五万。,微众多少可以赎回短期理财,0 10 | 微粒咨询电话号码多少,你们的人工客服电话是多少,1 11 | 已经在银行换了新预留号码。,我现在换了电话号码,这个需要更换吗,1 12 | 下周有什么好产品?,元月份有哪些理财产品,1 13 | 第一次使用,额度多少?,我的额度多少钱,1 14 | 我什么时候可以通过微粒贷借钱,提前还清贷款还能再借吗,0 15 | 借款后多长时间给打电话,借款后多久打电话啊,1 16 | 没看到微粒贷,我借那么久也没有提升啊,0 17 | 原来的手机号不用了,怎么换,手机号码没有更改,1 18 | 我想开通微粒贷 不知我应该做写什么准备材料呢,为何苹果手机显示微粒贷暂未开放?,0 19 | 能查帐单吗,可以查询帐单,1 20 | 日利率多少,息多少,1 21 | 微信6.2的版本没有微粒贷吗?,什么时候才会全面开放名额,0 22 | -------------------------------------------------------------------------------- /Data/sentence_vec_encode_char/char: -------------------------------------------------------------------------------- 1 | char 2 | -------------------------------------------------------------------------------- /Data/sentence_vec_encode_word/word: -------------------------------------------------------------------------------- 1 | word 2 | -------------------------------------------------------------------------------- /Data/tf_idf/tf_idf: -------------------------------------------------------------------------------- 1 | tf_idf 2 | -------------------------------------------------------------------------------- /FeatureProject/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | #!/usr/bin/python 3 | # @Time :2019/3/29 23:10 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /FeatureProject/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/FeatureProject/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/FeatureProject/__pycache__/distance_text_or_vec.cpython-36.pyc -------------------------------------------------------------------------------- /FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/FeatureProject/__pycache__/distance_vec_TS_SS.cpython-36.pyc -------------------------------------------------------------------------------- /FeatureProject/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 9:12 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /FeatureProject/bert/extract_keras_bert_feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/8 20:04 4 | # @author :Mo 5 | # @function :extract feature of bert and keras 6 | 7 | import codecs 8 | import os 9 | 10 | import keras.backend.tensorflow_backend as ktf_keras 11 | import numpy as np 12 | import tensorflow as tf 13 | from keras.layers import Add 14 | from keras.models import Model 15 | from keras_bert import load_trained_model_from_checkpoint, Tokenizer 16 | 17 | from FeatureProject.bert.layers_keras import NonMaskingLayer 18 | from conf.feature_config import gpu_memory_fraction, config_name, ckpt_name, vocab_file, max_seq_len, layer_indexes 19 | 20 | # 全局使用,使其可以django、flask、tornado等调用 21 | graph = None 22 | model = None 23 | 24 | 25 | # gpu配置与使用率设置 26 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 27 | config = tf.ConfigProto() 28 | config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction 29 | sess = tf.Session(config=config) 30 | ktf_keras.set_session(sess) 31 | 32 | class KerasBertVector(): 33 | def __init__(self): 34 | self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len 35 | # 全局使用,使其可以django、flask、tornado等调用 36 | global graph 37 | graph = tf.get_default_graph() 38 | global model 39 | model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, 40 | seq_len=self.max_seq_len) 41 | print(model.output) 42 | print(len(model.layers)) 43 | # lay = model.layers 44 | #一共104个layer,其中前八层包括token,pos,embed等, 45 | # 每8层(MultiHeadAttention,Dropout,Add,LayerNormalization) resnet 46 | # 一共12层 47 | layer_dict = [7] 48 | layer_0 = 7 49 | for i in range(12): 50 | layer_0 = layer_0 + 8 51 | layer_dict.append(layer_0) 52 | # 输出它本身 53 | if len(layer_indexes) == 0: 54 | encoder_layer = model.output 55 | # 分类如果只有一层,就只取最后那一层的weight,取得不正确 56 | elif len(layer_indexes) == 1: 57 | if layer_indexes[0] in [i+1 for i in range(13)]: 58 | encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]]).output 59 | else: 60 | encoder_layer = model.get_layer(index=layer_dict[-1]).output 61 | # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 62 | else: 63 | # layer_indexes must be [1,2,3,......13] 64 | # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] 65 | all_layers = [model.get_layer(index=layer_dict[lay-1]).output if lay in [i+1 for i in range(13)] 66 | else model.get_layer(index=layer_dict[-1]).output # 如果给出不正确,就默认输出最后一层 67 | for lay in layer_indexes] 68 | print(layer_indexes) 69 | print(all_layers) 70 | # 其中layer==1的output是格式不对,第二层输入input是list 71 | all_layers_select = [] 72 | for all_layers_one in all_layers: 73 | all_layers_select.append(all_layers_one) 74 | encoder_layer = Add()(all_layers_select) 75 | print(encoder_layer.shape) 76 | print("KerasBertEmbedding:") 77 | print(encoder_layer.shape) 78 | output_layer = NonMaskingLayer()(encoder_layer) 79 | model = Model(model.inputs, output_layer) 80 | # model.summary(120) 81 | # reader tokenizer 82 | self.token_dict = {} 83 | with codecs.open(self.dict_path, 'r', 'utf8') as reader: 84 | for line in reader: 85 | token = line.strip() 86 | self.token_dict[token] = len(self.token_dict) 87 | 88 | self.tokenizer = Tokenizer(self.token_dict) 89 | 90 | 91 | def bert_encode(self, texts): 92 | # 文本预处理 93 | input_ids = [] 94 | input_masks = [] 95 | input_type_ids = [] 96 | for text in texts: 97 | print(text) 98 | tokens_text = self.tokenizer.tokenize(text) 99 | print('Tokens:', tokens_text) 100 | input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.max_seq_len) 101 | input_mask = [0 if ids == 0 else 1 for ids in input_id] 102 | input_ids.append(input_id) 103 | input_type_ids.append(input_type_id) 104 | input_masks.append(input_mask) 105 | 106 | input_ids = np.array(input_ids) 107 | input_masks = np.array(input_masks) 108 | input_type_ids = np.array(input_type_ids) 109 | 110 | # 全局使用,使其可以django、flask、tornado等调用 111 | with graph.as_default(): 112 | predicts = model.predict([input_ids, input_type_ids], batch_size=1) 113 | print(predicts.shape) 114 | tokens_text = tokens_text if len(tokens_text) <= self.max_seq_len - 2 else tokens_text[:self.max_seq_len - 2] 115 | for i, token in enumerate(tokens_text): 116 | print(token, [len(predicts[0][i].tolist())], predicts[0][i].tolist()) 117 | 118 | # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py 119 | mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1) 120 | masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9) 121 | 122 | pools = [] 123 | for i in range(len(predicts)): 124 | pred = predicts[i] 125 | masks = input_masks.tolist() 126 | mask_np = np.array([masks[i]]) 127 | pooled = masked_reduce_mean(pred, mask_np) 128 | pooled = pooled.tolist() 129 | pools.append(pooled[0]) 130 | print('bert:', pools) 131 | return pools 132 | 133 | 134 | if __name__ == "__main__": 135 | bert_vector = KerasBertVector() 136 | pooled = bert_vector.bert_encode(['你是谁呀', '小老弟']) 137 | print(pooled) 138 | while True: 139 | print("input:") 140 | ques = input() 141 | print(bert_vector.bert_encode([ques])) 142 | 143 | -------------------------------------------------------------------------------- /FeatureProject/bert/layers_keras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 10:49 4 | # @author :Mo 5 | # @function :create model of keras-bert for get [-2] layers 6 | 7 | from keras.engine import Layer 8 | 9 | 10 | class NonMaskingLayer(Layer): 11 | """ 12 | fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978 13 | thanks for https://github.com/jacoxu 14 | """ 15 | 16 | def __init__(self, **kwargs): 17 | self.supports_masking = True 18 | super(NonMaskingLayer, self).__init__(**kwargs) 19 | 20 | def build(self, input_shape): 21 | pass 22 | 23 | def compute_mask(self, input, input_mask=None): 24 | # do not pass the mask to the next layers 25 | return None 26 | 27 | def call(self, x, mask=None): 28 | return x 29 | 30 | def get_output_shape_for(self, input_shape): 31 | return input_shape 32 | -------------------------------------------------------------------------------- /FeatureProject/bert/readme.md: -------------------------------------------------------------------------------- 1 | # BERT 2 | 3 | # usage: 4 | this bert you need not fine tuning for common target 5 | 6 | * step1:github只上传部分数据,前往链接: https://pan.baidu.com/s/1I3vydhmFEQ9nuPG2fDou8Q 提取码: rket download chinese_L-12_H-768_A-12(谷歌预训练好的模型) 7 | 解压到Data/chinese_L-12_H-768_A-12 8 | * 9 | * step2-1: 10 | 运行 FeatureProject/bert/extract_keras_bert_feature.py 11 | then you can get vector of bert encoding 12 | * 13 | * step2-2: 14 | 运行 FeatureProject/bert/tet_bert_keras_sim.py 15 | then you can get sim of bert vector of two sentence 16 | and get avg time of run a sentence of encode 17 | 18 | # thanks 19 | * keras-bert: https://github.com/CyberZHG/keras-bert 20 | -------------------------------------------------------------------------------- /FeatureProject/bert/tet_bert_keras_sim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/7 20:27 4 | # @author :Mo 5 | # @function :test sentence of bert encode and cosin sim of two question 6 | 7 | 8 | def calculate_count(): 9 | """ 10 | 统计一下1000条测试数据的平均耗时 11 | :return: 12 | """ 13 | from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector 14 | import time 15 | 16 | bert_vector = KerasBertVector() 17 | print("bert start ok!") 18 | time_start = time.time() 19 | for i in range(10): 20 | vector = bert_vector.bert_encode(["jy,你知道吗,我一直都很喜欢你呀,在一起在一起在一起,哈哈哈哈"]) 21 | 22 | time_end = time.time() 23 | time_avg = (time_end-time_start)/10 24 | print(vector) 25 | print(time_avg) 26 | # 0.12605296468734742 win10 gpu avg 27 | # 0.01629048466682434 linux cpu avg 28 | 29 | 30 | def sim_two_question(): 31 | """测试一下两个问题的相似句子""" 32 | from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector 33 | from sklearn import preprocessing 34 | from math import pi 35 | import numpy as np 36 | import time 37 | import math 38 | 39 | def cosine_distance(v1, v2): # 余弦距离 40 | if type(v1)==list: 41 | v1 = np.array(v1) 42 | if type(v2)==list: 43 | v2 = np.array(v2) 44 | 45 | if v1.all() and v2.all(): 46 | return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) 47 | else: 48 | return 0 49 | 50 | def scale_zoom(rate): # sig 缩放 51 | zoom = (1 + np.exp(-float(rate))) / 2 52 | return zoom 53 | 54 | def scale_triangle(rate): # sin 缩放 55 | triangle = math.sin(rate/1*pi/2 - pi/2) 56 | return triangle 57 | 58 | bert_vector = KerasBertVector() 59 | print("bert start ok!") 60 | while True: 61 | print("input ques-1: ") 62 | ques_1 = input() 63 | print("input ques_2: ") 64 | ques_2 = input() 65 | vector_1 = bert_vector.bert_encode([ques_1]) 66 | vector_2 = bert_vector.bert_encode([ques_2]) 67 | sim = cosine_distance(vector_1[0], vector_2[0]) 68 | # sim_list = [sim, 0, 0.2, 0.4, 0.6, 0.8, 1.0] 69 | # sim = preprocessing.scale(sim_list)[0] 70 | # sim = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(sim_list)[0] 71 | # sim_1 = preprocessing.normalize(sim_list, norm='l1')[0] 72 | # sim_2 = preprocessing.normalize(sim_list, norm='l2')[0] 73 | # sim = scale_zoom(sim) 74 | # sim = scale_triangle(sim) 75 | # print(sim_1) 76 | # print(sim_2) 77 | print(sim) 78 | 79 | 80 | if __name__=="__main__": 81 | calculate_count() 82 | sim_two_question() 83 | -------------------------------------------------------------------------------- /FeatureProject/cut_td_idf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/1 10:35 4 | # @author :Mo 5 | # @function :cut sentences 6 | 7 | 8 | from conf.path_config import chicken_and_gossip_path, td_idf_cut_path, td_idf_cut_pinyin 9 | from utils.text_tools import txtWrite, txtRead, get_syboml, strQ2B 10 | from conf.path_config import projectdir 11 | from gensim import corpora, models 12 | import xpinyin 13 | import pickle 14 | import jieba 15 | 16 | 17 | def cut_td_idf(sources_path, target_path): 18 | """ 19 | 结巴切词,汉语 20 | :param path: 21 | :return: 22 | """ 23 | print("cut_td_idf start! ") 24 | corpus = txtRead(sources_path) 25 | governments = [] 26 | for corpus_one in corpus: 27 | corpus_one_clear = corpus_one.replace(' ', '').strip() 28 | ques_q2b = strQ2B(corpus_one_clear.strip()) 29 | ques_q2b_syboml = get_syboml(ques_q2b) 30 | governments.append(ques_q2b_syboml.strip()) 31 | 32 | government_ques = list(map(lambda x: ' '.join(jieba.lcut(x)), governments)) 33 | 34 | topic_ques_all = [] 35 | for topic_ques_one in government_ques: 36 | top_ques_aqlq = topic_ques_one.replace(' ', ' ').replace(' ', ' ').strip() + '\n' 37 | topic_ques_all.append(top_ques_aqlq) 38 | 39 | txtWrite(topic_ques_all, target_path) 40 | print("cut_td_idf ok! " + sources_path) 41 | 42 | 43 | def cut_td_idf_pinyin(sources_path, target_path): #获取拼音 44 | """ 45 | 汉语转拼音 46 | :param path: 47 | :return: 48 | """ 49 | pin = xpinyin.Pinyin() 50 | corpus = txtRead(sources_path) 51 | topic_ques_all = [] 52 | corpus_count = 0 53 | for corpus_one in corpus: 54 | corpus_count += 1 55 | # time1 = time.time() 56 | corpus_one_clear = corpus_one.replace(' ', '').strip() 57 | ques_q2b = strQ2B(corpus_one_clear.strip()) 58 | ques_q2b_syboml = get_syboml(ques_q2b) 59 | ques_q2b_syboml_pinying = pin.get_pinyin(ques_q2b_syboml.replace(' ', '').replace(' ', '').strip(), ' ') 60 | topic_ques_all.append(ques_q2b_syboml_pinying + '\n') 61 | # time2 = time.time() 62 | # print(str(corpus_count) + 'time:' + str(time2 - time1)) 63 | txtWrite(topic_ques_all, target_path) 64 | print("cut_td_idf_pinyin ok! " + sources_path) 65 | 66 | 67 | def init_tfidf_chinese_or_pinyin(sources_path): 68 | """ 69 | 构建td_idf 70 | :param path: 71 | :return: 72 | """ 73 | questions = txtRead(sources_path) 74 | corpora_documents = [] 75 | for item_text in questions: 76 | item_seg = list(jieba.cut(str(item_text).strip())) 77 | corpora_documents.append(item_seg) 78 | 79 | dictionary = corpora.Dictionary(corpora_documents) 80 | corpus = [dictionary.doc2bow(text) for text in corpora_documents] 81 | tfidf_model = models.TfidfModel(corpus) 82 | print("init_tfidf_chinese_or_pinyin ok! " + sources_path) 83 | file = open(sources_path.replace(".csv", "_dictionary_model.pkl"), 'wb') 84 | pickle.dump([dictionary, tfidf_model], file) 85 | 86 | 87 | if __name__ == '__main__': 88 | # path_text = projectdir + '/Data/chicken_gossip.txt' 89 | # sentences = txtRead(path_text) 90 | # sentences_q = [] 91 | # for sentences_one in sentences: 92 | # sentences_one_replace = sentences_one.replace(" ", "").replace("\t", "") 93 | # sentences_one_replace_split = sentences_one_replace.split("|") 94 | # sentence_new = sentences_one_replace_split[0] + "\t" + "".join(sentences_one_replace_split[1:]) 95 | # sentences_q.append(sentence_new) 96 | # sentences = txtWrite(sentences_q, projectdir + '/Data/chicken_and_gossip.txt') 97 | 98 | 99 | cut_td_idf(chicken_and_gossip_path, td_idf_cut_path) 100 | cut_td_idf_pinyin(chicken_and_gossip_path, td_idf_cut_pinyin) 101 | init_tfidf_chinese_or_pinyin(td_idf_cut_path) 102 | init_tfidf_chinese_or_pinyin(td_idf_cut_pinyin) 103 | print("corpus ok!") 104 | 105 | -------------------------------------------------------------------------------- /FeatureProject/distance_vec_TS_SS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/3 10:36 4 | # @author :Mo 5 | # @function :TS-SS distance 6 | # @url :https://github.com/taki0112/Vector_Similarity 7 | # @paper :A Hybrid Geometric Approach for Measuring Similarity Level Among Documents and Document Clustering 8 | 9 | 10 | import numpy as np 11 | import math 12 | 13 | zero_bit = 0.000000001 14 | 15 | 16 | def Cosine(vec1, vec2): 17 | """ 18 | 余弦相似度 19 | :param vec1: arrary 20 | :param vec2: arrary 21 | :return: float 22 | """ 23 | result = InnerProduct(vec1, vec2) / (VectorSize(vec1) * VectorSize(vec2) + zero_bit) 24 | return result 25 | 26 | 27 | def VectorSize(vec): 28 | vec_pow = sum(math.pow(v + zero_bit, 2) for v in vec) 29 | if vec_pow >= 0: 30 | return math.sqrt(vec_pow) 31 | else: 32 | return zero_bit 33 | 34 | 35 | def InnerProduct(vec1, vec2): 36 | try: 37 | return sum(v1 * v2 for v1, v2 in zip(vec1, vec2)) 38 | except: 39 | return zero_bit 40 | 41 | 42 | def Euclidean(vec1, vec2): 43 | vec12_pow = sum(math.pow((v1 - v2), 2) for v1, v2 in zip(vec1, vec2)) 44 | if vec12_pow >= 0: 45 | return math.sqrt(vec12_pow) 46 | else: 47 | return zero_bit 48 | 49 | 50 | def Theta(vec1, vec2): 51 | cosine_vec12 = Cosine(vec1, vec2) 52 | if -1 <= cosine_vec12 and cosine_vec12 <= 1: 53 | return math.acos(cosine_vec12) + 10 54 | else: 55 | return zero_bit + 10 56 | 57 | 58 | def Triangle(vec1, vec2): 59 | theta = math.radians(Theta(vec1, vec2)) 60 | return (VectorSize(vec1) * VectorSize(vec2) * math.sin(theta)) / 2 61 | 62 | 63 | def Magnitude_Difference(vec1, vec2): 64 | return abs(VectorSize(vec1) - VectorSize(vec2)) 65 | 66 | 67 | def Sector(vec1, vec2): 68 | ED = Euclidean(vec1, vec2) 69 | MD = Magnitude_Difference(vec1, vec2) 70 | theta = Theta(vec1, vec2) 71 | return math.pi * math.pow((ED + MD), 2) * theta / 360 72 | 73 | 74 | def TS_SS(vec1, vec2): 75 | return Triangle(vec1, vec2) * Sector(vec1, vec2) 76 | 77 | 78 | if __name__ == '__main__': 79 | vec1_test = np.array([1, 38, 17, 32]) 80 | vec2_test = np.array([5, 6, 8, 9]) 81 | 82 | print(Euclidean(vec1_test, vec2_test)) 83 | print(Cosine(vec1_test, vec2_test)) 84 | print(TS_SS(vec1_test, vec2_test)) 85 | -------------------------------------------------------------------------------- /FeatureProject/normalization_util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | #!/usr/bin/python 3 | # @Time :2019/3/12 14:18 4 | # @author :Mo 5 | # @site :https://blog.csdn.net/rensihui 6 | 7 | from sklearn import preprocessing 8 | import numpy as np 9 | 10 | def autoL1L2(data, norms = 'l1'): 11 | '''L1或者L2正则化''' 12 | return preprocessing.normalize(data, norm = norms) 13 | 14 | def autoScale(data): 15 | '''标准化, (X-mean)/std.得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。''' 16 | return preprocessing.scale(data) 17 | 18 | def autoMinMaxScaler(data): 19 | '''将属性缩放到一个指定范围''' 20 | return preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(data) 21 | 22 | def autoLinNorm(data): # 传入一个矩阵 23 | ''' 0-1归一化 24 | :param data: []矩阵 25 | :return: [] 26 | ''' 27 | mins = data.min(0) # 返回data矩阵中每一列中最小的元素,返回一个列表 28 | maxs = data.max(0) # 返回data矩阵中每一列中最大的元素,返回一个列表 29 | ranges = maxs - mins # 最大值列表 - 最小值列表 = 差值列表 30 | normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵,用于装归一化后的数据 31 | row = data.shape[0] # 返回 data矩阵的行数 32 | normData = data - np.tile(mins, (row, 1)) # data矩阵每一列数据都减去每一列的最小值 33 | normData = normData / np.tile(ranges, (row, 1)) # data矩阵每一列数据都除去每一列的差值(差值 = 某列的最大值- 某列最小值) 34 | return normData 35 | 36 | 37 | 38 | def autoAvgNorm(data): # 传入一个矩阵 39 | ''' 均值归一化 40 | :param data: []矩阵 41 | :return: [] 42 | ''' 43 | avg = np.average(data, axis=1) # 返回data矩阵中每一列中最小的元素,返回一个列表 44 | sigma = np.std(data, axis=1) # 返回data矩阵中每一列中最大的元素,返回一个列表 45 | normData = np.zeros(np.shape(data)) # 生成一个与 data矩阵同规格的normData全0矩阵,用于装归一化后的数据 46 | row = data.shape[0] # 返回 data矩阵的行数 47 | normData = data - np.tile(avg, (row, 1)) # data矩阵每一列数据都减去每一列的最小值 48 | normData = normData / np.tile(sigma, (row, 1)) # data矩阵每一列数据都除去每一列的差值(差值 = 某列的最大值- 某列最小值) 49 | return normData 50 | 51 | 52 | 53 | ###Sigmoid函数;Sigmoid函数是一个具有S形曲线的函数,是良好的阈值函数,在(0, 0.5)处中心对称,在(0, 0.5)附近有比较大的斜率, 54 | # 而当数据趋向于正无穷和负无穷的时候,映射出来的值就会无限趋向于1和0,是个人非常喜欢的“归一化方法”,之所以打引号是因为我觉得Sigmoid函数在 55 | # 阈值分割上也有很不错的表现,根据公式的改变,就可以改变分割阈值,这里作为归一化方法,我们只考虑(0, 0.5)作为分割阈值的点的情况: 56 | def sigmoid(data,useStatus): 57 | ''' sig归一化 58 | :param data: []矩阵 59 | :return: [] 60 | ''' 61 | if useStatus: 62 | row=data.shape[0] 63 | column=data.shape[1] 64 | normData = np.zeros(np.shape(data)) 65 | for i in range(row): 66 | for j in range(column): 67 | normData[i][j]=1.0 / (1 + np.exp(-float(data[i][j]))); 68 | return normData 69 | else: 70 | return float(data); 71 | 72 | if __name__ == '__main__': 73 | arr = np.array([[8, 7, 8], [4, 3, 1], [6, 9, 8]]) 74 | 75 | print("l1正则化") 76 | print(autoL1L2(arr, norms='l1')) 77 | 78 | print("l2正则化") 79 | print(autoL1L2(arr, norms='l2')) 80 | 81 | print("0-1标准化处理") 82 | print(autoScale(arr)) 83 | 84 | print("0-1缩放处理") 85 | print(autoMinMaxScaler(arr)) 86 | 87 | 88 | print("0-1归一化处理") 89 | print(autoLinNorm(arr)) 90 | 91 | 92 | print("均值归一化处理") 93 | print(autoAvgNorm(arr)) 94 | 95 | print("sig归一化处理") 96 | print(sigmoid(arr,True)) 97 | -------------------------------------------------------------------------------- /FeatureProject/xlnet/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/27 22:26 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /FeatureProject/xlnet/args.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/27 23:03 4 | # @author :Mo 5 | # @function : 6 | 7 | 8 | import pathlib 9 | import sys 10 | import os 11 | 12 | 13 | # base dir 14 | projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent.parent) 15 | sys.path.append(projectdir) 16 | 17 | 18 | # path of BERT model 19 | model_dir = os.path.join(projectdir, 'Data', 'chinese_xlnet_mid_L-24_H-768_A-12') 20 | config_name = os.path.join(model_dir, 'xlnet_config.json') 21 | ckpt_name = os.path.join(model_dir, 'xlnet_model.ckpt') 22 | spiece_model = os.path.join(model_dir, 'spiece.model') 23 | attention_type = 'bi' # or 'uni' 24 | # 批处理尺寸 25 | batch_size = 1 26 | # 历史序列长度 27 | memory_len=0 28 | # 当前目标序列长度 29 | target_len=32 30 | # 默认取倒数第二层的输出值作为句向量 31 | layer_indexes = [0, 23] # 可填 0, 1, 2, 3, 4, 5, 6, 7..., 24,其中0为embedding层 32 | # gpu使用率 33 | gpu_memory_fraction = 0.64 -------------------------------------------------------------------------------- /FeatureProject/xlnet/extract_keras_xlnet_feature.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/8/27 22:27 4 | # @author :Mo 5 | # @function : 6 | 7 | 8 | 9 | from keras_xlnet import Tokenizer, ATTENTION_TYPE_BI, ATTENTION_TYPE_UNI 10 | from keras_xlnet import load_trained_model_from_checkpoint 11 | 12 | from FeatureProject.bert.layers_keras import NonMaskingLayer 13 | import keras.backend.tensorflow_backend as ktf_keras 14 | from keras.models import Model 15 | from keras.layers import Add 16 | import tensorflow as tf 17 | import numpy as np 18 | import codecs 19 | import os 20 | 21 | from FeatureProject.xlnet import args 22 | 23 | 24 | # 全局使用,使其可以django、flask、tornado等调用 25 | graph = None 26 | model = None 27 | # gpu配置与使用率设置 28 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 29 | config = tf.ConfigProto() 30 | config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction 31 | sess = tf.Session(config=config) 32 | ktf_keras.set_session(sess) 33 | 34 | 35 | class KerasXlnetVector(): 36 | def __init__(self): 37 | self.attention_type = ATTENTION_TYPE_BI if args.attention_type[0] == 'bi' else ATTENTION_TYPE_UNI 38 | self.memory_len, self.target_len, self.batch_size = args.memory_len, args.target_len, args.batch_size 39 | self.checkpoint_path, self.config_path = args.ckpt_name, args.config_name 40 | self.layer_indexes, self.in_train_phase = args.layer_indexes, False 41 | 42 | print("load KerasXlnetEmbedding start! ") 43 | # 全局使用,使其可以django、flask、tornado等调用 44 | global graph 45 | graph = tf.get_default_graph() 46 | global model 47 | # 模型加载 48 | model = load_trained_model_from_checkpoint(checkpoint_path=self.checkpoint_path, 49 | attention_type=self.attention_type, 50 | in_train_phase=self.in_train_phase, 51 | config_path=self.config_path, 52 | memory_len=self.memory_len, 53 | target_len=self.target_len, 54 | batch_size=self.batch_size, 55 | mask_index=0) 56 | # 字典加载 57 | self.tokenizer = Tokenizer(args.spiece_model) 58 | # debug时候查看layers 59 | self.model_layers = model.layers 60 | len_layers = self.model_layers.__len__() 61 | print(len_layers) 62 | len_couche = int((len_layers - 6) / 10) 63 | # 一共246个layer 64 | # 每层10个layer(MultiHeadAttention,Dropout,Add,LayerNormalization),第一是9个layer的输入和embedding层 65 | # 一共24层 66 | layer_dict = [5] 67 | layer_0 = 6 68 | for i in range(len_couche): 69 | layer_0 = layer_0 + 10 70 | layer_dict.append(layer_0-2) 71 | # 输出它本身 72 | if len(self.layer_indexes) == 0: 73 | encoder_layer = model.output 74 | # 分类如果只有一层,取得不正确的话就取倒数第二层 75 | elif len(self.layer_indexes) == 1: 76 | if self.layer_indexes[0] in [i + 1 for i in range(len_couche + 1)]: 77 | encoder_layer = model.get_layer(index=layer_dict[self.layer_indexes[0]]).output 78 | else: 79 | encoder_layer = model.get_layer(index=layer_dict[-2]).output 80 | # 否则遍历需要取的层,把所有层的weight取出来并加起来shape:768*层数 81 | else: 82 | # layer_indexes must be [0, 1, 2,3,......24] 83 | all_layers = [model.get_layer(index=layer_dict[lay]).output 84 | if lay in [i + 1 for i in range(len_couche + 1)] 85 | else model.get_layer(index=layer_dict[-2]).output # 如果给出不正确,就默认输出倒数第二层 86 | for lay in self.layer_indexes] 87 | print(self.layer_indexes) 88 | print(all_layers) 89 | all_layers_select = [] 90 | for all_layers_one in all_layers: 91 | all_layers_select.append(all_layers_one) 92 | encoder_layer = Add()(all_layers_select) 93 | print(encoder_layer.shape) 94 | output_layer = NonMaskingLayer()(encoder_layer) 95 | model = Model(model.inputs, output_layer) 96 | print("load KerasXlnetEmbedding end") 97 | model.summary(132) 98 | 99 | 100 | def xlnet_encode(self, texts): 101 | 102 | # 相当于pool,采用的是https://github.com/terrifyzhao/bert-utils/blob/master/graph.py 103 | mul_mask = lambda x, m: x * np.expand_dims(m, axis=-1) 104 | masked_reduce_mean = lambda x, m: np.sum(mul_mask(x, m), axis=1) / (np.sum(m, axis=1, keepdims=True) + 1e-9) 105 | 106 | # 文本预处理 107 | predicts = [] 108 | for text in texts: 109 | # print(text) 110 | tokens = self.tokenizer.encode(text) 111 | tokens = tokens + [0]*(self.target_len-len(tokens)) if len(tokens) < self.target_len else tokens[0:self.target_len] 112 | token_input = np.expand_dims(np.array(tokens), axis=0) 113 | mask_input = np.array([0 if ids == 0 else 1 for ids in tokens]) 114 | segment_input = np.zeros_like(token_input) 115 | memory_length_input = np.zeros((1, 1)) 116 | # 全局使用,使其可以django、flask、tornado等调用 117 | with graph.as_default(): 118 | predict = model.predict([token_input, segment_input, memory_length_input], batch_size=1) 119 | # print(predict) 120 | prob = predict[0] 121 | pooled = masked_reduce_mean(prob, [mask_input]) 122 | pooled = pooled.tolist() 123 | predicts.append(pooled[0]) 124 | return predicts 125 | 126 | 127 | if __name__ == "__main__": 128 | xlnet_vector = KerasXlnetVector() 129 | pooled = xlnet_vector.xlnet_encode(['你是谁呀', '小老弟']) 130 | print(pooled) 131 | while True: 132 | print("input:") 133 | ques = input() 134 | print(ques) 135 | print(xlnet_vector.xlnet_encode([ques])) 136 | -------------------------------------------------------------------------------- /FeatureProject/xlnet/layers_keras.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 10:49 4 | # @author :Mo 5 | # @function :create model of keras-bert for get [-2] layers 6 | 7 | from keras.engine import Layer 8 | 9 | 10 | class NonMaskingLayer(Layer): 11 | """ 12 | fix convolutional 1D can't receive masked input, detail: https://github.com/keras-team/keras/issues/4978 13 | thanks for https://github.com/jacoxu 14 | """ 15 | 16 | def __init__(self, **kwargs): 17 | self.supports_masking = True 18 | super(NonMaskingLayer, self).__init__(**kwargs) 19 | 20 | def build(self, input_shape): 21 | pass 22 | 23 | def compute_mask(self, input, input_mask=None): 24 | # do not pass the mask to the next layers 25 | return None 26 | 27 | def call(self, x, mask=None): 28 | return x 29 | 30 | def get_output_shape_for(self, input_shape): 31 | return input_shape 32 | -------------------------------------------------------------------------------- /FeatureProject/xlnet/tet_xlnet_keras_sim.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/7 20:27 4 | # @author :Mo 5 | # @function :test sentence of xlnet encode and cosin sim of two question 6 | 7 | 8 | def calculate_count(): 9 | """ 10 | 统计一下1000条测试数据的平均耗时 11 | :return: 12 | """ 13 | from FeatureProject.xlnet.extract_keras_xlnet_feature import KerasXlnetVector 14 | import time 15 | 16 | xlnet_vector = KerasXlnetVector() 17 | print("xlnet start ok!") 18 | time_start = time.time() 19 | for i in range(1000): 20 | vector = xlnet_vector.xlnet_encode(["yx,你知道吗,我很喜欢你呀,在一起在一起在一起,哈哈哈哈"]) 21 | 22 | time_end = time.time() 23 | time_avg = (time_end-time_start)/1000 24 | print(vector) 25 | print(time_avg) 26 | # 0.12605296468734742 win10 gpu avg 27 | # 0.01629048466682434 linux cpu avg 28 | 29 | 30 | def sim_two_question(): 31 | """测试一下两个问题的相似句子""" 32 | from FeatureProject.xlnet.extract_keras_xlnet_feature import KerasXlnetVector 33 | from sklearn import preprocessing 34 | from math import pi 35 | import numpy as np 36 | import time 37 | import math 38 | 39 | def cosine_distance(v1, v2): # 余弦距离 40 | if type(v1)==list: 41 | v1 = np.array(v1) 42 | if type(v2)==list: 43 | v2 = np.array(v2) 44 | if v1.all() and v2.all(): 45 | return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) 46 | else: 47 | return 0 48 | 49 | def scale_zoom(rate): # sig 缩放 50 | zoom = (1 + np.exp(-float(rate))) / 2 51 | return zoom 52 | 53 | def scale_triangle(rate): # sin 缩放 54 | triangle = math.sin(rate/1*pi/2 - pi/2) 55 | return triangle 56 | 57 | xlnet_vector = KerasXlnetVector() 58 | print("xlnet start ok!") 59 | while True: 60 | print("input ques-1: ") 61 | ques_1 = input() 62 | print("input ques_2: ") 63 | ques_2 = input() 64 | vector_1 = xlnet_vector.xlnet_encode([ques_1]) 65 | vector_2 = xlnet_vector.xlnet_encode([ques_2]) 66 | sim = cosine_distance(vector_1[0], vector_2[0]) 67 | # sim_list = [sim, 0, 0.2, 0.4, 0.6, 0.8, 1.0] 68 | # sim = preprocessing.scale(sim_list)[0] 69 | # sim = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(sim_list)[0] 70 | # sim_1 = preprocessing.normalize(sim_list, norm='l1')[0] 71 | # sim_2 = preprocessing.normalize(sim_list, norm='l2')[0] 72 | # sim = scale_zoom(sim) 73 | # sim = scale_triangle(sim) 74 | # print(sim_1) 75 | # print(sim_2) 76 | print(sim) 77 | 78 | 79 | if __name__=="__main__": 80 | calculate_count() 81 | sim_two_question() 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 yongzhuo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Ner/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/21 15:23 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /Ner/bert/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/21 15:23 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /Ner/bert/args.py: -------------------------------------------------------------------------------- 1 | # bi-lstm 2 | return_sequences = True 3 | use_cudnn_cell = True 4 | use_lstm = True 5 | use_crf = True 6 | is_training = True 7 | 8 | loss = 'categorical_crossentropy' 9 | metrics = ['accuracy'] # 'crf_loss' # ['accuracy'] 10 | activation = 'relu' # 'relu' 11 | optimizers = 'adam' 12 | learning_rate = 1e-3 13 | epsilon = 1e-9 14 | embedding_dim = 768 15 | keep_prob = 0.5 16 | units = 256 17 | decay = 0.0 18 | label = 7 19 | l2 = 0.032 20 | 21 | epochs = 320 22 | batch_size = 16 23 | path_save_model = 'models/bilstm/bert_ner_bilstm_no_12_config.h5' 24 | path_tag_li = 'models/bilstm/tag_l_i.pkl' 25 | 26 | # gpu使用率 27 | gpu_memory_fraction = 0.32 28 | 29 | # ner当然是所有层都会提取啦,句向量默认取倒数第二层的输出值作为句向量 30 | layer_indexes = [i+1 for i in range(13)] # [-2] 31 | 32 | # 序列的最大程度 33 | max_seq_len = 50 34 | -------------------------------------------------------------------------------- /Ner/bert/keras_bert_embedding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/8 20:04 4 | # @author :Mo 5 | # @function :embedding of bert keras 6 | 7 | import os 8 | 9 | import keras.backend.tensorflow_backend as ktf_keras 10 | import tensorflow as tf 11 | from Ner.bert.keras_bert_layer import NonMaskingLayer 12 | from keras.layers import Add, Concatenate 13 | from keras.models import Model 14 | from keras_bert import load_trained_model_from_checkpoint 15 | 16 | from Ner.bert.args import gpu_memory_fraction, max_seq_len, layer_indexes 17 | from conf.feature_config import config_name, ckpt_name, vocab_file 18 | 19 | # 全局使用,使其可以django、flask、tornado等调用 20 | graph = None 21 | model = None 22 | 23 | # gpu配置与使用率设置 24 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 25 | config = tf.ConfigProto() 26 | config.gpu_options.per_process_gpu_memory_fraction = gpu_memory_fraction 27 | sess = tf.Session(config=config) 28 | ktf_keras.set_session(sess) 29 | 30 | class KerasBertEmbedding(): 31 | def __init__(self): 32 | self.config_path, self.checkpoint_path, self.dict_path, self.max_seq_len = config_name, ckpt_name, vocab_file, max_seq_len 33 | 34 | def bert_encode(self): 35 | # 全局使用,使其可以django、flask、tornado等调用 36 | global graph 37 | graph = tf.get_default_graph() 38 | global model 39 | model = load_trained_model_from_checkpoint(self.config_path, self.checkpoint_path, 40 | seq_len=self.max_seq_len) 41 | bert_layers = model.layers 42 | # return model 43 | print(bert_layers) 44 | print(model.output) 45 | print(len(model.layers)) 46 | # lay = model.layers 47 | #一共104个layer,其中前八层包括token,pos,embed等, 48 | # 每8层(MultiHeadAttention,Dropout,Add,LayerNormalization) 49 | # 一共12层+最开始未处理那层(可以理解为input) 50 | layer_dict = [7] 51 | layer_0 = 7 52 | for i in range(12): 53 | layer_0 = layer_0 + 8 54 | layer_dict.append(layer_0) 55 | 56 | # 输出它本身 57 | if len(layer_indexes) == 0: 58 | encoder_layer = model.output 59 | # 分类如果只有一层,就只取最后那一层的weight,取得不正确 60 | elif len(layer_indexes) == 1: 61 | if layer_indexes[0] in [i+1 for i in range(13)]: 62 | encoder_layer = model.get_layer(index=layer_dict[layer_indexes[0]]).output 63 | else: 64 | encoder_layer = model.get_layer(index=layer_dict[-1]).output 65 | # 否则遍历需要取的层,把所有层的weight取出来并拼接起来shape:768*层数 66 | else: 67 | # layer_indexes must be [1,2,3,......12] 68 | # all_layers = [model.get_layer(index=lay).output if lay is not 1 else model.get_layer(index=lay).output[0] for lay in layer_indexes] 69 | all_layers = [model.get_layer(index=layer_dict[lay]).output if lay in [i for i in range(13)] 70 | else model.get_layer(index=layer_dict[-1]).output #如果给出不正确,就默认输出最后一层 71 | for lay in layer_indexes] 72 | print(layer_indexes) 73 | print(all_layers) 74 | all_layers_select = [] 75 | for all_layers_one in all_layers: 76 | all_layers_select.append(all_layers_one) 77 | # encoder_layer = Add()(all_layers_select) 78 | encoder_layer = Concatenate(axis=-1)(all_layers_select) 79 | print(encoder_layer.shape) 80 | print("KerasBertEmbedding:") 81 | print(encoder_layer.shape) 82 | output = NonMaskingLayer()(encoder_layer) 83 | model = Model(model.inputs, output) 84 | # model.summary(120) 85 | return model.inputs, model.output 86 | 87 | 88 | if __name__ == "__main__": 89 | bert_vector = KerasBertEmbedding() 90 | pooled = bert_vector.bert_encode() 91 | -------------------------------------------------------------------------------- /Ner/bert/layer_crf_bojone.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/26 9:29 4 | # @author :Mo 5 | # @function : 6 | 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | 10 | from keras.layers import Layer 11 | import keras.backend as K 12 | 13 | 14 | class CRF(Layer): 15 | """ 16 | codes from: https://github.com/bojone/crf/blob/master/crf_keras.py 17 | 纯Keras实现CRF层 18 | CRF层本质上是一个带训练参数的loss计算层,因此CRF层只用来训练模型, 19 | 而预测则需要另外建立模型。 20 | """ 21 | 22 | def __init__(self, ignore_last_label=False, **kwargs): 23 | """ignore_last_label:定义要不要忽略最后一个标签,起到mask的效果 24 | """ 25 | self.ignore_last_label = 1 if ignore_last_label else 0 26 | super(CRF, self).__init__(**kwargs) 27 | 28 | def build(self, input_shape): 29 | self.num_labels = input_shape[-1] - self.ignore_last_label 30 | self.trans = self.add_weight(name='crf_trans', 31 | shape=(self.num_labels, self.num_labels), 32 | initializer='glorot_uniform', 33 | trainable=True) 34 | 35 | def log_norm_step(self, inputs, states): 36 | """递归计算归一化因子 37 | 要点:1、递归计算;2、用logsumexp避免溢出。 38 | 技巧:通过expand_dims来对齐张量。 39 | """ 40 | states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) 41 | trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) 42 | output = K.logsumexp(states + trans, 1) # (batch_size, output_dim) 43 | return output + inputs, [output + inputs] 44 | 45 | def path_score(self, inputs, labels): 46 | """计算目标路径的相对概率(还没有归一化) 47 | 要点:逐标签得分,加上转移概率得分。 48 | 技巧:用“预测”点乘“目标”的方法抽取出目标路径的得分。 49 | """ 50 | point_score = K.sum(K.sum(inputs * labels, 2), 1, keepdims=True) # 逐标签得分 51 | labels1 = K.expand_dims(labels[:, :-1], 3) 52 | labels2 = K.expand_dims(labels[:, 1:], 2) 53 | labels = labels1 * labels2 # 两个错位labels,负责从转移矩阵中抽取目标转移得分 54 | trans = K.expand_dims(K.expand_dims(self.trans, 0), 0) 55 | trans_score = K.sum(K.sum(trans * labels, [2, 3]), 1, keepdims=True) 56 | return point_score + trans_score # 两部分得分之和 57 | 58 | def call(self, inputs): # CRF本身不改变输出,它只是一个loss 59 | return inputs 60 | 61 | def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式 62 | mask = 1 - y_true[:, 1:, -1] if self.ignore_last_label else None 63 | y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels] 64 | init_states = [y_pred[:, 0]] # 初始状态 65 | log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states, mask=mask) # 计算Z向量(对数) 66 | log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) 67 | path_score = self.path_score(y_pred, y_true) # 计算分子(对数) 68 | return log_norm - path_score # 即log(分子/分母) 69 | 70 | def accuracy(self, y_true, y_pred): # 训练过程中显示逐帧准确率的函数,排除了mask的影响 71 | mask = 1 - y_true[:, :, -1] if self.ignore_last_label else None 72 | y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels] 73 | isequal = K.equal(K.argmax(y_true, 2), K.argmax(y_pred, 2)) 74 | isequal = K.cast(isequal, 'float32') 75 | if mask == None: 76 | return K.mean(isequal) 77 | else: 78 | return K.sum(isequal * mask) / K.sum(mask) 79 | -------------------------------------------------------------------------------- /Ner/bert/models/bilstm/useless.txt: -------------------------------------------------------------------------------- 1 | useless -------------------------------------------------------------------------------- /conf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/3 11:23 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /conf/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/conf/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /conf/__pycache__/path_config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/conf/__pycache__/path_config.cpython-36.pyc -------------------------------------------------------------------------------- /conf/augment_constant.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/9 23:02 4 | # @author :Mo 5 | # @function : 6 | 7 | 8 | # 语言缩写1,google 9 | language_short_zh = ['zh', 'zh_CN', 'zh_HK', 'zh_TW'] 10 | language_short_first = ['en', 'fr', 'de', 'es', 'be', 'it', 'ja', 'ar', 'nl', 'pt', 'bg', 'el', 'ca', 'iw', 'is', 'sh', 'ko', 'sv', 'sq', 'ru', 'no', 'fi', 'hr', 'ro', 'sr', 'pl', 'lt', 'th', 'mk', 'sk', 'et', 'da', 'hu', 'sl', 'tr', 'uk', 'lv', 'cs'] 11 | language_short_other = ['en', 'en_US', 'ar', 'ar_AE', 'ar_BH', 'ar_DZ', 'ar_EG', 'ar_IQ', 'ar_JO', 'ar_KW', 'ar_LB', 'ar_LY', 'ar_MA', 'ar_OM', 'ar_QA', 'ar_SA', 'ar_SD', 'ar_SY', 'ar_TN', 'ar_YE', 'be', 'be_BY', 'bg', 'bg_BG', 'bo_CN', 'ca', 'ca_ES', 'ca_ES_EURO', 'cs', 'cs_CZ', 'da', 'da_DK', 'de', 'de_AT', 'de_AT_EURO', 'de_CH', 'de_DE', 'de_DE_EURO', 'de_LU', 'de_LU_EURO', 'el', 'el_GR', 'en_AU', 'en_CA', 'en_GB', 'en_IE', 'en_IE_EURO', 'en_NZ', 'en_ZA', 'es', 'es_BO', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_ES_EURO', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'et', 'es_PA', 'es_PE', 'es_PR', 'es_PY', 'es_SV', 'es_UY', 'es_VE', 'et_EE', 'fi', 'fi_FI', 'fi_FI_EURO', 'fr', 'fr_BE', 'fr_BE_EURO', 'fr_CA', 'fr_CH', 'fr_FR', 'fr_FR_EURO', 'fr_LU', 'fr_LU_EURO', 'hr', 'hr_HR', 'hu', 'hu_HU', 'is', 'is_IS', 'it', 'it_CH', 'it_IT', 'it_IT_EURO', 'iw', 'iw_IL', 'ja', 'ja_JP', 'ko', 'ko_KR', 'lt', 'lt_LT', 'lv', 'lv_LV', 'mk', 'mk_MK', 'nl', 'nl_BE', 'nl_BE_EURO', 'nl_NL', 'nl_NL_EURO', 'no', 'no_NO', 'no_NO_NY', 'pl', 'pl_PL', 'pt', 'pt_BR', 'pt_PT', 'pt_PT_EURO', 'ro', 'ro_RO', 'ru', 'ru_RU', 'sh', 'sh_YU', 'sk', 'sk_SK', 'sl', 'sl_SI', 'sq', 'sq_AL', 'sr', 'sr_YU', 'sv', 'sv_SE', 'th', 'th_TH', 'tr', 'tr_TR', 'uk', 'uk_UA'] 12 | 13 | 14 | # 语言缩写,国内在线翻译 15 | language_short_google = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'it', 'be', 'nl', 'bg', 'el', 'ca', 'iw', 'is', 'sh', 'sv', 'sq', 'no', 'fi', 'hr', 'ro', 'pl', 'lt', 'th', 'mk', 'sk', 'et', 'da', 'hu', 'sl', 'tr', 'uk', 'lv', 'cs', 'sr'] 16 | language_short_baidu = ['en', 'fra', 'ru', 'de', 'est', 'pt', 'ara', 'jp', 'kor', 'vie', 'yue', 'wyw', 'spa', 'th', 'it', 'el', 'nl', 'pl', 'bul', 'dan', 'fin', 'cs', 'rom', 'slo', 'swe', 'hu', 'cht'] 17 | language_short_youdao = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'vi', 'id'] 18 | language_short_sougou = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'ar', 'ja', 'ko', 'vi', 'id', 'it', 'et', 'bg', 'pl', 'bs-Latn', 'fa', 'mww', 'da', 'fi', 'tlh-Qaak', 'tlh', 'hr', 'otq', 'ca', 'cs', 'ro', 'lv', 'ht', 'lt', 'nl', 'ms', 'mt', 'sl', 'th', 'tr', 'sk', 'sw', 'af', 'no', 'uk', 'ur', 'el', 'hu', 'cy', 'yua', 'he', 'hi', 'sv', 'yue', 'fj', 'fil', 'sm', 'to', 'ty', 'mg', 'bn', 'sr-Latn', 'sr-Cyrl'] 19 | language_short_tencent = ['en', 'fr', 'ru', 'de', 'es', 'pt', 'jp', 'ko', 'vi', 'id', 'it', 'kr', 'tr', 'ms', 'th'] 20 | 21 | 22 | # 在线翻译账户密码,自己去注册吧 23 | app_key_google = "" 24 | app_secret_google = "" 25 | app_key_bing = "" 26 | app_secret_bing = "" 27 | app_key_baidu = "" 28 | app_secret_baidu = "" 29 | app_key_youdao = "" 30 | app_secret_youdao = "" 31 | app_key_sougou = "" 32 | app_secret_sougou = "" 33 | app_key_tencent = "" 34 | app_secret_tentcnet = "" -------------------------------------------------------------------------------- /conf/feature_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/5/10 9:13 4 | # @author :Mo 5 | # @function :path of FeatureProject 6 | import pathlib 7 | import sys 8 | import os 9 | 10 | 11 | # base dir 12 | projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent) 13 | sys.path.append(projectdir) 14 | 15 | 16 | # path of BERT model 17 | model_dir = projectdir + '/Data/chinese_L-12_H-768_A-12' 18 | config_name = model_dir + '/bert_config.json' 19 | ckpt_name = model_dir + '/bert_model.ckpt' 20 | vocab_file = model_dir + '/vocab.txt' 21 | # gpu使用率 22 | gpu_memory_fraction = 0.32 23 | # 默认取倒数第二层的输出值作为句向量 24 | layer_indexes = [-2] # 可填 1, 2, 3, 4, 5, 6, 7..., 13,其中1为embedding层 25 | # 序列的最大程度 26 | max_seq_len = 32 27 | -------------------------------------------------------------------------------- /conf/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "bidirectional": true, 3 | "use_residual": true, 4 | "use_dropout": true, 5 | "time_major": true, 6 | "cell_type": "lstm", 7 | "depth": 2, 8 | "attention_type": "Bahdanau", 9 | "hidden_units": 128, 10 | "optimizer": "adam", 11 | "learning_rate": 0.001, 12 | "embedding_size": 300 13 | } 14 | -------------------------------------------------------------------------------- /conf/path_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/3 11:23 4 | # @author :Mo 5 | # @function :path 6 | 7 | 8 | import pathlib 9 | import sys 10 | import os 11 | 12 | 13 | # base dir 14 | projectdir = str(pathlib.Path(os.path.abspath(__file__)).parent.parent) 15 | sys.path.append(projectdir) 16 | print(projectdir) 17 | 18 | # stop_words_path 19 | stop_words_path = projectdir + '/Data/common_words/stopwords.txt' 20 | 21 | # corpus 22 | chicken_and_gossip_path = projectdir + '/Data/corpus/chicken_and_gossip.txt' 23 | 24 | # word2vec 25 | w2v_model_merge_short_path = projectdir + "/Data/chinese_vector/w2v_model_merge_short.vec" 26 | 27 | # tf_idf 28 | td_idf_cut_path = projectdir + '/Data/tf_idf/td_idf_cut.csv' 29 | td_idf_cut_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin.csv' 30 | td_idf_path_pinyin = projectdir + '/Data/tf_idf/td_idf_cut_pinyin_dictionary_model.pkl' 31 | td_idf_path = projectdir + '/Data/tf_idf/td_idf_cut_dictionary_model.pkl' 32 | 33 | # word, 句向量 34 | w2v_model_wiki_word_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_word.vec' 35 | matrix_ques_part_path = projectdir + '/Data/sentence_vec_encode_word/1.txt' 36 | 37 | # char, 句向量 38 | w2v_model_char_path = projectdir + '/Data/chinese_vector/w2v_model_wiki_char.vec' 39 | matrix_ques_part_path_char = projectdir + '/Data/sentence_vec_encode_char/1.txt' 40 | 41 | # word2vec select 42 | word2_vec_path = w2v_model_wiki_word_path if os.path.exists(w2v_model_wiki_word_path) else w2v_model_merge_short_path 43 | 44 | # stanford_corenlp_full_path,需要自己下载配置stanford-corenlp-full-2018-10-05 45 | stanford_corenlp_full_path = "Y:/segment/stanford-corenlp-full-2018-10-05" 46 | 47 | # corpus webbank sim data char 48 | train_data_web_char_dir = projectdir + '/AugmentText/augment_seq2seq/data_mid/char/' 49 | train_data_web_ws_anti=projectdir + '/AugmentText/augment_seq2seq/data_mid/char/train_data_web_ws_anti.pkl' 50 | train_data_web_xy_anti=projectdir + '/AugmentText/augment_seq2seq/data_mid/char/train_data_web_xy_anti.pkl' 51 | model_ckpt_web_anti=projectdir + '/AugmentText/augment_seq2seq/model_seq2seq_tp/seq2seq_char_webank/model_ckpt_char_webank.ckp' 52 | path_params=projectdir + '/conf/params.json' 53 | path_webank_sim=projectdir + '/Data/corpus/sim_webank.csv' 54 | 55 | # corpus webbank sim data word 56 | train_data_web_word_dir = projectdir + '/AugmentText/augment_seq2seq/data_mid/word/' 57 | train_data_web_emb_anti=projectdir + '/AugmentText/augment_seq2seq/data_mid/word/train_data_web_emb_anti.pkl' 58 | train_data_web_xyw_anti=projectdir + '/AugmentText/augment_seq2seq/data_mid/word/train_data_web_ws_anti.pkl' 59 | model_ckpt_web_anti_word=projectdir + '/AugmentText/augment_seq2seq/model_seq2seq_tp/seq2seq_word_webank/train_data_web_ws_anti.pkl' 60 | 61 | # chatbot data char 62 | chatbot_data_cg_char_dir = projectdir + '/ChatBot/chatbot_generate/seq2seq/data_mid/char/' 63 | chatbot_data_cg_ws_anti=projectdir + '/ChatBot/chatbot_generate/seq2seq/data_mid/char/train_data_web_ws_anti.pkl' 64 | chatbot_data_cg_xy_anti=projectdir + '/ChatBot/chatbot_generate/seq2seq/data_mid/char/train_data_web_xy_anti.pkl' 65 | model_ckpt_cg_anti=projectdir + '/ChatBot/chatbot_generate/seq2seq/model_seq2seq_tp/seq2seq_char_cg/model_ckpt_char_cg.ckp' 66 | 67 | # chatbot data word 68 | chatbot_data_cg_word_dir = projectdir + '/ChatBot/chatbot_generate/seq2seq/data_mid/word/' 69 | chatbot_data_cg_xyw_anti_word=projectdir + '/ChatBot/chatbot_generate/seq2seq/data_mid/word/train_data_cg_word_xyw.pkl' 70 | chatbot_data_cg_emb_anti_word=projectdir + '/ChatBot/chatbot_generate/seq2seq/data_mid/word/train_data_cg_word_emb.pkl' 71 | model_ckpt_cg_anti_word=projectdir + '/ChatBot/chatbot_generate/seq2seq/model_seq2seq_tp/seq2seq_word_cg/model_ckpt_word_cg.ckp' 72 | 73 | # webank corpus for classify train-dev-test 74 | path_webank_train=projectdir + '/Data/corpus/webank/train.csv' 75 | path_webank_dev=projectdir + '/Data/corpus/webank/dev.csv' 76 | path_webank_test=projectdir + '/Data/corpus/webank/test.csv' 77 | 78 | # ner chinese_people_daily 79 | path_ner_people_train = projectdir + '/Data/corpus/ner/people_daily/people.train' 80 | path_ner_people_dev = projectdir + '/Data/corpus/ner/people_daily/people.dev' 81 | path_ner_people_test = projectdir + '/Data/corpus/ner/people_daily/people.test' 82 | 83 | -------------------------------------------------------------------------------- /python-version-time: -------------------------------------------------------------------------------- 1 | Python 3.3.2(May 15, 2013) 2 | Python 3.2.5(May 15, 2013) 3 | Python 3.1.5(April 10, 2012) 4 | Python 3.0.1(February 13, 2009) 5 | Python 2.7.5(May 15, 2013) 6 | Python 2.6.8(April 10, 2012) 7 | Python 2.5.6(May 26, 2011) 8 | Python 2.4.6(December 19, 2008) 9 | Python 2.3.7(March 11, 2008) 10 | Python 2.2.3(May 30, 2003) 11 | Python 2.1.3(April 8, 2002) 12 | Python 2.0.1(June 2001) 13 | Python 1.6.1(September 2000) 14 | Python 1.5.2(April 1999) 15 | Older releases:Source releases,binaries-1.1,binaries-1.2,binaries-1.3,binaries-1.4,binaries-1.5 16 | -------------------------------------------------------------------------------- /requestments.txt: -------------------------------------------------------------------------------- 1 | python-Levenshtein>=0.12.0 2 | fuzzywuzzy>=0.17.0 3 | openpyxl>=2.6.2 4 | pandas>=0.24.2 5 | xpinyin>=0.5.6 6 | numpy>=1.16.1 7 | gensim>=3.7.1 8 | pyemd>=0.5.1 9 | jieba>=0.39 10 | xlrd>=1.2.0 11 | tensorflow-gpu>=1.12.0 12 | keras-bert>=0.41.0 13 | keras-xlnet>=0.16.0 14 | keras>=2.2.4 15 | python>=3.6.2 16 | sklearn 17 | pathlib 18 | translate 19 | PyExecJS 20 | stanfordcorenlp 21 | -------------------------------------------------------------------------------- /result_test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/3 14:40 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /result_test/result_augment_seq2seq_char.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/result_test/result_augment_seq2seq_char.txt -------------------------------------------------------------------------------- /result_test/result_augment_seq2seq_word.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/result_test/result_augment_seq2seq_word.txt -------------------------------------------------------------------------------- /result_test/result_chatbot_fuzzy.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/result_test/result_chatbot_fuzzy.txt -------------------------------------------------------------------------------- /result_test/result_chatbot_sentence_vec_by_char.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/result_test/result_chatbot_sentence_vec_by_char.txt -------------------------------------------------------------------------------- /result_test/result_chatbot_sentence_vec_by_word.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/result_test/result_chatbot_sentence_vec_by_word.txt -------------------------------------------------------------------------------- /result_test/result_sentence_sim_feature.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yongzhuo/nlp_xiaojiang/729f8ee20d4ff9db9f8dfd75e745e8ca5ba7cee6/result_test/result_sentence_sim_feature.txt -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/6/13 10:17 4 | # @author :Mo 5 | # @function :setup of nlp_xiaojiang 6 | # @codes :copy from https://github.com/TianWenQAQ/Kashgari/blob/master/setup.py 7 | 8 | from setuptools import find_packages, setup 9 | import pathlib 10 | 11 | # Package meta-data. 12 | NAME = 'nlp-xiaojiang' 13 | DESCRIPTION = 'nlp of augment、chatbot、classification and featureproject of chinese text' 14 | URL = 'https://github.com/yongzhuo/nlp_xiaojiang' 15 | EMAIL = '1903865025@qq.com' 16 | AUTHOR = 'yongzhuo' 17 | LICENSE = 'MIT' 18 | 19 | HERE = pathlib.Path(__file__).parent 20 | README = (HERE / "README.md").read_text() 21 | 22 | required = [ 23 | 'scikit-learn>=0.19.1', 24 | 'fuzzywuzzy>=0.17.0', 25 | 'openpyxl>=2.6.2', 26 | 'xpinyin>=0.5.6', 27 | 'gensim>=3.7.1', 28 | 'jieba>=0.39', 29 | 'xlrd>=1.2.0', 30 | 'tensorflow>=1.8.0', 31 | 'keras-bert>=0.41.0', 32 | 'Keras>=2.2.0', 33 | 'pandas>=0.23.0', 34 | 'h5py>=2.7.1', 35 | 'numpy>=1.16.1', 36 | 'pyemd==0.5.1', 37 | 'pathlib', 38 | 'translate', 39 | 'PyExecJS', 40 | 'stanfordcorenlp',] 41 | 42 | setup(name=NAME, 43 | version='0.0.1', 44 | description=DESCRIPTION, 45 | long_description=README, 46 | long_description_content_type="text/markdown", 47 | author=AUTHOR, 48 | author_email=EMAIL, 49 | url=URL, 50 | packages=find_packages(exclude=('tests')), 51 | install_requires=required, 52 | license=LICENSE, 53 | classifiers=['License :: OSI Approved :: MIT License', 54 | 'Programming Language :: Python :: 3.4', 55 | 'Programming Language :: Python :: 3.5', 56 | 'Programming Language :: Python :: 3.6', 57 | 'Programming Language :: Python :: 3.7', 58 | 'Programming Language :: Python :: 3.8', 59 | 'Programming Language :: Python :: Implementation :: CPython', 60 | 'Programming Language :: Python :: Implementation :: PyPy'],) 61 | 62 | 63 | if __name__ == "__main__": 64 | print("setup ok!") 65 | 66 | # 说明,项目工程目录这里nlp_xiaojiang,实际上,下边还要有一层nlp_xiangjiang,也就是说,nlp_xiangjiang和setup同一层 67 | # Data包里必须要有__init__.py,否则文件不会生成 68 | 69 | # step: 70 | # 打开cmd 71 | # 到达安装目录 72 | # python setup.py build 73 | # python setup.py install -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/3 15:15 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /utils/mode_util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 9:58 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /utils/mode_util/seq2seq/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/15 9:58 4 | # @author :Mo 5 | # @function : -------------------------------------------------------------------------------- /utils/mode_util/seq2seq/thread_generator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code from : https://gist.github.com/everilae/9697228 3 | QHduan added __next__:https://github.com/qhduan/just_another_seq2seq 4 | """ 5 | 6 | # A simple generator wrapper, not sure if it's good for anything at all. 7 | # With basic python threading 8 | from threading import Thread 9 | from queue import Queue 10 | 11 | # ... or use multiprocessing versions 12 | # WARNING: use sentinel based on value, not identity 13 | # from multiprocessing import Process, Queue as MpQueue 14 | 15 | 16 | class ThreadedGenerator(object): 17 | """ 18 | Generator that runs on a separate thread, returning values to calling 19 | thread. Care must be taken that the iterator does not mutate any shared 20 | variables referenced in the calling thread. 21 | """ 22 | 23 | def __init__(self, iterator, 24 | sentinel=object(), 25 | queue_maxsize=0, 26 | daemon=False): 27 | self._iterator = iterator 28 | self._sentinel = sentinel 29 | self._queue = Queue(maxsize=queue_maxsize) 30 | self._thread = Thread( 31 | name=repr(iterator), 32 | target=self._run 33 | ) 34 | self._thread.daemon = daemon 35 | self._started = False 36 | 37 | def __repr__(self): 38 | return 'ThreadedGenerator({!r})'.format(self._iterator) 39 | 40 | def _run(self): 41 | try: 42 | for value in self._iterator: 43 | if not self._started: 44 | return 45 | self._queue.put(value) 46 | finally: 47 | self._queue.put(self._sentinel) 48 | 49 | def close(self): 50 | self._started = False 51 | try: 52 | while True: 53 | self._queue.get(timeout=0) 54 | except KeyboardInterrupt as e: 55 | raise e 56 | except: # pylint: disable=bare-except 57 | pass 58 | # self._thread.join() 59 | 60 | def __iter__(self): 61 | self._started = True 62 | self._thread.start() 63 | for value in iter(self._queue.get, self._sentinel): 64 | yield value 65 | self._thread.join() 66 | self._started = False 67 | 68 | def __next__(self): 69 | if not self._started: 70 | self._started = True 71 | self._thread.start() 72 | value = self._queue.get(timeout=30) 73 | if value == self._sentinel: 74 | raise StopIteration() 75 | return value 76 | 77 | 78 | def test(): 79 | """测试""" 80 | 81 | def gene(): 82 | i = 0 83 | while True: 84 | yield i 85 | i += 1 86 | t = gene() 87 | tt = ThreadedGenerator(t) 88 | for _ in range(10): 89 | print(next(tt)) 90 | tt.close() 91 | # for i in range(10): 92 | # print(next(tt)) 93 | 94 | # for t in ThreadedGenerator(range(10)): 95 | # print(t) 96 | # print('-' * 10) 97 | # 98 | # t = ThreadedGenerator(range(10)) 99 | # # def gene(): 100 | # # for t in range(10): 101 | # # yield t 102 | # # t = gene() 103 | # for _ in range(10): 104 | # print(next(t)) 105 | # print('-' * 10) 106 | 107 | 108 | 109 | if __name__ == '__main__': 110 | test() 111 | -------------------------------------------------------------------------------- /utils/mode_util/seq2seq/word_sequence.py: -------------------------------------------------------------------------------- 1 | """ 2 | WordSequence类 3 | Code from https://github.com/qhduan/just_another_seq2seq/blob/master/word_sequence.py 4 | 维护一个字典,把一个list(或者字符串)编码化,或者反向恢复 5 | 6 | """ 7 | 8 | 9 | import numpy as np 10 | 11 | 12 | class WordSequence(object): 13 | """一个可以把句子编码化(index)的类 14 | """ 15 | 16 | PAD_TAG = '' 17 | UNK_TAG = '' 18 | START_TAG = '' 19 | END_TAG = '' 20 | PAD = 0 21 | UNK = 1 22 | START = 2 23 | END = 3 24 | 25 | 26 | def __init__(self): 27 | """初始化基本的dict 28 | """ 29 | self.dict = { 30 | WordSequence.PAD_TAG: WordSequence.PAD, 31 | WordSequence.UNK_TAG: WordSequence.UNK, 32 | WordSequence.START_TAG: WordSequence.START, 33 | WordSequence.END_TAG: WordSequence.END, 34 | } 35 | self.fited = False 36 | 37 | 38 | def to_index(self, word): 39 | """把一个单字转换为index 40 | """ 41 | assert self.fited, 'WordSequence 尚未 fit' 42 | if word in self.dict: 43 | return self.dict[word] 44 | return WordSequence.UNK 45 | 46 | 47 | def to_word(self, index): 48 | """把一个index转换为单字 49 | """ 50 | assert self.fited, 'WordSequence 尚未 fit' 51 | for k, v in self.dict.items(): 52 | if v == index: 53 | return k 54 | return WordSequence.UNK_TAG 55 | 56 | 57 | def size(self): 58 | """返回字典大小 59 | """ 60 | assert self.fited, 'WordSequence 尚未 fit' 61 | return len(self.dict) + 1 62 | 63 | def __len__(self): 64 | """返回字典大小 65 | """ 66 | return self.size() 67 | 68 | 69 | def fit(self, sentences, min_count=5, max_count=None, max_features=None): 70 | """训练 WordSequence 71 | Args: 72 | min_count 最小出现次数 73 | max_count 最大出现次数 74 | max_features 最大特征数 75 | 76 | ws = WordSequence() 77 | ws.fit([['hello', 'world']]) 78 | """ 79 | assert not self.fited, 'WordSequence 只能 fit 一次' 80 | 81 | count = {} 82 | for sentence in sentences: 83 | arr = list(sentence) 84 | for a in arr: 85 | if a not in count: 86 | count[a] = 0 87 | count[a] += 1 88 | 89 | if min_count is not None: 90 | count = {k: v for k, v in count.items() if v >= min_count} 91 | 92 | if max_count is not None: 93 | count = {k: v for k, v in count.items() if v <= max_count} 94 | 95 | self.dict = { 96 | WordSequence.PAD_TAG: WordSequence.PAD, 97 | WordSequence.UNK_TAG: WordSequence.UNK, 98 | WordSequence.START_TAG: WordSequence.START, 99 | WordSequence.END_TAG: WordSequence.END, 100 | } 101 | 102 | if isinstance(max_features, int): 103 | count = sorted(list(count.items()), key=lambda x: x[1]) 104 | if max_features is not None and len(count) > max_features: 105 | count = count[-int(max_features):] 106 | for w, _ in count: 107 | self.dict[w] = len(self.dict) 108 | else: 109 | for w in sorted(count.keys()): 110 | self.dict[w] = len(self.dict) 111 | 112 | self.fited = True 113 | 114 | 115 | def transform(self, 116 | sentence, max_len=None): 117 | """把句子转换为向量 118 | 例如输入 ['a', 'b', 'c'] 119 | 输出 [1, 2, 3] 这个数字是字典里的编号,顺序没有意义 120 | """ 121 | assert self.fited, 'WordSequence 尚未 fit' 122 | 123 | # if max_len is not None: 124 | # r = [self.PAD] * max_len 125 | # else: 126 | # r = [self.PAD] * len(sentence) 127 | 128 | if max_len is not None: 129 | r = [self.PAD] * max_len 130 | else: 131 | r = [self.PAD] * len(sentence) 132 | 133 | for index, a in enumerate(sentence): 134 | if max_len is not None and index >= len(r): 135 | break 136 | r[index] = self.to_index(a) 137 | 138 | return np.array(r) 139 | 140 | 141 | def inverse_transform(self, indices, 142 | ignore_pad=False, ignore_unk=False, 143 | ignore_start=False, ignore_end=False): 144 | """把向量转换为句子,和上面的相反 145 | """ 146 | ret = [] 147 | for i in indices: 148 | word = self.to_word(i) 149 | if word == WordSequence.PAD_TAG and ignore_pad: 150 | continue 151 | if word == WordSequence.UNK_TAG and ignore_unk: 152 | continue 153 | if word == WordSequence.START_TAG and ignore_start: 154 | continue 155 | if word == WordSequence.END_TAG and ignore_end: 156 | continue 157 | ret.append(word) 158 | 159 | return ret 160 | 161 | 162 | def test(): 163 | """测试 164 | """ 165 | ws = WordSequence() 166 | ws.fit([ 167 | ['第', '一', '句', '话'], 168 | ['第', '二', '句', '话'] 169 | ]) 170 | 171 | indice = ws.transform(['第', '三']) 172 | print(indice) 173 | 174 | back = ws.inverse_transform(indice) 175 | print(back) 176 | 177 | if __name__ == '__main__': 178 | test() 179 | -------------------------------------------------------------------------------- /utils/word2vec_vector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | # !/usr/bin/python 3 | # @time :2019/4/4 10:00 4 | # @author :Mo 5 | # @function : 6 | 7 | from __future__ import print_function 8 | from utils.text_tools import txtRead, txtWrite 9 | from gensim.models.word2vec import LineSentence 10 | from gensim.models import Word2Vec 11 | import multiprocessing 12 | import logging 13 | import sys 14 | import os 15 | 16 | def train_word2vec_by_word(): 17 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 18 | logging.root.setLevel(level=logging.INFO) 19 | logging.info("running") 20 | 21 | inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt" 22 | outp1 = "w2v_model_wiki.model" 23 | outp2 = "w2v_model_wiki_word.vec" 24 | model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count()) 25 | model.save(outp1) 26 | model.wv.save_word2vec_format(outp2, binary=False) 27 | 28 | def train_word2vec_by_char(): 29 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 30 | logging.root.setLevel(level=logging.INFO) 31 | logging.info("running") 32 | 33 | inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt" 34 | outp1 = "w2v_model_wiki.model" 35 | outp2 = "w2v_model_wiki_char.vec" 36 | model = Word2Vec(LineSentence(inp), size=300, window=5, min_count=5, workers=multiprocessing.cpu_count()) 37 | model.save(outp1) 38 | model.wv.save_word2vec_format(outp2, binary=False) 39 | 40 | 41 | if __name__ == '__main__': 42 | train_word2vec_by_word() 43 | # train_word2vec_by_char() 44 | 45 | # inp = "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse.txt" 46 | # sentences_char = [] 47 | # sentences = txtRead(inp) 48 | # for sentences_one in sentences: 49 | # sentences_one_replace = sentences_one.strip().replace(" ", "") 50 | # sentences_one_replace_all = [] 51 | # for sentences_one_replace_one in sentences_one_replace: 52 | # sentences_one_replace_all.append(sentences_one_replace_one) 53 | # sentences_char.append(" ".join(sentences_one_replace_all) + "\n") 54 | # txtWrite(sentences_char, "Y:/BaiduNetdiskDownload/cut_zhwiki_wiki_parse/cut_zhwiki_wiki_parse_char.txt") 55 | # gg = 0 --------------------------------------------------------------------------------