├── README.md ├── __pycache__ ├── baidu_search.cpython-37.pyc ├── config.cpython-36.pyc ├── config.cpython-37.pyc ├── create_wordcloud.cpython-37.pyc ├── exe_01.cpython-36.pyc └── exe_01.cpython-37.pyc ├── app.py ├── frontend └── link.txt ├── image ├── Matrix Admin 运行演示.png ├── 导航菜单.png ├── 文本关键信息提取1.png ├── 文本关键信息提取2.png ├── 文本分类.gif ├── 文本生成.png ├── 新词挖掘.png ├── 用户画像分析.png ├── 用户评价情感分析.gif ├── 竞品分析.gif ├── 自动生成词云.gif ├── 项目文件目录结构.png └── 首页.png └── src ├── __pycache__ ├── __init__.cpython-36.pyc ├── config.cpython-36.pyc ├── config.cpython-37.pyc ├── exe_01.cpython-36.pyc ├── exe_01.cpython-37.pyc ├── exe_02.cpython-36.pyc ├── exe_02.cpython-37.pyc ├── exe_03.cpython-36.pyc ├── exe_05.cpython-36.pyc ├── exe_06.cpython-36.pyc └── exe_06.cpython-37.pyc ├── background ├── china.jpg ├── oval.png └── profile.png ├── config.py ├── data ├── ProductData.csv ├── README.md ├── UserReviewData.csv ├── data_out_proucts_details.xls ├── glove.6B.100d.txt ├── idf.txt ├── new_proucts_details.csv ├── new_proucts_details.xls ├── new_users_comments.csv ├── new_users_comments.xls ├── save_article.txt ├── simhei.ttf ├── stop_words_ch.txt ├── stopwords.txt ├── text.txt ├── token_vector.bin └── users_comments.xls ├── exe ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── exe_01.cpython-36.pyc │ ├── exe_01.cpython-37.pyc │ ├── exe_02.cpython-36.pyc │ ├── exe_02.cpython-37.pyc │ ├── exe_03.cpython-36.pyc │ ├── exe_03.cpython-37.pyc │ ├── exe_05.cpython-36.pyc │ └── exe_06.cpython-36.pyc ├── exe_01.py ├── exe_02.py ├── exe_03.py ├── exe_05.py ├── exe_06.py ├── key_info_extraction │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-37.pyc │ │ ├── abstract_textrank.cpython-36.pyc │ │ ├── abstract_textrank.cpython-37.pyc │ │ ├── compute_keywords_tfidf.cpython-36.pyc │ │ ├── create_wordcloud.cpython-36.pyc │ │ ├── keywords_textrank.cpython-36.pyc │ │ ├── keywords_textrank.cpython-37.pyc │ │ ├── sentence_similarity.cpython-36.pyc │ │ ├── sentence_similarity.cpython-37.pyc │ │ ├── textrank.cpython-36.pyc │ │ ├── textrank.cpython-37.pyc │ │ └── topic_cluster_lda.cpython-36.pyc │ ├── abstract_textrank.py │ ├── compute_keywords_tfidf.py │ ├── create_wordcloud.py │ ├── keywords_textrank.py │ ├── sentence_similarity.py │ ├── textrank.py │ └── topic_cluster_lda.py ├── review_sentiment │ ├── business.py │ ├── main.py │ ├── model_training.py │ ├── sentence.py │ └── stopwords.txt ├── sentiment_analysis │ ├── __init__.py │ ├── __pycache__ │ │ ├── bert_embedding_extend.cpython-36.pyc │ │ ├── bert_embedding_extend.cpython-37.pyc │ │ ├── embedding_manager_cyd.cpython-36.pyc │ │ ├── embedding_manager_cyd.cpython-37.pyc │ │ ├── glove_embedding.cpython-36.pyc │ │ ├── glove_embedding.cpython-37.pyc │ │ ├── review_sentiment_analysis.cpython-36.pyc │ │ ├── review_sentiment_analysis.cpython-37.pyc │ │ ├── sentiment_model.cpython-36.pyc │ │ ├── sentiment_model.cpython-37.pyc │ │ ├── utils.cpython-36.pyc │ │ └── utils.cpython-37.pyc │ ├── bert_embedding_extend.py │ ├── embedding_manager_cyd.py │ ├── glove_embedding.py │ ├── review_sentiment_analysis.py │ ├── sentiment_model.py │ └── utils.py └── worddiscovery │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-37.pyc │ ├── entropy_based.cpython-36.pyc │ ├── trie.cpython-36.pyc │ └── trie.cpython-37.pyc │ ├── entropy_based.py │ ├── test.txt │ └── trie.py ├── image └── wordcloud_62068.png ├── model ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── abstract_textrank.cpython-36.pyc │ ├── bert_embedding_extend.cpython-36.pyc │ ├── compute_keywords_tfidf.cpython-36.pyc │ ├── config.cpython-36.pyc │ ├── create_wordcloud.cpython-36.pyc │ ├── embedding_manager_cyd.cpython-36.pyc │ ├── glove_embedding.cpython-36.pyc │ ├── keywords_textrank.cpython-36.pyc │ ├── review_sentiment_analysis.cpython-36.pyc │ ├── sentence_similarity.cpython-36.pyc │ ├── sentiment_model.cpython-36.pyc │ ├── textrank.cpython-36.pyc │ ├── topic_cluster_lda.cpython-36.pyc │ └── utils.cpython-36.pyc ├── logistic_reg_clf_model.pkl ├── svm_clf.pkl └── svm_clf_model.pkl ├── save ├── keyinfo_from_input_file.txt ├── keyinfo_from_input_text.txt ├── keyinfo_from_url.txt ├── keyinfo_input_file.txt ├── keyinfo_input_text.txt ├── keyinfo_input_url.txt ├── new_word_discovery_input_file.txt ├── new_word_discovery_output.txt ├── review_summary.txt ├── save_article.txt ├── test_article.txt ├── testtext.txt ├── text.txt ├── topic_input_file.txt ├── topic_keywords_dist.txt ├── user_input_product_id_name.txt ├── userfile.txt ├── usertext.txt ├── userurl.txt ├── wordcloud_from_input_file.txt ├── wordcloud_from_input_text.txt └── wordcloud_from_url.txt ├── static.zip ├── templates.zip └── utils └── data_prepare.py /README.md: -------------------------------------------------------------------------------- 1 | # NLP可视化系统搭建 2 | 3 | 本项目将Capterra上爬取的产品和用户数据的可视化分析,文本关键信息的挖掘,用户的情感分析等集成到Web里面。在Web框架Flask中使用可视化工具pyecharts来动态展示可视化数据。 4 | 5 | ## 1. 环境配置 6 | 7 | 语言:Python3 8 | 9 | 编辑器:Anaconda(Spyder) 10 | 11 | Web框架:Flask 12 | 13 | 数据可视化:Pyecharts 14 | 15 | ## 2. 源码结构 16 | 17 | 本项目开发过程采用前后端分离的模式,完整的源码结构如下图所示: 18 | 19 | ![项目文件目录结构](D:\Github\NLPVisualizationSystem\image\项目文件目录结构.png) 20 | 21 | 在项目NLPVisualizationSystem下,主要的文件及文件夹包括:app.py、src(exe、save、utils、data、image、background、static 和 templates)、frontend。 22 | 23 | - app.py 是运行整个项目并发布到线上, 24 | - exe是业务逻辑处理模块文件目录, 25 | - data 是数据库操作脚本文件、爬取到的数据目录, 26 | - image是项目涉及的图片文件目录, 27 | - background是项目里词云图的默认背景文件目录, 28 | - static 是资源文件目录, 29 | - save是保存前端页面的输入数据的目录, 30 | - templates 是模板文件目录, 31 | - model是项目模块训练的模型, 32 | - utils是帮助数据预处理, 33 | - frontend是项目用到的前端主题样式模板。 34 | 35 | ## 3. 安装运行 36 | 37 | 运行前需要安装的python包: 38 | 39 | | Package | 用途说明 | 版本 | 40 | | ----------------- | -------------------------------- | ------ | 41 | | pandas | 结构化数据分析 | 0.24.2 | 42 | | numpy | 科学计算 | 1.19.4 | 43 | | Flask | web框架整合 | 1.0.2 | 44 | | pyecharts | 数据可视化 | 1.8.1 | 45 | | werkzeug | web应用工具包 | 0.14.1 | 46 | | collections | python标准库,数据结构常用模块 | 内置 | 47 | | newspaper3k | python爬虫框架,适合抓取新闻网页 | 0.2.8 | 48 | | imageio | 图像处理包 | 内置 | 49 | | snapshot_selenium | 渲染图片 | 0.0.2 | 50 | | jieba | 结巴分词 | 0.39 | 51 | | re | 正则化表达 | 内置 | 52 | | random | 用于生成伪随机数 | 内置 | 53 | | gensim | NLP工具包 | 3.7.3 | 54 | | bert_embedding | 提供BERT预训练模型及词表示 | 1.0.1 | 55 | | nltk | NLP工具包 | 3.4 | 56 | | mxnet | MXNet深度学习框架 | 1.4.0 | 57 | | sklearn | 机器学习工具包 | 0.19.2 | 58 | | | | | 59 | 60 | 运行: 61 | 62 | 1. 先解压src文件夹内的static和templates文件夹, 63 | 2. 找到src\data\glove.6B.100d.txt文件根据里面的链接下载并保存真正的glove.6B.100d.txt替换旧的。 64 | 3. 找到app.py,可以直接`python app.py`运行。 65 | 66 | 运行后在本地服务器启动一个 Flask 应用程序。在浏览器中输入地址: http://127.0.0.1:5000/ 可以看到NLP可视化系统的页面,截图如下所示: 67 | 68 | ![首页](.\image\首页.png) 69 | 70 | ## 4. 前端页面设计 71 | 72 | ### 4.1 主题模板选择 73 | 74 | 本项目中,选择 Bootstrap 的主题样式模板:[Matrix Admin](https://www.matrixadmin.wrappixel.com/) 开源免费版本。Matrix Admin 分为开源版本和商业版本,开源版本的下载地址为:[http://matrixadmin.wrappixel.com/matrix-admin-package-full.zip](http://matrixadmin.wrappixel.com/matrix-admin-package-full.zip)。下载后得到matrix-admin-package-full.zip,依次解压得到matrix-admin-bt4文件。 75 | 76 | 解压顺序: 77 | 78 | | 顺序 | 压缩包 | 解压后 | 79 | | ---- | ----------------------------- | ------------------------------------------------------------ | 80 | | 1 | matrix-admin-package-full.zip | matrix-admin-package-full ( matrix-admin-package.zip, matriz-admin-old.zip ) | 81 | | 2 | matrix-admin-package.zip | matrix-admin-package ( matrix-admin-bt4.zip, matriz-admin-old.zip ) | 82 | | 3 | matrix-admin-bt4.zip | matrix-admin-bt4 ( assets, dist, html ) | 83 | 84 | Matrix Admin 的文件目录,共分为 3 个文件夹:asserts、dist 和 html。 85 | 86 | - asserts 是第三方资源依赖文件目录, 87 | - dist 存储的是页面资源文件, 88 | - html 存储的是示例程序。 89 | 90 | ### 4.2 导航菜单设计 91 | 92 | ![导航菜单](.\image\导航菜单.png) 93 | 94 | 导航栏目前分了7块内容,有DashBoard、竞品分析、自动生成词云图、文本关键信息提取、文本情感分析、用户评价分析、用户画像。 95 | 96 | - DashBoard:罗列一些实时数据指标。 97 | - 竞品分析:对市场里的EDC产品做竞品分析, 分别从产品定价、安装、功能、培训等几方面进行数据分析与可视化的展示。 98 | - 自动生成词云图:用户可以通过3种方式生成词云图,一是根据用户输入的网址来采集文本生成词云;二是根据用户输入的文本内容来生成词云;三是根据用户上传的文本文件来生成词云图。 99 | - 文本关键信息提取:单文本分析:基于TextRank的算法的单文本摘要提取与关键词抽取。多文本分析:基于LDA的多文档主题分布探索。 100 | - 文本分类:主要是文本分类项目,如图书分类,情感分类等。 101 | - 用户评价情感分析:抓取用户对产品的评价数据,从评价文本中提取产品相关的aspects,归纳用户的正面和负面评价。 102 | - 用户画像:采集使用EDC产品的用户数据,从用户所在行业、公司规模、职业等方面进行数据分析与可视化的展示。 103 | 104 | 105 | 106 | 107 | 108 | ## 5. 后台应用设计 109 | 110 | ### 5.1 服务接口设计 111 | 112 | 服务接口设计包括**页面请求接口和数据请求接**口: 113 | 114 | - **页面请求接口**是浏览器对应的该页面的访问地址, 115 | - **数据请求接口**对应的是图表对象的数据请求地址。 116 | 117 | ### 5.2 异常请求设计 118 | 119 | 针对常见错误和异常,设计异常提示程序。在templates文件夹,存放设计好的异常页面模板:error-403.html,error-404.html,error-405.html,error-500.html。 120 | 121 | 122 | 123 | 124 | 125 | ## 6. 运行演示 126 | 127 | ### 6.0 DashBoard 128 | 129 | ![首页](D:\Github\NLPVisualizationSystem\image\首页.png) 130 | 131 | ### 6.1 竞品分析 132 | 133 | ![竞品分析](D:\Github\NLPVisualizationSystem\image\竞品分析.gif) 134 | 135 | ### 6.2 文本预处理 136 | 137 | #### 6.2.1 自动生成词云图 138 | 139 | ![自动生成词云](.\image\自动生成词云.gif) 140 | 141 | #### 6.2.2 单文档分析——关键词提取 142 | ![1](.\image\文本关键信息提取1.png) 143 | 144 | #### 6.2.3 多文档分析——主题分析 145 | 146 | ![2](.\image\文本关键信息提取2.png) 147 | 148 | #### 6.2.4 新词挖掘 149 | 150 | ![新词挖掘](D:\Github\NLPVisualizationSystem\image\新词挖掘.png) 151 | 152 | #### 6.2.5 文本数据增强 153 | 154 | 155 | 156 | ### 6.3 文本分类 157 | 158 | ![文本分类](D:\Github\NLPVisualizationSystem\image\文本分类.gif) 159 | 160 | 161 | 162 | ### 6.4 文本生成 163 | 164 | ![文本生成](D:\Github\NLPVisualizationSystem\image\文本生成.png) 165 | 166 | ### 6.5 用户分析 167 | 168 | #### 6.5.1 用户画像 169 | 170 | ![用户画像分析](.\image\用户画像分析.png) 171 | 172 | #### 6.5.2 用户评价情感分析 173 | 174 | ![用户评价情感分析](.\image\用户评价情感分析.gif) 175 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /__pycache__/baidu_search.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/baidu_search.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/create_wordcloud.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/create_wordcloud.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/exe_01.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/exe_01.cpython-36.pyc -------------------------------------------------------------------------------- /__pycache__/exe_01.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/exe_01.cpython-37.pyc -------------------------------------------------------------------------------- /frontend/link.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/frontend/link.txt -------------------------------------------------------------------------------- /image/Matrix Admin 运行演示.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/Matrix Admin 运行演示.png -------------------------------------------------------------------------------- /image/导航菜单.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/导航菜单.png -------------------------------------------------------------------------------- /image/文本关键信息提取1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本关键信息提取1.png -------------------------------------------------------------------------------- /image/文本关键信息提取2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本关键信息提取2.png -------------------------------------------------------------------------------- /image/文本分类.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本分类.gif -------------------------------------------------------------------------------- /image/文本生成.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本生成.png -------------------------------------------------------------------------------- /image/新词挖掘.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/新词挖掘.png -------------------------------------------------------------------------------- /image/用户画像分析.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/用户画像分析.png -------------------------------------------------------------------------------- /image/用户评价情感分析.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/用户评价情感分析.gif -------------------------------------------------------------------------------- /image/竞品分析.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/竞品分析.gif -------------------------------------------------------------------------------- /image/自动生成词云.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/自动生成词云.gif -------------------------------------------------------------------------------- /image/项目文件目录结构.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/项目文件目录结构.png -------------------------------------------------------------------------------- /image/首页.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/首页.png -------------------------------------------------------------------------------- /src/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_01.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_01.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_01.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_01.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_02.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_02.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_02.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_02.cpython-37.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_03.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_03.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_05.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_05.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_06.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_06.cpython-36.pyc -------------------------------------------------------------------------------- /src/__pycache__/exe_06.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_06.cpython-37.pyc -------------------------------------------------------------------------------- /src/background/china.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/background/china.jpg -------------------------------------------------------------------------------- /src/background/oval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/background/oval.png -------------------------------------------------------------------------------- /src/background/profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/background/profile.png -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Oct 29 15:58:11 2020 4 | 5 | @author: Xu 6 | """ 7 | import os 8 | import sys 9 | 10 | curPath = os.path.abspath(os.path.dirname(__file__)) 11 | rootPath = os.path.split(curPath)[0] 12 | sys.path.append(curPath) 13 | sys.path.append(rootPath) 14 | 15 | # main directory 16 | 17 | # curPath = r'D:\Github\NLPVisualizationSystem' 18 | data_dir = os.path.join(curPath, 'data') 19 | image_dir = os.path.join(curPath, 'image') 20 | template_dir = os.path.join(curPath, 'templates') 21 | background_dir = os.path.join(curPath, "background") 22 | static_dir = os.path.join(curPath, "static") 23 | run_dir = os.path.join(curPath, "run") 24 | html_dir = os.path.join(curPath,"html") 25 | save_dir = os.path.join(curPath, "save") 26 | model_dir = os.path.join(curPath,'model') 27 | 28 | 29 | 30 | # data path 31 | user_data_path = os.path.join(data_dir, 'new_users_comments.csv') 32 | product_data_path = os.path.join(data_dir, "new_proucts_details.csv") 33 | user_data_path_1 = os.path.join(data_dir, 'users_comments.xls') 34 | product_data_path_2 = os.path.join(data_dir, "data_out_proucts_details.xls") 35 | #stopwords_path = os.path.join(data_dir, "stopwords.txt") 36 | bg_pic = os.path.join(background_dir, "oval.png") 37 | 38 | # WordCloud Generation 39 | # save user input data 40 | wc_input_url_path = os.path.join(save_dir, 'userurl.txt') 41 | wc_input_text_path = os.path.join(save_dir, 'usertext.txt') 42 | wc_input_file_save_path = os.path.join(save_dir, 'userfile.txt') 43 | # save wordcloud picture 44 | pic_wc_input_url_save_path = os.path.join(save_dir, 'wordcloud_from_url.txt') 45 | pic_wc_input_text_save_path = os.path.join(save_dir, 'wordcloud_from_input_text.txt') 46 | pic_wc_input_file_save_path = os.path.join(save_dir, 'wordcloud_from_input_file.txt') 47 | 48 | 49 | 50 | # Text Extraction: configuration 51 | # KeyInfo Extraction 52 | # data path 53 | idf_path = os.path.join(data_dir, 'idf.txt') 54 | token_vector_path = os.path.join(data_dir, 'token_vector.bin') 55 | # save user input data 56 | keyinfo_input_url_path = os.path.join(save_dir, 'keyinfo_input_url.txt') 57 | keyinfo_input_text_path = os.path.join(save_dir, 'keyinfo_input_text.txt') 58 | keyinfo_input_file_save_path = os.path.join(save_dir, 'keyinfo_input_file.txt') 59 | # download text extraction result 60 | download_keyinfo_input_url_save_path = os.path.join(save_dir, 'keyinfo_from_url.txt') 61 | download_keyinfo_input_text_save_path = os.path.join(save_dir, 'keyinfo_from_input_text.txt') 62 | download_keyinfo_input_file_save_path = os.path.join(save_dir, 'keyinfo_from_input_file.txt') 63 | 64 | # Topic CLuster 65 | # data path 66 | StopWords_path = os.path.join(data_dir, "stop_words_ch.txt") 67 | # save user input data 68 | topic_input_file_save_path = os.path.join(save_dir, 'topic_input_file.txt') 69 | # download topic keywords result 70 | download_topic_input_file_save_path = os.path.join(save_dir, 'topic_keywords_dist.txt') 71 | 72 | # New Word Discovery 73 | # save user input data 74 | new_word_discovery_file_save_path = os.path.join(save_dir, 'new_word_discovery_input_file.txt') 75 | # download new words result 76 | download_new_word_output_file_save_path = os.path.join(save_dir, 'new_word_discovery_output.txt') 77 | 78 | 79 | 80 | # Review Sentiment Analysis 81 | # data path 82 | en_stopwords_path = os.path.join(data_dir, 'stopwords.txt') 83 | review_data_path = os.path.join(data_dir, 'UserReviewData.csv') 84 | business_data_path = os.path.join(data_dir, 'ProductData.csv') 85 | glove_embedding_path = os.path.join(data_dir, 'glove.6B.100d.txt') 86 | 87 | # model path 88 | svm_model_save_path = os.path.join(model_dir, 'svm_clf_model.pkl') 89 | lr_model_save_path = os.path.join(model_dir, 'logistic_reg_clf_model.pkl') 90 | 91 | # review summary save path 92 | user_input_id_name_path = os.path.join(save_dir, 'user_input_product_id_name.txt') 93 | review_summary_save_path = os.path.join(save_dir, 'review_summary.txt') 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /src/data/ProductData.csv: -------------------------------------------------------------------------------- 1 | product_name,review_count,product_id 2 | ABBYY FlexiCapture,25.0,8 3 | Castor EDC,132.0,1 4 | Clinical Studio,41.0,3 5 | Data+ Platform,29.0,4 6 | DataFax,8.0,18 7 | DataLabs,4.0,25 8 | Debut,5.0,21 9 | DocuPhase,146.0,5 10 | Ephesoft Transact,67.0,6 11 | Flex Databases,6.0,16 12 | Formation,10.0,12 13 | Grooper,12.0,15 14 | Kofax Kapow,8.0,22 15 | MPS IntelliVector,4.0,24 16 | MainEDC,6.0,23 17 | OnlineCRF,5.0,2 18 | Poimapper,18.0,9 19 | SMART-TRIAL,23.0,11 20 | Square 9 GlobalCapture,43.0,7 21 | Ultimate Forms,7.0,20 22 | VKS,14.0,14 23 | Viedoc,50.0,0 24 | iCapture,26.0,10 25 | iMedNet EDC,8.0,19 26 | naturalFORMS,11.0,17 27 | Aetiol,0.0,13 28 | BizEfficiency,0.0,26 29 | Clinion EDC & CDM,0.0,27 30 | Clipboard To Contact - Turn Text Into Contacts,0.0,28 31 | CTMS,0.0,29 32 | Datacap Taskmaster Capture,0.0,30 33 | Dharma,0.0,31 34 | eCaseLink,0.0,32 35 | ExamineYou,0.0,33 36 | AcuFill,0.0,34 37 | LabChart Pro,0.0,35 38 | OnCore,0.0,36 39 | QureClinical,0.0,37 40 | CareRecord,0.0,38 41 | Timaeus,0.0,39 42 | AcqKnowledge,0.0,40 43 | ActiView software,0.0,41 44 | agCapture,0.0,42 45 | ALPHADAS,0.0,43 46 | Appliance,0.0,44 47 | AQ2 Remittance,0.0,45 48 | Ascerteon,0.0,46 49 | BioClinica Express,0.0,47 50 | BSCAN Capture,0.0,48 51 | Captricity,0.0,49 52 | Clear Clinica,0.0,50 53 | clincase,0.0,51 54 | ClinicalAnalytics,0.0,52 55 | DADOS,0.0,53 56 | Data Scan,0.0,54 57 | Data-Scan,0.0,55 58 | DDi-mEDC,0.0,56 59 | DealMatrix,0.0,57 60 | Digitalis Clinical Data Collection,0.0,58 61 | Docsumo,0.0,59 62 | EDC Made Easy,0.0,60 63 | Entrypoint i4,0.0,61 64 | eplansoft REVIEW,0.0,62 65 | eResearch,0.0,63 66 | FileStore EDM,0.0,64 67 | FormFoundry,0.0,65 68 | Fusion eClinical Suite,0.0,66 69 | Gather,0.0,67 70 | GoResearch,0.0,68 71 | i-CDMS,0.0,69 72 | idtPlans Review,0.0,70 73 | Improve,0.0,71 74 | INKWRX,0.0,72 75 | Intelligent Data Capture,0.0,73 76 | iQapture,0.0,74 77 | ixtract,0.0,75 78 | KnowledgeLake Capture,0.0,76 79 | Magpi,0.0,77 80 | MailThis.to,0.0,78 81 | MATRIX EDC/IWRS,0.0,79 82 | Med-Quest,0.0,80 83 | MedSciNet Builder,0.0,81 84 | MetricWire,0.0,82 85 | MyoResearch XP Master,0.0,83 86 | Net Station,0.0,84 87 | OCR for AnyDoc,0.0,85 88 | PaperLess,0.0,86 89 | PaperSurvey,0.0,87 90 | PhysioQ,0.0,88 91 | Protocol First,0.0,89 92 | Returnable Forms,0.0,90 93 | Scout,0.0,91 94 | secuTrial,0.0,92 95 | SymmetricDS,0.0,93 96 | Teamscope,0.0,94 97 | The Data Center,0.0,95 98 | Thread Learning,0.0,96 99 | TWin PSG,0.0,97 100 | VISION EDC/CTMS,0.0,98 101 | -------------------------------------------------------------------------------- /src/data/README.md: -------------------------------------------------------------------------------- 1 | 把数据解压到data文件夹里 2 | 3 | 下载地址 4 | https://pan.baidu.com/s/1hSFBjQHLhYDw9jPBDNH6pw&shfl=sharepset 5 | 6 | 将glove.6B.100d加到data文件夹下。 -------------------------------------------------------------------------------- /src/data/data_out_proucts_details.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/data_out_proucts_details.xls -------------------------------------------------------------------------------- /src/data/glove.6B.100d.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/glove.6B.100d.txt -------------------------------------------------------------------------------- /src/data/new_proucts_details.xls: -------------------------------------------------------------------------------- 1 | product_name,review_count,product_id 2 | ABBYY FlexiCapture,25.0,8 3 | Castor EDC,132.0,1 4 | Clinical Studio,41.0,3 5 | Data+ Platform,29.0,4 6 | DataFax,8.0,18 7 | DataLabs,4.0,25 8 | Debut,5.0,21 9 | DocuPhase,146.0,5 10 | Ephesoft Transact,67.0,6 11 | Flex Databases,6.0,16 12 | Formation,10.0,12 13 | Grooper,12.0,15 14 | Kofax Kapow,8.0,22 15 | MPS IntelliVector,4.0,24 16 | MainEDC,6.0,23 17 | OnlineCRF,5.0,2 18 | Poimapper,18.0,9 19 | SMART-TRIAL,23.0,11 20 | Square 9 GlobalCapture,43.0,7 21 | Ultimate Forms,7.0,20 22 | VKS,14.0,14 23 | Viedoc,50.0,0 24 | iCapture,26.0,10 25 | iMedNet EDC,8.0,19 26 | naturalFORMS,11.0,17 27 | Aetiol,0.0,13 28 | BizEfficiency,0.0,26 29 | Clinion EDC & CDM,0.0,27 30 | Clipboard To Contact - Turn Text Into Contacts,0.0,28 31 | CTMS,0.0,29 32 | Datacap Taskmaster Capture,0.0,30 33 | Dharma,0.0,31 34 | eCaseLink,0.0,32 35 | ExamineYou,0.0,33 36 | AcuFill,0.0,34 37 | LabChart Pro,0.0,35 38 | OnCore,0.0,36 39 | QureClinical,0.0,37 40 | CareRecord,0.0,38 41 | Timaeus,0.0,39 42 | AcqKnowledge,0.0,40 43 | ActiView software,0.0,41 44 | agCapture,0.0,42 45 | ALPHADAS,0.0,43 46 | Appliance,0.0,44 47 | AQ2 Remittance,0.0,45 48 | Ascerteon,0.0,46 49 | BioClinica Express,0.0,47 50 | BSCAN Capture,0.0,48 51 | Captricity,0.0,49 52 | Clear Clinica,0.0,50 53 | clincase,0.0,51 54 | ClinicalAnalytics,0.0,52 55 | DADOS,0.0,53 56 | Data Scan,0.0,54 57 | Data-Scan,0.0,55 58 | DDi-mEDC,0.0,56 59 | DealMatrix,0.0,57 60 | Digitalis Clinical Data Collection,0.0,58 61 | Docsumo,0.0,59 62 | EDC Made Easy,0.0,60 63 | Entrypoint i4,0.0,61 64 | eplansoft REVIEW,0.0,62 65 | eResearch,0.0,63 66 | FileStore EDM,0.0,64 67 | FormFoundry,0.0,65 68 | Fusion eClinical Suite,0.0,66 69 | Gather,0.0,67 70 | GoResearch,0.0,68 71 | i-CDMS,0.0,69 72 | idtPlans Review,0.0,70 73 | Improve,0.0,71 74 | INKWRX,0.0,72 75 | Intelligent Data Capture,0.0,73 76 | iQapture,0.0,74 77 | ixtract,0.0,75 78 | KnowledgeLake Capture,0.0,76 79 | Magpi,0.0,77 80 | MailThis.to,0.0,78 81 | MATRIX EDC/IWRS,0.0,79 82 | Med-Quest,0.0,80 83 | MedSciNet Builder,0.0,81 84 | MetricWire,0.0,82 85 | MyoResearch XP Master,0.0,83 86 | Net Station,0.0,84 87 | OCR for AnyDoc,0.0,85 88 | PaperLess,0.0,86 89 | PaperSurvey,0.0,87 90 | PhysioQ,0.0,88 91 | Protocol First,0.0,89 92 | Returnable Forms,0.0,90 93 | Scout,0.0,91 94 | secuTrial,0.0,92 95 | SymmetricDS,0.0,93 96 | Teamscope,0.0,94 97 | The Data Center,0.0,95 98 | Thread Learning,0.0,96 99 | TWin PSG,0.0,97 100 | VISION EDC/CTMS,0.0,98 101 | -------------------------------------------------------------------------------- /src/data/new_users_comments.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/new_users_comments.xls -------------------------------------------------------------------------------- /src/data/simhei.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/simhei.ttf -------------------------------------------------------------------------------- /src/data/stop_words_ch.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/stop_words_ch.txt -------------------------------------------------------------------------------- /src/data/stopwords.txt: -------------------------------------------------------------------------------- 1 | 'd 2 | 'll 3 | 'm 4 | 're 5 | 's 6 | 't 7 | 've 8 | ZT 9 | ZZ 10 | a 11 | a's 12 | able 13 | about 14 | above 15 | abst 16 | accordance 17 | according 18 | accordingly 19 | across 20 | act 21 | actually 22 | added 23 | adj 24 | adopted 25 | affected 26 | affecting 27 | affects 28 | after 29 | afterwards 30 | again 31 | against 32 | ah 33 | ain't 34 | all 35 | allow 36 | allows 37 | almost 38 | alone 39 | along 40 | already 41 | also 42 | although 43 | always 44 | am 45 | among 46 | amongst 47 | an 48 | and 49 | announce 50 | another 51 | any 52 | anybody 53 | anyhow 54 | anymore 55 | anyone 56 | anything 57 | anyway 58 | anyways 59 | anywhere 60 | apart 61 | apparently 62 | appear 63 | appreciate 64 | appropriate 65 | approximately 66 | are 67 | area 68 | areas 69 | aren 70 | aren't 71 | arent 72 | arise 73 | around 74 | as 75 | aside 76 | ask 77 | asked 78 | asking 79 | asks 80 | associated 81 | at 82 | auth 83 | available 84 | away 85 | awfully 86 | b 87 | back 88 | backed 89 | backing 90 | backs 91 | be 92 | became 93 | because 94 | become 95 | becomes 96 | becoming 97 | been 98 | before 99 | beforehand 100 | began 101 | begin 102 | beginning 103 | beginnings 104 | begins 105 | behind 106 | being 107 | beings 108 | believe 109 | below 110 | beside 111 | besides 112 | best 113 | better 114 | between 115 | beyond 116 | big 117 | biol 118 | both 119 | brief 120 | briefly 121 | but 122 | by 123 | c 124 | c'mon 125 | c's 126 | ca 127 | came 128 | can 129 | can't 130 | cannot 131 | cant 132 | case 133 | cases 134 | cause 135 | causes 136 | certain 137 | certainly 138 | changes 139 | clear 140 | clearly 141 | co 142 | com 143 | come 144 | comes 145 | concerning 146 | consequently 147 | consider 148 | considering 149 | contain 150 | containing 151 | contains 152 | corresponding 153 | could 154 | couldn't 155 | couldnt 156 | course 157 | currently 158 | d 159 | date 160 | definitely 161 | describe 162 | described 163 | despite 164 | did 165 | didn't 166 | differ 167 | different 168 | differently 169 | discuss 170 | do 171 | does 172 | doesn't 173 | doing 174 | don't 175 | done 176 | down 177 | downed 178 | downing 179 | downs 180 | downwards 181 | due 182 | during 183 | e 184 | each 185 | early 186 | ed 187 | edu 188 | effect 189 | eg 190 | eight 191 | eighty 192 | either 193 | else 194 | elsewhere 195 | end 196 | ended 197 | ending 198 | ends 199 | enough 200 | entirely 201 | especially 202 | et 203 | et-al 204 | etc 205 | even 206 | evenly 207 | ever 208 | every 209 | everybody 210 | everyone 211 | everything 212 | everywhere 213 | ex 214 | exactly 215 | example 216 | except 217 | f 218 | face 219 | faces 220 | fact 221 | facts 222 | far 223 | felt 224 | few 225 | ff 226 | fifth 227 | find 228 | finds 229 | first 230 | five 231 | fix 232 | followed 233 | following 234 | follows 235 | for 236 | former 237 | formerly 238 | forth 239 | found 240 | four 241 | from 242 | full 243 | fully 244 | further 245 | furthered 246 | furthering 247 | furthermore 248 | furthers 249 | g 250 | gave 251 | general 252 | generally 253 | get 254 | gets 255 | getting 256 | give 257 | given 258 | gives 259 | giving 260 | go 261 | goes 262 | going 263 | gone 264 | good 265 | goods 266 | got 267 | gotten 268 | great 269 | greater 270 | greatest 271 | greetings 272 | group 273 | grouped 274 | grouping 275 | groups 276 | h 277 | had 278 | hadn't 279 | happens 280 | hardly 281 | has 282 | hasn't 283 | have 284 | haven't 285 | having 286 | he 287 | he's 288 | hed 289 | hello 290 | help 291 | hence 292 | her 293 | here 294 | here's 295 | hereafter 296 | hereby 297 | herein 298 | heres 299 | hereupon 300 | hers 301 | herself 302 | hes 303 | hi 304 | hid 305 | high 306 | higher 307 | highest 308 | him 309 | himself 310 | his 311 | hither 312 | home 313 | hopefully 314 | how 315 | howbeit 316 | however 317 | hundred 318 | i 319 | i'd 320 | i'll 321 | i'm 322 | i've 323 | id 324 | ie 325 | if 326 | ignored 327 | im 328 | immediate 329 | immediately 330 | importance 331 | important 332 | in 333 | inasmuch 334 | inc 335 | include 336 | indeed 337 | index 338 | indicate 339 | indicated 340 | indicates 341 | information 342 | inner 343 | insofar 344 | instead 345 | interest 346 | interested 347 | interesting 348 | interests 349 | into 350 | invention 351 | inward 352 | is 353 | isn't 354 | it 355 | it'd 356 | it'll 357 | it's 358 | itd 359 | its 360 | itself 361 | j 362 | just 363 | k 364 | keep 365 | keeps 366 | kept 367 | keys 368 | kg 369 | kind 370 | km 371 | knew 372 | know 373 | known 374 | knows 375 | l 376 | large 377 | largely 378 | last 379 | lately 380 | later 381 | latest 382 | latter 383 | latterly 384 | least 385 | less 386 | lest 387 | let 388 | let's 389 | lets 390 | like 391 | liked 392 | likely 393 | line 394 | little 395 | long 396 | longer 397 | longest 398 | look 399 | looking 400 | looks 401 | ltd 402 | m 403 | made 404 | mainly 405 | make 406 | makes 407 | making 408 | man 409 | many 410 | may 411 | maybe 412 | me 413 | mean 414 | means 415 | meantime 416 | meanwhile 417 | member 418 | members 419 | men 420 | merely 421 | mg 422 | might 423 | million 424 | miss 425 | ml 426 | more 427 | moreover 428 | most 429 | mostly 430 | mr 431 | mrs 432 | much 433 | mug 434 | must 435 | my 436 | myself 437 | n 438 | n't 439 | na 440 | name 441 | namely 442 | nay 443 | nd 444 | near 445 | nearly 446 | necessarily 447 | necessary 448 | need 449 | needed 450 | needing 451 | needs 452 | neither 453 | never 454 | nevertheless 455 | new 456 | newer 457 | newest 458 | next 459 | nine 460 | ninety 461 | no 462 | nobody 463 | non 464 | none 465 | nonetheless 466 | noone 467 | nor 468 | normally 469 | nos 470 | not 471 | noted 472 | nothing 473 | novel 474 | now 475 | nowhere 476 | number 477 | numbers 478 | o 479 | obtain 480 | obtained 481 | obviously 482 | of 483 | off 484 | often 485 | oh 486 | ok 487 | okay 488 | old 489 | older 490 | oldest 491 | omitted 492 | on 493 | once 494 | one 495 | ones 496 | only 497 | onto 498 | open 499 | opened 500 | opening 501 | opens 502 | or 503 | ord 504 | order 505 | ordered 506 | ordering 507 | orders 508 | other 509 | others 510 | otherwise 511 | ought 512 | our 513 | ours 514 | ourselves 515 | out 516 | outside 517 | over 518 | overall 519 | owing 520 | own 521 | p 522 | page 523 | pages 524 | part 525 | parted 526 | particular 527 | particularly 528 | parting 529 | parts 530 | past 531 | per 532 | perhaps 533 | place 534 | placed 535 | places 536 | please 537 | plus 538 | point 539 | pointed 540 | pointing 541 | points 542 | poorly 543 | possible 544 | possibly 545 | potentially 546 | pp 547 | predominantly 548 | present 549 | presented 550 | presenting 551 | presents 552 | presumably 553 | previously 554 | primarily 555 | probably 556 | problem 557 | problems 558 | promptly 559 | proud 560 | provides 561 | put 562 | puts 563 | q 564 | que 565 | quickly 566 | quite 567 | qv 568 | r 569 | ran 570 | rather 571 | rd 572 | re 573 | readily 574 | really 575 | reasonably 576 | recent 577 | recently 578 | ref 579 | refs 580 | regarding 581 | regardless 582 | regards 583 | related 584 | relatively 585 | research 586 | respectively 587 | resulted 588 | resulting 589 | results 590 | right 591 | room 592 | rooms 593 | run 594 | s 595 | said 596 | same 597 | saw 598 | say 599 | saying 600 | says 601 | sec 602 | second 603 | secondly 604 | seconds 605 | section 606 | see 607 | seeing 608 | seem 609 | seemed 610 | seeming 611 | seems 612 | seen 613 | sees 614 | self 615 | selves 616 | sensible 617 | sent 618 | serious 619 | seriously 620 | seven 621 | several 622 | shall 623 | she 624 | she'll 625 | shed 626 | shes 627 | should 628 | shouldn't 629 | show 630 | showed 631 | showing 632 | shown 633 | showns 634 | shows 635 | side 636 | sides 637 | significant 638 | significantly 639 | similar 640 | similarly 641 | since 642 | six 643 | slightly 644 | small 645 | smaller 646 | smallest 647 | so 648 | some 649 | somebody 650 | somehow 651 | someone 652 | somethan 653 | something 654 | sometime 655 | sometimes 656 | somewhat 657 | somewhere 658 | soon 659 | sorry 660 | specifically 661 | specified 662 | specify 663 | specifying 664 | state 665 | states 666 | still 667 | stop 668 | strongly 669 | sub 670 | substantially 671 | successfully 672 | such 673 | sufficiently 674 | suggest 675 | sup 676 | sure 677 | t 678 | t's 679 | take 680 | taken 681 | taking 682 | tell 683 | tends 684 | th 685 | than 686 | thank 687 | thanks 688 | thanx 689 | that 690 | that'll 691 | that's 692 | that've 693 | thats 694 | the 695 | their 696 | theirs 697 | them 698 | themselves 699 | then 700 | thence 701 | there 702 | there'll 703 | there's 704 | there've 705 | thereafter 706 | thereby 707 | thered 708 | therefore 709 | therein 710 | thereof 711 | therere 712 | theres 713 | thereto 714 | thereupon 715 | these 716 | they 717 | they'd 718 | they'll 719 | they're 720 | they've 721 | theyd 722 | theyre 723 | thing 724 | things 725 | think 726 | thinks 727 | third 728 | this 729 | thorough 730 | thoroughly 731 | those 732 | thou 733 | though 734 | thoughh 735 | thought 736 | thoughts 737 | thousand 738 | three 739 | throug 740 | through 741 | throughout 742 | thru 743 | thus 744 | til 745 | tip 746 | to 747 | today 748 | together 749 | too 750 | took 751 | toward 752 | towards 753 | tried 754 | tries 755 | truly 756 | try 757 | trying 758 | ts 759 | turn 760 | turned 761 | turning 762 | turns 763 | twice 764 | two 765 | u 766 | un 767 | under 768 | unfortunately 769 | unless 770 | unlike 771 | unlikely 772 | until 773 | unto 774 | up 775 | upon 776 | ups 777 | us 778 | use 779 | used 780 | useful 781 | usefully 782 | usefulness 783 | uses 784 | using 785 | usually 786 | uucp 787 | v 788 | value 789 | various 790 | very 791 | via 792 | viz 793 | vol 794 | vols 795 | vs 796 | w 797 | want 798 | wanted 799 | wanting 800 | wants 801 | was 802 | wasn't 803 | way 804 | ways 805 | we 806 | we'd 807 | we'll 808 | we're 809 | we've 810 | wed 811 | welcome 812 | well 813 | wells 814 | went 815 | were 816 | weren't 817 | what 818 | what'll 819 | what's 820 | whatever 821 | whats 822 | when 823 | whence 824 | whenever 825 | where 826 | where's 827 | whereafter 828 | whereas 829 | whereby 830 | wherein 831 | wheres 832 | whereupon 833 | wherever 834 | whether 835 | which 836 | while 837 | whim 838 | whither 839 | who 840 | who'll 841 | who's 842 | whod 843 | whoever 844 | whole 845 | whom 846 | whomever 847 | whos 848 | whose 849 | why 850 | widely 851 | will 852 | willing 853 | wish 854 | with 855 | within 856 | without 857 | won't 858 | wonder 859 | words 860 | work 861 | worked 862 | working 863 | works 864 | world 865 | would 866 | wouldn't 867 | www 868 | x 869 | y 870 | year 871 | years 872 | yes 873 | yet 874 | you 875 | you'd 876 | you'll 877 | you're 878 | you've 879 | youd 880 | young 881 | younger 882 | youngest 883 | your 884 | youre 885 | yours 886 | yourself 887 | yourselves 888 | z 889 | zero 890 | zt 891 | zz 892 | -------------------------------------------------------------------------------- /src/data/text.txt: -------------------------------------------------------------------------------- 1 | 人工智能技术在防疫抗疫工作中大显身手 发布时间:2020-02-25 来源:人工智能实验室 近期,新型冠状病毒肺炎(简称“新冠肺炎”)的疫情突如其来,让人们有些措手不及。但是为了实现更好的防疫抗疫效果,不少研究人员纷纷应用诸多技术手段来抗击疫情。其中人工智能技术已成为这场防疫抗疫攻坚战的有力武器之一;它在疫情防控、图像分析、辅助诊断、疫苗研发、新药研制等方面助力防疫抗疫工作。 在疫情防控方面 新冠肺炎来势汹汹,但是它依然可防可控。采取有效的措施预防,戴口罩、勤洗手、居家隔离等都是非常行之有效的方法。例如戴口罩是预防传染病最重要、最有效的防控手段之一,可以有效降低感染新冠肺炎的风险。又如体温筛检是此次疫情中筛查排查可疑病例的一个手段。人工智能技术在疫情防控的各个应用场景中都可发挥重要作用,这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。 北京旷视科技有限公司最近推出一套用于发热及潜在被感染对象识别、筛查与分析的人工智能新系统“明骥”。该系统通过前端红外相机,鉴别人流中的高温人员,再根据疑似发烧者的人体、人脸信息,利用人工智能技术辅助工作人员快速定位体温异常者;做到了在佩戴口罩的情况下,也能精准锁定。目前,“明骥”已应用在地铁、火车站、机尝集中办公区等人流量较大的区域。 在图像分析方面 医疗影像数据是医疗数据的重要组成部分,人工智能技术能够通过快速准确地标记新冠肺炎的特定异常结构来提高图像分析的效率,以供放射科医生参考。提高图像分析效率,可让放射科医生腾出更多的时间聚焦在需要更多解读或判断的内容审阅上,从而有望缓解他们供给缺口问题。另外,这还可避免放射科医生以及临床医生被别人感染,降低他们的安全风险。 上海人工智能研究院与杭州健培科技有限公司联合研发的新冠肺炎影像云检测平台最近正式上线,对全国医院进行免费影像云诊断服务,并对所有医疗机构和各级政府免费开放,将高效、准确地为放射科医生以及临床医生提供决策依据,助力疫情防控。新冠肺炎影像云检测平台上线后,能够为临床一线抗疫医生疫情评估、肺炎性质判定、治疗方案制定提供高效精确的支撑依据。 在辅助诊断方面 医疗诊断是一个综合考虑各种影响因素的判断过程;利用人工智能技术辅助诊断新冠肺炎,能够在短时间内精准地预判病情,对提高患者预后具有重要作用。人工智能技术辅助诊断的功能既可以精确分割CT扫描部位的病灶;还可以对病灶的CT影像做分析,找出疑似病变和组织结构的异常,并给出诊断方向。在质控及病变识别方面,具有更为宽泛的使用范围。 在CT影像快速诊断方面,北京推想科技与武汉同济医院、深圳市第三人民医院合作研发针对新冠肺炎特别版,该版利用人工智能技术的深度学习、图像识别等对检出的病灶进行测量、密度分析,支持患者前后片对照,提供量化数据对比结果,帮助医生更快完成疑似患者诊断。北京安德医智联合解放军总医院正在研发新冠肺炎CT影像人工智能辅助诊断系统,免费提供给全国各级医院使用。 在疫苗研发方面 随着疫情持续,很多民众非常关心新冠肺炎的疫苗研发进展。据介绍,无论是对病毒进行基因测序,找到病毒来源以及传播宿主,还是研发病毒疫苗,人工智能技术都大有用武之地。例如传统的疫苗研发需在实验室中对数百种药物成分进行生物测试,这一过程往往要耗费不少时间;而人工智能技术可以极大加速这个过程,能够让更多的人获得疫苗的保护。 浙江大学研究团队最近利用人工智能技术在已有的药物中找到两种抗击疫情药物,从而使疫苗的研发工作取得了阶段性的成果。这两种药物有可能成为新冠肺炎候选疫苗,目前正在进行临床试验。据了解,将人工智能技术用于筛选和研发疫苗,能够帮助研究人员在已有的药物中快速找到可能对预防新冠肺炎有效的生物制品。 在新药研制方面 新冠肺炎的临床表现以发热﹑乏力﹑干咳为主要表现;而随着疾病的进展会出现急性呼吸窘迫综合征、难以纠正的代谢性酸中毒等,需要给予积极有效的治疗。但是目前还没有明确的特效药能够治疗新冠肺炎,只能根据患者的一般情况进行对症治疗,预防继发的感染,及时进行器官的功能支持。不过研究人员正在利用人工智能技术研制针对该病的特效药,新药很快就会问世。 美国麻省理工学院研究团队近日利用人工智能技术发现一种新型抗生素,它可以杀灭多种致病细菌,包括一些对所有已知抗生素都具耐药性的细菌菌株。研究人员通过让机器学习算法在几天内充分筛查庞大数据库中逾1亿种化合物,终于发现了这种抗生素;该抗生素被认为能有效抑制大肠杆菌,对治疗新冠肺炎也有效。 由上可知,人工智能技术正在新冠肺炎的防疫抗疫工作中大显身手。可以预料,作为一种综合性极强的技术,人工智能将在医疗健康领域内得到越来越多的应用,并将成为影响医学行业发展的重要科技手段。正如我国著名学者周海中教授曾经指出的那样:“随着社会的发展和科技的进步,人工智能技术将在医疗健康领域大显身手;其成果会不断涌现,应用前景令人期待。” 2 | -------------------------------------------------------------------------------- /src/data/users_comments.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/users_comments.xls -------------------------------------------------------------------------------- /src/exe/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 26 21:47:15 2021 4 | 5 | @author: Xu 6 | """ 7 | import sys 8 | import os 9 | curPath = os.path.abspath(os.path.dirname(__file__)) 10 | rootPath = os.path.split(curPath)[0] 11 | sys.path.append(os.path.split(rootPath)[0]) -------------------------------------------------------------------------------- /src/exe/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_01.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_01.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_01.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_01.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_02.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_02.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_02.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_02.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_03.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_03.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_03.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_03.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_05.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_05.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/__pycache__/exe_06.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_06.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/exe_01.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Oct 29 10:26:02 2020 4 | 5 | @author: Xu 6 | 7 | 1.竞品分析: 8 | 负责响应数据查询请求,调用数据逻辑程序。 9 | 基于数据逻辑查询结果,业务逻辑程序组装出竞品分析数据并返回给前端页面。 10 | 11 | """ 12 | import __init__ 13 | import numpy as np 14 | from src import config 15 | import pandas as pd 16 | import collections 17 | from pyecharts.charts import Bar, Timeline, Radar, Tab, Boxplot 18 | from pyecharts import options as opts 19 | 20 | 21 | 22 | # 1. 竞品分析 23 | # 数据逻辑: 24 | def itemcount(dictionary): 25 | """ 26 | 此函数用于计算字典重复的key个数 。 27 | 28 | Parameters 29 | ---------- 30 | dictionary : TYPE-dict 31 | 32 | Returns 33 | ------- 34 | count : TYPE-dict 35 | new dictionary. 36 | 37 | """ 38 | count = {} 39 | try: 40 | del dictionary[np.nan] 41 | except: 42 | None 43 | for i in dictionary.keys(): 44 | items = i.split(',') 45 | for j in items: 46 | j = j.strip() 47 | if j in count.keys(): 48 | count[j] += dictionary[i] 49 | else: 50 | count[j] = dictionary[i] 51 | return count 52 | 53 | def product_feature_query(): 54 | """ 55 | 此函数用于返回产品功能的分组统计数据。 56 | 57 | Returns 58 | ------- 59 | dataX: TYPE-list all product feature names. 60 | dataY: TYPE-list Product feature percentage(%). 61 | 62 | """ 63 | 64 | # load data 65 | data = pd.read_csv(config.product_data_path) 66 | # select data 67 | num_product = data.shape[0] 68 | cates = collections.Counter(data['Category_Features']) 69 | cates1 = itemcount(dict(cates)) 70 | dataX = list(cates1.keys()) 71 | dataY = list(cates1.values()) 72 | dataY = [round(i/num_product*100,2) for i in dataY] 73 | return dataX, dataY 74 | 75 | 76 | def product_edc_feature_query(): 77 | """ 78 | 此函数用于获取EDC产品功能的分组统计数据。 79 | 80 | Returns 81 | ------- 82 | dataX : TYPE-list 83 | EDC feature names. 84 | dataY : TYPE-list 85 | EDC feature percentage(%). 86 | 87 | """ 88 | # load data 89 | data = pd.read_csv(config.product_data_path) 90 | # select data 91 | num_product = data.shape[0] 92 | cate_list = [list(j.strip() for j in i.split(',')) for i in data['Category_Features']] 93 | cate_list_edc = [i.index("Electronic Data Capture Features") for i in cate_list] 94 | feats = data['Category_Features_List'] 95 | edc=dict(zip(list(eval(feats[0]).keys()),[0]*len(eval(feats[0]).keys()))) 96 | for i in range(len(feats)): 97 | if len(cate_list[i])==1: 98 | for j in dict(eval(feats[i])).keys(): 99 | if dict(eval(feats[i]))[j]=='able': 100 | edc[j]+=1 101 | else: 102 | edc[j]=edc[j] 103 | else: 104 | for j in edc.keys(): 105 | if list(eval(feats[i]))[cate_list_edc[i]]=='able': 106 | edc[j]+=1 107 | else: 108 | edc[j]=edc[j] 109 | dataX = list(edc.keys()) 110 | dataY = list(edc.values()) 111 | dataY = [round(i/num_product*100,2) for i in dataY] 112 | return dataX, dataY 113 | 114 | 115 | def product_deployment_query(): 116 | """ 117 | 此函数用于获取EDC产品安装的分组统计数据。 118 | 119 | Returns 120 | ------- 121 | dataX : TYPE-list 122 | EDC deployment method names. 123 | dataY : TYPE-list 124 | each EDC deployment percentage(%). 125 | 126 | """ 127 | # load data 128 | data = pd.read_csv(config.product_data_path) 129 | # select data 130 | num_product = data.shape[0] 131 | dp = collections.Counter(data['deployment']) 132 | dp1 = itemcount(dict(dp)) 133 | dataX = list(dp1.keys()) 134 | dataY = list(dp1.values()) 135 | dataY = [round(i/num_product*100,2) for i in dataY] 136 | return dataX, dataY 137 | 138 | 139 | def product_train_query(): 140 | """ 141 | 此函数用于获取EDC产品培训的分组统计数据。 142 | 143 | Returns 144 | ------- 145 | dataX : TYPE-list 146 | EDC training method names. 147 | dataY : TYPE-list 148 | each EDC training percentage(%). 149 | 150 | 151 | """ 152 | 153 | # load data 154 | data = pd.read_csv(config.product_data_path) 155 | # select data 156 | num_product = data.shape[0] 157 | tr = collections.Counter(data['training']) 158 | tr1 = itemcount(dict(tr)) 159 | dataX = list(tr1.keys()) 160 | dataY = list(tr1.values()) 161 | dataY = [round(i/num_product*100,2) for i in dataY] 162 | return dataX, dataY 163 | 164 | def product_support_query(): 165 | """ 166 | 此函数用于获取EDC产品售后支持的分组统计数据。 167 | 168 | Returns 169 | ------- 170 | dataX : TYPE-list 171 | EDC support method names. 172 | dataY : TYPE-list 173 | each EDC support percentage(%). 174 | 175 | 176 | """ 177 | 178 | # load data 179 | data = pd.read_csv(config.product_data_path) 180 | # select data 181 | num_product = data.shape[0] 182 | sp = collections.Counter(data['support']) 183 | sp1 = itemcount(dict(sp)) 184 | dataX = list(sp1.keys()) 185 | dataY = list(sp1.values()) 186 | dataY = [round(i/num_product*100,2) for i in dataY] 187 | return dataX, dataY 188 | 189 | 190 | 191 | def product_price_query(): 192 | """ 193 | 此函数用于获取EDC产品价格的分组统计数据。 194 | 195 | Returns 196 | ------- 197 | dataX : TYPE-list 198 | EDC product pricing. 199 | dataY : TYPE-list 200 | EDC product pricing 201 | Data for each EDC pricing. 202 | 203 | """ 204 | # load data 205 | data = pd.read_csv(config.product_data_path) 206 | # select data 207 | dataX = list(collections.Counter(data['starting_price_method_fill']).keys()) 208 | dataY = [] 209 | for m in dataX: 210 | m_val = data.loc[(data['starting_price_method_fill']==m),'starting_price_num_fill'].values.tolist() 211 | dataY.append(m_val) 212 | return dataX, dataY 213 | 214 | 215 | 216 | def product_rating_query(): 217 | """ 218 | It is used to get the average score of each product rating. 219 | 220 | Returns 221 | ------- 222 | dataX : TYPE-list 223 | rating index: i.e. 'rating_overall', 'rating_ease_of_use'. 224 | products : TYPE-list 225 | all product names. 226 | list_dataY : TYPE-array 227 | each product's average rating score. 228 | 229 | """ 230 | # load data 231 | data = pd.read_csv(config.user_data_path) 232 | # select data 233 | rate_overall_mean = round(data.groupby('product_name')['rating_overall'].mean(),2) 234 | rate_use_mean = round(data.groupby('product_name')['rating_ease_of_use'].mean(),2) 235 | rate_fun_mean = round(data.groupby('product_name')['rating_features_functionality'].mean(),2) 236 | rate_mn_mean = round(data.groupby('product_name')['rating_value_for_money'].mean(),2) 237 | rate_sp_mean = round(data.groupby('product_name')['rating_customer_support'].mean(),2) 238 | rate_rec_mean = round(data.groupby('product_name')['rating_likelihood_to_recommend'].mean(),2) 239 | df_rate = pd.concat([rate_overall_mean, 240 | rate_use_mean, 241 | rate_fun_mean, 242 | rate_mn_mean, 243 | rate_sp_mean, 244 | rate_rec_mean],axis=1) 245 | dataX = list(df_rate.columns) 246 | products = list(df_rate.index) 247 | list_dataY = df_rate.values 248 | return dataX, products, list_dataY 249 | 250 | 251 | def rt_index_query(): 252 | """ 253 | It is used to get the real-time index about the product and user analysis. 254 | 255 | Returns 256 | ------- 257 | product_sum : TYPE-int 258 | number of products. 259 | edc_feature_sum : TYPE-int 260 | number of features of EDC product. 261 | user_sum : TYPE-int 262 | number of users participating in the survey. 263 | 264 | """ 265 | # load data 266 | data = pd.read_csv(config.product_data_path) 267 | data2 = pd.read_csv(config.user_data_path) 268 | # select data 269 | product_sum = data.shape[0] 270 | edc_feature_sum = len(product_edc_feature_query()[0]) 271 | user_sum = data2.shape[0] 272 | return product_sum, edc_feature_sum, user_sum 273 | 274 | 275 | 276 | # 业务逻辑: 277 | colorList = ['#bcd3bb', '#e88f70', '#9dc5c8', '#e1e8c8', 278 | '#7b7c68', '#e5b5b5', '#f0b489', '#928ea8', 279 | '#bda29a', '#376956', '#c3bed4', '#495a80', 280 | '#9966cc', '#bdb76a', '#eee8ab', '#a35015', 281 | '#04dd98', '#d9b3e6', '#b6c3fc','#315dbc'] 282 | 283 | def rt_index_base(): 284 | """ 285 | It is used to get the data of realtime index. 286 | 287 | Returns 288 | ------- 289 | cur : TYPE-dict 290 | realt-time index. 291 | 292 | """ 293 | product_sum, edc_feature_sum, user_sum = rt_index_query() 294 | cur = {"product_sum": product_sum, "edc_feature_sum": edc_feature_sum, "user_sum":user_sum} 295 | return cur 296 | 297 | 298 | def hist_product_feature_base(): 299 | """ 300 | 此函数用于获取产品功能柱状图的参数。 301 | 302 | Returns 303 | ------- 304 | c : TYPE-echarts parameters 305 | return echarts parameters. 306 | 307 | """ 308 | # data query 309 | dataX, dataY = product_feature_query() 310 | # Declare objects, render pictures 311 | c = ( 312 | Bar() 313 | .add_xaxis(dataX) 314 | .add_yaxis("Product category",dataY,color='#4150d8') 315 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), 316 | yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"), 317 | title_opts=opts.TitleOpts(title="Distribution of different product function category"), 318 | datazoom_opts=[opts.DataZoomOpts()], 319 | legend_opts=opts.LegendOpts( pos_left="80%"), 320 | ) 321 | 322 | ) 323 | return c 324 | 325 | 326 | 327 | def hist_product_edc_feature_base(): 328 | """ 329 | 此函数用于获取产品EDC功能柱状图的参数。 330 | 331 | Returns 332 | ------- 333 | c : TYPE-echarts parameters 334 | return echarts parameters. 335 | 336 | """ 337 | # data query 338 | dataX, dataY = product_edc_feature_query() 339 | # Declare objects, render pictures 340 | c = ( 341 | Bar() 342 | .add_xaxis(dataX) 343 | .add_yaxis("EDC Features",dataY,color='#28bf7e') 344 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), 345 | yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"), 346 | title_opts=opts.TitleOpts(title="Distribution of Electronic Data Capture Features"), 347 | legend_opts=opts.LegendOpts( pos_left="80%"), 348 | ) 349 | 350 | ) 351 | return c 352 | 353 | 354 | 355 | def hist_product_deployment_base(): 356 | """ 357 | 此函数用于获取产品安装的柱状图的参数。 358 | 359 | Returns 360 | ------- 361 | c : TYPE-echarts parameters 362 | return echarts parameters. 363 | 364 | """ 365 | # data query 366 | dataX, dataY = product_deployment_query() 367 | # Declare objects, render pictures 368 | c = ( 369 | Bar() 370 | .add_xaxis(dataX) 371 | .add_yaxis("Deployment",dataY,color='#ed7c2f') 372 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), #name="Product Deployment Method", name_location="center", name_gap=25 373 | yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"), 374 | title_opts=opts.TitleOpts(title="Distribution of Product Deployment") 375 | ) 376 | 377 | ) 378 | 379 | return c 380 | 381 | 382 | 383 | def hist_product_train_base(): 384 | """ 385 | 此函数用于获取产品培训的柱状图的参数。 386 | 387 | Returns 388 | ------- 389 | c : TYPE-echarts parameters 390 | return echarts parameters. 391 | 392 | """ 393 | # data query 394 | dataX, dataY = product_train_query() 395 | # Declare objects, render pictures 396 | c = ( 397 | Bar() 398 | .add_xaxis(dataX) 399 | .add_yaxis("Training",dataY,color='#b6c2ff') 400 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category", ),#name="Product Training Method", name_location="center", name_gap=25 401 | yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"), 402 | title_opts=opts.TitleOpts(title="Distribution of Product Training Method") 403 | ) 404 | 405 | ) 406 | return c 407 | 408 | def boxplot_product_price_base(): 409 | """ 410 | 此函数用于获取产品价格的箱线图的参数。 411 | 412 | Returns 413 | ------- 414 | c : TYPE-echarts parameters 415 | return echarts parameters. 416 | 417 | """ 418 | # data query 419 | dataX, dataY = product_price_query() 420 | # Declare objects, render pictures 421 | c = Boxplot() 422 | c.add_xaxis(dataX) 423 | c.add_yaxis("Pricing",c.prepare_data(dataY)) 424 | c.set_global_opts(title_opts=opts.TitleOpts(title="Distribution of Product Pricing", subtitle="Unit:$")) 425 | return c 426 | 427 | 428 | 429 | 430 | def hist_product_support_base(): 431 | """ 432 | 此函数用于获取产品售后支持的柱状图的参数。 433 | 434 | Returns 435 | ------- 436 | c : TYPE-echarts parameters 437 | return echarts parameters. 438 | 439 | """ 440 | # data query 441 | dataX, dataY = product_support_query() 442 | # Declare objects, render pictures 443 | c = ( 444 | Bar() 445 | .add_xaxis(dataX) 446 | .add_yaxis("Support",dataY,color='#f2a93b') 447 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category", ),#name="Product Support Method", name_location="center", name_gap=25 448 | yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"), 449 | title_opts=opts.TitleOpts(title="Distribution of Product Support Method") 450 | ) 451 | 452 | ) 453 | 454 | return c 455 | 456 | 457 | 458 | 459 | def radar_product_rating_base(): 460 | """ 461 | 此函数用于获取产品多维评价的雷达图的参数。 462 | 463 | Returns 464 | ------- 465 | tl : TYPE-echarts parameters 466 | return echarts parameters. 467 | """ 468 | 469 | # data query 470 | dataX, products, list_dataY = product_rating_query() 471 | c_schema = [] 472 | for i in range(len(dataX)): 473 | c_schema.append({"name":dataX[i],"max":5,"min":0}) 474 | 475 | # Declare objects, render pictures 476 | tl = Timeline() 477 | for i in range(len(products)): 478 | c = ( 479 | Radar(init_opts=opts.InitOpts(width="1280px", height="720px")) 480 | .add_schema(schema=c_schema, 481 | splitarea_opt=opts.SplitAreaOpts(is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)), 482 | textstyle_opts=opts.TextStyleOpts(color="#fff"), 483 | ) 484 | .add(series_name=products[i], data=[list(list_dataY[i])], 485 | linestyle_opts=opts.LineStyleOpts(color="#CD0000"), 486 | ) 487 | .set_series_opts(label_opts=opts.LabelOpts(is_show=True)) 488 | .set_global_opts(title_opts=opts.TitleOpts(title="Multi-dimensional analysis of product performance"), 489 | legend_opts=opts.LegendOpts(pos_left="80%", pos_top="50%")) 490 | ) 491 | tl.add(c, "{}".format(products[i])) 492 | return tl 493 | 494 | 495 | def tab_product_base(): 496 | """ 497 | It is used to respond to requests for chart parameters. 498 | 499 | Returns 500 | ------- 501 | tab : TYPE-echarts parameters 502 | return echarts parameters. 503 | 504 | """ 505 | tab = Tab() 506 | tab.add(hist_product_feature_base(), "Product Module") 507 | tab.add(hist_product_edc_feature_base(), "EDC Feature") 508 | tab.add(hist_product_deployment_base(), "Product Deployment") 509 | tab.add(hist_product_support_base(),"Product Support") 510 | tab.add(hist_product_train_base(), "Product Train") 511 | tab.add(radar_product_rating_base(),"Product Rating") 512 | return tab 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | -------------------------------------------------------------------------------- /src/exe/exe_02.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Nov 2 17:34:12 2020 4 | 5 | @author: Xu 6 | 7 | 8 | 2.自动生成词云图: 9 | 负责响应数据查询请求,调用数据逻辑程序。 10 | 基于数据逻辑查询结果,业务逻辑程序组装出词云图并返回给前端页面。 11 | 12 | 三种类型的高频词和关键词可视化: 13 | 14 | 根据用户输入指定网址,通过采集该网址文本进行处理。 15 | 根据用户输入文本字符串进行处理。 16 | 根据用户输入载入本地文本进行处理,用户将所需要处理文本文件放入text文本夹中,指定文件名进行处理。 17 | 18 | """ 19 | import __init__ 20 | import os 21 | import numpy as np 22 | from src import config 23 | from collections import Counter 24 | from pyecharts import options as opts 25 | from pyecharts.charts import WordCloud 26 | from pyecharts.render import make_snapshot 27 | from jieba import posseg 28 | # import wordcloud 29 | from newspaper import Article 30 | # from imageio import imread 31 | from snapshot_selenium import snapshot 32 | 33 | # 2. 自动生成词云 34 | # 数据逻辑: 35 | def save_to_file(filepath, content): 36 | f = open(filepath, 'w', encoding='utf-8') 37 | f.write(content) 38 | f.close() 39 | 40 | def extract_words(content): 41 | """ 42 | Statistical word frequency. 43 | 44 | Parameters 45 | ---------- 46 | content : TYPE-str 47 | DESCRIPTION: text. 48 | 49 | Returns 50 | ------- 51 | word_dict : TYPE-dictionary 52 | DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} . 53 | 54 | """ 55 | words = [] 56 | pos_filters = ['n', 'v', 'a'] 57 | for line in content.split('\n'): 58 | line = line.strip() 59 | if not line: 60 | continue 61 | words += [w.word for w in posseg.cut(line) if w.flag[0] in pos_filters and len(w.word) > 1] 62 | word_dict = {i[0]: i[1] for i in Counter(words).most_common()} 63 | return word_dict 64 | 65 | 66 | def read_file(filepath): 67 | """ 68 | Read the local file and transform to text. 69 | 70 | Parameters 71 | ---------- 72 | filepath : TYPE-str 73 | DESCRIPTION: the text file path. 74 | 75 | Returns 76 | ------- 77 | content : TYPE-str 78 | DESCRIPTION:The preprocessed news text. 79 | 80 | """ 81 | f = open(filepath,'r',encoding='utf-8') 82 | content = f.read() 83 | f.close() 84 | return content 85 | 86 | 87 | def get_sorted_dict(dic, topn=100): 88 | """ 89 | Sort the dictionary by value and take the top n with the largest value. 90 | 91 | Parameters: 92 | dic: TYPE-dictionary 93 | DESCRIPTION: Dictionary to be sorted. 94 | topn: TYPE-integer 95 | DESCRIPTION: Select the N (key: value) with the largest value (default = 100). 96 | 97 | Returns 98 | ------- 99 | res : TYPE-list 100 | DESCRIPTION: A list composed of the top N key-value combinations after sorting, 101 | like [('a', 101),('b',78),...]. 102 | 103 | """ 104 | res = sorted(dic.items(), key=lambda item: item[1], reverse=True) 105 | if topn > len(dic): 106 | return res 107 | else: 108 | return res[0: topn] 109 | 110 | 111 | def text_wordfreq_by_url_query(): 112 | """ 113 | According to the user's input to specify the URL, the text of the URL is collected, 114 | and a word cloud image is automatically generated. 115 | 116 | Returns 117 | ------- 118 | word_dict:TYPE-dictionary 119 | DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} . 120 | 121 | """ 122 | 123 | def get_webcontent(url): 124 | """ 125 | Online mode: According to the URL, grab the text content of the news. 126 | 127 | Parameters 128 | ---------- 129 | url : TYPE-str 130 | DESCRIPTION: news online URL. 131 | 132 | Returns 133 | ------- 134 | content : TYPE-str 135 | DESCRIPTION:The preprocessed news text. 136 | 137 | """ 138 | news = Article(url, language='zh') 139 | news.download() 140 | news.parse() 141 | content = news.text 142 | return content 143 | 144 | url = read_file(config.wc_input_url_path) 145 | content = get_webcontent(url) 146 | word_dict = extract_words(content) 147 | return word_dict 148 | 149 | 150 | 151 | def text_wordfreq_by_input_query(): 152 | """ 153 | According to the text input by the user, a word cloud image is automatically generated. 154 | 155 | Parameters: 156 | input_text: TYPE-str 157 | DESCRIPTION: the text input by the user. 158 | 159 | Returns 160 | ------- 161 | word_dict:TYPE-dictionary 162 | DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} . 163 | 164 | """ 165 | input_text = read_file(config.wc_input_text_path) 166 | word_dict = extract_words(input_text) 167 | return word_dict 168 | 169 | 170 | 171 | 172 | 173 | def text_wordfreq_by_import_file_query(): 174 | """ 175 | According to the local file imported by the user, the word cloud image 176 | is automatically generated. 177 | 178 | Parameters: 179 | textfile: TYPE-str 180 | DESCRIPTION: the text file imported by user. 181 | 182 | 183 | Returns 184 | ------- 185 | word_dict:TYPE-dictionary 186 | DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} . 187 | 188 | """ 189 | path = read_file(config.wc_input_file_save_path).strip() 190 | content = read_file(path) 191 | word_dict = extract_words(content) 192 | return word_dict 193 | 194 | 195 | 196 | 197 | # 业务逻辑: 198 | def pic_rt_user_url_base(): 199 | path = config.pic_wc_input_url_save_path 200 | file_dir, filename = os.path.split(read_file(path).strip()) 201 | return file_dir, filename 202 | 203 | 204 | 205 | def rt_user_url_base(): 206 | """ 207 | It is used to return the requested real-time data. 208 | 209 | Returns 210 | ------- 211 | curinput : TYPE-dictionary 212 | return the frontend requested real-time data. 213 | 214 | """ 215 | userurl = read_file(config.wc_input_url_path) 216 | curinput = {'userurl': userurl} 217 | return curinput 218 | 219 | def pic_rt_user_input_text_base(): 220 | path = config.pic_wc_input_text_save_path 221 | file_dir, filename = os.path.split(read_file(path).strip()) 222 | return file_dir, filename 223 | 224 | 225 | 226 | def rt_user_input_text_base(): 227 | """ 228 | It is used to return the requested real-time data. 229 | 230 | Returns 231 | ------- 232 | curinput : TYPE-dictionary 233 | return the frontend requested real-time data. 234 | 235 | """ 236 | usertext = read_file(config.wc_input_text_path) 237 | curinput = {'usertext':usertext } 238 | return curinput 239 | 240 | def pic_rt_user_import_file_base(): 241 | path = config.pic_wc_input_file_save_path 242 | file_dir, filename = os.path.split(read_file(path).strip()) 243 | return file_dir, filename 244 | 245 | 246 | 247 | def rt_user_import_file_base(): 248 | """ 249 | It is used to return the requested real-time data. 250 | 251 | Returns 252 | ------- 253 | curinput : TYPE-dictionary 254 | return the frontend requested real-time data. 255 | 256 | """ 257 | path = read_file(config.wc_input_file_save_path).strip() 258 | filename = os.path.split(path)[-1] 259 | curinput = {'userfile':filename} 260 | return curinput 261 | 262 | 263 | def generate_random_filename(prefix, suffix): 264 | """ 265 | According to the specified prefix name and suffix name, 266 | this function is used to generate a random file name. 267 | Parameters: 268 | prefix: TYPE-str 269 | DESCRIPTION: Specified prefix name. 270 | suffix: TYPE-str 271 | DESCRIPTION: Specified suffix name. 272 | Returns 273 | ------- 274 | random_filename : TYPE-str 275 | return the random file name. 276 | 277 | """ 278 | randnum = np.random.randint(1, 100000) 279 | random_filename = prefix + '_' + str(randnum) + suffix 280 | return random_filename 281 | 282 | 283 | 284 | 285 | 286 | def wordcloud_text_by_url_base(): 287 | """ 288 | It is used to respond to requests for chart parameters. 289 | 290 | Returns 291 | ------- 292 | c : TYPE-echarts parameters 293 | return echarts parameters. 294 | 295 | """ 296 | 297 | # data query 298 | data = text_wordfreq_by_url_query() 299 | data_pair = get_sorted_dict(data, topn=1000) 300 | # generate wordcloud picture 301 | # font_path = r'C:\Windows\Fonts\simhei.ttf' 302 | # back_color = imread(config.bg_pic) 303 | # wc = wordcloud.WordCloud(font_path=font_path, mask=back_color, width=1200,height=800,min_font_size=10,max_font_size=66,max_words=1000,background_color="white") 304 | # wc.generate_from_frequencies(data) 305 | # wc.to_file(config.pic_wc_input_url_path) 306 | # Declare objects, render pictures 307 | c = ( 308 | WordCloud() 309 | .add(series_name="", data_pair=data_pair, word_size_range=[10,66],word_gap=10, 310 | shape="cicle", width="1200", height="800",) 311 | .set_global_opts( 312 | title_opts=opts.TitleOpts(title="WordCloud Chart\n", pos_left="center", 313 | title_textstyle_opts=opts.TextStyleOpts(font_size=25)), 314 | tooltip_opts=opts.TooltipOpts(is_show=True), 315 | ) 316 | ) 317 | # generate wordcloud picture 318 | rand_filename = generate_random_filename('wordcloud', '.png') 319 | pic_save_path = os.path.join(config.image_dir, rand_filename) 320 | save_to_file(config.pic_wc_input_url_save_path, pic_save_path) 321 | make_snapshot(snapshot, c.render(), pic_save_path, is_remove_html=True) 322 | return c 323 | 324 | 325 | 326 | def wordcloud_text_by_input_base(): 327 | """ 328 | It is used to respond to requests for chart parameters. 329 | 330 | Returns 331 | ------- 332 | c : TYPE-echarts parameters 333 | return echarts parameters. 334 | 335 | """ 336 | 337 | # data query 338 | data = text_wordfreq_by_input_query() 339 | data_pair = get_sorted_dict(data, topn=1000) 340 | # generate wordcloud picture 341 | # font_path = r'C:\Windows\Fonts\simhei.ttf' 342 | # back_color = imread(config.bg_pic) 343 | # wc = wordcloud.WordCloud(font_path=font_path, mask=back_color, width=1200,height=800,min_font_size=10,max_font_size=66,max_words=1000,background_color="white") 344 | # wc.generate_from_frequencies(data) 345 | # wc.to_file(config.pic_wc_input_text_path) 346 | # Declare objects, render pictures 347 | c = ( 348 | WordCloud() 349 | .add(series_name="", data_pair=data_pair, word_size_range=[10,66], word_gap=8, 350 | shape="cicle", width="1200", height="800",is_draw_out_of_bound=False,) 351 | .set_global_opts( 352 | title_opts=opts.TitleOpts(title="WordCloud Chart\n", pos_left="center", 353 | title_textstyle_opts=opts.TextStyleOpts(font_size=25)), 354 | tooltip_opts=opts.TooltipOpts(is_show=True), 355 | ) 356 | ) 357 | # generate wordcloud picture 358 | rand_filename = generate_random_filename('wordcloud', '.png') 359 | pic_save_path = os.path.join(config.image_dir, rand_filename) 360 | save_to_file(config.pic_wc_input_text_save_path, pic_save_path) 361 | make_snapshot(snapshot, c.render(), pic_save_path, is_remove_html=True) 362 | return c 363 | 364 | 365 | def wordcloud_text_by_import_file_base(): 366 | """ 367 | It is used to respond to requests for chart parameters. 368 | 369 | Returns 370 | ------- 371 | c : TYPE-echarts parameters 372 | return echarts parameters. 373 | 374 | """ 375 | 376 | # data query 377 | data = text_wordfreq_by_import_file_query() 378 | data_pair = get_sorted_dict(data, topn=1000) 379 | # generate wordcloud picture 380 | # font_path = r'C:\Windows\Fonts\simhei.ttf' 381 | # back_color = imread(config.bg_pic) 382 | # wc = wordcloud.WordCloud(font_path=font_path, mask=back_color, width=1200,height=800,min_font_size=10,max_font_size=66,max_words=1000,background_color="white") 383 | # wc.generate_from_frequencies(data) 384 | # wc.to_file(config.pic_wc_input_file_path) 385 | # Declare objects, render pictures 386 | c = ( 387 | WordCloud() 388 | .add(series_name="", data_pair=data_pair, word_size_range=[10,66], word_gap=8, 389 | shape="cicle", width="1200", height="800",is_draw_out_of_bound=False,) 390 | .set_global_opts( 391 | title_opts=opts.TitleOpts(title="WordCloud Chart\n", pos_left="center", 392 | title_textstyle_opts=opts.TextStyleOpts(font_size=25)), 393 | tooltip_opts=opts.TooltipOpts(is_show=True), 394 | ) 395 | ) 396 | # generate wordcloud picture 397 | rand_filename = generate_random_filename('wordcloud', '.png') 398 | pic_save_path = os.path.join(config.image_dir, rand_filename) 399 | save_to_file(config.pic_wc_input_file_save_path, pic_save_path) 400 | make_snapshot(snapshot, c.render(), pic_save_path, is_remove_html=True) 401 | return c 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | -------------------------------------------------------------------------------- /src/exe/exe_03.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Nov 6 10:01:58 2020 4 | 5 | @author: Xu 6 | 7 | 3.文本预处理 8 | 3.1 关键信息提取--单文本分析--关键词提取: 9 | 基于TextRank的算法的单文本摘要提取与关键词抽取。 10 | 11 | 3.2 多文本分析--主题分布: 12 | 基于LDA的多文档主题分布探索。 13 | 14 | 3.3 新词挖掘: 15 | 基于统计信息的新词挖掘: 16 | 由语料的N-gram片段建立Trie树和逆序Trie树 17 | 由Trie树计算片段的出现频次、PMI和左邻字熵 18 | 由逆序Trie树计算片段的右邻字熵 19 | 计算片段成词的得分 20 | 21 | 22 | 23 | 负责响应数据查询请求,调用数据逻辑程序。 24 | 基于数据逻辑查询结果,业务逻辑程序组装出文本关键信息并返回给前端页面。 25 | 26 | 27 | """ 28 | import __init__ 29 | import os 30 | import re 31 | from src import config 32 | from newspaper import Article 33 | from key_info_extraction.keywords_textrank import TextRank 34 | from key_info_extraction.abstract_textrank import AbstarctTextrank 35 | from key_info_extraction.topic_cluster_lda import lda_model 36 | from worddiscovery.entropy_based import EntropyBasedWorddiscovery 37 | 38 | 39 | # 3. 文本预处理--- Part 3.1 关键词提取--单文本分析 40 | # 数据逻辑: 41 | def save_to_file(filepath, content): 42 | """ 43 | Write the text to the local file. 44 | 45 | Parameters 46 | ---------- 47 | filepath : TYPE-str 48 | DESCRIPTION: the file save path. 49 | 50 | Returns 51 | ------- 52 | content : TYPE-str 53 | DESCRIPTION: the text. 54 | 55 | """ 56 | f = open(filepath, 'w', encoding='utf-8') 57 | f.write(content) 58 | f.close() 59 | 60 | 61 | def read_file(filepath): 62 | """ 63 | Read the local file and transform to text. 64 | 65 | Parameters 66 | ---------- 67 | filepath : TYPE-str 68 | DESCRIPTION: the text file path. 69 | 70 | Returns 71 | ------- 72 | content : TYPE-str 73 | DESCRIPTION:The preprocessed news text. 74 | 75 | """ 76 | f = open(filepath,'r',encoding='utf-8') 77 | content = f.read() 78 | f.close() 79 | return content 80 | 81 | 82 | def get_webcontent(url): 83 | """ 84 | Online mode: According to the URL, grab the text content of the news. 85 | 86 | Parameters 87 | ---------- 88 | url : TYPE-str 89 | DESCRIPTION: news online URL. 90 | 91 | Returns 92 | ------- 93 | content : TYPE-str 94 | DESCRIPTION:The preprocessed news text. 95 | 96 | """ 97 | news = Article(url, language='zh') 98 | news.download() 99 | news.parse() 100 | content = news.text 101 | return content 102 | 103 | 104 | def get_abstract(text): 105 | """ 106 | Use Textrank algorithm to extract text summaries/abstract. 107 | 108 | Parameters 109 | ---------- 110 | text : TYPE-str 111 | DESCRIPTION: the text content to be extracted. 112 | 113 | Returns 114 | ------- 115 | abstract : TYPE-str 116 | DESCRIPTION: the abstract extracted from text. 117 | 118 | """ 119 | abstracter = AbstarctTextrank() 120 | keysentences = abstracter.extract_abstract(text, 3) 121 | abstract = [] 122 | for sent in keysentences: 123 | abstract.append(sent[0]) 124 | return abstract 125 | 126 | def get_keywords(text): 127 | """ 128 | Use Textrank algorithm to extract text keywords. 129 | 130 | Parameters 131 | ---------- 132 | text : TYPE-str 133 | DESCRIPTION: the text content to be extracted. 134 | 135 | Returns 136 | ------- 137 | words : TYPE-str 138 | DESCRIPTION: the keywords extracted from text. 139 | 140 | """ 141 | keywords_textanker = TextRank() 142 | keywords = keywords_textanker.extract_keywords(text, 10) 143 | words = [] 144 | for word in keywords: 145 | words.append(word[0]) 146 | return words 147 | 148 | 149 | 150 | 151 | def keyinfo_by_url_query(): 152 | """ 153 | According to the user's input to specify the URL, the text of the URL is collected, 154 | and an abstract and keywords are automatically generated. 155 | 156 | Returns 157 | ------- 158 | abstract:TYPE-strs 159 | DESCRIPTION: the abstract extracted from text. 160 | keywords:TYPE-strs 161 | DESCRIPTION: the keywords extracted from text. 162 | 163 | """ 164 | url = read_file(config.keyinfo_input_url_path) 165 | content = get_webcontent(url) 166 | abstract = get_abstract(content) 167 | keywords = get_keywords(content) 168 | abstract = '。 '.join(abstract) + '。' 169 | keywords = ', '.join(keywords) 170 | wr_to_file = '摘要:\n' + abstract + '\n关键词:\n' + keywords 171 | save_to_file(config.download_keyinfo_input_url_save_path, wr_to_file) 172 | return abstract, keywords 173 | 174 | 175 | def keyinfo_by_input_text_query(): 176 | """ 177 | According to the text input by the user, an abstract and keywords 178 | are automatically generated. 179 | 180 | Returns 181 | ------- 182 | abstract:TYPE-strs 183 | DESCRIPTION: the abstract extracted from text. 184 | keywords:TYPE-strs 185 | DESCRIPTION: the keywords extracted from text. 186 | 187 | """ 188 | input_text = read_file(config.keyinfo_input_text_path) 189 | abstract = get_abstract(input_text) 190 | keywords = get_keywords(input_text) 191 | abstract = '。 '.join(abstract) + '。' 192 | keywords = ', '.join(keywords) 193 | wr_to_file = '摘要:\n' + abstract + '\n关键词:\n' + keywords 194 | save_to_file(config.download_keyinfo_input_text_save_path, wr_to_file) 195 | return abstract, keywords 196 | 197 | 198 | def keyinfo_by_import_file_query(): 199 | """ 200 | According to the local file imported by the user, an abstract and keywords 201 | are automatically generated. 202 | 203 | Returns 204 | ------- 205 | abstract:TYPE-strs 206 | DESCRIPTION: the abstract extracted from text. 207 | keywords:TYPE-strs 208 | DESCRIPTION: the keywords extracted from text. 209 | 210 | """ 211 | path = read_file(config.keyinfo_input_file_save_path).strip() 212 | content = read_file(path) 213 | abstract = get_abstract(content) 214 | keywords = get_keywords(content) 215 | abstract = '。 '.join(abstract) + '。' 216 | keywords = ', '.join(keywords) 217 | wr_to_file = '摘要:\n' + abstract + '\n关键词:\n' + keywords 218 | save_to_file(config.download_keyinfo_input_file_save_path, wr_to_file) 219 | return abstract, keywords 220 | 221 | 222 | 223 | 224 | # 业务逻辑: 225 | 226 | def rt_keyinfo_url_base(): 227 | """ 228 | It is used to return the requested real-time data. 229 | 230 | Returns 231 | ------- 232 | curinput : TYPE-dictionary 233 | return the frontend requested real-time data. 234 | 235 | """ 236 | url = read_file(config.keyinfo_input_url_path) 237 | abstract, keywords = keyinfo_by_url_query() 238 | curinput = {'url': url, 'abstract': abstract, 'keywords': keywords} 239 | return curinput 240 | 241 | 242 | 243 | 244 | def rt_keyinfo_input_text_base(): 245 | """ 246 | It is used to return the requested real-time data. 247 | 248 | Returns 249 | ------- 250 | curinput : TYPE-dictionary 251 | return the frontend requested real-time data. 252 | 253 | """ 254 | input_text = read_file(config.keyinfo_input_text_path) 255 | abstract, keywords = keyinfo_by_input_text_query() 256 | curinput = {'input_text':input_text, 'abstract': abstract, 'keywords': keywords } 257 | return curinput 258 | 259 | def download_rt_keyinfo_import_file_base(): 260 | path = read_file(config.keyinfo_input_file_save_path).strip() 261 | file_dir, filename = os.path.split(path) 262 | return file_dir, filename 263 | 264 | 265 | def rt_keyinfo_import_file_base(): 266 | """ 267 | It is used to return the requested real-time data. 268 | 269 | Returns 270 | ------- 271 | curinput : TYPE-dictionary 272 | return the frontend requested real-time data. 273 | 274 | """ 275 | path = read_file(config.keyinfo_input_file_save_path).strip() 276 | filename = os.path.split(path)[-1] 277 | abstract, keywords = keyinfo_by_import_file_query() 278 | curinput = {'filename':filename, 'abstract': abstract, 'keywords': keywords} 279 | return curinput 280 | 281 | 282 | 283 | # 3. 文本预处理--- Part 3.2 主题分析——多文本分析 284 | # 数据逻辑: 285 | def lda_topics_query(): 286 | """ 287 | It is used to get the optimal number of topics and save the topic keywords and 288 | topic distribution of documents to a file based on the file imported by the user. 289 | 290 | Returns 291 | ------- 292 | num_topics: type-integer 293 | return the number of topics. 294 | 295 | 296 | """ 297 | # data prepare 298 | filepath = read_file(config.topic_input_file_save_path).strip() 299 | f = open(filepath, 'r', encoding='utf-8') 300 | content = f.readlines() 301 | f.close() 302 | data = [text for text in content if len(re.sub(r'\s','',text))>5] 303 | # get optimal number of topics 304 | # write topic keywords to file 305 | num_topics, output_topic_keywords, output_topic_dist = lda_model(data, config.download_topic_input_file_save_path) 306 | return num_topics, output_topic_keywords, output_topic_dist 307 | 308 | 309 | 310 | # 业务逻辑: 311 | def rt_topic_import_file_base(): 312 | """ 313 | It is used to return the requested real-time data. 314 | 315 | Returns 316 | ------- 317 | curinput : TYPE-dictionary 318 | return the frontend requested real-time data. 319 | 320 | """ 321 | path = read_file(config.topic_input_file_save_path).strip() 322 | filename = os.path.split(path)[-1] 323 | num_topics, topic_keywords, topic_dist = lda_topics_query() 324 | curinput = {'filename':filename, 'num_topics':num_topics, 'topic_keywords': topic_keywords, 'topic_dist': topic_dist} 325 | return curinput 326 | 327 | 328 | 329 | # 3. 文本关键信息提取--- Part 3.3 新词挖掘 330 | # 数据逻辑: 331 | def new_word_discovery_query(): 332 | """ 333 | It is used to get the user input: source text, maximum number of new words and maximum of new word length. 334 | 335 | Returns 336 | ------- 337 | new_words: type-list 338 | return the list of new words. 339 | 340 | 341 | """ 342 | # data prepare 343 | content = read_file(config.new_word_discovery_file_save_path).split('\n, ') 344 | text, word_count, max_word_len = content[0].strip(),int(content[1].strip()),int(content[2].strip()) 345 | # word discovery 346 | discovery = EntropyBasedWorddiscovery(word_max_len=max_word_len) 347 | discovery.parse(text, debug=True) 348 | new_words = discovery.get_new_words(word_count) 349 | new_words = [str(i+1) + ' ' + new_words[i] for i in range(len(new_words))] 350 | new_words = '\n'.join(new_words) 351 | save_to_file(config.download_new_word_output_file_save_path, new_words) 352 | return text, word_count, max_word_len, new_words 353 | 354 | 355 | # 业务逻辑: 356 | def rt_new_word_discovery_base(): 357 | """ 358 | It is used to return the requested real-time data. 359 | 360 | Returns 361 | ------- 362 | curinput : TYPE-dictionary 363 | return the frontend requested real-time data. 364 | 365 | """ 366 | text, word_count, max_word_len, new_words= new_word_discovery_query() 367 | curinput = {'inputtext':text, 'wordcount':word_count, 'wordlength': max_word_len, 'findwords': new_words} 368 | return curinput 369 | 370 | def download_rt_new_word_discovery_base(): 371 | file_dir, filename = os.path.split(config.download_new_word_output_file_save_path) 372 | return file_dir, filename 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | -------------------------------------------------------------------------------- /src/exe/exe_05.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Nov 12 12:25:50 2020 4 | 5 | @author: Xu 6 | 7 | 8 | 5.用户评价情感分析 9 | 10 | 根据 11 | 12 | 13 | 负责响应数据查询请求,调用数据逻辑程序。 14 | 基于数据逻辑查询结果,业务逻辑程序组装出用户评价文本的分析数据并返回给前端页面。 15 | 16 | """ 17 | import __init__ 18 | import os 19 | import pandas as pd 20 | from src import config 21 | from sentiment_analysis.review_sentiment_analysis import review_summary 22 | 23 | 24 | # 5.用户评价情感分析 25 | # 数据逻辑: 26 | def save_to_file(filepath, content): 27 | """ 28 | Write the text to the local file. 29 | 30 | Parameters 31 | ---------- 32 | filepath : TYPE-str 33 | DESCRIPTION: the file save path. 34 | 35 | Returns 36 | ------- 37 | content : TYPE-str 38 | DESCRIPTION: the text. 39 | 40 | """ 41 | f = open(filepath, 'w', encoding='utf-8') 42 | f.write(content) 43 | f.close() 44 | 45 | def read_file(filepath): 46 | """ 47 | Read the local file and transform to text. 48 | 49 | Parameters 50 | ---------- 51 | filepath : TYPE-str 52 | DESCRIPTION: the text file path. 53 | 54 | Returns 55 | ------- 56 | content : TYPE-str 57 | DESCRIPTION:The preprocessed news text. 58 | 59 | """ 60 | f = open(filepath,'r',encoding='utf-8') 61 | content = f.read() 62 | f.close() 63 | return content 64 | 65 | def rt_index_query(): 66 | """ 67 | It is used to return the requested real-time data. 68 | 69 | Returns 70 | ------- 71 | curinput : TYPE-dictionary 72 | return the frontend requested real-time data. 73 | 74 | """ 75 | # load data 76 | data = pd.read_csv(config.business_data_path) 77 | data2 = pd.read_csv(config.review_data_path) 78 | # select data 79 | product_sum = data.shape[0] 80 | user_sum = len(data2['user_name'].unique()) 81 | review_sum = data2.shape[0] 82 | return product_sum, user_sum, review_sum 83 | 84 | 85 | def review_summary_query(): 86 | """ 87 | According to the query information entered by the user, 88 | the product review summary is returned. 89 | 90 | Returns 91 | ------- 92 | curinput : TYPE-dictionary 93 | return the frontend requested real-time data. 94 | 95 | """ 96 | data = pd.read_csv(config.review_data_path) 97 | query_word = read_file(config.user_input_id_name_path).strip() 98 | if query_word.isnumeric()==False: 99 | product_id = data.loc[(data['product_name']==query_word), 'product_id'].values[0] 100 | product_name, product_basic, review_result = review_summary(product_id, config.review_summary_save_path) 101 | else: 102 | product_name, product_basic, review_result = review_summary(int(query_word), config.review_summary_save_path) 103 | return product_name, product_basic, review_result 104 | 105 | 106 | 107 | # 业务逻辑: 108 | 109 | def rt_index_base(): 110 | """ 111 | It is used to get the data of realtime index. 112 | 113 | Returns 114 | ------- 115 | cur : TYPE-dict 116 | realt-time index. 117 | 118 | """ 119 | product_sum, user_sum, review_sum = rt_index_query() 120 | cur = {"product_sum": product_sum, "user_sum": user_sum, "review_sum": review_sum} 121 | return cur 122 | 123 | 124 | def review_summary_base(): 125 | product_name, product_basic, review_result = review_summary_query() 126 | cur = {'product_name': product_name, 'product_basic': product_basic, 'review_result': review_result} 127 | return cur 128 | 129 | 130 | def download_review_summary_base(): 131 | path = config.review_summary_save_path 132 | file_dir, filename = os.path.split(path) 133 | return file_dir, filename 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /src/exe/exe_06.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 30 09:20:06 2020 4 | 5 | @author: Xu 6 | 7 | 6.用户画像 8 | 9 | 负责响应数据查询请求,调用数据逻辑程序。 10 | 基于数据逻辑查询结果,业务逻辑程序组装出用户画像数据并返回给前端页面。 11 | 12 | """ 13 | 14 | 15 | import __init__ 16 | import re 17 | import os 18 | import pandas as pd 19 | import numpy as np 20 | import collections 21 | from src import config 22 | from pyecharts import options as opts 23 | from pyecharts.charts import Bar, WordCloud, Page, Tab 24 | 25 | # 6. 用户画像分析 26 | # 数据逻辑: 27 | def user_industry_query(): 28 | """ 29 | It is used for data query of user industry. 30 | 31 | Returns 32 | ------- 33 | dataX : TYPE-list 34 | DESCRIPTION:all user industry choices. 35 | dataY : TYPE-list 36 | DESCRIPTION:number of users in different industries. 37 | 38 | """ 39 | # load data 40 | data = pd.read_csv(config.user_data_path) 41 | # select data 42 | dataX = list(collections.Counter(data['user_industry']).keys()) 43 | dataY = list(collections.Counter(data['user_industry']).values()) 44 | remove_nan_indx = dataX.index(np.nan) 45 | dataX.remove(np.nan) 46 | dataY.pop(remove_nan_indx) 47 | return dataX,dataY 48 | 49 | 50 | def user_product_use_time_query(): 51 | """ 52 | It is used for data query of how long users use the product. 53 | 54 | Returns 55 | ------- 56 | dataX : TYPE-list 57 | DESCRIPTION:all product use time choices. 58 | dataY : TYPE-list 59 | DESCRIPTION:number of users in different usage time. 60 | 61 | """ 62 | # load data 63 | data = pd.read_csv(config.user_data_path) 64 | # select data 65 | dataX = list(collections.Counter(data['used_years']).keys()) 66 | dataY = list(collections.Counter(data['used_years']).values()) 67 | remove_nan_indx = dataX.index(np.nan) 68 | dataX.remove(np.nan) 69 | dataY.pop(remove_nan_indx) 70 | return dataX,dataY 71 | 72 | 73 | def user_company_size_query(): 74 | """ 75 | It is used for data query of user company size. 76 | 77 | Returns 78 | ------- 79 | dataX : TYPE-list 80 | DESCRIPTION:all company size. 81 | dataY : TYPE-list 82 | DESCRIPTION:number of users in different companies. 83 | 84 | """ 85 | # load data 86 | data = pd.read_csv(config.user_data_path) 87 | # select data 88 | dataX = list(collections.Counter(data['user_company_size']).keys()) 89 | dataY = list(collections.Counter(data['user_company_size']).values()) 90 | remove_nan_indx = dataX.index(np.nan) 91 | dataX.remove(np.nan) 92 | dataY.pop(remove_nan_indx) 93 | return dataX,dataY 94 | 95 | 96 | 97 | def user_job_query(): 98 | """ 99 | It is used for data query of user job title. 100 | 101 | Returns 102 | ------- 103 | dataX : TYPE-list 104 | DESCRIPTION:all jobs. 105 | dataY : TYPE-list 106 | DESCRIPTION:number of users in different jobs. 107 | 108 | """ 109 | # load data 110 | data = pd.read_csv(config.user_data_path) 111 | # select data 112 | dataX = list(collections.Counter(data['user_job_title']).keys()) 113 | dataY = list(collections.Counter(data['user_job_title']).values()) 114 | remove_nan_indx = dataX.index(np.nan) 115 | dataX.remove(np.nan) 116 | dataY.pop(remove_nan_indx) 117 | return dataX,dataY 118 | 119 | def user_job_wordcloud_freq_query(): 120 | """ 121 | It is used to count the frequency of words for user job title. 122 | 123 | Returns 124 | ------- 125 | data_pair : TYPE-list(tuple) 126 | DESCRIPTION: Count the frequency of words,like [(word1, count1), (word2, count2),...]. 127 | 128 | """ 129 | 130 | # load data 131 | dataX, dataY = user_job_query() 132 | # select data 133 | token = [ re.split(r'\W+',job) for job in dataX] 134 | vocab = {} 135 | for i in range(len(token)): 136 | for word in token[i]: 137 | if word not in vocab: 138 | vocab[word] = dataY[i] 139 | else: 140 | vocab[word] += dataY[i] 141 | data_pair = list(vocab.items()) 142 | return data_pair 143 | 144 | 145 | 146 | # 业务逻辑: 147 | def hist_user_industry_base(): 148 | """ 149 | It is used to respond to requests for chart parameters. 150 | 151 | Returns 152 | ------- 153 | c : TYPE-echarts parameters 154 | return echarts parameters. 155 | 156 | """ 157 | 158 | # data query 159 | dataX, dataY = user_industry_query() 160 | # Declare objects, render pictures 161 | c = ( 162 | Bar() 163 | .add_xaxis(dataX) 164 | .add_yaxis("Industry", dataY,color='#b6c2ff') 165 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), 166 | yaxis_opts=opts.AxisOpts(type_="value", name="number of users"), 167 | title_opts=opts.TitleOpts(title="User Industry Distribution",pos_left="center"), 168 | legend_opts=opts.LegendOpts(pos_left="80%",), 169 | datazoom_opts=[opts.DataZoomOpts()], 170 | ) 171 | 172 | ) 173 | return c 174 | 175 | 176 | def hist_user_company_size_base(): 177 | """ 178 | It is used to respond to requests for chart parameters. 179 | 180 | Returns 181 | ------- 182 | c : TYPE-echarts parameters 183 | return echarts parameters. 184 | 185 | """ 186 | 187 | # data query 188 | dataX, dataY = user_company_size_query() 189 | # Declare objects, render pictures 190 | c = ( 191 | Bar() 192 | .add_xaxis(dataX) 193 | .add_yaxis("Company Size", dataY, color='#ed7c2f') 194 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), 195 | yaxis_opts=opts.AxisOpts(type_="value", name="number of users"), 196 | title_opts=opts.TitleOpts(title="User Company Size Distribution", pos_left="center"), 197 | legend_opts=opts.LegendOpts(pos_left="70%",pos_top="5%"), 198 | ) 199 | 200 | ) 201 | return c 202 | 203 | 204 | def hist_user_product_use_time_base(): 205 | """ 206 | It is used to respond to requests for chart parameters. 207 | 208 | Returns 209 | ------- 210 | c : TYPE-echarts parameters 211 | return echarts parameters. 212 | 213 | """ 214 | 215 | # data query 216 | dataX, dataY = user_product_use_time_query() 217 | # Declare objects, render pictures 218 | c = ( 219 | Bar() 220 | .add_xaxis(dataX) 221 | .add_yaxis("Product Usage Time", dataY, color='#28bf7e') 222 | .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), 223 | yaxis_opts=opts.AxisOpts(type_="value", name="number of users"), 224 | title_opts=opts.TitleOpts(title="Distribution of cumulative product usage time by users",pos_left="center"), 225 | legend_opts=opts.LegendOpts(pos_left="70%",pos_top="5%"), 226 | ) 227 | 228 | ) 229 | return c 230 | 231 | def wordcloud_user_job_base(): 232 | """ 233 | It is used to respond to requests for chart parameters. 234 | 235 | Returns 236 | ------- 237 | c : TYPE-echarts parameters 238 | return echarts parameters. 239 | 240 | """ 241 | 242 | # data query 243 | data_pair = user_job_wordcloud_freq_query() 244 | # Declare objects, render pictures 245 | c = ( 246 | WordCloud() 247 | .add(series_name="Occupation", data_pair=data_pair, word_size_range=[10,100], 248 | shape="cicle", width="1200", height="800",) 249 | .set_global_opts( 250 | title_opts=opts.TitleOpts(title="User Occupation Analysis", pos_left="center", 251 | title_textstyle_opts=opts.TextStyleOpts(font_size=23)), 252 | tooltip_opts=opts.TooltipOpts(is_show=True), 253 | ) 254 | ) 255 | return c 256 | 257 | 258 | def pic_wordcloud_user_jov_base(): 259 | """ 260 | It is used to respond to requests for chart parameters. 261 | 262 | Returns 263 | ------- 264 | image : TYPE-IMAGE Component parameters 265 | DESCRIPTION:IMAGE html parameters. 266 | 267 | """ 268 | 269 | # get picture 270 | from wordcloud import WordCloud 271 | 272 | data_pair = user_job_wordcloud_freq_query() 273 | wc = WordCloud(width=1200,height=800,min_font_size=10,max_font_size=100,font_step=2,max_words=10000,background_color="white") 274 | wc.generate_from_frequencies(dict(data_pair)) 275 | wc.to_file(os.path.join(config.image_dir,"wordcloud_user_job.png")) 276 | # render picture 277 | from pyecharts.components import Image 278 | from pyecharts.options import ComponentTitleOpts 279 | 280 | image = Image() 281 | img_src = (os.path.join(config.image_dir,"wordcloud_user_job.png")) 282 | image.add(src=img_src, 283 | style_opts={"width": "1200px", "height": "800px", "style": "margin-top: 20px"}, 284 | ) 285 | image.set_global_opts(title_opts=ComponentTitleOpts(title="User Occupation Analysis")) 286 | return image 287 | 288 | 289 | 290 | def page_user_analysis_base(): 291 | """ 292 | It is used to respond to requests for chart parameters. 293 | 294 | Returns 295 | ------- 296 | page : TYPE-echarts parameters 297 | return echarts parameters. 298 | 299 | """ 300 | page = Page(interval=10, layout=Page.SimplePageLayout) 301 | page.add( 302 | hist_user_industry_base(), 303 | hist_user_company_size_base(), 304 | hist_user_product_use_time_base(), 305 | wordcloud_user_job_base(), 306 | ) 307 | return page 308 | 309 | 310 | def tab_user_analysis_base(): 311 | """ 312 | It is used to respond to requests for chart parameters. 313 | 314 | Returns 315 | ------- 316 | tab : TYPE-echarts parameters 317 | return echarts parameters. 318 | 319 | """ 320 | tab = Tab() 321 | tab.add(hist_user_industry_base(), "User Industry") 322 | tab.add(hist_user_company_size_base(), "User Company") 323 | tab.add(hist_user_product_use_time_base(), "Product Usage Time") 324 | tab.add(wordcloud_user_job_base(), "User Occupation") 325 | return tab 326 | 327 | 328 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 26 21:50:26 2021 4 | 5 | @author: Xu 6 | """ 7 | import sys 8 | import os 9 | curPath = os.path.abspath(os.path.dirname(__file__)) 10 | rootPath = os.path.split(curPath)[0] 11 | sys.path.append(os.path.split(rootPath)[0]) -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/compute_keywords_tfidf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/compute_keywords_tfidf.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/create_wordcloud.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/create_wordcloud.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/textrank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/textrank.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/textrank.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/textrank.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/__pycache__/topic_cluster_lda.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/topic_cluster_lda.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/key_info_extraction/abstract_textrank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 11 11:08:56 2020 4 | 5 | @author: Xu 6 | """ 7 | import __init__ 8 | from collections import defaultdict 9 | from jieba import posseg as pseg 10 | from key_info_extraction.textrank import textrank_graph 11 | from key_info_extraction.sentence_similarity import SimilarityCompute 12 | import re 13 | from key_info_extraction.create_wordcloud import CreateWordCloud 14 | 15 | class AbstarctTextrank: 16 | def __init__(self): 17 | self.span = 3 18 | self.similer = SimilarityCompute() 19 | self.sim_score = 0.5 #句子相似度阈值,用于构建句子之间的边 20 | 21 | def sentence_split(self, text): 22 | sentence_dict = {} 23 | sentences = [sentence.strip() for sentence in re.split(r'[?!。;;\n\r]', text) if sentence] 24 | for index, sentence in enumerate(sentences): 25 | sentence_dict[index] = [sentence, [word.word for word in pseg.cut(sentence) if word.flag[0] not in ['x', 'u', 'p', 'w']]] 26 | return sentence_dict 27 | 28 | def extract_abstract(self, text, num_sentences): 29 | sentence_dict = self.sentence_split(text) 30 | g = textrank_graph() 31 | cm = defaultdict(int) 32 | for i, s1 in sentence_dict.items(): 33 | for j, s2 in sentence_dict.items(): 34 | sim_score = self.similer.similarity_cosine(s1[1], s2[1]) 35 | if sim_score >= 0.5: 36 | cm[(s1[0], s2[0])] += 1 37 | for terms, w in cm.items(): 38 | g.addEdge(terms[0], terms[1], w) 39 | nodes_rank = g.rank() 40 | nodes_rank = sorted(nodes_rank.items(), key=lambda asd: asd[1], reverse=True) 41 | return nodes_rank[:num_sentences] 42 | 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/compute_keywords_tfidf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 4 10:54:09 2020 4 | 5 | @author: Xu 6 | 7 | 关键词抽取之TFIDF: 8 | 这个部分的任务是对输入文本构建词表,通过计算TF-IDF进行关键词抽取。 9 | 10 | 计算TF-IDF值: 11 | 一般有3种方法来实现: 12 | 1)用gensim库计算TFIDF值, 13 | from gensim import models 14 | tfidf = models.TfidfModel(corpus) 15 | 2)用sklearn库计算TFIDF值, 16 | from sklearn.feature_extraction.text import TfidfVectorizer 17 | tfidf_vec = TfidefVectorizer() 18 | tfidf_matrix = tfidf_vec.fit_transform(corpus) 19 | 3)用python手动实现TFIDF值, 20 | i.对语料进行分词 21 | ii.统计词频 22 | iii.定义计算tfidf函数 23 | iv.计算每个单词的tfidf值 24 | 25 | 实现: 26 | 1.传入词性限制集合:调用词性标注接口,对输入句子进行词性标注,得到分词及对应的词性; 27 | 2.遍历分词结果:如果该词的词性不再词性限制集合中,则跳过; 28 | 如果词的长度小于2,或者词为停用词,则跳过; 29 | 将满足条件的词添加到词频词典中,出现的次数加1; 30 | 3.遍历词频词典,根据idf词典得到每个词的idf值,并除以词频词典中的次数总和,得到每个词的tf*idf值; 31 | 4.根据tf-idf值对词频词典中的词进行降序排序,输出topK个词作为关键词。 32 | 33 | 34 | """ 35 | import __init__ 36 | from src import config 37 | from jieba import posseg 38 | 39 | 40 | class TFIDF: 41 | def __init__(self): 42 | self.idf_file = config.idf_path 43 | self.idf_dict, self.common_idf = self.load_idf() 44 | 45 | def load_idf(self): 46 | idf_dict = {} 47 | for line in open(self.idf_file, 'r', encoding='utf-8').readlines(): 48 | try: 49 | word, freq = line.strip().split(' ') 50 | except: 51 | word, freq = line.strip().split('\t') 52 | idf_dict[word] = float(freq) 53 | common_idf = sum(idf_dict.values())/len(idf_dict) 54 | 55 | return idf_dict, common_idf 56 | 57 | 58 | def build_wordsdict(self, text): 59 | word_dict = {} 60 | candi_words = [] 61 | candi_dict = {} 62 | for word in posseg.cut(text): 63 | if word.flag[0] in ['n', 'v', 'a'] and len(word.word) > 1: 64 | candi_words.append(word.word) 65 | if word.word not in word_dict: 66 | word_dict[word.word] = 1 67 | else: 68 | word_dict[word.word] += 1 69 | count_total = sum(word_dict.values()) 70 | for word, word_count in word_dict.items(): 71 | if word in candi_words: 72 | candi_dict[word] = word_count/count_total 73 | else: 74 | continue 75 | 76 | return candi_dict 77 | 78 | def extract_keywords(self, text, num_keywords): 79 | keywords_dict = {} 80 | candi_dict = self.build_wordsdict(text) 81 | for word, word_tf in candi_dict.items(): 82 | word_idf = self.idf_dict.get(word, self.common_idf) 83 | word_tfidf = word_idf * word_tf 84 | keywords_dict[word] = word_tfidf 85 | keywords_dict = sorted(keywords_dict.items(), key=lambda asd:asd[1], reverse=True) 86 | 87 | return keywords_dict[:num_keywords] 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/create_wordcloud.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 4 10:24:59 2020 4 | 5 | @author: Xu 6 | 7 | 项目介绍:自动词云生成 8 | 对给定的文本进行关键词和高频词统计并根据指定图片形状来生成词云图。 9 | 10 | 11 | 项目功能: 12 | 三种类型的高频词和关键词可视化: 13 | 1. 根据用户输入指定网址,通过采集该网址文本进行处理。 14 | 2. 根据用户输入文本字符串进行处理。 15 | 3. 根据用户输入载入本地文本进行处理,用户将所需要处理文本文件放入text文本夹中,指定文件名进行处理。 16 | 17 | 18 | 项目实现: 19 | 主要通过调用wordcloud这个可视化组件来完成任务。 20 | 21 | 输入用户给定参数: 22 | 1) textfile: 放于text文件夹中, 为用户需要分析的文本 23 | 2) picturefile: 放于background文件夹中, 为用户给定的图片源文件 24 | 3) url: 用户需要进行分析网页文本的url 25 | 4) content: 用户需要分析的文本字符串 26 | 5) save_name: 用户对当前分析目标的命名 27 | 6) word_num: 用户希望展示的词数 28 | 29 | 输出: 在output文件夹下会生成以save_name开头的高频词云图和关键词云图 30 | 31 | """ 32 | 33 | import __init__ 34 | import os 35 | import numpy as np 36 | from PIL import Image 37 | import matplotlib.pyplot as plt 38 | from key_info_extraction.compute_keywords_tfidf import TFIDF 39 | from collections import Counter 40 | from jieba import posseg 41 | import urllib.request 42 | from wordcloud import WordCloud, ImageColorGenerator 43 | from newspaper import Article 44 | 45 | 46 | class CreateWordCloud: 47 | def __init__(self): 48 | cur = 'D:\\NLP\\My projects\\Capterra信息提取\\Key Info Extraction' 49 | self.textdir = os.path.join(cur, 'text') 50 | self.background = os.path.join(cur, 'background') 51 | self.fontpath = os.path.join(cur, 'data\\simhei.ttf') 52 | self.outpath = os.path.join(cur, 'output') 53 | self.pos_filters = ['n', 'v', 'a'] 54 | self.limit_words = 100 55 | self.Keyworder = TFIDF() 56 | return 57 | 58 | '''获取搜索页''' 59 | def get_html(self, url): 60 | headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"} 61 | req = urllib.request.Request(url, headers=headers) 62 | html = urllib.request.urlopen(req).read().decode('utf-8') 63 | return html 64 | 65 | '''读取本地文件进行处理''' 66 | def read_local_file(self, textfile): 67 | textpath = os.path.join(self.textdir, textfile) 68 | content = open(textpath,'r',encoding='utf-8').read() 69 | return content 70 | 71 | '''统计词频''' 72 | def extract_words(self, content): 73 | words = [] 74 | for line in content.split('\n'): 75 | line = line.strip() 76 | if not line: 77 | continue 78 | words += [w.word for w in posseg.cut(line) if w.flag[0] in self.pos_filters and len(w.word) > 1] 79 | word_dict = {i[0]: i[1] for i in Counter(words).most_common()} 80 | return word_dict 81 | 82 | '''抽取关键词''' 83 | def extract_keywords(self, content, words_num=20): 84 | keywords_dict = {} 85 | keywords = self.Keyworder.extract_keywords(content, words_num) 86 | for key in keywords: 87 | word = key[0] 88 | value = int(key[1]*1000) 89 | keywords_dict[word] = value 90 | return keywords_dict 91 | 92 | '''创建关键词云图''' 93 | def show_cloud(self, word_dict, max_words, picturefile, save_name): 94 | self.backimage = os.path.join(self.background, picturefile) 95 | saveimage = os.path.join(self.outpath, save_name + '.jpg') 96 | backgroud_Image = np.array(Image.open(self.backimage)) 97 | plt.figure(figsize=(15,10)) 98 | cloud = WordCloud(font_path=self.fontpath, 99 | background_color='white', 100 | # width=800, 101 | # height=600, 102 | max_words= max_words, 103 | max_font_size=500, 104 | mask=backgroud_Image, 105 | random_state=50 106 | ) 107 | 108 | word_cloud = cloud.generate_from_frequencies(word_dict) 109 | # img_colors = ImageColorGenerator(backgroud_Image) 110 | # word_cloud.recolor(color_func=img_colors) 111 | plt.imshow(word_cloud) 112 | plt.axis('off') 113 | plt.savefig(saveimage) 114 | # plt.show() 115 | # plt.close() 116 | 117 | 118 | 119 | '''展示关键词云图''' 120 | def show_keywords(self, content, picturefile, words_num=20, save_name = 'test'): 121 | keywords_text = self.extract_keywords(content, words_num) 122 | self.show_cloud(keywords_text, words_num, picturefile, save_name) 123 | return 124 | 125 | '''展示高频词云图''' 126 | def show_topwords(self, content, picturefile, words_num=50, save_name = 'test'): 127 | topwords_text = self.extract_words(content) 128 | self.show_cloud(topwords_text, words_num, picturefile, save_name) 129 | return 130 | 131 | '''在线模式抓取新闻进行既定形状可视化''' 132 | def get_webcontent(self, url): 133 | news = Article(url, language='zh') 134 | news.download() 135 | news.parse() 136 | content = news.text 137 | return content 138 | 139 | 140 | '''根据用户输入url进行处理''' 141 | def show_wordcloud_online(self, url, picturefile, words_num, save_name): 142 | content = self.get_webcontent(url) 143 | self.show_main(content, picturefile, words_num, save_name) 144 | return 145 | 146 | '''根据用户输入文本进行处理''' 147 | def show_wordcloud_input(self, content, picturefile, words_num, save_name): 148 | self.show_main(content, picturefile, words_num, save_name) 149 | return 150 | 151 | '''根据用户输入载入本地文本进行处理''' 152 | def show_wordcloud_offline(self, textfile, picturefile, words_num, save_name): 153 | content = self.read_local_file(textfile) 154 | self.show_main(content, picturefile, words_num, save_name) 155 | return 156 | 157 | '''分别执行绘制关键词和高频词''' 158 | def show_main(self, content, picturefile, words_num, save_name): 159 | name = save_name + '-topwords' 160 | print('正在生成该文本的高频词云图.....') 161 | self.show_topwords(content, picturefile, words_num, name) 162 | print('已完成该文本的高频词云图.....') 163 | print('正在生成该文本的关键词云图.....') 164 | name = save_name + '-keywords' 165 | self.show_keywords(content, picturefile, words_num, name) 166 | print('已完成该文本的关键词云图.....') 167 | 168 | def test(): 169 | print('*'*10 +'根据输入文本进行处理:'+ '*'*10) 170 | with open('text\\test_article.txt', 'r', encoding='utf-8') as f: 171 | content = f.readlines()[0] 172 | print(content[:100]+'...') 173 | picturefile = 'china.jpg' 174 | save_name = 'test' 175 | words_num = 50 176 | handler = CreateWordCloud() 177 | handler.show_wordcloud_input(content, picturefile, words_num, save_name) 178 | f.close() 179 | 180 | print('*'*10 +'根据输入url进行处理:'+ '*'*10) 181 | with open('text\\test_url.txt', 'r', encoding='utf-8') as f1: 182 | url = f1.readlines()[-1].strip() 183 | print(url) 184 | picturefile = 'oval.png' 185 | save_name = 'test1' 186 | words_num = 50 187 | handler = CreateWordCloud() 188 | handler.show_wordcloud_online(url, picturefile, words_num, save_name) 189 | f1.close() 190 | 191 | print('*'*10 +'根据输入载入本地文本进行处理:'+ '*'*10) 192 | textfile = 'test_article.txt' 193 | picturefile = 'profile.png' 194 | save_name = 'test2' 195 | words_num = 50 196 | handler = CreateWordCloud() 197 | print('load text file from {}'.format(os.path.join(handler.textdir, textfile))) 198 | handler.show_wordcloud_offline(textfile, picturefile, words_num, save_name) 199 | 200 | 201 | 202 | 203 | if __name__ == '__main__': 204 | test() 205 | 206 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/keywords_textrank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 11 11:10:17 2020 4 | 5 | @author: Xu 6 | """ 7 | import __init__ 8 | from collections import defaultdict 9 | from jieba import posseg as pseg 10 | from key_info_extraction.textrank import textrank_graph 11 | 12 | class TextRank: 13 | def __init__(self): 14 | self.candi_pos = ['n', 'v', 'a'] 15 | self.span = 5 16 | 17 | def extract_keywords(self, text, num_keywords): 18 | g = textrank_graph() 19 | cm = defaultdict(int) 20 | word_list = [[word.word, word.flag] for word in pseg.cut(text)] 21 | for i, word in enumerate(word_list): 22 | if word[1][0] in self.candi_pos and len(word[0]) > 1: 23 | for j in range(i + 1, i + self.span): 24 | if j >= len(word_list): 25 | break 26 | if word_list[j][1][0] not in self.candi_pos or len(word_list[j][0]) < 2: 27 | continue 28 | pair = tuple((word[0], word_list[j][0])) 29 | cm[(pair)] += 1 30 | 31 | for terms, w in cm.items(): 32 | g.addEdge(terms[0], terms[1], w) 33 | nodes_rank = g.rank() 34 | nodes_rank = sorted(nodes_rank.items(), key=lambda asd:asd[1], reverse=True) 35 | 36 | return nodes_rank[:num_keywords] 37 | 38 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/sentence_similarity.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 11 11:06:16 2020 4 | 5 | @author: Xu 6 | """ 7 | import __init__ 8 | from gensim import models 9 | from src import config 10 | import numpy as np 11 | 12 | # import os 13 | # import sys 14 | # curPath = os.path.abspath(os.path.dirname(__file__)) 15 | # rootPath = os.path.split(curPath)[0] 16 | # sys.append(os.path.split(rootPath)[0]) 17 | 18 | 19 | class SimilarityCompute: 20 | def __init__(self): 21 | self.embedding_file = config.token_vector_path 22 | self.model = models.KeyedVectors.load_word2vec_format(self.embedding_file, binary=False) 23 | 24 | def get_wordvector(self, word): 25 | try: 26 | return self.model[word] 27 | except: 28 | return np.zeros(200) 29 | 30 | def similarity_cosine(self, word_list1,word_list2): 31 | simalrity = 0 32 | vector1 = np.zeros(200) 33 | for word in word_list1: 34 | vector1 += self.get_wordvector(word) 35 | 36 | vector1 = vector1/len(word_list1) 37 | vector2 = np.zeros(200) 38 | 39 | for word in word_list2: 40 | vector2 += self.get_wordvector(word) 41 | 42 | vector2 = vector2/len(word_list2) 43 | cos1 = np.sum(vector1*vector2) 44 | cos21 = np.sqrt(sum(vector1**2)) 45 | cos22 = np.sqrt(sum(vector2**2)) 46 | similarity = cos1/float(cos21*cos22) 47 | return similarity 48 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/textrank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon May 11 11:05:07 2020 4 | 5 | @author: Xu 6 | """ 7 | import __init__ 8 | from collections import defaultdict 9 | import sys 10 | 11 | 12 | class textrank_graph: 13 | def __init__(self): 14 | self.graph = defaultdict(list) 15 | self.d = 0.85 #d是阻尼系数,一般设置为0.85 16 | self.min_diff = 1e-5 #设定收敛阈值 17 | 18 | #添加节点之间的边 19 | def addEdge(self, start, end, weight): 20 | self.graph[start].append((start, end, weight)) 21 | self.graph[end].append((end, start, weight)) 22 | 23 | #节点排序 24 | def rank(self): 25 | #一共有14个节点 26 | print(len(self.graph)) 27 | #默认初始化权重 28 | weight_deault = 1.0 / (len(self.graph) or 1.0) 29 | #nodeweight_dict, 存储节点的权重 30 | nodeweight_dict = defaultdict(float) 31 | #outsum,存储节点的出度权重 32 | outsum_node_dict = defaultdict(float) 33 | #根据图中的边,更新节点权重 34 | for node, out_edge in self.graph.items(): 35 | #是 [('是', '全国', 1), ('是', '调查', 1), ('是', '失业率', 1), ('是', '城镇', 1)] 36 | nodeweight_dict[node] = weight_deault 37 | outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0) 38 | #初始状态下的textrank重要性权重 39 | sorted_keys = sorted(self.graph.keys()) 40 | #设定迭代次数, 41 | step_dict = [0] 42 | for step in range(1, 1000): 43 | for node in sorted_keys: 44 | s = 0 45 | #计算公式:(edge_weight/outsum_node_dict[edge_node])*node_weight[edge_node] 46 | for e in self.graph[node]: 47 | s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]] 48 | #计算公式:(1-d) + d*s 49 | nodeweight_dict[node] = (1 - self.d) + self.d * s 50 | step_dict.append(sum(nodeweight_dict.values())) 51 | 52 | if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff: 53 | break 54 | 55 | #利用Z-score进行权重归一化,也称为离差标准化,是对原始数据的线性变换,使结果值映射到[0 - 1]之间。 56 | #先设定最大值与最小值均为系统存储的最大值和最小值 57 | (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3]) 58 | for w in nodeweight_dict.values(): 59 | if w < min_rank: 60 | min_rank = w 61 | if w > max_rank: 62 | max_rank = w 63 | 64 | for n, w in nodeweight_dict.items(): 65 | nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0) 66 | 67 | return nodeweight_dict 68 | 69 | -------------------------------------------------------------------------------- /src/exe/key_info_extraction/topic_cluster_lda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Nov 7 15:05:43 2020 4 | 5 | @author: Xu 6 | 7 | 8 | 基于Lda模型的多文档主题聚类,输入多篇文档,输出每个主题的关键词与相应文本,可用于主题发现与热点分析. 9 | 10 | """ 11 | import __init__ 12 | import math 13 | import re 14 | import random 15 | import numpy as np 16 | import pandas as pd 17 | import jieba 18 | import config 19 | from gensim import corpora, models,similarities 20 | 21 | 22 | # 数据处理 23 | def data_process(data): 24 | """ 25 | Process the multiple document input: remove non-text characters, remove stop words, word segmentation, etc., 26 | to generate dictionary and corpus vector. 27 | 28 | Parameters: 29 | data: type-list, the contents of multiple documents entered, like [doc1, doc2, ...,docn] 30 | 31 | Return: 32 | dictionary: type-dict, Generate dictionary based on input documents. 33 | corpus: type-iterator, is an iterator that returns the BOW vector 34 | corpus_tfidf: type-array, Calculate the TFIDF value for each feature that appears in the corpus. 35 | 36 | 37 | """ 38 | # 去掉非文本字符 39 | 40 | data_new = [re.sub(r'[^\u4e00-\u9fa5]+', '', d).strip() for d in data] 41 | # data_new = [re.sub(r"[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]", '', d).strip() for d in data] 42 | #分词 43 | data_new = [list(jieba.cut(d)) for d in data_new] 44 | # 去停用词 45 | stopwords = open(config.StopWords_path,'r').read() 46 | stoplist = stopwords.split('\n') 47 | data_new = [[word for word in d if word not in stoplist] for d in data_new ] 48 | # 过滤长度<=1的词 49 | data_new = [[word for word in d if len(word)>1 ] for d in data_new ] 50 | #对文本进行处理,得到文本集合中的词典 51 | dictionary = corpora.Dictionary(data_new) 52 | print('number of docs: ', dictionary.num_docs) 53 | print('number of words: ', dictionary.num_pos) 54 | #利用词典,对文本进行bow表示,生成词袋 55 | corpus = [dictionary.doc2bow(text) for text in data_new] 56 | #利用bow,对文本进行tfidf表示 57 | tfidf = models.TfidfModel(corpus) 58 | corpus_tfidf = tfidf[corpus] 59 | return dictionary,corpus,corpus_tfidf 60 | 61 | 62 | # 计算困惑度 63 | def preplexity(ldamodel,testset,dictionary,size_dictionary,num_topics): 64 | ''' 65 | Calculate the preplexity of a lda-model. 66 | Parameters: 67 | ldamodel: a LDA Model 68 | testset: corpus data 69 | dictionary: vocabulary, like {7822:'deferment', 1841:'circuitry',19202:'fabianism'...} 70 | size_dictionary: type: integer, the size of vocabulary 71 | num_topics: type: integer, number of tipics 72 | ---------- 73 | Return: 74 | prep: type-float, preplexity of a lda-model 75 | 76 | ''' 77 | print('\n') 78 | print('The info of this lda-model: ') 79 | print('num of the testset: %s; size_dictionary: %s; num of topics: %s' %(len(testset),size_dictionary,num_topics)) 80 | prep=0.0 81 | prob_doc_sum=0.0 82 | topic_word_list=[] #store the prabability of topic-word 83 | for topic_id in range(num_topics): 84 | topic_word=ldamodel.show_topic(topic_id,size_dictionary) 85 | dic={} 86 | for word,probability in topic_word: 87 | dic[word]=probability 88 | topic_word_list.append(dic) 89 | doc_topic_list=[] #store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...] 90 | for doc in testset: 91 | doc_topic_list.append(ldamodel.get_document_topics(doc,minimum_probability=0)) 92 | testset_word_num=0 93 | for i in range(len(testset)): 94 | prob_doc=0.0 # the probability of the doc 95 | doc=testset[i] 96 | doc_word_num=0 # the number of words in the doc 97 | for word_id,num in doc: #doc.items() if testset is a dic else list 98 | prob_word=0.0 # the probability of word 99 | doc_word_num+=num 100 | word=dictionary[word_id] 101 | for topic_id in range(num_topics): 102 | # calculate p(w): p(w)=sumz(p(z)*p(w|z)) 103 | prob_topic=doc_topic_list[i][topic_id][1] 104 | prob_topic_word=topic_word_list[topic_id][word] 105 | prob_word+=prob_topic*prob_topic_word 106 | prob_doc+=math.log(prob_word) # p(d)=sum(log(p(w))) 107 | prob_doc_sum+=prob_doc 108 | testset_word_num+=doc_word_num 109 | prep=math.exp(-prob_doc_sum/testset_word_num) # perplexity=exp(-sum(p(d))/sum(Nd)) 110 | print('the perplexity of this lda-model is: %s' %prep) 111 | return prep 112 | 113 | # 确定主题个数 114 | def get_best_num_topics(data, max_num_topics): 115 | """ 116 | This is the optimal number of topics obtained through perplexity assessment. 117 | 118 | Parameters: 119 | data: type: array or dataframe or matrix or list, the raw text data 120 | max_num_topics: type: integer, maximum of number of tipics 121 | 122 | Return: 123 | best_num_topics: best number of topics 124 | 125 | """ 126 | from scipy.signal import argrelextrema 127 | 128 | # random.shuffle(data) 129 | train_data = data[:int(len(data)*0.6)] 130 | val_data = data[int(len(data)*0.6): int(len(data)*0.8)] 131 | test_data = data[int(len(data)*0.8):] 132 | 133 | # 计算模型在训练和验证集上的困惑度 134 | preplexity_list_val=[] 135 | preplexity_list_train=[] 136 | 137 | for k in range(2, max_num_topics+1): 138 | dictionary,corpus,corpus_tfidf = data_process(train_data+val_data) 139 | ldamodel = models.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=k) 140 | preplexity_list_train.append(preplexity(ldamodel,corpus,dictionary,len(dictionary.keys()),k)) 141 | val_corpus = corpus[int(len(data)*0.6):] 142 | preplexity_list_val.append(preplexity(ldamodel,val_corpus,dictionary,len(dictionary.keys()),k)) 143 | 144 | # 选择拐点(最佳主题个数) 145 | y1 = np.array(preplexity_list_val) 146 | y2 = np.array(preplexity_list_train) 147 | best_num_topics = argrelextrema(np.abs(y1-y2), np.less)[0][0] 148 | 149 | return best_num_topics 150 | 151 | 152 | # 使用lda模型,获取主题分布 153 | def lda_model(data, file_path): 154 | # data prepare 155 | dictionary,corpus,corpus_tfidf = data_process(data) 156 | # num of topics 157 | num_topics = get_best_num_topics(data, 20) 158 | # lda model 159 | lda = models.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics) 160 | # save topic keywords to the file 161 | output_topic_keywords = '' 162 | for topic in lda.print_topics(num_topics=num_topics,num_words=25): 163 | keywords = re.sub(r'[\d\.\d+\*]|\"','', topic[1]) 164 | output_topic_keywords += 'Topic ' + str(topic[0]) + '\t' + keywords + '\n' 165 | # topic prediction/classification 166 | output_topic_dist = '' 167 | topics = lda.get_document_topics(corpus) 168 | topics_label = {} 169 | for i in range(20): 170 | topics_label[i] = 'Topic ' + str(i) 171 | # save topic distribution to the file 172 | for i in range(len(corpus)): 173 | dist = str(topics[i]) 174 | output_topic_dist += 'Document ' + str(i+1) + '\t' + dist + '\n' 175 | # save 176 | f = open(file_path, 'w', encoding='utf-8') 177 | f.write('Number of topics' + '\t' + str(num_topics) + '\n') 178 | f.write(output_topic_keywords) 179 | f.write(output_topic_dist) 180 | f.close() 181 | 182 | return num_topics, output_topic_keywords, output_topic_dist 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | -------------------------------------------------------------------------------- /src/exe/review_sentiment/business.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import nltk 4 | import sentiment_model 5 | from sentiment_model import SentimentModel 6 | 7 | 8 | # 自定义打印方法 9 | def print_format(str, a): 10 | print(str + '\n{0}\n'.format(a)) 11 | 12 | 13 | # review.json对应的数据类 14 | class ReviewDataItem(object): 15 | def __init__(self, review_id, user_id, business_id, stars, text): 16 | self.review_id = review_id 17 | self.user_id = user_id 18 | self.business_id = business_id 19 | self.stars = stars 20 | self.text = text 21 | 22 | 23 | # business.json对应的数据类 24 | class BusinessDataItem(object): 25 | def __init__(self, business_id, name, review_count): 26 | self.business_id = business_id 27 | self.name = name 28 | self.review_count = review_count 29 | 30 | 31 | class Business(object): 32 | """ 33 | 用来表示跟business相关的变量和函数 34 | """ 35 | 36 | def __init__(self): 37 | # 初始化变量以及函数 38 | # self.aspect_filter = ["salsa"] 39 | self.aspect_filter = [] 40 | self.dic_business_id = {} 41 | self.dic_business_data = {} 42 | print("step1 加载模型==================") 43 | self.sentimentModel = SentimentModel() # 把已经训练好的模型存放在文件里,并导入进来 44 | print("step2 读取数据开始==================") 45 | self.read_data() 46 | print("step2 读取数据结束==================") 47 | 48 | def read_data(self): 49 | 50 | json_file_business_path = './data/business.json' 51 | json_file_review_path = './data/review.json' 52 | 53 | with open(json_file_business_path, 'r', encoding='utf-8') as fin: 54 | for line in fin: 55 | line_contents = json.loads(line) 56 | business_id = line_contents["business_id"] 57 | name = line_contents["name"] 58 | review_count = line_contents["review_count"] 59 | if review_count >= 100: 60 | self.dic_business_id[business_id] = [] 61 | business_DataItem = BusinessDataItem(business_id, name, review_count) 62 | self.dic_business_data[business_id] = business_DataItem 63 | 64 | with open(json_file_review_path, 'r', encoding='utf-8') as fin: 65 | for line in fin: 66 | line_contents = json.loads(line) 67 | business_id = line_contents["business_id"] 68 | if business_id in self.dic_business_id: 69 | review_id = line_contents["review_id"] 70 | user_id = line_contents["user_id"] 71 | stars = line_contents["stars"] 72 | text = line_contents["text"] 73 | review_DataItem = ReviewDataItem(review_id, user_id, business_id, stars, text) 74 | self.dic_business_id[business_id].append(review_DataItem) 75 | 76 | def aspect_based_summary(self, business_id): 77 | """ 78 | 返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews. 79 | 具体细节请看给定的文档。 80 | """ 81 | 82 | aspects_dic = self.extract_aspects(business_id) 83 | # print(aspects_dic) 84 | business_name = self.dic_business_data[business_id].name 85 | # print(business_name) 86 | 87 | pos_aspect_dic = {} 88 | neg_aspect_dic = {} 89 | review_segment_dic = {} 90 | 91 | for aspect, reviews in aspects_dic.items(): 92 | for review in reviews: 93 | review_text = review.text 94 | if review_text == None or str.strip(review_text) == '': 95 | continue 96 | review_segment = self.get_segment(review_text, aspect, aspects_dic) 97 | # 粗略筛选一下 98 | if len(str.strip(review_segment)) > len(aspect) + 3: 99 | # print(review_segment) 100 | key = review.review_id + "_" + aspect 101 | review_segment_dic[key] = review_segment 102 | 103 | score = self.sentimentModel.predict_prob(review_segment) 104 | 105 | if score > 0.75: 106 | if aspect not in pos_aspect_dic: 107 | pos_aspect_dic[aspect] = [] 108 | pos_aspect_dic[aspect].append([key, score]) 109 | else: 110 | if aspect not in neg_aspect_dic: 111 | neg_aspect_dic[aspect] = [] 112 | neg_aspect_dic[aspect].append([key, score]) 113 | 114 | dic_aspect_summary = {} 115 | for aspect, reviews in aspects_dic.items(): 116 | if aspect not in dic_aspect_summary: 117 | dic_aspect_summary[aspect] = {} 118 | 119 | # 算某个aspect的得分 120 | pos_aspect_review_nums = len(pos_aspect_dic[aspect]) 121 | pos_aspect_total_scores = 0 122 | for item in pos_aspect_dic[aspect]: 123 | pos_aspect_total_scores += item[1] 124 | 125 | neg_aspect_review_nums = len(neg_aspect_dic[aspect]) 126 | neg_aspect_total_scores = 0 127 | for item in neg_aspect_dic[aspect]: 128 | neg_aspect_total_scores += item[1] 129 | 130 | aspect_review_nums = pos_aspect_review_nums +neg_aspect_review_nums 131 | aspect_score = (pos_aspect_total_scores + neg_aspect_total_scores) / aspect_review_nums 132 | 133 | dic_aspect_summary[aspect]["rating"] = aspect_score 134 | 135 | # TOP 5 正面 136 | aspects_pos_sorted = sorted(pos_aspect_dic[aspect], key=lambda x: x[1], reverse=True) 137 | aspects_pos_contents = {} 138 | dic_aspect_summary[aspect]["pos"] = [] 139 | for index, item in enumerate(aspects_pos_sorted): 140 | if len(dic_aspect_summary[aspect]["pos"]) >= 5: 141 | break 142 | review_content = review_segment_dic[item[0]] 143 | if review_content not in aspects_pos_contents: 144 | dic_aspect_summary[aspect]["pos"].append(review_content) 145 | aspects_pos_contents[review_content] = None 146 | 147 | # TOP 5 负面 148 | aspects_neg_sorted = sorted(neg_aspect_dic[aspect], key=lambda x: x[1], reverse=False) 149 | aspects_neg_contents = {} 150 | dic_aspect_summary[aspect]["neg"] = [] 151 | for index, item in enumerate(aspects_neg_sorted): 152 | if len(dic_aspect_summary[aspect]["neg"]) >= 5: 153 | break 154 | review_content = review_segment_dic[item[0]] 155 | if review_content not in aspects_neg_contents: 156 | dic_aspect_summary[aspect]["neg"].append(review_content) 157 | aspects_neg_contents[review_content] = None 158 | 159 | all_aspect_scores = 0 160 | for item in dic_aspect_summary.items(): 161 | all_aspect_scores += item[1]["rating"] 162 | 163 | business_rating = all_aspect_scores / len(dic_aspect_summary.items()) 164 | 165 | return {'business_id':business_id, 166 | 'business_name':business_name, 167 | 'business_rating':business_rating, 168 | 'aspect_summary':dic_aspect_summary 169 | } 170 | 171 | def get_segment(self, review_text, aspect, aspects_dic): 172 | 173 | if self.is_review_only_one_aspect(review_text): 174 | return review_text 175 | 176 | cur_aspect_index = review_text.index(aspect) 177 | cur_aspect_end_index_begin = cur_aspect_index + len(aspect) 178 | cur_aspect_end_index_end = cur_aspect_end_index_begin 179 | end_pos = len(review_text) - 1 180 | 181 | stop_punct_map = {c: None for c in ',.!?;'} 182 | relation_punct_list = ["and", "when", "but"] 183 | 184 | # next_aspect = self.get_next_aspect(review_text[cur_aspect_end_index_begin:end_pos]) 185 | cur_aspect_des = self.get_cur_aspect_adj(review_text[cur_aspect_end_index_begin:end_pos]) 186 | 187 | while cur_aspect_end_index_end <= end_pos: 188 | # 在标点符号处截取 189 | cur_str = review_text[cur_aspect_end_index_end:min(cur_aspect_end_index_end + 1, end_pos)] 190 | if cur_str in stop_punct_map: 191 | break 192 | 193 | # 在转移符号处截取 194 | cur_strs = review_text[cur_aspect_end_index_begin:cur_aspect_end_index_end] 195 | relation_store = "" 196 | for relation in relation_punct_list: 197 | if relation in cur_strs.lower(): 198 | relation_store = relation 199 | break 200 | 201 | if relation_store != "": 202 | cur_aspect_end_index_end -= len(relation_store) 203 | break 204 | 205 | # 在下一个aspect截取 206 | # if next_aspect != None: 207 | # if next_aspect in aspects_dic and next_aspect in cur_strs: 208 | # cur_aspect_end_index_end -= len(next_aspect) 209 | # break 210 | 211 | # 在aspect最近的形容词截取 212 | if cur_aspect_des != None: 213 | if cur_aspect_des in cur_strs: 214 | break 215 | 216 | cur_aspect_end_index_end += 1 217 | 218 | cur_aspect_end_index_end = min(cur_aspect_end_index_end, end_pos) 219 | return review_text[cur_aspect_index:cur_aspect_end_index_end] 220 | 221 | def get_next_aspect(self, text): 222 | tokens = nltk.word_tokenize(text) 223 | tag_tuples = nltk.pos_tag(tokens) 224 | for (word, tag) in tag_tuples: 225 | if tag == "NN": 226 | return word 227 | return None 228 | 229 | def get_cur_aspect_adj(self, text): 230 | tokens = nltk.word_tokenize(text) 231 | tag_tuples = nltk.pos_tag(tokens) 232 | for (word, tag) in tag_tuples: 233 | if tag == "JJ" or tag == "ADJ": 234 | return word 235 | return None 236 | 237 | def is_review_only_one_aspect(self, review_text): 238 | ''' 239 | 判断评论里面是否只包含一个方面 240 | :param review: 241 | :return: 242 | ''' 243 | 244 | tagged_words = [] 245 | tokens = nltk.word_tokenize(review_text) 246 | tag_tuples = nltk.pos_tag(tokens) 247 | for (word, tag) in tag_tuples: 248 | if tag == "NN": 249 | tagged_words.append(word) 250 | 251 | if len(tagged_words) <= 1: 252 | return True 253 | 254 | return False 255 | 256 | def extract_aspects(self, business_id): 257 | """ 258 | 从一个business的review中抽取aspects 259 | """ 260 | 261 | # print("step3 extract_aspects begin==================") 262 | 263 | if business_id not in self.dic_business_id: 264 | print("business_id not exit") 265 | return None 266 | 267 | review_list = self.dic_business_id[business_id] 268 | aspects_dic = {} 269 | for review_data in review_list: 270 | sentence = review_data.text 271 | if sentence == None or str.strip(sentence) == '': 272 | continue 273 | tagged_words = [] 274 | tokens = nltk.word_tokenize(sentence) 275 | tag_tuples = nltk.pos_tag(tokens) 276 | for (word, tag) in tag_tuples: 277 | if tag == "NN": 278 | # token = {'word': string, 'pos': tag} 279 | # tagged_words.append(word) 280 | if word not in aspects_dic: 281 | aspects_dic[word] = [] 282 | aspects_dic[word].append(review_data) 283 | 284 | # 对字典进行排序 285 | aspects_sorted = sorted(aspects_dic.items(), key=lambda x: len(x[1]), reverse=True) 286 | aspects_dic = {} 287 | for index, item in enumerate(aspects_sorted): 288 | if item[0] in self.aspect_filter: 289 | continue 290 | 291 | if len(aspects_dic.items()) < 5: 292 | aspects_dic[item[0]] = item[1] 293 | 294 | # print("step3 extract_aspects end==================") 295 | return aspects_dic 296 | -------------------------------------------------------------------------------- /src/exe/review_sentiment/main.py: -------------------------------------------------------------------------------- 1 | import time 2 | import business 3 | from business import Business 4 | 5 | ''' 6 | 运行前需要从以下百度网盘链接下载 7 | 8 | https://pan.baidu.com/s/1hSFBjQHLhYDw9jPBDNH6pw&shfl=sharepset 9 | 这个路径包含data文件夹下的内容 10 | business.json 11 | glove.6B.100d.txt 12 | review.json 13 | 14 | https://pan.baidu.com/s/1gnANAnoGv5GHQKWwAIVE4Q&shfl=sharepset 15 | 这个路径包含data文件夹下的模型文件 16 | svm_clf.pkl 17 | 18 | 19 | ''' 20 | 21 | 22 | def get_review_summary_for_business(biz_id): 23 | # 获取每一个business的评论总结 24 | return business_module.aspect_based_summary(biz_id) 25 | 26 | def main(): 27 | 28 | bus_ids = ["tstimHoMcYbkSC4eBA1wEg","gnKjwL_1w79qoiV3IC_xQQ"] # 指定几个business ids 29 | 30 | for bus_id in bus_ids: 31 | # print ("Working on biz_id %s" % bus_id) 32 | start = time.time() 33 | 34 | summary = get_review_summary_for_business(bus_id) 35 | 36 | print("\n") 37 | 38 | normal_print_list = ["business_id","business_name","business_rating", "rating"] 39 | for item in summary.items(): 40 | if item[0] in normal_print_list: 41 | print(str(item[0]) + ": " + str(item[1])) 42 | else: 43 | print(str(item[0]) + ": ") 44 | # for content in item[1]: 45 | # print(content) 46 | for data in item[1].items(): 47 | # print(str(data[0]) + ": " + str(data[1])) 48 | print("------------------" + str(data[0]) + "------------------") 49 | for data_1 in data[1].items(): 50 | if data_1[0] in normal_print_list: 51 | print(str(data_1[0]) + ": " + str(data_1[1])) 52 | else: 53 | review_list = [] 54 | for item_1 in data_1[1]: 55 | review_list.append(item_1) 56 | print(str(data_1[0]) + ": " + "; ".join(review_list)) 57 | 58 | if __name__ == "__main__": 59 | business_module = Business() 60 | main() 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /src/exe/review_sentiment/model_training.py: -------------------------------------------------------------------------------- 1 | # 此文件包含模型的训练。 给定数据集,训练出情感分类模型,并把模型文件存放在 model文件夹里。 2 | import json 3 | import numpy as np 4 | import pandas as pd 5 | import business 6 | from sklearn.feature_extraction.text import TfidfVectorizer 7 | from nltk import pos_tag, word_tokenize 8 | # from glove_embedding import GloveEmbedding 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.metrics import roc_auc_score 11 | import matplotlib.pyplot as plt 12 | from sklearn.svm import SVC 13 | import utils 14 | from utils import handle_text 15 | from embedding_manager_cyd import EmbeddingManagerCyd 16 | from embedding_manager_cyd import Embedding_Type 17 | 18 | SEED = 222 19 | np.random.seed(SEED) 20 | 21 | dic_business_id = {} 22 | 23 | def main(): 24 | 25 | embeddingManagerCyd = EmbeddingManagerCyd() 26 | 27 | json_file_business_path = './data/business.json' 28 | json_file_review_path = './data/review.json' 29 | 30 | with open(json_file_business_path, 'r', encoding='utf-8') as fin: 31 | for line in fin: 32 | line_contents = json.loads(line) 33 | business_id = line_contents["business_id"] 34 | name = line_contents["name"] 35 | review_count = line_contents["review_count"] 36 | dic_business_id[business_id] = [] 37 | 38 | tags = [] 39 | reviewList = [] 40 | sentiment = 0 41 | index = 0 42 | 43 | with open(json_file_review_path, 'r', encoding='utf-8') as fin: 44 | for line in fin: 45 | line_contents = json.loads(line) 46 | business_id = line_contents["business_id"] 47 | if business_id in dic_business_id: 48 | review_id = line_contents["review_id"] 49 | user_id = line_contents["user_id"] 50 | stars = line_contents["stars"] 51 | text = line_contents["text"] 52 | if stars >= 4: 53 | sentiment = 1 54 | else: 55 | sentiment = 0 56 | tags.append(sentiment) 57 | # reviewList.append([index,text]) 58 | # reviewList.append(handle_text(text)) 59 | reviewList.append(embeddingManagerCyd.getEmbedding(text, Embedding_Type.glove, True, False)) 60 | index += 1 61 | print(index) 62 | # if index >= 20000: 63 | # break 64 | 65 | 66 | # glove_embedding = GloveEmbedding() 67 | # # gloveVectors = [glove_embedding.getSentenceVectorCommon(item[1], isUseAveragePooling=True) for item in tokenizedWords.items()] 68 | # gloveVectors = [glove_embedding.getSentenceVectorCommon(item, isUseAveragePooling=True) for item in 69 | # reviewList] 70 | # features = np.array(gloveVectors, dtype=np.float16) 71 | 72 | features = np.array(reviewList, dtype=np.float16) 73 | tags = np.array(tags) 74 | 75 | classification_svm(features, tags) 76 | # classification_logistic(features, tags) 77 | 78 | # print("step4=================") 79 | 80 | def get_train_test(features,tags,test_size=0.3): 81 | return train_test_split(features, tags, test_size=test_size, random_state=SEED) 82 | 83 | from sklearn.pipeline import make_pipeline 84 | from sklearn.model_selection import GridSearchCV 85 | 86 | from sklearn.externals import joblib #jbolib模块 87 | from sklearn.linear_model import LogisticRegression 88 | from sklearn.model_selection import KFold 89 | 90 | def classification_logistic(features, tags): 91 | xtrain, xtest, ytrain, ytest = get_train_test(features, tags) 92 | 93 | # cross_validator = KFold(n_splits=10, shuffle=False, random_state=None) 94 | 95 | # lr = LogisticRegression(penalty = "l1") 96 | # 97 | # # params = {"penalty":["l1","l2"], 98 | # # "C":[0.1,1.0,10.0,100.0]}, 99 | # 100 | # params = {"C":[100, 120,150]}, 101 | 102 | # grid = GridSearchCV(estimator=lr, param_grid = params) 103 | # grid.fit(xtrain, ytrain) 104 | # print("最优参数为:",grid.best_params_) 105 | # model = grid.best_estimator_ 106 | # predict_value = model.predict(xtest) 107 | # proba_value = model.predict_proba(xtest) 108 | # p = proba_value[:,1] 109 | # print("Logistic=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) 110 | # 111 | # joblib.dump(model, 'model/logistic_clf.pkl') 112 | 113 | model = LogisticRegression(penalty="l1",C = 100, solver='liblinear') 114 | model.fit(xtrain, ytrain) 115 | predict_value = model.predict(xtest) 116 | proba_value = model.predict_proba(xtest) 117 | p = proba_value[:,1] 118 | print("Logistic=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) 119 | joblib.dump(model, 'model/logistic_clf.pkl') 120 | 121 | #100000 0.889 122 | #50000 0.889 123 | 124 | def classification_svm(features, tags): 125 | xtrain, xtest, ytrain, ytest = get_train_test(features, tags) 126 | 127 | # svc = SVC(kernel='rbf',probability = True) 128 | # model = make_pipeline(svc) 129 | # 130 | # # 使用GridSearchCV选择参数 131 | # # param_grid = {'svc__C': [1, 5, 10,0.5,20,30], 132 | # # 'svc__gamma': [0.0001, 0.0005, 0.001, 0.005,0.006,0.007,0.008,0.009,0.01]} 133 | # param_grid = {'svc__C': [1, 5, 10, 15], 134 | # 'svc__gamma': [0.0005, 0.001, 0.005]} 135 | # grid = GridSearchCV(model, param_grid) 136 | # 137 | # grid.fit(xtrain, ytrain) 138 | # print("最优参数为:",grid.best_params_) 139 | # model = grid.best_estimator_ 140 | # predict_value = model.predict(xtest) 141 | # proba_value = model.predict_proba(xtest) 142 | # p = proba_value[:,1] 143 | # print("SVM=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) 144 | 145 | model =SVC(kernel='rbf',probability = True, C=20, gamma=0.005) 146 | model.fit(xtrain, ytrain) 147 | proba_value = model.predict_proba(xtest) 148 | p = proba_value[:, 1] 149 | print("SVM=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p)) 150 | joblib.dump(model, 'model/svm_clf.pkl') 151 | 152 | #20000 0.883 153 | #500000 0.887 154 | 155 | if __name__ == "__main__": 156 | main() 157 | 158 | -------------------------------------------------------------------------------- /src/exe/review_sentiment/sentence.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | class Sentence(object): 5 | 6 | # WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False) 7 | 8 | # LEMMATIZER = WordNetLemmatizer() 9 | 10 | # 针对于每一句话抽取aspects 11 | # ASP_EXTRACTOR = 12 | 13 | def __init__(self): 14 | pass 15 | 16 | 17 | def word_tokenize(self): 18 | pass 19 | 20 | 21 | def pos_tag(self): 22 | pass 23 | 24 | def lemmatize(self): 25 | pass 26 | 27 | def contain_aspect(self): 28 | pass -------------------------------------------------------------------------------- /src/exe/review_sentiment/stopwords.txt: -------------------------------------------------------------------------------- 1 | 'd 2 | 'll 3 | 'm 4 | 're 5 | 's 6 | 't 7 | 've 8 | ZT 9 | ZZ 10 | a 11 | a's 12 | able 13 | about 14 | above 15 | abst 16 | accordance 17 | according 18 | accordingly 19 | across 20 | act 21 | actually 22 | added 23 | adj 24 | adopted 25 | affected 26 | affecting 27 | affects 28 | after 29 | afterwards 30 | again 31 | against 32 | ah 33 | ain't 34 | all 35 | allow 36 | allows 37 | almost 38 | alone 39 | along 40 | already 41 | also 42 | although 43 | always 44 | am 45 | among 46 | amongst 47 | an 48 | and 49 | announce 50 | another 51 | any 52 | anybody 53 | anyhow 54 | anymore 55 | anyone 56 | anything 57 | anyway 58 | anyways 59 | anywhere 60 | apart 61 | apparently 62 | appear 63 | appreciate 64 | appropriate 65 | approximately 66 | are 67 | area 68 | areas 69 | aren 70 | aren't 71 | arent 72 | arise 73 | around 74 | as 75 | aside 76 | ask 77 | asked 78 | asking 79 | asks 80 | associated 81 | at 82 | auth 83 | available 84 | away 85 | awfully 86 | b 87 | back 88 | backed 89 | backing 90 | backs 91 | be 92 | became 93 | because 94 | become 95 | becomes 96 | becoming 97 | been 98 | before 99 | beforehand 100 | began 101 | begin 102 | beginning 103 | beginnings 104 | begins 105 | behind 106 | being 107 | beings 108 | believe 109 | below 110 | beside 111 | besides 112 | best 113 | better 114 | between 115 | beyond 116 | big 117 | biol 118 | both 119 | brief 120 | briefly 121 | but 122 | by 123 | c 124 | c'mon 125 | c's 126 | ca 127 | came 128 | can 129 | can't 130 | cannot 131 | cant 132 | case 133 | cases 134 | cause 135 | causes 136 | certain 137 | certainly 138 | changes 139 | clear 140 | clearly 141 | co 142 | com 143 | come 144 | comes 145 | concerning 146 | consequently 147 | consider 148 | considering 149 | contain 150 | containing 151 | contains 152 | corresponding 153 | could 154 | couldn't 155 | couldnt 156 | course 157 | currently 158 | d 159 | date 160 | definitely 161 | describe 162 | described 163 | despite 164 | did 165 | didn't 166 | differ 167 | different 168 | differently 169 | discuss 170 | do 171 | does 172 | doesn't 173 | doing 174 | don't 175 | done 176 | down 177 | downed 178 | downing 179 | downs 180 | downwards 181 | due 182 | during 183 | e 184 | each 185 | early 186 | ed 187 | edu 188 | effect 189 | eg 190 | eight 191 | eighty 192 | either 193 | else 194 | elsewhere 195 | end 196 | ended 197 | ending 198 | ends 199 | enough 200 | entirely 201 | especially 202 | et 203 | et-al 204 | etc 205 | even 206 | evenly 207 | ever 208 | every 209 | everybody 210 | everyone 211 | everything 212 | everywhere 213 | ex 214 | exactly 215 | example 216 | except 217 | f 218 | face 219 | faces 220 | fact 221 | facts 222 | far 223 | felt 224 | few 225 | ff 226 | fifth 227 | find 228 | finds 229 | first 230 | five 231 | fix 232 | followed 233 | following 234 | follows 235 | for 236 | former 237 | formerly 238 | forth 239 | found 240 | four 241 | from 242 | full 243 | fully 244 | further 245 | furthered 246 | furthering 247 | furthermore 248 | furthers 249 | g 250 | gave 251 | general 252 | generally 253 | get 254 | gets 255 | getting 256 | give 257 | given 258 | gives 259 | giving 260 | go 261 | goes 262 | going 263 | gone 264 | good 265 | goods 266 | got 267 | gotten 268 | great 269 | greater 270 | greatest 271 | greetings 272 | group 273 | grouped 274 | grouping 275 | groups 276 | h 277 | had 278 | hadn't 279 | happens 280 | hardly 281 | has 282 | hasn't 283 | have 284 | haven't 285 | having 286 | he 287 | he's 288 | hed 289 | hello 290 | help 291 | hence 292 | her 293 | here 294 | here's 295 | hereafter 296 | hereby 297 | herein 298 | heres 299 | hereupon 300 | hers 301 | herself 302 | hes 303 | hi 304 | hid 305 | high 306 | higher 307 | highest 308 | him 309 | himself 310 | his 311 | hither 312 | home 313 | hopefully 314 | how 315 | howbeit 316 | however 317 | hundred 318 | i 319 | i'd 320 | i'll 321 | i'm 322 | i've 323 | id 324 | ie 325 | if 326 | ignored 327 | im 328 | immediate 329 | immediately 330 | importance 331 | important 332 | in 333 | inasmuch 334 | inc 335 | include 336 | indeed 337 | index 338 | indicate 339 | indicated 340 | indicates 341 | information 342 | inner 343 | insofar 344 | instead 345 | interest 346 | interested 347 | interesting 348 | interests 349 | into 350 | invention 351 | inward 352 | is 353 | isn't 354 | it 355 | it'd 356 | it'll 357 | it's 358 | itd 359 | its 360 | itself 361 | j 362 | just 363 | k 364 | keep 365 | keeps 366 | kept 367 | keys 368 | kg 369 | kind 370 | km 371 | knew 372 | know 373 | known 374 | knows 375 | l 376 | large 377 | largely 378 | last 379 | lately 380 | later 381 | latest 382 | latter 383 | latterly 384 | least 385 | less 386 | lest 387 | let 388 | let's 389 | lets 390 | like 391 | liked 392 | likely 393 | line 394 | little 395 | long 396 | longer 397 | longest 398 | look 399 | looking 400 | looks 401 | ltd 402 | m 403 | made 404 | mainly 405 | make 406 | makes 407 | making 408 | man 409 | many 410 | may 411 | maybe 412 | me 413 | mean 414 | means 415 | meantime 416 | meanwhile 417 | member 418 | members 419 | men 420 | merely 421 | mg 422 | might 423 | million 424 | miss 425 | ml 426 | more 427 | moreover 428 | most 429 | mostly 430 | mr 431 | mrs 432 | much 433 | mug 434 | must 435 | my 436 | myself 437 | n 438 | n't 439 | na 440 | name 441 | namely 442 | nay 443 | nd 444 | near 445 | nearly 446 | necessarily 447 | necessary 448 | need 449 | needed 450 | needing 451 | needs 452 | neither 453 | never 454 | nevertheless 455 | new 456 | newer 457 | newest 458 | next 459 | nine 460 | ninety 461 | no 462 | nobody 463 | non 464 | none 465 | nonetheless 466 | noone 467 | nor 468 | normally 469 | nos 470 | not 471 | noted 472 | nothing 473 | novel 474 | now 475 | nowhere 476 | number 477 | numbers 478 | o 479 | obtain 480 | obtained 481 | obviously 482 | of 483 | off 484 | often 485 | oh 486 | ok 487 | okay 488 | old 489 | older 490 | oldest 491 | omitted 492 | on 493 | once 494 | one 495 | ones 496 | only 497 | onto 498 | open 499 | opened 500 | opening 501 | opens 502 | or 503 | ord 504 | order 505 | ordered 506 | ordering 507 | orders 508 | other 509 | others 510 | otherwise 511 | ought 512 | our 513 | ours 514 | ourselves 515 | out 516 | outside 517 | over 518 | overall 519 | owing 520 | own 521 | p 522 | page 523 | pages 524 | part 525 | parted 526 | particular 527 | particularly 528 | parting 529 | parts 530 | past 531 | per 532 | perhaps 533 | place 534 | placed 535 | places 536 | please 537 | plus 538 | point 539 | pointed 540 | pointing 541 | points 542 | poorly 543 | possible 544 | possibly 545 | potentially 546 | pp 547 | predominantly 548 | present 549 | presented 550 | presenting 551 | presents 552 | presumably 553 | previously 554 | primarily 555 | probably 556 | problem 557 | problems 558 | promptly 559 | proud 560 | provides 561 | put 562 | puts 563 | q 564 | que 565 | quickly 566 | quite 567 | qv 568 | r 569 | ran 570 | rather 571 | rd 572 | re 573 | readily 574 | really 575 | reasonably 576 | recent 577 | recently 578 | ref 579 | refs 580 | regarding 581 | regardless 582 | regards 583 | related 584 | relatively 585 | research 586 | respectively 587 | resulted 588 | resulting 589 | results 590 | right 591 | room 592 | rooms 593 | run 594 | s 595 | said 596 | same 597 | saw 598 | say 599 | saying 600 | says 601 | sec 602 | second 603 | secondly 604 | seconds 605 | section 606 | see 607 | seeing 608 | seem 609 | seemed 610 | seeming 611 | seems 612 | seen 613 | sees 614 | self 615 | selves 616 | sensible 617 | sent 618 | serious 619 | seriously 620 | seven 621 | several 622 | shall 623 | she 624 | she'll 625 | shed 626 | shes 627 | should 628 | shouldn't 629 | show 630 | showed 631 | showing 632 | shown 633 | showns 634 | shows 635 | side 636 | sides 637 | significant 638 | significantly 639 | similar 640 | similarly 641 | since 642 | six 643 | slightly 644 | small 645 | smaller 646 | smallest 647 | so 648 | some 649 | somebody 650 | somehow 651 | someone 652 | somethan 653 | something 654 | sometime 655 | sometimes 656 | somewhat 657 | somewhere 658 | soon 659 | sorry 660 | specifically 661 | specified 662 | specify 663 | specifying 664 | state 665 | states 666 | still 667 | stop 668 | strongly 669 | sub 670 | substantially 671 | successfully 672 | such 673 | sufficiently 674 | suggest 675 | sup 676 | sure 677 | t 678 | t's 679 | take 680 | taken 681 | taking 682 | tell 683 | tends 684 | th 685 | than 686 | thank 687 | thanks 688 | thanx 689 | that 690 | that'll 691 | that's 692 | that've 693 | thats 694 | the 695 | their 696 | theirs 697 | them 698 | themselves 699 | then 700 | thence 701 | there 702 | there'll 703 | there's 704 | there've 705 | thereafter 706 | thereby 707 | thered 708 | therefore 709 | therein 710 | thereof 711 | therere 712 | theres 713 | thereto 714 | thereupon 715 | these 716 | they 717 | they'd 718 | they'll 719 | they're 720 | they've 721 | theyd 722 | theyre 723 | thing 724 | things 725 | think 726 | thinks 727 | third 728 | this 729 | thorough 730 | thoroughly 731 | those 732 | thou 733 | though 734 | thoughh 735 | thought 736 | thoughts 737 | thousand 738 | three 739 | throug 740 | through 741 | throughout 742 | thru 743 | thus 744 | til 745 | tip 746 | to 747 | today 748 | together 749 | too 750 | took 751 | toward 752 | towards 753 | tried 754 | tries 755 | truly 756 | try 757 | trying 758 | ts 759 | turn 760 | turned 761 | turning 762 | turns 763 | twice 764 | two 765 | u 766 | un 767 | under 768 | unfortunately 769 | unless 770 | unlike 771 | unlikely 772 | until 773 | unto 774 | up 775 | upon 776 | ups 777 | us 778 | use 779 | used 780 | useful 781 | usefully 782 | usefulness 783 | uses 784 | using 785 | usually 786 | uucp 787 | v 788 | value 789 | various 790 | very 791 | via 792 | viz 793 | vol 794 | vols 795 | vs 796 | w 797 | want 798 | wanted 799 | wanting 800 | wants 801 | was 802 | wasn't 803 | way 804 | ways 805 | we 806 | we'd 807 | we'll 808 | we're 809 | we've 810 | wed 811 | welcome 812 | well 813 | wells 814 | went 815 | were 816 | weren't 817 | what 818 | what'll 819 | what's 820 | whatever 821 | whats 822 | when 823 | whence 824 | whenever 825 | where 826 | where's 827 | whereafter 828 | whereas 829 | whereby 830 | wherein 831 | wheres 832 | whereupon 833 | wherever 834 | whether 835 | which 836 | while 837 | whim 838 | whither 839 | who 840 | who'll 841 | who's 842 | whod 843 | whoever 844 | whole 845 | whom 846 | whomever 847 | whos 848 | whose 849 | why 850 | widely 851 | will 852 | willing 853 | wish 854 | with 855 | within 856 | without 857 | won't 858 | wonder 859 | words 860 | work 861 | worked 862 | working 863 | works 864 | world 865 | would 866 | wouldn't 867 | www 868 | x 869 | y 870 | year 871 | years 872 | yes 873 | yet 874 | you 875 | you'd 876 | you'll 877 | you're 878 | you've 879 | youd 880 | young 881 | younger 882 | youngest 883 | your 884 | youre 885 | yours 886 | yourself 887 | yourselves 888 | z 889 | zero 890 | zt 891 | zz 892 | -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jan 26 22:20:08 2021 4 | 5 | @author: Xu 6 | """ 7 | 8 | import sys 9 | import os 10 | curPath = os.path.abspath(os.path.dirname(__file__)) 11 | rootPath = os.path.split(curPath)[0] 12 | sys.path.append(os.path.split(rootPath)[0]) -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/bert_embedding_extend.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from bert_embedding import BertEmbedding 5 | import mxnet as mx 6 | import numpy as np 7 | from sentiment_analysis.utils import handle_text 8 | 9 | class BertEmbeddingExtend(object): 10 | def __init__(self): 11 | # self.bert_embed = BertEmbedding(model='bert_12_768_12', ctx = mx.gpu(0)) 12 | self.bert_embed = BertEmbedding(model='bert_12_768_12') 13 | 14 | def getSenetnceEmbedding(self, sentence, isUseAveragePooling, isUseStopwords): 15 | 16 | if isUseStopwords: 17 | new_words_list = handle_text(sentence, isUseStopwords) 18 | if len(new_words_list) == 0: 19 | return np.zeros(768) 20 | sentence = " ".join(new_words_list) 21 | 22 | result = self.bert_embed(sentence.split('\n')) 23 | first_sentence = result[0] 24 | 25 | if first_sentence[1] == None or len(first_sentence[1]) == 0: 26 | return np.zeros(768) 27 | 28 | w_v = np.array(first_sentence[1]) 29 | total_effect_count = w_v.shape[0] 30 | 31 | if isUseAveragePooling: 32 | w_v = np.sum(w_v, axis=0) / total_effect_count 33 | else: 34 | w_v = np.max(w_v, axis=0) 35 | 36 | return w_v 37 | -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/embedding_manager_cyd.py: -------------------------------------------------------------------------------- 1 | from sentiment_analysis.glove_embedding import GloveEmbedding 2 | from sentiment_analysis.bert_embedding_extend import BertEmbeddingExtend 3 | from enum import Enum 4 | 5 | class Embedding_Type(Enum): 6 | glove = 0, 7 | bert = 1 8 | 9 | class EmbeddingManagerCyd(object): 10 | def __init__(self): 11 | self.gloveEmbedding = GloveEmbedding() 12 | self.bertEmbedding = BertEmbeddingExtend() 13 | 14 | def getEmbedding(self, sentence, type, isUseAveragePooling, isUseStopwords): 15 | if type == Embedding_Type.glove: 16 | return self.gloveEmbedding.getSentenceVectorCommon(sentence, isUseAveragePooling, isUseStopwords) 17 | elif type == Embedding_Type.bert: 18 | return self.bertEmbedding.getSenetnceEmbedding(sentence, isUseAveragePooling, isUseStopwords) 19 | 20 | return None 21 | -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/glove_embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sentiment_analysis.utils import handle_text 3 | from src import config 4 | 5 | 6 | class GloveEmbedding(object): 7 | 8 | def __init__(self): 9 | ''' 10 | 初始化函数 11 | ''' 12 | self.embeddings_index = {} 13 | self.embedding_dim_glove = 100 14 | self.init_data() 15 | 16 | def init_data(self): 17 | ''' 18 | 初始化数据 19 | :return: 20 | ''' 21 | glovefile = open(config.glove_embedding_path, "r", encoding="utf-8") 22 | 23 | for line in glovefile: 24 | values = line.split() 25 | word = values[0] 26 | coefs = np.asarray(values[1:], dtype='float16') 27 | self.embeddings_index[word] = coefs 28 | glovefile.close() 29 | 30 | 31 | def get_embedding_matrix_glove(self, word): 32 | """ 33 | 获取glove词向量 34 | :param word: 35 | :return: 36 | """ 37 | embedding_vector = self.embeddings_index.get(word) 38 | if embedding_vector is not None: 39 | return embedding_vector[:self.embedding_dim_glove] 40 | return np.zeros(self.embedding_dim_glove) 41 | 42 | def getSentenceVectorCommon(self, sentence, isUseAveragePooling, isUseStopwords): 43 | tokens = handle_text(sentence,isUseStopwords) 44 | total_effect_count = 0 45 | w_v = [] 46 | for word in tokens: 47 | if word in self.embeddings_index: 48 | total_effect_count += 1 49 | w_v.append(self.embeddings_index[word]) 50 | 51 | w_v = np.array(w_v) 52 | 53 | is_effect = total_effect_count > 0 54 | if is_effect: 55 | if isUseAveragePooling: 56 | w_v = np.sum(w_v, axis=0) / total_effect_count 57 | else: 58 | w_v = np.max(w_v, axis=0) 59 | else: 60 | w_v = np.zeros(self.embedding_dim_glove) 61 | 62 | return np.array(w_v) 63 | 64 | 65 | -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/sentiment_model.py: -------------------------------------------------------------------------------- 1 | import sentiment_analysis.glove_embedding 2 | from sentiment_analysis.embedding_manager_cyd import EmbeddingManagerCyd 3 | from sentiment_analysis.embedding_manager_cyd import Embedding_Type 4 | from sklearn.externals import joblib #jbolib模块 5 | from nltk import pos_tag, word_tokenize 6 | import sentiment_analysis.utils 7 | from sentiment_analysis.utils import handle_text 8 | import numpy as np 9 | from src import config 10 | 11 | class SentimentModel(object): 12 | def __init__(self): 13 | self.model = joblib.load(config.svm_model_save_path) 14 | self.embeddingManagerCyd = EmbeddingManagerCyd() 15 | 16 | def predict_prob(self, review_segment): 17 | vectors = [self.embeddingManagerCyd.getEmbedding(review_segment, Embedding_Type.glove, True, False)] 18 | features = np.array(vectors, dtype=np.float16) 19 | proba_value = self.model.predict_proba(features) 20 | score = proba_value[:, 1] 21 | return np.float16(score[0]) 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/exe/sentiment_analysis/utils.py: -------------------------------------------------------------------------------- 1 | from nltk import pos_tag, word_tokenize 2 | from nltk.corpus import stopwords 3 | import config 4 | 5 | stopwords = {line.rstrip().lower(): None for line in open(config.en_stopwords_path)} 6 | 7 | def handle_text(text, isUseStopWords): 8 | if isUseStopWords: 9 | # new_word_list = [word for word in word_tokenize(text) if word not in stopwords.words('english')] 10 | new_word_list = [word for word in word_tokenize(text) if word.lower() not in stopwords] 11 | else: 12 | new_word_list = [word for word in word_tokenize(text)] 13 | 14 | return new_word_list 15 | -------------------------------------------------------------------------------- /src/exe/worddiscovery/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__init__.py -------------------------------------------------------------------------------- /src/exe/worddiscovery/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/worddiscovery/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/worddiscovery/__pycache__/entropy_based.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/entropy_based.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/worddiscovery/__pycache__/trie.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/trie.cpython-36.pyc -------------------------------------------------------------------------------- /src/exe/worddiscovery/__pycache__/trie.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/trie.cpython-37.pyc -------------------------------------------------------------------------------- /src/exe/worddiscovery/entropy_based.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # Author: lujiaying93@foxmail.com 3 | # Algorithm source from: http://www.matrix67.com/blog/archives/5044 4 | 5 | from __future__ import division 6 | import os 7 | import sys 8 | cur_dir = os.path.dirname(os.path.realpath(__file__)) 9 | sys.path.append("%s/../" % (cur_dir)) 10 | 11 | import math 12 | import re 13 | import time 14 | from collections import defaultdict 15 | from worddiscovery.trie import CharTrie 16 | 17 | import logging 18 | log_console = logging.StreamHandler(sys.stderr) 19 | default_logger = logging.getLogger(__name__) 20 | default_logger.setLevel(logging.DEBUG) 21 | default_logger.addHandler(log_console) 22 | 23 | MAX_INT = 9223372036854775807 24 | RE_SENTENCE_SEPERATOR = r'[\n\r]\s*' 25 | RE_PUNCTUATION_TO_CLEAN = r'[.:;?!\~,\-_()[\]<>。:;?!~,、——()【】《》#*=+/|‘’“”¥#*=+\\|\'"^$%`]' 26 | 27 | 28 | class EntropyBasedWorddiscovery(object): 29 | def __init__(self, word_max_len=6): 30 | self._trie = CharTrie() 31 | self._trie_reversed = CharTrie() # for left char entropy calculate 32 | self._word_info = defaultdict(dict) 33 | self.word_max_len = word_max_len 34 | 35 | self.WORD_MIN_LEN = 2 36 | self.WORD_MIN_FREQ = 2 37 | self.WORD_MIN_PMI = 6 38 | self.WORD_MIN_NEIGHBOR_ENTROPY = 0 39 | 40 | def clear(self): 41 | self._trie.clear() 42 | self._trie_reversed.clear() 43 | self._word_info = defaultdict(dict) 44 | 45 | def parse_file(self, file_name, debug=False): 46 | with open(file_name) as fopen: 47 | document_text = fopen.read() 48 | self.parse(document_text, debug) 49 | 50 | def parse(self, document_text, debug=False): 51 | self.clear() 52 | sentences = self._preprocess(document_text) 53 | self._build_trie(sentences) 54 | self.cal_aggregation(debug) 55 | self.cal_neighbor_char_entropy(debug) 56 | self.cal_score(debug) 57 | 58 | def get_new_words(self, top=20): 59 | default_logger.debug("Start sorting to get new words...") 60 | start_t = time.time() 61 | sorted_word_info = sorted(self._word_info.items(), key=lambda _: _[1]['score_freq'], reverse=True) 62 | default_logger.debug("Get new words, which cost %.3f seconds" % (time.time()-start_t)) 63 | top_new_words = [_[0] for _ in sorted_word_info[:top]] 64 | return top_new_words 65 | 66 | def cal_aggregation(self, debug): 67 | default_logger.debug("Calculating word internal aggregation score...") 68 | start_t = time.time() 69 | for word, count in self._trie.get_all_words(): 70 | if len(word) < self.WORD_MIN_LEN or count < self.WORD_MIN_FREQ: 71 | continue 72 | pmi = self._cal_word_aggregation(word, count) 73 | if debug: 74 | self._word_info[word]['aggreg'] = self._cal_word_aggregation(word, count) 75 | else: 76 | if pmi > self.WORD_MIN_PMI: 77 | self._word_info[word]['aggreg'] = self._cal_word_aggregation(word, count) 78 | default_logger.debug("Internal aggregation has been calculated succesfully, which costs %.3f seconds" % (time.time()-start_t)) 79 | 80 | def cal_neighbor_char_entropy(self, debug): 81 | default_logger.debug("Calculating word neighbor entropy score...") 82 | start_t = time.time() 83 | for word, count in self._trie.get_all_words(): 84 | if len(word) < self.WORD_MIN_LEN or count < self.WORD_MIN_FREQ: 85 | continue 86 | if not debug: 87 | if word not in self._word_info: # to speed up 88 | continue 89 | rc_entropy = self._cal_word_neighbor_char_entropy(self._trie, word) 90 | if not debug: 91 | if rc_entropy <= self.WORD_MIN_NEIGHBOR_ENTROPY: # to speed up 92 | self._word_info.pop(word) 93 | continue 94 | lc_entropy = self._cal_word_neighbor_char_entropy(self._trie_reversed, word[::-1]) 95 | neighbor_entropy = min(rc_entropy, lc_entropy) 96 | if debug: 97 | self._word_info[word]['nbr_entropy'] = neighbor_entropy 98 | self._word_info[word]['rc_entropy'] = rc_entropy 99 | self._word_info[word]['lc_entropy'] = lc_entropy 100 | else: 101 | if neighbor_entropy > self.WORD_MIN_NEIGHBOR_ENTROPY: 102 | self._word_info[word]['nbr_entropy'] = neighbor_entropy 103 | else: 104 | self._word_info.pop(word) 105 | default_logger.debug("Neighbor entropy has been calculated succesfully, which costs %.3f seconds" % (time.time()-start_t)) 106 | 107 | def cal_score(self, debug): 108 | for word, d in self._word_info.items(): 109 | self._word_info[word]['score'] = d['aggreg'] + d['nbr_entropy'] 110 | if debug: 111 | if d['nbr_entropy'] <= self.WORD_MIN_NEIGHBOR_ENTROPY: 112 | self._word_info[word]['score'] = 0.0 113 | self._word_info[word]['score_freq'] = d['score'] * self._trie.find(word) 114 | 115 | def _build_trie(self, sentences): 116 | default_logger.debug("Building trie tree...") 117 | start_t = time.time() 118 | for s in sentences: 119 | for n_grams in range(1, min(self.word_max_len+1, len(s)) + 1): 120 | if len(s) <= n_grams: 121 | self._trie.insert(s) 122 | self._trie_reversed.insert(s[::-1]) 123 | else: 124 | for end_pos in range(n_grams, len(s) + 1): 125 | self._trie.insert(s[end_pos-n_grams:end_pos]) 126 | self._trie_reversed.insert(s[end_pos-n_grams:end_pos][::-1]) 127 | default_logger.debug("Trie tree has been built succesfully, which costs %.3f seconds" % (time.time()-start_t)) 128 | 129 | def _preprocess(self, document_text): 130 | global RE_SENTENCE_SEPERATOR 131 | global RE_PUNCTUATION_TO_CLEAN 132 | # split to sentence 133 | sentences = re.split(RE_SENTENCE_SEPERATOR, document_text) 134 | # clean 135 | sentences_clean = [] 136 | for s in sentences: 137 | s = re.sub(RE_PUNCTUATION_TO_CLEAN, '', s) 138 | if not s: 139 | continue 140 | sentences_clean.append(s) 141 | return sentences_clean 142 | 143 | def _cal_word_aggregation(self, word, word_count): 144 | min_aggregation = MAX_INT 145 | for frag1, frag2 in self._generate_word_fragment(word): 146 | frag1_count = self._trie.find(frag1) 147 | frag2_count = self._trie.find(frag2) 148 | aggregation = word_count * self._trie.total_word_count / frag1_count / frag2_count 149 | min_aggregation = min(min_aggregation, aggregation) 150 | return math.log2(min_aggregation) 151 | 152 | def _generate_word_fragment(self, word): 153 | for pos in range(1, len(word)): 154 | yield (word[0:pos], word[pos:len(word)]) 155 | 156 | def _cal_word_neighbor_char_entropy(self, trie_tree, word): 157 | children_count_list = [] 158 | for char, char_count in trie_tree.get_children_char_count(word): 159 | children_count_list.append(char_count) 160 | total_word_count = sum(children_count_list) 161 | entropy = sum(map(lambda c: -(c/total_word_count)*math.log2(c/total_word_count), children_count_list)) 162 | return entropy 163 | 164 | if __name__ == '__main__': 165 | discover = EntropyBasedWorddiscovery(word_max_len=6) 166 | 167 | discover.parse(""" 168 | 自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此,这一领域的研究将涉及自然语言,即人们日常使用的语言,所以它与语言学的研究有着密切的联系,但又有重要的区别。自然语言处理并不是一般地研究自然语言,而在于研制能有效地实现自然语言通信的计算机系统,特别是其中的软件系统。因而它是计算机科学的一部分。 169 | 自然语言处理(NLP)是计算机科学,人工智能,语言学关注计算机和人类(自然)语言之间的相互作用的领域。 170 | """, debug=True) 171 | 172 | #for word, count in discover._trie.get_all_words(): 173 | # print(word, count) 174 | #for node, prefix in discover._trie.traverse(): 175 | # print(node, prefix) 176 | for word, d in discover._word_info.items(): 177 | print(word, d['aggreg'], d['nbr_entropy'], discover._trie.find(word)) 178 | 179 | print('\n'.join(discover.get_new_words(10))) 180 | -------------------------------------------------------------------------------- /src/exe/worddiscovery/test.txt: -------------------------------------------------------------------------------- 1 | 自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此,这一领域的研究将涉及自然语言,即人们日常使用的语言,所以它与语言学的研究有着密切的联系,但又有重要的区别。自然语言处理并不是一般地研究自然语言,而在于研制能有效地实现自然语言通信的计算机系统,特别是其中的软件系统。因而它是计算机科学的一部分。 2 | , 10 3 | , 5 -------------------------------------------------------------------------------- /src/exe/worddiscovery/trie.py: -------------------------------------------------------------------------------- 1 | _SENTINEL = object() 2 | 3 | class _TrieNode(object): 4 | __slots__ = ('children', 'value', 'count') 5 | 6 | def __init__(self): 7 | self.children = {} 8 | self.value = _SENTINEL 9 | self.count = 0 10 | 11 | def __repr__(self): 12 | return '_TrieNode<%s>: value[%s], count[%d]' % (id(self), self.value, self.count) 13 | 14 | 15 | class CharTrie(object): 16 | def __init__(self): 17 | self._root = _TrieNode() 18 | self.total_word_count = 0 19 | 20 | def insert(self, text): 21 | node = self._root 22 | for c in text: 23 | if c not in node.children: 24 | node.children[c] = _TrieNode() 25 | node.children[c].value = c 26 | node = node.children[c] 27 | node.count += 1 28 | self.total_word_count += 1 29 | 30 | def delete(self, text): 31 | pass 32 | 33 | def find(self, text): 34 | """ 35 | Args: 36 | text: string 37 | Returns: 38 | count: int, frequent of text 39 | """ 40 | is_in = True 41 | node = self._root 42 | for c in text: 43 | if c not in node.children: 44 | is_in = False 45 | break 46 | node = node.children[c] 47 | if is_in: 48 | return node.count 49 | else: 50 | return -1 51 | 52 | def traverse(self): 53 | Q = [(self._root, '')] 54 | 55 | while Q: 56 | node, prefix = Q.pop(0) 57 | for child in node.children.values(): 58 | yield (child, prefix) 59 | Q.append((child, prefix+child.value)) 60 | 61 | def get_all_words(self): 62 | for node, prefix in self.traverse(): 63 | yield (prefix+node.value, node.count) 64 | 65 | def get_children_char_count(self, text): 66 | """ 67 | function for entropy based word discovery 68 | """ 69 | is_in = True 70 | node = self._root 71 | for c in text: 72 | if c not in node.children: 73 | is_in = False 74 | break 75 | node = node.children[c] 76 | 77 | children = [] 78 | if is_in: 79 | for child in node.children.values(): 80 | children.append((child.value, child.count)) 81 | return children 82 | 83 | def clear(self): 84 | self._root = _TrieNode() 85 | -------------------------------------------------------------------------------- /src/image/wordcloud_62068.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/image/wordcloud_62068.png -------------------------------------------------------------------------------- /src/model/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/abstract_textrank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/abstract_textrank.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/bert_embedding_extend.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/bert_embedding_extend.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/compute_keywords_tfidf.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/compute_keywords_tfidf.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/config.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/config.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/create_wordcloud.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/create_wordcloud.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/embedding_manager_cyd.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/embedding_manager_cyd.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/glove_embedding.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/glove_embedding.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/keywords_textrank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/keywords_textrank.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/review_sentiment_analysis.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/review_sentiment_analysis.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/sentence_similarity.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/sentence_similarity.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/sentiment_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/sentiment_model.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/textrank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/textrank.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/topic_cluster_lda.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/topic_cluster_lda.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /src/model/logistic_reg_clf_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/logistic_reg_clf_model.pkl -------------------------------------------------------------------------------- /src/model/svm_clf.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/svm_clf.pkl -------------------------------------------------------------------------------- /src/model/svm_clf_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/svm_clf_model.pkl -------------------------------------------------------------------------------- /src/save/keyinfo_from_input_file.txt: -------------------------------------------------------------------------------- 1 | 摘要: 2 | 人工智能技术在疫情防控的各个应用场景中都可发挥重要作用,这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。 另外,这还可避免放射科医生以及临床医生被别人感染,降低他们的安全风险。 人工智能技术在防疫抗疫工作中大显身手 发布时间:2020-02-25 来源:人工智能实验室 近期,新型冠状病毒肺炎(简称“新冠肺炎”)的疫情突如其来,让人们有些措手不及。 3 | 关键词: 4 | 人工智能, 肺炎, 技术, 新冠, 疫情, 诊断, 疫苗, 进行, 医生, 利用 -------------------------------------------------------------------------------- /src/save/keyinfo_from_input_text.txt: -------------------------------------------------------------------------------- 1 | 摘要: 2 | 新小分子先导化合物生成中的 deep learning。 总结 在中国此次抗击疫情过程中,人工智能和大数据发挥了巨大的作用。 前面提到的AI新药研发公司Insilico Medicine,2020年2月6日,在官网上公开了其AI算法设计的6种可以阻止病毒复制的新分子结构。 3 | 关键词: 4 | 药物, 人工智能, 技术, 筛选, 靶点, 发现, 病毒, 数据, 利用, 临床 -------------------------------------------------------------------------------- /src/save/keyinfo_from_url.txt: -------------------------------------------------------------------------------- 1 | 摘要: 2 | 民航局于今日下发的通知,旨在响应1月25日中共中央办公厅、国务院办公厅下发的《关于做好人民群众就地过年服务保障工作的通知》,进一步扩大了免费退改政策的适用范围,落实春节假期非必要不流动的号召。 此前,民航局1月2日下发了《关于切实做好疫情常态化防控形势下客票退改服务工作的通知》,针对国内出现局部聚集性疫情的地区,要求各航空公司及时制定发布客票免费退改方案,并加强信息告知和宣传。 具体退改规则为:乘机日期在1月28日至2月3日的旅客,自1月27日0时起至航班起飞前可提出退票或改期申请。 3 | 关键词: 4 | 通知, 退改, 疫情, 民航局, 下发, 退票, 改期, 流动, 具体, 免费 -------------------------------------------------------------------------------- /src/save/keyinfo_input_file.txt: -------------------------------------------------------------------------------- 1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\save\text.txt -------------------------------------------------------------------------------- /src/save/keyinfo_input_text.txt: -------------------------------------------------------------------------------- 1 | 原标题:AI技术在新冠肺炎药物发现中的应用 背景 2020年2月26日,《麻省理工学院技术评论》发布的2020年“全球十大突破性技术”中,人工智能筛选分子入选。那么什么是筛选分子呢? 伴随着新型冠状病毒肺炎(COVID-19)的爆发,相信大家对这个概念并不陌生。最早提到这个概念的消息如下:2020年1月25日,中国科学院上海药物研究所和上海科技大学联合研究团队综合利用虚拟筛选和酶学测试相结合的策略,发现了一批可能对新型肺炎有治疗作用的老药和中药。其中包括后来人人都耳熟能详的洛匹那韦和瑞德西韦。 分子筛选是药物开发流程(图1)中的一个步骤:是指从大量化合物中选择对某一特定靶点有活性化合物的过程。由图1可见,新药研发过程复杂漫长,面对突然爆发的新型冠状病毒肺炎,全新药物设计很不现实。因此利用人工智能技术从已有药物中发现对新型冠状病毒(SARS-CoV-2)有抑制作用的药物显得非常迫切。 人工智能与新药之路 人工智能(Artificial Intelligent, AI)用于药物发现是基于计算机辅助药物设计(Computer-aided Drug Design,CADD),然后结合化学信息、生物信息中的大量数据建立优质的机器学习模型,在靶点筛选、分子结构/化学空间分析、配体-受体相互作用模拟、药物三维定量构效关系(3D-QSAR)分析等过程中指导先导化合物的发现和优化。 另外,在药物临床阶段及批准上市后也有人工智能的使用。比如诺华利用人工智能从多个内部数据源抓取临床数据,用于预测和监控临床试验的患者招募、花费和质量。诺华宣称该技术的应用,使得患者招募时间缩短了10-15%。 图2是人工智能医药公司Insilico Medicine在药物研发各环节中人工智能技术的应用:包括在靶点发现阶段的deep feature selection,NLP;新小分子先导化合物生成中的 deep learning;以及对小分子临床结果的预测等。 图2 Insilico Medicine 的AI药物研发之路 新型冠状病毒肺炎药物中的人工智能 由图1可知,新药研发过程复杂漫长,面对突然爆发的疫情,人工智能技术的应用变得非常重要。自疫情爆发以来,已公开了大量关于利用人工智能和大数据发现有效药物的研究报导。这些研究主要集中在靶点发现、疾病网络构建和药物筛选。 2020年1月29日,燧坤智能应急小组利用人工智能文本挖掘技术,完成了对13139个已有药物分子,2000余万篇文献和1960万摘要的挖掘,输出了数十个已报道对SARS、MERS等冠状病毒有抑制效果的药物化合物。 2月3日,华中科技大学同济医学院等医院和研究所与华为云联合科研团队宣布,筛选出五种可能对2019新型冠状病毒(2019-nCoV)有效的抗病毒药物。分别是Beclabuvir,沙奎那韦(Saquinavir),比特拉韦(Bictegravir),洛匹那韦(Lopinavir),多替拉韦(Dolutegravir)。联合科研团队针对SARS-CoV-2的多个靶标蛋白对8506种上市或者正在进行临床试验的药物中进行超大规模计算机辅助药物筛选工作,并在一周内取得了第一阶段成果。 某合资医药公司首先通过序列相似性对比找到同源性较大的病毒序列,以此为关键词在公共平台中寻找已发表文献或相关靶点数据,利用数据挖掘进行实体识别和关系抽取;同时通过传统的数据库检索,查找相关靶点;然后将两部分的结果做加权,最后输出高置信靶点结果86个。然后构建病毒特异性网络,挖掘高置信度信号通路24条及病毒作用的核心模块:T细胞受体途径、内吞作用、趋化因子途径、C型凝集受体途径、JAK-STAT途径。最后,对超过8000种已知药物与病毒网络结合起来进行药物筛选、过滤,共得到78个对SARS-CoV-2有抑制作用的药物,包括氯喹、阿巴卡韦、穿心莲内酯等。 2月14日,广东省钟南山医学基金会、广州呼吸健康研究院与阿里云达成合作,加速推进新冠病毒的临床救治关键技术、有效药物和疫苗研发等工作。阿里云将提供超大规模计算力、AI算法等技术,支持钟南山团队的科研人员加快开展对新冠病毒的新药研发、病毒基因测序、蛋白筛选等相关工作。 此外,除了靶点发现、疾病网络构建和药物筛选,也有利用人工智能技术生成新的小分子。前面提到的AI新药研发公司Insilico Medicine,2020年2月6日,在官网上公开了其AI算法设计的6种可以阻止病毒复制的新分子结构。 总结 在中国此次抗击疫情过程中,人工智能和大数据发挥了巨大的作用。也使更多的企业和科研院所进一步认识到了人工智能技术对药物发现不可或缺的作用。 事实上,从本次疫情可以看到中国在算法和硬件方面已经达到世界先进水平,全球健康药物研发中心GHDDI正与阿里云合作开发人工智能药物研发和大数据平台:针对SARS/MERS等冠状病毒的历史药物研发进行数据挖掘与集成,开放相关临床前和临床数据资源,计算靶点和药物分子性质,并跟进新型冠状病毒最新科研动态。 或许,本次疫情会改变国内药企对人工智能的态度,翻开人工智能在中国助力药物研发的新篇章。返回搜狐,查看更多 责任编辑: -------------------------------------------------------------------------------- /src/save/keyinfo_input_url.txt: -------------------------------------------------------------------------------- 1 | https://baijiahao.baidu.com/s?id=1689928103313263522&wfr=spider&for=pc -------------------------------------------------------------------------------- /src/save/new_word_discovery_input_file.txt: -------------------------------------------------------------------------------- 1 | 自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此,这一领域的研究将涉及自然语言,即人们日常使用的语言,所以它与语言学的研究有着密切的联系,但又有重要的区别。自然语言处理并不是一般地研究自然语言,而在于研制能有效地实现自然语言通信的计算机系统,特别是其中的软件系统。因而它是计算机科学的一部分。 2 | , 10 3 | , 5 -------------------------------------------------------------------------------- /src/save/new_word_discovery_output.txt: -------------------------------------------------------------------------------- 1 | 1 语言 2 | 2 自然语言 3 | 3 计算机 4 | 4 研究 5 | 5 科学 6 | 6 领域 7 | 7 计算机科学 8 | 8 重要 9 | 9 实现 10 | 10 系统 -------------------------------------------------------------------------------- /src/save/review_summary.txt: -------------------------------------------------------------------------------- 1 | business_id: 1 2 | business_name: Castor EDC 3 | business_rating: 0.6000312926277281 4 | average_user_rating: {'rating_overall': 4.72093023255814, 'rating_ease_of_use': 4.651162790697675, 'rating_customer_support': 4.953488372093024, 'rating_features_functionality': 4.5813953488372094, 'rating_value_for_money': 4.790697674418603, 'rating_likelihood_to_recommend': 4.697674418604652} 5 | aspect_summary: 6 | ------------------research------------------ 7 | rating: 0.47914341517857145 8 | pos: research we selected Castor EDC because it seemed easy; research easier 9 | neg: research studies; research data managemen; research forms ; research with substantial; Overall: We used it to collect data for medical research in several centers 10 | ------------------software------------------ 11 | rating: 0.7059326171875 12 | pos: software is easy; software is very intuitive; software is really intuitive; software keeps being developed 13 | neg: software with those characteristics so that collaborators won't have problems; Comments: We have just started using the software. We found it very easy to set up on our own and use it. There are some possibilities we would like to see added, but it might be we just missed them. ; software have occure; Pros: The software is very practical and userfriendly.; software for research 14 | ------------------database------------------ 15 | rating: 0.6578369140625 16 | pos: database is easy; database builder to create your own; Clear and user friendly database with many features 17 | neg: databases to new; database in a very structured; database that will also create a user-friendly; database to hav 18 | ------------------system------------------ 19 | rating: 0.734222412109375 20 | pos: Very user friendly EDC system; system is easy; system can perform 21 | neg: system nxt to the other; system that is affordable; Love this system! 22 | ------------------study------------------ 23 | rating: 0.4230211046006944 24 | pos: None 25 | neg: study on Castor; study visit ; study fell on me; study ID numbering is a bit clunky; study very easily without any complex 26 | -------------------------------------------------------------------------------- /src/save/testtext.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/save/testtext.txt -------------------------------------------------------------------------------- /src/save/text.txt: -------------------------------------------------------------------------------- 1 | 人工智能技术在防疫抗疫工作中大显身手 发布时间:2020-02-25 来源:人工智能实验室 近期,新型冠状病毒肺炎(简称“新冠肺炎”)的疫情突如其来,让人们有些措手不及。但是为了实现更好的防疫抗疫效果,不少研究人员纷纷应用诸多技术手段来抗击疫情。其中人工智能技术已成为这场防疫抗疫攻坚战的有力武器之一;它在疫情防控、图像分析、辅助诊断、疫苗研发、新药研制等方面助力防疫抗疫工作。 在疫情防控方面 新冠肺炎来势汹汹,但是它依然可防可控。采取有效的措施预防,戴口罩、勤洗手、居家隔离等都是非常行之有效的方法。例如戴口罩是预防传染病最重要、最有效的防控手段之一,可以有效降低感染新冠肺炎的风险。又如体温筛检是此次疫情中筛查排查可疑病例的一个手段。人工智能技术在疫情防控的各个应用场景中都可发挥重要作用,这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。 北京旷视科技有限公司最近推出一套用于发热及潜在被感染对象识别、筛查与分析的人工智能新系统“明骥”。该系统通过前端红外相机,鉴别人流中的高温人员,再根据疑似发烧者的人体、人脸信息,利用人工智能技术辅助工作人员快速定位体温异常者;做到了在佩戴口罩的情况下,也能精准锁定。目前,“明骥”已应用在地铁、火车站、机尝集中办公区等人流量较大的区域。 在图像分析方面 医疗影像数据是医疗数据的重要组成部分,人工智能技术能够通过快速准确地标记新冠肺炎的特定异常结构来提高图像分析的效率,以供放射科医生参考。提高图像分析效率,可让放射科医生腾出更多的时间聚焦在需要更多解读或判断的内容审阅上,从而有望缓解他们供给缺口问题。另外,这还可避免放射科医生以及临床医生被别人感染,降低他们的安全风险。 上海人工智能研究院与杭州健培科技有限公司联合研发的新冠肺炎影像云检测平台最近正式上线,对全国医院进行免费影像云诊断服务,并对所有医疗机构和各级政府免费开放,将高效、准确地为放射科医生以及临床医生提供决策依据,助力疫情防控。新冠肺炎影像云检测平台上线后,能够为临床一线抗疫医生疫情评估、肺炎性质判定、治疗方案制定提供高效精确的支撑依据。 在辅助诊断方面 医疗诊断是一个综合考虑各种影响因素的判断过程;利用人工智能技术辅助诊断新冠肺炎,能够在短时间内精准地预判病情,对提高患者预后具有重要作用。人工智能技术辅助诊断的功能既可以精确分割CT扫描部位的病灶;还可以对病灶的CT影像做分析,找出疑似病变和组织结构的异常,并给出诊断方向。在质控及病变识别方面,具有更为宽泛的使用范围。 在CT影像快速诊断方面,北京推想科技与武汉同济医院、深圳市第三人民医院合作研发针对新冠肺炎特别版,该版利用人工智能技术的深度学习、图像识别等对检出的病灶进行测量、密度分析,支持患者前后片对照,提供量化数据对比结果,帮助医生更快完成疑似患者诊断。北京安德医智联合解放军总医院正在研发新冠肺炎CT影像人工智能辅助诊断系统,免费提供给全国各级医院使用。 在疫苗研发方面 随着疫情持续,很多民众非常关心新冠肺炎的疫苗研发进展。据介绍,无论是对病毒进行基因测序,找到病毒来源以及传播宿主,还是研发病毒疫苗,人工智能技术都大有用武之地。例如传统的疫苗研发需在实验室中对数百种药物成分进行生物测试,这一过程往往要耗费不少时间;而人工智能技术可以极大加速这个过程,能够让更多的人获得疫苗的保护。 浙江大学研究团队最近利用人工智能技术在已有的药物中找到两种抗击疫情药物,从而使疫苗的研发工作取得了阶段性的成果。这两种药物有可能成为新冠肺炎候选疫苗,目前正在进行临床试验。据了解,将人工智能技术用于筛选和研发疫苗,能够帮助研究人员在已有的药物中快速找到可能对预防新冠肺炎有效的生物制品。 在新药研制方面 新冠肺炎的临床表现以发热﹑乏力﹑干咳为主要表现;而随着疾病的进展会出现急性呼吸窘迫综合征、难以纠正的代谢性酸中毒等,需要给予积极有效的治疗。但是目前还没有明确的特效药能够治疗新冠肺炎,只能根据患者的一般情况进行对症治疗,预防继发的感染,及时进行器官的功能支持。不过研究人员正在利用人工智能技术研制针对该病的特效药,新药很快就会问世。 美国麻省理工学院研究团队近日利用人工智能技术发现一种新型抗生素,它可以杀灭多种致病细菌,包括一些对所有已知抗生素都具耐药性的细菌菌株。研究人员通过让机器学习算法在几天内充分筛查庞大数据库中逾1亿种化合物,终于发现了这种抗生素;该抗生素被认为能有效抑制大肠杆菌,对治疗新冠肺炎也有效。 由上可知,人工智能技术正在新冠肺炎的防疫抗疫工作中大显身手。可以预料,作为一种综合性极强的技术,人工智能将在医疗健康领域内得到越来越多的应用,并将成为影响医学行业发展的重要科技手段。正如我国著名学者周海中教授曾经指出的那样:“随着社会的发展和科技的进步,人工智能技术将在医疗健康领域大显身手;其成果会不断涌现,应用前景令人期待。” 2 | -------------------------------------------------------------------------------- /src/save/topic_input_file.txt: -------------------------------------------------------------------------------- 1 | D:\Github\NLPVisualizationSystem\src\save\save_article.txt -------------------------------------------------------------------------------- /src/save/topic_keywords_dist.txt: -------------------------------------------------------------------------------- 1 | Number of topics 2 2 | Topic 0 病变 药物 病毒 公司 疫苗 患者 技术 诊断 制药 人工智能 平安 氯喹 西韦 瑞德 辅助 一种 新药 医药 科技 治疗 感染 新型 成本 细胞 影像 3 | Topic 1 药物 平安 公司 病变 诊断 患者 制药 这种 瑞德 一种 疫苗 新药 西韦 人工智能 技术 郭佑民 辅助 能够 病毒 试验 找到 肺部 风湿性关节炎 抑制 诊断系统 4 | Document 1 [(1, 0.9984122)] 5 | Document 2 [(1, 0.99875927)] 6 | Document 3 [(1, 0.998737)] 7 | Document 4 [(1, 0.998796)] 8 | Document 5 [(1, 0.99632406)] 9 | Document 6 [(1, 0.991446)] 10 | Document 7 [(0, 0.03731014), (1, 0.9626899)] 11 | Document 8 [(0, 0.1481864), (1, 0.8518136)] 12 | Document 9 [(0, 0.2686373), (1, 0.7313627)] 13 | Document 10 [(0, 0.18562442), (1, 0.81437564)] 14 | Document 11 [(0, 0.18398936), (1, 0.8160106)] 15 | Document 12 [(0, 0.32994798), (1, 0.67005205)] 16 | Document 13 [(0, 0.052803226), (1, 0.9471968)] 17 | Document 14 [(1, 0.99649817)] 18 | Document 15 [(1, 0.9962901)] 19 | Document 16 [(0, 0.016117718), (1, 0.98388225)] 20 | Document 17 [(1, 0.9963944)] 21 | Document 18 [(1, 0.99775714)] 22 | Document 19 [(0, 0.33902404), (1, 0.660976)] 23 | Document 20 [(1, 0.99305475)] 24 | Document 21 [(1, 0.99877584)] 25 | Document 22 [(0, 0.0373046), (1, 0.9626954)] 26 | Document 23 [(1, 0.99497014)] 27 | Document 24 [(1, 0.99875236)] 28 | Document 25 [(1, 0.99881166)] 29 | Document 26 [(1, 0.99876165)] 30 | Document 27 [(0, 0.27710757), (1, 0.72289246)] 31 | Document 28 [(0, 0.973271), (1, 0.026728965)] 32 | Document 29 [(0, 0.9737816), (1, 0.02621842)] 33 | Document 30 [(0, 0.9732813), (1, 0.02671867)] 34 | Document 31 [(1, 0.99455696)] 35 | Document 32 [(0, 0.43477303), (1, 0.565227)] 36 | Document 33 [(0, 0.09642063), (1, 0.90357935)] 37 | Document 34 [(0, 0.013210406), (1, 0.9867896)] 38 | Document 35 [(0, 0.99591917)] 39 | Document 36 [(0, 0.6590466), (1, 0.3409534)] 40 | Document 37 [(1, 0.99653476)] 41 | Document 38 [(0, 0.8884582), (1, 0.11154182)] 42 | Document 39 [(1, 0.9984031)] 43 | Document 40 [(1, 0.99875927)] 44 | Document 41 [(1, 0.9987372)] 45 | Document 42 [(1, 0.99879587)] 46 | Document 43 [(1, 0.9963238)] 47 | Document 44 [(1, 0.9919473)] 48 | Document 45 [(0, 0.048582092), (1, 0.9514179)] 49 | Document 46 [(0, 0.18444301), (1, 0.815557)] 50 | Document 47 [(0, 0.29741976), (1, 0.7025802)] 51 | Document 48 [(0, 0.14375831), (1, 0.8562417)] 52 | Document 49 [(0, 0.21951777), (1, 0.78048223)] 53 | Document 50 [(0, 0.20557976), (1, 0.79442024)] 54 | Document 51 [(0, 0.04226604), (1, 0.957734)] 55 | Document 52 [(1, 0.9964978)] 56 | Document 53 [(1, 0.99628973)] 57 | Document 54 [(0, 0.01421434), (1, 0.98578566)] 58 | Document 55 [(1, 0.9964239)] 59 | Document 56 [(1, 0.99775696)] 60 | Document 57 [(0, 0.25308257), (1, 0.7469175)] 61 | Document 58 [(1, 0.9926701)] 62 | Document 59 [(1, 0.9987756)] 63 | Document 60 [(0, 0.047722455), (1, 0.9522776)] 64 | Document 61 [(1, 0.99478036)] 65 | Document 62 [(1, 0.99875253)] 66 | Document 63 [(1, 0.99881274)] 67 | Document 64 [(1, 0.998761)] 68 | Document 65 [(0, 0.25111613), (1, 0.7488839)] 69 | Document 66 [(0, 0.95668525), (1, 0.043314744)] 70 | Document 67 [(0, 0.97325945), (1, 0.026740566)] 71 | Document 68 [(1, 0.9944634)] 72 | Document 69 [(0, 0.4053822), (1, 0.5946178)] 73 | Document 70 [(0, 0.113431476), (1, 0.88656855)] 74 | Document 71 [(0, 0.6606846), (1, 0.33931538)] 75 | Document 72 [(0, 0.013217282), (1, 0.98678267)] 76 | Document 73 [(0, 0.99607855)] 77 | Document 74 [(0, 0.6522202), (1, 0.34777978)] 78 | Document 75 [(1, 0.99647295)] 79 | Document 76 [(0, 0.91441846), (1, 0.08558151)] 80 | Document 77 [(0, 0.96741784), (1, 0.032582123)] 81 | Document 78 [(0, 0.1464981), (1, 0.8535019)] 82 | Document 79 [(1, 0.99072385)] 83 | Document 80 [(0, 0.24954157), (1, 0.7504584)] 84 | Document 81 [(0, 0.7606445), (1, 0.23935558)] 85 | Document 82 [(1, 0.9988935)] 86 | Document 83 [(1, 0.99625957)] 87 | Document 84 [(0, 0.39040664), (1, 0.60959333)] 88 | Document 85 [(1, 0.9976985)] 89 | Document 86 [(1, 0.9961334)] 90 | Document 87 [(0, 0.17439792), (1, 0.82560205)] 91 | Document 88 [(0, 0.037328243), (1, 0.9626718)] 92 | Document 89 [(0, 0.37466863), (1, 0.62533134)] 93 | Document 90 [(0, 0.104917355), (1, 0.89508265)] 94 | Document 91 [(1, 0.9970127)] 95 | Document 92 [(1, 0.9965256)] 96 | Document 93 [(0, 0.06389082), (1, 0.9361092)] 97 | Document 94 [(0, 0.12760386), (1, 0.8723961)] 98 | Document 95 [(0, 0.8994774), (1, 0.10052254)] 99 | Document 96 [(1, 0.99544394)] 100 | Document 97 [(0, 0.3909931), (1, 0.60900694)] 101 | Document 98 [(0, 0.22720024), (1, 0.77279973)] 102 | Document 99 [(0, 0.168411), (1, 0.831589)] 103 | Document 100 [(0, 0.9206198), (1, 0.07938017)] 104 | Document 101 [(0, 0.109019205), (1, 0.89098084)] 105 | Document 102 [(0, 0.80498064), (1, 0.19501936)] 106 | Document 103 [(0, 0.01335223), (1, 0.9866478)] 107 | Document 104 [(1, 0.9921854)] 108 | Document 105 [(1, 0.9976654)] 109 | Document 106 [(0, 0.35281095), (1, 0.6471891)] 110 | Document 107 [(0, 0.015055568), (1, 0.9849444)] 111 | Document 108 [(0, 0.19415398), (1, 0.805846)] 112 | Document 109 [(1, 0.99857014)] 113 | Document 110 [(0, 0.687761), (1, 0.312239)] 114 | Document 111 [(0, 0.0134905195), (1, 0.9865095)] 115 | Document 112 [(0, 0.9023951), (1, 0.09760485)] 116 | Document 113 [(0, 0.92524976), (1, 0.074750245)] 117 | Document 114 [(1, 0.9969824)] 118 | Document 115 [(0, 0.07226383), (1, 0.9277361)] 119 | Document 116 [(1, 0.9922549)] 120 | Document 117 [(0, 0.49086607), (1, 0.5091339)] 121 | Document 118 [(0, 0.047327124), (1, 0.9526729)] 122 | Document 119 [(0, 0.9909708)] 123 | Document 120 [(0, 0.8767328), (1, 0.123267174)] 124 | Document 121 [(0, 0.015171723), (1, 0.9848283)] 125 | Document 122 [(1, 0.9901243)] 126 | Document 123 [(1, 0.99853384)] 127 | Document 124 [(0, 0.9940778)] 128 | -------------------------------------------------------------------------------- /src/save/user_input_product_id_name.txt: -------------------------------------------------------------------------------- 1 | Castor EDC -------------------------------------------------------------------------------- /src/save/userfile.txt: -------------------------------------------------------------------------------- 1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\save\text.txt -------------------------------------------------------------------------------- /src/save/usertext.txt: -------------------------------------------------------------------------------- 1 | 人工智能技术在防疫抗疫工作中大显身手 发布时间:2020-02-25 来源:人工智能实验室 近期,新型冠状病毒肺炎(简称“新冠肺炎”)的疫情突如其来,让人们有些措手不及。但是为了实现更好的防疫抗疫效果,不少研究人员纷纷应用诸多技术手段来抗击疫情。其中人工智能技术已成为这场防疫抗疫攻坚战的有力武器之一;它在疫情防控、图像分析、辅助诊断、疫苗研发、新药研制等方面助力防疫抗疫工作。 在疫情防控方面 新冠肺炎来势汹汹,但是它依然可防可控。采取有效的措施预防,戴口罩、勤洗手、居家隔离等都是非常行之有效的方法。例如戴口罩是预防传染病最重要、最有效的防控手段之一,可以有效降低感染新冠肺炎的风险。又如体温筛检是此次疫情中筛查排查可疑病例的一个手段。人工智能技术在疫情防控的各个应用场景中都可发挥重要作用,这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。 北京旷视科技有限公司最近推出一套用于发热及潜在被感染对象识别、筛查与分析的人工智能新系统“明骥”。该系统通过前端红外相机,鉴别人流中的高温人员,再根据疑似发烧者的人体、人脸信息,利用人工智能技术辅助工作人员快速定位体温异常者;做到了在佩戴口罩的情况下,也能精准锁定。目前,“明骥”已应用在地铁、火车站、机尝集中办公区等人流量较大的区域。 在图像分析方面 医疗影像数据是医疗数据的重要组成部分,人工智能技术能够通过快速准确地标记新冠肺炎的特定异常结构来提高图像分析的效率,以供放射科医生参考。提高图像分析效率,可让放射科医生腾出更多的时间聚焦在需要更多解读或判断的内容审阅上,从而有望缓解他们供给缺口问题。另外,这还可避免放射科医生以及临床医生被别人感染,降低他们的安全风险。 上海人工智能研究院与杭州健培科技有限公司联合研发的新冠肺炎影像云检测平台最近正式上线,对全国医院进行免费影像云诊断服务,并对所有医疗机构和各级政府免费开放,将高效、准确地为放射科医生以及临床医生提供决策依据,助力疫情防控。新冠肺炎影像云检测平台上线后,能够为临床一线抗疫医生疫情评估、肺炎性质判定、治疗方案制定提供高效精确的支撑依据。 在辅助诊断方面 医疗诊断是一个综合考虑各种影响因素的判断过程;利用人工智能技术辅助诊断新冠肺炎,能够在短时间内精准地预判病情,对提高患者预后具有重要作用。人工智能技术辅助诊断的功能既可以精确分割CT扫描部位的病灶;还可以对病灶的CT影像做分析,找出疑似病变和组织结构的异常,并给出诊断方向。在质控及病变识别方面,具有更为宽泛的使用范围。 在CT影像快速诊断方面,北京推想科技与武汉同济医院、深圳市第三人民医院合作研发针对新冠肺炎特别版,该版利用人工智能技术的深度学习、图像识别等对检出的病灶进行测量、密度分析,支持患者前后片对照,提供量化数据对比结果,帮助医生更快完成疑似患者诊断。北京安德医智联合解放军总医院正在研发新冠肺炎CT影像人工智能辅助诊断系统,免费提供给全国各级医院使用。 在疫苗研发方面 随着疫情持续,很多民众非常关心新冠肺炎的疫苗研发进展。据介绍,无论是对病毒进行基因测序,找到病毒来源以及传播宿主,还是研发病毒疫苗,人工智能技术都大有用武之地。例如传统的疫苗研发需在实验室中对数百种药物成分进行生物测试,这一过程往往要耗费不少时间;而人工智能技术可以极大加速这个过程,能够让更多的人获得疫苗的保护。 浙江大学研究团队最近利用人工智能技术在已有的药物中找到两种抗击疫情药物,从而使疫苗的研发工作取得了阶段性的成果。这两种药物有可能成为新冠肺炎候选疫苗,目前正在进行临床试验。据了解,将人工智能技术用于筛选和研发疫苗,能够帮助研究人员在已有的药物中快速找到可能对预防新冠肺炎有效的生物制品。 在新药研制方面 新冠肺炎的临床表现以发热﹑乏力﹑干咳为主要表现;而随着疾病的进展会出现急性呼吸窘迫综合征、难以纠正的代谢性酸中毒等,需要给予积极有效的治疗。但是目前还没有明确的特效药能够治疗新冠肺炎,只能根据患者的一般情况进行对症治疗,预防继发的感染,及时进行器官的功能支持。不过研究人员正在利用人工智能技术研制针对该病的特效药,新药很快就会问世。 美国麻省理工学院研究团队近日利用人工智能技术发现一种新型抗生素,它可以杀灭多种致病细菌,包括一些对所有已知抗生素都具耐药性的细菌菌株。研究人员通过让机器学习算法在几天内充分筛查庞大数据库中逾1亿种化合物,终于发现了这种抗生素;该抗生素被认为能有效抑制大肠杆菌,对治疗新冠肺炎也有效。 由上可知,人工智能技术正在新冠肺炎的防疫抗疫工作中大显身手。可以预料,作为一种综合性极强的技术,人工智能将在医疗健康领域内得到越来越多的应用,并将成为影响医学行业发展的重要科技手段。正如我国著名学者周海中教授曾经指出的那样:“随着社会的发展和科技的进步,人工智能技术将在医疗健康领域大显身手;其成果会不断涌现,应用前景令人期待。” 2 | -------------------------------------------------------------------------------- /src/save/userurl.txt: -------------------------------------------------------------------------------- 1 | https://baijiahao.baidu.com/s?id=1689928103313263522&wfr=spider&for=pc -------------------------------------------------------------------------------- /src/save/wordcloud_from_input_file.txt: -------------------------------------------------------------------------------- 1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\image\wordcloud_40304.png -------------------------------------------------------------------------------- /src/save/wordcloud_from_input_text.txt: -------------------------------------------------------------------------------- 1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\image\wordcloud_30399.png -------------------------------------------------------------------------------- /src/save/wordcloud_from_url.txt: -------------------------------------------------------------------------------- 1 | D:\Github\NLPVisualizationSystem\src\image\wordcloud_62068.png -------------------------------------------------------------------------------- /src/static.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/static.zip -------------------------------------------------------------------------------- /src/templates.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/templates.zip -------------------------------------------------------------------------------- /src/utils/data_prepare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 30 10:33:25 2020 4 | 5 | @author: Xu 6 | """ 7 | # data prepare 8 | import pandas as pd 9 | import re 10 | from src import config 11 | import os 12 | import numpy as np 13 | from collections import Counter 14 | 15 | 16 | 17 | ## part 2: DATA 2 (product) 18 | df = pd.read_excel(config.product_data_path_2) 19 | 20 | # add column: product id 21 | df['product_id'] = df.index 22 | 23 | # remove extra spaces 24 | df['deployment'] = df['deployment'].astype(str).map(lambda x: re.sub(r'\s+','',x)) 25 | df['deployment'] = df['deployment'].map(lambda x: np.nan if x=='nan' else x) 26 | 27 | 28 | df['training'] = df['training'].astype(str).map(lambda x: re.sub(r'\s+','',x)) 29 | df['training'] = df['training'].map(lambda x: np.nan if x=='nan' else x) 30 | 31 | df['support'] = df['support'].astype(str).map(lambda x: re.sub(r'\s+','',x)) 32 | df['support'] = df['support'].map(lambda x: np.nan if x=='nan' else x) 33 | 34 | 35 | # add new feature:price 36 | # handle unclearly defined fields 37 | import locale 38 | locale.setlocale(locale.LC_ALL,'English_US') 39 | 40 | df['starting_price_method'] = df['starting_price'].astype(str).map(lambda x: re.split(r'/',x,maxsplit=1)[1] if len(re.split(r'/',x,maxsplit=1))>1 else 'Not provided by vendor') 41 | df['starting_price_num'] = df['starting_price'].astype(str).map(lambda x: locale.atof(re.sub(r'\$', '',re.split(r'/',x,maxsplit=1)[0])) if re.match(r'^\$\S+',re.split(r'/',x,maxsplit=1)[0])!=None else float('inf')) 42 | 43 | # missing data process:'Not provided by vendor' 44 | 45 | import random 46 | 47 | def get_num_by_prob(num_list, prob_list): 48 | x = random.uniform(0, 1) 49 | cum_pro = 0.0 50 | for num, pro in zip(num_list, prob_list): 51 | cum_pro += pro 52 | if x < cum_pro: 53 | return num 54 | 55 | def get_num_by_prob_range(num_range_list, prob_list): 56 | x = random.uniform(0, 1) 57 | cum_pro = 0.0 58 | for num_range, pro in zip(num_range_list, prob_list): 59 | cum_pro += pro 60 | if x < cum_pro: 61 | num = np.random.uniform(num_range[0],num_range[1]) 62 | return num 63 | 64 | def get_uniform_random_num(low, high): 65 | num = np.random.uniform(low, high) 66 | return round(num,2) 67 | 68 | def fill_missing_data(dist, dtype='str'): 69 | """ 70 | It is used to generate random number or string based on 71 | your given distribution. Here specifically refers to the frequency distribution,like: 72 | - dist = {'a':0.1,'b':0.2,'c':0.7}, dtype='str': return a random string,like 'c' 73 | - dist = {[0,10]: 0.8,[11,100]:0.2}, dtype='int':return a random integer, like 7 74 | - dist = {[0,10]: 0.8,[11,100]:0.2}, dtype='float':return a random decimal, like 20.5 75 | 76 | Parameters 77 | ---------- 78 | dist: TYPE-dictionary 79 | DESCRIPTION: specifically refers tofrequency distribution. 80 | The keys of the dictionary represent all possible random values, 81 | and the values represent the probability of obtaining each key. 82 | i.e. dist = {'a':0.1,'b':0.2,'c':0.7}, dist = {'a':1,'b':2,'c':7} 83 | If the value (frequency) is not a decimal, it is automatically converted to a decimal. 84 | 85 | dtype: TYPE-str 86 | DESCRIPTION: the data type of random value, default is 'str', options='str','int','float'. 87 | 88 | Returns 89 | ------- 90 | result: Type-depends on 'dtype' 91 | DESCRIPTION: a random value 92 | 93 | """ 94 | num_list = list(dist.keys()) 95 | prob_list = list(dist.values()) 96 | if sum(prob_list)>1: 97 | prob_list = [sum(prob_list)-p for p in prob_list] 98 | prob_list = [p/sum(prob_list) for p in prob_list] 99 | 100 | if dtype=='str': 101 | return get_num_by_prob(num_list, prob_list) 102 | elif dtype=='int': 103 | return int(get_num_by_prob(num_list, prob_list)) 104 | else: 105 | return round(get_num_by_prob(num_list, prob_list),2) 106 | 107 | # missing data process:'Not provided by vendor' 108 | # fill the starting price method 109 | starting_price_method_dic = dict(Counter(df['starting_price_method'])) 110 | del starting_price_method_dic['Not provided by vendor'] 111 | df['starting_price_method_fill'] = df['starting_price_method'].map(lambda x: fill_missing_data(starting_price_method_dic, dtype='str') if x=='Not provided by vendor' else x) 112 | 113 | # prices interval 114 | prices_range = {} 115 | for m in starting_price_method_dic.keys(): 116 | price_list = list(df.loc[(df['starting_price_method']==m),'starting_price_num'].values) 117 | if len(price_list)>1: 118 | prange = [min(price_list), max(price_list)] 119 | else: 120 | prange = [price_list[0]*0.05, price_list[0]*1.5] 121 | prices_range[m] = prange 122 | 123 | # fill the starting price num 124 | import copy 125 | 126 | df['starting_price_num_fill'] = copy.copy(df['starting_price_num']) 127 | for i in df.index: 128 | k = df.loc[i, 'starting_price_method_fill'] 129 | if df.loc[i, 'starting_price_num_fill'] == float('inf'): 130 | df.loc[i, 'starting_price_num_fill'] = get_uniform_random_num(prices_range[k][0], prices_range[k][1]) 131 | 132 | # save new data 133 | df.to_csv(config.product_data_path, index=False) 134 | 135 | 136 | # --------------------------------------------------------------------------------- 137 | 138 | ## part 1: DATA 1 (user) 139 | data = pd.read_excel(config.user_data_path_1) 140 | ## missing data process 141 | data['rating_overall'] = data['rating_overall'].map(lambda x: float(re.sub('/5','',x))) 142 | data['rating_likelihood_to_recommend'] = data['rating_likelihood_to_recommend'].fillna(method='pad') 143 | data['rating_likelihood_to_recommend'] = data['rating_likelihood_to_recommend'].map(lambda x: float(x.split('/')[0])/2) 144 | 145 | for i in ['rating_ease_of_use','rating_customer_support','rating_value_for_money','rating_features_functionality']: 146 | data[i] = data[i].fillna(data[i].mode()[0]) 147 | data[i] = data[i].map(lambda x: float(x)) 148 | 149 | 150 | ## check data features 151 | # remove wrong field 152 | p = re.compile(r'(\d+\-\d+ \w+)') 153 | data['user_industry']= data['user_industry'].astype(str).map(lambda x: p.sub('nan',x)) 154 | data['user_industry'] = data['user_industry'].map(lambda x: np.nan if x=='nan' else x) 155 | 156 | # remove wrong field 157 | #删除这列中错误的字段'Wellness and Fitness' 158 | # data.index[data['user_company_size']=='Wellness and Fitness'].tolist() 159 | data['user_company_size'].where(cond=data['user_company_size']!='Wellness and Fitness',other=np.nan,inplace=True) 160 | 161 | # remove wrong field 162 | data['user_job_title'] = data['user_job_title'].astype(str).map(lambda x: 'nan' if re.match(r'\W+',x)!=None else x) 163 | data['user_job_title'] = data['user_job_title'].map(lambda x: np.nan if x=='nan' else x) 164 | 165 | # add product id 166 | data = pd.merge(left=data, right=df[['product_id','product_name']], how='left', on='product_name') 167 | 168 | # save new data 169 | data.to_csv(config.user_data_path, index=False) 170 | 171 | 172 | 173 | 174 | # --------------------------------------------------------------------------------- 175 | # review data 176 | def get_review_data(): 177 | """ 178 | It is used to read data and build a specific data frame, includes 179 | review data items and product data items. 180 | 181 | Return 182 | ------- 183 | review_data_item: type-dataframe, columns: review_id, product_name, user_name, text, 184 | like ['product_1', 'user_name_1', 4.5, '....'], .... 185 | product_data_item: type-dataframe, columns: product_id, product_name, review_count, rate 186 | like ['product_1', 12], .... 187 | 188 | """ 189 | 190 | df = pd.read_csv(config.user_data_path) 191 | 192 | # review data 193 | review_data_item = df[['product_name', 'user_name', 'review_title']] 194 | review_data_item = review_data_item.rename(columns={'review_title':'review'}).dropna(subset=['review']) 195 | for col in ['software_pros', 'software_cons', 'software_overall_experience', 196 | 'software_comments', 'software_recommendations']: 197 | df1 = df[['product_name', 'user_name', col]] 198 | df1 = df1.rename(columns={col:'review'}).dropna(subset=['review']) 199 | review_data_item = pd.concat([review_data_item, df1]) 200 | 201 | # product data 202 | product_basic = review_data_item.groupby('product_name').count()['user_name'].reset_index(name="review_count") 203 | product_rate = df.groupby('product_name').mean()[['rating_overall', 'rating_ease_of_use', 204 | 'rating_customer_support', 'rating_features_functionality', 205 | 'rating_value_for_money', 'rating_likelihood_to_recommend']].reset_index() 206 | product_data_item = pd.merge(left=product_basic,right=product_rate,on='product_name') 207 | 208 | # save review data 209 | ReviewData = pd.merge(left = review_data_item, right = product_data_item, how='inner', on='product_name') 210 | ReviewData = pd.merge(left = ReviewData, right = df[['product_name','product_id']].drop_duplicates(), how='left', on='product_name') 211 | ReviewData.to_csv(config.review_data_path, index_label='review_id') 212 | 213 | # save product data 214 | df2 = pd.read_csv(config.product_data_path) 215 | ProductData = pd.merge(left = product_basic, right = df2[['product_name','product_id']].drop_duplicates(), how='outer', on='product_name') 216 | ProductData = ProductData.fillna(0) 217 | ProductData.to_csv(config.business_data_path, index=False) 218 | 219 | get_review_data() 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | --------------------------------------------------------------------------------