├── README.md
├── __pycache__
    ├── baidu_search.cpython-37.pyc
    ├── config.cpython-36.pyc
    ├── config.cpython-37.pyc
    ├── create_wordcloud.cpython-37.pyc
    ├── exe_01.cpython-36.pyc
    └── exe_01.cpython-37.pyc
├── app.py
├── frontend
    └── link.txt
├── image
    ├── Matrix Admin 运行演示.png
    ├── 导航菜单.png
    ├── 文本关键信息提取1.png
    ├── 文本关键信息提取2.png
    ├── 文本分类.gif
    ├── 文本生成.png
    ├── 新词挖掘.png
    ├── 用户画像分析.png
    ├── 用户评价情感分析.gif
    ├── 竞品分析.gif
    ├── 自动生成词云.gif
    ├── 项目文件目录结构.png
    └── 首页.png
└── src
    ├── __pycache__
        ├── __init__.cpython-36.pyc
        ├── config.cpython-36.pyc
        ├── config.cpython-37.pyc
        ├── exe_01.cpython-36.pyc
        ├── exe_01.cpython-37.pyc
        ├── exe_02.cpython-36.pyc
        ├── exe_02.cpython-37.pyc
        ├── exe_03.cpython-36.pyc
        ├── exe_05.cpython-36.pyc
        ├── exe_06.cpython-36.pyc
        └── exe_06.cpython-37.pyc
    ├── background
        ├── china.jpg
        ├── oval.png
        └── profile.png
    ├── config.py
    ├── data
        ├── ProductData.csv
        ├── README.md
        ├── UserReviewData.csv
        ├── data_out_proucts_details.xls
        ├── glove.6B.100d.txt
        ├── idf.txt
        ├── new_proucts_details.csv
        ├── new_proucts_details.xls
        ├── new_users_comments.csv
        ├── new_users_comments.xls
        ├── save_article.txt
        ├── simhei.ttf
        ├── stop_words_ch.txt
        ├── stopwords.txt
        ├── text.txt
        ├── token_vector.bin
        └── users_comments.xls
    ├── exe
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-36.pyc
        │   ├── __init__.cpython-37.pyc
        │   ├── exe_01.cpython-36.pyc
        │   ├── exe_01.cpython-37.pyc
        │   ├── exe_02.cpython-36.pyc
        │   ├── exe_02.cpython-37.pyc
        │   ├── exe_03.cpython-36.pyc
        │   ├── exe_03.cpython-37.pyc
        │   ├── exe_05.cpython-36.pyc
        │   └── exe_06.cpython-36.pyc
        ├── exe_01.py
        ├── exe_02.py
        ├── exe_03.py
        ├── exe_05.py
        ├── exe_06.py
        ├── key_info_extraction
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-36.pyc
        │   │   ├── __init__.cpython-37.pyc
        │   │   ├── abstract_textrank.cpython-36.pyc
        │   │   ├── abstract_textrank.cpython-37.pyc
        │   │   ├── compute_keywords_tfidf.cpython-36.pyc
        │   │   ├── create_wordcloud.cpython-36.pyc
        │   │   ├── keywords_textrank.cpython-36.pyc
        │   │   ├── keywords_textrank.cpython-37.pyc
        │   │   ├── sentence_similarity.cpython-36.pyc
        │   │   ├── sentence_similarity.cpython-37.pyc
        │   │   ├── textrank.cpython-36.pyc
        │   │   ├── textrank.cpython-37.pyc
        │   │   └── topic_cluster_lda.cpython-36.pyc
        │   ├── abstract_textrank.py
        │   ├── compute_keywords_tfidf.py
        │   ├── create_wordcloud.py
        │   ├── keywords_textrank.py
        │   ├── sentence_similarity.py
        │   ├── textrank.py
        │   └── topic_cluster_lda.py
        ├── review_sentiment
        │   ├── business.py
        │   ├── main.py
        │   ├── model_training.py
        │   ├── sentence.py
        │   └── stopwords.txt
        ├── sentiment_analysis
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── bert_embedding_extend.cpython-36.pyc
        │   │   ├── bert_embedding_extend.cpython-37.pyc
        │   │   ├── embedding_manager_cyd.cpython-36.pyc
        │   │   ├── embedding_manager_cyd.cpython-37.pyc
        │   │   ├── glove_embedding.cpython-36.pyc
        │   │   ├── glove_embedding.cpython-37.pyc
        │   │   ├── review_sentiment_analysis.cpython-36.pyc
        │   │   ├── review_sentiment_analysis.cpython-37.pyc
        │   │   ├── sentiment_model.cpython-36.pyc
        │   │   ├── sentiment_model.cpython-37.pyc
        │   │   ├── utils.cpython-36.pyc
        │   │   └── utils.cpython-37.pyc
        │   ├── bert_embedding_extend.py
        │   ├── embedding_manager_cyd.py
        │   ├── glove_embedding.py
        │   ├── review_sentiment_analysis.py
        │   ├── sentiment_model.py
        │   └── utils.py
        └── worddiscovery
        │   ├── __init__.py
        │   ├── __pycache__
        │       ├── __init__.cpython-36.pyc
        │       ├── __init__.cpython-37.pyc
        │       ├── entropy_based.cpython-36.pyc
        │       ├── trie.cpython-36.pyc
        │       └── trie.cpython-37.pyc
        │   ├── entropy_based.py
        │   ├── test.txt
        │   └── trie.py
    ├── image
        └── wordcloud_62068.png
    ├── model
        ├── __pycache__
        │   ├── __init__.cpython-36.pyc
        │   ├── abstract_textrank.cpython-36.pyc
        │   ├── bert_embedding_extend.cpython-36.pyc
        │   ├── compute_keywords_tfidf.cpython-36.pyc
        │   ├── config.cpython-36.pyc
        │   ├── create_wordcloud.cpython-36.pyc
        │   ├── embedding_manager_cyd.cpython-36.pyc
        │   ├── glove_embedding.cpython-36.pyc
        │   ├── keywords_textrank.cpython-36.pyc
        │   ├── review_sentiment_analysis.cpython-36.pyc
        │   ├── sentence_similarity.cpython-36.pyc
        │   ├── sentiment_model.cpython-36.pyc
        │   ├── textrank.cpython-36.pyc
        │   ├── topic_cluster_lda.cpython-36.pyc
        │   └── utils.cpython-36.pyc
        ├── logistic_reg_clf_model.pkl
        ├── svm_clf.pkl
        └── svm_clf_model.pkl
    ├── save
        ├── keyinfo_from_input_file.txt
        ├── keyinfo_from_input_text.txt
        ├── keyinfo_from_url.txt
        ├── keyinfo_input_file.txt
        ├── keyinfo_input_text.txt
        ├── keyinfo_input_url.txt
        ├── new_word_discovery_input_file.txt
        ├── new_word_discovery_output.txt
        ├── review_summary.txt
        ├── save_article.txt
        ├── test_article.txt
        ├── testtext.txt
        ├── text.txt
        ├── topic_input_file.txt
        ├── topic_keywords_dist.txt
        ├── user_input_product_id_name.txt
        ├── userfile.txt
        ├── usertext.txt
        ├── userurl.txt
        ├── wordcloud_from_input_file.txt
        ├── wordcloud_from_input_text.txt
        └── wordcloud_from_url.txt
    ├── static.zip
    ├── templates.zip
    └── utils
        └── data_prepare.py


/README.md:
--------------------------------------------------------------------------------
  1 | # NLP可视化系统搭建
  2 | 
  3 | 本项目将Capterra上爬取的产品和用户数据的可视化分析，文本关键信息的挖掘，用户的情感分析等集成到Web里面。在Web框架Flask中使用可视化工具pyecharts来动态展示可视化数据。
  4 | 
  5 | ## 1. 环境配置
  6 | 
  7 | 语言：Python3
  8 | 
  9 | 编辑器：Anaconda(Spyder)
 10 | 
 11 | Web框架：Flask
 12 | 
 13 | 数据可视化：Pyecharts
 14 | 
 15 | ## 2. 源码结构
 16 | 
 17 | 本项目开发过程采用前后端分离的模式，完整的源码结构如下图所示：
 18 | 
 19 | ![项目文件目录结构](D:\Github\NLPVisualizationSystem\image\项目文件目录结构.png)
 20 | 
 21 | 在项目NLPVisualizationSystem下，主要的文件及文件夹包括：app.py、src（exe、save、utils、data、image、background、static 和  templates）、frontend。
 22 | 
 23 | - app.py 是运行整个项目并发布到线上，
 24 | - exe是业务逻辑处理模块文件目录，
 25 | - data 是数据库操作脚本文件、爬取到的数据目录，
 26 | - image是项目涉及的图片文件目录，
 27 | - background是项目里词云图的默认背景文件目录，
 28 | - static 是资源文件目录，
 29 | - save是保存前端页面的输入数据的目录，
 30 | - templates 是模板文件目录，
 31 | - model是项目模块训练的模型，
 32 | - utils是帮助数据预处理，
 33 | - frontend是项目用到的前端主题样式模板。
 34 | 
 35 | ## 3. 安装运行
 36 | 
 37 | 运行前需要安装的python包：
 38 | 
 39 | | Package           | 用途说明                         | 版本   |
 40 | | ----------------- | -------------------------------- | ------ |
 41 | | pandas            | 结构化数据分析                   | 0.24.2 |
 42 | | numpy             | 科学计算                         | 1.19.4 |
 43 | | Flask             | web框架整合                      | 1.0.2  |
 44 | | pyecharts         | 数据可视化                       | 1.8.1  |
 45 | | werkzeug          | web应用工具包                    | 0.14.1 |
 46 | | collections       | python标准库，数据结构常用模块   | 内置   |
 47 | | newspaper3k       | python爬虫框架，适合抓取新闻网页 | 0.2.8  |
 48 | | imageio           | 图像处理包                       | 内置   |
 49 | | snapshot_selenium | 渲染图片                         | 0.0.2  |
 50 | | jieba             | 结巴分词                         | 0.39   |
 51 | | re                | 正则化表达                       | 内置   |
 52 | | random            | 用于生成伪随机数                 | 内置   |
 53 | | gensim            | NLP工具包                        | 3.7.3  |
 54 | | bert_embedding    | 提供BERT预训练模型及词表示       | 1.0.1  |
 55 | | nltk              | NLP工具包                        | 3.4    |
 56 | | mxnet             | MXNet深度学习框架                | 1.4.0  |
 57 | | sklearn           | 机器学习工具包                   | 0.19.2 |
 58 | |                   |                                  |        |
 59 | 
 60 | 运行：
 61 | 
 62 | 1. 先解压src文件夹内的static和templates文件夹，
 63 | 2. 找到src\data\glove.6B.100d.txt文件根据里面的链接下载并保存真正的glove.6B.100d.txt替换旧的。
 64 | 3. 找到app.py，可以直接`python app.py`运行。
 65 | 
 66 | 运行后在本地服务器启动一个 Flask 应用程序。在浏览器中输入地址：  http://127.0.0.1:5000/ 可以看到NLP可视化系统的页面，截图如下所示：
 67 | 
 68 | ![首页](.\image\首页.png)
 69 | 
 70 | ## 4. 前端页面设计
 71 | 
 72 | ### 4.1 主题模板选择
 73 | 
 74 | 本项目中，选择 Bootstrap 的主题样式模板：[Matrix Admin](https://www.matrixadmin.wrappixel.com/) 开源免费版本。Matrix Admin 分为开源版本和商业版本，开源版本的下载地址为：[http://matrixadmin.wrappixel.com/matrix-admin-package-full.zip](http://matrixadmin.wrappixel.com/matrix-admin-package-full.zip)。下载后得到matrix-admin-package-full.zip，依次解压得到matrix-admin-bt4文件。
 75 | 
 76 | 解压顺序：
 77 | 
 78 | | 顺序 | 压缩包                        | 解压后                                                       |
 79 | | ---- | ----------------------------- | ------------------------------------------------------------ |
 80 | | 1    | matrix-admin-package-full.zip | matrix-admin-package-full ( matrix-admin-package.zip, matriz-admin-old.zip ) |
 81 | | 2    | matrix-admin-package.zip      | matrix-admin-package ( matrix-admin-bt4.zip, matriz-admin-old.zip ) |
 82 | | 3    | matrix-admin-bt4.zip          | matrix-admin-bt4 ( assets, dist,  html )                     |
 83 | 
 84 | Matrix Admin 的文件目录，共分为 3 个文件夹：asserts、dist 和 html。
 85 | 
 86 | - asserts 是第三方资源依赖文件目录，
 87 | - dist 存储的是页面资源文件，
 88 | - html 存储的是示例程序。
 89 | 
 90 | ### 4.2 导航菜单设计
 91 | 
 92 | ![导航菜单](.\image\导航菜单.png)
 93 | 
 94 | 导航栏目前分了7块内容，有DashBoard、竞品分析、自动生成词云图、文本关键信息提取、文本情感分析、用户评价分析、用户画像。
 95 | 
 96 | - DashBoard：罗列一些实时数据指标。
 97 | - 竞品分析：对市场里的EDC产品做竞品分析， 分别从产品定价、安装、功能、培训等几方面进行数据分析与可视化的展示。
 98 | - 自动生成词云图：用户可以通过3种方式生成词云图，一是根据用户输入的网址来采集文本生成词云；二是根据用户输入的文本内容来生成词云；三是根据用户上传的文本文件来生成词云图。
 99 | - 文本关键信息提取：单文本分析：基于TextRank的算法的单文本摘要提取与关键词抽取。多文本分析：基于LDA的多文档主题分布探索。
100 | - 文本分类：主要是文本分类项目，如图书分类，情感分类等。
101 | - 用户评价情感分析：抓取用户对产品的评价数据，从评价文本中提取产品相关的aspects，归纳用户的正面和负面评价。
102 | - 用户画像：采集使用EDC产品的用户数据，从用户所在行业、公司规模、职业等方面进行数据分析与可视化的展示。
103 | 
104 | 
105 | 
106 | 
107 | 
108 | ## 5. 后台应用设计
109 | 
110 | ### 5.1 服务接口设计
111 | 
112 | 服务接口设计包括**页面请求接口和数据请求接**口:
113 | 
114 | - **页面请求接口**是浏览器对应的该页面的访问地址，
115 | - **数据请求接口**对应的是图表对象的数据请求地址。
116 | 
117 | ### 5.2 异常请求设计
118 | 
119 | 针对常见错误和异常，设计异常提示程序。在templates文件夹，存放设计好的异常页面模板：error-403.html，error-404.html，error-405.html，error-500.html。
120 | 
121 | 
122 | 
123 | 
124 | 
125 | ## 6. 运行演示
126 | 
127 | ### 6.0 DashBoard
128 | 
129 | ![首页](D:\Github\NLPVisualizationSystem\image\首页.png)
130 | 
131 | ### 6.1 竞品分析
132 | 
133 | ![竞品分析](D:\Github\NLPVisualizationSystem\image\竞品分析.gif)
134 | 
135 | ### 6.2 文本预处理
136 | 
137 | #### 6.2.1 自动生成词云图
138 | 
139 | ![自动生成词云](.\image\自动生成词云.gif)
140 | 
141 | #### 6.2.2 单文档分析——关键词提取
142 | ![1](.\image\文本关键信息提取1.png)
143 | 
144 | #### 6.2.3 多文档分析——主题分析
145 | 
146 | ![2](.\image\文本关键信息提取2.png)
147 | 
148 | #### 6.2.4 新词挖掘
149 | 
150 | ![新词挖掘](D:\Github\NLPVisualizationSystem\image\新词挖掘.png)
151 | 
152 | #### 6.2.5 文本数据增强
153 | 
154 | 
155 | 
156 | ### 6.3 文本分类
157 | 
158 | ![文本分类](D:\Github\NLPVisualizationSystem\image\文本分类.gif)
159 | 
160 | 
161 | 
162 | ### 6.4 文本生成
163 | 
164 | ![文本生成](D:\Github\NLPVisualizationSystem\image\文本生成.png)
165 | 
166 | ### 6.5 用户分析
167 | 
168 | #### 6.5.1 用户画像
169 | 
170 | ![用户画像分析](.\image\用户画像分析.png)
171 | 
172 | #### 6.5.2 用户评价情感分析
173 | 
174 | ![用户评价情感分析](.\image\用户评价情感分析.gif)
175 | 
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/__pycache__/baidu_search.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/baidu_search.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/create_wordcloud.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/create_wordcloud.cpython-37.pyc


--------------------------------------------------------------------------------
/__pycache__/exe_01.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/exe_01.cpython-36.pyc


--------------------------------------------------------------------------------
/__pycache__/exe_01.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/__pycache__/exe_01.cpython-37.pyc


--------------------------------------------------------------------------------
/frontend/link.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/frontend/link.txt


--------------------------------------------------------------------------------
/image/Matrix Admin 运行演示.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/Matrix Admin 运行演示.png


--------------------------------------------------------------------------------
/image/导航菜单.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/导航菜单.png


--------------------------------------------------------------------------------
/image/文本关键信息提取1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本关键信息提取1.png


--------------------------------------------------------------------------------
/image/文本关键信息提取2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本关键信息提取2.png


--------------------------------------------------------------------------------
/image/文本分类.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本分类.gif


--------------------------------------------------------------------------------
/image/文本生成.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/文本生成.png


--------------------------------------------------------------------------------
/image/新词挖掘.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/新词挖掘.png


--------------------------------------------------------------------------------
/image/用户画像分析.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/用户画像分析.png


--------------------------------------------------------------------------------
/image/用户评价情感分析.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/用户评价情感分析.gif


--------------------------------------------------------------------------------
/image/竞品分析.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/竞品分析.gif


--------------------------------------------------------------------------------
/image/自动生成词云.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/自动生成词云.gif


--------------------------------------------------------------------------------
/image/项目文件目录结构.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/项目文件目录结构.png


--------------------------------------------------------------------------------
/image/首页.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/image/首页.png


--------------------------------------------------------------------------------
/src/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/config.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_01.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_01.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_01.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_01.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_02.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_02.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_02.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_02.cpython-37.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_03.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_03.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_05.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_05.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_06.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_06.cpython-36.pyc


--------------------------------------------------------------------------------
/src/__pycache__/exe_06.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/__pycache__/exe_06.cpython-37.pyc


--------------------------------------------------------------------------------
/src/background/china.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/background/china.jpg


--------------------------------------------------------------------------------
/src/background/oval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/background/oval.png


--------------------------------------------------------------------------------
/src/background/profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/background/profile.png


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Oct 29 15:58:11 2020
  4 | 
  5 | @author: Xu
  6 | """
  7 | import os
  8 | import sys
  9 | 
 10 | curPath = os.path.abspath(os.path.dirname(__file__))
 11 | rootPath = os.path.split(curPath)[0]
 12 | sys.path.append(curPath)
 13 | sys.path.append(rootPath)
 14 | 
 15 | # main directory
 16 | 
 17 | # curPath = r'D:\Github\NLPVisualizationSystem'  
 18 | data_dir =  os.path.join(curPath, 'data')   
 19 | image_dir =  os.path.join(curPath, 'image')
 20 | template_dir = os.path.join(curPath, 'templates')
 21 | background_dir = os.path.join(curPath, "background")
 22 | static_dir = os.path.join(curPath, "static")
 23 | run_dir = os.path.join(curPath, "run")
 24 | html_dir = os.path.join(curPath,"html")
 25 | save_dir = os.path.join(curPath, "save")
 26 | model_dir = os.path.join(curPath,'model')
 27 | 
 28 |    
 29 | 
 30 | # data path
 31 | user_data_path = os.path.join(data_dir, 'new_users_comments.csv')
 32 | product_data_path = os.path.join(data_dir, "new_proucts_details.csv")
 33 | user_data_path_1 = os.path.join(data_dir, 'users_comments.xls')
 34 | product_data_path_2 = os.path.join(data_dir, "data_out_proucts_details.xls")
 35 | #stopwords_path = os.path.join(data_dir, "stopwords.txt")
 36 | bg_pic = os.path.join(background_dir, "oval.png")
 37 | 
 38 | # WordCloud Generation
 39 | # save user input data
 40 | wc_input_url_path = os.path.join(save_dir, 'userurl.txt')
 41 | wc_input_text_path = os.path.join(save_dir, 'usertext.txt')
 42 | wc_input_file_save_path = os.path.join(save_dir, 'userfile.txt')
 43 | # save wordcloud picture
 44 | pic_wc_input_url_save_path = os.path.join(save_dir, 'wordcloud_from_url.txt')
 45 | pic_wc_input_text_save_path = os.path.join(save_dir, 'wordcloud_from_input_text.txt')
 46 | pic_wc_input_file_save_path = os.path.join(save_dir, 'wordcloud_from_input_file.txt')
 47 | 
 48 | 
 49 | 
 50 | # Text Extraction: configuration 
 51 | # KeyInfo Extraction
 52 | # data path
 53 | idf_path = os.path.join(data_dir, 'idf.txt')
 54 | token_vector_path = os.path.join(data_dir, 'token_vector.bin')
 55 | # save user input data
 56 | keyinfo_input_url_path = os.path.join(save_dir, 'keyinfo_input_url.txt')
 57 | keyinfo_input_text_path = os.path.join(save_dir, 'keyinfo_input_text.txt')
 58 | keyinfo_input_file_save_path = os.path.join(save_dir, 'keyinfo_input_file.txt')
 59 | # download text extraction result 
 60 | download_keyinfo_input_url_save_path = os.path.join(save_dir, 'keyinfo_from_url.txt')
 61 | download_keyinfo_input_text_save_path = os.path.join(save_dir, 'keyinfo_from_input_text.txt')
 62 | download_keyinfo_input_file_save_path = os.path.join(save_dir, 'keyinfo_from_input_file.txt')
 63 | 
 64 | # Topic CLuster
 65 | # data path
 66 | StopWords_path = os.path.join(data_dir, "stop_words_ch.txt")
 67 | # save user input data
 68 | topic_input_file_save_path = os.path.join(save_dir, 'topic_input_file.txt')
 69 | # download topic keywords result
 70 | download_topic_input_file_save_path = os.path.join(save_dir, 'topic_keywords_dist.txt')
 71 | 
 72 | # New Word Discovery
 73 | # save user input data
 74 | new_word_discovery_file_save_path = os.path.join(save_dir, 'new_word_discovery_input_file.txt')
 75 | # download new words result
 76 | download_new_word_output_file_save_path = os.path.join(save_dir, 'new_word_discovery_output.txt')
 77 | 
 78 | 
 79 | 
 80 | # Review Sentiment Analysis
 81 | # data path
 82 | en_stopwords_path = os.path.join(data_dir, 'stopwords.txt')
 83 | review_data_path = os.path.join(data_dir, 'UserReviewData.csv')
 84 | business_data_path = os.path.join(data_dir, 'ProductData.csv')
 85 | glove_embedding_path = os.path.join(data_dir, 'glove.6B.100d.txt')
 86 | 
 87 | # model path 
 88 | svm_model_save_path = os.path.join(model_dir, 'svm_clf_model.pkl')
 89 | lr_model_save_path = os.path.join(model_dir, 'logistic_reg_clf_model.pkl')
 90 | 
 91 | # review summary save path
 92 | user_input_id_name_path = os.path.join(save_dir, 'user_input_product_id_name.txt')
 93 | review_summary_save_path = os.path.join(save_dir, 'review_summary.txt')
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/src/data/ProductData.csv:
--------------------------------------------------------------------------------
  1 | product_name,review_count,product_id
  2 | ABBYY FlexiCapture,25.0,8
  3 | Castor EDC,132.0,1
  4 | Clinical Studio,41.0,3
  5 | Data+ Platform,29.0,4
  6 | DataFax,8.0,18
  7 | DataLabs,4.0,25
  8 | Debut,5.0,21
  9 | DocuPhase,146.0,5
 10 | Ephesoft Transact,67.0,6
 11 | Flex Databases,6.0,16
 12 | Formation,10.0,12
 13 | Grooper,12.0,15
 14 | Kofax Kapow,8.0,22
 15 | MPS IntelliVector,4.0,24
 16 | MainEDC,6.0,23
 17 | OnlineCRF,5.0,2
 18 | Poimapper,18.0,9
 19 | SMART-TRIAL,23.0,11
 20 | Square 9 GlobalCapture,43.0,7
 21 | Ultimate Forms,7.0,20
 22 | VKS,14.0,14
 23 | Viedoc,50.0,0
 24 | iCapture,26.0,10
 25 | iMedNet EDC,8.0,19
 26 | naturalFORMS,11.0,17
 27 | Aetiol,0.0,13
 28 | BizEfficiency,0.0,26
 29 | Clinion EDC & CDM,0.0,27
 30 | Clipboard To Contact - Turn Text Into Contacts,0.0,28
 31 | CTMS,0.0,29
 32 | Datacap Taskmaster Capture,0.0,30
 33 | Dharma,0.0,31
 34 | eCaseLink,0.0,32
 35 | ExamineYou,0.0,33
 36 | AcuFill,0.0,34
 37 | LabChart Pro,0.0,35
 38 | OnCore,0.0,36
 39 | QureClinical,0.0,37
 40 | CareRecord,0.0,38
 41 | Timaeus,0.0,39
 42 | AcqKnowledge,0.0,40
 43 | ActiView software,0.0,41
 44 | agCapture,0.0,42
 45 | ALPHADAS,0.0,43
 46 | Appliance,0.0,44
 47 | AQ2 Remittance,0.0,45
 48 | Ascerteon,0.0,46
 49 | BioClinica Express,0.0,47
 50 | BSCAN Capture,0.0,48
 51 | Captricity,0.0,49
 52 | Clear Clinica,0.0,50
 53 | clincase,0.0,51
 54 | ClinicalAnalytics,0.0,52
 55 | DADOS,0.0,53
 56 | Data Scan,0.0,54
 57 | Data-Scan,0.0,55
 58 | DDi-mEDC,0.0,56
 59 | DealMatrix,0.0,57
 60 | Digitalis Clinical Data Collection,0.0,58
 61 | Docsumo,0.0,59
 62 | EDC Made Easy,0.0,60
 63 | Entrypoint i4,0.0,61
 64 | eplansoft REVIEW,0.0,62
 65 | eResearch,0.0,63
 66 | FileStore EDM,0.0,64
 67 | FormFoundry,0.0,65
 68 | Fusion eClinical Suite,0.0,66
 69 | Gather,0.0,67
 70 | GoResearch,0.0,68
 71 | i-CDMS,0.0,69
 72 | idtPlans Review,0.0,70
 73 | Improve,0.0,71
 74 | INKWRX,0.0,72
 75 | Intelligent Data Capture,0.0,73
 76 | iQapture,0.0,74
 77 | ixtract,0.0,75
 78 | KnowledgeLake Capture,0.0,76
 79 | Magpi,0.0,77
 80 | MailThis.to,0.0,78
 81 | MATRIX EDC/IWRS,0.0,79
 82 | Med-Quest,0.0,80
 83 | MedSciNet Builder,0.0,81
 84 | MetricWire,0.0,82
 85 | MyoResearch XP Master,0.0,83
 86 | Net Station,0.0,84
 87 | OCR for AnyDoc,0.0,85
 88 | PaperLess,0.0,86
 89 | PaperSurvey,0.0,87
 90 | PhysioQ,0.0,88
 91 | Protocol First,0.0,89
 92 | Returnable Forms,0.0,90
 93 | Scout,0.0,91
 94 | secuTrial,0.0,92
 95 | SymmetricDS,0.0,93
 96 | Teamscope,0.0,94
 97 | The Data Center,0.0,95
 98 | Thread Learning,0.0,96
 99 | TWin PSG,0.0,97
100 | VISION EDC/CTMS,0.0,98
101 | 


--------------------------------------------------------------------------------
/src/data/README.md:
--------------------------------------------------------------------------------
1 | 把数据解压到data文件夹里
2 | 
3 | 下载地址
4 | https://pan.baidu.com/s/1hSFBjQHLhYDw9jPBDNH6pw&shfl=sharepset
5 | 
6 | 将glove.6B.100d加到data文件夹下。


--------------------------------------------------------------------------------
/src/data/data_out_proucts_details.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/data_out_proucts_details.xls


--------------------------------------------------------------------------------
/src/data/glove.6B.100d.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/glove.6B.100d.txt


--------------------------------------------------------------------------------
/src/data/new_proucts_details.xls:
--------------------------------------------------------------------------------
  1 | product_name,review_count,product_id
  2 | ABBYY FlexiCapture,25.0,8
  3 | Castor EDC,132.0,1
  4 | Clinical Studio,41.0,3
  5 | Data+ Platform,29.0,4
  6 | DataFax,8.0,18
  7 | DataLabs,4.0,25
  8 | Debut,5.0,21
  9 | DocuPhase,146.0,5
 10 | Ephesoft Transact,67.0,6
 11 | Flex Databases,6.0,16
 12 | Formation,10.0,12
 13 | Grooper,12.0,15
 14 | Kofax Kapow,8.0,22
 15 | MPS IntelliVector,4.0,24
 16 | MainEDC,6.0,23
 17 | OnlineCRF,5.0,2
 18 | Poimapper,18.0,9
 19 | SMART-TRIAL,23.0,11
 20 | Square 9 GlobalCapture,43.0,7
 21 | Ultimate Forms,7.0,20
 22 | VKS,14.0,14
 23 | Viedoc,50.0,0
 24 | iCapture,26.0,10
 25 | iMedNet EDC,8.0,19
 26 | naturalFORMS,11.0,17
 27 | Aetiol,0.0,13
 28 | BizEfficiency,0.0,26
 29 | Clinion EDC & CDM,0.0,27
 30 | Clipboard To Contact - Turn Text Into Contacts,0.0,28
 31 | CTMS,0.0,29
 32 | Datacap Taskmaster Capture,0.0,30
 33 | Dharma,0.0,31
 34 | eCaseLink,0.0,32
 35 | ExamineYou,0.0,33
 36 | AcuFill,0.0,34
 37 | LabChart Pro,0.0,35
 38 | OnCore,0.0,36
 39 | QureClinical,0.0,37
 40 | CareRecord,0.0,38
 41 | Timaeus,0.0,39
 42 | AcqKnowledge,0.0,40
 43 | ActiView software,0.0,41
 44 | agCapture,0.0,42
 45 | ALPHADAS,0.0,43
 46 | Appliance,0.0,44
 47 | AQ2 Remittance,0.0,45
 48 | Ascerteon,0.0,46
 49 | BioClinica Express,0.0,47
 50 | BSCAN Capture,0.0,48
 51 | Captricity,0.0,49
 52 | Clear Clinica,0.0,50
 53 | clincase,0.0,51
 54 | ClinicalAnalytics,0.0,52
 55 | DADOS,0.0,53
 56 | Data Scan,0.0,54
 57 | Data-Scan,0.0,55
 58 | DDi-mEDC,0.0,56
 59 | DealMatrix,0.0,57
 60 | Digitalis Clinical Data Collection,0.0,58
 61 | Docsumo,0.0,59
 62 | EDC Made Easy,0.0,60
 63 | Entrypoint i4,0.0,61
 64 | eplansoft REVIEW,0.0,62
 65 | eResearch,0.0,63
 66 | FileStore EDM,0.0,64
 67 | FormFoundry,0.0,65
 68 | Fusion eClinical Suite,0.0,66
 69 | Gather,0.0,67
 70 | GoResearch,0.0,68
 71 | i-CDMS,0.0,69
 72 | idtPlans Review,0.0,70
 73 | Improve,0.0,71
 74 | INKWRX,0.0,72
 75 | Intelligent Data Capture,0.0,73
 76 | iQapture,0.0,74
 77 | ixtract,0.0,75
 78 | KnowledgeLake Capture,0.0,76
 79 | Magpi,0.0,77
 80 | MailThis.to,0.0,78
 81 | MATRIX EDC/IWRS,0.0,79
 82 | Med-Quest,0.0,80
 83 | MedSciNet Builder,0.0,81
 84 | MetricWire,0.0,82
 85 | MyoResearch XP Master,0.0,83
 86 | Net Station,0.0,84
 87 | OCR for AnyDoc,0.0,85
 88 | PaperLess,0.0,86
 89 | PaperSurvey,0.0,87
 90 | PhysioQ,0.0,88
 91 | Protocol First,0.0,89
 92 | Returnable Forms,0.0,90
 93 | Scout,0.0,91
 94 | secuTrial,0.0,92
 95 | SymmetricDS,0.0,93
 96 | Teamscope,0.0,94
 97 | The Data Center,0.0,95
 98 | Thread Learning,0.0,96
 99 | TWin PSG,0.0,97
100 | VISION EDC/CTMS,0.0,98
101 | 


--------------------------------------------------------------------------------
/src/data/new_users_comments.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/new_users_comments.xls


--------------------------------------------------------------------------------
/src/data/simhei.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/simhei.ttf


--------------------------------------------------------------------------------
/src/data/stop_words_ch.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/stop_words_ch.txt


--------------------------------------------------------------------------------
/src/data/stopwords.txt:
--------------------------------------------------------------------------------
  1 | 'd  
  2 | 'll  
  3 | 'm  
  4 | 're  
  5 | 's  
  6 | 't  
  7 | 've  
  8 | ZT  
  9 | ZZ  
 10 | a  
 11 | a's  
 12 | able  
 13 | about  
 14 | above  
 15 | abst  
 16 | accordance  
 17 | according  
 18 | accordingly  
 19 | across  
 20 | act  
 21 | actually  
 22 | added  
 23 | adj  
 24 | adopted  
 25 | affected  
 26 | affecting  
 27 | affects  
 28 | after  
 29 | afterwards  
 30 | again  
 31 | against  
 32 | ah  
 33 | ain't  
 34 | all  
 35 | allow  
 36 | allows  
 37 | almost  
 38 | alone  
 39 | along  
 40 | already  
 41 | also  
 42 | although  
 43 | always  
 44 | am  
 45 | among  
 46 | amongst  
 47 | an  
 48 | and  
 49 | announce  
 50 | another  
 51 | any  
 52 | anybody  
 53 | anyhow  
 54 | anymore  
 55 | anyone  
 56 | anything  
 57 | anyway  
 58 | anyways  
 59 | anywhere  
 60 | apart  
 61 | apparently  
 62 | appear  
 63 | appreciate  
 64 | appropriate  
 65 | approximately  
 66 | are  
 67 | area  
 68 | areas  
 69 | aren  
 70 | aren't  
 71 | arent  
 72 | arise  
 73 | around  
 74 | as  
 75 | aside  
 76 | ask  
 77 | asked  
 78 | asking  
 79 | asks  
 80 | associated  
 81 | at  
 82 | auth  
 83 | available  
 84 | away  
 85 | awfully  
 86 | b  
 87 | back  
 88 | backed  
 89 | backing  
 90 | backs  
 91 | be  
 92 | became  
 93 | because  
 94 | become  
 95 | becomes  
 96 | becoming  
 97 | been  
 98 | before  
 99 | beforehand  
100 | began  
101 | begin  
102 | beginning  
103 | beginnings  
104 | begins  
105 | behind  
106 | being  
107 | beings  
108 | believe  
109 | below  
110 | beside  
111 | besides  
112 | best  
113 | better  
114 | between  
115 | beyond  
116 | big  
117 | biol  
118 | both  
119 | brief  
120 | briefly  
121 | but  
122 | by  
123 | c  
124 | c'mon  
125 | c's  
126 | ca  
127 | came  
128 | can  
129 | can't  
130 | cannot  
131 | cant  
132 | case  
133 | cases  
134 | cause  
135 | causes  
136 | certain  
137 | certainly  
138 | changes  
139 | clear  
140 | clearly  
141 | co  
142 | com  
143 | come  
144 | comes  
145 | concerning  
146 | consequently  
147 | consider  
148 | considering  
149 | contain  
150 | containing  
151 | contains  
152 | corresponding  
153 | could  
154 | couldn't  
155 | couldnt  
156 | course  
157 | currently  
158 | d  
159 | date  
160 | definitely  
161 | describe  
162 | described  
163 | despite  
164 | did  
165 | didn't  
166 | differ  
167 | different  
168 | differently  
169 | discuss  
170 | do  
171 | does  
172 | doesn't  
173 | doing  
174 | don't  
175 | done  
176 | down  
177 | downed  
178 | downing  
179 | downs  
180 | downwards  
181 | due  
182 | during  
183 | e  
184 | each  
185 | early  
186 | ed  
187 | edu  
188 | effect  
189 | eg  
190 | eight  
191 | eighty  
192 | either  
193 | else  
194 | elsewhere  
195 | end  
196 | ended  
197 | ending  
198 | ends  
199 | enough  
200 | entirely  
201 | especially  
202 | et  
203 | et-al  
204 | etc  
205 | even  
206 | evenly  
207 | ever  
208 | every  
209 | everybody  
210 | everyone  
211 | everything  
212 | everywhere  
213 | ex  
214 | exactly  
215 | example  
216 | except  
217 | f  
218 | face  
219 | faces  
220 | fact  
221 | facts  
222 | far  
223 | felt  
224 | few  
225 | ff  
226 | fifth  
227 | find  
228 | finds  
229 | first  
230 | five  
231 | fix  
232 | followed  
233 | following  
234 | follows  
235 | for  
236 | former  
237 | formerly  
238 | forth  
239 | found  
240 | four  
241 | from  
242 | full  
243 | fully  
244 | further  
245 | furthered  
246 | furthering  
247 | furthermore  
248 | furthers  
249 | g  
250 | gave  
251 | general  
252 | generally  
253 | get  
254 | gets  
255 | getting  
256 | give  
257 | given  
258 | gives  
259 | giving  
260 | go  
261 | goes  
262 | going  
263 | gone  
264 | good  
265 | goods  
266 | got  
267 | gotten  
268 | great  
269 | greater  
270 | greatest  
271 | greetings  
272 | group  
273 | grouped  
274 | grouping  
275 | groups  
276 | h  
277 | had  
278 | hadn't  
279 | happens  
280 | hardly  
281 | has  
282 | hasn't  
283 | have  
284 | haven't  
285 | having  
286 | he  
287 | he's  
288 | hed  
289 | hello  
290 | help  
291 | hence  
292 | her  
293 | here  
294 | here's  
295 | hereafter  
296 | hereby  
297 | herein  
298 | heres  
299 | hereupon  
300 | hers  
301 | herself  
302 | hes  
303 | hi  
304 | hid  
305 | high  
306 | higher  
307 | highest  
308 | him  
309 | himself  
310 | his  
311 | hither  
312 | home  
313 | hopefully  
314 | how  
315 | howbeit  
316 | however  
317 | hundred  
318 | i  
319 | i'd  
320 | i'll  
321 | i'm  
322 | i've  
323 | id  
324 | ie  
325 | if  
326 | ignored  
327 | im  
328 | immediate  
329 | immediately  
330 | importance  
331 | important  
332 | in  
333 | inasmuch  
334 | inc  
335 | include  
336 | indeed  
337 | index  
338 | indicate  
339 | indicated  
340 | indicates  
341 | information  
342 | inner  
343 | insofar  
344 | instead  
345 | interest  
346 | interested  
347 | interesting  
348 | interests  
349 | into  
350 | invention  
351 | inward  
352 | is  
353 | isn't  
354 | it  
355 | it'd  
356 | it'll  
357 | it's  
358 | itd  
359 | its  
360 | itself  
361 | j  
362 | just  
363 | k  
364 | keep  
365 | keeps  
366 | kept  
367 | keys  
368 | kg  
369 | kind  
370 | km  
371 | knew  
372 | know  
373 | known  
374 | knows  
375 | l  
376 | large  
377 | largely  
378 | last  
379 | lately  
380 | later  
381 | latest  
382 | latter  
383 | latterly  
384 | least  
385 | less  
386 | lest  
387 | let  
388 | let's  
389 | lets  
390 | like  
391 | liked  
392 | likely  
393 | line  
394 | little  
395 | long  
396 | longer  
397 | longest  
398 | look  
399 | looking  
400 | looks  
401 | ltd  
402 | m  
403 | made  
404 | mainly  
405 | make  
406 | makes  
407 | making  
408 | man  
409 | many  
410 | may  
411 | maybe  
412 | me  
413 | mean  
414 | means  
415 | meantime  
416 | meanwhile  
417 | member  
418 | members  
419 | men  
420 | merely  
421 | mg  
422 | might  
423 | million  
424 | miss  
425 | ml  
426 | more  
427 | moreover  
428 | most  
429 | mostly  
430 | mr  
431 | mrs  
432 | much  
433 | mug  
434 | must  
435 | my  
436 | myself  
437 | n  
438 | n't  
439 | na  
440 | name  
441 | namely  
442 | nay  
443 | nd  
444 | near  
445 | nearly  
446 | necessarily  
447 | necessary  
448 | need  
449 | needed  
450 | needing  
451 | needs  
452 | neither  
453 | never  
454 | nevertheless  
455 | new  
456 | newer  
457 | newest  
458 | next  
459 | nine  
460 | ninety  
461 | no  
462 | nobody  
463 | non  
464 | none  
465 | nonetheless  
466 | noone  
467 | nor  
468 | normally  
469 | nos  
470 | not  
471 | noted  
472 | nothing  
473 | novel  
474 | now  
475 | nowhere  
476 | number  
477 | numbers  
478 | o  
479 | obtain  
480 | obtained  
481 | obviously  
482 | of  
483 | off  
484 | often  
485 | oh  
486 | ok  
487 | okay  
488 | old  
489 | older  
490 | oldest  
491 | omitted  
492 | on  
493 | once  
494 | one  
495 | ones  
496 | only  
497 | onto  
498 | open  
499 | opened  
500 | opening  
501 | opens  
502 | or  
503 | ord  
504 | order  
505 | ordered  
506 | ordering  
507 | orders  
508 | other  
509 | others  
510 | otherwise  
511 | ought  
512 | our  
513 | ours  
514 | ourselves  
515 | out  
516 | outside  
517 | over  
518 | overall  
519 | owing  
520 | own  
521 | p  
522 | page  
523 | pages  
524 | part  
525 | parted  
526 | particular  
527 | particularly  
528 | parting  
529 | parts  
530 | past  
531 | per  
532 | perhaps  
533 | place  
534 | placed  
535 | places  
536 | please  
537 | plus  
538 | point  
539 | pointed  
540 | pointing  
541 | points  
542 | poorly  
543 | possible  
544 | possibly  
545 | potentially  
546 | pp  
547 | predominantly  
548 | present  
549 | presented  
550 | presenting  
551 | presents  
552 | presumably  
553 | previously  
554 | primarily  
555 | probably  
556 | problem  
557 | problems  
558 | promptly  
559 | proud  
560 | provides  
561 | put  
562 | puts  
563 | q  
564 | que  
565 | quickly  
566 | quite  
567 | qv  
568 | r  
569 | ran  
570 | rather  
571 | rd  
572 | re  
573 | readily  
574 | really  
575 | reasonably  
576 | recent  
577 | recently  
578 | ref  
579 | refs  
580 | regarding  
581 | regardless  
582 | regards  
583 | related  
584 | relatively  
585 | research  
586 | respectively  
587 | resulted  
588 | resulting  
589 | results  
590 | right  
591 | room  
592 | rooms  
593 | run  
594 | s  
595 | said  
596 | same  
597 | saw  
598 | say  
599 | saying  
600 | says  
601 | sec  
602 | second  
603 | secondly  
604 | seconds  
605 | section  
606 | see  
607 | seeing  
608 | seem  
609 | seemed  
610 | seeming  
611 | seems  
612 | seen  
613 | sees  
614 | self  
615 | selves  
616 | sensible  
617 | sent  
618 | serious  
619 | seriously  
620 | seven  
621 | several  
622 | shall  
623 | she  
624 | she'll  
625 | shed  
626 | shes  
627 | should  
628 | shouldn't  
629 | show  
630 | showed  
631 | showing  
632 | shown  
633 | showns  
634 | shows  
635 | side  
636 | sides  
637 | significant  
638 | significantly  
639 | similar  
640 | similarly  
641 | since  
642 | six  
643 | slightly  
644 | small  
645 | smaller  
646 | smallest  
647 | so  
648 | some  
649 | somebody  
650 | somehow  
651 | someone  
652 | somethan  
653 | something  
654 | sometime  
655 | sometimes  
656 | somewhat  
657 | somewhere  
658 | soon  
659 | sorry  
660 | specifically  
661 | specified  
662 | specify  
663 | specifying  
664 | state  
665 | states  
666 | still  
667 | stop  
668 | strongly  
669 | sub  
670 | substantially  
671 | successfully  
672 | such  
673 | sufficiently  
674 | suggest  
675 | sup  
676 | sure  
677 | t  
678 | t's  
679 | take  
680 | taken  
681 | taking  
682 | tell  
683 | tends  
684 | th  
685 | than  
686 | thank  
687 | thanks  
688 | thanx  
689 | that  
690 | that'll  
691 | that's  
692 | that've  
693 | thats  
694 | the  
695 | their  
696 | theirs  
697 | them  
698 | themselves  
699 | then  
700 | thence  
701 | there  
702 | there'll  
703 | there's  
704 | there've  
705 | thereafter  
706 | thereby  
707 | thered  
708 | therefore  
709 | therein  
710 | thereof  
711 | therere  
712 | theres  
713 | thereto  
714 | thereupon  
715 | these  
716 | they  
717 | they'd  
718 | they'll  
719 | they're  
720 | they've  
721 | theyd  
722 | theyre  
723 | thing  
724 | things  
725 | think  
726 | thinks  
727 | third  
728 | this  
729 | thorough  
730 | thoroughly  
731 | those  
732 | thou  
733 | though  
734 | thoughh  
735 | thought  
736 | thoughts  
737 | thousand  
738 | three  
739 | throug  
740 | through  
741 | throughout  
742 | thru  
743 | thus  
744 | til  
745 | tip  
746 | to  
747 | today  
748 | together  
749 | too  
750 | took  
751 | toward  
752 | towards  
753 | tried  
754 | tries  
755 | truly  
756 | try  
757 | trying  
758 | ts  
759 | turn  
760 | turned  
761 | turning  
762 | turns  
763 | twice  
764 | two  
765 | u  
766 | un  
767 | under  
768 | unfortunately  
769 | unless  
770 | unlike  
771 | unlikely  
772 | until  
773 | unto  
774 | up  
775 | upon  
776 | ups  
777 | us  
778 | use  
779 | used  
780 | useful  
781 | usefully  
782 | usefulness  
783 | uses  
784 | using  
785 | usually  
786 | uucp  
787 | v  
788 | value  
789 | various  
790 | very  
791 | via  
792 | viz  
793 | vol  
794 | vols  
795 | vs  
796 | w  
797 | want  
798 | wanted  
799 | wanting  
800 | wants  
801 | was  
802 | wasn't  
803 | way  
804 | ways  
805 | we  
806 | we'd  
807 | we'll  
808 | we're  
809 | we've  
810 | wed  
811 | welcome  
812 | well  
813 | wells  
814 | went  
815 | were  
816 | weren't  
817 | what  
818 | what'll  
819 | what's  
820 | whatever  
821 | whats  
822 | when  
823 | whence  
824 | whenever  
825 | where  
826 | where's  
827 | whereafter  
828 | whereas  
829 | whereby  
830 | wherein  
831 | wheres  
832 | whereupon  
833 | wherever  
834 | whether  
835 | which  
836 | while  
837 | whim  
838 | whither  
839 | who  
840 | who'll  
841 | who's  
842 | whod  
843 | whoever  
844 | whole  
845 | whom  
846 | whomever  
847 | whos  
848 | whose  
849 | why  
850 | widely  
851 | will  
852 | willing  
853 | wish  
854 | with  
855 | within  
856 | without  
857 | won't  
858 | wonder  
859 | words  
860 | work  
861 | worked  
862 | working  
863 | works  
864 | world  
865 | would  
866 | wouldn't  
867 | www  
868 | x  
869 | y  
870 | year  
871 | years  
872 | yes  
873 | yet  
874 | you  
875 | you'd  
876 | you'll  
877 | you're  
878 | you've  
879 | youd  
880 | young  
881 | younger  
882 | youngest  
883 | your  
884 | youre  
885 | yours  
886 | yourself  
887 | yourselves  
888 | z  
889 | zero  
890 | zt  
891 | zz
892 | 


--------------------------------------------------------------------------------
/src/data/text.txt:
--------------------------------------------------------------------------------
1 | 人工智能技术在防疫抗疫工作中大显身手  发布时间：2020-02-25 来源：人工智能实验室  近期，新型冠状病毒肺炎（简称“新冠肺炎”）的疫情突如其来，让人们有些措手不及。但是为了实现更好的防疫抗疫效果，不少研究人员纷纷应用诸多技术手段来抗击疫情。其中人工智能技术已成为这场防疫抗疫攻坚战的有力武器之一；它在疫情防控、图像分析、辅助诊断、疫苗研发、新药研制等方面助力防疫抗疫工作。  在疫情防控方面  新冠肺炎来势汹汹，但是它依然可防可控。采取有效的措施预防，戴口罩、勤洗手、居家隔离等都是非常行之有效的方法。例如戴口罩是预防传染病最重要、最有效的防控手段之一，可以有效降低感染新冠肺炎的风险。又如体温筛检是此次疫情中筛查排查可疑病例的一个手段。人工智能技术在疫情防控的各个应用场景中都可发挥重要作用，这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。  北京旷视科技有限公司最近推出一套用于发热及潜在被感染对象识别、筛查与分析的人工智能新系统“明骥”。该系统通过前端红外相机，鉴别人流中的高温人员，再根据疑似发烧者的人体、人脸信息，利用人工智能技术辅助工作人员快速定位体温异常者；做到了在佩戴口罩的情况下，也能精准锁定。目前，“明骥”已应用在地铁、火车站、机尝集中办公区等人流量较大的区域。  在图像分析方面  医疗影像数据是医疗数据的重要组成部分，人工智能技术能够通过快速准确地标记新冠肺炎的特定异常结构来提高图像分析的效率，以供放射科医生参考。提高图像分析效率，可让放射科医生腾出更多的时间聚焦在需要更多解读或判断的内容审阅上，从而有望缓解他们供给缺口问题。另外，这还可避免放射科医生以及临床医生被别人感染，降低他们的安全风险。  上海人工智能研究院与杭州健培科技有限公司联合研发的新冠肺炎影像云检测平台最近正式上线，对全国医院进行免费影像云诊断服务，并对所有医疗机构和各级政府免费开放，将高效、准确地为放射科医生以及临床医生提供决策依据，助力疫情防控。新冠肺炎影像云检测平台上线后，能够为临床一线抗疫医生疫情评估、肺炎性质判定、治疗方案制定提供高效精确的支撑依据。  在辅助诊断方面  医疗诊断是一个综合考虑各种影响因素的判断过程；利用人工智能技术辅助诊断新冠肺炎，能够在短时间内精准地预判病情，对提高患者预后具有重要作用。人工智能技术辅助诊断的功能既可以精确分割CT扫描部位的病灶；还可以对病灶的CT影像做分析，找出疑似病变和组织结构的异常，并给出诊断方向。在质控及病变识别方面，具有更为宽泛的使用范围。  在CT影像快速诊断方面，北京推想科技与武汉同济医院、深圳市第三人民医院合作研发针对新冠肺炎特别版，该版利用人工智能技术的深度学习、图像识别等对检出的病灶进行测量、密度分析，支持患者前后片对照，提供量化数据对比结果，帮助医生更快完成疑似患者诊断。北京安德医智联合解放军总医院正在研发新冠肺炎CT影像人工智能辅助诊断系统，免费提供给全国各级医院使用。  在疫苗研发方面  随着疫情持续，很多民众非常关心新冠肺炎的疫苗研发进展。据介绍，无论是对病毒进行基因测序，找到病毒来源以及传播宿主，还是研发病毒疫苗，人工智能技术都大有用武之地。例如传统的疫苗研发需在实验室中对数百种药物成分进行生物测试，这一过程往往要耗费不少时间；而人工智能技术可以极大加速这个过程，能够让更多的人获得疫苗的保护。  浙江大学研究团队最近利用人工智能技术在已有的药物中找到两种抗击疫情药物，从而使疫苗的研发工作取得了阶段性的成果。这两种药物有可能成为新冠肺炎候选疫苗，目前正在进行临床试验。据了解,将人工智能技术用于筛选和研发疫苗,能够帮助研究人员在已有的药物中快速找到可能对预防新冠肺炎有效的生物制品。  在新药研制方面  新冠肺炎的临床表现以发热﹑乏力﹑干咳为主要表现；而随着疾病的进展会出现急性呼吸窘迫综合征、难以纠正的代谢性酸中毒等，需要给予积极有效的治疗。但是目前还没有明确的特效药能够治疗新冠肺炎，只能根据患者的一般情况进行对症治疗，预防继发的感染，及时进行器官的功能支持。不过研究人员正在利用人工智能技术研制针对该病的特效药，新药很快就会问世。  美国麻省理工学院研究团队近日利用人工智能技术发现一种新型抗生素，它可以杀灭多种致病细菌，包括一些对所有已知抗生素都具耐药性的细菌菌株。研究人员通过让机器学习算法在几天内充分筛查庞大数据库中逾1亿种化合物，终于发现了这种抗生素；该抗生素被认为能有效抑制大肠杆菌，对治疗新冠肺炎也有效。  由上可知，人工智能技术正在新冠肺炎的防疫抗疫工作中大显身手。可以预料，作为一种综合性极强的技术，人工智能将在医疗健康领域内得到越来越多的应用,并将成为影响医学行业发展的重要科技手段。正如我国著名学者周海中教授曾经指出的那样：“随着社会的发展和科技的进步，人工智能技术将在医疗健康领域大显身手；其成果会不断涌现，应用前景令人期待。”
2 | 


--------------------------------------------------------------------------------
/src/data/users_comments.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/data/users_comments.xls


--------------------------------------------------------------------------------
/src/exe/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 26 21:47:15 2021
 4 | 
 5 | @author: Xu
 6 | """
 7 | import sys
 8 | import os
 9 | curPath = os.path.abspath(os.path.dirname(__file__))
10 | rootPath = os.path.split(curPath)[0]
11 | sys.path.append(os.path.split(rootPath)[0])


--------------------------------------------------------------------------------
/src/exe/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_01.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_01.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_01.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_01.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_02.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_02.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_02.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_02.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_03.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_03.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_03.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_03.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_05.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_05.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/__pycache__/exe_06.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/__pycache__/exe_06.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/exe_01.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Oct 29 10:26:02 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 1.竞品分析：
  8 | 负责响应数据查询请求，调用数据逻辑程序。
  9 | 基于数据逻辑查询结果，业务逻辑程序组装出竞品分析数据并返回给前端页面。
 10 | 
 11 | """
 12 | import __init__
 13 | import numpy as np
 14 | from src import config
 15 | import pandas as pd
 16 | import collections 
 17 | from pyecharts.charts import Bar, Timeline, Radar, Tab, Boxplot
 18 | from pyecharts import options as opts
 19 | 
 20 | 
 21 | 
 22 | # 1. 竞品分析
 23 | # 数据逻辑： 
 24 | def itemcount(dictionary):
 25 |     """
 26 |     此函数用于计算字典重复的key个数 。
 27 |     
 28 |      Parameters
 29 |     ----------
 30 |     dictionary : TYPE-dict
 31 |        
 32 |     Returns
 33 |     -------
 34 |     count : TYPE-dict
 35 |         new dictionary.
 36 | 
 37 |     """
 38 |     count = {}  
 39 |     try:
 40 |         del dictionary[np.nan]  
 41 |     except:
 42 |         None
 43 |     for i in dictionary.keys():
 44 |         items = i.split(',')
 45 |         for j in items:
 46 |             j = j.strip()
 47 |             if j in count.keys():
 48 |                 count[j] += dictionary[i]
 49 |             else:
 50 |                 count[j] = dictionary[i]
 51 |     return count
 52 |      
 53 | def product_feature_query():
 54 |     """
 55 |     此函数用于返回产品功能的分组统计数据。
 56 | 
 57 |     Returns
 58 |     -------
 59 |     dataX: TYPE-list all product feature names.
 60 |     dataY: TYPE-list Product feature percentage(%).
 61 | 
 62 |     """
 63 | 
 64 |     # load data
 65 |     data = pd.read_csv(config.product_data_path)
 66 |     # select data
 67 |     num_product = data.shape[0]
 68 |     cates = collections.Counter(data['Category_Features'])
 69 |     cates1 = itemcount(dict(cates))
 70 |     dataX = list(cates1.keys())
 71 |     dataY = list(cates1.values())
 72 |     dataY = [round(i/num_product*100,2) for i in dataY] 
 73 |     return dataX, dataY
 74 | 
 75 | 
 76 | def product_edc_feature_query():
 77 |     """
 78 |     此函数用于获取EDC产品功能的分组统计数据。
 79 | 
 80 |     Returns
 81 |     -------
 82 |     dataX : TYPE-list
 83 |         EDC feature names.
 84 |     dataY : TYPE-list
 85 |         EDC feature percentage(%).
 86 | 
 87 |     """
 88 |     # load data
 89 |     data = pd.read_csv(config.product_data_path)
 90 |     # select data
 91 |     num_product = data.shape[0]
 92 |     cate_list = [list(j.strip() for j in i.split(',')) for i in data['Category_Features']]
 93 |     cate_list_edc = [i.index("Electronic Data Capture Features") for i in cate_list]
 94 |     feats = data['Category_Features_List']
 95 |     edc=dict(zip(list(eval(feats[0]).keys()),[0]*len(eval(feats[0]).keys())))   
 96 |     for i in range(len(feats)):
 97 |         if len(cate_list[i])==1:
 98 |             for j in dict(eval(feats[i])).keys():
 99 |                 if dict(eval(feats[i]))[j]=='able':
100 |                     edc[j]+=1
101 |                 else:
102 |                     edc[j]=edc[j]
103 |         else:
104 |             for j in edc.keys():
105 |                 if list(eval(feats[i]))[cate_list_edc[i]]=='able':
106 |                     edc[j]+=1
107 |                 else:
108 |                     edc[j]=edc[j]
109 |     dataX = list(edc.keys())
110 |     dataY = list(edc.values())
111 |     dataY = [round(i/num_product*100,2) for i in dataY]
112 |     return dataX, dataY
113 | 
114 |     
115 | def product_deployment_query():
116 |     """
117 |     此函数用于获取EDC产品安装的分组统计数据。
118 | 
119 |     Returns
120 |     -------
121 |     dataX : TYPE-list
122 |         EDC deployment method names.
123 |     dataY : TYPE-list
124 |         each EDC deployment percentage(%).
125 | 
126 |     """
127 |     # load data
128 |     data = pd.read_csv(config.product_data_path)
129 |     # select data
130 |     num_product = data.shape[0]
131 |     dp = collections.Counter(data['deployment'])
132 |     dp1 = itemcount(dict(dp))
133 |     dataX = list(dp1.keys())
134 |     dataY = list(dp1.values())
135 |     dataY = [round(i/num_product*100,2) for i in dataY] 
136 |     return dataX, dataY
137 |     
138 |     
139 | def product_train_query(): 
140 |     """
141 |     此函数用于获取EDC产品培训的分组统计数据。
142 | 
143 |     Returns
144 |     -------
145 |     dataX : TYPE-list
146 |         EDC training method names.
147 |     dataY : TYPE-list
148 |         each EDC training percentage(%).
149 | 
150 |     
151 |     """  
152 |     
153 |     # load data
154 |     data = pd.read_csv(config.product_data_path)
155 |     # select data
156 |     num_product = data.shape[0]
157 |     tr = collections.Counter(data['training'])
158 |     tr1 = itemcount(dict(tr))
159 |     dataX = list(tr1.keys())
160 |     dataY = list(tr1.values())
161 |     dataY = [round(i/num_product*100,2) for i in dataY] 
162 |     return dataX, dataY
163 |     
164 | def product_support_query(): 
165 |     """
166 |     此函数用于获取EDC产品售后支持的分组统计数据。
167 | 
168 |     Returns
169 |     -------
170 |     dataX : TYPE-list
171 |         EDC support method names.
172 |     dataY : TYPE-list
173 |         each EDC support percentage(%).
174 | 
175 |     
176 |     """
177 |     
178 |     # load data
179 |     data = pd.read_csv(config.product_data_path)
180 |     # select data
181 |     num_product = data.shape[0]
182 |     sp = collections.Counter(data['support'])
183 |     sp1 = itemcount(dict(sp))
184 |     dataX = list(sp1.keys())
185 |     dataY = list(sp1.values())
186 |     dataY = [round(i/num_product*100,2) for i in dataY] 
187 |     return dataX, dataY   
188 | 
189 | 
190 | 
191 | def product_price_query():
192 |     """
193 |     此函数用于获取EDC产品价格的分组统计数据。
194 | 
195 |     Returns
196 |     -------
197 |     dataX : TYPE-list
198 |         EDC product pricing.
199 |     dataY : TYPE-list
200 |         EDC product pricing
201 |         Data for each EDC pricing.
202 | 
203 |     """
204 |     # load data
205 |     data = pd.read_csv(config.product_data_path)
206 |     # select data
207 |     dataX = list(collections.Counter(data['starting_price_method_fill']).keys())
208 |     dataY = []
209 |     for m in dataX:     
210 |         m_val = data.loc[(data['starting_price_method_fill']==m),'starting_price_num_fill'].values.tolist()
211 |         dataY.append(m_val)
212 |     return dataX, dataY
213 |     
214 |     
215 | 
216 | def product_rating_query():
217 |     """
218 |     It is used to get the average score of each product rating.
219 | 
220 |     Returns
221 |     -------
222 |     dataX : TYPE-list
223 |         rating index: i.e. 'rating_overall', 'rating_ease_of_use'.
224 |     products : TYPE-list
225 |         all product names.
226 |     list_dataY : TYPE-array
227 |         each product's average rating score.
228 | 
229 |     """
230 |     # load data
231 |     data = pd.read_csv(config.user_data_path)
232 |     # select data
233 |     rate_overall_mean = round(data.groupby('product_name')['rating_overall'].mean(),2)
234 |     rate_use_mean = round(data.groupby('product_name')['rating_ease_of_use'].mean(),2)
235 |     rate_fun_mean = round(data.groupby('product_name')['rating_features_functionality'].mean(),2)
236 |     rate_mn_mean = round(data.groupby('product_name')['rating_value_for_money'].mean(),2)
237 |     rate_sp_mean = round(data.groupby('product_name')['rating_customer_support'].mean(),2)
238 |     rate_rec_mean = round(data.groupby('product_name')['rating_likelihood_to_recommend'].mean(),2)
239 |     df_rate = pd.concat([rate_overall_mean,
240 |                          rate_use_mean,
241 |                          rate_fun_mean,
242 |                          rate_mn_mean,
243 |                          rate_sp_mean,
244 |                          rate_rec_mean],axis=1)
245 |     dataX = list(df_rate.columns)
246 |     products = list(df_rate.index)
247 |     list_dataY = df_rate.values
248 |     return dataX, products, list_dataY
249 | 
250 | 
251 | def rt_index_query():
252 |     """
253 |     It is used to get the real-time index about the product and user analysis.
254 |     
255 |     Returns
256 |     -------
257 |     product_sum : TYPE-int
258 |         number of products.
259 |     edc_feature_sum : TYPE-int
260 |         number of features of EDC product.
261 |     user_sum : TYPE-int
262 |         number of users participating in the survey.
263 | 
264 |     """
265 |     # load data
266 |     data = pd.read_csv(config.product_data_path)
267 |     data2 = pd.read_csv(config.user_data_path)
268 |     # select data
269 |     product_sum = data.shape[0]
270 |     edc_feature_sum = len(product_edc_feature_query()[0])
271 |     user_sum = data2.shape[0]
272 |     return product_sum, edc_feature_sum, user_sum
273 |     
274 | 
275 | 
276 | # 业务逻辑：
277 | colorList = ['#bcd3bb', '#e88f70', '#9dc5c8', '#e1e8c8',
278 |             '#7b7c68', '#e5b5b5', '#f0b489', '#928ea8',
279 |             '#bda29a', '#376956', '#c3bed4', '#495a80',
280 |             '#9966cc', '#bdb76a', '#eee8ab', '#a35015',
281 |             '#04dd98', '#d9b3e6', '#b6c3fc','#315dbc']
282 | 
283 | def rt_index_base():
284 |     """
285 |     It is used to get the data of realtime index.
286 | 
287 |     Returns
288 |     -------
289 |     cur : TYPE-dict
290 |         realt-time index.
291 | 
292 |     """
293 |     product_sum, edc_feature_sum, user_sum = rt_index_query()
294 |     cur = {"product_sum": product_sum, "edc_feature_sum": edc_feature_sum, "user_sum":user_sum}
295 |     return cur
296 | 
297 | 
298 | def hist_product_feature_base():  
299 |     """
300 |     此函数用于获取产品功能柱状图的参数。
301 | 
302 |     Returns
303 |     -------
304 |     c : TYPE-echarts parameters
305 |         return echarts parameters.
306 | 
307 |     """
308 |     # data query
309 |     dataX, dataY = product_feature_query()
310 |     # Declare objects, render pictures
311 |     c = (
312 |         Bar()
313 |         .add_xaxis(dataX)
314 |         .add_yaxis("Product category",dataY,color='#4150d8')
315 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"),
316 |                          yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"),
317 |                          title_opts=opts.TitleOpts(title="Distribution of different product function category"),
318 |                          datazoom_opts=[opts.DataZoomOpts()],
319 |                          legend_opts=opts.LegendOpts( pos_left="80%"),
320 |                          )
321 |         
322 |         )
323 |     return c
324 |     
325 |     
326 | 
327 | def hist_product_edc_feature_base():  
328 |     """
329 |     此函数用于获取产品EDC功能柱状图的参数。
330 | 
331 |     Returns
332 |     -------
333 |     c : TYPE-echarts parameters
334 |         return echarts parameters.
335 | 
336 |     """
337 |     # data query
338 |     dataX, dataY = product_edc_feature_query()
339 |     # Declare objects, render pictures
340 |     c = (
341 |         Bar()
342 |         .add_xaxis(dataX) 
343 |         .add_yaxis("EDC Features",dataY,color='#28bf7e')
344 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"),
345 |                          yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"),
346 |                          title_opts=opts.TitleOpts(title="Distribution of Electronic Data Capture Features"),
347 |                          legend_opts=opts.LegendOpts( pos_left="80%"),
348 |                          )
349 |         
350 |         )
351 |     return c
352 |     
353 | 
354 | 
355 | def hist_product_deployment_base():  
356 |     """
357 |     此函数用于获取产品安装的柱状图的参数。
358 | 
359 |     Returns
360 |     -------
361 |     c : TYPE-echarts parameters
362 |         return echarts parameters.
363 | 
364 |     """
365 |     # data query
366 |     dataX, dataY = product_deployment_query()
367 |     # Declare objects, render pictures
368 |     c = (
369 |         Bar()
370 |         .add_xaxis(dataX)
371 |         .add_yaxis("Deployment",dataY,color='#ed7c2f')
372 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"), #name="Product Deployment Method", name_location="center", name_gap=25
373 |                          yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"),
374 |                          title_opts=opts.TitleOpts(title="Distribution of Product Deployment")
375 |                          )
376 |         
377 |         )
378 |     
379 |     return c
380 |     
381 |    
382 |     
383 | def hist_product_train_base():  
384 |     """
385 |     此函数用于获取产品培训的柱状图的参数。
386 | 
387 |     Returns
388 |     -------
389 |     c : TYPE-echarts parameters
390 |         return echarts parameters.
391 | 
392 |     """
393 |     # data query
394 |     dataX, dataY = product_train_query()
395 |     # Declare objects, render pictures
396 |     c = (
397 |         Bar()
398 |         .add_xaxis(dataX)
399 |         .add_yaxis("Training",dataY,color='#b6c2ff')
400 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category", ),#name="Product Training Method", name_location="center", name_gap=25
401 |                          yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"),
402 |                          title_opts=opts.TitleOpts(title="Distribution of Product Training Method")
403 |                          )
404 |         
405 |         )
406 |     return c
407 |     
408 | def boxplot_product_price_base():
409 |     """
410 |     此函数用于获取产品价格的箱线图的参数。
411 | 
412 |     Returns
413 |     -------
414 |     c : TYPE-echarts parameters
415 |         return echarts parameters.
416 | 
417 |     """
418 |     # data query
419 |     dataX, dataY = product_price_query()
420 |     # Declare objects, render pictures
421 |     c = Boxplot()
422 |     c.add_xaxis(dataX)
423 |     c.add_yaxis("Pricing",c.prepare_data(dataY))
424 |     c.set_global_opts(title_opts=opts.TitleOpts(title="Distribution of Product Pricing", subtitle="Unit:$"))
425 |     return c
426 |     
427 |     
428 | 
429 | 
430 | def hist_product_support_base():  
431 |     """
432 |     此函数用于获取产品售后支持的柱状图的参数。
433 | 
434 |     Returns
435 |     -------
436 |     c : TYPE-echarts parameters
437 |         return echarts parameters.
438 | 
439 |     """
440 |     # data query
441 |     dataX, dataY = product_support_query()
442 |     # Declare objects, render pictures
443 |     c = (
444 |         Bar()
445 |         .add_xaxis(dataX)
446 |         .add_yaxis("Support",dataY,color='#f2a93b')
447 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category", ),#name="Product Support Method", name_location="center", name_gap=25
448 |                          yaxis_opts=opts.AxisOpts(type_="value", name="Proportion of products(%)"),
449 |                          title_opts=opts.TitleOpts(title="Distribution of Product Support Method")
450 |                          )
451 |         
452 |         )
453 |     
454 |     return c
455 |     
456 |     
457 | 
458 | 
459 | def radar_product_rating_base():
460 |     """
461 |     此函数用于获取产品多维评价的雷达图的参数。
462 | 
463 |     Returns
464 |     -------
465 |     tl : TYPE-echarts parameters
466 |         return echarts parameters.
467 |     """
468 |     
469 |     # data query
470 |     dataX, products, list_dataY = product_rating_query()
471 |     c_schema = []
472 |     for i in range(len(dataX)):
473 |         c_schema.append({"name":dataX[i],"max":5,"min":0})
474 |         
475 |     # Declare objects, render pictures
476 |     tl = Timeline()
477 |     for i in range(len(products)):
478 |         c = (
479 |             Radar(init_opts=opts.InitOpts(width="1280px", height="720px"))
480 |             .add_schema(schema=c_schema,
481 |                         splitarea_opt=opts.SplitAreaOpts(is_show=True, areastyle_opts=opts.AreaStyleOpts(opacity=1)),
482 |                         textstyle_opts=opts.TextStyleOpts(color="#fff"),
483 |                         )
484 |             .add(series_name=products[i], data=[list(list_dataY[i])], 
485 |                  linestyle_opts=opts.LineStyleOpts(color="#CD0000"),
486 |                  )
487 |             .set_series_opts(label_opts=opts.LabelOpts(is_show=True))
488 |             .set_global_opts(title_opts=opts.TitleOpts(title="Multi-dimensional analysis of product performance"),
489 |                              legend_opts=opts.LegendOpts(pos_left="80%", pos_top="50%"))
490 |         )
491 |         tl.add(c, "{}".format(products[i]))
492 |     return tl
493 | 
494 | 
495 | def tab_product_base():
496 |     """
497 |     It is used to respond to requests for chart parameters.
498 | 
499 |     Returns
500 |     -------
501 |     tab : TYPE-echarts parameters
502 |         return echarts parameters.
503 | 
504 |     """
505 |     tab = Tab()
506 |     tab.add(hist_product_feature_base(), "Product Module")
507 |     tab.add(hist_product_edc_feature_base(), "EDC Feature")
508 |     tab.add(hist_product_deployment_base(), "Product Deployment")
509 |     tab.add(hist_product_support_base(),"Product Support")
510 |     tab.add(hist_product_train_base(), "Product Train")
511 |     tab.add(radar_product_rating_base(),"Product Rating")
512 |     return tab
513 | 
514 | 
515 | 
516 | 
517 | 
518 | 
519 | 
520 | 
521 | 
522 | 
523 | 
524 | 


--------------------------------------------------------------------------------
/src/exe/exe_02.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Nov  2 17:34:12 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 
  8 | 2.自动生成词云图：
  9 | 负责响应数据查询请求，调用数据逻辑程序。
 10 | 基于数据逻辑查询结果，业务逻辑程序组装出词云图并返回给前端页面。
 11 | 
 12 | 三种类型的高频词和关键词可视化：
 13 | 
 14 | 根据用户输入指定网址，通过采集该网址文本进行处理。
 15 | 根据用户输入文本字符串进行处理。
 16 | 根据用户输入载入本地文本进行处理，用户将所需要处理文本文件放入text文本夹中，指定文件名进行处理。
 17 | 
 18 | """
 19 | import __init__
 20 | import os
 21 | import numpy as np
 22 | from src import config
 23 | from collections import Counter
 24 | from pyecharts import options as opts
 25 | from pyecharts.charts import WordCloud
 26 | from pyecharts.render import make_snapshot
 27 | from jieba import posseg 
 28 | # import wordcloud 
 29 | from newspaper import Article
 30 | # from imageio import imread
 31 | from snapshot_selenium import snapshot
 32 | 
 33 | # 2. 自动生成词云
 34 | # 数据逻辑：
 35 | def save_to_file(filepath, content):
 36 |     f = open(filepath, 'w', encoding='utf-8') 
 37 |     f.write(content)
 38 |     f.close()
 39 |     
 40 | def extract_words(content):
 41 |     """
 42 |     Statistical word frequency.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     content : TYPE-str
 47 |         DESCRIPTION: text.
 48 | 
 49 |     Returns
 50 |     -------
 51 |     word_dict : TYPE-dictionary
 52 |         DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} .
 53 | 
 54 |     """
 55 |     words = []
 56 |     pos_filters = ['n', 'v', 'a']
 57 |     for line in content.split('\n'):
 58 |         line = line.strip()
 59 |         if not line:
 60 |             continue
 61 |         words += [w.word for w in posseg.cut(line) if w.flag[0] in pos_filters and len(w.word) > 1]
 62 |     word_dict = {i[0]: i[1] for i in Counter(words).most_common()}
 63 |     return word_dict
 64 |  
 65 |     
 66 | def read_file(filepath):
 67 |         """
 68 |         Read the local file and transform to text.
 69 | 
 70 |         Parameters
 71 |         ----------
 72 |         filepath : TYPE-str
 73 |             DESCRIPTION: the text file path.
 74 | 
 75 |         Returns
 76 |         -------
 77 |         content : TYPE-str
 78 |             DESCRIPTION:The preprocessed news text.
 79 | 
 80 |         """
 81 |         f = open(filepath,'r',encoding='utf-8')
 82 |         content = f.read()
 83 |         f.close()
 84 |         return content   
 85 | 
 86 | 
 87 | def get_sorted_dict(dic, topn=100):
 88 |     """
 89 |     Sort the dictionary by value and take the top n with the largest value.
 90 |     
 91 |     Parameters:
 92 |         dic: TYPE-dictionary
 93 |             DESCRIPTION: Dictionary to be sorted.
 94 |         topn: TYPE-integer
 95 |             DESCRIPTION: Select the N (key: value) with the largest value (default = 100).
 96 | 
 97 |     Returns
 98 |     -------
 99 |     res : TYPE-list
100 |         DESCRIPTION: A list composed of the top N key-value combinations after sorting, 
101 |         like [('a', 101),('b',78),...].
102 |     
103 |     """
104 |     res = sorted(dic.items(), key=lambda item: item[1], reverse=True)
105 |     if topn > len(dic):
106 |          return res
107 |     else: 
108 |          return res[0: topn]
109 |     
110 |     
111 | def text_wordfreq_by_url_query():
112 |     """
113 |     According to the user's input to specify the URL, the text of the URL is collected, 
114 |     and a word cloud image is automatically generated.
115 | 
116 |     Returns
117 |     -------
118 |         word_dict:TYPE-dictionary
119 |             DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} .
120 | 
121 |     """
122 |     
123 |     def get_webcontent(url):
124 |         """
125 |         Online mode: According to the URL, grab the text content of the news.
126 | 
127 |         Parameters
128 |         ----------
129 |         url : TYPE-str
130 |             DESCRIPTION: news online URL.
131 | 
132 |         Returns
133 |         -------
134 |         content : TYPE-str
135 |             DESCRIPTION:The preprocessed news text.
136 | 
137 |         """
138 |         news = Article(url, language='zh')
139 |         news.download()
140 |         news.parse()
141 |         content = news.text
142 |         return content
143 |     
144 |     url = read_file(config.wc_input_url_path)
145 |     content = get_webcontent(url)
146 |     word_dict = extract_words(content)
147 |     return word_dict
148 |     
149 |     
150 |  
151 | def text_wordfreq_by_input_query():
152 |     """
153 |     According to the text input by the user, a word cloud image is automatically generated.
154 |     
155 |     Parameters:
156 |         input_text: TYPE-str
157 |             DESCRIPTION: the text input by the user.
158 | 
159 |     Returns
160 |     -------
161 |         word_dict:TYPE-dictionary
162 |             DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} .
163 | 
164 |     """
165 |     input_text = read_file(config.wc_input_text_path)
166 |     word_dict = extract_words(input_text)
167 |     return word_dict
168 |     
169 |     
170 |     
171 |     
172 |     
173 | def text_wordfreq_by_import_file_query():    
174 |     """
175 |     According to the local file imported by the user, the word cloud image 
176 |     is automatically generated.
177 |     
178 |     Parameters:
179 |         textfile: TYPE-str
180 |             DESCRIPTION: the text file imported by user.
181 | 
182 | 
183 |     Returns
184 |     -------
185 |         word_dict:TYPE-dictionary
186 |             DESCRIPTION: dictionary like {Word1: Frequency1, Word2: Frequency2} .
187 | 
188 |     """
189 |     path = read_file(config.wc_input_file_save_path).strip()
190 |     content = read_file(path)
191 |     word_dict = extract_words(content)
192 |     return word_dict
193 |     
194 |     
195 |     
196 | 
197 | # 业务逻辑：
198 | def pic_rt_user_url_base():
199 |     path = config.pic_wc_input_url_save_path
200 |     file_dir, filename = os.path.split(read_file(path).strip())
201 |     return file_dir, filename
202 |     
203 | 
204 |     
205 | def rt_user_url_base():
206 |     """
207 |     It is used to return the requested real-time data.
208 | 
209 |     Returns
210 |     -------
211 |     curinput : TYPE-dictionary
212 |         return the frontend requested real-time data.
213 |     
214 |     """
215 |     userurl = read_file(config.wc_input_url_path)
216 |     curinput = {'userurl': userurl}
217 |     return curinput
218 | 
219 | def pic_rt_user_input_text_base():
220 |     path = config.pic_wc_input_text_save_path
221 |     file_dir, filename = os.path.split(read_file(path).strip())
222 |     return file_dir, filename
223 | 
224 | 
225 | 
226 | def rt_user_input_text_base():
227 |     """
228 |     It is used to return the requested real-time data.
229 | 
230 |     Returns
231 |     -------
232 |     curinput : TYPE-dictionary
233 |         return the frontend requested real-time data.
234 |     
235 |     """
236 |     usertext = read_file(config.wc_input_text_path)
237 |     curinput = {'usertext':usertext }
238 |     return curinput
239 | 
240 | def pic_rt_user_import_file_base():
241 |     path = config.pic_wc_input_file_save_path
242 |     file_dir, filename = os.path.split(read_file(path).strip())
243 |     return file_dir, filename
244 | 
245 | 
246 | 
247 | def rt_user_import_file_base():
248 |     """
249 |     It is used to return the requested real-time data.
250 | 
251 |     Returns
252 |     -------
253 |     curinput : TYPE-dictionary
254 |         return the frontend requested real-time data.
255 |     
256 |     """
257 |     path = read_file(config.wc_input_file_save_path).strip()
258 |     filename = os.path.split(path)[-1]
259 |     curinput = {'userfile':filename}
260 |     return curinput
261 | 
262 | 
263 | def generate_random_filename(prefix, suffix):
264 |     """
265 |     According to the specified prefix name and suffix name, 
266 |     this function is used to generate a random file name.
267 |     Parameters:
268 |         prefix: TYPE-str
269 |             DESCRIPTION: Specified prefix name.
270 |         suffix: TYPE-str
271 |             DESCRIPTION: Specified suffix name.
272 |     Returns
273 |     -------
274 |     random_filename : TYPE-str
275 |         return the random file name.
276 |     
277 |     """
278 |     randnum = np.random.randint(1, 100000)
279 |     random_filename = prefix + '_' + str(randnum) + suffix
280 |     return random_filename
281 |     
282 | 
283 | 
284 | 
285 | 
286 | def wordcloud_text_by_url_base():
287 |     """
288 |     It is used to respond to requests for chart parameters.
289 | 
290 |     Returns
291 |     -------
292 |     c : TYPE-echarts parameters
293 |         return echarts parameters.
294 | 
295 |     """
296 |     
297 |     # data query    
298 |     data = text_wordfreq_by_url_query()
299 |     data_pair = get_sorted_dict(data, topn=1000)
300 |     # generate wordcloud picture
301 | #    font_path = r'C:\Windows\Fonts\simhei.ttf'
302 | #    back_color = imread(config.bg_pic)
303 | #    wc = wordcloud.WordCloud(font_path=font_path, mask=back_color, width=1200,height=800,min_font_size=10,max_font_size=66,max_words=1000,background_color="white")
304 | #    wc.generate_from_frequencies(data)
305 | #    wc.to_file(config.pic_wc_input_url_path)
306 |     # Declare objects, render pictures
307 |     c = (
308 |         WordCloud()
309 |         .add(series_name="", data_pair=data_pair, word_size_range=[10,66],word_gap=10,
310 |              shape="cicle", width="1200", height="800",)
311 |         .set_global_opts(
312 |                          title_opts=opts.TitleOpts(title="WordCloud Chart\n", pos_left="center",
313 |                                                    title_textstyle_opts=opts.TextStyleOpts(font_size=25)),
314 |                          tooltip_opts=opts.TooltipOpts(is_show=True),
315 |                          )
316 |         )
317 |     # generate wordcloud picture
318 |     rand_filename = generate_random_filename('wordcloud', '.png')
319 |     pic_save_path = os.path.join(config.image_dir, rand_filename)
320 |     save_to_file(config.pic_wc_input_url_save_path, pic_save_path)
321 |     make_snapshot(snapshot, c.render(), pic_save_path, is_remove_html=True) 
322 |     return c
323 |     
324 | 
325 | 
326 | def wordcloud_text_by_input_base():
327 |     """
328 |     It is used to respond to requests for chart parameters.
329 | 
330 |     Returns
331 |     -------
332 |     c : TYPE-echarts parameters
333 |         return echarts parameters.
334 | 
335 |     """
336 |     
337 |     # data query
338 |     data = text_wordfreq_by_input_query()
339 |     data_pair = get_sorted_dict(data, topn=1000)
340 |     # generate wordcloud picture     
341 | #    font_path = r'C:\Windows\Fonts\simhei.ttf'
342 | #    back_color = imread(config.bg_pic)
343 | #    wc = wordcloud.WordCloud(font_path=font_path, mask=back_color, width=1200,height=800,min_font_size=10,max_font_size=66,max_words=1000,background_color="white")
344 | #    wc.generate_from_frequencies(data)
345 | #    wc.to_file(config.pic_wc_input_text_path)
346 |     # Declare objects, render pictures
347 |     c = (
348 |         WordCloud()
349 |         .add(series_name="", data_pair=data_pair, word_size_range=[10,66], word_gap=8,
350 |              shape="cicle", width="1200", height="800",is_draw_out_of_bound=False,)
351 |         .set_global_opts(
352 |                          title_opts=opts.TitleOpts(title="WordCloud Chart\n", pos_left="center",
353 |                                                    title_textstyle_opts=opts.TextStyleOpts(font_size=25)),
354 |                          tooltip_opts=opts.TooltipOpts(is_show=True),
355 |                          )
356 |         )
357 |     # generate wordcloud picture                     
358 |     rand_filename = generate_random_filename('wordcloud', '.png')
359 |     pic_save_path = os.path.join(config.image_dir, rand_filename)
360 |     save_to_file(config.pic_wc_input_text_save_path, pic_save_path)
361 |     make_snapshot(snapshot, c.render(), pic_save_path, is_remove_html=True) 
362 |     return c
363 | 
364 | 
365 | def wordcloud_text_by_import_file_base():
366 |     """
367 |     It is used to respond to requests for chart parameters.
368 | 
369 |     Returns
370 |     -------
371 |     c : TYPE-echarts parameters
372 |         return echarts parameters.
373 | 
374 |     """
375 |     
376 |     # data query
377 |     data = text_wordfreq_by_import_file_query()
378 |     data_pair = get_sorted_dict(data, topn=1000)
379 |     # generate wordcloud picture
380 | #    font_path = r'C:\Windows\Fonts\simhei.ttf'
381 | #    back_color = imread(config.bg_pic)
382 | #    wc = wordcloud.WordCloud(font_path=font_path, mask=back_color, width=1200,height=800,min_font_size=10,max_font_size=66,max_words=1000,background_color="white")
383 | #    wc.generate_from_frequencies(data)
384 | #    wc.to_file(config.pic_wc_input_file_path)
385 |     # Declare objects, render pictures
386 |     c = (
387 |         WordCloud()
388 |         .add(series_name="", data_pair=data_pair, word_size_range=[10,66], word_gap=8,
389 |              shape="cicle", width="1200", height="800",is_draw_out_of_bound=False,)
390 |         .set_global_opts(
391 |                          title_opts=opts.TitleOpts(title="WordCloud Chart\n", pos_left="center",
392 |                                                    title_textstyle_opts=opts.TextStyleOpts(font_size=25)),
393 |                          tooltip_opts=opts.TooltipOpts(is_show=True),
394 |                          )
395 |         )
396 |     # generate wordcloud picture                     
397 |     rand_filename = generate_random_filename('wordcloud', '.png')
398 |     pic_save_path = os.path.join(config.image_dir, rand_filename)
399 |     save_to_file(config.pic_wc_input_file_save_path, pic_save_path)
400 |     make_snapshot(snapshot, c.render(), pic_save_path, is_remove_html=True)                         
401 |     return c
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | 
418 | 
419 | 
420 | 
421 | 


--------------------------------------------------------------------------------
/src/exe/exe_03.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Nov  6 10:01:58 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 3.文本预处理
  8 | 3.1 关键信息提取--单文本分析--关键词提取：
  9 | 基于TextRank的算法的单文本摘要提取与关键词抽取。
 10 | 
 11 | 3.2 多文本分析--主题分布：
 12 | 基于LDA的多文档主题分布探索。
 13 | 
 14 | 3.3 新词挖掘：
 15 | 基于统计信息的新词挖掘：
 16 | 由语料的N-gram片段建立Trie树和逆序Trie树
 17 | 由Trie树计算片段的出现频次、PMI和左邻字熵
 18 | 由逆序Trie树计算片段的右邻字熵
 19 | 计算片段成词的得分
 20 | 
 21 | 
 22 | 
 23 | 负责响应数据查询请求，调用数据逻辑程序。
 24 | 基于数据逻辑查询结果，业务逻辑程序组装出文本关键信息并返回给前端页面。
 25 | 
 26 | 
 27 | """
 28 | import __init__
 29 | import os
 30 | import re
 31 | from src import config
 32 | from newspaper import Article
 33 | from key_info_extraction.keywords_textrank import TextRank
 34 | from key_info_extraction.abstract_textrank import AbstarctTextrank
 35 | from key_info_extraction.topic_cluster_lda import lda_model
 36 | from worddiscovery.entropy_based import EntropyBasedWorddiscovery
 37 | 
 38 | 
 39 | # 3. 文本预处理--- Part 3.1 关键词提取--单文本分析 
 40 | # 数据逻辑：
 41 | def save_to_file(filepath, content):
 42 |     """
 43 |     Write the text to the local file.
 44 | 
 45 |     Parameters
 46 |     ----------
 47 |     filepath : TYPE-str
 48 |         DESCRIPTION: the file save path.
 49 | 
 50 |     Returns
 51 |     -------
 52 |     content : TYPE-str
 53 |         DESCRIPTION: the text.
 54 | 
 55 |     """
 56 |     f = open(filepath, 'w', encoding='utf-8') 
 57 |     f.write(content)
 58 |     f.close()
 59 | 
 60 | 
 61 | def read_file(filepath):
 62 |     """
 63 |     Read the local file and transform to text.
 64 | 
 65 |     Parameters
 66 |     ----------
 67 |     filepath : TYPE-str
 68 |         DESCRIPTION: the text file path.
 69 | 
 70 |     Returns
 71 |     -------
 72 |     content : TYPE-str
 73 |         DESCRIPTION:The preprocessed news text.
 74 | 
 75 |     """
 76 |     f = open(filepath,'r',encoding='utf-8')
 77 |     content = f.read()
 78 |     f.close()
 79 |     return content   
 80 | 
 81 | 
 82 | def get_webcontent(url):
 83 |         """
 84 |         Online mode: According to the URL, grab the text content of the news.
 85 | 
 86 |         Parameters
 87 |         ----------
 88 |         url : TYPE-str
 89 |             DESCRIPTION: news online URL.
 90 | 
 91 |         Returns
 92 |         -------
 93 |         content : TYPE-str
 94 |             DESCRIPTION:The preprocessed news text.
 95 | 
 96 |         """
 97 |         news = Article(url, language='zh')
 98 |         news.download()
 99 |         news.parse()
100 |         content = news.text
101 |         return content
102 | 
103 | 
104 | def get_abstract(text):  
105 |     """
106 |     Use Textrank algorithm to extract text summaries/abstract.
107 |     
108 |     Parameters
109 |     ----------
110 |     text : TYPE-str
111 |         DESCRIPTION: the text content to be extracted.
112 | 
113 |     Returns
114 |     -------
115 |     abstract : TYPE-str
116 |         DESCRIPTION: the abstract extracted from text.
117 |     
118 |     """
119 |     abstracter = AbstarctTextrank()
120 |     keysentences = abstracter.extract_abstract(text, 3)
121 |     abstract = []
122 |     for sent in keysentences:
123 |         abstract.append(sent[0])     
124 |     return abstract
125 | 
126 | def get_keywords(text):
127 |     """
128 |     Use Textrank algorithm to extract text keywords.
129 |     
130 |     Parameters
131 |     ----------
132 |     text : TYPE-str
133 |         DESCRIPTION: the text content to be extracted.
134 | 
135 |     Returns
136 |     -------
137 |     words : TYPE-str
138 |         DESCRIPTION: the keywords extracted from text.
139 |     
140 |     """
141 |     keywords_textanker = TextRank()
142 |     keywords = keywords_textanker.extract_keywords(text, 10)
143 |     words = []
144 |     for word in keywords:
145 |         words.append(word[0])
146 |     return words 
147 | 
148 | 
149 | 
150 | 
151 | def keyinfo_by_url_query():
152 |     """
153 |     According to the user's input to specify the URL, the text of the URL is collected, 
154 |     and an abstract and keywords are automatically generated.
155 | 
156 |     Returns
157 |     -------
158 |         abstract:TYPE-strs
159 |             DESCRIPTION: the abstract extracted from text.
160 |         keywords:TYPE-strs
161 |             DESCRIPTION: the keywords extracted from text.
162 | 
163 |     """
164 |     url = read_file(config.keyinfo_input_url_path)
165 |     content = get_webcontent(url)
166 |     abstract = get_abstract(content)
167 |     keywords = get_keywords(content)
168 |     abstract = '。 '.join(abstract) + '。'
169 |     keywords = ', '.join(keywords)
170 |     wr_to_file = '摘要：\n' + abstract + '\n关键词：\n' + keywords
171 |     save_to_file(config.download_keyinfo_input_url_save_path, wr_to_file)
172 |     return abstract, keywords
173 |     
174 | 
175 | def keyinfo_by_input_text_query():
176 |     """
177 |     According to the text input by the user, an abstract and keywords 
178 |     are automatically generated.
179 | 
180 |     Returns
181 |     -------
182 |         abstract:TYPE-strs
183 |             DESCRIPTION: the abstract extracted from text.
184 |         keywords:TYPE-strs
185 |             DESCRIPTION: the keywords extracted from text.
186 | 
187 |     """
188 |     input_text = read_file(config.keyinfo_input_text_path)
189 |     abstract = get_abstract(input_text)
190 |     keywords = get_keywords(input_text)
191 |     abstract = '。 '.join(abstract) + '。'
192 |     keywords = ', '.join(keywords)
193 |     wr_to_file = '摘要：\n' + abstract + '\n关键词：\n' + keywords
194 |     save_to_file(config.download_keyinfo_input_text_save_path, wr_to_file)
195 |     return abstract, keywords
196 | 
197 | 
198 | def keyinfo_by_import_file_query():
199 |     """
200 |     According to the local file imported by the user, an abstract and keywords 
201 |     are automatically generated.
202 | 
203 |     Returns
204 |     -------
205 |         abstract:TYPE-strs
206 |             DESCRIPTION: the abstract extracted from text.
207 |         keywords:TYPE-strs
208 |             DESCRIPTION: the keywords extracted from text.
209 | 
210 |     """
211 |     path = read_file(config.keyinfo_input_file_save_path).strip()
212 |     content = read_file(path)
213 |     abstract = get_abstract(content)
214 |     keywords = get_keywords(content)
215 |     abstract = '。 '.join(abstract) + '。'
216 |     keywords = ', '.join(keywords)
217 |     wr_to_file = '摘要：\n' + abstract + '\n关键词：\n' + keywords
218 |     save_to_file(config.download_keyinfo_input_file_save_path, wr_to_file)
219 |     return abstract, keywords
220 | 
221 | 
222 | 
223 | 
224 | # 业务逻辑：
225 |    
226 | def rt_keyinfo_url_base():
227 |     """
228 |     It is used to return the requested real-time data.
229 | 
230 |     Returns
231 |     -------
232 |     curinput : TYPE-dictionary
233 |         return the frontend requested real-time data.
234 |     
235 |     """
236 |     url = read_file(config.keyinfo_input_url_path)
237 |     abstract, keywords = keyinfo_by_url_query()
238 |     curinput = {'url': url, 'abstract': abstract, 'keywords': keywords}
239 |     return curinput
240 | 
241 | 
242 | 
243 | 
244 | def rt_keyinfo_input_text_base():
245 |     """
246 |     It is used to return the requested real-time data.
247 | 
248 |     Returns
249 |     -------
250 |     curinput : TYPE-dictionary
251 |         return the frontend requested real-time data.
252 |     
253 |     """
254 |     input_text = read_file(config.keyinfo_input_text_path)
255 |     abstract, keywords = keyinfo_by_input_text_query()
256 |     curinput = {'input_text':input_text, 'abstract': abstract, 'keywords': keywords }
257 |     return curinput
258 | 
259 | def download_rt_keyinfo_import_file_base():
260 |     path = read_file(config.keyinfo_input_file_save_path).strip()
261 |     file_dir, filename = os.path.split(path)
262 |     return file_dir, filename
263 | 
264 | 
265 | def rt_keyinfo_import_file_base():
266 |     """
267 |     It is used to return the requested real-time data.
268 | 
269 |     Returns
270 |     -------
271 |     curinput : TYPE-dictionary
272 |         return the frontend requested real-time data.
273 |     
274 |     """
275 |     path = read_file(config.keyinfo_input_file_save_path).strip()
276 |     filename = os.path.split(path)[-1]
277 |     abstract, keywords = keyinfo_by_import_file_query()
278 |     curinput = {'filename':filename, 'abstract': abstract, 'keywords': keywords}
279 |     return curinput
280 | 
281 | 
282 | 
283 | # 3. 文本预处理--- Part 3.2 主题分析——多文本分析 
284 | # 数据逻辑：
285 | def lda_topics_query():
286 |     """
287 |     It is used to get the optimal number of topics  and save the topic keywords  and 
288 |     topic distribution of documents to a file based on the file imported by the user.
289 |     
290 |     Returns
291 |     -------
292 |     num_topics: type-integer
293 |         return the number of topics.
294 |     
295 |     
296 |     """
297 |     # data prepare
298 |     filepath = read_file(config.topic_input_file_save_path).strip()
299 |     f = open(filepath, 'r', encoding='utf-8')
300 |     content = f.readlines()
301 |     f.close()
302 |     data = [text for text in content if len(re.sub(r'\s','',text))>5]
303 |     # get optimal number of topics 
304 |     # write topic keywords to file 
305 |     num_topics, output_topic_keywords,  output_topic_dist = lda_model(data, config.download_topic_input_file_save_path)                     
306 |     return num_topics, output_topic_keywords,  output_topic_dist
307 | 
308 | 
309 | 
310 | # 业务逻辑：
311 | def rt_topic_import_file_base():
312 |     """
313 |     It is used to return the requested real-time data.
314 | 
315 |     Returns
316 |     -------
317 |     curinput : TYPE-dictionary
318 |         return the frontend requested real-time data.
319 |     
320 |     """
321 |     path = read_file(config.topic_input_file_save_path).strip()
322 |     filename = os.path.split(path)[-1]
323 |     num_topics, topic_keywords,  topic_dist = lda_topics_query()
324 |     curinput = {'filename':filename, 'num_topics':num_topics, 'topic_keywords': topic_keywords, 'topic_dist': topic_dist}
325 |     return curinput
326 | 
327 | 
328 | 
329 | # 3. 文本关键信息提取--- Part 3.3 新词挖掘
330 | # 数据逻辑：
331 | def new_word_discovery_query():
332 |     """
333 |     It is used to get the user input: source text, maximum number of new words and maximum of new word length.
334 |     
335 |     Returns
336 |     -------
337 |     new_words: type-list
338 |         return the list of new words.
339 |     
340 |     
341 |     """
342 |     # data prepare
343 |     content = read_file(config.new_word_discovery_file_save_path).split('\n, ')
344 |     text, word_count, max_word_len = content[0].strip(),int(content[1].strip()),int(content[2].strip())
345 |     # word discovery
346 |     discovery = EntropyBasedWorddiscovery(word_max_len=max_word_len)   
347 |     discovery.parse(text, debug=True)
348 |     new_words = discovery.get_new_words(word_count)
349 |     new_words = [str(i+1) + ' ' + new_words[i] for i in range(len(new_words))]
350 |     new_words = '\n'.join(new_words) 
351 |     save_to_file(config.download_new_word_output_file_save_path, new_words)                
352 |     return text, word_count, max_word_len, new_words
353 | 
354 | 
355 | # 业务逻辑：
356 | def rt_new_word_discovery_base():
357 |     """
358 |     It is used to return the requested real-time data.
359 | 
360 |     Returns
361 |     -------
362 |     curinput : TYPE-dictionary
363 |         return the frontend requested real-time data.
364 |     
365 |     """
366 |     text, word_count, max_word_len, new_words= new_word_discovery_query()
367 |     curinput = {'inputtext':text, 'wordcount':word_count, 'wordlength': max_word_len, 'findwords': new_words}
368 |     return curinput
369 | 
370 | def download_rt_new_word_discovery_base():
371 |     file_dir, filename = os.path.split(config.download_new_word_output_file_save_path)
372 |     return file_dir, filename
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 


--------------------------------------------------------------------------------
/src/exe/exe_05.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Nov 12 12:25:50 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 
  8 | 5.用户评价情感分析
  9 | 
 10 | 根据
 11 | 
 12 | 
 13 | 负责响应数据查询请求，调用数据逻辑程序。
 14 | 基于数据逻辑查询结果，业务逻辑程序组装出用户评价文本的分析数据并返回给前端页面。
 15 | 
 16 | """
 17 | import __init__
 18 | import os
 19 | import pandas as pd
 20 | from src import config
 21 | from sentiment_analysis.review_sentiment_analysis import review_summary
 22 | 
 23 | 
 24 | # 5.用户评价情感分析
 25 | # 数据逻辑：
 26 | def save_to_file(filepath, content):
 27 |     """
 28 |     Write the text to the local file.
 29 | 
 30 |     Parameters
 31 |     ----------
 32 |     filepath : TYPE-str
 33 |         DESCRIPTION: the file save path.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     content : TYPE-str
 38 |         DESCRIPTION: the text.
 39 | 
 40 |     """
 41 |     f = open(filepath, 'w', encoding='utf-8') 
 42 |     f.write(content)
 43 |     f.close()
 44 | 
 45 | def read_file(filepath):
 46 |     """
 47 |     Read the local file and transform to text.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     filepath : TYPE-str
 52 |         DESCRIPTION: the text file path.
 53 | 
 54 |     Returns
 55 |     -------
 56 |     content : TYPE-str
 57 |         DESCRIPTION:The preprocessed news text.
 58 | 
 59 |     """
 60 |     f = open(filepath,'r',encoding='utf-8')
 61 |     content = f.read()
 62 |     f.close()
 63 |     return content   
 64 |     
 65 | def rt_index_query():
 66 |     """
 67 |     It is used to return the requested real-time data.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     curinput : TYPE-dictionary
 72 |         return the frontend requested real-time data.
 73 |     
 74 |     """
 75 |     # load data
 76 |     data = pd.read_csv(config.business_data_path)
 77 |     data2 = pd.read_csv(config.review_data_path)
 78 |     # select data
 79 |     product_sum = data.shape[0]
 80 |     user_sum = len(data2['user_name'].unique())
 81 |     review_sum = data2.shape[0]
 82 |     return product_sum, user_sum, review_sum
 83 | 
 84 | 
 85 | def review_summary_query():
 86 |     """
 87 |     According to the query information entered by the user, 
 88 |     the product review summary is returned.
 89 | 
 90 |     Returns
 91 |     -------
 92 |     curinput : TYPE-dictionary
 93 |         return the frontend requested real-time data.
 94 |     
 95 |     """
 96 |     data = pd.read_csv(config.review_data_path)
 97 |     query_word = read_file(config.user_input_id_name_path).strip()
 98 |     if query_word.isnumeric()==False:
 99 |         product_id = data.loc[(data['product_name']==query_word), 'product_id'].values[0]
100 |         product_name, product_basic, review_result = review_summary(product_id, config.review_summary_save_path)
101 |     else:
102 |         product_name, product_basic, review_result = review_summary(int(query_word), config.review_summary_save_path)
103 |     return  product_name, product_basic, review_result
104 |         
105 |         
106 | 
107 | # 业务逻辑：
108 | 
109 | def rt_index_base():
110 |     """
111 |     It is used to get the data of realtime index.
112 | 
113 |     Returns
114 |     -------
115 |     cur : TYPE-dict
116 |         realt-time index.
117 | 
118 |     """
119 |     product_sum, user_sum, review_sum = rt_index_query()
120 |     cur = {"product_sum": product_sum, "user_sum": user_sum, "review_sum": review_sum}
121 |     return cur
122 | 
123 | 
124 | def review_summary_base():
125 |     product_name, product_basic, review_result = review_summary_query()
126 |     cur = {'product_name': product_name, 'product_basic': product_basic, 'review_result': review_result}
127 |     return cur
128 |     
129 | 
130 | def download_review_summary_base():
131 |     path = config.review_summary_save_path
132 |     file_dir, filename = os.path.split(path)
133 |     return file_dir, filename
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/src/exe/exe_06.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Oct 30 09:20:06 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 6.用户画像
  8 | 
  9 | 负责响应数据查询请求，调用数据逻辑程序。
 10 | 基于数据逻辑查询结果，业务逻辑程序组装出用户画像数据并返回给前端页面。
 11 | 
 12 | """
 13 | 
 14 | 
 15 | import __init__
 16 | import re
 17 | import os
 18 | import pandas as pd
 19 | import numpy as np
 20 | import collections 
 21 | from src import config
 22 | from pyecharts import options as opts
 23 | from pyecharts.charts import Bar, WordCloud, Page, Tab
 24 | 
 25 | # 6. 用户画像分析
 26 | # 数据逻辑： 
 27 | def user_industry_query():
 28 |     """
 29 |     It is used for data query of user industry. 
 30 | 
 31 |     Returns
 32 |     -------
 33 |     dataX : TYPE-list
 34 |         DESCRIPTION:all user industry choices.
 35 |     dataY : TYPE-list
 36 |         DESCRIPTION:number of users in different industries.
 37 | 
 38 |     """
 39 |     # load data
 40 |     data = pd.read_csv(config.user_data_path)
 41 |     # select data
 42 |     dataX = list(collections.Counter(data['user_industry']).keys())
 43 |     dataY = list(collections.Counter(data['user_industry']).values())
 44 |     remove_nan_indx = dataX.index(np.nan)
 45 |     dataX.remove(np.nan)
 46 |     dataY.pop(remove_nan_indx)
 47 |     return dataX,dataY
 48 | 
 49 | 
 50 | def user_product_use_time_query():
 51 |     """
 52 |     It is used for data query of how long users use the product. 
 53 | 
 54 |     Returns
 55 |     -------
 56 |     dataX : TYPE-list
 57 |         DESCRIPTION:all product use time choices.
 58 |     dataY : TYPE-list
 59 |         DESCRIPTION:number of users in different usage time.
 60 | 
 61 |     """
 62 |     # load data
 63 |     data = pd.read_csv(config.user_data_path)
 64 |     # select data
 65 |     dataX = list(collections.Counter(data['used_years']).keys())
 66 |     dataY = list(collections.Counter(data['used_years']).values())
 67 |     remove_nan_indx = dataX.index(np.nan)
 68 |     dataX.remove(np.nan)
 69 |     dataY.pop(remove_nan_indx)
 70 |     return dataX,dataY
 71 | 
 72 | 
 73 | def user_company_size_query():
 74 |     """
 75 |     It is used for data query of user company size. 
 76 | 
 77 |     Returns
 78 |     -------
 79 |     dataX : TYPE-list
 80 |         DESCRIPTION:all company size.
 81 |     dataY : TYPE-list
 82 |         DESCRIPTION:number of users in different companies.
 83 | 
 84 |     """
 85 |     # load data
 86 |     data = pd.read_csv(config.user_data_path)
 87 |     # select data
 88 |     dataX = list(collections.Counter(data['user_company_size']).keys())
 89 |     dataY = list(collections.Counter(data['user_company_size']).values())
 90 |     remove_nan_indx = dataX.index(np.nan)
 91 |     dataX.remove(np.nan)
 92 |     dataY.pop(remove_nan_indx)
 93 |     return dataX,dataY
 94 | 
 95 | 
 96 | 
 97 | def user_job_query():
 98 |     """
 99 |     It is used for data query of user job title. 
100 | 
101 |     Returns
102 |     -------
103 |     dataX : TYPE-list
104 |         DESCRIPTION:all jobs.
105 |     dataY : TYPE-list
106 |         DESCRIPTION:number of users in different jobs.
107 | 
108 |     """
109 |     # load data
110 |     data = pd.read_csv(config.user_data_path)
111 |     # select data
112 |     dataX = list(collections.Counter(data['user_job_title']).keys())
113 |     dataY = list(collections.Counter(data['user_job_title']).values())
114 |     remove_nan_indx = dataX.index(np.nan)
115 |     dataX.remove(np.nan)
116 |     dataY.pop(remove_nan_indx)
117 |     return dataX,dataY
118 | 
119 | def user_job_wordcloud_freq_query():
120 |     """
121 |     It is used to count the frequency of words for user job title.
122 | 
123 |     Returns
124 |     -------
125 |     data_pair : TYPE-list(tuple)
126 |         DESCRIPTION: Count the frequency of words,like [(word1, count1), (word2, count2),...].
127 | 
128 |     """
129 |     
130 |     # load data
131 |     dataX, dataY = user_job_query()
132 |     # select data
133 |     token = [ re.split(r'\W+',job) for job in dataX]
134 |     vocab = {}
135 |     for i in range(len(token)):
136 |         for word in token[i]:
137 |             if word not in vocab:
138 |                 vocab[word] = dataY[i]
139 |             else:
140 |                 vocab[word] += dataY[i]
141 |     data_pair = list(vocab.items())
142 |     return data_pair
143 |         
144 |     
145 | 
146 | # 业务逻辑：
147 | def hist_user_industry_base():
148 |     """
149 |     It is used to respond to requests for chart parameters.
150 | 
151 |     Returns
152 |     -------
153 |     c : TYPE-echarts parameters
154 |         return echarts parameters.
155 | 
156 |     """
157 |     
158 |     # data query
159 |     dataX, dataY = user_industry_query()
160 |     # Declare objects, render pictures
161 |     c = (
162 |         Bar()
163 |         .add_xaxis(dataX)
164 |         .add_yaxis("Industry", dataY,color='#b6c2ff')
165 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"),
166 |                          yaxis_opts=opts.AxisOpts(type_="value", name="number of users"),
167 |                          title_opts=opts.TitleOpts(title="User Industry Distribution",pos_left="center"), 
168 |                          legend_opts=opts.LegendOpts(pos_left="80%",),                                              
169 |                          datazoom_opts=[opts.DataZoomOpts()],
170 |                          )
171 |         
172 |         )
173 |     return c
174 |     
175 | 
176 | def hist_user_company_size_base():
177 |     """
178 |     It is used to respond to requests for chart parameters.
179 | 
180 |     Returns
181 |     -------
182 |     c : TYPE-echarts parameters
183 |         return echarts parameters.
184 | 
185 |     """
186 |     
187 |     # data query
188 |     dataX, dataY = user_company_size_query()
189 |     # Declare objects, render pictures
190 |     c = (
191 |         Bar()
192 |         .add_xaxis(dataX)
193 |         .add_yaxis("Company Size", dataY, color='#ed7c2f')
194 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"),
195 |                          yaxis_opts=opts.AxisOpts(type_="value", name="number of users"),
196 |                          title_opts=opts.TitleOpts(title="User Company Size Distribution", pos_left="center"), 
197 |                          legend_opts=opts.LegendOpts(pos_left="70%",pos_top="5%"),                                                                          
198 |                          )
199 |         
200 |         )
201 |     return c
202 |  
203 |    
204 | def hist_user_product_use_time_base():
205 |     """
206 |     It is used to respond to requests for chart parameters.
207 | 
208 |     Returns
209 |     -------
210 |     c : TYPE-echarts parameters
211 |         return echarts parameters.
212 | 
213 |     """
214 |     
215 |     # data query
216 |     dataX, dataY = user_product_use_time_query()
217 |     # Declare objects, render pictures
218 |     c = (
219 |         Bar()
220 |         .add_xaxis(dataX)
221 |         .add_yaxis("Product Usage Time", dataY, color='#28bf7e')
222 |         .set_global_opts(xaxis_opts=opts.AxisOpts(type_="category"),
223 |                          yaxis_opts=opts.AxisOpts(type_="value", name="number of users"),
224 |                          title_opts=opts.TitleOpts(title="Distribution of cumulative product usage time by users",pos_left="center"),                                                                      
225 |                          legend_opts=opts.LegendOpts(pos_left="70%",pos_top="5%"),
226 |                          )
227 |         
228 |         )
229 |     return c
230 | 
231 | def wordcloud_user_job_base():
232 |     """
233 |     It is used to respond to requests for chart parameters.
234 | 
235 |     Returns
236 |     -------
237 |     c : TYPE-echarts parameters
238 |         return echarts parameters.
239 | 
240 |     """
241 |     
242 |     # data query
243 |     data_pair = user_job_wordcloud_freq_query()
244 |     # Declare objects, render pictures
245 |     c = (
246 |         WordCloud()
247 |         .add(series_name="Occupation", data_pair=data_pair, word_size_range=[10,100],
248 |              shape="cicle", width="1200", height="800",)
249 |         .set_global_opts(
250 |                          title_opts=opts.TitleOpts(title="User Occupation Analysis", pos_left="center",
251 |                                                    title_textstyle_opts=opts.TextStyleOpts(font_size=23)),
252 |                          tooltip_opts=opts.TooltipOpts(is_show=True),
253 |                          )
254 |         )
255 |     return c
256 | 
257 | 
258 | def pic_wordcloud_user_jov_base():
259 |     """
260 |     It is used to respond to requests for chart parameters.
261 | 
262 |     Returns
263 |     -------
264 |     image : TYPE-IMAGE Component parameters
265 |         DESCRIPTION:IMAGE html parameters.
266 | 
267 |     """
268 |     
269 |     # get picture
270 |     from wordcloud import WordCloud
271 |     
272 |     data_pair = user_job_wordcloud_freq_query()
273 |     wc = WordCloud(width=1200,height=800,min_font_size=10,max_font_size=100,font_step=2,max_words=10000,background_color="white")
274 |     wc.generate_from_frequencies(dict(data_pair))
275 |     wc.to_file(os.path.join(config.image_dir,"wordcloud_user_job.png"))
276 |     # render picture
277 |     from pyecharts.components import Image
278 |     from pyecharts.options import ComponentTitleOpts
279 |     
280 |     image = Image()
281 |     img_src = (os.path.join(config.image_dir,"wordcloud_user_job.png"))
282 |     image.add(src=img_src,
283 |               style_opts={"width": "1200px", "height": "800px", "style": "margin-top: 20px"},
284 |               )
285 |     image.set_global_opts(title_opts=ComponentTitleOpts(title="User Occupation Analysis"))
286 |     return image
287 | 
288 | 
289 | 
290 | def page_user_analysis_base():
291 |     """
292 |     It is used to respond to requests for chart parameters.
293 | 
294 |     Returns
295 |     -------
296 |     page : TYPE-echarts parameters
297 |         return echarts parameters.
298 | 
299 |     """
300 |     page = Page(interval=10, layout=Page.SimplePageLayout)
301 |     page.add(
302 |         hist_user_industry_base(),
303 |         hist_user_company_size_base(),
304 |         hist_user_product_use_time_base(),
305 |         wordcloud_user_job_base(),
306 |         ) 
307 |     return page
308 | 
309 | 
310 | def tab_user_analysis_base():
311 |     """
312 |     It is used to respond to requests for chart parameters.
313 | 
314 |     Returns
315 |     -------
316 |     tab : TYPE-echarts parameters
317 |         return echarts parameters.
318 | 
319 |     """
320 |     tab = Tab()
321 |     tab.add(hist_user_industry_base(), "User Industry")
322 |     tab.add(hist_user_company_size_base(), "User Company")
323 |     tab.add(hist_user_product_use_time_base(), "Product Usage Time")
324 |     tab.add(wordcloud_user_job_base(), "User Occupation")
325 |     return tab
326 |     
327 | 
328 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 26 21:50:26 2021
 4 | 
 5 | @author: Xu
 6 | """
 7 | import sys
 8 | import os
 9 | curPath = os.path.abspath(os.path.dirname(__file__))
10 | rootPath = os.path.split(curPath)[0]
11 | sys.path.append(os.path.split(rootPath)[0])


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/abstract_textrank.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/compute_keywords_tfidf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/compute_keywords_tfidf.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/create_wordcloud.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/create_wordcloud.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/keywords_textrank.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/sentence_similarity.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/textrank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/textrank.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/textrank.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/textrank.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/__pycache__/topic_cluster_lda.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/key_info_extraction/__pycache__/topic_cluster_lda.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/abstract_textrank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 11 11:08:56 2020
 4 | 
 5 | @author: Xu
 6 | """
 7 | import __init__
 8 | from collections import defaultdict
 9 | from jieba import posseg as pseg
10 | from key_info_extraction.textrank import textrank_graph
11 | from key_info_extraction.sentence_similarity import SimilarityCompute
12 | import re
13 | from key_info_extraction.create_wordcloud import CreateWordCloud
14 | 
15 | class AbstarctTextrank:
16 |     def __init__(self):
17 |         self.span = 3
18 |         self.similer = SimilarityCompute()
19 |         self.sim_score = 0.5 #句子相似度阈值，用于构建句子之间的边
20 | 
21 |     def sentence_split(self, text):
22 |         sentence_dict = {}
23 |         sentences = [sentence.strip() for sentence in re.split(r'[？！。;；\n\r]', text) if sentence]
24 |         for index, sentence in enumerate(sentences):
25 |             sentence_dict[index] = [sentence, [word.word for word in pseg.cut(sentence) if word.flag[0] not in ['x', 'u', 'p', 'w']]]
26 |         return sentence_dict
27 | 
28 |     def extract_abstract(self, text, num_sentences):
29 |         sentence_dict = self.sentence_split(text)
30 |         g = textrank_graph()
31 |         cm = defaultdict(int)
32 |         for i, s1 in sentence_dict.items():
33 |             for j, s2 in sentence_dict.items():
34 |                 sim_score = self.similer.similarity_cosine(s1[1], s2[1])
35 |                 if sim_score >= 0.5:
36 |                     cm[(s1[0], s2[0])] += 1
37 |         for terms, w in cm.items():
38 |             g.addEdge(terms[0], terms[1], w)
39 |         nodes_rank = g.rank()
40 |         nodes_rank = sorted(nodes_rank.items(), key=lambda asd: asd[1], reverse=True)
41 |         return nodes_rank[:num_sentences]
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/compute_keywords_tfidf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon May  4 10:54:09 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 关键词抽取之TFIDF：
  8 | 这个部分的任务是对输入文本构建词表，通过计算TF-IDF进行关键词抽取。
  9 | 
 10 | 计算TF-IDF值：
 11 | 一般有3种方法来实现：
 12 | 1）用gensim库计算TFIDF值, 
 13 | from gensim import models 
 14 | tfidf = models.TfidfModel(corpus)
 15 | 2）用sklearn库计算TFIDF值, 
 16 | from sklearn.feature_extraction.text import TfidfVectorizer
 17 | tfidf_vec = TfidefVectorizer()
 18 | tfidf_matrix = tfidf_vec.fit_transform(corpus)
 19 | 3）用python手动实现TFIDF值,
 20 | i.对语料进行分词
 21 | ii.统计词频
 22 | iii.定义计算tfidf函数
 23 | iv.计算每个单词的tfidf值
 24 | 
 25 | 实现：
 26 | 1.传入词性限制集合：调用词性标注接口，对输入句子进行词性标注，得到分词及对应的词性；
 27 | 2.遍历分词结果：如果该词的词性不再词性限制集合中，则跳过；
 28 | 如果词的长度小于2，或者词为停用词，则跳过；
 29 | 将满足条件的词添加到词频词典中，出现的次数加1；
 30 | 3.遍历词频词典，根据idf词典得到每个词的idf值，并除以词频词典中的次数总和，得到每个词的tf*idf值；
 31 | 4.根据tf-idf值对词频词典中的词进行降序排序，输出topK个词作为关键词。
 32 | 
 33 | 
 34 | """
 35 | import __init__
 36 | from src import config
 37 | from jieba import posseg
 38 | 
 39 | 
 40 | class TFIDF:
 41 |     def __init__(self):
 42 |         self.idf_file = config.idf_path
 43 |         self.idf_dict, self.common_idf = self.load_idf()
 44 |         
 45 |     def load_idf(self):
 46 |         idf_dict = {}
 47 |         for line in open(self.idf_file, 'r', encoding='utf-8').readlines():
 48 |             try:
 49 |                 word, freq = line.strip().split(' ')
 50 |             except:
 51 |                 word, freq = line.strip().split('\t')
 52 |             idf_dict[word] = float(freq)
 53 |         common_idf = sum(idf_dict.values())/len(idf_dict)
 54 | 
 55 |         return idf_dict, common_idf
 56 |     
 57 | 
 58 |     def build_wordsdict(self, text):
 59 |         word_dict = {}
 60 |         candi_words = []
 61 |         candi_dict = {}
 62 |         for word in posseg.cut(text):
 63 |             if word.flag[0] in ['n', 'v', 'a'] and len(word.word) > 1:
 64 |                 candi_words.append(word.word)
 65 |             if word.word not in word_dict:
 66 |                 word_dict[word.word] = 1
 67 |             else:
 68 |                 word_dict[word.word] += 1
 69 |         count_total = sum(word_dict.values())
 70 |         for word, word_count in word_dict.items():
 71 |             if word in candi_words:
 72 |                 candi_dict[word] = word_count/count_total
 73 |             else:
 74 |                 continue
 75 | 
 76 |         return candi_dict
 77 | 
 78 |     def extract_keywords(self, text, num_keywords):
 79 |         keywords_dict = {}
 80 |         candi_dict = self.build_wordsdict(text)
 81 |         for word, word_tf in candi_dict.items():
 82 |             word_idf = self.idf_dict.get(word, self.common_idf)
 83 |             word_tfidf = word_idf * word_tf
 84 |             keywords_dict[word] = word_tfidf
 85 |         keywords_dict = sorted(keywords_dict.items(), key=lambda asd:asd[1], reverse=True)
 86 | 
 87 |         return keywords_dict[:num_keywords]
 88 | 
 89 |     
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 
110 | 
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/create_wordcloud.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon May  4 10:24:59 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 项目介绍：自动词云生成
  8 | 对给定的文本进行关键词和高频词统计并根据指定图片形状来生成词云图。
  9 | 
 10 | 
 11 | 项目功能：
 12 | 三种类型的高频词和关键词可视化：
 13 | 1. 根据用户输入指定网址，通过采集该网址文本进行处理。
 14 | 2. 根据用户输入文本字符串进行处理。
 15 | 3. 根据用户输入载入本地文本进行处理，用户将所需要处理文本文件放入text文本夹中，指定文件名进行处理。
 16 | 
 17 | 
 18 | 项目实现：
 19 | 主要通过调用wordcloud这个可视化组件来完成任务。
 20 | 
 21 | 输入用户给定参数:
 22 | 1) textfile: 放于text文件夹中, 为用户需要分析的文本
 23 | 2) picturefile: 放于background文件夹中, 为用户给定的图片源文件
 24 | 3) url: 用户需要进行分析网页文本的url
 25 | 4) content: 用户需要分析的文本字符串
 26 | 5) save_name: 用户对当前分析目标的命名
 27 | 6) word_num: 用户希望展示的词数
 28 | 
 29 | 输出: 在output文件夹下会生成以save_name开头的高频词云图和关键词云图
 30 | 
 31 | """
 32 | 
 33 | import __init__
 34 | import os
 35 | import numpy as np
 36 | from PIL import Image
 37 | import matplotlib.pyplot as plt
 38 | from key_info_extraction.compute_keywords_tfidf import TFIDF
 39 | from collections import Counter
 40 | from jieba import posseg 
 41 | import urllib.request
 42 | from wordcloud import WordCloud, ImageColorGenerator
 43 | from newspaper import Article
 44 | 
 45 | 
 46 | class CreateWordCloud:
 47 |     def __init__(self):
 48 |         cur = 'D:\\NLP\\My projects\\Capterra信息提取\\Key Info Extraction'
 49 |         self.textdir = os.path.join(cur, 'text')
 50 |         self.background = os.path.join(cur, 'background')
 51 |         self.fontpath = os.path.join(cur, 'data\\simhei.ttf')
 52 |         self.outpath = os.path.join(cur, 'output')
 53 |         self.pos_filters = ['n', 'v', 'a']
 54 |         self.limit_words = 100
 55 |         self.Keyworder =  TFIDF()
 56 |         return
 57 | 
 58 |     '''获取搜索页'''
 59 |     def get_html(self, url):
 60 |         headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17"}
 61 |         req = urllib.request.Request(url, headers=headers)
 62 |         html = urllib.request.urlopen(req).read().decode('utf-8')
 63 |         return html
 64 | 
 65 |     '''读取本地文件进行处理'''
 66 |     def read_local_file(self, textfile):
 67 |         textpath = os.path.join(self.textdir, textfile)
 68 |         content = open(textpath,'r',encoding='utf-8').read()
 69 |         return content
 70 | 
 71 |     '''统计词频'''
 72 |     def extract_words(self, content):
 73 |         words = []
 74 |         for line in content.split('\n'):
 75 |             line = line.strip()
 76 |             if not line:
 77 |                 continue
 78 |             words += [w.word for w in posseg.cut(line) if w.flag[0] in self.pos_filters and len(w.word) > 1]
 79 |         word_dict = {i[0]: i[1] for i in Counter(words).most_common()}
 80 |         return word_dict
 81 | 
 82 |     '''抽取关键词'''
 83 |     def extract_keywords(self, content, words_num=20):
 84 |         keywords_dict = {}
 85 |         keywords = self.Keyworder.extract_keywords(content, words_num)
 86 |         for key in keywords:
 87 |             word = key[0]
 88 |             value = int(key[1]*1000)
 89 |             keywords_dict[word] = value
 90 |         return keywords_dict
 91 | 
 92 |     '''创建关键词云图'''
 93 |     def show_cloud(self, word_dict, max_words, picturefile, save_name):
 94 |         self.backimage = os.path.join(self.background, picturefile)
 95 |         saveimage = os.path.join(self.outpath, save_name + '.jpg')
 96 |         backgroud_Image = np.array(Image.open(self.backimage))
 97 |         plt.figure(figsize=(15,10))
 98 |         cloud = WordCloud(font_path=self.fontpath,
 99 |                           background_color='white',
100 | #                          width=800,
101 | #                          height=600,
102 |                           max_words= max_words,
103 |                           max_font_size=500,
104 |                           mask=backgroud_Image,
105 |                           random_state=50
106 |                           )
107 | 
108 |         word_cloud = cloud.generate_from_frequencies(word_dict)
109 | #        img_colors = ImageColorGenerator(backgroud_Image)
110 | #        word_cloud.recolor(color_func=img_colors)
111 |         plt.imshow(word_cloud)
112 |         plt.axis('off')
113 |         plt.savefig(saveimage)
114 |         # plt.show()
115 |         # plt.close()
116 | 
117 | 
118 | 
119 |     '''展示关键词云图'''
120 |     def show_keywords(self, content, picturefile, words_num=20, save_name = 'test'):
121 |         keywords_text = self.extract_keywords(content, words_num)
122 |         self.show_cloud(keywords_text, words_num, picturefile, save_name)
123 |         return
124 | 
125 |     '''展示高频词云图'''
126 |     def show_topwords(self, content, picturefile, words_num=50, save_name = 'test'):
127 |         topwords_text = self.extract_words(content)
128 |         self.show_cloud(topwords_text, words_num, picturefile, save_name)
129 |         return
130 | 
131 |     '''在线模式抓取新闻进行既定形状可视化'''
132 |     def get_webcontent(self, url):
133 |         news = Article(url, language='zh')
134 |         news.download()
135 |         news.parse()
136 |         content = news.text
137 |         return content
138 | 
139 |     
140 |     '''根据用户输入url进行处理'''
141 |     def show_wordcloud_online(self, url, picturefile, words_num, save_name):
142 |         content = self.get_webcontent(url)
143 |         self.show_main(content, picturefile, words_num, save_name)
144 |         return
145 | 
146 |     '''根据用户输入文本进行处理'''
147 |     def show_wordcloud_input(self, content, picturefile, words_num, save_name):
148 |         self.show_main(content, picturefile, words_num, save_name)
149 |         return
150 | 
151 |     '''根据用户输入载入本地文本进行处理'''
152 |     def show_wordcloud_offline(self, textfile, picturefile, words_num, save_name):
153 |         content = self.read_local_file(textfile)
154 |         self.show_main(content, picturefile, words_num, save_name)
155 |         return
156 | 
157 |     '''分别执行绘制关键词和高频词'''
158 |     def show_main(self, content, picturefile, words_num, save_name):
159 |         name = save_name + '-topwords'
160 |         print('正在生成该文本的高频词云图.....')
161 |         self.show_topwords(content, picturefile, words_num, name)
162 |         print('已完成该文本的高频词云图.....')
163 |         print('正在生成该文本的关键词云图.....')
164 |         name = save_name + '-keywords'
165 |         self.show_keywords(content, picturefile, words_num, name)
166 |         print('已完成该文本的关键词云图.....')
167 | 
168 | def test():
169 |     print('*'*10 +'根据输入文本进行处理：'+ '*'*10)
170 |     with open('text\\test_article.txt', 'r', encoding='utf-8') as f:
171 |         content = f.readlines()[0]
172 |         print(content[:100]+'...')
173 |         picturefile = 'china.jpg'
174 |         save_name = 'test'
175 |         words_num = 50
176 |         handler = CreateWordCloud()
177 |         handler.show_wordcloud_input(content, picturefile, words_num, save_name)
178 |     f.close()
179 |     
180 |     print('*'*10 +'根据输入url进行处理：'+ '*'*10)
181 |     with open('text\\test_url.txt', 'r', encoding='utf-8') as f1:
182 |         url = f1.readlines()[-1].strip()
183 |         print(url)
184 |         picturefile = 'oval.png'
185 |         save_name = 'test1'
186 |         words_num = 50
187 |         handler = CreateWordCloud()
188 |         handler.show_wordcloud_online(url, picturefile, words_num, save_name)
189 |     f1.close()
190 |     
191 |     print('*'*10 +'根据输入载入本地文本进行处理：'+ '*'*10)
192 |     textfile = 'test_article.txt'
193 |     picturefile = 'profile.png'
194 |     save_name = 'test2'
195 |     words_num = 50
196 |     handler = CreateWordCloud()
197 |     print('load text file from {}'.format(os.path.join(handler.textdir, textfile)))
198 |     handler.show_wordcloud_offline(textfile, picturefile, words_num, save_name)
199 |     
200 |     
201 |     
202 | 
203 | if __name__ == '__main__':
204 |     test()
205 | 
206 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/keywords_textrank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 11 11:10:17 2020
 4 | 
 5 | @author: Xu
 6 | """
 7 | import __init__
 8 | from collections import defaultdict
 9 | from jieba import posseg as pseg
10 | from key_info_extraction.textrank import textrank_graph
11 | 
12 | class TextRank:
13 |     def __init__(self):
14 |         self.candi_pos = ['n', 'v', 'a']
15 |         self.span = 5
16 | 
17 |     def extract_keywords(self, text, num_keywords):
18 |         g = textrank_graph()
19 |         cm = defaultdict(int)
20 |         word_list = [[word.word, word.flag] for word in pseg.cut(text)]
21 |         for i, word in enumerate(word_list):
22 |             if word[1][0] in self.candi_pos and len(word[0]) > 1:
23 |                 for j in range(i + 1, i + self.span):
24 |                     if j >= len(word_list):
25 |                         break
26 |                     if word_list[j][1][0] not in self.candi_pos or len(word_list[j][0]) < 2:
27 |                         continue
28 |                     pair = tuple((word[0], word_list[j][0]))
29 |                     cm[(pair)] +=  1
30 | 
31 |         for terms, w in cm.items():
32 |             g.addEdge(terms[0], terms[1], w)
33 |         nodes_rank = g.rank()
34 |         nodes_rank = sorted(nodes_rank.items(), key=lambda asd:asd[1], reverse=True)
35 | 
36 |         return nodes_rank[:num_keywords]
37 | 
38 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/sentence_similarity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 11 11:06:16 2020
 4 | 
 5 | @author: Xu
 6 | """
 7 | import __init__
 8 | from gensim import models
 9 | from src import config
10 | import numpy as np
11 | 
12 | # import os
13 | # import sys 
14 | # curPath = os.path.abspath(os.path.dirname(__file__))
15 | # rootPath = os.path.split(curPath)[0]
16 | # sys.append(os.path.split(rootPath)[0])
17 | 
18 | 
19 | class SimilarityCompute:
20 |     def __init__(self):
21 |         self.embedding_file = config.token_vector_path
22 |         self.model = models.KeyedVectors.load_word2vec_format(self.embedding_file, binary=False)
23 | 
24 |     def get_wordvector(self, word):
25 |         try:
26 |             return self.model[word]
27 |         except:
28 |             return np.zeros(200)
29 | 
30 |     def similarity_cosine(self, word_list1,word_list2):
31 |         simalrity = 0
32 |         vector1 = np.zeros(200)
33 |         for word in word_list1:
34 |             vector1 += self.get_wordvector(word)
35 | 
36 |         vector1 = vector1/len(word_list1)
37 |         vector2 = np.zeros(200)
38 | 
39 |         for word in word_list2:
40 |             vector2 += self.get_wordvector(word)
41 | 
42 |         vector2 = vector2/len(word_list2)
43 |         cos1 = np.sum(vector1*vector2)
44 |         cos21 = np.sqrt(sum(vector1**2))
45 |         cos22 = np.sqrt(sum(vector2**2))
46 |         similarity = cos1/float(cos21*cos22)
47 |         return similarity
48 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/textrank.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Mon May 11 11:05:07 2020
 4 | 
 5 | @author: Xu
 6 | """
 7 | import __init__
 8 | from collections import defaultdict
 9 | import sys
10 | 
11 | 
12 | class textrank_graph:
13 |     def __init__(self):
14 |         self.graph = defaultdict(list)
15 |         self.d = 0.85 #d是阻尼系数，一般设置为0.85
16 |         self.min_diff = 1e-5 #设定收敛阈值
17 | 
18 |     #添加节点之间的边
19 |     def addEdge(self, start, end, weight):
20 |         self.graph[start].append((start, end, weight))
21 |         self.graph[end].append((end, start, weight))
22 | 
23 |     #节点排序
24 |     def rank(self):
25 |         #一共有14个节点
26 |         print(len(self.graph))
27 |         #默认初始化权重
28 |         weight_deault = 1.0 / (len(self.graph) or 1.0)
29 |         #nodeweight_dict, 存储节点的权重
30 |         nodeweight_dict = defaultdict(float)
31 |         #outsum，存储节点的出度权重
32 |         outsum_node_dict = defaultdict(float)
33 |         #根据图中的边，更新节点权重
34 |         for node, out_edge in self.graph.items():
35 |             #是 [('是', '全国', 1), ('是', '调查', 1), ('是', '失业率', 1), ('是', '城镇', 1)]
36 |             nodeweight_dict[node] = weight_deault
37 |             outsum_node_dict[node] = sum((edge[2] for edge in out_edge), 0.0)
38 |         #初始状态下的textrank重要性权重
39 |         sorted_keys = sorted(self.graph.keys())
40 |         #设定迭代次数，
41 |         step_dict = [0]
42 |         for step in range(1, 1000):
43 |             for node in sorted_keys:
44 |                 s = 0
45 |                 #计算公式：(edge_weight/outsum_node_dict[edge_node])*node_weight[edge_node]
46 |                 for e in self.graph[node]:
47 |                     s += e[2] / outsum_node_dict[e[1]] * nodeweight_dict[e[1]]
48 |                 #计算公式：(1-d) + d*s
49 |                 nodeweight_dict[node] = (1 - self.d) + self.d * s
50 |             step_dict.append(sum(nodeweight_dict.values()))
51 | 
52 |             if abs(step_dict[step] - step_dict[step - 1]) <= self.min_diff:
53 |                 break
54 | 
55 |         #利用Z-score进行权重归一化，也称为离差标准化，是对原始数据的线性变换，使结果值映射到[0 - 1]之间。
56 |         #先设定最大值与最小值均为系统存储的最大值和最小值
57 |         (min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
58 |         for w in nodeweight_dict.values():
59 |             if w < min_rank:
60 |                 min_rank = w
61 |             if w > max_rank:
62 |                 max_rank = w
63 | 
64 |         for n, w in nodeweight_dict.items():
65 |             nodeweight_dict[n] = (w - min_rank/10.0) / (max_rank - min_rank/10.0)
66 | 
67 |         return nodeweight_dict
68 | 
69 | 


--------------------------------------------------------------------------------
/src/exe/key_info_extraction/topic_cluster_lda.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Sat Nov  7 15:05:43 2020
  4 | 
  5 | @author: Xu
  6 | 
  7 | 
  8 | 基于Lda模型的多文档主题聚类,输入多篇文档,输出每个主题的关键词与相应文本,可用于主题发现与热点分析.
  9 | 
 10 | """
 11 | import __init__
 12 | import math
 13 | import re
 14 | import random
 15 | import numpy as np
 16 | import pandas as pd
 17 | import jieba
 18 | import config
 19 | from gensim import corpora, models,similarities
 20 | 
 21 | 
 22 | # 数据处理
 23 | def data_process(data):
 24 |     """
 25 |     Process the multiple document input: remove non-text characters, remove stop words, word segmentation, etc.,
 26 |     to generate dictionary and corpus vector.
 27 |     
 28 |     Parameters:
 29 |         data: type-list, the contents of multiple documents entered, like [doc1, doc2, ...,docn]
 30 |         
 31 |     Return:
 32 |         dictionary: type-dict, Generate dictionary based on input documents.
 33 |         corpus: type-iterator, is an iterator that returns the BOW vector
 34 |         corpus_tfidf: type-array, Calculate the TFIDF value for each feature that appears in the corpus.
 35 |         
 36 |     
 37 |     """
 38 |     # 去掉非文本字符
 39 |     
 40 |     data_new  = [re.sub(r'[^\u4e00-\u9fa5]+', '', d).strip() for d in data]
 41 | #    data_new  = [re.sub(r"[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]", '', d).strip() for d in data]
 42 |     #分词
 43 |     data_new = [list(jieba.cut(d)) for d in data_new]
 44 |     # 去停用词
 45 |     stopwords = open(config.StopWords_path,'r').read()
 46 |     stoplist = stopwords.split('\n')
 47 |     data_new = [[word for word in d if word not in stoplist] for d in data_new ]
 48 |     # 过滤长度<=1的词
 49 |     data_new = [[word for word in d if len(word)>1 ] for d in data_new ]
 50 |     #对文本进行处理，得到文本集合中的词典
 51 |     dictionary = corpora.Dictionary(data_new)
 52 |     print('number of docs: ', dictionary.num_docs)
 53 |     print('number of words: ', dictionary.num_pos)
 54 |     #利用词典，对文本进行bow表示，生成词袋
 55 |     corpus = [dictionary.doc2bow(text) for text in data_new]
 56 |     #利用bow，对文本进行tfidf表示
 57 |     tfidf = models.TfidfModel(corpus)
 58 |     corpus_tfidf = tfidf[corpus]
 59 |     return dictionary,corpus,corpus_tfidf
 60 | 
 61 | 
 62 | #  计算困惑度
 63 | def preplexity(ldamodel,testset,dictionary,size_dictionary,num_topics):
 64 |     '''
 65 |     Calculate the preplexity of a lda-model.
 66 |     Parameters:
 67 |         ldamodel: a LDA Model
 68 |         testset: corpus data
 69 |         dictionary: vocabulary, like {7822:'deferment', 1841:'circuitry',19202:'fabianism'...}
 70 |         size_dictionary: type: integer, the size of vocabulary
 71 |         num_topics: type: integer, number of tipics
 72 |     ----------
 73 |     Return:
 74 |         prep: type-float, preplexity of a lda-model
 75 |         
 76 |     '''    
 77 |     print('\n')
 78 |     print('The info of this lda-model: ')
 79 |     print('num of the testset: %s; size_dictionary: %s; num of topics: %s' %(len(testset),size_dictionary,num_topics))
 80 |     prep=0.0
 81 |     prob_doc_sum=0.0
 82 |     topic_word_list=[]  #store the prabability of topic-word
 83 |     for topic_id in range(num_topics):
 84 |         topic_word=ldamodel.show_topic(topic_id,size_dictionary)
 85 |         dic={}
 86 |         for word,probability in topic_word:
 87 |             dic[word]=probability
 88 |         topic_word_list.append(dic)
 89 |     doc_topic_list=[]  #store the doc-topic tuples:[(0, 0.0006211180124223594),(1, 0.0006211180124223594),...]
 90 |     for doc in testset:
 91 |         doc_topic_list.append(ldamodel.get_document_topics(doc,minimum_probability=0))
 92 |     testset_word_num=0
 93 |     for i in range(len(testset)):
 94 |         prob_doc=0.0     # the probability of the doc
 95 |         doc=testset[i]
 96 |         doc_word_num=0   # the number of words in the doc
 97 |         for word_id,num in doc:  #doc.items() if testset is a dic else list
 98 |             prob_word=0.0    # the probability of word
 99 |             doc_word_num+=num
100 |             word=dictionary[word_id]
101 |             for topic_id in range(num_topics):
102 |                 # calculate p(w): p(w)=sumz(p(z)*p(w|z))
103 |                 prob_topic=doc_topic_list[i][topic_id][1]
104 |                 prob_topic_word=topic_word_list[topic_id][word]
105 |                 prob_word+=prob_topic*prob_topic_word
106 |             prob_doc+=math.log(prob_word)   # p(d)=sum(log(p(w)))
107 |         prob_doc_sum+=prob_doc
108 |         testset_word_num+=doc_word_num
109 |     prep=math.exp(-prob_doc_sum/testset_word_num)  # perplexity=exp(-sum(p(d))/sum(Nd))
110 |     print('the perplexity of this lda-model is: %s' %prep)
111 |     return prep
112 | 
113 | # 确定主题个数
114 | def get_best_num_topics(data, max_num_topics):
115 |     """
116 |     This is the optimal number of topics obtained through perplexity assessment.
117 |     
118 |     Parameters:
119 |         data: type: array or dataframe or matrix or list, the raw text data 
120 |         max_num_topics: type: integer, maximum of number of tipics
121 |     
122 |     Return:
123 |         best_num_topics: best number of topics
124 |        
125 |     """
126 |     from scipy.signal import argrelextrema
127 |     
128 | #    random.shuffle(data)
129 |     train_data = data[:int(len(data)*0.6)]      
130 |     val_data = data[int(len(data)*0.6): int(len(data)*0.8)]
131 |     test_data = data[int(len(data)*0.8):]
132 |        
133 |     # 计算模型在训练和验证集上的困惑度
134 |     preplexity_list_val=[]
135 |     preplexity_list_train=[]
136 | 
137 |     for k in range(2, max_num_topics+1):
138 |         dictionary,corpus,corpus_tfidf = data_process(train_data+val_data)
139 |         ldamodel = models.LdaModel(corpus_tfidf,id2word=dictionary,num_topics=k)
140 |         preplexity_list_train.append(preplexity(ldamodel,corpus,dictionary,len(dictionary.keys()),k))
141 |         val_corpus = corpus[int(len(data)*0.6):]
142 |         preplexity_list_val.append(preplexity(ldamodel,val_corpus,dictionary,len(dictionary.keys()),k))
143 |     
144 |     # 选择拐点（最佳主题个数）
145 |     y1 = np.array(preplexity_list_val)
146 |     y2 = np.array(preplexity_list_train)
147 |     best_num_topics = argrelextrema(np.abs(y1-y2), np.less)[0][0]
148 |     
149 |     return  best_num_topics
150 |     
151 |     
152 | # 使用lda模型，获取主题分布
153 | def lda_model(data, file_path): 
154 |     # data prepare 
155 |     dictionary,corpus,corpus_tfidf = data_process(data)
156 |     # num of topics
157 |     num_topics = get_best_num_topics(data, 20)
158 |     # lda model
159 |     lda = models.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=num_topics)
160 |     # save topic keywords to the file
161 |     output_topic_keywords = ''
162 |     for topic in lda.print_topics(num_topics=num_topics,num_words=25):
163 |         keywords = re.sub(r'[\d\.\d+\*]|\"','', topic[1])
164 |         output_topic_keywords += 'Topic ' + str(topic[0]) + '\t' + keywords + '\n'
165 |     # topic prediction/classification
166 |     output_topic_dist = ''
167 |     topics = lda.get_document_topics(corpus)
168 |     topics_label = {}
169 |     for i in range(20):
170 |         topics_label[i] = 'Topic ' + str(i)
171 |     # save topic distribution to the file
172 |     for i in range(len(corpus)):
173 |         dist = str(topics[i])
174 |         output_topic_dist += 'Document ' + str(i+1) + '\t' + dist + '\n'
175 |     # save     
176 |     f = open(file_path, 'w', encoding='utf-8')
177 |     f.write('Number of topics' + '\t' + str(num_topics) + '\n')
178 |     f.write(output_topic_keywords)
179 |     f.write(output_topic_dist)
180 |     f.close()
181 |     
182 |     return num_topics, output_topic_keywords,  output_topic_dist  
183 |     
184 |     
185 |     
186 | 
187 |     
188 |     
189 |     
190 |     
191 |     
192 |     
193 |     
194 |     
195 |     
196 |     
197 |     
198 |     
199 |     
200 |     
201 |     
202 |     
203 |     
204 |     
205 | 
206 | 


--------------------------------------------------------------------------------
/src/exe/review_sentiment/business.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import pandas as pd
  3 | import nltk
  4 | import sentiment_model
  5 | from sentiment_model import SentimentModel
  6 | 
  7 | 
  8 | # 自定义打印方法
  9 | def print_format(str, a):
 10 |     print(str + '\n{0}\n'.format(a))
 11 | 
 12 | 
 13 | # review.json对应的数据类
 14 | class ReviewDataItem(object):
 15 |     def __init__(self, review_id, user_id, business_id, stars, text):
 16 |         self.review_id = review_id
 17 |         self.user_id = user_id
 18 |         self.business_id = business_id
 19 |         self.stars = stars
 20 |         self.text = text
 21 | 
 22 | 
 23 | # business.json对应的数据类
 24 | class BusinessDataItem(object):
 25 |     def __init__(self, business_id, name, review_count):
 26 |         self.business_id = business_id
 27 |         self.name = name
 28 |         self.review_count = review_count
 29 | 
 30 | 
 31 | class Business(object):
 32 |     """
 33 |     用来表示跟business相关的变量和函数
 34 |     """
 35 | 
 36 |     def __init__(self):
 37 |         # 初始化变量以及函数
 38 |         # self.aspect_filter = ["salsa"]
 39 |         self.aspect_filter = []
 40 |         self.dic_business_id = {}
 41 |         self.dic_business_data = {}
 42 |         print("step1 加载模型==================")
 43 |         self.sentimentModel = SentimentModel()  # 把已经训练好的模型存放在文件里，并导入进来
 44 |         print("step2 读取数据开始==================")
 45 |         self.read_data()
 46 |         print("step2 读取数据结束==================")
 47 | 
 48 |     def read_data(self):
 49 | 
 50 |         json_file_business_path = './data/business.json'
 51 |         json_file_review_path = './data/review.json'
 52 | 
 53 |         with open(json_file_business_path, 'r', encoding='utf-8') as fin:
 54 |             for line in fin:
 55 |                 line_contents = json.loads(line)
 56 |                 business_id = line_contents["business_id"]
 57 |                 name = line_contents["name"]
 58 |                 review_count = line_contents["review_count"]
 59 |                 if review_count >= 100:
 60 |                     self.dic_business_id[business_id] = []
 61 |                     business_DataItem = BusinessDataItem(business_id, name, review_count)
 62 |                     self.dic_business_data[business_id] = business_DataItem
 63 | 
 64 |         with open(json_file_review_path, 'r', encoding='utf-8') as fin:
 65 |             for line in fin:
 66 |                 line_contents = json.loads(line)
 67 |                 business_id = line_contents["business_id"]
 68 |                 if business_id in self.dic_business_id:
 69 |                     review_id = line_contents["review_id"]
 70 |                     user_id = line_contents["user_id"]
 71 |                     stars = line_contents["stars"]
 72 |                     text = line_contents["text"]
 73 |                     review_DataItem = ReviewDataItem(review_id, user_id, business_id, stars, text)
 74 |                     self.dic_business_id[business_id].append(review_DataItem)
 75 | 
 76 |     def aspect_based_summary(self, business_id):
 77 |         """
 78 |         返回一个business的summary. 针对于每一个aspect计算出它的正面负面情感以及TOP reviews.
 79 |         具体细节请看给定的文档。
 80 |         """
 81 | 
 82 |         aspects_dic = self.extract_aspects(business_id)
 83 |         # print(aspects_dic)
 84 |         business_name = self.dic_business_data[business_id].name
 85 |         # print(business_name)
 86 | 
 87 |         pos_aspect_dic = {}
 88 |         neg_aspect_dic = {}
 89 |         review_segment_dic = {}
 90 | 
 91 |         for aspect, reviews in aspects_dic.items():
 92 |             for review in reviews:
 93 |                 review_text = review.text
 94 |                 if review_text == None or str.strip(review_text) == '':
 95 |                     continue
 96 |                 review_segment = self.get_segment(review_text, aspect, aspects_dic)
 97 |                 # 粗略筛选一下
 98 |                 if len(str.strip(review_segment)) > len(aspect) + 3:
 99 |                     # print(review_segment)
100 |                     key = review.review_id + "_" + aspect
101 |                     review_segment_dic[key] = review_segment
102 | 
103 |                     score = self.sentimentModel.predict_prob(review_segment)
104 | 
105 |                     if score > 0.75:
106 |                         if aspect not in pos_aspect_dic:
107 |                             pos_aspect_dic[aspect] = []
108 |                         pos_aspect_dic[aspect].append([key, score])
109 |                     else:
110 |                         if aspect not in neg_aspect_dic:
111 |                             neg_aspect_dic[aspect] = []
112 |                         neg_aspect_dic[aspect].append([key, score])
113 | 
114 |         dic_aspect_summary = {}
115 |         for aspect, reviews in aspects_dic.items():
116 |             if aspect not in dic_aspect_summary:
117 |                 dic_aspect_summary[aspect] = {}
118 | 
119 |             # 算某个aspect的得分
120 |             pos_aspect_review_nums = len(pos_aspect_dic[aspect])
121 |             pos_aspect_total_scores = 0
122 |             for item in pos_aspect_dic[aspect]:
123 |                 pos_aspect_total_scores += item[1]
124 | 
125 |             neg_aspect_review_nums = len(neg_aspect_dic[aspect])
126 |             neg_aspect_total_scores = 0
127 |             for item in neg_aspect_dic[aspect]:
128 |                 neg_aspect_total_scores += item[1]
129 | 
130 |             aspect_review_nums = pos_aspect_review_nums +neg_aspect_review_nums
131 |             aspect_score = (pos_aspect_total_scores + neg_aspect_total_scores) / aspect_review_nums
132 | 
133 |             dic_aspect_summary[aspect]["rating"] = aspect_score
134 | 
135 |             # TOP 5 正面
136 |             aspects_pos_sorted = sorted(pos_aspect_dic[aspect], key=lambda x: x[1], reverse=True)
137 |             aspects_pos_contents = {}
138 |             dic_aspect_summary[aspect]["pos"] = []
139 |             for index, item in enumerate(aspects_pos_sorted):
140 |                 if len(dic_aspect_summary[aspect]["pos"]) >= 5:
141 |                     break
142 |                 review_content = review_segment_dic[item[0]]
143 |                 if review_content not in aspects_pos_contents:
144 |                     dic_aspect_summary[aspect]["pos"].append(review_content)
145 |                     aspects_pos_contents[review_content] = None
146 | 
147 |             # TOP 5 负面
148 |             aspects_neg_sorted = sorted(neg_aspect_dic[aspect], key=lambda x: x[1], reverse=False)
149 |             aspects_neg_contents = {}
150 |             dic_aspect_summary[aspect]["neg"] = []
151 |             for index, item in enumerate(aspects_neg_sorted):
152 |                 if len(dic_aspect_summary[aspect]["neg"]) >= 5:
153 |                     break
154 |                 review_content = review_segment_dic[item[0]]
155 |                 if review_content not in aspects_neg_contents:
156 |                     dic_aspect_summary[aspect]["neg"].append(review_content)
157 |                     aspects_neg_contents[review_content] = None
158 | 
159 |         all_aspect_scores = 0
160 |         for item in dic_aspect_summary.items():
161 |             all_aspect_scores += item[1]["rating"]
162 | 
163 |         business_rating = all_aspect_scores / len(dic_aspect_summary.items())
164 | 
165 |         return {'business_id':business_id,
166 |             'business_name':business_name,
167 |             'business_rating':business_rating,
168 |             'aspect_summary':dic_aspect_summary
169 |             }
170 | 
171 |     def get_segment(self, review_text, aspect, aspects_dic):
172 | 
173 |         if self.is_review_only_one_aspect(review_text):
174 |             return review_text
175 | 
176 |         cur_aspect_index = review_text.index(aspect)
177 |         cur_aspect_end_index_begin = cur_aspect_index + len(aspect)
178 |         cur_aspect_end_index_end = cur_aspect_end_index_begin
179 |         end_pos = len(review_text) - 1
180 | 
181 |         stop_punct_map = {c: None for c in ',.!?;'}
182 |         relation_punct_list = ["and", "when", "but"]
183 | 
184 |         # next_aspect = self.get_next_aspect(review_text[cur_aspect_end_index_begin:end_pos])
185 |         cur_aspect_des = self.get_cur_aspect_adj(review_text[cur_aspect_end_index_begin:end_pos])
186 | 
187 |         while cur_aspect_end_index_end <= end_pos:
188 |             # 在标点符号处截取
189 |             cur_str = review_text[cur_aspect_end_index_end:min(cur_aspect_end_index_end + 1, end_pos)]
190 |             if cur_str in stop_punct_map:
191 |                 break
192 | 
193 |             # 在转移符号处截取
194 |             cur_strs = review_text[cur_aspect_end_index_begin:cur_aspect_end_index_end]
195 |             relation_store = ""
196 |             for relation in relation_punct_list:
197 |                 if relation in cur_strs.lower():
198 |                     relation_store = relation
199 |                     break
200 | 
201 |             if relation_store != "":
202 |                 cur_aspect_end_index_end -= len(relation_store)
203 |                 break
204 | 
205 |             # 在下一个aspect截取
206 |             # if next_aspect != None:
207 |             # 	if next_aspect in aspects_dic and next_aspect in cur_strs:
208 |             # 		cur_aspect_end_index_end -= len(next_aspect)
209 |             # 		break
210 | 
211 |             # 在aspect最近的形容词截取
212 |             if cur_aspect_des != None:
213 |                 if cur_aspect_des in cur_strs:
214 |                     break
215 | 
216 |             cur_aspect_end_index_end += 1
217 | 
218 |         cur_aspect_end_index_end = min(cur_aspect_end_index_end, end_pos)
219 |         return review_text[cur_aspect_index:cur_aspect_end_index_end]
220 | 
221 |     def get_next_aspect(self, text):
222 |         tokens = nltk.word_tokenize(text)
223 |         tag_tuples = nltk.pos_tag(tokens)
224 |         for (word, tag) in tag_tuples:
225 |             if tag == "NN":
226 |                 return word
227 |         return None
228 | 
229 |     def get_cur_aspect_adj(self, text):
230 |         tokens = nltk.word_tokenize(text)
231 |         tag_tuples = nltk.pos_tag(tokens)
232 |         for (word, tag) in tag_tuples:
233 |             if tag == "JJ" or tag == "ADJ":
234 |                 return word
235 |         return None
236 | 
237 |     def is_review_only_one_aspect(self, review_text):
238 |         '''
239 |         判断评论里面是否只包含一个方面
240 |         :param review:
241 |         :return:
242 |         '''
243 | 
244 |         tagged_words = []
245 |         tokens = nltk.word_tokenize(review_text)
246 |         tag_tuples = nltk.pos_tag(tokens)
247 |         for (word, tag) in tag_tuples:
248 |             if tag == "NN":
249 |                 tagged_words.append(word)
250 | 
251 |         if len(tagged_words) <= 1:
252 |             return True
253 | 
254 |         return False
255 | 
256 |     def extract_aspects(self, business_id):
257 |         """
258 |         从一个business的review中抽取aspects
259 |         """
260 | 
261 |         # print("step3 extract_aspects begin==================")
262 | 
263 |         if business_id not in self.dic_business_id:
264 |             print("business_id not exit")
265 |             return None
266 | 
267 |         review_list = self.dic_business_id[business_id]
268 |         aspects_dic = {}
269 |         for review_data in review_list:
270 |             sentence = review_data.text
271 |             if sentence == None or str.strip(sentence) == '':
272 |                 continue
273 |             tagged_words = []
274 |             tokens = nltk.word_tokenize(sentence)
275 |             tag_tuples = nltk.pos_tag(tokens)
276 |             for (word, tag) in tag_tuples:
277 |                 if tag == "NN":
278 |                     # token = {'word': string, 'pos': tag}
279 |                     # tagged_words.append(word)
280 |                     if word not in aspects_dic:
281 |                         aspects_dic[word] = []
282 |                     aspects_dic[word].append(review_data)
283 | 
284 |         # 对字典进行排序
285 |         aspects_sorted = sorted(aspects_dic.items(), key=lambda x: len(x[1]), reverse=True)
286 |         aspects_dic = {}
287 |         for index, item in enumerate(aspects_sorted):
288 |             if item[0] in self.aspect_filter:
289 |                 continue
290 | 
291 |             if len(aspects_dic.items()) < 5:
292 |                 aspects_dic[item[0]] = item[1]
293 | 
294 |         # print("step3 extract_aspects end==================")
295 |         return aspects_dic
296 | 


--------------------------------------------------------------------------------
/src/exe/review_sentiment/main.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import business
 3 | from business import Business
 4 | 
 5 | '''
 6 | 运行前需要从以下百度网盘链接下载
 7 | 
 8 | https://pan.baidu.com/s/1hSFBjQHLhYDw9jPBDNH6pw&shfl=sharepset
 9 | 这个路径包含data文件夹下的内容
10 | business.json
11 | glove.6B.100d.txt
12 | review.json
13 | 
14 | https://pan.baidu.com/s/1gnANAnoGv5GHQKWwAIVE4Q&shfl=sharepset
15 | 这个路径包含data文件夹下的模型文件
16 | svm_clf.pkl
17 | 
18 | 
19 | '''
20 | 
21 | 
22 | def get_review_summary_for_business(biz_id):
23 | 	# 获取每一个business的评论总结
24 | 	return business_module.aspect_based_summary(biz_id)
25 | 
26 | def main():
27 | 
28 | 	bus_ids = ["tstimHoMcYbkSC4eBA1wEg","gnKjwL_1w79qoiV3IC_xQQ"]  # 指定几个business ids
29 | 
30 | 	for bus_id in bus_ids:
31 | 		# print ("Working on biz_id %s" % bus_id)
32 | 		start = time.time()
33 | 
34 | 		summary = get_review_summary_for_business(bus_id)
35 | 
36 | 		print("\n")
37 | 
38 | 		normal_print_list = ["business_id","business_name","business_rating", "rating"]
39 | 		for item in summary.items():
40 | 			if item[0] in normal_print_list:
41 | 				print(str(item[0]) + ": " + str(item[1]))
42 | 			else:
43 | 				print(str(item[0]) + ": ")
44 | 				# for content in item[1]:
45 | 				# 	print(content)
46 | 				for data in item[1].items():
47 | 					# print(str(data[0]) + ": " + str(data[1]))
48 | 					print("------------------" + str(data[0]) + "------------------")
49 | 					for data_1 in data[1].items():
50 | 						if data_1[0] in normal_print_list:
51 | 							print(str(data_1[0]) + ": " + str(data_1[1]))
52 | 						else:
53 | 							review_list = []
54 | 							for item_1 in data_1[1]:
55 | 								review_list.append(item_1)
56 | 							print(str(data_1[0]) + ": " + ";  ".join(review_list))
57 | 
58 | if __name__ == "__main__":
59 | 	business_module = Business()
60 | 	main()
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/src/exe/review_sentiment/model_training.py:
--------------------------------------------------------------------------------
  1 | # 此文件包含模型的训练。 给定数据集，训练出情感分类模型，并把模型文件存放在 model文件夹里。
  2 | import json
  3 | import numpy as np
  4 | import pandas as pd
  5 | import business
  6 | from sklearn.feature_extraction.text import TfidfVectorizer
  7 | from nltk import pos_tag, word_tokenize
  8 | # from glove_embedding import GloveEmbedding
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.metrics import roc_auc_score
 11 | import matplotlib.pyplot as plt
 12 | from sklearn.svm import SVC
 13 | import utils
 14 | from utils import handle_text
 15 | from embedding_manager_cyd import EmbeddingManagerCyd
 16 | from embedding_manager_cyd import Embedding_Type
 17 | 
 18 | SEED = 222
 19 | np.random.seed(SEED)
 20 | 
 21 | dic_business_id = {}
 22 | 
 23 | def main():
 24 | 
 25 |     embeddingManagerCyd = EmbeddingManagerCyd()
 26 | 
 27 |     json_file_business_path = './data/business.json'
 28 |     json_file_review_path = './data/review.json'
 29 | 
 30 |     with open(json_file_business_path, 'r', encoding='utf-8') as fin:
 31 |         for line in fin:
 32 |             line_contents = json.loads(line)
 33 |             business_id = line_contents["business_id"]
 34 |             name = line_contents["name"]
 35 |             review_count = line_contents["review_count"]
 36 |             dic_business_id[business_id] = []
 37 | 
 38 |     tags = []
 39 |     reviewList = []
 40 |     sentiment = 0
 41 |     index = 0
 42 | 
 43 |     with open(json_file_review_path, 'r', encoding='utf-8') as fin:
 44 |         for line in fin:
 45 |             line_contents = json.loads(line)
 46 |             business_id = line_contents["business_id"]
 47 |             if business_id in dic_business_id:
 48 |                 review_id = line_contents["review_id"]
 49 |                 user_id = line_contents["user_id"]
 50 |                 stars = line_contents["stars"]
 51 |                 text = line_contents["text"]
 52 |                 if stars >= 4:
 53 |                     sentiment = 1
 54 |                 else:
 55 |                     sentiment = 0
 56 |                 tags.append(sentiment)
 57 |                 # reviewList.append([index,text])
 58 |                 # reviewList.append(handle_text(text))
 59 |                 reviewList.append(embeddingManagerCyd.getEmbedding(text, Embedding_Type.glove, True, False))
 60 |                 index += 1
 61 |                 print(index)
 62 |                 # if index >= 20000:
 63 |                 #     break
 64 | 
 65 | 
 66 |     # glove_embedding = GloveEmbedding()
 67 |     # # gloveVectors = [glove_embedding.getSentenceVectorCommon(item[1], isUseAveragePooling=True) for item in tokenizedWords.items()]
 68 |     # gloveVectors = [glove_embedding.getSentenceVectorCommon(item, isUseAveragePooling=True) for item in
 69 |     #                 reviewList]
 70 |     # features = np.array(gloveVectors, dtype=np.float16)
 71 | 
 72 |     features = np.array(reviewList, dtype=np.float16)
 73 |     tags = np.array(tags)
 74 | 
 75 |     classification_svm(features, tags)
 76 |     # classification_logistic(features, tags)
 77 | 
 78 |     # print("step4=================")
 79 | 
 80 | def get_train_test(features,tags,test_size=0.3):
 81 |     return train_test_split(features, tags, test_size=test_size, random_state=SEED)
 82 | 
 83 | from sklearn.pipeline import make_pipeline
 84 | from sklearn.model_selection import GridSearchCV
 85 | 
 86 | from sklearn.externals import joblib #jbolib模块
 87 | from sklearn.linear_model import LogisticRegression
 88 | from sklearn.model_selection import KFold
 89 | 
 90 | def classification_logistic(features, tags):
 91 |     xtrain, xtest, ytrain, ytest = get_train_test(features, tags)
 92 | 
 93 |     # cross_validator = KFold(n_splits=10, shuffle=False, random_state=None)
 94 | 
 95 |     # lr = LogisticRegression(penalty = "l1")
 96 |     #
 97 |     # # params = {"penalty":["l1","l2"],
 98 |     # #              "C":[0.1,1.0,10.0,100.0]},
 99 |     #
100 |     # params = {"C":[100, 120,150]},
101 | 
102 |     # grid = GridSearchCV(estimator=lr, param_grid = params)
103 |     # grid.fit(xtrain, ytrain)
104 |     # print("最优参数为：",grid.best_params_)
105 |     # model = grid.best_estimator_
106 |     # predict_value = model.predict(xtest)
107 |     # proba_value = model.predict_proba(xtest)
108 |     # p = proba_value[:,1]
109 |     # print("Logistic=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
110 |     #
111 |     # joblib.dump(model, 'model/logistic_clf.pkl')
112 | 
113 |     model = LogisticRegression(penalty="l1",C = 100, solver='liblinear')
114 |     model.fit(xtrain, ytrain)
115 |     predict_value = model.predict(xtest)
116 |     proba_value = model.predict_proba(xtest)
117 |     p = proba_value[:,1]
118 |     print("Logistic=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
119 |     joblib.dump(model, 'model/logistic_clf.pkl')
120 | 
121 |     #100000 0.889
122 |     #50000 0.889
123 | 
124 | def classification_svm(features, tags):
125 |     xtrain, xtest, ytrain, ytest = get_train_test(features, tags)
126 | 
127 |     # svc = SVC(kernel='rbf',probability = True)
128 |     # model = make_pipeline(svc)
129 |     #
130 |     # # 使用GridSearchCV选择参数
131 |     # # param_grid = {'svc__C': [1, 5, 10,0.5,20,30],
132 |     # #               'svc__gamma': [0.0001, 0.0005, 0.001, 0.005,0.006,0.007,0.008,0.009,0.01]}
133 |     # param_grid = {'svc__C': [1, 5, 10, 15],
134 |     #               'svc__gamma': [0.0005, 0.001, 0.005]}
135 |     # grid = GridSearchCV(model, param_grid)
136 |     #
137 |     # grid.fit(xtrain, ytrain)
138 |     # print("最优参数为：",grid.best_params_)
139 |     # model = grid.best_estimator_
140 |     # predict_value = model.predict(xtest)
141 |     # proba_value = model.predict_proba(xtest)
142 |     # p = proba_value[:,1]
143 |     # print("SVM=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
144 | 
145 |     model =SVC(kernel='rbf',probability = True, C=20, gamma=0.005)
146 |     model.fit(xtrain, ytrain)
147 |     proba_value = model.predict_proba(xtest)
148 |     p = proba_value[:, 1]
149 |     print("SVM=========== ROC-AUC score: %.3f" % roc_auc_score(ytest, p))
150 |     joblib.dump(model, 'model/svm_clf.pkl')
151 | 
152 |     #20000 0.883
153 |     #500000 0.887
154 | 
155 | if __name__ == "__main__":
156 |     main()
157 | 
158 | 


--------------------------------------------------------------------------------
/src/exe/review_sentiment/sentence.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | class Sentence(object):
 5 | 	
 6 | 	# WORD_TOKENIZER = MyPottsTokenizer(preserve_case=False)
 7 | 	
 8 | 	# LEMMATIZER = WordNetLemmatizer()
 9 | 
10 | 	# 针对于每一句话抽取aspects
11 | 	# ASP_EXTRACTOR =
12 | 
13 | 	def __init__(self):
14 | 		pass
15 | 		
16 | 
17 | 	def word_tokenize(self):
18 | 		pass
19 | 	
20 | 
21 | 	def pos_tag(self):
22 | 		pass
23 | 
24 | 	def lemmatize(self):
25 | 		pass
26 | 
27 | 	def contain_aspect(self):
28 | 		pass


--------------------------------------------------------------------------------
/src/exe/review_sentiment/stopwords.txt:
--------------------------------------------------------------------------------
  1 | 'd  
  2 | 'll  
  3 | 'm  
  4 | 're  
  5 | 's  
  6 | 't  
  7 | 've  
  8 | ZT  
  9 | ZZ  
 10 | a  
 11 | a's  
 12 | able  
 13 | about  
 14 | above  
 15 | abst  
 16 | accordance  
 17 | according  
 18 | accordingly  
 19 | across  
 20 | act  
 21 | actually  
 22 | added  
 23 | adj  
 24 | adopted  
 25 | affected  
 26 | affecting  
 27 | affects  
 28 | after  
 29 | afterwards  
 30 | again  
 31 | against  
 32 | ah  
 33 | ain't  
 34 | all  
 35 | allow  
 36 | allows  
 37 | almost  
 38 | alone  
 39 | along  
 40 | already  
 41 | also  
 42 | although  
 43 | always  
 44 | am  
 45 | among  
 46 | amongst  
 47 | an  
 48 | and  
 49 | announce  
 50 | another  
 51 | any  
 52 | anybody  
 53 | anyhow  
 54 | anymore  
 55 | anyone  
 56 | anything  
 57 | anyway  
 58 | anyways  
 59 | anywhere  
 60 | apart  
 61 | apparently  
 62 | appear  
 63 | appreciate  
 64 | appropriate  
 65 | approximately  
 66 | are  
 67 | area  
 68 | areas  
 69 | aren  
 70 | aren't  
 71 | arent  
 72 | arise  
 73 | around  
 74 | as  
 75 | aside  
 76 | ask  
 77 | asked  
 78 | asking  
 79 | asks  
 80 | associated  
 81 | at  
 82 | auth  
 83 | available  
 84 | away  
 85 | awfully  
 86 | b  
 87 | back  
 88 | backed  
 89 | backing  
 90 | backs  
 91 | be  
 92 | became  
 93 | because  
 94 | become  
 95 | becomes  
 96 | becoming  
 97 | been  
 98 | before  
 99 | beforehand  
100 | began  
101 | begin  
102 | beginning  
103 | beginnings  
104 | begins  
105 | behind  
106 | being  
107 | beings  
108 | believe  
109 | below  
110 | beside  
111 | besides  
112 | best  
113 | better  
114 | between  
115 | beyond  
116 | big  
117 | biol  
118 | both  
119 | brief  
120 | briefly  
121 | but  
122 | by  
123 | c  
124 | c'mon  
125 | c's  
126 | ca  
127 | came  
128 | can  
129 | can't  
130 | cannot  
131 | cant  
132 | case  
133 | cases  
134 | cause  
135 | causes  
136 | certain  
137 | certainly  
138 | changes  
139 | clear  
140 | clearly  
141 | co  
142 | com  
143 | come  
144 | comes  
145 | concerning  
146 | consequently  
147 | consider  
148 | considering  
149 | contain  
150 | containing  
151 | contains  
152 | corresponding  
153 | could  
154 | couldn't  
155 | couldnt  
156 | course  
157 | currently  
158 | d  
159 | date  
160 | definitely  
161 | describe  
162 | described  
163 | despite  
164 | did  
165 | didn't  
166 | differ  
167 | different  
168 | differently  
169 | discuss  
170 | do  
171 | does  
172 | doesn't  
173 | doing  
174 | don't  
175 | done  
176 | down  
177 | downed  
178 | downing  
179 | downs  
180 | downwards  
181 | due  
182 | during  
183 | e  
184 | each  
185 | early  
186 | ed  
187 | edu  
188 | effect  
189 | eg  
190 | eight  
191 | eighty  
192 | either  
193 | else  
194 | elsewhere  
195 | end  
196 | ended  
197 | ending  
198 | ends  
199 | enough  
200 | entirely  
201 | especially  
202 | et  
203 | et-al  
204 | etc  
205 | even  
206 | evenly  
207 | ever  
208 | every  
209 | everybody  
210 | everyone  
211 | everything  
212 | everywhere  
213 | ex  
214 | exactly  
215 | example  
216 | except  
217 | f  
218 | face  
219 | faces  
220 | fact  
221 | facts  
222 | far  
223 | felt  
224 | few  
225 | ff  
226 | fifth  
227 | find  
228 | finds  
229 | first  
230 | five  
231 | fix  
232 | followed  
233 | following  
234 | follows  
235 | for  
236 | former  
237 | formerly  
238 | forth  
239 | found  
240 | four  
241 | from  
242 | full  
243 | fully  
244 | further  
245 | furthered  
246 | furthering  
247 | furthermore  
248 | furthers  
249 | g  
250 | gave  
251 | general  
252 | generally  
253 | get  
254 | gets  
255 | getting  
256 | give  
257 | given  
258 | gives  
259 | giving  
260 | go  
261 | goes  
262 | going  
263 | gone  
264 | good  
265 | goods  
266 | got  
267 | gotten  
268 | great  
269 | greater  
270 | greatest  
271 | greetings  
272 | group  
273 | grouped  
274 | grouping  
275 | groups  
276 | h  
277 | had  
278 | hadn't  
279 | happens  
280 | hardly  
281 | has  
282 | hasn't  
283 | have  
284 | haven't  
285 | having  
286 | he  
287 | he's  
288 | hed  
289 | hello  
290 | help  
291 | hence  
292 | her  
293 | here  
294 | here's  
295 | hereafter  
296 | hereby  
297 | herein  
298 | heres  
299 | hereupon  
300 | hers  
301 | herself  
302 | hes  
303 | hi  
304 | hid  
305 | high  
306 | higher  
307 | highest  
308 | him  
309 | himself  
310 | his  
311 | hither  
312 | home  
313 | hopefully  
314 | how  
315 | howbeit  
316 | however  
317 | hundred  
318 | i  
319 | i'd  
320 | i'll  
321 | i'm  
322 | i've  
323 | id  
324 | ie  
325 | if  
326 | ignored  
327 | im  
328 | immediate  
329 | immediately  
330 | importance  
331 | important  
332 | in  
333 | inasmuch  
334 | inc  
335 | include  
336 | indeed  
337 | index  
338 | indicate  
339 | indicated  
340 | indicates  
341 | information  
342 | inner  
343 | insofar  
344 | instead  
345 | interest  
346 | interested  
347 | interesting  
348 | interests  
349 | into  
350 | invention  
351 | inward  
352 | is  
353 | isn't  
354 | it  
355 | it'd  
356 | it'll  
357 | it's  
358 | itd  
359 | its  
360 | itself  
361 | j  
362 | just  
363 | k  
364 | keep  
365 | keeps  
366 | kept  
367 | keys  
368 | kg  
369 | kind  
370 | km  
371 | knew  
372 | know  
373 | known  
374 | knows  
375 | l  
376 | large  
377 | largely  
378 | last  
379 | lately  
380 | later  
381 | latest  
382 | latter  
383 | latterly  
384 | least  
385 | less  
386 | lest  
387 | let  
388 | let's  
389 | lets  
390 | like  
391 | liked  
392 | likely  
393 | line  
394 | little  
395 | long  
396 | longer  
397 | longest  
398 | look  
399 | looking  
400 | looks  
401 | ltd  
402 | m  
403 | made  
404 | mainly  
405 | make  
406 | makes  
407 | making  
408 | man  
409 | many  
410 | may  
411 | maybe  
412 | me  
413 | mean  
414 | means  
415 | meantime  
416 | meanwhile  
417 | member  
418 | members  
419 | men  
420 | merely  
421 | mg  
422 | might  
423 | million  
424 | miss  
425 | ml  
426 | more  
427 | moreover  
428 | most  
429 | mostly  
430 | mr  
431 | mrs  
432 | much  
433 | mug  
434 | must  
435 | my  
436 | myself  
437 | n  
438 | n't  
439 | na  
440 | name  
441 | namely  
442 | nay  
443 | nd  
444 | near  
445 | nearly  
446 | necessarily  
447 | necessary  
448 | need  
449 | needed  
450 | needing  
451 | needs  
452 | neither  
453 | never  
454 | nevertheless  
455 | new  
456 | newer  
457 | newest  
458 | next  
459 | nine  
460 | ninety  
461 | no  
462 | nobody  
463 | non  
464 | none  
465 | nonetheless  
466 | noone  
467 | nor  
468 | normally  
469 | nos  
470 | not  
471 | noted  
472 | nothing  
473 | novel  
474 | now  
475 | nowhere  
476 | number  
477 | numbers  
478 | o  
479 | obtain  
480 | obtained  
481 | obviously  
482 | of  
483 | off  
484 | often  
485 | oh  
486 | ok  
487 | okay  
488 | old  
489 | older  
490 | oldest  
491 | omitted  
492 | on  
493 | once  
494 | one  
495 | ones  
496 | only  
497 | onto  
498 | open  
499 | opened  
500 | opening  
501 | opens  
502 | or  
503 | ord  
504 | order  
505 | ordered  
506 | ordering  
507 | orders  
508 | other  
509 | others  
510 | otherwise  
511 | ought  
512 | our  
513 | ours  
514 | ourselves  
515 | out  
516 | outside  
517 | over  
518 | overall  
519 | owing  
520 | own  
521 | p  
522 | page  
523 | pages  
524 | part  
525 | parted  
526 | particular  
527 | particularly  
528 | parting  
529 | parts  
530 | past  
531 | per  
532 | perhaps  
533 | place  
534 | placed  
535 | places  
536 | please  
537 | plus  
538 | point  
539 | pointed  
540 | pointing  
541 | points  
542 | poorly  
543 | possible  
544 | possibly  
545 | potentially  
546 | pp  
547 | predominantly  
548 | present  
549 | presented  
550 | presenting  
551 | presents  
552 | presumably  
553 | previously  
554 | primarily  
555 | probably  
556 | problem  
557 | problems  
558 | promptly  
559 | proud  
560 | provides  
561 | put  
562 | puts  
563 | q  
564 | que  
565 | quickly  
566 | quite  
567 | qv  
568 | r  
569 | ran  
570 | rather  
571 | rd  
572 | re  
573 | readily  
574 | really  
575 | reasonably  
576 | recent  
577 | recently  
578 | ref  
579 | refs  
580 | regarding  
581 | regardless  
582 | regards  
583 | related  
584 | relatively  
585 | research  
586 | respectively  
587 | resulted  
588 | resulting  
589 | results  
590 | right  
591 | room  
592 | rooms  
593 | run  
594 | s  
595 | said  
596 | same  
597 | saw  
598 | say  
599 | saying  
600 | says  
601 | sec  
602 | second  
603 | secondly  
604 | seconds  
605 | section  
606 | see  
607 | seeing  
608 | seem  
609 | seemed  
610 | seeming  
611 | seems  
612 | seen  
613 | sees  
614 | self  
615 | selves  
616 | sensible  
617 | sent  
618 | serious  
619 | seriously  
620 | seven  
621 | several  
622 | shall  
623 | she  
624 | she'll  
625 | shed  
626 | shes  
627 | should  
628 | shouldn't  
629 | show  
630 | showed  
631 | showing  
632 | shown  
633 | showns  
634 | shows  
635 | side  
636 | sides  
637 | significant  
638 | significantly  
639 | similar  
640 | similarly  
641 | since  
642 | six  
643 | slightly  
644 | small  
645 | smaller  
646 | smallest  
647 | so  
648 | some  
649 | somebody  
650 | somehow  
651 | someone  
652 | somethan  
653 | something  
654 | sometime  
655 | sometimes  
656 | somewhat  
657 | somewhere  
658 | soon  
659 | sorry  
660 | specifically  
661 | specified  
662 | specify  
663 | specifying  
664 | state  
665 | states  
666 | still  
667 | stop  
668 | strongly  
669 | sub  
670 | substantially  
671 | successfully  
672 | such  
673 | sufficiently  
674 | suggest  
675 | sup  
676 | sure  
677 | t  
678 | t's  
679 | take  
680 | taken  
681 | taking  
682 | tell  
683 | tends  
684 | th  
685 | than  
686 | thank  
687 | thanks  
688 | thanx  
689 | that  
690 | that'll  
691 | that's  
692 | that've  
693 | thats  
694 | the  
695 | their  
696 | theirs  
697 | them  
698 | themselves  
699 | then  
700 | thence  
701 | there  
702 | there'll  
703 | there's  
704 | there've  
705 | thereafter  
706 | thereby  
707 | thered  
708 | therefore  
709 | therein  
710 | thereof  
711 | therere  
712 | theres  
713 | thereto  
714 | thereupon  
715 | these  
716 | they  
717 | they'd  
718 | they'll  
719 | they're  
720 | they've  
721 | theyd  
722 | theyre  
723 | thing  
724 | things  
725 | think  
726 | thinks  
727 | third  
728 | this  
729 | thorough  
730 | thoroughly  
731 | those  
732 | thou  
733 | though  
734 | thoughh  
735 | thought  
736 | thoughts  
737 | thousand  
738 | three  
739 | throug  
740 | through  
741 | throughout  
742 | thru  
743 | thus  
744 | til  
745 | tip  
746 | to  
747 | today  
748 | together  
749 | too  
750 | took  
751 | toward  
752 | towards  
753 | tried  
754 | tries  
755 | truly  
756 | try  
757 | trying  
758 | ts  
759 | turn  
760 | turned  
761 | turning  
762 | turns  
763 | twice  
764 | two  
765 | u  
766 | un  
767 | under  
768 | unfortunately  
769 | unless  
770 | unlike  
771 | unlikely  
772 | until  
773 | unto  
774 | up  
775 | upon  
776 | ups  
777 | us  
778 | use  
779 | used  
780 | useful  
781 | usefully  
782 | usefulness  
783 | uses  
784 | using  
785 | usually  
786 | uucp  
787 | v  
788 | value  
789 | various  
790 | very  
791 | via  
792 | viz  
793 | vol  
794 | vols  
795 | vs  
796 | w  
797 | want  
798 | wanted  
799 | wanting  
800 | wants  
801 | was  
802 | wasn't  
803 | way  
804 | ways  
805 | we  
806 | we'd  
807 | we'll  
808 | we're  
809 | we've  
810 | wed  
811 | welcome  
812 | well  
813 | wells  
814 | went  
815 | were  
816 | weren't  
817 | what  
818 | what'll  
819 | what's  
820 | whatever  
821 | whats  
822 | when  
823 | whence  
824 | whenever  
825 | where  
826 | where's  
827 | whereafter  
828 | whereas  
829 | whereby  
830 | wherein  
831 | wheres  
832 | whereupon  
833 | wherever  
834 | whether  
835 | which  
836 | while  
837 | whim  
838 | whither  
839 | who  
840 | who'll  
841 | who's  
842 | whod  
843 | whoever  
844 | whole  
845 | whom  
846 | whomever  
847 | whos  
848 | whose  
849 | why  
850 | widely  
851 | will  
852 | willing  
853 | wish  
854 | with  
855 | within  
856 | without  
857 | won't  
858 | wonder  
859 | words  
860 | work  
861 | worked  
862 | working  
863 | works  
864 | world  
865 | would  
866 | wouldn't  
867 | www  
868 | x  
869 | y  
870 | year  
871 | years  
872 | yes  
873 | yet  
874 | you  
875 | you'd  
876 | you'll  
877 | you're  
878 | you've  
879 | youd  
880 | young  
881 | younger  
882 | youngest  
883 | your  
884 | youre  
885 | yours  
886 | yourself  
887 | yourselves  
888 | z  
889 | zero  
890 | zt  
891 | zz
892 | 


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Tue Jan 26 22:20:08 2021
 4 | 
 5 | @author: Xu
 6 | """
 7 | 
 8 | import sys
 9 | import os
10 | curPath = os.path.abspath(os.path.dirname(__file__))
11 | rootPath = os.path.split(curPath)[0]
12 | sys.path.append(os.path.split(rootPath)[0])


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/bert_embedding_extend.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/embedding_manager_cyd.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/glove_embedding.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/review_sentiment_analysis.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/sentiment_model.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/sentiment_analysis/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/bert_embedding_extend.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from bert_embedding import BertEmbedding
 5 | import mxnet as mx
 6 | import numpy as np
 7 | from sentiment_analysis.utils import handle_text
 8 | 
 9 | class BertEmbeddingExtend(object):
10 |     def __init__(self):
11 |         # self.bert_embed = BertEmbedding(model='bert_12_768_12', ctx = mx.gpu(0))
12 |         self.bert_embed = BertEmbedding(model='bert_12_768_12')
13 | 
14 |     def getSenetnceEmbedding(self, sentence, isUseAveragePooling, isUseStopwords):
15 | 
16 |         if isUseStopwords:
17 |             new_words_list = handle_text(sentence, isUseStopwords)
18 |             if len(new_words_list) == 0:
19 |                  return np.zeros(768)
20 |             sentence = " ".join(new_words_list)
21 | 
22 |         result = self.bert_embed(sentence.split('\n'))
23 |         first_sentence = result[0]
24 | 
25 |         if first_sentence[1] == None or len(first_sentence[1]) == 0:
26 |             return np.zeros(768)
27 | 
28 |         w_v = np.array(first_sentence[1])
29 |         total_effect_count = w_v.shape[0]
30 | 
31 |         if isUseAveragePooling:
32 |             w_v = np.sum(w_v, axis=0) / total_effect_count
33 |         else:
34 |             w_v = np.max(w_v, axis=0)
35 | 
36 |         return w_v
37 | 


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/embedding_manager_cyd.py:
--------------------------------------------------------------------------------
 1 | from sentiment_analysis.glove_embedding import GloveEmbedding
 2 | from  sentiment_analysis.bert_embedding_extend import BertEmbeddingExtend
 3 | from enum import Enum
 4 | 
 5 | class Embedding_Type(Enum):
 6 |     glove = 0,
 7 |     bert = 1
 8 | 
 9 | class EmbeddingManagerCyd(object):
10 |     def __init__(self):
11 |         self.gloveEmbedding = GloveEmbedding()
12 |         self.bertEmbedding = BertEmbeddingExtend()
13 | 
14 |     def getEmbedding(self, sentence, type, isUseAveragePooling, isUseStopwords):
15 |         if type == Embedding_Type.glove:
16 |             return self.gloveEmbedding.getSentenceVectorCommon(sentence, isUseAveragePooling, isUseStopwords)
17 |         elif type == Embedding_Type.bert:
18 |             return self.bertEmbedding.getSenetnceEmbedding(sentence, isUseAveragePooling, isUseStopwords)
19 | 
20 |         return None
21 | 


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/glove_embedding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sentiment_analysis.utils import handle_text
 3 | from src import config
 4 | 
 5 | 
 6 | class GloveEmbedding(object):
 7 | 
 8 |     def __init__(self):
 9 |         '''
10 |         初始化函数
11 |         '''
12 |         self.embeddings_index = {}
13 |         self.embedding_dim_glove = 100
14 |         self.init_data()
15 | 
16 |     def init_data(self):
17 |         '''
18 |         初始化数据
19 |         :return:
20 |         '''
21 |         glovefile = open(config.glove_embedding_path, "r", encoding="utf-8")
22 | 
23 |         for line in glovefile:
24 |             values = line.split()
25 |             word = values[0]
26 |             coefs = np.asarray(values[1:], dtype='float16')
27 |             self.embeddings_index[word] = coefs
28 |         glovefile.close()
29 | 
30 | 
31 |     def get_embedding_matrix_glove(self, word):
32 |         """
33 |         获取glove词向量
34 |         :param word:
35 |         :return:
36 |         """
37 |         embedding_vector = self.embeddings_index.get(word)
38 |         if embedding_vector is not None:
39 |             return embedding_vector[:self.embedding_dim_glove]
40 |         return np.zeros(self.embedding_dim_glove)
41 | 
42 |     def getSentenceVectorCommon(self, sentence, isUseAveragePooling, isUseStopwords):
43 |         tokens = handle_text(sentence,isUseStopwords)
44 |         total_effect_count = 0
45 |         w_v = []
46 |         for word in tokens:
47 |             if word in self.embeddings_index:
48 |                 total_effect_count += 1
49 |                 w_v.append(self.embeddings_index[word])
50 | 
51 |         w_v = np.array(w_v)
52 | 
53 |         is_effect = total_effect_count > 0
54 |         if  is_effect:
55 |             if isUseAveragePooling:
56 |                 w_v = np.sum(w_v, axis=0) / total_effect_count
57 |             else:
58 |                 w_v = np.max(w_v, axis=0)
59 |         else:
60 |             w_v = np.zeros(self.embedding_dim_glove)
61 | 
62 |         return np.array(w_v)
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/sentiment_model.py:
--------------------------------------------------------------------------------
 1 | import sentiment_analysis.glove_embedding
 2 | from sentiment_analysis.embedding_manager_cyd import EmbeddingManagerCyd
 3 | from sentiment_analysis.embedding_manager_cyd import Embedding_Type
 4 | from sklearn.externals import joblib #jbolib模块
 5 | from nltk import pos_tag, word_tokenize
 6 | import sentiment_analysis.utils
 7 | from sentiment_analysis.utils import handle_text
 8 | import numpy as np
 9 | from src import config
10 | 
11 | class SentimentModel(object):
12 |     def __init__(self):
13 |         self.model = joblib.load(config.svm_model_save_path)
14 |         self.embeddingManagerCyd = EmbeddingManagerCyd()
15 | 
16 |     def predict_prob(self, review_segment):
17 |         vectors = [self.embeddingManagerCyd.getEmbedding(review_segment, Embedding_Type.glove, True, False)]
18 |         features = np.array(vectors, dtype=np.float16)
19 |         proba_value = self.model.predict_proba(features)
20 |         score = proba_value[:, 1]
21 |         return np.float16(score[0])
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/src/exe/sentiment_analysis/utils.py:
--------------------------------------------------------------------------------
 1 | from nltk import pos_tag, word_tokenize
 2 | from nltk.corpus import stopwords
 3 | import config
 4 | 
 5 | stopwords = {line.rstrip().lower(): None for line in open(config.en_stopwords_path)}
 6 | 
 7 | def handle_text(text, isUseStopWords):
 8 |     if isUseStopWords:
 9 | #        new_word_list = [word for word in word_tokenize(text) if word not in stopwords.words('english')]
10 |         new_word_list = [word for word in word_tokenize(text) if word.lower() not in stopwords]
11 |     else:
12 |         new_word_list = [word for word in word_tokenize(text)]
13 | 
14 |     return new_word_list
15 | 


--------------------------------------------------------------------------------
/src/exe/worddiscovery/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__init__.py


--------------------------------------------------------------------------------
/src/exe/worddiscovery/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/worddiscovery/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/worddiscovery/__pycache__/entropy_based.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/entropy_based.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/worddiscovery/__pycache__/trie.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/trie.cpython-36.pyc


--------------------------------------------------------------------------------
/src/exe/worddiscovery/__pycache__/trie.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/exe/worddiscovery/__pycache__/trie.cpython-37.pyc


--------------------------------------------------------------------------------
/src/exe/worddiscovery/entropy_based.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8 -*-
  2 | # Author: lujiaying93@foxmail.com
  3 | # Algorithm source from: http://www.matrix67.com/blog/archives/5044
  4 | 
  5 | from __future__ import division
  6 | import os
  7 | import sys
  8 | cur_dir = os.path.dirname(os.path.realpath(__file__))
  9 | sys.path.append("%s/../" % (cur_dir))
 10 | 
 11 | import math
 12 | import re
 13 | import time
 14 | from collections import defaultdict
 15 | from worddiscovery.trie import CharTrie
 16 | 
 17 | import logging
 18 | log_console = logging.StreamHandler(sys.stderr)
 19 | default_logger = logging.getLogger(__name__)
 20 | default_logger.setLevel(logging.DEBUG)
 21 | default_logger.addHandler(log_console)
 22 | 
 23 | MAX_INT = 9223372036854775807
 24 | RE_SENTENCE_SEPERATOR = r'[\n\r]\s*'
 25 | RE_PUNCTUATION_TO_CLEAN = r'[.:;?!\~,\-_()[\]<>。：；？！~，、——（）【】《》＃＊＝＋/｜‘’“”￥#*=+\\|\'"^$%`]'
 26 | 
 27 | 
 28 | class EntropyBasedWorddiscovery(object):
 29 |     def __init__(self, word_max_len=6):
 30 |         self._trie = CharTrie()
 31 |         self._trie_reversed = CharTrie()  # for left char entropy calculate
 32 |         self._word_info = defaultdict(dict)
 33 |         self.word_max_len = word_max_len
 34 | 
 35 |         self.WORD_MIN_LEN = 2
 36 |         self.WORD_MIN_FREQ = 2
 37 |         self.WORD_MIN_PMI = 6
 38 |         self.WORD_MIN_NEIGHBOR_ENTROPY = 0
 39 | 
 40 |     def clear(self):
 41 |         self._trie.clear()
 42 |         self._trie_reversed.clear()
 43 |         self._word_info = defaultdict(dict)
 44 | 
 45 |     def parse_file(self, file_name, debug=False):
 46 |         with open(file_name) as fopen:
 47 |             document_text = fopen.read()
 48 |             self.parse(document_text, debug)
 49 | 
 50 |     def parse(self, document_text, debug=False):
 51 |         self.clear()
 52 |         sentences = self._preprocess(document_text)
 53 |         self._build_trie(sentences)
 54 |         self.cal_aggregation(debug)
 55 |         self.cal_neighbor_char_entropy(debug)
 56 |         self.cal_score(debug)
 57 | 
 58 |     def get_new_words(self, top=20):
 59 |         default_logger.debug("Start sorting to get new words...")
 60 |         start_t = time.time()
 61 |         sorted_word_info = sorted(self._word_info.items(), key=lambda _: _[1]['score_freq'], reverse=True)
 62 |         default_logger.debug("Get new words, which cost %.3f seconds" % (time.time()-start_t))
 63 |         top_new_words = [_[0] for _ in sorted_word_info[:top]]
 64 |         return top_new_words
 65 | 
 66 |     def cal_aggregation(self, debug):
 67 |         default_logger.debug("Calculating word internal aggregation score...")
 68 |         start_t = time.time()
 69 |         for word, count in self._trie.get_all_words():
 70 |             if len(word) < self.WORD_MIN_LEN or count < self.WORD_MIN_FREQ:
 71 |                 continue
 72 |             pmi = self._cal_word_aggregation(word, count)
 73 |             if debug:
 74 |                 self._word_info[word]['aggreg'] = self._cal_word_aggregation(word, count)
 75 |             else:
 76 |                 if pmi > self.WORD_MIN_PMI:
 77 |                     self._word_info[word]['aggreg'] = self._cal_word_aggregation(word, count)
 78 |         default_logger.debug("Internal aggregation has been calculated succesfully, which costs %.3f seconds" % (time.time()-start_t))
 79 | 
 80 |     def cal_neighbor_char_entropy(self, debug):
 81 |         default_logger.debug("Calculating word neighbor entropy score...")
 82 |         start_t = time.time()
 83 |         for word, count in self._trie.get_all_words():
 84 |             if len(word) < self.WORD_MIN_LEN or count < self.WORD_MIN_FREQ:
 85 |                 continue
 86 |             if not debug:
 87 |                 if word not in self._word_info:  # to speed up
 88 |                     continue
 89 |             rc_entropy = self._cal_word_neighbor_char_entropy(self._trie, word)
 90 |             if not debug:
 91 |                 if rc_entropy <= self.WORD_MIN_NEIGHBOR_ENTROPY:   # to speed up
 92 |                     self._word_info.pop(word)
 93 |                     continue
 94 |             lc_entropy = self._cal_word_neighbor_char_entropy(self._trie_reversed, word[::-1])
 95 |             neighbor_entropy = min(rc_entropy, lc_entropy)
 96 |             if debug:
 97 |                 self._word_info[word]['nbr_entropy'] = neighbor_entropy
 98 |                 self._word_info[word]['rc_entropy'] = rc_entropy
 99 |                 self._word_info[word]['lc_entropy'] = lc_entropy
100 |             else:
101 |                 if neighbor_entropy > self.WORD_MIN_NEIGHBOR_ENTROPY:
102 |                     self._word_info[word]['nbr_entropy'] = neighbor_entropy
103 |                 else:
104 |                     self._word_info.pop(word)
105 |         default_logger.debug("Neighbor entropy has been calculated succesfully, which costs %.3f seconds" % (time.time()-start_t))
106 | 
107 |     def cal_score(self, debug):
108 |         for word, d in self._word_info.items():
109 |             self._word_info[word]['score'] = d['aggreg'] + d['nbr_entropy']
110 |             if debug:
111 |                 if d['nbr_entropy'] <= self.WORD_MIN_NEIGHBOR_ENTROPY:
112 |                     self._word_info[word]['score'] = 0.0
113 |             self._word_info[word]['score_freq'] = d['score'] * self._trie.find(word)
114 | 
115 |     def _build_trie(self, sentences):
116 |         default_logger.debug("Building trie tree...")
117 |         start_t = time.time()
118 |         for s in sentences:
119 |             for n_grams in range(1, min(self.word_max_len+1, len(s)) + 1):
120 |                 if len(s) <= n_grams:
121 |                     self._trie.insert(s)
122 |                     self._trie_reversed.insert(s[::-1])
123 |                 else:
124 |                     for end_pos in range(n_grams, len(s) + 1):
125 |                         self._trie.insert(s[end_pos-n_grams:end_pos])
126 |                         self._trie_reversed.insert(s[end_pos-n_grams:end_pos][::-1])
127 |         default_logger.debug("Trie tree has been built succesfully, which costs %.3f seconds" % (time.time()-start_t))
128 | 
129 |     def _preprocess(self, document_text):
130 |         global RE_SENTENCE_SEPERATOR
131 |         global RE_PUNCTUATION_TO_CLEAN
132 |         # split to sentence
133 |         sentences = re.split(RE_SENTENCE_SEPERATOR, document_text)
134 |         # clean
135 |         sentences_clean = []
136 |         for s in sentences:
137 |             s = re.sub(RE_PUNCTUATION_TO_CLEAN, '', s)
138 |             if not s:
139 |                 continue
140 |             sentences_clean.append(s)
141 |         return sentences_clean
142 | 
143 |     def _cal_word_aggregation(self, word, word_count):
144 |         min_aggregation = MAX_INT
145 |         for frag1, frag2 in self._generate_word_fragment(word):
146 |             frag1_count = self._trie.find(frag1)
147 |             frag2_count = self._trie.find(frag2)
148 |             aggregation = word_count * self._trie.total_word_count / frag1_count / frag2_count
149 |             min_aggregation = min(min_aggregation, aggregation)
150 |         return math.log2(min_aggregation)
151 | 
152 |     def _generate_word_fragment(self, word):
153 |         for pos in range(1, len(word)):
154 |             yield (word[0:pos], word[pos:len(word)])
155 | 
156 |     def _cal_word_neighbor_char_entropy(self, trie_tree, word):
157 |         children_count_list = []
158 |         for char, char_count in trie_tree.get_children_char_count(word):
159 |             children_count_list.append(char_count)
160 |         total_word_count = sum(children_count_list)
161 |         entropy = sum(map(lambda c: -(c/total_word_count)*math.log2(c/total_word_count), children_count_list))
162 |         return entropy
163 | 
164 | if __name__ == '__main__':
165 |     discover = EntropyBasedWorddiscovery(word_max_len=6)
166 | 
167 |     discover.parse("""
168 |     自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，所以它与语言学的研究有着密切的联系，但又有重要的区别。自然语言处理并不是一般地研究自然语言，而在于研制能有效地实现自然语言通信的计算机系统，特别是其中的软件系统。因而它是计算机科学的一部分。
169 | 自然语言处理（NLP）是计算机科学，人工智能，语言学关注计算机和人类（自然）语言之间的相互作用的领域。
170 |    """, debug=True)
171 | 
172 |     #for word, count in discover._trie.get_all_words():
173 |     #    print(word, count)
174 |     #for node, prefix in discover._trie.traverse():
175 |     #    print(node, prefix)
176 |     for word, d in discover._word_info.items():
177 |         print(word, d['aggreg'], d['nbr_entropy'], discover._trie.find(word))
178 | 
179 |     print('\n'.join(discover.get_new_words(10)))
180 | 


--------------------------------------------------------------------------------
/src/exe/worddiscovery/test.txt:
--------------------------------------------------------------------------------
1 | 自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，所以它与语言学的研究有着密切的联系，但又有重要的区别。自然语言处理并不是一般地研究自然语言，而在于研制能有效地实现自然语言通信的计算机系统，特别是其中的软件系统。因而它是计算机科学的一部分。
2 | , 10
3 | , 5


--------------------------------------------------------------------------------
/src/exe/worddiscovery/trie.py:
--------------------------------------------------------------------------------
 1 | _SENTINEL = object()
 2 | 
 3 | class _TrieNode(object):
 4 |     __slots__ = ('children', 'value', 'count')
 5 | 
 6 |     def __init__(self):
 7 |         self.children = {}
 8 |         self.value = _SENTINEL
 9 |         self.count = 0
10 | 
11 |     def __repr__(self):
12 |         return '_TrieNode<%s>: value[%s], count[%d]' % (id(self), self.value, self.count)
13 | 
14 | 
15 | class CharTrie(object):
16 |     def __init__(self):
17 |         self._root = _TrieNode()
18 |         self.total_word_count = 0
19 | 
20 |     def insert(self, text):
21 |         node = self._root
22 |         for c in text:
23 |             if c not in node.children:
24 |                 node.children[c] = _TrieNode()
25 |                 node.children[c].value = c
26 |             node = node.children[c]
27 |         node.count += 1
28 |         self.total_word_count += 1
29 | 
30 |     def delete(self, text):
31 |         pass
32 | 
33 |     def find(self, text):
34 |         """
35 |         Args:
36 |             text: string
37 |         Returns:
38 |             count: int, frequent of text
39 |         """
40 |         is_in = True
41 |         node = self._root
42 |         for c in text:
43 |             if c not in node.children:
44 |                 is_in = False
45 |                 break
46 |             node = node.children[c]
47 |         if is_in:
48 |             return node.count
49 |         else:
50 |             return -1
51 | 
52 |     def traverse(self):
53 |         Q = [(self._root, '')]
54 | 
55 |         while Q:
56 |             node, prefix = Q.pop(0)
57 |             for child in node.children.values():
58 |                 yield (child, prefix)
59 |                 Q.append((child, prefix+child.value))
60 | 
61 |     def get_all_words(self):
62 |         for node, prefix in self.traverse():
63 |             yield (prefix+node.value, node.count)
64 | 
65 |     def get_children_char_count(self, text):
66 |         """
67 |         function for entropy based word discovery
68 |         """
69 |         is_in = True
70 |         node = self._root
71 |         for c in text:
72 |             if c not in node.children:
73 |                 is_in = False
74 |                 break
75 |             node = node.children[c]
76 | 
77 |         children = []
78 |         if is_in:
79 |             for child in node.children.values():
80 |                 children.append((child.value, child.count))
81 |         return children
82 | 
83 |     def clear(self):
84 |         self._root = _TrieNode()
85 | 


--------------------------------------------------------------------------------
/src/image/wordcloud_62068.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/image/wordcloud_62068.png


--------------------------------------------------------------------------------
/src/model/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/abstract_textrank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/abstract_textrank.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/bert_embedding_extend.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/bert_embedding_extend.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/compute_keywords_tfidf.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/compute_keywords_tfidf.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/create_wordcloud.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/create_wordcloud.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/embedding_manager_cyd.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/embedding_manager_cyd.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/glove_embedding.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/glove_embedding.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/keywords_textrank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/keywords_textrank.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/review_sentiment_analysis.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/review_sentiment_analysis.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/sentence_similarity.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/sentence_similarity.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/sentiment_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/sentiment_model.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/textrank.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/textrank.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/topic_cluster_lda.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/topic_cluster_lda.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/src/model/logistic_reg_clf_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/logistic_reg_clf_model.pkl


--------------------------------------------------------------------------------
/src/model/svm_clf.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/svm_clf.pkl


--------------------------------------------------------------------------------
/src/model/svm_clf_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/model/svm_clf_model.pkl


--------------------------------------------------------------------------------
/src/save/keyinfo_from_input_file.txt:
--------------------------------------------------------------------------------
1 | 摘要：
2 | 人工智能技术在疫情防控的各个应用场景中都可发挥重要作用，这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。 另外，这还可避免放射科医生以及临床医生被别人感染，降低他们的安全风险。 人工智能技术在防疫抗疫工作中大显身手  发布时间：2020-02-25 来源：人工智能实验室  近期，新型冠状病毒肺炎（简称“新冠肺炎”）的疫情突如其来，让人们有些措手不及。
3 | 关键词：
4 | 人工智能, 肺炎, 技术, 新冠, 疫情, 诊断, 疫苗, 进行, 医生, 利用


--------------------------------------------------------------------------------
/src/save/keyinfo_from_input_text.txt:
--------------------------------------------------------------------------------
1 | 摘要：
2 | 新小分子先导化合物生成中的 deep learning。 总结  在中国此次抗击疫情过程中，人工智能和大数据发挥了巨大的作用。 前面提到的AI新药研发公司Insilico Medicine，2020年2月6日，在官网上公开了其AI算法设计的6种可以阻止病毒复制的新分子结构。
3 | 关键词：
4 | 药物, 人工智能, 技术, 筛选, 靶点, 发现, 病毒, 数据, 利用, 临床


--------------------------------------------------------------------------------
/src/save/keyinfo_from_url.txt:
--------------------------------------------------------------------------------
1 | 摘要：
2 | 民航局于今日下发的通知，旨在响应1月25日中共中央办公厅、国务院办公厅下发的《关于做好人民群众就地过年服务保障工作的通知》，进一步扩大了免费退改政策的适用范围，落实春节假期非必要不流动的号召。 此前，民航局1月2日下发了《关于切实做好疫情常态化防控形势下客票退改服务工作的通知》，针对国内出现局部聚集性疫情的地区，要求各航空公司及时制定发布客票免费退改方案，并加强信息告知和宣传。 具体退改规则为：乘机日期在1月28日至2月3日的旅客，自1月27日0时起至航班起飞前可提出退票或改期申请。
3 | 关键词：
4 | 通知, 退改, 疫情, 民航局, 下发, 退票, 改期, 流动, 具体, 免费


--------------------------------------------------------------------------------
/src/save/keyinfo_input_file.txt:
--------------------------------------------------------------------------------
1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\save\text.txt


--------------------------------------------------------------------------------
/src/save/keyinfo_input_text.txt:
--------------------------------------------------------------------------------
1 | 原标题：AI技术在新冠肺炎药物发现中的应用  背景  2020年2月26日，《麻省理工学院技术评论》发布的2020年“全球十大突破性技术”中，人工智能筛选分子入选。那么什么是筛选分子呢？  伴随着新型冠状病毒肺炎（COVID-19）的爆发，相信大家对这个概念并不陌生。最早提到这个概念的消息如下：2020年1月25日，中国科学院上海药物研究所和上海科技大学联合研究团队综合利用虚拟筛选和酶学测试相结合的策略，发现了一批可能对新型肺炎有治疗作用的老药和中药。其中包括后来人人都耳熟能详的洛匹那韦和瑞德西韦。  分子筛选是药物开发流程（图1）中的一个步骤：是指从大量化合物中选择对某一特定靶点有活性化合物的过程。由图1可见，新药研发过程复杂漫长，面对突然爆发的新型冠状病毒肺炎，全新药物设计很不现实。因此利用人工智能技术从已有药物中发现对新型冠状病毒(SARS-CoV-2)有抑制作用的药物显得非常迫切。  人工智能与新药之路  人工智能（Artificial Intelligent, AI）用于药物发现是基于计算机辅助药物设计（Computer-aided Drug Design,CADD），然后结合化学信息、生物信息中的大量数据建立优质的机器学习模型，在靶点筛选、分子结构/化学空间分析、配体-受体相互作用模拟、药物三维定量构效关系（3D-QSAR）分析等过程中指导先导化合物的发现和优化。  另外，在药物临床阶段及批准上市后也有人工智能的使用。比如诺华利用人工智能从多个内部数据源抓取临床数据，用于预测和监控临床试验的患者招募、花费和质量。诺华宣称该技术的应用，使得患者招募时间缩短了10-15%。  图2是人工智能医药公司Insilico Medicine在药物研发各环节中人工智能技术的应用：包括在靶点发现阶段的deep feature selection,NLP；新小分子先导化合物生成中的 deep learning；以及对小分子临床结果的预测等。  图2 Insilico Medicine 的AI药物研发之路  新型冠状病毒肺炎药物中的人工智能  由图1可知，新药研发过程复杂漫长，面对突然爆发的疫情，人工智能技术的应用变得非常重要。自疫情爆发以来，已公开了大量关于利用人工智能和大数据发现有效药物的研究报导。这些研究主要集中在靶点发现、疾病网络构建和药物筛选。  2020年1月29日，燧坤智能应急小组利用人工智能文本挖掘技术，完成了对13139个已有药物分子，2000余万篇文献和1960万摘要的挖掘，输出了数十个已报道对SARS、MERS等冠状病毒有抑制效果的药物化合物。  2月3日，华中科技大学同济医学院等医院和研究所与华为云联合科研团队宣布，筛选出五种可能对2019新型冠状病毒（2019-nCoV）有效的抗病毒药物。分别是Beclabuvir，沙奎那韦（Saquinavir），比特拉韦（Bictegravir），洛匹那韦（Lopinavir），多替拉韦（Dolutegravir）。联合科研团队针对SARS-CoV-2的多个靶标蛋白对8506种上市或者正在进行临床试验的药物中进行超大规模计算机辅助药物筛选工作，并在一周内取得了第一阶段成果。  某合资医药公司首先通过序列相似性对比找到同源性较大的病毒序列，以此为关键词在公共平台中寻找已发表文献或相关靶点数据，利用数据挖掘进行实体识别和关系抽取；同时通过传统的数据库检索，查找相关靶点；然后将两部分的结果做加权，最后输出高置信靶点结果86个。然后构建病毒特异性网络，挖掘高置信度信号通路24条及病毒作用的核心模块：T细胞受体途径、内吞作用、趋化因子途径、C型凝集受体途径、JAK-STAT途径。最后，对超过8000种已知药物与病毒网络结合起来进行药物筛选、过滤，共得到78个对SARS-CoV-2有抑制作用的药物，包括氯喹、阿巴卡韦、穿心莲内酯等。  2月14日，广东省钟南山医学基金会、广州呼吸健康研究院与阿里云达成合作，加速推进新冠病毒的临床救治关键技术、有效药物和疫苗研发等工作。阿里云将提供超大规模计算力、AI算法等技术，支持钟南山团队的科研人员加快开展对新冠病毒的新药研发、病毒基因测序、蛋白筛选等相关工作。  此外，除了靶点发现、疾病网络构建和药物筛选，也有利用人工智能技术生成新的小分子。前面提到的AI新药研发公司Insilico Medicine，2020年2月6日，在官网上公开了其AI算法设计的6种可以阻止病毒复制的新分子结构。  总结  在中国此次抗击疫情过程中，人工智能和大数据发挥了巨大的作用。也使更多的企业和科研院所进一步认识到了人工智能技术对药物发现不可或缺的作用。  事实上，从本次疫情可以看到中国在算法和硬件方面已经达到世界先进水平，全球健康药物研发中心GHDDI正与阿里云合作开发人工智能药物研发和大数据平台：针对SARS/MERS等冠状病毒的历史药物研发进行数据挖掘与集成，开放相关临床前和临床数据资源，计算靶点和药物分子性质，并跟进新型冠状病毒最新科研动态。  或许，本次疫情会改变国内药企对人工智能的态度，翻开人工智能在中国助力药物研发的新篇章。返回搜狐，查看更多  责任编辑：


--------------------------------------------------------------------------------
/src/save/keyinfo_input_url.txt:
--------------------------------------------------------------------------------
1 | https://baijiahao.baidu.com/s?id=1689928103313263522&wfr=spider&for=pc


--------------------------------------------------------------------------------
/src/save/new_word_discovery_input_file.txt:
--------------------------------------------------------------------------------
1 | 自然语言处理是计算机科学领域与人工智能领域中的一个重要方向。它研究能实现人与计算机之间用自然语言进行有效通信的各种理论和方法。自然语言处理是一门融语言学、计算机科学、数学于一体的科学。因此，这一领域的研究将涉及自然语言，即人们日常使用的语言，所以它与语言学的研究有着密切的联系，但又有重要的区别。自然语言处理并不是一般地研究自然语言，而在于研制能有效地实现自然语言通信的计算机系统，特别是其中的软件系统。因而它是计算机科学的一部分。
2 | , 10
3 | , 5


--------------------------------------------------------------------------------
/src/save/new_word_discovery_output.txt:
--------------------------------------------------------------------------------
 1 | 1 语言
 2 | 2 自然语言
 3 | 3 计算机
 4 | 4 研究
 5 | 5 科学
 6 | 6 领域
 7 | 7 计算机科学
 8 | 8 重要
 9 | 9 实现
10 | 10 系统


--------------------------------------------------------------------------------
/src/save/review_summary.txt:
--------------------------------------------------------------------------------
 1 | business_id: 1
 2 | business_name: Castor EDC
 3 | business_rating: 0.6000312926277281
 4 | average_user_rating: {'rating_overall': 4.72093023255814, 'rating_ease_of_use': 4.651162790697675, 'rating_customer_support': 4.953488372093024, 'rating_features_functionality': 4.5813953488372094, 'rating_value_for_money': 4.790697674418603, 'rating_likelihood_to_recommend': 4.697674418604652}
 5 | aspect_summary: 
 6 | ------------------research------------------
 7 | rating: 0.47914341517857145
 8 | pos: research we selected Castor EDC because it seemed easy;  research easier
 9 | neg: research studies;  research data managemen;  research forms ;  research with substantial;  Overall: We used it to collect data for medical research in several centers
10 | ------------------software------------------
11 | rating: 0.7059326171875
12 | pos: software is easy;  software is very intuitive;  software is really intuitive;  software keeps being developed 
13 | neg: software with those characteristics so that collaborators won't have problems;  Comments: We have just started using the software. We found it very easy to set up on our own and use it. There are some possibilities we would like to see added, but it might be we just missed them. ;  software have occure;  Pros: The software is very practical and userfriendly.;  software for research
14 | ------------------database------------------
15 | rating: 0.6578369140625
16 | pos: database is easy;  database builder to create your own;  Clear and user friendly database with many features 
17 | neg: databases to new;  database in a very structured;  database that will also create a user-friendly;  database to hav
18 | ------------------system------------------
19 | rating: 0.734222412109375
20 | pos: Very user friendly EDC system;  system is easy;  system can perform
21 | neg: system nxt to the other;  system that is affordable;  Love this system!
22 | ------------------study------------------
23 | rating: 0.4230211046006944
24 | pos: None
25 | neg: study on Castor;  study visit ;  study fell on me;  study ID numbering is a bit clunky;  study very easily without any complex
26 | 


--------------------------------------------------------------------------------
/src/save/testtext.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/save/testtext.txt


--------------------------------------------------------------------------------
/src/save/text.txt:
--------------------------------------------------------------------------------
1 | 人工智能技术在防疫抗疫工作中大显身手  发布时间：2020-02-25 来源：人工智能实验室  近期，新型冠状病毒肺炎（简称“新冠肺炎”）的疫情突如其来，让人们有些措手不及。但是为了实现更好的防疫抗疫效果，不少研究人员纷纷应用诸多技术手段来抗击疫情。其中人工智能技术已成为这场防疫抗疫攻坚战的有力武器之一；它在疫情防控、图像分析、辅助诊断、疫苗研发、新药研制等方面助力防疫抗疫工作。  在疫情防控方面  新冠肺炎来势汹汹，但是它依然可防可控。采取有效的措施预防，戴口罩、勤洗手、居家隔离等都是非常行之有效的方法。例如戴口罩是预防传染病最重要、最有效的防控手段之一，可以有效降低感染新冠肺炎的风险。又如体温筛检是此次疫情中筛查排查可疑病例的一个手段。人工智能技术在疫情防控的各个应用场景中都可发挥重要作用，这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。  北京旷视科技有限公司最近推出一套用于发热及潜在被感染对象识别、筛查与分析的人工智能新系统“明骥”。该系统通过前端红外相机，鉴别人流中的高温人员，再根据疑似发烧者的人体、人脸信息，利用人工智能技术辅助工作人员快速定位体温异常者；做到了在佩戴口罩的情况下，也能精准锁定。目前，“明骥”已应用在地铁、火车站、机尝集中办公区等人流量较大的区域。  在图像分析方面  医疗影像数据是医疗数据的重要组成部分，人工智能技术能够通过快速准确地标记新冠肺炎的特定异常结构来提高图像分析的效率，以供放射科医生参考。提高图像分析效率，可让放射科医生腾出更多的时间聚焦在需要更多解读或判断的内容审阅上，从而有望缓解他们供给缺口问题。另外，这还可避免放射科医生以及临床医生被别人感染，降低他们的安全风险。  上海人工智能研究院与杭州健培科技有限公司联合研发的新冠肺炎影像云检测平台最近正式上线，对全国医院进行免费影像云诊断服务，并对所有医疗机构和各级政府免费开放，将高效、准确地为放射科医生以及临床医生提供决策依据，助力疫情防控。新冠肺炎影像云检测平台上线后，能够为临床一线抗疫医生疫情评估、肺炎性质判定、治疗方案制定提供高效精确的支撑依据。  在辅助诊断方面  医疗诊断是一个综合考虑各种影响因素的判断过程；利用人工智能技术辅助诊断新冠肺炎，能够在短时间内精准地预判病情，对提高患者预后具有重要作用。人工智能技术辅助诊断的功能既可以精确分割CT扫描部位的病灶；还可以对病灶的CT影像做分析，找出疑似病变和组织结构的异常，并给出诊断方向。在质控及病变识别方面，具有更为宽泛的使用范围。  在CT影像快速诊断方面，北京推想科技与武汉同济医院、深圳市第三人民医院合作研发针对新冠肺炎特别版，该版利用人工智能技术的深度学习、图像识别等对检出的病灶进行测量、密度分析，支持患者前后片对照，提供量化数据对比结果，帮助医生更快完成疑似患者诊断。北京安德医智联合解放军总医院正在研发新冠肺炎CT影像人工智能辅助诊断系统，免费提供给全国各级医院使用。  在疫苗研发方面  随着疫情持续，很多民众非常关心新冠肺炎的疫苗研发进展。据介绍，无论是对病毒进行基因测序，找到病毒来源以及传播宿主，还是研发病毒疫苗，人工智能技术都大有用武之地。例如传统的疫苗研发需在实验室中对数百种药物成分进行生物测试，这一过程往往要耗费不少时间；而人工智能技术可以极大加速这个过程，能够让更多的人获得疫苗的保护。  浙江大学研究团队最近利用人工智能技术在已有的药物中找到两种抗击疫情药物，从而使疫苗的研发工作取得了阶段性的成果。这两种药物有可能成为新冠肺炎候选疫苗，目前正在进行临床试验。据了解,将人工智能技术用于筛选和研发疫苗,能够帮助研究人员在已有的药物中快速找到可能对预防新冠肺炎有效的生物制品。  在新药研制方面  新冠肺炎的临床表现以发热﹑乏力﹑干咳为主要表现；而随着疾病的进展会出现急性呼吸窘迫综合征、难以纠正的代谢性酸中毒等，需要给予积极有效的治疗。但是目前还没有明确的特效药能够治疗新冠肺炎，只能根据患者的一般情况进行对症治疗，预防继发的感染，及时进行器官的功能支持。不过研究人员正在利用人工智能技术研制针对该病的特效药，新药很快就会问世。  美国麻省理工学院研究团队近日利用人工智能技术发现一种新型抗生素，它可以杀灭多种致病细菌，包括一些对所有已知抗生素都具耐药性的细菌菌株。研究人员通过让机器学习算法在几天内充分筛查庞大数据库中逾1亿种化合物，终于发现了这种抗生素；该抗生素被认为能有效抑制大肠杆菌，对治疗新冠肺炎也有效。  由上可知，人工智能技术正在新冠肺炎的防疫抗疫工作中大显身手。可以预料，作为一种综合性极强的技术，人工智能将在医疗健康领域内得到越来越多的应用,并将成为影响医学行业发展的重要科技手段。正如我国著名学者周海中教授曾经指出的那样：“随着社会的发展和科技的进步，人工智能技术将在医疗健康领域大显身手；其成果会不断涌现，应用前景令人期待。”
2 | 


--------------------------------------------------------------------------------
/src/save/topic_input_file.txt:
--------------------------------------------------------------------------------
1 | D:\Github\NLPVisualizationSystem\src\save\save_article.txt


--------------------------------------------------------------------------------
/src/save/topic_keywords_dist.txt:
--------------------------------------------------------------------------------
  1 | Number of topics	2
  2 | Topic 0	病变  药物  病毒  公司  疫苗  患者  技术  诊断  制药  人工智能  平安  氯喹  西韦  瑞德  辅助  一种  新药  医药  科技  治疗  感染  新型  成本  细胞  影像
  3 | Topic 1	药物  平安  公司  病变  诊断  患者  制药  这种  瑞德  一种  疫苗  新药  西韦  人工智能  技术  郭佑民  辅助  能够  病毒  试验  找到  肺部  风湿性关节炎  抑制  诊断系统
  4 | Document 1	[(1, 0.9984122)]
  5 | Document 2	[(1, 0.99875927)]
  6 | Document 3	[(1, 0.998737)]
  7 | Document 4	[(1, 0.998796)]
  8 | Document 5	[(1, 0.99632406)]
  9 | Document 6	[(1, 0.991446)]
 10 | Document 7	[(0, 0.03731014), (1, 0.9626899)]
 11 | Document 8	[(0, 0.1481864), (1, 0.8518136)]
 12 | Document 9	[(0, 0.2686373), (1, 0.7313627)]
 13 | Document 10	[(0, 0.18562442), (1, 0.81437564)]
 14 | Document 11	[(0, 0.18398936), (1, 0.8160106)]
 15 | Document 12	[(0, 0.32994798), (1, 0.67005205)]
 16 | Document 13	[(0, 0.052803226), (1, 0.9471968)]
 17 | Document 14	[(1, 0.99649817)]
 18 | Document 15	[(1, 0.9962901)]
 19 | Document 16	[(0, 0.016117718), (1, 0.98388225)]
 20 | Document 17	[(1, 0.9963944)]
 21 | Document 18	[(1, 0.99775714)]
 22 | Document 19	[(0, 0.33902404), (1, 0.660976)]
 23 | Document 20	[(1, 0.99305475)]
 24 | Document 21	[(1, 0.99877584)]
 25 | Document 22	[(0, 0.0373046), (1, 0.9626954)]
 26 | Document 23	[(1, 0.99497014)]
 27 | Document 24	[(1, 0.99875236)]
 28 | Document 25	[(1, 0.99881166)]
 29 | Document 26	[(1, 0.99876165)]
 30 | Document 27	[(0, 0.27710757), (1, 0.72289246)]
 31 | Document 28	[(0, 0.973271), (1, 0.026728965)]
 32 | Document 29	[(0, 0.9737816), (1, 0.02621842)]
 33 | Document 30	[(0, 0.9732813), (1, 0.02671867)]
 34 | Document 31	[(1, 0.99455696)]
 35 | Document 32	[(0, 0.43477303), (1, 0.565227)]
 36 | Document 33	[(0, 0.09642063), (1, 0.90357935)]
 37 | Document 34	[(0, 0.013210406), (1, 0.9867896)]
 38 | Document 35	[(0, 0.99591917)]
 39 | Document 36	[(0, 0.6590466), (1, 0.3409534)]
 40 | Document 37	[(1, 0.99653476)]
 41 | Document 38	[(0, 0.8884582), (1, 0.11154182)]
 42 | Document 39	[(1, 0.9984031)]
 43 | Document 40	[(1, 0.99875927)]
 44 | Document 41	[(1, 0.9987372)]
 45 | Document 42	[(1, 0.99879587)]
 46 | Document 43	[(1, 0.9963238)]
 47 | Document 44	[(1, 0.9919473)]
 48 | Document 45	[(0, 0.048582092), (1, 0.9514179)]
 49 | Document 46	[(0, 0.18444301), (1, 0.815557)]
 50 | Document 47	[(0, 0.29741976), (1, 0.7025802)]
 51 | Document 48	[(0, 0.14375831), (1, 0.8562417)]
 52 | Document 49	[(0, 0.21951777), (1, 0.78048223)]
 53 | Document 50	[(0, 0.20557976), (1, 0.79442024)]
 54 | Document 51	[(0, 0.04226604), (1, 0.957734)]
 55 | Document 52	[(1, 0.9964978)]
 56 | Document 53	[(1, 0.99628973)]
 57 | Document 54	[(0, 0.01421434), (1, 0.98578566)]
 58 | Document 55	[(1, 0.9964239)]
 59 | Document 56	[(1, 0.99775696)]
 60 | Document 57	[(0, 0.25308257), (1, 0.7469175)]
 61 | Document 58	[(1, 0.9926701)]
 62 | Document 59	[(1, 0.9987756)]
 63 | Document 60	[(0, 0.047722455), (1, 0.9522776)]
 64 | Document 61	[(1, 0.99478036)]
 65 | Document 62	[(1, 0.99875253)]
 66 | Document 63	[(1, 0.99881274)]
 67 | Document 64	[(1, 0.998761)]
 68 | Document 65	[(0, 0.25111613), (1, 0.7488839)]
 69 | Document 66	[(0, 0.95668525), (1, 0.043314744)]
 70 | Document 67	[(0, 0.97325945), (1, 0.026740566)]
 71 | Document 68	[(1, 0.9944634)]
 72 | Document 69	[(0, 0.4053822), (1, 0.5946178)]
 73 | Document 70	[(0, 0.113431476), (1, 0.88656855)]
 74 | Document 71	[(0, 0.6606846), (1, 0.33931538)]
 75 | Document 72	[(0, 0.013217282), (1, 0.98678267)]
 76 | Document 73	[(0, 0.99607855)]
 77 | Document 74	[(0, 0.6522202), (1, 0.34777978)]
 78 | Document 75	[(1, 0.99647295)]
 79 | Document 76	[(0, 0.91441846), (1, 0.08558151)]
 80 | Document 77	[(0, 0.96741784), (1, 0.032582123)]
 81 | Document 78	[(0, 0.1464981), (1, 0.8535019)]
 82 | Document 79	[(1, 0.99072385)]
 83 | Document 80	[(0, 0.24954157), (1, 0.7504584)]
 84 | Document 81	[(0, 0.7606445), (1, 0.23935558)]
 85 | Document 82	[(1, 0.9988935)]
 86 | Document 83	[(1, 0.99625957)]
 87 | Document 84	[(0, 0.39040664), (1, 0.60959333)]
 88 | Document 85	[(1, 0.9976985)]
 89 | Document 86	[(1, 0.9961334)]
 90 | Document 87	[(0, 0.17439792), (1, 0.82560205)]
 91 | Document 88	[(0, 0.037328243), (1, 0.9626718)]
 92 | Document 89	[(0, 0.37466863), (1, 0.62533134)]
 93 | Document 90	[(0, 0.104917355), (1, 0.89508265)]
 94 | Document 91	[(1, 0.9970127)]
 95 | Document 92	[(1, 0.9965256)]
 96 | Document 93	[(0, 0.06389082), (1, 0.9361092)]
 97 | Document 94	[(0, 0.12760386), (1, 0.8723961)]
 98 | Document 95	[(0, 0.8994774), (1, 0.10052254)]
 99 | Document 96	[(1, 0.99544394)]
100 | Document 97	[(0, 0.3909931), (1, 0.60900694)]
101 | Document 98	[(0, 0.22720024), (1, 0.77279973)]
102 | Document 99	[(0, 0.168411), (1, 0.831589)]
103 | Document 100	[(0, 0.9206198), (1, 0.07938017)]
104 | Document 101	[(0, 0.109019205), (1, 0.89098084)]
105 | Document 102	[(0, 0.80498064), (1, 0.19501936)]
106 | Document 103	[(0, 0.01335223), (1, 0.9866478)]
107 | Document 104	[(1, 0.9921854)]
108 | Document 105	[(1, 0.9976654)]
109 | Document 106	[(0, 0.35281095), (1, 0.6471891)]
110 | Document 107	[(0, 0.015055568), (1, 0.9849444)]
111 | Document 108	[(0, 0.19415398), (1, 0.805846)]
112 | Document 109	[(1, 0.99857014)]
113 | Document 110	[(0, 0.687761), (1, 0.312239)]
114 | Document 111	[(0, 0.0134905195), (1, 0.9865095)]
115 | Document 112	[(0, 0.9023951), (1, 0.09760485)]
116 | Document 113	[(0, 0.92524976), (1, 0.074750245)]
117 | Document 114	[(1, 0.9969824)]
118 | Document 115	[(0, 0.07226383), (1, 0.9277361)]
119 | Document 116	[(1, 0.9922549)]
120 | Document 117	[(0, 0.49086607), (1, 0.5091339)]
121 | Document 118	[(0, 0.047327124), (1, 0.9526729)]
122 | Document 119	[(0, 0.9909708)]
123 | Document 120	[(0, 0.8767328), (1, 0.123267174)]
124 | Document 121	[(0, 0.015171723), (1, 0.9848283)]
125 | Document 122	[(1, 0.9901243)]
126 | Document 123	[(1, 0.99853384)]
127 | Document 124	[(0, 0.9940778)]
128 | 


--------------------------------------------------------------------------------
/src/save/user_input_product_id_name.txt:
--------------------------------------------------------------------------------
1 | Castor EDC


--------------------------------------------------------------------------------
/src/save/userfile.txt:
--------------------------------------------------------------------------------
1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\save\text.txt


--------------------------------------------------------------------------------
/src/save/usertext.txt:
--------------------------------------------------------------------------------
1 | 人工智能技术在防疫抗疫工作中大显身手  发布时间：2020-02-25 来源：人工智能实验室  近期，新型冠状病毒肺炎（简称“新冠肺炎”）的疫情突如其来，让人们有些措手不及。但是为了实现更好的防疫抗疫效果，不少研究人员纷纷应用诸多技术手段来抗击疫情。其中人工智能技术已成为这场防疫抗疫攻坚战的有力武器之一；它在疫情防控、图像分析、辅助诊断、疫苗研发、新药研制等方面助力防疫抗疫工作。  在疫情防控方面  新冠肺炎来势汹汹，但是它依然可防可控。采取有效的措施预防，戴口罩、勤洗手、居家隔离等都是非常行之有效的方法。例如戴口罩是预防传染病最重要、最有效的防控手段之一，可以有效降低感染新冠肺炎的风险。又如体温筛检是此次疫情中筛查排查可疑病例的一个手段。人工智能技术在疫情防控的各个应用场景中都可发挥重要作用，这些应用场景都能直接为患者或者潜在的患者人群带来切实好处。  北京旷视科技有限公司最近推出一套用于发热及潜在被感染对象识别、筛查与分析的人工智能新系统“明骥”。该系统通过前端红外相机，鉴别人流中的高温人员，再根据疑似发烧者的人体、人脸信息，利用人工智能技术辅助工作人员快速定位体温异常者；做到了在佩戴口罩的情况下，也能精准锁定。目前，“明骥”已应用在地铁、火车站、机尝集中办公区等人流量较大的区域。  在图像分析方面  医疗影像数据是医疗数据的重要组成部分，人工智能技术能够通过快速准确地标记新冠肺炎的特定异常结构来提高图像分析的效率，以供放射科医生参考。提高图像分析效率，可让放射科医生腾出更多的时间聚焦在需要更多解读或判断的内容审阅上，从而有望缓解他们供给缺口问题。另外，这还可避免放射科医生以及临床医生被别人感染，降低他们的安全风险。  上海人工智能研究院与杭州健培科技有限公司联合研发的新冠肺炎影像云检测平台最近正式上线，对全国医院进行免费影像云诊断服务，并对所有医疗机构和各级政府免费开放，将高效、准确地为放射科医生以及临床医生提供决策依据，助力疫情防控。新冠肺炎影像云检测平台上线后，能够为临床一线抗疫医生疫情评估、肺炎性质判定、治疗方案制定提供高效精确的支撑依据。  在辅助诊断方面  医疗诊断是一个综合考虑各种影响因素的判断过程；利用人工智能技术辅助诊断新冠肺炎，能够在短时间内精准地预判病情，对提高患者预后具有重要作用。人工智能技术辅助诊断的功能既可以精确分割CT扫描部位的病灶；还可以对病灶的CT影像做分析，找出疑似病变和组织结构的异常，并给出诊断方向。在质控及病变识别方面，具有更为宽泛的使用范围。  在CT影像快速诊断方面，北京推想科技与武汉同济医院、深圳市第三人民医院合作研发针对新冠肺炎特别版，该版利用人工智能技术的深度学习、图像识别等对检出的病灶进行测量、密度分析，支持患者前后片对照，提供量化数据对比结果，帮助医生更快完成疑似患者诊断。北京安德医智联合解放军总医院正在研发新冠肺炎CT影像人工智能辅助诊断系统，免费提供给全国各级医院使用。  在疫苗研发方面  随着疫情持续，很多民众非常关心新冠肺炎的疫苗研发进展。据介绍，无论是对病毒进行基因测序，找到病毒来源以及传播宿主，还是研发病毒疫苗，人工智能技术都大有用武之地。例如传统的疫苗研发需在实验室中对数百种药物成分进行生物测试，这一过程往往要耗费不少时间；而人工智能技术可以极大加速这个过程，能够让更多的人获得疫苗的保护。  浙江大学研究团队最近利用人工智能技术在已有的药物中找到两种抗击疫情药物，从而使疫苗的研发工作取得了阶段性的成果。这两种药物有可能成为新冠肺炎候选疫苗，目前正在进行临床试验。据了解,将人工智能技术用于筛选和研发疫苗,能够帮助研究人员在已有的药物中快速找到可能对预防新冠肺炎有效的生物制品。  在新药研制方面  新冠肺炎的临床表现以发热﹑乏力﹑干咳为主要表现；而随着疾病的进展会出现急性呼吸窘迫综合征、难以纠正的代谢性酸中毒等，需要给予积极有效的治疗。但是目前还没有明确的特效药能够治疗新冠肺炎，只能根据患者的一般情况进行对症治疗，预防继发的感染，及时进行器官的功能支持。不过研究人员正在利用人工智能技术研制针对该病的特效药，新药很快就会问世。  美国麻省理工学院研究团队近日利用人工智能技术发现一种新型抗生素，它可以杀灭多种致病细菌，包括一些对所有已知抗生素都具耐药性的细菌菌株。研究人员通过让机器学习算法在几天内充分筛查庞大数据库中逾1亿种化合物，终于发现了这种抗生素；该抗生素被认为能有效抑制大肠杆菌，对治疗新冠肺炎也有效。  由上可知，人工智能技术正在新冠肺炎的防疫抗疫工作中大显身手。可以预料，作为一种综合性极强的技术，人工智能将在医疗健康领域内得到越来越多的应用,并将成为影响医学行业发展的重要科技手段。正如我国著名学者周海中教授曾经指出的那样：“随着社会的发展和科技的进步，人工智能技术将在医疗健康领域大显身手；其成果会不断涌现，应用前景令人期待。”
2 | 


--------------------------------------------------------------------------------
/src/save/userurl.txt:
--------------------------------------------------------------------------------
1 | https://baijiahao.baidu.com/s?id=1689928103313263522&wfr=spider&for=pc


--------------------------------------------------------------------------------
/src/save/wordcloud_from_input_file.txt:
--------------------------------------------------------------------------------
1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\image\wordcloud_40304.png


--------------------------------------------------------------------------------
/src/save/wordcloud_from_input_text.txt:
--------------------------------------------------------------------------------
1 | D:\PythonJupyterNootebook\My NLP projects\My projects\NLPVisualizationSystem\image\wordcloud_30399.png


--------------------------------------------------------------------------------
/src/save/wordcloud_from_url.txt:
--------------------------------------------------------------------------------
1 | D:\Github\NLPVisualizationSystem\src\image\wordcloud_62068.png


--------------------------------------------------------------------------------
/src/static.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/static.zip


--------------------------------------------------------------------------------
/src/templates.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SoulDGXu/NLPVisualizationSystem/789975cc8d42f54dd153dcc965760ccd3ced25b0/src/templates.zip


--------------------------------------------------------------------------------
/src/utils/data_prepare.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Oct 30 10:33:25 2020
  4 | 
  5 | @author: Xu
  6 | """
  7 | # data prepare
  8 | import pandas as pd
  9 | import re
 10 | from src import config
 11 | import os
 12 | import numpy as np
 13 | from collections import Counter
 14 | 
 15 | 
 16 | 
 17 | ## part 2: DATA 2 (product)
 18 | df = pd.read_excel(config.product_data_path_2)
 19 | 
 20 | # add column: product id 
 21 | df['product_id'] = df.index
 22 | 
 23 | # remove extra spaces
 24 | df['deployment'] = df['deployment'].astype(str).map(lambda x: re.sub(r'\s+','',x))
 25 | df['deployment'] = df['deployment'].map(lambda x: np.nan if x=='nan' else x)
 26 | 
 27 | 
 28 | df['training'] = df['training'].astype(str).map(lambda x: re.sub(r'\s+','',x))
 29 | df['training'] = df['training'].map(lambda x: np.nan if x=='nan' else x)
 30 | 
 31 | df['support'] = df['support'].astype(str).map(lambda x: re.sub(r'\s+','',x))
 32 | df['support'] = df['support'].map(lambda x: np.nan if x=='nan' else x)
 33 | 
 34 | 
 35 | # add new feature:price
 36 | # handle unclearly defined fields
 37 | import locale
 38 | locale.setlocale(locale.LC_ALL,'English_US')
 39 | 
 40 | df['starting_price_method'] = df['starting_price'].astype(str).map(lambda x: re.split(r'/',x,maxsplit=1)[1] if len(re.split(r'/',x,maxsplit=1))>1 else 'Not provided by vendor')
 41 | df['starting_price_num'] =  df['starting_price'].astype(str).map(lambda x: locale.atof(re.sub(r'\$', '',re.split(r'/',x,maxsplit=1)[0])) if re.match(r'^\$\S+',re.split(r'/',x,maxsplit=1)[0])!=None else float('inf'))                                                           
 42 | 
 43 | # missing data process:'Not provided by vendor'
 44 | 
 45 | import random
 46 | 
 47 | def get_num_by_prob(num_list, prob_list):
 48 |         x = random.uniform(0, 1)
 49 |         cum_pro = 0.0
 50 |         for num, pro in zip(num_list, prob_list):
 51 |             cum_pro += pro
 52 |             if x < cum_pro:
 53 |                 return num
 54 |             
 55 | def get_num_by_prob_range(num_range_list, prob_list):
 56 |     x = random.uniform(0, 1)
 57 |     cum_pro = 0.0
 58 |     for num_range, pro in zip(num_range_list, prob_list):
 59 |         cum_pro += pro
 60 |         if x < cum_pro:
 61 |             num = np.random.uniform(num_range[0],num_range[1])
 62 |             return num
 63 |         
 64 | def get_uniform_random_num(low, high):
 65 |     num = np.random.uniform(low, high)
 66 |     return round(num,2)
 67 |         
 68 | def fill_missing_data(dist, dtype='str'):
 69 |     """
 70 |     It is used to generate random number or string based on 
 71 |     your given distribution. Here specifically refers to the frequency distribution,like:
 72 |     - dist = {'a':0.1,'b':0.2,'c':0.7}, dtype='str': return a random string,like 'c'
 73 |     - dist = {[0,10]: 0.8,[11,100]:0.2}, dtype='int':return a random integer, like 7
 74 |     - dist = {[0,10]: 0.8,[11,100]:0.2}, dtype='float':return a random decimal, like 20.5
 75 |     
 76 |     Parameters
 77 |     ----------
 78 |     dist: TYPE-dictionary
 79 |         DESCRIPTION: specifically refers tofrequency distribution. 
 80 |         The keys of the dictionary represent all possible random values,
 81 |         and the values represent the probability of obtaining each key.
 82 |         i.e. dist = {'a':0.1,'b':0.2,'c':0.7}, dist = {'a':1,'b':2,'c':7}
 83 |         If the value (frequency) is not a decimal, it is automatically converted to a decimal.
 84 |         
 85 |     dtype: TYPE-str
 86 |         DESCRIPTION: the data type of random value, default is 'str', options='str','int','float'.
 87 | 
 88 |     Returns
 89 |     -------
 90 |     result: Type-depends on 'dtype'
 91 |         DESCRIPTION: a random value
 92 | 
 93 |     """    
 94 |     num_list = list(dist.keys())
 95 |     prob_list = list(dist.values())
 96 |     if sum(prob_list)>1:
 97 |         prob_list = [sum(prob_list)-p for p in prob_list]
 98 |         prob_list = [p/sum(prob_list) for p in prob_list]
 99 |         
100 |     if dtype=='str':
101 |        return get_num_by_prob(num_list, prob_list)    
102 |     elif dtype=='int':
103 |         return int(get_num_by_prob(num_list, prob_list))
104 |     else:
105 |         return round(get_num_by_prob(num_list, prob_list),2)
106 |     
107 | # missing data process:'Not provided by vendor'
108 | # fill the starting price method 
109 | starting_price_method_dic = dict(Counter(df['starting_price_method']))
110 | del starting_price_method_dic['Not provided by vendor']
111 | df['starting_price_method_fill'] = df['starting_price_method'].map(lambda x: fill_missing_data(starting_price_method_dic, dtype='str') if x=='Not provided by vendor' else x)
112 | 
113 | # prices interval
114 | prices_range = {}
115 | for m in starting_price_method_dic.keys():
116 |     price_list = list(df.loc[(df['starting_price_method']==m),'starting_price_num'].values)
117 |     if len(price_list)>1:
118 |         prange = [min(price_list), max(price_list)]
119 |     else:
120 |         prange = [price_list[0]*0.05, price_list[0]*1.5]
121 |     prices_range[m] = prange
122 | 
123 | # fill the starting price num
124 | import copy 
125 | 
126 | df['starting_price_num_fill'] = copy.copy(df['starting_price_num'])
127 | for i in df.index:
128 |     k = df.loc[i, 'starting_price_method_fill']
129 |     if df.loc[i, 'starting_price_num_fill'] == float('inf'):
130 |         df.loc[i, 'starting_price_num_fill'] = get_uniform_random_num(prices_range[k][0], prices_range[k][1]) 
131 | 
132 | # save new data
133 | df.to_csv(config.product_data_path, index=False)  
134 | 
135 | 
136 | # ---------------------------------------------------------------------------------
137 | 
138 | ## part 1: DATA 1 (user)
139 | data = pd.read_excel(config.user_data_path_1)
140 | ## missing data process
141 | data['rating_overall'] = data['rating_overall'].map(lambda x: float(re.sub('/5','',x)))
142 | data['rating_likelihood_to_recommend'] = data['rating_likelihood_to_recommend'].fillna(method='pad')
143 | data['rating_likelihood_to_recommend'] = data['rating_likelihood_to_recommend'].map(lambda x: float(x.split('/')[0])/2)
144 | 
145 | for i in ['rating_ease_of_use','rating_customer_support','rating_value_for_money','rating_features_functionality']:
146 |     data[i] = data[i].fillna(data[i].mode()[0])
147 |     data[i] = data[i].map(lambda x: float(x))
148 | 
149 | 
150 | ## check data features 
151 | # remove wrong field
152 | p = re.compile(r'(\d+\-\d+ \w+)')
153 | data['user_industry']= data['user_industry'].astype(str).map(lambda x: p.sub('nan',x))
154 | data['user_industry'] = data['user_industry'].map(lambda x: np.nan if x=='nan' else x)
155 | 
156 | #  remove wrong field
157 | #删除这列中错误的字段'Wellness and Fitness'
158 | # data.index[data['user_company_size']=='Wellness and Fitness'].tolist()
159 | data['user_company_size'].where(cond=data['user_company_size']!='Wellness and Fitness',other=np.nan,inplace=True)
160 | 
161 | # remove wrong field
162 | data['user_job_title'] = data['user_job_title'].astype(str).map(lambda x: 'nan' if re.match(r'\W+',x)!=None else x)
163 | data['user_job_title'] = data['user_job_title'].map(lambda x: np.nan if x=='nan' else x)
164 | 
165 | # add product id
166 | data = pd.merge(left=data, right=df[['product_id','product_name']], how='left', on='product_name')
167 | 
168 | # save new data
169 | data.to_csv(config.user_data_path, index=False)      
170 | 
171 | 
172 | 
173 | 
174 | # ---------------------------------------------------------------------------------
175 | # review data 
176 | def get_review_data():
177 |     """
178 |     It is used to read data and build a specific data frame, includes
179 |     review data items and product data items.
180 |     
181 |     Return
182 |     -------
183 |     review_data_item: type-dataframe, columns: review_id, product_name, user_name, text,
184 |         like ['product_1', 'user_name_1', 4.5, '....'], ....
185 |     product_data_item: type-dataframe, columns: product_id, product_name, review_count, rate
186 |         like ['product_1', 12], ....
187 |     
188 |     """
189 |     
190 |     df = pd.read_csv(config.user_data_path)
191 |     
192 |     # review data
193 |     review_data_item = df[['product_name', 'user_name', 'review_title']]
194 |     review_data_item = review_data_item.rename(columns={'review_title':'review'}).dropna(subset=['review'])
195 |     for col in ['software_pros', 'software_cons', 'software_overall_experience', 
196 |                 'software_comments', 'software_recommendations']:
197 |         df1 = df[['product_name', 'user_name', col]]
198 |         df1 = df1.rename(columns={col:'review'}).dropna(subset=['review'])
199 |         review_data_item = pd.concat([review_data_item, df1])
200 |         
201 |     # product data 
202 |     product_basic = review_data_item.groupby('product_name').count()['user_name'].reset_index(name="review_count")
203 |     product_rate = df.groupby('product_name').mean()[['rating_overall', 'rating_ease_of_use', 
204 |                              'rating_customer_support', 'rating_features_functionality', 
205 |                              'rating_value_for_money', 'rating_likelihood_to_recommend']].reset_index()
206 |     product_data_item = pd.merge(left=product_basic,right=product_rate,on='product_name')
207 |     
208 |     # save review data
209 |     ReviewData = pd.merge(left = review_data_item, right = product_data_item, how='inner', on='product_name')
210 |     ReviewData = pd.merge(left = ReviewData, right = df[['product_name','product_id']].drop_duplicates(), how='left', on='product_name')
211 |     ReviewData.to_csv(config.review_data_path, index_label='review_id')
212 |     
213 |     # save product data
214 |     df2 = pd.read_csv(config.product_data_path)
215 |     ProductData = pd.merge(left = product_basic, right = df2[['product_name','product_id']].drop_duplicates(), how='outer', on='product_name')
216 |     ProductData = ProductData.fillna(0)
217 |     ProductData.to_csv(config.business_data_path, index=False)
218 |     
219 | get_review_data() 
220 | 
221 | 
222 | 
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 


--------------------------------------------------------------------------------