├── Jupyter notebook
    ├── CNKI爬虫（改进版）.ipynb
    ├── co-citation_network.ipynb
    ├── sentiment_analysis_by_plotly.ipynb
    └── strategy_analysis_by_plotly(关键词).ipynb
├── README.md
├── Results
    ├── co-word_analysis.png
    ├── co_citation_analysis.png
    ├── scatter_exp.png
    ├── scatter_log.png
    ├── sentimental_analysis（区块链）.png
    ├── sentimental_analysis（小王子）.png
    └── 区块链词云.png
├── dependence
    ├── mask.png
    ├── simkai.ttf
    ├── stopwords.txt
    ├── 区块链技术发展现状与展望_袁勇.pdf
    ├── 区块链技术发展现状与展望_袁勇.txt
    ├── 小王子.txt
    └── 知网数据.xls
└── main
    ├── CNKI.py
    ├── CNKI2.py
    ├── __pycache__
        ├── draw_word_cloud.cpython-37.pyc
        ├── jieba_analysis.cpython-37.pyc
        └── network.cpython-37.pyc
    ├── co-citation_network.py
    ├── co-word_network.py
    ├── cooperation_network.py
    ├── draw_word_cloud.py
    ├── jieba_analysis.py
    ├── keywords_by_jieba.py
    ├── keywords_by_snownlp.py
    ├── keywords_by_textrank4zh.py
    ├── network.py
    ├── pdf-to-txt.py
    ├── sentiment_analysis.py
    ├── sentiment_analysis2.py
    ├── strategy_analysis.py
    ├── strategy_analysis_uniform.py
    └── word_cloud.py


/Jupyter notebook/CNKI爬虫（改进版）.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import requests\n",
 10 |     "from bs4 import BeautifulSoup\n",
 11 |     "import pandas as pd"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "def getHTMLText(url):\n",
 21 |     "    try:\n",
 22 |     "        headers = {'user-agent':'Mozilla/5.0'}\n",
 23 |     "        r = requests.get(url,timeout=30,headers=headers)\n",
 24 |     "        r.raise_for_status()\n",
 25 |     "        r.encoding = r.apparent_encoding\n",
 26 |     "        return r.text\n",
 27 |     "    except:\n",
 28 |     "        return \"产生异常\""
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "def getPageurl(list,pageNum):\n",
 38 |     "    first_url = \"http://search.cnki.com.cn/Search.aspx?q=%e5%8c%ba%e5%9d%97%e9%93%be&rank=relevant&cluster=all&val=&p=\"\n",
 39 |     "    for i in range(pageNum):\n",
 40 |     "        i = i*15\n",
 41 |     "        soup = BeautifulSoup(getHTMLText(first_url+str(i)),'html.parser')\n",
 42 |     "        for div in soup.find_all('div',class_=\"wz_tab\"):\n",
 43 |     "            for a in div.find_all('a',target='_blank'):\n",
 44 |     "                if 'http://search.cnki.net' not in a.get('href'):\n",
 45 |     "                    list.append(a.get('href'))"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "def getOnePage(list,pagetext):\n",
 55 |     "    for oneurl in list:\n",
 56 |     "        try:\n",
 57 |     "            onepage={}\n",
 58 |     "            tempL=[]\n",
 59 |     "            soup = BeautifulSoup(getHTMLText(oneurl),'html.parser')\n",
 60 |     "            onepage['题目']=soup.head.title.string#题目\n",
 61 |     "            onepage['关键词'] = soup.head.find_all('meta')[3].get('content')#关键词\n",
 62 |     "            au = []\n",
 63 |     "            for div in soup.find_all('div',style=\"text-align:center; width:740px; height:30px;\"):#作者\n",
 64 |     "                for a in div.find_all('a',target=\"_blank\"):\n",
 65 |     "                    au.append(a.string)\n",
 66 |     "            onepage['作者'] = au\n",
 67 |     "            for div in soup.find_all('div',style='float:left;'):\n",
 68 |     "                for b in div.find_all('b'):#机构\n",
 69 |     "                    onepage['机构']=b.string.strip()\n",
 70 |     "                for font in div.find_all('font',color='#0080ff'):#年份\n",
 71 |     "                    onepage['年份']=font.string.strip()\n",
 72 |     "            for div in soup.find_all('div',id=\"div_Ref\"):#相似文献、引用文献等\n",
 73 |     "                ref=[]\n",
 74 |     "                for td in div.find_all('td',rowspan=\"2\",align=\"left\",valign=\"bottom\",class_=\"b14\"):\n",
 75 |     "                    getType = td.string[1:5]\n",
 76 |     "                for a in div.find_all('a',target=\"_blank\"):\n",
 77 |     "                    ref.append(a.string)\n",
 78 |     "                onepage[getType] = ref\n",
 79 |     "            for table in soup.find_all('table',cellspacing=\"0\",cellpadding=\"0\",width=\"100%\",style=\"border:1px solid #7498d6;\"):\n",
 80 |     "                temp=[]\n",
 81 |     "                for a in table.find_all('a',target=\"_blank\"):#相关机构和相关作者\n",
 82 |     "                    temp.append(a.string)\n",
 83 |     "                tempL.append(temp)\n",
 84 |     "            if len(tempL) > 0:\n",
 85 |     "                onepage['相关机构'] = tempL[len(tempL)-2]\n",
 86 |     "                onepage['相关作者'] = tempL[len(tempL)-1]\n",
 87 |     "            pagetext.append(onepage)\n",
 88 |     "        except:\n",
 89 |     "            pass\n",
 90 |     "        continue"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "ls = []\n",
100 |     "pagetext=[]\n",
101 |     "getPageurl(ls,20)#设置爬取页数\n",
102 |     "getOnePage(ls,pagetext)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "frame = pd.DataFrame(pagetext)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "writer = pd.ExcelWriter('E:/1.xlsx')"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 8,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "frame.to_excel(writer,'Sheet1')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 9,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "writer.save()"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": []
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.7.1"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 2
170 | }
171 | 


--------------------------------------------------------------------------------
/Jupyter notebook/sentiment_analysis_by_plotly.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/html": [
 11 |        "<script type=\"text/javascript\">window.PlotlyConfig = {MathJaxConfig: 'local'};</script><script type=\"text/javascript\">if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}</script><script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window._Plotly) {require(['plotly'],function(plotly) {window._Plotly=plotly;});}</script>"
 12 |       ],
 13 |       "text/vnd.plotly.v1+html": [
 14 |        "<script type=\"text/javascript\">window.PlotlyConfig = {MathJaxConfig: 'local'};</script><script type=\"text/javascript\">if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}</script><script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window._Plotly) {require(['plotly'],function(plotly) {window._Plotly=plotly;});}</script>"
 15 |       ]
 16 |      },
 17 |      "metadata": {},
 18 |      "output_type": "display_data"
 19 |     },
 20 |     {
 21 |      "name": "stderr",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "D:\\conda\\lib\\site-packages\\plotly\\graph_objs\\_deprecations.py:39: DeprecationWarning:\n",
 25 |       "\n",
 26 |       "plotly.graph_objs.Data is deprecated.\n",
 27 |       "Please replace it with a list or tuple of instances of the following types\n",
 28 |       "  - plotly.graph_objs.Scatter\n",
 29 |       "  - plotly.graph_objs.Bar\n",
 30 |       "  - plotly.graph_objs.Area\n",
 31 |       "  - plotly.graph_objs.Histogram\n",
 32 |       "  - etc.\n",
 33 |       "\n",
 34 |       "\n"
 35 |      ]
 36 |     },
 37 |     {
 38 |      "data": {
 39 |       "application/vnd.plotly.v1+json": {
 40 |        "config": {
 41 |         "linkText": "Export to plot.ly",
 42 |         "plotlyServerURL": "https://plot.ly",
 43 |         "showLink": false
 44 |        },
 45 |        "data": [
 46 |         {
 47 |          "type": "bar",
 48 |          "uid": "16cf3f06-06fd-4d95-be02-b834386aca35",
 49 |          "x": [
 50 |           0,
 51 |           0.02,
 52 |           0.04,
 53 |           0.06,
 54 |           0.08,
 55 |           0.1,
 56 |           0.12,
 57 |           0.14,
 58 |           0.16,
 59 |           0.18,
 60 |           0.2,
 61 |           0.22,
 62 |           0.24,
 63 |           0.26,
 64 |           0.28,
 65 |           0.3,
 66 |           0.32,
 67 |           0.34,
 68 |           0.36,
 69 |           0.38,
 70 |           0.4,
 71 |           0.42,
 72 |           0.44,
 73 |           0.46,
 74 |           0.48,
 75 |           0.5,
 76 |           0.52,
 77 |           0.54,
 78 |           0.56,
 79 |           0.58,
 80 |           0.6,
 81 |           0.62,
 82 |           0.64,
 83 |           0.66,
 84 |           0.68,
 85 |           0.7,
 86 |           0.72,
 87 |           0.74,
 88 |           0.76,
 89 |           0.78,
 90 |           0.8,
 91 |           0.82,
 92 |           0.84,
 93 |           0.86,
 94 |           0.88,
 95 |           0.9,
 96 |           0.92,
 97 |           0.94,
 98 |           0.96,
 99 |           0.98
100 |          ],
101 |          "y": [
102 |           46,
103 |           11,
104 |           11,
105 |           4,
106 |           3,
107 |           6,
108 |           3,
109 |           5,
110 |           3,
111 |           5,
112 |           5,
113 |           3,
114 |           4,
115 |           2,
116 |           3,
117 |           5,
118 |           6,
119 |           3,
120 |           2,
121 |           0,
122 |           9,
123 |           1,
124 |           2,
125 |           2,
126 |           2,
127 |           111,
128 |           3,
129 |           1,
130 |           4,
131 |           1,
132 |           3,
133 |           1,
134 |           1,
135 |           0,
136 |           2,
137 |           3,
138 |           4,
139 |           3,
140 |           4,
141 |           8,
142 |           4,
143 |           4,
144 |           10,
145 |           4,
146 |           5,
147 |           10,
148 |           7,
149 |           8,
150 |           19,
151 |           115
152 |          ]
153 |         }
154 |        ],
155 |        "layout": {}
156 |       },
157 |       "text/html": [
158 |        "<div id=\"9346a92a-945f-4d72-b565-dd5ae160488f\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";\n",
159 |        "if (document.getElementById(\"9346a92a-945f-4d72-b565-dd5ae160488f\")) {\n",
160 |        "    Plotly.newPlot(\"9346a92a-945f-4d72-b565-dd5ae160488f\", [{\"x\": [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7, 0.72, 0.74, 0.76, 0.78, 0.8, 0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98], \"y\": [46, 11, 11, 4, 3, 6, 3, 5, 3, 5, 5, 3, 4, 2, 3, 5, 6, 3, 2, 0, 9, 1, 2, 2, 2, 111, 3, 1, 4, 1, 3, 1, 1, 0, 2, 3, 4, 3, 4, 8, 4, 4, 10, 4, 5, 10, 7, 8, 19, 115], \"type\": \"bar\", \"uid\": \"7e50964e-a422-4939-8aa4-7a175d715600\"}], {}, {\"showLink\": false, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\"}); \n",
161 |        "}\n",
162 |        "});</script><script type=\"text/javascript\">window.addEventListener(\"resize\", function(){if (document.getElementById(\"9346a92a-945f-4d72-b565-dd5ae160488f\")) {window._Plotly.Plots.resize(document.getElementById(\"9346a92a-945f-4d72-b565-dd5ae160488f\"));};})</script>"
163 |       ],
164 |       "text/vnd.plotly.v1+html": [
165 |        "<div id=\"9346a92a-945f-4d72-b565-dd5ae160488f\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";\n",
166 |        "if (document.getElementById(\"9346a92a-945f-4d72-b565-dd5ae160488f\")) {\n",
167 |        "    Plotly.newPlot(\"9346a92a-945f-4d72-b565-dd5ae160488f\", [{\"x\": [0.0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.12, 0.14, 0.16, 0.18, 0.2, 0.22, 0.24, 0.26, 0.28, 0.3, 0.32, 0.34, 0.36, 0.38, 0.4, 0.42, 0.44, 0.46, 0.48, 0.5, 0.52, 0.54, 0.56, 0.58, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7, 0.72, 0.74, 0.76, 0.78, 0.8, 0.82, 0.84, 0.86, 0.88, 0.9, 0.92, 0.94, 0.96, 0.98], \"y\": [46, 11, 11, 4, 3, 6, 3, 5, 3, 5, 5, 3, 4, 2, 3, 5, 6, 3, 2, 0, 9, 1, 2, 2, 2, 111, 3, 1, 4, 1, 3, 1, 1, 0, 2, 3, 4, 3, 4, 8, 4, 4, 10, 4, 5, 10, 7, 8, 19, 115], \"type\": \"bar\", \"uid\": \"7e50964e-a422-4939-8aa4-7a175d715600\"}], {}, {\"showLink\": false, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\"}); \n",
168 |        "}\n",
169 |        "});</script><script type=\"text/javascript\">window.addEventListener(\"resize\", function(){if (document.getElementById(\"9346a92a-945f-4d72-b565-dd5ae160488f\")) {window._Plotly.Plots.resize(document.getElementById(\"9346a92a-945f-4d72-b565-dd5ae160488f\"));};})</script>"
170 |       ]
171 |      },
172 |      "metadata": {},
173 |      "output_type": "display_data"
174 |     }
175 |    ],
176 |    "source": [
177 |     "#coding:utf-8\n",
178 |     "#author: moyuweiqing\n",
179 |     "#情感分析\n",
180 |     "\n",
181 |     "from snownlp import SnowNLP\n",
182 |     "import plotly.offline as py\n",
183 |     "import plotly.graph_objs as go\n",
184 |     "import numpy as np\n",
185 |     "import math\n",
186 |     "\n",
187 |     "py.init_notebook_mode(connected=True)#离线模式使用plotly\n",
188 |     "\n",
189 |     "text = open(r'C:\\Users\\Yoga\\Desktop\\区块链技术发展现状与展望_袁勇.txt').read()\n",
190 |     "s1 = text.replace('\\n', '').replace(' ', '').replace('.', '。')#去除换行\n",
191 |     "# print(s1)\n",
192 |     "sn1 = SnowNLP(s1)\n",
193 |     "sentimentslist = []\n",
194 |     "for i in sn1.sentences:\n",
195 |     "    j = SnowNLP(i)\n",
196 |     "    # print(i)\n",
197 |     "    # print(j.sentiments)\n",
198 |     "    sentimentslist.append(j.sentiments)\n",
199 |     "\n",
200 |     "dic = {}\n",
201 |     "for i in np.arange(0, 1, 0.02):\n",
202 |     "    index = round(i, 2)\n",
203 |     "    dic[index] = 0\n",
204 |     "# print(dic)\n",
205 |     "\n",
206 |     "for i in sentimentslist:\n",
207 |     "    temp = round(math.floor(i/0.02)*0.02, 2)\n",
208 |     "    dic[temp] = dic[temp] + 1\n",
209 |     "\n",
210 |     "trace = go.Bar(x = list(dic.keys()), y = list(dic.values()))\n",
211 |     "data = go.Data([trace])\n",
212 |     "py.iplot(data)"
213 |    ]
214 |   }
215 |  ],
216 |  "metadata": {
217 |   "kernelspec": {
218 |    "display_name": "Python 3",
219 |    "language": "python",
220 |    "name": "python3"
221 |   },
222 |   "language_info": {
223 |    "codemirror_mode": {
224 |     "name": "ipython",
225 |     "version": 3
226 |    },
227 |    "file_extension": ".py",
228 |    "mimetype": "text/x-python",
229 |    "name": "python",
230 |    "nbconvert_exporter": "python",
231 |    "pygments_lexer": "ipython3",
232 |    "version": "3.7.1"
233 |   }
234 |  },
235 |  "nbformat": 4,
236 |  "nbformat_minor": 2
237 | }
238 | 


--------------------------------------------------------------------------------
/Jupyter notebook/strategy_analysis_by_plotly(关键词).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/html": [
 11 |        "<script type=\"text/javascript\">window.PlotlyConfig = {MathJaxConfig: 'local'};</script><script type=\"text/javascript\">if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}</script><script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window._Plotly) {require(['plotly'],function(plotly) {window._Plotly=plotly;});}</script>"
 12 |       ],
 13 |       "text/vnd.plotly.v1+html": [
 14 |        "<script type=\"text/javascript\">window.PlotlyConfig = {MathJaxConfig: 'local'};</script><script type=\"text/javascript\">if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}</script><script>requirejs.config({paths: { 'plotly': ['https://cdn.plot.ly/plotly-latest.min']},});if(!window._Plotly) {require(['plotly'],function(plotly) {window._Plotly=plotly;});}</script>"
 15 |       ]
 16 |      },
 17 |      "metadata": {},
 18 |      "output_type": "display_data"
 19 |     },
 20 |     {
 21 |      "name": "stderr",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "Building prefix dict from the default dictionary ...\n",
 25 |       "Loading model from cache C:\\Users\\Yoga\\AppData\\Local\\Temp\\jieba.cache\n",
 26 |       "Loading model cost 2.208 seconds.\n",
 27 |       "Prefix dict has been built succesfully.\n",
 28 |       "D:\\conda\\lib\\site-packages\\plotly\\graph_objs\\_deprecations.py:39: DeprecationWarning:\n",
 29 |       "\n",
 30 |       "plotly.graph_objs.Data is deprecated.\n",
 31 |       "Please replace it with a list or tuple of instances of the following types\n",
 32 |       "  - plotly.graph_objs.Scatter\n",
 33 |       "  - plotly.graph_objs.Bar\n",
 34 |       "  - plotly.graph_objs.Area\n",
 35 |       "  - plotly.graph_objs.Histogram\n",
 36 |       "  - etc.\n",
 37 |       "\n",
 38 |       "\n"
 39 |      ]
 40 |     },
 41 |     {
 42 |      "data": {
 43 |       "application/vnd.plotly.v1+json": {
 44 |        "config": {
 45 |         "linkText": "Export to plot.ly",
 46 |         "plotlyServerURL": "https://plot.ly",
 47 |         "showLink": false
 48 |        },
 49 |        "data": [
 50 |         {
 51 |          "mode": "markers+text",
 52 |          "text": [
 53 |           "技术",
 54 |           "研究",
 55 |           "应用",
 56 |           "发展",
 57 |           "金融",
 58 |           "分析",
 59 |           "领域",
 60 |           "—",
 61 |           "创新",
 62 |           "设计",
 63 |           "实现",
 64 |           "系统",
 65 |           "产业",
 66 |           "问题",
 67 |           "我国",
 68 |           "展望",
 69 |           "商业银行",
 70 |           "挑战",
 71 |           "综述",
 72 |           "模式"
 73 |          ],
 74 |          "textposition": "top center",
 75 |          "type": "scatter",
 76 |          "uid": "e19ddfd8-8bcf-4b59-8a39-cb120804171b",
 77 |          "x": [
 78 |           4.700480365792417,
 79 |           4.543294782270004,
 80 |           4.3694478524670215,
 81 |           3.5263605246161616,
 82 |           3.332204510175204,
 83 |           2.8903717578961645,
 84 |           2.8903717578961645,
 85 |           2.772588722239781,
 86 |           2.772588722239781,
 87 |           2.70805020110221,
 88 |           2.6390573296152584,
 89 |           2.5649493574615367,
 90 |           2.5649493574615367,
 91 |           2.4849066497880004,
 92 |           2.302585092994046,
 93 |           2.302585092994046,
 94 |           2.302585092994046,
 95 |           2.302585092994046,
 96 |           2.1972245773362196,
 97 |           2.1972245773362196
 98 |          ],
 99 |          "y": [
100 |           6.529418838262226,
101 |           4.3694478524670215,
102 |           5.198497031265826,
103 |           5.087596335232384,
104 |           4.912654885736052,
105 |           2.70805020110221,
106 |           4.204692619390966,
107 |           4.143134726391533,
108 |           4.143134726391533,
109 |           3.1354942159291497,
110 |           0,
111 |           3.6888794541139363,
112 |           3.9318256327243257,
113 |           3.2188758248682006,
114 |           4.30406509320417,
115 |           0,
116 |           4.02535169073515,
117 |           2.1972245773362196,
118 |           1.3862943611198906,
119 |           2.9444389791664403
120 |          ]
121 |         }
122 |        ],
123 |        "layout": {}
124 |       },
125 |       "text/html": [
126 |        "<div id=\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";\n",
127 |        "if (document.getElementById(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\")) {\n",
128 |        "    Plotly.newPlot(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\", [{\"mode\": \"markers+text\", \"text\": [\"\\u6280\\u672f\", \"\\u7814\\u7a76\", \"\\u5e94\\u7528\", \"\\u53d1\\u5c55\", \"\\u91d1\\u878d\", \"\\u5206\\u6790\", \"\\u9886\\u57df\", \"\\u2014\", \"\\u521b\\u65b0\", \"\\u8bbe\\u8ba1\", \"\\u5b9e\\u73b0\", \"\\u7cfb\\u7edf\", \"\\u4ea7\\u4e1a\", \"\\u95ee\\u9898\", \"\\u6211\\u56fd\", \"\\u5c55\\u671b\", \"\\u5546\\u4e1a\\u94f6\\u884c\", \"\\u6311\\u6218\", \"\\u7efc\\u8ff0\", \"\\u6a21\\u5f0f\"], \"textposition\": \"top center\", \"x\": [4.700480365792417, 4.543294782270004, 4.3694478524670215, 3.5263605246161616, 3.332204510175204, 2.8903717578961645, 2.8903717578961645, 2.772588722239781, 2.772588722239781, 2.70805020110221, 2.6390573296152584, 2.5649493574615367, 2.5649493574615367, 2.4849066497880004, 2.302585092994046, 2.302585092994046, 2.302585092994046, 2.302585092994046, 2.1972245773362196, 2.1972245773362196], \"y\": [6.529418838262226, 4.3694478524670215, 5.198497031265826, 5.087596335232384, 4.912654885736052, 2.70805020110221, 4.204692619390966, 4.143134726391533, 4.143134726391533, 3.1354942159291497, 0, 3.6888794541139363, 3.9318256327243257, 3.2188758248682006, 4.30406509320417, 0, 4.02535169073515, 2.1972245773362196, 1.3862943611198906, 2.9444389791664403], \"type\": \"scatter\", \"uid\": \"a4d09e0c-8620-4a17-b256-2c29547443e9\"}], {}, {\"showLink\": false, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\"}); \n",
129 |        "}\n",
130 |        "});</script><script type=\"text/javascript\">window.addEventListener(\"resize\", function(){if (document.getElementById(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\")) {window._Plotly.Plots.resize(document.getElementById(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\"));};})</script>"
131 |       ],
132 |       "text/vnd.plotly.v1+html": [
133 |        "<div id=\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\" style=\"height: 525px; width: 100%;\" class=\"plotly-graph-div\"></div><script type=\"text/javascript\">require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {};window.PLOTLYENV.BASE_URL=\"https://plot.ly\";\n",
134 |        "if (document.getElementById(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\")) {\n",
135 |        "    Plotly.newPlot(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\", [{\"mode\": \"markers+text\", \"text\": [\"\\u6280\\u672f\", \"\\u7814\\u7a76\", \"\\u5e94\\u7528\", \"\\u53d1\\u5c55\", \"\\u91d1\\u878d\", \"\\u5206\\u6790\", \"\\u9886\\u57df\", \"\\u2014\", \"\\u521b\\u65b0\", \"\\u8bbe\\u8ba1\", \"\\u5b9e\\u73b0\", \"\\u7cfb\\u7edf\", \"\\u4ea7\\u4e1a\", \"\\u95ee\\u9898\", \"\\u6211\\u56fd\", \"\\u5c55\\u671b\", \"\\u5546\\u4e1a\\u94f6\\u884c\", \"\\u6311\\u6218\", \"\\u7efc\\u8ff0\", \"\\u6a21\\u5f0f\"], \"textposition\": \"top center\", \"x\": [4.700480365792417, 4.543294782270004, 4.3694478524670215, 3.5263605246161616, 3.332204510175204, 2.8903717578961645, 2.8903717578961645, 2.772588722239781, 2.772588722239781, 2.70805020110221, 2.6390573296152584, 2.5649493574615367, 2.5649493574615367, 2.4849066497880004, 2.302585092994046, 2.302585092994046, 2.302585092994046, 2.302585092994046, 2.1972245773362196, 2.1972245773362196], \"y\": [6.529418838262226, 4.3694478524670215, 5.198497031265826, 5.087596335232384, 4.912654885736052, 2.70805020110221, 4.204692619390966, 4.143134726391533, 4.143134726391533, 3.1354942159291497, 0, 3.6888794541139363, 3.9318256327243257, 3.2188758248682006, 4.30406509320417, 0, 4.02535169073515, 2.1972245773362196, 1.3862943611198906, 2.9444389791664403], \"type\": \"scatter\", \"uid\": \"a4d09e0c-8620-4a17-b256-2c29547443e9\"}], {}, {\"showLink\": false, \"linkText\": \"Export to plot.ly\", \"plotlyServerURL\": \"https://plot.ly\"}); \n",
136 |        "}\n",
137 |        "});</script><script type=\"text/javascript\">window.addEventListener(\"resize\", function(){if (document.getElementById(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\")) {window._Plotly.Plots.resize(document.getElementById(\"a1730e02-f6a1-45e5-99f7-e7e587caa31b\"));};})</script>"
138 |       ]
139 |      },
140 |      "metadata": {},
141 |      "output_type": "display_data"
142 |     }
143 |    ],
144 |    "source": [
145 |     "#coding:utf-8\n",
146 |     "#author: moyuweiqing\n",
147 |     "#战略分析，计算密度和向心度，向心度算法自己写，建立二维坐标轴\n",
148 |     "\n",
149 |     "import pandas as pd\n",
150 |     "import jieba\n",
151 |     "import plotly.offline as py\n",
152 |     "import plotly.graph_objs as go\n",
153 |     "import math\n",
154 |     "\n",
155 |     "py.init_notebook_mode(connected=True)#离线模式使用plotly\n",
156 |     "\n",
157 |     "xls = pd.ExcelFile(r'C:\\Users\\Yoga\\Desktop\\srp资料\\知网-区块链(2).xls')\n",
158 |     "readf = pd.read_excel(xls, 'Sheet1')['标题']\n",
159 |     "\n",
160 |     "all_word = []   #记录所有分词\n",
161 |     "dic = {}    #记录分词的出现数量\n",
162 |     "\n",
163 |     "#统计所有存在的分词\n",
164 |     "for row in range(0, len(readf)):\n",
165 |     "\ttemp = jieba.cut(readf[row])\n",
166 |     "\tfor i in temp:\n",
167 |     "\t\tif i in all_word:\n",
168 |     "\t\t\tcontinue\n",
169 |     "\t\telse:\n",
170 |     "\t\t\tall_word.append(i)\n",
171 |     "\n",
172 |     "#统计分词的出现数量\n",
173 |     "for i in all_word:\n",
174 |     "    dic[i] = 0\n",
175 |     "\n",
176 |     "for row in range(0, len(readf)):\n",
177 |     "    temp = jieba.cut(readf[row])\n",
178 |     "    for i in temp:\n",
179 |     "        dic[i] = dic[i] + 1\n",
180 |     "\n",
181 |     "#去除无关词\n",
182 |     "f = open(r'D:\\JetBrains\\PyCharm 2018.3.4\\CNKI-analysis\\venv\\Include\\dependence\\stopwords.txt', encoding = \"utf-8\")\n",
183 |     "temp_dic = dic.copy()\n",
184 |     "f = f.read()\n",
185 |     "for i in temp_dic:\n",
186 |     "    if i in f:\n",
187 |     "        dic.pop(i)\n",
188 |     "\n",
189 |     "#对分词进行排序，并挑选出出现次数最多的前20个\n",
190 |     "dic_sorted = dict(sorted(dic.items(), key = lambda x: x[1], reverse = True))\n",
191 |     "dic_20 = {}# 20个出现次数最多的词语\n",
192 |     "for i in range(0, 20):\n",
193 |     "    dic_20[list(dic_sorted.keys())[i]] = list(dic_sorted.values())[i]\n",
194 |     "\n",
195 |     "#建立一个空白的向心度模型\n",
196 |     "dic_heart = {}\n",
197 |     "for i in range(0, 20):\n",
198 |     "    dic_heart[list(dic_20.keys())[i]] = 0\n",
199 |     "\n",
200 |     "#计算向心度\n",
201 |     "for key in dic_heart.keys():\n",
202 |     "    for row in range(0, len(readf)):\n",
203 |     "        temp = jieba.cut(readf[row])\n",
204 |     "        if key in temp:\n",
205 |     "            dic_heart[key] = dic_heart[key] + len(list(temp))\n",
206 |     "\n",
207 |     "exp_densit = []# 密度的自然对数\n",
208 |     "exp_heart = [] # 向心度的自然对数\n",
209 |     "\n",
210 |     "#计算向心度\n",
211 |     "for i in dic_20.values():\n",
212 |     "    exp_densit.append(math.log(i))\n",
213 |     "for i in dic_heart.values():\n",
214 |     "    if i != 0:\n",
215 |     "        exp_heart.append(math.log(i))\n",
216 |     "    else:\n",
217 |     "        exp_heart.append(0)\n",
218 |     "\n",
219 |     "trace = go.Scatter(x = exp_densit, y = exp_heart, text = list(dic_20.keys()), textposition = \"top center\", mode = 'markers+text')\n",
220 |     "data = go.Data([trace])\n",
221 |     "py.iplot(data)\n",
222 |     "#解决乱码\n",
223 |     "# plt.rcParams['font.sans-serif'] =['Microsoft YaHei']\n",
224 |     "# plt.rcParams['axes.unicode_minus'] = False\n",
225 |     "#\n",
226 |     "# plt.title(u'密度和向心度散点图')\n",
227 |     "#\n",
228 |     "# plt.xlabel('密度的自然对数')\n",
229 |     "# plt.ylabel('向心度的自然对数')\n",
230 |     "\n",
231 |     "# plt.scatter(dic2.values(), dic3.values(), s=20, c=\"#ff1212\", marker='o')\n",
232 |     "# # plt.scatter(exp_densit, exp_heart, s=20, c=\"#ff1212\", marker='o')\n",
233 |     "# for i in range(0, 20):\n",
234 |     "#     plt.annotate(list(dic_20.keys())[i], xy = (exp_densit[i], exp_heart[i]))\n",
235 |     "# plt.show()\n",
236 |     "# plt.savefig(\"scatter_exp1.png\")"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "kernelspec": {
242 |    "display_name": "Python 3",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.7.1"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 2
261 | }
262 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CNKI-analysis   
 2 | ###项目简介：   
 3 | 使用python，从知网上爬取相关的t数据，并进行数据分析，涉及到pycharm和jupyter notebook   
 4 |    
 5 | ### 研究过程：   
 6 | 从知网上抓取以“区块链”为主题的文献，获取文献题名、主要责任者、发表杂志、关键词、文章分类号、引用文献和被引文献等数据；对低价值数据进行清洗；数据处理；对数据结果进行可视化呈现   
 7 |    
 8 | ### 技术栈：   
 9 | 数据抓取：python   
10 | 数据处理：python，主要涉及到jieba、networkx库   
11 | 可视化：matplotlib、plotly、pyecharts   
12 |    
13 | ### 存储说明：
14 | dependence存储的是依赖文件   
15 | main主要的分析部分   
16 | Results存储结果图   
17 | Jupyter notebook里面存放的是.ipynb文件，需要在Jupyter notebook下运行，主要是因为plotly库依赖Jupyter notebook环境   
18 |    
19 | ### 文件说明：   
20 | CNKI.py是我参考的爬虫文件   
21 | CNKI2.py是最开始用来爬取数据的爬虫文件   
22 | CNKI爬虫（改进版）是我一个师弟做的，用来分析的数据主要从这里爬取，爬取的数据存储在了知网数据.xls文件中   
23 | pdf-to-txt.py实现了从pdf到txt文件的转换   
24 | network.py封装了一部分构建网络的函数   
25 | co-citation_network.py是共被引网络分析   
26 | cooperation-network.py是作者合作网络分析   
27 | co-work_network.py是共词网络分析   
28 | keywords系列的py文件，是用不同的库进行关键词的提取，效果不同   
29 | sentiment_analysis.py是对区块链文章的情感分析   
30 | sentiment_analysis2.py是对《小王子》的情感分析   
31 | jieba_analysis.py封装了部分分词的操作函数   
32 | strategy_analysis.py战略分析，调用jieba_analysis.py构建散点图，对关键词的密度和向心度进行分析   
33 | strategy_analysis_uniform.py不调用jieba_analysis.py，直接进行分析   
34 | draw_word_cloud.py实现词云   
35 | word_cloud.py对关键词进行词云制作   
36 | Jupyter notebook里面存放的主要是要依赖Jupyter notebook开发环境的库的分析   
37 | 


--------------------------------------------------------------------------------
/Results/co-word_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/co-word_analysis.png


--------------------------------------------------------------------------------
/Results/co_citation_analysis.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/co_citation_analysis.png


--------------------------------------------------------------------------------
/Results/scatter_exp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/scatter_exp.png


--------------------------------------------------------------------------------
/Results/scatter_log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/scatter_log.png


--------------------------------------------------------------------------------
/Results/sentimental_analysis（区块链）.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/sentimental_analysis（区块链）.png


--------------------------------------------------------------------------------
/Results/sentimental_analysis（小王子）.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/sentimental_analysis（小王子）.png


--------------------------------------------------------------------------------
/Results/区块链词云.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/区块链词云.png


--------------------------------------------------------------------------------
/dependence/mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/mask.png


--------------------------------------------------------------------------------
/dependence/simkai.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/simkai.ttf


--------------------------------------------------------------------------------
/dependence/stopwords.txt:
--------------------------------------------------------------------------------
  1 | ﻿$
  2 | 0
  3 | 1
  4 | 2
  5 | 3
  6 | 4
  7 | 5
  8 | 6
  9 | 7
 10 | 8
 11 | 9
 12 | ?
 13 | _
 14 | “
 15 | ”
 16 | 、
 17 | 。
 18 | .:《
 19 | 》
 20 | 【
 21 | 】的区块链与和
 22 |  
 23 | 一
 24 | 一些
 25 | 一何
 26 | 一切
 27 | 一则
 28 | 一方面
 29 | 一旦
 30 | 一来
 31 | 一样
 32 | 一般
 33 | 一转眼
 34 | 万一
 35 | 上
 36 | 上下
 37 | 下
 38 | 不
 39 | 不仅
 40 | 不但
 41 | 不光
 42 | 不单
 43 | 不只
 44 | 不外乎
 45 | 不如
 46 | 不妨
 47 | 不尽
 48 | 不尽然
 49 | 不得
 50 | 不怕
 51 | 不惟
 52 | 不成
 53 | 不拘
 54 | 不料
 55 | 不是
 56 | 不比
 57 | 不然
 58 | 不特
 59 | 不独
 60 | 不管
 61 | 不至于
 62 | 不若
 63 | 不论
 64 | 不过
 65 | 不问
 66 | 与
 67 | 与其
 68 | 与其说
 69 | 与否
 70 | 与此同时
 71 | 且
 72 | 且不说
 73 | 且说
 74 | 两者
 75 | 个
 76 | 个别
 77 | 临
 78 | 为
 79 | 为了
 80 | 为什么
 81 | 为何
 82 | 为止
 83 | 为此
 84 | 为着
 85 | 乃
 86 | 乃至
 87 | 乃至于
 88 | 么
 89 | 之
 90 | 之一
 91 | 之所以
 92 | 之类
 93 | 乌乎
 94 | 乎
 95 | 乘
 96 | 也
 97 | 也好
 98 | 也罢
 99 | 了
100 | 二来
101 | 于
102 | 于是
103 | 于是乎
104 | 云云
105 | 云尔
106 | 些
107 | 亦
108 | 人
109 | 人们
110 | 人家
111 | 什么
112 | 什么样
113 | 今
114 | 介于
115 | 仍
116 | 仍旧
117 | 从
118 | 从此
119 | 从而
120 | 他
121 | 他人
122 | 他们
123 | 以
124 | 以上
125 | 以为
126 | 以便
127 | 以免
128 | 以及
129 | 以故
130 | 以期
131 | 以来
132 | 以至
133 | 以至于
134 | 以致
135 | 们
136 | 任
137 | 任何
138 | 任凭
139 | 似的
140 | 但
141 | 但凡
142 | 但是
143 | 何
144 | 何以
145 | 何况
146 | 何处
147 | 何时
148 | 余外
149 | 作为
150 | 你
151 | 你们
152 | 使
153 | 使得
154 | 例如
155 | 依
156 | 依据
157 | 依照
158 | 便于
159 | 俺
160 | 俺们
161 | 倘
162 | 倘使
163 | 倘或
164 | 倘然
165 | 倘若
166 | 借
167 | 假使
168 | 假如
169 | 假若
170 | 傥然
171 | 像
172 | 儿
173 | 先不先
174 | 光是
175 | 全体
176 | 全部
177 | 兮
178 | 关于
179 | 其
180 | 其一
181 | 其中
182 | 其二
183 | 其他
184 | 其余
185 | 其它
186 | 其次
187 | 具体地说
188 | 具体说来
189 | 兼之
190 | 内
191 | 再
192 | 再其次
193 | 再则
194 | 再有
195 | 再者
196 | 再者说
197 | 再说
198 | 冒
199 | 冲
200 | 况且
201 | 几
202 | 几时
203 | 凡
204 | 凡是
205 | 凭
206 | 凭借
207 | 出于
208 | 出来
209 | 分别
210 | 则
211 | 则甚
212 | 别
213 | 别人
214 | 别处
215 | 别是
216 | 别的
217 | 别管
218 | 别说
219 | 到
220 | 前后
221 | 前此
222 | 前者
223 | 加之
224 | 加以
225 | 即
226 | 即令
227 | 即使
228 | 即便
229 | 即如
230 | 即或
231 | 即若
232 | 却
233 | 去
234 | 又
235 | 又及
236 | 及
237 | 及其
238 | 及至
239 | 反之
240 | 反而
241 | 反过来
242 | 反过来说
243 | 受到
244 | 另
245 | 另一方面
246 | 另外
247 | 另悉
248 | 只
249 | 只当
250 | 只怕
251 | 只是
252 | 只有
253 | 只消
254 | 只要
255 | 只限
256 | 叫
257 | 叮咚
258 | 可
259 | 可以
260 | 可是
261 | 可见
262 | 各
263 | 各个
264 | 各位
265 | 各种
266 | 各自
267 | 同
268 | 同时
269 | 后
270 | 后者
271 | 向
272 | 向使
273 | 向着
274 | 吓
275 | 吗
276 | 否则
277 | 吧
278 | 吧哒
279 | 吱
280 | 呀
281 | 呃
282 | 呕
283 | 呗
284 | 呜
285 | 呜呼
286 | 呢
287 | 呵
288 | 呵呵
289 | 呸
290 | 呼哧
291 | 咋
292 | 和
293 | 咚
294 | 咦
295 | 咧
296 | 咱
297 | 咱们
298 | 咳
299 | 哇
300 | 哈
301 | 哈哈
302 | 哉
303 | 哎
304 | 哎呀
305 | 哎哟
306 | 哗
307 | 哟
308 | 哦
309 | 哩
310 | 哪
311 | 哪个
312 | 哪些
313 | 哪儿
314 | 哪天
315 | 哪年
316 | 哪怕
317 | 哪样
318 | 哪边
319 | 哪里
320 | 哼
321 | 哼唷
322 | 唉
323 | 唯有
324 | 啊
325 | 啐
326 | 啥
327 | 啦
328 | 啪达
329 | 啷当
330 | 喂
331 | 喏
332 | 喔唷
333 | 喽
334 | 嗡
335 | 嗡嗡
336 | 嗬
337 | 嗯
338 | 嗳
339 | 嘎
340 | 嘎登
341 | 嘘
342 | 嘛
343 | 嘻
344 | 嘿
345 | 嘿嘿
346 | 因
347 | 因为
348 | 因了
349 | 因此
350 | 因着
351 | 因而
352 | 固然
353 | 在
354 | 在下
355 | 在于
356 | 地
357 | 基于
358 | 处在
359 | 多
360 | 多么
361 | 多少
362 | 大
363 | 大家
364 | 她
365 | 她们
366 | 好
367 | 如
368 | 如上
369 | 如上所述
370 | 如下
371 | 如何
372 | 如其
373 | 如同
374 | 如是
375 | 如果
376 | 如此
377 | 如若
378 | 始而
379 | 孰料
380 | 孰知
381 | 宁
382 | 宁可
383 | 宁愿
384 | 宁肯
385 | 它
386 | 它们
387 | 对
388 | 对于
389 | 对待
390 | 对方
391 | 对比
392 | 将
393 | 小
394 | 尔
395 | 尔后
396 | 尔尔
397 | 尚且
398 | 就
399 | 就是
400 | 就是了
401 | 就是说
402 | 就算
403 | 就要
404 | 尽
405 | 尽管
406 | 尽管如此
407 | 岂但
408 | 己
409 | 已
410 | 已矣
411 | 巴
412 | 巴巴
413 | 并
414 | 并且
415 | 并非
416 | 庶乎
417 | 庶几
418 | 开外
419 | 开始
420 | 归
421 | 归齐
422 | 当
423 | 当地
424 | 当然
425 | 当着
426 | 彼
427 | 彼时
428 | 彼此
429 | 往
430 | 待
431 | 很
432 | 得
433 | 得了
434 | 怎
435 | 怎么
436 | 怎么办
437 | 怎么样
438 | 怎奈
439 | 怎样
440 | 总之
441 | 总的来看
442 | 总的来说
443 | 总的说来
444 | 总而言之
445 | 恰恰相反
446 | 您
447 | 惟其
448 | 慢说
449 | 我
450 | 我们
451 | 或
452 | 或则
453 | 或是
454 | 或曰
455 | 或者
456 | 截至
457 | 所
458 | 所以
459 | 所在
460 | 所幸
461 | 所有
462 | 才
463 | 才能
464 | 打
465 | 打从
466 | 把
467 | 抑或
468 | 拿
469 | 按
470 | 按照
471 | 换句话说
472 | 换言之
473 | 据
474 | 据此
475 | 接着
476 | 故
477 | 故此
478 | 故而
479 | 旁人
480 | 无
481 | 无宁
482 | 无论
483 | 既
484 | 既往
485 | 既是
486 | 既然
487 | 时候
488 | 是
489 | 是以
490 | 是的
491 | 曾
492 | 替
493 | 替代
494 | 最
495 | 有
496 | 有些
497 | 有关
498 | 有及
499 | 有时
500 | 有的
501 | 望
502 | 朝
503 | 朝着
504 | 本
505 | 本人
506 | 本地
507 | 本着
508 | 本身
509 | 来
510 | 来着
511 | 来自
512 | 来说
513 | 极了
514 | 果然
515 | 果真
516 | 某
517 | 某个
518 | 某些
519 | 某某
520 | 根据
521 | 欤
522 | 正值
523 | 正如
524 | 正巧
525 | 正是
526 | 此
527 | 此地
528 | 此处
529 | 此外
530 | 此时
531 | 此次
532 | 此间
533 | 毋宁
534 | 每
535 | 每当
536 | 比
537 | 比及
538 | 比如
539 | 比方
540 | 没奈何
541 | 沿
542 | 沿着
543 | 漫说
544 | 焉
545 | 然则
546 | 然后
547 | 然而
548 | 照
549 | 照着
550 | 犹且
551 | 犹自
552 | 甚且
553 | 甚么
554 | 甚或
555 | 甚而
556 | 甚至
557 | 甚至于
558 | 用
559 | 用来
560 | 由
561 | 由于
562 | 由是
563 | 由此
564 | 由此可见
565 | 的
566 | 的确
567 | 的话
568 | 直到
569 | 相对而言
570 | 省得
571 | 看
572 | 眨眼
573 | 着
574 | 着呢
575 | 矣
576 | 矣乎
577 | 矣哉
578 | 离
579 | 竟而
580 | 第
581 | 等
582 | 等到
583 | 等等
584 | 简言之
585 | 管
586 | 类如
587 | 紧接着
588 | 纵
589 | 纵令
590 | 纵使
591 | 纵然
592 | 经
593 | 经过
594 | 结果
595 | 给
596 | 继之
597 | 继后
598 | 继而
599 | 综上所述
600 | 罢了
601 | 者
602 | 而
603 | 而且
604 | 而况
605 | 而后
606 | 而外
607 | 而已
608 | 而是
609 | 而言
610 | 能
611 | 能否
612 | 腾
613 | 自
614 | 自个儿
615 | 自从
616 | 自各儿
617 | 自后
618 | 自家
619 | 自己
620 | 自打
621 | 自身
622 | 至
623 | 至于
624 | 至今
625 | 至若
626 | 致
627 | 般的
628 | 若
629 | 若夫
630 | 若是
631 | 若果 
632 | 若非
633 | 莫不然
634 | 莫如
635 | 莫若
636 | 虽
637 | 虽则
638 | 虽然
639 | 虽说
640 | 被
641 | 要
642 | 要不
643 | 要不是
644 | 要不然
645 | 要么
646 | 要是
647 | 譬喻
648 | 譬如
649 | 让
650 | 许多
651 | 论
652 | 设使
653 | 设或
654 | 设若
655 | 诚如
656 | 诚然
657 | 该
658 | 说来
659 | 诸
660 | 诸位
661 | 诸如
662 | 谁
663 | 谁人
664 | 谁料
665 | 谁知
666 | 贼死
667 | 赖以
668 | 赶
669 | 起
670 | 起见
671 | 趁
672 | 趁着
673 | 越是
674 | 距
675 | 跟
676 | 较
677 | 较之
678 | 边
679 | 过
680 | 还
681 | 还是
682 | 还有
683 | 还要
684 | 这
685 | 这一来
686 | 这个
687 | 这么
688 | 这么些
689 | 这么样
690 | 这么点儿
691 | 这些
692 | 这会儿
693 | 这儿
694 | 这就是说
695 | 这时
696 | 这样
697 | 这次
698 | 这般
699 | 这边
700 | 这里
701 | 进而
702 | 连
703 | 连同
704 | 逐步
705 | 通过
706 | 遵循
707 | 遵照
708 | 那
709 | 那个
710 | 那么
711 | 那么些
712 | 那么样
713 | 那些
714 | 那会儿
715 | 那儿
716 | 那时
717 | 那样
718 | 那般
719 | 那边
720 | 那里
721 | 都
722 | 鄙人
723 | 鉴于
724 | 针对
725 | 阿
726 | 除
727 | 除了
728 | 除外
729 | 除开
730 | 除此之外
731 | 除非
732 | 随
733 | 随后
734 | 随时
735 | 随着
736 | 难道说
737 | 非但
738 | 非徒
739 | 非特
740 | 非独
741 | 靠
742 | 顺
743 | 顺着
744 | 首先
745 | 没有一个！
746 | ，
747 | ：
748 | ；
749 | ？
750 | 大学论文博士硕士浙江中国北京期年首都大连 nan


--------------------------------------------------------------------------------
/dependence/区块链技术发展现状与展望_袁勇.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/区块链技术发展现状与展望_袁勇.pdf


--------------------------------------------------------------------------------
/dependence/区块链技术发展现状与展望_袁勇.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/区块链技术发展现状与展望_袁勇.txt


--------------------------------------------------------------------------------
/dependence/小王子.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/小王子.txt


--------------------------------------------------------------------------------
/dependence/知网数据.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/知网数据.xls


--------------------------------------------------------------------------------
/main/CNKI.py:
--------------------------------------------------------------------------------
 1 | #初始的爬虫案例，主要参照于这个来做
 2 | #coding:utf-8
 3 | 
 4 | import requests
 5 | from bs4 import BeautifulSoup as bs
 6 | import time
 7 | import xlwt
 8 | import openpyxl
 9 | import re
10 | 
11 | 
12 | def pagenext():
13 |     base_url = 'http://search.cnki.com.cn/search.aspx?q=%E6%96%B0%E9%97%BB%E4%BC%A0%E6%92%AD&rank=relevant&cluster=Type&val=I141&p='
14 |     L = range(0, 840)  # 最尾巴的数不计入
15 |     All_Page = []
16 |     for i in L[::10]:
17 |         next_url = base_url + str(i)
18 |         # print(next_url)
19 |         print("第 ", i / 10 + 1, " 页的数据")
20 |         page_text = spider(next_url)
21 |         time.sleep(10)
22 |         for page in page_text:
23 |             All_Page.append(page)
24 |     print(All_Page)
25 |     write_excel('xlsx论文筛选.xlsx', 'info', All_Page)
26 | 
27 | 
28 | def datespider(date_url):
29 |     # 因为跳转的链接类型不一样，所以我们要判断这两种链接是哪一种并且选择不一样的解析find方法
30 |     response_try = requests.get(date_url, {
31 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'})
32 |     # print(response_try.text)
33 |     response_tree = bs(response_try.text, 'html.parser')
34 |     # 根据两个不同的链接返回不一样的值
35 |     if re.match(r'http://www.cnki.com.cn/Article/[0-9a-zA-Z\_]+', date_url):
36 |         res_date = response_tree.find("font", {"color": "#0080ff"})
37 |         if res_date == None:
38 |             response_date = None
39 |         else:
40 |             response_date = res_date.get_text().replace('\r', '').replace('\n', '')
41 |     else:
42 |         response_date = response_tree.find("title").get_text()[-8:]
43 |     return response_date
44 | 
45 | 
46 | def write_excel(path, sheet_name, text_info):
47 |     index = len(text_info)
48 |     workbook = openpyxl.Workbook()
49 |     sheet = workbook.active
50 |     sheet.title = sheet_name
51 |     for i in range(0, index):
52 |         for j in range(len(text_info[i])):
53 |             sheet.cell(row=i + 1, column=j + 1, value=str(text_info[i][j]))
54 |     workbook.save(path)
55 |     print("xlsx格式表格写入数据成功！")
56 | 
57 | 
58 | def spider(url):
59 |     response = requests.get(url, {
60 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'})
61 |     res = response.content
62 |     html = str(res, 'utf-8')
63 |     html_tree = bs(html, 'lxml')
64 |     # 找打h3标签下的内容
65 |     html_text = html_tree.find_all("h3")
66 |     All_text = []
67 |     # 隔一个才是文章的标题
68 |     for text in html_text[1:-2:]:
69 |         one_text = []
70 |         text_title = text.get_text().replace('\xa0', '').replace('\n', '')  # 得到论文的标题
71 |         # print(text.get_text())
72 |         text_url = text.find('a')['href']  # 选取了当前文章的链接
73 |         # 用正则表达式匹配我们需要的链接
74 |         if re.match(r"""http://youxian.cnki.com.cn/yxdetail.aspx\?filename=[0-9a-zA-Z]+&dbname=[a-zA-Z]+""",
75 |                     text_url) or re.match(r'http://www.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url):
76 |             # print(text.find('a')['href'])
77 |             text_date = datespider(text_url)
78 |             one_text.append(text.get_text().replace('\xa0', '').replace('\n', ''))  # text.get_text是得到文章的标题
79 |             if text_date == None:
80 |                 one_text.append(None)
81 |             else:
82 |                 if int(text_date[:4]) >= 2014:
83 |                     one_text.append(text_date.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', ''))
84 |                 else:
85 |                     continue
86 |             All_text.append(one_text)
87 |     # print(text.find('a')['href'])
88 | 
89 |     # print(All_text)
90 |     return All_text
91 | 
92 | 
93 | # write_excel(All_text)
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     pagenext()


--------------------------------------------------------------------------------
/main/CNKI2.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | #author: moyuweiqing
  3 | #主爬虫，数据用这个来爬取的
  4 | 
  5 | #爬虫须知：
  6 | #  1、运行前请配置包 requests、bs4、xlutils、my_fake_useragent
  7 | #  2、Excel 文件请先在同一级目录新建好，代码里的名字：知网-区块链.xls（不能是xlsx后缀）
  8 | #  3、目前测试 只能爬取18页的数据，到19页就会失败，好像有个上限，没想到它是怎么识别的
  9 | 
 10 | 
 11 | import requests #爬取IP端口和
 12 | from bs4 import BeautifulSoup as bs #bs4解析库，用来解析网页
 13 | import time
 14 | import openpyxl #对Excel的操作
 15 | import re   #对字符串的操作
 16 | import xlrd #xls文件的读
 17 | import xlwt #xls文件的写
 18 | from xlutils.copy import copy#修改（追加写入）
 19 | from my_fake_useragent import UserAgent #这个库用来做反爬虫的
 20 | #这库用来随机生成user_agent 在这个爬虫中好像没必要 一样会循环重定向
 21 | 
 22 | def pagenext():
 23 |     #最开始的链接 最后面 'p=' 添加你要的页数 就能去其他页
 24 |     base_url = 'http://search.cnki.com.cn/Search.aspx?q=%e5%8c%ba%e5%9d%97%e9%93%be&rank=relevant&cluster=all&val=&p='
 25 |     L = range(0, 450) #修改这里可以改变获取的数量 不要太多 不然跑很久    4500就是300页了
 26 |     # All_Page = []
 27 |     for i in L[::15]: #15条是一页
 28 |         All_Page = []
 29 |         next_url = base_url + str(i)#配置下一页的url，每15个数据一页
 30 |         print(next_url)
 31 |         print(i / 15 + 1, " 页的数据")
 32 |         page_text = spider(next_url)      #跑第*页的爬虫 获取那一页的数据
 33 |         time.sleep(10)        #休息一会 防被网站 ban
 34 |         write_excel('xlsx论文筛选.xls',i / 15 + 1, page_text)  #写进Excel
 35 | 
 36 | #进入了文章的具体ulr
 37 | def datespider(date_url):
 38 |     #设置一下 UserAgent 突破反扒
 39 |     response_try = requests.get(date_url, UserAgent().random())
 40 |     # 用BeautifulSoup框架转化
 41 |     response_tree = bs(response_try.text, 'html.parser')
 42 |     if(response_tree==None):
 43 |         return []
 44 |     else:
 45 |         # 在对应位置 匹配需要的信息
 46 |         res_date = response_tree.find("font", {"color": "#0080ff"})
 47 |         res_name = response_tree.find("div", {"style": "text-align:center; width:740px; height:30px;"})
 48 |         res_msg = response_tree.find("div", {"style": "text-align:left;"})
 49 | 
 50 |         #时间
 51 |         if res_date == None:
 52 |             response_date = None
 53 |         else:
 54 |             response_date = res_date.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '')
 55 |         #作者
 56 |         if res_name == None:
 57 |             response_name = None
 58 |         else:
 59 |             response_name = res_name.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '')
 60 |         #其他信息
 61 |         if res_msg == None:
 62 |             res_msg = None
 63 |         else:
 64 |             # 去除不想要的东西
 65 |             response_msg = res_msg.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t','')\
 66 |                 .replace('】', '').replace('学位授予单位：', '').replace('学位级别：', '').replace('作者单位：', '').replace('学位授予年份：','').replace('分类号：', '')
 67 |             #用“【”作为分割界限，将response_msg字符串 划分为 response_point列表
 68 |             response_point = response_msg.split("【")
 69 |         #插入列表 并返回
 70 |         response_All = []
 71 |         response_All.append(response_date)
 72 |         response_All.append(response_name)
 73 |         #列表拼接
 74 |         #列表拼接
 75 |         for item in range(1,len(response_point)):
 76 |             response_All.append(response_point[item])
 77 | 
 78 |         return response_All
 79 | 
 80 | #写进表格里面去
 81 | def write_excel(path, page, text_info):
 82 | 
 83 |     index = len(text_info)
 84 |     # workbook = openpyxl.Workbook()
 85 |     workbook = xlrd.open_workbook(path)#打开
 86 |     sheets = workbook.sheet_names()
 87 |     sheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
 88 |     rows_old = sheet.nrows  # 获取表格中已存在的数据的行数
 89 |     new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
 90 |     new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
 91 |     # sheet.title = sheet_name
 92 |     for i in range(0, index):
 93 |         for j in range(len(text_info[i])):
 94 |             new_worksheet.write(i + rows_old,j,str(text_info[i][j]))
 95 |     new_workbook.save(path)
 96 | 
 97 |     print(page," 页写入数据成功！")
 98 | 
 99 | def spider(url):
100 |     response = requests.get(url, {'User-Agent':UserAgent().random()})#用来突破反爬虫
101 |     res = response.content
102 |     html = str(res, 'utf-8')#用来获取html页面
103 |     html_tree = bs(html, 'lxml')
104 |     # 找class = wz_content标签下的内容
105 |     html_text = html_tree.find_all("div", class_="wz_content")
106 |     All_text = []
107 |     for text in html_text:
108 |         one_text = []
109 |         text_url = text.find('a')['href']  # 选取了当前文章的链接
110 |         text_title = text.find('h3') #标题
111 |         text_cout = text.find("span", class_="count")
112 |         #舍弃http://youxian.cnki链接 打不开的 没数据 可能需要登陆才有数据 之后再调试吧  出现概率1/20
113 |         if re.match(r'http://www.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url) or re.match(r'http://cdmd.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url):
114 |             # 调用函数 进去各个文章的具体网站 找其他信息
115 |             text_all = datespider(text_url)
116 |             one_text.append(text_title.get_text().replace('\xa0', '').replace('\n', ''))  # 得到文章的标题
117 |             one_text.append(text_cout.get_text().replace('\xa0', '').replace('\n', '').replace('下载次数', '').replace('被引次数', '').replace('（', '').replace('）', ''))  # 把操作次数 放进列表
118 |             for item in text_all:#将datespider函数返回的信息，文章的 作者、单位、学位 、分类号，插入列表
119 |                 one_text.append(item.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '').replace('年', ''))
120 |             one_text.append(text_url)  # 把文章的链接 放进列表
121 | 
122 |             All_text.append(one_text)
123 |     return All_text
124 | 
125 | if __name__ == '__main__':
126 |     pagenext()


--------------------------------------------------------------------------------
/main/__pycache__/draw_word_cloud.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/main/__pycache__/draw_word_cloud.cpython-37.pyc


--------------------------------------------------------------------------------
/main/__pycache__/jieba_analysis.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/main/__pycache__/jieba_analysis.cpython-37.pyc


--------------------------------------------------------------------------------
/main/__pycache__/network.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/main/__pycache__/network.cpython-37.pyc


--------------------------------------------------------------------------------
/main/co-citation_network.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #共引文献网络分析，和共词网络分析差不多
 4 | 
 5 | import os
 6 | import networkx as nx#复杂网络分析库
 7 | import network
 8 | import pandas as pd
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | 
12 | path = os.path.abspath('..')
13 | xls = pd.ExcelFile(path+'\dependence\知网数据.xls')#读取xls表格
14 | readf = pd.read_excel(xls,'Sheet1')#读取第一个表
15 | frame = readf[readf['共引文献'].notnull()]#如果关键词那一列非空，读取所有数据
16 | 
17 | keywords = network.seperate(frame, '共引文献', ' ', ';')#关键词列表，里面记录了所有的关键词，没有重复
18 | # for keyword in frame['共引文献']:#分隔关键词，并加入到列表中，去重
19 | #     if ',' in keyword:
20 | #         temp = keyword.split(',')
21 | #         for x in temp:
22 | #             if x not in keywords:
23 | #                 keywords.append(x)
24 | #     elif ';' in keyword:
25 | #         temp = keyword.split(';')
26 | #         for x in temp:
27 | #             if x not in keywords:
28 | #                 keywords.append(x)
29 | #     else:
30 | #         if keyword not in keywords:
31 | #             keywords.append(keyword)
32 | 
33 | df = pd.DataFrame(index=frame['序号'],columns=keywords)    #建立以标题为行，关键词为列的DataFrame矩阵
34 | df.index.name='序号'
35 | df.columns.name='共引文献'
36 | #
37 | # #将这一篇文献所拥有的关键词在矩阵中标记为1
38 | # for row in frame['序号']:
39 | #     for keyword in df.columns:
40 | #         if keyword in frame.loc[row]['共引文献']:
41 | #             df.loc[row][keyword] = 1
42 | # df = df.fillna(0)#填充空值
43 | 
44 | df = network.fill(frame, '序号', '共引文献', df)
45 | 
46 | #df为存在矩阵，dataframe类型
47 | #data为关联度，矩阵类型
48 | #df2位关联度矩阵，dataframe类型
49 | 
50 | data = df.values.T.dot(df.values)#建立关键词之间的相关性，边的长度为相关性，在这里是将两个df点乘，df.values是按行读取值
51 | df2 = pd.DataFrame(data = data,index=keywords,columns=keywords)#建立关键词之间的相关性矩阵，以关联度作为值传入
52 | 
53 | #设置阈值
54 | value = lambda x : x * 30 if x > 0 else 0
55 | df2 = df2.applymap(value)
56 | 
57 | net = nx.Graph(df2)#创建无向图，以关键词为节点，相关性为边
58 | 
59 | dele, net = network.remove(keywords, net)
60 | 
61 | de=dict(net.degree())#建立字典，关键字为索引，度（关联情况）为值
62 | pos = nx.spring_layout(net)#四种建图模式，spectral,shell,circular,spring，spring是可以看的了
63 | 
64 | array = np.zeros(len(keywords))#建立以度为值的一维矩阵
65 | arg = np.argsort(-np.array(array))
66 | labels = {}#记录关键词
67 | for index in range(0, len(keywords)):
68 |     labels[keywords[arg[index]]] = keywords[arg[index]]
69 | 
70 | de2 = [de[v]*10 for v in sorted(de.keys(), reverse=False)]#应该是节点的大小，尺寸调整合适
71 | 
72 | plt.figure(figsize=(50, 50))
73 | nx.draw_networkx_labels(net,pos,labels, font_size=40,font_color='black',font_family ='YouYuan')#写标记
74 | nx.draw_networkx(net, pos, node_size=de2, with_labels = False, node_color='#A52A2A', linewidths=None, width=2.0, edge_color ='#858585')


--------------------------------------------------------------------------------
/main/co-word_network.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #共词网络可视化
 4 | 
 5 | import os
 6 | import networkx as nx#复杂网络分析库
 7 | import network
 8 | import pandas as pd
 9 | import numpy as np
10 | import matplotlib.pyplot as plt
11 | 
12 | path = os.path.abspath('..')
13 | xls = pd.ExcelFile(path+'\dependence\知网数据.xls')#读取xls表格
14 | readf = pd.read_excel(xls,'Sheet1')#读取第一个表
15 | frame = readf[readf['关键词'].notnull()]#如果关键词那一列非空，读取所有数据
16 | 
17 | keywords = network.seperate(frame, '关键词', ' ', ';')#关键词列表，里面记录了所有的关键词，没有重复
18 | # for keyword in frame['关键词']:#分隔关键词，并加入到列表中，去重
19 | #     if ' ' in keyword:
20 | #         temp = keyword.split(' ')
21 | #         for x in temp:
22 | #             if x not in keywords:
23 | #                 keywords.append(x)
24 | #     elif ';' in keyword:
25 | #         temp = keyword.split(';')
26 | #         for x in temp:
27 | #             if x not in keywords:
28 | #                 keywords.append(x)
29 | #     else:
30 | #         if keyword not in keywords:
31 | #             keywords.append(keyword)
32 | 
33 | #建立以标题为行，关键词为列的DataFrame矩阵
34 | df = pd.DataFrame(index=frame['序号'],columns=keywords)
35 | df.index.name='序号'
36 | df.columns.name='关键词'
37 | 
38 | # for row in frame['序号']:#将这一篇文献所拥有的关键词在矩阵中标记为1
39 | #     for keyword in df.columns:
40 | #         if keyword in frame.loc[row]['关键词']:
41 | #             #print(keyword)
42 | #             df.loc[row][keyword] = 1
43 | #
44 | # #df为存在矩阵，dataframe类型
45 | # #data为关联度，矩阵类型
46 | # #df2位关联度矩阵，dataframe类型
47 | #
48 | # df = df.fillna(0)#填充空值
49 | df = network.fill(frame, '序号', '关键词', df)
50 | 
51 | data = df.values.T.dot(df.values)#建立关键词之间的相关性，边的长度为相关性，在这里是将两个df点乘，df.values是按行读取值
52 | 
53 | df2 = pd.DataFrame(data = data,index=keywords,columns=keywords)#建立关键词之间的相关性矩阵
54 | 
55 | #设置阈值
56 | valve = lambda x : x if x > 32 else 0
57 | df2 = df2.applymap(valve)
58 | 
59 | net = nx.Graph(df2)#创建无向图，以关键词为节点，相关性为边
60 | 
61 | # def check(x,net):
62 | #     for i in range(0,keywords.index(x)):
63 | #         if nx.has_path(net,x,keywords[i]):
64 | #             return True
65 | #     for j in range(keywords.index(x)+1,len(keywords)):
66 | #         if nx.has_path(net,x,keywords[j]):
67 | #             return True
68 | #     return False
69 | #
70 | # #去除无连接节点
71 | # dele=[]
72 | # for i in range(len(keywords)):
73 | #     if not check(keywords[i],net):
74 | #         if keywords[i] not in dele:
75 | #             dele.append(keywords[i])
76 | # net.remove_nodes_from(dele)
77 | dele, net = network.remove(keywords, net)
78 | 
79 | de=dict(net.degree())#建立字典，关键字为索引，度（关联情况）为值
80 | pos = nx.spring_layout(net)#四种建图模式，spectral,shell,circular,spring，spring是可以看的了
81 | keywords = [i for i in keywords if i not in dele]#有边的关键词
82 | 
83 | array = np.zeros(len(keywords))#建立以度为值的一维矩阵
84 | arg = np.argsort(-np.array(array))
85 | labels = {}#记录关键词
86 | for index in range(0, len(keywords)):
87 |     labels[keywords[arg[index]]] = keywords[arg[index]]
88 | 
89 | de2 = [de[v]*60 for v in sorted(de.keys(), reverse=False)]#应该是节点的大小，尺寸调整合适
90 | 
91 | plt.figure(figsize=(50, 50))
92 | nx.draw_networkx_labels(net,pos,labels, font_size=40,font_color='black',font_family ='YouYuan')
93 | nx.draw_networkx(net, pos, node_size=de2, with_labels = False, node_color='#A52A2A', linewidths=None, width=2.0, edge_color ='#858585')


--------------------------------------------------------------------------------
/main/cooperation_network.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #合作网络可视化
 4 | 
 5 | import os
 6 | import networkx as nx
 7 | import pandas as pd
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt  #导入所需要的库
10 | 
11 | path = os.path.abspath('..')
12 | xlsx = pd.ExcelFile(path+'\dependence\知网数据.xls') #读取数据文件
13 | readf = pd.read_excel(xlsx,'Sheet1')
14 | 
15 | #数据进行预处理，将读入的作者以及相关作者信息转换为列表形式
16 | for i in range(len(readf['作者'])):
17 |     if readf['作者'][i] != '[]':
18 |         readf['作者'][i] = eval(readf['作者'][i])
19 |     else:
20 |         readf['作者'][i] = np.nan
21 | for j in range(len(readf['相关作者'])):
22 |     if readf['相关作者'][j] is not np.nan:
23 |         readf['相关作者'][j] = eval(readf['相关作者'][j])
24 | 
25 | #数据预处理，将作者为空的数据去除
26 | frame = readf[readf['作者'].notnull()]
27 | frame.index = frame['题目']
28 | 
29 | #获取作者以及相关作者，将其整合到一个列表中
30 | all_authors = []
31 | for authors in frame['作者']:
32 |     for author in authors:
33 |         if author not in all_authors:
34 |             all_authors.append(author)
35 | for r_authors in frame['相关作者']:
36 |     if r_authors is not np.nan:
37 |         for r_author in r_authors:
38 |             if r_author not in all_authors:
39 |                 all_authors.append(r_author)
40 | 
41 | #构建出现矩阵
42 | df = pd.DataFrame(index=frame['题目'],columns=all_authors)
43 | df.index.name='题目'
44 | df.columns.name='作者'
45 | for title in frame['题目']:
46 |     for i in frame.loc[title]['作者']:
47 |         df.loc[title,i] = 1
48 |     if frame.loc[title]['相关作者'] is not np.nan:
49 |         for j in frame.loc[title]['相关作者']:
50 |             df.loc[title,j] = 1
51 | df=df.fillna(0)
52 | 
53 | #将出现矩阵转换为共现矩阵
54 | data = df.values.T.dot(df.values)
55 | df2 = pd.DataFrame(data = data,index=all_authors,columns=all_authors)
56 | 
57 | #设置阀门，排除关联度小的点
58 | valve = lambda x : x if x > 32 else 0
59 | df2 = df2.applymap(valve)
60 | 
61 | #构建共现网络
62 | net = nx.Graph(df2)
63 | 
64 | #过滤关联度为0的节点
65 | def check(x,net):
66 |     for i in range(0,all_authors.index(x)):
67 |         if nx.has_path(net,x,all_authors[i]):
68 |             return True
69 |     for j in range(all_authors.index(x)+1,len(all_authors)):
70 |         if nx.has_path(net,x,all_authors[j]):
71 |             return True
72 |     return False
73 | dele=[]
74 | for i in range(len(all_authors)):
75 |     if not check(all_authors[i],net):
76 |         if all_authors[i] not in dele:
77 |             dele.append(all_authors[i])
78 | net.remove_nodes_from(dele)
79 | 
80 | #设置每个节点的大小比例为它们度的大小比例，并且显示每个节点的标签
81 | de=dict(net.degree())
82 | pos = nx.spring_layout(net)
83 | all_authors = [i for i in all_authors if i not in dele]
84 | array = np.zeros(len(all_authors))
85 | j = 0
86 | for i in de.keys():
87 |     array[j] = de[i]
88 |     j+=1
89 | arg = np.argsort(-np.array(array))
90 | labels = {}
91 | for index in range(len(all_authors)):
92 |     labels[all_authors[arg[index]]] = all_authors[arg[index]]
93 | de2 = [de[v]*20 for v in sorted(de.keys(), reverse=False)]
94 | 
95 | #对网路进行可视化
96 | plt.figure(figsize=(50, 50))
97 | nx.draw_networkx_labels(net,pos,labels, font_size=40,font_color='black',font_family ='YouYuan')
98 | nx.draw_networkx(net, pos, node_size=de2, with_labels = False, node_color='#A52A2A', linewidths=None, width=1.0, edge_color ='#858585')
99 | 


--------------------------------------------------------------------------------
/main/draw_word_cloud.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #使用wordcloud库画词云
 4 | 
 5 | import os
 6 | import jieba
 7 | import imageio as ima	#读入图片文件
 8 | from wordcloud import WordCloud
 9 | 
10 | def drawWordCloud(words, title, savepath='./results'): #定义一个词云绘制函数，通过词频绘制词云图并写出到特定目录
11 |    path = os.path.abspath('..')
12 |    if not os.path.exists(savepath):
13 |       os.mkdir(savepath)
14 |    wc = WordCloud(font_path=path+'\dependence\simkai.ttf', background_color='white', max_words=2000, width=1920, height=1080, margin=5, mask=ima.imread(path+'\dependence\mask.png'))#使用原先准备好的一张照片作为背景图
15 |    wc.generate_from_frequencies(words)
16 |    wc.to_file(os.path.join(savepath, title+'.png'))
17 | 
18 | def statistics(texts, stopwords):  #使用jieba库来进行分词，并统计词语出现次数
19 |    words_dict = {}
20 |    for text in texts:
21 |       temp = jieba.cut(text)
22 |       for t in temp:
23 |          if t in stopwords:
24 |             continue
25 |          if t in words_dict.keys():
26 |             words_dict[t] += 1
27 |          else:
28 |             words_dict[t] = 1
29 |    return words_dict


--------------------------------------------------------------------------------
/main/jieba_analysis.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #使用jieba库进行分词、统计词频、去除无关词
 4 | 
 5 | import jieba
 6 | 
 7 | #统计所有存在的分词
 8 | def calculateAllWords(readfile):
 9 |     all_word = []   #记录所有分词
10 |     for row in range(0, len(readfile)):
11 |         temp = jieba.cut(readfile[row])
12 |         for i in temp:
13 |             if i in all_word:
14 |                 continue
15 |             else:
16 |                 all_word.append(i)
17 |     #all_word.pop()
18 |     return all_word
19 | 
20 | #统计分词的出现数量
21 | def calculateNumOfEachWord(readfile, all_word):
22 |     dic = {}  # 记录分词的出现数量
23 |     for i in all_word:
24 |         dic[i] = 0
25 |     for row in range(0, len(readfile)):
26 |         temp = jieba.cut(readfile[row])
27 |         for i in temp:
28 |             dic[i] = dic[i] + 1
29 |     return dic
30 | 
31 | # 去除无关词
32 | def removeIrreleventWords(stopwords, dic):
33 |     temp_dic = dic.copy()
34 |     for i in temp_dic:
35 |         if i in stopwords:
36 |             dic.pop(i)
37 |     return dic
38 | 
39 | #对关键词进行排序
40 | def sortKeyWords(keyword,num):
41 |     dic_sorted = dict(sorted(keyword.items(), key=lambda x: x[1], reverse=True))
42 |     dic_num = {}
43 |     for i in range(0, num):
44 |         dic_num[list(dic_sorted.keys())[i]] = list(dic_sorted.values())[i]
45 |     return dic_num


--------------------------------------------------------------------------------
/main/keywords_by_jieba.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #使用jieba库来进行关键词的提取
 4 | 
 5 | import os
 6 | import jieba
 7 | import jieba.analyse
 8 | 
 9 | path = os.path.abspath('..')
10 | text = open(path + '\dependence\区块链技术发展现状与展望_袁勇.txt')
11 | text = text.read()
12 | s1 = text.replace('\n', '').replace(' ', '')#去除换行
13 | 
14 | fenci_text = jieba.cut(s1)
15 | stopwords = {}.fromkeys([ line.rstrip() for line in open(path + '\dependence\stopwords.txt', encoding = "utf-8") ])
16 | final = ""
17 | for word in fenci_text:
18 |     if word not in stopwords:
19 |         if (word != "。" and word != "，") :
20 |             final = final + " " + word
21 | 
22 | keywords = jieba.analyse.extract_tags(final, topK = 20, withWeight = True, allowPOS = ())
23 | print(keywords)


--------------------------------------------------------------------------------
/main/keywords_by_snownlp.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #使用SnowNlp来提取关键词
 4 | 
 5 | import os
 6 | from snownlp import SnowNLP
 7 | 
 8 | path = os.path.abspath('..')
 9 | text = open(path + '\dependence\区块链技术发展现状与展望_袁勇.txt').read().replace('\n', '').replace(' ', '')
10 | 
11 | analysis_result = SnowNLP(text)
12 | stopwords = {}.fromkeys([ line.rstrip() for line in open(path + '\dependence\stopwords.txt', encoding = "utf-8") ])
13 | final = ""
14 | for word in analysis_result.keywords(20):
15 |     if word not in stopwords:
16 |         if (word != "。" and word != "，") :
17 |             final = final + " " + word
18 | 
19 | print(final)


--------------------------------------------------------------------------------
/main/keywords_by_textrank4zh.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #使用textrank4zh来进行关键词的提取
 4 | 
 5 | import os
 6 | from textrank4zh import TextRank4Keyword
 7 | 
 8 | path = os.path.abspath('..')
 9 | text = open(path+'\dependence\区块链技术发展现状与展望_袁勇.txt').read().replace('\n', '').replace(' ', '')
10 | 
11 | tr4w = TextRank4Keyword()
12 | tr4w.analyze(text, lower=True)
13 | key_words = tr4w.get_keywords(20)
14 | # print(key_words)
15 | word_list = list(key_word.word for key_word in key_words)
16 | 
17 | stopwords = {}.fromkeys([ line.rstrip() for line in open(path+'\dependence\stopwords.txt', encoding = "utf-8") ])
18 | final = ""
19 | for word in word_list:
20 |     if word not in stopwords:
21 |         if (word != "。" and word != "，") :
22 |             final = final + " " + word
23 | 
24 | print(final)


--------------------------------------------------------------------------------
/main/network.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #网络
 4 | 
 5 | import networkx as nx       #复杂网络分析库
 6 | import pandas as pd
 7 | import numpy as np
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | # 分隔关键词，并加入到列表中，去重，设置两个分隔符
12 | # 参数说明：（frame：pd.read_excel()之后的对象，name：需要进行分析的那一列的名称，sp1、sp2为分隔符）
13 | def seperate(frame, name, sp1, sp2):
14 |     list = []
15 |     for word in frame[name]:
16 |         if sp1 in word:
17 |             temp = word.split(sp1)
18 |             for x in temp:
19 |                 if x not in list:
20 |                     list.append(x)
21 |         elif sp2 in word:
22 |             temp = word.split(sp2)
23 |             for x in temp:
24 |                 if x not in list:
25 |                     list.append(x)
26 |         else:
27 |             if word not in list:
28 |                 list.append(word)
29 |     return list
30 | 
31 | #填充值，存在这个关键词的dataframe的位置设置为1，其余的用0来填充
32 | #参数说明：（frame：需要用来遍历的那个excel表格，index：用来进行遍历的frame的索引名字，name：用来进行遍历的frame的值，dataframe：用来写入的信息）
33 | def fill(frame, index, name, dataframe):
34 |     for row in frame[index]:  # 将这一篇文献所拥有的关键词在矩阵中标记为1
35 |         for keyword in dataframe.columns:
36 |             if keyword in frame.loc[row][name]:
37 |                 dataframe.loc[row][keyword] = 1
38 |     df = dataframe.fillna(0)  # 填充空值
39 |     return df
40 | 
41 | #检查是否有没有连接的节点
42 | #参数说明：（list:需要进行检查的列表，x：节点，net：网络）
43 | def check(list, x, net):
44 |     for i in range(0,list.index(x)):
45 |         if nx.has_path(net,x,list[i]):
46 |             return True
47 |     for j in range(list.index(x)+1,len(list)):
48 |         if nx.has_path(net,x,list[j]):
49 |             return True
50 |     return False
51 | 
52 | #去除没有连接的节点
53 | #参数说明：（list：需要进行检查的列表，net：网络）
54 | def remove(list, net):
55 |     dele = []
56 |     for i in range(len(list)):
57 |         if not check(list, list[i], net):
58 |             if list[i] not in dele:
59 |                 dele.append(list[i])
60 |     net.remove_nodes_from(dele)
61 |     return dele, net


--------------------------------------------------------------------------------
/main/pdf-to-txt.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #pdf转换成txt
 4 | 
 5 | import time,os.path,requests,re
 6 | time1=time.time()
 7 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 8 | from pdfminer.converter import PDFPageAggregator
 9 | from pdfminer.layout import LAParams,LTTextBoxHorizontal
10 | from pdfminer.pdfpage import PDFTextExtractionNotAllowed,PDFPage
11 | from pdfminer.pdfparser import PDFParser
12 | from pdfminer.pdfdocument import PDFDocument
13 | 
14 | 
15 | class CPdf2TxtManager():
16 |     def changePdfToText(self, filePath):
17 |         # 以二进制读模式打开
18 |         file = open(path, 'rb')
19 |         #用文件对象来创建一个pdf文档分析器
20 |         praser = PDFParser(file)
21 |         # 创建一个PDF文档对象存储文档结构,提供密码初始化，没有就不用传该参数
22 |         doc = PDFDocument(praser, password='')
23 |         ##检查文件是否允许文本提取
24 |         if not doc.is_extractable:
25 |             raise PDFTextExtractionNotAllowed
26 | 
27 |         # 创建PDf 资源管理器 来管理共享资源，#caching = False不缓存
28 |         rsrcmgr = PDFResourceManager(caching = False)
29 |         # 创建一个PDF设备对象
30 |         laparams = LAParams()
31 |         # 创建一个PDF页面聚合对象
32 |         device = PDFPageAggregator(rsrcmgr, laparams=laparams)
33 |         # 创建一个PDF解析器对象
34 |         interpreter = PDFPageInterpreter(rsrcmgr, device)
35 |         # 获得文档的目录（纲要）,文档没有纲要会报错
36 |         #PDF文档没有目录时会报：raise PDFNoOutlines  pdfminer.pdfdocument.PDFNoOutlines
37 |         # print(doc.get_outlines())
38 | 
39 |         # 获取page列表
40 |         print(PDFPage.get_pages(doc))
41 |         # 循环遍历列表，每次处理一个page的内容
42 |         for page in PDFPage.create_pages(doc):
43 |             interpreter.process_page(page)
44 |             # 接受该页面的LTPage对象
45 |             layout = device.get_result()
46 |             # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
47 |             # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等
48 |             for x in layout:
49 |                 if hasattr(x, "get_text"):
50 |                     fileNames = os.path.splitext(filePath)
51 |                     with open(fileNames[0] + '.txt','a+') as f:
52 |                         results = x.get_text()
53 |                         print(results)
54 |                         f.write(results.encode('gbk','ignore').decode('gbk') + '\n')
55 |                 # 如果x是水平文本对象的话
56 |                 # if (isinstance(x, LTTextBoxHorizontal)):
57 |                 #     text = re.sub(replace, '', x.get_text())
58 |                 #     if len(text) != 0:
59 |                 #         print(text)
60 | 
61 | if __name__ == '__main__':
62 |     path = os.path.abspath('..')
63 |     pdf2TxtManager = CPdf2TxtManager()
64 |     pdf2TxtManager.changePdfToText(path+'\dependence\区块链技术发展现状与展望_袁勇.pdf')
65 |     time2 = time.time()
66 |     print('ok,解析pdf结束!')
67 |     print('总共耗时：' + str(time2 - time1) + 's')


--------------------------------------------------------------------------------
/main/sentiment_analysis.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #情感分析-by-snownlp&matplotlib
 4 | 
 5 | import os
 6 | from snownlp import SnowNLP
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | import math
10 | 
11 | path = os.path.abspath('..')
12 | text = open(path + '\dependence\区块链技术发展现状与展望_袁勇.txt')
13 | text = text.read()
14 | s1 = text.replace('\n', '').replace(' ', '').replace('.', '。')#去除换行
15 | 
16 | #建立情感分析
17 | sn1 = SnowNLP(s1)
18 | sentimentslist = []
19 | for i in sn1.sentences:
20 |     j = SnowNLP(i)
21 |     sentimentslist.append(j.sentiments)
22 | 
23 | #可视化处理，使用matplotlib
24 | dic = {}
25 | for i in np.arange(0, 1, 0.02):
26 |     index = round(i, 2)
27 |     dic[index] = 0
28 | for i in sentimentslist:
29 |     temp = round(math.floor(i/0.02)*0.02, 2)
30 |     dic[temp] = dic[temp] + 1
31 | plt.hist(sentimentslist,bins=np.arange(0,1,0.02))
32 | plt.savefig(path+'\Results\sentimental_analysis（区块链）.png')


--------------------------------------------------------------------------------
/main/sentiment_analysis2.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #情感分析-by-snownlp&matplotlib
 4 | 
 5 | import os
 6 | from snownlp import SnowNLP
 7 | import matplotlib.pyplot as plt
 8 | import numpy as np
 9 | import math
10 | 
11 | path = os.path.abspath('..')
12 | text = open(path + '\dependence\小王子.txt')
13 | text = text.read()
14 | s1 = text.replace('\n', '').replace(' ', '').replace('.', '。')#去除换行
15 | 
16 | #建立情感分析
17 | sn1 = SnowNLP(s1)
18 | sentimentslist = []
19 | for i in sn1.sentences:
20 |     j = SnowNLP(i)
21 |     sentimentslist.append(j.sentiments)
22 | 
23 | #可视化处理，使用matplotlib
24 | dic = {}
25 | for i in np.arange(0, 1, 0.02):
26 |     index = round(i, 2)
27 |     dic[index] = 0
28 | for i in sentimentslist:
29 |     temp = round(math.floor(i/0.02)*0.02, 2)
30 |     dic[temp] = dic[temp] + 1
31 | plt.hist(sentimentslist,bins=np.arange(0,1,0.02))
32 | plt.savefig(path+'\Results\sentimental_analysis（小王子）.png')


--------------------------------------------------------------------------------
/main/strategy_analysis.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #战略分析，计算密度和向心度，向心度算法自己写，建立二维坐标轴
 4 | 
 5 | import os
 6 | import pandas as pd
 7 | import jieba_analysis
 8 | import jieba
 9 | import matplotlib.pyplot as plt
10 | import matplotlib.font_manager as fm    #字体管理，防止乱码
11 | import math
12 | 
13 | path = os.path.abspath('..')
14 | xls = pd.ExcelFile(path + '\dependence\知网数据.xls')
15 | readf = pd.read_excel(xls, 'Sheet1')['题目'].astype(str)
16 | 
17 | #统计所有存在的分词
18 | all_word = []   #记录所有分词
19 | all_word = jieba_analysis.calculateAllWords(readf)
20 | 
21 | #统计分词的出现数量
22 | dic_raw = {}    #记录分词的出现数量
23 | dic_raw = jieba_analysis.calculateNumOfEachWord(readf, all_word)
24 | 
25 | #去除无关词
26 | stf = open(path+'\dependence\stopwords.txt', encoding = "utf-8").read()
27 | dic = jieba_analysis.removeIrreleventWords(stf, dic_raw)
28 | 
29 | #对分词进行排序，并挑选出出现次数最多的前20个
30 | dic_20 = {}     #记录前20个关键词
31 | dic_20 = jieba_analysis.sortKeyWords(dic, 20)
32 | 
33 | #建立一个空白的向心度模型
34 | dic_heart = {}
35 | for i in range(0, 20):
36 |     dic_heart[list(dic_20.keys())[i]] = 0
37 | 
38 | #计算向心度
39 | for key in dic_heart.keys():
40 |     for row in range(0, len(readf)):
41 |         temp = jieba.cut(readf[row])
42 |         if key in temp:
43 |             dic_heart[key] = dic_heart[key] + len(list(temp))
44 | 
45 | log_densit = []# 密度的自然对数
46 | log_heart = [] # 向心度的自然对数
47 | 
48 | #计算向心度
49 | for i in dic_20.values():
50 |     log_densit.append(math.log(i))
51 | for i in dic_heart.values():
52 |     if i != 0:
53 |         log_heart.append(math.log(i))
54 |     else:
55 |         log_heart.append(0)
56 | 
57 | #解决乱码
58 | plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
59 | plt.rcParams['axes.unicode_minus'] = False
60 | 
61 | plt.title(u'密度和向心度散点图')
62 | plt.xlabel('密度的自然对数')
63 | plt.ylabel('向心度的自然对数')
64 | 
65 | plt.scatter(log_densit, log_heart, s=20, c="#ff1212", marker='o')
66 | for i in range(0, 20):
67 |     plt.annotate(list(dic_20.keys())[i], xy = (log_densit[i], log_heart[i]))
68 | plt.savefig(path + "\Results\scatter_log.png")


--------------------------------------------------------------------------------
/main/strategy_analysis_uniform.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import jieba
 3 | import matplotlib.pyplot as plt
 4 | import matplotlib.font_manager as fm    #字体管理，防止乱码
 5 | import math
 6 | 
 7 | xls = pd.ExcelFile(r'C:\Users\Yoga\Desktop\srp资料\知网-区块链(2).xls')
 8 | readf = pd.read_excel(xls, 'Sheet1')['标题']
 9 | 
10 | all_word = []   #记录所有分词
11 | dic = {}    #记录分词的出现数量
12 | 
13 | #统计所有存在的分词
14 | for row in range(0, len(readf)):
15 |    temp = jieba.cut(readf[row])
16 |    for i in temp:
17 |       if i in all_word:
18 |          continue
19 |       else:
20 |          all_word.append(i)
21 | 
22 | #统计分词的出现数量
23 | for i in all_word:
24 |     dic[i] = 0
25 | 
26 | for row in range(0, len(readf)):
27 |     temp = jieba.cut(readf[row])
28 |     for i in temp:
29 |         dic[i] = dic[i] + 1
30 | 
31 | #去除无关词
32 | f = open(r'D:\JetBrains\PyCharm 2018.3.4\CNKI-analysis\venv\Include\dependence\stopwords.txt', encoding = "utf-8")
33 | temp_dic = dic.copy()
34 | f = f.read()
35 | for i in temp_dic:
36 |     if i in f:
37 |         dic.pop(i)
38 | 
39 | #对分词进行排序，并挑选出出现次数最多的前20个
40 | dic_sorted = dict(sorted(dic.items(), key = lambda x: x[1], reverse = True))
41 | dic_20 = {}# 20个出现次数最多的词语
42 | for i in range(0, 20):
43 |     dic_20[list(dic_sorted.keys())[i]] = list(dic_sorted.values())[i]
44 | print(dic_20)
45 | 
46 | #建立一个空白的向心度模型
47 | dic_heart = {}
48 | for i in range(0, 20):
49 |     dic_heart[list(dic_20.keys())[i]] = 0
50 | 
51 | #计算向心度
52 | for key in dic_heart.keys():
53 |     for row in range(0, len(readf)):
54 |         temp = jieba.cut(readf[row])
55 |         if key in temp:
56 |             dic_heart[key] = dic_heart[key] + len(list(temp))
57 | 
58 | exp_densit = []# 密度的自然对数
59 | exp_heart = [] # 向心度的自然对数
60 | 
61 | #计算向心度
62 | for i in dic_20.values():
63 |     exp_densit.append(math.log(i))
64 | for i in dic_heart.values():
65 |     if i != 0:
66 |         exp_heart.append(math.log(i))
67 |     else:
68 |         exp_heart.append(0)
69 | 
70 | #解决乱码
71 | plt.rcParams['font.sans-serif'] =['Microsoft YaHei']
72 | plt.rcParams['axes.unicode_minus'] = False
73 | 
74 | plt.title(u'密度和向心度散点图')
75 | plt.xlabel('密度的自然对数')
76 | plt.ylabel('向心度的自然对数')
77 | 
78 | plt.scatter(exp_densit, exp_heart, s=20, c="#ff1212", marker='o')
79 | for i in range(0, 20):
80 |     plt.annotate(list(dic_20.keys())[i], xy = (exp_densit[i], exp_heart[i]))
81 | plt.show()
82 | # plt.savefig("scatter_exp.png")
83 | 


--------------------------------------------------------------------------------
/main/word_cloud.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | #author: moyuweiqing
 3 | #词云
 4 | 
 5 | import os
 6 | import pandas as pd
 7 | import draw_word_cloud
 8 | 
 9 | if __name__ == '__main__':
10 |    content = []
11 |    path = os.path.abspath('..')
12 |    xls = pd.ExcelFile(path+'\dependence\知网数据.xls') #读取数据文件
13 |    readf = pd.read_excel(xls, 'Sheet1')  # 读取第一个表
14 |    frame = readf[readf['题目'].notnull()]  # 如果关键词那一列非空，读取所有数据
15 |    for keyword in frame['题目']:  # 分隔关键词，并加入到列表中，去重
16 |       content.append(keyword)
17 |    stopwords = open(path+'\dependence\stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
18 |    words_dict = draw_word_cloud.statistics(content, stopwords)
19 |    draw_word_cloud.drawWordCloud(words_dict, '区块链词云', savepath=path + '\Results')
20 | 


--------------------------------------------------------------------------------