├── Jupyter notebook ├── CNKI爬虫(改进版).ipynb ├── co-citation_network.ipynb ├── sentiment_analysis_by_plotly.ipynb └── strategy_analysis_by_plotly(关键词).ipynb ├── README.md ├── Results ├── co-word_analysis.png ├── co_citation_analysis.png ├── scatter_exp.png ├── scatter_log.png ├── sentimental_analysis(区块链).png ├── sentimental_analysis(小王子).png └── 区块链词云.png ├── dependence ├── mask.png ├── simkai.ttf ├── stopwords.txt ├── 区块链技术发展现状与展望_袁勇.pdf ├── 区块链技术发展现状与展望_袁勇.txt ├── 小王子.txt └── 知网数据.xls └── main ├── CNKI.py ├── CNKI2.py ├── __pycache__ ├── draw_word_cloud.cpython-37.pyc ├── jieba_analysis.cpython-37.pyc └── network.cpython-37.pyc ├── co-citation_network.py ├── co-word_network.py ├── cooperation_network.py ├── draw_word_cloud.py ├── jieba_analysis.py ├── keywords_by_jieba.py ├── keywords_by_snownlp.py ├── keywords_by_textrank4zh.py ├── network.py ├── pdf-to-txt.py ├── sentiment_analysis.py ├── sentiment_analysis2.py ├── strategy_analysis.py ├── strategy_analysis_uniform.py └── word_cloud.py /Jupyter notebook/CNKI爬虫(改进版).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests\n", 10 | "from bs4 import BeautifulSoup\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "def getHTMLText(url):\n", 21 | " try:\n", 22 | " headers = {'user-agent':'Mozilla/5.0'}\n", 23 | " r = requests.get(url,timeout=30,headers=headers)\n", 24 | " r.raise_for_status()\n", 25 | " r.encoding = r.apparent_encoding\n", 26 | " return r.text\n", 27 | " except:\n", 28 | " return \"产生异常\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def getPageurl(list,pageNum):\n", 38 | " first_url = \"http://search.cnki.com.cn/Search.aspx?q=%e5%8c%ba%e5%9d%97%e9%93%be&rank=relevant&cluster=all&val=&p=\"\n", 39 | " for i in range(pageNum):\n", 40 | " i = i*15\n", 41 | " soup = BeautifulSoup(getHTMLText(first_url+str(i)),'html.parser')\n", 42 | " for div in soup.find_all('div',class_=\"wz_tab\"):\n", 43 | " for a in div.find_all('a',target='_blank'):\n", 44 | " if 'http://search.cnki.net' not in a.get('href'):\n", 45 | " list.append(a.get('href'))" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "def getOnePage(list,pagetext):\n", 55 | " for oneurl in list:\n", 56 | " try:\n", 57 | " onepage={}\n", 58 | " tempL=[]\n", 59 | " soup = BeautifulSoup(getHTMLText(oneurl),'html.parser')\n", 60 | " onepage['题目']=soup.head.title.string#题目\n", 61 | " onepage['关键词'] = soup.head.find_all('meta')[3].get('content')#关键词\n", 62 | " au = []\n", 63 | " for div in soup.find_all('div',style=\"text-align:center; width:740px; height:30px;\"):#作者\n", 64 | " for a in div.find_all('a',target=\"_blank\"):\n", 65 | " au.append(a.string)\n", 66 | " onepage['作者'] = au\n", 67 | " for div in soup.find_all('div',style='float:left;'):\n", 68 | " for b in div.find_all('b'):#机构\n", 69 | " onepage['机构']=b.string.strip()\n", 70 | " for font in div.find_all('font',color='#0080ff'):#年份\n", 71 | " onepage['年份']=font.string.strip()\n", 72 | " for div in soup.find_all('div',id=\"div_Ref\"):#相似文献、引用文献等\n", 73 | " ref=[]\n", 74 | " for td in div.find_all('td',rowspan=\"2\",align=\"left\",valign=\"bottom\",class_=\"b14\"):\n", 75 | " getType = td.string[1:5]\n", 76 | " for a in div.find_all('a',target=\"_blank\"):\n", 77 | " ref.append(a.string)\n", 78 | " onepage[getType] = ref\n", 79 | " for table in soup.find_all('table',cellspacing=\"0\",cellpadding=\"0\",width=\"100%\",style=\"border:1px solid #7498d6;\"):\n", 80 | " temp=[]\n", 81 | " for a in table.find_all('a',target=\"_blank\"):#相关机构和相关作者\n", 82 | " temp.append(a.string)\n", 83 | " tempL.append(temp)\n", 84 | " if len(tempL) > 0:\n", 85 | " onepage['相关机构'] = tempL[len(tempL)-2]\n", 86 | " onepage['相关作者'] = tempL[len(tempL)-1]\n", 87 | " pagetext.append(onepage)\n", 88 | " except:\n", 89 | " pass\n", 90 | " continue" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "ls = []\n", 100 | "pagetext=[]\n", 101 | "getPageurl(ls,20)#设置爬取页数\n", 102 | "getOnePage(ls,pagetext)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "frame = pd.DataFrame(pagetext)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 7, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "writer = pd.ExcelWriter('E:/1.xlsx')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 8, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "frame.to_excel(writer,'Sheet1')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 9, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "writer.save()" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.7.1" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 2 170 | } 171 | -------------------------------------------------------------------------------- /Jupyter notebook/sentiment_analysis_by_plotly.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "" 12 | ], 13 | "text/vnd.plotly.v1+html": [ 14 | "" 15 | ] 16 | }, 17 | "metadata": {}, 18 | "output_type": "display_data" 19 | }, 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "D:\\conda\\lib\\site-packages\\plotly\\graph_objs\\_deprecations.py:39: DeprecationWarning:\n", 25 | "\n", 26 | "plotly.graph_objs.Data is deprecated.\n", 27 | "Please replace it with a list or tuple of instances of the following types\n", 28 | " - plotly.graph_objs.Scatter\n", 29 | " - plotly.graph_objs.Bar\n", 30 | " - plotly.graph_objs.Area\n", 31 | " - plotly.graph_objs.Histogram\n", 32 | " - etc.\n", 33 | "\n", 34 | "\n" 35 | ] 36 | }, 37 | { 38 | "data": { 39 | "application/vnd.plotly.v1+json": { 40 | "config": { 41 | "linkText": "Export to plot.ly", 42 | "plotlyServerURL": "https://plot.ly", 43 | "showLink": false 44 | }, 45 | "data": [ 46 | { 47 | "type": "bar", 48 | "uid": "16cf3f06-06fd-4d95-be02-b834386aca35", 49 | "x": [ 50 | 0, 51 | 0.02, 52 | 0.04, 53 | 0.06, 54 | 0.08, 55 | 0.1, 56 | 0.12, 57 | 0.14, 58 | 0.16, 59 | 0.18, 60 | 0.2, 61 | 0.22, 62 | 0.24, 63 | 0.26, 64 | 0.28, 65 | 0.3, 66 | 0.32, 67 | 0.34, 68 | 0.36, 69 | 0.38, 70 | 0.4, 71 | 0.42, 72 | 0.44, 73 | 0.46, 74 | 0.48, 75 | 0.5, 76 | 0.52, 77 | 0.54, 78 | 0.56, 79 | 0.58, 80 | 0.6, 81 | 0.62, 82 | 0.64, 83 | 0.66, 84 | 0.68, 85 | 0.7, 86 | 0.72, 87 | 0.74, 88 | 0.76, 89 | 0.78, 90 | 0.8, 91 | 0.82, 92 | 0.84, 93 | 0.86, 94 | 0.88, 95 | 0.9, 96 | 0.92, 97 | 0.94, 98 | 0.96, 99 | 0.98 100 | ], 101 | "y": [ 102 | 46, 103 | 11, 104 | 11, 105 | 4, 106 | 3, 107 | 6, 108 | 3, 109 | 5, 110 | 3, 111 | 5, 112 | 5, 113 | 3, 114 | 4, 115 | 2, 116 | 3, 117 | 5, 118 | 6, 119 | 3, 120 | 2, 121 | 0, 122 | 9, 123 | 1, 124 | 2, 125 | 2, 126 | 2, 127 | 111, 128 | 3, 129 | 1, 130 | 4, 131 | 1, 132 | 3, 133 | 1, 134 | 1, 135 | 0, 136 | 2, 137 | 3, 138 | 4, 139 | 3, 140 | 4, 141 | 8, 142 | 4, 143 | 4, 144 | 10, 145 | 4, 146 | 5, 147 | 10, 148 | 7, 149 | 8, 150 | 19, 151 | 115 152 | ] 153 | } 154 | ], 155 | "layout": {} 156 | }, 157 | "text/html": [ 158 | "
" 163 | ], 164 | "text/vnd.plotly.v1+html": [ 165 | "
" 170 | ] 171 | }, 172 | "metadata": {}, 173 | "output_type": "display_data" 174 | } 175 | ], 176 | "source": [ 177 | "#coding:utf-8\n", 178 | "#author: moyuweiqing\n", 179 | "#情感分析\n", 180 | "\n", 181 | "from snownlp import SnowNLP\n", 182 | "import plotly.offline as py\n", 183 | "import plotly.graph_objs as go\n", 184 | "import numpy as np\n", 185 | "import math\n", 186 | "\n", 187 | "py.init_notebook_mode(connected=True)#离线模式使用plotly\n", 188 | "\n", 189 | "text = open(r'C:\\Users\\Yoga\\Desktop\\区块链技术发展现状与展望_袁勇.txt').read()\n", 190 | "s1 = text.replace('\\n', '').replace(' ', '').replace('.', '。')#去除换行\n", 191 | "# print(s1)\n", 192 | "sn1 = SnowNLP(s1)\n", 193 | "sentimentslist = []\n", 194 | "for i in sn1.sentences:\n", 195 | " j = SnowNLP(i)\n", 196 | " # print(i)\n", 197 | " # print(j.sentiments)\n", 198 | " sentimentslist.append(j.sentiments)\n", 199 | "\n", 200 | "dic = {}\n", 201 | "for i in np.arange(0, 1, 0.02):\n", 202 | " index = round(i, 2)\n", 203 | " dic[index] = 0\n", 204 | "# print(dic)\n", 205 | "\n", 206 | "for i in sentimentslist:\n", 207 | " temp = round(math.floor(i/0.02)*0.02, 2)\n", 208 | " dic[temp] = dic[temp] + 1\n", 209 | "\n", 210 | "trace = go.Bar(x = list(dic.keys()), y = list(dic.values()))\n", 211 | "data = go.Data([trace])\n", 212 | "py.iplot(data)" 213 | ] 214 | } 215 | ], 216 | "metadata": { 217 | "kernelspec": { 218 | "display_name": "Python 3", 219 | "language": "python", 220 | "name": "python3" 221 | }, 222 | "language_info": { 223 | "codemirror_mode": { 224 | "name": "ipython", 225 | "version": 3 226 | }, 227 | "file_extension": ".py", 228 | "mimetype": "text/x-python", 229 | "name": "python", 230 | "nbconvert_exporter": "python", 231 | "pygments_lexer": "ipython3", 232 | "version": "3.7.1" 233 | } 234 | }, 235 | "nbformat": 4, 236 | "nbformat_minor": 2 237 | } 238 | -------------------------------------------------------------------------------- /Jupyter notebook/strategy_analysis_by_plotly(关键词).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "" 12 | ], 13 | "text/vnd.plotly.v1+html": [ 14 | "" 15 | ] 16 | }, 17 | "metadata": {}, 18 | "output_type": "display_data" 19 | }, 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "Building prefix dict from the default dictionary ...\n", 25 | "Loading model from cache C:\\Users\\Yoga\\AppData\\Local\\Temp\\jieba.cache\n", 26 | "Loading model cost 2.208 seconds.\n", 27 | "Prefix dict has been built succesfully.\n", 28 | "D:\\conda\\lib\\site-packages\\plotly\\graph_objs\\_deprecations.py:39: DeprecationWarning:\n", 29 | "\n", 30 | "plotly.graph_objs.Data is deprecated.\n", 31 | "Please replace it with a list or tuple of instances of the following types\n", 32 | " - plotly.graph_objs.Scatter\n", 33 | " - plotly.graph_objs.Bar\n", 34 | " - plotly.graph_objs.Area\n", 35 | " - plotly.graph_objs.Histogram\n", 36 | " - etc.\n", 37 | "\n", 38 | "\n" 39 | ] 40 | }, 41 | { 42 | "data": { 43 | "application/vnd.plotly.v1+json": { 44 | "config": { 45 | "linkText": "Export to plot.ly", 46 | "plotlyServerURL": "https://plot.ly", 47 | "showLink": false 48 | }, 49 | "data": [ 50 | { 51 | "mode": "markers+text", 52 | "text": [ 53 | "技术", 54 | "研究", 55 | "应用", 56 | "发展", 57 | "金融", 58 | "分析", 59 | "领域", 60 | "—", 61 | "创新", 62 | "设计", 63 | "实现", 64 | "系统", 65 | "产业", 66 | "问题", 67 | "我国", 68 | "展望", 69 | "商业银行", 70 | "挑战", 71 | "综述", 72 | "模式" 73 | ], 74 | "textposition": "top center", 75 | "type": "scatter", 76 | "uid": "e19ddfd8-8bcf-4b59-8a39-cb120804171b", 77 | "x": [ 78 | 4.700480365792417, 79 | 4.543294782270004, 80 | 4.3694478524670215, 81 | 3.5263605246161616, 82 | 3.332204510175204, 83 | 2.8903717578961645, 84 | 2.8903717578961645, 85 | 2.772588722239781, 86 | 2.772588722239781, 87 | 2.70805020110221, 88 | 2.6390573296152584, 89 | 2.5649493574615367, 90 | 2.5649493574615367, 91 | 2.4849066497880004, 92 | 2.302585092994046, 93 | 2.302585092994046, 94 | 2.302585092994046, 95 | 2.302585092994046, 96 | 2.1972245773362196, 97 | 2.1972245773362196 98 | ], 99 | "y": [ 100 | 6.529418838262226, 101 | 4.3694478524670215, 102 | 5.198497031265826, 103 | 5.087596335232384, 104 | 4.912654885736052, 105 | 2.70805020110221, 106 | 4.204692619390966, 107 | 4.143134726391533, 108 | 4.143134726391533, 109 | 3.1354942159291497, 110 | 0, 111 | 3.6888794541139363, 112 | 3.9318256327243257, 113 | 3.2188758248682006, 114 | 4.30406509320417, 115 | 0, 116 | 4.02535169073515, 117 | 2.1972245773362196, 118 | 1.3862943611198906, 119 | 2.9444389791664403 120 | ] 121 | } 122 | ], 123 | "layout": {} 124 | }, 125 | "text/html": [ 126 | "
" 131 | ], 132 | "text/vnd.plotly.v1+html": [ 133 | "
" 138 | ] 139 | }, 140 | "metadata": {}, 141 | "output_type": "display_data" 142 | } 143 | ], 144 | "source": [ 145 | "#coding:utf-8\n", 146 | "#author: moyuweiqing\n", 147 | "#战略分析,计算密度和向心度,向心度算法自己写,建立二维坐标轴\n", 148 | "\n", 149 | "import pandas as pd\n", 150 | "import jieba\n", 151 | "import plotly.offline as py\n", 152 | "import plotly.graph_objs as go\n", 153 | "import math\n", 154 | "\n", 155 | "py.init_notebook_mode(connected=True)#离线模式使用plotly\n", 156 | "\n", 157 | "xls = pd.ExcelFile(r'C:\\Users\\Yoga\\Desktop\\srp资料\\知网-区块链(2).xls')\n", 158 | "readf = pd.read_excel(xls, 'Sheet1')['标题']\n", 159 | "\n", 160 | "all_word = [] #记录所有分词\n", 161 | "dic = {} #记录分词的出现数量\n", 162 | "\n", 163 | "#统计所有存在的分词\n", 164 | "for row in range(0, len(readf)):\n", 165 | "\ttemp = jieba.cut(readf[row])\n", 166 | "\tfor i in temp:\n", 167 | "\t\tif i in all_word:\n", 168 | "\t\t\tcontinue\n", 169 | "\t\telse:\n", 170 | "\t\t\tall_word.append(i)\n", 171 | "\n", 172 | "#统计分词的出现数量\n", 173 | "for i in all_word:\n", 174 | " dic[i] = 0\n", 175 | "\n", 176 | "for row in range(0, len(readf)):\n", 177 | " temp = jieba.cut(readf[row])\n", 178 | " for i in temp:\n", 179 | " dic[i] = dic[i] + 1\n", 180 | "\n", 181 | "#去除无关词\n", 182 | "f = open(r'D:\\JetBrains\\PyCharm 2018.3.4\\CNKI-analysis\\venv\\Include\\dependence\\stopwords.txt', encoding = \"utf-8\")\n", 183 | "temp_dic = dic.copy()\n", 184 | "f = f.read()\n", 185 | "for i in temp_dic:\n", 186 | " if i in f:\n", 187 | " dic.pop(i)\n", 188 | "\n", 189 | "#对分词进行排序,并挑选出出现次数最多的前20个\n", 190 | "dic_sorted = dict(sorted(dic.items(), key = lambda x: x[1], reverse = True))\n", 191 | "dic_20 = {}# 20个出现次数最多的词语\n", 192 | "for i in range(0, 20):\n", 193 | " dic_20[list(dic_sorted.keys())[i]] = list(dic_sorted.values())[i]\n", 194 | "\n", 195 | "#建立一个空白的向心度模型\n", 196 | "dic_heart = {}\n", 197 | "for i in range(0, 20):\n", 198 | " dic_heart[list(dic_20.keys())[i]] = 0\n", 199 | "\n", 200 | "#计算向心度\n", 201 | "for key in dic_heart.keys():\n", 202 | " for row in range(0, len(readf)):\n", 203 | " temp = jieba.cut(readf[row])\n", 204 | " if key in temp:\n", 205 | " dic_heart[key] = dic_heart[key] + len(list(temp))\n", 206 | "\n", 207 | "exp_densit = []# 密度的自然对数\n", 208 | "exp_heart = [] # 向心度的自然对数\n", 209 | "\n", 210 | "#计算向心度\n", 211 | "for i in dic_20.values():\n", 212 | " exp_densit.append(math.log(i))\n", 213 | "for i in dic_heart.values():\n", 214 | " if i != 0:\n", 215 | " exp_heart.append(math.log(i))\n", 216 | " else:\n", 217 | " exp_heart.append(0)\n", 218 | "\n", 219 | "trace = go.Scatter(x = exp_densit, y = exp_heart, text = list(dic_20.keys()), textposition = \"top center\", mode = 'markers+text')\n", 220 | "data = go.Data([trace])\n", 221 | "py.iplot(data)\n", 222 | "#解决乱码\n", 223 | "# plt.rcParams['font.sans-serif'] =['Microsoft YaHei']\n", 224 | "# plt.rcParams['axes.unicode_minus'] = False\n", 225 | "#\n", 226 | "# plt.title(u'密度和向心度散点图')\n", 227 | "#\n", 228 | "# plt.xlabel('密度的自然对数')\n", 229 | "# plt.ylabel('向心度的自然对数')\n", 230 | "\n", 231 | "# plt.scatter(dic2.values(), dic3.values(), s=20, c=\"#ff1212\", marker='o')\n", 232 | "# # plt.scatter(exp_densit, exp_heart, s=20, c=\"#ff1212\", marker='o')\n", 233 | "# for i in range(0, 20):\n", 234 | "# plt.annotate(list(dic_20.keys())[i], xy = (exp_densit[i], exp_heart[i]))\n", 235 | "# plt.show()\n", 236 | "# plt.savefig(\"scatter_exp1.png\")" 237 | ] 238 | } 239 | ], 240 | "metadata": { 241 | "kernelspec": { 242 | "display_name": "Python 3", 243 | "language": "python", 244 | "name": "python3" 245 | }, 246 | "language_info": { 247 | "codemirror_mode": { 248 | "name": "ipython", 249 | "version": 3 250 | }, 251 | "file_extension": ".py", 252 | "mimetype": "text/x-python", 253 | "name": "python", 254 | "nbconvert_exporter": "python", 255 | "pygments_lexer": "ipython3", 256 | "version": "3.7.1" 257 | } 258 | }, 259 | "nbformat": 4, 260 | "nbformat_minor": 2 261 | } 262 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNKI-analysis 2 | ###项目简介: 3 | 使用python,从知网上爬取相关的t数据,并进行数据分析,涉及到pycharm和jupyter notebook 4 | 5 | ### 研究过程: 6 | 从知网上抓取以“区块链”为主题的文献,获取文献题名、主要责任者、发表杂志、关键词、文章分类号、引用文献和被引文献等数据;对低价值数据进行清洗;数据处理;对数据结果进行可视化呈现 7 | 8 | ### 技术栈: 9 | 数据抓取:python 10 | 数据处理:python,主要涉及到jieba、networkx库 11 | 可视化:matplotlib、plotly、pyecharts 12 | 13 | ### 存储说明: 14 | dependence存储的是依赖文件 15 | main主要的分析部分 16 | Results存储结果图 17 | Jupyter notebook里面存放的是.ipynb文件,需要在Jupyter notebook下运行,主要是因为plotly库依赖Jupyter notebook环境 18 | 19 | ### 文件说明: 20 | CNKI.py是我参考的爬虫文件 21 | CNKI2.py是最开始用来爬取数据的爬虫文件 22 | CNKI爬虫(改进版)是我一个师弟做的,用来分析的数据主要从这里爬取,爬取的数据存储在了知网数据.xls文件中 23 | pdf-to-txt.py实现了从pdf到txt文件的转换 24 | network.py封装了一部分构建网络的函数 25 | co-citation_network.py是共被引网络分析 26 | cooperation-network.py是作者合作网络分析 27 | co-work_network.py是共词网络分析 28 | keywords系列的py文件,是用不同的库进行关键词的提取,效果不同 29 | sentiment_analysis.py是对区块链文章的情感分析 30 | sentiment_analysis2.py是对《小王子》的情感分析 31 | jieba_analysis.py封装了部分分词的操作函数 32 | strategy_analysis.py战略分析,调用jieba_analysis.py构建散点图,对关键词的密度和向心度进行分析 33 | strategy_analysis_uniform.py不调用jieba_analysis.py,直接进行分析 34 | draw_word_cloud.py实现词云 35 | word_cloud.py对关键词进行词云制作 36 | Jupyter notebook里面存放的主要是要依赖Jupyter notebook开发环境的库的分析 37 | -------------------------------------------------------------------------------- /Results/co-word_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/co-word_analysis.png -------------------------------------------------------------------------------- /Results/co_citation_analysis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/co_citation_analysis.png -------------------------------------------------------------------------------- /Results/scatter_exp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/scatter_exp.png -------------------------------------------------------------------------------- /Results/scatter_log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/scatter_log.png -------------------------------------------------------------------------------- /Results/sentimental_analysis(区块链).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/sentimental_analysis(区块链).png -------------------------------------------------------------------------------- /Results/sentimental_analysis(小王子).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/sentimental_analysis(小王子).png -------------------------------------------------------------------------------- /Results/区块链词云.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/Results/区块链词云.png -------------------------------------------------------------------------------- /dependence/mask.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/mask.png -------------------------------------------------------------------------------- /dependence/simkai.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/simkai.ttf -------------------------------------------------------------------------------- /dependence/stopwords.txt: -------------------------------------------------------------------------------- 1 | $ 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | ? 13 | _ 14 | “ 15 | ” 16 | 、 17 | 。 18 | .:《 19 | 》 20 | 【 21 | 】的区块链与和 22 | 23 | 一 24 | 一些 25 | 一何 26 | 一切 27 | 一则 28 | 一方面 29 | 一旦 30 | 一来 31 | 一样 32 | 一般 33 | 一转眼 34 | 万一 35 | 上 36 | 上下 37 | 下 38 | 不 39 | 不仅 40 | 不但 41 | 不光 42 | 不单 43 | 不只 44 | 不外乎 45 | 不如 46 | 不妨 47 | 不尽 48 | 不尽然 49 | 不得 50 | 不怕 51 | 不惟 52 | 不成 53 | 不拘 54 | 不料 55 | 不是 56 | 不比 57 | 不然 58 | 不特 59 | 不独 60 | 不管 61 | 不至于 62 | 不若 63 | 不论 64 | 不过 65 | 不问 66 | 与 67 | 与其 68 | 与其说 69 | 与否 70 | 与此同时 71 | 且 72 | 且不说 73 | 且说 74 | 两者 75 | 个 76 | 个别 77 | 临 78 | 为 79 | 为了 80 | 为什么 81 | 为何 82 | 为止 83 | 为此 84 | 为着 85 | 乃 86 | 乃至 87 | 乃至于 88 | 么 89 | 之 90 | 之一 91 | 之所以 92 | 之类 93 | 乌乎 94 | 乎 95 | 乘 96 | 也 97 | 也好 98 | 也罢 99 | 了 100 | 二来 101 | 于 102 | 于是 103 | 于是乎 104 | 云云 105 | 云尔 106 | 些 107 | 亦 108 | 人 109 | 人们 110 | 人家 111 | 什么 112 | 什么样 113 | 今 114 | 介于 115 | 仍 116 | 仍旧 117 | 从 118 | 从此 119 | 从而 120 | 他 121 | 他人 122 | 他们 123 | 以 124 | 以上 125 | 以为 126 | 以便 127 | 以免 128 | 以及 129 | 以故 130 | 以期 131 | 以来 132 | 以至 133 | 以至于 134 | 以致 135 | 们 136 | 任 137 | 任何 138 | 任凭 139 | 似的 140 | 但 141 | 但凡 142 | 但是 143 | 何 144 | 何以 145 | 何况 146 | 何处 147 | 何时 148 | 余外 149 | 作为 150 | 你 151 | 你们 152 | 使 153 | 使得 154 | 例如 155 | 依 156 | 依据 157 | 依照 158 | 便于 159 | 俺 160 | 俺们 161 | 倘 162 | 倘使 163 | 倘或 164 | 倘然 165 | 倘若 166 | 借 167 | 假使 168 | 假如 169 | 假若 170 | 傥然 171 | 像 172 | 儿 173 | 先不先 174 | 光是 175 | 全体 176 | 全部 177 | 兮 178 | 关于 179 | 其 180 | 其一 181 | 其中 182 | 其二 183 | 其他 184 | 其余 185 | 其它 186 | 其次 187 | 具体地说 188 | 具体说来 189 | 兼之 190 | 内 191 | 再 192 | 再其次 193 | 再则 194 | 再有 195 | 再者 196 | 再者说 197 | 再说 198 | 冒 199 | 冲 200 | 况且 201 | 几 202 | 几时 203 | 凡 204 | 凡是 205 | 凭 206 | 凭借 207 | 出于 208 | 出来 209 | 分别 210 | 则 211 | 则甚 212 | 别 213 | 别人 214 | 别处 215 | 别是 216 | 别的 217 | 别管 218 | 别说 219 | 到 220 | 前后 221 | 前此 222 | 前者 223 | 加之 224 | 加以 225 | 即 226 | 即令 227 | 即使 228 | 即便 229 | 即如 230 | 即或 231 | 即若 232 | 却 233 | 去 234 | 又 235 | 又及 236 | 及 237 | 及其 238 | 及至 239 | 反之 240 | 反而 241 | 反过来 242 | 反过来说 243 | 受到 244 | 另 245 | 另一方面 246 | 另外 247 | 另悉 248 | 只 249 | 只当 250 | 只怕 251 | 只是 252 | 只有 253 | 只消 254 | 只要 255 | 只限 256 | 叫 257 | 叮咚 258 | 可 259 | 可以 260 | 可是 261 | 可见 262 | 各 263 | 各个 264 | 各位 265 | 各种 266 | 各自 267 | 同 268 | 同时 269 | 后 270 | 后者 271 | 向 272 | 向使 273 | 向着 274 | 吓 275 | 吗 276 | 否则 277 | 吧 278 | 吧哒 279 | 吱 280 | 呀 281 | 呃 282 | 呕 283 | 呗 284 | 呜 285 | 呜呼 286 | 呢 287 | 呵 288 | 呵呵 289 | 呸 290 | 呼哧 291 | 咋 292 | 和 293 | 咚 294 | 咦 295 | 咧 296 | 咱 297 | 咱们 298 | 咳 299 | 哇 300 | 哈 301 | 哈哈 302 | 哉 303 | 哎 304 | 哎呀 305 | 哎哟 306 | 哗 307 | 哟 308 | 哦 309 | 哩 310 | 哪 311 | 哪个 312 | 哪些 313 | 哪儿 314 | 哪天 315 | 哪年 316 | 哪怕 317 | 哪样 318 | 哪边 319 | 哪里 320 | 哼 321 | 哼唷 322 | 唉 323 | 唯有 324 | 啊 325 | 啐 326 | 啥 327 | 啦 328 | 啪达 329 | 啷当 330 | 喂 331 | 喏 332 | 喔唷 333 | 喽 334 | 嗡 335 | 嗡嗡 336 | 嗬 337 | 嗯 338 | 嗳 339 | 嘎 340 | 嘎登 341 | 嘘 342 | 嘛 343 | 嘻 344 | 嘿 345 | 嘿嘿 346 | 因 347 | 因为 348 | 因了 349 | 因此 350 | 因着 351 | 因而 352 | 固然 353 | 在 354 | 在下 355 | 在于 356 | 地 357 | 基于 358 | 处在 359 | 多 360 | 多么 361 | 多少 362 | 大 363 | 大家 364 | 她 365 | 她们 366 | 好 367 | 如 368 | 如上 369 | 如上所述 370 | 如下 371 | 如何 372 | 如其 373 | 如同 374 | 如是 375 | 如果 376 | 如此 377 | 如若 378 | 始而 379 | 孰料 380 | 孰知 381 | 宁 382 | 宁可 383 | 宁愿 384 | 宁肯 385 | 它 386 | 它们 387 | 对 388 | 对于 389 | 对待 390 | 对方 391 | 对比 392 | 将 393 | 小 394 | 尔 395 | 尔后 396 | 尔尔 397 | 尚且 398 | 就 399 | 就是 400 | 就是了 401 | 就是说 402 | 就算 403 | 就要 404 | 尽 405 | 尽管 406 | 尽管如此 407 | 岂但 408 | 己 409 | 已 410 | 已矣 411 | 巴 412 | 巴巴 413 | 并 414 | 并且 415 | 并非 416 | 庶乎 417 | 庶几 418 | 开外 419 | 开始 420 | 归 421 | 归齐 422 | 当 423 | 当地 424 | 当然 425 | 当着 426 | 彼 427 | 彼时 428 | 彼此 429 | 往 430 | 待 431 | 很 432 | 得 433 | 得了 434 | 怎 435 | 怎么 436 | 怎么办 437 | 怎么样 438 | 怎奈 439 | 怎样 440 | 总之 441 | 总的来看 442 | 总的来说 443 | 总的说来 444 | 总而言之 445 | 恰恰相反 446 | 您 447 | 惟其 448 | 慢说 449 | 我 450 | 我们 451 | 或 452 | 或则 453 | 或是 454 | 或曰 455 | 或者 456 | 截至 457 | 所 458 | 所以 459 | 所在 460 | 所幸 461 | 所有 462 | 才 463 | 才能 464 | 打 465 | 打从 466 | 把 467 | 抑或 468 | 拿 469 | 按 470 | 按照 471 | 换句话说 472 | 换言之 473 | 据 474 | 据此 475 | 接着 476 | 故 477 | 故此 478 | 故而 479 | 旁人 480 | 无 481 | 无宁 482 | 无论 483 | 既 484 | 既往 485 | 既是 486 | 既然 487 | 时候 488 | 是 489 | 是以 490 | 是的 491 | 曾 492 | 替 493 | 替代 494 | 最 495 | 有 496 | 有些 497 | 有关 498 | 有及 499 | 有时 500 | 有的 501 | 望 502 | 朝 503 | 朝着 504 | 本 505 | 本人 506 | 本地 507 | 本着 508 | 本身 509 | 来 510 | 来着 511 | 来自 512 | 来说 513 | 极了 514 | 果然 515 | 果真 516 | 某 517 | 某个 518 | 某些 519 | 某某 520 | 根据 521 | 欤 522 | 正值 523 | 正如 524 | 正巧 525 | 正是 526 | 此 527 | 此地 528 | 此处 529 | 此外 530 | 此时 531 | 此次 532 | 此间 533 | 毋宁 534 | 每 535 | 每当 536 | 比 537 | 比及 538 | 比如 539 | 比方 540 | 没奈何 541 | 沿 542 | 沿着 543 | 漫说 544 | 焉 545 | 然则 546 | 然后 547 | 然而 548 | 照 549 | 照着 550 | 犹且 551 | 犹自 552 | 甚且 553 | 甚么 554 | 甚或 555 | 甚而 556 | 甚至 557 | 甚至于 558 | 用 559 | 用来 560 | 由 561 | 由于 562 | 由是 563 | 由此 564 | 由此可见 565 | 的 566 | 的确 567 | 的话 568 | 直到 569 | 相对而言 570 | 省得 571 | 看 572 | 眨眼 573 | 着 574 | 着呢 575 | 矣 576 | 矣乎 577 | 矣哉 578 | 离 579 | 竟而 580 | 第 581 | 等 582 | 等到 583 | 等等 584 | 简言之 585 | 管 586 | 类如 587 | 紧接着 588 | 纵 589 | 纵令 590 | 纵使 591 | 纵然 592 | 经 593 | 经过 594 | 结果 595 | 给 596 | 继之 597 | 继后 598 | 继而 599 | 综上所述 600 | 罢了 601 | 者 602 | 而 603 | 而且 604 | 而况 605 | 而后 606 | 而外 607 | 而已 608 | 而是 609 | 而言 610 | 能 611 | 能否 612 | 腾 613 | 自 614 | 自个儿 615 | 自从 616 | 自各儿 617 | 自后 618 | 自家 619 | 自己 620 | 自打 621 | 自身 622 | 至 623 | 至于 624 | 至今 625 | 至若 626 | 致 627 | 般的 628 | 若 629 | 若夫 630 | 若是 631 | 若果 632 | 若非 633 | 莫不然 634 | 莫如 635 | 莫若 636 | 虽 637 | 虽则 638 | 虽然 639 | 虽说 640 | 被 641 | 要 642 | 要不 643 | 要不是 644 | 要不然 645 | 要么 646 | 要是 647 | 譬喻 648 | 譬如 649 | 让 650 | 许多 651 | 论 652 | 设使 653 | 设或 654 | 设若 655 | 诚如 656 | 诚然 657 | 该 658 | 说来 659 | 诸 660 | 诸位 661 | 诸如 662 | 谁 663 | 谁人 664 | 谁料 665 | 谁知 666 | 贼死 667 | 赖以 668 | 赶 669 | 起 670 | 起见 671 | 趁 672 | 趁着 673 | 越是 674 | 距 675 | 跟 676 | 较 677 | 较之 678 | 边 679 | 过 680 | 还 681 | 还是 682 | 还有 683 | 还要 684 | 这 685 | 这一来 686 | 这个 687 | 这么 688 | 这么些 689 | 这么样 690 | 这么点儿 691 | 这些 692 | 这会儿 693 | 这儿 694 | 这就是说 695 | 这时 696 | 这样 697 | 这次 698 | 这般 699 | 这边 700 | 这里 701 | 进而 702 | 连 703 | 连同 704 | 逐步 705 | 通过 706 | 遵循 707 | 遵照 708 | 那 709 | 那个 710 | 那么 711 | 那么些 712 | 那么样 713 | 那些 714 | 那会儿 715 | 那儿 716 | 那时 717 | 那样 718 | 那般 719 | 那边 720 | 那里 721 | 都 722 | 鄙人 723 | 鉴于 724 | 针对 725 | 阿 726 | 除 727 | 除了 728 | 除外 729 | 除开 730 | 除此之外 731 | 除非 732 | 随 733 | 随后 734 | 随时 735 | 随着 736 | 难道说 737 | 非但 738 | 非徒 739 | 非特 740 | 非独 741 | 靠 742 | 顺 743 | 顺着 744 | 首先 745 | 没有一个! 746 | , 747 | : 748 | ; 749 | ? 750 | 大学论文博士硕士浙江中国北京期年首都大连 nan -------------------------------------------------------------------------------- /dependence/区块链技术发展现状与展望_袁勇.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/区块链技术发展现状与展望_袁勇.pdf -------------------------------------------------------------------------------- /dependence/区块链技术发展现状与展望_袁勇.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/区块链技术发展现状与展望_袁勇.txt -------------------------------------------------------------------------------- /dependence/小王子.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/小王子.txt -------------------------------------------------------------------------------- /dependence/知网数据.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/dependence/知网数据.xls -------------------------------------------------------------------------------- /main/CNKI.py: -------------------------------------------------------------------------------- 1 | #初始的爬虫案例,主要参照于这个来做 2 | #coding:utf-8 3 | 4 | import requests 5 | from bs4 import BeautifulSoup as bs 6 | import time 7 | import xlwt 8 | import openpyxl 9 | import re 10 | 11 | 12 | def pagenext(): 13 | base_url = 'http://search.cnki.com.cn/search.aspx?q=%E6%96%B0%E9%97%BB%E4%BC%A0%E6%92%AD&rank=relevant&cluster=Type&val=I141&p=' 14 | L = range(0, 840) # 最尾巴的数不计入 15 | All_Page = [] 16 | for i in L[::10]: 17 | next_url = base_url + str(i) 18 | # print(next_url) 19 | print("第 ", i / 10 + 1, " 页的数据") 20 | page_text = spider(next_url) 21 | time.sleep(10) 22 | for page in page_text: 23 | All_Page.append(page) 24 | print(All_Page) 25 | write_excel('xlsx论文筛选.xlsx', 'info', All_Page) 26 | 27 | 28 | def datespider(date_url): 29 | # 因为跳转的链接类型不一样,所以我们要判断这两种链接是哪一种并且选择不一样的解析find方法 30 | response_try = requests.get(date_url, { 31 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}) 32 | # print(response_try.text) 33 | response_tree = bs(response_try.text, 'html.parser') 34 | # 根据两个不同的链接返回不一样的值 35 | if re.match(r'http://www.cnki.com.cn/Article/[0-9a-zA-Z\_]+', date_url): 36 | res_date = response_tree.find("font", {"color": "#0080ff"}) 37 | if res_date == None: 38 | response_date = None 39 | else: 40 | response_date = res_date.get_text().replace('\r', '').replace('\n', '') 41 | else: 42 | response_date = response_tree.find("title").get_text()[-8:] 43 | return response_date 44 | 45 | 46 | def write_excel(path, sheet_name, text_info): 47 | index = len(text_info) 48 | workbook = openpyxl.Workbook() 49 | sheet = workbook.active 50 | sheet.title = sheet_name 51 | for i in range(0, index): 52 | for j in range(len(text_info[i])): 53 | sheet.cell(row=i + 1, column=j + 1, value=str(text_info[i][j])) 54 | workbook.save(path) 55 | print("xlsx格式表格写入数据成功!") 56 | 57 | 58 | def spider(url): 59 | response = requests.get(url, { 60 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}) 61 | res = response.content 62 | html = str(res, 'utf-8') 63 | html_tree = bs(html, 'lxml') 64 | # 找打h3标签下的内容 65 | html_text = html_tree.find_all("h3") 66 | All_text = [] 67 | # 隔一个才是文章的标题 68 | for text in html_text[1:-2:]: 69 | one_text = [] 70 | text_title = text.get_text().replace('\xa0', '').replace('\n', '') # 得到论文的标题 71 | # print(text.get_text()) 72 | text_url = text.find('a')['href'] # 选取了当前文章的链接 73 | # 用正则表达式匹配我们需要的链接 74 | if re.match(r"""http://youxian.cnki.com.cn/yxdetail.aspx\?filename=[0-9a-zA-Z]+&dbname=[a-zA-Z]+""", 75 | text_url) or re.match(r'http://www.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url): 76 | # print(text.find('a')['href']) 77 | text_date = datespider(text_url) 78 | one_text.append(text.get_text().replace('\xa0', '').replace('\n', '')) # text.get_text是得到文章的标题 79 | if text_date == None: 80 | one_text.append(None) 81 | else: 82 | if int(text_date[:4]) >= 2014: 83 | one_text.append(text_date.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '')) 84 | else: 85 | continue 86 | All_text.append(one_text) 87 | # print(text.find('a')['href']) 88 | 89 | # print(All_text) 90 | return All_text 91 | 92 | 93 | # write_excel(All_text) 94 | 95 | 96 | if __name__ == '__main__': 97 | pagenext() -------------------------------------------------------------------------------- /main/CNKI2.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #主爬虫,数据用这个来爬取的 4 | 5 | #爬虫须知: 6 | # 1、运行前请配置包 requests、bs4、xlutils、my_fake_useragent 7 | # 2、Excel 文件请先在同一级目录新建好,代码里的名字:知网-区块链.xls(不能是xlsx后缀) 8 | # 3、目前测试 只能爬取18页的数据,到19页就会失败,好像有个上限,没想到它是怎么识别的 9 | 10 | 11 | import requests #爬取IP端口和 12 | from bs4 import BeautifulSoup as bs #bs4解析库,用来解析网页 13 | import time 14 | import openpyxl #对Excel的操作 15 | import re #对字符串的操作 16 | import xlrd #xls文件的读 17 | import xlwt #xls文件的写 18 | from xlutils.copy import copy#修改(追加写入) 19 | from my_fake_useragent import UserAgent #这个库用来做反爬虫的 20 | #这库用来随机生成user_agent 在这个爬虫中好像没必要 一样会循环重定向 21 | 22 | def pagenext(): 23 | #最开始的链接 最后面 'p=' 添加你要的页数 就能去其他页 24 | base_url = 'http://search.cnki.com.cn/Search.aspx?q=%e5%8c%ba%e5%9d%97%e9%93%be&rank=relevant&cluster=all&val=&p=' 25 | L = range(0, 450) #修改这里可以改变获取的数量 不要太多 不然跑很久 4500就是300页了 26 | # All_Page = [] 27 | for i in L[::15]: #15条是一页 28 | All_Page = [] 29 | next_url = base_url + str(i)#配置下一页的url,每15个数据一页 30 | print(next_url) 31 | print(i / 15 + 1, " 页的数据") 32 | page_text = spider(next_url) #跑第*页的爬虫 获取那一页的数据 33 | time.sleep(10) #休息一会 防被网站 ban 34 | write_excel('xlsx论文筛选.xls',i / 15 + 1, page_text) #写进Excel 35 | 36 | #进入了文章的具体ulr 37 | def datespider(date_url): 38 | #设置一下 UserAgent 突破反扒 39 | response_try = requests.get(date_url, UserAgent().random()) 40 | # 用BeautifulSoup框架转化 41 | response_tree = bs(response_try.text, 'html.parser') 42 | if(response_tree==None): 43 | return [] 44 | else: 45 | # 在对应位置 匹配需要的信息 46 | res_date = response_tree.find("font", {"color": "#0080ff"}) 47 | res_name = response_tree.find("div", {"style": "text-align:center; width:740px; height:30px;"}) 48 | res_msg = response_tree.find("div", {"style": "text-align:left;"}) 49 | 50 | #时间 51 | if res_date == None: 52 | response_date = None 53 | else: 54 | response_date = res_date.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '') 55 | #作者 56 | if res_name == None: 57 | response_name = None 58 | else: 59 | response_name = res_name.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '') 60 | #其他信息 61 | if res_msg == None: 62 | res_msg = None 63 | else: 64 | # 去除不想要的东西 65 | response_msg = res_msg.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t','')\ 66 | .replace('】', '').replace('学位授予单位:', '').replace('学位级别:', '').replace('作者单位:', '').replace('学位授予年份:','').replace('分类号:', '') 67 | #用“【”作为分割界限,将response_msg字符串 划分为 response_point列表 68 | response_point = response_msg.split("【") 69 | #插入列表 并返回 70 | response_All = [] 71 | response_All.append(response_date) 72 | response_All.append(response_name) 73 | #列表拼接 74 | #列表拼接 75 | for item in range(1,len(response_point)): 76 | response_All.append(response_point[item]) 77 | 78 | return response_All 79 | 80 | #写进表格里面去 81 | def write_excel(path, page, text_info): 82 | 83 | index = len(text_info) 84 | # workbook = openpyxl.Workbook() 85 | workbook = xlrd.open_workbook(path)#打开 86 | sheets = workbook.sheet_names() 87 | sheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格 88 | rows_old = sheet.nrows # 获取表格中已存在的数据的行数 89 | new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象 90 | new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格 91 | # sheet.title = sheet_name 92 | for i in range(0, index): 93 | for j in range(len(text_info[i])): 94 | new_worksheet.write(i + rows_old,j,str(text_info[i][j])) 95 | new_workbook.save(path) 96 | 97 | print(page," 页写入数据成功!") 98 | 99 | def spider(url): 100 | response = requests.get(url, {'User-Agent':UserAgent().random()})#用来突破反爬虫 101 | res = response.content 102 | html = str(res, 'utf-8')#用来获取html页面 103 | html_tree = bs(html, 'lxml') 104 | # 找class = wz_content标签下的内容 105 | html_text = html_tree.find_all("div", class_="wz_content") 106 | All_text = [] 107 | for text in html_text: 108 | one_text = [] 109 | text_url = text.find('a')['href'] # 选取了当前文章的链接 110 | text_title = text.find('h3') #标题 111 | text_cout = text.find("span", class_="count") 112 | #舍弃http://youxian.cnki链接 打不开的 没数据 可能需要登陆才有数据 之后再调试吧 出现概率1/20 113 | if re.match(r'http://www.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url) or re.match(r'http://cdmd.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url): 114 | # 调用函数 进去各个文章的具体网站 找其他信息 115 | text_all = datespider(text_url) 116 | one_text.append(text_title.get_text().replace('\xa0', '').replace('\n', '')) # 得到文章的标题 117 | one_text.append(text_cout.get_text().replace('\xa0', '').replace('\n', '').replace('下载次数', '').replace('被引次数', '').replace('(', '').replace(')', '')) # 把操作次数 放进列表 118 | for item in text_all:#将datespider函数返回的信息,文章的 作者、单位、学位 、分类号,插入列表 119 | one_text.append(item.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '').replace('年', '')) 120 | one_text.append(text_url) # 把文章的链接 放进列表 121 | 122 | All_text.append(one_text) 123 | return All_text 124 | 125 | if __name__ == '__main__': 126 | pagenext() -------------------------------------------------------------------------------- /main/__pycache__/draw_word_cloud.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/main/__pycache__/draw_word_cloud.cpython-37.pyc -------------------------------------------------------------------------------- /main/__pycache__/jieba_analysis.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/main/__pycache__/jieba_analysis.cpython-37.pyc -------------------------------------------------------------------------------- /main/__pycache__/network.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/moyuweiqing/CNKI-analysis/6f6516bf84d89d32a45e2175eb7f66b0629b304b/main/__pycache__/network.cpython-37.pyc -------------------------------------------------------------------------------- /main/co-citation_network.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #共引文献网络分析,和共词网络分析差不多 4 | 5 | import os 6 | import networkx as nx#复杂网络分析库 7 | import network 8 | import pandas as pd 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | path = os.path.abspath('..') 13 | xls = pd.ExcelFile(path+'\dependence\知网数据.xls')#读取xls表格 14 | readf = pd.read_excel(xls,'Sheet1')#读取第一个表 15 | frame = readf[readf['共引文献'].notnull()]#如果关键词那一列非空,读取所有数据 16 | 17 | keywords = network.seperate(frame, '共引文献', ' ', ';')#关键词列表,里面记录了所有的关键词,没有重复 18 | # for keyword in frame['共引文献']:#分隔关键词,并加入到列表中,去重 19 | # if ',' in keyword: 20 | # temp = keyword.split(',') 21 | # for x in temp: 22 | # if x not in keywords: 23 | # keywords.append(x) 24 | # elif ';' in keyword: 25 | # temp = keyword.split(';') 26 | # for x in temp: 27 | # if x not in keywords: 28 | # keywords.append(x) 29 | # else: 30 | # if keyword not in keywords: 31 | # keywords.append(keyword) 32 | 33 | df = pd.DataFrame(index=frame['序号'],columns=keywords) #建立以标题为行,关键词为列的DataFrame矩阵 34 | df.index.name='序号' 35 | df.columns.name='共引文献' 36 | # 37 | # #将这一篇文献所拥有的关键词在矩阵中标记为1 38 | # for row in frame['序号']: 39 | # for keyword in df.columns: 40 | # if keyword in frame.loc[row]['共引文献']: 41 | # df.loc[row][keyword] = 1 42 | # df = df.fillna(0)#填充空值 43 | 44 | df = network.fill(frame, '序号', '共引文献', df) 45 | 46 | #df为存在矩阵,dataframe类型 47 | #data为关联度,矩阵类型 48 | #df2位关联度矩阵,dataframe类型 49 | 50 | data = df.values.T.dot(df.values)#建立关键词之间的相关性,边的长度为相关性,在这里是将两个df点乘,df.values是按行读取值 51 | df2 = pd.DataFrame(data = data,index=keywords,columns=keywords)#建立关键词之间的相关性矩阵,以关联度作为值传入 52 | 53 | #设置阈值 54 | value = lambda x : x * 30 if x > 0 else 0 55 | df2 = df2.applymap(value) 56 | 57 | net = nx.Graph(df2)#创建无向图,以关键词为节点,相关性为边 58 | 59 | dele, net = network.remove(keywords, net) 60 | 61 | de=dict(net.degree())#建立字典,关键字为索引,度(关联情况)为值 62 | pos = nx.spring_layout(net)#四种建图模式,spectral,shell,circular,spring,spring是可以看的了 63 | 64 | array = np.zeros(len(keywords))#建立以度为值的一维矩阵 65 | arg = np.argsort(-np.array(array)) 66 | labels = {}#记录关键词 67 | for index in range(0, len(keywords)): 68 | labels[keywords[arg[index]]] = keywords[arg[index]] 69 | 70 | de2 = [de[v]*10 for v in sorted(de.keys(), reverse=False)]#应该是节点的大小,尺寸调整合适 71 | 72 | plt.figure(figsize=(50, 50)) 73 | nx.draw_networkx_labels(net,pos,labels, font_size=40,font_color='black',font_family ='YouYuan')#写标记 74 | nx.draw_networkx(net, pos, node_size=de2, with_labels = False, node_color='#A52A2A', linewidths=None, width=2.0, edge_color ='#858585') -------------------------------------------------------------------------------- /main/co-word_network.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #共词网络可视化 4 | 5 | import os 6 | import networkx as nx#复杂网络分析库 7 | import network 8 | import pandas as pd 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | path = os.path.abspath('..') 13 | xls = pd.ExcelFile(path+'\dependence\知网数据.xls')#读取xls表格 14 | readf = pd.read_excel(xls,'Sheet1')#读取第一个表 15 | frame = readf[readf['关键词'].notnull()]#如果关键词那一列非空,读取所有数据 16 | 17 | keywords = network.seperate(frame, '关键词', ' ', ';')#关键词列表,里面记录了所有的关键词,没有重复 18 | # for keyword in frame['关键词']:#分隔关键词,并加入到列表中,去重 19 | # if ' ' in keyword: 20 | # temp = keyword.split(' ') 21 | # for x in temp: 22 | # if x not in keywords: 23 | # keywords.append(x) 24 | # elif ';' in keyword: 25 | # temp = keyword.split(';') 26 | # for x in temp: 27 | # if x not in keywords: 28 | # keywords.append(x) 29 | # else: 30 | # if keyword not in keywords: 31 | # keywords.append(keyword) 32 | 33 | #建立以标题为行,关键词为列的DataFrame矩阵 34 | df = pd.DataFrame(index=frame['序号'],columns=keywords) 35 | df.index.name='序号' 36 | df.columns.name='关键词' 37 | 38 | # for row in frame['序号']:#将这一篇文献所拥有的关键词在矩阵中标记为1 39 | # for keyword in df.columns: 40 | # if keyword in frame.loc[row]['关键词']: 41 | # #print(keyword) 42 | # df.loc[row][keyword] = 1 43 | # 44 | # #df为存在矩阵,dataframe类型 45 | # #data为关联度,矩阵类型 46 | # #df2位关联度矩阵,dataframe类型 47 | # 48 | # df = df.fillna(0)#填充空值 49 | df = network.fill(frame, '序号', '关键词', df) 50 | 51 | data = df.values.T.dot(df.values)#建立关键词之间的相关性,边的长度为相关性,在这里是将两个df点乘,df.values是按行读取值 52 | 53 | df2 = pd.DataFrame(data = data,index=keywords,columns=keywords)#建立关键词之间的相关性矩阵 54 | 55 | #设置阈值 56 | valve = lambda x : x if x > 32 else 0 57 | df2 = df2.applymap(valve) 58 | 59 | net = nx.Graph(df2)#创建无向图,以关键词为节点,相关性为边 60 | 61 | # def check(x,net): 62 | # for i in range(0,keywords.index(x)): 63 | # if nx.has_path(net,x,keywords[i]): 64 | # return True 65 | # for j in range(keywords.index(x)+1,len(keywords)): 66 | # if nx.has_path(net,x,keywords[j]): 67 | # return True 68 | # return False 69 | # 70 | # #去除无连接节点 71 | # dele=[] 72 | # for i in range(len(keywords)): 73 | # if not check(keywords[i],net): 74 | # if keywords[i] not in dele: 75 | # dele.append(keywords[i]) 76 | # net.remove_nodes_from(dele) 77 | dele, net = network.remove(keywords, net) 78 | 79 | de=dict(net.degree())#建立字典,关键字为索引,度(关联情况)为值 80 | pos = nx.spring_layout(net)#四种建图模式,spectral,shell,circular,spring,spring是可以看的了 81 | keywords = [i for i in keywords if i not in dele]#有边的关键词 82 | 83 | array = np.zeros(len(keywords))#建立以度为值的一维矩阵 84 | arg = np.argsort(-np.array(array)) 85 | labels = {}#记录关键词 86 | for index in range(0, len(keywords)): 87 | labels[keywords[arg[index]]] = keywords[arg[index]] 88 | 89 | de2 = [de[v]*60 for v in sorted(de.keys(), reverse=False)]#应该是节点的大小,尺寸调整合适 90 | 91 | plt.figure(figsize=(50, 50)) 92 | nx.draw_networkx_labels(net,pos,labels, font_size=40,font_color='black',font_family ='YouYuan') 93 | nx.draw_networkx(net, pos, node_size=de2, with_labels = False, node_color='#A52A2A', linewidths=None, width=2.0, edge_color ='#858585') -------------------------------------------------------------------------------- /main/cooperation_network.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #合作网络可视化 4 | 5 | import os 6 | import networkx as nx 7 | import pandas as pd 8 | import numpy as np 9 | import matplotlib.pyplot as plt #导入所需要的库 10 | 11 | path = os.path.abspath('..') 12 | xlsx = pd.ExcelFile(path+'\dependence\知网数据.xls') #读取数据文件 13 | readf = pd.read_excel(xlsx,'Sheet1') 14 | 15 | #数据进行预处理,将读入的作者以及相关作者信息转换为列表形式 16 | for i in range(len(readf['作者'])): 17 | if readf['作者'][i] != '[]': 18 | readf['作者'][i] = eval(readf['作者'][i]) 19 | else: 20 | readf['作者'][i] = np.nan 21 | for j in range(len(readf['相关作者'])): 22 | if readf['相关作者'][j] is not np.nan: 23 | readf['相关作者'][j] = eval(readf['相关作者'][j]) 24 | 25 | #数据预处理,将作者为空的数据去除 26 | frame = readf[readf['作者'].notnull()] 27 | frame.index = frame['题目'] 28 | 29 | #获取作者以及相关作者,将其整合到一个列表中 30 | all_authors = [] 31 | for authors in frame['作者']: 32 | for author in authors: 33 | if author not in all_authors: 34 | all_authors.append(author) 35 | for r_authors in frame['相关作者']: 36 | if r_authors is not np.nan: 37 | for r_author in r_authors: 38 | if r_author not in all_authors: 39 | all_authors.append(r_author) 40 | 41 | #构建出现矩阵 42 | df = pd.DataFrame(index=frame['题目'],columns=all_authors) 43 | df.index.name='题目' 44 | df.columns.name='作者' 45 | for title in frame['题目']: 46 | for i in frame.loc[title]['作者']: 47 | df.loc[title,i] = 1 48 | if frame.loc[title]['相关作者'] is not np.nan: 49 | for j in frame.loc[title]['相关作者']: 50 | df.loc[title,j] = 1 51 | df=df.fillna(0) 52 | 53 | #将出现矩阵转换为共现矩阵 54 | data = df.values.T.dot(df.values) 55 | df2 = pd.DataFrame(data = data,index=all_authors,columns=all_authors) 56 | 57 | #设置阀门,排除关联度小的点 58 | valve = lambda x : x if x > 32 else 0 59 | df2 = df2.applymap(valve) 60 | 61 | #构建共现网络 62 | net = nx.Graph(df2) 63 | 64 | #过滤关联度为0的节点 65 | def check(x,net): 66 | for i in range(0,all_authors.index(x)): 67 | if nx.has_path(net,x,all_authors[i]): 68 | return True 69 | for j in range(all_authors.index(x)+1,len(all_authors)): 70 | if nx.has_path(net,x,all_authors[j]): 71 | return True 72 | return False 73 | dele=[] 74 | for i in range(len(all_authors)): 75 | if not check(all_authors[i],net): 76 | if all_authors[i] not in dele: 77 | dele.append(all_authors[i]) 78 | net.remove_nodes_from(dele) 79 | 80 | #设置每个节点的大小比例为它们度的大小比例,并且显示每个节点的标签 81 | de=dict(net.degree()) 82 | pos = nx.spring_layout(net) 83 | all_authors = [i for i in all_authors if i not in dele] 84 | array = np.zeros(len(all_authors)) 85 | j = 0 86 | for i in de.keys(): 87 | array[j] = de[i] 88 | j+=1 89 | arg = np.argsort(-np.array(array)) 90 | labels = {} 91 | for index in range(len(all_authors)): 92 | labels[all_authors[arg[index]]] = all_authors[arg[index]] 93 | de2 = [de[v]*20 for v in sorted(de.keys(), reverse=False)] 94 | 95 | #对网路进行可视化 96 | plt.figure(figsize=(50, 50)) 97 | nx.draw_networkx_labels(net,pos,labels, font_size=40,font_color='black',font_family ='YouYuan') 98 | nx.draw_networkx(net, pos, node_size=de2, with_labels = False, node_color='#A52A2A', linewidths=None, width=1.0, edge_color ='#858585') 99 | -------------------------------------------------------------------------------- /main/draw_word_cloud.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #使用wordcloud库画词云 4 | 5 | import os 6 | import jieba 7 | import imageio as ima #读入图片文件 8 | from wordcloud import WordCloud 9 | 10 | def drawWordCloud(words, title, savepath='./results'): #定义一个词云绘制函数,通过词频绘制词云图并写出到特定目录 11 | path = os.path.abspath('..') 12 | if not os.path.exists(savepath): 13 | os.mkdir(savepath) 14 | wc = WordCloud(font_path=path+'\dependence\simkai.ttf', background_color='white', max_words=2000, width=1920, height=1080, margin=5, mask=ima.imread(path+'\dependence\mask.png'))#使用原先准备好的一张照片作为背景图 15 | wc.generate_from_frequencies(words) 16 | wc.to_file(os.path.join(savepath, title+'.png')) 17 | 18 | def statistics(texts, stopwords): #使用jieba库来进行分词,并统计词语出现次数 19 | words_dict = {} 20 | for text in texts: 21 | temp = jieba.cut(text) 22 | for t in temp: 23 | if t in stopwords: 24 | continue 25 | if t in words_dict.keys(): 26 | words_dict[t] += 1 27 | else: 28 | words_dict[t] = 1 29 | return words_dict -------------------------------------------------------------------------------- /main/jieba_analysis.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #使用jieba库进行分词、统计词频、去除无关词 4 | 5 | import jieba 6 | 7 | #统计所有存在的分词 8 | def calculateAllWords(readfile): 9 | all_word = [] #记录所有分词 10 | for row in range(0, len(readfile)): 11 | temp = jieba.cut(readfile[row]) 12 | for i in temp: 13 | if i in all_word: 14 | continue 15 | else: 16 | all_word.append(i) 17 | #all_word.pop() 18 | return all_word 19 | 20 | #统计分词的出现数量 21 | def calculateNumOfEachWord(readfile, all_word): 22 | dic = {} # 记录分词的出现数量 23 | for i in all_word: 24 | dic[i] = 0 25 | for row in range(0, len(readfile)): 26 | temp = jieba.cut(readfile[row]) 27 | for i in temp: 28 | dic[i] = dic[i] + 1 29 | return dic 30 | 31 | # 去除无关词 32 | def removeIrreleventWords(stopwords, dic): 33 | temp_dic = dic.copy() 34 | for i in temp_dic: 35 | if i in stopwords: 36 | dic.pop(i) 37 | return dic 38 | 39 | #对关键词进行排序 40 | def sortKeyWords(keyword,num): 41 | dic_sorted = dict(sorted(keyword.items(), key=lambda x: x[1], reverse=True)) 42 | dic_num = {} 43 | for i in range(0, num): 44 | dic_num[list(dic_sorted.keys())[i]] = list(dic_sorted.values())[i] 45 | return dic_num -------------------------------------------------------------------------------- /main/keywords_by_jieba.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #使用jieba库来进行关键词的提取 4 | 5 | import os 6 | import jieba 7 | import jieba.analyse 8 | 9 | path = os.path.abspath('..') 10 | text = open(path + '\dependence\区块链技术发展现状与展望_袁勇.txt') 11 | text = text.read() 12 | s1 = text.replace('\n', '').replace(' ', '')#去除换行 13 | 14 | fenci_text = jieba.cut(s1) 15 | stopwords = {}.fromkeys([ line.rstrip() for line in open(path + '\dependence\stopwords.txt', encoding = "utf-8") ]) 16 | final = "" 17 | for word in fenci_text: 18 | if word not in stopwords: 19 | if (word != "。" and word != ",") : 20 | final = final + " " + word 21 | 22 | keywords = jieba.analyse.extract_tags(final, topK = 20, withWeight = True, allowPOS = ()) 23 | print(keywords) -------------------------------------------------------------------------------- /main/keywords_by_snownlp.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #使用SnowNlp来提取关键词 4 | 5 | import os 6 | from snownlp import SnowNLP 7 | 8 | path = os.path.abspath('..') 9 | text = open(path + '\dependence\区块链技术发展现状与展望_袁勇.txt').read().replace('\n', '').replace(' ', '') 10 | 11 | analysis_result = SnowNLP(text) 12 | stopwords = {}.fromkeys([ line.rstrip() for line in open(path + '\dependence\stopwords.txt', encoding = "utf-8") ]) 13 | final = "" 14 | for word in analysis_result.keywords(20): 15 | if word not in stopwords: 16 | if (word != "。" and word != ",") : 17 | final = final + " " + word 18 | 19 | print(final) -------------------------------------------------------------------------------- /main/keywords_by_textrank4zh.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #使用textrank4zh来进行关键词的提取 4 | 5 | import os 6 | from textrank4zh import TextRank4Keyword 7 | 8 | path = os.path.abspath('..') 9 | text = open(path+'\dependence\区块链技术发展现状与展望_袁勇.txt').read().replace('\n', '').replace(' ', '') 10 | 11 | tr4w = TextRank4Keyword() 12 | tr4w.analyze(text, lower=True) 13 | key_words = tr4w.get_keywords(20) 14 | # print(key_words) 15 | word_list = list(key_word.word for key_word in key_words) 16 | 17 | stopwords = {}.fromkeys([ line.rstrip() for line in open(path+'\dependence\stopwords.txt', encoding = "utf-8") ]) 18 | final = "" 19 | for word in word_list: 20 | if word not in stopwords: 21 | if (word != "。" and word != ",") : 22 | final = final + " " + word 23 | 24 | print(final) -------------------------------------------------------------------------------- /main/network.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #网络 4 | 5 | import networkx as nx #复杂网络分析库 6 | import pandas as pd 7 | import numpy as np 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | # 分隔关键词,并加入到列表中,去重,设置两个分隔符 12 | # 参数说明:(frame:pd.read_excel()之后的对象,name:需要进行分析的那一列的名称,sp1、sp2为分隔符) 13 | def seperate(frame, name, sp1, sp2): 14 | list = [] 15 | for word in frame[name]: 16 | if sp1 in word: 17 | temp = word.split(sp1) 18 | for x in temp: 19 | if x not in list: 20 | list.append(x) 21 | elif sp2 in word: 22 | temp = word.split(sp2) 23 | for x in temp: 24 | if x not in list: 25 | list.append(x) 26 | else: 27 | if word not in list: 28 | list.append(word) 29 | return list 30 | 31 | #填充值,存在这个关键词的dataframe的位置设置为1,其余的用0来填充 32 | #参数说明:(frame:需要用来遍历的那个excel表格,index:用来进行遍历的frame的索引名字,name:用来进行遍历的frame的值,dataframe:用来写入的信息) 33 | def fill(frame, index, name, dataframe): 34 | for row in frame[index]: # 将这一篇文献所拥有的关键词在矩阵中标记为1 35 | for keyword in dataframe.columns: 36 | if keyword in frame.loc[row][name]: 37 | dataframe.loc[row][keyword] = 1 38 | df = dataframe.fillna(0) # 填充空值 39 | return df 40 | 41 | #检查是否有没有连接的节点 42 | #参数说明:(list:需要进行检查的列表,x:节点,net:网络) 43 | def check(list, x, net): 44 | for i in range(0,list.index(x)): 45 | if nx.has_path(net,x,list[i]): 46 | return True 47 | for j in range(list.index(x)+1,len(list)): 48 | if nx.has_path(net,x,list[j]): 49 | return True 50 | return False 51 | 52 | #去除没有连接的节点 53 | #参数说明:(list:需要进行检查的列表,net:网络) 54 | def remove(list, net): 55 | dele = [] 56 | for i in range(len(list)): 57 | if not check(list, list[i], net): 58 | if list[i] not in dele: 59 | dele.append(list[i]) 60 | net.remove_nodes_from(dele) 61 | return dele, net -------------------------------------------------------------------------------- /main/pdf-to-txt.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #pdf转换成txt 4 | 5 | import time,os.path,requests,re 6 | time1=time.time() 7 | from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter 8 | from pdfminer.converter import PDFPageAggregator 9 | from pdfminer.layout import LAParams,LTTextBoxHorizontal 10 | from pdfminer.pdfpage import PDFTextExtractionNotAllowed,PDFPage 11 | from pdfminer.pdfparser import PDFParser 12 | from pdfminer.pdfdocument import PDFDocument 13 | 14 | 15 | class CPdf2TxtManager(): 16 | def changePdfToText(self, filePath): 17 | # 以二进制读模式打开 18 | file = open(path, 'rb') 19 | #用文件对象来创建一个pdf文档分析器 20 | praser = PDFParser(file) 21 | # 创建一个PDF文档对象存储文档结构,提供密码初始化,没有就不用传该参数 22 | doc = PDFDocument(praser, password='') 23 | ##检查文件是否允许文本提取 24 | if not doc.is_extractable: 25 | raise PDFTextExtractionNotAllowed 26 | 27 | # 创建PDf 资源管理器 来管理共享资源,#caching = False不缓存 28 | rsrcmgr = PDFResourceManager(caching = False) 29 | # 创建一个PDF设备对象 30 | laparams = LAParams() 31 | # 创建一个PDF页面聚合对象 32 | device = PDFPageAggregator(rsrcmgr, laparams=laparams) 33 | # 创建一个PDF解析器对象 34 | interpreter = PDFPageInterpreter(rsrcmgr, device) 35 | # 获得文档的目录(纲要),文档没有纲要会报错 36 | #PDF文档没有目录时会报:raise PDFNoOutlines pdfminer.pdfdocument.PDFNoOutlines 37 | # print(doc.get_outlines()) 38 | 39 | # 获取page列表 40 | print(PDFPage.get_pages(doc)) 41 | # 循环遍历列表,每次处理一个page的内容 42 | for page in PDFPage.create_pages(doc): 43 | interpreter.process_page(page) 44 | # 接受该页面的LTPage对象 45 | layout = device.get_result() 46 | # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 47 | # 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 48 | for x in layout: 49 | if hasattr(x, "get_text"): 50 | fileNames = os.path.splitext(filePath) 51 | with open(fileNames[0] + '.txt','a+') as f: 52 | results = x.get_text() 53 | print(results) 54 | f.write(results.encode('gbk','ignore').decode('gbk') + '\n') 55 | # 如果x是水平文本对象的话 56 | # if (isinstance(x, LTTextBoxHorizontal)): 57 | # text = re.sub(replace, '', x.get_text()) 58 | # if len(text) != 0: 59 | # print(text) 60 | 61 | if __name__ == '__main__': 62 | path = os.path.abspath('..') 63 | pdf2TxtManager = CPdf2TxtManager() 64 | pdf2TxtManager.changePdfToText(path+'\dependence\区块链技术发展现状与展望_袁勇.pdf') 65 | time2 = time.time() 66 | print('ok,解析pdf结束!') 67 | print('总共耗时:' + str(time2 - time1) + 's') -------------------------------------------------------------------------------- /main/sentiment_analysis.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #情感分析-by-snownlp&matplotlib 4 | 5 | import os 6 | from snownlp import SnowNLP 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import math 10 | 11 | path = os.path.abspath('..') 12 | text = open(path + '\dependence\区块链技术发展现状与展望_袁勇.txt') 13 | text = text.read() 14 | s1 = text.replace('\n', '').replace(' ', '').replace('.', '。')#去除换行 15 | 16 | #建立情感分析 17 | sn1 = SnowNLP(s1) 18 | sentimentslist = [] 19 | for i in sn1.sentences: 20 | j = SnowNLP(i) 21 | sentimentslist.append(j.sentiments) 22 | 23 | #可视化处理,使用matplotlib 24 | dic = {} 25 | for i in np.arange(0, 1, 0.02): 26 | index = round(i, 2) 27 | dic[index] = 0 28 | for i in sentimentslist: 29 | temp = round(math.floor(i/0.02)*0.02, 2) 30 | dic[temp] = dic[temp] + 1 31 | plt.hist(sentimentslist,bins=np.arange(0,1,0.02)) 32 | plt.savefig(path+'\Results\sentimental_analysis(区块链).png') -------------------------------------------------------------------------------- /main/sentiment_analysis2.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #情感分析-by-snownlp&matplotlib 4 | 5 | import os 6 | from snownlp import SnowNLP 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import math 10 | 11 | path = os.path.abspath('..') 12 | text = open(path + '\dependence\小王子.txt') 13 | text = text.read() 14 | s1 = text.replace('\n', '').replace(' ', '').replace('.', '。')#去除换行 15 | 16 | #建立情感分析 17 | sn1 = SnowNLP(s1) 18 | sentimentslist = [] 19 | for i in sn1.sentences: 20 | j = SnowNLP(i) 21 | sentimentslist.append(j.sentiments) 22 | 23 | #可视化处理,使用matplotlib 24 | dic = {} 25 | for i in np.arange(0, 1, 0.02): 26 | index = round(i, 2) 27 | dic[index] = 0 28 | for i in sentimentslist: 29 | temp = round(math.floor(i/0.02)*0.02, 2) 30 | dic[temp] = dic[temp] + 1 31 | plt.hist(sentimentslist,bins=np.arange(0,1,0.02)) 32 | plt.savefig(path+'\Results\sentimental_analysis(小王子).png') -------------------------------------------------------------------------------- /main/strategy_analysis.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #战略分析,计算密度和向心度,向心度算法自己写,建立二维坐标轴 4 | 5 | import os 6 | import pandas as pd 7 | import jieba_analysis 8 | import jieba 9 | import matplotlib.pyplot as plt 10 | import matplotlib.font_manager as fm #字体管理,防止乱码 11 | import math 12 | 13 | path = os.path.abspath('..') 14 | xls = pd.ExcelFile(path + '\dependence\知网数据.xls') 15 | readf = pd.read_excel(xls, 'Sheet1')['题目'].astype(str) 16 | 17 | #统计所有存在的分词 18 | all_word = [] #记录所有分词 19 | all_word = jieba_analysis.calculateAllWords(readf) 20 | 21 | #统计分词的出现数量 22 | dic_raw = {} #记录分词的出现数量 23 | dic_raw = jieba_analysis.calculateNumOfEachWord(readf, all_word) 24 | 25 | #去除无关词 26 | stf = open(path+'\dependence\stopwords.txt', encoding = "utf-8").read() 27 | dic = jieba_analysis.removeIrreleventWords(stf, dic_raw) 28 | 29 | #对分词进行排序,并挑选出出现次数最多的前20个 30 | dic_20 = {} #记录前20个关键词 31 | dic_20 = jieba_analysis.sortKeyWords(dic, 20) 32 | 33 | #建立一个空白的向心度模型 34 | dic_heart = {} 35 | for i in range(0, 20): 36 | dic_heart[list(dic_20.keys())[i]] = 0 37 | 38 | #计算向心度 39 | for key in dic_heart.keys(): 40 | for row in range(0, len(readf)): 41 | temp = jieba.cut(readf[row]) 42 | if key in temp: 43 | dic_heart[key] = dic_heart[key] + len(list(temp)) 44 | 45 | log_densit = []# 密度的自然对数 46 | log_heart = [] # 向心度的自然对数 47 | 48 | #计算向心度 49 | for i in dic_20.values(): 50 | log_densit.append(math.log(i)) 51 | for i in dic_heart.values(): 52 | if i != 0: 53 | log_heart.append(math.log(i)) 54 | else: 55 | log_heart.append(0) 56 | 57 | #解决乱码 58 | plt.rcParams['font.sans-serif'] =['Microsoft YaHei'] 59 | plt.rcParams['axes.unicode_minus'] = False 60 | 61 | plt.title(u'密度和向心度散点图') 62 | plt.xlabel('密度的自然对数') 63 | plt.ylabel('向心度的自然对数') 64 | 65 | plt.scatter(log_densit, log_heart, s=20, c="#ff1212", marker='o') 66 | for i in range(0, 20): 67 | plt.annotate(list(dic_20.keys())[i], xy = (log_densit[i], log_heart[i])) 68 | plt.savefig(path + "\Results\scatter_log.png") -------------------------------------------------------------------------------- /main/strategy_analysis_uniform.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import jieba 3 | import matplotlib.pyplot as plt 4 | import matplotlib.font_manager as fm #字体管理,防止乱码 5 | import math 6 | 7 | xls = pd.ExcelFile(r'C:\Users\Yoga\Desktop\srp资料\知网-区块链(2).xls') 8 | readf = pd.read_excel(xls, 'Sheet1')['标题'] 9 | 10 | all_word = [] #记录所有分词 11 | dic = {} #记录分词的出现数量 12 | 13 | #统计所有存在的分词 14 | for row in range(0, len(readf)): 15 | temp = jieba.cut(readf[row]) 16 | for i in temp: 17 | if i in all_word: 18 | continue 19 | else: 20 | all_word.append(i) 21 | 22 | #统计分词的出现数量 23 | for i in all_word: 24 | dic[i] = 0 25 | 26 | for row in range(0, len(readf)): 27 | temp = jieba.cut(readf[row]) 28 | for i in temp: 29 | dic[i] = dic[i] + 1 30 | 31 | #去除无关词 32 | f = open(r'D:\JetBrains\PyCharm 2018.3.4\CNKI-analysis\venv\Include\dependence\stopwords.txt', encoding = "utf-8") 33 | temp_dic = dic.copy() 34 | f = f.read() 35 | for i in temp_dic: 36 | if i in f: 37 | dic.pop(i) 38 | 39 | #对分词进行排序,并挑选出出现次数最多的前20个 40 | dic_sorted = dict(sorted(dic.items(), key = lambda x: x[1], reverse = True)) 41 | dic_20 = {}# 20个出现次数最多的词语 42 | for i in range(0, 20): 43 | dic_20[list(dic_sorted.keys())[i]] = list(dic_sorted.values())[i] 44 | print(dic_20) 45 | 46 | #建立一个空白的向心度模型 47 | dic_heart = {} 48 | for i in range(0, 20): 49 | dic_heart[list(dic_20.keys())[i]] = 0 50 | 51 | #计算向心度 52 | for key in dic_heart.keys(): 53 | for row in range(0, len(readf)): 54 | temp = jieba.cut(readf[row]) 55 | if key in temp: 56 | dic_heart[key] = dic_heart[key] + len(list(temp)) 57 | 58 | exp_densit = []# 密度的自然对数 59 | exp_heart = [] # 向心度的自然对数 60 | 61 | #计算向心度 62 | for i in dic_20.values(): 63 | exp_densit.append(math.log(i)) 64 | for i in dic_heart.values(): 65 | if i != 0: 66 | exp_heart.append(math.log(i)) 67 | else: 68 | exp_heart.append(0) 69 | 70 | #解决乱码 71 | plt.rcParams['font.sans-serif'] =['Microsoft YaHei'] 72 | plt.rcParams['axes.unicode_minus'] = False 73 | 74 | plt.title(u'密度和向心度散点图') 75 | plt.xlabel('密度的自然对数') 76 | plt.ylabel('向心度的自然对数') 77 | 78 | plt.scatter(exp_densit, exp_heart, s=20, c="#ff1212", marker='o') 79 | for i in range(0, 20): 80 | plt.annotate(list(dic_20.keys())[i], xy = (exp_densit[i], exp_heart[i])) 81 | plt.show() 82 | # plt.savefig("scatter_exp.png") 83 | -------------------------------------------------------------------------------- /main/word_cloud.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | #author: moyuweiqing 3 | #词云 4 | 5 | import os 6 | import pandas as pd 7 | import draw_word_cloud 8 | 9 | if __name__ == '__main__': 10 | content = [] 11 | path = os.path.abspath('..') 12 | xls = pd.ExcelFile(path+'\dependence\知网数据.xls') #读取数据文件 13 | readf = pd.read_excel(xls, 'Sheet1') # 读取第一个表 14 | frame = readf[readf['题目'].notnull()] # 如果关键词那一列非空,读取所有数据 15 | for keyword in frame['题目']: # 分隔关键词,并加入到列表中,去重 16 | content.append(keyword) 17 | stopwords = open(path+'\dependence\stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1] 18 | words_dict = draw_word_cloud.statistics(content, stopwords) 19 | draw_word_cloud.drawWordCloud(words_dict, '区块链词云', savepath=path + '\Results') 20 | --------------------------------------------------------------------------------