├── 2020年泰迪杯挑战赛C题赛题描述.pdf ├── README.md ├── code ├── 第一问 │ ├── .ipynb_checkpoints │ │ ├── data_process-checkpoint.ipynb │ │ └── 测试代码-checkpoint.ipynb │ ├── SVC模型.pkl │ ├── __pycache__ │ │ └── data_process1.cpython-37.pyc │ ├── data_process.ipynb │ ├── data_process1.py │ ├── fj2.csv │ ├── model1.py │ ├── newdic1.txt │ ├── stopword1.txt │ ├── word_cloud1.py │ ├── word_fre1-7.csv │ ├── 作品测试结果.zip │ ├── 作品测试结果 │ │ ├── 作品测试结果.zip │ │ ├── 测试代码.ipynb │ │ ├── 附件2(测试数据).xlsx │ │ └── 附件2(测试结果).xlsx │ └── 结果2.xlsx ├── 第三问 │ ├── .ipynb_checkpoints │ │ ├── 主题模型-checkpoint.ipynb │ │ ├── 开头结尾及解决否-checkpoint.ipynb │ │ └── 连接词模型-checkpoint.ipynb │ ├── Word Dict │ │ ├── readme.txt │ │ ├── unParallelWord.txt │ │ └── unSingleWord.txt │ ├── ~$开头结尾.xlsx │ ├── 及时性 │ │ ├── 回复时间.xlsx │ │ └── 回复速度计算.ipynb │ ├── 可解释性 │ │ ├── .ipynb_checkpoints │ │ │ └── 连接词模型-checkpoint.ipynb │ │ ├── 增加可解释性词.csv │ │ ├── 增加连接词.csv │ │ ├── 自定义可解释性.txt │ │ ├── 连接词模型.ipynb │ │ ├── 连接词词典.csv │ │ └── 附件4.xlsx │ ├── 完整性 │ │ ├── 开头结尾.xlsx │ │ ├── 开头结尾及解决否.ipynb │ │ └── 开头结尾及解决否.xlsx │ ├── 相关性 │ │ ├── .ipynb_checkpoints │ │ │ └── 相似度-checkpoint.ipynb │ │ ├── ~$主题2.xlsx │ │ ├── 主题2.xlsx │ │ ├── 主题模型.ipynb │ │ ├── 相似度.ipynb │ │ └── 相似度.xlsx │ ├── 计分 │ │ ├── .ipynb_checkpoints │ │ │ ├── Untitled-checkpoint.ipynb │ │ │ └── 计分-checkpoint.ipynb │ │ ├── Untitled.ipynb │ │ ├── ~$总得分(最终版).xlsx │ │ ├── ~$统计10.xlsx │ │ ├── 准备计分.xlsx │ │ ├── 回复时间.xlsx │ │ ├── 总得分(最终版).xlsx │ │ ├── 正在计分.xlsx │ │ ├── 统计10.xlsx │ │ └── 计分.ipynb │ └── 附件4.xlsx └── 第二问 │ ├── 热点分析.ipynb │ ├── 热点问题留言明细表 .xls │ └── 热点问题表 .xls ├── data ├── 测试数据.xlsx ├── 附件1.xlsx ├── 附件2.xlsx ├── 附件3.xlsx └── 附件4.xlsx └── paper.pdf /2020年泰迪杯挑战赛C题赛题描述.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/2020年泰迪杯挑战赛C题赛题描述.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 近年来,随着微信、微博、市长信箱、阳光热线等网络问政平台逐步成为政府了解民意、汇聚民智、凝聚民气的重要渠道,各类社情民意相关的文本数据量不断攀升,给以往主要依靠人工来进行留言划分和热点整理的相关部门的工作带来了极大挑战。同时,随着大数据技术的发展,建立基于自然语言处理技术的智慧政务系统已经是社会治理创新发展的新趋势,对提升政府的管理水平和施政效率具有极大的推动作用。 2 | 3 | 本文针对“智慧政务”中的居民投诉建议文本评论数据,基于向量空间模型算法提取了文本关键词并我们采用了多种机器学习分类模型进行测试,从最终得到线性支持向量回归算法相对较优的结果,F1-Score评价指标达0.86。 4 | 5 | 在挖掘热点问题的前期处理上,使用了余弦相似度计算整理出文本相似的同类主题并加以筛选,通过在SPSS中建立基于因子分析法的热度评价指标模型,给出得分前五的主题样本作为Top5热点问题,分析比较了相关类问题的热度体现在各个指标上的具体表现。 6 | 7 | 为建立留言的答复意见的评价体系,我们定义了相关性、完整性、可解释性和及时性四个指标。答复意见和留言详情相关性的计算是基于LDA主题模型的中文编辑距离得到的,另外答复意见的可解释性使用了哈工大中文篇章关系的关联词表以及自定义的可解释性词典来判别。通过将这四项指标的得分相加得到某条答复意见的综合评分,分数越高,该答复的质量就越高,从而为决策者提供一个较为清晰完善的参考意见。 8 | -------------------------------------------------------------------------------- /code/第一问/.ipynb_checkpoints/data_process-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %load data_process1.py\n", 10 | "import pandas as pd\n", 11 | "import re\n", 12 | "import jieba.analyse\n", 13 | "\n", 14 | "def data_process(file='附件2(测试数据).xlsx'):\n", 15 | " data = pd.read_excel(file, index_col=0,encoding = 'GB18030')\n", 16 | " \n", 17 | " \n", 18 | " jieba.load_userdict('newdic1.txt')\n", 19 | " data_cut = data['留言详情'].apply(lambda x: jieba.lcut(x))\n", 20 | "\n", 21 | " stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)\n", 22 | " stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–'] + list(stopWords.iloc[:, 0])\n", 23 | " data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n", 24 | " #labels = data_new.loc[data_after_stop.index, '一级标签']\n", 25 | " adata = data_after_stop.apply(lambda x: ' '.join(x))\n", 26 | " data_after_stop = data_after_stop.to_frame()\n", 27 | " \n", 28 | "\t#提取关键词\n", 29 | " key=[]\n", 30 | " for i in adata:\n", 31 | " keywords=jieba.analyse.extract_tags(i,topK=20)\n", 32 | " key.append(keywords)\n", 33 | " data_after_stop['key']=key\n", 34 | " \n", 35 | "\t#去除城市乡镇以外的字母和0\n", 36 | " key=[]\n", 37 | " pattern = re.compile('[0-9]+')\n", 38 | " for x in data_after_stop['key']:\n", 39 | " temp=[]\n", 40 | " for i in x:\n", 41 | " match = pattern.findall(i)\n", 42 | " if match:\n", 43 | " pass\n", 44 | " else:\n", 45 | " temp.append(i)\n", 46 | " key.append(temp)\n", 47 | " data_after_stop['key']=key\n", 48 | " #data_after_stop['labels']=labels\n", 49 | " return adata, data_after_stop\n", 50 | "#adata, data_after_stop, labels = data_process()\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 11, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stderr", 61 | "output_type": "stream", 62 | "text": [ 63 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 64 | " del sys.path[0]\n" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "留言编号\n", 71 | "1 A1区 苑 小区 A1区 火炬 路 小区 物业 A市 程明 物业管理 有限公司 未经 小区业...\n", 72 | "23 五一 大道 202 李四 LIST PUB 十点钟 营业时间 钟 结束 大声 播放 迪厅 嗨...\n", 73 | "26 月 地铁 号线 桐梓 坡 地铁站 交叉口 设备 发出 怪异 低 频率 噪音 扰民 日夜不停 ...\n", 74 | "39 地铁 号线 号线 通车 正线 剩下 号线 修 启动 号线 2022 年 号线 通车 剩下 号...\n", 75 | "40 投诉 A2区 福满 新城 二期 噪音 扰民 82 天 拨打 投诉 奢望 市政府 职能部门 通...\n", 76 | " ... \n", 77 | "6056990 医生 实行 基药 乡镇 卫生院 底下 各村 驻点 撤 卫生院 高级 职工 一视同仁 保留 不...\n", 78 | "6665290 C3县 居民 家中 独女 老师 农民 农村 居住 读 中专 时 户口 转到 C市 区 05 ...\n", 79 | "16704000 步步高 E12 市店 无视 劳动法 法定 节假日 上班 14 不发 少发 加班 工资 平时 ...\n", 80 | "33681565 西地 省格尚 置业 有限公司 2004 年 滨江 花园 一期 土地出让 合同 办理 土地 使...\n", 81 | "38654898 抽空 留言 原 F市 财经 中等专业 学校 98 届 学生 学校 合并 十三 中学 找回 学...\n", 82 | "Name: 留言详情, Length: 2801, dtype: object" 83 | ] 84 | }, 85 | "execution_count": 11, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 14, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.7.1" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 2 128 | } 129 | -------------------------------------------------------------------------------- /code/第一问/.ipynb_checkpoints/测试代码-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 55, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "E:\\MY_COMPETE\\2020挑战杯\\C题数据&代码\\附件最新\\附件\\附件\\第一问\\data_process1.py:26: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 13 | " stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)\n", 14 | "D:\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 15 | " FutureWarning)\n", 16 | "D:\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", 17 | " \"this warning.\", FutureWarning)\n" 18 | ] 19 | }, 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,\n", 24 | " intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n", 25 | " multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n", 26 | " verbose=0)" 27 | ] 28 | }, 29 | "execution_count": 55, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "# %load model1.py\n", 36 | "import seaborn as sns\n", 37 | "import pandas as pd\n", 38 | "import matplotlib.pyplot as plt\n", 39 | "\n", 40 | "from data_process1 import data_process\n", 41 | "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n", 42 | "from sklearn.linear_model import LogisticRegression\n", 43 | "from sklearn.ensemble import RandomForestClassifier\n", 44 | "from sklearn.svm import LinearSVC\n", 45 | "from sklearn.model_selection import cross_val_score, train_test_split\n", 46 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer\n", 47 | "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", 48 | "'''\n", 49 | "#朴素贝叶斯\n", 50 | "adata, data_after_stop, labels = data_process()\n", 51 | "data_tr, data_te, labels_tr, labels_te = train_test_split(adata, labels, test_size=0.2)\n", 52 | "\n", 53 | "countVectorizer = CountVectorizer()\n", 54 | "data_tr = countVectorizer.fit_transform(data_tr)\n", 55 | "X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray()\n", 56 | "\n", 57 | "data_te = CountVectorizer(vocabulary=countVectorizer.vocabulary_).fit_transform(data_te)\n", 58 | "X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray()\n", 59 | "'''\n", 60 | "\n", 61 | "\n", 62 | "#--------------------------------------------------\n", 63 | "\n", 64 | "\n", 65 | "#其他四种模型\n", 66 | "adata, data_after_stop, labels = data_process()\n", 67 | "adata_key = data_after_stop['key']\n", 68 | "adata_set = adata_key.apply(lambda x: ' '.join(x))\n", 69 | "\n", 70 | "tfidfVectorizer = TfidfVectorizer(norm='l2', ngram_range=(1, 2))\n", 71 | "features = tfidfVectorizer.fit_transform(adata_set)\n", 72 | "\n", 73 | "\n", 74 | "\n", 75 | "models = [\n", 76 | " RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),\n", 77 | " LinearSVC(),\n", 78 | " MultinomialNB(),\n", 79 | " LogisticRegression(random_state=0),\n", 80 | "]\n", 81 | "\n", 82 | "#5折交叉验证\n", 83 | "CV = 5\n", 84 | "cv_df = pd.DataFrame(index=range(CV * len(models)))\n", 85 | "entries = []\n", 86 | "for model in models:\n", 87 | " \n", 88 | " model_name = model.__class__.__name__\n", 89 | " accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)\n", 90 | " for fold_idx, accuracy in enumerate(accuracies):\n", 91 | " entries.append((model_name, fold_idx, accuracy))\n", 92 | "cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])\n", 93 | "\n", 94 | "\n", 95 | "#-------------------------------------\n", 96 | "#线性SVC模型调用\n", 97 | "model = LinearSVC()\n", 98 | "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data_after_stop.index, \n", 99 | " test_size=0.3, stratify=labels, random_state=0)\n", 100 | "model.fit(X_train, y_train)\n", 101 | "#y_pred = model.predict(X_test)\n", 102 | "\n", 103 | "\n", 104 | "#print('accuracy %s' % accuracy_score(y_pred, y_test))\n", 105 | "#print(classification_report(y_test, y_pred,target_names=labelss))\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 56, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "(3469,)" 117 | ] 118 | }, 119 | "execution_count": 56, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "data_after_stop['key'].shape" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 57, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "name": "stdout", 135 | "output_type": "stream", 136 | "text": [ 137 | "X_test为: (1041, 73876)\n", 138 | "X_train为: (2428, 73876)\n", 139 | "features为: (3469, 73876)\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "print('X_test为:',X_test.shape)\n", 145 | "print('X_train为:',X_train.shape)\n", 146 | "print('features为:',features.shape)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 58, 152 | "metadata": {}, 153 | "outputs": [ 154 | { 155 | "data": { 156 | "text/plain": [ 157 | "['SVC模型.pkl']" 158 | ] 159 | }, 160 | "execution_count": 58, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "#引入sklearn中自带的保存模型\n", 167 | "from sklearn.externals import joblib\n", 168 | "#保存模型\n", 169 | "joblib.dump(model, 'SVC模型.pkl')" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 59, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "#预处理测试数据\n", 179 | "import pandas as pd\n", 180 | "import re\n", 181 | "import jieba.analyse\n", 182 | "\n", 183 | "def process(file='附件2(测试数据).xlsx'):\n", 184 | " data = pd.read_excel(file, index_col=0,encoding = 'GB18030')\n", 185 | " \n", 186 | " \n", 187 | " jieba.load_userdict('newdic1.txt')\n", 188 | " data_cut = data['留言详情'].apply(lambda x: jieba.lcut(x))\n", 189 | "\n", 190 | " stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)\n", 191 | " stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–'] + list(stopWords.iloc[:, 0])\n", 192 | " after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n", 193 | " #labels = data_new.loc[data_after_stop.index, '一级标签']\n", 194 | " adata = after_stop.apply(lambda x: ' '.join(x))\n", 195 | " after_stop = after_stop.to_frame()\n", 196 | " \n", 197 | "\t#提取关键词\n", 198 | " key=[]\n", 199 | " for i in adata:\n", 200 | " keywords=jieba.analyse.extract_tags(i,topK=20)\n", 201 | " key.append(keywords)\n", 202 | " after_stop['key']=key\n", 203 | " \n", 204 | "\t#去除城市乡镇以外的字母和0\n", 205 | " key=[]\n", 206 | " pattern = re.compile('[0-9]+')\n", 207 | " for x in after_stop['key']:\n", 208 | " temp=[]\n", 209 | " for i in x:\n", 210 | " match = pattern.findall(i)\n", 211 | " if match:\n", 212 | " pass\n", 213 | " else:\n", 214 | " temp.append(i)\n", 215 | " key.append(temp)\n", 216 | " after_stop['key']=key\n", 217 | " #data_after_stop['labels']=labels\n", 218 | " return after_stop\n", 219 | "\n", 220 | "\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 61, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "name": "stderr", 230 | "output_type": "stream", 231 | "text": [ 232 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 233 | " del sys.path[0]\n" 234 | ] 235 | } 236 | ], 237 | "source": [ 238 | "after_stop = process()\n", 239 | "key = after_stop['key']\n", 240 | "join_key = key.apply(lambda x: ' '.join(x))\n", 241 | "\n", 242 | "\n", 243 | "my_features = TfidfVectorizer(vocabulary=tfidfVectorizer.vocabulary_).fit_transform(join_key)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 62, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "name": "stdout", 253 | "output_type": "stream", 254 | "text": [ 255 | "(2801, 73876)\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "print(my_features.shape)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 63, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/html": [ 271 | "
\n", 272 | "\n", 285 | "\n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | "
留言用户留言主题留言时间留言详情一级分类
留言编号
1U0001196投诉A市A1区苑物业违规收停车费2019/12/30 17:06:14\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的领导:A1区苑小区位于A1区火炬路,小...商贸旅游
23U0002738A4区五一大道一酒吧噪音严重影响居民休息2020/1/6 13:15:56\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t位于五一大道202的“李四的LIST-PUB...环境保护
26U0003729A市地铁6号线桐梓坡地铁站交叉口低频率噪音严重扰民2020/1/6 11:29:11\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t近2个月来,地铁6号线桐梓坡地铁站交叉口有设...环境保护
39U0007638A市地铁8号线工作现在就要开始准备了2020/1/3 22:26:19\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今年地铁3号线和5号线通车后,正线就只剩下6...城乡建设
40U0005855A2区福满新城二期施工噪音扰民谁能管2020/1/3 20:08:05\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今天是投诉A2区福满新城二期噪音扰民问题第8...环境保护
..................
6056990U0004035为何在B市乡镇卫生院的高级职工所在的点不能一视同仁还保留?2012/12/15 18:39:45\\n \\n   我想请问一下领导,自从乡村医生实行基药开始,是不是所有的乡镇...卫生计生
6665290U0008214C3县居民这样的情况要不要罚款?2011/12/13 14:05:17\\n \\n   你好,我是C3县居民,在家中是独女,父亲是老师,母亲是农民,...卫生计生
16704000U0007689请求市长查处步步高E12市店长期拖欠员工加班工资的违法行为2011/11/16 1:35:27\\n \\n 尊敬的王书记:  您好!步步高E12市店无视劳动法有关规定,国家...劳动和社会保障
33681565U0006759B市滨江花园二期容积率达到4.11之高2018/12/24 19:19:26\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t西地省格尚置业有限公司于2004年取得滨江花...城乡建设
38654898U000262F市十三中学弄丢了我的学籍档案,我咋办?2012/9/7 12:40:52\\n \\n 王局长:  您好!希望您在百忙之中抽空看看我的留言。我是原F市财...教育文体
\n", 395 | "

2801 rows × 5 columns

\n", 396 | "
" 397 | ], 398 | "text/plain": [ 399 | " 留言用户 留言主题 留言时间 \\\n", 400 | "留言编号 \n", 401 | "1 U0001196 投诉A市A1区苑物业违规收停车费 2019/12/30 17:06:14 \n", 402 | "23 U0002738 A4区五一大道一酒吧噪音严重影响居民休息 2020/1/6 13:15:56 \n", 403 | "26 U0003729 A市地铁6号线桐梓坡地铁站交叉口低频率噪音严重扰民 2020/1/6 11:29:11 \n", 404 | "39 U0007638 A市地铁8号线工作现在就要开始准备了 2020/1/3 22:26:19 \n", 405 | "40 U0005855 A2区福满新城二期施工噪音扰民谁能管 2020/1/3 20:08:05 \n", 406 | "... ... ... ... \n", 407 | "6056990 U0004035 为何在B市乡镇卫生院的高级职工所在的点不能一视同仁还保留? 2012/12/15 18:39:45 \n", 408 | "6665290 U0008214 C3县居民这样的情况要不要罚款? 2011/12/13 14:05:17 \n", 409 | "16704000 U0007689 请求市长查处步步高E12市店长期拖欠员工加班工资的违法行为 2011/11/16 1:35:27 \n", 410 | "33681565 U0006759 B市滨江花园二期容积率达到4.11之高 2018/12/24 19:19:26 \n", 411 | "38654898 U000262 F市十三中学弄丢了我的学籍档案,我咋办? 2012/9/7 12:40:52 \n", 412 | "\n", 413 | " 留言详情 一级分类 \n", 414 | "留言编号 \n", 415 | "1 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的领导:A1区苑小区位于A1区火炬路,小... 商贸旅游 \n", 416 | "23 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t位于五一大道202的“李四的LIST-PUB... 环境保护 \n", 417 | "26 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t近2个月来,地铁6号线桐梓坡地铁站交叉口有设... 环境保护 \n", 418 | "39 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今年地铁3号线和5号线通车后,正线就只剩下6... 城乡建设 \n", 419 | "40 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今天是投诉A2区福满新城二期噪音扰民问题第8... 环境保护 \n", 420 | "... ... ... \n", 421 | "6056990 \\n \\n   我想请问一下领导,自从乡村医生实行基药开始,是不是所有的乡镇... 卫生计生 \n", 422 | "6665290 \\n \\n   你好,我是C3县居民,在家中是独女,父亲是老师,母亲是农民,... 卫生计生 \n", 423 | "16704000 \\n \\n 尊敬的王书记:  您好!步步高E12市店无视劳动法有关规定,国家... 劳动和社会保障 \n", 424 | "33681565 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t西地省格尚置业有限公司于2004年取得滨江花... 城乡建设 \n", 425 | "38654898 \\n \\n 王局长:  您好!希望您在百忙之中抽空看看我的留言。我是原F市财... 教育文体 \n", 426 | "\n", 427 | "[2801 rows x 5 columns]" 428 | ] 429 | }, 430 | "execution_count": 63, 431 | "metadata": {}, 432 | "output_type": "execute_result" 433 | } 434 | ], 435 | "source": [ 436 | "my_model = joblib.load('SVC模型.pkl')\n", 437 | "first_labels = []\n", 438 | "#得到预测的目标值\n", 439 | "to_list = my_model.predict(my_features)\n", 440 | "for i in to_list:\n", 441 | " first_labels.append(i)\n", 442 | "\n", 443 | "#目标值写进测试数据文件\n", 444 | "data1 = pd.read_excel('附件2(测试数据).xlsx', index_col=0,encoding = 'GB18030')\n", 445 | "data1 = data1.drop(['一级分类'], axis = 1)\n", 446 | "data1['一级分类'] = first_labels\n", 447 | "data1" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 71, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "#把目标值写进测试结果文件\n", 457 | "data2 = pd.read_excel('附件2(测试结果).xlsx', index_col=0,encoding = 'GB18030')\n", 458 | "data2 = data2.drop(['一级分类'], axis = 1)\n", 459 | "data_new = pd.merge(data1, data2, on='留言编号')\n", 460 | "data_temp1 = data_new.drop(['留言用户'], axis = 1)\n", 461 | "data_temp2 = data_temp1.drop(['留言主题'], axis = 1)\n", 462 | "data_temp3 = data_temp2.drop(['留言时间'], axis = 1)\n", 463 | "data_final = data_temp3.drop(['留言详情'], axis = 1)\n", 464 | "data_final.to_excel('结果(待重命名).xlsx',encoding = 'GB18030')" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [] 480 | } 481 | ], 482 | "metadata": { 483 | "kernelspec": { 484 | "display_name": "Python 3", 485 | "language": "python", 486 | "name": "python3" 487 | }, 488 | "language_info": { 489 | "codemirror_mode": { 490 | "name": "ipython", 491 | "version": 3 492 | }, 493 | "file_extension": ".py", 494 | "mimetype": "text/x-python", 495 | "name": "python", 496 | "nbconvert_exporter": "python", 497 | "pygments_lexer": "ipython3", 498 | "version": "3.7.1" 499 | } 500 | }, 501 | "nbformat": 4, 502 | "nbformat_minor": 2 503 | } 504 | -------------------------------------------------------------------------------- /code/第一问/SVC模型.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/SVC模型.pkl -------------------------------------------------------------------------------- /code/第一问/__pycache__/data_process1.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/__pycache__/data_process1.cpython-37.pyc -------------------------------------------------------------------------------- /code/第一问/data_process.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# %load data_process1.py\n", 10 | "import pandas as pd\n", 11 | "import re\n", 12 | "import jieba.analyse\n", 13 | "\n", 14 | "def data_process(file='附件2(测试数据).xlsx'):\n", 15 | " data = pd.read_excel(file, index_col=0,encoding = 'GB18030')\n", 16 | " \n", 17 | " \n", 18 | " jieba.load_userdict('newdic1.txt')\n", 19 | " data_cut = data['留言详情'].apply(lambda x: jieba.lcut(x))\n", 20 | "\n", 21 | " stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)\n", 22 | " stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–'] + list(stopWords.iloc[:, 0])\n", 23 | " data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n", 24 | " #labels = data_new.loc[data_after_stop.index, '一级标签']\n", 25 | " adata = data_after_stop.apply(lambda x: ' '.join(x))\n", 26 | " data_after_stop = data_after_stop.to_frame()\n", 27 | " \n", 28 | "\t#提取关键词\n", 29 | " key=[]\n", 30 | " for i in adata:\n", 31 | " keywords=jieba.analyse.extract_tags(i,topK=20)\n", 32 | " key.append(keywords)\n", 33 | " data_after_stop['key']=key\n", 34 | " \n", 35 | "\t#去除城市乡镇以外的字母和0\n", 36 | " key=[]\n", 37 | " pattern = re.compile('[0-9]+')\n", 38 | " for x in data_after_stop['key']:\n", 39 | " temp=[]\n", 40 | " for i in x:\n", 41 | " match = pattern.findall(i)\n", 42 | " if match:\n", 43 | " pass\n", 44 | " else:\n", 45 | " temp.append(i)\n", 46 | " key.append(temp)\n", 47 | " data_after_stop['key']=key\n", 48 | " #data_after_stop['labels']=labels\n", 49 | " return adata, data_after_stop\n", 50 | "#adata, data_after_stop, labels = data_process()\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 11, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stderr", 61 | "output_type": "stream", 62 | "text": [ 63 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 64 | " del sys.path[0]\n" 65 | ] 66 | }, 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "留言编号\n", 71 | "1 A1区 苑 小区 A1区 火炬 路 小区 物业 A市 程明 物业管理 有限公司 未经 小区业...\n", 72 | "23 五一 大道 202 李四 LIST PUB 十点钟 营业时间 钟 结束 大声 播放 迪厅 嗨...\n", 73 | "26 月 地铁 号线 桐梓 坡 地铁站 交叉口 设备 发出 怪异 低 频率 噪音 扰民 日夜不停 ...\n", 74 | "39 地铁 号线 号线 通车 正线 剩下 号线 修 启动 号线 2022 年 号线 通车 剩下 号...\n", 75 | "40 投诉 A2区 福满 新城 二期 噪音 扰民 82 天 拨打 投诉 奢望 市政府 职能部门 通...\n", 76 | " ... \n", 77 | "6056990 医生 实行 基药 乡镇 卫生院 底下 各村 驻点 撤 卫生院 高级 职工 一视同仁 保留 不...\n", 78 | "6665290 C3县 居民 家中 独女 老师 农民 农村 居住 读 中专 时 户口 转到 C市 区 05 ...\n", 79 | "16704000 步步高 E12 市店 无视 劳动法 法定 节假日 上班 14 不发 少发 加班 工资 平时 ...\n", 80 | "33681565 西地 省格尚 置业 有限公司 2004 年 滨江 花园 一期 土地出让 合同 办理 土地 使...\n", 81 | "38654898 抽空 留言 原 F市 财经 中等专业 学校 98 届 学生 学校 合并 十三 中学 找回 学...\n", 82 | "Name: 留言详情, Length: 2801, dtype: object" 83 | ] 84 | }, 85 | "execution_count": 11, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 14, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.7.1" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 2 128 | } 129 | -------------------------------------------------------------------------------- /code/第一问/data_process1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import re 3 | import jieba.analyse 4 | 5 | def data_process(file='fj2.csv'): 6 | data = pd.read_csv(file, header=None, index_col=0, sep = ',', encoding = 'GB18030') 7 | data.columns = ['留言用户', '留言主题', '留言时间', '留言详情', '一级标签'] 8 | n = 500 9 | 10 | #抽样 11 | a = data[data['一级标签'] == '城乡建设'].sample(n) 12 | b = data[data['一级标签'] == '环境保护'].sample(n) 13 | c = data[data['一级标签'] == '交通运输'].sample(n) 14 | d = data[data['一级标签'] == '教育文体'].sample(n) 15 | e = data[data['一级标签'] == '劳动和社会保障'].sample(n) 16 | f = data[data['一级标签'] == '商贸旅游'].sample(n) 17 | g = data[data['一级标签'] == '卫生计生'].sample(n) 18 | data_new = pd.concat([a, b, c, d, e, f, g], axis=0) 19 | 20 | data_dup = data_new['留言详情'].drop_duplicates() 21 | data_qumin = data_dup.apply(lambda x: re.sub('x', '', x)) 22 | 23 | jieba.load_userdict('newdic1.txt') 24 | data_cut = data_qumin.apply(lambda x: jieba.lcut(x)) 25 | 26 | stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None) 27 | stopWords = [' ', '\n', '\t', '\r\n', '\u3000', '"', '–'] + list(stopWords.iloc[:, 0]) 28 | data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords]) 29 | labels = data_new.loc[data_after_stop.index, '一级标签'] 30 | adata = data_after_stop.apply(lambda x: ' '.join(x)) 31 | data_after_stop = data_after_stop.to_frame() 32 | 33 | #提取关键词 34 | key=[] 35 | for i in adata: 36 | keywords=jieba.analyse.extract_tags(i,topK=20) 37 | key.append(keywords) 38 | data_after_stop['key']=key 39 | 40 | #去除城市乡镇以外的字母和0 41 | key=[] 42 | pattern = re.compile('[0-9]+') 43 | for x in data_after_stop['key']: 44 | temp=[] 45 | for i in x: 46 | match = pattern.findall(i) 47 | if match: 48 | pass 49 | else: 50 | temp.append(i) 51 | key.append(temp) 52 | data_after_stop['key']=key 53 | data_after_stop['labels']=labels 54 | return adata, data_after_stop, labels 55 | #adata, data_after_stop, labels = data_process() 56 | 57 | -------------------------------------------------------------------------------- /code/第一问/fj2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/fj2.csv -------------------------------------------------------------------------------- /code/第一问/model1.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | 5 | from data_process1 import data_process 6 | from sklearn.naive_bayes import GaussianNB, MultinomialNB 7 | from sklearn.linear_model import LogisticRegression 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.svm import LinearSVC 10 | from sklearn.model_selection import cross_val_score, train_test_split 11 | from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer 12 | from sklearn.metrics import classification_report, accuracy_score, confusion_matrix 13 | 14 | #朴素贝叶斯 15 | adata, data_after_stop, labels = data_process() 16 | data_tr, data_te, labels_tr, labels_te = train_test_split(adata, labels, test_size=0.2) 17 | 18 | countVectorizer = CountVectorizer() 19 | data_tr = countVectorizer.fit_transform(data_tr) 20 | X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray() 21 | 22 | data_te = CountVectorizer(vocabulary=countVectorizer.vocabulary_).fit_transform(data_te) 23 | X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray() 24 | 25 | model1 = GaussianNB() 26 | model1.fit(X_tr, labels_tr) 27 | print(model1.score(X_te, labels_te)) 28 | 29 | #其他四种模型 30 | adata_key = data_after_stop['key'] 31 | adata_set = adata_key.apply(lambda x: ' '.join(x)) 32 | 33 | tfidf = TfidfVectorizer(norm='l2', ngram_range=(1, 2)) 34 | features = tfidf.fit_transform(adata_set) 35 | models = [ 36 | RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0), 37 | LinearSVC(), 38 | MultinomialNB(), 39 | LogisticRegression(random_state=0), 40 | ] 41 | 42 | #5折交叉验证 43 | CV = 5 44 | cv_df = pd.DataFrame(index=range(CV * len(models))) 45 | entries = [] 46 | for model in models: 47 | model_name = model.__class__.__name__ 48 | accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV) 49 | for fold_idx, accuracy in enumerate(accuracies): 50 | entries.append((model_name, fold_idx, accuracy)) 51 | cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy']) 52 | 53 | #绘制箱线图 54 | sns.boxplot(x='model_name', y='accuracy', data=cv_df) 55 | sns.stripplot(x='model_name', y='accuracy', data=cv_df, 56 | size=8, jitter=True, edgecolor="gray", linewidth=2) 57 | plt.show() 58 | 59 | #线性SVC模型调用 60 | model = LinearSVC() 61 | X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data_after_stop.index, 62 | test_size=0.3, stratify=labels, random_state=0) 63 | model.fit(X_train, y_train) 64 | y_pred = model.predict(X_test) 65 | 66 | #生成混淆矩阵 67 | labelss = ['城乡建设','环境保护','交通运输','教育文体','劳动和社会保障','商贸旅游','卫生计生'] 68 | conf_mat = confusion_matrix(y_test, y_pred) 69 | fig, ax = plt.subplots(figsize=(10,8)) 70 | sns.heatmap(conf_mat, annot=True, fmt='d', 71 | xticklabels=labelss, yticklabels=labelss) 72 | plt.rcParams['font.sans-serif']='SimHei' 73 | plt.ylabel('实际结果',fontsize=18) 74 | plt.xlabel('预测结果',fontsize=18) 75 | plt.show() 76 | 77 | print('accuracy %s' % accuracy_score(y_pred, y_test)) 78 | print(classification_report(y_test, y_pred,target_names=labelss)) 79 | -------------------------------------------------------------------------------- /code/第一问/newdic1.txt: -------------------------------------------------------------------------------- 1 | A市 2 | B市 3 | C市 4 | D市 5 | E市 6 | F市 7 | G市 8 | H市 9 | I市 10 | J市 11 | K市 12 | L市 13 | M市 14 | A1区 15 | A2区 16 | A3区 17 | A4区 18 | A5区 19 | A6区 20 | A7县 21 | A8县 22 | A9市 23 | B4区 24 | B7县 25 | C2区 26 | C3县 27 | C4市 28 | C5市 29 | E5县 30 | E6县 31 | E7县 32 | E8县 33 | G1区 34 | G2区 35 | G3县 36 | G7县 37 | G8县 38 | H3县 39 | I3县 40 | I4县 41 | I5县 42 | J3县 43 | J4县 44 | J5县 45 | J9县 46 | K3县 47 | K4县 48 | K5县 49 | K6县 50 | K8县 51 | K9县 52 | K10县 53 | K11县 54 | L5县 55 | L6县 56 | L7县 57 | M2县 58 | M3县 59 | M9县 -------------------------------------------------------------------------------- /code/第一问/stopword1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/stopword1.txt -------------------------------------------------------------------------------- /code/第一问/word_cloud1.py: -------------------------------------------------------------------------------- 1 | from data_process1 import data_process 2 | from wordcloud import WordCloud 3 | import matplotlib.pyplot as plt 4 | 5 | adata, data_after_stop, labels = data_process() 6 | 7 | word_fre1 = {} 8 | word_fre2 = {} 9 | word_fre3 = {} 10 | word_fre4 = {} 11 | word_fre5 = {} 12 | word_fre6 = {} 13 | word_fre7 = {} 14 | data_key1 = data_after_stop[data_after_stop['labels'] == '城乡建设'] 15 | data_key2 = data_after_stop[data_after_stop['labels'] == '环境保护'] 16 | data_key3 = data_after_stop[data_after_stop['labels'] == '交通运输'] 17 | data_key4 = data_after_stop[data_after_stop['labels'] == '教育文体'] 18 | data_key5 = data_after_stop[data_after_stop['labels'] == '劳动和社会保障'] 19 | data_key6 = data_after_stop[data_after_stop['labels'] == '商贸旅游'] 20 | data_key7 = data_after_stop[data_after_stop['labels'] == '卫生计生'] 21 | 22 | for i in data_key1['key']: 23 | for j in i: 24 | if j not in word_fre1.keys(): 25 | word_fre1[j] = 1 26 | else: 27 | word_fre1[j] += 1 28 | wc1 = WordCloud(scale=16,background_color='white', 29 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 30 | wc1.fit_words(word_fre1) 31 | #绘制词云图 32 | #plt.imshow(wc1) 33 | #plt.axis("off") 34 | #plt.show() 35 | 36 | for i in data_key2['key']: 37 | for j in i: 38 | if j not in word_fre2.keys(): 39 | word_fre2[j] = 1 40 | else: 41 | word_fre2[j] += 1 42 | wc2 = WordCloud(scale=16,background_color='white', 43 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 44 | wc2.fit_words(word_fre2) 45 | #plt.imshow(wc2) 46 | #plt.axis("off") 47 | #plt.show() 48 | 49 | for i in data_key3['key']: 50 | for j in i: 51 | if j not in word_fre3.keys(): 52 | word_fre3[j] = 1 53 | else: 54 | word_fre3[j] += 1 55 | wc3 = WordCloud(scale=16,background_color='white', 56 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 57 | wc3.fit_words(word_fre3) 58 | #plt.imshow(wc3) 59 | #plt.axis("off") 60 | #plt.show() 61 | 62 | for i in data_key4['key']: 63 | for j in i: 64 | if j not in word_fre4.keys(): 65 | word_fre4[j] = 1 66 | else: 67 | word_fre4[j] += 1 68 | wc4 = WordCloud(scale=16,background_color='white', 69 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 70 | wc4.fit_words(word_fre4) 71 | #plt.imshow(wc4) 72 | #plt.axis("off") 73 | #plt.show() 74 | 75 | for i in data_key5['key']: 76 | for j in i: 77 | if j not in word_fre5.keys(): 78 | word_fre5[j] = 1 79 | else: 80 | word_fre5[j] += 1 81 | wc5 = WordCloud(scale=16,background_color='white', 82 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 83 | wc5.fit_words(word_fre5) 84 | #plt.imshow(wc5) 85 | #plt.axis("off") 86 | #plt.show() 87 | 88 | for i in data_key6['key']: 89 | for j in i: 90 | if j not in word_fre6.keys(): 91 | word_fre6[j] = 1 92 | else: 93 | word_fre6[j] += 1 94 | wc6 = WordCloud(scale=16,background_color='white', 95 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 96 | wc6.fit_words(word_fre6) 97 | #plt.imshow(wc6) 98 | #plt.axis("off") 99 | #plt.show() 100 | 101 | for i in data_key7['key']: 102 | for j in i: 103 | if j not in word_fre7.keys(): 104 | word_fre7[j] = 1 105 | else: 106 | word_fre7[j] += 1 107 | wc7 = WordCloud(scale=16,background_color='white', 108 | font_path=r'C:\Windows\Fonts\simhei.ttf',max_words = 100,max_font_size = 60) 109 | wc7.fit_words(word_fre7) 110 | #plt.imshow(wc7) 111 | #plt.axis("off") 112 | #plt.show() 113 | -------------------------------------------------------------------------------- /code/第一问/word_fre1-7.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/word_fre1-7.csv -------------------------------------------------------------------------------- /code/第一问/作品测试结果.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/作品测试结果.zip -------------------------------------------------------------------------------- /code/第一问/作品测试结果/作品测试结果.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/作品测试结果/作品测试结果.zip -------------------------------------------------------------------------------- /code/第一问/作品测试结果/测试代码.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 90, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "E:\\MY_COMPETE\\2020挑战杯\\C题数据&代码\\附件最新\\附件\\附件\\第一问\\data_process1.py:26: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 13 | " stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)\n", 14 | "D:\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n", 15 | " FutureWarning)\n", 16 | "D:\\Anaconda3\\lib\\site-packages\\sklearn\\linear_model\\logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n", 17 | " \"this warning.\", FutureWarning)\n" 18 | ] 19 | }, 20 | { 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "accuracy 0.8662175168431184\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "# %load model1.py\n", 30 | "import seaborn as sns\n", 31 | "import pandas as pd\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "\n", 34 | "from data_process1 import data_process\n", 35 | "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n", 36 | "from sklearn.linear_model import LogisticRegression\n", 37 | "from sklearn.ensemble import RandomForestClassifier\n", 38 | "from sklearn.svm import LinearSVC\n", 39 | "from sklearn.model_selection import cross_val_score, train_test_split\n", 40 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer\n", 41 | "from sklearn.metrics import classification_report, accuracy_score, confusion_matrix\n", 42 | "'''\n", 43 | "#朴素贝叶斯\n", 44 | "adata, data_after_stop, labels = data_process()\n", 45 | "data_tr, data_te, labels_tr, labels_te = train_test_split(adata, labels, test_size=0.2)\n", 46 | "\n", 47 | "countVectorizer = CountVectorizer()\n", 48 | "data_tr = countVectorizer.fit_transform(data_tr)\n", 49 | "X_tr = TfidfTransformer().fit_transform(data_tr.toarray()).toarray()\n", 50 | "\n", 51 | "data_te = CountVectorizer(vocabulary=countVectorizer.vocabulary_).fit_transform(data_te)\n", 52 | "X_te = TfidfTransformer().fit_transform(data_te.toarray()).toarray()\n", 53 | "'''\n", 54 | "\n", 55 | "\n", 56 | "#--------------------------------------------------\n", 57 | "\n", 58 | "\n", 59 | "#其他四种模型\n", 60 | "adata, data_after_stop, labels = data_process()\n", 61 | "adata_key = data_after_stop['key']\n", 62 | "adata_set = adata_key.apply(lambda x: ' '.join(x))\n", 63 | "\n", 64 | "tfidfVectorizer = TfidfVectorizer(norm='l2', ngram_range=(1, 2))\n", 65 | "features = tfidfVectorizer.fit_transform(adata_set)\n", 66 | "\n", 67 | "\n", 68 | "\n", 69 | "models = [\n", 70 | " RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),\n", 71 | " LinearSVC(),\n", 72 | " MultinomialNB(),\n", 73 | " LogisticRegression(random_state=0),\n", 74 | "]\n", 75 | "\n", 76 | "#5折交叉验证\n", 77 | "CV = 5\n", 78 | "cv_df = pd.DataFrame(index=range(CV * len(models)))\n", 79 | "entries = []\n", 80 | "for model in models:\n", 81 | " \n", 82 | " model_name = model.__class__.__name__\n", 83 | " accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)\n", 84 | " for fold_idx, accuracy in enumerate(accuracies):\n", 85 | " entries.append((model_name, fold_idx, accuracy))\n", 86 | "cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])\n", 87 | "\n", 88 | "\n", 89 | "#-------------------------------------\n", 90 | "#线性SVC模型调用\n", 91 | "model = LinearSVC(random_state=0,dual =False)\n", 92 | "X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, data_after_stop.index, \n", 93 | " test_size=0.3, stratify=labels, random_state=0)\n", 94 | "model.fit(X_train, y_train)\n", 95 | "y_pred = model.predict(X_test)\n", 96 | "\n", 97 | "#labelss = ['城乡建设','环境保护','交通运输','教育文体','劳动和社会保障','商贸旅游','卫生计生']\n", 98 | "print('accuracy %s' % accuracy_score(y_pred, y_test))\n", 99 | "#print(classification_report(y_test, y_pred,target_names=labelss))\n" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "data_after_stop['key'].shape" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "print('X_test为:',X_test.shape)\n", 118 | "print('X_train为:',X_train.shape)\n", 119 | "print('features为:',features.shape)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 58, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "['SVC模型.pkl']" 131 | ] 132 | }, 133 | "execution_count": 58, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "#引入sklearn中自带的保存模型\n", 140 | "from sklearn.externals import joblib\n", 141 | "#保存模型\n", 142 | "joblib.dump(model, 'SVC模型.pkl')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 59, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "#预处理测试数据\n", 152 | "import pandas as pd\n", 153 | "import re\n", 154 | "import jieba.analyse\n", 155 | "\n", 156 | "def process(file='附件2(测试数据).xlsx'):\n", 157 | " data = pd.read_excel(file, index_col=0,encoding = 'GB18030')\n", 158 | " \n", 159 | " \n", 160 | " jieba.load_userdict('newdic1.txt')\n", 161 | " data_cut = data['留言详情'].apply(lambda x: jieba.lcut(x))\n", 162 | "\n", 163 | " stopWords = pd.read_csv('stopword1.txt', encoding='GB18030', sep='hahaha', header=None)\n", 164 | " stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–'] + list(stopWords.iloc[:, 0])\n", 165 | " after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n", 166 | " #labels = data_new.loc[data_after_stop.index, '一级标签']\n", 167 | " adata = after_stop.apply(lambda x: ' '.join(x))\n", 168 | " after_stop = after_stop.to_frame()\n", 169 | " \n", 170 | "\t#提取关键词\n", 171 | " key=[]\n", 172 | " for i in adata:\n", 173 | " keywords=jieba.analyse.extract_tags(i,topK=20)\n", 174 | " key.append(keywords)\n", 175 | " after_stop['key']=key\n", 176 | " \n", 177 | "\t#去除城市乡镇以外的字母和0\n", 178 | " key=[]\n", 179 | " pattern = re.compile('[0-9]+')\n", 180 | " for x in after_stop['key']:\n", 181 | " temp=[]\n", 182 | " for i in x:\n", 183 | " match = pattern.findall(i)\n", 184 | " if match:\n", 185 | " pass\n", 186 | " else:\n", 187 | " temp.append(i)\n", 188 | " key.append(temp)\n", 189 | " after_stop['key']=key\n", 190 | " #data_after_stop['labels']=labels\n", 191 | " return after_stop\n", 192 | "\n", 193 | "\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 61, 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stderr", 203 | "output_type": "stream", 204 | "text": [ 205 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 206 | " del sys.path[0]\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "after_stop = process()\n", 212 | "key = after_stop['key']\n", 213 | "join_key = key.apply(lambda x: ' '.join(x))\n", 214 | "\n", 215 | "\n", 216 | "my_features = TfidfVectorizer(vocabulary=tfidfVectorizer.vocabulary_).fit_transform(join_key)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 62, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "(2801, 73876)\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "print(my_features.shape)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 63, 239 | "metadata": {}, 240 | "outputs": [ 241 | { 242 | "data": { 243 | "text/html": [ 244 | "
\n", 245 | "\n", 258 | "\n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | "
留言用户留言主题留言时间留言详情一级分类
留言编号
1U0001196投诉A市A1区苑物业违规收停车费2019/12/30 17:06:14\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的领导:A1区苑小区位于A1区火炬路,小...商贸旅游
23U0002738A4区五一大道一酒吧噪音严重影响居民休息2020/1/6 13:15:56\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t位于五一大道202的“李四的LIST-PUB...环境保护
26U0003729A市地铁6号线桐梓坡地铁站交叉口低频率噪音严重扰民2020/1/6 11:29:11\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t近2个月来,地铁6号线桐梓坡地铁站交叉口有设...环境保护
39U0007638A市地铁8号线工作现在就要开始准备了2020/1/3 22:26:19\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今年地铁3号线和5号线通车后,正线就只剩下6...城乡建设
40U0005855A2区福满新城二期施工噪音扰民谁能管2020/1/3 20:08:05\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今天是投诉A2区福满新城二期噪音扰民问题第8...环境保护
..................
6056990U0004035为何在B市乡镇卫生院的高级职工所在的点不能一视同仁还保留?2012/12/15 18:39:45\\n \\n   我想请问一下领导,自从乡村医生实行基药开始,是不是所有的乡镇...卫生计生
6665290U0008214C3县居民这样的情况要不要罚款?2011/12/13 14:05:17\\n \\n   你好,我是C3县居民,在家中是独女,父亲是老师,母亲是农民,...卫生计生
16704000U0007689请求市长查处步步高E12市店长期拖欠员工加班工资的违法行为2011/11/16 1:35:27\\n \\n 尊敬的王书记:  您好!步步高E12市店无视劳动法有关规定,国家...劳动和社会保障
33681565U0006759B市滨江花园二期容积率达到4.11之高2018/12/24 19:19:26\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t西地省格尚置业有限公司于2004年取得滨江花...城乡建设
38654898U000262F市十三中学弄丢了我的学籍档案,我咋办?2012/9/7 12:40:52\\n \\n 王局长:  您好!希望您在百忙之中抽空看看我的留言。我是原F市财...教育文体
\n", 368 | "

2801 rows × 5 columns

\n", 369 | "
" 370 | ], 371 | "text/plain": [ 372 | " 留言用户 留言主题 留言时间 \\\n", 373 | "留言编号 \n", 374 | "1 U0001196 投诉A市A1区苑物业违规收停车费 2019/12/30 17:06:14 \n", 375 | "23 U0002738 A4区五一大道一酒吧噪音严重影响居民休息 2020/1/6 13:15:56 \n", 376 | "26 U0003729 A市地铁6号线桐梓坡地铁站交叉口低频率噪音严重扰民 2020/1/6 11:29:11 \n", 377 | "39 U0007638 A市地铁8号线工作现在就要开始准备了 2020/1/3 22:26:19 \n", 378 | "40 U0005855 A2区福满新城二期施工噪音扰民谁能管 2020/1/3 20:08:05 \n", 379 | "... ... ... ... \n", 380 | "6056990 U0004035 为何在B市乡镇卫生院的高级职工所在的点不能一视同仁还保留? 2012/12/15 18:39:45 \n", 381 | "6665290 U0008214 C3县居民这样的情况要不要罚款? 2011/12/13 14:05:17 \n", 382 | "16704000 U0007689 请求市长查处步步高E12市店长期拖欠员工加班工资的违法行为 2011/11/16 1:35:27 \n", 383 | "33681565 U0006759 B市滨江花园二期容积率达到4.11之高 2018/12/24 19:19:26 \n", 384 | "38654898 U000262 F市十三中学弄丢了我的学籍档案,我咋办? 2012/9/7 12:40:52 \n", 385 | "\n", 386 | " 留言详情 一级分类 \n", 387 | "留言编号 \n", 388 | "1 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的领导:A1区苑小区位于A1区火炬路,小... 商贸旅游 \n", 389 | "23 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t位于五一大道202的“李四的LIST-PUB... 环境保护 \n", 390 | "26 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t近2个月来,地铁6号线桐梓坡地铁站交叉口有设... 环境保护 \n", 391 | "39 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今年地铁3号线和5号线通车后,正线就只剩下6... 城乡建设 \n", 392 | "40 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t今天是投诉A2区福满新城二期噪音扰民问题第8... 环境保护 \n", 393 | "... ... ... \n", 394 | "6056990 \\n \\n   我想请问一下领导,自从乡村医生实行基药开始,是不是所有的乡镇... 卫生计生 \n", 395 | "6665290 \\n \\n   你好,我是C3县居民,在家中是独女,父亲是老师,母亲是农民,... 卫生计生 \n", 396 | "16704000 \\n \\n 尊敬的王书记:  您好!步步高E12市店无视劳动法有关规定,国家... 劳动和社会保障 \n", 397 | "33681565 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t西地省格尚置业有限公司于2004年取得滨江花... 城乡建设 \n", 398 | "38654898 \\n \\n 王局长:  您好!希望您在百忙之中抽空看看我的留言。我是原F市财... 教育文体 \n", 399 | "\n", 400 | "[2801 rows x 5 columns]" 401 | ] 402 | }, 403 | "execution_count": 63, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "my_model = joblib.load('SVC模型.pkl')\n", 410 | "first_labels = []\n", 411 | "#得到预测的目标值\n", 412 | "to_list = my_model.predict(my_features)\n", 413 | "for i in to_list:\n", 414 | " first_labels.append(i)\n", 415 | "\n", 416 | "#目标值写进测试数据文件\n", 417 | "data1 = pd.read_excel('附件2(测试数据).xlsx', index_col=0,encoding = 'GB18030')\n", 418 | "data1 = data1.drop(['一级分类'], axis = 1)\n", 419 | "data1['一级分类'] = first_labels\n", 420 | "data1" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 71, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "#把目标值写进测试结果文件\n", 430 | "data2 = pd.read_excel('附件2(测试结果).xlsx', index_col=0,encoding = 'GB18030')\n", 431 | "data2 = data2.drop(['一级分类'], axis = 1)\n", 432 | "data_new = pd.merge(data1, data2, on='留言编号')\n", 433 | "data_temp1 = data_new.drop(['留言用户'], axis = 1)\n", 434 | "data_temp2 = data_temp1.drop(['留言主题'], axis = 1)\n", 435 | "data_temp3 = data_temp2.drop(['留言时间'], axis = 1)\n", 436 | "data_final = data_temp3.drop(['留言详情'], axis = 1)\n", 437 | "data_final.to_excel('结果(待重命名).xlsx',encoding = 'GB18030')" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [] 453 | } 454 | ], 455 | "metadata": { 456 | "kernelspec": { 457 | "display_name": "Python 3", 458 | "language": "python", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.7.1" 472 | } 473 | }, 474 | "nbformat": 4, 475 | "nbformat_minor": 2 476 | } 477 | -------------------------------------------------------------------------------- /code/第一问/作品测试结果/附件2(测试数据).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/作品测试结果/附件2(测试数据).xlsx -------------------------------------------------------------------------------- /code/第一问/作品测试结果/附件2(测试结果).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/作品测试结果/附件2(测试结果).xlsx -------------------------------------------------------------------------------- /code/第一问/结果2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第一问/结果2.xlsx -------------------------------------------------------------------------------- /code/第三问/.ipynb_checkpoints/主题模型-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import re\n", 11 | "import jieba\n", 12 | "\n", 13 | "def data_process(file='附件4.xlsx'):\n", 14 | " data = pd.read_excel(file, index_col=0, sep = ',', encoding = 'GB18030')\n", 15 | " \n", 16 | "\n", 17 | " #data_dup = data['答复意见'].drop_duplicates()\n", 18 | " #去数字字母\n", 19 | " data_qu123a = data['答复意见'].apply(lambda x: re.sub('[a-zA-Z0-9’!\"#$%&\\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\\\]^_`{|}~\\s]+', \"\", x))\n", 20 | " #jieba.load_userdict('newdic1.txt')\n", 21 | " data_cut = data_qu123a.apply(lambda x: jieba.lcut(x))\n", 22 | "\n", 23 | " #设置不存在于停用词表的分隔符,避免 ,被默认为分隔符而报错\n", 24 | " stopWords = pd.read_csv('stopword.txt', encoding='GB18030', sep='hahaha', header=None)\n", 25 | " #列表使用+拼接\n", 26 | " stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–','现将','在','的','您好','你好','您','网友','留言','已收悉',':',\n", 27 | " '反映','的问题','平台','同志','现','领导'] + list(stopWords.iloc[:, 0])\n", 28 | " \n", 29 | " #分词\n", 30 | " data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n", 31 | " \n", 32 | " #用空格分割列表中的词语\n", 33 | " #adata = data_after_stop.apply(lambda x: ' '.join(x))\n", 34 | " return data_after_stop" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 4, 40 | "metadata": { 41 | "scrolled": true 42 | }, 43 | "outputs": [ 44 | { 45 | "name": "stderr", 46 | "output_type": "stream", 47 | "text": [ 48 | "Building prefix dict from the default dictionary ...\n", 49 | "Loading model from cache C:\\Users\\HP\\AppData\\Local\\Temp\\jieba.cache\n", 50 | "Loading model cost 1.079 seconds.\n", 51 | "Prefix dict has been built successfully.\n", 52 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:16: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 53 | " app.launch_new_instance()\n", 54 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n", 55 | " \n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "import pandas as pd\n", 61 | "from gensim.corpora import Dictionary\n", 62 | "from gensim.models import LdaModel\n", 63 | "\n", 64 | "data_after_stop = data_process()\n", 65 | "data_after_stop.to_csv('分词后.csv')\n", 66 | "\n" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 5, 72 | "metadata": {}, 73 | "outputs": [ 74 | { 75 | "data": { 76 | "text/plain": [ 77 | "留言编号\n", 78 | "2549 [问政, 西地省, 栏目, 胡华衡, 书记, 区景蓉, 花苑, 物业管理, 调查核实, 情况...\n", 79 | "2554 [区潇楚, 南路, 洋湖, 段, 修好, 区洋湖, 街道, 高度重视, 组织, 精干, 力量...\n", 80 | "2555 [市民, 请, 加快, 提高, 民营, 幼儿园, 教师, 待遇, 来信, 收悉, 回复, 改...\n", 81 | "2557 [问政, 西地省, 收悉, 市住, 建局, 交由, 市, 房屋交易, 管理中心, 办理, 相...\n", 82 | "2574 [收悉, 具体内容, 答复, 来信, 建议, 白竹坡, 路口, 更名, 马坡岭, 小学, 原...\n", 83 | " ... \n", 84 | "181267 [收悉, 已转, 区委, 区, 人民政府, 调查]\n", 85 | "181603 [收悉, 转市, 交通运输, 局, 调查]\n", 86 | "184423 [获悉, 对县, 文盛, 小学, 特色, 班, 质疑, 网帖, 我局, 高度重视, 责成, ...\n", 87 | "185799 [西地省, 问政, 西地省, 栏目组, 网民, 贵, 栏目, 咨询, 中央, 转移, 支付,...\n", 88 | "185986 [我厅, 高度重视, 相关, 部门, 调查, 研究, 回复, 我厅, 县城, 朱良桥, 公路...\n", 89 | "Name: 答复意见, Length: 2816, dtype: object" 90 | ] 91 | }, 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "data_after_stop" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "data_com = pd.read_csv('分词后.csv', header=None,encoding = 'utf-8')\n", 108 | "data_com.columns = ['留言编号','comments']" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 7, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "topic = []\n", 118 | "for i in range(0, len(data_after_stop)):\n", 119 | " mid = data_com['comments'][i:i+1].str.split(', ')\n", 120 | " dictionary = Dictionary(mid) # 生成词典\n", 121 | " bow = [dictionary.doc2bow(j) for j in mid] # 将文档转成数值型预料库\n", 122 | "\n", 123 | " data_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=2) # 构建LDA主题模型\n", 124 | " topic.append(data_model.print_topic(0)+'\\n'+data_model.print_topic(1)) # 打印主题\n", 125 | "\n", 126 | "\n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 12, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "from pandas.core.frame import DataFrame\n", 136 | "data_topic = DataFrame(topic)\n", 137 | "data_topic.columns = ['主题']\n", 138 | "data_new = pd.concat([data_com, data_topic], axis=1) #横向拼接\n", 139 | "data_new.to_excel('主题2.xlsx',encoding = 'GB18030')" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 13, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "data_new = pd.read_excel('主题2.xlsx',encoding = 'GB18030')" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 14, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/html": [ 159 | "
\n", 160 | "\n", 173 | "\n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | "
Unnamed: 0留言编号comments主题
002549['问政', '西地省', '栏目', '胡华衡', '书记', '区景蓉', '花苑', ...0.032*\"'业委会'\" + 0.029*\"'业主'\" + 0.023*\"'业主大会'\" ...
112554['区潇楚', '南路', '洋湖', '段', '修好', '区洋湖', '街道', '高...0.054*\"'施工'\" + 0.028*\"'道路'\" + 0.026*\"'项目'\" + 0...
222555['市民', '请', '加快', '提高', '民营', '幼儿园', '教师', '待遇...0.043*\"'民办'\" + 0.042*\"'幼儿园'\" + 0.040*\"'教师'\" + ...
332557['问政', '西地省', '收悉', '市住', '建局', '交由', '市', '房屋...0.053*\"'购房'\" + 0.035*\"'市'\" + 0.033*\"'首次'\" + 0....
442574['收悉', '具体内容', '答复', '来信', '建议', '白竹坡', '路口', ...0.048*\"'来信'\" + 0.047*\"'马坡岭'\" + 0.045*\"'小学'\" + ...
...............
28112811181267['收悉', '已转', '区委', '区', '人民政府', '调查']0.174*\"'已转'\" + 0.168*\"['收悉'\" + 0.167*\"'区委'\" + ...
28122812181603['收悉', '转市', '交通运输', '局', '调查']0.218*\"'交通运输'\" + 0.201*\"'局'\" + 0.196*\"['收悉'\" +...
28132813184423['获悉', '对县', '文盛', '小学', '特色', '班', '质疑', '网帖'...0.039*\"'学生'\" + 0.028*\"'社团活动'\" + 0.023*\"'社团'\" +...
28142814185799['西地省', '问政', '西地省', '栏目组', '网民', '贵', '栏目', '...0.070*\"'资金'\" + 0.047*\"'支付'\" + 0.042*\"'燃油税'\" + ...
28152815185986['我厅', '高度重视', '相关', '部门', '调查', '研究', '回复', '...0.044*\"'交通运输'\" + 0.034*\"'项目'\" + 0.029*\"'建设'\" +...
\n", 263 | "

2816 rows × 4 columns

\n", 264 | "
" 265 | ], 266 | "text/plain": [ 267 | " Unnamed: 0 留言编号 comments \\\n", 268 | "0 0 2549 ['问政', '西地省', '栏目', '胡华衡', '书记', '区景蓉', '花苑', ... \n", 269 | "1 1 2554 ['区潇楚', '南路', '洋湖', '段', '修好', '区洋湖', '街道', '高... \n", 270 | "2 2 2555 ['市民', '请', '加快', '提高', '民营', '幼儿园', '教师', '待遇... \n", 271 | "3 3 2557 ['问政', '西地省', '收悉', '市住', '建局', '交由', '市', '房屋... \n", 272 | "4 4 2574 ['收悉', '具体内容', '答复', '来信', '建议', '白竹坡', '路口', ... \n", 273 | "... ... ... ... \n", 274 | "2811 2811 181267 ['收悉', '已转', '区委', '区', '人民政府', '调查'] \n", 275 | "2812 2812 181603 ['收悉', '转市', '交通运输', '局', '调查'] \n", 276 | "2813 2813 184423 ['获悉', '对县', '文盛', '小学', '特色', '班', '质疑', '网帖'... \n", 277 | "2814 2814 185799 ['西地省', '问政', '西地省', '栏目组', '网民', '贵', '栏目', '... \n", 278 | "2815 2815 185986 ['我厅', '高度重视', '相关', '部门', '调查', '研究', '回复', '... \n", 279 | "\n", 280 | " 主题 \n", 281 | "0 0.032*\"'业委会'\" + 0.029*\"'业主'\" + 0.023*\"'业主大会'\" ... \n", 282 | "1 0.054*\"'施工'\" + 0.028*\"'道路'\" + 0.026*\"'项目'\" + 0... \n", 283 | "2 0.043*\"'民办'\" + 0.042*\"'幼儿园'\" + 0.040*\"'教师'\" + ... \n", 284 | "3 0.053*\"'购房'\" + 0.035*\"'市'\" + 0.033*\"'首次'\" + 0.... \n", 285 | "4 0.048*\"'来信'\" + 0.047*\"'马坡岭'\" + 0.045*\"'小学'\" + ... \n", 286 | "... ... \n", 287 | "2811 0.174*\"'已转'\" + 0.168*\"['收悉'\" + 0.167*\"'区委'\" + ... \n", 288 | "2812 0.218*\"'交通运输'\" + 0.201*\"'局'\" + 0.196*\"['收悉'\" +... \n", 289 | "2813 0.039*\"'学生'\" + 0.028*\"'社团活动'\" + 0.023*\"'社团'\" +... \n", 290 | "2814 0.070*\"'资金'\" + 0.047*\"'支付'\" + 0.042*\"'燃油税'\" + ... \n", 291 | "2815 0.044*\"'交通运输'\" + 0.034*\"'项目'\" + 0.029*\"'建设'\" +... \n", 292 | "\n", 293 | "[2816 rows x 4 columns]" 294 | ] 295 | }, 296 | "execution_count": 14, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "data_new" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "'''\n", 312 | "temp = []\n", 313 | "for i in range(0, 2750):\n", 314 | " mid = data_com['comments'][i:i+1].str.split(' ')\n", 315 | " temp.append(mid)\n", 316 | "\n", 317 | "temp_split = DataFrame(temp)\n", 318 | "\n", 319 | "print(temp_split)\n", 320 | " #print(mid.dtype)\n", 321 | " #print('-------------------------------')\n", 322 | " #dictionary = Dictionary(mid)\n", 323 | "\n", 324 | "#mid0 = data_com['comments'][0:1].str.split(' ')\n", 325 | "print(mid0.dtype)\n", 326 | "#dictionary = Dictionary(mid0)\n", 327 | "#bow = [dictionary.doc2bow(j) for j in mid0]\n", 328 | "#data_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3) # 构建LDA主题模型\n", 329 | "#data_model.print_topic(1) # 打印主题\n", 330 | "'''" 331 | ] 332 | } 333 | ], 334 | "metadata": { 335 | "kernelspec": { 336 | "display_name": "Python 3", 337 | "language": "python", 338 | "name": "python3" 339 | }, 340 | "language_info": { 341 | "codemirror_mode": { 342 | "name": "ipython", 343 | "version": 3 344 | }, 345 | "file_extension": ".py", 346 | "mimetype": "text/x-python", 347 | "name": "python", 348 | "nbconvert_exporter": "python", 349 | "pygments_lexer": "ipython3", 350 | "version": "3.7.1" 351 | } 352 | }, 353 | "nbformat": 4, 354 | "nbformat_minor": 2 355 | } 356 | -------------------------------------------------------------------------------- /code/第三问/.ipynb_checkpoints/开头结尾及解决否-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import os\n", 12 | "import re" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 16, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
\n", 24 | "\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
留言编号留言用户留言主题留言时间留言详情答复意见答复时间
02549A00045581A2区景蓉华苑物业管理有问题2019/4/25 9:32:09\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t2019年4月以来,位于A市A2区桂花坪街道...现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...2019/5/10 14:56:53
12554A00023583A3区潇楚南路洋湖段怎么还没修好?2019/4/24 16:03:40\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t潇楚南路从2018年开始修,到现在都快一年了...网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...2019/5/9 9:49:10
22555A00031618请加快提高A市民营幼儿园老师的待遇2019/4/24 15:40:04\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t地处省会A市民营幼儿园众多,小孩是祖国的未来...市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...2019/5/9 9:49:14
32557A000110735在A市买公寓能享受人才新政购房补贴吗?2019/4/24 15:07:30\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的书记:您好!我研究生毕业后根据人才新政...网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...2019/5/9 9:49:42
42574A0009233关于A市公交站点名称变更的建议2019/4/23 17:03:19\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t建议将“白竹坡路口”更名为“马坡岭小学”,原...网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...2019/5/9 9:51:30
........................
2811181267UU008766汽车北站进站口附近居民强烈反对建设I市平康肾病医院!2018/12/12 15:20:46\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t我们是I市汽车北站进站口的周围居民。在这里的...您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。2019/1/8 16:54:53
2812181603UU008194强烈反对I市9路公交车改线路2018/6/12 8:51:03\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t强烈反对I市9路公交车改线路获悉从7月1日起...“UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。2018/7/4 16:55:53
2813184423UU0082115对G7县文盛小学特色班的一点质疑2018/10/11 20:02:52\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\tG7县文盛小学引入特色班,每个学生必须参加,...“UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...2018/10/24 9:22:07
2814185799UU008785燃油税费改革政策的咨询2012/9/4 23:14:44\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t贺厅长:      您  好!自燃油税费改革...西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,...2013/1/6 15:41:02
2815185986UU008363强烈呼吁宁朱公路拓宽提质改造2011/10/3 21:52:37\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t  A8县朱良桥乡可以说是A8县最破烂的乡了...“UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和...2012/2/28 10:19:55
\n", 163 | "

2816 rows × 7 columns

\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " 留言编号 留言用户 留言主题 留言时间 \\\n", 168 | "0 2549 A00045581 A2区景蓉华苑物业管理有问题 2019/4/25 9:32:09 \n", 169 | "1 2554 A00023583 A3区潇楚南路洋湖段怎么还没修好? 2019/4/24 16:03:40 \n", 170 | "2 2555 A00031618 请加快提高A市民营幼儿园老师的待遇 2019/4/24 15:40:04 \n", 171 | "3 2557 A000110735 在A市买公寓能享受人才新政购房补贴吗? 2019/4/24 15:07:30 \n", 172 | "4 2574 A0009233 关于A市公交站点名称变更的建议 2019/4/23 17:03:19 \n", 173 | "... ... ... ... ... \n", 174 | "2811 181267 UU008766 汽车北站进站口附近居民强烈反对建设I市平康肾病医院! 2018/12/12 15:20:46 \n", 175 | "2812 181603 UU008194 强烈反对I市9路公交车改线路 2018/6/12 8:51:03 \n", 176 | "2813 184423 UU0082115 对G7县文盛小学特色班的一点质疑 2018/10/11 20:02:52 \n", 177 | "2814 185799 UU008785 燃油税费改革政策的咨询 2012/9/4 23:14:44 \n", 178 | "2815 185986 UU008363 强烈呼吁宁朱公路拓宽提质改造 2011/10/3 21:52:37 \n", 179 | "\n", 180 | " 留言详情 \\\n", 181 | "0 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t2019年4月以来,位于A市A2区桂花坪街道... \n", 182 | "1 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t潇楚南路从2018年开始修,到现在都快一年了... \n", 183 | "2 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t地处省会A市民营幼儿园众多,小孩是祖国的未来... \n", 184 | "3 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的书记:您好!我研究生毕业后根据人才新政... \n", 185 | "4 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t建议将“白竹坡路口”更名为“马坡岭小学”,原... \n", 186 | "... ... \n", 187 | "2811 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t我们是I市汽车北站进站口的周围居民。在这里的... \n", 188 | "2812 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t强烈反对I市9路公交车改线路获悉从7月1日起... \n", 189 | "2813 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\tG7县文盛小学引入特色班,每个学生必须参加,... \n", 190 | "2814 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t贺厅长:      您  好!自燃油税费改革... \n", 191 | "2815 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t  A8县朱良桥乡可以说是A8县最破烂的乡了... \n", 192 | "\n", 193 | " 答复意见 答复时间 \n", 194 | "0 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核... 2019/5/10 14:56:53 \n", 195 | "1 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋... 2019/5/9 9:49:10 \n", 196 | "2 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善... 2019/5/9 9:49:14 \n", 197 | "3 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反... 2019/5/9 9:49:42 \n", 198 | "4 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡... 2019/5/9 9:51:30 \n", 199 | "... ... ... \n", 200 | "2811 您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。 2019/1/8 16:54:53 \n", 201 | "2812 “UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。 2018/7/4 16:55:53 \n", 202 | "2813 “UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重... 2018/10/24 9:22:07 \n", 203 | "2814 西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,... 2013/1/6 15:41:02 \n", 204 | "2815 “UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和... 2012/2/28 10:19:55 \n", 205 | "\n", 206 | "[2816 rows x 7 columns]" 207 | ] 208 | }, 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "#os.chdir('C:/Users/64672/Desktop/C题全部数据')\n", 216 | "df=pd.read_excel('附件4.xlsx')\n", 217 | "data_dup = df.drop_duplicates()\n", 218 | "data_dup" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 17, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/html": [ 229 | "
\n", 230 | "\n", 243 | "\n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | "
留言编号答复意见
02549现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...
12554网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...
22555市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...
32557网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...
42574网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...
.........
2811181267您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。
2812181603“UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。
2813184423“UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...
2814185799西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,...
2815185986“UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和...
\n", 309 | "

2816 rows × 2 columns

\n", 310 | "
" 311 | ], 312 | "text/plain": [ 313 | " 留言编号 答复意见\n", 314 | "0 2549 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...\n", 315 | "1 2554 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...\n", 316 | "2 2555 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...\n", 317 | "3 2557 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...\n", 318 | "4 2574 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...\n", 319 | "... ... ...\n", 320 | "2811 181267 您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。\n", 321 | "2812 181603 “UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。\n", 322 | "2813 184423 “UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...\n", 323 | "2814 185799 西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,...\n", 324 | "2815 185986 “UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和...\n", 325 | "\n", 326 | "[2816 rows x 2 columns]" 327 | ] 328 | }, 329 | "execution_count": 17, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "answer=pd.DataFrame(df.loc[:,['留言编号','答复意见']])\n", 336 | "answer" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 18, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/html": [ 347 | "
\n", 348 | "\n", 361 | "\n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | "
留言编号答复意见
02549现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...
12554网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...
22555市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...
32557网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...
42574网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...
.........
2811181267您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。
2812181603“UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。
2813184423“UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...
2814185799西地省平台《问政西地省》栏目组:网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,现函复...
2815185986“UU008363”:您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和研究。...
\n", 427 | "

2816 rows × 2 columns

\n", 428 | "
" 429 | ], 430 | "text/plain": [ 431 | " 留言编号 答复意见\n", 432 | "0 2549 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...\n", 433 | "1 2554 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...\n", 434 | "2 2555 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...\n", 435 | "3 2557 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...\n", 436 | "4 2574 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...\n", 437 | "... ... ...\n", 438 | "2811 181267 您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。\n", 439 | "2812 181603 “UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。\n", 440 | "2813 184423 “UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...\n", 441 | "2814 185799 西地省平台《问政西地省》栏目组:网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,现函复...\n", 442 | "2815 185986 “UU008363”:您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和研究。...\n", 443 | "\n", 444 | "[2816 rows x 2 columns]" 445 | ] 446 | }, 447 | "execution_count": 18, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\n','',str(x)))\n", 454 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\t','',str(x)))\n", 455 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\r','',str(x)))\n", 456 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\s','',str(x)))\n", 457 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('http[A-Za-z:/.]+','',str(x)))\n", 458 | "answer" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "按标点符号切割句子" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 19, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/plain": [ 476 | "0 [现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查...\n", 477 | "1 [网友“A00023583”, 您好, 针对您反映A3区潇楚南路洋湖段怎么还没修好的问题, ...\n", 478 | "2 [市民同志, 你好, 您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉, 现回复如下,...\n", 479 | "3 [网友“A000110735”, 您好, 您在平台《问政西地省》上的留言已收悉, 市住建局及...\n", 480 | "4 [网友“A0009233”, 您好, 您的留言已收悉, 现将具体内容答复如下, 关于来信人建...\n", 481 | " ... \n", 482 | "2811 [您的留言已收悉, 关于您反映的问题, 已转I1区委, 区人民政府调查处理, ]\n", 483 | "2812 [“UU008194”您的留言已收悉, 关于您反映的问题, 已转市交通运输局调查处理, ]\n", 484 | "2813 [“UU0082115”您好, 获悉关于“对G7县文盛小学特色班的质疑”的网帖后, 我局领导...\n", 485 | "2814 [西地省平台《问政西地省》栏目组, 网民在贵栏目留言, 咨询中央转移支付我省燃油税资金情况,...\n", 486 | "2815 [“UU008363”, 您好, 您的留言, 我厅领导高度重视, 要求相关部门进行了认真的调...\n", 487 | "Name: 答复意见, Length: 2816, dtype: object" 488 | ] 489 | }, 490 | "execution_count": 19, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "pattern = r',|\\.|/|;|\\'|`|\\[|\\]|<|>|\\?|:|\"|\\{|\\}|\\~|!|@|#|\\$|%|\\^|&|\\(|\\)|-|=|\\_|\\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)'\n", 497 | "cut = answer['答复意见'].apply(lambda x: re.split(pattern, x))\n", 498 | "cut" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 20, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "0 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...\n", 510 | "1 网友“A00023583” 您好\n", 511 | "2 市民同志 你好\n", 512 | "3 网友“A000110735” 您好\n", 513 | "4 网友“A0009233” 您好\n", 514 | " ... \n", 515 | "2811 您的留言已收悉 关于您反映的问题\n", 516 | "2812 “UU008194”您的留言已收悉 关于您反映的问题\n", 517 | "2813 “UU0082115”您好 获悉关于“对G7县文盛小学特色班的质疑”的网帖后\n", 518 | "2814 西地省平台《问政西地省》栏目组 网民在贵栏目留言\n", 519 | "2815 “UU008363” 您好\n", 520 | "Name: first, Length: 2816, dtype: object" 521 | ] 522 | }, 523 | "execution_count": 20, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "answer['first'] = cut.apply(lambda x: x[0:2])\n", 530 | "answer['first']=answer['first'].apply(lambda x: ' '.join(x))\n", 531 | "answer['first']" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 62, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "data": { 541 | "text/plain": [ 542 | "0 区住房和城乡建设局已要求业委会依法依规召开业主大会 根据业主大会的表决结果再执行相应的程序 ...\n", 543 | "1 预计今年8月底将完工通车 感谢您对我们工作的关心 监督与支持 2019年4月29日\n", 544 | "2 工资待遇 社会保障和职称评聘等方面继续推进 谢谢您对我市学前教育的关注和支持 \n", 545 | "3 建议可拨打市房屋交易管理中心咨询电话0000 00000000详询 特此回复 2019年4月30日\n", 546 | "4 市民均已熟知 因此不宜变更 感谢来信人对我市公共交通的支持与关心 2019年5月5日\n", 547 | " ... \n", 548 | "2811 关于您反映的问题 已转I1区委 区人民政府调查处理 \n", 549 | "2812 “UU008194”您的留言已收悉 关于您反映的问题 已转市交通运输局调查处理 \n", 550 | "2813 符合国家 地方 学校三级课程管理要求 2018年10月15日\n", 551 | "2814 承办责任处室 西地省交通运输厅财务处 \n", 552 | "2815 承办责任单位 省交通运输厅规划办 \n", 553 | "Name: end, Length: 2816, dtype: object" 554 | ] 555 | }, 556 | "execution_count": 62, 557 | "metadata": {}, 558 | "output_type": "execute_result" 559 | } 560 | ], 561 | "source": [ 562 | "answer['end'] = cut.apply(lambda x : x[-4:])\n", 563 | "answer['end']=answer['end'].apply(lambda x: ' '.join(x))\n", 564 | "answer['end']" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 63, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "def first(x):\n", 574 | " try:\n", 575 | " if '你好' in x or '您好'in x or '你好' in x or '您好'in x or '你们好'in x or '您们好'in x or '收悉'in x:\n", 576 | " a='完整'\n", 577 | " else:\n", 578 | " a='不完整'\n", 579 | " except:\n", 580 | " a = '不完整'\n", 581 | " return a" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 64, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "def end(x):\n", 591 | " try:\n", 592 | " if '年' in x and '月' in x and '日' in x and ('感谢' in x or '谢谢' in x or '理解' in x or '我们非常乐意听取您的意见和建议' in x):\n", 593 | " a='完整'\n", 594 | " else:\n", 595 | " a='不完整'\n", 596 | " except:\n", 597 | " a = '不完整'\n", 598 | " return a" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 72, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "def complete(x):\n", 608 | " try:\n", 609 | " if '已转' in x or '已收悉' in x or '交办' in x or '转交' in x or '待核实后给您答复' in x or '办复' in x or '敬请关注后续回复' in x:\n", 610 | " a = '未解决'\n", 611 | " else:\n", 612 | " a = '已解决'\n", 613 | " except:\n", 614 | " a = '已解决'\n", 615 | " return a\n" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 73, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "answer['f_first']=answer['first'].apply(lambda x: first(x) )" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 74, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "answer['f_end']=answer['end'].apply(lambda x: end(x) )" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 75, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "answer['f_complete']=answer['end'].apply(lambda x: complete(x) )" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 76, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "answer.to_excel('开头结尾.xlsx',index=False)" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 77, 657 | "metadata": {}, 658 | "outputs": [ 659 | { 660 | "ename": "SyntaxError", 661 | "evalue": "invalid syntax (, line 1)", 662 | "output_type": "error", 663 | "traceback": [ 664 | "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m answer[]\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" 665 | ] 666 | } 667 | ], 668 | "source": [] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [] 676 | } 677 | ], 678 | "metadata": { 679 | "kernelspec": { 680 | "display_name": "Python 3", 681 | "language": "python", 682 | "name": "python3" 683 | }, 684 | "language_info": { 685 | "codemirror_mode": { 686 | "name": "ipython", 687 | "version": 3 688 | }, 689 | "file_extension": ".py", 690 | "mimetype": "text/x-python", 691 | "name": "python", 692 | "nbconvert_exporter": "python", 693 | "pygments_lexer": "ipython3", 694 | "version": "3.7.1" 695 | } 696 | }, 697 | "nbformat": 4, 698 | "nbformat_minor": 2 699 | } 700 | -------------------------------------------------------------------------------- /code/第三问/Word Dict/readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/Word Dict/readme.txt -------------------------------------------------------------------------------- /code/第三问/Word Dict/unParallelWord.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/Word Dict/unParallelWord.txt -------------------------------------------------------------------------------- /code/第三问/Word Dict/unSingleWord.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/Word Dict/unSingleWord.txt -------------------------------------------------------------------------------- /code/第三问/~$开头结尾.xlsx: -------------------------------------------------------------------------------- 1 | HP HP -------------------------------------------------------------------------------- /code/第三问/及时性/回复时间.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/及时性/回复时间.xlsx -------------------------------------------------------------------------------- /code/第三问/及时性/回复速度计算.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import os" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 4, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "os.chdir('C:/Users/64672/Desktop/C题全部数据')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 62, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "df=pd.read_excel('附件4.xlsx')" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 63, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/html": [ 40 | "
\n", 41 | "\n", 54 | "\n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | "
留言编号留言用户留言主题留言时间留言详情答复意见答复时间
02549A00045581A2区景蓉华苑物业管理有问题2019/4/25 9:32:09\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t2019年4月以来,位于A市A2区桂花坪街道...现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...2019/5/10 14:56:53
12554A00023583A3区潇楚南路洋湖段怎么还没修好?2019/4/24 16:03:40\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t潇楚南路从2018年开始修,到现在都快一年了...网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...2019/5/9 9:49:10
22555A00031618请加快提高A市民营幼儿园老师的待遇2019/4/24 15:40:04\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t地处省会A市民营幼儿园众多,小孩是祖国的未来...市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...2019/5/9 9:49:14
32557A000110735在A市买公寓能享受人才新政购房补贴吗?2019/4/24 15:07:30\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的书记:您好!我研究生毕业后根据人才新政...网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...2019/5/9 9:49:42
42574A0009233关于A市公交站点名称变更的建议2019/4/23 17:03:19\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t建议将“白竹坡路口”更名为“马坡岭小学”,原...网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...2019/5/9 9:51:30
\n", 120 | "
" 121 | ], 122 | "text/plain": [ 123 | " 留言编号 留言用户 留言主题 留言时间 \\\n", 124 | "0 2549 A00045581 A2区景蓉华苑物业管理有问题 2019/4/25 9:32:09 \n", 125 | "1 2554 A00023583 A3区潇楚南路洋湖段怎么还没修好? 2019/4/24 16:03:40 \n", 126 | "2 2555 A00031618 请加快提高A市民营幼儿园老师的待遇 2019/4/24 15:40:04 \n", 127 | "3 2557 A000110735 在A市买公寓能享受人才新政购房补贴吗? 2019/4/24 15:07:30 \n", 128 | "4 2574 A0009233 关于A市公交站点名称变更的建议 2019/4/23 17:03:19 \n", 129 | "\n", 130 | " 留言详情 \\\n", 131 | "0 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t2019年4月以来,位于A市A2区桂花坪街道... \n", 132 | "1 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t潇楚南路从2018年开始修,到现在都快一年了... \n", 133 | "2 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t地处省会A市民营幼儿园众多,小孩是祖国的未来... \n", 134 | "3 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的书记:您好!我研究生毕业后根据人才新政... \n", 135 | "4 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t建议将“白竹坡路口”更名为“马坡岭小学”,原... \n", 136 | "\n", 137 | " 答复意见 答复时间 \n", 138 | "0 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核... 2019/5/10 14:56:53 \n", 139 | "1 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋... 2019/5/9 9:49:10 \n", 140 | "2 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善... 2019/5/9 9:49:14 \n", 141 | "3 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反... 2019/5/9 9:49:42 \n", 142 | "4 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡... 2019/5/9 9:51:30 " 143 | ] 144 | }, 145 | "execution_count": 63, 146 | "metadata": {}, 147 | "output_type": "execute_result" 148 | } 149 | ], 150 | "source": [ 151 | "df.head()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 64, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "df['留言时间']=pd.to_datetime(df['留言时间'])\n", 161 | "df['答复时间']=pd.to_datetime(df['答复时间'])" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 65, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "df['回复速度']=df['答复时间']-df['留言时间']" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 66, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "df['回复速度']=(df['回复速度']/np.timedelta64(1,'D')).astype(int)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 36, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "0.0 0.0\n", 191 | "0.2 3.0\n", 192 | "0.4 8.0\n", 193 | "0.6 14.0\n", 194 | "0.8 26.0\n", 195 | "1.0 1160.0\n", 196 | "Name: 回复速度, dtype: float64" 197 | ] 198 | }, 199 | "execution_count": 36, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "df['回复速度'].quantile([0,0.2,0.4,0.6,0.8,1])" 206 | ] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.7.4" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 2 230 | } 231 | -------------------------------------------------------------------------------- /code/第三问/可解释性/增加可解释性词.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/可解释性/增加可解释性词.csv -------------------------------------------------------------------------------- /code/第三问/可解释性/增加连接词.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/可解释性/增加连接词.csv -------------------------------------------------------------------------------- /code/第三问/可解释性/自定义可解释性.txt: -------------------------------------------------------------------------------- 1 | 依 2 | 按 3 | 根据 4 | 按照 5 | 调查 6 | 核实 7 | 经查 8 | 据查 9 | 针对 10 | 依据 11 | 依法 12 | 规定 13 | 流程 14 | 依规 15 | 方案 16 | 法律 17 | 规划 18 | 规定 19 | 审查 20 | 协商 21 | 部署 22 | 协议 23 | 政策 24 | 条件 25 | 符合 26 | 审批 27 | 认定 28 | 论证 29 | 研究 30 | 通知 31 | 要求 32 | 核查 33 | 询问 34 | 查看 35 | 鉴定 36 | 记录 37 | 通过 38 | 了解 39 | 调阅 40 | 批准 41 | 核定 42 | 公示 43 | 属实 44 | 巡查 45 | 指示 46 | 查明 47 | 依照 48 | 查处 49 | 合规 50 | 合法 51 | 为了 52 | 确保 53 | 满足 54 | 查询 55 | -------------------------------------------------------------------------------- /code/第三问/可解释性/连接词词典.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/可解释性/连接词词典.csv -------------------------------------------------------------------------------- /code/第三问/可解释性/附件4.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/可解释性/附件4.xlsx -------------------------------------------------------------------------------- /code/第三问/完整性/开头结尾.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/完整性/开头结尾.xlsx -------------------------------------------------------------------------------- /code/第三问/完整性/开头结尾及解决否.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 15, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import os\n", 12 | "import re" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 16, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/html": [ 23 | "
\n", 24 | "\n", 37 | "\n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | "
留言编号留言用户留言主题留言时间留言详情答复意见答复时间
02549A00045581A2区景蓉华苑物业管理有问题2019/4/25 9:32:09\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t2019年4月以来,位于A市A2区桂花坪街道...现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...2019/5/10 14:56:53
12554A00023583A3区潇楚南路洋湖段怎么还没修好?2019/4/24 16:03:40\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t潇楚南路从2018年开始修,到现在都快一年了...网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...2019/5/9 9:49:10
22555A00031618请加快提高A市民营幼儿园老师的待遇2019/4/24 15:40:04\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t地处省会A市民营幼儿园众多,小孩是祖国的未来...市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...2019/5/9 9:49:14
32557A000110735在A市买公寓能享受人才新政购房补贴吗?2019/4/24 15:07:30\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的书记:您好!我研究生毕业后根据人才新政...网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...2019/5/9 9:49:42
42574A0009233关于A市公交站点名称变更的建议2019/4/23 17:03:19\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t建议将“白竹坡路口”更名为“马坡岭小学”,原...网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...2019/5/9 9:51:30
........................
2811181267UU008766汽车北站进站口附近居民强烈反对建设I市平康肾病医院!2018/12/12 15:20:46\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t我们是I市汽车北站进站口的周围居民。在这里的...您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。2019/1/8 16:54:53
2812181603UU008194强烈反对I市9路公交车改线路2018/6/12 8:51:03\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t强烈反对I市9路公交车改线路获悉从7月1日起...“UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。2018/7/4 16:55:53
2813184423UU0082115对G7县文盛小学特色班的一点质疑2018/10/11 20:02:52\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\tG7县文盛小学引入特色班,每个学生必须参加,...“UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...2018/10/24 9:22:07
2814185799UU008785燃油税费改革政策的咨询2012/9/4 23:14:44\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t贺厅长:      您  好!自燃油税费改革...西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,...2013/1/6 15:41:02
2815185986UU008363强烈呼吁宁朱公路拓宽提质改造2011/10/3 21:52:37\\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t  A8县朱良桥乡可以说是A8县最破烂的乡了...“UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和...2012/2/28 10:19:55
\n", 163 | "

2816 rows × 7 columns

\n", 164 | "
" 165 | ], 166 | "text/plain": [ 167 | " 留言编号 留言用户 留言主题 留言时间 \\\n", 168 | "0 2549 A00045581 A2区景蓉华苑物业管理有问题 2019/4/25 9:32:09 \n", 169 | "1 2554 A00023583 A3区潇楚南路洋湖段怎么还没修好? 2019/4/24 16:03:40 \n", 170 | "2 2555 A00031618 请加快提高A市民营幼儿园老师的待遇 2019/4/24 15:40:04 \n", 171 | "3 2557 A000110735 在A市买公寓能享受人才新政购房补贴吗? 2019/4/24 15:07:30 \n", 172 | "4 2574 A0009233 关于A市公交站点名称变更的建议 2019/4/23 17:03:19 \n", 173 | "... ... ... ... ... \n", 174 | "2811 181267 UU008766 汽车北站进站口附近居民强烈反对建设I市平康肾病医院! 2018/12/12 15:20:46 \n", 175 | "2812 181603 UU008194 强烈反对I市9路公交车改线路 2018/6/12 8:51:03 \n", 176 | "2813 184423 UU0082115 对G7县文盛小学特色班的一点质疑 2018/10/11 20:02:52 \n", 177 | "2814 185799 UU008785 燃油税费改革政策的咨询 2012/9/4 23:14:44 \n", 178 | "2815 185986 UU008363 强烈呼吁宁朱公路拓宽提质改造 2011/10/3 21:52:37 \n", 179 | "\n", 180 | " 留言详情 \\\n", 181 | "0 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t2019年4月以来,位于A市A2区桂花坪街道... \n", 182 | "1 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t潇楚南路从2018年开始修,到现在都快一年了... \n", 183 | "2 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t地处省会A市民营幼儿园众多,小孩是祖国的未来... \n", 184 | "3 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t尊敬的书记:您好!我研究生毕业后根据人才新政... \n", 185 | "4 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t建议将“白竹坡路口”更名为“马坡岭小学”,原... \n", 186 | "... ... \n", 187 | "2811 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t我们是I市汽车北站进站口的周围居民。在这里的... \n", 188 | "2812 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t强烈反对I市9路公交车改线路获悉从7月1日起... \n", 189 | "2813 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\tG7县文盛小学引入特色班,每个学生必须参加,... \n", 190 | "2814 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t贺厅长:      您  好!自燃油税费改革... \n", 191 | "2815 \\n\\t\\t\\t\\t\\t\\n\\t\\t\\t\\t\\t  A8县朱良桥乡可以说是A8县最破烂的乡了... \n", 192 | "\n", 193 | " 答复意见 答复时间 \n", 194 | "0 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核... 2019/5/10 14:56:53 \n", 195 | "1 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋... 2019/5/9 9:49:10 \n", 196 | "2 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善... 2019/5/9 9:49:14 \n", 197 | "3 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反... 2019/5/9 9:49:42 \n", 198 | "4 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡... 2019/5/9 9:51:30 \n", 199 | "... ... ... \n", 200 | "2811 您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。 2019/1/8 16:54:53 \n", 201 | "2812 “UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。 2018/7/4 16:55:53 \n", 202 | "2813 “UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重... 2018/10/24 9:22:07 \n", 203 | "2814 西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,... 2013/1/6 15:41:02 \n", 204 | "2815 “UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和... 2012/2/28 10:19:55 \n", 205 | "\n", 206 | "[2816 rows x 7 columns]" 207 | ] 208 | }, 209 | "execution_count": 16, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "#os.chdir('C:/Users/64672/Desktop/C题全部数据')\n", 216 | "df=pd.read_excel('附件4.xlsx')\n", 217 | "data_dup = df.drop_duplicates()\n", 218 | "data_dup" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 17, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/html": [ 229 | "
\n", 230 | "\n", 243 | "\n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | "
留言编号答复意见
02549现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...
12554网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...
22555市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...
32557网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...
42574网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...
.........
2811181267您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。
2812181603“UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。
2813184423“UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...
2814185799西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,...
2815185986“UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和...
\n", 309 | "

2816 rows × 2 columns

\n", 310 | "
" 311 | ], 312 | "text/plain": [ 313 | " 留言编号 答复意见\n", 314 | "0 2549 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...\n", 315 | "1 2554 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...\n", 316 | "2 2555 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...\n", 317 | "3 2557 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...\n", 318 | "4 2574 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...\n", 319 | "... ... ...\n", 320 | "2811 181267 您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。\n", 321 | "2812 181603 “UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。\n", 322 | "2813 184423 “UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...\n", 323 | "2814 185799 西地省平台《问政西地省》栏目组:   网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,...\n", 324 | "2815 185986 “UU008363”:   您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和...\n", 325 | "\n", 326 | "[2816 rows x 2 columns]" 327 | ] 328 | }, 329 | "execution_count": 17, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "answer=pd.DataFrame(df.loc[:,['留言编号','答复意见']])\n", 336 | "answer" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 18, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/html": [ 347 | "
\n", 348 | "\n", 361 | "\n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | "
留言编号答复意见
02549现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...
12554网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...
22555市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...
32557网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...
42574网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...
.........
2811181267您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。
2812181603“UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。
2813184423“UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...
2814185799西地省平台《问政西地省》栏目组:网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,现函复...
2815185986“UU008363”:您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和研究。...
\n", 427 | "

2816 rows × 2 columns

\n", 428 | "
" 429 | ], 430 | "text/plain": [ 431 | " 留言编号 答复意见\n", 432 | "0 2549 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...\n", 433 | "1 2554 网友“A00023583”:您好!针对您反映A3区潇楚南路洋湖段怎么还没修好的问题,A3区洋...\n", 434 | "2 2555 市民同志:你好!您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉。现回复如下:为了改善...\n", 435 | "3 2557 网友“A000110735”:您好!您在平台《问政西地省》上的留言已收悉,市住建局及时将您反...\n", 436 | "4 2574 网友“A0009233”,您好,您的留言已收悉,现将具体内容答复如下:关于来信人建议“白竹坡...\n", 437 | "... ... ...\n", 438 | "2811 181267 您的留言已收悉。关于您反映的问题,已转I1区委、区人民政府调查处理。\n", 439 | "2812 181603 “UU008194”您的留言已收悉。关于您反映的问题,已转市交通运输局调查处理。\n", 440 | "2813 184423 “UU0082115”您好!获悉关于“对G7县文盛小学特色班的质疑”的网帖后,我局领导高度重...\n", 441 | "2814 185799 西地省平台《问政西地省》栏目组:网民在贵栏目留言,咨询中央转移支付我省燃油税资金情况,现函复...\n", 442 | "2815 185986 “UU008363”:您好!您的留言,我厅领导高度重视,要求相关部门进行了认真的调查和研究。...\n", 443 | "\n", 444 | "[2816 rows x 2 columns]" 445 | ] 446 | }, 447 | "execution_count": 18, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\n','',str(x)))\n", 454 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\t','',str(x)))\n", 455 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\r','',str(x)))\n", 456 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('\\s','',str(x)))\n", 457 | "answer['答复意见'] = answer['答复意见'].apply(lambda x : re.sub('http[A-Za-z:/.]+','',str(x)))\n", 458 | "answer" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "按标点符号切割句子" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 19, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/plain": [ 476 | "0 [现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查...\n", 477 | "1 [网友“A00023583”, 您好, 针对您反映A3区潇楚南路洋湖段怎么还没修好的问题, ...\n", 478 | "2 [市民同志, 你好, 您反映的“请加快提高民营幼儿园教师的待遇”的来信已收悉, 现回复如下,...\n", 479 | "3 [网友“A000110735”, 您好, 您在平台《问政西地省》上的留言已收悉, 市住建局及...\n", 480 | "4 [网友“A0009233”, 您好, 您的留言已收悉, 现将具体内容答复如下, 关于来信人建...\n", 481 | " ... \n", 482 | "2811 [您的留言已收悉, 关于您反映的问题, 已转I1区委, 区人民政府调查处理, ]\n", 483 | "2812 [“UU008194”您的留言已收悉, 关于您反映的问题, 已转市交通运输局调查处理, ]\n", 484 | "2813 [“UU0082115”您好, 获悉关于“对G7县文盛小学特色班的质疑”的网帖后, 我局领导...\n", 485 | "2814 [西地省平台《问政西地省》栏目组, 网民在贵栏目留言, 咨询中央转移支付我省燃油税资金情况,...\n", 486 | "2815 [“UU008363”, 您好, 您的留言, 我厅领导高度重视, 要求相关部门进行了认真的调...\n", 487 | "Name: 答复意见, Length: 2816, dtype: object" 488 | ] 489 | }, 490 | "execution_count": 19, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "pattern = r',|\\.|/|;|\\'|`|\\[|\\]|<|>|\\?|:|\"|\\{|\\}|\\~|!|@|#|\\$|%|\\^|&|\\(|\\)|-|=|\\_|\\+|,|。|、|;|‘|’|【|】|·|!| |…|(|)'\n", 497 | "cut = answer['答复意见'].apply(lambda x: re.split(pattern, x))\n", 498 | "cut" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 20, 504 | "metadata": {}, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "0 现将网友在平台《问政西地省》栏目向胡华衡书记留言反映“A2区景蓉花苑物业管理有问题”的调查核...\n", 510 | "1 网友“A00023583” 您好\n", 511 | "2 市民同志 你好\n", 512 | "3 网友“A000110735” 您好\n", 513 | "4 网友“A0009233” 您好\n", 514 | " ... \n", 515 | "2811 您的留言已收悉 关于您反映的问题\n", 516 | "2812 “UU008194”您的留言已收悉 关于您反映的问题\n", 517 | "2813 “UU0082115”您好 获悉关于“对G7县文盛小学特色班的质疑”的网帖后\n", 518 | "2814 西地省平台《问政西地省》栏目组 网民在贵栏目留言\n", 519 | "2815 “UU008363” 您好\n", 520 | "Name: first, Length: 2816, dtype: object" 521 | ] 522 | }, 523 | "execution_count": 20, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "answer['first'] = cut.apply(lambda x: x[0:2])\n", 530 | "answer['first']=answer['first'].apply(lambda x: ' '.join(x))\n", 531 | "answer['first']" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 62, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "data": { 541 | "text/plain": [ 542 | "0 区住房和城乡建设局已要求业委会依法依规召开业主大会 根据业主大会的表决结果再执行相应的程序 ...\n", 543 | "1 预计今年8月底将完工通车 感谢您对我们工作的关心 监督与支持 2019年4月29日\n", 544 | "2 工资待遇 社会保障和职称评聘等方面继续推进 谢谢您对我市学前教育的关注和支持 \n", 545 | "3 建议可拨打市房屋交易管理中心咨询电话0000 00000000详询 特此回复 2019年4月30日\n", 546 | "4 市民均已熟知 因此不宜变更 感谢来信人对我市公共交通的支持与关心 2019年5月5日\n", 547 | " ... \n", 548 | "2811 关于您反映的问题 已转I1区委 区人民政府调查处理 \n", 549 | "2812 “UU008194”您的留言已收悉 关于您反映的问题 已转市交通运输局调查处理 \n", 550 | "2813 符合国家 地方 学校三级课程管理要求 2018年10月15日\n", 551 | "2814 承办责任处室 西地省交通运输厅财务处 \n", 552 | "2815 承办责任单位 省交通运输厅规划办 \n", 553 | "Name: end, Length: 2816, dtype: object" 554 | ] 555 | }, 556 | "execution_count": 62, 557 | "metadata": {}, 558 | "output_type": "execute_result" 559 | } 560 | ], 561 | "source": [ 562 | "answer['end'] = cut.apply(lambda x : x[-4:])\n", 563 | "answer['end']=answer['end'].apply(lambda x: ' '.join(x))\n", 564 | "answer['end']" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": 63, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "def first(x):\n", 574 | " try:\n", 575 | " if '你好' in x or '您好'in x or '你好' in x or '您好'in x or '你们好'in x or '您们好'in x or '收悉'in x:\n", 576 | " a='完整'\n", 577 | " else:\n", 578 | " a='不完整'\n", 579 | " except:\n", 580 | " a = '不完整'\n", 581 | " return a" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 64, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "def end(x):\n", 591 | " try:\n", 592 | " if '年' in x and '月' in x and '日' in x and ('感谢' in x or '谢谢' in x or '理解' in x or '我们非常乐意听取您的意见和建议' in x):\n", 593 | " a='完整'\n", 594 | " else:\n", 595 | " a='不完整'\n", 596 | " except:\n", 597 | " a = '不完整'\n", 598 | " return a" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 72, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "def complete(x):\n", 608 | " try:\n", 609 | " if '已转' in x or '已收悉' in x or '交办' in x or '转交' in x or '待核实后给您答复' in x or '办复' in x or '敬请关注后续回复' in x:\n", 610 | " a = '未解决'\n", 611 | " else:\n", 612 | " a = '已解决'\n", 613 | " except:\n", 614 | " a = '已解决'\n", 615 | " return a\n" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": 73, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "answer['f_first']=answer['first'].apply(lambda x: first(x) )" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 74, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "answer['f_end']=answer['end'].apply(lambda x: end(x) )" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 75, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "answer['f_complete']=answer['end'].apply(lambda x: complete(x) )" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 78, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "answer.to_excel('开头结尾及解决否.xlsx',index=False)" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 77, 657 | "metadata": {}, 658 | "outputs": [ 659 | { 660 | "ename": "SyntaxError", 661 | "evalue": "invalid syntax (, line 1)", 662 | "output_type": "error", 663 | "traceback": [ 664 | "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m answer[]\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" 665 | ] 666 | } 667 | ], 668 | "source": [] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [] 676 | } 677 | ], 678 | "metadata": { 679 | "kernelspec": { 680 | "display_name": "Python 3", 681 | "language": "python", 682 | "name": "python3" 683 | }, 684 | "language_info": { 685 | "codemirror_mode": { 686 | "name": "ipython", 687 | "version": 3 688 | }, 689 | "file_extension": ".py", 690 | "mimetype": "text/x-python", 691 | "name": "python", 692 | "nbconvert_exporter": "python", 693 | "pygments_lexer": "ipython3", 694 | "version": "3.7.1" 695 | } 696 | }, 697 | "nbformat": 4, 698 | "nbformat_minor": 2 699 | } 700 | -------------------------------------------------------------------------------- /code/第三问/完整性/开头结尾及解决否.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/完整性/开头结尾及解决否.xlsx -------------------------------------------------------------------------------- /code/第三问/相关性/.ipynb_checkpoints/相似度-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 247, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import os\n", 12 | "import re" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 248, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "os.chdir('C:/Users/64672/Desktop/C题全部数据')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 253, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "df=pd.read_excel('主题2.xlsx',encoding='gbk')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 254, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "df1=df.loc[:,['留言编号','主题']]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 259, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "df1['主题']=df1['主题'].apply(lambda x: re.sub('[^\\u4E00-\\u9FD5]+',',',x))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 262, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "df1['主题']=df1['主题'].apply(lambda x: x[1:-1])" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 263, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0 业主,业委会,停车,意见,辖区,街道,业主大会,建设局,情况,坪,业委会,业主大会,业主,情...\n", 69 | "1 施工,道路,项目,排水,渠道,换填,坪塘,大道,管线,土方,施工,排水,项目,道路,集镇,土...\n", 70 | "2 幼儿园,教师,待遇,民办,提高,依法,学前教育,监管,保障,推进,民办,待遇,教师,幼儿园,...\n", 71 | "3 购房,首次,市,补贴,含,情况,万元,住房,房屋交易,购买,购房,市,含,补贴,首次,商品,...\n", 72 | "4 市民,来信,小学,马坡岭,取消,支持,关心,保留,感谢,不宜,马坡岭,小学,来信,市民,原马...\n", 73 | " ... \n", 74 | "2746 收悉,已转,区,区委,人民政府,调查,调查,人民政府,区委,区,已转,收悉\n", 75 | "2747 收悉,调查,转市,交通运输,局,局,交通运输,转市,调查,收悉\n", 76 | "2748 学生,社团,社团活动,学校,小学,家长,课程,文盛,调查,有利于,学生,学校,社团活动,社团...\n", 77 | "2749 资金,转移,支付,情况,燃油税,交通运输,中央,年,养护,亿元,资金,支付,燃油税,转移,年...\n", 78 | "2750 交通运输,项目,单位,规划,建设,前期工作,公路,厅,主管部门,督促,交通运输,项目,厅,公...\n", 79 | "Name: 主题, Length: 2751, dtype: object" 80 | ] 81 | }, 82 | "execution_count": 263, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "df1['主题']" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 264, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "df2=pd.read_excel('附件4.xlsx')" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 265, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "import jieba" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 269, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "df2['留言主题']=df2['留言主题'].apply(jieba.lcut)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 272, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "df1['回复主题']=df1['主题'].apply(lambda x: x.split(','))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 281, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "df1['留言主题']=df2['留言主题']" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 284, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "del df1['主题']" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 295, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "with open('C:/Users/64672/Desktop/Python/stopwords.txt','r',encoding='gbk') as f:\n", 159 | " stop=f.read()\n", 160 | "stop=stop.split()\n", 161 | "df1['留言主题']=df1['留言主题'].apply(lambda x:[i for i in x if i not in stop])" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 285, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "#计算中文编辑距离 str1和str2 是经过分词的\n", 171 | "def edit_similar(str1,str2):\n", 172 | " len_str1=len(str1)\n", 173 | " len_str2=len(str2)\n", 174 | " taglist=np.zeros((len_str1+1,len_str2+1))\n", 175 | " for a in range(len_str1):\n", 176 | " taglist[a][0]=a\n", 177 | " for a in range(len_str2):\n", 178 | " taglist[0][a] = a\n", 179 | " for i in range(1,len_str1+1):\n", 180 | " for j in range(1,len_str2+1):\n", 181 | " if(str1[i - 1] == str2[j - 1]):\n", 182 | " temp = 0\n", 183 | " else:\n", 184 | " temp = 1\n", 185 | " taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1)\n", 186 | " return 1-taglist[len_str1][len_str2] / max(len_str1, len_str2)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 286, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "#计算余弦相似度\n", 196 | "def cos_dist(str1,str2):\n", 197 | " key_word = list(set(str1 + str2))\n", 198 | " # 给定形状和类型的用0填充的矩阵存储向量\n", 199 | " word_vector1 = np.zeros(len(key_word))\n", 200 | " word_vector2 = np.zeros(len(key_word))\n", 201 | "\n", 202 | " # 计算词频\n", 203 | " # 依次确定向量的每个位置的值\n", 204 | " for i in range(len(key_word)):\n", 205 | " # 遍历key_word中每个词在句子中的出现次数\n", 206 | " for j in range(len(str1)):\n", 207 | " if key_word[i] == str1[j]:\n", 208 | " word_vector1[i] += 1\n", 209 | " for k in range(len(str2)):\n", 210 | " if key_word[i] == str2[k]:\n", 211 | " word_vector2[i] += 1\n", 212 | "\n", 213 | "\n", 214 | " dist1=float(np.dot(word_vector1,word_vector2)/(np.linalg.norm(word_vector1)*np.linalg.norm(word_vector2)))\n", 215 | " return dist1" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 287, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "#余弦相似度赋予权值0.7,编辑距离赋予权重0.3\n", 225 | "def compare(str1,str2):\n", 226 | " cos_result=cos_dist(str1,str2)\n", 227 | " edit_result=edit_similar(str1,str2)\n", 228 | " return cos_result*0.7 + edit_result*0.3" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 307, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "f_similar=[]\n", 238 | "for i in range(len(df1)):\n", 239 | " a=compare(df1['留言主题'][i],df1['回复主题'][i])\n", 240 | " f_similar.append(a)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 313, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "df1['相似度']=f_similar" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 314, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "df1.to_excel('相似度.xlsx',index=False)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.7.1" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /code/第三问/相关性/~$主题2.xlsx: -------------------------------------------------------------------------------- 1 | HP HP -------------------------------------------------------------------------------- /code/第三问/相关性/主题2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/相关性/主题2.xlsx -------------------------------------------------------------------------------- /code/第三问/相关性/主题模型.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import re\n", 11 | "import jieba\n", 12 | "\n", 13 | "def data_process(file='附件4.xlsx'):\n", 14 | " data = pd.read_excel(file, index_col=0, sep = ',', encoding = 'GB18030')\n", 15 | " \n", 16 | "\n", 17 | " #data_dup = data['答复意见'].drop_duplicates()\n", 18 | " #去数字字母\n", 19 | " data_qu123a = data['答复意见'].apply(lambda x: re.sub('[a-zA-Z0-9’!\"#$%&\\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\\\]^_`{|}~\\s]+', \"\", x))\n", 20 | " #jieba.load_userdict('newdic1.txt')\n", 21 | " data_cut = data_qu123a.apply(lambda x: jieba.lcut(x))\n", 22 | "\n", 23 | " #设置不存在于停用词表的分隔符,避免 ,被默认为分隔符而报错\n", 24 | " stopWords = pd.read_csv('stopword.txt', encoding='GB18030', sep='hahaha', header=None)\n", 25 | " #列表使用+拼接\n", 26 | " stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–','现将','在','的','您好','你好','您','网友','留言','已收悉',':',\n", 27 | " '反映','的问题','平台','同志','现','领导'] + list(stopWords.iloc[:, 0])\n", 28 | " \n", 29 | " #分词\n", 30 | " data_after_stop = data_cut.apply(lambda x: [i for i in x if i not in stopWords])\n", 31 | " \n", 32 | " #用空格分割列表中的词语\n", 33 | " #adata = data_after_stop.apply(lambda x: ' '.join(x))\n", 34 | " return data_after_stop" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 19, 40 | "metadata": { 41 | "scrolled": true 42 | }, 43 | "outputs": [ 44 | { 45 | "name": "stderr", 46 | "output_type": "stream", 47 | "text": [ 48 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:16: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 49 | " app.launch_new_instance()\n", 50 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: FutureWarning: The signature of `Series.to_csv` was aligned to that of `DataFrame.to_csv`, and argument 'header' will change its default value from False to True: please pass an explicit value to suppress this warning.\n", 51 | " \n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "import pandas as pd\n", 57 | "from gensim.corpora import Dictionary\n", 58 | "from gensim.models import LdaModel\n", 59 | "\n", 60 | "data_after_stop = data_process()\n", 61 | "data_after_stop.to_csv('分词后.csv')\n", 62 | "\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 20, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "留言编号\n", 74 | "2549 [问政, 西地省, 栏目, 胡华衡, 书记, 区景蓉, 花苑, 物业管理, 调查核实, 情况...\n", 75 | "2554 [区潇楚, 南路, 洋湖, 段, 修好, 区洋湖, 街道, 高度重视, 组织, 精干, 力量...\n", 76 | "2555 [市民, 请, 加快, 提高, 民营, 幼儿园, 教师, 待遇, 来信, 收悉, 回复, 改...\n", 77 | "2557 [问政, 西地省, 收悉, 市住, 建局, 交由, 市, 房屋交易, 管理中心, 办理, 相...\n", 78 | "2574 [收悉, 具体内容, 答复, 来信, 建议, 白竹坡, 路口, 更名, 马坡岭, 小学, 原...\n", 79 | " ... \n", 80 | "181267 [收悉, 已转, 区委, 区, 人民政府, 调查]\n", 81 | "181603 [收悉, 转市, 交通运输, 局, 调查]\n", 82 | "184423 [获悉, 对县, 文盛, 小学, 特色, 班, 质疑, 网帖, 我局, 高度重视, 责成, ...\n", 83 | "185799 [西地省, 问政, 西地省, 栏目组, 网民, 贵, 栏目, 咨询, 中央, 转移, 支付,...\n", 84 | "185986 [我厅, 高度重视, 相关, 部门, 调查, 研究, 回复, 我厅, 县城, 朱良桥, 公路...\n", 85 | "Name: 答复意见, Length: 2816, dtype: object" 86 | ] 87 | }, 88 | "execution_count": 20, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "data_after_stop" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 21, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "data_com = pd.read_csv('分词后.csv', header=None,encoding = 'utf-8')\n", 104 | "data_com.columns = ['留言编号','comments']" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 22, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "topic = []\n", 114 | "for i in range(0, len(data_after_stop)):\n", 115 | " mid = data_com['comments'][i:i+1].str.split(', ')\n", 116 | " dictionary = Dictionary(mid) # 生成词典\n", 117 | " bow = [dictionary.doc2bow(j) for j in mid] # 将文档转成数值型预料库\n", 118 | "\n", 119 | " data_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=2) # 构建LDA主题模型\n", 120 | " topic.append(data_model.print_topic(0)+'\\n'+data_model.print_topic(1)) # 打印主题\n", 121 | "\n", 122 | "\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 23, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "from pandas.core.frame import DataFrame\n", 132 | "data_topic = DataFrame(topic)\n", 133 | "data_topic.columns = ['主题']\n", 134 | "data_new = pd.concat([data_com, data_topic], axis=1) #横向拼接\n", 135 | "data_new.to_excel('主题2.xlsx',encoding = 'GB18030')" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 13, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "data_new = pd.read_excel('主题2.xlsx',encoding = 'GB18030')" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 14, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "data": { 161 | "text/html": [ 162 | "
\n", 163 | "\n", 176 | "\n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | "
Unnamed: 0留言编号comments主题
002549['问政', '西地省', '栏目', '胡华衡', '书记', '区景蓉', '花苑', ...0.032*\"'业委会'\" + 0.029*\"'业主'\" + 0.023*\"'业主大会'\" ...
112554['区潇楚', '南路', '洋湖', '段', '修好', '区洋湖', '街道', '高...0.054*\"'施工'\" + 0.028*\"'道路'\" + 0.026*\"'项目'\" + 0...
222555['市民', '请', '加快', '提高', '民营', '幼儿园', '教师', '待遇...0.043*\"'民办'\" + 0.042*\"'幼儿园'\" + 0.040*\"'教师'\" + ...
332557['问政', '西地省', '收悉', '市住', '建局', '交由', '市', '房屋...0.053*\"'购房'\" + 0.035*\"'市'\" + 0.033*\"'首次'\" + 0....
442574['收悉', '具体内容', '答复', '来信', '建议', '白竹坡', '路口', ...0.048*\"'来信'\" + 0.047*\"'马坡岭'\" + 0.045*\"'小学'\" + ...
...............
28112811181267['收悉', '已转', '区委', '区', '人民政府', '调查']0.174*\"'已转'\" + 0.168*\"['收悉'\" + 0.167*\"'区委'\" + ...
28122812181603['收悉', '转市', '交通运输', '局', '调查']0.218*\"'交通运输'\" + 0.201*\"'局'\" + 0.196*\"['收悉'\" +...
28132813184423['获悉', '对县', '文盛', '小学', '特色', '班', '质疑', '网帖'...0.039*\"'学生'\" + 0.028*\"'社团活动'\" + 0.023*\"'社团'\" +...
28142814185799['西地省', '问政', '西地省', '栏目组', '网民', '贵', '栏目', '...0.070*\"'资金'\" + 0.047*\"'支付'\" + 0.042*\"'燃油税'\" + ...
28152815185986['我厅', '高度重视', '相关', '部门', '调查', '研究', '回复', '...0.044*\"'交通运输'\" + 0.034*\"'项目'\" + 0.029*\"'建设'\" +...
\n", 266 | "

2816 rows × 4 columns

\n", 267 | "
" 268 | ], 269 | "text/plain": [ 270 | " Unnamed: 0 留言编号 comments \\\n", 271 | "0 0 2549 ['问政', '西地省', '栏目', '胡华衡', '书记', '区景蓉', '花苑', ... \n", 272 | "1 1 2554 ['区潇楚', '南路', '洋湖', '段', '修好', '区洋湖', '街道', '高... \n", 273 | "2 2 2555 ['市民', '请', '加快', '提高', '民营', '幼儿园', '教师', '待遇... \n", 274 | "3 3 2557 ['问政', '西地省', '收悉', '市住', '建局', '交由', '市', '房屋... \n", 275 | "4 4 2574 ['收悉', '具体内容', '答复', '来信', '建议', '白竹坡', '路口', ... \n", 276 | "... ... ... ... \n", 277 | "2811 2811 181267 ['收悉', '已转', '区委', '区', '人民政府', '调查'] \n", 278 | "2812 2812 181603 ['收悉', '转市', '交通运输', '局', '调查'] \n", 279 | "2813 2813 184423 ['获悉', '对县', '文盛', '小学', '特色', '班', '质疑', '网帖'... \n", 280 | "2814 2814 185799 ['西地省', '问政', '西地省', '栏目组', '网民', '贵', '栏目', '... \n", 281 | "2815 2815 185986 ['我厅', '高度重视', '相关', '部门', '调查', '研究', '回复', '... \n", 282 | "\n", 283 | " 主题 \n", 284 | "0 0.032*\"'业委会'\" + 0.029*\"'业主'\" + 0.023*\"'业主大会'\" ... \n", 285 | "1 0.054*\"'施工'\" + 0.028*\"'道路'\" + 0.026*\"'项目'\" + 0... \n", 286 | "2 0.043*\"'民办'\" + 0.042*\"'幼儿园'\" + 0.040*\"'教师'\" + ... \n", 287 | "3 0.053*\"'购房'\" + 0.035*\"'市'\" + 0.033*\"'首次'\" + 0.... \n", 288 | "4 0.048*\"'来信'\" + 0.047*\"'马坡岭'\" + 0.045*\"'小学'\" + ... \n", 289 | "... ... \n", 290 | "2811 0.174*\"'已转'\" + 0.168*\"['收悉'\" + 0.167*\"'区委'\" + ... \n", 291 | "2812 0.218*\"'交通运输'\" + 0.201*\"'局'\" + 0.196*\"['收悉'\" +... \n", 292 | "2813 0.039*\"'学生'\" + 0.028*\"'社团活动'\" + 0.023*\"'社团'\" +... \n", 293 | "2814 0.070*\"'资金'\" + 0.047*\"'支付'\" + 0.042*\"'燃油税'\" + ... \n", 294 | "2815 0.044*\"'交通运输'\" + 0.034*\"'项目'\" + 0.029*\"'建设'\" +... \n", 295 | "\n", 296 | "[2816 rows x 4 columns]" 297 | ] 298 | }, 299 | "execution_count": 14, 300 | "metadata": {}, 301 | "output_type": "execute_result" 302 | } 303 | ], 304 | "source": [ 305 | "data_new" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "'''\n", 315 | "temp = []\n", 316 | "for i in range(0, 2750):\n", 317 | " mid = data_com['comments'][i:i+1].str.split(' ')\n", 318 | " temp.append(mid)\n", 319 | "\n", 320 | "temp_split = DataFrame(temp)\n", 321 | "\n", 322 | "print(temp_split)\n", 323 | " #print(mid.dtype)\n", 324 | " #print('-------------------------------')\n", 325 | " #dictionary = Dictionary(mid)\n", 326 | "\n", 327 | "#mid0 = data_com['comments'][0:1].str.split(' ')\n", 328 | "print(mid0.dtype)\n", 329 | "#dictionary = Dictionary(mid0)\n", 330 | "#bow = [dictionary.doc2bow(j) for j in mid0]\n", 331 | "#data_model = LdaModel(corpus=bow, id2word=dictionary, num_topics=3) # 构建LDA主题模型\n", 332 | "#data_model.print_topic(1) # 打印主题\n", 333 | "'''" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 15, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/html": [ 344 | "
\n", 345 | "\n", 358 | "\n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | "
keyAB
0K0A0B0
1K1A1B1
2K2A2B2
3K3A3B3
\n", 394 | "
" 395 | ], 396 | "text/plain": [ 397 | " key A B\n", 398 | "0 K0 A0 B0\n", 399 | "1 K1 A1 B1\n", 400 | "2 K2 A2 B2\n", 401 | "3 K3 A3 B3" 402 | ] 403 | }, 404 | "execution_count": 15, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "import pandas as pd\n", 411 | "\n", 412 | "left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],\n", 413 | " 'A': ['A0', 'A1', 'A2', 'A3'],\n", 414 | " 'B': ['B0', 'B1', 'B2', 'B3']})\n", 415 | "right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],\n", 416 | " 'C': ['C0', 'C1', 'C2', 'C3'],\n", 417 | " 'D': ['D0', 'D1', 'D2', 'D3']})\n", 418 | "left\n", 419 | "\n" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 16, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "data": { 429 | "text/html": [ 430 | "
\n", 431 | "\n", 444 | "\n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | "
keyCD
0K0C0D0
1K1C1D1
2K2C2D2
3K3C3D3
\n", 480 | "
" 481 | ], 482 | "text/plain": [ 483 | " key C D\n", 484 | "0 K0 C0 D0\n", 485 | "1 K1 C1 D1\n", 486 | "2 K2 C2 D2\n", 487 | "3 K3 C3 D3" 488 | ] 489 | }, 490 | "execution_count": 16, 491 | "metadata": {}, 492 | "output_type": "execute_result" 493 | } 494 | ], 495 | "source": [ 496 | "right" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 17, 502 | "metadata": {}, 503 | "outputs": [ 504 | { 505 | "data": { 506 | "text/html": [ 507 | "
\n", 508 | "\n", 521 | "\n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | "
keyABCD
0K0A0B0C0D0
1K1A1B1C1D1
2K2A2B2C2D2
3K3A3B3C3D3
\n", 567 | "
" 568 | ], 569 | "text/plain": [ 570 | " key A B C D\n", 571 | "0 K0 A0 B0 C0 D0\n", 572 | "1 K1 A1 B1 C1 D1\n", 573 | "2 K2 A2 B2 C2 D2\n", 574 | "3 K3 A3 B3 C3 D3" 575 | ] 576 | }, 577 | "execution_count": 17, 578 | "metadata": {}, 579 | "output_type": "execute_result" 580 | } 581 | ], 582 | "source": [ 583 | "pd.merge(left, right, on='key')" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": null, 589 | "metadata": {}, 590 | "outputs": [], 591 | "source": [] 592 | } 593 | ], 594 | "metadata": { 595 | "kernelspec": { 596 | "display_name": "Python 3", 597 | "language": "python", 598 | "name": "python3" 599 | }, 600 | "language_info": { 601 | "codemirror_mode": { 602 | "name": "ipython", 603 | "version": 3 604 | }, 605 | "file_extension": ".py", 606 | "mimetype": "text/x-python", 607 | "name": "python", 608 | "nbconvert_exporter": "python", 609 | "pygments_lexer": "ipython3", 610 | "version": "3.7.1" 611 | } 612 | }, 613 | "nbformat": 4, 614 | "nbformat_minor": 2 615 | } 616 | -------------------------------------------------------------------------------- /code/第三问/相关性/相似度.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 24, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import os\n", 12 | "import re" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 25, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "#os.chdir('C:/Users/64672/Desktop/C题全部数据')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 26, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "df=pd.read_excel('主题2.xlsx',encoding='gbk')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 27, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "df1=df.loc[:,['留言编号','主题']]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 28, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "df1['主题']=df1['主题'].apply(lambda x: re.sub('[^\\u4E00-\\u9FD5]+',',',x))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 29, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "df1['主题']=df1['主题'].apply(lambda x: x[1:-1])" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 30, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0 业委会,业主大会,业主,意见,停车,情况,住房,坪,书记,工作,业委会,业主,业主大会,情况...\n", 69 | "1 施工,项目,排水,道路,大道,土方,坪塘,渠道,管线,换填,施工,道路,排水,项目,集镇,换...\n", 70 | "2 教师,幼儿园,民办,待遇,学前教育,依法,提高,监管,教职工,推进,待遇,民办,幼儿园,教师...\n", 71 | "3 购房,市,补贴,首次,含,房屋交易,情况,万元,住房,商品,购房,市,含,首次,补贴,回复,...\n", 72 | "4 马坡岭,来信,小学,站名,周边,收悉,市民,具体内容,变更,不宜,市民,小学,来信,马坡岭,...\n", 73 | " ... \n", 74 | "2811 收悉,人民政府,已转,区,调查,区委,区委,调查,区,已转,人民政府,收悉\n", 75 | "2812 局,交通运输,收悉,转市,调查,调查,转市,收悉,交通运输,局\n", 76 | "2813 学生,社团,学校,社团活动,小学,课程,文盛,有利于,家长,调查,学生,社团活动,学校,社团...\n", 77 | "2814 资金,支付,转移,燃油税,年,情况,市州,中央,养护,亿元,资金,支付,情况,燃油税,转移,...\n", 78 | "2815 交通运输,项目,建设,前期工作,公路,单位,厅,规划,发展,工作进度,交通运输,项目,规划,...\n", 79 | "Name: 主题, Length: 2816, dtype: object" 80 | ] 81 | }, 82 | "execution_count": 30, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "df1['主题']" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 31, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "df2=pd.read_excel('附件4.xlsx')" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 32, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "import jieba" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 33, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "df2['留言主题']=df2['留言主题'].apply(jieba.lcut)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 34, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "df1['回复主题']=df1['主题'].apply(lambda x: x.split(','))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 35, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "df1['留言主题']=df2['留言主题']" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 36, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "del df1['主题']" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 37, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stderr", 159 | "output_type": "stream", 160 | "text": [ 161 | "D:\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:6: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.\n", 162 | " \n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "'''\n", 168 | "with open('E:/MY_COMPETE/2020挑战杯/C题数据&代码/第三问/stopwords.txt','r',encoding='gbk') as f:\n", 169 | " stop=f.read()\n", 170 | "stop=stop.split()\n", 171 | "'''\n", 172 | "stopWords = pd.read_csv('stopword.txt', encoding='GB18030', sep='hahaha', header=None)\n", 173 | "#列表使用+拼接\n", 174 | "stopWords = [' ', '\\n', '\\t', '\\r\\n', '\\u3000', '"', '–','现将','在','的','您好','你好','您','网友','留言','已收悉',':',\n", 175 | " '反映','的问题','平台','同志','现','领导'] + list(stopWords.iloc[:, 0])\n", 176 | "df1['留言主题']=df1['留言主题'].apply(lambda x:[i for i in x if i not in stopWords])" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 38, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "#计算中文编辑距离 str1和str2 是经过分词的\n", 186 | "def edit_similar(str1,str2):\n", 187 | " len_str1=len(str1)\n", 188 | " len_str2=len(str2)\n", 189 | " taglist=np.zeros((len_str1+1,len_str2+1))\n", 190 | " for a in range(len_str1):\n", 191 | " taglist[a][0]=a\n", 192 | " for a in range(len_str2):\n", 193 | " taglist[0][a] = a\n", 194 | " for i in range(1,len_str1+1):\n", 195 | " for j in range(1,len_str2+1):\n", 196 | " if(str1[i - 1] == str2[j - 1]):\n", 197 | " temp = 0\n", 198 | " else:\n", 199 | " temp = 1\n", 200 | " taglist[i][j] = min(taglist[i - 1][j - 1] + temp, taglist[i][j - 1] + 1, taglist[i - 1][j] + 1)\n", 201 | " return 1-taglist[len_str1][len_str2] / max(len_str1, len_str2)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 39, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "#计算余弦相似度\n", 211 | "def cos_dist(str1,str2):\n", 212 | " key_word = list(set(str1 + str2))\n", 213 | " # 给定形状和类型的用0填充的矩阵存储向量\n", 214 | " word_vector1 = np.zeros(len(key_word))\n", 215 | " word_vector2 = np.zeros(len(key_word))\n", 216 | "\n", 217 | " # 计算词频\n", 218 | " # 依次确定向量的每个位置的值\n", 219 | " for i in range(len(key_word)):\n", 220 | " # 遍历key_word中每个词在句子中的出现次数\n", 221 | " for j in range(len(str1)):\n", 222 | " if key_word[i] == str1[j]:\n", 223 | " word_vector1[i] += 1\n", 224 | " for k in range(len(str2)):\n", 225 | " if key_word[i] == str2[k]:\n", 226 | " word_vector2[i] += 1\n", 227 | "\n", 228 | "\n", 229 | " dist1=float(np.dot(word_vector1,word_vector2)/(np.linalg.norm(word_vector1)*np.linalg.norm(word_vector2)))\n", 230 | " return dist1" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 40, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "#余弦相似度赋予权值0.7,编辑距离赋予权重0.3\n", 240 | "def compare(str1,str2):\n", 241 | " cos_result=cos_dist(str1,str2)\n", 242 | " edit_result=edit_similar(str1,str2)\n", 243 | " return cos_result*0.7 + edit_result*0.3" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 41, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "f_similar=[]\n", 253 | "for i in range(len(df1)):\n", 254 | " a=compare(df1['留言主题'][i],df1['回复主题'][i])\n", 255 | " f_similar.append(a)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 42, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "df1['相似度']=f_similar" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 43, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "df1.to_excel('相似度.xlsx',index=False)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [] 289 | } 290 | ], 291 | "metadata": { 292 | "kernelspec": { 293 | "display_name": "Python 3", 294 | "language": "python", 295 | "name": "python3" 296 | }, 297 | "language_info": { 298 | "codemirror_mode": { 299 | "name": "ipython", 300 | "version": 3 301 | }, 302 | "file_extension": ".py", 303 | "mimetype": "text/x-python", 304 | "name": "python", 305 | "nbconvert_exporter": "python", 306 | "pygments_lexer": "ipython3", 307 | "version": "3.7.1" 308 | } 309 | }, 310 | "nbformat": 4, 311 | "nbformat_minor": 2 312 | } 313 | -------------------------------------------------------------------------------- /code/第三问/相关性/相似度.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/相关性/相似度.xlsx -------------------------------------------------------------------------------- /code/第三问/计分/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /code/第三问/计分/.ipynb_checkpoints/计分-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import os\n", 12 | "import re" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 8, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "data1 = pd.read_excel('相似度.xlsx',encoding='gbk')\n", 22 | "data2 = pd.read_csv('增加可解释性词.csv', encoding = 'GB18030')\n", 23 | "data_new = pd.concat([data1, data2], axis=1) #横向拼接\n", 24 | "data_new\n", 25 | "data_new.to_excel('准备计分.xlsx',encoding = 'GB18030')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 47, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "dtype('O')" 37 | ] 38 | }, 39 | "execution_count": 47, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "score = pd.read_excel('计分.xlsx',encoding='gbk',index=False)\n", 46 | "score['可解释性词'].dtype" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 18, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "def first_end(x):\n", 56 | " if '完整' in x:\n", 57 | " a = 0.3\n", 58 | " else:\n", 59 | " a = 0\n", 60 | " return a" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 48, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def complete(x):\n", 70 | " if '已解决' in x:\n", 71 | " a = 0.4\n", 72 | " else:\n", 73 | " a = 0\n", 74 | " return a" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 54, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def conj(x):\n", 84 | " if len(x)> 2:\n", 85 | " a = 1\n", 86 | " else:\n", 87 | " a = 0\n", 88 | " return a" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 55, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "score['f_score']=score['f_first'].apply(lambda x: first_end(x) )\n", 98 | "score['e_score']=score['f_end'].apply(lambda x: first_end(x) )\n", 99 | "score['com_score']=score['f_complete'].apply(lambda x: complete(x) )\n", 100 | "score['conj_score']=score['可解释性词'].apply(lambda x: conj(x) )" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 58, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "score.to_excel('正在计分.xlsx',encoding = 'GB18030',index = False)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.7.1" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 2 141 | } 142 | -------------------------------------------------------------------------------- /code/第三问/计分/~$总得分(最终版).xlsx: -------------------------------------------------------------------------------- 1 | HP HP -------------------------------------------------------------------------------- /code/第三问/计分/~$统计10.xlsx: -------------------------------------------------------------------------------- 1 | HP HP -------------------------------------------------------------------------------- /code/第三问/计分/准备计分.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/计分/准备计分.xlsx -------------------------------------------------------------------------------- /code/第三问/计分/回复时间.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/计分/回复时间.xlsx -------------------------------------------------------------------------------- /code/第三问/计分/总得分(最终版).xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/计分/总得分(最终版).xlsx -------------------------------------------------------------------------------- /code/第三问/计分/正在计分.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/计分/正在计分.xlsx -------------------------------------------------------------------------------- /code/第三问/计分/统计10.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/计分/统计10.xlsx -------------------------------------------------------------------------------- /code/第三问/附件4.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第三问/附件4.xlsx -------------------------------------------------------------------------------- /code/第二问/热点分析.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import jieba\n", 12 | "import os\n", 13 | "import re" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "os.chdir('C:/Users/64672/Desktop')" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "预处理" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "data2 = pd.read_excel('地点或人物/伊景园.xlsx')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "data2['留言详情'] = data2['留言详情'].apply(lambda x : re.sub('\\n','',str(x)))\n", 48 | "data2['留言详情'] = data2['留言详情'].apply(lambda x : re.sub('\\t','',str(x)))\n", 49 | "data2['留言详情'] = data2['留言详情'].apply(lambda x : re.sub('\\r','',str(x)))\n", 50 | "data2['留言详情'] = data2['留言详情'].apply(lambda x : re.sub('\\s','',str(x)))\n", 51 | "data2['留言详情'] = data2['留言详情'].apply(lambda x : re.sub('http[A-Za-z:/.]+','',str(x)))" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "分词 (留言主题)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "with open('Python/stopwords.txt','r',encoding='gbk') as f:\n", 68 | " stop=f.read()\n", 69 | "stop=stop.split()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "data2['topic']=data2['留言主题'].apply(jieba.lcut)\n", 79 | "data2_cut=data2['topic'].apply(lambda x:[i for i in x if i not in stop])\n", 80 | "data2_cut=data2_cut*2 #给留言主题比较大的权重" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "关键词" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "import jieba.analyse\n", 97 | "key=[]\n", 98 | "for i in data2['留言详情']:\n", 99 | " keywords=jieba.analyse.extract_tags(i,topK=20)\n", 100 | " key.append(keywords)\n", 101 | "data2['key']=key" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "data2['key']=data2['key'].apply(lambda x:[i for i in x if i not in stop])" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "合并关键词" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "for i in range(len(data2['key'])):\n", 127 | " data2['key'][i].extend(data2_cut[i])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "计算相似度" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "all_doc_list=data2['key'].tolist()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "from gensim import corpora,models,similarities\n", 153 | "dictionary = corpora.Dictionary(all_doc_list)\n", 154 | "corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]\n", 155 | "tfidf = models.TfidfModel(corpus)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "count=[]\n", 165 | "id=[]\n", 166 | "index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))\n", 167 | "for i in range(len(data2['留言主题'])):\n", 168 | " sim = index[tfidf[corpus[i]]]\n", 169 | " a=sorted(enumerate(sim), key=lambda item: -item[1])\n", 170 | " a=pd.DataFrame(a)\n", 171 | " count.append((a[1]>0.1).sum())\n", 172 | " id.append(a.loc[a[1]>0.1,0].tolist())\n", 173 | "data2['count']=count\n", 174 | "data2['id']=id" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "new_id=[]\n", 184 | "for i in range(len(data2)):\n", 185 | " a=set(data2.iloc[i,10])\n", 186 | " for j in range(len(data2)):\n", 187 | " b=set(data2.iloc[j,10])\n", 188 | " if len(a&b) > 2:\n", 189 | " a = a|b\n", 190 | " new_id.append(a)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "def tostr(x):\n", 200 | " s=''\n", 201 | " for i in x:\n", 202 | " i=str(i)\n", 203 | " s=s+i\n", 204 | " return s" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "data2['new_id'] = new_id \n", 214 | "data2['new_id'] = data2['new_id'].apply(lambda x: sorted(x))\n", 215 | "data2['new_id2'] = data2['new_id'].apply(lambda x: tostr(x))" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "data2_new=data2.drop_duplicates(subset='new_id2')" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "data2_hot=data2.sort_values(by='new_id')\n", 234 | "data2_hot.drop(data2_hot.columns[[7,8,9,10,11,12]],axis=1,inplace=True)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "data2=data2.sort_values(by='new_id')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "hot_id=[1]\n", 253 | "j=1\n", 254 | "for i in range(1,len(data2_hot)):\n", 255 | " if data2.iloc[i,12]==data2.iloc[i-1,12]:\n", 256 | " hot_id.append(j)\n", 257 | " else:\n", 258 | " j=j+1\n", 259 | " hot_id.append(j)\n", 260 | "data2_hot['hot_id'] = hot_id" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "data2_hot" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "data2_hot.to_excel('伊景园.xlsx',index=False)" 279 | ] 280 | } 281 | ], 282 | "metadata": { 283 | "kernelspec": { 284 | "display_name": "Python 3", 285 | "language": "python", 286 | "name": "python3" 287 | }, 288 | "language_info": { 289 | "codemirror_mode": { 290 | "name": "ipython", 291 | "version": 3 292 | }, 293 | "file_extension": ".py", 294 | "mimetype": "text/x-python", 295 | "name": "python", 296 | "nbconvert_exporter": "python", 297 | "pygments_lexer": "ipython3", 298 | "version": "3.7.4" 299 | } 300 | }, 301 | "nbformat": 4, 302 | "nbformat_minor": 2 303 | } 304 | -------------------------------------------------------------------------------- /code/第二问/热点问题留言明细表 .xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第二问/热点问题留言明细表 .xls -------------------------------------------------------------------------------- /code/第二问/热点问题表 .xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/code/第二问/热点问题表 .xls -------------------------------------------------------------------------------- /data/测试数据.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/data/测试数据.xlsx -------------------------------------------------------------------------------- /data/附件1.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/data/附件1.xlsx -------------------------------------------------------------------------------- /data/附件2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/data/附件2.xlsx -------------------------------------------------------------------------------- /data/附件3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/data/附件3.xlsx -------------------------------------------------------------------------------- /data/附件4.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/data/附件4.xlsx -------------------------------------------------------------------------------- /paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/15625103741/Text-Mining/17ed6eacddd8e2a41463e4c397a95b587dae7ee9/paper.pdf --------------------------------------------------------------------------------