├── 1.环球时报-数据清洗和预处理+描述性分析.ipynb ├── 2.环球时报-基础词频的LDA+基于TFIDF的LDA+基于标题加权TFIDF的LDA.ipynb ├── 3.环球时报-看主题和微博量.ipynb ├── 4.五个分类微博-数据清洗和预处理.ipynb ├── 5.五个分类微博-SVM+TFIDF SVM+加权TFIDF SVM.ipynb ├── README.md ├── 数据集1-环球时报.zip └── 数据集2-五个分类.zip /1.环球时报-数据清洗和预处理+描述性分析.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:11: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead\n", 13 | " # This is added back by InteractiveShellApp.init_path()\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "#coding: utf-8\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import re\n", 22 | "import jieba\n", 23 | "import os\n", 24 | "import time\n", 25 | "import jieba \n", 26 | "import numpy as np\n", 27 | "from progressbar import *\n", 28 | "start1 =time.clock()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def obatin_news_from_csv(path_csv):\n", 38 | " old_text=pd.read_csv(path_csv)\n", 39 | " drop_index_na=[]\n", 40 | " for i in range(old_text.shape[0]):\n", 41 | " j=old_text['微博正文'][i]\n", 42 | " if (\"抱歉\" in j or len(j)<40):\n", 43 | " drop_index_na.append(i)\n", 44 | " text=old_text.drop(drop_index_na)\n", 45 | " text=text.reset_index(drop=True)\n", 46 | " return text\n", 47 | "\n", 48 | "import re\n", 49 | "def find_title(text):\n", 50 | " text[\"标题\"]=[-99 for i in range(text.shape[0])]\n", 51 | " text[\"微博正文(去掉标题)\"]=[-99 for i in range(text.shape[0])]\n", 52 | " notitle_index=[]\n", 53 | " title_p=re.compile(r\".*\\【.*\\】\")\n", 54 | " progress = ProgressBar()\n", 55 | " for i in progress(range(text.shape[0])):\n", 56 | " title_=re.match(title_p,text[\"微博正文\"][i])\n", 57 | " if title_:\n", 58 | " text[\"标题\"][i]=title_[0]\n", 59 | " text[\"微博正文(去掉标题)\"][i]=re.sub(r\".*\\【.*\\】\",\"\",text[\"微博正文\"][i])\n", 60 | " else:\n", 61 | " notitle_index.append(i)\n", 62 | " time.sleep(0.1)\n", 63 | " text=text.drop(notitle_index)\n", 64 | " text=text.reset_index(drop=True)\n", 65 | " return text" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stderr", 75 | "output_type": "stream", 76 | "text": [ 77 | "Building prefix dict from the default dictionary ...\n", 78 | "Loading model from cache C:\\Users\\Public\\Documents\\Wondershare\\CreatorTemp\\jieba.cache\n", 79 | "Loading model cost 0.965 seconds.\n", 80 | "Prefix dict has been built successfully.\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "\n", 86 | "jieba.load_userdict(r\".\\自定义词典.txt\")\n", 87 | "stopwords = [line.strip() for line in open(r\".\\停用词库.txt\",encoding='UTF-8').readlines()]\n", 88 | "\n", 89 | "def clean_text_content(text_content,cut_all=False):\n", 90 | " text_content = re.sub(r\"。\", \".\", text_content)\n", 91 | " text_content = re.sub(r\",\", \",\", text_content)\n", 92 | " text_content = re.sub(r\"“\", \"'\", text_content)\n", 93 | " text_content = re.sub(r\"”\", \"'\", text_content)\n", 94 | " text_content = re.sub(r\"…\", \".\", text_content)\n", 95 | " text_content = re.sub(r\"@\", \"@\", text_content)\n", 96 | " text_content = re.sub(r\" \", \" \", text_content)\n", 97 | " text_content = re.sub(r\"!\", \"!\", text_content)\n", 98 | " text_content = re.sub(r\"?\", \"?\", text_content)\n", 99 | " text_content = re.sub(r\":\", \":\", text_content)\n", 100 | " text_content = re.sub(r\")\", \")\", text_content)\n", 101 | " text_content = re.sub(r\"(\", \"(\", text_content)\n", 102 | " text_content = re.sub(r\"(\\d+年)*(\\d+月)*(\\d+[日])\", \"\", text_content) #日期\n", 103 | " text_content = re.sub(r\"\\d+[年月日天号人名时例名省市区县院]\", \"\", text_content) \n", 104 | " text_content = re.sub(r\"[第]*[零一二三四五六七八九百千万]+[年月日天号人名时例名省市区县院名例周月年]*\", \"\", text_content) \n", 105 | " text_content = re.sub(r\"[0-2]?[0-9]:[0-6][0-9]\", \"\", text_content) #时间\n", 106 | " text_content = re.sub(r\"^[-+]?[0-9]+(\\.)?[0-9]*$\", \"\", text_content) #数字\n", 107 | " text_content = re.sub(r\"@\\S*\\:+\\s*\", \" \", text_content) #@小央视频\n", 108 | " text_content = re.sub(r\"\\[.+\\]\", \"\", text_content) #[组图共2张]和[加油]\n", 109 | " text_content = re.sub(r\"\\(.*?\\)\", \"\", text_content) #(环球网)\n", 110 | " \n", 111 | " text_content = re.sub(r\"\\W+\\w*(视频)\\s+\", \"\", text_content) #小央视频的秒拍视频\n", 112 | " text_content = re.sub(r\"#\", \"\", text_content) ##\n", 113 | " text_content = re.sub(r\"(http|https)(://t.cn/)[a-zA-Z0-9]+\", \"\", text_content) #网址(微博上的连接都是http://t.cn/.....形式)\n", 114 | " text_content = re.sub(r\"转发理由:\", \"\", text_content) \n", 115 | " text_content = re.sub(r\"转发内容:\", \"\", text_content) \n", 116 | " text_content = re.sub(r\"原始用户:.*\", \"\", text_content) \n", 117 | " \n", 118 | " word_list=jieba.lcut(text_content,cut_all=cut_all)\n", 119 | " word_list_len=len(word_list)\n", 120 | " i=0\n", 121 | " while i\n", 175 | "\n", 188 | "\n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | "
微博id微博正文原始图片url被转发微博原始图片url是否为原创微博微博视频url发布位置发布时间发布工具点赞数转发数评论数标题微博正文(去掉标题)
0Inng7t7Qo【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响...True2019-12-31 23:50微博 weibo.com50633134【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响着人们的睡眠时间的主要原因,7...
1Inn80A0nV【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担...Truehttp://miaopai.video.weibocdn.com/000mhB9Blx07...2019-12-31 23:30微博 weibo.com3682479【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担心被监控抓拍,竟用孩子的画遮挡号牌。孩子制止其母无果...
2InmZSs4Wv【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆...True2019-12-31 23:10微博 weibo.com3176919【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆的暖心瞬间:10岁男孩2月增重12斤捐造干救父;夫...
3InmRLvKiu【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 陕西西安一敬老院,...True2019-12-31 22:50微博 weibo.com12733664【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】陕西西安一敬老院,85岁的陈爷爷迎娶82岁的沈奶奶。沈奶奶爱在大厅里弹钢琴,陈爷爷常坐在后...
4InmJEgLDC【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《...True2019-12-31 22:30微博 weibo.com286771222【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《今日》电台采访时,被问到如果有机会跟特朗普面对面...
\n", 296 | "" 297 | ], 298 | "text/plain": [ 299 | " 微博id 微博正文 原始图片url \\\n", 300 | "0 Inng7t7Qo 【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响... 无 \n", 301 | "1 Inn80A0nV 【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担... 无 \n", 302 | "2 InmZSs4Wv 【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆... 无 \n", 303 | "3 InmRLvKiu 【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 陕西西安一敬老院,... 无 \n", 304 | "4 InmJEgLDC 【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《... 无 \n", 305 | "\n", 306 | " 被转发微博原始图片url 是否为原创微博 微博视频url \\\n", 307 | "0 无 True 无 \n", 308 | "1 无 True http://miaopai.video.weibocdn.com/000mhB9Blx07... \n", 309 | "2 无 True 无 \n", 310 | "3 无 True 无 \n", 311 | "4 无 True 无 \n", 312 | "\n", 313 | " 发布位置 发布时间 发布工具 点赞数 转发数 评论数 \\\n", 314 | "0 无 2019-12-31 23:50 微博 weibo.com 506 331 34 \n", 315 | "1 无 2019-12-31 23:30 微博 weibo.com 368 24 79 \n", 316 | "2 无 2019-12-31 23:10 微博 weibo.com 317 69 19 \n", 317 | "3 无 2019-12-31 22:50 微博 weibo.com 1273 36 64 \n", 318 | "4 无 2019-12-31 22:30 微博 weibo.com 2867 71 222 \n", 319 | "\n", 320 | " 标题 \\\n", 321 | "0 【如何获得稳定的8小时睡眠?】 \n", 322 | "1 【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】 \n", 323 | "2 【#致敬2019那些温暖瞬间#,愿2020更美好】 \n", 324 | "3 【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 \n", 325 | "4 【瑞典环保少女:不想和特朗普说话,不想浪费时间】 \n", 326 | "\n", 327 | " 微博正文(去掉标题) \n", 328 | "0 如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响着人们的睡眠时间的主要原因,7... \n", 329 | "1 近日,四川自贡市一女子违停在路边,由于担心被监控抓拍,竟用孩子的画遮挡号牌。孩子制止其母无果... \n", 330 | "2 回顾2019年,发生在齐鲁大地上,值得记忆的暖心瞬间:10岁男孩2月增重12斤捐造干救父;夫... \n", 331 | "3 陕西西安一敬老院,85岁的陈爷爷迎娶82岁的沈奶奶。沈奶奶爱在大厅里弹钢琴,陈爷爷常坐在后... \n", 332 | "4 12月30日,瑞典环保少女通贝里接受BBC《今日》电台采访时,被问到如果有机会跟特朗普面对面... " 333 | ] 334 | }, 335 | "execution_count": 4, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "old_text=obatin_news_from_csv(\"./1974576991.csv\")\n", 342 | "text_column=[\"微博正文\",\"发布时间\"]\n", 343 | "\n", 344 | "#把2020年的都去掉\n", 345 | "len_old_text=old_text.shape[0]\n", 346 | "drop_index=[]\n", 347 | "old_pattern=re.compile(r\"2020(\\-)[0-9]+(\\-)[0-9]+\")\n", 348 | "for i in range(0,len_old_text):\n", 349 | " if old_pattern.match(old_text[\"发布时间\"][i]):\n", 350 | " drop_index.append(i)\n", 351 | "text=old_text.drop(drop_index)\n", 352 | "text=text.reset_index(drop=True)\n", 353 | "text.head(2)\n", 354 | "\n", 355 | "text=find_title(text)\n", 356 | "text_shape_old=text.shape[0]\n", 357 | "print(text_shape_old)\n", 358 | "text.head()" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 5, 364 | "metadata": {}, 365 | "outputs": [ 366 | { 367 | "name": "stderr", 368 | "output_type": "stream", 369 | "text": [ 370 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead\n", 371 | " \"\"\"Entry point for launching an IPython kernel.\n", 372 | "N/A% (0 of 17682) | | Elapsed Time: 0:00:00 ETA: --:--:--C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:14: SettingWithCopyWarning: \n", 373 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 374 | "\n", 375 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 376 | " \n", 377 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:17: SettingWithCopyWarning: \n", 378 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 379 | "\n", 380 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 381 | "100% (17682 of 17682) |##################| Elapsed Time: 0:49:02 Time: 0:49:02\n", 382 | "N/A% (0 of 17682) | | Elapsed Time: 0:00:00 ETA: --:--:--C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:31: SettingWithCopyWarning: \n", 383 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 384 | "\n", 385 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 386 | "100% (17682 of 17682) |##################| Elapsed Time: 0:06:26 Time: 0:06:26\n" 387 | ] 388 | }, 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "Running time: 3328.6935436999997 Seconds\n" 394 | ] 395 | }, 396 | { 397 | "name": "stderr", 398 | "output_type": "stream", 399 | "text": [ 400 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:33: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "start =time.clock()\n", 406 | "\n", 407 | "len_text=text.shape[0]\n", 408 | "\n", 409 | "text[\"微博正文(无标题切词后)\"]=[-99 for i in range(len_text)]\n", 410 | "text[\"标题(切词后)\"]=[-99 for i in range(len_text)]\n", 411 | "\n", 412 | "drop_na=[]\n", 413 | "\n", 414 | "progress = ProgressBar()\n", 415 | "\n", 416 | "for u in progress(range(0,len_text)):\n", 417 | " cleaned_content=clean_text_content(text[\"微博正文\"][u])\n", 418 | " text[\"微博正文(无标题切词后)\"][u]=\" \".join(cleaned_content)\n", 419 | "\n", 420 | " cleaned_title=clean_text_content(text[\"标题\"][u],True)\n", 421 | " text[\"标题(切词后)\"][u]=\" \".join(cleaned_title)\n", 422 | "\n", 423 | " if pd.isnull(text.loc[u]).any():\n", 424 | " drop_na.append(u)\n", 425 | " print(u) \n", 426 | " time.sleep(0.1)\n", 427 | "\n", 428 | "text=text.drop(drop_na)\n", 429 | "text=text.reset_index(drop=True) \n", 430 | "\n", 431 | "text[\"微博正文(有标题切词后)\"]=[-99 for i in range(len_text)]\n", 432 | "progress = ProgressBar()\n", 433 | "\n", 434 | "for u in progress(range(0,len_text)):\n", 435 | " text[\"微博正文(有标题切词后)\"][u]=text[\"标题(切词后)\"][u]+\" \"+text[\"微博正文(无标题切词后)\"][u]\n", 436 | "\n", 437 | "end = time.clock()\n", 438 | "\n", 439 | "print('Running time: %s Seconds'%(end-start))" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 6, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "text_shape_old 17682\n", 452 | "text_shape_new 17682\n" 453 | ] 454 | }, 455 | { 456 | "data": { 457 | "text/html": [ 458 | "
\n", 459 | "\n", 472 | "\n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | "
微博id微博正文原始图片url被转发微博原始图片url是否为原创微博微博视频url发布位置发布时间发布工具点赞数转发数评论数标题微博正文(去掉标题)微博正文(无标题切词后)标题(切词后)微博正文(有标题切词后)
0Inng7t7Qo【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响...True2019-12-31 23:50微博 weibo.com50633134【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响着人们的睡眠时间的主要原因,7...获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响 人们 睡眠 主要 原因...获得 稳定 小时 睡眠获得 稳定 小时 睡眠 获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响...
1Inn80A0nV【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担...Truehttp://miaopai.video.weibocdn.com/000mhB9Blx07...2019-12-31 23:30微博 weibo.com3682479【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担心被监控抓拍,竟用孩子的画遮挡号牌。孩子制止其母无果...妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 监控 抓拍 孩子 遮挡 号牌 ...妈妈 省心 抓拍 孩子 号牌妈妈 省心 抓拍 孩子 号牌 妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 ...
2InmZSs4Wv【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆...True2019-12-31 23:10微博 weibo.com3176919【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆的暖心瞬间:10岁男孩2月增重12斤捐造干救父;夫...致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重 捐造 干救父 夫妻 雾...致敬 温暖 瞬间 美好致敬 温暖 瞬间 美好 致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重...
3InmRLvKiu【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 陕西西安一敬老院,...True2019-12-31 22:50微博 weibo.com12733664【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】陕西西安一敬老院,85岁的陈爷爷迎娶82岁的沈奶奶。沈奶奶爱在大厅里弹钢琴,陈爷爷常坐在后...浪漫 爷爷 敬老院 奶奶 上海 搭讪 音乐会 庆祝 陕西 西安 敬老院 爷爷 迎娶 奶奶 奶...浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝 浪漫 爷爷 敬老院 奶...
4InmJEgLDC【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《...True2019-12-31 22:30微博 weibo.com286771222【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《今日》电台采访时,被问到如果有机会跟特朗普面对面...瑞典 环保 少女 不想 特朗普 说话 不想 浪费时间 瑞典 环保 少女 贝里 接受 今日 电...瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间 瑞典 环保 少女 ...
\n", 598 | "
" 599 | ], 600 | "text/plain": [ 601 | " 微博id 微博正文 原始图片url \\\n", 602 | "0 Inng7t7Qo 【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响... 无 \n", 603 | "1 Inn80A0nV 【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担... 无 \n", 604 | "2 InmZSs4Wv 【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆... 无 \n", 605 | "3 InmRLvKiu 【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 陕西西安一敬老院,... 无 \n", 606 | "4 InmJEgLDC 【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《... 无 \n", 607 | "\n", 608 | " 被转发微博原始图片url 是否为原创微博 微博视频url \\\n", 609 | "0 无 True 无 \n", 610 | "1 无 True http://miaopai.video.weibocdn.com/000mhB9Blx07... \n", 611 | "2 无 True 无 \n", 612 | "3 无 True 无 \n", 613 | "4 无 True 无 \n", 614 | "\n", 615 | " 发布位置 发布时间 发布工具 点赞数 转发数 评论数 \\\n", 616 | "0 无 2019-12-31 23:50 微博 weibo.com 506 331 34 \n", 617 | "1 无 2019-12-31 23:30 微博 weibo.com 368 24 79 \n", 618 | "2 无 2019-12-31 23:10 微博 weibo.com 317 69 19 \n", 619 | "3 无 2019-12-31 22:50 微博 weibo.com 1273 36 64 \n", 620 | "4 无 2019-12-31 22:30 微博 weibo.com 2867 71 222 \n", 621 | "\n", 622 | " 标题 \\\n", 623 | "0 【如何获得稳定的8小时睡眠?】 \n", 624 | "1 【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】 \n", 625 | "2 【#致敬2019那些温暖瞬间#,愿2020更美好】 \n", 626 | "3 【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 \n", 627 | "4 【瑞典环保少女:不想和特朗普说话,不想浪费时间】 \n", 628 | "\n", 629 | " 微博正文(去掉标题) \\\n", 630 | "0 如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响着人们的睡眠时间的主要原因,7... \n", 631 | "1 近日,四川自贡市一女子违停在路边,由于担心被监控抓拍,竟用孩子的画遮挡号牌。孩子制止其母无果... \n", 632 | "2 回顾2019年,发生在齐鲁大地上,值得记忆的暖心瞬间:10岁男孩2月增重12斤捐造干救父;夫... \n", 633 | "3 陕西西安一敬老院,85岁的陈爷爷迎娶82岁的沈奶奶。沈奶奶爱在大厅里弹钢琴,陈爷爷常坐在后... \n", 634 | "4 12月30日,瑞典环保少女通贝里接受BBC《今日》电台采访时,被问到如果有机会跟特朗普面对面... \n", 635 | "\n", 636 | " 微博正文(无标题切词后) \\\n", 637 | "0 获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响 人们 睡眠 主要 原因... \n", 638 | "1 妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 监控 抓拍 孩子 遮挡 号牌 ... \n", 639 | "2 致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重 捐造 干救父 夫妻 雾... \n", 640 | "3 浪漫 爷爷 敬老院 奶奶 上海 搭讪 音乐会 庆祝 陕西 西安 敬老院 爷爷 迎娶 奶奶 奶... \n", 641 | "4 瑞典 环保 少女 不想 特朗普 说话 不想 浪费时间 瑞典 环保 少女 贝里 接受 今日 电... \n", 642 | "\n", 643 | " 标题(切词后) \\\n", 644 | "0 获得 稳定 小时 睡眠 \n", 645 | "1 妈妈 省心 抓拍 孩子 号牌 \n", 646 | "2 致敬 温暖 瞬间 美好 \n", 647 | "3 浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝 \n", 648 | "4 瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间 \n", 649 | "\n", 650 | " 微博正文(有标题切词后) \n", 651 | "0 获得 稳定 小时 睡眠 获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响... \n", 652 | "1 妈妈 省心 抓拍 孩子 号牌 妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 ... \n", 653 | "2 致敬 温暖 瞬间 美好 致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重... \n", 654 | "3 浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝 浪漫 爷爷 敬老院 奶... \n", 655 | "4 瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间 瑞典 环保 少女 ... " 656 | ] 657 | }, 658 | "execution_count": 6, 659 | "metadata": {}, 660 | "output_type": "execute_result" 661 | } 662 | ], 663 | "source": [ 664 | "print(\"text_shape_old\",text_shape_old)\n", 665 | "print(\"text_shape_new\",text.shape[0])\n", 666 | "text.head()" 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "execution_count": 14, 672 | "metadata": {}, 673 | "outputs": [ 674 | { 675 | "data": { 676 | "text/html": [ 677 | "
\n", 678 | "\n", 691 | "\n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | "
发布时间微博正文标题微博正文(去掉标题)微博正文(无标题切词后)标题(切词后)微博正文(有标题切词后)
02019-12-31 23:50【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响...【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响着人们的睡眠时间的主要原因,7...获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响 人们 睡眠 主要 原因...获得 稳定 小时 睡眠获得 稳定 小时 睡眠 获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响...
12019-12-31 23:30【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担...【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担心被监控抓拍,竟用孩子的画遮挡号牌。孩子制止其母无果...妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 监控 抓拍 孩子 遮挡 号牌 ...妈妈 省心 抓拍 孩子 号牌妈妈 省心 抓拍 孩子 号牌 妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 ...
22019-12-31 23:10【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆...【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆的暖心瞬间:10岁男孩2月增重12斤捐造干救父;夫...致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重 捐造 干救父 夫妻 雾...致敬 温暖 瞬间 美好致敬 温暖 瞬间 美好 致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重...
32019-12-31 22:50【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 陕西西安一敬老院,...【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】陕西西安一敬老院,85岁的陈爷爷迎娶82岁的沈奶奶。沈奶奶爱在大厅里弹钢琴,陈爷爷常坐在后...浪漫 爷爷 敬老院 奶奶 上海 搭讪 音乐会 庆祝 陕西 西安 敬老院 爷爷 迎娶 奶奶 奶...浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝 浪漫 爷爷 敬老院 奶...
42019-12-31 22:30【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《...【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《今日》电台采访时,被问到如果有机会跟特朗普面对面...瑞典 环保 少女 不想 特朗普 说话 不想 浪费时间 瑞典 环保 少女 贝里 接受 今日 电...瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间 瑞典 环保 少女 ...
\n", 757 | "
" 758 | ], 759 | "text/plain": [ 760 | " 发布时间 微博正文 \\\n", 761 | "0 2019-12-31 23:50 【如何获得稳定的8小时睡眠?】如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响... \n", 762 | "1 2019-12-31 23:30 【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】近日,四川自贡市一女子违停在路边,由于担... \n", 763 | "2 2019-12-31 23:10 【#致敬2019那些温暖瞬间#,愿2020更美好】回顾2019年,发生在齐鲁大地上,值得记忆... \n", 764 | "3 2019-12-31 22:50 【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 陕西西安一敬老院,... \n", 765 | "4 2019-12-31 22:30 【瑞典环保少女:不想和特朗普说话,不想浪费时间】12月30日,瑞典环保少女通贝里接受BBC《... \n", 766 | "\n", 767 | " 标题 \\\n", 768 | "0 【如何获得稳定的8小时睡眠?】 \n", 769 | "1 【这妈妈不太省心!怕被抓拍#女子拿孩子的画遮号牌#】 \n", 770 | "2 【#致敬2019那些温暖瞬间#,愿2020更美好】 \n", 771 | "3 【浪漫!#85岁爷爷敬老院娶82岁奶奶#:上海话“搭讪”,开音乐会庆祝】 \n", 772 | "4 【瑞典环保少女:不想和特朗普说话,不想浪费时间】 \n", 773 | "\n", 774 | " 微博正文(去掉标题) \\\n", 775 | "0 如今随着新的信息技术的发展,比如笔记本电脑,手机,游戏成了影响着人们的睡眠时间的主要原因,7... \n", 776 | "1 近日,四川自贡市一女子违停在路边,由于担心被监控抓拍,竟用孩子的画遮挡号牌。孩子制止其母无果... \n", 777 | "2 回顾2019年,发生在齐鲁大地上,值得记忆的暖心瞬间:10岁男孩2月增重12斤捐造干救父;夫... \n", 778 | "3 陕西西安一敬老院,85岁的陈爷爷迎娶82岁的沈奶奶。沈奶奶爱在大厅里弹钢琴,陈爷爷常坐在后... \n", 779 | "4 12月30日,瑞典环保少女通贝里接受BBC《今日》电台采访时,被问到如果有机会跟特朗普面对面... \n", 780 | "\n", 781 | " 微博正文(无标题切词后) \\\n", 782 | "0 获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响 人们 睡眠 主要 原因... \n", 783 | "1 妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 监控 抓拍 孩子 遮挡 号牌 ... \n", 784 | "2 致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重 捐造 干救父 夫妻 雾... \n", 785 | "3 浪漫 爷爷 敬老院 奶奶 上海 搭讪 音乐会 庆祝 陕西 西安 敬老院 爷爷 迎娶 奶奶 奶... \n", 786 | "4 瑞典 环保 少女 不想 特朗普 说话 不想 浪费时间 瑞典 环保 少女 贝里 接受 今日 电... \n", 787 | "\n", 788 | " 标题(切词后) \\\n", 789 | "0 获得 稳定 小时 睡眠 \n", 790 | "1 妈妈 省心 抓拍 孩子 号牌 \n", 791 | "2 致敬 温暖 瞬间 美好 \n", 792 | "3 浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝 \n", 793 | "4 瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间 \n", 794 | "\n", 795 | " 微博正文(有标题切词后) \n", 796 | "0 获得 稳定 小时 睡眠 获得 稳定 小时 睡眠 信息技术 发展 笔记本电脑 手机 游戏 影响... \n", 797 | "1 妈妈 省心 抓拍 孩子 号牌 妈妈 不太 省心 抓拍 孩子 号牌 自贡市 违停 路边 担心 ... \n", 798 | "2 致敬 温暖 瞬间 美好 致敬 温暖 瞬间 美好 回顾 齐鲁大地 值得 记忆 瞬间 男孩 增重... \n", 799 | "3 浪漫 爷爷 敬老 敬老院 奶奶 上海 搭讪 音乐 音乐会 会庆 庆祝 浪漫 爷爷 敬老院 奶... \n", 800 | "4 瑞典 环保 少女 不想 特朗普 说话 不想 浪费 浪费时间 费时 费时间 瑞典 环保 少女 ... " 801 | ] 802 | }, 803 | "execution_count": 14, 804 | "metadata": {}, 805 | "output_type": "execute_result" 806 | } 807 | ], 808 | "source": [ 809 | "text_new=text[[\"发布时间\",\"微博正文\",\"标题\",\"微博正文(去掉标题)\",\"微博正文(无标题切词后)\",\"标题(切词后)\",\"微博正文(有标题切词后)\"]]\n", 810 | "text_new.head()" 811 | ] 812 | }, 813 | { 814 | "cell_type": "code", 815 | "execution_count": 15, 816 | "metadata": {}, 817 | "outputs": [], 818 | "source": [ 819 | "outputpath='./huanqiu_news_with_title_2.csv'\n", 820 | "text_new.to_csv(outputpath,sep=',',index=False,header=True)" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": null, 826 | "metadata": {}, 827 | "outputs": [], 828 | "source": [ 829 | "temp_text=text[\"微博正文(有标题切词后)\"].split(\" \")\n", 830 | "all_words=[str(i) for k in temp_text for i in k]\n", 831 | "all_words_space=\" \".join(('%s' %id for id in all_words))\n", 832 | "#all_words_space[:200]\n", 833 | "\n", 834 | "from wordcloud import WordCloud\n", 835 | "import matplotlib.pyplot as plt\n", 836 | "import matplotlib\n", 837 | "from PIL import Image\n", 838 | "from wordcloud import WordCloud, ImageColorGenerator\n", 839 | "wordcloud1 = WordCloud(\n", 840 | " font_path=\"C:/Windows/Fonts/simfang.ttf\",\n", 841 | " background_color=\"white\",\n", 842 | " width=800,\n", 843 | " height=660,\n", 844 | " max_font_size=200).generate(all_words_space)\n", 845 | "plt.imshow(wordcloud1.recolor(), interpolation=\"bilinear\")\n", 846 | "plt.axis(\"off\")\n", 847 | "wordcloud1.to_file('./wordcloud1.jpg')\n", 848 | "plt.show()" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": null, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "import jieba.analyse\n", 858 | "tag=jieba.analyse.extract_tags(all_words_space, topK=100, withWeight=True, allowPOS=())\n", 859 | "wc_tfidf ={}\n", 860 | "for i in tag:\n", 861 | " wc_tfidf[i[0]]=i[1]\n", 862 | " \n", 863 | "wordcloud2 = WordCloud(\n", 864 | " font_path=\"C:/Windows/Fonts/simfang.ttf\",\n", 865 | " background_color=\"white\",\n", 866 | " width=800,\n", 867 | " height=660,\n", 868 | " max_font_size=200).generate_from_frequencies(wc_tfidf)\n", 869 | "plt.imshow(wordcloud2.recolor(), interpolation=\"bilinear\")\n", 870 | "plt.axis(\"off\")\n", 871 | "wordcloud2.to_file('./result2.jpg')\n", 872 | "plt.show()" 873 | ] 874 | }, 875 | { 876 | "cell_type": "code", 877 | "execution_count": null, 878 | "metadata": {}, 879 | "outputs": [], 880 | "source": [ 881 | "import re\n", 882 | "time_all={}\n", 883 | "for i in range(1,13):\n", 884 | " month_text=0\n", 885 | " if len(str(i))==1:\n", 886 | " mon=\"0\"+str(i)\n", 887 | " else:\n", 888 | " mon=str(i)\n", 889 | " month_re='2019-'+mon\n", 890 | " \n", 891 | " time_list=text[\"发布时间\"]\n", 892 | " for j in time_list:\n", 893 | " if month_re in j:\n", 894 | " month_text=month_text+1\n", 895 | " time_all[mon]=month_text\n", 896 | "print(time_all)\n", 897 | "\n", 898 | "import matplotlib.pyplot as plt\n", 899 | "import numpy as np\n", 900 | "from scipy.interpolate import interp1d\n", 901 | "month_re_list=[]\n", 902 | "for i in range(1,13):\n", 903 | " if len(str(i))==1:\n", 904 | " mon=\"0\"+str(i)\n", 905 | " else:\n", 906 | " mon=str(i)\n", 907 | " month_re='2019-'+mon\n", 908 | " month_re_list.append(month_re)\n", 909 | "#print(\"month_re_list\",month_re_list)\n", 910 | "new_ticks =month_re_list\n", 911 | "x = [int(i[0]) for i in time_all.items()]\n", 912 | "y = [int(i[1]) for i in time_all.items()]\n", 913 | "print(\"x\",x)\n", 914 | "print(\"y\",y)\n", 915 | "print(\"month_re_list\",month_re_list)\n", 916 | "peint(\"sum(y)\",sum(y))\n", 917 | "\n", 918 | "plt.figure(figsize=(10,8),dpi=50)\n", 919 | "plt.ylim(0,2000)\n", 920 | "plt.xticks(rotation = 90)\n", 921 | "plt.bar(month_re_list ,y,width=0.8)\n", 922 | "for m,n in zip(x ,y):\n", 923 | " plt.text(m-0.7, n+1, '%.0f' % n, ha='right', va= 'bottom',fontsize=11)" 924 | ] 925 | } 926 | ], 927 | "metadata": { 928 | "kernelspec": { 929 | "display_name": "Python 3", 930 | "language": "python", 931 | "name": "python3" 932 | }, 933 | "language_info": { 934 | "codemirror_mode": { 935 | "name": "ipython", 936 | "version": 3 937 | }, 938 | "file_extension": ".py", 939 | "mimetype": "text/x-python", 940 | "name": "python", 941 | "nbconvert_exporter": "python", 942 | "pygments_lexer": "ipython3", 943 | "version": "3.7.3" 944 | } 945 | }, 946 | "nbformat": 4, 947 | "nbformat_minor": 2 948 | } 949 | -------------------------------------------------------------------------------- /3.环球时报-看主题和微博量.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#coding: utf-8\n", 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "import re\n", 13 | "import jieba\n", 14 | "import os\n", 15 | "import time\n", 16 | "import random\n", 17 | "import jieba \n", 18 | "import sklearn\n", 19 | "from sklearn.naive_bayes import MultinomialNB \n", 20 | "import numpy as np\n", 21 | "import pylab as pl\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "import time\n", 24 | "import gensim\n", 25 | "import jieba.analyse\n", 26 | "import time" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "from gensim.models import CoherenceModel\n", 36 | "from gensim import models\n", 37 | "def tw_lda_get_tfidf(text,gamma=1.0):\n", 38 | " #1.用正文+标题作为dictionary,过滤极端值\n", 39 | " content_and_title=[i.split(\" \") for i in text[\"微博正文(无标题切词后)\"]]\n", 40 | " dictionary = gensim.corpora.Dictionary(content_and_title)\n", 41 | " dictionary.filter_extremes(no_below=15,no_above=0.8, keep_n=100000)\n", 42 | " \n", 43 | " #2.计算标题的tfidf\n", 44 | " processed_docs_title=[i.split(\" \") for i in text[\"标题(切词后)\"]]\n", 45 | " bow_corpus_title = [dictionary.doc2bow(doc) for doc in processed_docs_title]\n", 46 | " tfidf_title = models.TfidfModel(bow_corpus_title,normalize=False)\n", 47 | " corpus_tfidf_title = tfidf_title[bow_corpus_title]\n", 48 | "# print(\"corpus_tfidf_title\",corpus_tfidf_title)\n", 49 | " \n", 50 | " #3.计算正文+标题的tfidf\n", 51 | " processed_docs_content=[i.split(\" \") for i in text[\"微博正文(无标题切词后)\"]]\n", 52 | " bow_corpus_content = [dictionary.doc2bow(doc) for doc in processed_docs_content]\n", 53 | " tfidf_content = models.TfidfModel(bow_corpus_content,normalize=False)\n", 54 | " corpus_tfidf_content = tfidf_content[bow_corpus_content]\n", 55 | "# print(\"corpus_tfidf_content\",corpus_tfidf_content)\n", 56 | " \n", 57 | " #4.把标题和正文tfidf结合在一起\n", 58 | " new_tfidf=[]\n", 59 | " for i in range(len(corpus_tfidf_content)):#corpus_tfidf2[i]\n", 60 | " dict_2={one:two for one,two in corpus_tfidf_content[i]}\n", 61 | " dict_1={one:two for one,two in corpus_tfidf_title[i]}\n", 62 | " for j in dict_1.keys():\n", 63 | " if j in dict_2.keys():\n", 64 | " dict_2[j]=(1-gamma)*dict_2[j]+gamma*dict_1[j]\n", 65 | " new_tfidf_part=[(one,two) for one,two in sorted(dict_2.items(), key=lambda d: d[0],reverse=False)]\n", 66 | " new_tfidf.append(new_tfidf_part)\n", 67 | " \n", 68 | " #5.返回tfidf\n", 69 | " return new_tfidf,dictionary\n", 70 | "\n", 71 | "\n", 72 | "# lda的模型\n", 73 | "from gensim.models import CoherenceModel\n", 74 | "from gensim import models\n", 75 | "\n", 76 | "def lda_and_coherence_score(processed_docs,num_topics,methods,corpus_tfidf_tw_lda=False,dictionary_tw_lda=False):\n", 77 | " \n", 78 | " if methods==\"tw_lda\":\n", 79 | " dictionary=dictionary_tw_lda\n", 80 | " corpus=corpus_tfidf_tw_lda\n", 81 | " else:\n", 82 | " dictionary = gensim.corpora.Dictionary(processed_docs)\n", 83 | " dictionary.filter_extremes(no_below=15, no_above=0.8, keep_n=100000)\n", 84 | " bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n", 85 | " if methods==\"lda_normal\":\n", 86 | " corpus=bow_corpus\n", 87 | " elif methods==\"lda_tfidf\":\n", 88 | " tfidf = models.TfidfModel(bow_corpus,normalize=False)\n", 89 | " corpus_tfidf = tfidf[bow_corpus]\n", 90 | " corpus=corpus_tfidf\n", 91 | " model = gensim.models.LdaMulticore(corpus, num_topics=num_topics, id2word=dictionary,\n", 92 | " passes=20,iterations=5000\n", 93 | " #,eval_every=1\n", 94 | " )\n", 95 | " coherence = CoherenceModel(model=model, texts=processed_docs,dictionary=dictionary, coherence='c_v')\n", 96 | " score = coherence.get_coherence()\n", 97 | " logper=model.log_perplexity(corpus)\n", 98 | " return model,score,logper\n", 99 | "\n", 100 | "\n", 101 | "def cs_bar_5(c,num_topics,methods,corpus_tfidf_tw_lda=False,dictionary_tw_lda=False):\n", 102 | " cs_list=[]\n", 103 | " logper_list=[]\n", 104 | " if methods==\"tw_lda\":\n", 105 | " for i in range(5):\n", 106 | " lda,cs,logper=lda_and_coherence_score(c,num_topics,methods,corpus_tfidf_tw_lda=corpus_tfidf_tw_lda,dictionary_tw_lda=dictionary_tw_lda)\n", 107 | " cs_list.append(cs)\n", 108 | " logper_list.append(logper)\n", 109 | " else:\n", 110 | " for i in range(5):\n", 111 | " lda,cs,logper=lda_and_coherence_score(c,num_topics,methods)\n", 112 | " cs_list.append(cs)\n", 113 | " logper_list.append(logper)\n", 114 | " return lda,cs_list,logper_list " 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": { 121 | "scrolled": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "text=pd.read_csv(\"./huanqiu_news_with_title_2.csv\")\n", 126 | "drop_na=[]\n", 127 | "for i in range(text.shape[0]):\n", 128 | " if pd.isnull(text.loc[i]).any():\n", 129 | " drop_na.append(i)\n", 130 | "\n", 131 | "text=text.drop(drop_na)\n", 132 | "text=text.reset_index(drop=True)\n", 133 | "\n", 134 | "text.head()" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "c=[i.split(\" \") for i in text[\"微博正文(有标题切词后)\"]]" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "tn=50 \n", 153 | "ga=0.5\n", 154 | "\n", 155 | "a,b=tw_lda_get_tfidf(text,gamma=ga)\n", 156 | "lda3,cs3,logper3=cs_bar_5(c,\n", 157 | " tn,\n", 158 | " \"tw_lda\",\n", 159 | " corpus_tfidf_tw_lda=a,\n", 160 | " dictionary_tw_lda=b)\n", 161 | "text_topics3=lda3.get_document_topics(a)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "import pyLDAvis\n", 171 | "import pyLDAvis.gensim\n", 172 | "\n", 173 | "\n", 174 | "pyLDAvis.enable_notebook()\n", 175 | "vis = pyLDAvis.gensim.prepare(lda3, a, b)\n", 176 | "\n", 177 | "end_lda = time.clock()\n", 178 | "\n", 179 | "#pyLDAvis.show(vis)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "pyLDAvis.show(vis)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "lda3.print_topics(-1,20)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "text_topics_dict3={}\n", 207 | "for i in range(tn):\n", 208 | " text_topics_dict3[i]=[]\n", 209 | " \n", 210 | "text_topics_no_dict3={}\n", 211 | "for i in range(tn):\n", 212 | " text_topics_no_dict3[i]=[]\n", 213 | "\n", 214 | "for i in range(len(text_topics3)):\n", 215 | " dict_text_topics3=dict(text_topics3[i])\n", 216 | " #print(dict_text_topics3)\n", 217 | " for key,value in dict_text_topics3.items():\n", 218 | " if(value == max(dict_text_topics3.values()) and str(max(dict_text_topics3.values()))!=\"0.02\"):\n", 219 | " text_topics_dict3[key].append(value)\n", 220 | " text_topics_no_dict3[key].append(i)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "lda3.print_topics(-1,20)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "lda3.show_topic(1,20) " 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": { 245 | "scrolled": true 246 | }, 247 | "outputs": [], 248 | "source": [ 249 | "\n", 250 | "topic_id=1\n", 251 | "print(lda3.show_topic(1,20))\n", 252 | "import re\n", 253 | "time_3={}\n", 254 | "for i in range(1,13):\n", 255 | " month_text=0\n", 256 | " if len(str(i))==1:\n", 257 | " mon=\"0\"+str(i)\n", 258 | " else:\n", 259 | " mon=str(i)\n", 260 | " month_re='2019-'+mon\n", 261 | " time_list=text[\"发布时间\"][text_topics_no_dict3[topic_id]]\n", 262 | " for j in time_list:\n", 263 | " if month_re in j:\n", 264 | " month_text=month_text+1\n", 265 | " time_3[mon]=month_text\n", 266 | "print(time_3)\n", 267 | "\n", 268 | "import matplotlib.pyplot as plt\n", 269 | "import numpy as np\n", 270 | "from scipy.interpolate import interp1d\n", 271 | "\n", 272 | "plt.figure(figsize=(13,5),dpi=50)\n", 273 | "\n", 274 | "month_re_list=[]\n", 275 | "for i in range(1,13):\n", 276 | " if len(str(i))==1:\n", 277 | " mon=\"0\"+str(i)\n", 278 | " else:\n", 279 | " mon=str(i)\n", 280 | " month_re='2019-'+mon\n", 281 | " month_re_list.append(month_re)\n", 282 | "month_re_list.append('2020-01')\n", 283 | "#print(\"month_re_list\",month_re_list)\n", 284 | "new_ticks =month_re_list\n", 285 | "\n", 286 | "plt.xticks(range(1,14),new_ticks)\n", 287 | "\n", 288 | "plt.ylim(0, 250)\n", 289 | "\n", 290 | "x = [int(i[0]) for i in time_3.items()]\n", 291 | "x.append(13)\n", 292 | "\n", 293 | "y = [int(i[1]) for i in time_3.items()]\n", 294 | "y.append(0)\n", 295 | "#np.linspace(0, 10, num=11, endpoint=True)\n", 296 | "\n", 297 | "f2 = interp1d(x, y, kind='zero')\n", 298 | "\n", 299 | "xnew = np.linspace(1, 13, num=1001, endpoint=False)\n", 300 | "#plt.plot(x, y, 'o')\n", 301 | "plt.plot(xnew, f2(xnew), '-',color='blue')\n", 302 | "plt.legend(['Topic1', 'zero'], loc='best')\n", 303 | "\n", 304 | "x1=[int(i[0]) for i in time_3.items()]\n", 305 | "y1=[int(i[1]) for i in time_3.items()]\n", 306 | "\n", 307 | "for m,n in zip(x1,y1):\n", 308 | " plt.text(m+0.5, n+0.15, '%.0f' % n, ha='right', va= 'bottom',fontsize=11)\n", 309 | "\n", 310 | "plt.show()\n" 311 | ] 312 | } 313 | ], 314 | "metadata": { 315 | "kernelspec": { 316 | "display_name": "Python 3", 317 | "language": "python", 318 | "name": "python3" 319 | }, 320 | "language_info": { 321 | "codemirror_mode": { 322 | "name": "ipython", 323 | "version": 3 324 | }, 325 | "file_extension": ".py", 326 | "mimetype": "text/x-python", 327 | "name": "python", 328 | "nbconvert_exporter": "python", 329 | "pygments_lexer": "ipython3", 330 | "version": "3.7.3" 331 | } 332 | }, 333 | "nbformat": 4, 334 | "nbformat_minor": 2 335 | } 336 | -------------------------------------------------------------------------------- /4.五个分类微博-数据清洗和预处理.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:11: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead\n", 13 | " # This is added back by InteractiveShellApp.init_path()\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "#coding: utf-8\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import re\n", 22 | "import jieba\n", 23 | "import os\n", 24 | "import time\n", 25 | "import jieba \n", 26 | "import numpy as np\n", 27 | "from progressbar import *\n", 28 | "start1 =time.clock()" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def obatin_news_from_csv(path_csv):\n", 38 | " old_text=pd.read_csv(path_csv)\n", 39 | " drop_index_na=[]\n", 40 | " for i in range(old_text.shape[0]):\n", 41 | " j=old_text['微博正文'][i]\n", 42 | " if (\"抱歉\" in j or len(j)<40):\n", 43 | " drop_index_na.append(i)\n", 44 | " text=old_text.drop(drop_index_na)\n", 45 | " text=text.reset_index(drop=True)\n", 46 | " return text\n", 47 | "\n", 48 | "import re\n", 49 | "def find_title(text):\n", 50 | " text[\"标题\"]=[-99 for i in range(text.shape[0])]\n", 51 | " title_p=re.compile(r\".*\\【.*\\】\")\n", 52 | " progress = ProgressBar()\n", 53 | " for i in progress(range(len(text[\"微博正文\"]))):\n", 54 | " title_=re.match(title_p,text[\"微博正文\"][i])\n", 55 | " if title_:\n", 56 | " text[\"标题\"][i]=title_[0]\n", 57 | " text[\"微博正文\"][i]=re.sub(\".*\\【.*\\】\",\"\",text[\"微博正文\"][i])\n", 58 | " time.sleep(0.1)\n", 59 | " notitle_index=[]\n", 60 | " for i in range(text.shape[0]):\n", 61 | " if text[\"标题\"][i]==-99:\n", 62 | " notitle_index.append(i)\n", 63 | " text=text.drop(notitle_index)\n", 64 | " text=text.reset_index(drop=True)\n", 65 | " return text" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stderr", 75 | "output_type": "stream", 76 | "text": [ 77 | "Building prefix dict from the default dictionary ...\n", 78 | "Loading model from cache C:\\Users\\Public\\Documents\\Wondershare\\CreatorTemp\\jieba.cache\n", 79 | "Loading model cost 1.929 seconds.\n", 80 | "Prefix dict has been built successfully.\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "#载入词典和停用词\n", 86 | "jieba.load_userdict(r\".\\环球时报\\自定义词典.txt\")\n", 87 | "stopwords = [line.strip() for line in open(r\".\\环球时报\\停用词库.txt\",encoding='UTF-8').readlines()]\n", 88 | "def clean_text_content(text_content,cut_all=False):\n", 89 | " text_content = re.sub(r\"。\", \".\", text_content)\n", 90 | " text_content = re.sub(r\",\", \",\", text_content)\n", 91 | " text_content = re.sub(r\"“\", \"'\", text_content)\n", 92 | " text_content = re.sub(r\"”\", \"'\", text_content)\n", 93 | " text_content = re.sub(r\"…\", \".\", text_content)\n", 94 | " text_content = re.sub(r\"@\", \"@\", text_content)\n", 95 | " text_content = re.sub(r\" \", \" \", text_content)\n", 96 | " text_content = re.sub(r\"!\", \"!\", text_content)\n", 97 | " text_content = re.sub(r\"?\", \"?\", text_content)\n", 98 | " text_content = re.sub(r\":\", \":\", text_content)\n", 99 | " text_content = re.sub(r\")\", \")\", text_content)\n", 100 | " text_content = re.sub(r\"(\", \"(\", text_content)\n", 101 | " text_content = re.sub(r\"(\\d+年)*(\\d+月)*(\\d+[日])\", \"\", text_content) #日期\n", 102 | " text_content = re.sub(r\"\\d+[年月日天号人名时例名省市区县院]\", \"\", text_content) \n", 103 | " text_content = re.sub(r\"[第]*[零一二三四五六七八九百千万]+[年月日天号人名时例名省市区县院名例周月年]*\", \"\", text_content) \n", 104 | " text_content = re.sub(r\"[0-2]?[0-9]:[0-6][0-9]\", \"\", text_content) #时间\n", 105 | " text_content = re.sub(r\"^[-+]?[0-9]+(\\.)?[0-9]*$\", \"\", text_content) #数字\n", 106 | " text_content = re.sub(r\"@\\S*\\s\", \" \", text_content) #@小央视频\n", 107 | " text_content = re.sub(r\"\\[.+\\]\", \"\", text_content) #[组图共2张]和[加油]\n", 108 | " text_content = re.sub(r\"\\(.*\\)\", \"\", text_content) #(环球网)\n", 109 | " \n", 110 | " text_content = re.sub(r\"\\W+\\w*(视频)\\s+\", \"\", text_content) #小央视频的秒拍视频\n", 111 | " text_content = re.sub(r\"#\", \"\", text_content) ##\n", 112 | " text_content = re.sub(r\"(http|https)(://t.cn/)[a-zA-Z0-9]+\", \"\", text_content) #网址(微博上的连接都是http://t.cn/.....形式)\n", 113 | " text_content = re.sub(r\"转发理由:\", \"\", text_content) \n", 114 | " text_content = re.sub(r\"转发内容:\", \"\", text_content) \n", 115 | " text_content = re.sub(r\"原始用户:.*\", \"\", text_content) \n", 116 | " \n", 117 | " word_list=jieba.lcut(text_content,cut_all=cut_all)\n", 118 | " word_list_len=len(word_list)\n", 119 | " i=0\n", 120 | " while i\n", 236 | "\n", 249 | "\n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | "
微博正文标题分类
0春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。...【农业农村部:当前春耕备耕热潮掀起进展顺利】1
1大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕...【大学生村官的困惑:困在农村 还是奋斗在农村?】1
2从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公...【一个山旮旯的振兴“路”】1
3今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将...#中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】1
4又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今...【今年春耕有啥新特点】1
\n", 291 | "" 292 | ], 293 | "text/plain": [ 294 | " 微博正文 \\\n", 295 | "0 春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。... \n", 296 | "1 大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕... \n", 297 | "2 从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公... \n", 298 | "3 今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将... \n", 299 | "4 又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今... \n", 300 | "\n", 301 | " 标题 分类 \n", 302 | "0 【农业农村部:当前春耕备耕热潮掀起进展顺利】 1 \n", 303 | "1 【大学生村官的困惑:困在农村 还是奋斗在农村?】 1 \n", 304 | "2 【一个山旮旯的振兴“路”】 1 \n", 305 | "3 #中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】 1 \n", 306 | "4 【今年春耕有啥新特点】 1 " 307 | ] 308 | }, 309 | "execution_count": 5, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "text=text_one_news_list2[0]\n", 316 | "for i in range(1,5):\n", 317 | " text=text.append(text_one_news_list2[i])\n", 318 | "text=text.reset_index(drop=True)\n", 319 | "print(text.shape[0])\n", 320 | "text.head(5)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 6, 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "name": "stderr", 330 | "output_type": "stream", 331 | "text": [ 332 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:3: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead\n", 333 | " This is separate from the ipykernel package so we can avoid doing imports until\n", 334 | "N/A% (0 of 4993) | | Elapsed Time: 0:00:00 ETA: --:--:--C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:16: SettingWithCopyWarning: \n", 335 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 336 | "\n", 337 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 338 | " app.launch_new_instance()\n", 339 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:19: SettingWithCopyWarning: \n", 340 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 341 | "\n", 342 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 343 | "100% (4993 of 4993) |####################| Elapsed Time: 0:14:53 Time: 0:14:53\n", 344 | "N/A% (0 of 4993) | | Elapsed Time: 0:00:00 ETA: --:--:--C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:35: SettingWithCopyWarning: \n", 345 | "A value is trying to be set on a copy of a slice from a DataFrame\n", 346 | "\n", 347 | "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n", 348 | "100% (4993 of 4993) |####################| Elapsed Time: 0:02:43 Time: 0:02:43\n" 349 | ] 350 | }, 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "Running time: 1057.8040643 Seconds\n" 356 | ] 357 | }, 358 | { 359 | "name": "stderr", 360 | "output_type": "stream", 361 | "text": [ 362 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:38: DeprecationWarning: time.clock has been deprecated in Python 3.3 and will be removed from Python 3.8: use time.perf_counter or time.process_time instead\n" 363 | ] 364 | }, 365 | { 366 | "data": { 367 | "text/html": [ 368 | "
\n", 369 | "\n", 382 | "\n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | "
微博正文标题分类微博正文(无标题切词后)标题(切词后)微博正文(有标题切词后)
0春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。...【农业农村部:当前春耕备耕热潮掀起进展顺利】1春分 已过 清明 农谚 清明 种瓜 点豆 当前 全国 由南向北 全面 春耕生产 大忙 季节 ...农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 春分 已过 清明 农谚 清明 ...
1大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕...【大学生村官的困惑:困在农村 还是奋斗在农村?】1大学生 村官 制度 农村 输血 重要途径 半月谈 记者 河北 山东 陕西 调研 各地 围绕 ...大学 大学生 学生 村官 困惑 农村 奋斗 农村大学 大学生 学生 村官 困惑 农村 奋斗 农村 大学生 村官 制度 农村 输血 重要途径 ...
2从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公...【一个山旮旯的振兴“路”】1昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不到 公里 公...旮旯 振兴旮旯 振兴 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不...
3今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将...#中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】1今年 盖钧 时常 卷起 裤腿 田间 地头 身为 中国工程院 院士 大豆 遗传 育种学 更是 ...中国 种业 十大 杰出 杰出人物 人物 生为 满仓中国 种业 十大 杰出 杰出人物 人物 生为 满仓 今年 盖钧 时常 卷起 裤腿 田间 地头...
4又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今...【今年春耕有啥新特点】1春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴 政策 引导 今年 春耕...今年 春耕 新特 特点今年 春耕 新特 特点 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴...
\n", 442 | "
" 443 | ], 444 | "text/plain": [ 445 | " 微博正文 \\\n", 446 | "0 春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。... \n", 447 | "1 大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕... \n", 448 | "2 从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公... \n", 449 | "3 今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将... \n", 450 | "4 又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今... \n", 451 | "\n", 452 | " 标题 分类 \\\n", 453 | "0 【农业农村部:当前春耕备耕热潮掀起进展顺利】 1 \n", 454 | "1 【大学生村官的困惑:困在农村 还是奋斗在农村?】 1 \n", 455 | "2 【一个山旮旯的振兴“路”】 1 \n", 456 | "3 #中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】 1 \n", 457 | "4 【今年春耕有啥新特点】 1 \n", 458 | "\n", 459 | " 微博正文(无标题切词后) \\\n", 460 | "0 春分 已过 清明 农谚 清明 种瓜 点豆 当前 全国 由南向北 全面 春耕生产 大忙 季节 ... \n", 461 | "1 大学生 村官 制度 农村 输血 重要途径 半月谈 记者 河北 山东 陕西 调研 各地 围绕 ... \n", 462 | "2 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不到 公里 公... \n", 463 | "3 今年 盖钧 时常 卷起 裤腿 田间 地头 身为 中国工程院 院士 大豆 遗传 育种学 更是 ... \n", 464 | "4 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴 政策 引导 今年 春耕... \n", 465 | "\n", 466 | " 标题(切词后) \\\n", 467 | "0 农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 \n", 468 | "1 大学 大学生 学生 村官 困惑 农村 奋斗 农村 \n", 469 | "2 旮旯 振兴 \n", 470 | "3 中国 种业 十大 杰出 杰出人物 人物 生为 满仓 \n", 471 | "4 今年 春耕 新特 特点 \n", 472 | "\n", 473 | " 微博正文(有标题切词后) \n", 474 | "0 农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 春分 已过 清明 农谚 清明 ... \n", 475 | "1 大学 大学生 学生 村官 困惑 农村 奋斗 农村 大学生 村官 制度 农村 输血 重要途径 ... \n", 476 | "2 旮旯 振兴 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不... \n", 477 | "3 中国 种业 十大 杰出 杰出人物 人物 生为 满仓 今年 盖钧 时常 卷起 裤腿 田间 地头... \n", 478 | "4 今年 春耕 新特 特点 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴... " 479 | ] 480 | }, 481 | "execution_count": 6, 482 | "metadata": {}, 483 | "output_type": "execute_result" 484 | } 485 | ], 486 | "source": [ 487 | "\n", 488 | "\n", 489 | "start =time.clock()\n", 490 | "\n", 491 | "len_text=text.shape[0]\n", 492 | "\n", 493 | "text[\"微博正文(无标题切词后)\"]=[-99 for i in range(len_text)]\n", 494 | "text[\"标题(切词后)\"]=[-99 for i in range(len_text)]\n", 495 | "\n", 496 | "drop_na=[]\n", 497 | "\n", 498 | "progress = ProgressBar()\n", 499 | "\n", 500 | "for u in progress(range(0,len_text)):\n", 501 | " cleaned_content=clean_text_content(text[\"微博正文\"][u])\n", 502 | " text[\"微博正文(无标题切词后)\"][u]=\" \".join(cleaned_content)\n", 503 | "\n", 504 | " cleaned_title=clean_text_content(text[\"标题\"][u],True)\n", 505 | " text[\"标题(切词后)\"][u]=\" \".join(cleaned_title)\n", 506 | "\n", 507 | " if pd.isnull(text.loc[u]).any():\n", 508 | " drop_na.append(u)\n", 509 | " print(u) \n", 510 | " \n", 511 | " \n", 512 | " time.sleep(0.1)\n", 513 | "\n", 514 | "text=text.drop(drop_na)\n", 515 | "text=text.reset_index(drop=True) \n", 516 | "\n", 517 | "text[\"微博正文(有标题切词后)\"]=[-99 for i in range(len_text)]\n", 518 | "progress = ProgressBar()\n", 519 | "\n", 520 | "for u in progress(range(0,len_text)):\n", 521 | " text[\"微博正文(有标题切词后)\"][u]=text[\"标题(切词后)\"][u]+\" \"+text[\"微博正文(无标题切词后)\"][u]\n", 522 | "\n", 523 | "\n", 524 | "end = time.clock()\n", 525 | "\n", 526 | "print('Running time: %s Seconds'%(end-start))\n", 527 | "text.head()" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 7, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "outputpath='./test_news_with_title_2.csv'\n", 537 | "text.to_csv(outputpath,sep=',',index=False,header=True)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 8, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "data": { 547 | "text/html": [ 548 | "
\n", 549 | "\n", 562 | "\n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | "
微博正文标题分类微博正文(无标题切词后)标题(切词后)微博正文(有标题切词后)
0春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。...【农业农村部:当前春耕备耕热潮掀起进展顺利】1春分 已过 清明 农谚 清明 种瓜 点豆 当前 全国 由南向北 全面 春耕生产 大忙 季节 ...农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 春分 已过 清明 农谚 清明 ...
1大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕...【大学生村官的困惑:困在农村 还是奋斗在农村?】1大学生 村官 制度 农村 输血 重要途径 半月谈 记者 河北 山东 陕西 调研 各地 围绕 ...大学 大学生 学生 村官 困惑 农村 奋斗 农村大学 大学生 学生 村官 困惑 农村 奋斗 农村 大学生 村官 制度 农村 输血 重要途径 ...
2从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公...【一个山旮旯的振兴“路”】1昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不到 公里 公...旮旯 振兴旮旯 振兴 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不...
3今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将...#中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】1今年 盖钧 时常 卷起 裤腿 田间 地头 身为 中国工程院 院士 大豆 遗传 育种学 更是 ...中国 种业 十大 杰出 杰出人物 人物 生为 满仓中国 种业 十大 杰出 杰出人物 人物 生为 满仓 今年 盖钧 时常 卷起 裤腿 田间 地头...
4又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今...【今年春耕有啥新特点】1春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴 政策 引导 今年 春耕...今年 春耕 新特 特点今年 春耕 新特 特点 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴...
\n", 622 | "
" 623 | ], 624 | "text/plain": [ 625 | " 微博正文 \\\n", 626 | "0 春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。... \n", 627 | "1 大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕... \n", 628 | "2 从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公... \n", 629 | "3 今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将... \n", 630 | "4 又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今... \n", 631 | "\n", 632 | " 标题 分类 \\\n", 633 | "0 【农业农村部:当前春耕备耕热潮掀起进展顺利】 1 \n", 634 | "1 【大学生村官的困惑:困在农村 还是奋斗在农村?】 1 \n", 635 | "2 【一个山旮旯的振兴“路”】 1 \n", 636 | "3 #中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】 1 \n", 637 | "4 【今年春耕有啥新特点】 1 \n", 638 | "\n", 639 | " 微博正文(无标题切词后) \\\n", 640 | "0 春分 已过 清明 农谚 清明 种瓜 点豆 当前 全国 由南向北 全面 春耕生产 大忙 季节 ... \n", 641 | "1 大学生 村官 制度 农村 输血 重要途径 半月谈 记者 河北 山东 陕西 调研 各地 围绕 ... \n", 642 | "2 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不到 公里 公... \n", 643 | "3 今年 盖钧 时常 卷起 裤腿 田间 地头 身为 中国工程院 院士 大豆 遗传 育种学 更是 ... \n", 644 | "4 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴 政策 引导 今年 春耕... \n", 645 | "\n", 646 | " 标题(切词后) \\\n", 647 | "0 农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 \n", 648 | "1 大学 大学生 学生 村官 困惑 农村 奋斗 农村 \n", 649 | "2 旮旯 振兴 \n", 650 | "3 中国 种业 十大 杰出 杰出人物 人物 生为 满仓 \n", 651 | "4 今年 春耕 新特 特点 \n", 652 | "\n", 653 | " 微博正文(有标题切词后) \n", 654 | "0 农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 春分 已过 清明 农谚 清明 ... \n", 655 | "1 大学 大学生 学生 村官 困惑 农村 奋斗 农村 大学生 村官 制度 农村 输血 重要途径 ... \n", 656 | "2 旮旯 振兴 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不... \n", 657 | "3 中国 种业 十大 杰出 杰出人物 人物 生为 满仓 今年 盖钧 时常 卷起 裤腿 田间 地头... \n", 658 | "4 今年 春耕 新特 特点 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴... " 659 | ] 660 | }, 661 | "execution_count": 8, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "a=pd.read_csv('./test_news_with_title_2.csv')\n", 668 | "a.head()" 669 | ] 670 | } 671 | ], 672 | "metadata": { 673 | "kernelspec": { 674 | "display_name": "Python 3", 675 | "language": "python", 676 | "name": "python3" 677 | }, 678 | "language_info": { 679 | "codemirror_mode": { 680 | "name": "ipython", 681 | "version": 3 682 | }, 683 | "file_extension": ".py", 684 | "mimetype": "text/x-python", 685 | "name": "python", 686 | "nbconvert_exporter": "python", 687 | "pygments_lexer": "ipython3", 688 | "version": "3.7.3" 689 | } 690 | }, 691 | "nbformat": 4, 692 | "nbformat_minor": 2 693 | } 694 | -------------------------------------------------------------------------------- /5.五个分类微博-SVM+TFIDF SVM+加权TFIDF SVM.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 30, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import scipy\n", 10 | "from sklearn import svm\n", 11 | "from sklearn import metrics\n", 12 | "import gensim\n", 13 | "from gensim import models\n", 14 | "from gensim.models import CoherenceModel\n", 15 | "import numpy as np\n", 16 | "import pandas as pd\n", 17 | "import sklearn\n", 18 | "from sklearn.model_selection import train_test_split\n", 19 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,\\\n", 20 | " TfidfVectorizer\n", 21 | "\n", 22 | "def train_predict_evaluate_model(classifier, train_features, train_labels,\n", 23 | " test_features, test_labels):\n", 24 | " \"\"\"\n", 25 | " 训练、预测、评估 模型\n", 26 | " :param classifier: 模型\n", 27 | " :param train_features: 训练集特征\n", 28 | " :param train_labels: 训练集label\n", 29 | " :param test_features: 测试集特征\n", 30 | " :param test_labels: 测试集label\n", 31 | " :return: 预测结果\n", 32 | " \"\"\"\n", 33 | " classifier.fit(train_features, train_labels)\n", 34 | " predictions = classifier.predict(test_features)\n", 35 | " get_metrics(true_labels=test_labels, predicted_labels=predictions)\n", 36 | " return predictions\n", 37 | "\n", 38 | "def get_metrics(true_labels, predicted_labels):\n", 39 | " \"\"\"\n", 40 | " 分别计算预测结果的准确率、精确率、召回率、F1值,直接打印出这些结果\n", 41 | " :param true_labels: 真实label\n", 42 | " :param predicted_labels: 预测结果\n", 43 | " :return:\n", 44 | " \"\"\"\n", 45 | " print(\"accuracy:\", np.round(metrics.accuracy_score(true_labels,\n", 46 | " predicted_labels), 5))\n", 47 | " print(\"precision:\", np.round(metrics.precision_score(\n", 48 | " true_labels, predicted_labels, average='weighted'), 5))\n", 49 | " print(\"recall:\", np.round(metrics.recall_score(\n", 50 | " true_labels, predicted_labels, average='weighted'), 5))\n", 51 | " print(\"f1 score:\", np.round(metrics.f1_score(\n", 52 | " true_labels, predicted_labels, average='weighted'), 5))\n", 53 | "\n", 54 | "def sparse2dense(corpus):\n", 55 | " data = []\n", 56 | " rows = []\n", 57 | " cols = []\n", 58 | " line_count = 0\n", 59 | " for line in corpus: # lsi_corpus_total 是之前由gensim生成的lsi向量\n", 60 | " for elem in line:\n", 61 | " rows.append(line_count)\n", 62 | " cols.append(elem[0])\n", 63 | " data.append(elem[1])\n", 64 | " line_count += 1\n", 65 | " lsi_sparse_matrix = scipy.sparse.csr_matrix((data,(rows,cols))) # 稀疏向量\n", 66 | " corpus2matrix = lsi_sparse_matrix.toarray() # 密集向量\n", 67 | " return corpus2matrix" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/html": [ 78 | "
\n", 79 | "\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | "
微博正文标题分类微博正文(无标题切词后)标题(切词后)微博正文(有标题切词后)
0春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。...【农业农村部:当前春耕备耕热潮掀起进展顺利】1春分 已过 清明 农谚 清明 种瓜 点豆 当前 全国 由南向北 全面 春耕生产 大忙 季节 ...农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 春分 已过 清明 农谚 清明 ...
1大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕...【大学生村官的困惑:困在农村 还是奋斗在农村?】1大学生 村官 制度 农村 输血 重要途径 半月谈 记者 河北 山东 陕西 调研 各地 围绕 ...大学 大学生 学生 村官 困惑 农村 奋斗 农村大学 大学生 学生 村官 困惑 农村 奋斗 农村 大学生 村官 制度 农村 输血 重要途径 ...
2从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公...【一个山旮旯的振兴“路”】1昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不到 公里 公...旮旯 振兴旮旯 振兴 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不...
3今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将...#中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】1今年 盖钧 时常 卷起 裤腿 田间 地头 身为 中国工程院 院士 大豆 遗传 育种学 更是 ...中国 种业 十大 杰出 杰出人物 人物 生为 满仓中国 种业 十大 杰出 杰出人物 人物 生为 满仓 今年 盖钧 时常 卷起 裤腿 田间 地头...
4又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今...【今年春耕有啥新特点】1春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴 政策 引导 今年 春耕...今年 春耕 新特 特点今年 春耕 新特 特点 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴...
\n", 152 | "
" 153 | ], 154 | "text/plain": [ 155 | " 微博正文 \\\n", 156 | "0 春分已过,清明将至。农谚讲,清明前后,种瓜点豆。当前全国由南向北全面进入春耕生产大忙季节。... \n", 157 | "1 大学生村官制度是我国向农村输血的重要途径。半月谈记者在河北、山东、陕西等地调研发现,各地围绕... \n", 158 | "2 从昔日深山的羊肠小道变身今天网友盛赞的“最美赛道”,广西马山县古零镇古零村弄拉屯一条不到4公... \n", 159 | "3 今年82岁的盖钧镒还时常卷起裤腿下到田间地头。身为中国工程院院士、大豆遗传育种学家,他更是将... \n", 160 | "4 又是一年春播时,广大农民和各级农业部门又开始了繁忙的春耕备耕工作。在乡村振兴政策的引导下,今... \n", 161 | "\n", 162 | " 标题 分类 \\\n", 163 | "0 【农业农村部:当前春耕备耕热潮掀起进展顺利】 1 \n", 164 | "1 【大学生村官的困惑:困在农村 还是奋斗在农村?】 1 \n", 165 | "2 【一个山旮旯的振兴“路”】 1 \n", 166 | "3 #中国种业十大杰出人物#【盖钧镒:一生为了“豆满仓”】 1 \n", 167 | "4 【今年春耕有啥新特点】 1 \n", 168 | "\n", 169 | " 微博正文(无标题切词后) \\\n", 170 | "0 春分 已过 清明 农谚 清明 种瓜 点豆 当前 全国 由南向北 全面 春耕生产 大忙 季节 ... \n", 171 | "1 大学生 村官 制度 农村 输血 重要途径 半月谈 记者 河北 山东 陕西 调研 各地 围绕 ... \n", 172 | "2 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不到 公里 公... \n", 173 | "3 今年 盖钧 时常 卷起 裤腿 田间 地头 身为 中国工程院 院士 大豆 遗传 育种学 更是 ... \n", 174 | "4 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴 政策 引导 今年 春耕... \n", 175 | "\n", 176 | " 标题(切词后) \\\n", 177 | "0 农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 \n", 178 | "1 大学 大学生 学生 村官 困惑 农村 奋斗 农村 \n", 179 | "2 旮旯 振兴 \n", 180 | "3 中国 种业 十大 杰出 杰出人物 人物 生为 满仓 \n", 181 | "4 今年 春耕 新特 特点 \n", 182 | "\n", 183 | " 微博正文(有标题切词后) \n", 184 | "0 农业 农村 农村部 当前 春耕 备耕 热潮 掀起 进展 顺利 春分 已过 清明 农谚 清明 ... \n", 185 | "1 大学 大学生 学生 村官 困惑 农村 奋斗 农村 大学生 村官 制度 农村 输血 重要途径 ... \n", 186 | "2 旮旯 振兴 昔日 深山 羊肠小道 网友 盛赞 最美 赛道 广西 马山县 古镇 古村 屯条 不... \n", 187 | "3 中国 种业 十大 杰出 杰出人物 人物 生为 满仓 今年 盖钧 时常 卷起 裤腿 田间 地头... \n", 188 | "4 今年 春耕 新特 特点 春播 广大 农民 各级 农业部门 繁忙 春耕 备耕 工作 乡村 振兴... " 189 | ] 190 | }, 191 | "execution_count": 4, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "import numpy as np\n", 198 | "import pandas as pd\n", 199 | "\n", 200 | "text=pd.read_csv(\"./test_news_with_title_2.csv\")\n", 201 | "drop_na=[]\n", 202 | "for i in range(text.shape[0]):\n", 203 | " if pd.isnull(text.loc[i]).any():\n", 204 | " drop_na.append(i)\n", 205 | "\n", 206 | "text=text.drop(drop_na)\n", 207 | "text=text.reset_index(drop=True)\n", 208 | "\n", 209 | "text.head()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 7, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "1012\n", 222 | "微博正文 6日,香港中学文凭考试举行,有考生疑太紧张在考试中连放了2个小时屁,坐在放屁考生后座的同学发...\n", 223 | "标题 #高考加油#【香港#考生考试连放2小时屁# ,后座:大脑一片空白,考试严重受影响】\n", 224 | "分类 2\n", 225 | "微博正文(无标题切词后) 香港 中学 文凭 考试 举行 考生 太紧张 考试 中连放 小时 坐在 放屁 考生 后座 同学...\n", 226 | "标题(切词后) 高考 加油 香港 考生 考试 连放 小时 后座 大脑 脑片 空白 考试 严重 影响\n", 227 | "微博正文(有标题切词后) 高考 加油 香港 考生 考试 连放 小时 后座 大脑 脑片 空白 考试 严重 影响 香港 中...\n", 228 | "Name: 1012, dtype: object\n", 229 | "1198\n", 230 | "微博正文 广东深圳,香港中文大学深圳校区的大四学生吕懿惟,曾因食堂不合胃口,在大一时开网店卖简餐,获得...\n", 231 | "标题 【超厉害的小姐姐!食堂不合胃口,#女学霸卖简餐获创业奖#:博士毕业后还想创业】\n", 232 | "分类 2\n", 233 | "微博正文(无标题切词后) 广东 深圳 香港中文大学 深圳 校区 大学生 吕懿 食堂 合胃口 大开 网店 简餐 获得 学...\n", 234 | "标题(切词后) 厉害 小姐 小姐姐 姐姐 食堂 不合 合胃口 胃口 创业 博士 毕业 创业\n", 235 | "微博正文(有标题切词后) 厉害 小姐 小姐姐 姐姐 食堂 不合 合胃口 胃口 创业 博士 毕业 创业 广东 深圳 香港...\n", 236 | "Name: 1198, dtype: object\n", 237 | "1214\n", 238 | "微博正文 《经济学人》杂志根据全球各大城市生活成本,发布2019年全球“最贵”十大城市榜单,新加坡、香...\n", 239 | "标题 #美好,就在身边# 【经济学人发布2019#全球生活成本最贵城市排名#,香港首登榜首,三城并...\n", 240 | "分类 2\n", 241 | "微博正文(无标题切词后) 经济学 杂志 全球 大城市 生活 成本 全球 十大 榜单 新加坡 香港 巴黎城 并列 香港 ...\n", 242 | "标题(切词后) 美好 身边 经济 经济学 济学 学人 全球 生活 成本 最贵 排名 香港 榜首 并列\n", 243 | "微博正文(有标题切词后) 美好 身边 经济 经济学 济学 学人 全球 生活 成本 最贵 排名 香港 榜首 并列 经济学...\n", 244 | "Name: 1214, dtype: object\n", 245 | "1336\n", 246 | "微博正文 本周教育圈发生了什么?政府工作报告中教育内容引关注;教育部部长陈宝生:教育质量是被尊敬出来的...\n", 247 | "标题 #教育有料#【本周教育圈发生了什么】\n", 248 | "分类 2\n", 249 | "微博正文(无标题切词后) 本周 教育 政府 工作 报告 教育 内容 关注 教育部 部长 陈宝生 教育 质量 尊敬 北京...\n", 250 | "标题(切词后) 教育 有料 本周 教育\n", 251 | "微博正文(有标题切词后) 教育 有料 本周 教育 本周 教育 政府 工作 报告 教育 内容 关注 教育部 部长 陈宝生...\n", 252 | "Name: 1336, dtype: object\n", 253 | "1987\n", 254 | "微博正文 香港的家庭主妇Paige有两个儿子,她每天6点半左右起床,为2个儿子准备营养丰富的花式早餐,...\n", 255 | "标题 【文艺#老妈狂做2年Ins风营养早餐#,儿子拒绝夸赞只想快点吃光,网友:直男好气哦】\n", 256 | "分类 2\n", 257 | "微博正文(无标题切词后) 香港 家庭主妇 两个 儿子 每天 半左右 起床 儿子 准备 营养 丰富 花式 早餐 营养 颜...\n", 258 | "标题(切词后) 文艺 老妈 营养 早餐 儿子 拒绝 夸赞 快点 吃光 网友 好气\n", 259 | "微博正文(有标题切词后) 文艺 老妈 营养 早餐 儿子 拒绝 夸赞 快点 吃光 网友 好气 香港 家庭主妇 两个 儿子...\n", 260 | "Name: 1987, dtype: object\n", 261 | "2276\n", 262 | "微博正文 据香港特区政府新闻网30日报道,国际货币基金组织(IMF)30日公布评估报告,再次肯定香港作...\n", 263 | "标题 【港府:#IMF肯定香港国际金融中心地位#】\n", 264 | "分类 3\n", 265 | "微博正文(无标题切词后) 香港特区政府 新闻网 国际货币基金组织 公布 评估 报告 肯定 香港 全球 金融中心 区内 ...\n", 266 | "标题(切词后) 港府 肯定 香港 国际 国际金融 金融 金融中心 中心 心地 地位\n", 267 | "微博正文(有标题切词后) 港府 肯定 香港 国际 国际金融 金融 金融中心 中心 心地 地位 香港特区政府 新闻网 国...\n", 268 | "Name: 2276, dtype: object\n", 269 | "2645\n", 270 | "微博正文 王建宙坦言,遗憾的事有两个,一个是中国移动没有成功在A股上市;二是中国移动没有做更多的国际并...\n", 271 | "标题 【中国移动原董事长王建宙:遗憾中国移动没在A股上市】\n", 272 | "分类 3\n", 273 | "微博正文(无标题切词后) 王建宙 坦言 遗憾 两个 中国移动 成功 上市 中国移动 国际 并购 当时 选择 美国 中国...\n", 274 | "标题(切词后) 中国 中国移动 移动 董事 董事长 王建宙 遗憾 中国 中国移动 移动 上市\n", 275 | "微博正文(有标题切词后) 中国 中国移动 移动 董事 董事长 王建宙 遗憾 中国 中国移动 移动 上市 王建宙 坦言 ...\n", 276 | "Name: 2645, dtype: object\n", 277 | "2895\n", 278 | "微博正文 香港电视广播有限公司(TVB)集团行政总裁李宝安16日发内部信称,由于需要重组部分业务流程以...\n", 279 | "标题 【#TVB将裁员350人#,重组业务】\n", 280 | "分类 3\n", 281 | "微博正文(无标题切词后) 香港电视广播有限公司 集团 行政 总裁 宝安 内部 信称 需要 重组 业务流程 增加 成本 ...\n", 282 | "标题(切词后) 裁员 重组 业务\n", 283 | "微博正文(有标题切词后) 裁员 重组 业务 香港电视广播有限公司 集团 行政 总裁 宝安 内部 信称 需要 重组 业务...\n", 284 | "Name: 2895, dtype: object\n", 285 | "3097\n", 286 | "微博正文 炫目的灯光,劲爆的音乐,2019年保诚国际自行车赛将香港自行车馆打造成一个派对舞台,在香港推...\n", 287 | "标题 【自行车馆变身派对舞台——保诚国际赛在港推广自行车运动】\n", 288 | "分类 4\n", 289 | "微博正文(无标题切词后) 炫目 灯光 音乐 保诚 国际 自行车赛 香港 自行车 造成 派对 舞台 香港 推广 自行车 ...\n", 290 | "标题(切词后) 自行 自行车 行车 变身 派对 舞台 国际 推广 自行 自行车 行车 车运 运动\n", 291 | "微博正文(有标题切词后) 自行 自行车 行车 变身 派对 舞台 国际 推广 自行 自行车 行车 车运 运动 炫目 灯光...\n", 292 | "Name: 3097, dtype: object\n", 293 | "3169\n", 294 | "微博正文 安踏体育用品有限公司26日在香港召开发布会公布2018年全年业绩报告,宣布公司在2018年收...\n", 295 | "标题 【安踏发布历史最佳年报】\n", 296 | "分类 4\n", 297 | "微博正文(无标题切词后) 安踏 体育用品 有限公司 香港 召开 发布会 公布 业绩 报告 宣布 收益 亿元 同比 增长...\n", 298 | "标题(切词后) 安踏 历史 最佳 年报\n", 299 | "微博正文(有标题切词后) 安踏 历史 最佳 年报 安踏 体育用品 有限公司 香港 召开 发布会 公布 业绩 报告 宣布...\n", 300 | "Name: 3169, dtype: object\n", 301 | "3295\n", 302 | "微博正文 2019年渣打香港马拉松赛17日落幕,肯尼亚选手巴·基普图姆打破赛会纪录,夺得男子组冠军。女...\n", 303 | "标题 【渣打香港马拉松赛肯尼亚选手基普图姆破赛会纪录夺冠】\n", 304 | "分类 4\n", 305 | "微博正文(无标题切词后) 渣打 香港 马拉松赛 落幕 肯尼亚 选手 基普 图姆 打破 赛会 纪录 夺得 男子组 冠军 ...\n", 306 | "标题(切词后) 渣打 香港 马拉 马拉松 马拉松赛 肯尼 肯尼亚 尼亚 选手 基普 赛会 纪录 夺冠\n", 307 | "微博正文(有标题切词后) 渣打 香港 马拉 马拉松 马拉松赛 肯尼 肯尼亚 尼亚 选手 基普 赛会 纪录 夺冠 渣打 ...\n", 308 | "Name: 3295, dtype: object\n", 309 | "3779\n", 310 | "微博正文 这个周末,记者在广州燕子岗体育场看了一场香港足球超级联赛,主场作战的RF富力6:0狂扫梦想F...\n", 311 | "标题 【由一张A4纸想到的】\n", 312 | "分类 4\n", 313 | "微博正文(无标题切词后) 周末 记者 广州 燕子 体育场 香港 足球 超级 联赛 主场 作战 富力 狂扫 梦想 登上 ...\n", 314 | "标题(切词后) 想到\n", 315 | "微博正文(有标题切词后) 想到 周末 记者 广州 燕子 体育场 香港 足球 超级 联赛 主场 作战 富力 狂扫 梦想 ...\n", 316 | "Name: 3779, dtype: object\n", 317 | "3905\n", 318 | "微博正文 国际马联三星级场地障碍赛——中国马术巡回赛香港赛马会杯广州总决赛6日在广州黄村体育训练中心落...\n", 319 | "标题 【法国骑手登顶新年首场国际马联星级赛】\n", 320 | "分类 4\n", 321 | "微博正文(无标题切词后) 国际 马联 星级 场地 障碍赛 中国 马术 巡回赛 香港 赛马会 广州 总决赛 广州 黄村 ...\n", 322 | "标题(切词后) 法国 骑手 登顶 新年 首场 国际 星级\n", 323 | "微博正文(有标题切词后) 法国 骑手 登顶 新年 首场 国际 星级 国际 马联 星级 场地 障碍赛 中国 马术 巡回赛...\n", 324 | "Name: 3905, dtype: object\n", 325 | "4371\n", 326 | "微博正文 2019年英国泰晤士高等教育亚洲大学排名2日揭晓,清华大学排名第一,这是中国大陆高校首次名列...\n", 327 | "标题 【2019年泰晤士亚洲大学排行榜出炉 清华首次登顶】\n", 328 | "分类 5\n", 329 | "微博正文(无标题切词后) 英国 泰晤士 高等教育 亚洲 大学排名 揭晓 清华大学 排名 这是 中国 大陆 高校 首次 ...\n", 330 | "标题(切词后) 泰晤士 亚洲 大学 排行 排行榜 出炉 清华 首次 次登 登顶\n", 331 | "微博正文(有标题切词后) 泰晤士 亚洲 大学 排行 排行榜 出炉 清华 首次 次登 登顶 英国 泰晤士 高等教育 亚洲...\n", 332 | "Name: 4371, dtype: object\n", 333 | "4729\n", 334 | "微博正文 英国经济学人智库18日发布的全球生活成本调查报告显示,法国首都巴黎、中国香港和新加坡首都新加...\n", 335 | "标题 【全球城市哪家“贵”?巴黎香港新加坡】\n", 336 | "分类 5\n", 337 | "微博正文(无标题切词后) 英国 经济学 智库 全球 生活 成本 调查报告 显示 法国 首都 巴黎 中国香港 新加坡 首...\n", 338 | "标题(切词后) 全球 哪家 巴黎 香港 新加坡\n", 339 | "微博正文(有标题切词后) 全球 哪家 巴黎 香港 新加坡 英国 经济学 智库 全球 生活 成本 调查报告 显示 法国 ...\n", 340 | "Name: 4729, dtype: object\n" 341 | ] 342 | } 343 | ], 344 | "source": [ 345 | "for i in range(text.shape[0]):\n", 346 | " if \"香港\" in text[\"微博正文(有标题切词后)\"][i]:\n", 347 | " print(i)\n", 348 | " print(text.loc[i])" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 3, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "data": { 358 | "text/plain": [ 359 | "4983" 360 | ] 361 | }, 362 | "execution_count": 3, 363 | "metadata": {}, 364 | "output_type": "execute_result" 365 | } 366 | ], 367 | "source": [ 368 | "text.shape[0]" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "# 1.BOW+SVM" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 32, 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "name": "stderr", 385 | "output_type": "stream", 386 | "text": [ 387 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 388 | " \"avoid this warning.\", FutureWarning)\n" 389 | ] 390 | }, 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "accuracy: 0.65396\n", 396 | "precision: 0.82983\n", 397 | "recall: 0.65396\n", 398 | "f1 score: 0.6674\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "def bow_(processed_docs):\n", 404 | " dictionary = gensim.corpora.Dictionary(processed_docs)\n", 405 | " dictionary.filter_extremes(no_below=15, no_above=0.8, keep_n=100000)\n", 406 | " bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n", 407 | " return bow_corpus\n", 408 | "\n", 409 | "text_splited=[i.split(\" \") for i in text[\"微博正文(有标题切词后)\"]]\n", 410 | "bow_corpus=bow_(text_splited)\n", 411 | "#print(bow_corpus)\n", 412 | "\n", 413 | "bow_matrix=sparse2dense(bow_corpus)\n", 414 | "#print(bow_matrix)\n", 415 | "\n", 416 | "train_matrix, test_matrix, y_train, y_test= train_test_split(bow_matrix, text[\"分类\"],random_state=2,test_size=0.2)\n", 417 | "\n", 418 | "clf = svm.SVC()\n", 419 | "clf.fit(train_matrix,y_train)\n", 420 | "predictions = clf.predict(test_matrix)\n", 421 | "get_metrics(true_labels=y_test, predicted_labels=predictions)" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "# 2.TFIDF+SVM" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 33, 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "name": "stderr", 438 | "output_type": "stream", 439 | "text": [ 440 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 441 | " \"avoid this warning.\", FutureWarning)\n" 442 | ] 443 | }, 444 | { 445 | "name": "stdout", 446 | "output_type": "stream", 447 | "text": [ 448 | "accuracy: 0.85256\n", 449 | "precision: 0.85891\n", 450 | "recall: 0.85256\n", 451 | "f1 score: 0.85381\n" 452 | ] 453 | } 454 | ], 455 | "source": [ 456 | "def tfidf_(processed_docs):\n", 457 | " dictionary = gensim.corpora.Dictionary(processed_docs)\n", 458 | " dictionary.filter_extremes(no_below=15, no_above=0.8, keep_n=100000)\n", 459 | " bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]\n", 460 | " tfidf = models.TfidfModel(bow_corpus,normalize=False)#-----------------这里改了,改成false\n", 461 | " corpus_tfidf = tfidf[bow_corpus]\n", 462 | " return corpus_tfidf\n", 463 | "\n", 464 | "text_splited=[i.split(\" \") for i in text[\"微博正文(有标题切词后)\"]]\n", 465 | "tfidf_corpus=tfidf_(text_splited)\n", 466 | "#print(tfidf_corpus)\n", 467 | "\n", 468 | "tfidf_matrix=sparse2dense(tfidf_corpus)\n", 469 | "#print(tfidf_matrix)\n", 470 | "\n", 471 | "train_matrix, test_matrix, y_train, y_test= train_test_split(tfidf_matrix, text[\"分类\"],random_state=2,test_size=0.2)\n", 472 | "\n", 473 | "clf = svm.SVC()\n", 474 | "clf.fit(train_matrix,y_train)\n", 475 | "predictions = clf.predict(test_matrix)\n", 476 | "get_metrics(true_labels=y_test, predicted_labels=predictions)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": {}, 482 | "source": [ 483 | "# 3. W-TFIDF+SVM" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 34, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "def tw_lda_get_tfidf(text,gamma):\n", 493 | " #1.用正文+标题作为dictionary,过滤极端值\n", 494 | " content_and_title=[i.split(\" \") for i in text[\"微博正文(有标题切词后)\"]]\n", 495 | " dictionary = gensim.corpora.Dictionary(content_and_title)\n", 496 | " dictionary.filter_extremes(no_below=15,no_above=0.8, keep_n=100000)\n", 497 | " \n", 498 | " #2.计算标题的tfidf\n", 499 | " processed_docs_title=[i.split(\" \") for i in text[\"标题(切词后)\"]]\n", 500 | " bow_corpus_title = [dictionary.doc2bow(doc) for doc in processed_docs_title]\n", 501 | " tfidf_title = models.TfidfModel(bow_corpus_title,normalize=False)\n", 502 | " corpus_tfidf_title = tfidf_title[bow_corpus_title]\n", 503 | "# print(\"corpus_tfidf_title\",corpus_tfidf_title)\n", 504 | " \n", 505 | " #3.计算正文+标题的tfidf\n", 506 | " processed_docs_content=[i.split(\" \") for i in text[\"微博正文(有标题切词后)\"]]\n", 507 | " bow_corpus_content = [dictionary.doc2bow(doc) for doc in processed_docs_content]\n", 508 | " tfidf_content = models.TfidfModel(bow_corpus_content,normalize=False)\n", 509 | " corpus_tfidf_content = tfidf_content[bow_corpus_content]\n", 510 | "# print(\"corpus_tfidf_content\",corpus_tfidf_content)\n", 511 | " \n", 512 | " #4.把标题和正文tfidf结合在一起\n", 513 | " new_tfidf=[]\n", 514 | " for i in range(len(corpus_tfidf_content)):#corpus_tfidf2[i]\n", 515 | " dict_2={one:two for one,two in corpus_tfidf_content[i]}\n", 516 | " dict_1={one:two for one,two in corpus_tfidf_title[i]}\n", 517 | " for j in dict_1.keys():\n", 518 | " if j in dict_2.keys():\n", 519 | " dict_2[j]=(1-gamma)*dict_2[j]+gamma*dict_1[j]\n", 520 | " new_tfidf_part=[(one,two) for one,two in sorted(dict_2.items(), key=lambda d: d[0],reverse=False)]\n", 521 | "# print(\"new_tfidf_part\",new_tfidf_part)\n", 522 | "# new_tfidf.append(new_tfidf_part)\n", 523 | "# normed=gensim.models.tfidfmodel.smartirs_normalize(new_tfidf_part,\"c\") \n", 524 | "# new_tfidf.append(normed)\n", 525 | " new_tfidf.append(new_tfidf_part)\n", 526 | " \n", 527 | " #5.返回tfidf\n", 528 | " return new_tfidf,dictionary" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 35, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "def wtfidf_svm(text,gamma):\n", 538 | " a,b=tw_lda_get_tfidf(text,gamma=gamma)\n", 539 | "\n", 540 | " wtfidf_matrix=sparse2dense(a)\n", 541 | " #print(bow_matrix)\n", 542 | "\n", 543 | " train_matrix, test_matrix, y_train, y_test= train_test_split(wtfidf_matrix, text[\"分类\"],random_state=2,test_size=0.2)\n", 544 | "\n", 545 | " clf = svm.SVC()\n", 546 | " clf.fit(train_matrix,y_train)\n", 547 | " \n", 548 | " predictions = clf.predict(train_matrix)\n", 549 | " get_metrics(true_labels=y_train, predicted_labels=predictions)\n", 550 | " \n", 551 | " print(\"++++++++++\")\n", 552 | " \n", 553 | " predictions = clf.predict(test_matrix)\n", 554 | " get_metrics(true_labels=y_test, predicted_labels=predictions)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 36, 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "0.0\n" 567 | ] 568 | }, 569 | { 570 | "name": "stderr", 571 | "output_type": "stream", 572 | "text": [ 573 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 574 | " \"avoid this warning.\", FutureWarning)\n" 575 | ] 576 | }, 577 | { 578 | "name": "stdout", 579 | "output_type": "stream", 580 | "text": [ 581 | "accuracy: 0.9719\n", 582 | "precision: 0.97238\n", 583 | "recall: 0.9719\n", 584 | "f1 score: 0.97195\n", 585 | "++++++++++\n", 586 | "accuracy: 0.85256\n", 587 | "precision: 0.85891\n", 588 | "recall: 0.85256\n", 589 | "f1 score: 0.85381\n", 590 | "--------------\n", 591 | " \n", 592 | "0.1\n" 593 | ] 594 | }, 595 | { 596 | "name": "stderr", 597 | "output_type": "stream", 598 | "text": [ 599 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 600 | " \"avoid this warning.\", FutureWarning)\n" 601 | ] 602 | }, 603 | { 604 | "name": "stdout", 605 | "output_type": "stream", 606 | "text": [ 607 | "accuracy: 0.9714\n", 608 | "precision: 0.97185\n", 609 | "recall: 0.9714\n", 610 | "f1 score: 0.97144\n", 611 | "++++++++++\n", 612 | "accuracy: 0.85557\n", 613 | "precision: 0.86118\n", 614 | "recall: 0.85557\n", 615 | "f1 score: 0.85668\n", 616 | "--------------\n", 617 | " \n", 618 | "0.2\n" 619 | ] 620 | }, 621 | { 622 | "name": "stderr", 623 | "output_type": "stream", 624 | "text": [ 625 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 626 | " \"avoid this warning.\", FutureWarning)\n" 627 | ] 628 | }, 629 | { 630 | "name": "stdout", 631 | "output_type": "stream", 632 | "text": [ 633 | "accuracy: 0.9709\n", 634 | "precision: 0.97133\n", 635 | "recall: 0.9709\n", 636 | "f1 score: 0.97094\n", 637 | "++++++++++\n", 638 | "accuracy: 0.85356\n", 639 | "precision: 0.85887\n", 640 | "recall: 0.85356\n", 641 | "f1 score: 0.85464\n", 642 | "--------------\n", 643 | " \n", 644 | "0.3\n" 645 | ] 646 | }, 647 | { 648 | "name": "stderr", 649 | "output_type": "stream", 650 | "text": [ 651 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 652 | " \"avoid this warning.\", FutureWarning)\n" 653 | ] 654 | }, 655 | { 656 | "name": "stdout", 657 | "output_type": "stream", 658 | "text": [ 659 | "accuracy: 0.97065\n", 660 | "precision: 0.97114\n", 661 | "recall: 0.97065\n", 662 | "f1 score: 0.9707\n", 663 | "++++++++++\n", 664 | "accuracy: 0.85356\n", 665 | "precision: 0.85913\n", 666 | "recall: 0.85356\n", 667 | "f1 score: 0.85481\n", 668 | "--------------\n", 669 | " \n", 670 | "0.4\n" 671 | ] 672 | }, 673 | { 674 | "name": "stderr", 675 | "output_type": "stream", 676 | "text": [ 677 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 678 | " \"avoid this warning.\", FutureWarning)\n" 679 | ] 680 | }, 681 | { 682 | "name": "stdout", 683 | "output_type": "stream", 684 | "text": [ 685 | "accuracy: 0.9704\n", 686 | "precision: 0.97085\n", 687 | "recall: 0.9704\n", 688 | "f1 score: 0.97044\n", 689 | "++++++++++\n", 690 | "accuracy: 0.85456\n", 691 | "precision: 0.86079\n", 692 | "recall: 0.85456\n", 693 | "f1 score: 0.85589\n", 694 | "--------------\n", 695 | " \n", 696 | "0.5\n" 697 | ] 698 | }, 699 | { 700 | "name": "stderr", 701 | "output_type": "stream", 702 | "text": [ 703 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 704 | " \"avoid this warning.\", FutureWarning)\n" 705 | ] 706 | }, 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "accuracy: 0.96939\n", 712 | "precision: 0.96989\n", 713 | "recall: 0.96939\n", 714 | "f1 score: 0.96945\n", 715 | "++++++++++\n", 716 | "accuracy: 0.85557\n", 717 | "precision: 0.86319\n", 718 | "recall: 0.85557\n", 719 | "f1 score: 0.85706\n", 720 | "--------------\n", 721 | " \n", 722 | "0.6\n" 723 | ] 724 | }, 725 | { 726 | "name": "stderr", 727 | "output_type": "stream", 728 | "text": [ 729 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 730 | " \"avoid this warning.\", FutureWarning)\n" 731 | ] 732 | }, 733 | { 734 | "name": "stdout", 735 | "output_type": "stream", 736 | "text": [ 737 | "accuracy: 0.96864\n", 738 | "precision: 0.9691\n", 739 | "recall: 0.96864\n", 740 | "f1 score: 0.96869\n", 741 | "++++++++++\n", 742 | "accuracy: 0.85858\n", 743 | "precision: 0.86559\n", 744 | "recall: 0.85858\n", 745 | "f1 score: 0.85994\n", 746 | "--------------\n", 747 | " \n", 748 | "0.7\n" 749 | ] 750 | }, 751 | { 752 | "name": "stderr", 753 | "output_type": "stream", 754 | "text": [ 755 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 756 | " \"avoid this warning.\", FutureWarning)\n" 757 | ] 758 | }, 759 | { 760 | "name": "stdout", 761 | "output_type": "stream", 762 | "text": [ 763 | "accuracy: 0.96713\n", 764 | "precision: 0.96767\n", 765 | "recall: 0.96713\n", 766 | "f1 score: 0.9672\n", 767 | "++++++++++\n", 768 | "accuracy: 0.85757\n", 769 | "precision: 0.86437\n", 770 | "recall: 0.85757\n", 771 | "f1 score: 0.85884\n", 772 | "--------------\n", 773 | " \n", 774 | "0.8\n" 775 | ] 776 | }, 777 | { 778 | "name": "stderr", 779 | "output_type": "stream", 780 | "text": [ 781 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 782 | " \"avoid this warning.\", FutureWarning)\n" 783 | ] 784 | }, 785 | { 786 | "name": "stdout", 787 | "output_type": "stream", 788 | "text": [ 789 | "accuracy: 0.96513\n", 790 | "precision: 0.96572\n", 791 | "recall: 0.96513\n", 792 | "f1 score: 0.96519\n", 793 | "++++++++++\n", 794 | "accuracy: 0.85557\n", 795 | "precision: 0.86325\n", 796 | "recall: 0.85557\n", 797 | "f1 score: 0.85705\n", 798 | "--------------\n", 799 | " \n", 800 | "0.9\n" 801 | ] 802 | }, 803 | { 804 | "name": "stderr", 805 | "output_type": "stream", 806 | "text": [ 807 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 808 | " \"avoid this warning.\", FutureWarning)\n" 809 | ] 810 | }, 811 | { 812 | "name": "stdout", 813 | "output_type": "stream", 814 | "text": [ 815 | "accuracy: 0.96237\n", 816 | "precision: 0.96306\n", 817 | "recall: 0.96237\n", 818 | "f1 score: 0.96247\n", 819 | "++++++++++\n", 820 | "accuracy: 0.84654\n", 821 | "precision: 0.85562\n", 822 | "recall: 0.84654\n", 823 | "f1 score: 0.84846\n", 824 | "--------------\n", 825 | " \n", 826 | "1.0\n" 827 | ] 828 | }, 829 | { 830 | "name": "stderr", 831 | "output_type": "stream", 832 | "text": [ 833 | "C:\\Users\\dell-pc\\Anaconda3\\lib\\site-packages\\sklearn\\svm\\base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n", 834 | " \"avoid this warning.\", FutureWarning)\n" 835 | ] 836 | }, 837 | { 838 | "name": "stdout", 839 | "output_type": "stream", 840 | "text": [ 841 | "accuracy: 0.96036\n", 842 | "precision: 0.9613\n", 843 | "recall: 0.96036\n", 844 | "f1 score: 0.9605\n", 845 | "++++++++++\n", 846 | "accuracy: 0.84152\n", 847 | "precision: 0.8511\n", 848 | "recall: 0.84152\n", 849 | "f1 score: 0.84365\n", 850 | "--------------\n", 851 | " \n" 852 | ] 853 | } 854 | ], 855 | "source": [ 856 | "for ga in [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]:\n", 857 | " print(ga)\n", 858 | " wtfidf_svm(text,ga)\n", 859 | " print(\"--------------\")\n", 860 | " print(\" \")" 861 | ] 862 | }, 863 | { 864 | "cell_type": "code", 865 | "execution_count": 37, 866 | "metadata": {}, 867 | "outputs": [ 868 | { 869 | "data": { 870 | "text/plain": [ 871 | "" 872 | ] 873 | }, 874 | "execution_count": 37, 875 | "metadata": {}, 876 | "output_type": "execute_result" 877 | }, 878 | { 879 | "data": { 880 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAr0AAAG7CAYAAADOlS1YAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAAPYQAAD2EBqD+naQAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzs3Xl8nlWd///XJ2nSJmmTtKWlC5R9ayuKLFpkVVBgRBlwhhnBYYSfqIx8FXS+U8QBBdlcwA0RxEFFQZwBK4sUvgqiFWUVhi4gYmmhGy1tk7ZJmjY5vz/uO2mSJu2dNOvV1/PxyOPOde5zzn3uXmnyzsm5zhUpJSRJkqQsKxroAUiSJEl9zdArSZKkzDP0SpIkKfMMvZIkSco8Q68kSZIyz9ArSZKkzDP0SpIkKfMMvZIkSco8Q68kSZIyz9ArSZKkzBuw0BsRF0TEwohoiIhnIuLobdQtiYjLIuKVfP3nI+KkHelTkiRJO48BCb0RcSbwDeAq4BDg98CDETGliyZfBj4OXAhMBb4H/CIiDtmBPiVJkrSTiJRS/79oxBPAsymlT7YpWwDMSild0kn9pcBVKaUb25TNAtanlM7uSZ+SJEnaeQzr7xeMiFLgUODaDk89DBzZRbPhQEOHsnrgqB3ok4gYnu+7rTHA6q7aSJIkacCNApambsze9nvoBXYBioEVHcpXABO6aPMQcHFE/A54BXgP8MF8Pz3tE+AS4PKCRy5JkqTBYjdgSaGVByL0tuiYzKOTshafBr4PvJiv8wpwG/DRHegT4Brg+jbHo4DXX3vtNSorK7fRTJIkSQOhtraW3XffHWBdd9oNROhdBTSx9QzseLaeqQUgpbQSOC0iRgBjgaXkljIs7Gmf+X43AhtbjiMCgMrKSkOvJElShvT77g0ppUbgGeDEDk+dCDy+nbYNKaUl5ML6GcAvd7RPSZIkZd9ALW+4Hrg9Ip4G/gicD0whtxUZEfFjYEnLrgsR8Q5gMvBc/vGL5AL7VwrtU5IkSTuvAQm9KaW7ImIscBkwEZgLnJJSWpSvMgVobtNkBLm9evcG1gO/Aj6SUlrbjT4lSZK0kxqQfXoHq4ioBGpqampc0ytJGlJSSmzevJmmpqaBHoq0Q4qLixk2bFjrtVYd1dbWUlVVBVCVUqottN+B3L1BkiT1gsbGRpYtW0ZdXd1AD0XqFeXl5UycOJHS0tJe69PQK0nSENbc3MzChQspLi5m0qRJlJaWdjlDJg12KSUaGxtZuXIlCxcuZL/99qOoqHf2XTD0SpI0hDU2NtLc3Mzuu+9OeXn5QA9H2mFlZWWUlJSwaNEiGhsbGTFiRK/02+9blkmSpN7XW7Nh0mDQF1/P/g+RJElS5hl6JUmSlHmGXkmSJGWeoVeSJAGwrKaex19ZxbKa+j59nQsvvJBzzz2Xl19+md13352//vWvXHfddVx55ZWce+65XH311ey3337ccsstzJgxg7q6Ou677z5GjRrFf//3fzNr1iy+9rWvsffee7Nx40YA7r33XmbOnMnhhx/OrFmzeP311wH49re/zZQpU7jzzjv51re+xU033QTApk2bmDlzJocddhizZs3i7rvv5rOf/SynnHLKdse/dOlSvv71r/PDH/6Qz372swwfPpzGxkZeeuklpk6dyuWXX976Gh/60Ie46aabeOaZZ6iqqmLRoi33zJoxYwYf/ehHaWxs7PK1XnnlFY444gj+/d//nVmzZvGTn/yEo48+mjvvvBOARYsW8aMf/Yi99967W+fgjjvu4L/+67/4zne+w6GHHsqll17arfZDUkrJj/wHUAmkmpqaJEnSUFBfX5/mz5+f6uvrd6ifnz25KO018/60x3/cn/aaeX/62ZOLemmEW/vNb36TrrjiipRSSsccc0xqampK//M//5Ouuuqq1NzcnBYuXJiOPfbYlFJKc+fOTXPmzEkppbTHHnu06+epp55KDzzwQOvxo48+ms4555x2ddr2lVJKX/3qV9O3vvWtLut/+9vfTm+++Wbr8bPPPpueffbZdnU+/OEPpw0bNrQef/e7302LFuX+va688srW/lNK6Rvf+Ebr54ceemi68847U0q587bPPvuk2267rbN/onbOOeec9Oijj7Yeb9y4MV1zzTXt6kyZMmW7/bT485//nL74xS+2Hq9fvz5deeWVBbfvD9v6uq6pqUlAAipTN3KeW5ZJkpRBp357DivXbSyoblNzYuX6LXWbE/zH3S/wtYf+QnFR4Xv+jhs1nPsuPGq79Q488EBuu+02ADZu3MjixYupq6tj5syZW+0xPG3aNJqbmzvt57DDDmPz5s0Fjw/g3HPP5e1vfzsXXnhhp89fcMEF7Y5ramq2qrNo0aKWyTIAPvnJT7Z+fvzxx3PDDTe09l9cXNz63LHHHssf//hH/umf/oknnniCY445pltjb1FaWsrnPve5dmXd2Zt54cKF7epXVFTwhS98oUdjGUoMvZIkZdDKdRtZXtuwY32sLyw0d9ekSZNYvnw5mzZtYu3atbz88stA19tUdVZ+88038/GPf5xhw7oXZcaMGcP69etZs2ZNl31uzyc+8QmOP/54vvCFL3Dqqae2C5BHHHEEzzzzDCklXn31Vfbbb7/W5w4++GB++MMfArng2d0lCQANDQ38/Oc/51/+5V+63bbFe9/7Xr7+9a9TU1PDZz/7WSZNmtTu+T//+c88/PDDHHDAATz++ON8+tOfZvLkyQDcfffdrFy5kvHjx7NgwQIuvvhiysrKuOeee7j33nvZc889OfTQQ3nttdf4wQ9+wJ/+9CdKSkp44oknePrpp6moqGD+/PlcdNFFTJw4scfvoScMvZIkZdC4UcMLrttxpre1j5HDuz3TW6iI4LnnnuPMM8/k5ZdfZtSoUdttU1dXx6xZs9i0aRPf/OY3CwqonSkrK2u9ZfPixYuZNWsWa9as4d57723t89prr2Xz5s0sXLgQgDlz5jBs2DBmzpzJ2WefzZFHHsl1113H5ZdfzvXXX8/xxx8PQElJCfvvvz8LFizg6aef5h/+4R9aX7e4uJiSkhIaGhq6vQ/tnDlzWLt2LQ8++CAzZszo0ftuUVFRwWOPPcYdd9zB3/3d33HUUUfx1a9+lREjRlBbW8vFF1/Mb37zG4qKinj99deZPXs25513Hs899xyPPPIIN954I5ALx5///Oe54YYbOP300xkzZgxXXHEF//iP/8ipp57K1KlTKSkpoaamhltvvZXvf//7AMybN4/Pfe5z/PSnP92h99Fdhl5JkjKokGUGbd311GI+f89cmlKiOIKrT5/OmYdP6aPRwcSJE3nkkUc47bTTuPHGG/nABz6w3Tbl5eWcdtppALzxxhs9fu26ujqqq6sBmDJlSmufLUEYYObMmQD89re/BeC4445r18fee+/NzTffzF//+lfOOOMMfvrTnzJ9+nQgt8Tht7/9LSUlJZSVlbVrd+ihh/Lkk0+2W/ZQiKOOOorjjjuOE044gbvvvrtbbTtTXFzMRz7yEc466yy+8pWvcO6553LHHXcwZ84cDj744NZQ/qlPfaq1zZ133skJJ5zQenzIIYdw1llntev3kEMOYerUqcCWf7OWwD5r1iwgd+vslpnj/uTuDZIkiTMPn8Kcmcdz58feyZyZx/dp4IXcut7nn3+eAw88kF/84hetgbFQPZ3lXbFiBRMmTKCioqLHfc6bN6/183333ZcvfelLPPTQQ61lxx9/PI888gilpaVbtZ0xYwa33HILBx98cA9GDyNHjtwqaHbX/PnzW9ckFxUVMXPmTF577TUANm/e3OWSkebm5q2e6xjeO5uxb25uZty4cZx22mmcdtppnH766XzlK1/ZoffQE4ZeSZIEwMSqMmbsM5aJVWXbr7yDDjzwQCZMmEBZWRmTJ09mr7326lb77q7lbXHzzTdz8cUXF9zn6NGjGT16dLuy6667rnWrNMhtYfaWt7yl9fiwww7j17/+Ne94xzu26m/GjBnMnj272yF/e+Nse2Hd9rz44ov8/Oc/bz3evHkz48ePB+Cd73wnTz75ZLv+7rrrLgDOOOMM5syZ01r+0ksvcfTRR2/39d75znfypz/9qfWiw+bmZu6///6Cx9tbXN4gSZL63UEHHUR9fW4/4A996EOtF4Nt3LiRe+65h0WLFvGHP/yBd73rXQD84he/YM2aNfz+97/vNGjde++9PP7448ybN49Zs2Zx2GGHsdtuu3HPPffw6quvcuedd7Jo0SImTJjAeeedR0NDA7Nnz2bBggW8/PLL7S44a+utb33rVmURwbve9S7OPvtsKisrgdzFYS2Ki4s5++yzOfDAA1vLnn/+eWbPns3pp5/ORz7yEd58802eeuopXnnlFT784Q93OisMuWA5f/58Zs+ezYwZMxg+vP266UWLFvHAAw+wePFivvSlL3H66ae3C+CdaVmC8Pjjj/O2t72NlStXcvXVVwMwfvx4Pv/5z3PRRRexzz770NTUxBlnnAHkwuvChQu57bbbqK6u5pVXXmmdsX3kkUd4+OGHeeGFF5g1axbHH388VVVVAIwbN47rrruOCy+8sHXpwznnnLPNMfaF6M5vBlkXEZVATU1NTesXsSRJg1lDQwMLFy5kr732YsSIEQM9HKlXbOvrura2tiVQV6WUagvt05leSZKkATRnzhxWrVrV6XMnn3zyVrO7hWi5aKyjUaNG8Z73vKfb/WWBoVeSJGkAHXVU93baKETLjhTawgvZJEmSlHmGXkmSMsBrdJQlffH1bOiVJGkIKykpAdrfWEEa6lq+nlu+vnuDa3olSRrCiouLqa6ubr1DWXl5eev2X9JQk1Kirq6ON954g+rq6m7fuW5bDL2SJA1xEyZMAHbs1rzSYFJdXd36dd1bDL2SJA1xEcHEiRMZP348mzZtGujhSDukpKSkV2d4Wxh6JUnKiOLi4j4JC1IWeCGbJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwz9EqSJCnzDL2SJEnKPEOvJEmSMs/QK0mSpMwbsNAbERdExMKIaIiIZyLi6O3U/0xEvBQR9RHxWkTcEBEj2jw/LCK+nO+zPiL+FhGXRYTBXpIkaSc3bCBeNCLOBL4BXAD8Afg48GBETE0pLe6k/lnAtcC5wOPA/sAP809flH/8D+ATwDnAPOAw4DagBvhmX70XSZIkDX4DEnqBi4EfpJRuzR9/JiLeB3wSuKST+jOAP6SU7sgfvxoRdwJHdKjzy5TSA23q/DO58CtJkqSdWL//6T8iSoFDgYc7PPUwcGQXzeYAh0bEEfk+9gZOAR7oUOc9EbF/vs5bgaOAX21jLMMjorLlAxjVg7ckSZKkQW4gZnp3AYqBFR3KVwATOmuQUvpZRIwD5kREkBv3TSmla9tUuw6oAl6MiKb8a1yaUrpzG2O5BLi8Z29DkiRJQ8VAXuSVOhxHJ2W5JyKOAy4ltwb47cDpwPsj4j/bVDsTOBv4cL7OOcDnIuKcbYzhGnJBueVjt26/C0mSJA16AzHTuwpoYutZ3fFsPfvb4krg9jZrgF+IiArgloi4KqXUDHwVuDal9LM2dfYgN5v7o846TSltBDa2HOcmkSVJkpQ1/T7Tm1JqBJ4BTuzw1InkdmboTDnQ3KGsidzscGynjluWSZIk7eQGaveG64HbI+Jp4I/A+cAU4HsAEfFjYElKqWUnh/uAiyPiz8ATwL7kZn/vTSk1talzaUQsJrdl2SHkdon4r/55S5IkSRqsBiT0ppTuioixwGXARGAucEpKaVG+yhTaz9p+mdx63y8Dk4GV5ENumzoXkgvC3yW3VGIpcDNwRd+9E0mSJA0FkVKn147tlPLbltXU1NRQWVk50MORJElSB7W1tVRVVQFUpZRqC23neldJkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRlnqFXkiRJmWfolSRJUuYZeiVJkpR5hl5JkiRl3oCF3oi4ICIWRkRDRDwTEUdvp/5nIuKliKiPiNci4oaIGNGhzuSI+ElEvBkRdRHxXEQc2rfvRJIkSYPdsIF40Yg4E/gGcAHwB+DjwIMRMTWltLiT+mcB1wLnAo8D+wM/zD99Ub7O6HxfjwInA28A+wBr+/K9SJIGzrKaehau2sBeu1QwsapsoIcjaRAbkNALXAz8IKV0a/74MxHxPuCTwCWd1J8B/CGldEf++NWIuBM4ok2d/wBeSyl9tE3Zq707bEnSYLC5qZnvPPpXvvnrl0lAAOcdtRenHTKZ0RWljC4voaykmIgY6KFKGiR6HHojohIYnVJaFBGRUkoFtisFDiU3c9vWw8CRXTSbA5wdEUeklJ6MiL2BU4AftanzAeChiPhv4FhgCfDdlNL3tzGW4cDwNkWjCnkPkqT+07CpiReXr2PukhrmLa1l/tIa5i+rZVPTlh87Cbh1zkJunbOwtax0WBHVZSWMLi+lujz3OLqihOryXCiuLsuX50NydXkp1WUlDCv2chcpi3oUeiPiA+Rma/9Ibmb2lIhYmVJ6soDmuwDFwIoO5SuACZ01SCn9LCLGAXMi92v7MOCmlFLb4Lw3uZni64Gryc0CfysiNqaUftzFWC4BLi9gzJKkflDbsIn5S2uZu6SG+Utrmbe0lr+uXE9Tc0HzKu00bm7mjXUbeWPdxm61GzViWGtQbgnIbYNzZ+Ujhw9zVlka5Ho60zs1pXRcRHwQIKX0QER8HCgk9Lbo+B0sOinLPRFxHHApuTXATwD7At+MiGUppSvz1YqAp1NKn88f/zkippELwl2F3mvIheQWo4DXu/EeJEk9tHLdRuYtzc3ezltaw9wltSxeXbfddhGwe3UZi9fUty8HPvC2STRubmZNXSNr6zaxpq6RNXWbaNzcXPC41jVsZl3DZhavLvy9lBQHVWUtM8YdQ/GWmeSOj6XDtj2r7Jplqff0NPR2FgwrCmy7Cmhi61nd8Ww9+9viSuD2NmuAX4iICuCWiLgqpdQMLAPmd2i3ADijq4GklDYCrVMA/pYuSb0vpcTra+rbBNzcTG4hM7DDioL9dh3F9EmVTJtUybTJVRw0sZKRw4dx11OL+fw9c2lKieIIrj59OmcePqXTfuobm/IBuH0Yrsk/ti1veayp30RhC/dgU1Ni1fqNrFrfvVnlitLiXAiuyAXkqvxyjNHlJSxctYH7/3cZCSgK+OKp0/iXI/fsVv+Stuhp6B0XEWPJz8xGxLHAfoU0TCk1RsQzwInAL9o8dSLwyy6alQMdf01vIveLfUtS/QNwQIc6+wOLChmXJGnHNTUnFq5az9wlte1Cbk39pu22HVFSxEETK5k+qSoXcCdVsf+EkQwfVtxp/TMPn8Ix+4/j1VV17LlL+TZnQstKiykrLWNSdeGzpc3NidqGTW1CcSNrNmwJxmvrc4G5pXxtPkDXb2oq+DU2NDaxobGeJWvrt1mvOcFl987jaw+/xO5jyplUXcbk6jJ2G13W+vnk0WWMrSh1AkfqQhR4/Vn7RhEjgeuAd5MLpHOAT6eUVhXY/kzgduAT5NYFnw98DJiWvzDux8CSlNIl+fpfJLeG+Hy2LG+4CXgmpXRmvs7h5LYzuxz4Obk1vd8Hzk8p/bTAcVUCNTU1NVRWVhbSRJJ2Whs3N/GX5etbw+3cpTW8uGxdQaGvcsQwpk2qYvrkytbHvXYZSXHR0A9sDZuaqKnPzya3CcMtwTk3k9xS3hKgN/Vo3XJHw4cVtQbgSVW5x5bjydVlTKgaQYkX6mmIq62tpaqqCqAqpVRbaLsehd7eEBEXAP8XmAjMBS5KKf0u/9xvgVdTSv+aPx5Gbk3vR4DJwErgPuDSlNLaNn2+n9w63f2AhcD129q9oZMxGXolqRPrN25mwbJa5i1pCbi1vLxiHZsLCGrjRw1n+uSW2dtcyN1tdJkzkm2klKht2ExNPhy/snI9n/3581td6DKhcgQr12/scUAuCti1ckS72eFJ1WXs1iYYVwwfqN1MpcL0a+iNiPcCz6WU3uh240HM0CtJsHpD45bZ2/wuCgvf3FDQ+tYpY8pbZ2+n5kPu+FEjtt9QW+lqzXJTc2JFbQNL1tazdG09r6/JLY9YsiZ3vGRtPXWNhS+x6KiqrKTd7HDbcDy5uoxdRrqEQgOrv0Pv/cDFKaW/dLvxIGbolbQzSSmxrKah3e4J85fWsLSmYbttiwL2HT+S6flwO31y7rFyREk/jHznsaymvqA1y22llFhbtykXhDuE4ZagvGp9Y4/HVNqyhKKTQLzb6DJ2rRyx3V0ppB3R36H3Eyml73Uom5ZSmtftzgYRQ6+krGnZ8mqPMeVs3Nzcuva2ZQ/c1Ru2H35KhxVx0IRRTM1fYDZ9chUHThjFiJLOLzDT4Newqak1AC9pM1PcEoyX1zQUtHSlMxGw66gR7cJwbtZ4BJOry5k8uoyRbZZQuC2buqu/Q+9e5O6edkfLndgi4oyU0t3d7mwQMfT2Lr+RSf1jU1Nz60VTqzfkLo5avaGR3/1lJQ/P72onyM6NHD6sdVlCywVm+4wb6cVPO5mm5sQb6xraBeGWz1uC8oYdWEJROWIYk0eXA/DistrWbdmuOf0tXW47J7Xo79A7D1hPbsuyOnLbhu2TUhrSX6mG3t5z11OLueSeF2hOfiOTuqO5OVFTv4nVdY2s2dDYJsTmLnB6c/2WUNvyuK5hc49ea2xFKdPyF5i1bBM2ZUw5RRnYQUF9K6Xc1+nrbZdOrKlnac2WcNyTJRRFAX+Y+W4nSrRN/R16L0op3dChzJnendjausbWdYFPv7qm09mld+0zlv0njGKPMeXsMbaCKWPL2W10WZd7cEpDXUqJ9Rs352Zg6xpZvWFjLrxuaOwk1G7Z87UXdq7q0ofevhsnv2UC0yZVsWvlcC9IUp9p2NTUGohbZodfX9s+JHf2tX7iQbty3YcOZkxFaf8PWkNCf4feg1JKCzqU7ZNSeqXbnQ0iht7tSymxonYjc5dsuXXovKW1291YvSsRMKmqjCljytljbDlTxpazx5iK1s+9KEZ9qbtLcOobmzoPq60hdlOHENvIpqa+S7BVZSWMqcjdvSv3WJp7rCilCLhm9ovtdlwojmDOzOOdRdOg8PqaOo75yqOdBt/KEcP4zAn785EZe7i0Rlvp9316I6IMOCZ/+HhKaV2POhpEDL3tNTcnFq2ua72qe17+4pc3C7jwpbeMLi9hytiK/OxweT4c50Lx+FHOUqnnfvKnRVz2y7k0p9wvXx8+YgoHTqzsZPZ1S5jtzp22uquitJjRFaXtw2t5KWNHthyXtAu11WUlDNtOGOjObXqlgdD2azSAkuKgsc0vivuMq+A/3z+V4w4YP3CD1KDT3zO9BwNXAy8Cm8ndIe3qlNKz3e5sENmZQ++mpmZeXrG+zW1Da1iwbB3rN25/rWB5aTFTJ2658GXa5EqeW7yWy345r/WH7Zf/fjrHHTCORW/WsfjNOhat3pD7fHUdi96sK+gWpR2NKCliyphypoyp6DBTXM5uo8vdMmcnsampmbV1m6hpvSVsbu1ryyb/a+vb3Ca29fNGGjZ3vLN57ykdVsTYDjOvY8pLOg21YypKqS4v6bOdEHqy5ZXUn9p+jRZH8JWHXuJ/nnm9XZ3jDxjHF94/lX3GjRygUWow6e/Q+7mU0tc6lG21jdlQs7OE3rrGzSxYto75LTO4y2r4y/L1NDZtPwSMqShl2qTK/NXdVUyfVMmeYys6vfClOz9sa+o2dQjCWz5fXttQ0Kb4bRUFTKwqY4+xLTPEFW1missZ5bKJQacpfwFXyy1ba+rbB9XWMFu/acutW+s2FfSL2Y4oLopOZ1rHlOcf25S3fJSVFPtXCGkHPP/aWq64fz7PLFrTWjasKDjnyD35P+/Zj6oyv4fvzPo79J6WUprVoeyDKaVfdruzQSSLobftBWYtSxQWrtpQ0IUyk6vLttq6aELliH7/Yd6wqYnX1+RmhNuF4tV1vL66vqCw3tGYitLWALzHmPLcEor85+NcNtGqJ9vONTcn1m3cvHVQ3dAy67ol2Ladea3t4Q4E3VFSHFSXlzJyeDELV9W1ey6Az59yIHuPG9ku1FaOGObXgzQAUkrc+/xSrn3wRZa1uWHKmIpSLj5xf/75iCkUu9PITqm/Q+8XgOtTSnX542HARSmlr3a7s0FkKIfelBLLaxuYt6S22xeYRcDeu1Tklia0LFGYVMnoIXDlbMvtOHNhODc7vGh1fgnFmxt6FKTKSopzyybyITi3bCK3rnjy6LJ2F1VkZS/ilBJNzYmmlGhuhqaU+O+nX+PK++e3rnk991178bbdq9uE2U2srW9sDbFr24TYvtx9AHKzr9VlJVSV52ZZq8tKqC7PLRMYXV5CVXmjdOrtAAAgAElEQVTu4q7R5aVUlZW0roEtL90yA+t6V2loqG9s4nuPvcLNv3uFhk1bJjkOnDCKy94/lSP33WUAR6eB0N+hdzfgFmAjuTW9FcCnUkp/63Zng8hQCb3NzYlX39yQD7dbAm5Bd1YqLmL/CSOZNjG39nbapEoOnFBJRZu742TJ2rrGNkG4fSheXrv9W612VFwUTKoewR5jKtjU1MyTC1eTyM0Svm/6rkyfVEVTPjQ2t4bItoEy0ZzotLwp0UlZrn6n5fnHpmbalTWnNuF1q7pt2+fHkVK3l4/0lgioHNFFUM2H2NxH7rnqslKqK0oYWTqsV/aSdb2rNHQsXVvPtQ++yL3PL21X/r5pu3LpKVOZMrZ8gEam/jYQuzcEcABQBLyYUuq7q0L6yWAMvY2bm3n5jXXMW1qbv21obgeFQu6EU1Fa3Lr2tmWZwn7jR3mBV17DpiZey19I1xqK84H4tTV1fbrVVBaNGj6M6op8MG0XVNvOwpa2m52tLCvxz5OSuuXpV1fzpfvm88KSmtay0uIizjt6L/7t+H3b3eJY2dTvobddJxFjU0pv7nBHA6y/Q2/HP43nLjDLz9528wKzsRWlrQF3Wj7gdnWBmbavqTmxrKY+v9NEXbvlE4vfrGNdH188VYiI3L6rRUVBcQTFRZEryx+3LS8q6qxuUNyhvOVxc3MzT726pv3rAZ86fl+mjC3fEmjz4baqrMS9NCX1m+bmxN3Pvs5XHnqJles2tpaPGzWcf3/fAXzo7bv58y/D+nt5wz8C56WU3pc/Ph4oTSk91O3OBpH+DL3/NWchVz4wv/XPyuNGlrJqQ2NBf2aeXF3Wbu3ttAG6wGxnlVLipeXrOPlbv293vooCrj39YMaNGt4mRNImeLYJodF5EO2qXVFsXd7X59s1r5IGu/UbN/PdR//Krb9f2G6C6C2Tq7j81KkctueYARyd+kp/h95/A37WdnbXLcsKt6ymnhnXPFLAeGCfcSNbZ26nTapi6sShcYHZzmBnCIWueZU0FCx+s46rf7WA2fOWtys/9a2TmHnygUyu9vtXlvR36N1qe7KIuCildEO3OxtE+iv0Pv7KKj78/Se2Kt9rlwresdeY/D64VRw0cRTlpa5NGswMhZI0eDz+yiquuG8+Ly7fcpPYESVFnH/MPnzi2L39mZoRA7Fl2a9TSn/KHx8MnJ9S+lS3OxtE+nOm98hrH9nqT+N/mPlug5MkSTugqTnxs6cW8/WH/9JuV6OJVSOYefKBfOCtk1wOOMT1d+gtB74KnAg0A88Bn04preh2Z4NIf67p3Rn+NC5J0kCpqd/Et3/zMj98/FU2t9k8/O1Tqrn81Gm8dffqARyddsSA7t6QFQOxe4N/Gpckqe+8snI9Vz2wgEdefKNd+Rlv343/e9IB7Fo5YoBGpp4akNAbEbsARwPzU0ov9bijQWIw7tMrSZJ23GN/WcmV98/nr2+sby0rLy3m347fl/OO2osRJcUDODp1R09Db8Eba0bE4oi4ss3xwcBjwBTgXyPizG6MV5Ikqd8cu/84Hvz00Vx+6lQqR+QuaKtrbOKrD73ECdc/xoMvLMO/fmdbwTO9EXFzSunjbY5/B1yUUnomf/yxlNL3+2aY/cOZXkmSsm/NhkZu+PVf+MmfFtFmuS/v2GsMl506lWmTqgZucNquPp/pBf7Y8klEHAMsbQm8ee4DIkmSBr3RFaVc8cHpPPjpYzhq311ay59YuJr3f3sOl9zzAqvWb9xGDxqKuhN6qwAit8/HF4GrOjx/ei+NSZIkqc8dMGEUt593BN//l8PYc2w5ACnBnU8u5viv/pbv/+5vNG5u3k4vGiq6E3ofioi7gSeB2SmlFwAiYp+I+ClQ0hcDlCRJ6isRwYlTd+Whi47hkpMPZOTw3B+u123czFW/WsD7vvE7frNghet9M2CHtyyLiNKUUuP2aw5+rumVJGnntnLdRr7+8Evc9fRr7W4idfR+u3DZ+6ey366jBm5wAtynt1cYeiVJEsDcJTVccd98nnx1dWtZcVHwkXfuwWdO2I/q8tIBHN3ObcBDb0R8IKV0b690NkAMvZIkqUVKiQdeWMY1v3qRJWvrW8ury0u46IT9OesdUxhW3J2VouoNfR56I+Jstr0G+KyU0vsKfeHByNArSZI6atjUxPd/9ze++9tXqN/U1Fq+3/iRXHbqVI7eb9wAjm7n0x+h9y5gAdBZgwD+PqX01kJfeDAy9EqSpK4sr2ngutkv8os/L2lXfsJB47n076ay1y4VAzSynUt/hN6/Tyn9YhvPn5FSurvQFx6MDL2SJGl7nl28hivum89zr61tLSspDv71yD258D37UTnCDa360oCv6c0CQ68kSSpEc3Pil88v4doHX2RF7ZYbWYytKOVz7zuAfzxsd4qLYgBHmF39EnojojqltHb7NYcmQ68kSeqODRs3873HXuGW3/2NjW1uZDF1YiWXnTqVPcaWs3DVBvbapYKJVWUDONLs6I/lDfcDv0opfbdnQxz8DL2SJKknXltdx7UPvsgDLyzr9PmigGtOfwtnHj6ln0eWPT0Nvd3ZZ+PFlsAbEedHxH0R8cmIcK8OSZK0U9t9TDk3nvV27jr/nUybtPXEWXOCmfe8wAtLMvsH80GvOzO9p6eU7mlz/K8ppR/21cAGgjO9kiRpRzU1J657cAG3/H5hp88fvudoTpo+kfdN25XdRpf38+iGvv6Y6d3c4Xh124OI2KcbfUmSJGVScVHw0aP2oqvr2J56dQ1X3j+fo657lFO/PYcbH/0rf1u5vn8HuRPqzkzvU8ALbYqmAItbngYOTCnN6N3h9S9neiVJUm+566nFfP6euTSlRFHAcfuPZ9HqDbyyckOn9fffdSQnTZvASdMnctDEUUS4+0Nn+uNCtmuA2duoclJK6ZJCX3gwMvRKkqTetKymnldX1bHnLuWtuzf89Y11PPjCcmbPW868pZ1ntj3GlnPStAm8b/oE3rZbNUVuf9aqP0LvsJRSxyUObZ8fnlLa2NXzQ4GhV5Ik9afFb9bx0LzlPDh3Gc8u7vwitwmVI3jftF05afpEDt9zNMOKd+49BAb85hRZ2MPX0CtJkgbKitoGHpq3nNlzl/PEwtU0NW+d0cZUlPLeqbvyvukTeNc+u1A6bOcLwIMh9H4gpXRvr3Q2QAy9kiRpMFi9oZFfz1/B7HnLmfPyKhqbmreqM2r4MN5z0HhOmj6RY/cfR1lp8QCMtP/1aeiNiMOBk7dVBTgspXRqoS88GBl6JUnSYFPbsIlHX3yD2XOX89uXVlK/qWmrOmUlxRx3wDhOmj6Bdx84nlEjSgZgpP2jr0PvnsDH8UI2SZKkAVPf2MRjf1nJQ/OW8+sFK1jXsPXlVqXFRbxr37GcNH0CJ06dwJiK0gEYad/pjwvZTkkp/Wobz5+aUrqv0BcejAy9kiRpqGjc3Mzjr6xi9tzlPDx/Bas3NG5VpyjgHXuN5eS3TOC9UycwoWrEAIy0dw34mt4sMPRKkqShaHNTM08vWsPsubkL4ZbXNnRa7+1Tqjlp+gROmjaRKWOH5t3gDL29wNArSZKGuubmxPOvr2V2fieIRW/WdVpv6sRKTp4+gZOmT2C/XUf18yh7rl9Db0RMAL4GvJ5SmhkRfwc8llIa0vfQM/RKkqQsSSnx4vJ1PDh3OQ/NXc5LK9Z1Wm+fcRWcNH0CJ0+fyLRJlYP6bnD9HXqvBX4MHJBS+kXk/mUuSCnd2O3OBhFDryRJyrK/rVzfOgP8v6/XdFpnt9Fl+dshT+DtU0YPurvB9XfoPSel9KOI+GBK6Zf5sn8z9EqSJA0NS9bW81B+DfBTi1bTWSQcN2p47m5w0ybyjr3HUDII7gbX36H3kymlm1puSBERRcB3UkoXdLuzQcTQK0mSdkYr123k4fm5APzHV95kcyd3g6suL+GEg3bl5OkTeNe+uzCipJhlNfUsXLWBvXapYGJVWb+Mtb9D7xHA/wdUASuBdwL/kVL6Tbc7G0QMvZIkaWe3tq6R3yx4gwfnLud3L6+kcfPWd4MbOXwY+4yr4H9fryGR2xrtmtPfwpmHT+nz8fX77g35gHgUUAT8KaW0qkcdDSKGXkmSpC02bNzMoy/l7gb36ItvsKFx67vBtSgK+MPMd/f5jG9PQ++wnr5g/kVab1YREbumlFb0tD9JkiQNLhXDh/H+gyfx/oMn0bCpiTkvr2L2vOU8+MKyrQJwc4JXV9X12zKH7ioo9EbEXsBHtlNtvwLqSJIkaQgaUVLMCVN35YSpu/J/3rMvx371t+0ufiuOYM9dBu8NLwq9BG8JUA08lv+oAv7U5ngV8L99MUBJkiQNLlPGVHDt6W+hOL+fb1HA1adPH7SzvFDgTG9KqTEi/jOltAEgIg5KKT3cpspjEfHxPhmhJEmSBp0zD5/CMfuP49VVdey5S/mgDrzQjTW9LYE3b+9Oqkze8eFIkiRpqJhYVTbow26Lnl7INjsiHgOeBhqBqcBdvTYqSZIkqRf1KPSmlB6JiOeAGfk+vp1SWtqrI5MkSZJ6yY5sWbY6Ih7Jf17fe0OSJEmSelePQm9EjAa+S25tb1NEvA78W0ppZW8OTpIkSeoNPZ3pvQC4KKW0HCAiqoFzget7a2CSJElSbyl0n96O5rcEXoCU0lrgb93pICIuiIiFEdEQEc9ExNHbqf+ZiHgpIuoj4rWIuCEiRnRR95KISBHxje6MSZIkSdnU09A7pZOy8YU2jogzgW8AVwGHAL8HHoyIzvolIs4CrgW+BBwEnAecCVzTSd3DgfPxZhmSJEnK62no/XNE/DwiPh0R/xYRtwELu9H+YuAHKaVbU0oLUkqfAV4DPtlF/RnAH1JKd6SUXs3fGONO4LC2lSJiJPBT4GPAmu6+KUmSJGVTwaE3It7T8nlK6XfAp4EGoBm4PP9YSD+lwKHAwx2eehg4sotmc4BDI+KIfB97A6cAD3SodyPwQErp1wWOZXhEVLZ8AKMKaSdJkqShpTsXsv2/iHgDqG1TFvnHi4ExwNgC+tkFKAZWdChfAUzorEFK6WcRMQ6YExGRH/dNKaVrWwcS8U/kwvRhnfXRhUvIBXZJkiRlWHdC74HkAmUpkIBlwOMppfXQupa2O1KH4+ikjHzfxwGXkts14glgX+CbEbEspXRlROwOfBN4b0qpoRtjuIb2O06MAl7vRntJkiQNAQWH3pTSX4C/tBznZ17fnV9qsBu5JQhPFdDVKqCJrWd1x7P17G+LK4HbU0q35o9fiIgK4JaIuIrcDO944JncRDCQm00+JiI+BQxPKTV18p42AhvbvKcChi9JkqShplv79OaXFhxM7sKyI4E9gJXAn4A3CukjpdQYEc8AJwK/aPPUicAvu2hWztZrhpvIzQ4H8BvgLR2evw14Ebius8ArSZKknUfBoTd/y+G9gKeBPwI3A6+2qXIs8HiB3V0P3B4RLX2dT24btO/lX+vHwJKU0iX5+vcBF0fEn9myvOFK4N58oF0HzO0w3g3AmymlduWSJEna+XRnpvcP5PbJbdt2n/znAexfaEcppbsiYixwGTCRXGA9JaW0KF9lCu1ndr9Mbr3vl4HJ5GaX7yO3zleSJEnapkip02vHdkr5bctqampqqKysHOjhSJIkqYPa2lqqqqoAqlJKtdur36KnN6eQJEmShgxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/RKkiQp8wy9kiRJyrwBC70RcUFELIyIhoh4JiKO3k79z0TESxFRHxGvRcQNETGizfOXRMRTEbEuIt6IiFkRcUDfvxNJkiQNdgMSeiPiTOAbwFXAIcDvgQcjYkoX9c8CrgW+BBwEnAecCVzTptqxwI3AO4ETgWHAwxFR0UdvQ5IkSUNEpJT6/0UjngCeTSl9sk3ZAmBWSumSTup/BzgopfSeNmVfB45IKXU6QxwR44A3gGNTSr8rcFyVQE1NTQ2VlZXdek+SJEnqe7W1tVRVVQFUpZRqC23X7zO9EVEKHAo83OGph4Eju2g2Bzg0Io7I97E3cArwwDZeqir/uHobYxkeEZUtH8CoAt6CJEmShphhA/CauwDFwIoO5SuACZ01SCn9LD9zOycigty4b0opXdtZ/Xyd64E5KaW52xjLJcDl3Ry/JEmShpiB3L2h47qK6KQs90TEccClwAXA24HTgfdHxH920fd3gIOBf97OGK4hNyPc8rFbIQOXJEnS0DIQM72rgCa2ntUdz9azvy2uBG5PKd2aP34hf4HaLRFxVUqpuaViRHwb+ABwTErp9W0NJKW0EdjYpm233ogkSZKGhn6f6U0pNQLPkNthoa0Tgce7aFYONHcoayI3OxyQW9KQv+DtdODdKaWFvTZoSZIkDWkDMdMLufW2t0fE08AfgfOBKcD3ACLix8CSNjs53AdcHBF/Bp4A9iU3+3tvSqkpX+dG4MPAB4F1EdEyk1yTUqrvh/ckSZKkQWpAQm9K6a6IGAtcBkwE5gKnpJQW5atMof3M7pfJrff9MjAZWEkuCF/apk7L9me/7fByHwV+2IvDlyRJ0hAzIPv0Dlbu0ytJkjS4DZl9eiVJkqT+ZuiVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGWeoVeSJEmZZ+iVJElS5hl6JUmSlHmGXkmSJGXegIXeiLggIhZGRENEPBMRR2+n/mci4qWIqI+I1yLihogYsSN9SpIkaecwIKE3Is4EvgFcBRwC/B54MCKmdFH/LOBa4EvAQcB5wJnANT3tU5IkSTuPSCn1/4tGPAE8m1L6ZJuyBcCslNIlndT/DnBQSuk9bcq+DhyRUjq6J312Ma5KoKampobKysoevjtJkiT1ldraWqqqqgCqUkq1hbYb1ndD6lxElAKHkpu5beth4Mgums0Bzo6II1JKT0bE3sApwI92oE8iYjgwvE3RKMj9Y0qSJGnw6WlO6/fQC+wCFAMrOpSvACZ01iCl9LOIGAfMiYggN+6bUkotIbfbfeZdAlzesXD33Xff3nuQJEnSwBoFDN6Z3jY6rquITspyT0QcB1wKXAA8AewLfDMilqWUruxJn3nXANd3KBsDrN7myHvPKOB1YDdgXT+9pnqX53Bo8/wNfZ7Doc9zOLQN1PkbBSztToOBCL2rgCa2noEdz9YztS2uBG5PKd2aP34hIiqAWyLiqh72SUppI7CxQ3G/rW3ITVoDsK47a1I0eHgOhzbP39DnORz6PIdD2wCev26/Vr/v3pBSagSeAU7s8NSJwONdNCsHmjuUNZGbyY0e9ilJkqSdxEAtb7geuD0ingb+CJwPTAG+BxARPwaWtNl14T7g4oj4M1uWN1wJ3JtSaiqkT0mSJO28BiT0ppTuioixwGXARGAucEpKaVG+yhTaz+x+mdza3C8Dk4GV5ILwpd3oczDaSG7v4Y5LLDR0eA6HNs/f0Oc5HPo8h0PbkDl/A7JPryRJktSfBuw2xJIkSVJ/MfRKkiQp8wy9kiRJyjxDryRJkjLP0CtJkqTMM/T2sYi4ICIWRkRDRDwTEUdvp/4ZETE/IjbmH/++v8aqrXXn/EXExyLi9xGxJv/x64g4oj/Hq6119/9gm3b/FBEpImb19Ri1bT34PlodETdGxLJ8mwURcUp/jVdb68E5/ExEvBQR9RHxWkTcEBEj+mu82iIijomI+yJiaf574mkFtDk2f54bIuJvEfGJ/hjr9hh6+1BEnAl8A7gKOAT4PfBgREzpov4M4C7gduCt+cefR8Q7+mfEaqu75w84DrgTOB6YASwGHo6IyX0/WnWmB+ewpd0ewNfy9TWAevB9tBT4f8CewIeAA4CPAUv6Y7zaWg/O4VnAteT2fj0IOA84E7imXwasjiqA54FPFVI5IvYCfkXuPB8CXA18KyLO6LMRFsh9evtQRDwBPJtS+mSbsgXArDZ3m2tb/y6gMqV0cpuy2cCalNI/98eYtUV3z18n7YuBNcCnUko/7ruRqis9OYf58/YYcBtwNFCdUtruzIb6Rg++j34C+HfgwJTSpv4bqbrSg3P4HeCglNJ72pR9HTgipVTQX2rUNyIiAX+fUuryL2ARcR3wgZTSQW3Kvge8NaU0ox+G2SVnevtIfrbhUODhDk89DBzZRbMZndR/aBv11Ud6eP46KgdKgNW9ODQVaAfO4WXAypTSD/pqbCpMD8/hB8jdiv7GiFgREXMj4vP5X2bUz3p4DucAh7YsD4uIvYFTgAf6apzqVV1lmcMiomQAxtNqQG5DvJPYBSgGVnQoXwFM6KLNhG7WV9/pyfnr6Fpyf1L9dS+OS4Xr9jmMiHeR+1Pq2/p2aCpQT/4f7g28G/gpuaC0H3AjuZ93V/TNMLUN3T6HKaWfRcQ4YE5EBLlzd1NK6do+Hal6S1dZZhi5r4dl/T6iPENv3+u4fiQ6KduR+upbPTofEfF/gX8GjkspNfTFwFSwgs5hRIwCfgJ8LKW0qj8GpoJ15/9hEfAGcH5KqQl4JiImkVvyYOgdOAWfw4g4DrgUuAB4AtgX+GZELEspXdmXg1Sv6ex8d1berwy9fWcV0MTWv8mOZ+vfgFos72Z99Z2enD8AIuJzwOeBE1JK/9s3w1MBunsO9yF38dN9ucklIL8ELCI2AweklF7pk5GqKz35f7gM2JQPvC0WABMiojSl1Nj7w9Q29OQcXgncnlK6NX/8QkRUALdExFUppea+Gap6SVdZZjPwZv8PZwvX9PaR/DfWZ4ATOzx1IvB4F83+2En9926jvvpID88fEfHvwH8CJ6WUnu67EWp7enAOXwTeQm5pQ8vHvcCj/397dxciVRnHcfz7J5GKhYqKsNUVIUIiiiiQDLwMyqtaIoOgsAJDKASxixAMNq0blxJCuqnsJlOJqJvosndcohehm14IKYmK0F2Ndst/F89xHWzbcLaZc/bM9wMPO3vOmXOeh4eZ/c0zz3O2eny0Z5XVnLp8HX4AXBMRnX/frgWOGXj7r8s+vBg4N9j+RRktjH8erob5tywzUfvi0sy09KhQbrEyDWyk3HZlHJgCVlb79wG7Oo5fS/kk9ASwuvo5A6ypuy2DWLrov23AH8Ao5VPumTJUd1sGtZxvH87x/JcpK8xrb8ugli5ehyuASWAPJeyup4woPll3Wwa1dNGHO4ATwAZgFSVAfQ3sr7stg1iAIc4OBCSwpXo8Uu3fBezrOH4VcBLYXfX3xqr/R+tui9Mbeigz90fE5ZTV4MuAI8Cdmfl9dcgIHZ9mM/PDiNgAjFG+3vkGuDczP+lvzQXn33+U+WdLgYPnnOopypu4+qyLPlTDdPE+ejQibqcEqy8oi0mfA57ta8U1q4vX4RglXI0Bw8DPwFuUeb7qv1so33idsbv6+QrwIKVPZ++5nJnfVf8MZhzYDPwIPJaZh/pS23l4n15JkiS1nnN6JUmS1HqGXkmSJLWeoVeSJEmtZ+iVJElS6xl6JUmS1HqGXkmSJLWeoVeSJEmtZ+iVJElS6xl6JanhImJFRIxHxKaIeC8ittVdJ0labAy9ktR8LwJPZ+Ze4A7gyprrI0mLjqFXkhosItYBP2TmLwCZOQW8Xm+tJGnxWVJ3BSRJ81oHHOnckJmHI+ICYCvwLTANrAV2ZubxiHgH+Bw4DlwNfEkZIb4fODjPvtPAFuCr6vEaYHtmzvS4jZLUc4ZeSWq2YWBiju3Lgcsy8wBARAxTwutrwEeUYPsmMJGZmyNiGXDjf+z7E5jMzEPVOW8FbgY+7mH7JKkvDL2S1Gy/AZcCRMRS4FHgJuAFYE9EPAxMAquBTzufl5mnI+JE9Xty9j1/zn2Z+X5E/BoRj1BGgkeAC3vYNknqG0OvJDXbYeAugMycjog3gLspYXgvcF9mTkXERQu9UETcBmwCHqqudd1CzylJTeFCNklqtreBGyJiqGNbAuuBd6uFbbMi4qoFXOse4EBmTv+P55SkRnCkV5IaLDNnqikMz0TEZ5SpDhVMINsAAACLSURBVM9TRoB3RsQDlEVnVwDXA8co0x9ORcQpYHlErKy2/T7PvpPAS8DjEXFJdfklwCjwKvBTXxosST0SmVl3HSRJkqSecnqDJEmSWs/QK0mSpNYz9EqSJKn1DL2SJElqPUOvJEmSWs/QK0mSpNYz9EqSJKn1DL2SJElqPUOvJEmSWs/QK0mSpNb7G8IjxO4dJUxCAAAAAElFTkSuQmCC\n", 881 | "text/plain": [ 882 | "
" 883 | ] 884 | }, 885 | "metadata": { 886 | "needs_background": "light" 887 | }, 888 | "output_type": "display_data" 889 | } 890 | ], 891 | "source": [ 892 | "import matplotlib.pyplot as plt\n", 893 | "from matplotlib import font_manager\n", 894 | "my_font = font_manager.FontProperties(fname='C:/Windows/Fonts/simfang.ttf')\n", 895 | "\n", 896 | "\n", 897 | "f1_wtfidf={0.0:0.85381,0.1:0.85668,0.2:0.85464,0.3:0.85481,0.4:0.85589,0.5:0.85706,0.6:0.85994,0.7:0.85884,0.8:0.85705,0.9:0.84846,1.0:0.84365}\n", 898 | "f1_wtfidf_values=[i for i in f1_wtfidf.values()]\n", 899 | "f1_wtfidf_keys=[i for i in f1_wtfidf.keys()]\n", 900 | "\n", 901 | "f1_bow=[0.6674 for i in range(len(f1_wtfidf_keys))]\n", 902 | "f1_tfidf=[0.7998 for i in range(len(f1_wtfidf_keys))]\n", 903 | "\n", 904 | "plt.figure(figsize=(8,5),dpi=100)\n", 905 | "plt.plot(f1_wtfidf_keys,f1_wtfidf_values,label=\"WTFIDF+SVM_F1_Score\",marker=\".\",linewidth=2)\n", 906 | "# plt.plot(f1_wtfidf_keys,f1_bow,label=\"BOW+SVM_F1_Score\",linewidth=2)\n", 907 | "# plt.plot(f1_wtfidf_keys,f1_tfidf,label=\"TFIDF+SVM_F1_Score\",linewidth=2)\n", 908 | "\n", 909 | "\n", 910 | "plt.ylim(0.8,0.9)\n", 911 | "plt.xlabel('Gamma', fontproperties=my_font)\n", 912 | "plt.ylabel('ModelF1Score', fontproperties=my_font)\n", 913 | "plt.legend(prop=my_font)" 914 | ] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": null, 919 | "metadata": {}, 920 | "outputs": [], 921 | "source": [] 922 | } 923 | ], 924 | "metadata": { 925 | "kernelspec": { 926 | "display_name": "Python 3", 927 | "language": "python", 928 | "name": "python3" 929 | }, 930 | "language_info": { 931 | "codemirror_mode": { 932 | "name": "ipython", 933 | "version": 3 934 | }, 935 | "file_extension": ".py", 936 | "mimetype": "text/x-python", 937 | "name": "python", 938 | "nbconvert_exporter": "python", 939 | "pygments_lexer": "ipython3", 940 | "version": "3.7.3" 941 | } 942 | }, 943 | "nbformat": 4, 944 | "nbformat_minor": 2 945 | } 946 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP-TopicAnalysis 2 | This project proposes the Title- Weighed TF-IDF based on VSM and combined it with the LDA model and SVM model, using labelled texts obtained by crawlers on Sina Weibo (which is the biggest social media platform in China that is similar to Twitter), and conducted three experiments: Topic clustering, topic classification, topic analysis. 3 | 4 | - [NLP-TopicAnalysis](#nlp-topicanalysis) 5 | * [Background](#background) 6 | * [Dataset](#dataset) 7 | + [Raw data](#raw-data) 8 | + [Processed data](#processed-data) 9 | * [Topic clustering - dataset 1](#topic-clustering---dataset-1) 10 | + [1. Decide topic numbers - based on xLDA](#1-decide-topic-numbers---based-on-xlda) 11 | + [2. Decide value of parameter gamma](#2-decide-value-of-parameter-gamma) 12 | + [3.Topic clustering simple analysis - take TW-LDA as example](#3topic-clustering-simple-analysis---take-tw-lda-as-example) 13 | * [Topic classification - dataset 2](#topic-classification---dataset-2) 14 | + [SVM modeling](#svm-modeling) 15 | * [Topic analysis - dataset 1 and 2](#topic-analysis---dataset-1-and-2) 16 | + [Combine LDA and SVM modeling](#combine-lda-and-svm-modeling) 17 | * [Conclusion and future work](#conclusion-and-future-work) 18 | + [Conclusion](#conclusion) 19 | + [Future work](#future-work) 20 | 21 | 22 | 23 | ## Background 24 | 25 | This repository includes source code, scripts, and data for project "Topic Analysis of Weibo News Based on Title-Weighted LDA Model". Completed in May 2020. The uploaded code can reproduce the results, and the further standardization of the code is still in progress. 26 | 27 | 这里存放着一些所做的的项目的相关资料,名字是“基于标题加权的LDA模型的微博新闻主题分析”。完成于2020年5月。目前上传的代码可以复现结果,对于代码的进一步规范化还在进行中。 28 | 29 | プロジェクトの課題は「タイトル加重LDAモデルに基づくWeiboニューストピック分析」。2020年5月で完成しました。アップロードされたコードは結果を再現でき、コードのさらなる標準化はまだ進行中です。 30 | 31 | 32 | ## Dataset 33 | ### Raw data 34 | Two datasets are used and the raw data is crawled from Sina weibo, like this: 35 | ![image](https://user-images.githubusercontent.com/58460943/135987422-750adb9a-9563-463a-8ad6-5ca21b5c0c93.png) 36 | 37 | **Dataset 1** is all the crawled text content of Sina Weibo offical account "Global Times", which is posted from 2019-01-01 00:00 to 2019-12-31 23:59. This is mainly for topic clustering. 38 | 39 | **Dataset 2** is all the crawled text content of Sina Weibo offical account "China Agriculture News", "Sina Education", "Sina Finance", "Xinhua Sports", "Xinhua International" from 2019-12-31 (including this day) forward 1000 texts content, and label each text with the official account type (like label "Agriculture" for ccount "China Agriculture News", label "Education" for account "Sina Education", etc.) 40 | 41 | ### Processed data 42 | After processing the data, the dataset content like this: 43 | 44 | ![image](https://user-images.githubusercontent.com/58460943/135989374-655afd1f-cdaa-4a37-b845-32d89251822d.png) 45 | 46 | 47 | Here is a simple descriptive analysis of dataset 1, for example: 48 | 49 | **1. Number of texts posted per month** 50 | 51 | ![image](https://user-images.githubusercontent.com/58460943/135989238-bd22ddb2-b977-4d16-9f82-e1982ebf7d51.png) 52 | 53 | It can be seen that the number of Weibo posts posted every month in 2019 is around 1200-1500, with the least number of posts in February (maybe due to the Spring Festival and corona virus). 54 | 55 | **2. Word cloud using frequency(left one) and tf-idf(right one)** 56 | 57 | ![image](https://user-images.githubusercontent.com/58460943/135989689-554cdffc-cd37-4dc9-b780-2ae04dcf7ee5.png) 58 | ![image](https://user-images.githubusercontent.com/58460943/135989717-f2d7f8c3-c0f9-471e-adb0-1bc8d7aff3e3.png) 59 | 60 | Both show key keywords in the 2019 news, such as "China", "United States", "Hong Kong" and so on. 61 | 62 | However, in contrast, these two word clouds **reflect the nature and advantages of TF-IDF to correct the weight of high-frequency words**: "China" appears in most news texts, so words only from the frequency of words will occupy a very important position. However, the weight calculated by TF-IDF pays more attention to other words that may be non-high-frequency but important, so the weight of "China" is reduced. At the same time, the weights of some commonly used words such as "have" and "appeared" are also less than the word frequency word cloud. It can be seen intuitively that TF-IDF has better feature words and weights than the word frequency method. 63 | 64 | 65 | ## Topic clustering - dataset 1 66 | ### 1. Decide topic numbers - based on xLDA 67 | The basic model is the standard LDA model (xLDA) using word frequency. 68 | 69 | Delete extreme feature words, get feature words, set the number of topics(from 10 to 150), model using xLDA 55 times per situation, calculate the average of cherence score C_v and confusion score in the 5 times after Gibbs sampling converges. 70 | 71 | ![image](https://user-images.githubusercontent.com/58460943/135995221-56bd84d2-de9d-401b-9ab9-3f3cf69a0e5e.png) 72 | 73 | Combining the values, **choose 50 as the best number of topics for following experiments.** 74 | 75 | ### 2. Decide value of parameter gamma 76 | 77 | The advanced model is LDA topic modeling based on TF-IDF, and TW-LDA topic modeling based on title weighted TF-IDF. 78 | 79 | Delete extreme feature words, get feature words, set the value of γ (0.0,0.1,0.2,...,1.0), model using xLDA 55 times per situation, calculate the average of cherence score C_v in the 5 times after Gibbs sampling converges. 80 | 81 | ![image](https://user-images.githubusercontent.com/58460943/135992571-7cbfd57c-f844-4231-9446-8ead7e3ba5ca.png) 82 | 83 | Therefre, **choose 0.5 as the value of parameter γ.** 84 | 85 | Besides, the C_v of the LDA model based on TF-IDF is greater than the standard LDA model based on word frequency, and regardless of the γ of the TW-LDA model, the C_v of the TW-LDA model is greater than the LDA model based on TF-IDF. 86 | 87 | ### 3.Topic clustering simple analysis - take TW-LDA as example 88 | 89 | **The results are highly interpretable and can be consistent with real life situations or timelines.** 90 | 91 | Below shows the top 20 words of top 10 topics: 92 | 93 | ![image](https://user-images.githubusercontent.com/58460943/135995508-1a795e22-4c1f-4cfd-8e98-68081dd5ba98.png) 94 | 95 | Below is the visualization of all the 50 topics: 96 | 97 | ![image](https://user-images.githubusercontent.com/58460943/135995607-6dfab480-8afe-45e1-a133-483f03010590.png) 98 | 99 | 100 | Below is the feature words and weights of the same topic in different models: 101 | 102 | ![image](https://user-images.githubusercontent.com/58460943/135993798-6b439f36-a74b-421d-9182-73c868736dce.png) 103 | 104 | Below is the number of posted text per month of "National Day military parade" topic: 105 | 106 | ![image](https://user-images.githubusercontent.com/58460943/135994324-c9807bc4-cb40-4ee6-b2f3-a39c6e11766b.png) 107 | 108 | Below is some examples of texts clustered to "HK Riots" topic: 109 | 110 | ![image](https://user-images.githubusercontent.com/58460943/135994472-7207b64d-6a0c-44d4-9141-aab34c8daf82.png) 111 | 112 | Below is the number of posted text per month of "HK Riots" topic: 113 | 114 | ![image](https://user-images.githubusercontent.com/58460943/135994584-83b8fd0e-e309-4fa7-88f4-5c647c3504f9.png) 115 | 116 | 117 | ## Topic classification - dataset 2 118 | ### SVM modeling 119 | 120 | ![image](https://user-images.githubusercontent.com/58460943/135998364-87d9e8d5-2e7e-43f2-aa84-ae9b3aa51a4c.png) 121 | 122 | γ reaches two local maximums at 0.1 and 0.6, and as γ goes from 0.8 to 1.0, the F1 Score of the model drops rapidly. Considering only the F1 Score as the evaluation standard, **γ is chosen as 0.6**. In fact, when γ=0.6, the model not only reaches the maximum value of F1 Score, but also its accuracy, precision, and recall. 123 | 124 | ![image](https://user-images.githubusercontent.com/58460943/135998753-8c3f5ded-ae6b-4cac-829e-c612ce3aa970.png) 125 | 126 | 127 | **The indexes of SVM model based on word frequency are lower than the SVM model based on TF-IDF, and the performance of the SVM model based on title weighted TF-IDF is higher than that of the SVM model based on word frequency.** 128 | 129 | When γ=0.6, the accuracy, precision, recall, and F1 score of the model are the highest among all models. But under different parameters γ, its performance gradually declines. When γ values are 0.9 and 1.0, It may not be as good as the classification effect of the SVM model based on TF-IDF. 130 | 131 | The preliminary analysis may be due to the fact that when γ=0.9, the weighted TF-IDF value used by the feature words in a certain title is 10% of its TF-IDF value in all texts plus 90% of its value in all titles. TF-IDF value. When γ=1.0, the weighted TF-IDF value used by the word feature word is 100% of its TF-IDF value in all texts. 132 | 133 | In the case of the data set used in this article, for a title feature word that appears in the text, although its weighted TF-IDF value may be more representative of its importance in the title, it may be more important in the text at the same time. There has been too much reduction, which makes the weighted TF-IDF value deviate from the real situation. 134 | 135 | **This result also proves the importance of weighting the weights of the title and the text used here, which enables the model to take into account the important role of the title and the feature words in the text.** 136 | 137 | 138 | ## Topic analysis - dataset 1 and 2 139 | ### Combine LDA and SVM modeling 140 | 141 | Combining the Title-Weighted LDA model in the previous section with the Title-Weighted TF-IDF SVM classification model in this section, we can classify a text according to events and channels. For example, use following text as input: 142 | 143 | ![image](https://user-images.githubusercontent.com/58460943/136000934-8fe470e5-da29-4787-aa93-94be72200e33.png) 144 |
*("\[Hong Kong Government: #IMF Confirm Hong Kong International Financial Center Position#] According to a report on the Hong Kong Special Administrative Region Government News Network on the 30th, the International Monetary Fund (IMF) released an assessment report on the 30th, reaffirming Hong Kong as a global financial center and an intra-regional trade hub. , The status of one of the most open economies in the world. The organization welcomes the Hong Kong Special Administrative Region Government’s recent fiscal stimulus measures to revitalize the economy, and supports the Hong Kong Special Administrative Region Government to adopt a three-pronged approach to control property market risks and increase the affordability of home ownership. : The IMF affirms Hong Kong’s status as an international financial center (with photos).")* 145 | 146 | Extract the title and text of this text content, use the established dictionary to calculate the TF-IDF value based on the title weight, and enter it into the title weighted LDA model that has been established in the previous section. The result shows this text has a probability of 0.38997 that may fall in the topic with **"Hong Kong"** in the top20 feature words, and a probability of 0.32516 may fall in the topic with **"Economy"** or **"Financial"** in the top20 feature words, and this has been established. 147 | 148 | Among the 50 topics, the probability that it belongs to other topics is less than 0.18. Then input this text into the SVM model based on the title weighted TF-IDF that has been established, and it can be calculated to have a probability of 0.81514 of belonging to the **"financial"** classification. 149 | 150 | ## Conclusion and future work 151 | ### Conclusion 152 | 1. Introduce Title-Weighted TF-IDF model to make full use of the different importance of text in different locations. 153 | 154 | 3. Apply Title-Weighted TF-IDF model to LDA to generate word-topic and document-topic matrices that are closer to the real situation, improve the quality of topic clustering, and perform visualization to improve interpretability. 155 | 156 | 5. Apply Title-Weighted TF-IDF model to SVM to improve the classification quality of texts, and combine the LDA model with the SVM model. Through unsupervised and supervised aspects, the Weibo content can be analyzed from two aspect: Potential topics and existing topic labels. 157 | 158 | ### Future work 159 | 1. Further improve the feature word extraction and weighting. 160 | 161 | 2. Combine external semantic knowledge with other aspect knowledge for experimental analysis. 162 | -------------------------------------------------------------------------------- /数据集1-环球时报.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lpn-98/NLP-TopicAnalysis/a3aac45311f33d7b78b58d660fbcf35e41f1d73b/数据集1-环球时报.zip -------------------------------------------------------------------------------- /数据集2-五个分类.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lpn-98/NLP-TopicAnalysis/a3aac45311f33d7b78b58d660fbcf35e41f1d73b/数据集2-五个分类.zip --------------------------------------------------------------------------------