├── LICENSE ├── README.md ├── fansMap ├── .ipynb_checkpoints │ ├── acquaireData-checkpoint.ipynb │ ├── analyseSite-checkpoint.ipynb │ └── geo_example-checkpoint.py ├── acquaireData.ipynb ├── analyseSite.ipynb ├── china_gdp_from_1980.html ├── geo_example.py └── render.html ├── repostWeibo ├── acquaireData.ipynb ├── analyseData.ipynb └── analyseTxt.ipynb └── singleWeibo ├── acquaireRepost.ipynb ├── analyseLinks-html.ipynb ├── analyseLinks.ipynb └── render.html /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 leezeeyee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sinaWeibo crawler 2 | 本测试项目为爬虫爬取新浪微博并分析数据的实现代码 3 | 使用平台为jupyter notebook 4 | 编程语言为python 5 | 6 | 7 | 每个文件夹中的文件配合使用,实现功能如下: 8 | ## repostWeibo 9 | 博主转发影响力分析 10 | acquaireData:从微博爬取需要的数据——某博主所有转发的微博相关数据 11 | analyseData:分析转发量等数值型数据 12 | analyseTxt:分析转发博文文本,绘制词云图 13 | 总结文章:https://blog.csdn.net/cascara/article/details/104090441 14 | ## singleWeibo 15 | 博文传播图绘制 16 | acquaireRepost:获取需要的数据——某条微博的全部转发情况 17 | analyseLinks:生成传播图像 18 | 总结文章:https://blog.csdn.net/cascara/article/details/104135868 19 | ## fansMap 20 | 绘制粉丝分布热力图 21 | acquaireData:从微博爬取需要的数据——某博主的所有粉丝(微博限流,实际只能爬取10页) 22 | analyseSite:生成地区图像 23 | 总结文章:https://blog.csdn.net/cascara/article/details/104172602 24 | -------------------------------------------------------------------------------- /fansMap/.ipynb_checkpoints/acquaireData-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "@[TOC]" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 库" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import csv\n", 25 | "import requests\n", 26 | "import xlwt\n", 27 | "import re\n", 28 | "import json\n", 29 | "import time" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### 配置" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "jupyter": { 44 | "source_hidden": true 45 | } 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "#根据个人浏览器信息进行修改\n", 50 | "headers = {\n", 51 | " 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36'\n", 52 | " ,\n", 53 | " 'Cookie': '_T_WM=67706607048; WEIBOCN_FROM=1110006030; ALF=1582777481; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5tF6kBCjVpv4O6BDEGM4gShv5JHfuyjMoLBKfT5-Xwsc.; SUB=_2A25zK8jDDeRhGeNP41UT9yjIyj6IHXVQ1-iLrDV6PUJbktAKLUHSkW1NTk4PgJoxaitdQXaQL6znAIMdvJJs4-5l; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5K-hUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0qjEKc2Va_YMLH; SSOLoginState=1580185747; MLOGIN=1; XSRF-TOKEN=607e98; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304135671786192_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D2304135671786192_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011'\n", 54 | " #'ALF=1581501545; _T_WM=67706607048; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2Fapi%2Fcomments%2Fshow%3Fid%3DIr5j4iRXW%26page%3D3; XSRF-TOKEN=11216a; WEIBOCN_FROM=1110006030; MLOGIN=1; SSOLoginState=1580006602; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5zFt06rZeA1gEI0iP7HfWxZntbpMr8WTWhrxEdSVGB58.; SUB=_2A25zKIyaDeRhGeNP41UT9yjIyj6IHXVQ0hTSrDV6PUJbktAKLRL-kW1NTk4PgHLYgtoeuxFzuGDIDcybzoEoXvq9; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5KzhUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0IIlrfWMMkVsTI; M_WEIBOCN_PARAMS=uicode%3D20000174'\n", 55 | "}" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "#文件保存地址\n", 65 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/'" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "#是否获取转发者具体个人信息\n", 75 | "getConcreteInfoList=True#False#True#\n", 76 | "isLogin=True#False#True\n", 77 | "\n", 78 | "#是否登入采集个人信息\n", 79 | "\n", 80 | "#无信息打印字符\n", 81 | "infoNoExistStr='未知'\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "#是否处理微博文本内容\n", 91 | "processText = True" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### 构造表格,采集数据内容(修改这里获取想要的信息)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "raw", 103 | "metadata": {}, 104 | "source": [ 105 | "博主的信息单独收集:转发的:转发reposts_count、评论comments_count、点赞数量attitudes_count、粉丝数量followers_count\n", 106 | "\n", 107 | " 原始的retweeted_status:转发reposts_count、评论comments_count、点赞数量attitudes_count\n", 108 | " 原始用户的user:用户名screen_name、id、粉丝数量followers_count" 109 | ] 110 | }, 111 | { 112 | "cell_type": "raw", 113 | "metadata": {}, 114 | "source": [ 115 | "获取个人具体信息范围、排列" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "metadata": { 122 | "jupyter": { 123 | "source_hidden": true 124 | } 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "#获取个人具体信息范围、排列\n", 129 | "infoRangeDict={\n", 130 | "'性别':True,\n", 131 | "'所在地':True,\n", 132 | " \n", 133 | "'生日':False,\n", 134 | "'家乡':False,\n", 135 | "'公司':True,\n", 136 | "'大学':True,\n", 137 | " \n", 138 | "'昵称':False,\n", 139 | "'简介':False,\n", 140 | "'注册时间':False,\n", 141 | "'阳光信用':False,\n", 142 | " \n", 143 | " #若无信息显示\n", 144 | "'infoNoExist':'未知'\n", 145 | "}\n", 146 | "\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "raw", 151 | "metadata": {}, 152 | "source": [ 153 | "获取博文信息范围、排列" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "#获取博文信息范围、排列\n", 163 | "blogRangeDict={\n", 164 | "'visible': False,#{type: 0, list_id: 0}\n", 165 | " \n", 166 | "'created_at': True,#\"20分钟前\"\n", 167 | " \n", 168 | "'id': False,#\"4466073829119710\"\n", 169 | "'idstr': False,#\"4466073829119710\"\n", 170 | "'mid': False,#\"4466073829119710\"\n", 171 | "'can_edit': False,#false\n", 172 | "'show_additional_indication': False,#0\n", 173 | " \n", 174 | "'text': True,#\"【情况通报】2019年12月31日,武汉市卫健部门发布关于肺炎疫情的情况通报。\n", 175 | " \n", 176 | "'textLength': False,#452\n", 177 | "'source': False,#\"360安全浏览器\"\n", 178 | "'favorited': False,#false\n", 179 | "'pic_types': False,#\"\"\n", 180 | "'is_paid': False,#false\n", 181 | "'mblog_vip_type': False,#0\n", 182 | "'user': False,#{id: 2418542712, screen_name: \"平安武汉\",…}\n", 183 | " \n", 184 | "'reposts_count': True,#1035\n", 185 | "'comments_count': True,#1886\n", 186 | "'attitudes_count': True,#7508\n", 187 | " \n", 188 | "'pending_approval_count': False,#0\n", 189 | "'isLongText': False,#true\n", 190 | "'reward_exhibition_type':False,# 0\n", 191 | "'hide_flag': False,#0\n", 192 | "'mblogtype': False,#0\n", 193 | "'more_info_type': False,#0\n", 194 | "'cardid': False,#\"star_11247_common\"\n", 195 | "'content_auth': False,#0\n", 196 | "'pic_num': False,#0\n", 197 | " \n", 198 | "#若无相关信息,则显示:\n", 199 | "'infoNoExist':'未知'\n", 200 | "}" 201 | ] 202 | }, 203 | { 204 | "cell_type": "raw", 205 | "metadata": {}, 206 | "source": [ 207 | "获取博主信息范围、排列" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 8, 213 | "metadata": { 214 | "jupyter": { 215 | "source_hidden": true 216 | } 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "#获取博主信息范围、排列\n", 221 | "userRangeDict={\n", 222 | "'id':True,# 1323527941\n", 223 | "'screen_name': True,#\"Vista看天下\"\n", 224 | " \n", 225 | "'profile_image_url': False,#\"https://tva2.sinaimg.cn/crop.0.0.180.180.180/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg?KID=imgbed,tva&Expires=1580290462&ssig=xPIoKDRR56\"\n", 226 | "'profile_url':False,# \"https://m.weibo.cn/u/1323527941?uid=1323527941&luicode=10000011&lfid=1076031323527941\"\n", 227 | "'statuses_count': False,#微博数 77256\n", 228 | "'verified': False,#true\n", 229 | "'verified_type':False,# 3\n", 230 | "'verified_type_ext': False,#0\n", 231 | "'verified_reason': False,#\"《Vista看天下》官方微博\"\n", 232 | "'close_blue_v': False,#false\n", 233 | " \n", 234 | "'description': True,#\"一个有趣的蓝V\"\n", 235 | "'gender': True,# \"m\"\n", 236 | " \n", 237 | "'mbtype': False,#12\n", 238 | "'urank': False,#48\n", 239 | "'mbrank': False,#6\n", 240 | "'follow_me':False,# false\n", 241 | "'following':False,# false\n", 242 | " \n", 243 | "'followers_count': True,#19657897\n", 244 | "'follow_count': True,#1809\n", 245 | " \n", 246 | "'cover_image_phone': False,#\"https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg\"\n", 247 | "'avatar_hd': False,#\"https://ww2.sinaimg.cn/orj480/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg\"\n", 248 | "'like': False,#false\n", 249 | "'like_me': False,#false\n", 250 | "'badge': False,#{enterprise: 1, gongyi_level: 1, bind_taobao: 1, dzwbqlx_2016: 1, follow_whitelist_video: 1,…}\n", 251 | " \n", 252 | "#若无信息显示\n", 253 | "'infoNoExist':'未知'\n", 254 | "}\n" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "### 文件命名" 262 | ] 263 | }, 264 | { 265 | "cell_type": "raw", 266 | "metadata": { 267 | "jupyter": { 268 | "source_hidden": true 269 | } 270 | }, 271 | "source": [ 272 | "使用示例:\n", 273 | "tweeter='王'\n", 274 | "fp = open(addrFile(tweeter),'w+',newline='',encoding='utf-16')\n", 275 | "fp.close()\n", 276 | "\n", 277 | "使用库函数:\n", 278 | "os" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 9, 284 | "metadata": { 285 | "jupyter": { 286 | "source_hidden": true 287 | } 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "def addrFile(tweeter,suffix):\n", 292 | " path=addrRoot+str(tweeter)+'/'\n", 293 | " if os.path.exists(path) is False:\n", 294 | " os.makedirs(path)\n", 295 | " address=path+tweeter+suffix+'.csv'\n", 296 | " return address " 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "### 生成信息标题" 304 | ] 305 | }, 306 | { 307 | "cell_type": "raw", 308 | "metadata": { 309 | "jupyter": { 310 | "source_hidden": true 311 | } 312 | }, 313 | "source": [ 314 | "将字典Dict中为True的条目生成标题,加前缀prefix\n", 315 | "\n", 316 | "使用实例:\n", 317 | "print(getInfoTitle(blogRangeDict,'原文'))\n", 318 | "打印结果:\n", 319 | "['原文created_at', '原文text', '原文reposts_count', '原文comments_count', '原文attitudes_count']" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 10, 325 | "metadata": { 326 | "jupyter": { 327 | "source_hidden": true 328 | } 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "def getInfoTitle(Dict,prefix):\n", 333 | " titleList=[]\n", 334 | " for item in Dict:\n", 335 | " if(Dict.get(item) is True):\n", 336 | " titleList.append(prefix+item)\n", 337 | " return (titleList)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": {}, 343 | "source": [ 344 | "## 工具类,用来去除爬取的正文中一些不需要的链接、标签等" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 11, 350 | "metadata": { 351 | "jupyter": { 352 | "source_hidden": true 353 | } 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "#工具类,用来去除爬取的正文中一些不需要的链接、标签等\n", 358 | "class Tool:\n", 359 | " deleteImg = re.compile('')\n", 360 | " newLine =re.compile('|
||
')\n", 361 | " deleteAite = re.compile('//.*?:')\n", 362 | " deleteAddr = re.compile('.*?|')\n", 364 | " deleteWord = re.compile('回复@|回覆@|回覆|回复')\n", 365 | " \n", 366 | " @classmethod\n", 367 | " def replace(cls,x):\n", 368 | " x = re.sub(cls.deleteWord,'',x)\n", 369 | " x = re.sub(cls.deleteImg,'',x)\n", 370 | " x = re.sub(cls.deleteAite,'',x)\n", 371 | " x = re.sub(cls.deleteAddr, '', x)\n", 372 | " x = re.sub(cls.newLine,'',x)\n", 373 | " x = re.sub(cls.deleteTag,'',x)\n", 374 | " return x.strip()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "### 构造微博内容的url" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 12, 387 | "metadata": { 388 | "jupyter": { 389 | "source_hidden": true 390 | } 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "###某微博账户的全部微博内容\n", 395 | "def contentURL(id,pages):\n", 396 | " i=0\n", 397 | " urls=[]\n", 398 | " for page in pages:\n", 399 | " if page is not 0:\n", 400 | " urls+=['https://m.weibo.cn/api/container/getIndex?containerid=230413'+str(id)+'_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page='+str(page)]\n", 401 | "\n", 402 | " return urls" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 13, 408 | "metadata": { 409 | "jupyter": { 410 | "source_hidden": true 411 | } 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "#将字典类型的信息格式传递为需要的信息列表\n", 416 | "def getInfoList(infoDict,rangeDict):\n", 417 | " infoList=[]\n", 418 | " for item in rangeDict:\n", 419 | " if rangeDict.get(item) is True:\n", 420 | " content=infoDict.get(item)\n", 421 | " if content is not None:\n", 422 | " #处理微博文本内容 \n", 423 | " if item =='text':\n", 424 | " if processText is True:\n", 425 | " content=Tool.replace(content)\n", 426 | " infoList.append(content) \n", 427 | " else:\n", 428 | " infoList.append(rangeDict['infoNoExist'])\n", 429 | " return infoList" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "metadata": {}, 435 | "source": [ 436 | "### 观测对每个转发微博的影响" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 14, 442 | "metadata": { 443 | "jupyter": { 444 | "source_hidden": true 445 | } 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "###在已有的一系列urls中进行操作\n", 450 | "###筛选出微博转发内容进行操作\n", 451 | "def reRatio(urls,csvWriter):\n", 452 | " notEnd= True\n", 453 | " \n", 454 | " retweetBlogTitle=getInfoTitle(blogRangeDict,'转发')#转发博文信息标题\n", 455 | " retweetUserTitle=getInfoTitle(userRangeDict,'转发')#转发博主信息标题\n", 456 | " \n", 457 | " originBlogTitle=getInfoTitle(blogRangeDict,'原文')#原文博文信息标题\n", 458 | " originUserTitle=getInfoTitle(userRangeDict,'原文')#原文博主信息标题\n", 459 | " infoTitle=getInfoTitle(infoRangeDict,'')#原文博主个人主页信息标题\n", 460 | " \n", 461 | " #写表格的标题\n", 462 | " if getConcreteInfoList is True: \n", 463 | " csvWriter.writerow(retweetBlogTitle+retweetUserTitle+originBlogTitle+originUserTitle+infoTitle) \n", 464 | " else:\n", 465 | " csvWriter.writerow(retweetBlogTitle+retweetUserTitle+originBlogTitle+originUserTitle)\n", 466 | " \n", 467 | " for url in urls: \n", 468 | " \n", 469 | " response = requests.get(url,headers=headers)\n", 470 | " resjson = json.loads(response.text) \n", 471 | " cards=resjson['data']['cards'] \n", 472 | " \n", 473 | " #print(cards)\n", 474 | " \n", 475 | " #结束最后\n", 476 | " if(len(cards)==1):\n", 477 | " notEnd=False\n", 478 | " break\n", 479 | " #遍历一个页面的所有微博 \n", 480 | " for card in cards:\n", 481 | " try:\n", 482 | " #转发博文与博主信息\n", 483 | " retweetBlogInfoDict=card['mblog'] \n", 484 | " retweetUserInfoDict=retweetBlogInfoDict['user'] \n", 485 | " \n", 486 | " #筛选出转发的微博\n", 487 | " try: \n", 488 | " originBlogInfoDict=retweetBlogInfoDict['retweeted_status']\n", 489 | " \n", 490 | " \n", 491 | " if originBlogInfoDict is not None: \n", 492 | " \n", 493 | " #转发博文原文与博主信息\n", 494 | " originUserInfoDict=originBlogInfoDict['user']\n", 495 | " retweetUserID=retweetUserInfoDict['id']\n", 496 | " originUserID=originUserInfoDict['id']\n", 497 | " ###不是转发自己的微博,则选中进行处理\n", 498 | " if(retweetUserID!=originUserID):\n", 499 | " infoList=[] \n", 500 | " \n", 501 | " #转发博文数据\n", 502 | " retweetBlogInfoList=getInfoList(retweetBlogInfoDict,blogRangeDict) \n", 503 | " infoList+=retweetBlogInfoList \n", 504 | " #转发博主数据\n", 505 | " ##默认已知\n", 506 | " retweetUserInfoList=getInfoList(retweetUserInfoDict,userRangeDict) \n", 507 | " infoList+=retweetUserInfoList \n", 508 | " #原文博文数据\n", 509 | " originBlogInfoList=getInfoList(originBlogInfoDict,blogRangeDict) \n", 510 | " infoList+=originBlogInfoList\n", 511 | " #原文博主数据\n", 512 | " originUserInfoList=getInfoList(originUserInfoDict,userRangeDict) \n", 513 | " infoList+=originUserInfoList \n", 514 | " \n", 515 | " #originUserID为原文账号的ID \n", 516 | " #可在此对id进行信息采集 \n", 517 | " \n", 518 | " if getConcreteInfoList is True:\n", 519 | " infoDict=getInfo(isLogin,originUserID)\n", 520 | " otherInfoList=getInfoList(infoDict,infoRangeDict) \n", 521 | " infoList+=otherInfoList \n", 522 | " #print(infoList)\n", 523 | " #保存数据至csv\n", 524 | " csvWriter.writerow(infoList) \n", 525 | " \n", 526 | " #不断获取该博主对的影响力\n", 527 | " #break\n", 528 | " except:\n", 529 | " pass\n", 530 | " except:\n", 531 | " pass\n", 532 | " #延时,防止反爬\n", 533 | " time.sleep(3)\n", 534 | " \n", 535 | " return notEnd" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": {}, 541 | "source": [ 542 | "### 获取个人主页中信息" 543 | ] 544 | }, 545 | { 546 | "cell_type": "raw", 547 | "metadata": { 548 | "jupyter": { 549 | "source_hidden": true 550 | } 551 | }, 552 | "source": [ 553 | "使用示例:\n", 554 | "response = requests.get(url)\n", 555 | "txt=response.text\n", 556 | "print(drillInfo(txt))\n", 557 | "\n", 558 | "结果如下:\n", 559 | "{'昵称': '甘肃华熙文化',\n", 560 | " '简介': '【马丛珊.禅绣艺术,世界纹绣大师学院甘肃分院】服务生命之美;践行匠心为本,艺心创造,慈心发扬校训,微信mashan5374,☎13109439909',\n", 561 | " '性别': '女',\n", 562 | " '所在地': '甘肃 兰州'}" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 15, 568 | "metadata": { 569 | "jupyter": { 570 | "source_hidden": true 571 | } 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "def drillInfo(txt):\n", 576 | " keyInfo={}\n", 577 | " \n", 578 | " try: \n", 579 | " resjson = json.loads(txt) \n", 580 | " infodata = resjson.get('data')\n", 581 | " cards = infodata.get('cards')\n", 582 | " for l in range(0,len(cards)):\n", 583 | " temp = cards[l]\n", 584 | " card_group = temp.get('card_group') \n", 585 | " #判断获取信息类型 \n", 586 | " for card in card_group: \n", 587 | " #将信息传入字典\n", 588 | " name=card.get('item_name')\n", 589 | " if name is not None:\n", 590 | " content=card.get('item_content')\n", 591 | " keyInfo[name]=content \n", 592 | " except:\n", 593 | " pass\n", 594 | " return keyInfo" 595 | ] 596 | }, 597 | { 598 | "cell_type": "markdown", 599 | "metadata": {}, 600 | "source": [ 601 | "### 构建通过id访问个人主页的url" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": 16, 607 | "metadata": { 608 | "jupyter": { 609 | "source_hidden": true 610 | } 611 | }, 612 | "outputs": [], 613 | "source": [ 614 | "def infoUrl(id):\n", 615 | " url = \"https://m.weibo.cn/api/container/getIndex?containerid=230283\"+str(id)+\"_-_INFO\" \n", 616 | " return url" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "## 爬取某id博主的个人信息" 624 | ] 625 | }, 626 | { 627 | "cell_type": "raw", 628 | "metadata": { 629 | "jupyter": { 630 | "source_hidden": true 631 | } 632 | }, 633 | "source": [ 634 | "为防止反复爬取,将原文整体保存为文件,格式为 信息卡片长度(2 or 5)+id+博主id\n", 635 | "不登录2含有性别、所在地\n", 636 | "登录5含有性别、所在地、星座、大学、公司等完整信息\n", 637 | "若存在所需文件,则从文件读取信息,否则爬取,同时保存文件\n", 638 | "\n", 639 | "若爬取未成功,返回-1\n", 640 | "\n", 641 | "使用库函数:\n", 642 | "os" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 17, 648 | "metadata": { 649 | "jupyter": { 650 | "source_hidden": true 651 | } 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "def getInfo(state,id):\n", 656 | " \n", 657 | " address=addrRoot+'info/'+str(state)+'id'+str(id)+'.txt'\n", 658 | " path=addrRoot+'info/'\n", 659 | " if os.path.exists(path) is False:\n", 660 | " os.makedirs(path)\n", 661 | " try:\n", 662 | " #已有文件\n", 663 | " if(os.path.exists(address)==True):\n", 664 | " fp = open(address,'r',encoding='utf-16')\n", 665 | " txt=fp.read()\n", 666 | " info=drillInfo(txt)\n", 667 | " fp.close()\n", 668 | " else: \n", 669 | " fp = open(address,'w+',encoding='utf-16')\n", 670 | " url=infoUrl(id)\n", 671 | " if state is True:\n", 672 | " response = requests.get(url,headers=headers)\n", 673 | " else:\n", 674 | " response = requests.get(url)\n", 675 | " txt=response.text\n", 676 | " fp.write(response.text) \n", 677 | " info=drillInfo(txt)\n", 678 | " fp.close()\n", 679 | " except:\n", 680 | " info=-1 \n", 681 | " \n", 682 | " return info" 683 | ] 684 | }, 685 | { 686 | "cell_type": "raw", 687 | "metadata": {}, 688 | "source": [ 689 | "获取特定个人信息" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 18, 695 | "metadata": { 696 | "jupyter": { 697 | "source_hidden": true 698 | } 699 | }, 700 | "outputs": [], 701 | "source": [ 702 | "def getExatInfo(item,state,id):\n", 703 | " info=getInfo(state,id)\n", 704 | " content=info.get(item)\n", 705 | " if content is not None:\n", 706 | " return content\n", 707 | " else:\n", 708 | " return infoNoExistStr " 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": 19, 714 | "metadata": {}, 715 | "outputs": [], 716 | "source": [ 717 | "### 构造热门界面访问" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 20, 723 | "metadata": { 724 | "jupyter": { 725 | "source_hidden": true 726 | } 727 | }, 728 | "outputs": [], 729 | "source": [ 730 | "def downloadData(id):\n", 731 | " tweeter=getExatInfo('昵称',2,int(id))\n", 732 | " batch=0\n", 733 | " while(1):\n", 734 | "\n", 735 | " fileAddr=addrFile(tweeter,'batch'+str(batch))\n", 736 | " if os.path.exists(fileAddr) is True:\n", 737 | " print(fileAddr+'已存在,跳过采集') \n", 738 | " else:\n", 739 | " print('文件将写入:'+fileAddr)\n", 740 | " fp = open(fileAddr,'w+',newline='',encoding='utf-16')\n", 741 | " writer=csv.writer(fp)\n", 742 | " if reRatio(contentURL(id,range(20*batch,20*(batch+1))),writer) is False:\n", 743 | " fp.close()\n", 744 | " break\n", 745 | "\n", 746 | " fp.close()\n", 747 | " print('第'+str(batch)+'批数据已记录完毕')\n", 748 | " batch+=1" 749 | ] 750 | }, 751 | { 752 | "cell_type": "raw", 753 | "metadata": { 754 | "jupyter": { 755 | "source_hidden": true 756 | } 757 | }, 758 | "source": [ 759 | "\n", 760 | "#陈赫\n", 761 | "id=1574684061\n", 762 | "#MorningGlory_肖战资源博\n", 763 | "id=5735501478\n", 764 | "\n", 765 | "#靳东\n", 766 | "id=1093897112\n", 767 | "#李健\n", 768 | "id=1744395855\n", 769 | "\n", 770 | "#干部\n", 771 | "id=6472269230\n", 772 | "\n", 773 | "#陶勇\n", 774 | "id=5899876484\n", 775 | "\n", 776 | "#姚晨\n", 777 | "id=1266321801\n", 778 | "\n", 779 | "#鞠婧祎\n", 780 | "id=3669102477\n", 781 | "\n", 782 | "#韩红\n", 783 | "#id=1922542315\n", 784 | "\n", 785 | "\n", 786 | "#穿帮君\n", 787 | "id=5671786192\n", 788 | "\n", 789 | "#汉堡爸爸\n", 790 | "id=2784421224\n", 791 | "\n", 792 | "#蔡徐坤\n", 793 | "\n", 794 | "id=1776448504\n", 795 | "\n", 796 | "\n", 797 | "#林书豪\n", 798 | "id=2106855375\n", 799 | "\n", 800 | "#干部\n", 801 | "id=6472269230\n", 802 | "\n", 803 | "#任嘉伦\n", 804 | "id=3800468188\n", 805 | "\n", 806 | "#肖战\n", 807 | "id=1792951112\n", 808 | "\n", 809 | "\n", 810 | "#迪丽热巴\n", 811 | "id=1669879400\n", 812 | "\n", 813 | "\n", 814 | "#科比\n", 815 | "id=3264072325" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 23, 821 | "metadata": {}, 822 | "outputs": [ 823 | { 824 | "name": "stdin", 825 | "output_type": "stream", 826 | "text": [ 827 | "博主id: 1663414103\n" 828 | ] 829 | }, 830 | { 831 | "name": "stdout", 832 | "output_type": "stream", 833 | "text": [ 834 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch0.csv\n", 835 | "第0批数据已记录完毕\n", 836 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch1.csv\n", 837 | "第1批数据已记录完毕\n", 838 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch2.csv\n", 839 | "第2批数据已记录完毕\n", 840 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch3.csv\n", 841 | "第3批数据已记录完毕\n", 842 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch4.csv\n", 843 | "第4批数据已记录完毕\n", 844 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch5.csv\n", 845 | "第5批数据已记录完毕\n", 846 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch6.csv\n", 847 | "第6批数据已记录完毕\n", 848 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch7.csv\n", 849 | "第7批数据已记录完毕\n", 850 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch8.csv\n", 851 | "第8批数据已记录完毕\n", 852 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch9.csv\n", 853 | "第9批数据已记录完毕\n", 854 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch10.csv\n", 855 | "第10批数据已记录完毕\n", 856 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch11.csv\n", 857 | "第11批数据已记录完毕\n", 858 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch12.csv\n", 859 | "第12批数据已记录完毕\n", 860 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch13.csv\n", 861 | "第13批数据已记录完毕\n", 862 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch14.csv\n", 863 | "第14批数据已记录完毕\n", 864 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch15.csv\n", 865 | "第15批数据已记录完毕\n", 866 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch16.csv\n", 867 | "第16批数据已记录完毕\n", 868 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch17.csv\n", 869 | "第17批数据已记录完毕\n", 870 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch18.csv\n", 871 | "第18批数据已记录完毕\n", 872 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch19.csv\n", 873 | "第19批数据已记录完毕\n", 874 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch20.csv\n", 875 | "第20批数据已记录完毕\n", 876 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch21.csv\n", 877 | "第21批数据已记录完毕\n", 878 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch22.csv\n" 879 | ] 880 | } 881 | ], 882 | "source": [ 883 | "id=input('博主id:')\n", 884 | "\n", 885 | "downloadData(id)" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [] 894 | } 895 | ], 896 | "metadata": { 897 | "kernelspec": { 898 | "display_name": "Python 3", 899 | "language": "python", 900 | "name": "python3" 901 | }, 902 | "language_info": { 903 | "codemirror_mode": { 904 | "name": "ipython", 905 | "version": 3 906 | }, 907 | "file_extension": ".py", 908 | "mimetype": "text/x-python", 909 | "name": "python", 910 | "nbconvert_exporter": "python", 911 | "pygments_lexer": "ipython3", 912 | "version": "3.7.4" 913 | } 914 | }, 915 | "nbformat": 4, 916 | "nbformat_minor": 4 917 | } 918 | -------------------------------------------------------------------------------- /fansMap/.ipynb_checkpoints/analyseSite-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import csv" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "def addrPath(tweeter):\n", 20 | " path=addrRoot+str(tweeter)+'/可视化分析结果/'\n", 21 | " if os.path.exists(path) is False:\n", 22 | " os.makedirs(path)\n", 23 | " return path" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "def addrFile(tweeter,suffix):\n", 33 | " path=addrRoot+str(tweeter)+'/'\n", 34 | " if os.path.exists(path) is False:\n", 35 | " os.makedirs(path)\n", 36 | " address=path+tweeter+suffix+'.csv'\n", 37 | " return address " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "jupyter": { 45 | "source_hidden": true 46 | } 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import random\n", 51 | "\n", 52 | "#for item in dataDict:\n", 53 | " #print(dataDict.get(item)[-1-sort[-1]])\n", 54 | " #print(item)\n", 55 | "def randomText(no):\n", 56 | " \n", 57 | " item=random.choice(list(dataDict))\n", 58 | " return (item+':'+str(dataDict.get(item)[no]))\n", 59 | "\n", 60 | "def exactText(no,item):\n", 61 | " text=(item+':'+str(dataDict.get(item)[no]))\n", 62 | " print(text)\n", 63 | " return text\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "metadata": { 70 | "jupyter": { 71 | "source_hidden": true 72 | } 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "#文件保存地址,改为你存放csv文件的完整地址\n", 77 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/'\n", 78 | "\n", 79 | "#是否调试\n", 80 | "isDebug=False\n", 81 | "\n", 82 | "#100万+显示\n", 83 | "infinity=1000000" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdin", 93 | "output_type": "stream", 94 | "text": [ 95 | "输入博主姓名: 华中科技大学\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "tweeter=input('输入博主姓名:')#'陈赫'" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 7, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch0.csv\n", 113 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch1.csv\n", 114 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch2.csv\n", 115 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch3.csv\n", 116 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch4.csv\n", 117 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch5.csv\n", 118 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch6.csv\n", 119 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch7.csv\n", 120 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch8.csv\n", 121 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch9.csv\n", 122 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch10.csv\n", 123 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch11.csv\n", 124 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch12.csv\n", 125 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch13.csv\n", 126 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch14.csv\n", 127 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch15.csv\n", 128 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch16.csv\n", 129 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch17.csv\n", 130 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch18.csv\n", 131 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch19.csv\n", 132 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch20.csv\n", 133 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch21.csv\n", 134 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch22.csv\n", 135 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch23.csv\n", 136 | "不存在C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch23.csv\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "#开始\n", 142 | "startCollect=True\n", 143 | "\n", 144 | "\n", 145 | "batch=0\n", 146 | "dataDict={}\n", 147 | "Title=[]\n", 148 | "while(1):\n", 149 | " titleEixst=True\n", 150 | " address=addrFile(tweeter,'batch'+str(batch)) \n", 151 | " print(address)\n", 152 | " \n", 153 | " if os.path.exists(address) is True:\n", 154 | " fp = open(address,'r',newline='',encoding='utf-16') \n", 155 | " reader=csv.reader(fp)\n", 156 | " for line in reader:\n", 157 | " if titleEixst is True:\n", 158 | " #将标题制成各个\n", 159 | " if startCollect is True:\n", 160 | " for item in line:\n", 161 | " #print(item[-1])\n", 162 | " if isDebug is True:\n", 163 | " print(item)\n", 164 | "\n", 165 | " \n", 166 | " dataDict[item]=[]\n", 167 | " startCollect=False\n", 168 | " Title=line \n", 169 | " if isDebug is True:\n", 170 | " print(line)\n", 171 | " titleEixst=False\n", 172 | " else:\n", 173 | " for no in range(len(line)): \n", 174 | " if(Title[no].find('count')>=0):\n", 175 | " if line[no].find('0万') > 0:\n", 176 | " dataDict.get(Title[no]).append(infinity)\n", 177 | " else:\n", 178 | " dataDict.get(Title[no]).append(eval(line[no]))\n", 179 | " \n", 180 | " #dataDict.get(Title[no]).append((line[no]))\n", 181 | " \n", 182 | " else:\n", 183 | " dataDict.get(Title[no]).append((line[no]))\n", 184 | " if isDebug is True:\n", 185 | " print(Title[no])\n", 186 | " print(line[no]) \n", 187 | " \n", 188 | " \n", 189 | " else:\n", 190 | " print('不存在'+address)\n", 191 | " break\n", 192 | " fp.close()\n", 193 | " batch+=1\n", 194 | "#print(dataDict)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 12, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "['北京 东城区', '北京', '北京', '湖北 武汉', '北京', '北京', '北京', '北京', '北京', '北京 朝阳区', '北京', '北京', '湖北', '湖北', '湖北 武汉', '北京', '四川', '湖北 武汉', '湖北 武汉', '陕西 西安', '北京', '湖北 武汉', '北京 朝阳区', '广东', '北京', '湖北', '湖北 武汉', '北京', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '湖北 武汉', '河南', '湖北 武汉', '北京 东城区', '北京', '天津', '其他', '广东 深圳', '湖北 武汉', '北京 朝阳区', '北京', '四川', '湖北 武汉', '湖北 武汉', '北京 西城区', '湖北 襄阳', '湖北', '北京', '北京', '北京', '湖北', '北京', '广东', '北京', '湖北 武汉', '湖北 武汉', '北京', '北京', '湖北 武汉', '其他', '北京', '北京', '北京', '北京 朝阳区', '北京', '广东 广州', '福建 福州', '湖北 武汉', '湖南 岳阳', '湖北 武汉', '其他', '湖北 武汉', '北京 朝阳区', '湖北 武汉', '北京', '北京', '北京', '湖北', '海外 美国', '湖北', '浙江 杭州', '北京', '湖北', '湖北 武汉', '湖北 武汉', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '北京', '重庆 沙坪坝区', '湖北 武汉', '湖北 武汉', '浙江 温州', '其他', '其他', '湖北 黄冈', '云南', '湖北 武汉', '湖南 长沙', '其他', '湖北 武汉', '湖北 武汉', '北京', '重庆', '福建', '湖北 武汉', '湖北 武汉', '湖北 武汉', '北京', '湖北 武汉', '湖北', '湖北 武汉', '其他', '湖北', '湖北 武汉', '湖北 武汉', '海外 其他', '湖北', '北京', '湖北 武汉', '北京', '北京', '海外', '北京', '湖北', '湖北 荆州', '北京', '北京', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '未知', '安徽 池州', '湖北 武汉', '其他', '湖北 武汉', '北京', '湖南', '湖北 武汉', '湖北', '湖北 武汉', '湖北', '湖北 宜昌', '安徽 池州', '湖北 武汉', '湖北 武汉', '湖北 武汉', '湖北', '北京', '湖北 武汉', '湖北', '香港', '安徽 池州', '湖南 长沙', '海外', '安徽 池州', '湖北 武汉', '北京', '湖北', '其他', '湖北 武汉', '福建 福州', '福建 福州', '湖北 武汉', '福建 福州', '重庆', '陕西', '湖北 武汉', '北京', '湖北 武汉', '重庆', '湖北 武汉', '山东 青岛', '北京', '北京 西城区', '湖北 武汉', '天津', '山东', '天津', '重庆 沙坪坝区', '湖北', '湖北', '湖北', '北京', '浙江', '四川 成都', '广东', '湖北 武汉', '未知', '福建 福州', '浙江 宁波', '北京', '湖北 武汉', '北京', '福建 福州', '山西 太原', '福建 福州', '江苏 苏州', '湖北 武汉', '北京', '湖北', '北京', '浙江', '浙江 宁波', '其他', '湖南', '北京', '湖北 武汉', '湖南', '北京', '湖北', '湖北', '北京', '其他', '河南 洛阳', '湖北', '未知', '北京', '湖南', '海外 英国', '河南 焦作', '湖北 武汉', '湖北 武汉', '江苏 南京', '湖北', '湖北', '北京', '北京', '江苏 南京', '湖北 武汉', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '其他', '重庆', '湖北 武汉', '其他', '湖北 武汉', '海外', '湖北 武汉', '其他', '海外 英国', '湖北', '海外 沙特阿拉伯', '海外', '北京', '湖北 武汉', '湖南', '湖北 武汉', '湖北', '湖北 武汉', '其他', '湖北', '湖北 武汉', '其他', '湖北 武汉', '未知', '广东 广州', '湖北 武汉', '云南 昆明', '湖北 武汉', '浙江 杭州', '海外', '湖北', '河南 郑州', '安徽 池州', '未知', '上海 徐汇区', '福建 福州', '海外 英国', '其他', '北京', '其他', '其他', '福建 福州', '湖北 宜昌', '上海', '其他', '湖北 武汉', '江西', '江苏', '北京', '北京', '湖南', '北京', '其他', '重庆', '湖北 武汉', '其他', '福建 福州', '其他', '陕西', '其他', '辽宁', '其他', '广东', '其他', '上海', '陕西 西安', '湖北 武汉', '湖北', '其他', '湖北 武汉', '湖北 武汉', '江苏 苏州', '福建 福州', '湖北 武汉', '陕西 西安', '北京', '福建 福州', '湖北 武汉', '江西 九江', '山东 济宁', '湖北 武汉', '北京', '湖北 武汉', '广东', '四川 成都', '福建 福州', '福建 福州', '湖南', '上海 徐汇区', '海外 菲律宾', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '其他', '湖北 武汉', '湖北', '安徽 池州', '湖北', '四川 泸州', '湖北', '湖北 武汉', '湖南 岳阳', '重庆', '湖北 武汉', '湖北 武汉', '湖北 武汉', '湖北 武汉', '青海 西宁', '未知', '江苏 苏州', '北京', '其他']\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "print(dataDict['所在地'])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### 读入数据" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 10, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "originData={}\n", 228 | "retweetData={}\n", 229 | "\n", 230 | "labels=[0 for i in range(4)]\n", 231 | "cnLabels={}\n", 232 | "\n", 233 | "labels[1]='reposts_count'\n", 234 | "labels[2]='comments_count'\n", 235 | "labels[3]='attitudes_count'\n", 236 | "labels[0]='follow_count'\n", 237 | "\n", 238 | "cnLabels[labels[1]]='转发量'\n", 239 | "cnLabels[labels[2]]='评论量'\n", 240 | "cnLabels[labels[3]]='点赞量'\n", 241 | "cnLabels[labels[0]]='粉丝量'\n", 242 | "\n", 243 | "\n", 244 | "for i in range(4):\n", 245 | " #print(choice+label)\n", 246 | " try:\n", 247 | " #originData[cnLabels[i]]=np.asarray(dataDict.get('原文'+labels[i])[::-1])\n", 248 | " retweetData[labels[i]]=np.asarray(dataDict.get('转发'+labels[i])[::-1])\n", 249 | " except:\n", 250 | " pass\n", 251 | " try:\n", 252 | " originData[labels[i]]=np.asarray(dataDict.get('原文'+labels[i])[::-1])\n", 253 | " #retweetData[cnLabels[i]]=np.asarray(dataDict.get('转发'+labels[i])[::-1])\n", 254 | " except:\n", 255 | " pass\n", 256 | "\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 11, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "txt=''\n", 266 | "for text in (dataDict['转发text']):\n", 267 | " txt +=text\n" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "choice='转发'#'原文'#\n", 277 | "\n", 278 | "txt=''\n", 279 | "for text in (dataDict[choice+'text']):\n", 280 | " txt +=text" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 13, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "name": "stderr", 290 | "output_type": "stream", 291 | "text": [ 292 | "Building prefix dict from the default dictionary ...\n", 293 | "Loading model from cache C:\\Users\\cascara\\AppData\\Local\\Temp\\jieba.cache\n", 294 | "Loading model cost 0.932 seconds.\n", 295 | "Prefix dict has been built successfully.\n" 296 | ] 297 | }, 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "" 302 | ] 303 | }, 304 | "execution_count": 13, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "import jieba\n", 311 | "import numpy as np\n", 312 | "import wordcloud\n", 313 | "from PIL import Image\n", 314 | "import os\n", 315 | "#fc = open(\"threekingdoms.txt\", \"r\", encoding='utf-8')\n", 316 | "#fc=fc.read()\n", 317 | "\n", 318 | "savePath=addrPath(tweeter)+'词云/'\n", 319 | "\n", 320 | "if os.path.exists(savePath) is False:\n", 321 | " os.mkdir(savePath)\n", 322 | "\n", 323 | "\n", 324 | "fc=txt\n", 325 | "ct=0\n", 326 | "words=jieba.lcut(fc)\n", 327 | "'''for word in words:\n", 328 | " ct+=1\n", 329 | " print(word)\n", 330 | " if ct==3:\n", 331 | " break'''\n", 332 | "\n", 333 | "article=' '.join(words)\n", 334 | "\n", 335 | "\n", 336 | "mask = np.array(Image.open(\"huakegatemask.jpg\"))\n", 337 | "\n", 338 | "\n", 339 | "c = wordcloud.WordCloud( \\\n", 340 | " mask=mask,\n", 341 | " width = 1000, height = 700,\\\n", 342 | " background_color = \"white\",\n", 343 | " font_path = \"msyh.ttc\" \n", 344 | " )\n", 345 | "#c=wordcloud.WordCloud(font_path = \"msyh.ttc\" )\n", 346 | "c.generate(article)\n", 347 | "#print(c)\n", 348 | "c.to_file(savePath+choice+'词云图.png')\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 34, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "name": "stderr", 358 | "output_type": "stream", 359 | "text": [ 360 | "c:\\users\\cascara\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\ipykernel_launcher.py:36: MatplotlibDeprecationWarning: \n", 361 | "The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.\n" 362 | ] 363 | }, 364 | { 365 | "data": { 366 | "image/png": "\n", 367 | "text/plain": [ 368 | "
" 369 | ] 370 | }, 371 | "metadata": {}, 372 | "output_type": "display_data" 373 | } 374 | ], 375 | "source": [ 376 | "import random\n", 377 | "import math\n", 378 | "import matplotlib.pyplot as plt\n", 379 | "import seaborn as sns\n", 380 | "import numpy as np\n", 381 | "\n", 382 | "%matplotlib inline\n", 383 | "sns.set_style('darkgrid')\n", 384 | "plt.rcParams['figure.figsize'] = (12, 8)\n", 385 | "\n", 386 | "\n", 387 | "def AceeptReject(split_val):\n", 388 | " global c\n", 389 | " global power\n", 390 | " while True:\n", 391 | " x = random.uniform(0, 1)\n", 392 | " y = random.uniform(0, 1)\n", 393 | " if y*c <= math.pow(x - split_val, power):\n", 394 | " return x\n", 395 | "\n", 396 | "power = 4\n", 397 | "t = 0.4 \n", 398 | "sum_ = (math.pow(1-t, power + 1) - math.pow(-t, power + 1)) / (power + 1) #求积分\n", 399 | "x = np.linspace(0, 1, 100)\n", 400 | "#常数值c\n", 401 | "c = 0.6**4/sum_\n", 402 | "cc = [c for xi in x]\n", 403 | "plt.plot(x, cc, '--',label='c*f(x)')\n", 404 | "#目标概率密度函数的值f(x)\n", 405 | "y = [math.pow(xi - t, power)/sum_ for xi in x]\n", 406 | "plt.plot(x, y,label='f(x)')\n", 407 | "#采样10000个点\n", 408 | "samples = []\n", 409 | "for i in range(10000):\n", 410 | " samples.append(AceeptReject(t))\n", 411 | "plt.hist(samples, bins=50, normed=True,label='sampling')\n", 412 | "plt.legend()\n", 413 | "plt.show()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [] 422 | } 423 | ], 424 | "metadata": { 425 | "kernelspec": { 426 | "display_name": "Python 3", 427 | "language": "python", 428 | "name": "python3" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.7.4" 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 4 445 | } 446 | -------------------------------------------------------------------------------- /fansMap/.ipynb_checkpoints/geo_example-checkpoint.py: -------------------------------------------------------------------------------- 1 | from pyecharts import options as opts 2 | from pyecharts.charts import Geo, Page 3 | from pyecharts.faker import Collector, Faker 4 | from pyecharts.globals import ChartType, SymbolType 5 | 6 | C = Collector() 7 | 8 | 9 | @C.funcs 10 | def geo_base() -> Geo: 11 | c = ( 12 | Geo() 13 | .add_schema(maptype="china") 14 | .add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())]) 15 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 16 | .set_global_opts( 17 | visualmap_opts=opts.VisualMapOpts(), 18 | title_opts=opts.TitleOpts(title="Geo-基本示例"), 19 | ) 20 | ) 21 | return c 22 | 23 | 24 | @C.funcs 25 | def geo_visualmap_piecewise() -> Geo: 26 | c = ( 27 | Geo() 28 | .add_schema(maptype="china") 29 | .add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())]) 30 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 31 | .set_global_opts( 32 | visualmap_opts=opts.VisualMapOpts(is_piecewise=True), 33 | title_opts=opts.TitleOpts(title="Geo-VisualMap(分段型)"), 34 | ) 35 | ) 36 | return c 37 | 38 | 39 | @C.funcs 40 | def geo_effectscatter() -> Geo: 41 | c = ( 42 | Geo() 43 | .add_schema(maptype="china") 44 | .add( 45 | "geo", 46 | [list(z) for z in zip(Faker.provinces, Faker.values())], 47 | type_=ChartType.EFFECT_SCATTER, 48 | ) 49 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 50 | .set_global_opts(title_opts=opts.TitleOpts(title="Geo-EffectScatter")) 51 | ) 52 | return c 53 | 54 | 55 | @C.funcs 56 | def geo_heatmap() -> Geo: 57 | c = ( 58 | Geo() 59 | .add_schema(maptype="china") 60 | .add( 61 | "geo", 62 | [list(z) for z in zip(Faker.provinces, Faker.values())], 63 | type_=ChartType.HEATMAP, 64 | ) 65 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 66 | .set_global_opts( 67 | visualmap_opts=opts.VisualMapOpts(), 68 | title_opts=opts.TitleOpts(title="Geo-HeatMap"), 69 | ) 70 | ) 71 | return c 72 | 73 | 74 | @C.funcs 75 | def geo_guangdong() -> Geo: 76 | c = ( 77 | Geo() 78 | .add_schema(maptype="广东") 79 | .add( 80 | "geo", 81 | [list(z) for z in zip(Faker.guangdong_city, Faker.values())], 82 | type_=ChartType.HEATMAP, 83 | ) 84 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 85 | .set_global_opts( 86 | visualmap_opts=opts.VisualMapOpts(), 87 | title_opts=opts.TitleOpts(title="Geo-广东地图"), 88 | ) 89 | ) 90 | return c 91 | 92 | 93 | @C.funcs 94 | def geo_lines() -> Geo: 95 | c = ( 96 | Geo() 97 | .add_schema(maptype="china") 98 | .add( 99 | "", 100 | [("广州", 55), ("北京", 66), ("杭州", 77), ("重庆", 88)], 101 | type_=ChartType.EFFECT_SCATTER, 102 | color="white", 103 | ) 104 | .add( 105 | "geo", 106 | [("广州", "上海"), ("广州", "北京"), ("广州", "杭州"), ("广州", "重庆")], 107 | type_=ChartType.LINES, 108 | effect_opts=opts.EffectOpts( 109 | symbol=SymbolType.ARROW, symbol_size=6, color="blue" 110 | ), 111 | linestyle_opts=opts.LineStyleOpts(curve=0.2), 112 | ) 113 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 114 | .set_global_opts(title_opts=opts.TitleOpts(title="Geo-Lines")) 115 | ) 116 | return c 117 | 118 | 119 | @C.funcs 120 | def geo_lines_background() -> Geo: 121 | c = ( 122 | Geo() 123 | .add_schema( 124 | maptype="china", 125 | itemstyle_opts=opts.ItemStyleOpts(color="#323c48", border_color="#111"), 126 | ) 127 | .add( 128 | "", 129 | [("广州", 55), ("北京", 66), ("杭州", 77), ("重庆", 88)], 130 | type_=ChartType.EFFECT_SCATTER, 131 | color="white", 132 | ) 133 | .add( 134 | "geo", 135 | [("广州", "上海"), ("广州", "北京"), ("广州", "杭州"), ("广州", "重庆")], 136 | type_=ChartType.LINES, 137 | effect_opts=opts.EffectOpts( 138 | symbol=SymbolType.ARROW, symbol_size=6, color="blue" 139 | ), 140 | linestyle_opts=opts.LineStyleOpts(curve=0.2), 141 | ) 142 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 143 | .set_global_opts(title_opts=opts.TitleOpts(title="Geo-Lines-background")) 144 | ) 145 | return c 146 | 147 | 148 | Page().add(*[fn() for fn, _ in C.charts]).render() 149 | -------------------------------------------------------------------------------- /fansMap/acquaireData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "@[TOC]" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 库" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import csv\n", 25 | "import requests\n", 26 | "import xlwt\n", 27 | "import re\n", 28 | "import json\n", 29 | "import time" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### 配置" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": { 43 | "jupyter": { 44 | "source_hidden": true 45 | } 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "#根据个人浏览器信息进行修改\n", 50 | "headers = {\n", 51 | " 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36'\n", 52 | " ,\n", 53 | " 'Cookie': '_T_WM=67706607048; WEIBOCN_FROM=1110006030; ALF=1582777481; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5tF6kBCjVpv4O6BDEGM4gShv5JHfuyjMoLBKfT5-Xwsc.; SUB=_2A25zK8jDDeRhGeNP41UT9yjIyj6IHXVQ1-iLrDV6PUJbktAKLUHSkW1NTk4PgJoxaitdQXaQL6znAIMdvJJs4-5l; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5K-hUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0qjEKc2Va_YMLH; SSOLoginState=1580185747; MLOGIN=1; XSRF-TOKEN=607e98; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304135671786192_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D2304135671786192_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011'\n", 54 | " #'ALF=1581501545; _T_WM=67706607048; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2Fapi%2Fcomments%2Fshow%3Fid%3DIr5j4iRXW%26page%3D3; XSRF-TOKEN=11216a; WEIBOCN_FROM=1110006030; MLOGIN=1; SSOLoginState=1580006602; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5zFt06rZeA1gEI0iP7HfWxZntbpMr8WTWhrxEdSVGB58.; SUB=_2A25zKIyaDeRhGeNP41UT9yjIyj6IHXVQ0hTSrDV6PUJbktAKLRL-kW1NTk4PgHLYgtoeuxFzuGDIDcybzoEoXvq9; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5KzhUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0IIlrfWMMkVsTI; M_WEIBOCN_PARAMS=uicode%3D20000174'\n", 55 | "}" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "#文件保存地址\n", 65 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/fans/'" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 5, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "#是否获取转发者具体个人信息\n", 75 | "getConcreteInfoList=True#False#True#\n", 76 | "isLogin=False#True#True\n", 77 | "\n", 78 | "#是否登入采集个人信息\n", 79 | "\n", 80 | "#无信息打印字符\n", 81 | "infoNoExistStr='未知'\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "#是否处理微博文本内容\n", 91 | "processText = True" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### 构造表格,采集数据内容(修改这里获取想要的信息)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "raw", 103 | "metadata": {}, 104 | "source": [ 105 | "博主的信息单独收集:转发的:转发reposts_count、评论comments_count、点赞数量attitudes_count、粉丝数量followers_count\n", 106 | "\n", 107 | " 原始的retweeted_status:转发reposts_count、评论comments_count、点赞数量attitudes_count\n", 108 | " 原始用户的user:用户名screen_name、id、粉丝数量followers_count" 109 | ] 110 | }, 111 | { 112 | "cell_type": "raw", 113 | "metadata": {}, 114 | "source": [ 115 | "获取个人具体信息范围、排列" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 7, 121 | "metadata": { 122 | "jupyter": { 123 | "source_hidden": true 124 | } 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "#获取个人具体信息范围、排列\n", 129 | "infoRangeDict={\n", 130 | "'性别':True,\n", 131 | "'所在地':True,\n", 132 | " \n", 133 | "'生日':False,\n", 134 | "'家乡':False,\n", 135 | "'公司':True,\n", 136 | "'大学':True,\n", 137 | " \n", 138 | "'昵称':False,\n", 139 | "'简介':False,\n", 140 | "'注册时间':False,\n", 141 | "'阳光信用':False,\n", 142 | " \n", 143 | " #若无信息显示\n", 144 | "'infoNoExist':'未知'\n", 145 | "}\n", 146 | "\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "raw", 151 | "metadata": {}, 152 | "source": [ 153 | "获取博主信息范围、排列" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 8, 159 | "metadata": { 160 | "jupyter": { 161 | "source_hidden": true 162 | } 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "#获取博主信息范围、排列\n", 167 | "userRangeDict={\n", 168 | "'id':True,# 1323527941\n", 169 | "'screen_name': True,#\"Vista看天下\"\n", 170 | " \n", 171 | "'profile_image_url': False,#\"https://tva2.sinaimg.cn/crop.0.0.180.180.180/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg?KID=imgbed,tva&Expires=1580290462&ssig=xPIoKDRR56\"\n", 172 | "'profile_url':False,# \"https://m.weibo.cn/u/1323527941?uid=1323527941&luicode=10000011&lfid=1076031323527941\"\n", 173 | "'statuses_count': False,#微博数 77256\n", 174 | "'verified': False,#true\n", 175 | "'verified_type':False,# 3\n", 176 | "'verified_type_ext': False,#0\n", 177 | "'verified_reason': False,#\"《Vista看天下》官方微博\"\n", 178 | "'close_blue_v': False,#false\n", 179 | " \n", 180 | "'description': True,#\"一个有趣的蓝V\"\n", 181 | "'gender': True,# \"m\"\n", 182 | " \n", 183 | "'mbtype': False,#12\n", 184 | "'urank': False,#48\n", 185 | "'mbrank': False,#6\n", 186 | "'follow_me':False,# false\n", 187 | "'following':False,# false\n", 188 | " \n", 189 | "'followers_count': True,#19657897\n", 190 | "'follow_count': True,#1809\n", 191 | " \n", 192 | "'cover_image_phone': False,#\"https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg\"\n", 193 | "'avatar_hd': False,#\"https://ww2.sinaimg.cn/orj480/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg\"\n", 194 | "'like': False,#false\n", 195 | "'like_me': False,#false\n", 196 | "'badge': False,#{enterprise: 1, gongyi_level: 1, bind_taobao: 1, dzwbqlx_2016: 1, follow_whitelist_video: 1,…}\n", 197 | " \n", 198 | "#若无信息显示\n", 199 | "'infoNoExist':'未知'\n", 200 | "}\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### 文件命名" 208 | ] 209 | }, 210 | { 211 | "cell_type": "raw", 212 | "metadata": {}, 213 | "source": [ 214 | "使用示例:\n", 215 | "tweeter='王'\n", 216 | "fp = open(addrFile(tweeter),'w+',newline='',encoding='utf-16')\n", 217 | "fp.close()\n", 218 | "\n", 219 | "使用库函数:\n", 220 | "os" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 9, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "def addrFile(tweeter,suffix):\n", 230 | " path=addrRoot+str(tweeter)+'/'\n", 231 | " if os.path.exists(path) is False:\n", 232 | " os.makedirs(path)\n", 233 | " address=path+tweeter+suffix+'.csv'\n", 234 | " return address " 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### 生成信息标题" 242 | ] 243 | }, 244 | { 245 | "cell_type": "raw", 246 | "metadata": { 247 | "jupyter": { 248 | "source_hidden": true 249 | } 250 | }, 251 | "source": [ 252 | "将字典Dict中为True的条目生成标题,加前缀prefix\n", 253 | "\n", 254 | "使用实例:\n", 255 | "print(getInfoTitle(blogRangeDict,'原文'))\n", 256 | "打印结果:\n", 257 | "['原文created_at', '原文text', '原文reposts_count', '原文comments_count', '原文attitudes_count']" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 10, 263 | "metadata": { 264 | "jupyter": { 265 | "source_hidden": true 266 | } 267 | }, 268 | "outputs": [], 269 | "source": [ 270 | "def getInfoTitle(Dict,prefix):\n", 271 | " titleList=[]\n", 272 | " for item in Dict:\n", 273 | " if(Dict.get(item) is True):\n", 274 | " titleList.append(prefix+item)\n", 275 | " return (titleList)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "## 工具类,用来去除爬取的正文中一些不需要的链接、标签等" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 11, 288 | "metadata": { 289 | "jupyter": { 290 | "source_hidden": true 291 | } 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "#工具类,用来去除爬取的正文中一些不需要的链接、标签等\n", 296 | "class Tool:\n", 297 | " deleteImg = re.compile('')\n", 298 | " newLine =re.compile('|
||
')\n", 299 | " deleteAite = re.compile('//.*?:')\n", 300 | " deleteAddr = re.compile('.*?
|')\n", 302 | " deleteWord = re.compile('回复@|回覆@|回覆|回复')\n", 303 | " \n", 304 | " @classmethod\n", 305 | " def replace(cls,x):\n", 306 | " x = re.sub(cls.deleteWord,'',x)\n", 307 | " x = re.sub(cls.deleteImg,'',x)\n", 308 | " x = re.sub(cls.deleteAite,'',x)\n", 309 | " x = re.sub(cls.deleteAddr, '', x)\n", 310 | " x = re.sub(cls.newLine,'',x)\n", 311 | " x = re.sub(cls.deleteTag,'',x)\n", 312 | " return x.strip()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "### 构造粉丝界面的url" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 12, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "###某微博账户的全部微博内容\n", 329 | "def contentURL(id,pages):\n", 330 | " i=0\n", 331 | " urls=[]\n", 332 | " for page in pages:\n", 333 | " if page not in [0,1]:\n", 334 | " urls+=['https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_'+str(id)+'&page='+str(page)]\n", 335 | "\n", 336 | " return urls" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 13, 342 | "metadata": { 343 | "jupyter": { 344 | "source_hidden": true 345 | } 346 | }, 347 | "outputs": [], 348 | "source": [ 349 | "#将字典类型的信息格式传递为需要的信息列表\n", 350 | "def getInfoList(infoDict,rangeDict):\n", 351 | " infoList=[]\n", 352 | " for item in rangeDict:\n", 353 | " if rangeDict.get(item) is True:\n", 354 | " content=infoDict.get(item)\n", 355 | " if content is not None:\n", 356 | " #处理微博文本内容 \n", 357 | " if item =='text':\n", 358 | " if processText is True:\n", 359 | " content=Tool.replace(content)\n", 360 | " infoList.append(content) \n", 361 | " else:\n", 362 | " infoList.append(rangeDict['infoNoExist'])\n", 363 | " return infoList" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "### 统计所有的粉丝数" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": 25, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "###在已有的一系列urls中进行操作\n", 380 | "###筛选出微博转发内容进行操作\n", 381 | "def reRatio(urls,csvWriter):\n", 382 | " notEnd= True\n", 383 | " \n", 384 | " fansUserTitle=getInfoTitle(userRangeDict,'')#粉丝信息标题\n", 385 | " infoTitle=getInfoTitle(infoRangeDict,'')#原文博主个人主页信息标题\n", 386 | " \n", 387 | " #写表格的标题\n", 388 | " if getConcreteInfoList is True: \n", 389 | " csvWriter.writerow(fansUserTitle+infoTitle) \n", 390 | " else:\n", 391 | " csvWriter.writerow(fansUserTitle)\n", 392 | " \n", 393 | " for url in urls: \n", 394 | " \n", 395 | " response = requests.get(url,headers=headers)\n", 396 | " resjson = json.loads(response.text) \n", 397 | " \n", 398 | " if resjson['ok'] ==0:\n", 399 | " print(url)\n", 400 | " notEnd=False\n", 401 | " break \n", 402 | " \n", 403 | " cards=resjson['data']['cards'] \n", 404 | " \n", 405 | " if(len(cards)==1):\n", 406 | " try: \n", 407 | " cards=cards[0]['card_group']\n", 408 | " except:\n", 409 | " print(url)\n", 410 | " print(cards)\n", 411 | " notEnd=False\n", 412 | " break \n", 413 | " \n", 414 | " #遍历一个页面的所有微博 \n", 415 | " for card in cards:\n", 416 | " try:\n", 417 | " #fans个人信息\n", 418 | " \n", 419 | " fansUserInfoDict=card['user'] \n", 420 | " infoList=[] \n", 421 | "\n", 422 | "\n", 423 | " #原文博主数据\n", 424 | " fansUserInfoList=getInfoList(fansUserInfoDict,userRangeDict) \n", 425 | " infoList+=fansUserInfoList \n", 426 | "\n", 427 | "\n", 428 | " fansUserID=fansUserInfoDict['id']\n", 429 | "\n", 430 | " #fansUserID为粉丝账号的ID \n", 431 | " #可在此对id进行信息采集 \n", 432 | "\n", 433 | " if getConcreteInfoList is True:\n", 434 | " infoDict=getInfo(isLogin,fansUserID)\n", 435 | " otherInfoList=getInfoList(infoDict,infoRangeDict) \n", 436 | " infoList+=otherInfoList \n", 437 | " #print(infoList)\n", 438 | " #保存数据至csv\n", 439 | " csvWriter.writerow(infoList) \n", 440 | "\n", 441 | " #不断获取该博主对的影响力\n", 442 | " #break\n", 443 | " except:\n", 444 | " pass\n", 445 | "\n", 446 | " #延时,防止反爬\n", 447 | " time.sleep(3)\n", 448 | " \n", 449 | " return notEnd" 450 | ] 451 | }, 452 | { 453 | "cell_type": "markdown", 454 | "metadata": {}, 455 | "source": [ 456 | "### 获取个人主页中信息" 457 | ] 458 | }, 459 | { 460 | "cell_type": "raw", 461 | "metadata": { 462 | "jupyter": { 463 | "source_hidden": true 464 | } 465 | }, 466 | "source": [ 467 | "使用示例:\n", 468 | "response = requests.get(url)\n", 469 | "txt=response.text\n", 470 | "print(drillInfo(txt))\n", 471 | "\n", 472 | "结果如下:\n", 473 | "{'昵称': '甘肃华熙文化',\n", 474 | " '简介': '【马丛珊.禅绣艺术,世界纹绣大师学院甘肃分院】服务生命之美;践行匠心为本,艺心创造,慈心发扬校训,微信mashan5374,☎13109439909',\n", 475 | " '性别': '女',\n", 476 | " '所在地': '甘肃 兰州'}" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 15, 482 | "metadata": { 483 | "jupyter": { 484 | "source_hidden": true 485 | } 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "def drillInfo(txt):\n", 490 | " keyInfo={}\n", 491 | " \n", 492 | " try: \n", 493 | " resjson = json.loads(txt) \n", 494 | " infodata = resjson.get('data')\n", 495 | " cards = infodata.get('cards')\n", 496 | " for l in range(0,len(cards)):\n", 497 | " temp = cards[l]\n", 498 | " card_group = temp.get('card_group') \n", 499 | " #判断获取信息类型 \n", 500 | " for card in card_group: \n", 501 | " #将信息传入字典\n", 502 | " name=card.get('item_name')\n", 503 | " if name is not None:\n", 504 | " content=card.get('item_content')\n", 505 | " keyInfo[name]=content \n", 506 | " except:\n", 507 | " pass\n", 508 | " return keyInfo" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "### 构建通过id访问个人主页的url" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 16, 521 | "metadata": { 522 | "jupyter": { 523 | "source_hidden": true 524 | } 525 | }, 526 | "outputs": [], 527 | "source": [ 528 | "def infoUrl(id):\n", 529 | " url = \"https://m.weibo.cn/api/container/getIndex?containerid=230283\"+str(id)+\"_-_INFO\" \n", 530 | " return url" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "## 爬取某id博主的个人信息" 538 | ] 539 | }, 540 | { 541 | "cell_type": "raw", 542 | "metadata": { 543 | "jupyter": { 544 | "source_hidden": true 545 | } 546 | }, 547 | "source": [ 548 | "为防止反复爬取,将原文整体保存为文件,格式为 信息卡片长度(2 or 5)+id+博主id\n", 549 | "不登录2含有性别、所在地\n", 550 | "登录5含有性别、所在地、星座、大学、公司等完整信息\n", 551 | "若存在所需文件,则从文件读取信息,否则爬取,同时保存文件\n", 552 | "\n", 553 | "若爬取未成功,返回-1\n", 554 | "\n", 555 | "使用库函数:\n", 556 | "os" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": 17, 562 | "metadata": { 563 | "jupyter": { 564 | "source_hidden": true 565 | } 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "def getInfo(state,id):\n", 570 | " \n", 571 | " address=addrRoot+'info/'+str(state)+'id'+str(id)+'.txt'\n", 572 | " path=addrRoot+'info/'\n", 573 | " if os.path.exists(path) is False:\n", 574 | " os.makedirs(path)\n", 575 | " try:\n", 576 | " #已有文件\n", 577 | " if(os.path.exists(address)==True):\n", 578 | " fp = open(address,'r',encoding='utf-16')\n", 579 | " txt=fp.read()\n", 580 | " info=drillInfo(txt)\n", 581 | " fp.close()\n", 582 | " else: \n", 583 | " fp = open(address,'w+',encoding='utf-16')\n", 584 | " url=infoUrl(id)\n", 585 | " if state is True:\n", 586 | " response = requests.get(url,headers=headers)\n", 587 | " else:\n", 588 | " response = requests.get(url)\n", 589 | " txt=response.text\n", 590 | " fp.write(response.text) \n", 591 | " info=drillInfo(txt)\n", 592 | " fp.close()\n", 593 | " except:\n", 594 | " info=-1 \n", 595 | " \n", 596 | " return info" 597 | ] 598 | }, 599 | { 600 | "cell_type": "raw", 601 | "metadata": {}, 602 | "source": [ 603 | "获取特定个人信息" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": 18, 609 | "metadata": { 610 | "jupyter": { 611 | "source_hidden": true 612 | } 613 | }, 614 | "outputs": [], 615 | "source": [ 616 | "def getExatInfo(item,state,id):\n", 617 | " info=getInfo(state,id)\n", 618 | " content=info.get(item)\n", 619 | " if content is not None:\n", 620 | " return content\n", 621 | " else:\n", 622 | " return infoNoExistStr " 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 19, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "### 构造热门界面访问" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 20, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "def downloadData(id):\n", 641 | " tweeter=getExatInfo('昵称',2,int(id))\n", 642 | " batch=0\n", 643 | " while(1):\n", 644 | "\n", 645 | " fileAddr=addrFile(tweeter,'batch'+str(batch))\n", 646 | " if os.path.exists(fileAddr) is True:\n", 647 | " print(fileAddr+'已存在,跳过采集') \n", 648 | " else:\n", 649 | " print('文件将写入:'+fileAddr)\n", 650 | " fp = open(fileAddr,'w+',newline='',encoding='utf-16')\n", 651 | " writer=csv.writer(fp)\n", 652 | " if reRatio(contentURL(id,range(20*batch,20*(batch+1))),writer) is False:\n", 653 | " fp.close()\n", 654 | " break\n", 655 | "\n", 656 | " fp.close()\n", 657 | " print('第'+str(batch)+'批数据已记录完毕')\n", 658 | " batch+=1" 659 | ] 660 | }, 661 | { 662 | "cell_type": "raw", 663 | "metadata": {}, 664 | "source": [ 665 | "\n", 666 | "#陈赫\n", 667 | "id=1574684061\n", 668 | "#MorningGlory_肖战资源博\n", 669 | "id=5735501478\n", 670 | "\n", 671 | "#靳东\n", 672 | "id=1093897112\n", 673 | "#李健\n", 674 | "id=1744395855\n", 675 | "\n", 676 | "#干部\n", 677 | "id=6472269230\n", 678 | "\n", 679 | "#陶勇\n", 680 | "id=5899876484\n", 681 | "\n", 682 | "#姚晨\n", 683 | "id=1266321801\n", 684 | "\n", 685 | "#鞠婧祎\n", 686 | "id=3669102477\n", 687 | "\n", 688 | "#韩红\n", 689 | "#id=1922542315\n", 690 | "\n", 691 | "\n", 692 | "#穿帮君\n", 693 | "id=5671786192\n", 694 | "\n", 695 | "#汉堡爸爸\n", 696 | "id=2784421224\n", 697 | "\n", 698 | "#蔡徐坤\n", 699 | "\n", 700 | "id=1776448504\n", 701 | "\n", 702 | "\n", 703 | "#林书豪\n", 704 | "id=2106855375\n", 705 | "\n", 706 | "#干部\n", 707 | "id=6472269230\n", 708 | "\n", 709 | "#任嘉伦\n", 710 | "id=3800468188\n", 711 | "\n", 712 | "#肖战\n", 713 | "id=1792951112\n", 714 | "\n", 715 | "\n", 716 | "#迪丽热巴\n", 717 | "id=1669879400\n", 718 | "\n", 719 | "\n", 720 | "#科比\n", 721 | "id=3264072325\n", 722 | "\n", 723 | "#雷军\n", 724 | "1749127163" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 27, 730 | "metadata": {}, 731 | "outputs": [ 732 | { 733 | "name": "stdin", 734 | "output_type": "stream", 735 | "text": [ 736 | "博主id: 1574684061\n" 737 | ] 738 | }, 739 | { 740 | "name": "stdout", 741 | "output_type": "stream", 742 | "text": [ 743 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/fans/陈赫/陈赫batch0.csv\n", 744 | "https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_1574684061&page=11\n" 745 | ] 746 | } 747 | ], 748 | "source": [ 749 | "id=input('博主id:')\n", 750 | "\n", 751 | "downloadData(id)" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": null, 757 | "metadata": {}, 758 | "outputs": [], 759 | "source": [] 760 | } 761 | ], 762 | "metadata": { 763 | "kernelspec": { 764 | "display_name": "Python 3", 765 | "language": "python", 766 | "name": "python3" 767 | }, 768 | "language_info": { 769 | "codemirror_mode": { 770 | "name": "ipython", 771 | "version": 3 772 | }, 773 | "file_extension": ".py", 774 | "mimetype": "text/x-python", 775 | "name": "python", 776 | "nbconvert_exporter": "python", 777 | "pygments_lexer": "ipython3", 778 | "version": "3.7.4" 779 | } 780 | }, 781 | "nbformat": 4, 782 | "nbformat_minor": 4 783 | } 784 | -------------------------------------------------------------------------------- /fansMap/geo_example.py: -------------------------------------------------------------------------------- 1 | from pyecharts import options as opts 2 | from pyecharts.charts import Geo, Page 3 | from pyecharts.faker import Collector, Faker 4 | from pyecharts.globals import ChartType, SymbolType 5 | 6 | C = Collector() 7 | 8 | 9 | @C.funcs 10 | def geo_base() -> Geo: 11 | c = ( 12 | Geo() 13 | .add_schema(maptype="china") 14 | .add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())]) 15 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 16 | .set_global_opts( 17 | visualmap_opts=opts.VisualMapOpts(), 18 | title_opts=opts.TitleOpts(title="Geo-基本示例"), 19 | ) 20 | ) 21 | return c 22 | 23 | 24 | @C.funcs 25 | def geo_visualmap_piecewise() -> Geo: 26 | c = ( 27 | Geo() 28 | .add_schema(maptype="china") 29 | .add("geo", [list(z) for z in zip(Faker.provinces, Faker.values())]) 30 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 31 | .set_global_opts( 32 | visualmap_opts=opts.VisualMapOpts(is_piecewise=True), 33 | title_opts=opts.TitleOpts(title="Geo-VisualMap(分段型)"), 34 | ) 35 | ) 36 | return c 37 | 38 | 39 | @C.funcs 40 | def geo_effectscatter() -> Geo: 41 | c = ( 42 | Geo() 43 | .add_schema(maptype="china") 44 | .add( 45 | "geo", 46 | [list(z) for z in zip(Faker.provinces, Faker.values())], 47 | type_=ChartType.EFFECT_SCATTER, 48 | ) 49 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 50 | .set_global_opts(title_opts=opts.TitleOpts(title="Geo-EffectScatter")) 51 | ) 52 | return c 53 | 54 | 55 | @C.funcs 56 | def geo_heatmap() -> Geo: 57 | c = ( 58 | Geo() 59 | .add_schema(maptype="china") 60 | .add( 61 | "geo", 62 | [list(z) for z in zip(Faker.provinces, Faker.values())], 63 | type_=ChartType.HEATMAP, 64 | ) 65 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 66 | .set_global_opts( 67 | visualmap_opts=opts.VisualMapOpts(), 68 | title_opts=opts.TitleOpts(title="Geo-HeatMap"), 69 | ) 70 | ) 71 | return c 72 | 73 | 74 | @C.funcs 75 | def geo_guangdong() -> Geo: 76 | c = ( 77 | Geo() 78 | .add_schema(maptype="广东") 79 | .add( 80 | "geo", 81 | [list(z) for z in zip(Faker.guangdong_city, Faker.values())], 82 | type_=ChartType.HEATMAP, 83 | ) 84 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 85 | .set_global_opts( 86 | visualmap_opts=opts.VisualMapOpts(), 87 | title_opts=opts.TitleOpts(title="Geo-广东地图"), 88 | ) 89 | ) 90 | return c 91 | 92 | 93 | @C.funcs 94 | def geo_lines() -> Geo: 95 | c = ( 96 | Geo() 97 | .add_schema(maptype="china") 98 | .add( 99 | "", 100 | [("广州", 55), ("北京", 66), ("杭州", 77), ("重庆", 88)], 101 | type_=ChartType.EFFECT_SCATTER, 102 | color="white", 103 | ) 104 | .add( 105 | "geo", 106 | [("广州", "上海"), ("广州", "北京"), ("广州", "杭州"), ("广州", "重庆")], 107 | type_=ChartType.LINES, 108 | effect_opts=opts.EffectOpts( 109 | symbol=SymbolType.ARROW, symbol_size=6, color="blue" 110 | ), 111 | linestyle_opts=opts.LineStyleOpts(curve=0.2), 112 | ) 113 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 114 | .set_global_opts(title_opts=opts.TitleOpts(title="Geo-Lines")) 115 | ) 116 | return c 117 | 118 | 119 | @C.funcs 120 | def geo_lines_background() -> Geo: 121 | c = ( 122 | Geo() 123 | .add_schema( 124 | maptype="china", 125 | itemstyle_opts=opts.ItemStyleOpts(color="#323c48", border_color="#111"), 126 | ) 127 | .add( 128 | "", 129 | [("广州", 55), ("北京", 66), ("杭州", 77), ("重庆", 88)], 130 | type_=ChartType.EFFECT_SCATTER, 131 | color="white", 132 | ) 133 | .add( 134 | "geo", 135 | [("广州", "上海"), ("广州", "北京"), ("广州", "杭州"), ("广州", "重庆")], 136 | type_=ChartType.LINES, 137 | effect_opts=opts.EffectOpts( 138 | symbol=SymbolType.ARROW, symbol_size=6, color="blue" 139 | ), 140 | linestyle_opts=opts.LineStyleOpts(curve=0.2), 141 | ) 142 | .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) 143 | .set_global_opts(title_opts=opts.TitleOpts(title="Geo-Lines-background")) 144 | ) 145 | return c 146 | 147 | 148 | Page().add(*[fn() for fn, _ in C.charts]).render() 149 | -------------------------------------------------------------------------------- /fansMap/render.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Awesome-pyecharts 6 | 7 | 8 | 9 | 10 | 11 |
12 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /repostWeibo/acquaireData.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "@[TOC]" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 库" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import csv\n", 25 | "import requests\n", 26 | "import xlwt\n", 27 | "import re\n", 28 | "import json\n", 29 | "import time" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### 配置" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "#根据个人浏览器信息进行修改\n", 46 | "headers = {\n", 47 | " 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36'\n", 48 | " ,\n", 49 | " 'Cookie':'个人浏览器中设置'\n", 50 | " #类似于'xxxxxxxxxxx H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2Fapi%2Fcomments%2Fshow%3Fid%3DIr5j4iRXW%26page%3D3; XSRF-TOKEN=11216a; WEIBOCN_FROM=1110006030; MLOGIN=1; SSOLoginState=1580006602; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5zFt06rZeA1gEI0iP7HfWxZntbpMr8WTWhrxEdSVGB58.; SUB=_2A25zKIyaDeRhGeNP41UT9yjIyj6IHXVQ0hTSrDV6PUJbktAKLRL-kW1NTk4PgHLYgtoeuxFzuGDIDcybzoEoXvq9; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5KzhUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0IIlrfWMMkVsTI; M_WEIBOCN_PARAMS=uicode%3D20000174'\n", 51 | "}" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "#文件保存地址\n", 61 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/'" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "#是否获取转发者具体个人信息\n", 71 | "getConcreteInfoList=True#False#True#\n", 72 | "isLogin=True#False#True\n", 73 | "\n", 74 | "#是否登入采集个人信息\n", 75 | "\n", 76 | "#无信息打印字符\n", 77 | "infoNoExistStr='未知'\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "#是否处理微博文本内容\n", 87 | "processText = True" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "### 构造表格,采集数据内容(修改这里获取想要的信息)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "raw", 99 | "metadata": {}, 100 | "source": [ 101 | "博主的信息单独收集:转发的:转发reposts_count、评论comments_count、点赞数量attitudes_count、粉丝数量followers_count\n", 102 | "\n", 103 | " 原始的retweeted_status:转发reposts_count、评论comments_count、点赞数量attitudes_count\n", 104 | " 原始用户的user:用户名screen_name、id、粉丝数量followers_count" 105 | ] 106 | }, 107 | { 108 | "cell_type": "raw", 109 | "metadata": {}, 110 | "source": [ 111 | "获取个人具体信息范围、排列" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 6, 117 | "metadata": { 118 | "jupyter": { 119 | "source_hidden": true 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "#获取个人具体信息范围、排列\n", 125 | "infoRangeDict={\n", 126 | "'性别':True,\n", 127 | "'所在地':True,\n", 128 | " \n", 129 | "'生日':False,\n", 130 | "'家乡':False,\n", 131 | "'公司':True,\n", 132 | "'大学':True,\n", 133 | " \n", 134 | "'昵称':False,\n", 135 | "'简介':False,\n", 136 | "'注册时间':False,\n", 137 | "'阳光信用':False,\n", 138 | " \n", 139 | " #若无信息显示\n", 140 | "'infoNoExist':'未知'\n", 141 | "}\n", 142 | "\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "raw", 147 | "metadata": {}, 148 | "source": [ 149 | "获取博文信息范围、排列" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 7, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "#获取博文信息范围、排列\n", 159 | "blogRangeDict={\n", 160 | "'visible': False,#{type: 0, list_id: 0}\n", 161 | " \n", 162 | "'created_at': True,#\"20分钟前\"\n", 163 | " \n", 164 | "'id': False,#\"4466073829119710\"\n", 165 | "'idstr': False,#\"4466073829119710\"\n", 166 | "'mid': False,#\"4466073829119710\"\n", 167 | "'can_edit': False,#false\n", 168 | "'show_additional_indication': False,#0\n", 169 | " \n", 170 | "'text': True,#\"【情况通报】2019年12月31日,武汉市卫健部门发布关于肺炎疫情的情况通报。\n", 171 | " \n", 172 | "'textLength': False,#452\n", 173 | "'source': False,#\"360安全浏览器\"\n", 174 | "'favorited': False,#false\n", 175 | "'pic_types': False,#\"\"\n", 176 | "'is_paid': False,#false\n", 177 | "'mblog_vip_type': False,#0\n", 178 | "'user': False,#{id: 2418542712, screen_name: \"平安武汉\",…}\n", 179 | " \n", 180 | "'reposts_count': True,#1035\n", 181 | "'comments_count': True,#1886\n", 182 | "'attitudes_count': True,#7508\n", 183 | " \n", 184 | "'pending_approval_count': False,#0\n", 185 | "'isLongText': False,#true\n", 186 | "'reward_exhibition_type':False,# 0\n", 187 | "'hide_flag': False,#0\n", 188 | "'mblogtype': False,#0\n", 189 | "'more_info_type': False,#0\n", 190 | "'cardid': False,#\"star_11247_common\"\n", 191 | "'content_auth': False,#0\n", 192 | "'pic_num': False,#0\n", 193 | " \n", 194 | "#若无相关信息,则显示:\n", 195 | "'infoNoExist':'未知'\n", 196 | "}" 197 | ] 198 | }, 199 | { 200 | "cell_type": "raw", 201 | "metadata": {}, 202 | "source": [ 203 | "获取博主信息范围、排列" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 8, 209 | "metadata": { 210 | "jupyter": { 211 | "source_hidden": true 212 | } 213 | }, 214 | "outputs": [], 215 | "source": [ 216 | "#获取博主信息范围、排列\n", 217 | "userRangeDict={\n", 218 | "'id':True,# 1323527941\n", 219 | "'screen_name': True,#\"Vista看天下\"\n", 220 | " \n", 221 | "'profile_image_url': False,#\"https://tva2.sinaimg.cn/crop.0.0.180.180.180/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg?KID=imgbed,tva&Expires=1580290462&ssig=xPIoKDRR56\"\n", 222 | "'profile_url':False,# \"https://m.weibo.cn/u/1323527941?uid=1323527941&luicode=10000011&lfid=1076031323527941\"\n", 223 | "'statuses_count': False,#微博数 77256\n", 224 | "'verified': False,#true\n", 225 | "'verified_type':False,# 3\n", 226 | "'verified_type_ext': False,#0\n", 227 | "'verified_reason': False,#\"《Vista看天下》官方微博\"\n", 228 | "'close_blue_v': False,#false\n", 229 | " \n", 230 | "'description': True,#\"一个有趣的蓝V\"\n", 231 | "'gender': True,# \"m\"\n", 232 | " \n", 233 | "'mbtype': False,#12\n", 234 | "'urank': False,#48\n", 235 | "'mbrank': False,#6\n", 236 | "'follow_me':False,# false\n", 237 | "'following':False,# false\n", 238 | " \n", 239 | "'followers_count': True,#19657897\n", 240 | "'follow_count': True,#1809\n", 241 | " \n", 242 | "'cover_image_phone': False,#\"https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg\"\n", 243 | "'avatar_hd': False,#\"https://ww2.sinaimg.cn/orj480/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg\"\n", 244 | "'like': False,#false\n", 245 | "'like_me': False,#false\n", 246 | "'badge': False,#{enterprise: 1, gongyi_level: 1, bind_taobao: 1, dzwbqlx_2016: 1, follow_whitelist_video: 1,…}\n", 247 | " \n", 248 | "#若无信息显示\n", 249 | "'infoNoExist':'未知'\n", 250 | "}\n" 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "### 文件命名" 258 | ] 259 | }, 260 | { 261 | "cell_type": "raw", 262 | "metadata": { 263 | "jupyter": { 264 | "source_hidden": true 265 | } 266 | }, 267 | "source": [ 268 | "使用示例:\n", 269 | "tweeter='王'\n", 270 | "fp = open(addrFile(tweeter),'w+',newline='',encoding='utf-16')\n", 271 | "fp.close()\n", 272 | "\n", 273 | "使用库函数:\n", 274 | "os" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 9, 280 | "metadata": { 281 | "jupyter": { 282 | "source_hidden": true 283 | } 284 | }, 285 | "outputs": [], 286 | "source": [ 287 | "def addrFile(tweeter,suffix):\n", 288 | " path=addrRoot+str(tweeter)+'/'\n", 289 | " if os.path.exists(path) is False:\n", 290 | " os.makedirs(path)\n", 291 | " address=path+tweeter+suffix+'.csv'\n", 292 | " return address " 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "### 生成信息标题" 300 | ] 301 | }, 302 | { 303 | "cell_type": "raw", 304 | "metadata": { 305 | "jupyter": { 306 | "source_hidden": true 307 | } 308 | }, 309 | "source": [ 310 | "将字典Dict中为True的条目生成标题,加前缀prefix\n", 311 | "\n", 312 | "使用实例:\n", 313 | "print(getInfoTitle(blogRangeDict,'原文'))\n", 314 | "打印结果:\n", 315 | "['原文created_at', '原文text', '原文reposts_count', '原文comments_count', '原文attitudes_count']" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 10, 321 | "metadata": { 322 | "jupyter": { 323 | "source_hidden": true 324 | } 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "def getInfoTitle(Dict,prefix):\n", 329 | " titleList=[]\n", 330 | " for item in Dict:\n", 331 | " if(Dict.get(item) is True):\n", 332 | " titleList.append(prefix+item)\n", 333 | " return (titleList)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": {}, 339 | "source": [ 340 | "## 工具类,用来去除爬取的正文中一些不需要的链接、标签等" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 11, 346 | "metadata": { 347 | "jupyter": { 348 | "source_hidden": true 349 | } 350 | }, 351 | "outputs": [], 352 | "source": [ 353 | "#工具类,用来去除爬取的正文中一些不需要的链接、标签等\n", 354 | "class Tool:\n", 355 | " deleteImg = re.compile('')\n", 356 | " newLine =re.compile('|
||
')\n", 357 | " deleteAite = re.compile('//.*?:')\n", 358 | " deleteAddr = re.compile('.*?
|')\n", 360 | " deleteWord = re.compile('回复@|回覆@|回覆|回复')\n", 361 | " \n", 362 | " @classmethod\n", 363 | " def replace(cls,x):\n", 364 | " x = re.sub(cls.deleteWord,'',x)\n", 365 | " x = re.sub(cls.deleteImg,'',x)\n", 366 | " x = re.sub(cls.deleteAite,'',x)\n", 367 | " x = re.sub(cls.deleteAddr, '', x)\n", 368 | " x = re.sub(cls.newLine,'',x)\n", 369 | " x = re.sub(cls.deleteTag,'',x)\n", 370 | " return x.strip()" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "### 构造微博内容的url" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 12, 383 | "metadata": { 384 | "jupyter": { 385 | "source_hidden": true 386 | } 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "###某微博账户的全部微博内容\n", 391 | "def contentURL(id,pages):\n", 392 | " i=0\n", 393 | " urls=[]\n", 394 | " for page in pages:\n", 395 | " if page is not 0:\n", 396 | " urls+=['https://m.weibo.cn/api/container/getIndex?containerid=230413'+str(id)+'_-_WEIBO_SECOND_PROFILE_WEIBO&page_type=03&page='+str(page)]\n", 397 | "\n", 398 | " return urls" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 13, 404 | "metadata": { 405 | "jupyter": { 406 | "source_hidden": true 407 | } 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "#将字典类型的信息格式传递为需要的信息列表\n", 412 | "def getInfoList(infoDict,rangeDict):\n", 413 | " infoList=[]\n", 414 | " for item in rangeDict:\n", 415 | " if rangeDict.get(item) is True:\n", 416 | " content=infoDict.get(item)\n", 417 | " if content is not None:\n", 418 | " #处理微博文本内容 \n", 419 | " if item =='text':\n", 420 | " if processText is True:\n", 421 | " content=Tool.replace(content)\n", 422 | " infoList.append(content) \n", 423 | " else:\n", 424 | " infoList.append(rangeDict['infoNoExist'])\n", 425 | " return infoList" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "### 观测对每个转发微博的影响" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 14, 438 | "metadata": { 439 | "jupyter": { 440 | "source_hidden": true 441 | } 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "###在已有的一系列urls中进行操作\n", 446 | "###筛选出微博转发内容进行操作\n", 447 | "def reRatio(urls,csvWriter):\n", 448 | " notEnd= True\n", 449 | " \n", 450 | " retweetBlogTitle=getInfoTitle(blogRangeDict,'转发')#转发博文信息标题\n", 451 | " retweetUserTitle=getInfoTitle(userRangeDict,'转发')#转发博主信息标题\n", 452 | " \n", 453 | " originBlogTitle=getInfoTitle(blogRangeDict,'原文')#原文博文信息标题\n", 454 | " originUserTitle=getInfoTitle(userRangeDict,'原文')#原文博主信息标题\n", 455 | " infoTitle=getInfoTitle(infoRangeDict,'')#原文博主个人主页信息标题\n", 456 | " \n", 457 | " #写表格的标题\n", 458 | " if getConcreteInfoList is True: \n", 459 | " csvWriter.writerow(retweetBlogTitle+retweetUserTitle+originBlogTitle+originUserTitle+infoTitle) \n", 460 | " else:\n", 461 | " csvWriter.writerow(retweetBlogTitle+retweetUserTitle+originBlogTitle+originUserTitle)\n", 462 | " \n", 463 | " for url in urls: \n", 464 | " \n", 465 | " response = requests.get(url,headers=headers)\n", 466 | " resjson = json.loads(response.text) \n", 467 | " cards=resjson['data']['cards'] \n", 468 | " \n", 469 | " #print(cards)\n", 470 | " \n", 471 | " #结束最后\n", 472 | " if(len(cards)==1):\n", 473 | " notEnd=False\n", 474 | " break\n", 475 | " #遍历一个页面的所有微博 \n", 476 | " for card in cards:\n", 477 | " try:\n", 478 | " #转发博文与博主信息\n", 479 | " retweetBlogInfoDict=card['mblog'] \n", 480 | " retweetUserInfoDict=retweetBlogInfoDict['user'] \n", 481 | " \n", 482 | " #筛选出转发的微博\n", 483 | " try: \n", 484 | " originBlogInfoDict=retweetBlogInfoDict['retweeted_status']\n", 485 | " \n", 486 | " \n", 487 | " if originBlogInfoDict is not None: \n", 488 | " \n", 489 | " #转发博文原文与博主信息\n", 490 | " originUserInfoDict=originBlogInfoDict['user']\n", 491 | " retweetUserID=retweetUserInfoDict['id']\n", 492 | " originUserID=originUserInfoDict['id']\n", 493 | " ###不是转发自己的微博,则选中进行处理\n", 494 | " if(retweetUserID!=originUserID):\n", 495 | " infoList=[] \n", 496 | " \n", 497 | " #转发博文数据\n", 498 | " retweetBlogInfoList=getInfoList(retweetBlogInfoDict,blogRangeDict) \n", 499 | " infoList+=retweetBlogInfoList \n", 500 | " #转发博主数据\n", 501 | " ##默认已知\n", 502 | " retweetUserInfoList=getInfoList(retweetUserInfoDict,userRangeDict) \n", 503 | " infoList+=retweetUserInfoList \n", 504 | " #原文博文数据\n", 505 | " originBlogInfoList=getInfoList(originBlogInfoDict,blogRangeDict) \n", 506 | " infoList+=originBlogInfoList\n", 507 | " #原文博主数据\n", 508 | " originUserInfoList=getInfoList(originUserInfoDict,userRangeDict) \n", 509 | " infoList+=originUserInfoList \n", 510 | " \n", 511 | " #originUserID为原文账号的ID \n", 512 | " #可在此对id进行信息采集 \n", 513 | " \n", 514 | " if getConcreteInfoList is True:\n", 515 | " infoDict=getInfo(isLogin,originUserID)\n", 516 | " otherInfoList=getInfoList(infoDict,infoRangeDict) \n", 517 | " infoList+=otherInfoList \n", 518 | " #print(infoList)\n", 519 | " #保存数据至csv\n", 520 | " csvWriter.writerow(infoList) \n", 521 | " \n", 522 | " #不断获取该博主对的影响力\n", 523 | " #break\n", 524 | " except:\n", 525 | " pass\n", 526 | " except:\n", 527 | " pass\n", 528 | " #延时,防止反爬\n", 529 | " time.sleep(3)\n", 530 | " \n", 531 | " return notEnd" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "metadata": {}, 537 | "source": [ 538 | "### 获取个人主页中信息" 539 | ] 540 | }, 541 | { 542 | "cell_type": "raw", 543 | "metadata": { 544 | "jupyter": { 545 | "source_hidden": true 546 | } 547 | }, 548 | "source": [ 549 | "使用示例:\n", 550 | "response = requests.get(url)\n", 551 | "txt=response.text\n", 552 | "print(drillInfo(txt))\n", 553 | "\n", 554 | "结果如下:\n", 555 | "{'昵称': '甘肃华熙文化',\n", 556 | " '简介': '【马丛珊.禅绣艺术,世界纹绣大师学院甘肃分院】服务生命之美;践行匠心为本,艺心创造,慈心发扬校训,微信mashan5374,☎13109439909',\n", 557 | " '性别': '女',\n", 558 | " '所在地': '甘肃 兰州'}" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": 15, 564 | "metadata": { 565 | "jupyter": { 566 | "source_hidden": true 567 | } 568 | }, 569 | "outputs": [], 570 | "source": [ 571 | "def drillInfo(txt):\n", 572 | " keyInfo={}\n", 573 | " \n", 574 | " try: \n", 575 | " resjson = json.loads(txt) \n", 576 | " infodata = resjson.get('data')\n", 577 | " cards = infodata.get('cards')\n", 578 | " for l in range(0,len(cards)):\n", 579 | " temp = cards[l]\n", 580 | " card_group = temp.get('card_group') \n", 581 | " #判断获取信息类型 \n", 582 | " for card in card_group: \n", 583 | " #将信息传入字典\n", 584 | " name=card.get('item_name')\n", 585 | " if name is not None:\n", 586 | " content=card.get('item_content')\n", 587 | " keyInfo[name]=content \n", 588 | " except:\n", 589 | " pass\n", 590 | " return keyInfo" 591 | ] 592 | }, 593 | { 594 | "cell_type": "markdown", 595 | "metadata": {}, 596 | "source": [ 597 | "### 构建通过id访问个人主页的url" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 16, 603 | "metadata": { 604 | "jupyter": { 605 | "source_hidden": true 606 | } 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "def infoUrl(id):\n", 611 | " url = \"https://m.weibo.cn/api/container/getIndex?containerid=230283\"+str(id)+\"_-_INFO\" \n", 612 | " return url" 613 | ] 614 | }, 615 | { 616 | "cell_type": "markdown", 617 | "metadata": {}, 618 | "source": [ 619 | "## 爬取某id博主的个人信息" 620 | ] 621 | }, 622 | { 623 | "cell_type": "raw", 624 | "metadata": { 625 | "jupyter": { 626 | "source_hidden": true 627 | } 628 | }, 629 | "source": [ 630 | "为防止反复爬取,将原文整体保存为文件,格式为 信息卡片长度(2 or 5)+id+博主id\n", 631 | "不登录2含有性别、所在地\n", 632 | "登录5含有性别、所在地、星座、大学、公司等完整信息\n", 633 | "若存在所需文件,则从文件读取信息,否则爬取,同时保存文件\n", 634 | "\n", 635 | "若爬取未成功,返回-1\n", 636 | "\n", 637 | "使用库函数:\n", 638 | "os" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 17, 644 | "metadata": { 645 | "jupyter": { 646 | "source_hidden": true 647 | } 648 | }, 649 | "outputs": [], 650 | "source": [ 651 | "def getInfo(state,id):\n", 652 | " \n", 653 | " address=addrRoot+'info/'+str(state)+'id'+str(id)+'.txt'\n", 654 | " path=addrRoot+'info/'\n", 655 | " if os.path.exists(path) is False:\n", 656 | " os.makedirs(path)\n", 657 | " try:\n", 658 | " #已有文件\n", 659 | " if(os.path.exists(address)==True):\n", 660 | " fp = open(address,'r',encoding='utf-16')\n", 661 | " txt=fp.read()\n", 662 | " info=drillInfo(txt)\n", 663 | " fp.close()\n", 664 | " else: \n", 665 | " fp = open(address,'w+',encoding='utf-16')\n", 666 | " url=infoUrl(id)\n", 667 | " if state is True:\n", 668 | " response = requests.get(url,headers=headers)\n", 669 | " else:\n", 670 | " response = requests.get(url)\n", 671 | " txt=response.text\n", 672 | " fp.write(response.text) \n", 673 | " info=drillInfo(txt)\n", 674 | " fp.close()\n", 675 | " except:\n", 676 | " info=-1 \n", 677 | " \n", 678 | " return info" 679 | ] 680 | }, 681 | { 682 | "cell_type": "raw", 683 | "metadata": {}, 684 | "source": [ 685 | "获取特定个人信息" 686 | ] 687 | }, 688 | { 689 | "cell_type": "code", 690 | "execution_count": 18, 691 | "metadata": { 692 | "jupyter": { 693 | "source_hidden": true 694 | } 695 | }, 696 | "outputs": [], 697 | "source": [ 698 | "def getExatInfo(item,state,id):\n", 699 | " info=getInfo(state,id)\n", 700 | " content=info.get(item)\n", 701 | " if content is not None:\n", 702 | " return content\n", 703 | " else:\n", 704 | " return infoNoExistStr " 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 19, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "### 构造热门界面访问" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 20, 719 | "metadata": { 720 | "jupyter": { 721 | "source_hidden": true 722 | } 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "def downloadData(id):\n", 727 | " tweeter=getExatInfo('昵称',2,int(id))\n", 728 | " batch=0\n", 729 | " while(1):\n", 730 | "\n", 731 | " fileAddr=addrFile(tweeter,'batch'+str(batch))\n", 732 | " if os.path.exists(fileAddr) is True:\n", 733 | " print(fileAddr+'已存在,跳过采集') \n", 734 | " else:\n", 735 | " print('文件将写入:'+fileAddr)\n", 736 | " fp = open(fileAddr,'w+',newline='',encoding='utf-16')\n", 737 | " writer=csv.writer(fp)\n", 738 | " if reRatio(contentURL(id,range(20*batch,20*(batch+1))),writer) is False:\n", 739 | " fp.close()\n", 740 | " break\n", 741 | "\n", 742 | " fp.close()\n", 743 | " print('第'+str(batch)+'批数据已记录完毕')\n", 744 | " batch+=1" 745 | ] 746 | }, 747 | { 748 | "cell_type": "raw", 749 | "metadata": { 750 | "jupyter": { 751 | "source_hidden": true 752 | } 753 | }, 754 | "source": [ 755 | "\n", 756 | "#陈赫\n", 757 | "id=1574684061\n", 758 | "#MorningGlory_肖战资源博\n", 759 | "id=5735501478\n", 760 | "\n", 761 | "#靳东\n", 762 | "id=1093897112\n", 763 | "#李健\n", 764 | "id=1744395855\n", 765 | "\n", 766 | "#干部\n", 767 | "id=6472269230\n", 768 | "\n", 769 | "#陶勇\n", 770 | "id=5899876484\n", 771 | "\n", 772 | "#姚晨\n", 773 | "id=1266321801\n", 774 | "\n", 775 | "#鞠婧祎\n", 776 | "id=3669102477\n", 777 | "\n", 778 | "#韩红\n", 779 | "#id=1922542315\n", 780 | "\n", 781 | "\n", 782 | "#穿帮君\n", 783 | "id=5671786192\n", 784 | "\n", 785 | "#汉堡爸爸\n", 786 | "id=2784421224\n", 787 | "\n", 788 | "#蔡徐坤\n", 789 | "\n", 790 | "id=1776448504\n", 791 | "\n", 792 | "\n", 793 | "#林书豪\n", 794 | "id=2106855375\n", 795 | "\n", 796 | "#干部\n", 797 | "id=6472269230\n", 798 | "\n", 799 | "#任嘉伦\n", 800 | "id=3800468188\n", 801 | "\n", 802 | "#肖战\n", 803 | "id=1792951112\n", 804 | "\n", 805 | "\n", 806 | "#迪丽热巴\n", 807 | "id=1669879400\n", 808 | "\n", 809 | "\n", 810 | "#科比\n", 811 | "id=3264072325" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": 23, 817 | "metadata": {}, 818 | "outputs": [ 819 | { 820 | "name": "stdin", 821 | "output_type": "stream", 822 | "text": [ 823 | "博主id: 1663414103\n" 824 | ] 825 | }, 826 | { 827 | "name": "stdout", 828 | "output_type": "stream", 829 | "text": [ 830 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch0.csv\n", 831 | "第0批数据已记录完毕\n", 832 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch1.csv\n", 833 | "第1批数据已记录完毕\n", 834 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch2.csv\n", 835 | "第2批数据已记录完毕\n", 836 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch3.csv\n", 837 | "第3批数据已记录完毕\n", 838 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch4.csv\n", 839 | "第4批数据已记录完毕\n", 840 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch5.csv\n", 841 | "第5批数据已记录完毕\n", 842 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch6.csv\n", 843 | "第6批数据已记录完毕\n", 844 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch7.csv\n", 845 | "第7批数据已记录完毕\n", 846 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch8.csv\n", 847 | "第8批数据已记录完毕\n", 848 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch9.csv\n", 849 | "第9批数据已记录完毕\n", 850 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch10.csv\n", 851 | "第10批数据已记录完毕\n", 852 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch11.csv\n", 853 | "第11批数据已记录完毕\n", 854 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch12.csv\n", 855 | "第12批数据已记录完毕\n", 856 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch13.csv\n", 857 | "第13批数据已记录完毕\n", 858 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch14.csv\n", 859 | "第14批数据已记录完毕\n", 860 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch15.csv\n", 861 | "第15批数据已记录完毕\n", 862 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch16.csv\n", 863 | "第16批数据已记录完毕\n", 864 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch17.csv\n", 865 | "第17批数据已记录完毕\n", 866 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch18.csv\n", 867 | "第18批数据已记录完毕\n", 868 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch19.csv\n", 869 | "第19批数据已记录完毕\n", 870 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch20.csv\n", 871 | "第20批数据已记录完毕\n", 872 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch21.csv\n", 873 | "第21批数据已记录完毕\n", 874 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch22.csv\n" 875 | ] 876 | } 877 | ], 878 | "source": [ 879 | "id=input('博主id:')\n", 880 | "\n", 881 | "downloadData(id)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": null, 887 | "metadata": {}, 888 | "outputs": [], 889 | "source": [] 890 | } 891 | ], 892 | "metadata": { 893 | "kernelspec": { 894 | "display_name": "Python 3", 895 | "language": "python", 896 | "name": "python3" 897 | }, 898 | "language_info": { 899 | "codemirror_mode": { 900 | "name": "ipython", 901 | "version": 3 902 | }, 903 | "file_extension": ".py", 904 | "mimetype": "text/x-python", 905 | "name": "python", 906 | "nbconvert_exporter": "python", 907 | "pygments_lexer": "ipython3", 908 | "version": "3.7.4" 909 | } 910 | }, 911 | "nbformat": 4, 912 | "nbformat_minor": 4 913 | } 914 | -------------------------------------------------------------------------------- /repostWeibo/analyseTxt.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import csv" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "def addrPath(tweeter):\n", 20 | " path=addrRoot+str(tweeter)+'/可视化分析结果/'\n", 21 | " if os.path.exists(path) is False:\n", 22 | " os.makedirs(path)\n", 23 | " return path" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "def addrFile(tweeter,suffix):\n", 33 | " path=addrRoot+str(tweeter)+'/'\n", 34 | " if os.path.exists(path) is False:\n", 35 | " os.makedirs(path)\n", 36 | " address=path+tweeter+suffix+'.csv'\n", 37 | " return address " 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 4, 43 | "metadata": { 44 | "jupyter": { 45 | "source_hidden": true 46 | } 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import random\n", 51 | "\n", 52 | "#for item in dataDict:\n", 53 | " #print(dataDict.get(item)[-1-sort[-1]])\n", 54 | " #print(item)\n", 55 | "def randomText(no):\n", 56 | " \n", 57 | " item=random.choice(list(dataDict))\n", 58 | " return (item+':'+str(dataDict.get(item)[no]))\n", 59 | "\n", 60 | "def exactText(no,item):\n", 61 | " text=(item+':'+str(dataDict.get(item)[no]))\n", 62 | " print(text)\n", 63 | " return text\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "metadata": { 70 | "jupyter": { 71 | "source_hidden": true 72 | } 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "#文件保存地址,改为你存放csv文件的完整地址\n", 77 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/'\n", 78 | "\n", 79 | "#是否调试\n", 80 | "isDebug=False\n", 81 | "\n", 82 | "#100万+显示\n", 83 | "infinity=1000000" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 6, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdin", 93 | "output_type": "stream", 94 | "text": [ 95 | "输入博主姓名: 华中科技大学\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "tweeter=input('输入博主姓名:')#'陈赫'" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 7, 106 | "metadata": {}, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch0.csv\n", 113 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch1.csv\n", 114 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch2.csv\n", 115 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch3.csv\n", 116 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch4.csv\n", 117 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch5.csv\n", 118 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch6.csv\n", 119 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch7.csv\n", 120 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch8.csv\n", 121 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch9.csv\n", 122 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch10.csv\n", 123 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch11.csv\n", 124 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch12.csv\n", 125 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch13.csv\n", 126 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch14.csv\n", 127 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch15.csv\n", 128 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch16.csv\n", 129 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch17.csv\n", 130 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch18.csv\n", 131 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch19.csv\n", 132 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch20.csv\n", 133 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch21.csv\n", 134 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch22.csv\n", 135 | "C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch23.csv\n", 136 | "不存在C:/Users/cascara/Desktop/seedcup/csv/blog/华中科技大学/华中科技大学batch23.csv\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "#开始\n", 142 | "startCollect=True\n", 143 | "\n", 144 | "\n", 145 | "batch=0\n", 146 | "dataDict={}\n", 147 | "Title=[]\n", 148 | "while(1):\n", 149 | " titleEixst=True\n", 150 | " address=addrFile(tweeter,'batch'+str(batch)) \n", 151 | " print(address)\n", 152 | " \n", 153 | " if os.path.exists(address) is True:\n", 154 | " fp = open(address,'r',newline='',encoding='utf-16') \n", 155 | " reader=csv.reader(fp)\n", 156 | " for line in reader:\n", 157 | " if titleEixst is True:\n", 158 | " #将标题制成各个\n", 159 | " if startCollect is True:\n", 160 | " for item in line:\n", 161 | " #print(item[-1])\n", 162 | " if isDebug is True:\n", 163 | " print(item)\n", 164 | "\n", 165 | " \n", 166 | " dataDict[item]=[]\n", 167 | " startCollect=False\n", 168 | " Title=line \n", 169 | " if isDebug is True:\n", 170 | " print(line)\n", 171 | " titleEixst=False\n", 172 | " else:\n", 173 | " for no in range(len(line)): \n", 174 | " if(Title[no].find('count')>=0):\n", 175 | " if line[no].find('0万') > 0:\n", 176 | " dataDict.get(Title[no]).append(infinity)\n", 177 | " else:\n", 178 | " dataDict.get(Title[no]).append(eval(line[no]))\n", 179 | " \n", 180 | " #dataDict.get(Title[no]).append((line[no]))\n", 181 | " \n", 182 | " else:\n", 183 | " dataDict.get(Title[no]).append((line[no]))\n", 184 | " if isDebug is True:\n", 185 | " print(Title[no])\n", 186 | " print(line[no]) \n", 187 | " \n", 188 | " \n", 189 | " else:\n", 190 | " print('不存在'+address)\n", 191 | " break\n", 192 | " fp.close()\n", 193 | " batch+=1\n", 194 | "#print(dataDict)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 12, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "['北京 东城区', '北京', '北京', '湖北 武汉', '北京', '北京', '北京', '北京', '北京', '北京 朝阳区', '北京', '北京', '湖北', '湖北', '湖北 武汉', '北京', '四川', '湖北 武汉', '湖北 武汉', '陕西 西安', '北京', '湖北 武汉', '北京 朝阳区', '广东', '北京', '湖北', '湖北 武汉', '北京', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '湖北 武汉', '河南', '湖北 武汉', '北京 东城区', '北京', '天津', '其他', '广东 深圳', '湖北 武汉', '北京 朝阳区', '北京', '四川', '湖北 武汉', '湖北 武汉', '北京 西城区', '湖北 襄阳', '湖北', '北京', '北京', '北京', '湖北', '北京', '广东', '北京', '湖北 武汉', '湖北 武汉', '北京', '北京', '湖北 武汉', '其他', '北京', '北京', '北京', '北京 朝阳区', '北京', '广东 广州', '福建 福州', '湖北 武汉', '湖南 岳阳', '湖北 武汉', '其他', '湖北 武汉', '北京 朝阳区', '湖北 武汉', '北京', '北京', '北京', '湖北', '海外 美国', '湖北', '浙江 杭州', '北京', '湖北', '湖北 武汉', '湖北 武汉', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '北京', '重庆 沙坪坝区', '湖北 武汉', '湖北 武汉', '浙江 温州', '其他', '其他', '湖北 黄冈', '云南', '湖北 武汉', '湖南 长沙', '其他', '湖北 武汉', '湖北 武汉', '北京', '重庆', '福建', '湖北 武汉', '湖北 武汉', '湖北 武汉', '北京', '湖北 武汉', '湖北', '湖北 武汉', '其他', '湖北', '湖北 武汉', '湖北 武汉', '海外 其他', '湖北', '北京', '湖北 武汉', '北京', '北京', '海外', '北京', '湖北', '湖北 荆州', '北京', '北京', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '未知', '安徽 池州', '湖北 武汉', '其他', '湖北 武汉', '北京', '湖南', '湖北 武汉', '湖北', '湖北 武汉', '湖北', '湖北 宜昌', '安徽 池州', '湖北 武汉', '湖北 武汉', '湖北 武汉', '湖北', '北京', '湖北 武汉', '湖北', '香港', '安徽 池州', '湖南 长沙', '海外', '安徽 池州', '湖北 武汉', '北京', '湖北', '其他', '湖北 武汉', '福建 福州', '福建 福州', '湖北 武汉', '福建 福州', '重庆', '陕西', '湖北 武汉', '北京', '湖北 武汉', '重庆', '湖北 武汉', '山东 青岛', '北京', '北京 西城区', '湖北 武汉', '天津', '山东', '天津', '重庆 沙坪坝区', '湖北', '湖北', '湖北', '北京', '浙江', '四川 成都', '广东', '湖北 武汉', '未知', '福建 福州', '浙江 宁波', '北京', '湖北 武汉', '北京', '福建 福州', '山西 太原', '福建 福州', '江苏 苏州', '湖北 武汉', '北京', '湖北', '北京', '浙江', '浙江 宁波', '其他', '湖南', '北京', '湖北 武汉', '湖南', '北京', '湖北', '湖北', '北京', '其他', '河南 洛阳', '湖北', '未知', '北京', '湖南', '海外 英国', '河南 焦作', '湖北 武汉', '湖北 武汉', '江苏 南京', '湖北', '湖北', '北京', '北京', '江苏 南京', '湖北 武汉', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '其他', '重庆', '湖北 武汉', '其他', '湖北 武汉', '海外', '湖北 武汉', '其他', '海外 英国', '湖北', '海外 沙特阿拉伯', '海外', '北京', '湖北 武汉', '湖南', '湖北 武汉', '湖北', '湖北 武汉', '其他', '湖北', '湖北 武汉', '其他', '湖北 武汉', '未知', '广东 广州', '湖北 武汉', '云南 昆明', '湖北 武汉', '浙江 杭州', '海外', '湖北', '河南 郑州', '安徽 池州', '未知', '上海 徐汇区', '福建 福州', '海外 英国', '其他', '北京', '其他', '其他', '福建 福州', '湖北 宜昌', '上海', '其他', '湖北 武汉', '江西', '江苏', '北京', '北京', '湖南', '北京', '其他', '重庆', '湖北 武汉', '其他', '福建 福州', '其他', '陕西', '其他', '辽宁', '其他', '广东', '其他', '上海', '陕西 西安', '湖北 武汉', '湖北', '其他', '湖北 武汉', '湖北 武汉', '江苏 苏州', '福建 福州', '湖北 武汉', '陕西 西安', '北京', '福建 福州', '湖北 武汉', '江西 九江', '山东 济宁', '湖北 武汉', '北京', '湖北 武汉', '广东', '四川 成都', '福建 福州', '福建 福州', '湖南', '上海 徐汇区', '海外 菲律宾', '湖北 武汉', '北京', '湖北 武汉', '湖北 武汉', '其他', '湖北 武汉', '湖北', '安徽 池州', '湖北', '四川 泸州', '湖北', '湖北 武汉', '湖南 岳阳', '重庆', '湖北 武汉', '湖北 武汉', '湖北 武汉', '湖北 武汉', '青海 西宁', '未知', '江苏 苏州', '北京', '其他']\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "print(dataDict['所在地'])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### 读入数据" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 10, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "originData={}\n", 228 | "retweetData={}\n", 229 | "\n", 230 | "labels=[0 for i in range(4)]\n", 231 | "cnLabels={}\n", 232 | "\n", 233 | "labels[1]='reposts_count'\n", 234 | "labels[2]='comments_count'\n", 235 | "labels[3]='attitudes_count'\n", 236 | "labels[0]='follow_count'\n", 237 | "\n", 238 | "cnLabels[labels[1]]='转发量'\n", 239 | "cnLabels[labels[2]]='评论量'\n", 240 | "cnLabels[labels[3]]='点赞量'\n", 241 | "cnLabels[labels[0]]='粉丝量'\n", 242 | "\n", 243 | "\n", 244 | "for i in range(4):\n", 245 | " #print(choice+label)\n", 246 | " try:\n", 247 | " #originData[cnLabels[i]]=np.asarray(dataDict.get('原文'+labels[i])[::-1])\n", 248 | " retweetData[labels[i]]=np.asarray(dataDict.get('转发'+labels[i])[::-1])\n", 249 | " except:\n", 250 | " pass\n", 251 | " try:\n", 252 | " originData[labels[i]]=np.asarray(dataDict.get('原文'+labels[i])[::-1])\n", 253 | " #retweetData[cnLabels[i]]=np.asarray(dataDict.get('转发'+labels[i])[::-1])\n", 254 | " except:\n", 255 | " pass\n", 256 | "\n" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 11, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "txt=''\n", 266 | "for text in (dataDict['转发text']):\n", 267 | " txt +=text\n" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 12, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "choice='转发'#'原文'#\n", 277 | "\n", 278 | "txt=''\n", 279 | "for text in (dataDict[choice+'text']):\n", 280 | " txt +=text" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 13, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "name": "stderr", 290 | "output_type": "stream", 291 | "text": [ 292 | "Building prefix dict from the default dictionary ...\n", 293 | "Loading model from cache C:\\Users\\cascara\\AppData\\Local\\Temp\\jieba.cache\n", 294 | "Loading model cost 0.932 seconds.\n", 295 | "Prefix dict has been built successfully.\n" 296 | ] 297 | }, 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "" 302 | ] 303 | }, 304 | "execution_count": 13, 305 | "metadata": {}, 306 | "output_type": "execute_result" 307 | } 308 | ], 309 | "source": [ 310 | "import jieba\n", 311 | "import numpy as np\n", 312 | "import wordcloud\n", 313 | "from PIL import Image\n", 314 | "import os\n", 315 | "#fc = open(\"threekingdoms.txt\", \"r\", encoding='utf-8')\n", 316 | "#fc=fc.read()\n", 317 | "\n", 318 | "savePath=addrPath(tweeter)+'词云/'\n", 319 | "\n", 320 | "if os.path.exists(savePath) is False:\n", 321 | " os.mkdir(savePath)\n", 322 | "\n", 323 | "\n", 324 | "fc=txt\n", 325 | "ct=0\n", 326 | "words=jieba.lcut(fc)\n", 327 | "'''for word in words:\n", 328 | " ct+=1\n", 329 | " print(word)\n", 330 | " if ct==3:\n", 331 | " break'''\n", 332 | "\n", 333 | "article=' '.join(words)\n", 334 | "\n", 335 | "\n", 336 | "mask = np.array(Image.open(\"huakegatemask.jpg\"))\n", 337 | "\n", 338 | "\n", 339 | "c = wordcloud.WordCloud( \\\n", 340 | " mask=mask,\n", 341 | " width = 1000, height = 700,\\\n", 342 | " background_color = \"white\",\n", 343 | " font_path = \"msyh.ttc\" \n", 344 | " )\n", 345 | "#c=wordcloud.WordCloud(font_path = \"msyh.ttc\" )\n", 346 | "c.generate(article)\n", 347 | "#print(c)\n", 348 | "c.to_file(savePath+choice+'词云图.png')\n" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 34, 354 | "metadata": {}, 355 | "outputs": [ 356 | { 357 | "name": "stderr", 358 | "output_type": "stream", 359 | "text": [ 360 | "c:\\users\\cascara\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\ipykernel_launcher.py:36: MatplotlibDeprecationWarning: \n", 361 | "The 'normed' kwarg was deprecated in Matplotlib 2.1 and will be removed in 3.1. Use 'density' instead.\n" 362 | ] 363 | }, 364 | { 365 | "data": { 366 | "image/png": "\n", 367 | "text/plain": [ 368 | "
" 369 | ] 370 | }, 371 | "metadata": {}, 372 | "output_type": "display_data" 373 | } 374 | ], 375 | "source": [ 376 | "import random\n", 377 | "import math\n", 378 | "import matplotlib.pyplot as plt\n", 379 | "import seaborn as sns\n", 380 | "import numpy as np\n", 381 | "\n", 382 | "%matplotlib inline\n", 383 | "sns.set_style('darkgrid')\n", 384 | "plt.rcParams['figure.figsize'] = (12, 8)\n", 385 | "\n", 386 | "\n", 387 | "def AceeptReject(split_val):\n", 388 | " global c\n", 389 | " global power\n", 390 | " while True:\n", 391 | " x = random.uniform(0, 1)\n", 392 | " y = random.uniform(0, 1)\n", 393 | " if y*c <= math.pow(x - split_val, power):\n", 394 | " return x\n", 395 | "\n", 396 | "power = 4\n", 397 | "t = 0.4 \n", 398 | "sum_ = (math.pow(1-t, power + 1) - math.pow(-t, power + 1)) / (power + 1) #求积分\n", 399 | "x = np.linspace(0, 1, 100)\n", 400 | "#常数值c\n", 401 | "c = 0.6**4/sum_\n", 402 | "cc = [c for xi in x]\n", 403 | "plt.plot(x, cc, '--',label='c*f(x)')\n", 404 | "#目标概率密度函数的值f(x)\n", 405 | "y = [math.pow(xi - t, power)/sum_ for xi in x]\n", 406 | "plt.plot(x, y,label='f(x)')\n", 407 | "#采样10000个点\n", 408 | "samples = []\n", 409 | "for i in range(10000):\n", 410 | " samples.append(AceeptReject(t))\n", 411 | "plt.hist(samples, bins=50, normed=True,label='sampling')\n", 412 | "plt.legend()\n", 413 | "plt.show()" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [] 422 | } 423 | ], 424 | "metadata": { 425 | "kernelspec": { 426 | "display_name": "Python 3", 427 | "language": "python", 428 | "name": "python3" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.7.4" 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 4 445 | } 446 | -------------------------------------------------------------------------------- /singleWeibo/acquaireRepost.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "@[TOC]" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 库" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 28, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import csv\n", 25 | "import requests\n", 26 | "import xlwt\n", 27 | "import re\n", 28 | "import json\n", 29 | "import time" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "### 配置" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 29, 42 | "metadata": { 43 | "jupyter": { 44 | "source_hidden": true 45 | } 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "#根据个人浏览器信息进行修改\n", 50 | "headers = {\n", 51 | " 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Mobile Safari/537.36'\n", 52 | " ,\n", 53 | " 'Cookie': '_T_WM=67706607048; WEIBOCN_FROM=1110006030; ALF=1582777481; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5tF6kBCjVpv4O6BDEGM4gShv5JHfuyjMoLBKfT5-Xwsc.; SUB=_2A25zK8jDDeRhGeNP41UT9yjIyj6IHXVQ1-iLrDV6PUJbktAKLUHSkW1NTk4PgJoxaitdQXaQL6znAIMdvJJs4-5l; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5K-hUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0qjEKc2Va_YMLH; SSOLoginState=1580185747; MLOGIN=1; XSRF-TOKEN=607e98; M_WEIBOCN_PARAMS=luicode%3D10000011%26lfid%3D2304135671786192_-_WEIBO_SECOND_PROFILE_WEIBO%26fid%3D2304135671786192_-_WEIBO_SECOND_PROFILE_WEIBO%26uicode%3D10000011'\n", 54 | " #'ALF=1581501545; _T_WM=67706607048; H5_wentry=H5; backURL=https%3A%2F%2Fm.weibo.cn%2Fapi%2Fcomments%2Fshow%3Fid%3DIr5j4iRXW%26page%3D3; XSRF-TOKEN=11216a; WEIBOCN_FROM=1110006030; MLOGIN=1; SSOLoginState=1580006602; SCF=AqQddu0eGCw6Wh1xPsTyigWBFJH-P0ACsyLUFzNakys5zFt06rZeA1gEI0iP7HfWxZntbpMr8WTWhrxEdSVGB58.; SUB=_2A25zKIyaDeRhGeNP41UT9yjIyj6IHXVQ0hTSrDV6PUJbktAKLRL-kW1NTk4PgHLYgtoeuxFzuGDIDcybzoEoXvq9; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5q.Hx0pIs7PKpACzdnFYSZ5JpX5KzhUgL.Fo-p1hMES0qXeKz2dJLoIpUeBc8EdFH8SC-4BbHFSFH81F-RSF-4Sntt; SUHB=0IIlrfWMMkVsTI; M_WEIBOCN_PARAMS=uicode%3D20000174'\n", 55 | "}" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 30, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "#文件保存地址\n", 65 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/single/'" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 31, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "#是否获取转发者具体个人信息\n", 75 | "getConcreteInfoList=False#True#\n", 76 | "isLogin=False#True\n", 77 | "\n", 78 | "#是否登入采集个人信息\n", 79 | "\n", 80 | "#无信息打印字符\n", 81 | "infoNoExistStr='未知'\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 32, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "#是否处理微博文本内容\n", 91 | "processText = False#True" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "### 构造表格,采集数据内容(修改这里获取想要的信息)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "raw", 103 | "metadata": { 104 | "jupyter": { 105 | "source_hidden": true 106 | } 107 | }, 108 | "source": [ 109 | "博主的信息单独收集:转发的:转发reposts_count、评论comments_count、点赞数量attitudes_count、粉丝数量followers_count\n", 110 | "\n", 111 | " 原始的retweeted_status:转发reposts_count、评论comments_count、点赞数量attitudes_count\n", 112 | " 原始用户的user:用户名screen_name、id、粉丝数量followers_count" 113 | ] 114 | }, 115 | { 116 | "cell_type": "raw", 117 | "metadata": {}, 118 | "source": [ 119 | "获取个人具体信息范围、排列" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 33, 125 | "metadata": { 126 | "jupyter": { 127 | "source_hidden": true 128 | } 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "#获取个人具体信息范围、排列\n", 133 | "infoRangeDict={\n", 134 | "'性别':True,\n", 135 | "'所在地':True,\n", 136 | " \n", 137 | "'生日':False,\n", 138 | "'家乡':False,\n", 139 | "'公司':True,\n", 140 | "'大学':True,\n", 141 | " \n", 142 | "'昵称':False,\n", 143 | "'简介':False,\n", 144 | "'注册时间':False,\n", 145 | "'阳光信用':False,\n", 146 | " \n", 147 | " #若无信息显示\n", 148 | "'infoNoExist':'未知'\n", 149 | "}\n", 150 | "\n" 151 | ] 152 | }, 153 | { 154 | "cell_type": "raw", 155 | "metadata": {}, 156 | "source": [ 157 | "获取博文信息范围、排列" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 34, 163 | "metadata": { 164 | "jupyter": { 165 | "source_hidden": true 166 | } 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "#获取博文信息范围、排列\n", 171 | "blogRangeDict={\n", 172 | "'visible': False,#{type: 0, list_id: 0}\n", 173 | " \n", 174 | "'created_at': True,#\"20分钟前\"\n", 175 | " \n", 176 | "'id': False,#\"4466073829119710\"\n", 177 | "'idstr': False,#\"4466073829119710\"\n", 178 | "'mid': False,#\"4466073829119710\"\n", 179 | "'can_edit': False,#false\n", 180 | "'show_additional_indication': False,#0\n", 181 | " \n", 182 | "'text': True,#\"【情况通报】2019年12月31日,武汉市卫健部门发布关于肺炎疫情的情况通报。\n", 183 | " \n", 184 | "'textLength': False,#452\n", 185 | "'source': False,#\"360安全浏览器\"\n", 186 | "'favorited': False,#false\n", 187 | "'pic_types': False,#\"\"\n", 188 | "'is_paid': False,#false\n", 189 | "'mblog_vip_type': False,#0\n", 190 | "'user': False,#{id: 2418542712, screen_name: \"平安武汉\",…}\n", 191 | " \n", 192 | "'reposts_count': True,#1035\n", 193 | "'comments_count': True,#1886\n", 194 | "'attitudes_count': True,#7508\n", 195 | " \n", 196 | "'pending_approval_count': False,#0\n", 197 | "'isLongText': False,#true\n", 198 | "'reward_exhibition_type':False,# 0\n", 199 | "'hide_flag': False,#0\n", 200 | "'mblogtype': False,#0\n", 201 | "'more_info_type': False,#0\n", 202 | "'cardid': False,#\"star_11247_common\"\n", 203 | "'content_auth': False,#0\n", 204 | "'pic_num': False,#0\n", 205 | " \n", 206 | "#若无相关信息,则显示:\n", 207 | "'infoNoExist':'未知'\n", 208 | "}" 209 | ] 210 | }, 211 | { 212 | "cell_type": "raw", 213 | "metadata": {}, 214 | "source": [ 215 | "获取博主信息范围、排列" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 35, 221 | "metadata": { 222 | "jupyter": { 223 | "source_hidden": true 224 | } 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "#获取博主信息范围、排列\n", 229 | "userRangeDict={\n", 230 | "'id':True,# 1323527941\n", 231 | "'screen_name': True,#\"Vista看天下\"\n", 232 | " \n", 233 | "'profile_image_url': False,#\"https://tva2.sinaimg.cn/crop.0.0.180.180.180/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg?KID=imgbed,tva&Expires=1580290462&ssig=xPIoKDRR56\"\n", 234 | "'profile_url':False,# \"https://m.weibo.cn/u/1323527941?uid=1323527941&luicode=10000011&lfid=1076031323527941\"\n", 235 | "'statuses_count': False,#微博数 77256\n", 236 | "'verified': False,#true\n", 237 | "'verified_type':False,# 3\n", 238 | "'verified_type_ext': False,#0\n", 239 | "'verified_reason': False,#\"《Vista看天下》官方微博\"\n", 240 | "'close_blue_v': False,#false\n", 241 | " \n", 242 | "'description': True,#\"一个有趣的蓝V\"\n", 243 | "'gender': True,# \"m\"\n", 244 | " \n", 245 | "'mbtype': False,#12\n", 246 | "'urank': False,#48\n", 247 | "'mbrank': False,#6\n", 248 | "'follow_me':False,# false\n", 249 | "'following':False,# false\n", 250 | " \n", 251 | "'followers_count': True,#19657897\n", 252 | "'follow_count': True,#1809\n", 253 | " \n", 254 | "'cover_image_phone': False,#\"https://tva1.sinaimg.cn/crop.0.0.640.640.640/549d0121tw1egm1kjly3jj20hs0hsq4f.jpg\"\n", 255 | "'avatar_hd': False,#\"https://ww2.sinaimg.cn/orj480/4ee36f05jw1e8qgp5bmzyj2050050aa8.jpg\"\n", 256 | "'like': False,#false\n", 257 | "'like_me': False,#false\n", 258 | "'badge': False,#{enterprise: 1, gongyi_level: 1, bind_taobao: 1, dzwbqlx_2016: 1, follow_whitelist_video: 1,…}\n", 259 | " \n", 260 | "#若无信息显示\n", 261 | "'infoNoExist':'未知'\n", 262 | "}\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "### 文件命名" 270 | ] 271 | }, 272 | { 273 | "cell_type": "raw", 274 | "metadata": { 275 | "jupyter": { 276 | "source_hidden": true 277 | } 278 | }, 279 | "source": [ 280 | "使用示例:\n", 281 | "tweeter='王'\n", 282 | "fp = open(addrFile(tweeter),'w+',newline='',encoding='utf-16')\n", 283 | "fp.close()\n", 284 | "\n", 285 | "使用库函数:\n", 286 | "os" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 36, 292 | "metadata": { 293 | "jupyter": { 294 | "source_hidden": true 295 | } 296 | }, 297 | "outputs": [], 298 | "source": [ 299 | "def addrFile(tweeter,suffix):\n", 300 | " path=addrRoot+str(tweeter)+'/'\n", 301 | " if os.path.exists(path) is False:\n", 302 | " os.makedirs(path)\n", 303 | " address=path+tweeter+suffix+'.csv'\n", 304 | " return address " 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "### 生成信息标题" 312 | ] 313 | }, 314 | { 315 | "cell_type": "raw", 316 | "metadata": { 317 | "jupyter": { 318 | "source_hidden": true 319 | } 320 | }, 321 | "source": [ 322 | "将字典Dict中为True的条目生成标题,加前缀prefix\n", 323 | "\n", 324 | "使用实例:\n", 325 | "print(getInfoTitle(blogRangeDict,'原文'))\n", 326 | "打印结果:\n", 327 | "['原文created_at', '原文text', '原文reposts_count', '原文comments_count', '原文attitudes_count']" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 37, 333 | "metadata": { 334 | "jupyter": { 335 | "source_hidden": true 336 | } 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "def getInfoTitle(Dict,prefix):\n", 341 | " titleList=[]\n", 342 | " for item in Dict:\n", 343 | " if(Dict.get(item) is True):\n", 344 | " titleList.append(prefix+item)\n", 345 | " return (titleList)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "## 工具类,用来去除爬取的正文中一些不需要的链接、标签等" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 38, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "#工具类,用来去除爬取的正文中一些不需要的链接、标签等\n", 362 | "class Tool:\n", 363 | " deleteImg = re.compile('')\n", 364 | " newLine =re.compile('|
||
')\n", 365 | " deleteAite = re.compile('//.*?:')\n", 366 | " deleteAddr = re.compile('.*?
|')\n", 368 | " deleteWord = re.compile('回复@|回覆@|回覆|回复')\n", 369 | " \n", 370 | " @classmethod\n", 371 | " def replace(cls,x):\n", 372 | " x = re.sub(cls.deleteWord,'',x)\n", 373 | " x = re.sub(cls.deleteImg,'',x)\n", 374 | " x = re.sub(cls.deleteAite,'',x)\n", 375 | " x = re.sub(cls.deleteAddr, '', x)\n", 376 | " x = re.sub(cls.newLine,'',x)\n", 377 | " x = re.sub(cls.deleteTag,'',x)\n", 378 | " return x.strip()" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "### 构造微博转发信息的url" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 39, 391 | "metadata": { 392 | "jupyter": { 393 | "source_hidden": true 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "def repostURL(id,pages):\n", 399 | " urls=[]\n", 400 | " for page in pages:\n", 401 | " if page is not 0:\n", 402 | " urls+=['https://m.weibo.cn/api/statuses/repostTimeline?id='+str(id)+'&page='+str(page)]\n", 403 | " return urls " 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 40, 409 | "metadata": { 410 | "jupyter": { 411 | "source_hidden": true 412 | } 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "#将字典类型的信息格式传递为需要的信息列表\n", 417 | "def getInfoList(infoDict,rangeDict):\n", 418 | " infoList=[]\n", 419 | " for item in rangeDict:\n", 420 | " if rangeDict.get(item) is True:\n", 421 | " content=infoDict.get(item)\n", 422 | " if content is not None:\n", 423 | " #处理微博文本内容 \n", 424 | " if item =='text':\n", 425 | " if processText is True:\n", 426 | " content=Tool.replace(content)\n", 427 | " infoList.append(content) \n", 428 | " else:\n", 429 | " infoList.append(rangeDict['infoNoExist'])\n", 430 | " return infoList" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "### 爬取所有转发信息" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 41, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "###在已有的一系列urls中进行操作\n", 447 | "###筛选出微博转发内容进行操作\n", 448 | "def reRatio(urls,csvWriter):\n", 449 | " \n", 450 | " notEnd= True\n", 451 | " \n", 452 | " retweetBlogTitle=getInfoTitle(blogRangeDict,'转发')#转发博文信息标题\n", 453 | " retweetUserTitle=getInfoTitle(userRangeDict,'转发')#转发博主信息标题\n", 454 | " \n", 455 | " \n", 456 | " infoTitle=getInfoTitle(infoRangeDict,'')#原文博主个人主页信息标题\n", 457 | " \n", 458 | " #写表格的标题\n", 459 | " if getConcreteInfoList is True: \n", 460 | " csvWriter.writerow(retweetBlogTitle+retweetUserTitle+infoTitle) \n", 461 | " else:\n", 462 | " csvWriter.writerow(retweetBlogTitle+retweetUserTitle)\n", 463 | " \n", 464 | " for url in urls: \n", 465 | " \n", 466 | " response = requests.get(url,headers=headers)\n", 467 | " resjson = json.loads(response.text) \n", 468 | " state=resjson['ok']\n", 469 | " #结束最后\n", 470 | " if(state==0):\n", 471 | " notEnd=False\n", 472 | " break\n", 473 | " \n", 474 | " cards=resjson['data']['data'] \n", 475 | " \n", 476 | " #print(cards)\n", 477 | " \n", 478 | " #结束最后\n", 479 | " if(len(cards)==1):\n", 480 | " notEnd=False\n", 481 | " break\n", 482 | " #遍历一个页面的所有微博 \n", 483 | " for card in cards:\n", 484 | " try:\n", 485 | " #转发博文与博主信息\n", 486 | " retweetBlogInfoDict=card \n", 487 | " retweetUserInfoDict=retweetBlogInfoDict['user'] \n", 488 | " \n", 489 | " #构造填入csv文件数据列表\n", 490 | " infoList=[] \n", 491 | "\n", 492 | " #转发博文数据\n", 493 | " retweetBlogInfoList=getInfoList(retweetBlogInfoDict,blogRangeDict) \n", 494 | " infoList+=retweetBlogInfoList \n", 495 | " \n", 496 | " #转发博主数据\n", 497 | " ##默认已知\n", 498 | " retweetUserInfoList=getInfoList(retweetUserInfoDict,userRangeDict) \n", 499 | " infoList+=retweetUserInfoList \n", 500 | " \n", 501 | " \n", 502 | "\n", 503 | " if getConcreteInfoList is True:\n", 504 | " infoDict=getInfo(isLogin,originUserID)\n", 505 | " otherInfoList=getInfoList(infoDict,infoRangeDict) \n", 506 | " infoList+=otherInfoList \n", 507 | " #print(infoList) \n", 508 | " #保存数据至csv\n", 509 | " csvWriter.writerow(infoList) \n", 510 | "\n", 511 | " #不断获取该博主对的影响力\n", 512 | " #break\n", 513 | " except:\n", 514 | " pass \n", 515 | " #延时,防止反爬\n", 516 | " time.sleep(3)\n", 517 | " \n", 518 | " return notEnd" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "### 获取个人主页中信息" 526 | ] 527 | }, 528 | { 529 | "cell_type": "raw", 530 | "metadata": { 531 | "jupyter": { 532 | "source_hidden": true 533 | } 534 | }, 535 | "source": [ 536 | "使用示例:\n", 537 | "response = requests.get(url)\n", 538 | "txt=response.text\n", 539 | "print(drillInfo(txt))\n", 540 | "\n", 541 | "结果如下:\n", 542 | "{'昵称': '甘肃华熙文化',\n", 543 | " '简介': '【马丛珊.禅绣艺术,世界纹绣大师学院甘肃分院】服务生命之美;践行匠心为本,艺心创造,慈心发扬校训,微信mashan5374,☎13109439909',\n", 544 | " '性别': '女',\n", 545 | " '所在地': '甘肃 兰州'}" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 42, 551 | "metadata": { 552 | "jupyter": { 553 | "source_hidden": true 554 | } 555 | }, 556 | "outputs": [], 557 | "source": [ 558 | "def drillInfo(txt):\n", 559 | " keyInfo={}\n", 560 | " \n", 561 | " try: \n", 562 | " resjson = json.loads(txt) \n", 563 | " infodata = resjson.get('data')\n", 564 | " cards = infodata.get('cards')\n", 565 | " for l in range(0,len(cards)):\n", 566 | " temp = cards[l]\n", 567 | " card_group = temp.get('card_group') \n", 568 | " #判断获取信息类型 \n", 569 | " for card in card_group: \n", 570 | " #将信息传入字典\n", 571 | " name=card.get('item_name')\n", 572 | " if name is not None:\n", 573 | " content=card.get('item_content')\n", 574 | " keyInfo[name]=content \n", 575 | " except:\n", 576 | " pass\n", 577 | " return keyInfo" 578 | ] 579 | }, 580 | { 581 | "cell_type": "markdown", 582 | "metadata": {}, 583 | "source": [ 584 | "### 构建通过id访问个人主页的url" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 43, 590 | "metadata": { 591 | "jupyter": { 592 | "source_hidden": true 593 | } 594 | }, 595 | "outputs": [], 596 | "source": [ 597 | "def infoUrl(id):\n", 598 | " url = \"https://m.weibo.cn/api/container/getIndex?containerid=230283\"+str(id)+\"_-_INFO\" \n", 599 | " return url" 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "## 爬取某id博主的个人信息" 607 | ] 608 | }, 609 | { 610 | "cell_type": "raw", 611 | "metadata": { 612 | "jupyter": { 613 | "source_hidden": true 614 | } 615 | }, 616 | "source": [ 617 | "为防止反复爬取,将原文整体保存为文件,格式为 信息卡片长度(2 or 5)+id+博主id\n", 618 | "不登录2含有性别、所在地\n", 619 | "登录5含有性别、所在地、星座、大学、公司等完整信息\n", 620 | "若存在所需文件,则从文件读取信息,否则爬取,同时保存文件\n", 621 | "\n", 622 | "若爬取未成功,返回-1\n", 623 | "\n", 624 | "使用库函数:\n", 625 | "os" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 44, 631 | "metadata": { 632 | "jupyter": { 633 | "source_hidden": true 634 | } 635 | }, 636 | "outputs": [], 637 | "source": [ 638 | "def getInfo(state,id):\n", 639 | " \n", 640 | " address=addrRoot+'info/'+str(state)+'id'+str(id)+'.txt'\n", 641 | " path=addrRoot+'info/'\n", 642 | " if os.path.exists(path) is False:\n", 643 | " os.makedirs(path)\n", 644 | " try:\n", 645 | " #已有文件\n", 646 | " if(os.path.exists(address)==True):\n", 647 | " fp = open(address,'r',encoding='utf-16')\n", 648 | " txt=fp.read()\n", 649 | " info=drillInfo(txt)\n", 650 | " fp.close()\n", 651 | " else: \n", 652 | " fp = open(address,'w+',encoding='utf-16')\n", 653 | " url=infoUrl(id)\n", 654 | " if state is True:\n", 655 | " response = requests.get(url,headers=headers)\n", 656 | " else:\n", 657 | " response = requests.get(url)\n", 658 | " txt=response.text\n", 659 | " fp.write(response.text) \n", 660 | " info=drillInfo(txt)\n", 661 | " fp.close()\n", 662 | " except:\n", 663 | " info=-1 \n", 664 | " \n", 665 | " return info" 666 | ] 667 | }, 668 | { 669 | "cell_type": "raw", 670 | "metadata": { 671 | "jupyter": { 672 | "source_hidden": true 673 | } 674 | }, 675 | "source": [ 676 | "获取特定个人信息" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": 45, 682 | "metadata": { 683 | "jupyter": { 684 | "source_hidden": true 685 | } 686 | }, 687 | "outputs": [], 688 | "source": [ 689 | "def getExatInfo(item,state,id):\n", 690 | " info=getInfo(state,id)\n", 691 | " content=info.get(item)\n", 692 | " if content is not None:\n", 693 | " return content\n", 694 | " else:\n", 695 | " return infoNoExistStr " 696 | ] 697 | }, 698 | { 699 | "cell_type": "code", 700 | "execution_count": 46, 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [ 704 | "### 构造热门界面访问" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 50, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "def downloadData(id):\n", 714 | " #tweeter=getExatInfo('昵称',2,int(id))\n", 715 | " tweeter='视频-郭杰瑞'\n", 716 | " batch=0\n", 717 | " \n", 718 | " \n", 719 | " while(1):\n", 720 | " ranges=range(20*batch,20*(batch+1))\n", 721 | "\n", 722 | " fileAddr=addrFile(tweeter,'batch'+str(batch))\n", 723 | " if os.path.exists(fileAddr) is True:\n", 724 | " print(fileAddr+'已存在,跳过采集') \n", 725 | " else:\n", 726 | " print('文件将写入:'+fileAddr)\n", 727 | " fp = open(fileAddr,'w+',newline='',encoding='utf-16')\n", 728 | " writer=csv.writer(fp)\n", 729 | " if reRatio(repostURL(id,ranges),writer) is False:\n", 730 | " fp.close()\n", 731 | " break\n", 732 | "\n", 733 | " fp.close()\n", 734 | " print('第'+str(batch)+'批数据已记录完毕')\n", 735 | " batch+=1 \n", 736 | " " 737 | ] 738 | }, 739 | { 740 | "cell_type": "raw", 741 | "metadata": { 742 | "jupyter": { 743 | "source_hidden": true 744 | } 745 | }, 746 | "source": [ 747 | "\n", 748 | "#陈赫\n", 749 | "id=1574684061\n", 750 | "#MorningGlory_肖战资源博\n", 751 | "id=5735501478\n", 752 | "\n", 753 | "#靳东\n", 754 | "id=1093897112\n", 755 | "#李健\n", 756 | "id=1744395855\n", 757 | "\n", 758 | "#干部\n", 759 | "id=6472269230\n", 760 | "\n", 761 | "#陶勇\n", 762 | "id=5899876484\n", 763 | "\n", 764 | "#姚晨\n", 765 | "id=1266321801\n", 766 | "\n", 767 | "#鞠婧祎\n", 768 | "id=3669102477\n", 769 | "\n", 770 | "#韩红\n", 771 | "#id=1922542315\n", 772 | "\n", 773 | "\n", 774 | "#穿帮君\n", 775 | "id=5671786192\n", 776 | "\n", 777 | "#汉堡爸爸\n", 778 | "id=2784421224\n", 779 | "\n", 780 | "#蔡徐坤\n", 781 | "\n", 782 | "id=1776448504\n", 783 | "\n", 784 | "\n", 785 | "#林书豪\n", 786 | "id=2106855375\n", 787 | "\n", 788 | "#干部\n", 789 | "id=6472269230\n", 790 | "\n", 791 | "#任嘉伦\n", 792 | "id=3800468188\n", 793 | "\n", 794 | "#肖战\n", 795 | "id=1792951112\n", 796 | "\n", 797 | "\n", 798 | "#迪丽热巴\n", 799 | "id=1669879400\n", 800 | "\n", 801 | "\n", 802 | "#科比\n", 803 | "id=3264072325" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": 51, 809 | "metadata": {}, 810 | "outputs": [ 811 | { 812 | "name": "stdin", 813 | "output_type": "stream", 814 | "text": [ 815 | "博文id: 4466832293743498\n" 816 | ] 817 | }, 818 | { 819 | "name": "stdout", 820 | "output_type": "stream", 821 | "text": [ 822 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch0.csv\n", 823 | "第0批数据已记录完毕\n", 824 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch1.csv\n", 825 | "第1批数据已记录完毕\n", 826 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch2.csv\n", 827 | "第2批数据已记录完毕\n", 828 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch3.csv\n", 829 | "第3批数据已记录完毕\n", 830 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch4.csv\n", 831 | "第4批数据已记录完毕\n", 832 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch5.csv\n", 833 | "第5批数据已记录完毕\n", 834 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch6.csv\n", 835 | "第6批数据已记录完毕\n", 836 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch7.csv\n", 837 | "第7批数据已记录完毕\n", 838 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch8.csv\n", 839 | "第8批数据已记录完毕\n", 840 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch9.csv\n", 841 | "第9批数据已记录完毕\n", 842 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch10.csv\n", 843 | "第10批数据已记录完毕\n", 844 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch11.csv\n", 845 | "第11批数据已记录完毕\n", 846 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch12.csv\n", 847 | "第12批数据已记录完毕\n", 848 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch13.csv\n", 849 | "第13批数据已记录完毕\n", 850 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch14.csv\n", 851 | "第14批数据已记录完毕\n", 852 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch15.csv\n", 853 | "第15批数据已记录完毕\n", 854 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch16.csv\n", 855 | "第16批数据已记录完毕\n", 856 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch17.csv\n", 857 | "第17批数据已记录完毕\n", 858 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch18.csv\n", 859 | "第18批数据已记录完毕\n", 860 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch19.csv\n", 861 | "第19批数据已记录完毕\n", 862 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch20.csv\n", 863 | "第20批数据已记录完毕\n", 864 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch21.csv\n", 865 | "第21批数据已记录完毕\n", 866 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch22.csv\n", 867 | "第22批数据已记录完毕\n", 868 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch23.csv\n", 869 | "第23批数据已记录完毕\n", 870 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch24.csv\n", 871 | "第24批数据已记录完毕\n", 872 | "文件将写入:C:/Users/cascara/Desktop/seedcup/csv/blog/single/视频-郭杰瑞/视频-郭杰瑞batch25.csv\n" 873 | ] 874 | } 875 | ], 876 | "source": [ 877 | "id=input('博文id:')\n", 878 | "#4102228300324979\n", 879 | "#4466810701697847武汉肺炎\n", 880 | "#4465738137650546\n", 881 | "downloadData(id)" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": 37, 887 | "metadata": {}, 888 | "outputs": [ 889 | { 890 | "ename": "NameError", 891 | "evalue": "name 'fp' is not defined", 892 | "output_type": "error", 893 | "traceback": [ 894 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 895 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 896 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mfp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 897 | "\u001b[1;31mNameError\u001b[0m: name 'fp' is not defined" 898 | ] 899 | } 900 | ], 901 | "source": [ 902 | "fp.close()" 903 | ] 904 | }, 905 | { 906 | "cell_type": "code", 907 | "execution_count": 48, 908 | "metadata": {}, 909 | "outputs": [ 910 | { 911 | "ename": "NameError", 912 | "evalue": "name 'resjson' is not defined", 913 | "output_type": "error", 914 | "traceback": [ 915 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 916 | "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", 917 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mresjson\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", 918 | "\u001b[1;31mNameError\u001b[0m: name 'resjson' is not defined" 919 | ] 920 | } 921 | ], 922 | "source": [ 923 | "print(resjson)" 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": null, 929 | "metadata": {}, 930 | "outputs": [], 931 | "source": [] 932 | } 933 | ], 934 | "metadata": { 935 | "kernelspec": { 936 | "display_name": "Python 3", 937 | "language": "python", 938 | "name": "python3" 939 | }, 940 | "language_info": { 941 | "codemirror_mode": { 942 | "name": "ipython", 943 | "version": 3 944 | }, 945 | "file_extension": ".py", 946 | "mimetype": "text/x-python", 947 | "name": "python", 948 | "nbconvert_exporter": "python", 949 | "pygments_lexer": "ipython3", 950 | "version": "3.7.4" 951 | } 952 | }, 953 | "nbformat": 4, 954 | "nbformat_minor": 4 955 | } 956 | -------------------------------------------------------------------------------- /singleWeibo/analyseLinks-html.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import csv" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "def addrFile(tweeter,suffix):\n", 20 | " path=addrRoot+str(tweeter)+'/'\n", 21 | " if os.path.exists(path) is False:\n", 22 | " os.makedirs(path)\n", 23 | " address=path+tweeter+suffix\n", 24 | " return address " 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "#文件保存地址,改为你存放csv文件的完整地址\n", 34 | "addrRoot='C:/Users/cascara/Desktop/seedcup/csv/blog/single/'\n", 35 | "\n", 36 | "#是否调试\n", 37 | "isDebug=False\n", 38 | "\n", 39 | "#100万+显示\n", 40 | "infinity=1000000" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdin", 50 | "output_type": "stream", 51 | "text": [ 52 | "输入博主姓名: 武汉肺炎-搜狐新闻\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "tweeter=input('输入博文代号:')#武汉肺炎-搜狐新闻" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 6, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "C:/Users/cascara/Desktop/seedcup/csv/blog/single/武汉肺炎-搜狐新闻/武汉肺炎-搜狐新闻batch0.csv\n", 70 | "C:/Users/cascara/Desktop/seedcup/csv/blog/single/武汉肺炎-搜狐新闻/武汉肺炎-搜狐新闻batch1.csv\n", 71 | "C:/Users/cascara/Desktop/seedcup/csv/blog/single/武汉肺炎-搜狐新闻/武汉肺炎-搜狐新闻batch2.csv\n", 72 | "C:/Users/cascara/Desktop/seedcup/csv/blog/single/武汉肺炎-搜狐新闻/武汉肺炎-搜狐新闻batch3.csv\n", 73 | "不存在C:/Users/cascara/Desktop/seedcup/csv/blog/single/武汉肺炎-搜狐新闻/武汉肺炎-搜狐新闻batch3.csv\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "#开始\n", 79 | "startCollect=True\n", 80 | "\n", 81 | "\n", 82 | "batch=0\n", 83 | "dataDict={}\n", 84 | "Title=[]\n", 85 | "while(1):\n", 86 | " titleEixst=True\n", 87 | " address=addrFile(tweeter,'batch'+str(batch)+'.csv') \n", 88 | " print(address)\n", 89 | " \n", 90 | " if os.path.exists(address) is True:\n", 91 | " fp = open(address,'r',newline='',encoding='utf-16') \n", 92 | " reader=csv.reader(fp)\n", 93 | " for line in reader:\n", 94 | " if titleEixst is True:\n", 95 | " #将标题制成各个\n", 96 | " if startCollect is True:\n", 97 | " for item in line:\n", 98 | " #print(item[-1])\n", 99 | " if isDebug is True:\n", 100 | " print(item)\n", 101 | "\n", 102 | " \n", 103 | " dataDict[item]=[]\n", 104 | " startCollect=False\n", 105 | " Title=line \n", 106 | " if isDebug is True:\n", 107 | " print(line)\n", 108 | " titleEixst=False\n", 109 | " else:\n", 110 | " for no in range(len(line)): \n", 111 | " if(Title[no].find('count')>=0):\n", 112 | " if line[no].find('0万') > 0:\n", 113 | " dataDict.get(Title[no]).append(infinity)\n", 114 | " else:\n", 115 | " dataDict.get(Title[no]).append(eval(line[no]))\n", 116 | " \n", 117 | " #dataDict.get(Title[no]).append((line[no]))\n", 118 | " \n", 119 | " else:\n", 120 | " dataDict.get(Title[no]).append((line[no]))\n", 121 | " if isDebug is True:\n", 122 | " print(Title[no])\n", 123 | " print(line[no]) \n", 124 | " \n", 125 | " \n", 126 | " else:\n", 127 | " print('不存在'+address)\n", 128 | " break\n", 129 | " fp.close()\n", 130 | " batch+=1\n", 131 | "#print(dataDict)" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "['转发created_at', '转发text', '转发reposts_count', '转发comments_count', '转发attitudes_count', '转发id', '转发screen_name', '转发description', '转发gender', '转发followers_count', '转发follow_count']\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "print(Title)" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### 读入数据" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "originData={}\n", 165 | "retweetData={}\n", 166 | "\n", 167 | "labels=[0 for i in range(4)]\n", 168 | "cnLabels={}\n", 169 | "\n", 170 | "labels[1]='reposts_count'\n", 171 | "labels[2]='comments_count'\n", 172 | "labels[3]='attitudes_count'\n", 173 | "labels[0]='follow_count'\n", 174 | "\n", 175 | "cnLabels[labels[1]]='转发量'\n", 176 | "cnLabels[labels[2]]='评论量'\n", 177 | "cnLabels[labels[3]]='点赞量'\n", 178 | "cnLabels[labels[0]]='粉丝量'\n", 179 | "\n", 180 | "\n", 181 | "for i in range(4):\n", 182 | " #print(choice+label)\n", 183 | " try:\n", 184 | " #originData[cnLabels[i]]=np.asarray(dataDict.get('原文'+labels[i])[::-1])\n", 185 | " retweetData[labels[i]]=np.asarray(dataDict.get('转发'+labels[i])[::-1])\n", 186 | " except:\n", 187 | " pass\n", 188 | " try:\n", 189 | " originData[labels[i]]=np.asarray(dataDict.get('原文'+labels[i])[::-1])\n", 190 | " #retweetData[cnLabels[i]]=np.asarray(dataDict.get('转发'+labels[i])[::-1])\n", 191 | " except:\n", 192 | " pass\n", 193 | "\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 10, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "import re\n", 203 | "#工具类,用来去除爬取的正文中一些不需要的链接、标签等\n", 204 | "class Tool:\n", 205 | " deleteImg = re.compile('')\n", 206 | " newLine =re.compile('|
||
')\n", 207 | " \n", 208 | " deleteAite = re.compile('//.*?:')\n", 209 | " \n", 210 | " repostEN=re.compile('//@(.*?)
:')#英文字符冒号\n", 211 | " repostCN=re.compile('//@(.*?):')#中文字符冒号\n", 212 | " \n", 213 | " \n", 214 | " deleteAddr = re.compile('.*?|')\n", 216 | " deleteWord = re.compile('回复@|回覆@|回覆|回复')\n", 217 | " \n", 218 | " @classmethod\n", 219 | " def findSource(cls,x):\n", 220 | " sourceName=''\n", 221 | " xEN=xCN=''\n", 222 | " \n", 223 | " xEN = re.findall(cls.repostEN,x)\n", 224 | " xCN = re.findall(cls.repostCN,x)\n", 225 | " \n", 226 | " \n", 227 | " #如果其中一者存在,另一者不存在,即返回该者\n", 228 | " if(len(xCN)==0 and len(xEN)>0):\n", 229 | " sourceName=xEN[0]\n", 230 | " #print(xEN[0])\n", 231 | " elif(len(xEN)==0 and len(xCN)>0):\n", 232 | " sourceName=xCN[0]\n", 233 | " #print(xCN[0]) \n", 234 | " #若二者都存在,则返回第一位置字符串较小的\n", 235 | " elif(len(xEN)>0 and len(xCN)>0):\n", 236 | " #print(xCN[0]) if(len(xEN[0])>len(xCN[0])) else print(xEN[0])\n", 237 | " sourceName=xCN[0] if(len(xEN[0])>len(xCN[0])) else xEN[0]\n", 238 | "\n", 239 | " \n", 240 | " return sourceName" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 11, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "class Categories:\n", 250 | " \n", 251 | " def __init__(self):\n", 252 | " self.compose={}\n", 253 | " #pass\n", 254 | " def add(self,name):\n", 255 | " self.compose[name]={}\n", 256 | " category=self.compose[name]\n", 257 | " category['value']=0 #记录被转次数\n", 258 | " category['target']={}\n", 259 | " category['source']={}\n", 260 | " #category['isS']=0\n", 261 | " def nameExist(self,name):\n", 262 | " if self.compose.get(name) is None: \n", 263 | " return False\n", 264 | " else:\n", 265 | " return True\n", 266 | " \n", 267 | " def addTarget(self,sourceName,targetName):\n", 268 | " if self.nameExist(sourceName) is False:\n", 269 | " self.add(sourceName)\n", 270 | " if self.nameExist(targetName) is False:\n", 271 | " self.add(targetName)\n", 272 | " \n", 273 | " ##防止循环调用 \n", 274 | " if sourceName == targetName:\n", 275 | " #print(sourceName)\n", 276 | " return \n", 277 | " if self.compose[targetName]['source'].get(sourceName) is not None:\n", 278 | " #print(sourceName)\n", 279 | " self.compose[targetName]['source'].pop(sourceName)\n", 280 | " \n", 281 | " \n", 282 | " if self.compose[targetName]['source'].get(sourceName) is None:\n", 283 | " self.compose[targetName]['source'][sourceName]=1\n", 284 | " else:\n", 285 | " self.compose[targetName]['source'][sourceName]+=1 \n", 286 | " \n", 287 | " \n", 288 | " if self.compose[sourceName]['target'].get(sourceName) is None:\n", 289 | " self.compose[sourceName]['target'][targetName]=1\n", 290 | " else:\n", 291 | " self.compose[sourceName]['target'][targetName]+=1\n", 292 | " self.compose[sourceName]['value']+=1 \n", 293 | " \n", 294 | " def countAll(self,name):\n", 295 | " '''count =0\n", 296 | " if self.nameExist(name) is False:\n", 297 | " self.add(name) \n", 298 | " count= 0\n", 299 | " else:'''\n", 300 | " \n", 301 | " targets=self.compose[name]['target']\n", 302 | " if targets == {}:\n", 303 | " self.compose[name]['value']=0\n", 304 | " else:\n", 305 | " for targetName in targets:\n", 306 | " if self.compose[targetName]['target']=={}:\n", 307 | " self.compose[targetName]['value']=0\n", 308 | " #self.compose[name]['value']+=1\n", 309 | " else:\n", 310 | " self.countAll(targetName)\n", 311 | " self.compose[name]['value']+=self.compose[targetName]['value']\n", 312 | " #数据缺失补充 假定为转发原博主 \n", 313 | " def fillSource(self,tweeter):\n", 314 | " for item in self.compose:\n", 315 | " source=self.compose[item].get('source')\n", 316 | " if (len(source))!=1 and item !=tweeter:\n", 317 | " self.addTarget(tweeter,item)\n", 318 | " #print(self.compose[item])\n", 319 | " \n", 320 | " \n", 321 | " " 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 12, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "choice='转发'#'原文'#\n", 331 | "categories=Categories()\n", 332 | "txt=''\n", 333 | "\n", 334 | "\n", 335 | "\n", 336 | "for name,text in zip(dataDict[choice+'screen_name'],dataDict[choice+'text']):\n", 337 | " \n", 338 | " if categories.nameExist(name) is False:\n", 339 | " categories.add(name)\n", 340 | " \n", 341 | " \n", 342 | " sourceName=(Tool.findSource(text))\n", 343 | " if sourceName is not '': \n", 344 | " categories.addTarget(sourceName,name)\n", 345 | " \n", 346 | " else:\n", 347 | " categories.addTarget(tweeter,name)\n", 348 | "\n", 349 | "categories.countAll(tweeter)#zwysgs')#处理所有,统计所有转发来源量 \n", 350 | "categories.fillSource(tweeter) \n", 351 | " \n", 352 | " " 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 13, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "nodes=[]\n", 362 | "arcs=[]\n", 363 | "\n", 364 | "category=[]\n", 365 | "\n", 366 | "\n", 367 | "\n", 368 | "for i in categories.compose:\n", 369 | " value=categories.compose[i]['value']\n", 370 | " try:\n", 371 | " source=list(categories.compose[i]['source'])[0]\n", 372 | " except:\n", 373 | " source=tweeter\n", 374 | " #print(i)\n", 375 | " node={ \"name\":i,\n", 376 | " \"symbolSize\": 5,\n", 377 | " \"draggable\": \"False\",\n", 378 | " \"value\": value,\n", 379 | " \"category\": source\n", 380 | " }\n", 381 | " \n", 382 | " \n", 383 | " if value > 0:\n", 384 | " if i==tweeter:\n", 385 | " node[\"category\"]=i\n", 386 | " #change node\n", 387 | " if source !=tweeter:\n", 388 | " #print(i)\n", 389 | " node[\"category\"]=i\n", 390 | " symbolSize=value//10 \n", 391 | " if symbolSize>5:\n", 392 | " node['symbolSize']=symbolSize\n", 393 | " node['label']={\n", 394 | " \"normal\": {\n", 395 | " \"show\": \"True\"\n", 396 | " }\n", 397 | " }\n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " #add arcs\n", 402 | " targets=categories.compose[i]['target']\n", 403 | " if targets != {}:\n", 404 | " for target in targets:\n", 405 | " arcs.append({'source':i,'target':target})\n", 406 | " \n", 407 | " \n", 408 | " #归入category\n", 409 | " category.append({'name':i})\n", 410 | " nodes.append(node)\n", 411 | " \n", 412 | " \n", 413 | " #try:\n", 414 | " #categories.compose[i]['target'][i] \n", 415 | " \n", 416 | " #print(i)\n", 417 | " #print( categories.compose[i])\n", 418 | " #except:\n", 419 | " #pass\n", 420 | "#print(nodes)\n", 421 | "#print(arcs)\n", 422 | "#print(category)\n", 423 | "\n", 424 | "content='【独家|武汉红十字会:无管理费收入 全靠财政拨款;12名员工人均“工资福利”23万】资料显示,武汉市红十字会为财政拨款单位,几乎全部收入来自财政拨款。2018年武汉红会“人员经费”支出为329.67万元。其中,工资福利279.34万元,其官方公布当年在职人员12人。以此计算,2018年武汉红会人均“人员经费”27.47万元,人均“工资福利”23.28万元。另据其2019年预算数据计算,武汉红会人均领取“工资福利”有望达到27.98万元。而2018年武汉市城镇单位就业人员平均工资为73671元,城镇非私营单位就业人员平均工资为85885元。\\\n", 425 | "自武汉新冠肺炎疫情爆发以来,武汉红十字会便开始接受全国各地、各界捐赠物资。截至1月29日24:00,武汉红会6天内累计收到社会捐款4.9亿元。 “1月23号以来,11个工作人员已经全部取消年假,还另外招募了近50位志愿者,24小时轮流加班。” 武汉市红十字会在官网上讲述了自己繁忙的工作状态。但与此同时,对于捐款和物资的质疑也接连而至。\\\n", 426 | "1月25日,有网友称“第一批医疗物资300套防护服到达武汉时,武汉红会要求捐赠者支付6%-8%的服务费,不然就拒收”。武汉市红十字会于两天后辟谣称,其至今为止的所有工作流程中,均不存在任何收费现象。\\\n", 427 | "1月29日,网上又开始流传“山东寿光无偿援助武汉350吨蔬菜,却被经过武汉相关部门通过超市售卖后,款项流向武汉红会”。对此,武汉红十字会再次发布申明否认称,其从未接收任何单位、任何个人捐赠的“寿光蔬菜”,更没有参与该批蔬菜的分配、售卖,也没有收到过与此相关的任何现金捐赠。'\n", 428 | "mid='4467059339091135'\n", 429 | "tweeter='武汉肺炎-搜狐新闻'" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 14, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "testData=[]\n", 439 | "testData.append(nodes)\n", 440 | "testData.append(arcs)\n", 441 | "testData.append(category)\n", 442 | "testData.append(content)\n", 443 | "testData.append(mid)\n", 444 | "testData.append(tweeter)\n" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 15, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "import json\n", 454 | "\n", 455 | "testFile=addrFile(tweeter,'.json')\n", 456 | "with open(testFile,'w',encoding='utf-8') as file_obj:\n", 457 | " json.dump(testData,file_obj)\n", 458 | "file_obj.close()" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 17, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "name": "stdout", 468 | "output_type": "stream", 469 | "text": [ 470 | "武汉肺炎-搜狐新闻\n", 471 | "武汉肺炎-搜狐新闻\n" 472 | ] 473 | }, 474 | { 475 | "data": { 476 | "text/plain": [ 477 | "'C:\\\\Users\\\\cascara\\\\learngit\\\\gitskills\\\\singleWeibo\\\\render.html'" 478 | ] 479 | }, 480 | "execution_count": 17, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "from pyecharts import options as opts\n", 487 | "from pyecharts.charts import Geo, Page, Bar ,Graph\n", 488 | "from pyecharts.faker import Collector, Faker\n", 489 | "from pyecharts.globals import CurrentConfig,NotebookType#ChartType, SymbolType\n", 490 | "CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_LAB\n", 491 | "\n", 492 | "C = Collector()\n", 493 | "\n", 494 | "@C.funcs\n", 495 | "def graph_weibo() -> Graph:\n", 496 | " with open(testFile, \"r\", encoding=\"utf-8\") as f:\n", 497 | " j = json.load(f)\n", 498 | " nodes, links, categories, cont, mid, userl = j\n", 499 | " c = (\n", 500 | " Graph()\n", 501 | " .add(\n", 502 | " \"肺炎\",\n", 503 | " nodes,\n", 504 | " links,\n", 505 | " categories,\n", 506 | " repulsion=50,\n", 507 | " linestyle_opts=opts.LineStyleOpts(curve=0.2),\n", 508 | " label_opts=opts.LabelOpts(is_show=False),#True),#\n", 509 | " )\n", 510 | " .set_global_opts(\n", 511 | " legend_opts=opts.LegendOpts(is_show=False),#True),#\n", 512 | " title_opts=opts.TitleOpts(title=\"武汉肺炎——搜狐新闻微博转发关系图\"),\n", 513 | " )\n", 514 | " )\n", 515 | " print(userl)\n", 516 | " return c\n", 517 | "\n", 518 | "Page().add(*[fn() for fn, _ in C.charts]).render()" 519 | ] 520 | } 521 | ], 522 | "metadata": { 523 | "kernelspec": { 524 | "display_name": "Python 3", 525 | "language": "python", 526 | "name": "python3" 527 | }, 528 | "language_info": { 529 | "codemirror_mode": { 530 | "name": "ipython", 531 | "version": 3 532 | }, 533 | "file_extension": ".py", 534 | "mimetype": "text/x-python", 535 | "name": "python", 536 | "nbconvert_exporter": "python", 537 | "pygments_lexer": "ipython3", 538 | "version": "3.7.4" 539 | } 540 | }, 541 | "nbformat": 4, 542 | "nbformat_minor": 4 543 | } 544 | --------------------------------------------------------------------------------