├── README.md ├── chromedriver.exe ├── draw.py ├── home.png ├── qq_excel.xlsx ├── qq_word.txt ├── qq空间excel.py ├── qq空间txt.py ├── 产生图片.py ├── 各种图.py ├── 每年发表说说总数柱状图.html └── 每年点赞和评论折现图.html /README.md: -------------------------------------------------------------------------------- 1 | ## python爬取网页版QQ空间,生成词云图、柱状图、折线图 2 | 3 | 最近python课程学完了,琢磨着用python点什么东西,经过一番搜索,盯上了QQ空间,拿走不谢,欢迎点赞收藏,记得github给个star! 4 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210523231805468.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 5 | 6 | - 爬取的说说内容 7 | 8 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210523231936542.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 9 | 10 | - 个性化说说内容词云图 11 | 12 | ![请添加图片描述](https://img-blog.csdnimg.cn/20210523232951939.jpg?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 13 | 14 | - 每年发表说说总数柱状图、每年点赞和评论折线图 15 | 16 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210523231852789.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 17 | 18 | - 7天好友动态柱状图、饼图 19 | 20 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210524000939728.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 21 | 22 | - 按照你的谷歌浏览器下载指定版本的驱动 http://chromedriver.storage.googleapis.com/index.html 23 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210523232204272.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 24 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210523232225804.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 25 | - 驱动跟两个python脚本放入同目录,我的版本是90.0.4430的,查看你自己的版本,下载后把我的chromedriver.exe替换掉! 26 | 27 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210523233209797.png) 28 | 29 | - 这里用到了很多第三方包,鼠标放在报红的包名下,用Alt+Enter导包,如果失败则在控制台用下面的必杀技 30 | 31 | ``` 32 | pip install 包名 -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com 33 | ``` 34 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210524000853586.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) 35 | 36 | github源码地址:https://github.com/kuishou68/python 37 | 38 | 记得给个star喲! 39 | 40 | ![在这里插入图片描述](https://img-blog.csdnimg.cn/20210524000224763.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80NDAxOTM3MA==,size_16,color_FFFFFF,t_70) -------------------------------------------------------------------------------- /chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/chromedriver.exe -------------------------------------------------------------------------------- /draw.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import calendar 3 | import matplotlib.pyplot as plt 4 | 5 | #统计近7天的说说 6 | statistics=dict() 7 | #今天的日期 8 | today = datetime.datetime.now() 9 | #判断闰年 10 | if (calendar.isleap(today.year)): 11 | #闰年2月的天数 12 | Feb = 29 13 | else: 14 | Feb = 28 15 | #每月的天数 16 | month = (31, Feb, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31) 17 | #格式化日期 18 | s = '{}年{}月{}日' 19 | #初始化statistics 20 | def init_times(): 21 | #如果7天都在同一个月 22 | if today.day>=7: 23 | #同一个月中,分别初始化7天的日期 24 | for i in range(7): 25 | temp=s.format(str(today.year),str(today.month),str(today.day-6+i)) 26 | #初始化日期为0 27 | statistics[temp]=0 28 | #7天不在同一个月中,即夸月,要判断上个月是否是前一年的12月,和确定上个月的总天数 29 | else: 30 | mon=today.month 31 | year=today.year 32 | #确定前一个月有几天 33 | last=7-today.day 34 | #如果当月是1月 35 | if today.month==1: 36 | month=12 37 | #年份减一 38 | year=today.year-1 39 | #先确定前一个月的几天日期 40 | for i in range(last): 41 | temp=s.format(str(year),str(mon),str(month[mon-1]-last+1+i)) 42 | statistics[temp] = 0 43 | #当月的几天日期 44 | for i in range(today.day): 45 | temp = s.format(str(today.year), str(today.month), str(i+1)) 46 | statistics[temp] = 0 47 | 48 | def count(qq_time): 49 | #如果qq日期中有“昨”的字样 50 | if '昨' in qq_time: 51 | # 如果今天是当月的第一天,那么昨天就是前一个月的最后一天 52 | if today.day==1: 53 | # 如果当月是1月 54 | if today.month==1: 55 | qq_time=s.format(str(today.year-1),str(12),str(31)) 56 | else: 57 | qq_time = s.format(str(today.year), str(today.month-1), str(month[today.month-2])) 58 | # 如果今天不是当月的第一天,则年份和月份不用管 59 | else: 60 | qq_time = s.format(str(today.year), str(today.month), str(today.day-1)) 61 | # 如果qq日期中有“前”的字样 62 | elif '前' in qq_time: 63 | #如果今天是当月的第二天,那么前天就是前一个月的最后一天 64 | if today.day == 2: 65 | #如果当月是1月 66 | if today.month == 1: 67 | qq_time = s.format(str(today.year - 1), str(12), str(31)) 68 | #如果当月不是1月,那么年份不用管 69 | else: 70 | qq_time = s.format(str(today.year), str(today.month - 1), str(month[today.month - 2])) 71 | # 如果今天是当月的第一天,那么前天就是前一个月的倒数第二天 72 | elif today.day==1: 73 | # 如果当月是1月 74 | if today.month==1: 75 | qq_time=s.format(str(today.year-1),str(12),str(30)) 76 | # 如果当月不是1月,那么年份不用管 77 | else: 78 | qq_time = s.format(str(today.year), str(today.month-1), str(month[today.month-2]-1)) 79 | # 如果今天既不是当月的第一天或者第二天 80 | else: 81 | qq_time = s.format(str(today.year), str(today.month), str(today.day - 2)) 82 | # 如果qq日期中直接是时间的字样 83 | elif '天' not in qq_time and '日' not in qq_time: 84 | qq_time=s.format(str(today.year),str(today.month),str(today.day)) 85 | #统计近7天qq中说说的条数 86 | if qq_time in statistics: 87 | statistics[qq_time]=statistics.get(qq_time)+1 88 | #初始化times 89 | init_times() 90 | #从爬取的时间文件中读取说说的日期 91 | with open('qq_word.txt','r',encoding='utf-8') as lines: 92 | for line in lines: 93 | count(line.split('\n')[0]) 94 | 95 | #绘图 96 | plt.rcParams['font.family'] = 'SimHei' 97 | #x轴数据 98 | x_data = date= statistics.keys() 99 | #y轴数据 100 | y_data = statistics.values() 101 | #饼图的百分比 102 | percent=[] 103 | #饼图的数据 104 | for value in statistics.values(): 105 | percent.append(round(value/sum(statistics.values()),1)) 106 | #柱状图 107 | #两个子图1行2列展示 108 | fig,ax = plt.subplots(1,2,figsize=(16,12)) 109 | #柱状上面的数据 110 | for x,y in zip(x_data,y_data): 111 | ax[0].text(x,y,y,fontsize=14,horizontalalignment='center') 112 | #y轴在说说最多的数字上加1 113 | ax[0].set_ylim([0,max(statistics.values())+1]) 114 | #喂入数据 115 | ax[0].bar(x_data,y_data) 116 | #x轴label文字显示 117 | ax[0].tick_params(axis='x',rotation=20) 118 | #柱状图标题 119 | ax[0].set_title('近7天好友动态统计',fontsize=12) 120 | #为子图设置横轴标题 121 | ax[0].set_xlabel('日期') 122 | #为子图设置纵轴标题 123 | ax[0].set_ylabel('条') 124 | #饼图 125 | ax[1].pie(percent,labels=date,autopct='%1.1f%%') 126 | #绘画 127 | plt.show() -------------------------------------------------------------------------------- /home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/home.png -------------------------------------------------------------------------------- /qq_excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/qq_excel.xlsx -------------------------------------------------------------------------------- /qq_word.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/qq_word.txt -------------------------------------------------------------------------------- /qq空间excel.py: -------------------------------------------------------------------------------- 1 | import time 2 | import urllib 3 | 4 | from selenium import webdriver 5 | from lxml import etree 6 | import xlwings as xw 7 | # 这里一定要设置编码格式,防止后面写入文件时报错 8 | 9 | friend = '1569339843' # 朋友的QQ号,朋友的空间要求允许你能访问 10 | user = '783533896' # 你的QQ号 11 | pw = '1323mkoNJI.@' # 你的QQ密码 12 | 13 | # 获取浏览器驱动 14 | chrome_driver = 'chromedriver.exe' 15 | driver = webdriver.Chrome(executable_path=chrome_driver) 16 | 17 | # 浏览器窗口最大化 18 | driver.maximize_window() 19 | 20 | def ParseCookiestr(cookie_str): 21 | cookielist = [] 22 | for item in cookie_str.split(';'): 23 | cookie = {} 24 | itemname=item.split('=')[0] 25 | iremvalue=item.split('=')[1] 26 | cookie['name']=itemname 27 | cookie['value']= urllib.parse.unquote(iremvalue) 28 | cookielist.append(cookie) 29 | return cookielist 30 | 31 | # 进入QQ空间 32 | def loginByPwd (driver, user, pw): 33 | # 浏览器地址定向为qq登陆页面 34 | driver.get("http://i.qq.com") 35 | #所以这里需要选中一下frame,否则找不到下面需要的网页元素 36 | driver.switch_to.frame("login_frame") 37 | time.sleep(3) 38 | # 自动点击账号登陆方式 39 | driver.find_element_by_id("switcher_plogin").click() 40 | time.sleep(3) 41 | # 账号输入框输入已知qq账号 42 | driver.find_element_by_id("u").send_keys(user) 43 | time.sleep(5) 44 | # 密码框输入已知密码 45 | driver.find_element_by_id("p").send_keys(pw) 46 | time.sleep(5) 47 | # 自动点击登陆按钮 48 | driver.find_element_by_id("login_button").click() 49 | time.sleep(5) 50 | # 让webdriver操纵当前页 51 | driver.switch_to.default_content() 52 | time.sleep(5) 53 | 54 | loginByPwd(driver, user, pw) 55 | 56 | # 跳到说说的url, friend你可以任意改成你想访问的空间 57 | driver.get("http://user.qzone.qq.com/" + friend + "/311") 58 | 59 | next_num = 0 # 初始“下一页”的id 60 | 61 | wb = xw.Book('qq_excel.xlsx') # 连接到当前工作目录中的现有文件 62 | sht = wb.sheets['Sheet1'] 63 | title = ['时间', '内容', '赞', '评论'] 64 | sht.range('A1').value = title 65 | indexD = 1 66 | 67 | # 开始爬取 68 | while True: 69 | # 下拉滚动条,使浏览器加载出动态加载的内容, 70 | # 我这里是从1开始到6结束 分5 次加载完每页数据 71 | for i in range(1, 6): 72 | height = 20000 * i # 每次滑动20000像素 73 | strWord = "window.scrollBy(0," + str(height) + ")" 74 | driver.execute_script(strWord) 75 | time.sleep(4) 76 | 77 | # 很多时候网页由多个或