├── README.md
├── chromedriver.exe
├── draw.py
├── home.png
├── qq_excel.xlsx
├── qq_word.txt
├── qq空间excel.py
├── qq空间txt.py
├── 产生图片.py
├── 各种图.py
├── 每年发表说说总数柱状图.html
└── 每年点赞和评论折现图.html
/README.md:
--------------------------------------------------------------------------------
1 | ## python爬取网页版QQ空间,生成词云图、柱状图、折线图
2 |
3 | 最近python课程学完了,琢磨着用python点什么东西,经过一番搜索,盯上了QQ空间,拿走不谢,欢迎点赞收藏,记得github给个star!
4 | 
5 |
6 | - 爬取的说说内容
7 |
8 | 
9 |
10 | - 个性化说说内容词云图
11 |
12 | 
13 |
14 | - 每年发表说说总数柱状图、每年点赞和评论折线图
15 |
16 | 
17 |
18 | - 7天好友动态柱状图、饼图
19 |
20 | 
21 |
22 | - 按照你的谷歌浏览器下载指定版本的驱动 http://chromedriver.storage.googleapis.com/index.html
23 | 
24 | 
25 | - 驱动跟两个python脚本放入同目录,我的版本是90.0.4430的,查看你自己的版本,下载后把我的chromedriver.exe替换掉!
26 |
27 | 
28 |
29 | - 这里用到了很多第三方包,鼠标放在报红的包名下,用Alt+Enter导包,如果失败则在控制台用下面的必杀技
30 |
31 | ```
32 | pip install 包名 -i http://pypi.douban.com/simple/ --trusted-host pypi.douban.com
33 | ```
34 | 
35 |
36 | github源码地址:https://github.com/kuishou68/python
37 |
38 | 记得给个star喲!
39 |
40 | 
--------------------------------------------------------------------------------
/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/chromedriver.exe
--------------------------------------------------------------------------------
/draw.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import calendar
3 | import matplotlib.pyplot as plt
4 |
5 | #统计近7天的说说
6 | statistics=dict()
7 | #今天的日期
8 | today = datetime.datetime.now()
9 | #判断闰年
10 | if (calendar.isleap(today.year)):
11 | #闰年2月的天数
12 | Feb = 29
13 | else:
14 | Feb = 28
15 | #每月的天数
16 | month = (31, Feb, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31)
17 | #格式化日期
18 | s = '{}年{}月{}日'
19 | #初始化statistics
20 | def init_times():
21 | #如果7天都在同一个月
22 | if today.day>=7:
23 | #同一个月中,分别初始化7天的日期
24 | for i in range(7):
25 | temp=s.format(str(today.year),str(today.month),str(today.day-6+i))
26 | #初始化日期为0
27 | statistics[temp]=0
28 | #7天不在同一个月中,即夸月,要判断上个月是否是前一年的12月,和确定上个月的总天数
29 | else:
30 | mon=today.month
31 | year=today.year
32 | #确定前一个月有几天
33 | last=7-today.day
34 | #如果当月是1月
35 | if today.month==1:
36 | month=12
37 | #年份减一
38 | year=today.year-1
39 | #先确定前一个月的几天日期
40 | for i in range(last):
41 | temp=s.format(str(year),str(mon),str(month[mon-1]-last+1+i))
42 | statistics[temp] = 0
43 | #当月的几天日期
44 | for i in range(today.day):
45 | temp = s.format(str(today.year), str(today.month), str(i+1))
46 | statistics[temp] = 0
47 |
48 | def count(qq_time):
49 | #如果qq日期中有“昨”的字样
50 | if '昨' in qq_time:
51 | # 如果今天是当月的第一天,那么昨天就是前一个月的最后一天
52 | if today.day==1:
53 | # 如果当月是1月
54 | if today.month==1:
55 | qq_time=s.format(str(today.year-1),str(12),str(31))
56 | else:
57 | qq_time = s.format(str(today.year), str(today.month-1), str(month[today.month-2]))
58 | # 如果今天不是当月的第一天,则年份和月份不用管
59 | else:
60 | qq_time = s.format(str(today.year), str(today.month), str(today.day-1))
61 | # 如果qq日期中有“前”的字样
62 | elif '前' in qq_time:
63 | #如果今天是当月的第二天,那么前天就是前一个月的最后一天
64 | if today.day == 2:
65 | #如果当月是1月
66 | if today.month == 1:
67 | qq_time = s.format(str(today.year - 1), str(12), str(31))
68 | #如果当月不是1月,那么年份不用管
69 | else:
70 | qq_time = s.format(str(today.year), str(today.month - 1), str(month[today.month - 2]))
71 | # 如果今天是当月的第一天,那么前天就是前一个月的倒数第二天
72 | elif today.day==1:
73 | # 如果当月是1月
74 | if today.month==1:
75 | qq_time=s.format(str(today.year-1),str(12),str(30))
76 | # 如果当月不是1月,那么年份不用管
77 | else:
78 | qq_time = s.format(str(today.year), str(today.month-1), str(month[today.month-2]-1))
79 | # 如果今天既不是当月的第一天或者第二天
80 | else:
81 | qq_time = s.format(str(today.year), str(today.month), str(today.day - 2))
82 | # 如果qq日期中直接是时间的字样
83 | elif '天' not in qq_time and '日' not in qq_time:
84 | qq_time=s.format(str(today.year),str(today.month),str(today.day))
85 | #统计近7天qq中说说的条数
86 | if qq_time in statistics:
87 | statistics[qq_time]=statistics.get(qq_time)+1
88 | #初始化times
89 | init_times()
90 | #从爬取的时间文件中读取说说的日期
91 | with open('qq_word.txt','r',encoding='utf-8') as lines:
92 | for line in lines:
93 | count(line.split('\n')[0])
94 |
95 | #绘图
96 | plt.rcParams['font.family'] = 'SimHei'
97 | #x轴数据
98 | x_data = date= statistics.keys()
99 | #y轴数据
100 | y_data = statistics.values()
101 | #饼图的百分比
102 | percent=[]
103 | #饼图的数据
104 | for value in statistics.values():
105 | percent.append(round(value/sum(statistics.values()),1))
106 | #柱状图
107 | #两个子图1行2列展示
108 | fig,ax = plt.subplots(1,2,figsize=(16,12))
109 | #柱状上面的数据
110 | for x,y in zip(x_data,y_data):
111 | ax[0].text(x,y,y,fontsize=14,horizontalalignment='center')
112 | #y轴在说说最多的数字上加1
113 | ax[0].set_ylim([0,max(statistics.values())+1])
114 | #喂入数据
115 | ax[0].bar(x_data,y_data)
116 | #x轴label文字显示
117 | ax[0].tick_params(axis='x',rotation=20)
118 | #柱状图标题
119 | ax[0].set_title('近7天好友动态统计',fontsize=12)
120 | #为子图设置横轴标题
121 | ax[0].set_xlabel('日期')
122 | #为子图设置纵轴标题
123 | ax[0].set_ylabel('条')
124 | #饼图
125 | ax[1].pie(percent,labels=date,autopct='%1.1f%%')
126 | #绘画
127 | plt.show()
--------------------------------------------------------------------------------
/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/home.png
--------------------------------------------------------------------------------
/qq_excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/qq_excel.xlsx
--------------------------------------------------------------------------------
/qq_word.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kuishou68/python/4c0b411507a6ada7b33b4781378df7ec9845013c/qq_word.txt
--------------------------------------------------------------------------------
/qq空间excel.py:
--------------------------------------------------------------------------------
1 | import time
2 | import urllib
3 |
4 | from selenium import webdriver
5 | from lxml import etree
6 | import xlwings as xw
7 | # 这里一定要设置编码格式,防止后面写入文件时报错
8 |
9 | friend = '1569339843' # 朋友的QQ号,朋友的空间要求允许你能访问
10 | user = '783533896' # 你的QQ号
11 | pw = '1323mkoNJI.@' # 你的QQ密码
12 |
13 | # 获取浏览器驱动
14 | chrome_driver = 'chromedriver.exe'
15 | driver = webdriver.Chrome(executable_path=chrome_driver)
16 |
17 | # 浏览器窗口最大化
18 | driver.maximize_window()
19 |
20 | def ParseCookiestr(cookie_str):
21 | cookielist = []
22 | for item in cookie_str.split(';'):
23 | cookie = {}
24 | itemname=item.split('=')[0]
25 | iremvalue=item.split('=')[1]
26 | cookie['name']=itemname
27 | cookie['value']= urllib.parse.unquote(iremvalue)
28 | cookielist.append(cookie)
29 | return cookielist
30 |
31 | # 进入QQ空间
32 | def loginByPwd (driver, user, pw):
33 | # 浏览器地址定向为qq登陆页面
34 | driver.get("http://i.qq.com")
35 | #所以这里需要选中一下frame,否则找不到下面需要的网页元素
36 | driver.switch_to.frame("login_frame")
37 | time.sleep(3)
38 | # 自动点击账号登陆方式
39 | driver.find_element_by_id("switcher_plogin").click()
40 | time.sleep(3)
41 | # 账号输入框输入已知qq账号
42 | driver.find_element_by_id("u").send_keys(user)
43 | time.sleep(5)
44 | # 密码框输入已知密码
45 | driver.find_element_by_id("p").send_keys(pw)
46 | time.sleep(5)
47 | # 自动点击登陆按钮
48 | driver.find_element_by_id("login_button").click()
49 | time.sleep(5)
50 | # 让webdriver操纵当前页
51 | driver.switch_to.default_content()
52 | time.sleep(5)
53 |
54 | loginByPwd(driver, user, pw)
55 |
56 | # 跳到说说的url, friend你可以任意改成你想访问的空间
57 | driver.get("http://user.qzone.qq.com/" + friend + "/311")
58 |
59 | next_num = 0 # 初始“下一页”的id
60 |
61 | wb = xw.Book('qq_excel.xlsx') # 连接到当前工作目录中的现有文件
62 | sht = wb.sheets['Sheet1']
63 | title = ['时间', '内容', '赞', '评论']
64 | sht.range('A1').value = title
65 | indexD = 1
66 |
67 | # 开始爬取
68 | while True:
69 | # 下拉滚动条,使浏览器加载出动态加载的内容,
70 | # 我这里是从1开始到6结束 分5 次加载完每页数据
71 | for i in range(1, 6):
72 | height = 20000 * i # 每次滑动20000像素
73 | strWord = "window.scrollBy(0," + str(height) + ")"
74 | driver.execute_script(strWord)
75 | time.sleep(4)
76 |
77 | # 很多时候网页由多个或