0:
46 | job_statement = ''.join(
47 | [i.strip() for i in re.split(r'
', re.sub('<[/]?\w+>', '', result[0].strip()))]) if \
48 | result[0] else ''
49 |
50 | self.job_info.append(job_statement)
51 | self.count = self.count + 1
52 | ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u're_51job招聘.xlsx')
53 | print '采集了{}条数据'.format(self.count)
54 | # 清空集合,为再次存放数据做准备
55 | self.job_info = []
56 | #
57 | #
58 | # if __name__ == '__main__':
59 | # x = SpiderRe2()
60 | # x.crawler_data()
61 |
--------------------------------------------------------------------------------
/51job爬虫/featch_51job/SpiderXpath.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | from lxml import etree
3 | from ExeclUtils import ExeclUtils
4 | from Spider import Spider
5 | import time
6 |
7 |
8 | class SpiderXpath(Spider):
9 |
10 | def __init__(self):
11 | super(SpiderXpath, self).__init__()
12 |
13 | def parse_job_list(self, text):
14 | try:
15 | f = etree.HTML(text)
16 | divs = f.xpath('//*[@id="resultList"]/div[@class="el"]')
17 | for div in divs:
18 | job_title = div.xpath('./p/span/a/@title')
19 | job_company = div.xpath('./span[1]/a/@title')
20 | job_address = div.xpath('./span[2]/text()')
21 | job_salary = div.xpath('./span[3]/text()')
22 | job_date = div.xpath('./span[4]/text()')
23 | job_href = div.xpath('./p/span/a/@href')
24 |
25 | job_title = job_title[0] if len(job_title) > 0 else ''
26 | job_company = job_company[0] if len(job_company) > 0 else ''
27 | job_address = job_address[0] if len(job_address) > 0 else ''
28 | job_salary = job_salary[0] if len(job_salary) > 0 else ''
29 | job_date = job_date[0] if len(job_date) > 0 else ''
30 | job_href = job_href[0] if len(job_href) > 0 else ''
31 |
32 | self.job_info.append(job_title)
33 | self.job_info.append(job_company)
34 | self.job_info.append(job_address)
35 | self.job_info.append(job_salary)
36 | self.job_info.append(job_date)
37 | self.job_info.append(job_href)
38 |
39 | self.request_job_detail(job_href)
40 | time.sleep(1)
41 | except Exception as e:
42 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
43 |
44 | def parse_job_detail(self, text):
45 | f = etree.HTML(text)
46 | try:
47 | # 工作描述
48 | job_statements = f.xpath('//div[@class="bmsg job_msg inbox"]')
49 | job_statement = job_statements[0] if len(job_statements) > 0 else ''
50 | if job_statement != '':
51 | job_statement = job_statement.xpath('string(.)').strip().split('\n')[0]
52 | else:
53 | job_statement = '职位无明确描述'
54 | except Exception as e:
55 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
56 | job_statement = '职位无明确描述'
57 |
58 | self.job_info.append(job_statement)
59 | self.count = self.count + 1
60 | ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u'xpath_51job招聘.xlsx')
61 | print '采集了{}条数据'.format(self.count)
62 | # 清空集合,为再次存放数据做准备
63 | self.job_info = []
64 | pass
65 |
66 | #
67 | # if __name__ == '__main__':
68 | # x = SpiderXpath2()
69 | # x.crawler_data()
70 |
--------------------------------------------------------------------------------
/51job爬虫/featch_51job/re_51job招聘.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/51job爬虫/featch_51job/re_51job招聘.xlsx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spider_works
2 | all sider source code
3 |
--------------------------------------------------------------------------------
/公众号爬虫/mp_spider1.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | import time
4 | import json
5 | import os
6 |
7 |
8 |
9 | class mp_spider(object):
10 |
11 | def __init__(self):
12 | self.offset = 10
13 | self.base_url = '为保护隐藏,你自己去处理'
14 | self.headers = '为保护隐藏,你自己去处理'
15 | def request_data(self):
16 | try:
17 | response = requests.get(self.base_url.format(self.offset), headers=self.headers)
18 | print(self.base_url.format(self.offset))
19 | if 200 == response.status_code:
20 | self.parse_data(response.text)
21 | except Exception as e:
22 | print(e)
23 | time.sleep(2)
24 | pass
25 |
26 | def parse_data(self, responseData):
27 |
28 | all_datas = json.loads(responseData)
29 |
30 | if 0== all_datas['ret']:
31 |
32 | summy_datas = all_datas['general_msg_list']
33 | datas = json.loads(summy_datas)['list']
34 | for data in datas:
35 | try:
36 | title = data['app_msg_ext_info']['title']
37 | title_child = data['app_msg_ext_info']['digest']
38 | article_url = data['app_msg_ext_info']['content_url']
39 | cover = data['app_msg_ext_info']['cover']
40 | print(title,title_child,article_url,cover)
41 | except Exception as e:
42 | print(e)
43 | continue
44 |
45 |
46 | print('----------------------------------------')
47 | time.sleep(3)
48 | self.offset = self.offset+10
49 | self.request_data()
50 | else:
51 | print('抓取数据出错!')
52 |
53 |
54 |
55 | if __name__ == '__main__':
56 | d = mp_spider()
57 | d.request_data()
58 |
--------------------------------------------------------------------------------
/公众号爬虫/mp_spider2.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | import time
4 | import json
5 | import os
6 | import pdfkit
7 |
8 |
9 | class mp_spider(object):
10 |
11 | def __init__(self):
12 | self.config = pdfkit.configuration(wkhtmltopdf='C:/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe')
13 | self.offset = 0
14 | self.count = 0
15 | self.base_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzAwMjQwODIwNg==&f=json&offset={}&count=10&is_ok=1&scene=124&uin=MTIyOTkzMzgyMA%3D%3D&key=7cabb994f4d85a88ad37c1ec41ddde6234e76a1f1e69b178052bc99ccdf724f77700b28cea9e242cc98e517bd2537122fdc7a65a601e36f438b33e31e183f64dd9519beed36d892cc0a31855f1c649d6&pass_ticket=n6xnvQjzn4yfkjScc%2FSoVi4SkEgzf4z0airW6Ue14zIDNH98t%2Fr62k2KszUJ1qNv&wxtoken=&appmsg_token=960_mNI0W0CuVRuEpG7GsxB7f7pUUrO2CWW_iib4ww~~&x5=0&f=json'
16 | self.headers = {
17 | 'Host': 'mp.weixin.qq.com',
18 | 'Connection': 'keep-alive',
19 | 'Accept': '*/*',
20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.884.400 QQBrowser/9.0.2524.400',
21 | 'X-Requested-With': 'XMLHttpRequest',
22 | 'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5MTQ4NjA3Nw==&scene=124&uin=MjA2MDM3NTU%3D&key=2b903b9a7252346947b8c8bec6a8e97ea469a66c7c55196aec680d36fef8d99bdd51ba33c76a8d0e5655e5186714a09c18bdc873bdac2350ffd215c1d3cb331a3f67f0dcc00984035cbaacc19e1ef3e2&devicetype=Windows+10&version=62060344&lang=zh_CN&a8scene=7&pass_ticket=jAFRJRtWRdJcSXta5fiYsjBqfK6vqTIYWrULumuK5sc%3D&winzoom=1',
23 | 'Accept-Encoding': 'gzip, deflate',
24 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
25 | 'Cookie': 'wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=n6xnvQjzn4yfkjScc/SoVi4SkEgzf4z0airW6Ue14zIDNH98t/r62k2KszUJ1qNv; wap_sid2=CPyZvcoEElwzdm5YaDByenY3S2dzYlJtdXFDQVJYbmZKUERuM2I5elhMb3NxMVZqX3FCTDVYaFJ2Rkd2RktMdm9KajV3TWU5T3YyTTVfUG5zZ2llWko0cW5aMzBiY0FEQUFBfjCo9fLYBTgNQJVO'
26 | }
27 |
28 | def request_data(self):
29 | response = requests.get(self.base_url.format(self.offset), headers=self.headers)
30 | if 200 == response.status_code:
31 | self.parse_data(response.text)
32 |
33 | def parse_data(self, response_data):
34 |
35 | all_datas = json.loads(response_data)
36 |
37 | if 0 == all_datas['ret']:
38 | if 1 == all_datas['can_msg_continue']:
39 | summy_datas = all_datas['general_msg_list']
40 | datas = json.loads(summy_datas)['list']
41 | for data in datas:
42 | try:
43 | title = data['app_msg_ext_info']['title']
44 | title_child = data['app_msg_ext_info']['digest']
45 | article_url = data['app_msg_ext_info']['content_url']
46 | cover = data['app_msg_ext_info']['cover']
47 | copyright = data['app_msg_ext_info']['copyright_stat']
48 | copyright = '原创文章_' if copyright == 11 else '非原创文章_'
49 | self.count = self.count + 1
50 | print('第【{}】篇文章'.format(self.count), copyright, title, title_child, article_url, cover)
51 | self.creat_pdf_file(article_url, '{}_{}'.format(copyright, title))
52 | except:
53 | continue
54 |
55 | time.sleep(3)
56 | self.offset = all_datas['next_offset'] # 下一页的偏移量
57 | self.request_data()
58 | else:
59 | exit('数据抓取完毕!')
60 | else:
61 | exit('数据抓取出错:' + all_datas['errmsg'])
62 |
63 | def creat_pdf_file(self, url, title):
64 | try:
65 | file = 'D:/store/file2/{}.pdf'.format(title)
66 | if not os.path.exists(file): # 过滤掉重复文件
67 | pdfkit.from_url(url, file)
68 |
69 | except Exception as e:
70 | print(e)
71 |
72 |
73 | if __name__ == '__main__':
74 | d = mp_spider()
75 | d.request_data()
76 |
77 |
--------------------------------------------------------------------------------
/公众号爬虫/mp_spider4.py:
--------------------------------------------------------------------------------
1 | import json
2 | from os import path
3 |
4 | import requests
5 | from scipy.misc import imread
6 | from wordcloud import WordCloud
7 |
8 | '''
9 | 择取抖音评论
10 | '''
11 |
12 |
13 | class mp_spider(object):
14 |
15 | def __init__(self):
16 | self.offset = 0
17 | self.count = 0
18 | self.base_comment_url = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz=MjM5NjA5NDczMg==&appmsgid=2652274724&idx=1&comment_id=303303606155886594&offset=0&limit=100&uin=MTIyOTkzMzgyMA%253D%253D&key=984e4c80c8bc7843fbc3177a66f8024c086af6b59a7ac97026e9f4db88fc49d0c26ce660040b865a3294ae651150d40227980433f1a5106b5a15261ad20d564aad1e8c6aa2dfda74fdd515af0bc77f1e&pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ%25252BBV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA%25252F&wxtoken=777&devicetype=Windows%26nbsp%3B10&clientversion=6206021f&appmsg_token=961_V5yXdClt1VInI19BnECwzmgi95G9e44nyElITVL5rKcbKbGDkLSLzLuTrUTO-TL3Zo_qNKEVSclPd8LG&x5=0&f=json'
19 | self.base_comment_header = {
20 | 'Host': 'mp.weixin.qq.com',
21 | 'Connection': 'keep-alive',
22 | 'Accept': '*/*',
23 | 'CSP': 'active',
24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400',
25 | 'X-Requested-With': 'XMLHttpRequest',
26 | 'Referer': 'https://mp.weixin.qq.com/s?__biz=MjM5NjA5NDczMg==&mid=2652274724&idx=1&sn=ad0bbb4461e20cdb5bb1563e6d20639d&chksm=bd0c56478a7bdf51db287ab8a6e054284f0a6aa9b475a3597e2f02c1a28a9ac0f085dab1820e&mpshare=1&scene=1&srcid=0603ZskndK5clppsBTw7kWWW&key=8799423f74e5608e8fddceb78f6442677bcc4589977665cb4aaf92376ab0b3acbf903998bd87428c0a2b8f8a724ce746d59882f43021889961664fd26aa68e05492d96213e1addea8cee62b98b6ebb76&ascene=1&uin=MTIyOTkzMzgyMA%3D%3D&devicetype=Windows+10&version=6206021f&lang=zh_CN&pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ%2BBV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA%2F&winzoom=1',
27 | 'Accept-Encoding': 'gzip, deflate',
28 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
29 | 'Cookie': 'rewardsn=; wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ+BV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA/; wap_sid2=CPyZvcoEElxMa0JKOS1tWHpPMFBlWFduNGRJbE9aUGFvNU9ja0poVXpKanpFSnVIQXpxbVUyVWNuZXlqQ2I3cDFvUmxlUGFIX2lFUDVGZ0dBTDBHRFFremh6Ml9vc0VEQUFBfjCikIrZBTgNQAE=; wxtokenkey=777'
30 | }
31 |
32 | def request_comment_data(self):
33 | response = requests.get(self.base_comment_url, headers=self.base_comment_header)
34 | if 200 == response.status_code:
35 | self.parse_comment_data(response.text)
36 |
37 | def parse_comment_data(self, response_data):
38 |
39 | all_datas = json.loads(response_data)
40 |
41 | if 0 == all_datas['base_resp']['ret']:
42 | all_comments = all_datas['elected_comment']
43 | with open('抖音毁掉.txt', 'a', encoding='utf-8') as f:
44 | for comments in all_comments:
45 | name = comments['nick_name']
46 | content = comments['content']
47 | print(name, content)
48 | try:
49 | f.write(content + "\n")
50 | except Exception as e:
51 | print(e)
52 | continue
53 |
54 | self.create_word_cloud('抖音毁掉')
55 | else:
56 | exit('数据抓取出错:' + all_datas['errmsg'])
57 |
58 | def create_word_cloud(self,file_name):
59 | d = path.dirname(__file__) # __file__ 为当前文件,
60 |
61 | text = open(path.join(d, '{}.txt'.format(file_name)), encoding='utf-8').read()
62 | back_coloring = imread(path.join(d, 'douyin_bg.png')) # 设置背景图片
63 |
64 | wc = WordCloud(background_color="white",
65 | font_path='C:\Windows\Fonts\msyhl.ttc',
66 | max_words=5000,
67 | mask=back_coloring,
68 | # 设置有多少种随机生成状态,即有多少种配色方案
69 | random_state=30)
70 | # generate word cloud
71 | wc.generate(text)
72 |
73 | # store to file
74 | wc.to_file(path.join(d, "alice.png"))
75 |
76 |
77 | if __name__ == '__main__':
78 | d = mp_spider()
79 | d.request_comment_data()
--------------------------------------------------------------------------------
/其它爬虫/book_py.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | """
4 | 爬取京东图书评价
5 |
6 | """
7 |
8 |
9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | import sys
13 | reload(sys)
14 | sys.setdefaultencoding( "utf-8" )
15 |
16 | class get_book(object):
17 | #获取浏览器驱动
18 | driver = webdriver.Firefox()
19 |
20 | # 浏览器窗口最大化
21 | driver.maximize_window()
22 |
23 | # 浏览器地址定向为qq登陆页面
24 | driver.get("https://item.jd.com/11993134.html#comment")
25 |
26 | # 切换到评价的tab
27 | driver.find_element_by_xpath('//*[@id="detail-tab-comm"]/a').click()
28 |
29 | while True:
30 | # 下拉滚动条,从1开始到3结束 分2次加载完每页数据
31 | for i in range(1,3):
32 | height = 20000*i#每次滑动20000像素
33 | strWord = "window.scrollBy(0,"+str(height)+")"
34 | driver.execute_script(strWord)
35 | time.sleep(4)
36 |
37 | selector = etree.HTML(driver.page_source)
38 | divs = selector.xpath('//*[@id="comment-0"]/div[1]/div/div')
39 |
40 | # mode =a 不清空连续写入
41 | with open('python_book.txt','a') as f:
42 | for div in divs:
43 | jd_conmment = div.xpath('./div[2]/div[1]/text()')
44 | jd_conmment = jd_conmment[0] if len(jd_conmment)>0 else ''
45 | f.write(jd_conmment+'\n')
46 |
47 | #分析得知当为最后一页时,最后的ui-pager-next不见了
48 | if driver.page_source.find('ui-pager-next') == -1:
49 | break
50 |
51 | # 找到“下一页”的按钮元素
52 | driver.find_element_by_class_name('ui-pager-next').click()
53 |
54 | # 因为在下一个循环里首先还要把页面下拉,所以要跳到外层的frame上
55 | driver.switch_to.parent_frame()
56 |
57 | if __name__=='__main__':
58 | get_book()
59 |
60 |
--------------------------------------------------------------------------------
/其它爬虫/download_video.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import datetime
3 | import os
4 | import threading
5 | import time
6 | from contextlib import closing
7 |
8 | import requests
9 | from lxml import etree
10 | from selenium import webdriver
11 | from selenium.webdriver.common.by import By
12 | from selenium.webdriver.support.ui import WebDriverWait
13 | from selenium.webdriver.support import expected_conditions as EC
14 |
15 |
16 | class VideoDown(object):
17 |
18 | def __init__(self):
19 | self.first_position = 0
20 | self.count = 0
21 | self.threads = []
22 | self.content = []
23 |
24 | def load_data(self):
25 |
26 | video_url = 'http://neihanshequ.com/video/'
27 | driver = webdriver.Firefox() # 获取浏览器驱动
28 | driver.maximize_window()
29 | driver.implicitly_wait(10) # 控制间隔时间等待浏览器反映
30 | driver.get(video_url)
31 |
32 | while True:
33 | try:
34 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'loadMore')))
35 | except Exception as e:
36 | print e.message
37 | break
38 |
39 | js = 'window.scrollTo(0,document.body.scrollHeight)'
40 | driver.execute_script(js)
41 | time.sleep(10)
42 |
43 | source = etree.HTML(driver.page_source)
44 | divs = source.xpath('//*[@id="detail-list"]/li')
45 |
46 | for div in divs:
47 | self.count = self.count + 1
48 | print '第{}条数据'.format(str(self.count))
49 | title = div.xpath('./div/div[2]/a/div/p/text()')
50 | v_url = div.xpath('.//*[@class="player-container"]/@data-src')
51 | title = title[0] if len(title) > 0 else '无介绍'.format(str(self.count))
52 | v_url = v_url[0] if len(v_url) > 0 else ''
53 | self.do_thread(title, v_url)
54 |
55 | try:
56 | load_more = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'loadMore')))
57 | load_more.click()
58 | time.sleep(10)
59 | except Exception as e:
60 | print e.message
61 | break
62 |
63 | def do_thread(self, title, url):
64 | t = threading.Thread(target=self.down_video, args=(title, url))
65 | self.threads.append(t)
66 | t.start()
67 |
68 | for tt in self.threads:
69 | tt.join()
70 |
71 | def down_video(self, title, url):
72 | try:
73 | with closing(requests.get(url, stream=True)) as response:
74 | print url
75 | chunk_size = 1024
76 | content_size = int(response.headers['content-length'])
77 |
78 | video_path = u'D:/store/video00'
79 | # 判断文件夹是否存在。
80 | if not os.path.exists(video_path):
81 | os.makedirs(video_path)
82 |
83 | file_name = video_path + u'/{}.mp4'.format(self.count)
84 | if os.path.exists(file_name) and os.path.getsize(file_name) == content_size:
85 | print(u'跳过' + file_name)
86 | else:
87 | down = DownProgress(title, content_size)
88 | with open(file_name, "wb") as f:
89 | for data in response.iter_content(chunk_size=chunk_size):
90 | f.write(data)
91 |
92 | down.refresh_down(len(data))
93 | except Exception as e:
94 | print e.message
95 |
96 |
97 | class DownProgress(object):
98 | def __init__(self, file_name, file_size):
99 | self.file_name = file_name
100 | self.file_down = 0
101 | self.file_size = file_size
102 |
103 | def refresh_down(self, down):
104 | self.file_down = self.file_down + down
105 | progress = (self.file_down / float(self.file_size)) * 100.0
106 | status = u'下载完成' if self.file_down >= self.file_size else u'正在下载...'
107 | print u'文件名称:{},下载进度:{},下载状态:{}'.format(self.file_name, '%.2f' % progress, status)
108 |
109 |
110 | if __name__ == '__main__':
111 | startTime = datetime.datetime.now()
112 | down = VideoDown()
113 | down.load_data()
114 | endTime = datetime.datetime.now()
115 | print '下载花费时间{}秒'.format((endTime - startTime).seconds)
116 |
--------------------------------------------------------------------------------
/其它爬虫/fenng_py.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | """
4 | 爬取冯大辉老师的微博
5 |
6 | """
7 |
8 |
9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | import wordcloud as wcp
13 | import sys
14 |
15 |
16 | reload(sys)
17 | #这里需要指定字符编码
18 | sys.setdefaultencoding( "utf-8" )
19 |
20 | def get_content(f_name):
21 |
22 | #你的微博帐号
23 | username = '******'
24 | psd = '******'
25 |
26 | #获取浏览器驱动
27 | driver = webdriver.Firefox()
28 |
29 | # 浏览器窗口最大化
30 | driver.maximize_window()
31 |
32 | driver.get('http://weibo.com/login.php')
33 | print('login............................')
34 |
35 | #给登录框与密码赋值
36 | driver.find_element_by_id('loginname').send_keys(username)
37 | driver.find_element_by_class_name('password').find_element_by_name('password').send_keys(psd)
38 |
39 | #点击登录按钮
40 | driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span').click()
41 |
42 | # 这里因为登录,需要有一个延时,不能直接切换到新网页去
43 | time.sleep(3)
44 |
45 | # 登录成功后,再用浏览器地址定向到大辉老师的微博列表页,没有什么技巧,自己去找的
46 | driver.get("https://weibo.com/p/1005051577826897/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place")
47 |
48 | while True:
49 | # 下拉滚动条,从1开始到3结束 分2次加载完每页数据
50 | for i in range(1,6):
51 | height = 20000*i#每次滑动20000像素
52 | strWord = "window.scrollBy(0,"+str(height)+")"
53 | driver.execute_script(strWord)
54 | time.sleep(4)
55 |
56 | selector = etree.HTML(driver.page_source)
57 | divs = selector.xpath('//*[@id="Pl_Official_MyProfileFeed__22"]/div/div/div[1]/div[4]')
58 |
59 | # mode =a 不清空连续写入
60 | with open('{}.txt'.format(f_name),'a') as f:
61 | for div in divs:
62 | wb_content = div.xpath('./div[3]/text()')
63 | wb_time = div.xpath('./div[2]/a/text()')
64 | wb_content = wb_content[0] if len(wb_content) > 0 else ''
65 | wb_time = wb_time[0] if len(wb_time) > 0 else ''
66 | wb_content = wb_content.strip() # 去掉左右两边的空格
67 | wb_time = wb_time.strip()
68 | print wb_content, wb_time
69 | f.write(wb_content+'\n')
70 |
71 | #分析得知当为最后一页时,最后的page next S_txt1 S_line1不见了
72 | if driver.page_source.find('page next S_txt1 S_line1') == -1:
73 | print '没有下一页了'
74 | break
75 |
76 | # 找到“下一页”的按钮元素,原本想用xpath与classname,都失败了
77 | # 这里我是用css来定位的,page next S_txt1 S_line1 在空格之间加'.' 来连接
78 | submit = driver.find_element_by_css_selector('.page.next.S_txt1.S_line1')
79 | submit.click()
80 |
81 | if __name__ == '__main__':
82 | f_name = 'ddddd'
83 | get_content(f_name)
84 | wcp.create_word_cloud(f_name)
85 |
--------------------------------------------------------------------------------
/其它爬虫/qq_send_word.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | """
4 | 爬取QQ 说说内容
5 |
6 | """
7 |
8 |
9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 |
13 | #这里一定要设置编码格式,防止后面写入文件时报错
14 | import sys
15 | reload(sys)
16 | sys.setdefaultencoding( "utf-8" )
17 |
18 | friend = '563679994' # 朋友的QQ号,朋友的空间要求允许你能访问
19 | user = '563679994' # 你的QQ号
20 | pw = 'XXX' # 你的QQ密码
21 |
22 | #获取浏览器驱动
23 | driver = webdriver.Firefox()
24 |
25 | # 浏览器窗口最大化
26 | driver.maximize_window()
27 |
28 | # 浏览器地址定向为qq登陆页面
29 | driver.get("http://i.qq.com")
30 |
31 | # 所以这里需要选中一下frame,否则找不到下面需要的网页元素
32 | driver.switch_to.frame("login_frame")
33 |
34 | # 自动点击账号登陆方式
35 | driver.find_element_by_id("switcher_plogin").click()
36 |
37 | # 账号输入框输入已知qq账号
38 | driver.find_element_by_id("u").send_keys(user)
39 |
40 | # 密码框输入已知密码
41 | driver.find_element_by_id("p").send_keys(pw)
42 |
43 | # 自动点击登陆按钮
44 | driver.find_element_by_id("login_button").click()
45 |
46 | # 让webdriver操纵当前页
47 | driver.switch_to.default_content()
48 |
49 | # 跳到说说的url, friend你可以任意改成你想访问的空间
50 | driver.get("http://user.qzone.qq.com/" + friend + "/311")
51 |
52 | driver.find_element_by_xpath('//*[@id="QM_Mood_Poster_Container"]/div/div[4]/div[4]/a[2]').click()
--------------------------------------------------------------------------------
/其它爬虫/qq_word.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 |
3 | """
4 | 爬取QQ 说说内容
5 |
6 | """
7 |
8 |
9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 |
13 | #这里一定要设置编码格式,防止后面写入文件时报错
14 | import sys
15 | reload(sys)
16 | sys.setdefaultencoding( "utf-8" )
17 |
18 | friend = 'XXX' # 朋友的QQ号,朋友的空间要求允许你能访问
19 | user = 'XXX' # 你的QQ号
20 | pw = 'XXXX' # 你的QQ密码
21 |
22 | #获取浏览器驱动
23 | driver = webdriver.Firefox()
24 |
25 | # 浏览器窗口最大化
26 | driver.maximize_window()
27 |
28 | # 浏览器地址定向为qq登陆页面
29 | driver.get("http://i.qq.com")
30 |
31 | # 所以这里需要选中一下frame,否则找不到下面需要的网页元素
32 | driver.switch_to.frame("login_frame")
33 |
34 | # 自动点击账号登陆方式
35 | driver.find_element_by_id("switcher_plogin").click()
36 |
37 | # 账号输入框输入已知qq账号
38 | driver.find_element_by_id("u").send_keys(user)
39 |
40 | # 密码框输入已知密码
41 | driver.find_element_by_id("p").send_keys(pw)
42 |
43 | # 自动点击登陆按钮
44 | driver.find_element_by_id("login_button").click()
45 |
46 | # 让webdriver操纵当前页
47 | driver.switch_to.default_content()
48 |
49 | # 跳到说说的url, friend你可以任意改成你想访问的空间
50 | driver.get("http://user.qzone.qq.com/" + friend + "/311")
51 |
52 | next_num = 0 # 初始“下一页”的id
53 | while True:
54 |
55 | # 下拉滚动条,使浏览器加载出动态加载的内容,
56 | # 我这里是从1开始到6结束 分5 次加载完每页数据
57 | for i in range(1,6):
58 | height = 20000*i#每次滑动20000像素
59 | strWord = "window.scrollBy(0,"+str(height)+")"
60 | driver.execute_script(strWord)
61 | time.sleep(4)
62 |
63 | # 很多时候网页由多个
或