├── 51job爬虫 └── featch_51job │ ├── ExeclUtils.py │ ├── Main.py │ ├── QQ截图20180318220606.png │ ├── README.md │ ├── Spider.py │ ├── SpiderBs4.py │ ├── SpiderRe.py │ ├── SpiderXpath.py │ └── re_51job招聘.xlsx ├── README.md ├── 公众号爬虫 ├── mp_spider1.py ├── mp_spider2.py └── mp_spider4.py ├── 其它爬虫 ├── book_py.py ├── download_video.py ├── fenng_py.py ├── qq_send_word.py ├── qq_word.py ├── weibo_py.py ├── weibo_py2.py ├── word_cloud_py.py ├── zsxq_5_pdf.py ├── 我主良缘爬虫.py ├── 爬取QQ.py └── 爬取冯大辉微博.py ├── 得到爬虫 └── dedao_App │ ├── ExeclUtils.py │ ├── QQ截图20180605204125.png │ ├── QQ截图20180605204150.png │ ├── __pycache__ │ ├── ExeclUtils.cpython-36.pyc │ └── __init__.cpython-36.pyc │ ├── dedaoSpider.py │ └── 逻辑思维音频.xlsx └── 拉勾爬虫 └── lagouSpider-master ├── QQ截图20180525142031.png ├── QQ截图20180525142111.png ├── lagouSpider ├── dbtools.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── lagou_crawl.py ├── readme.md └── scrapy.cfg /51job爬虫/featch_51job/ExeclUtils.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | import xlwt 4 | 5 | import sys 6 | 7 | reload(sys) 8 | sys.setdefaultencoding('utf-8') 9 | 10 | ''' 11 | 这里是操作execl的工具类,以后也可以直接复用 12 | 方法调用SpiderUtils.create_excecl(...) 13 | 14 | ''' 15 | 16 | 17 | class ExeclUtils(object): 18 | 19 | @staticmethod 20 | def create_execl(sheet_name, row_titles): 21 | ''' 22 | 创建execl文件与sheet表,并创建他们的第一行标题 23 | :param sheet_name: execl中sheet_name文件的名称 24 | :param row_titles: execl文件的标题行 25 | :return: execl_file,execl_sheet 26 | ''' 27 | 28 | f = xlwt.Workbook() 29 | sheet_info = f.add_sheet(sheet_name, cell_overwrite_ok=True) 30 | for i in range(0, len(row_titles)): 31 | sheet_info.write(0, i, row_titles[i]) 32 | 33 | return f, sheet_info 34 | 35 | @staticmethod 36 | def write_execl(execl_file, execl_sheet, count, data, execl_name): 37 | ''' 38 | 把数据写入到execl中.这是一个静态方法 39 | 注意:这里所有的数据都不要写死,方便复用. 40 | :param execl_file: 传入一个execl文件 41 | :param execl_sheet: 传入一个execl_sheet表 42 | :param count: execl文件的行数 43 | :param data: 要传入的一条数据 44 | :param execl_name: execl文件名 45 | :return: None 46 | ''' 47 | for j in range(len(data)): 48 | execl_sheet.write(count, j, data[j]) 49 | 50 | execl_file.save(execl_name) 51 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/Main.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | 3 | from SpiderBs4 import SpiderBs4 4 | from SpiderRe import SpiderRe 5 | from SpiderXpath import SpiderXpath 6 | 7 | '''爬虫入口''' 8 | 9 | 10 | class Main(object): 11 | 12 | # 创建一个静态方法 13 | @staticmethod 14 | def select_type(): 15 | type = input('请输入你先选择的爬虫类型:\n1.xpath爬取数据\n2.正则爬取数据 \n3.bs4爬取数据 \n默认使用xpath提取数据\n你的输入是:') 16 | if type == 1: 17 | print '选择了xpath爬取数据\n\n' 18 | xpath = SpiderXpath() 19 | xpath.crawler_data() 20 | elif type == 2: 21 | print '选择了正则爬取数据\n\n' 22 | xpath = SpiderRe() 23 | xpath.crawler_data() 24 | elif type == 3: 25 | print '选择了bs4爬取数据\n\n' 26 | bs4 = SpiderBs4() 27 | bs4.crawler_data() 28 | else: 29 | print '选择了xpath爬取数据\n\n' 30 | xpath = SpiderXpath() 31 | xpath.crawler_data() 32 | 33 | 34 | if __name__ == '__main__': 35 | Main().select_type() 36 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/QQ截图20180318220606.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/51job爬虫/featch_51job/QQ截图20180318220606.png -------------------------------------------------------------------------------- /51job爬虫/featch_51job/README.md: -------------------------------------------------------------------------------- 1 | ### 2 | # 本项目使用py2.7环境  3 |   4 | ### 分别用Xpath,bs4,正则三种方式获取51job关于Python的招聘信息 5 | 6 | 7 | 8 | `Spider`是三种爬取方式的基类,这里有请求招聘数据列表与工作详情的请求信息. 9 | 10 | 另外里面创建了解析列表与详情数据的抽象类,然后在子类里面对抽象类进行实现 11 | 12 | 13 | 14 | `ExeclUtils`这是一个操作Execl的工具类,评分有创建Execl的sheet表格与对表格进行写入数据. 15 | 16 | 17 | 18 | ### 效果图 19 | 20 | ![效果图](https://github.com/pythonchannel/fetch_51job/blob/master/QQ%E6%88%AA%E5%9B%BE20180318220606.png) 21 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/Spider.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import time 3 | import abc 4 | import requests 5 | from ExeclUtils import ExeclUtils 6 | 7 | ''' 8 | 这是爬虫的抽象类, 9 | xpath,bs4,re 三种爬虫方式都继承这个类 10 | 因为所有的请求列表与详情是通用的,所以我这里把请求数据都放在基类中 11 | 然后调用爬取方式,爬取方式在子类中实现 12 | 13 | ''' 14 | 15 | 16 | class Spider(object): 17 | # 定义一个抽象类 18 | __metaclass__ = abc.ABCMeta 19 | 20 | def __init__(self): 21 | self.rows_title = [u'招聘标题', u'公司名称', u'公司地址', u'待遇', u'发布日期', u'招聘链接', u'招聘要求描述'] 22 | sheet_name = u'51job_Python招聘' 23 | return_execl = ExeclUtils.create_execl(sheet_name, self.rows_title) 24 | self.execl_f = return_execl[0] 25 | self.sheet_table = return_execl[1] 26 | self.job_info = [] # 存放每一条数据中的各元素, 27 | self.count = 0 # 数据插入从1开始的 28 | 29 | def crawler_data(self): 30 | ''' 31 | 开始爬取数据 32 | :return: 33 | ''' 34 | for i in range(1, 5): 35 | url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format( 36 | i) 37 | self.request_job_list(url) 38 | # 采集不要太快了,否则容易造成ip被封或者网络请求失败 39 | time.sleep(2) 40 | 41 | def request_job_list(self, page_url): 42 | ''' 43 | 获取工作列表 44 | :param page_url: 45 | :return: 46 | ''' 47 | try: 48 | headers = { 49 | 'Referer': 'http://www.51job.com/', 50 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4549.400 QQBrowser/9.7.12900.400' 51 | } 52 | response = requests.get(page_url, headers=headers) 53 | response.encoding = 'gbk' 54 | # 如果请求失败,则不能继续进行 55 | if response.status_code != 200: 56 | return 57 | self.parse_job_list(response.text) 58 | except Exception as e: 59 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message) 60 | 61 | @abc.abstractmethod 62 | def parse_job_list(self, text): 63 | ''' 64 | 解析工作列表的抽象类,具体实现在子类中 65 | :param text: 66 | :return: 67 | ''' 68 | pass 69 | 70 | def request_job_detail(self, job_href): 71 | ''' 72 | 获取工作详情 73 | :param job_href: 招聘工作的链接 74 | :return: 75 | ''' 76 | try: 77 | headers = { 78 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4549.400 QQBrowser/9.7.12900.400' 79 | } 80 | response = requests.get(job_href, headers=headers) 81 | response.encoding = 'gbk' 82 | # 如果请求失败,则不能继续进行 83 | if response.status_code != 200: 84 | return '' 85 | 86 | self.parse_job_detail(response.text) 87 | 88 | except Exception as e: 89 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message) 90 | 91 | @abc.abstractmethod 92 | def parse_job_detail(self, text): 93 | ''' 94 | 定义工作详情的抽象类 95 | :param text: 96 | :return: 97 | ''' 98 | pass 99 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/SpiderBs4.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from bs4 import BeautifulSoup 3 | from ExeclUtils import ExeclUtils 4 | from Spider import Spider 5 | import time 6 | 7 | 8 | class SpiderBs4(Spider): 9 | 10 | def __init__(self): 11 | super(SpiderBs4, self).__init__() 12 | 13 | def parse_job_list(self, text): 14 | try: 15 | soup = BeautifulSoup(text, 'html.parser') 16 | results = soup.select('div.dw_table > div.el')[1:] 17 | for result in results: 18 | job_title = result.select('p.t1 span a') 19 | job_href = result.select('p.t1 span a') 20 | job_company = result.select('span.t2 a') 21 | job_address = result.select('span.t3') 22 | job_salary = result.select('span.t4') 23 | job_date = result.select('span.t5') 24 | 25 | job_title = job_title[0].attrs['title'] if len(job_title) > 0 else '' 26 | job_href = job_href[0].attrs['href'] if len(job_href) > 0 else '' 27 | job_company = job_company[0].attrs['title'] if len(job_company) > 0 else '' 28 | job_address = job_address[0].text if len(job_address) > 0 else '' 29 | job_salary = job_salary[0].text if len(job_salary) > 0 else '' 30 | job_date = job_date[0].text if len(job_date) > 0 else '' 31 | 32 | self.job_info.append(job_title) 33 | self.job_info.append(job_company) 34 | self.job_info.append(job_address) 35 | self.job_info.append(job_salary) 36 | self.job_info.append(job_date) 37 | self.job_info.append(job_href) 38 | 39 | self.request_job_detail(job_href) 40 | time.sleep(1) 41 | except Exception as e: 42 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message) 43 | 44 | def parse_job_detail(self, text): 45 | try: 46 | soup = BeautifulSoup(text, 'html.parser') 47 | try: 48 | # 工作描述 49 | job_statements = soup.select('div.job_msg') 50 | job_statement = job_statements[0].text.strip(' ').replace(' ', '').replace('\n', '') 51 | except Exception as e: 52 | print e.message 53 | job_statement = '职位无明确描述' 54 | 55 | self.job_info.append(job_statement) 56 | self.count = self.count + 1 57 | ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u'bs4_51job招聘.xlsx') 58 | print '采集了{}条数据'.format(self.count) 59 | # 清空集合,为再次存放数据做准备 60 | self.job_info = [] 61 | except Exception as e: 62 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message) 63 | 64 | # 65 | # 66 | # if __name__ == '__main__': 67 | # x = SpiderBs42() 68 | # x.crawler_data() 69 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/SpiderRe.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | from ExeclUtils import ExeclUtils 4 | from Spider import Spider 5 | import time 6 | 7 | 8 | class SpiderRe(Spider): 9 | 10 | def __init__(self): 11 | super(SpiderRe, self).__init__() 12 | 13 | def parse_job_list(self, text): 14 | pattern = re.compile( 15 | '
.*?(.*?).*?class.*?"t4">(.*?).*?class.*?"t5">(.*?).*?
', 16 | re.S) 17 | jobs = re.findall(pattern, text) 18 | for job in jobs: 19 | try: 20 | # 获取职位名称、公司、地点等信息 21 | job_title = job[0] 22 | job_href = job[1] 23 | job_company = job[2] 24 | job_address = job[3] 25 | job_salary = job[4] 26 | job_date = job[5] 27 | 28 | self.job_info.append(job_title) 29 | self.job_info.append(job_company) 30 | self.job_info.append(job_address) 31 | self.job_info.append(job_salary) 32 | self.job_info.append(job_date) 33 | self.job_info.append(job_href) 34 | 35 | self.request_job_detail(job_href) 36 | time.sleep(1) 37 | 38 | except Exception as e: 39 | print e.message 40 | continue 41 | 42 | def parse_job_detail(self, text): 43 | result = re.findall(r'
(.*?) 0: 46 | job_statement = ''.join( 47 | [i.strip() for i in re.split(r'
', re.sub('<[/]?\w+>', '', result[0].strip()))]) if \ 48 | result[0] else '' 49 | 50 | self.job_info.append(job_statement) 51 | self.count = self.count + 1 52 | ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u're_51job招聘.xlsx') 53 | print '采集了{}条数据'.format(self.count) 54 | # 清空集合,为再次存放数据做准备 55 | self.job_info = [] 56 | # 57 | # 58 | # if __name__ == '__main__': 59 | # x = SpiderRe2() 60 | # x.crawler_data() 61 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/SpiderXpath.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | from lxml import etree 3 | from ExeclUtils import ExeclUtils 4 | from Spider import Spider 5 | import time 6 | 7 | 8 | class SpiderXpath(Spider): 9 | 10 | def __init__(self): 11 | super(SpiderXpath, self).__init__() 12 | 13 | def parse_job_list(self, text): 14 | try: 15 | f = etree.HTML(text) 16 | divs = f.xpath('//*[@id="resultList"]/div[@class="el"]') 17 | for div in divs: 18 | job_title = div.xpath('./p/span/a/@title') 19 | job_company = div.xpath('./span[1]/a/@title') 20 | job_address = div.xpath('./span[2]/text()') 21 | job_salary = div.xpath('./span[3]/text()') 22 | job_date = div.xpath('./span[4]/text()') 23 | job_href = div.xpath('./p/span/a/@href') 24 | 25 | job_title = job_title[0] if len(job_title) > 0 else '' 26 | job_company = job_company[0] if len(job_company) > 0 else '' 27 | job_address = job_address[0] if len(job_address) > 0 else '' 28 | job_salary = job_salary[0] if len(job_salary) > 0 else '' 29 | job_date = job_date[0] if len(job_date) > 0 else '' 30 | job_href = job_href[0] if len(job_href) > 0 else '' 31 | 32 | self.job_info.append(job_title) 33 | self.job_info.append(job_company) 34 | self.job_info.append(job_address) 35 | self.job_info.append(job_salary) 36 | self.job_info.append(job_date) 37 | self.job_info.append(job_href) 38 | 39 | self.request_job_detail(job_href) 40 | time.sleep(1) 41 | except Exception as e: 42 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message) 43 | 44 | def parse_job_detail(self, text): 45 | f = etree.HTML(text) 46 | try: 47 | # 工作描述 48 | job_statements = f.xpath('//div[@class="bmsg job_msg inbox"]') 49 | job_statement = job_statements[0] if len(job_statements) > 0 else '' 50 | if job_statement != '': 51 | job_statement = job_statement.xpath('string(.)').strip().split('\n')[0] 52 | else: 53 | job_statement = '职位无明确描述' 54 | except Exception as e: 55 | print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message) 56 | job_statement = '职位无明确描述' 57 | 58 | self.job_info.append(job_statement) 59 | self.count = self.count + 1 60 | ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u'xpath_51job招聘.xlsx') 61 | print '采集了{}条数据'.format(self.count) 62 | # 清空集合,为再次存放数据做准备 63 | self.job_info = [] 64 | pass 65 | 66 | # 67 | # if __name__ == '__main__': 68 | # x = SpiderXpath2() 69 | # x.crawler_data() 70 | -------------------------------------------------------------------------------- /51job爬虫/featch_51job/re_51job招聘.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/51job爬虫/featch_51job/re_51job招聘.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spider_works 2 | all sider source code 3 | -------------------------------------------------------------------------------- /公众号爬虫/mp_spider1.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | import time 4 | import json 5 | import os 6 | 7 | 8 | 9 | class mp_spider(object): 10 | 11 | def __init__(self): 12 | self.offset = 10 13 | self.base_url = '为保护隐藏,你自己去处理' 14 | self.headers = '为保护隐藏,你自己去处理' 15 | def request_data(self): 16 | try: 17 | response = requests.get(self.base_url.format(self.offset), headers=self.headers) 18 | print(self.base_url.format(self.offset)) 19 | if 200 == response.status_code: 20 | self.parse_data(response.text) 21 | except Exception as e: 22 | print(e) 23 | time.sleep(2) 24 | pass 25 | 26 | def parse_data(self, responseData): 27 | 28 | all_datas = json.loads(responseData) 29 | 30 | if 0== all_datas['ret']: 31 | 32 | summy_datas = all_datas['general_msg_list'] 33 | datas = json.loads(summy_datas)['list'] 34 | for data in datas: 35 | try: 36 | title = data['app_msg_ext_info']['title'] 37 | title_child = data['app_msg_ext_info']['digest'] 38 | article_url = data['app_msg_ext_info']['content_url'] 39 | cover = data['app_msg_ext_info']['cover'] 40 | print(title,title_child,article_url,cover) 41 | except Exception as e: 42 | print(e) 43 | continue 44 | 45 | 46 | print('----------------------------------------') 47 | time.sleep(3) 48 | self.offset = self.offset+10 49 | self.request_data() 50 | else: 51 | print('抓取数据出错!') 52 | 53 | 54 | 55 | if __name__ == '__main__': 56 | d = mp_spider() 57 | d.request_data() 58 | -------------------------------------------------------------------------------- /公众号爬虫/mp_spider2.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | import time 4 | import json 5 | import os 6 | import pdfkit 7 | 8 | 9 | class mp_spider(object): 10 | 11 | def __init__(self): 12 | self.config = pdfkit.configuration(wkhtmltopdf='C:/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe') 13 | self.offset = 0 14 | self.count = 0 15 | self.base_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzAwMjQwODIwNg==&f=json&offset={}&count=10&is_ok=1&scene=124&uin=MTIyOTkzMzgyMA%3D%3D&key=7cabb994f4d85a88ad37c1ec41ddde6234e76a1f1e69b178052bc99ccdf724f77700b28cea9e242cc98e517bd2537122fdc7a65a601e36f438b33e31e183f64dd9519beed36d892cc0a31855f1c649d6&pass_ticket=n6xnvQjzn4yfkjScc%2FSoVi4SkEgzf4z0airW6Ue14zIDNH98t%2Fr62k2KszUJ1qNv&wxtoken=&appmsg_token=960_mNI0W0CuVRuEpG7GsxB7f7pUUrO2CWW_iib4ww~~&x5=0&f=json' 16 | self.headers = { 17 | 'Host': 'mp.weixin.qq.com', 18 | 'Connection': 'keep-alive', 19 | 'Accept': '*/*', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.884.400 QQBrowser/9.0.2524.400', 21 | 'X-Requested-With': 'XMLHttpRequest', 22 | 'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5MTQ4NjA3Nw==&scene=124&uin=MjA2MDM3NTU%3D&key=2b903b9a7252346947b8c8bec6a8e97ea469a66c7c55196aec680d36fef8d99bdd51ba33c76a8d0e5655e5186714a09c18bdc873bdac2350ffd215c1d3cb331a3f67f0dcc00984035cbaacc19e1ef3e2&devicetype=Windows+10&version=62060344&lang=zh_CN&a8scene=7&pass_ticket=jAFRJRtWRdJcSXta5fiYsjBqfK6vqTIYWrULumuK5sc%3D&winzoom=1', 23 | 'Accept-Encoding': 'gzip, deflate', 24 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4', 25 | 'Cookie': 'wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=n6xnvQjzn4yfkjScc/SoVi4SkEgzf4z0airW6Ue14zIDNH98t/r62k2KszUJ1qNv; wap_sid2=CPyZvcoEElwzdm5YaDByenY3S2dzYlJtdXFDQVJYbmZKUERuM2I5elhMb3NxMVZqX3FCTDVYaFJ2Rkd2RktMdm9KajV3TWU5T3YyTTVfUG5zZ2llWko0cW5aMzBiY0FEQUFBfjCo9fLYBTgNQJVO' 26 | } 27 | 28 | def request_data(self): 29 | response = requests.get(self.base_url.format(self.offset), headers=self.headers) 30 | if 200 == response.status_code: 31 | self.parse_data(response.text) 32 | 33 | def parse_data(self, response_data): 34 | 35 | all_datas = json.loads(response_data) 36 | 37 | if 0 == all_datas['ret']: 38 | if 1 == all_datas['can_msg_continue']: 39 | summy_datas = all_datas['general_msg_list'] 40 | datas = json.loads(summy_datas)['list'] 41 | for data in datas: 42 | try: 43 | title = data['app_msg_ext_info']['title'] 44 | title_child = data['app_msg_ext_info']['digest'] 45 | article_url = data['app_msg_ext_info']['content_url'] 46 | cover = data['app_msg_ext_info']['cover'] 47 | copyright = data['app_msg_ext_info']['copyright_stat'] 48 | copyright = '原创文章_' if copyright == 11 else '非原创文章_' 49 | self.count = self.count + 1 50 | print('第【{}】篇文章'.format(self.count), copyright, title, title_child, article_url, cover) 51 | self.creat_pdf_file(article_url, '{}_{}'.format(copyright, title)) 52 | except: 53 | continue 54 | 55 | time.sleep(3) 56 | self.offset = all_datas['next_offset'] # 下一页的偏移量 57 | self.request_data() 58 | else: 59 | exit('数据抓取完毕!') 60 | else: 61 | exit('数据抓取出错:' + all_datas['errmsg']) 62 | 63 | def creat_pdf_file(self, url, title): 64 | try: 65 | file = 'D:/store/file2/{}.pdf'.format(title) 66 | if not os.path.exists(file): # 过滤掉重复文件 67 | pdfkit.from_url(url, file) 68 | 69 | except Exception as e: 70 | print(e) 71 | 72 | 73 | if __name__ == '__main__': 74 | d = mp_spider() 75 | d.request_data() 76 | 77 | -------------------------------------------------------------------------------- /公众号爬虫/mp_spider4.py: -------------------------------------------------------------------------------- 1 | import json 2 | from os import path 3 | 4 | import requests 5 | from scipy.misc import imread 6 | from wordcloud import WordCloud 7 | 8 | ''' 9 | 择取抖音评论 10 | ''' 11 | 12 | 13 | class mp_spider(object): 14 | 15 | def __init__(self): 16 | self.offset = 0 17 | self.count = 0 18 | self.base_comment_url = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz=MjM5NjA5NDczMg==&appmsgid=2652274724&idx=1&comment_id=303303606155886594&offset=0&limit=100&uin=MTIyOTkzMzgyMA%253D%253D&key=984e4c80c8bc7843fbc3177a66f8024c086af6b59a7ac97026e9f4db88fc49d0c26ce660040b865a3294ae651150d40227980433f1a5106b5a15261ad20d564aad1e8c6aa2dfda74fdd515af0bc77f1e&pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ%25252BBV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA%25252F&wxtoken=777&devicetype=Windows%26nbsp%3B10&clientversion=6206021f&appmsg_token=961_V5yXdClt1VInI19BnECwzmgi95G9e44nyElITVL5rKcbKbGDkLSLzLuTrUTO-TL3Zo_qNKEVSclPd8LG&x5=0&f=json' 19 | self.base_comment_header = { 20 | 'Host': 'mp.weixin.qq.com', 21 | 'Connection': 'keep-alive', 22 | 'Accept': '*/*', 23 | 'CSP': 'active', 24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400', 25 | 'X-Requested-With': 'XMLHttpRequest', 26 | 'Referer': 'https://mp.weixin.qq.com/s?__biz=MjM5NjA5NDczMg==&mid=2652274724&idx=1&sn=ad0bbb4461e20cdb5bb1563e6d20639d&chksm=bd0c56478a7bdf51db287ab8a6e054284f0a6aa9b475a3597e2f02c1a28a9ac0f085dab1820e&mpshare=1&scene=1&srcid=0603ZskndK5clppsBTw7kWWW&key=8799423f74e5608e8fddceb78f6442677bcc4589977665cb4aaf92376ab0b3acbf903998bd87428c0a2b8f8a724ce746d59882f43021889961664fd26aa68e05492d96213e1addea8cee62b98b6ebb76&ascene=1&uin=MTIyOTkzMzgyMA%3D%3D&devicetype=Windows+10&version=6206021f&lang=zh_CN&pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ%2BBV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA%2F&winzoom=1', 27 | 'Accept-Encoding': 'gzip, deflate', 28 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4', 29 | 'Cookie': 'rewardsn=; wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ+BV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA/; wap_sid2=CPyZvcoEElxMa0JKOS1tWHpPMFBlWFduNGRJbE9aUGFvNU9ja0poVXpKanpFSnVIQXpxbVUyVWNuZXlqQ2I3cDFvUmxlUGFIX2lFUDVGZ0dBTDBHRFFremh6Ml9vc0VEQUFBfjCikIrZBTgNQAE=; wxtokenkey=777' 30 | } 31 | 32 | def request_comment_data(self): 33 | response = requests.get(self.base_comment_url, headers=self.base_comment_header) 34 | if 200 == response.status_code: 35 | self.parse_comment_data(response.text) 36 | 37 | def parse_comment_data(self, response_data): 38 | 39 | all_datas = json.loads(response_data) 40 | 41 | if 0 == all_datas['base_resp']['ret']: 42 | all_comments = all_datas['elected_comment'] 43 | with open('抖音毁掉.txt', 'a', encoding='utf-8') as f: 44 | for comments in all_comments: 45 | name = comments['nick_name'] 46 | content = comments['content'] 47 | print(name, content) 48 | try: 49 | f.write(content + "\n") 50 | except Exception as e: 51 | print(e) 52 | continue 53 | 54 | self.create_word_cloud('抖音毁掉') 55 | else: 56 | exit('数据抓取出错:' + all_datas['errmsg']) 57 | 58 | def create_word_cloud(self,file_name): 59 | d = path.dirname(__file__) # __file__ 为当前文件, 60 | 61 | text = open(path.join(d, '{}.txt'.format(file_name)), encoding='utf-8').read() 62 | back_coloring = imread(path.join(d, 'douyin_bg.png')) # 设置背景图片 63 | 64 | wc = WordCloud(background_color="white", 65 | font_path='C:\Windows\Fonts\msyhl.ttc', 66 | max_words=5000, 67 | mask=back_coloring, 68 | # 设置有多少种随机生成状态,即有多少种配色方案 69 | random_state=30) 70 | # generate word cloud 71 | wc.generate(text) 72 | 73 | # store to file 74 | wc.to_file(path.join(d, "alice.png")) 75 | 76 | 77 | if __name__ == '__main__': 78 | d = mp_spider() 79 | d.request_comment_data() -------------------------------------------------------------------------------- /其它爬虫/book_py.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | """ 4 | 爬取京东图书评价 5 | 6 | """ 7 | 8 | 9 | import time 10 | from selenium import webdriver 11 | from lxml import etree 12 | import sys 13 | reload(sys) 14 | sys.setdefaultencoding( "utf-8" ) 15 | 16 | class get_book(object): 17 | #获取浏览器驱动 18 | driver = webdriver.Firefox() 19 | 20 | # 浏览器窗口最大化 21 | driver.maximize_window() 22 | 23 | # 浏览器地址定向为qq登陆页面 24 | driver.get("https://item.jd.com/11993134.html#comment") 25 | 26 | # 切换到评价的tab 27 | driver.find_element_by_xpath('//*[@id="detail-tab-comm"]/a').click() 28 | 29 | while True: 30 | # 下拉滚动条,从1开始到3结束 分2次加载完每页数据 31 | for i in range(1,3): 32 | height = 20000*i#每次滑动20000像素 33 | strWord = "window.scrollBy(0,"+str(height)+")" 34 | driver.execute_script(strWord) 35 | time.sleep(4) 36 | 37 | selector = etree.HTML(driver.page_source) 38 | divs = selector.xpath('//*[@id="comment-0"]/div[1]/div/div') 39 | 40 | # mode =a 不清空连续写入 41 | with open('python_book.txt','a') as f: 42 | for div in divs: 43 | jd_conmment = div.xpath('./div[2]/div[1]/text()') 44 | jd_conmment = jd_conmment[0] if len(jd_conmment)>0 else '' 45 | f.write(jd_conmment+'\n') 46 | 47 | #分析得知当为最后一页时,最后的ui-pager-next不见了 48 | if driver.page_source.find('ui-pager-next') == -1: 49 | break 50 | 51 | # 找到“下一页”的按钮元素 52 | driver.find_element_by_class_name('ui-pager-next').click() 53 | 54 | # 因为在下一个循环里首先还要把页面下拉,所以要跳到外层的frame上 55 | driver.switch_to.parent_frame() 56 | 57 | if __name__=='__main__': 58 | get_book() 59 | 60 | -------------------------------------------------------------------------------- /其它爬虫/download_video.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import datetime 3 | import os 4 | import threading 5 | import time 6 | from contextlib import closing 7 | 8 | import requests 9 | from lxml import etree 10 | from selenium import webdriver 11 | from selenium.webdriver.common.by import By 12 | from selenium.webdriver.support.ui import WebDriverWait 13 | from selenium.webdriver.support import expected_conditions as EC 14 | 15 | 16 | class VideoDown(object): 17 | 18 | def __init__(self): 19 | self.first_position = 0 20 | self.count = 0 21 | self.threads = [] 22 | self.content = [] 23 | 24 | def load_data(self): 25 | 26 | video_url = 'http://neihanshequ.com/video/' 27 | driver = webdriver.Firefox() # 获取浏览器驱动 28 | driver.maximize_window() 29 | driver.implicitly_wait(10) # 控制间隔时间等待浏览器反映 30 | driver.get(video_url) 31 | 32 | while True: 33 | try: 34 | WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'loadMore'))) 35 | except Exception as e: 36 | print e.message 37 | break 38 | 39 | js = 'window.scrollTo(0,document.body.scrollHeight)' 40 | driver.execute_script(js) 41 | time.sleep(10) 42 | 43 | source = etree.HTML(driver.page_source) 44 | divs = source.xpath('//*[@id="detail-list"]/li') 45 | 46 | for div in divs: 47 | self.count = self.count + 1 48 | print '第{}条数据'.format(str(self.count)) 49 | title = div.xpath('./div/div[2]/a/div/p/text()') 50 | v_url = div.xpath('.//*[@class="player-container"]/@data-src') 51 | title = title[0] if len(title) > 0 else '无介绍'.format(str(self.count)) 52 | v_url = v_url[0] if len(v_url) > 0 else '' 53 | self.do_thread(title, v_url) 54 | 55 | try: 56 | load_more = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'loadMore'))) 57 | load_more.click() 58 | time.sleep(10) 59 | except Exception as e: 60 | print e.message 61 | break 62 | 63 | def do_thread(self, title, url): 64 | t = threading.Thread(target=self.down_video, args=(title, url)) 65 | self.threads.append(t) 66 | t.start() 67 | 68 | for tt in self.threads: 69 | tt.join() 70 | 71 | def down_video(self, title, url): 72 | try: 73 | with closing(requests.get(url, stream=True)) as response: 74 | print url 75 | chunk_size = 1024 76 | content_size = int(response.headers['content-length']) 77 | 78 | video_path = u'D:/store/video00' 79 | # 判断文件夹是否存在。 80 | if not os.path.exists(video_path): 81 | os.makedirs(video_path) 82 | 83 | file_name = video_path + u'/{}.mp4'.format(self.count) 84 | if os.path.exists(file_name) and os.path.getsize(file_name) == content_size: 85 | print(u'跳过' + file_name) 86 | else: 87 | down = DownProgress(title, content_size) 88 | with open(file_name, "wb") as f: 89 | for data in response.iter_content(chunk_size=chunk_size): 90 | f.write(data) 91 | 92 | down.refresh_down(len(data)) 93 | except Exception as e: 94 | print e.message 95 | 96 | 97 | class DownProgress(object): 98 | def __init__(self, file_name, file_size): 99 | self.file_name = file_name 100 | self.file_down = 0 101 | self.file_size = file_size 102 | 103 | def refresh_down(self, down): 104 | self.file_down = self.file_down + down 105 | progress = (self.file_down / float(self.file_size)) * 100.0 106 | status = u'下载完成' if self.file_down >= self.file_size else u'正在下载...' 107 | print u'文件名称:{},下载进度:{},下载状态:{}'.format(self.file_name, '%.2f' % progress, status) 108 | 109 | 110 | if __name__ == '__main__': 111 | startTime = datetime.datetime.now() 112 | down = VideoDown() 113 | down.load_data() 114 | endTime = datetime.datetime.now() 115 | print '下载花费时间{}秒'.format((endTime - startTime).seconds) 116 | -------------------------------------------------------------------------------- /其它爬虫/fenng_py.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | """ 4 | 爬取冯大辉老师的微博 5 | 6 | """ 7 | 8 | 9 | import time 10 | from selenium import webdriver 11 | from lxml import etree 12 | import wordcloud as wcp 13 | import sys 14 | 15 | 16 | reload(sys) 17 | #这里需要指定字符编码 18 | sys.setdefaultencoding( "utf-8" ) 19 | 20 | def get_content(f_name): 21 | 22 | #你的微博帐号 23 | username = '******' 24 | psd = '******' 25 | 26 | #获取浏览器驱动 27 | driver = webdriver.Firefox() 28 | 29 | # 浏览器窗口最大化 30 | driver.maximize_window() 31 | 32 | driver.get('http://weibo.com/login.php') 33 | print('login............................') 34 | 35 | #给登录框与密码赋值 36 | driver.find_element_by_id('loginname').send_keys(username) 37 | driver.find_element_by_class_name('password').find_element_by_name('password').send_keys(psd) 38 | 39 | #点击登录按钮 40 | driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span').click() 41 | 42 | # 这里因为登录,需要有一个延时,不能直接切换到新网页去 43 | time.sleep(3) 44 | 45 | # 登录成功后,再用浏览器地址定向到大辉老师的微博列表页,没有什么技巧,自己去找的 46 | driver.get("https://weibo.com/p/1005051577826897/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place") 47 | 48 | while True: 49 | # 下拉滚动条,从1开始到3结束 分2次加载完每页数据 50 | for i in range(1,6): 51 | height = 20000*i#每次滑动20000像素 52 | strWord = "window.scrollBy(0,"+str(height)+")" 53 | driver.execute_script(strWord) 54 | time.sleep(4) 55 | 56 | selector = etree.HTML(driver.page_source) 57 | divs = selector.xpath('//*[@id="Pl_Official_MyProfileFeed__22"]/div/div/div[1]/div[4]') 58 | 59 | # mode =a 不清空连续写入 60 | with open('{}.txt'.format(f_name),'a') as f: 61 | for div in divs: 62 | wb_content = div.xpath('./div[3]/text()') 63 | wb_time = div.xpath('./div[2]/a/text()') 64 | wb_content = wb_content[0] if len(wb_content) > 0 else '' 65 | wb_time = wb_time[0] if len(wb_time) > 0 else '' 66 | wb_content = wb_content.strip() # 去掉左右两边的空格 67 | wb_time = wb_time.strip() 68 | print wb_content, wb_time 69 | f.write(wb_content+'\n') 70 | 71 | #分析得知当为最后一页时,最后的page next S_txt1 S_line1不见了 72 | if driver.page_source.find('page next S_txt1 S_line1') == -1: 73 | print '没有下一页了' 74 | break 75 | 76 | # 找到“下一页”的按钮元素,原本想用xpath与classname,都失败了 77 | # 这里我是用css来定位的,page next S_txt1 S_line1 在空格之间加'.' 来连接 78 | submit = driver.find_element_by_css_selector('.page.next.S_txt1.S_line1') 79 | submit.click() 80 | 81 | if __name__ == '__main__': 82 | f_name = 'ddddd' 83 | get_content(f_name) 84 | wcp.create_word_cloud(f_name) 85 | -------------------------------------------------------------------------------- /其它爬虫/qq_send_word.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | """ 4 | 爬取QQ 说说内容 5 | 6 | """ 7 | 8 | 9 | import time 10 | from selenium import webdriver 11 | from lxml import etree 12 | 13 | #这里一定要设置编码格式,防止后面写入文件时报错 14 | import sys 15 | reload(sys) 16 | sys.setdefaultencoding( "utf-8" ) 17 | 18 | friend = '563679994' # 朋友的QQ号,朋友的空间要求允许你能访问 19 | user = '563679994' # 你的QQ号 20 | pw = 'XXX' # 你的QQ密码 21 | 22 | #获取浏览器驱动 23 | driver = webdriver.Firefox() 24 | 25 | # 浏览器窗口最大化 26 | driver.maximize_window() 27 | 28 | # 浏览器地址定向为qq登陆页面 29 | driver.get("http://i.qq.com") 30 | 31 | # 所以这里需要选中一下frame,否则找不到下面需要的网页元素 32 | driver.switch_to.frame("login_frame") 33 | 34 | # 自动点击账号登陆方式 35 | driver.find_element_by_id("switcher_plogin").click() 36 | 37 | # 账号输入框输入已知qq账号 38 | driver.find_element_by_id("u").send_keys(user) 39 | 40 | # 密码框输入已知密码 41 | driver.find_element_by_id("p").send_keys(pw) 42 | 43 | # 自动点击登陆按钮 44 | driver.find_element_by_id("login_button").click() 45 | 46 | # 让webdriver操纵当前页 47 | driver.switch_to.default_content() 48 | 49 | # 跳到说说的url, friend你可以任意改成你想访问的空间 50 | driver.get("http://user.qzone.qq.com/" + friend + "/311") 51 | 52 | driver.find_element_by_xpath('//*[@id="QM_Mood_Poster_Container"]/div/div[4]/div[4]/a[2]').click() -------------------------------------------------------------------------------- /其它爬虫/qq_word.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | 3 | """ 4 | 爬取QQ 说说内容 5 | 6 | """ 7 | 8 | 9 | import time 10 | from selenium import webdriver 11 | from lxml import etree 12 | 13 | #这里一定要设置编码格式,防止后面写入文件时报错 14 | import sys 15 | reload(sys) 16 | sys.setdefaultencoding( "utf-8" ) 17 | 18 | friend = 'XXX' # 朋友的QQ号,朋友的空间要求允许你能访问 19 | user = 'XXX' # 你的QQ号 20 | pw = 'XXXX' # 你的QQ密码 21 | 22 | #获取浏览器驱动 23 | driver = webdriver.Firefox() 24 | 25 | # 浏览器窗口最大化 26 | driver.maximize_window() 27 | 28 | # 浏览器地址定向为qq登陆页面 29 | driver.get("http://i.qq.com") 30 | 31 | # 所以这里需要选中一下frame,否则找不到下面需要的网页元素 32 | driver.switch_to.frame("login_frame") 33 | 34 | # 自动点击账号登陆方式 35 | driver.find_element_by_id("switcher_plogin").click() 36 | 37 | # 账号输入框输入已知qq账号 38 | driver.find_element_by_id("u").send_keys(user) 39 | 40 | # 密码框输入已知密码 41 | driver.find_element_by_id("p").send_keys(pw) 42 | 43 | # 自动点击登陆按钮 44 | driver.find_element_by_id("login_button").click() 45 | 46 | # 让webdriver操纵当前页 47 | driver.switch_to.default_content() 48 | 49 | # 跳到说说的url, friend你可以任意改成你想访问的空间 50 | driver.get("http://user.qzone.qq.com/" + friend + "/311") 51 | 52 | next_num = 0 # 初始“下一页”的id 53 | while True: 54 | 55 | # 下拉滚动条,使浏览器加载出动态加载的内容, 56 | # 我这里是从1开始到6结束 分5 次加载完每页数据 57 | for i in range(1,6): 58 | height = 20000*i#每次滑动20000像素 59 | strWord = "window.scrollBy(0,"+str(height)+")" 60 | driver.execute_script(strWord) 61 | time.sleep(4) 62 | 63 | # 很多时候网页由多个或