├── 51job爬虫
    └── featch_51job
    │   ├── ExeclUtils.py
    │   ├── Main.py
    │   ├── QQ截图20180318220606.png
    │   ├── README.md
    │   ├── Spider.py
    │   ├── SpiderBs4.py
    │   ├── SpiderRe.py
    │   ├── SpiderXpath.py
    │   └── re_51job招聘.xlsx
├── README.md
├── 公众号爬虫
    ├── mp_spider1.py
    ├── mp_spider2.py
    └── mp_spider4.py
├── 其它爬虫
    ├── book_py.py
    ├── download_video.py
    ├── fenng_py.py
    ├── qq_send_word.py
    ├── qq_word.py
    ├── weibo_py.py
    ├── weibo_py2.py
    ├── word_cloud_py.py
    ├── zsxq_5_pdf.py
    ├── 我主良缘爬虫.py
    ├── 爬取QQ.py
    └── 爬取冯大辉微博.py
├── 得到爬虫
    └── dedao_App
    │   ├── ExeclUtils.py
    │   ├── QQ截图20180605204125.png
    │   ├── QQ截图20180605204150.png
    │   ├── __pycache__
    │       ├── ExeclUtils.cpython-36.pyc
    │       └── __init__.cpython-36.pyc
    │   ├── dedaoSpider.py
    │   └── 逻辑思维音频.xlsx
└── 拉勾爬虫
    └── lagouSpider-master
        ├── QQ截图20180525142031.png
        ├── QQ截图20180525142111.png
        ├── lagouSpider
            ├── dbtools.py
            ├── items.py
            ├── middlewares.py
            ├── pipelines.py
            ├── settings.py
            └── spiders
            │   ├── __init__.py
            │   └── lagou_crawl.py
        ├── readme.md
        └── scrapy.cfg


/51job爬虫/featch_51job/ExeclUtils.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import xlwt
 4 | 
 5 | import sys
 6 | 
 7 | reload(sys)
 8 | sys.setdefaultencoding('utf-8')
 9 | 
10 | '''
11 | 这里是操作execl的工具类,以后也可以直接复用
12 | 方法调用SpiderUtils.create_excecl(...) 
13 | 
14 | '''
15 | 
16 | 
17 | class ExeclUtils(object):
18 | 
19 |     @staticmethod
20 |     def create_execl(sheet_name, row_titles):
21 |         '''
22 |         创建execl文件与sheet表，并创建他们的第一行标题
23 |         :param sheet_name: execl中sheet_name文件的名称
24 |         :param row_titles: execl文件的标题行
25 |         :return: execl_file,execl_sheet
26 |         '''
27 | 
28 |         f = xlwt.Workbook()
29 |         sheet_info = f.add_sheet(sheet_name, cell_overwrite_ok=True)
30 |         for i in range(0, len(row_titles)):
31 |             sheet_info.write(0, i, row_titles[i])
32 | 
33 |         return f, sheet_info
34 | 
35 |     @staticmethod
36 |     def write_execl(execl_file, execl_sheet, count, data, execl_name):
37 |         '''
38 |         把数据写入到execl中.这是一个静态方法
39 |         注意：这里所有的数据都不要写死，方便复用.
40 |         :param execl_file:  传入一个execl文件
41 |         :param execl_sheet:  传入一个execl_sheet表
42 |         :param count:  execl文件的行数
43 |         :param data:  要传入的一条数据
44 |         :param execl_name: execl文件名
45 |         :return: None
46 |         '''
47 |         for j in range(len(data)):
48 |             execl_sheet.write(count, j, data[j])
49 | 
50 |         execl_file.save(execl_name)
51 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/Main.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | from SpiderBs4 import SpiderBs4
 4 | from SpiderRe import SpiderRe
 5 | from SpiderXpath import SpiderXpath
 6 | 
 7 | '''爬虫入口'''
 8 | 
 9 | 
10 | class Main(object):
11 | 
12 |     # 创建一个静态方法
13 |     @staticmethod
14 |     def select_type():
15 |         type = input('请输入你先选择的爬虫类型:\n1.xpath爬取数据\n2.正则爬取数据 \n3.bs4爬取数据 \n默认使用xpath提取数据\n你的输入是:')
16 |         if type == 1:
17 |             print '选择了xpath爬取数据\n\n'
18 |             xpath = SpiderXpath()
19 |             xpath.crawler_data()
20 |         elif type == 2:
21 |             print '选择了正则爬取数据\n\n'
22 |             xpath = SpiderRe()
23 |             xpath.crawler_data()
24 |         elif type == 3:
25 |             print '选择了bs4爬取数据\n\n'
26 |             bs4 = SpiderBs4()
27 |             bs4.crawler_data()
28 |         else:
29 |             print '选择了xpath爬取数据\n\n'
30 |             xpath = SpiderXpath()
31 |             xpath.crawler_data()
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     Main().select_type()
36 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/QQ截图20180318220606.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/51job爬虫/featch_51job/QQ截图20180318220606.png


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/README.md:
--------------------------------------------------------------------------------
 1 | ###
 2 | # 本项目使用py2.7环境 
 3 |  
 4 | ### 分别用Xpath,bs4,正则三种方式获取51job关于Python的招聘信息
 5 | 
 6 | 
 7 | 
 8 | `Spider`是三种爬取方式的基类，这里有请求招聘数据列表与工作详情的请求信息.
 9 | 
10 | 另外里面创建了解析列表与详情数据的抽象类，然后在子类里面对抽象类进行实现 
11 | 
12 | 
13 | 
14 | `ExeclUtils`这是一个操作Execl的工具类，评分有创建Execl的sheet表格与对表格进行写入数据.
15 | 
16 | 
17 | 
18 | ### 效果图
19 | 
20 | ![效果图](https://github.com/pythonchannel/fetch_51job/blob/master/QQ%E6%88%AA%E5%9B%BE20180318220606.png)
21 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/Spider.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import time
 3 | import abc
 4 | import requests
 5 | from ExeclUtils import ExeclUtils
 6 | 
 7 | '''
 8 | 这是爬虫的抽象类，
 9 | xpath,bs4,re 三种爬虫方式都继承这个类
10 | 因为所有的请求列表与详情是通用的，所以我这里把请求数据都放在基类中
11 | 然后调用爬取方式，爬取方式在子类中实现
12 | 
13 | '''
14 | 
15 | 
16 | class Spider(object):
17 |     # 定义一个抽象类
18 |     __metaclass__ = abc.ABCMeta
19 | 
20 |     def __init__(self):
21 |         self.rows_title = [u'招聘标题', u'公司名称', u'公司地址', u'待遇', u'发布日期', u'招聘链接', u'招聘要求描述']
22 |         sheet_name = u'51job_Python招聘'
23 |         return_execl = ExeclUtils.create_execl(sheet_name, self.rows_title)
24 |         self.execl_f = return_execl[0]
25 |         self.sheet_table = return_execl[1]
26 |         self.job_info = []  # 存放每一条数据中的各元素，
27 |         self.count = 0  # 数据插入从1开始的
28 | 
29 |     def crawler_data(self):
30 |         '''
31 |         开始爬取数据
32 |         :return:
33 |         '''
34 |         for i in range(1, 5):
35 |             url = 'http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(
36 |                 i)
37 |             self.request_job_list(url)
38 |             # 采集不要太快了，否则容易造成ip被封或者网络请求失败
39 |             time.sleep(2)
40 | 
41 |     def request_job_list(self, page_url):
42 |         '''
43 |         获取工作列表
44 |         :param page_url:
45 |         :return:
46 |         '''
47 |         try:
48 |             headers = {
49 |                 'Referer': 'http://www.51job.com/',
50 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4549.400 QQBrowser/9.7.12900.400'
51 |             }
52 |             response = requests.get(page_url, headers=headers)
53 |             response.encoding = 'gbk'
54 |             # 如果请求失败，则不能继续进行
55 |             if response.status_code != 200:
56 |                 return
57 |             self.parse_job_list(response.text)
58 |         except Exception as e:
59 |             print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
60 | 
61 |     @abc.abstractmethod
62 |     def parse_job_list(self, text):
63 |         '''
64 |         解析工作列表的抽象类，具体实现在子类中
65 |         :param text:
66 |         :return:
67 |         '''
68 |         pass
69 | 
70 |     def request_job_detail(self, job_href):
71 |         '''
72 |         获取工作详情
73 |         :param job_href: 招聘工作的链接
74 |         :return:
75 |         '''
76 |         try:
77 |             headers = {
78 |                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4549.400 QQBrowser/9.7.12900.400'
79 |             }
80 |             response = requests.get(job_href, headers=headers)
81 |             response.encoding = 'gbk'
82 |             # 如果请求失败，则不能继续进行
83 |             if response.status_code != 200:
84 |                 return ''
85 | 
86 |             self.parse_job_detail(response.text)
87 | 
88 |         except Exception as e:
89 |             print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
90 | 
91 |     @abc.abstractmethod
92 |     def parse_job_detail(self, text):
93 |         '''
94 |         定义工作详情的抽象类
95 |         :param text:
96 |         :return:
97 |         '''
98 |         pass
99 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/SpiderBs4.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from bs4 import BeautifulSoup
 3 | from ExeclUtils import ExeclUtils
 4 | from Spider import Spider
 5 | import time
 6 | 
 7 | 
 8 | class SpiderBs4(Spider):
 9 | 
10 |     def __init__(self):
11 |         super(SpiderBs4, self).__init__()
12 | 
13 |     def parse_job_list(self, text):
14 |         try:
15 |             soup = BeautifulSoup(text, 'html.parser')
16 |             results = soup.select('div.dw_table > div.el')[1:]
17 |             for result in results:
18 |                 job_title = result.select('p.t1 span a')
19 |                 job_href = result.select('p.t1 span a')
20 |                 job_company = result.select('span.t2  a')
21 |                 job_address = result.select('span.t3')
22 |                 job_salary = result.select('span.t4')
23 |                 job_date = result.select('span.t5')
24 | 
25 |                 job_title = job_title[0].attrs['title'] if len(job_title) > 0 else ''
26 |                 job_href = job_href[0].attrs['href'] if len(job_href) > 0 else ''
27 |                 job_company = job_company[0].attrs['title'] if len(job_company) > 0 else ''
28 |                 job_address = job_address[0].text if len(job_address) > 0 else ''
29 |                 job_salary = job_salary[0].text if len(job_salary) > 0 else ''
30 |                 job_date = job_date[0].text if len(job_date) > 0 else ''
31 | 
32 |                 self.job_info.append(job_title)
33 |                 self.job_info.append(job_company)
34 |                 self.job_info.append(job_address)
35 |                 self.job_info.append(job_salary)
36 |                 self.job_info.append(job_date)
37 |                 self.job_info.append(job_href)
38 | 
39 |                 self.request_job_detail(job_href)
40 |                 time.sleep(1)
41 |         except Exception as e:
42 |             print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
43 | 
44 |     def parse_job_detail(self, text):
45 |         try:
46 |             soup = BeautifulSoup(text, 'html.parser')
47 |             try:
48 |                 # 工作描述
49 |                 job_statements = soup.select('div.job_msg')
50 |                 job_statement = job_statements[0].text.strip(' ').replace(' ', '').replace('\n', '')
51 |             except Exception as e:
52 |                 print e.message
53 |                 job_statement = '职位无明确描述'
54 | 
55 |             self.job_info.append(job_statement)
56 |             self.count = self.count + 1
57 |             ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u'bs4_51job招聘.xlsx')
58 |             print '采集了{}条数据'.format(self.count)
59 |             # 清空集合,为再次存放数据做准备
60 |             self.job_info = []
61 |         except Exception as e:
62 |             print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
63 | 
64 | #
65 | #
66 | # if __name__ == '__main__':
67 | #     x = SpiderBs42()
68 | #     x.crawler_data()
69 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/SpiderRe.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import re
 3 | from ExeclUtils import ExeclUtils
 4 | from Spider import Spider
 5 | import time
 6 | 
 7 | 
 8 | class SpiderRe(Spider):
 9 | 
10 |     def __init__(self):
11 |         super(SpiderRe, self).__init__()
12 | 
13 |     def parse_job_list(self, text):
14 |         pattern = re.compile(
15 |             '<div class="el">.*?<a.*?title="(.*?)".*?href="(.*?)".*?<a.*?title="(.*?)".*?class.*?"t3">(.*?)</span>.*?class.*?"t4">(.*?)</span>.*?class.*?"t5">(.*?)</span>.*?</div>',
16 |             re.S)
17 |         jobs = re.findall(pattern, text)
18 |         for job in jobs:
19 |             try:
20 |                 # 获取职位名称、公司、地点等信息
21 |                 job_title = job[0]
22 |                 job_href = job[1]
23 |                 job_company = job[2]
24 |                 job_address = job[3]
25 |                 job_salary = job[4]
26 |                 job_date = job[5]
27 | 
28 |                 self.job_info.append(job_title)
29 |                 self.job_info.append(job_company)
30 |                 self.job_info.append(job_address)
31 |                 self.job_info.append(job_salary)
32 |                 self.job_info.append(job_date)
33 |                 self.job_info.append(job_href)
34 | 
35 |                 self.request_job_detail(job_href)
36 |                 time.sleep(1)
37 | 
38 |             except Exception as e:
39 |                 print e.message
40 |                 continue
41 | 
42 |     def parse_job_detail(self, text):
43 |         result = re.findall(r'<div class="bmsg job_msg inbox">(.*?)<div', text, re.S)
44 |         job_statement = ''
45 |         if len(result) > 0:
46 |             job_statement = ''.join(
47 |                 [i.strip() for i in re.split(r'<br>', re.sub('<[/]?\w+>', '', result[0].strip()))]) if \
48 |                 result[0] else ''
49 | 
50 |         self.job_info.append(job_statement)
51 |         self.count = self.count + 1
52 |         ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u're_51job招聘.xlsx')
53 |         print '采集了{}条数据'.format(self.count)
54 |         # 清空集合,为再次存放数据做准备
55 |         self.job_info = []
56 | #
57 | #
58 | # if __name__ == '__main__':
59 | #     x = SpiderRe2()
60 | #     x.crawler_data()
61 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/SpiderXpath.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | from lxml import etree
 3 | from ExeclUtils import ExeclUtils
 4 | from Spider import Spider
 5 | import time
 6 | 
 7 | 
 8 | class SpiderXpath(Spider):
 9 | 
10 |     def __init__(self):
11 |         super(SpiderXpath, self).__init__()
12 | 
13 |     def parse_job_list(self, text):
14 |         try:
15 |             f = etree.HTML(text)
16 |             divs = f.xpath('//*[@id="resultList"]/div[@class="el"]')
17 |             for div in divs:
18 |                 job_title = div.xpath('./p/span/a/@title')
19 |                 job_company = div.xpath('./span[1]/a/@title')
20 |                 job_address = div.xpath('./span[2]/text()')
21 |                 job_salary = div.xpath('./span[3]/text()')
22 |                 job_date = div.xpath('./span[4]/text()')
23 |                 job_href = div.xpath('./p/span/a/@href')
24 | 
25 |                 job_title = job_title[0] if len(job_title) > 0 else ''
26 |                 job_company = job_company[0] if len(job_company) > 0 else ''
27 |                 job_address = job_address[0] if len(job_address) > 0 else ''
28 |                 job_salary = job_salary[0] if len(job_salary) > 0 else ''
29 |                 job_date = job_date[0] if len(job_date) > 0 else ''
30 |                 job_href = job_href[0] if len(job_href) > 0 else ''
31 | 
32 |                 self.job_info.append(job_title)
33 |                 self.job_info.append(job_company)
34 |                 self.job_info.append(job_address)
35 |                 self.job_info.append(job_salary)
36 |                 self.job_info.append(job_date)
37 |                 self.job_info.append(job_href)
38 | 
39 |                 self.request_job_detail(job_href)
40 |                 time.sleep(1)
41 |         except Exception as e:
42 |             print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
43 | 
44 |     def parse_job_detail(self, text):
45 |         f = etree.HTML(text)
46 |         try:
47 |             # 工作描述
48 |             job_statements = f.xpath('//div[@class="bmsg job_msg inbox"]')
49 |             job_statement = job_statements[0] if len(job_statements) > 0 else ''
50 |             if job_statement != '':
51 |                 job_statement = job_statement.xpath('string(.)').strip().split('\n')[0]
52 |             else:
53 |                 job_statement = '职位无明确描述'
54 |         except Exception as e:
55 |             print '\n\n出现错误,错误信息是:{}\n\n'.format(e.message)
56 |             job_statement = '职位无明确描述'
57 | 
58 |         self.job_info.append(job_statement)
59 |         self.count = self.count + 1
60 |         ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.job_info, u'xpath_51job招聘.xlsx')
61 |         print '采集了{}条数据'.format(self.count)
62 |         # 清空集合,为再次存放数据做准备
63 |         self.job_info = []
64 |         pass
65 | 
66 | #
67 | # if __name__ == '__main__':
68 | #     x = SpiderXpath2()
69 | #     x.crawler_data()
70 | 


--------------------------------------------------------------------------------
/51job爬虫/featch_51job/re_51job招聘.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/51job爬虫/featch_51job/re_51job招聘.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spider_works
2 | all sider source code
3 | 


--------------------------------------------------------------------------------
/公众号爬虫/mp_spider1.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | import time
 4 | import json
 5 | import os
 6 | 
 7 | 
 8 | 
 9 | class mp_spider(object):
10 | 
11 |     def __init__(self):
12 |         self.offset = 10
13 |         self.base_url = '为保护隐藏，你自己去处理'
14 |         self.headers = '为保护隐藏，你自己去处理'
15 |     def request_data(self):
16 |         try:
17 |             response = requests.get(self.base_url.format(self.offset), headers=self.headers)
18 |             print(self.base_url.format(self.offset))
19 |             if 200 == response.status_code:
20 |                self.parse_data(response.text)
21 |         except Exception as e:
22 |             print(e)
23 |             time.sleep(2)
24 |             pass
25 | 
26 |     def parse_data(self, responseData):
27 | 
28 |             all_datas = json.loads(responseData)
29 | 
30 |             if 0== all_datas['ret']:
31 | 
32 |                 summy_datas = all_datas['general_msg_list']
33 |                 datas = json.loads(summy_datas)['list']
34 |                 for data in datas:
35 |                     try:
36 |                         title = data['app_msg_ext_info']['title']
37 |                         title_child = data['app_msg_ext_info']['digest']
38 |                         article_url = data['app_msg_ext_info']['content_url']
39 |                         cover = data['app_msg_ext_info']['cover']
40 |                         print(title,title_child,article_url,cover)
41 |                     except Exception as e:
42 |                         print(e)
43 |                         continue
44 | 
45 | 
46 |                 print('----------------------------------------')
47 |                 time.sleep(3)
48 |                 self.offset = self.offset+10
49 |                 self.request_data()
50 |             else:
51 |                 print('抓取数据出错！')
52 | 
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     d = mp_spider()
57 |     d.request_data()
58 | 


--------------------------------------------------------------------------------
/公众号爬虫/mp_spider2.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | import time
 4 | import json
 5 | import os
 6 | import pdfkit
 7 | 
 8 | 
 9 | class mp_spider(object):
10 | 
11 |     def __init__(self):
12 |         self.config = pdfkit.configuration(wkhtmltopdf='C:/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe')
13 |         self.offset = 0
14 |         self.count = 0
15 |         self.base_url = 'https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzAwMjQwODIwNg==&f=json&offset={}&count=10&is_ok=1&scene=124&uin=MTIyOTkzMzgyMA%3D%3D&key=7cabb994f4d85a88ad37c1ec41ddde6234e76a1f1e69b178052bc99ccdf724f77700b28cea9e242cc98e517bd2537122fdc7a65a601e36f438b33e31e183f64dd9519beed36d892cc0a31855f1c649d6&pass_ticket=n6xnvQjzn4yfkjScc%2FSoVi4SkEgzf4z0airW6Ue14zIDNH98t%2Fr62k2KszUJ1qNv&wxtoken=&appmsg_token=960_mNI0W0CuVRuEpG7GsxB7f7pUUrO2CWW_iib4ww~~&x5=0&f=json'
16 |         self.headers = {
17 |             'Host': 'mp.weixin.qq.com',
18 |             'Connection': 'keep-alive',
19 |             'Accept': '*/*',
20 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.884.400 QQBrowser/9.0.2524.400',
21 |             'X-Requested-With': 'XMLHttpRequest',
22 |             'Referer': 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz=MjM5MTQ4NjA3Nw==&scene=124&uin=MjA2MDM3NTU%3D&key=2b903b9a7252346947b8c8bec6a8e97ea469a66c7c55196aec680d36fef8d99bdd51ba33c76a8d0e5655e5186714a09c18bdc873bdac2350ffd215c1d3cb331a3f67f0dcc00984035cbaacc19e1ef3e2&devicetype=Windows+10&version=62060344&lang=zh_CN&a8scene=7&pass_ticket=jAFRJRtWRdJcSXta5fiYsjBqfK6vqTIYWrULumuK5sc%3D&winzoom=1',
23 |             'Accept-Encoding': 'gzip, deflate',
24 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
25 |             'Cookie': 'wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=n6xnvQjzn4yfkjScc/SoVi4SkEgzf4z0airW6Ue14zIDNH98t/r62k2KszUJ1qNv; wap_sid2=CPyZvcoEElwzdm5YaDByenY3S2dzYlJtdXFDQVJYbmZKUERuM2I5elhMb3NxMVZqX3FCTDVYaFJ2Rkd2RktMdm9KajV3TWU5T3YyTTVfUG5zZ2llWko0cW5aMzBiY0FEQUFBfjCo9fLYBTgNQJVO'
26 |         }
27 | 
28 |     def request_data(self):
29 |         response = requests.get(self.base_url.format(self.offset), headers=self.headers)
30 |         if 200 == response.status_code:
31 |             self.parse_data(response.text)
32 | 
33 |     def parse_data(self, response_data):
34 | 
35 |         all_datas = json.loads(response_data)
36 | 
37 |         if 0 == all_datas['ret']:
38 |             if 1 == all_datas['can_msg_continue']:
39 |                 summy_datas = all_datas['general_msg_list']
40 |                 datas = json.loads(summy_datas)['list']
41 |                 for data in datas:
42 |                     try:
43 |                         title = data['app_msg_ext_info']['title']
44 |                         title_child = data['app_msg_ext_info']['digest']
45 |                         article_url = data['app_msg_ext_info']['content_url']
46 |                         cover = data['app_msg_ext_info']['cover']
47 |                         copyright = data['app_msg_ext_info']['copyright_stat']
48 |                         copyright = '原创文章_' if copyright == 11 else '非原创文章_'
49 |                         self.count = self.count + 1
50 |                         print('第【{}】篇文章'.format(self.count), copyright, title, title_child, article_url, cover)
51 |                         self.creat_pdf_file(article_url, '{}_{}'.format(copyright, title))
52 |                     except:
53 |                         continue
54 | 
55 |                 time.sleep(3)
56 |                 self.offset = all_datas['next_offset']  # 下一页的偏移量
57 |                 self.request_data()
58 |             else:
59 |                 exit('数据抓取完毕！')
60 |         else:
61 |             exit('数据抓取出错:' + all_datas['errmsg'])
62 | 
63 |     def creat_pdf_file(self, url, title):
64 |         try:
65 |             file = 'D:/store/file2/{}.pdf'.format(title)
66 |             if not os.path.exists(file):  # 过滤掉重复文件
67 |                 pdfkit.from_url(url, file)
68 | 
69 |         except Exception as e:
70 |             print(e)
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     d = mp_spider()
75 |     d.request_data()
76 | 
77 | 


--------------------------------------------------------------------------------
/公众号爬虫/mp_spider4.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from os import path
 3 | 
 4 | import requests
 5 | from scipy.misc import imread
 6 | from wordcloud import WordCloud
 7 | 
 8 | '''
 9 | 择取抖音评论
10 | '''
11 | 
12 | 
13 | class mp_spider(object):
14 | 
15 |     def __init__(self):
16 |         self.offset = 0
17 |         self.count = 0
18 |         self.base_comment_url = 'https://mp.weixin.qq.com/mp/appmsg_comment?action=getcomment&scene=0&__biz=MjM5NjA5NDczMg==&appmsgid=2652274724&idx=1&comment_id=303303606155886594&offset=0&limit=100&uin=MTIyOTkzMzgyMA%253D%253D&key=984e4c80c8bc7843fbc3177a66f8024c086af6b59a7ac97026e9f4db88fc49d0c26ce660040b865a3294ae651150d40227980433f1a5106b5a15261ad20d564aad1e8c6aa2dfda74fdd515af0bc77f1e&pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ%25252BBV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA%25252F&wxtoken=777&devicetype=Windows%26nbsp%3B10&clientversion=6206021f&appmsg_token=961_V5yXdClt1VInI19BnECwzmgi95G9e44nyElITVL5rKcbKbGDkLSLzLuTrUTO-TL3Zo_qNKEVSclPd8LG&x5=0&f=json'
19 |         self.base_comment_header = {
20 |             'Host': 'mp.weixin.qq.com',
21 |             'Connection': 'keep-alive',
22 |             'Accept': '*/*',
23 |             'CSP': 'active',
24 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI WindowsWechat QBCore/3.43.691.400 QQBrowser/9.0.2524.400',
25 |             'X-Requested-With': 'XMLHttpRequest',
26 |             'Referer': 'https://mp.weixin.qq.com/s?__biz=MjM5NjA5NDczMg==&mid=2652274724&idx=1&sn=ad0bbb4461e20cdb5bb1563e6d20639d&chksm=bd0c56478a7bdf51db287ab8a6e054284f0a6aa9b475a3597e2f02c1a28a9ac0f085dab1820e&mpshare=1&scene=1&srcid=0603ZskndK5clppsBTw7kWWW&key=8799423f74e5608e8fddceb78f6442677bcc4589977665cb4aaf92376ab0b3acbf903998bd87428c0a2b8f8a724ce746d59882f43021889961664fd26aa68e05492d96213e1addea8cee62b98b6ebb76&ascene=1&uin=MTIyOTkzMzgyMA%3D%3D&devicetype=Windows+10&version=6206021f&lang=zh_CN&pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ%2BBV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA%2F&winzoom=1',
27 |             'Accept-Encoding': 'gzip, deflate',
28 |             'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
29 |             'Cookie': 'rewardsn=; wxuin=1229933820; devicetype=Windows10; version=6206021f; lang=zh_CN; pass_ticket=xrtIeEFSb9ktVwLWcuMpduZ+BV6DrxwtLp5fn4E62xXSwYvNEvJQYumUDKuzbMA/; wap_sid2=CPyZvcoEElxMa0JKOS1tWHpPMFBlWFduNGRJbE9aUGFvNU9ja0poVXpKanpFSnVIQXpxbVUyVWNuZXlqQ2I3cDFvUmxlUGFIX2lFUDVGZ0dBTDBHRFFremh6Ml9vc0VEQUFBfjCikIrZBTgNQAE=; wxtokenkey=777'
30 |         }
31 | 
32 |     def request_comment_data(self):
33 |         response = requests.get(self.base_comment_url, headers=self.base_comment_header)
34 |         if 200 == response.status_code:
35 |             self.parse_comment_data(response.text)
36 | 
37 |     def parse_comment_data(self, response_data):
38 | 
39 |         all_datas = json.loads(response_data)
40 | 
41 |         if 0 == all_datas['base_resp']['ret']:
42 |             all_comments = all_datas['elected_comment']
43 |             with open('抖音毁掉.txt', 'a', encoding='utf-8') as f:
44 |                 for comments in all_comments:
45 |                     name = comments['nick_name']
46 |                     content = comments['content']
47 |                     print(name, content)
48 |                     try:
49 |                         f.write(content + "\n")
50 |                     except Exception as e:
51 |                         print(e)
52 |                         continue
53 | 
54 |             self.create_word_cloud('抖音毁掉')
55 |         else:
56 |             exit('数据抓取出错:' + all_datas['errmsg'])
57 | 
58 |     def create_word_cloud(self,file_name):
59 |         d = path.dirname(__file__)  # __file__ 为当前文件,
60 | 
61 |         text = open(path.join(d, '{}.txt'.format(file_name)), encoding='utf-8').read()
62 |         back_coloring = imread(path.join(d, 'douyin_bg.png'))  # 设置背景图片
63 | 
64 |         wc = WordCloud(background_color="white",
65 |                        font_path='C:\Windows\Fonts\msyhl.ttc',
66 |                        max_words=5000,
67 |                        mask=back_coloring,
68 |                        # 设置有多少种随机生成状态，即有多少种配色方案
69 |                        random_state=30)
70 |         # generate word cloud
71 |         wc.generate(text)
72 | 
73 |         # store to file
74 |         wc.to_file(path.join(d, "alice.png"))
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     d = mp_spider()
79 |     d.request_comment_data()


--------------------------------------------------------------------------------
/其它爬虫/book_py.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 爬取京东图书评价
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | import sys
13 | reload(sys)
14 | sys.setdefaultencoding( "utf-8" )
15 | 
16 | class get_book(object):
17 |     #获取浏览器驱动
18 |     driver = webdriver.Firefox()
19 | 
20 |     # 浏览器窗口最大化
21 |     driver.maximize_window()
22 | 
23 |     # 浏览器地址定向为qq登陆页面
24 |     driver.get("https://item.jd.com/11993134.html#comment")
25 | 
26 |     # 切换到评价的tab
27 |     driver.find_element_by_xpath('//*[@id="detail-tab-comm"]/a').click()
28 | 
29 |     while True:
30 |             # 下拉滚动条，从1开始到3结束 分2次加载完每页数据
31 |             for i in range(1,3):
32 |                 height = 20000*i#每次滑动20000像素
33 |                 strWord = "window.scrollBy(0,"+str(height)+")"
34 |                 driver.execute_script(strWord)
35 |                 time.sleep(4)
36 | 
37 |             selector = etree.HTML(driver.page_source)
38 |             divs = selector.xpath('//*[@id="comment-0"]/div[1]/div/div')
39 | 
40 |             # mode =a 不清空连续写入
41 |             with open('python_book.txt','a') as f:
42 |                 for div in divs:
43 |                     jd_conmment = div.xpath('./div[2]/div[1]/text()')
44 |                     jd_conmment = jd_conmment[0] if len(jd_conmment)>0 else ''
45 |                     f.write(jd_conmment+'\n')
46 | 
47 |             #分析得知当为最后一页时，最后的ui-pager-next不见了
48 |             if driver.page_source.find('ui-pager-next') == -1:
49 |                 break
50 | 
51 |             # 找到“下一页”的按钮元素
52 |             driver.find_element_by_class_name('ui-pager-next').click()
53 | 
54 |             # 因为在下一个循环里首先还要把页面下拉，所以要跳到外层的frame上
55 |             driver.switch_to.parent_frame()
56 | 
57 | if __name__=='__main__':
58 |     get_book()
59 | 
60 | 


--------------------------------------------------------------------------------
/其它爬虫/download_video.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import datetime
  3 | import os
  4 | import threading
  5 | import time
  6 | from contextlib import closing
  7 | 
  8 | import requests
  9 | from lxml import etree
 10 | from selenium import webdriver
 11 | from selenium.webdriver.common.by import By
 12 | from selenium.webdriver.support.ui import WebDriverWait
 13 | from selenium.webdriver.support import expected_conditions as EC
 14 | 
 15 | 
 16 | class VideoDown(object):
 17 | 
 18 |     def __init__(self):
 19 |         self.first_position = 0
 20 |         self.count = 0
 21 |         self.threads = []
 22 |         self.content = []
 23 | 
 24 |     def load_data(self):
 25 | 
 26 |         video_url = 'http://neihanshequ.com/video/'
 27 |         driver = webdriver.Firefox()  # 获取浏览器驱动
 28 |         driver.maximize_window()
 29 |         driver.implicitly_wait(10)  # 控制间隔时间等待浏览器反映
 30 |         driver.get(video_url)
 31 | 
 32 |         while True:
 33 |             try:
 34 |                 WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'loadMore')))
 35 |             except Exception as e:
 36 |                 print e.message
 37 |                 break
 38 | 
 39 |             js = 'window.scrollTo(0,document.body.scrollHeight)'
 40 |             driver.execute_script(js)
 41 |             time.sleep(10)
 42 | 
 43 |             source = etree.HTML(driver.page_source)
 44 |             divs = source.xpath('//*[@id="detail-list"]/li')
 45 | 
 46 |             for div in divs:
 47 |                 self.count = self.count + 1
 48 |                 print '第{}条数据'.format(str(self.count))
 49 |                 title = div.xpath('./div/div[2]/a/div/p/text()')
 50 |                 v_url = div.xpath('.//*[@class="player-container"]/@data-src')
 51 |                 title = title[0] if len(title) > 0 else '无介绍'.format(str(self.count))
 52 |                 v_url = v_url[0] if len(v_url) > 0 else ''
 53 |                 self.do_thread(title, v_url)
 54 | 
 55 |             try:
 56 |                 load_more = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, 'loadMore')))
 57 |                 load_more.click()
 58 |                 time.sleep(10)
 59 |             except Exception as e:
 60 |                 print e.message
 61 |                 break
 62 | 
 63 |     def do_thread(self, title, url):
 64 |         t = threading.Thread(target=self.down_video, args=(title, url))
 65 |         self.threads.append(t)
 66 |         t.start()
 67 | 
 68 |         for tt in self.threads:
 69 |             tt.join()
 70 | 
 71 |     def down_video(self, title, url):
 72 |         try:
 73 |             with closing(requests.get(url, stream=True)) as response:
 74 |                 print url
 75 |                 chunk_size = 1024
 76 |                 content_size = int(response.headers['content-length'])
 77 | 
 78 |                 video_path = u'D:/store/video00'
 79 |                 # 判断文件夹是否存在。
 80 |                 if not os.path.exists(video_path):
 81 |                     os.makedirs(video_path)
 82 | 
 83 |                 file_name = video_path + u'/{}.mp4'.format(self.count)
 84 |                 if os.path.exists(file_name) and os.path.getsize(file_name) == content_size:
 85 |                     print(u'跳过' + file_name)
 86 |                 else:
 87 |                     down = DownProgress(title, content_size)
 88 |                     with open(file_name, "wb") as f:
 89 |                         for data in response.iter_content(chunk_size=chunk_size):
 90 |                             f.write(data)
 91 | 
 92 |                             down.refresh_down(len(data))
 93 |         except Exception as e:
 94 |             print e.message
 95 | 
 96 | 
 97 | class DownProgress(object):
 98 |     def __init__(self, file_name, file_size):
 99 |         self.file_name = file_name
100 |         self.file_down = 0
101 |         self.file_size = file_size
102 | 
103 |     def refresh_down(self, down):
104 |         self.file_down = self.file_down + down
105 |         progress = (self.file_down / float(self.file_size)) * 100.0
106 |         status = u'下载完成' if self.file_down >= self.file_size else u'正在下载...'
107 |         print u'文件名称:{},下载进度:{},下载状态:{}'.format(self.file_name, '%.2f' % progress, status)
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     startTime = datetime.datetime.now()
112 |     down = VideoDown()
113 |     down.load_data()
114 |     endTime = datetime.datetime.now()
115 |     print '下载花费时间{}秒'.format((endTime - startTime).seconds)
116 | 


--------------------------------------------------------------------------------
/其它爬虫/fenng_py.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 爬取冯大辉老师的微博
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | import wordcloud as wcp
13 | import sys
14 | 
15 | 
16 | reload(sys)
17 | #这里需要指定字符编码
18 | sys.setdefaultencoding( "utf-8" )
19 | 
20 | def get_content(f_name):
21 | 
22 |     #你的微博帐号
23 |     username = '******'
24 |     psd = '******'
25 | 
26 |     #获取浏览器驱动
27 |     driver = webdriver.Firefox()
28 | 
29 |     # 浏览器窗口最大化
30 |     driver.maximize_window()
31 | 
32 |     driver.get('http://weibo.com/login.php')
33 |     print('login............................')
34 | 
35 |     #给登录框与密码赋值
36 |     driver.find_element_by_id('loginname').send_keys(username)
37 |     driver.find_element_by_class_name('password').find_element_by_name('password').send_keys(psd)
38 | 
39 |     #点击登录按钮
40 |     driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span').click()
41 | 
42 |     # 这里因为登录，需要有一个延时，不能直接切换到新网页去
43 |     time.sleep(3)
44 | 
45 |     # 登录成功后，再用浏览器地址定向到大辉老师的微博列表页，没有什么技巧，自己去找的
46 |     driver.get("https://weibo.com/p/1005051577826897/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place")
47 | 
48 |     while True:
49 |             # 下拉滚动条，从1开始到3结束 分2次加载完每页数据
50 |             for i in range(1,6):
51 |                 height = 20000*i#每次滑动20000像素
52 |                 strWord = "window.scrollBy(0,"+str(height)+")"
53 |                 driver.execute_script(strWord)
54 |                 time.sleep(4)
55 | 
56 |             selector = etree.HTML(driver.page_source)
57 |             divs = selector.xpath('//*[@id="Pl_Official_MyProfileFeed__22"]/div/div/div[1]/div[4]')
58 | 
59 |           #  mode =a 不清空连续写入
60 |             with open('{}.txt'.format(f_name),'a') as f:
61 |                 for div in divs:
62 |                     wb_content = div.xpath('./div[3]/text()')
63 |                     wb_time = div.xpath('./div[2]/a/text()')
64 |                     wb_content = wb_content[0] if len(wb_content) > 0 else ''
65 |                     wb_time = wb_time[0] if len(wb_time) > 0 else ''
66 |                     wb_content = wb_content.strip()  # 去掉左右两边的空格
67 |                     wb_time = wb_time.strip()
68 |                     print wb_content, wb_time
69 |                     f.write(wb_content+'\n')
70 | 
71 |             #分析得知当为最后一页时，最后的page next S_txt1 S_line1不见了
72 |             if driver.page_source.find('page next S_txt1 S_line1') == -1:
73 |                 print '没有下一页了'
74 |                 break
75 | 
76 |             # 找到“下一页”的按钮元素，原本想用xpath与classname，都失败了
77 |             # 这里我是用css来定位的，page next S_txt1 S_line1 在空格之间加'.' 来连接
78 |             submit = driver.find_element_by_css_selector('.page.next.S_txt1.S_line1')
79 |             submit.click()
80 | 
81 | if __name__ == '__main__':
82 |     f_name = 'ddddd'
83 |     get_content(f_name)
84 |     wcp.create_word_cloud(f_name)
85 | 


--------------------------------------------------------------------------------
/其它爬虫/qq_send_word.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 爬取QQ 说说内容
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | 
13 | #这里一定要设置编码格式，防止后面写入文件时报错
14 | import sys
15 | reload(sys)
16 | sys.setdefaultencoding( "utf-8" )
17 | 
18 | friend = '563679994' # 朋友的QQ号，朋友的空间要求允许你能访问
19 | user = '563679994'  # 你的QQ号
20 | pw = 'XXX'  # 你的QQ密码
21 | 
22 | #获取浏览器驱动
23 | driver = webdriver.Firefox()
24 | 
25 | # 浏览器窗口最大化
26 | driver.maximize_window()
27 | 
28 | # 浏览器地址定向为qq登陆页面
29 | driver.get("http://i.qq.com")
30 | 
31 | # 所以这里需要选中一下frame，否则找不到下面需要的网页元素
32 | driver.switch_to.frame("login_frame")
33 | 
34 | # 自动点击账号登陆方式
35 | driver.find_element_by_id("switcher_plogin").click()
36 | 
37 | # 账号输入框输入已知qq账号
38 | driver.find_element_by_id("u").send_keys(user)
39 | 
40 | # 密码框输入已知密码
41 | driver.find_element_by_id("p").send_keys(pw)
42 | 
43 | # 自动点击登陆按钮
44 | driver.find_element_by_id("login_button").click()
45 | 
46 | # 让webdriver操纵当前页
47 | driver.switch_to.default_content()
48 | 
49 | # 跳到说说的url, friend你可以任意改成你想访问的空间
50 | driver.get("http://user.qzone.qq.com/" + friend + "/311")
51 | 
52 | driver.find_element_by_xpath('//*[@id="QM_Mood_Poster_Container"]/div/div[4]/div[4]/a[2]').click()


--------------------------------------------------------------------------------
/其它爬虫/qq_word.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 爬取QQ 说说内容
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | 
13 | #这里一定要设置编码格式，防止后面写入文件时报错
14 | import sys
15 | reload(sys)
16 | sys.setdefaultencoding( "utf-8" )
17 | 
18 | friend = 'XXX' # 朋友的QQ号，朋友的空间要求允许你能访问
19 | user = 'XXX'  # 你的QQ号
20 | pw = 'XXXX'  # 你的QQ密码
21 | 
22 | #获取浏览器驱动
23 | driver = webdriver.Firefox()
24 | 
25 | # 浏览器窗口最大化
26 | driver.maximize_window()
27 | 
28 | # 浏览器地址定向为qq登陆页面
29 | driver.get("http://i.qq.com")
30 | 
31 | # 所以这里需要选中一下frame，否则找不到下面需要的网页元素
32 | driver.switch_to.frame("login_frame")
33 | 
34 | # 自动点击账号登陆方式
35 | driver.find_element_by_id("switcher_plogin").click()
36 | 
37 | # 账号输入框输入已知qq账号
38 | driver.find_element_by_id("u").send_keys(user)
39 | 
40 | # 密码框输入已知密码
41 | driver.find_element_by_id("p").send_keys(pw)
42 | 
43 | # 自动点击登陆按钮
44 | driver.find_element_by_id("login_button").click()
45 | 
46 | # 让webdriver操纵当前页
47 | driver.switch_to.default_content()
48 | 
49 | # 跳到说说的url, friend你可以任意改成你想访问的空间
50 | driver.get("http://user.qzone.qq.com/" + friend + "/311")
51 | 
52 | next_num = 0  # 初始“下一页”的id
53 | while True:
54 | 
55 |         # 下拉滚动条，使浏览器加载出动态加载的内容，
56 |         # 我这里是从1开始到6结束 分5 次加载完每页数据
57 |         for i in range(1,6):
58 |             height = 20000*i#每次滑动20000像素
59 |             strWord = "window.scrollBy(0,"+str(height)+")"
60 |             driver.execute_script(strWord)
61 |             time.sleep(4)
62 | 
63 |         # 很多时候网页由多个<frame>或<iframe>组成，webdriver默认定位的是最外层的frame，
64 |         # 所以这里需要选中一下说说所在的frame，否则找不到下面需要的网页元素
65 |         driver.switch_to.frame("app_canvas_frame")
66 |         selector = etree.HTML(driver.page_source)
67 |         divs = selector.xpath('//*[@id="msgList"]/li/div[3]')
68 | 
69 |         #这里使用 a 表示内容可以连续不清空写入
70 |         with open('qq_word.txt','a') as f:
71 |             for div in divs:
72 |                 qq_name = div.xpath('./div[2]/a/text()')
73 |                 qq_content = div.xpath('./div[2]/pre/text()')
74 |                 qq_time = div.xpath('./div[4]/div[1]/span/a/text()')
75 |                 qq_name = qq_name[0] if len(qq_name)>0 else ''
76 |                 qq_content = qq_content[0] if len(qq_content)>0 else ''
77 |                 qq_time = qq_time[0] if len(qq_time)>0 else ''
78 |                 print qq_name,qq_time,qq_content
79 |                 f.write(qq_content+"\n")
80 | 
81 |         # 当已经到了尾页，“下一页”这个按钮就没有id了，可以结束了
82 |         if driver.page_source.find('pager_next_' + str(next_num)) == -1:
83 |          break
84 | 
85 |         # 找到“下一页”的按钮，因为下一页的按钮是动态变化的，这里需要动态记录一下
86 |         driver.find_element_by_id('pager_next_' + str(next_num)).click()
87 | 
88 |         # “下一页”的id
89 |         next_num += 1
90 | 
91 |         # 因为在下一个循环里首先还要把页面下拉，所以要跳到外层的frame上
92 |         driver.switch_to.parent_frame()
93 | 
94 | 


--------------------------------------------------------------------------------
/其它爬虫/weibo_py.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import time
 3 | from selenium import webdriver
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | from lxml import etree
 9 | 
10 | #这里一定要设置编码格式，防止后面写入文件时报错
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding( "utf-8" )
14 | 
15 | 
16 | username = "hgsssssdfdfq.com"
17 | password = "lzl3xxxx434513"
18 | url = 'https://weibo.com/login.php'
19 | 
20 | #获取到Firefox驱动
21 | driver = webdriver.Firefox()
22 | 
23 | #浏览器窗口最大化
24 | driver.maximize_window()
25 | 
26 | time.sleep(3)
27 | 
28 | #指定到weibo首页
29 | driver.get(url)
30 | 
31 | 
32 | #填写登录用户名与密码以及点击登录按钮
33 | name_area = driver.find_element_by_xpath('//*[@id="loginname"]')
34 | name_area.send_keys(username)
35 | 
36 | psd_area = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input')
37 | psd_area.send_keys(password)
38 | 
39 | submit = driver.find_element_by_class_name('W_btn_a btn_32px')
40 | submit.click()
41 | 
42 | print '登录成功'
43 | 
44 | 
45 | #
46 | # #切回到当前主文档
47 | # driver.switch_to.default_content("https://weibo.com/p/1005051805319252/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place")
48 | #
49 | # print etree.HTML(driver.page_source)
50 | 


--------------------------------------------------------------------------------
/其它爬虫/weibo_py2.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | 
 3 | import re
 4 | import time
 5 | from selenium import webdriver
 6 | from selenium.webdriver.common.by import By
 7 | from selenium.webdriver.support.ui import WebDriverWait
 8 | from selenium.webdriver.common.action_chains import ActionChains
 9 | from selenium.webdriver.support import expected_conditions as EC
10 | 
11 | 
12 | def login(account, passwd, url):
13 |     # 如果driver没加入环境变量中，那么就需要明确指定其路径
14 |     # 验证于2017年4月11日
15 |     # 直接登陆新浪微博
16 |     driver = webdriver.Firefox()
17 |     driver.maximize_window()
18 |     # locator = (By.)
19 |     driver.get(url)
20 |     print('开始登陆')
21 |     name_field = driver.find_element_by_id('loginname')
22 |     name_field.clear()
23 |     name_field.send_keys(account)
24 |     password_field = driver.find_element_by_class_name('password').find_element_by_name('password')
25 |     password_field.clear()
26 |     password_field.send_keys(passwd)
27 | 
28 |     submit = driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span')
29 | 
30 |     ActionChains(driver).double_click(submit).perform()
31 |     time.sleep(5)
32 |     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'WB_miniblog')))
33 | 
34 |     source = driver.page_source
35 | 
36 |     if is_login(source):
37 |         print('登录成功')
38 | 
39 |     sina_cookies = driver.get_cookies()
40 |     driver.quit()
41 |     return sina_cookies
42 | 
43 | 
44 | def is_login(source):
45 |     rs = re.search("CONFIG\['islogin'\]='(\d)'", source)
46 |     if rs:
47 |         return int(rs.group(1)) == 1
48 |     else:
49 |         return False
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     url = 'http://weibo.com/login.php'
54 |     # name_input = input('请输入你的账号\n')
55 |     # passwd_input = input('请输入你的密码\n')
56 |     # cookies = login(name_input, passwd_input, url)
57 |     cookies = login('xxxxx', 'xxxxxx', url)
58 | 


--------------------------------------------------------------------------------
/其它爬虫/word_cloud_py.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 使用结巴分词生成云图
 5 |  说明这里
 6 |  1.生成词云一定要设置字体样式，否则汉字出现乱码或者不显示
 7 |  2.我不知道为什么本机一直显示不了中文，后面我加了jieba分词词库就可以显示中文了
 8 | 
 9 | """
10 | 
11 | from wordcloud import WordCloud
12 | import matplotlib.pyplot as plt
13 | import jieba
14 | 
15 | 
16 | 
17 | 
18 | #生成词云
19 | def create_word_cloud(filename):
20 |     text= open("{}.txt".format(filename)).read()
21 |     # 结巴分词
22 |     wordlist = jieba.cut(text, cut_all=True)
23 |     wl = " ".join(wordlist)
24 | 
25 |     # 设置词云
26 |     wc = WordCloud(
27 |         # 设置背景颜色
28 |         background_color="white",
29 |          # 设置最大显示的词云数
30 |        max_words=2000,
31 |          # 这种字体都在电脑字体中，一般路径
32 |        font_path='C:\Windows\Fonts\simfang.ttf',
33 |        height= 1200,
34 |        width= 1600,
35 |         # 设置字体最大值
36 |        max_font_size=100,
37 |      # 设置有多少种随机生成状态，即有多少种配色方案
38 |        random_state=30,
39 |     )
40 | 
41 |     myword = wc.generate(wl)  # 生成词云
42 |     # 展示词云图
43 |     plt.imshow(myword)
44 |     plt.axis("off")
45 |     plt.show()
46 |     wc.to_file('py_book.png')  # 把词云保存下
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     create_word_cloud('fenng_wb')


--------------------------------------------------------------------------------
/其它爬虫/zsxq_5_pdf.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | from urllib import parse
  4 | import pdfkit
  5 | import os
  6 | 
  7 | import sys
  8 | from bs4 import BeautifulSoup
  9 | 
 10 | 
 11 | class zsxq_work(object):
 12 | 
 13 |     def __init__(self):
 14 |         self.zsxq_group_id = []
 15 |         self.zsxq_group_name = []
 16 |         self.config = pdfkit.configuration(
 17 |             wkhtmltopdf='C:/Program Files/wkhtmltopdf/bin/wkhtmltopdf.exe')  # 这里需要配置一下wkhtmlpdf.exe路径
 18 |         self.end_time = 0  # 翻页的时间戳
 19 |         self.html_template = """
 20 |                             <!DOCTYPE html>
 21 |                             <html lang="en">
 22 |                             <head>
 23 |                                 <meta charset="UTF-8">
 24 |                             </head>
 25 |                             <body>
 26 |                             <h1>{title}</h1>
 27 |                             <h3>{author_time}</h3>
 28 |                             <p style='font-size:20px'>正文内容:<br>{text}</p>
 29 |                             <p style='font-size:20px'>相关链接:<br>{hrefs}</p>
 30 |                             <p style="text-align:center">{images}</p>            
 31 |                            
 32 |                             </body>
 33 |                             </html>
 34 |                             """
 35 |         self.html_contents = []
 36 |         self.my_cookie = 'mp_96b84420a1a32e448f73e7b9ffccebdb_mixpanel=%7B%22distinct_id%22%3A%20%2216b2692f782a2-0e2deca12ab522-345a4e7d-1fa400-16b2692f783958%22%2C%22%24initial_referrer%22%3A%20%22%24direct%22%2C%22%24initial_referring_domain%22%3A%20%22%24direct%22%7D; mp_a37bac8664a332481726ae49603f9f63_mixpanel=%7B%22distinct_id%22%3A%20%22099a483d-4c5a-4fa5-ab88-1d17ded4d3e7%22%7D; _ga=GA1.2.1397180488.1559720295; _gid=GA1.2.771246060.1559720295; zsxq_access_token=0F3F2032-4DA9-8931-1898-FBB9357B5FFC'
 37 |         self.zsxq_headers = {
 38 |             'Accept': 'application/json, text/plain, */*',
 39 |             'Accept-Encoding': 'gzip, deflate, br',
 40 |             'Accept-Language': 'zh-CN,zh;q=0.9',
 41 |             'Connection': 'keep-alive',
 42 |             'Cookie': self.my_cookie,
 43 |             'Host': 'api.zsxq.com',
 44 |             'Origin': 'https://wx.zsxq.com',
 45 |             'Referer': 'https://wx.zsxq.com/dweb/',
 46 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
 47 |         }
 48 | 
 49 |     def get_zsxq_group(self):
 50 |         """
 51 |          获取知识星球的星球id与名称
 52 |         """
 53 |         try:
 54 |             url_groups = 'https://api.zsxq.com/v1.10/groups'
 55 |             response = requests.get(url=url_groups, headers=self.zsxq_headers)  # 一定要加headers，规范写法，就像过马路一样穿红灯有时没事，有时要命！
 56 |             if response.status_code == 200:  # 注意：这里一定要做200判断，
 57 |                 datas = json.loads(response.text, encoding="utf-8").get('resp_data').get('groups')  # 把unicode 编码成 utf-8
 58 |                 if datas is not None:
 59 |                     for data in datas:
 60 |                         self.zsxq_group_id.append(data.get('group_id'))
 61 |                         self.zsxq_group_name.append(data.get('name'))
 62 | 
 63 |                     print(self.zsxq_group_name)
 64 |                 else:
 65 |                     print('服务器没有返回数据')
 66 |         except Exception as e:
 67 |             print(e.args)
 68 | 
 69 |     def get_zsxq_essence_content_pdf(self, type, group_id, group_name):
 70 |         """
 71 |         @:type 0 就是普通帖子 1 代表精华帖子
 72 | 
 73 |         """
 74 |         while True:
 75 |             # 精华帖子
 76 |             url_content_essence = 'https://api.zsxq.com/v1.10/groups/{}/topics?scope=digests&count=20&end_time={}'.format(
 77 |                 group_id,
 78 |                 self.end_time)
 79 |             # 普通帖子
 80 |             url_content_normal = 'https://api.zsxq.com/v1.10/groups/{}/topics?count=20&end_time={}'.format(
 81 |                 group_id,
 82 |                 self.end_time)
 83 |             response = requests.get(url=url_content_essence if type > 0 else url_content_normal,
 84 |                                     headers=self.zsxq_headers)  # 一定要加headers，规范写法，就像过马路一样穿红灯有时没事，有时要命！
 85 |             if response.status_code == 200:  # 注意：这里一定要做200判断，
 86 |                 topics = json.loads(response.text, encoding="utf-8").get("resp_data").get(
 87 |                     "topics")  # 把unicode 编码成 utf-8
 88 |                 #    print(topics)
 89 |                 if not topics:  # 如果没有主题就退出
 90 |                     print('数据加载完毕,开始制作pdf文档')
 91 |                     self.creat_pdf_file(group_name)
 92 |                     break
 93 | 
 94 |             end_time = topics[-1].get('create_time')
 95 |             self.get_end_time(end_time)
 96 | 
 97 |             for topic in topics:
 98 |                 if topic.get('type') == 'talk' and topic.get('talk'):  # 会话模式的
 99 |                     self.get_type_talk_content(topic)
100 |                 elif topic.get('type') == 'q&a' and topic.get('question'):
101 |                     self.get_type_question_content(topic)
102 | 
103 |     def get_end_time(self, create_time):
104 |         """
105 |         获取翻页的时间戳
106 |         :param create_time:
107 |         :return:
108 |         """
109 | 
110 |         first_time = create_time[:10]  # 前一部分时间
111 |         middle_time = create_time[10:-4]  # 中间一部分时间
112 |         last_time = create_time[-4:]  # 最后一部分时间
113 |         change_middle_time = middle_time.replace(middle_time[-4:-1], str(int(middle_time[-4:-1]) - 1).zfill(
114 |             3))  # 1. 网页列表的时间戳，发现规律，时间戳倒数第5位会比前面最后一页的时间戳少1 2. zfill方法可以在左边补0 凑成3位
115 | 
116 |         self.end_time = first_time + parse.quote(change_middle_time) + last_time  # parse.quote,url编码
117 | 
118 |     def get_type_talk_content(self, topic):
119 |         """获取talk模式下的内容"""
120 |         try:
121 |             text = '无正文'
122 |             if topic.get('talk').get('text'):
123 |                 text = topic.get('talk').get('text').replace('\n', '')  # 获取正文内容
124 | 
125 |             title = text[0:20] if len(text) > 20 else text  # 获取标题
126 |             title = title if len(title) > 0 else '无标题'
127 |             title = '无标题' if '<e type' in title else title
128 | 
129 |             author = topic.get('talk').get('owner').get('name')  # 获取作者名称
130 | 
131 |             create_time = (topic.get('create_time')[:19]).replace('T', ' ')  # 获取最后更新时间
132 |             author_time = '{}在{} 发表'.format(author, create_time)
133 | 
134 |             html_content = self.html_template.format(title=title, author_time=author_time, text=text,
135 |                                                      images=self.get_all_imgs(topic),
136 |                                                      hrefs=self.get_tag_web(text))
137 |             self.html_contents.append(html_content)
138 |         except Exception as e:
139 |             print(sys._getframe().f_code.co_name)
140 |             print(e.args)
141 | 
142 |     def get_type_question_content(self, topic):
143 |         """获取问答模式的内容"""
144 |         try:
145 |             if topic.get('question').get('owner'):
146 |                 author_question = topic.get('question').get('owner').get('name')  # 获取提问者的名称
147 |             else:
148 |                 author_question = "匿名提问"  # 获取提问者的名称
149 | 
150 |             author_answer = topic.get('answer').get('owner').get('name')  # 获取回答者的名称
151 | 
152 |             text_question = '无提问正文'
153 |             if topic.get('question').get('text'):
154 |                 text_question = topic.get('question').get('text').replace('\n', '')  # 获取提问正文
155 | 
156 |             text_answer = '无回答正文'
157 |             if topic.get('answer').get('text'):
158 |                 text_answer = topic.get('answer').get('text').replace('\n', '')  # 获取回答正文内容
159 | 
160 |             title = text_question[0:20] if len(text_question) > 20 else text_question  # 标题
161 |             text = '{}的提问:{}\n\n{}的回答:{}'.format(author_question, text_question, author_answer,
162 |                                                  text_answer)  # 获取正文内容
163 |             author = author_question + '&' + author_answer
164 |             create_time = (topic.get('create_time')[:19]).replace('T', '')  # 获取最后更新时间
165 | 
166 |             author_time = '{}在{} 发表'.format(author, create_time)
167 | 
168 |             html_content = self.html_template.format(title=title, author_time=author_time, text=text,
169 |                                                      images=self.get_all_imgs(topic),
170 |                                                      hrefs=self.get_tag_web(text))
171 |             self.html_contents.append(html_content)
172 |         except Exception as e:
173 |             print(sys._getframe().f_code.co_name)
174 |             print(e.args)
175 | 
176 |     def get_all_imgs(self, topic):
177 |         """获取帖子中的图片"""
178 |         images = []
179 | 
180 |         if topic.get('talk') and topic.get('talk').get('images'):  # 会话模式的
181 |             images += topic.get('talk').get('images')  # 获取图片列表
182 | 
183 |         if topic.get('question') and topic.get('question').get('images'):  # 问题模式
184 |             images += topic.get('question').get('images')  # 获取图片列表
185 | 
186 |         if topic.get('answer') and topic.get('answer').get('images'):  # 回答模式
187 |             images += topic.get('answer').get('images')  # 获取图片列表
188 | 
189 |         try:
190 |             imgs = []
191 |             if images is not None:
192 |                 for image in images:
193 |                     imgs.append(
194 |                         '<img src = {} ,style = "text-align:center,width:400px,height:300px">'.format(
195 |                             image.get('large').get('url')))  # 获取原图
196 | 
197 |                 return ''.join(imgs)
198 |             else:
199 |                 return ''
200 |         except Exception as e:
201 |             print(e.args)
202 |             return ''
203 | 
204 |     def get_tag_web(self, content):
205 |         """处理一下e标签内容, 主要是web链接有点用处，所以这里只处理web链接"""
206 |         soup = BeautifulSoup(content, 'html.parser')
207 |         list_e = soup.find_all('e')  # 遍历一下<e> 标签
208 |         hrefs = []
209 | 
210 |         if list_e:
211 |             for e in list_e:
212 |                 if e['type'] == 'web':  # 这里只处理web超链接
213 |                     hrefs.append('<a href="{}">{}</a>'.format(parse.unquote(e['href']), parse.unquote(e['title'])))
214 | 
215 |         return ''.join(hrefs) if len(hrefs) > 0 else '无'
216 | 
217 |     def creat_pdf_file(self, group_title):
218 |         htmls = []  # 这里是存放html文件
219 | 
220 |         for index, file in enumerate(self.html_contents):
221 |             html = '{}.html'.format(index)
222 |             with open(html, 'w', encoding='utf-8') as f:  # 点击open函数查看用法，这里是写入不要搞错了
223 |                 f.write(file)
224 | 
225 |             htmls.append(html)
226 | 
227 |         try:
228 |             output_file = 'D:/zsxq2/{}.pdf'.format(group_title)
229 |             if not os.path.exists(output_file):  # 过滤掉重复文件
230 |                 pdfkit.from_file(htmls, output_file, configuration=self.config,
231 |                                  )  # 注意这里需要配置一下wkhtmltopdf
232 |         except Exception as e:
233 |             print(e)
234 |         finally:
235 |             for html_file in htmls:  # 清除生成的html文件
236 |                 os.remove(html_file)
237 | 
238 | 
239 | if __name__ == '__main__':
240 |     xq = zsxq_work()
241 |     xq.get_zsxq_group()
242 |     if xq.zsxq_group_name:
243 |         xq.get_zsxq_essence_content_pdf(1, xq.zsxq_group_id[3], xq.zsxq_group_name[3])
244 | 
245 |     # for i in range(len(xq.zsxq_group_id)):
246 |     #     xq.end_time = 0  # 每换一个星球群组，需要把时间戳重置下
247 |     #     xq.html_contents = []  # 清空一下网页内容
248 |     #     xq.get_zsxq_essence_content_pdf(0, xq.zsxq_group_id[i], xq.zsxq_group_name[i])
249 | 


--------------------------------------------------------------------------------
/其它爬虫/我主良缘爬虫.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import urllib2,urllib,requests
  3 | import os
  4 | import json
  5 | import sys
  6 | import xlwt
  7 | reload(sys)
  8 | sys.setdefaultencoding('utf-8')
  9 | 
 10 | class wzly(object):
 11 | 
 12 |     def __init__(self):
 13 |         self.gender = 0
 14 |         self.stargage = 0
 15 |         self.endgage = 0
 16 |         self.startheight = 0
 17 |         self.endheight = 0
 18 |         self.marry = 1
 19 |         self.salary = 0
 20 |         self.eduction = 0
 21 |         self.count = 1  #表示数据条数,这里先去掉第一行的标题，所以是从1开始
 22 |         self.f = None
 23 |         self.sheetInfo = None
 24 |         self.create_execl()
 25 |         pass
 26 | 
 27 |     def create_execl(self):
 28 |         '''创建Execl'''
 29 |         self.f = xlwt.Workbook() #创建工作薄
 30 |         self.sheetInfo = self.f.add_sheet(u'我主良缘',cell_overwrite_ok=True)
 31 |         rowTitle = [u'编号',u'昵称',u'性别',u'年龄',u'身高',u'籍贯',u'学历',u'内心独白',u'照片']
 32 |         # 填充标题
 33 |         for i in range(0, len(rowTitle)):
 34 |             self.sheetInfo.write(0, i, rowTitle[i])
 35 | 
 36 |     def query_data(self):
 37 |         '''
 38 |         筛选条件
 39 |         年龄，
 40 |         身高，
 41 |         教育，
 42 |         期望薪资,
 43 |         :return:
 44 |         '''
 45 |         raw_input('请输入你的筛选条件，直接回车可以忽略本筛选条件：')
 46 |         self.query_age()
 47 |         self.query_sex()
 48 |         self.query_height()
 49 |         self.query_money()
 50 |         print '你的筛选条件是年龄:{}-{}岁\n性别是:{}\n对方身高是:{}-{}\n对方月薪是:{}'.format(self.stargage,self.endgage,self.gender,self.startheight,self.endheight,self.salary)
 51 |         self.craw_data()
 52 | 
 53 |     def query_age(self):
 54 |         '''
 55 |         年龄筛选
 56 |         :return:
 57 |         '''
 58 |         try:
 59 |             age = input('请输入期望对方年龄,如:25:')         # int类型的输入
 60 |         except Exception as e:
 61 |             age = 0
 62 | 
 63 |         try:
 64 |             if 21 <= age <=30:
 65 |                 self.stargage = 21
 66 |                 self.endgage = 30
 67 |             elif 31<=age<=40:
 68 |                 self.stargage = 31
 69 |                 self.endgage = 40
 70 |             elif 41<=age<=50:
 71 |                 self.stargage = 41
 72 |                 self.endgage = 50
 73 |             else:
 74 |                 self.stargage = 0
 75 |                 self.endgage = 0
 76 |         except Exception as e:
 77 |             self.stargage = 0
 78 |             self.endgage = 0
 79 | 
 80 |     def query_sex(self):
 81 |         '''性别筛选'''
 82 |         try:
 83 |             sex = raw_input('请输入期望对方性别,如:女:')  # 字符串的输入
 84 |         except Exception as e:
 85 |             sex = '女'
 86 | 
 87 |         try:
 88 |            if sex == '男':
 89 |                self.gender = 1
 90 |            else:
 91 |                self.gender = 2
 92 | 
 93 |         except Exception as e:
 94 |            self.gender = 2
 95 | 
 96 |     def query_height(self):
 97 |         '''身高筛选'''
 98 |         try:
 99 |             height = input('请输入期望对方身高,如:162:')
100 |         except Exception as e:
101 |             height = 0
102 | 
103 |             try:
104 |                 if 151 <= height <= 160:
105 |                     self.startheight = 151
106 |                     self.endheight = 160
107 |                 elif 161 <= height <= 170:
108 |                     self.startheight = 161
109 |                     self.endheight = 170
110 |                 elif 171 <= height <= 180:
111 |                     self.startheight = 171
112 |                     self.endheight = 180
113 |                 elif 181 <= height <= 190:
114 |                     self.startheight = 181
115 |                     self.endheight = 190
116 |                 else:
117 |                     self.startheight = 0
118 |                     self.endheight = 0
119 |             except Exception as e:
120 |                 self.startheight = 0
121 |                 self.endheight = 0
122 | 
123 |     def query_money(self):
124 |         '''待遇筛选'''
125 |         try:
126 |             money = input('请输入期望的对方月薪,如:8000:')
127 |         except Exception as e:
128 |             money = 0
129 | 
130 | 
131 |         try:
132 |             if 2000 <= money <5000:
133 |                self.salary = 2
134 |             elif 5000 <= money < 10000:
135 |                 self.salary = 3
136 |             elif 10000 <= money <= 20000:
137 |                 self.salary = 4
138 |             elif 20000 <= money :
139 |                 self.salary = 5
140 |             else:
141 |                 self.salary = 0
142 |         except Exception as e:
143 |             self.salary = 0
144 | 
145 |     def store_info(self, nick,age,height,address,heart,education,img_url):
146 |         '''
147 |         存照片,与他们的内心独白
148 |         '''
149 |         if age < 22:
150 |             tag = '22岁以下'
151 |         elif 22 <= age < 28:
152 |             tag = '22-28岁'
153 |         elif 28 <= age < 32:
154 |             tag = '28-32岁'
155 |         elif 32 <= age:
156 |             tag = '32岁以上'
157 |         filename = u'{}岁_身高{}_学历{}_{}_{}.jpg'.format(age,height,education, address, nick)
158 | 
159 |         try:
160 |             # 补全文件目录
161 |             image_path = u'E:/store/pic/{}'.format(tag)
162 |             # 判断文件夹是否存在。
163 |             if not os.path.exists(image_path):
164 |                 os.makedirs(image_path)
165 |                 print image_path + ' 创建成功'
166 | 
167 |             # 注意这里是写入图片，要用二进制格式写入。
168 |             with open(image_path + '/' + filename, 'wb') as f:
169 |                 f.write(urllib.urlopen(img_url).read())
170 | 
171 |             txt_path = u'E:/store/txt'
172 |             txt_name = u'内心独白.txt'
173 |             # 判断文件夹是否存在。
174 |             if not os.path.exists(txt_path):
175 |                 os.makedirs(txt_path)
176 |                 print txt_path + ' 创建成功'
177 | 
178 |             # 写入txt文本
179 |             with open(txt_path + '/' + txt_name, 'a') as f:
180 |                 f.write(heart)
181 |         except Exception as e:
182 |             e.message
183 | 
184 |     def store_info_execl(self,nick,age,height,address,heart,education,img_url):
185 |         person = []
186 |         person.append(self.count)   #正好是数据条
187 |         person.append(nick)
188 |         person.append(u'女' if self.gender == 2 else u'男')
189 |         person.append(age)
190 |         person.append(height)
191 |         person.append(address)
192 |         person.append(education)
193 |         person.append(heart)
194 |         person.append(img_url)
195 | 
196 |         for j in range(len(person)):
197 |             self.sheetInfo.write(self.count, j, person[j])
198 | 
199 |         self.f.save(u'我主良缘.xlsx')
200 |         self.count += 1
201 |         print '插入了{}条数据'.format(self.count)
202 | 
203 |     def parse_data(self,response):
204 |         '''数据解析'''
205 |         persons = json.loads(response).get('data').get('list')
206 |         if persons is None:
207 |             print '数据已经请求完毕'
208 |             return
209 | 
210 |         for person in persons:
211 |             nick = person.get('username')
212 |             gender = person.get('gender')
213 |             age = 2018 - int(person.get('birthdayyear'))
214 |             address = person.get('city')
215 |             heart = person.get('monolog')
216 |             height = person.get('height')
217 |             img_url = person.get('avatar')
218 |             education = person.get('education')
219 |             print nick,age,height,address,heart,education
220 |             self.store_info(nick,age,height,address,heart,education,img_url)
221 |             self.store_info_execl(nick,age,height,address,heart,education,img_url)
222 | 
223 |     def craw_data(self):
224 |         '''数据抓取'''
225 |         headers = {
226 |             'Referer': 'http://www.lovewzly.com/jiaoyou.html',
227 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4620.400 QQBrowser/9.7.13014.400'
228 |         }
229 |         page = 1
230 |         while True:
231 | 
232 |             query_data = {
233 |                 'page':page,
234 |                 'gender':self.gender,
235 |                 'starage':self.stargage,
236 |                 'endage':self.endgage,
237 |                 'stratheight':self.startheight,
238 |                 'endheight':self.endheight,
239 |                 'marry':self.marry,
240 |                 'salary':self.salary,
241 |             }
242 |             url = 'http://www.lovewzly.com/api/user/pc/list/search?'+urllib.urlencode(query_data)
243 |             print url
244 |             req = urllib2.Request(url, headers=headers)
245 |             response = urllib2.urlopen(req).read()
246 |             # print response
247 |             self.parse_data(response)
248 |             page += 1
249 | 
250 | 
251 | if __name__ == '__main__':
252 |    wz = wzly()
253 |    wz.query_data()
254 | 
255 | 
256 | 
257 | 
258 | 


--------------------------------------------------------------------------------
/其它爬虫/爬取QQ.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 爬取QQ 说说内容
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | 
13 | #这里一定要设置编码格式，防止后面写入文件时报错
14 | import sys
15 | reload(sys)
16 | sys.setdefaultencoding( "utf-8" )
17 | 
18 | friend = 'XXX' # 朋友的QQ号，朋友的空间要求允许你能访问
19 | user = 'XXX'  # 你的QQ号
20 | pw = 'XXXX'  # 你的QQ密码
21 | 
22 | #获取浏览器驱动
23 | driver = webdriver.Firefox()
24 | 
25 | # 浏览器窗口最大化
26 | driver.maximize_window()
27 | 
28 | # 浏览器地址定向为qq登陆页面
29 | driver.get("http://i.qq.com")
30 | 
31 | # 所以这里需要选中一下frame，否则找不到下面需要的网页元素
32 | driver.switch_to.frame("login_frame")
33 | 
34 | # 自动点击账号登陆方式
35 | driver.find_element_by_id("switcher_plogin").click()
36 | 
37 | # 账号输入框输入已知qq账号
38 | driver.find_element_by_id("u").send_keys(user)
39 | 
40 | # 密码框输入已知密码
41 | driver.find_element_by_id("p").send_keys(pw)
42 | 
43 | # 自动点击登陆按钮
44 | driver.find_element_by_id("login_button").click()
45 | 
46 | # 让webdriver操纵当前页
47 | driver.switch_to.default_content()
48 | 
49 | # 跳到说说的url, friend你可以任意改成你想访问的空间
50 | driver.get("http://user.qzone.qq.com/" + friend + "/311")
51 | 
52 | next_num = 0  # 初始“下一页”的id
53 | while True:
54 | 
55 |         # 下拉滚动条，使浏览器加载出动态加载的内容，
56 |         # 我这里是从1开始到6结束 分5 次加载完每页数据
57 |         for i in range(1,6):
58 |             height = 20000*i#每次滑动20000像素
59 |             strWord = "window.scrollBy(0,"+str(height)+")"
60 |             driver.execute_script(strWord)
61 |             time.sleep(4)
62 | 
63 |         # 很多时候网页由多个<frame>或<iframe>组成，webdriver默认定位的是最外层的frame，
64 |         # 所以这里需要选中一下说说所在的frame，否则找不到下面需要的网页元素
65 |         driver.switch_to.frame("app_canvas_frame")
66 |         selector = etree.HTML(driver.page_source)
67 |         divs = selector.xpath('//*[@id="msgList"]/li/div[3]')
68 | 
69 |         #这里使用 a 表示内容可以连续不清空写入
70 |         with open('qq_word.txt','a') as f:
71 |             for div in divs:
72 |                 qq_name = div.xpath('./div[2]/a/text()')
73 |                 qq_content = div.xpath('./div[2]/pre/text()')
74 |                 qq_time = div.xpath('./div[4]/div[1]/span/a/text()')
75 |                 qq_name = qq_name[0] if len(qq_name)>0 else ''
76 |                 qq_content = qq_content[0] if len(qq_content)>0 else ''
77 |                 qq_time = qq_time[0] if len(qq_time)>0 else ''
78 |                 print qq_name,qq_time,qq_content
79 |                 f.write(qq_content+"\n")
80 | 
81 |         # 当已经到了尾页，“下一页”这个按钮就没有id了，可以结束了
82 |         if driver.page_source.find('pager_next_' + str(next_num)) == -1:
83 |          break
84 | 
85 |         # 找到“下一页”的按钮，因为下一页的按钮是动态变化的，这里需要动态记录一下
86 |         driver.find_element_by_id('pager_next_' + str(next_num)).click()
87 | 
88 |         # “下一页”的id
89 |         next_num += 1
90 | 
91 |         # 因为在下一个循环里首先还要把页面下拉，所以要跳到外层的frame上
92 |         driver.switch_to.parent_frame()
93 | 
94 | 


--------------------------------------------------------------------------------
/其它爬虫/爬取冯大辉微博.py:
--------------------------------------------------------------------------------
 1 | #coding:utf-8
 2 | 
 3 | """
 4 | 爬取冯大辉老师的微博
 5 | 
 6 | """
 7 | 
 8 | 
 9 | import time
10 | from selenium import webdriver
11 | from lxml import etree
12 | import wordcloud as wcp
13 | import sys
14 | 
15 | 
16 | reload(sys)
17 | #这里需要指定字符编码
18 | sys.setdefaultencoding( "utf-8" )
19 | 
20 | def get_content(f_name):
21 | 
22 |     #你的微博帐号
23 |     username = '******'
24 |     psd = '******'
25 | 
26 |     #获取浏览器驱动
27 |     driver = webdriver.Firefox()
28 | 
29 |     # 浏览器窗口最大化
30 |     driver.maximize_window()
31 | 
32 |     driver.get('http://weibo.com/login.php')
33 |     print('login............................')
34 | 
35 |     #给登录框与密码赋值
36 |     driver.find_element_by_id('loginname').send_keys(username)
37 |     driver.find_element_by_class_name('password').find_element_by_name('password').send_keys(psd)
38 | 
39 |     #点击登录按钮
40 |     driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a/span').click()
41 | 
42 |     # 这里因为登录，需要有一个延时，不能直接切换到新网页去
43 |     time.sleep(3)
44 | 
45 |     # 登录成功后，再用浏览器地址定向到大辉老师的微博列表页，没有什么技巧，自己去找的
46 |     driver.get("https://weibo.com/p/1005051577826897/home?from=page_100505_profile&wvr=6&mod=data&is_all=1#place")
47 | 
48 |     while True:
49 |             # 下拉滚动条，从1开始到3结束 分2次加载完每页数据
50 |             for i in range(1,6):
51 |                 height = 20000*i#每次滑动20000像素
52 |                 strWord = "window.scrollBy(0,"+str(height)+")"
53 |                 driver.execute_script(strWord)
54 |                 time.sleep(4)
55 | 
56 |             selector = etree.HTML(driver.page_source)
57 |             divs = selector.xpath('//*[@id="Pl_Official_MyProfileFeed__22"]/div/div/div[1]/div[4]')
58 | 
59 |           #  mode =a 不清空连续写入
60 |             with open('{}.txt'.format(f_name),'a') as f:
61 |                 for div in divs:
62 |                     wb_content = div.xpath('./div[3]/text()')
63 |                     wb_time = div.xpath('./div[2]/a/text()')
64 |                     wb_content = wb_content[0] if len(wb_content) > 0 else ''
65 |                     wb_time = wb_time[0] if len(wb_time) > 0 else ''
66 |                     wb_content = wb_content.strip()  # 去掉左右两边的空格
67 |                     wb_time = wb_time.strip()
68 |                     print wb_content, wb_time
69 |                     f.write(wb_content+'\n')
70 | 
71 |             #分析得知当为最后一页时，最后的page next S_txt1 S_line1不见了
72 |             if driver.page_source.find('page next S_txt1 S_line1') == -1:
73 |                 print '没有下一页了'
74 |                 break
75 | 
76 |             # 找到“下一页”的按钮元素，原本想用xpath与classname，都失败了
77 |             # 这里我是用css来定位的，page next S_txt1 S_line1 在空格之间加'.' 来连接
78 |             submit = driver.find_element_by_css_selector('.page.next.S_txt1.S_line1')
79 |             submit.click()
80 | 
81 | if __name__ == '__main__':
82 |     f_name = 'ddddd'
83 |     get_content(f_name)
84 |     wcp.create_word_cloud(f_name)
85 | 


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/ExeclUtils.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | 
 3 | import xlwt
 4 | 
 5 | 
 6 | 
 7 | '''
 8 | 这里是操作execl的工具类,以后也可以直接复用
 9 | 方法调用SpiderUtils.create_excecl(...) 
10 | '''
11 | 
12 | 
13 | class ExeclUtils(object):
14 | 
15 |     @staticmethod
16 |     def create_execl(sheet_name, row_titles):
17 |         '''
18 |         创建execl文件与sheet表，并创建他们的第一行标题
19 |         :param sheet_name: execl中sheet_name文件的名称
20 |         :param row_titles: execl文件的标题行
21 |         :return: execl_file,execl_sheet
22 |         '''
23 | 
24 |         f = xlwt.Workbook()
25 |         sheet_info = f.add_sheet(sheet_name, cell_overwrite_ok=True)
26 |         for i in range(0, len(row_titles)):
27 |             sheet_info.write(0, i, row_titles[i])
28 | 
29 |         return f, sheet_info
30 | 
31 |     @staticmethod
32 |     def write_execl(execl_file, execl_sheet, count, data, execl_name):
33 |         '''
34 |         把数据写入到execl中.这是一个静态方法
35 |         注意：这里所有的数据都不要写死，方便复用.
36 |         :param execl_file:  传入一个execl文件
37 |         :param execl_sheet:  传入一个execl_sheet表
38 |         :param count:  execl文件的行数
39 |         :param data:  要传入的一条数据
40 |         :param execl_name: execl文件名
41 |         :return: None
42 |         '''
43 |         for j in range(len(data)):
44 |             execl_sheet.write(count, j, data[j])
45 | 
46 |         execl_file.save(execl_name)


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/QQ截图20180605204125.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/得到爬虫/dedao_App/QQ截图20180605204125.png


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/QQ截图20180605204150.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/得到爬虫/dedao_App/QQ截图20180605204150.png


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/__pycache__/ExeclUtils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/得到爬虫/dedao_App/__pycache__/ExeclUtils.cpython-36.pyc


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/得到爬虫/dedao_App/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/dedaoSpider.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | 
  3 | import time
  4 | import json
  5 | from dedao.ExeclUtils import ExeclUtils
  6 | import os
  7 | 
  8 | 
  9 | class dedao(object):
 10 | 
 11 |     def __init__(self):
 12 |         # self.rows_title = [u'招聘标题', u'公司名称', u'公司地址', u'待遇', u'发布日期', u'招聘链接', u'招聘要求描述']
 13 |         # sheet_name = u'51job_Python招聘'
 14 |         self.rows_title = [u'来源目录', u'标题', u'图片', u'分享标题', u'mp3地址', u'音频时长', u'文件大小']
 15 |         sheet_name = u'逻辑思维音频'
 16 | 
 17 |         return_execl = ExeclUtils.create_execl(sheet_name, self.rows_title)
 18 |         self.execl_f = return_execl[0]
 19 |         self.sheet_table = return_execl[1]
 20 |         self.audio_info = []  # 存放每一条数据中的各元素，
 21 |         self.count = 0  # 数据插入从1开始的
 22 |         self.base_url = 'https://entree.igetget.com/acropolis/v1/audio/listall'
 23 |         self.max_id = 0
 24 |         self.headers = {
 25 |             'Host': 'entree.igetget.com',
 26 |             'X-OS': 'iOS',
 27 |             'X-NET': 'wifi',
 28 |             'Accept': '*/*',
 29 |             'X-Nonce': '779b79d1d51d43fa',
 30 |             'Accept-Encoding': 'br, gzip, deflate',
 31 |             #     'Content-Length': '	67',
 32 |             'X-TARGET': 'main',
 33 |             'User-Agent': '%E5%BE%97%E5%88%B0/4.0.13 CFNetwork/901.1 Darwin/17.6.0',
 34 |             'X-CHIL': 'appstore',
 35 |             'Cookie	': 'acw_tc=AQAAAC0YfiuHegUAxkvoZRLraUMQyRfH; aliyungf_tc=AQAAAKwCD1dINAUAxkvoZTppW+jezS/9',
 36 |             'X-UID': '34556154',
 37 |             'X-AV	': '4.0.0',
 38 |             'X-SEID	': '',
 39 |             'X-SCR	': '1242*2208',
 40 |             'X-DT': 'phone',
 41 |             'X-S': '91a46b7a31ffc7a2',
 42 |             'X-Sign': 'ZTBiZjQyNTI1OTU2MTgwZjYwMWRhMjc5ZjhmMGRlNGI=',
 43 |             'Accept-Language': 'zh-cn',
 44 |             'X-D': 'ca3c83fca6e84a2d869f95829964ebb8',
 45 |             'X-THUMB': 'l',
 46 |             'X-T': 'json',
 47 |             'X-Timestamp': '1528195376',
 48 |             'X-TS': '1528195376',
 49 |             'X-U': '34556154',
 50 |             'X-App-Key': 'ios-4.0.0',
 51 |             'X-OV': '11.4',
 52 |             'Connection': 'keep-alive',
 53 |             'X-ADV': '1',
 54 |             'Content-Type': 'application/x-www-form-urlencoded',
 55 |             'X-V': '2',
 56 |             'X-IS_JAILBREAK	': 'NO',
 57 |             'X-DV': 'iPhone10,2',
 58 |         }
 59 | 
 60 |     def request_data(self):
 61 |         try:
 62 |             data = {
 63 |                 'max_id': self.max_id,
 64 |                 'since_id': 0,
 65 |                 'column_id': 2,
 66 |                 'count': 20,
 67 |                 'order': 1,
 68 |                 'section': 0
 69 |             }
 70 |             response = requests.post(self.base_url, headers=self.headers, data=data)
 71 |             if 200 == response.status_code:
 72 |                 self.parse_data(response)
 73 |         except Exception as e:
 74 |             print(e)
 75 |             time.sleep(2)
 76 |             pass
 77 | 
 78 |     def parse_data(self, response):
 79 |         dict_json = json.loads(response.text)
 80 |         datas = dict_json['c']['list']  # 这里取得数据列表
 81 |         #  print(datas)
 82 |         for data in datas:
 83 |             source_name = data['audio_detail']['source_name']
 84 |             title = data['audio_detail']['title']
 85 |             icon = data['audio_detail']['icon']
 86 |             share_title = data['audio_detail']['share_title']
 87 |             mp3_url = data['audio_detail']['mp3_play_url']
 88 |             duction = str(data['audio_detail']['duration']) + '秒'
 89 |             size = data['audio_detail']['size'] / (1000 * 1000)
 90 |             size = '%.2fM' % size
 91 | 
 92 |             self.download_mp3(mp3_url)
 93 | 
 94 |             self.audio_info.append(source_name)
 95 |             self.audio_info.append(title)
 96 |             self.audio_info.append(icon)
 97 |             self.audio_info.append(share_title)
 98 |             self.audio_info.append(mp3_url)
 99 |             self.audio_info.append(duction)
100 |             self.audio_info.append(size)
101 | 
102 |             self.count = self.count + 1
103 |             ExeclUtils.write_execl(self.execl_f, self.sheet_table, self.count, self.audio_info, u'逻辑思维音频.xlsx')
104 |             print('采集了{}条数据'.format(self.count))
105 |             # 清空集合,为再次存放数据做准备
106 |             self.audio_info = []
107 | 
108 |         time.sleep(3)
109 |         max_id = datas[-1]['publish_time_stamp']
110 |         if self.max_id != max_id:
111 |             self.max_id = max_id
112 |             self.request_data()
113 |         else:
114 |             print('数据抓取完毕!')
115 | 
116 |         pass
117 | 
118 |     def download_mp3(self, mp3_url):
119 |         try:
120 |             # 补全文件目录
121 |             mp3_path = u'D:/store/mp3/{}'.format(mp3_url.split('/')[-1])
122 |             print(mp3_path)
123 |             # 判断文件是否存在。
124 |             if not os.path.exists(mp3_path):
125 |                 # 注意这里是写入文件，要用二进制格式写入。
126 |                 with open(mp3_path, 'wb') as f:
127 |                     f.write(requests.get(mp3_url).content)
128 | 
129 |         except Exception as e:
130 |             print(e)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     d = dedao()
135 |     d.request_data()
136 | 


--------------------------------------------------------------------------------
/得到爬虫/dedao_App/逻辑思维音频.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/得到爬虫/dedao_App/逻辑思维音频.xlsx


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/QQ截图20180525142031.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/拉勾爬虫/lagouSpider-master/QQ截图20180525142031.png


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/QQ截图20180525142111.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pythonchannel/spider_works/40f0e3b80a06b4f9109689ae7b1d104576d3655b/拉勾爬虫/lagouSpider-master/QQ截图20180525142111.png


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/dbtools.py:
--------------------------------------------------------------------------------
  1 | import MySQLdb
  2 | 
  3 | # 导入settings设置
  4 | from scrapy.utils.project import get_project_settings
  5 | 
  6 | 
  7 | class DBTool():
  8 |     def __init__(self):
  9 |         '''读取settings中的mysql配置'''
 10 |         self.settings = get_project_settings()
 11 |         self.host = self.settings['MYSQL_HOST']
 12 |         self.port = self.settings['MYSQL_PORT']
 13 |         self.db = self.settings['MYSQL_DBNAME']
 14 |         self.user = self.settings['MYSQL_USER']
 15 |         self.pwd = self.settings['MYSQL_PASSWD']
 16 | 
 17 |     def conn_db(self):
 18 |         '''
 19 |         连接到 数据库
 20 |         '''
 21 |         conn = MySQLdb.connect(host=self.host,
 22 |                                port=self.port,
 23 |                                user=self.user,
 24 |                                password=self.pwd,
 25 |                                db=self.db,
 26 |                                charset='utf8mb4')
 27 |         return conn
 28 | 
 29 |     '''
 30 |     创建表的功能，建议直接在数据库中创建好，避免出错
 31 |     '''
 32 | 
 33 |     def inset_data(self, sql):
 34 |         '''
 35 |         插入数据
 36 |         :param sql:
 37 |         :param params:
 38 |         :return:
 39 |         '''
 40 |         conn = self.conn_db()
 41 |         cur = conn.cursor()
 42 |         try:
 43 |             cur.execute(sql)
 44 |             conn.commit()  # 这里要commit
 45 |         except Exception as e:
 46 |             conn.rollback()  # 这里如果插入失败要回滚
 47 |             print('---出错了:' + e)
 48 |         cur.close()
 49 |         conn.close()
 50 | 
 51 |     def update_data(self, sql, *params):
 52 |         conn = self.conn_db()
 53 |         cur = conn.cursor()
 54 |         try:
 55 |             cur.execute(sql, params)
 56 |             conn.commit()  # 这里要commit
 57 |         except Exception as e:
 58 |             conn.rollback()  # 这里如果插入失败要回滚
 59 |             print('---出错了:' + e)
 60 |         cur.close()
 61 |         conn.close()
 62 | 
 63 |     def delete_data(self, sql, *params):
 64 |         conn = self.conn_db()
 65 |         cur = conn.cursor()
 66 | 
 67 |         cur.execute(sql, params)
 68 |         conn.commit()  # 这里要commit
 69 | 
 70 |         cur.close()
 71 |         conn.close()
 72 | 
 73 |     def query_data(self, sql):
 74 |         '''
 75 |         查询数据
 76 |         :param sql:
 77 |         :return:
 78 |         '''
 79 |         conn = self.conn_db()
 80 |         cur = conn.cursor()
 81 |         cur.execute(sql)
 82 |         results = cur.fetchall()
 83 |         cur.close()
 84 |         conn.close()
 85 |         return results
 86 | 
 87 | 
 88 | '''测试dbTool的类'''
 89 | 
 90 | 
 91 | class TestDBTool():
 92 |     def __init__(self):
 93 |         self.dbTool = DBTool()
 94 | 
 95 |     # 测试插入
 96 |     def testInsert(self):
 97 |         sql = "insert into lagou_data(`title`,`address`,`money`,`experience`,`company`,`fintance`) values('Python工程师','北京','50k','三年经验','拉勾','A轮')"
 98 |         self.dbTool.inset_data(sql)
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     testdbTool = TestDBTool()
103 |     testdbTool.testInsert()  # 执行测试插入数据
104 | 
105 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LagouspiderItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     job_title = scrapy.Field()
15 |     job_address = scrapy.Field()
16 |     job_money = scrapy.Field()
17 |     job_company = scrapy.Field()
18 |     job_fintance = scrapy.Field()
19 |     pass
20 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class LagouspiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class LagouspiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from lagouSpider.dbtools import DBTool
 9 | 
10 | 
11 | class LagouspiderPipeline(object):
12 |     def __init__(self):
13 |         self.db = DBTool()
14 | 
15 |     def process_item(self, item, spider):
16 |         job_title = item['job_title']
17 |         job_address = item['job_address']
18 |         job_money = item['job_money']
19 |         job_company = item['job_company']
20 |         job_fintance = item['job_fintance']
21 | 
22 |         sql = "insert into lagou_data(`title`,`address`,`money`,`company`,`fintance`) values('{}','{}','{}','{}','{}')".format(job_title,job_address,job_money,job_company,job_fintance)
23 | 
24 |         print('--------------------{}'.format(sql)) # 这里一定要打印sql来看看sql是否正确，
25 |         print('--------------------{}'.format(sql)) # 这里一定要打印sql来看看sql是否正确，
26 |         self.db.inset_data(sql)
27 |         return item
28 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for lagouSpider project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'lagouSpider'
 13 | 
 14 | SPIDER_MODULES = ['lagouSpider.spiders']
 15 | NEWSPIDER_MODULE = 'lagouSpider.spiders'
 16 | 
 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 18 | # USER_AGENT = 'lagouSpider (+http://www.yourdomain.com)'
 19 | 
 20 | # Obey robots.txt rules,
 21 | # Scrapy 默认是遵守爬虫准则的，即ROBOTSTXT_OBEY = True,如果遵守爬虫准则的，是不能用Scrapy来爬取的，这时需要把ROBOSTSXT_OBEY=False,也就是不遵守它的规则，一般我们都是不遵守他的规则。
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # 报那些状态错误，就把错误加上，允许这种错误，上面报的是403，就把403加入。
 25 | HTTPERROR_ALLOWED_CODES = [403, 302]
 26 | 
 27 | # 禁止被重定向
 28 | REDIRECT_ENABLED = False
 29 | 
 30 | 
 31 | 
 32 | DEFAULT_REQUEST_HEADERS = {
 33 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 34 |     'Accept-Encoding': 'gzip, deflate, sdch, br',
 35 |     'Accept-Language': 'zh-CN,zh;q=0.8',
 36 |     'Cache-Control': 'max-age=0',
 37 |     'Connection': 'keep-alive',
 38 |    # 'Referer': 'https://www.lagou.com/',
 39 |     'Host': 'www.lagou.com',
 40 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4482.400 QQBrowser/9.7.13001.400'
 41 | }
 42 | 
 43 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 44 | # CONCURRENT_REQUESTS = 32
 45 | 
 46 | # Configure a delay for requests for the same website (default: 0)
 47 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 48 | # See also autothrottle settings and docs
 49 | # 设置延迟3秒,操作太快容易遭遇反爬虫
 50 | DOWNLOAD_DELAY = 15
 51 | # The download delay setting will honor only one of:
 52 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 53 | # CONCURRENT_REQUESTS_PER_IP = 16
 54 | 
 55 | # Disable cookies (enabled by default)
 56 | # COOKIES_ENABLED = False
 57 | 
 58 | # Disable Telnet Console (enabled by default)
 59 | # TELNETCONSOLE_ENABLED = False
 60 | 
 61 | # Override the default request headers:
 62 | # DEFAULT_REQUEST_HEADERS = {
 63 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 64 | #   'Accept-Language': 'en',
 65 | # }
 66 | 
 67 | # Enable or disable spider middlewares
 68 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 69 | # SPIDER_MIDDLEWARES = {
 70 | #    'lagouSpider.middlewares.LagouspiderSpiderMiddleware': 543,
 71 | # }
 72 | 
 73 | # Enable or disable downloader middlewares
 74 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 75 | # DOWNLOADER_MIDDLEWARES = {
 76 | #    'lagouSpider.middlewares.LagouspiderDownloaderMiddleware': 543,
 77 | # }
 78 | 
 79 | # Enable or disable extensions
 80 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 81 | # EXTENSIONS = {
 82 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 83 | # }
 84 | 
 85 | # Configure item pipelines
 86 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 87 | ITEM_PIPELINES = {
 88 |    'lagouSpider.pipelines.LagouspiderPipeline': 300,
 89 | }
 90 | 
 91 | # Enable and configure the AutoThrottle extension (disabled by default)
 92 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 93 | # AUTOTHROTTLE_ENABLED = True
 94 | # The initial download delay
 95 | # AUTOTHROTTLE_START_DELAY = 5
 96 | # The maximum download delay to be set in case of high latencies
 97 | # AUTOTHROTTLE_MAX_DELAY = 60
 98 | # The average number of requests Scrapy should be sending in parallel to
 99 | # each remote server
100 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
101 | # Enable showing throttling stats for every response received:
102 | # AUTOTHROTTLE_DEBUG = False
103 | 
104 | # Enable and configure HTTP caching (disabled by default)
105 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
106 | # HTTPCACHE_ENABLED = True
107 | # HTTPCACHE_EXPIRATION_SECS = 0
108 | # HTTPCACHE_DIR = 'httpcache'
109 | # HTTPCACHE_IGNORE_HTTP_CODES = []
110 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
111 | 
112 | 
113 | 
114 | 
115 | # Mysql数据库的配置信息
116 | MYSQL_HOST = '192.168.1.250'
117 | MYSQL_DBNAME = 'crawler_data'  # 数据库名字，请修改
118 | MYSQL_USER = 'root'  # 数据库账号，请修改
119 | MYSQL_PASSWD = '123456'  # 数据库密码，请修改
120 | MYSQL_PORT = 3306  # 数据库端口，在dbhelper中使用
121 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/lagouSpider/spiders/lagou_crawl.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from lagouSpider.items import LagouspiderItem
 3 | from scrapy import FormRequest
 4 | import time
 5 | import random
 6 | import json
 7 | from scrapy.crawler import CrawlerProcess
 8 | 
 9 | 
10 | class lagou_crawl(scrapy.Spider):
11 |     name = "lagou"
12 |     allowed_domains = ['lagou.com']
13 |     start_urls = [
14 |         'https://www.lagou.com/zhaopin/Python/{}/?filterOption=3'.format(i) for i in range(1, 31)
15 |     ]
16 | 
17 |     def parse(self, response):
18 |         item = LagouspiderItem()
19 |         divs = response.xpath('//*[@id="s_position_list"]/ul/li/div[1]')
20 |         for div in divs:
21 |             job_title = div.xpath('./div[1]/div[1]/a/h3/text()').extract()
22 |             job_address = div.xpath('./div[1]/div[1]/a/span/em/text()').extract()
23 |             job_money = div.xpath('./div[1]/div[2]/div/span/text()').extract()
24 |             job_company = div.xpath('./div[2]/div[1]/a/text()').extract()
25 |             job_fintance = div.xpath('./div[2]/div[2]/text()').extract()
26 | 
27 |             job_title = job_title[0] if len(job_title) > 0 else '无数据'
28 |             job_address = job_address[0] if len(job_address) > 0 else '无数据'
29 |             job_money = job_money[0] if len(job_money) > 0 else '无数据'
30 |             job_company = job_company[0] if len(job_company) > 0 else '无数据'
31 |             job_fintance = job_fintance[0] if len(job_fintance) > 0 else '无数据'
32 | 
33 |             item['job_title'] = job_title.strip()
34 |             item['job_address'] = job_address.strip()
35 |             item['job_money'] = job_money.strip()
36 |             item['job_company'] = job_company.strip()
37 |             item['job_fintance'] = job_fintance.strip()
38 | 
39 |             yield item
40 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/readme.md:
--------------------------------------------------------------------------------
 1 | 弄这个比较仓促，再加上周末来了，准备周末好好再去研究一下反爬处理
 2 | 
 3 | 
 4 | >程序中主要是反爬虫没有处理好,爬取的数据有些遗漏,据我所知，加上cookie池与挂上代理ip，可以完美解决这个问题!
 5 | 
 6 | 
 7 | 
 8 | 
 9 | ### 爬取数据的办法有两种：
10 | 
11 | - 直接选Python模块，发现数据是有规律的:
12 | 
13 | https://www.lagou.com/zhaopin/Python/2/?filterOption=3
14 | 
15 | https://www.lagou.com/zhaopin/Python/3/?filterOption=3
16 | 
17 | 
18 | 
19 | - 也可以直接搜索Python职位：
20 | 
21 | https://www.lagou.com/jobs/list_Python?labelWords=&fromSearch=true&suginput=
22 | 
23 | 爬取拉钩数据的办法有人使用了post发送数据请求，结果总是提示操作太频繁，我没有采用这种方式.
24 | 
25 | 
26 | ![图片](https://raw.githubusercontent.com/pythonchannel/lagouSpider/33deb18b56284172f42a77b7a091c6d0bf7f71f7/QQ%E6%88%AA%E5%9B%BE20180525142031.png)
27 | 
28 | 
29 | ![图片2](https://raw.githubusercontent.com/pythonchannel/lagouSpider/33deb18b56284172f42a77b7a091c6d0bf7f71f7/QQ%E6%88%AA%E5%9B%BE20180525142111.png)
30 |  
31 | # 还需要进一步完善
32 | 


--------------------------------------------------------------------------------
/拉勾爬虫/lagouSpider-master/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lagouSpider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = lagouSpider
12 | 


--------------------------------------------------------------------------------