.*?

'
18 | img_src_list = re.findall(ex, page_text, re.S)
19 | # print(img_src_list)
20 | for src in img_src_list:
21 | # 将图片信息以二进制存储
22 | img_data = requests.get(url=src, headers=headers).content
23 | # 生成图片名称
24 | img_name = src.split('/')[-1]
25 | imgPath = './imgLibs/' + img_name
26 | with open(imgPath, 'wb') as fp:
27 | fp.write(img_data)
28 | print(img_name, '下载成功')
29 |
--------------------------------------------------------------------------------
/基础篇/lxml&re/简历爬取.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | import os
4 |
5 | if __name__ == '__main__':
6 | # UA伪装
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
9 | }
10 | url0 = 'https://sc.chinaz.com/jianli/free.html' # 访问第一页的链接,这里因为直接用free_1无法打开网页
11 | url = 'https://sc.chinaz.com/jianli/free_%d.html'
12 | pageNum = 1
13 |
14 | download_list = []
15 | download_name_list = []
16 | # 新建文件夹可持续化存储
17 | if not os.path.exists('./CV_moban'):
18 | os.mkdir('./CV_moban')
19 | # 分页爬取
20 | for pageNum in range(1, 3):
21 | if pageNum == 1:
22 | new_url = url0
23 | else:
24 | new_url = format(url % pageNum)
25 | # 实例化对象的构建
26 | page_text = requests.get(url=new_url, headers=headers).text
27 | tree = etree.HTML(page_text)
28 | # 爬取需要下载的页面信息
29 | CV_infor_list = tree.xpath('//div[@class="main_list jl_main"]/div')
30 | for cv in CV_infor_list:
31 | CV_src = cv.xpath('./a/@href')[0]
32 | CV_text = requests.get(url=CV_src, headers=headers).text
33 | ctree = etree.HTML(CV_text)
34 | # 爬取简历下载链接
35 | download_src = ctree.xpath('//div[@class="down_wrap"]/div[2]/ul/li/a/@href')[0]
36 | download_list.append(download_src)
37 | # 爬取简历名称
38 | download_name = ctree.xpath('//div[@class="bgwhite"]/div//h1/text()')[0]
39 | download_name = download_name.encode('iso-8859-1').decode('utf-8') + '.rar'
40 | download_name_list.append(download_name)
41 |
42 | # 批量下载简历模板
43 | i = -1
44 | for cvv in download_list:
45 | i = i + 1
46 | cvv = download_list[i]
47 | cv_content = requests.get(url=cvv, headers=headers).content
48 | cv_path = 'CV_moban/' + download_name_list[i]
49 | with open(cv_path, 'wb') as fp:
50 | fp.write(cv_content)
51 | print(download_name_list[i] + '下载完成!')
52 |
--------------------------------------------------------------------------------
/基础篇/request/01-Request.py:
--------------------------------------------------------------------------------
1 | # requests模块的使用
2 | import requests
3 |
4 | if __name__ == "__main__":
5 | # 指定url
6 | url = 'https://wz.sun0769.com/political/index/politicsNewest'
7 | # 发起请求
8 | # get方法会返回一个响应对象
9 | response = requests.get(url=url)
10 | # 获取响应数据
11 | page_txt = response.text
12 | # 持久化存储
13 | with open('./sogou.html', 'w', encoding='utf-8') as fp:
14 | fp.write(page_txt)
15 | print('爬取数据结束!')
16 |
17 |
--------------------------------------------------------------------------------
/基础篇/request/02-(UA)网页采集器.py:
--------------------------------------------------------------------------------
1 | # UA检测(反爬机制):门户网站的服务器会检测对应请求的载体身份标识,如果检测到请求的载体身份为某一浏览器,说明该请求是一个正常请求。
2 | # 但是如果检测到不是某一浏览器,则表示该请求为非正常请求。服务器端拒绝该次请求。
3 | # UA:User-Agent(请求载体的身份标识)
4 | # UA伪装:让爬虫身份标识伪装成浏览器
5 | import requests
6 | if __name__ == '__main__':
7 | # UA伪装
8 | headers = {
9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
10 | }
11 | url = 'https://www.sogou.com/web?'
12 | # 处理url携带的参数:封装到字典中
13 | kw = input('enter a word:')
14 | param = {
15 | 'query': kw,
16 | }
17 | # 对指定的url发起的请求对应的url是携带参数的,并且请求过程中处理了参数
18 | response = requests.get(url=url, params=param, headers=headers)
19 |
20 | page_text = response.text
21 | fileName = kw+ '.html'
22 | with open(fileName, 'w', encoding='utf-8') as fp:
23 | fp.write(page_text)
24 | print(fileName, '保存成功!')
25 |
--------------------------------------------------------------------------------
/基础篇/request/03-(POST)百度翻译.py:
--------------------------------------------------------------------------------
1 | # post请求(携带了参数)
2 | # 响应数据是一组json数据
3 | import requests
4 | import json
5 | if __name__ == '__main__':
6 | # 指定url
7 | post_url = 'https://fanyi.baidu.com/sug'
8 | # UA伪装
9 | headers = {
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
11 | }
12 | # post请求参数处理
13 | word = input('enter a word:')
14 | data = {
15 | 'kw': word
16 | }
17 | # 请求发送
18 | response = requests.post(url=post_url, data=data, headers=headers)
19 | # 获取响应数据:json方法返回的是obj(如果确认响应数据是json类型的,才可以使用jason()
20 | dic_obj = response.json()
21 | # 持久化存储
22 | filename = word + '.json'
23 | fp = open(filename, 'w', encoding='utf-8')
24 | json.dump(dic_obj, fp=fp, ensure_ascii=False)
25 |
26 | print('over!!')
27 |
--------------------------------------------------------------------------------
/基础篇/request/04-豆瓣电影爬取.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | if __name__ == '__main__':
4 | # 指定url
5 | url = 'https://movie.douban.com/j/chart/top_list'
6 | param = {
7 | 'type': '24',
8 | 'interval_id': '100:90',
9 | 'action': '',
10 | 'start': '1',
11 | 'limit': '20',
12 | }
13 | # UA伪装
14 | headers = {
15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
16 | }
17 | response = requests.get(url=url, params=param, headers=headers)
18 | list_data = response.json()
19 | print(list_data)
20 | fp = open('./douban.json', 'w', encoding='utf-8')
21 | json.dump(list_data, fp=fp, ensure_ascii=False)
22 |
23 | print('Over!!')
24 |
--------------------------------------------------------------------------------
/基础篇/request/05-肯德基餐厅位置查询.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | if __name__ == '__main__':
4 | url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
5 | headers = {
6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
7 | }
8 |
9 | place = input('enter a place:')
10 | page = 1 # 从第1页开始
11 | fileName = place + 'KFC餐厅位置信息' + '.json'
12 | for i in range(0, 20): # 设置一个较大参数直到爬完所有页码
13 | param = {
14 | 'cname': '',
15 | 'pid': '',
16 | 'keyword': place, # 查询地点
17 | 'pageIndex': page, # 查询页码
18 | 'pageSize': '10', # 每页最多显示10个
19 | }
20 | response = requests.post(url=url, params=param, headers=headers)
21 | page_text = response.text
22 | # print(page_text)
23 | with open(fileName, 'a', encoding='utf-8') as fp:
24 | json.dump(page_text, fp=fp, ensure_ascii=False)
25 | fp.write('\n') # 注意这里还是在for循环当中,每爬取完一页内容,就敲个回车
26 | page = page + 1 # 佛如循环的循环变量,注意前文默认为1
27 | print('over!!!')
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/bossjob/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/bossjob/bossjob/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/fakeCookie.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | COOKIE_LIST = [
4 | 'wd_guid=544d13f9-f072-4fdc-9989-84452f1ecd52; historyState=state; _bl_uid=XtlO5cqLjv05qpj3t0d0nna8msI4; lastCity=101020100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1673095377,1673165470,1673257271,1673333037; boss_login_mode=sms; __fid=c58f56b0daac21ec5273e9b4b258f537; wt2=DY4IX_Pe18l5jPqD0AYgnA-G9UnTNtDaZ_zMhCpK7UovHjn5bKxYiZ6NtwTrfsFzsgpxFtIBCopvwd7HdvXTGrg~~; wbg=0; __zp_stoken__=887aefCE3dDAxC0wecFokLmdqeARKZz80V3cWbnglEDsONSs%2FVCMzL295aWdxVWw6Ry4PehcuLyROcX4mdTpZXyFXVEtiREADYGooaVQmYhwcSUtZVAQoNVpLLXZRQkdxBRc9G0QGUFhyNA0%3D; geek_zp_token=V1RN0kEOL031ZiVtRvyB4bLCuw6zrQxCo~; __l=l=%2Fwww.zhipin.com%2Fshanghai%2F&r=&g=&s=3&friend_source=0&s=3&friend_source=0; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1673349533; __c=1673333037; __a=68265253.1672926940.1673257271.1673333037.431.9.106.431'
5 | ]
6 |
7 |
8 | def cookie_dic():
9 | cookie_string = random.choice(COOKIE_LIST)
10 | cookie_dict = {}
11 | for kv in cookie_string.split(';'):
12 | k = kv.split('=')[0]
13 | v = kv.split('=')[1]
14 | cookie_dict[k] = v
15 | return cookie_dict
16 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class BossjobItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | pay = scrapy.Field() # 薪资
13 | job_name = scrapy.Field() # 岗位
14 | detail_url = scrapy.Field() # 职位详情链接
15 | company_name = scrapy.Field() # 公司名称
16 | requirement = scrapy.Field() # 要求
17 | detail = scrapy.Field()
18 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 | import pymysql
10 |
11 |
12 | class BossjobPipeline:
13 | def process_item(self, item, spider):
14 | print(item['detail'])
15 | return item
16 |
17 |
18 | class mysqlPipeLine(object):
19 | # 数据库连接
20 | conn = None
21 | cursor = None
22 |
23 | def open_spider(self, spider):
24 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Spider',
25 | charset='utf8')
26 |
27 | def process_item(self, item, spider):
28 | self.cursor = self.conn.cursor()
29 |
30 | try:
31 | self.cursor.execute('insert into bossjob values("%s", "%s", "%s", "%s", "%s")' % (
32 | item["company_name"], item["detail_url"], item["job_name"], item["pay"], item["requirement"]))
33 | self.conn.commit()
34 | print('成功插入', item['job_name'], '的工作信息到数据库中!')
35 | except Exception as e:
36 | print(e)
37 | self.conn.rollback()
38 |
39 | return item
40 |
41 | def close_spider(self, spider):
42 | if self.cursor:
43 | self.cursor.close()
44 | if self.conn:
45 | self.conn.close()
46 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/requset.py:
--------------------------------------------------------------------------------
1 | from scrapy import Request
2 |
3 |
4 | class SeleniumRequest(Request):
5 | pass
6 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/bossjob/spiders/boss.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import scrapy
4 | from lxml import etree
5 |
6 | from ..items import BossjobItem
7 |
8 |
9 | class BossSpider(scrapy.Spider):
10 | name = 'boss'
11 |
12 | def start_requests(self):
13 | for pageNum in range(51, 90):
14 | url = f'https://www.zhipin.com/wapi/zpgeek/mobile/search/joblist.json?page={pageNum}&city=101020100&query='
15 | yield scrapy.Request(url=url, callback=self.parse)
16 |
17 | def parse(self, response, **kwargs):
18 | res = json.loads(response.text)
19 | it = {'html': res['zpData']['html']}
20 | tree = etree.HTML(it['html'])
21 | li_list = tree.xpath('//li')
22 |
23 | for li in li_list:
24 | item = BossjobItem()
25 | job_name = li.xpath('./a/div[1]/span[1]/text()')[0]
26 | item['job_name'] = job_name
27 | detail_url = 'https://www.zhipin.com' + li.xpath('./a/@href')[0]
28 | item['detail_url'] = detail_url
29 | pay = li.xpath('a/div[1]/span[2]/text()')[0]
30 | item['pay'] = pay
31 | company_name = li.xpath('./a/div[2]/span[1]/text()')[0]
32 | item['company_name'] = company_name
33 | requirement = li.xpath('./a/div[3]//text()')
34 | re = ''
35 | for i in range(1, len(requirement)):
36 | re = re + requirement[i].strip() + ' '
37 | item['requirement'] = re
38 |
39 | yield item
40 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = bossjob.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bossjob
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/bossjob/vimm_chrome_proxyauth_plugin.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/bossjob/vimm_chrome_proxyauth_plugin.zip
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/caipiao/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/caipiao/caipiao/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class CaipiaoItem(scrapy.Item):
10 | qihao = scrapy.Field()
11 | red_ball = scrapy.Field()
12 | blue_ball = scrapy.Field()
13 |
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 | import random
6 | from time import sleep
7 |
8 | from scrapy import signals
9 |
10 | # useful for handling different item types with a single interface
11 | from itemadapter import is_item, ItemAdapter
12 | from scrapy.http import HtmlResponse
13 |
14 | from .fake_useragent import USER_AGENTS
15 |
16 |
17 | class CaipiaoDownloaderMiddleware:
18 | # Not all methods need to be defined. If a method is not defined,
19 | # scrapy acts as if the downloader middleware does not modify the
20 | # passed objects.
21 |
22 | def process_request(self, request, spider):
23 | # UA伪装
24 | request.headers['User-Agent'] = random.choice(USER_AGENTS)
25 | return None
26 |
27 | def process_response(self, request, response, spider):
28 | bro = spider.bro
29 | bro.get(request.url)
30 | sleep(0.5)
31 | click = bro.find_element_by_xpath('//*[@id="link248"]/img').click()
32 | start = bro.find_element_by_id('from')
33 | start.clear()
34 | start.send_keys('16001')
35 | end = bro.find_element_by_id('to')
36 | end.clear()
37 | end.send_keys('23004')
38 | find = bro.find_element_by_id('link176').click()
39 | page_text = bro.page_source
40 | new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
41 |
42 | return new_response
43 |
44 | def process_exception(self, request, exception, spider):
45 |
46 | pass
47 |
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 | import pymysql
10 |
11 | '''
12 | 存储数据的方案:
13 | 1、数据要存在csv文件中
14 | 2、数据要存在mysql数据库中
15 | 3、数据要存在mongodb数据库中
16 | 4.文件的存储
17 | '''
18 |
19 |
20 | class CaipiaoPipeline:
21 |
22 | def open_spider(self, spider):
23 | print('开始存储!')
24 | self.f = open('./双色球.csv', mode='w', encoding='utf-8')
25 | self.f.write("期数,红球号码,蓝球号码\n")
26 |
27 | def close_spider(self, spider):
28 | print('存储完毕!')
29 | if self.f:
30 | self.f.close()
31 |
32 | def process_item(self, item, spider):
33 | # print(item)
34 | self.f.write(f"{item['qihao']},{' '.join(item['red_ball'])},{item['blue_ball']}\n")
35 | return item
36 |
37 |
38 | class mySQLPipeline:
39 |
40 | def open_spider(self, spider):
41 | print('开始存储!')
42 | self.conn = pymysql.Connect(
43 | host="localhost",
44 | port=3306,
45 | user="root",
46 | password="",
47 | database="spider"
48 | )
49 |
50 | def close_spider(self, spider):
51 | print('存储完毕!')
52 | if self.conn:
53 | self.conn.close()
54 |
55 | def process_item(self, item, spider):
56 | cur = self.conn.cursor()
57 | sql = "insert into caipiao values(%s, %s, %s)"
58 | try:
59 | cur.execute(sql, (item['qihao'], item['red_ball'], item['blue_ball']))
60 | self.conn.commit()
61 | except Exception as e:
62 | print(e)
63 | self.conn.rollback()
64 |
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for caipiao project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'caipiao'
11 |
12 | SPIDER_MODULES = ['caipiao.spiders']
13 | NEWSPIDER_MODULE = 'caipiao.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'caipiao (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 | LOG_LEVEL = 'WARNING'
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | # SPIDER_MIDDLEWARES = {
49 | # 'caipiao.middlewares.CaipiaoSpiderMiddleware': 543,
50 | # }
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 | 'caipiao.middlewares.CaipiaoDownloaderMiddleware': 543,
56 | }
57 |
58 | # Enable or disable extensions
59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | 'caipiao.pipelines.CaipiaoPipeline': 300,
68 | }
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/caipiao/spiders/seq.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from ..items import CaipiaoItem
3 | from selenium import webdriver
4 | from selenium.webdriver import ChromeOptions
5 |
6 | class SeqSpider(scrapy.Spider):
7 | name = 'seq'
8 | # allowed_domains = ['www.xxx.com']
9 | start_urls = ['https://datachart.500.com/ssq/']
10 |
11 | def __init__(self, **kwargs):
12 | # 实现让selenium规避被检测到的风险
13 | super().__init__(**kwargs)
14 | option = ChromeOptions()
15 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
16 | option.add_experimental_option('excludeSwitches', ['enable-logging'])
17 | option.add_argument("--no-sandbox")
18 | option.add_argument("--disable-dev-shm-usage")
19 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
20 | option.add_argument('--headless')
21 | option.add_argument('--disable-gpu')
22 | # option.add_argument('blink-settings=imagesEnabled=false')
23 | self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option)
24 |
25 | def closed(self, spider):
26 | self.bro.quit()
27 |
28 | def parse(self, response):
29 | tr_list = response.xpath('//*[@id="tdata"]/tr')
30 | for tr in tr_list:
31 | item = CaipiaoItem()
32 | # 过滤掉没用的标签
33 | if tr.xpath('./@class').extract_first() == 'tdbck':
34 | continue
35 | qishu = tr.xpath('./td[1]/text()').extract_first().strip()
36 | # 也可以用xpath: red_ball = tr.xpath("./td[@class="chartBall01"]/text()").extract()
37 | red_ball = tr.css(".chartBall01::text").extract()
38 | blue_ball = tr.css(".chartBall02::text").extract_first()
39 | item['qihao'] = qishu
40 | item['red_ball'] = red_ball
41 | item['blue_ball'] = blue_ball
42 |
43 | yield item
44 |
--------------------------------------------------------------------------------
/基础篇/scrapy/caipiao/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = caipiao.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = caipiao
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/imgsPro/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/imgsPro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/imgsPro/imgsPro/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/imgsPro/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class ImgsproItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | img_name = scrapy.Field()
13 | img_src = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/imgsPro/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | import scrapy
9 | from itemadapter import ItemAdapter
10 | from scrapy.pipelines.images import ImagesPipeline
11 |
12 |
13 | class ImgsproPipeline:
14 | def process_item(self, item, spider):
15 | print(item)
16 | return item
17 |
18 |
19 | class imgsPipeLine(ImagesPipeline):
20 |
21 | # 根据图片地址进行图片数据的请求
22 | def get_media_requests(self, item, info):
23 | yield scrapy.Request(item['img_src'])
24 |
25 | # 指定图片存储的路径
26 | def file_path(self, request, response=None, info=None, *, item):
27 | imgName = item['img_name']
28 | return imgName
29 |
30 | def item_completed(self, results, item, info):
31 | return item # 返回给下一个即将被执行的管道类
32 |
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/imgsPro/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/imgsPro/spiders/img.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from ..items import ImgsproItem
3 | import re
4 |
5 |
6 | class ImgSpider(scrapy.Spider):
7 | name = 'img'
8 | # allowed_domains = ['www.xxx.com']
9 | start_urls = ['https://sc.chinaz.com/tupian//']
10 | page_num = 2
11 |
12 | def parse(self, response):
13 |
14 | div_list = response.xpath('/html/body/div[3]/div[2]/div')
15 | for div in div_list:
16 | item = ImgsproItem()
17 | img_name = div.xpath('./img/@alt').extract()
18 | img_name = ''.join(img_name) + '.jpg'
19 | item['img_name'] = img_name
20 | img_src = div.xpath('./img/@data-original').extract()
21 | img_src = 'https:' + ''.join(img_src)
22 | # 去掉_s以获取高清原图,如果链接里面有_s是缩略图
23 | s = re.sub('_s', '', img_src)
24 | item['img_src'] = s
25 |
26 | yield item
27 | # 另一种分页操作
28 | if self.page_num <= 3:
29 | new_url = f'https://sc.chinaz.com/tupian/index_{self.page_num}.html'
30 | self.page_num += 1
31 |
32 | yield scrapy.Request(new_url, callback=self.parse)
--------------------------------------------------------------------------------
/基础篇/scrapy/imgsPro/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = imgsPro.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = imgsPro
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/paper/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/paper/paper/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class PaperItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | pass
13 |
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 |
6 | from scrapy import signals
7 |
8 | # useful for handling different item types with a single interface
9 | from itemadapter import is_item, ItemAdapter
10 | from .fake_useragent import get_ua
11 |
12 |
13 | class PaperDownloaderMiddleware:
14 |
15 | def process_request(self, request, spider):
16 | # UA伪装
17 | headers = get_ua()
18 | request.headers['User-Agent'] = headers
19 | return None
20 |
21 | def process_response(self, request, response, spider):
22 | return response
23 |
24 | def process_exception(self, request, exception, spider):
25 | pass
26 |
27 |
28 | class CookieDownloaderMiddleware(object):
29 | def process_request(self, request, spider):
30 | cookie_dict = self.get_cookies()
31 | request.cookies = cookie_dict
32 |
33 | def get_cookies(self):
34 | # cookie_string = ''
35 | cookie_string = ''
36 | cookie_dict = {}
37 | for kv in cookie_string.split(';'):
38 | k = kv.split('=')[0]
39 | v = kv.split('=')[1]
40 | cookie_dict[k] = v
41 | return cookie_dict
42 |
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class PaperPipeline:
12 | def process_item(self, item, spider):
13 | return item
14 |
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for paper project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'paper'
11 |
12 | SPIDER_MODULES = ['paper.spiders']
13 | NEWSPIDER_MODULE = 'paper.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'paper (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 |
22 | LOG_LEVEL = 'WARNING'
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'paper.middlewares.PaperSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | 'paper.middlewares.PaperDownloaderMiddleware': 543,
57 | }
58 |
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'paper.pipelines.PaperPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/paper/spiders/page.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 |
3 |
4 | class PageSpider(scrapy.Spider):
5 | name = 'page'
6 | # allowed_domains = ['www.xxx.com']
7 | start_urls = ['https://user.17k.com/ck/author/shelf?page=1&appKey=2406394919']
8 |
9 | def start_requests(self):
10 | url = 'https://passport.17k.com/ck/user/login'
11 | username = ''
12 | password = ''
13 |
14 | # 发送post的方案
15 | yield scrapy.FormRequest(
16 | url=url,
17 | formdata={
18 | 'loginName': username,
19 | 'password': password
20 | },
21 | callback=self.parse
22 | )
23 |
24 | def parse(self, response, **kwargs):
25 | yield scrapy.Request(url=self.start_urls[0], callback=self.detail_parse)
26 |
27 | def detail_parse(self, response):
28 | print(response.json())
--------------------------------------------------------------------------------
/基础篇/scrapy/paper/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = paper.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = paper
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/sunPro/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sunPro.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sunPro
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/sunPro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/sunPro/sunPro/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/sunPro/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class SunproItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | number = scrapy.Field()
13 | title = scrapy.Field()
14 | status = scrapy.Field()
15 | content = scrapy.Field()
16 | city = scrapy.Field()
17 | time = scrapy.Field()
18 |
19 | # class DetailItem(scrapy.Item):
20 | # # define the fields for your item here like:
21 | # # name = scrapy.Field()
22 | # id = scrapy.Field()
23 | # content = scrapy.Field()
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/sunPro/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | import pymysql
9 | from itemadapter import ItemAdapter
10 |
11 |
12 | # class SunproPipeline:
13 | # def process_item(self, item, spider):
14 | # # 如何判断item的类型
15 | # # 将数据写入数据库中,如何保证数据的一致性
16 | # if item.__class__.__name__ == 'DetailItem':
17 | # print(item['id'], item['content'])
18 | # else:
19 | # print(item['number'], item['title'])
20 | # return item
21 |
22 |
23 | class mysqlPipeLine(object):
24 | # 数据库连接
25 | conn = None
26 | cursor = None
27 |
28 | def open_spider(self, spider):
29 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Bossjob', charset='utf8')
30 |
31 | def process_item(self, item, spider):
32 | self.cursor = self.conn.cursor()
33 |
34 | try:
35 | self.cursor.execute('insert into new values("%s", "%s", "%s", "%s", "%s", "%s")' %
36 | (item['number'], item['title'], item['content'], item['status'], item['city'], item['time']))
37 | self.conn.commit()
38 | print('成功插入编号为', item['number'], '的数据!')
39 | except Exception as e:
40 | print(e)
41 | print('error!')
42 | self.conn.rollback()
43 |
44 | return item
45 |
46 | def close_spider(self, spider):
47 | self.cursor.close()
48 | self.conn.close()
49 |
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/sunPro/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for sunPro project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'sunPro'
11 |
12 | SPIDER_MODULES = ['sunPro.spiders']
13 | NEWSPIDER_MODULE = 'sunPro.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'sunPro (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 | LOG_LEVEL = 'ERROR'
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'sunPro.middlewares.SunproSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 | 'sunPro.middlewares.RandomuaDownloaderMiddleware': 543,
56 | 'sunPro.middlewares.CookieDownloaderMiddleware': 400,
57 | 'sunPro.middlewares.SunproDownloaderMiddleware': 300,
58 | }
59 |
60 | # Enable or disable extensions
61 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'sunPro.pipelines.mysqlPipeLine': 200,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/sunPro/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/sunPro/sunPro/spiders/sun.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import scrapy
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.spiders import CrawlSpider, Rule
6 | from selenium import webdriver
7 | from selenium.webdriver import ChromeOptions
8 | from ..items import SunproItem
9 |
10 |
11 | class SunSpider(CrawlSpider):
12 | name = 'sun'
13 | # allowed_domains = ['www.xxx.com']
14 | start_urls = ['https://wz.sun0769.com/political/index/politicsNewest']
15 |
16 | # 实例化一个浏览器对象
17 | def __init__(self, **kwargs):
18 | # 实现让selenium规避被检测到的风险
19 | super().__init__(**kwargs)
20 | option = ChromeOptions()
21 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
22 | option.add_experimental_option('excludeSwitches', ['enable-logging'])
23 | option.add_argument("--no-sandbox")
24 | option.add_argument("--disable-dev-shm-usage")
25 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
26 | option.add_argument('--headless')
27 | option.add_argument('--disable-gpu')
28 | option.add_argument('blink-settings=imagesEnabled=false')
29 | self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option)
30 |
31 | def closed(self, spider):
32 | self.bro.quit()
33 |
34 | # 链接提取器: 根据指定规则(allow=r'正则表达式')进行指定链接提取
35 | link = LinkExtractor(allow=r'id=1&page=\d', restrict_xpaths='/html/body/div[2]/div[3]/div[3]/div/a')
36 | # link_detail = LinkExtractor(restrict_xpaths='/html/body/div[2]/div[3]/ul[2]/li/span[3]/a')
37 |
38 | rules = (
39 | # 规则解析器: 将链接提取器提取到的链接进行指定规则(callback)的解析操作
40 | # follow=True: 可以将链接提取器继续作用到链接提取器提取到的链接所对应的页面中
41 | Rule(link, callback='parse_item', follow=True),
42 | # Rule(link_detail, callback='parse_detail'),
43 | )
44 |
45 | # 解析投诉的编号和标题
46 | def parse_item(self, response):
47 | li_list = response.xpath('/html/body/div[2]/div[3]/ul[2]/li')
48 | for li in li_list:
49 | item = SunproItem()
50 | number = li.xpath('./span[1]/text()').extract_first()
51 | item['number'] = number
52 | status = li.xpath('./span[2]/text()').extract_first().strip()
53 | item['status'] = status
54 | title = li.xpath('./span[3]/a/text()').extract_first()
55 | item['title'] = title
56 | detail_url = 'https://wz.sun0769.com' + li.xpath('./span[3]/a/@href').extract_first()
57 |
58 | yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item})
59 |
60 | # 解析投诉的内容
61 | def parse_detail(self, response):
62 | item = response.meta['item']
63 | content = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre//text()').extract()
64 | content = ''.join(content)
65 | item['content'] = content
66 | city = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[2]/text()').extract_first()
67 | c = re.sub(' 来自:', '', city)
68 | C = re.sub(' ', '', c)
69 | item['city'] = C
70 | time = response.xpath('/html/body/div[3]/div[2]/div[2]/div[1]/span[3]/text()').extract_first()
71 | item['time'] = time
72 | # print(item)
73 | yield item
74 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/wangyi/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/news.txt:
--------------------------------------------------------------------------------
1 | (1)加拿大将为乌克兰购买美制防空系统 俄方:荒谬:
2 |
3 | 来源:环球网【环球网报道 见习记者 李律杉】据路透社报道,美加两国元首在墨西哥城会晤后,加拿大总理特鲁多办公室周二(10日)发表声明称,加拿大将为乌克兰购买美国制造的“国家先进地对空导弹系统”(NASAMS)。报道披露,当天特鲁多和拜登正在墨西哥参加第十届北美领导人峰会,两人在支持乌克兰方面进行了单独会晤。在此期间,特鲁多告诉拜登,加拿大将为乌克兰购买美制地空导弹系统一事。“这是加拿大首次向乌克兰捐赠防空系统。”加拿大国防部长安妮塔·阿南德在推特上写道。她还表示,乌克兰防长列兹尼科夫10日早些时候在电话中告诉她,得到防空系统是乌克兰的首要任务。阿南德介绍称,NASAMS是一种中短程地面防空系统,可抵御无人机、导弹和飞机的攻击。对于加拿大这一援乌决定,俄罗斯驻加拿大大使奥列格·斯捷潘诺夫作出回应。据俄罗斯卫星通讯社报道,斯捷潘诺夫在得知此事表示,“特鲁多总理的内阁把钱花在(进一步)激化战争上,支持一个距离加拿大上千公里之外的非法政权,这看起来很荒谬。”“尤其荒谬的是,(这是)在加拿大目前国内还面临着各种问题的背景下(做出的决定)。”另外,根据加拿大总理办公室的声明,特鲁多和拜登还就加拿大皇家空军采购F-35战斗机一事展开讨论。据央视新闻报道,加拿大国防部长安妮塔·阿南德当地时间1月9日宣布,加拿大已经签署了购买F-35战机的最终合同,初期购买金额达190亿加元。据悉,这88架战机中的第一架将在2026年之前交付,而第一批F-35中队将在2029年之前投入使用。
4 |
5 | (35)台媒:57架次解放军军机进入台岛周边 "异常紧张":
6 |
7 | 来源:环球网【环球网报道】“解放军对台打击军演 57架次共机‘三面围台’ 我战机与地面飞弹紧盯”,中国人民解放军东部战区1月8日位台岛周边海空域组织诸军兵种联合战备警巡和实战化演练第二天,台湾中时新闻网以此为题渲染“气氛异常紧张”。台防务部门9日的说法宣称,自8日上午6时至9日上午6时止,“侦获”解放军军机57架次(其中28架次逾越“台海中线”)、军舰4艘次,持续在台湾海峡周边活动。8日夜,东部战区新闻发言人施毅陆军大校表示,当天中国人民解放军东部战区位台岛周边海空域组织诸军兵种联合战备警巡和实战化演练,重点演练对陆打击、对海突击等内容,旨在检验部队联合作战能力,坚决反击外部势力、“台独”分裂势力勾连挑衅行径。中时新闻网9日称,解放军军机“扰台”范围明显扩大且集中在8日夜间,台空军战机整夜不断紧急升空,地面导弹部队更是进入高度警戒。台军还声称,运用任务机、舰艇及岸基导弹系统“严密监控”与“应处”。中时新闻网还称,台各空军基地8日晚气氛异常紧张,从北到南甚至东部,各基地战机接连紧急起飞,架次比平常多,状况如去年大陆军演一般,不少住在基地周边的民众都感觉到一丝不寻常的气氛,直到解放军东部战区发文,才知道原因是大陆进行演练。此次演习距东部战区位台岛周边海空域演习还不到半个月,2022年12月25日,中国人民解放军东部战区位台岛周边海空域组织诸军兵种联合战备警巡和联合火力打击演练。这是针对当前美台升级勾连挑衅的坚决回应。此前的12月23日,美国总统拜登签署“2023财年国防授权法案”,其中一项内容是未来5年将对台提供总额100亿美元、每年最多20亿美元的“军事援助”。该法案还要求“加速处理台湾军购请求”,并建议邀请台湾参与2024年“环太平洋军演”。这些严重违反一个中国原则和中美三个联合公报规定的恶性条款,给台海和平稳定造成严重损害。
8 |
9 | (34)德媒:柏林正疯狂寻找向基辅承诺的40辆步兵战车:
10 |
11 | 来源:中国新闻网中新网1月9日电 据德国《明镜》周刊报道,德国正在“疯狂地”寻找给乌克兰承诺的40辆“黄鼠狼”步兵战车,柏林将不得不从自己的武装力量储备中取出所承诺战车的大部分。报道称,德国总理朔尔茨此前曾向基辅承诺了40辆“黄鼠狼”步兵战车,目前联邦政府正在疯狂地寻找承诺的步兵战车。“德国政府尚未准备好供应此类军备,这就是为什么德国国防军必须清空其仓库,但它储备状态其实已经很差了。”德国联邦议院议员亨宁·奥特说道。报道指出,当政府决定将“黄鼠狼”步兵战车交付给乌克兰,德国军方、政界人士和安全专家都开始怀疑柏林将从哪里获得承诺的设备。朔尔茨的话“没那么容易实现”。消息显示,德国国防企业莱茵金属(Rheinmetall)公司库存有近60辆有缺陷的“黄鼠狼”步兵战车,但将其升级会需要很长时间。据报道,德国总理朔尔茨与美国总统拜登5日通电话,就向基辅运送重型军事装备达成一致。随后德国宣布,拟向乌克兰供应40辆“黄鼠狼”步兵战车和1枚“爱国者”防空导弹。乌克兰局势升级以来,德国已向乌克兰提供价值22.5亿欧元的武器和军事装备。
12 |
13 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = wangyi.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = wangyi
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/wangyi/wangyi/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class WangyiItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | title = scrapy.Field()
13 | content = scrapy.Field()
14 | number = scrapy.Field()
15 |
16 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 | import random
6 |
7 | from scrapy import signals
8 |
9 | # useful for handling different item types with a single interface
10 | from itemadapter import is_item, ItemAdapter
11 |
12 | from .fake_useragent import USER_AGENTS
13 | from scrapy.http import HtmlResponse
14 | from time import sleep
15 |
16 |
17 | class WangyiDownloaderMiddleware:
18 | # Not all methods need to be defined. If a method is not defined,
19 | # scrapy acts as if the downloader middleware does not modify the
20 | # passed objects.
21 |
22 | def process_request(self, request, spider):
23 | # UA伪装
24 | request.headers['User-Agent'] = random.choice(USER_AGENTS)
25 | return None
26 |
27 | def process_response(self, request, response, spider):
28 | # 挑选出指定的响应对象进行篡改
29 | # 通过url指定request,通过request指定response
30 | # 获取动态加载出的动态数据,基于selenium
31 | bro = spider.bro
32 |
33 | if request.url in spider.models_url:
34 | # 五大板块对应的响应对象
35 | # 针对定位到的这些response进行篡改
36 | # 实例化一个新响应对象,包含动态加载的新闻数据,用新的换旧的
37 | bro.get(request.url)
38 | sleep(0.5)
39 | bro.execute_script('window.scrollTo(0,10000)')
40 | page_text = bro.page_source
41 | # self.fp = open('./news.html', 'w', encoding='utf-8')
42 | # self.fp.write(page_text)
43 | # self.fp.close()
44 | new_response = HtmlResponse(url=request.url, body=page_text, encoding='utf-8', request=request)
45 |
46 | return new_response
47 | else:
48 | # 其他请求对应的响应对象
49 | return response
50 |
51 | def process_exception(self, request, exception, spider):
52 | pass
53 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 |
10 |
11 | class WangyiPipeline(object):
12 | fp = None
13 |
14 | # 重写父类的一个方法:该方法只在开始爬虫的时候被调用一次
15 | def open_spider(self, spider):
16 | print('开始爬虫!')
17 | self.fp = open('./news.txt', 'w', encoding='utf-8')
18 |
19 | # 专门用来处理item类型对象
20 | # 该方法可以接受爬虫文件提交过来的item对象
21 | # 该方法每接收到一个item就会被调用一次
22 | def process_item(self, item, spider):
23 | title = item['title']
24 | content = item['content']
25 | number = item['number']
26 | print('正在下载第', number, '个新闻。。。')
27 | # 持久化存储
28 | self.fp.write('(' + str(number) + ')' + title + ':' + '\n' + content + '\n')
29 |
30 | return item # 就会传递给下一个即将被执行的管道类
31 |
32 | # 重写父类
33 | def close_spider(self, spider):
34 | print('结束爬虫!')
35 | self.fp.close()
36 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for wangyi project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'wangyi'
11 |
12 | SPIDER_MODULES = ['wangyi.spiders']
13 | NEWSPIDER_MODULE = 'wangyi.spiders'
14 |
15 | LOG_LEVEL = 'ERROR'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'wangyi (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'wangyi.middlewares.WangyiSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | DOWNLOADER_MIDDLEWARES = {
55 | 'wangyi.middlewares.WangyiDownloaderMiddleware': 543,
56 | }
57 |
58 | # Enable or disable extensions
59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | 'wangyi.pipelines.WangyiPipeline': 300,
68 | }
69 |
70 | # Enable and configure the AutoThrottle extension (disabled by default)
71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
72 | #AUTOTHROTTLE_ENABLED = True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY = 5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY = 60
77 | # The average number of requests Scrapy should be sending in parallel to
78 | # each remote server
79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG = False
82 |
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | #HTTPCACHE_ENABLED = True
86 | #HTTPCACHE_EXPIRATION_SECS = 0
87 | #HTTPCACHE_DIR = 'httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES = []
89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
90 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/wangyi/wangyi/spiders/news.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from selenium import webdriver
3 | from selenium.webdriver import ChromeOptions
4 | from ..items import WangyiItem
5 |
6 |
7 | class NewsSpider(scrapy.Spider):
8 | name = 'news'
9 | # allowed_domains = ['www.xxx.com']
10 | start_urls = ['https://news.163.com/']
11 | models_url = [] # 存放板块的详情页url
12 | number = 1
13 |
14 | # 实例化一个浏览器对象
15 | def __init__(self, **kwargs):
16 | # 实现让selenium规避被检测到的风险
17 | super().__init__(**kwargs)
18 | option = ChromeOptions()
19 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
20 | option.add_experimental_option('useAutomationExtension', False)
21 | option.add_experimental_option('excludeSwitches', ['enable-logging'])
22 | option.add_argument("--no-sandbox")
23 | option.add_argument("--disable-dev-shm-usage")
24 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
25 | option.add_argument('--headless')
26 | option.add_argument('--disable-gpu')
27 | self.bro = webdriver.Chrome(executable_path='D:\爬虫\selenium\chromedriver.exe', options=option)
28 |
29 | def closed(self, spider):
30 | self.bro.quit()
31 |
32 | # 解析每一个板块对应的详情页url
33 | # 每一个板块对应新闻相关的内容都是动态加载出来的
34 | def detail_parse(self, response):
35 | div_list = response.xpath('//div[@class="ndi_main"]/div[@class="data_row news_article clearfix news_first"] | //div[@class="ndi_main"]/div[@class="data_row news_article clearfix "]')
36 | # print(div_list)
37 | for div in div_list:
38 | item = WangyiItem()
39 | title = div.xpath('./div/div/h3/a/text()').extract_first()
40 | item['title'] = title
41 | item['number'] = self.number
42 | self.number += 1
43 | content_url = div.xpath('./div/div/h3/a/@href').extract_first()
44 |
45 | yield scrapy.Request(url=content_url, callback=self.content_parse, meta={'item': item})
46 |
47 | # 解析新闻内容
48 | def content_parse(self, response):
49 | item = response.meta['item']
50 | content = response.xpath('//*[@id="content"]/div[2]//text()').extract()
51 | content = ''.join(content)
52 | item['content'] = content
53 | # print(item)
54 | yield item
55 |
56 | # 解析五大板块的详情页url
57 | def parse(self, response):
58 | li_list = response.xpath('//*[@id="index2016_wrap"]/div[3]/div[2]/div[2]/div[2]/div/ul/li')
59 | alist = [1, 2, 4, 5] # 存储各个领域的li标签编号
60 |
61 | for index in alist:
62 | model_url = li_list[index].xpath('./a/@href').extract_first()
63 | # print(model_url)
64 | self.models_url.append(model_url)
65 |
66 | # 依次对每个板块进行发起请求
67 | for url in self.models_url:
68 | yield scrapy.Request(url=url, callback=self.detail_parse)
69 |
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/xiaohua/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = xiaohua.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = xiaohua
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/xiaohua/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/xiaohua/xiaohua/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/xiaohua/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class XiaohuaItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | author = scrapy.Field()
13 | content = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/xiaohua/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 | import pymysql
10 |
11 |
12 | class XiaohuaPipeline:
13 | fp = None
14 |
15 | # 重写父类
16 | def open_spider(self, spider):
17 | print('开始爬虫。。。')
18 | self.fp = open('./xiaohua.txt', 'w', encoding='utf-8')
19 |
20 | def process_item(self, item, spider):
21 | author = item['author']
22 | content = item['content']
23 |
24 | # 持久化存储
25 | self.fp.write(author + '-->' + '\n' + content + '\n')
26 |
27 | return item
28 |
29 | # 重写父类
30 | def close_spider(self, spider):
31 | print('结束爬虫!')
32 | self.fp.close()
33 |
34 |
35 | class mysqlPipeLine(object):
36 | # 数据库连接
37 | conn = None
38 | cursor = None
39 |
40 | def open_spider(self, spider):
41 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='xioahua', charset='utf8')
42 |
43 | def process_item(self, item, spider):
44 | self.cursor = self.conn.cursor()
45 |
46 | try:
47 | self.cursor.execute('insert into xiaohua.xiaohua values("%s", "%s")' % (item["author"], item["content"]))
48 | self.conn.commit()
49 | except Exception as e:
50 | print(e)
51 | self.conn.rollback()
52 |
53 | return item
54 |
55 | def close_spider(self, spider):
56 | self.cursor.close()
57 | self.conn.close()
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/xiaohua/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for xiaohua project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'xiaohua'
11 |
12 | SPIDER_MODULES = ['xiaohua.spiders']
13 | NEWSPIDER_MODULE = 'xiaohua.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 |
22 | LOG_LEVEL = 'ERROR'
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'xiaohua.middlewares.XiaohuaSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'xiaohua.middlewares.XiaohuaDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'xiaohua.pipelines.XiaohuaPipeline': 300,
69 | 'xiaohua.pipelines.mysqlPipeLine': 301,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/xiaohua/spiders/Xiaohua.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from ..items import XiaohuaItem
3 |
4 |
5 | class XiaohuaSpider(scrapy.Spider):
6 | name = 'Xiaohua'
7 | # allowed_domains = ['www.xiaohua.com']
8 | start_urls = ['https://www.xiaohua.com/duanzi/']
9 |
10 | # 生成一个通用的url模板
11 | url = 'https://www.xiaohua.com/duanzi?page=%d'
12 | page_num = 2
13 |
14 | def parse(self, response):
15 | div_list = response.xpath('/html/body/div/div[8]/div[2]/div[2]/div[@class="one-cont"]')
16 | all_data = []
17 | for div in div_list:
18 | author = div.xpath('./div/div/a/i/text()')[0].extract()
19 | content = div.xpath('./p/a//text()').extract()
20 | # 将列表转化为字符串
21 | content = ''.join(content)
22 | item = XiaohuaItem()
23 | item['author'] = author
24 | item['content'] = content
25 | # 将item提交给管道
26 | yield item
27 |
28 | if self.page_num <= 3:
29 | new_url = format(self.url % self.page_num)
30 | self.page_num += 1
31 | # 手动请求发送;callback回调函数是专门用作数据解析
32 | yield scrapy.Request(url=new_url, callback=self.parse)
33 |
--------------------------------------------------------------------------------
/基础篇/scrapy/xiaohua/xiaohua/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/yiche/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = yiche.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = yiche
12 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/test.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 |
4 | fp = open('1.txt', 'r', encoding='utf-8').read()
5 | res = json.loads(fp)
6 | k = 0
7 | car = [] # 存储所有车辆的所有信息
8 | name_list = [] # 存储参数的名称
9 |
10 | while k < len(res['data']):
11 | if k >= 3: # 控制获取信息到目录的第几级
12 | break
13 | else:
14 | item_list = res["data"][k]["items"]
15 |
16 | value_list = []
17 | car_list = []
18 |
19 | for item in item_list:
20 | # 车辆颜色需要专门写
21 | if item['id'] == -30 or item['id'] == -31:
22 | break
23 | else:
24 | name_list.append(item['name'])
25 | value_list.append(item['paramValues'])
26 |
27 | for value in value_list:
28 | i = 0
29 | while i < len(value):
30 | va = value[i]['value']
31 | if va == '-':
32 | va = value[i]['subList'][0]['value']
33 | car_list.append(va)
34 | i += 1
35 | car.append(car_list)
36 | car_list = []
37 | k += 1
38 |
39 | # 规范汽车参数格式
40 | forN = len(car) # 参数的个数
41 | carN = len(car[1]) # 车辆的个数
42 | car = sum(car, []) # 整合汽车信息
43 | time = 0 # 循环次数
44 | name0 = []
45 | a = []
46 | b = []
47 |
48 |
49 | while time < carN:
50 | x = time
51 | k = 0
52 | for i in range(forN):
53 | if k == 0:
54 | name0.append(car[x])
55 | else:
56 | c = name_list[k] + ':' + car[x]
57 | a.append(c)
58 | x += carN
59 | k += 1
60 | time += 1
61 | b.append(a)
62 | a = []
63 | s = []
64 |
65 | for k in b:
66 | k = ' '.join(k)
67 | s.append(k)
68 |
69 | print(s, name0)
70 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/基础篇/scrapy/yiche/yiche/__init__.py
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class YicheItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | # name = scrapy.Field()
12 | brand = scrapy.Field()
13 | car_name = scrapy.Field()
14 | car_num = scrapy.Field()
15 | car_detail = scrapy.Field()
16 | car_name1 = scrapy.Field()
17 |
18 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 |
6 | from scrapy import signals
7 |
8 | # useful for handling different item types with a single interface
9 | from itemadapter import is_item, ItemAdapter
10 |
11 | # class jsDownloaderMiddleware(object):
12 | # def process_requset(self, request, spider):
13 |
14 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | import time
9 |
10 | from itemadapter import ItemAdapter
11 |
12 |
13 | class YichePipeline:
14 | def process_item(self, item, spider):
15 | print(item)
16 | return item
17 |
18 |
19 | import pymysql
20 |
21 |
22 | class mysqlPipeLine(object):
23 | # 数据库连接
24 | conn = None
25 | cursor = None
26 |
27 | def open_spider(self, spider):
28 | self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='', db='Spider',
29 | charset='utf8')
30 | print('开始插入数据')
31 |
32 | def process_item(self, item, spider):
33 | self.cursor = self.conn.cursor()
34 |
35 | try:
36 | i = 0
37 | if item['car_detail'] == '参数配置暂未公开' or item['car_detail'] == '暂无在售车辆':
38 | self.cursor.execute('insert into cars values("%s", "%s", "%s", "%s", "%s")' % (
39 | item["brand"], item['car_name'], item["car_name"] + item['car_name1'], item["car_num"], item['car_detail']))
40 | self.conn.commit()
41 | print(item['car_name'])
42 | else:
43 | for k in item['car_detail']:
44 | v = item['car_name1'][i]
45 | i += 1
46 | self.cursor.execute('insert into cars values("%s","%s", "%s", "%s", "%s")' % (
47 | item["brand"], item['car_name'], item["car_name"] + ' ' + v, item["car_num"], k))
48 | self.conn.commit()
49 | print(item['car_name'])
50 | except Exception as e:
51 | # print(item)
52 | print(e)
53 | self.conn.rollback()
54 |
55 | return item
56 |
57 | def close_spider(self, spider):
58 | print('结束插入数据')
59 | if self.cursor:
60 | self.cursor.close()
61 | if self.conn:
62 | self.conn.close()
63 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for yiche project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'yiche'
11 |
12 | SPIDER_MODULES = ['yiche.spiders']
13 | NEWSPIDER_MODULE = 'yiche.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'yiche (+http://www.yourdomain.com)'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = False
21 | LOG_LEVEL = 'ERROR'
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | #CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | #DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | #CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | #COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | #TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | #DEFAULT_REQUEST_HEADERS = {
42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | # 'Accept-Language': 'en',
44 | #}
45 |
46 | # Enable or disable spider middlewares
47 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
48 | #SPIDER_MIDDLEWARES = {
49 | # 'yiche.middlewares.YicheSpiderMiddleware': 543,
50 | #}
51 |
52 | # Enable or disable downloader middlewares
53 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
54 | # DOWNLOADER_MIDDLEWARES = {
55 | # 'yiche.middlewares.FakeUADownloaderMiddleware': 543,
56 | # }
57 |
58 | # Enable or disable extensions
59 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
60 | #EXTENSIONS = {
61 | # 'scrapy.extensions.telnet.TelnetConsole': None,
62 | #}
63 |
64 | # Configure item pipelines
65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
66 | ITEM_PIPELINES = {
67 | 'yiche.pipelines.mysqlPipeLine': 300,
68 | # 'yiche.pipelines.YichePipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/基础篇/scrapy/yiche/yiche/test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | def detail(fp):
5 | res = json.loads(fp)
6 | k = 0
7 | car = [] # 存储所有车辆的所有信息
8 | name_list = [] # 存储参数的名称
9 |
10 | while k < len(res['data']):
11 | if k >= 3: # 控制获取信息到目录的第几级
12 | break
13 | else:
14 | item_list = res["data"][k]["items"]
15 |
16 | value_list = []
17 | car_list = []
18 |
19 | for item in item_list:
20 | # 车辆颜色需要专门写
21 | if item['id'] == -30 or item['id'] == -31:
22 | break
23 | else:
24 | name_list.append(item['name'])
25 | value_list.append(item['paramValues'])
26 |
27 | for value in value_list:
28 | i = 0
29 | while i < len(value):
30 | va = value[i]['value']
31 | if va == '-':
32 | va = value[i]['subList'][0]['value']
33 | car_list.append(va)
34 | i += 1
35 | car.append(car_list)
36 | car_list = []
37 | k += 1
38 |
39 | # 规范汽车参数格式
40 | forN = len(car) # 参数的个数
41 | carN = len(car[1]) # 车辆的个数
42 | car = sum(car, []) # 整合汽车信息
43 | time = 0 # 循环次数
44 | a = []
45 | b = []
46 | name0 = []
47 |
48 | while time < carN:
49 | x = time
50 | k = 0
51 | for i in range(forN):
52 | if k == 0:
53 | name0.append(car[x])
54 | else:
55 | c = name_list[k] + ':' + car[x]
56 | a.append(c)
57 | x += carN
58 | k += 1
59 | time += 1
60 | b.append(a)
61 | a = []
62 |
63 | s = []
64 |
65 | for k in b:
66 | k = ' '.join(k)
67 | s.append(k)
68 | sk = {
69 | 'detail': s,
70 | 'name': name0
71 | }
72 | return sk
73 |
74 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/flask_server.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 | import time
3 |
4 | app = Flask(__name__)
5 |
6 |
7 | @app.route('/dxs')
8 | def index_dxs():
9 | time.sleep(2)
10 | return 'Hello dxs!'
11 |
12 |
13 | @app.route('/dxy')
14 | def index_dxy():
15 | time.sleep(2)
16 | return 'Hello dxy!'
17 |
18 |
19 | @app.route('/date')
20 | def index_date():
21 | time.sleep(2)
22 | return 'dxs date dxy!'
23 |
24 |
25 | if __name__ == '__main__':
26 | app.run(threaded=True)
27 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/meinv.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import aiofile
3 | import requests
4 | from lxml import html
5 | import os
6 | import aiohttp
7 |
8 | etree = html.etree
9 | cookies = {
10 | 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60': '1676030483',
11 | 'Hm_lpvt_c8263f264e5db13b29b03baeb1840f60': '1676030939',
12 | }
13 | headers = {
14 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
15 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
16 | 'Cache-Control': 'max-age=0',
17 | 'Connection': 'keep-alive',
18 | # 'Cookie': 'Hm_lvt_c8263f264e5db13b29b03baeb1840f60=1676030483; Hm_lpvt_c8263f264e5db13b29b03baeb1840f60=1676030939',
19 | 'Referer': 'https://www.3gbizhi.com/tag/meinv/2.html',
20 | 'Sec-Fetch-Dest': 'document',
21 | 'Sec-Fetch-Mode': 'navigate',
22 | 'Sec-Fetch-Site': 'same-origin',
23 | 'Sec-Fetch-User': '?1',
24 | 'Upgrade-Insecure-Requests': '1',
25 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
26 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
27 | 'sec-ch-ua-mobile': '?0',
28 | 'sec-ch-ua-platform': '"Windows"',
29 | }
30 |
31 |
32 | def getUrl(page):
33 | all = []
34 | response = requests.get(f'https://desk.3gbizhi.com/deskMV/index_{page}.html', cookies=cookies, headers=headers)
35 | tree = etree.HTML(response.text)
36 | li_list = tree.xpath('/html/body/div[5]/ul/li')
37 | for li in li_list:
38 | photo = {
39 | '标题': li.xpath('./a/img/@title')[0],
40 | 'url': li.xpath('./a/@href')[0]
41 | }
42 | all.append(photo)
43 | return all
44 |
45 |
46 | def getpic(data):
47 | response = requests.get(data['url'], headers, cookies=cookies).text
48 | tree = etree.HTML(response)
49 | url = tree.xpath('//*[@id="showimg"]/a[4]/img/@src')[0]
50 | return url
51 |
52 |
53 | async def thread(url, name):
54 | async with aiohttp.ClientSession() as session:
55 | async with session.get(url, ssl=False, headers=headers, cookies=cookies) as resp:
56 | datas = await resp.read()
57 | async with aiofile.async_open(f'./picLibs/{name}.jpg', 'wb') as fp:
58 | await fp.write(datas)
59 | print(name + '爬取成功!')
60 |
61 |
62 | if __name__ == '__main__':
63 | if not os.path.exists('./picLibs'):
64 | os.mkdir('./picLibs')
65 | loop = asyncio.get_event_loop()
66 | for page in range(1, 24):
67 | print(page)
68 | all = getUrl(page)
69 | URL = []
70 | for data in all:
71 | url = getpic(data)
72 | name = data['标题']
73 | URL.append(thread(url, name))
74 | loop.run_until_complete(asyncio.wait(URL))
75 | loop.close()
76 |
77 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/minxing.py:
--------------------------------------------------------------------------------
1 | import os
2 | import requests
3 | from lxml import html
4 |
5 | headers = {
6 | 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
7 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
8 | 'cookie': '__yjs_duid=1_f064d94f3576b1069275a2e233974a2c1676030524524; PHPSESSID=1asobv9sgpl0sb0ian1dm9jcc7; sYQDUGqqzHsearch_history=%u7F8E%u5973',
9 | 'referer': 'https://www.syt5.com/mingxing/mnmx',
10 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
11 | 'sec-ch-ua-mobile': '?0',
12 | 'sec-ch-ua-platform': '"Windows"',
13 | 'sec-fetch-dest': 'document',
14 | 'sec-fetch-mode': 'navigate',
15 | 'sec-fetch-site': 'same-origin',
16 | 'sec-fetch-user': '?1',
17 | 'upgrade-insecure-requests': '1',
18 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
19 | }
20 | etree = html.etree
21 | url = 'https://www.syt5.com/mingxing/mnmx/index_%d.html'
22 |
23 |
24 | def rebuilt_Language(url, headers):
25 | response = requests.get(url=url, headers=headers)
26 | response.encoding = response.apparent_encoding
27 | return response
28 |
29 |
30 | def getDetailInfo(url):
31 | all = []
32 | for page in range(2, 20):
33 | new_url = format(url % page)
34 | resp = rebuilt_Language(new_url, headers)
35 | tree = etree.HTML(resp.text)
36 | div_list = tree.xpath('//*[@id="body"]/main/div[4]/div/div')
37 | for div in div_list:
38 | info = {
39 | '标题': div.xpath('./div[1]/a/@title')[0],
40 | '链接': div.xpath('./div[1]/a/@href')[0]
41 | }
42 | all.append(info)
43 | return all
44 |
45 |
46 | def getPhotoUrl(data):
47 | resp = rebuilt_Language(data['链接'], headers)
48 | tree = etree.HTML(resp.text)
49 | li_list = tree.xpath('//*[@id="showimages"]/div[3]/div[2]/div[2]/ul/li')
50 | url = []
51 | for li in li_list:
52 | s = li.xpath('./a/@href')[0]
53 | url.append(s)
54 | if not url:
55 | li_list = tree.xpath('//*[@id="showimages"]/div[3]/div[3]/div[2]/ul/li')
56 | for li in li_list:
57 | s = li.xpath('./a/@href')[0]
58 | url.append(s)
59 | info = {
60 | '标题': data['标题'],
61 | 'urls': url
62 | }
63 | return info
64 |
65 |
66 | def download(Name, url):
67 | resp = rebuilt_Language(url, headers)
68 | tree = etree.HTML(resp.text)
69 | src = tree.xpath('//*[@id="showpicsouutuIs2020"]/@src')[0]
70 | name = src.split('/')[-1]
71 | data = requests.get(src, headers).content
72 | with open(f'./{Name}/{name}', 'wb')as fp:
73 | fp.write(data)
74 | print('over!')
75 |
76 |
77 | if __name__ == '__main__':
78 | total = getDetailInfo(url)
79 | for data in total:
80 | Info = getPhotoUrl(data)
81 | # print('正在采集'+ Info["标题"])
82 | # if not os.path.exists(f'./Piclib/{Info["标题"]}'):
83 | # os.mkdir(f'./Piclib/{Info["标题"]}')
84 | # for i in range(len(Info['urls'])):
85 | # download(Info['标题'],Info['urls'][i])
86 | print(Info)
87 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/协程.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 |
3 |
4 | async def request(url):
5 | print('正在请求', url)
6 | print(url, '请求成功!')
7 | return url
8 |
9 | # async修饰的函数,调用成功后返回的一个协程对象
10 | c = request('www.baidu.com')
11 |
12 | # # 创建一个协程对象
13 | # loop = asyncio.get_event_loop()
14 | # # 将协程对象注册到loop之中,然后启动loop
15 | # loop.run_until_complete(c)
16 |
17 | # # task的使用
18 | # loop = asyncio.get_event_loop()
19 | # # 基于loop创建了一个task对象
20 | # task = loop.create_task(c)
21 | # print(task)
22 | # loop.run_until_complete(task)
23 | # print(task)
24 |
25 | # # future的使用
26 | # loop = asyncio.get_event_loop()
27 | # task = asyncio.ensure_future(c)
28 | # print(task)
29 | # loop.run_until_complete(task)
30 | # print(task)
31 |
32 |
33 | def callback_func(task):
34 | print(task.result())
35 |
36 |
37 | # 绑定回调
38 | loop = asyncio.get_event_loop()
39 | task = asyncio.ensure_future(c)
40 | # 将回调函数绑定到任务对象中
41 | task.add_done_callback(callback_func)
42 | loop.run_until_complete(task)
43 |
44 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/多任务协程01.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import time
3 |
4 |
5 | async def request(url):
6 | print('正在下载', url)
7 | # 在异步协程中,如果出现了同步模块相关的代码,就无法实现异步
8 | # time.sleep(2)
9 | # 当在asyncio中遇到阻塞操作必须进行手动挂起
10 | await asyncio.sleep(2)
11 | print('下载完成', url)
12 |
13 |
14 | urls = [
15 | 'www.baidu.com',
16 | 'www.douban.com',
17 | 'www.shu.edu.cn'
18 | ]
19 | # 任务列表:存放多个任务对象
20 | tasks = []
21 | for url in urls:
22 | c = request(url)
23 | task = asyncio.ensure_future(c)
24 | tasks.append(task)
25 |
26 | start = time.time()
27 |
28 | loop = asyncio.get_event_loop()
29 | # 需要将任务列表封装到wait中
30 | loop.run_until_complete(asyncio.wait(tasks))
31 |
32 | end = time.time()
33 | print(end - start)
34 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/多任务异步协程02.py:
--------------------------------------------------------------------------------
1 | # 使用aiohttp中的ClientSession
2 | import requests
3 | import asyncio
4 | import time
5 | import aiohttp
6 |
7 | urls = [
8 | 'http://127.0.0.1:5000/dxs',
9 | 'http://127.0.0.1:5000/dxy',
10 | 'http://127.0.0.1:5000/date'
11 | ]
12 |
13 |
14 | async def get_page(url):
15 | print('正在下载', url)
16 | # requests发起的请求时基于同步的,必须使用基于异步的网络请求模块进行指定url的请求发送
17 | # aiohttp:基于异步的网络请求
18 | # response = requests.get(url=url).text
19 | async with aiohttp.ClientSession() as session:
20 | # headers,params/data,proxy='http://ip:port'
21 | async with await session.get(url=url) as response:
22 | # text()返回字符串型的响应对象
23 | # read()返回的二进制响应对象
24 | # json()返回的json对象
25 | # 注意在获取响应数据操作之前一定要使用await进行手动挂起
26 | page_text = await response.text()
27 | print('下载完成', url)
28 | return page_text
29 |
30 |
31 | def callback(task):
32 | print(task.result())
33 |
34 |
35 | tasks = []
36 |
37 | for url in urls:
38 | c = get_page(url)
39 | task = asyncio.ensure_future(c)
40 | task.add_done_callback(callback)
41 | tasks.append(task)
42 |
43 | start = time.time()
44 | loop = asyncio.get_event_loop()
45 | loop.run_until_complete(asyncio.wait(tasks))
46 | end = time.time()
47 |
48 | print('总耗时', end - start)
49 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/线程池的基本使用.py:
--------------------------------------------------------------------------------
1 | import time
2 | from multiprocessing.dummy import Pool # 导入线程池模块对应的类
3 |
4 | # # 使用单线程串行方式执行
5 | # def get_page(str):
6 | # print('正在下载: ', str)
7 | # time.sleep(2) # 模拟阻塞操作
8 | # print('下载成功: ', str)
9 | #
10 | #
11 | # name_list = ['aa', 'bb', 'cc', 'dd']
12 | # start_time = time.time()
13 | # for i in range(len(name_list)):
14 | # get_page(name_list[i])
15 | # end_time = time.time()
16 | # print('%d second' % (end_time - start_time))
17 |
18 |
19 | # 使用线程池的方式执行
20 | start_time = time.time()
21 |
22 |
23 | def get_page(str):
24 | print('正在下载:', str)
25 | time.sleep(2) # 模拟阻塞操作
26 | print('下载成功:', str)
27 |
28 |
29 | name_list = ['aa', 'bb', 'cc', 'dd']
30 | # 实例化一个线程池对象
31 | pool = Pool(4)
32 | # 将列表中的每一个元素传递给get_page处理,返回值就是get_page的返回值
33 | pool.map(get_page, name_list)
34 | end_time = time.time()
35 |
36 | pool.close()
37 | pool.join()
38 | print(end_time - start_time, 'second')
39 |
--------------------------------------------------------------------------------
/基础篇/高性能异步爬虫/线程池的应用.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | from multiprocessing.dummy import Pool
4 | import time
5 | import os
6 |
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
9 | }
10 | # 原则: 线程池处理的是阻塞且耗时的操作
11 | url = 'https://www.pearvideo.com/category_1'
12 |
13 | page_text = requests.get(url=url, headers=headers).text
14 | tree = etree.HTML(page_text)
15 | video_src_list = tree.xpath('//*[@id="listvideoListUl"]/li')
16 |
17 | for li in video_src_list:
18 | video_src = 'https://www.pearvideo.com/' + li.xpath('./div[1]/a/@href')[0]
19 | # print(video_src)
20 | video_name = li.xpath('./div[1]/a/div[2]/text()')[0]
21 | # print(video_name)
22 |
23 |
--------------------------------------------------------------------------------
/自动化篇/playwright/使用本地浏览器创建debug模式/README.md:
--------------------------------------------------------------------------------
1 | ## 说明
2 |
3 | 此方法是playwright与本地浏览器以ws方式通信
4 |
5 | 可以绕过基本上大部分浏览器检测,因为这就是一个真正的浏览器
6 |
7 | 两种使用方式:
8 |
9 | 1. 每次运行程序之后先打开浏览器
10 |
11 | > 1. 找到自己桌面chrome的快捷方式键
12 | > 2. 点击属性
13 | > 3. 在目标一栏的最后添加 --remote-debugging-port=9999 端口可自定义
14 | > 4. ```
15 | > with sync_playwright() as p:
16 | > # 创建一个连接
17 | > browser = p.chromium.connect_over_cdp("http://localhost:9999")
18 | > content = browser.contexts[0]
19 | > page = content.new_page()
20 | > ```
21 | > 5. 在上述page下进行浏览器操作即可
22 |
23 | 2. 不打开浏览器,自行打开
24 | > 在程序中添加下面的代码即可
25 | >```
26 | >import subprocess
27 | ># 这个路径可以是Google浏览器的exe路径,也可以是快捷方式的路径
28 | >chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
29 | >debugging_port = "--remote-debugging-port=9999"
30 | >
31 | >command = f"{chrome_path} {debugging_port}"
32 | >subprocess.Popen(command, shell=True)
33 | >```
34 | >之后就是
35 | > ```
36 | > with sync_playwright() as p:
37 | > # 创建一个连接
38 | > browser = p.chromium.connect_over_cdp("http://localhost:9999")
39 | > content = browser.contexts[0]
40 | > page = content.new_page()
41 | > ```
42 | > 在上述page下进行浏览器操作即可
43 | >
44 | > __注意__:
45 | > 此方法不可以在打开了普通版(非第一种情况)的浏览器使用
--------------------------------------------------------------------------------
/自动化篇/playwright/使用本地浏览器创建debug模式/auto.py:
--------------------------------------------------------------------------------
1 | from playwright.sync_api import sync_playwright
2 |
3 | import subprocess
4 |
5 | # 这个路径可以是Google浏览器的exe路径,也可以是快捷方式的路径
6 | chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
7 | debugging_port = "--remote-debugging-port=9999"
8 |
9 | command = f"{chrome_path} {debugging_port}"
10 | subprocess.Popen(command, shell=True)
11 |
12 |
13 | # 拦截请求
14 | def intercept_xhr(route, request):
15 | route.continue_()
16 | response = route.fetch()
17 | json = response.json()
18 | print(json)
19 |
20 |
21 | with sync_playwright() as p:
22 | # 创建一个连接
23 | browser = p.chromium.connect_over_cdp("http://localhost:9999")
24 | content = browser.contexts[0]
25 | page = content.new_page()
26 |
27 | # 设置拦截规则
28 | page.route("**/api/sns/web/v1/homefeed", lambda route, request: intercept_xhr(route, request))
29 | page.goto('https://www.xiaohongshu.com/')
30 | page.wait_for_selector('.feeds-container')
31 |
32 | # 获取页面内容高度
33 | page_height = page.evaluate('() => document.body.scrollHeight')
34 |
35 | # 模拟鼠标滚动操作,向下滚动到底部
36 | while page.evaluate('() => window.scrollY + window.innerHeight') < page_height:
37 | page.mouse.wheel(0, 100) # 这里的参数可以根据需要进行调整
38 |
39 | page.wait_for_timeout(5000)
40 |
--------------------------------------------------------------------------------
/自动化篇/playwright/反检测浏览器/README.md:
--------------------------------------------------------------------------------
1 | ## 如何获取js文件?
2 |
3 | ```bash
4 | npx extract-stealth-evasions
5 | ```
6 | 需要本机有node环境才可下载,之后在项目下就会出现这个文件了
7 |
8 | 此方法可以绕过90%浏览器检测
--------------------------------------------------------------------------------
/自动化篇/playwright/反检测浏览器/demo.py:
--------------------------------------------------------------------------------
1 | from playwright.sync_api import sync_playwright
2 |
3 | # stealth.min.js文件的存放路径
4 | STEALTH_PATH = 'stealth.min.js'
5 |
6 | with sync_playwright() as p:
7 | # 创建一个正常的浏览器窗口
8 | browser = p.chromium.launch(
9 | headless=False,
10 | chromium_sandbox=False,
11 | ignore_default_args=["--enable-automation"],
12 | channel="chrome",
13 | )
14 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
15 | content = browser.new_context(user_agent=ua)
16 | # 添加初始化脚本
17 | content.add_init_script(path=STEALTH_PATH)
18 | # 创建页面
19 | page = content.new_page()
20 | page.goto('https://bot.sannysoft.com/')
21 | # 查看效果,和浏览器一致
22 | page.wait_for_timeout(5000)
23 | # 关闭所有
24 | page.close()
25 | content.close()
26 | browser.close()
27 |
--------------------------------------------------------------------------------
/自动化篇/playwright/起点vip/10086.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/自动化篇/playwright/起点vip/10086.png
--------------------------------------------------------------------------------
/自动化篇/playwright/起点vip/README.md:
--------------------------------------------------------------------------------
1 | ### 说明
2 |
3 | demo文件是我自己手写的截取固定元素下的所有内容
4 |
5 | demo2是使用playwright自带的api截取(可以说简直是方便了不少)
6 |
7 | 由于第一次写的时候不知道playwright自带了滚动截图所以手写了一段滚动
8 |
9 | > 注: 由于qidian的导航在滚动的时候会附着在顶部,所以我们在开始的时候用js把这个dom元素给删除掉就可以了
10 | >
11 | > 如果不删除会出现遮挡小说内容的情况
12 | >
13 | 可以说playwright自带的各种api太强大了
--------------------------------------------------------------------------------
/自动化篇/playwright/起点vip/demo.py:
--------------------------------------------------------------------------------
1 | import playwright.sync_api
2 | from PIL import Image
3 |
4 |
5 | def run(syncPlayWright: playwright.sync_api.Playwright, url: str, savePath: str, cookies: list[dict]):
6 | browser = syncPlayWright.chromium.launch(
7 | headless=False,
8 | chromium_sandbox=False,
9 | channel="chrome",
10 | )
11 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
12 | content = browser.new_context(user_agent=ua)
13 | content.add_init_script(path=r'D://crawlProjects/stealth.min.js')
14 | content.add_cookies(cookies)
15 | page = content.new_page()
16 |
17 | page.goto(url)
18 | # 获取 main 标签的高度
19 | rectangle = page.wait_for_selector('main')
20 | box = rectangle.bounding_box()
21 | main_height = box['height'] + box['y']
22 | main_left = box['x']
23 | main_offset = box['y']
24 | main_width = box['width']
25 | # 初始化截图列表
26 | screenshots = []
27 | # 逐步滚动并截取屏幕截图
28 | scroll_offset = main_offset
29 | prev = 0
30 | scroll_height = 500
31 | while True:
32 | # 滚动屏幕
33 | page.evaluate(f'window.scrollTo({prev}, {scroll_offset})')
34 | # 截个图
35 | page.wait_for_timeout(100)
36 | screenshots.append(page.screenshot(
37 | clip={"x": main_left, "y": 0, "width": main_width, "height": scroll_height}
38 | ))
39 | # 记录上一次的终点
40 | prev = scroll_offset
41 | # 判断边界
42 | if prev < main_height <= prev + scroll_height:
43 | page.evaluate(f'window.scrollTo(0, {prev})')
44 | page.wait_for_timeout(100)
45 | screenshots.append(page.screenshot(
46 | clip={"x": main_left, "y": 0, "width": main_width, "height": main_height - prev}
47 | ))
48 | break
49 | scroll_offset += scroll_height
50 |
51 | # 将截图拼接在一起
52 | full_screenshot = Image.new('RGB', (round(main_width), round(box['height'])))
53 | y_offset = 0
54 | for index, screenshot in enumerate(screenshots):
55 | with open(savePath, 'wb') as f:
56 | f.write(screenshot)
57 | img = Image.open(savePath)
58 | full_screenshot.paste(img, (0, y_offset))
59 | y_offset += img.height
60 | # 保存完整截图
61 | full_screenshot.save(savePath)
62 | page.close()
63 |
64 |
65 | if __name__ == '__main__':
66 | cookies = []
67 | cookie_string = '_csrfToken=;fu=;_yep_uuid=;ywguid=;ywkey=;ywopenid='
68 | cookie_items = cookie_string.split(';')
69 | for item in cookie_items:
70 | name, value = item.split('=')
71 | cookies.append({'name': name, 'value': value, 'domain': '.qidian.com', 'path': '/'})
72 | with playwright.sync_api.sync_playwright() as p:
73 | run(p, 'https://www.qidian.com/chapter/1036094942/764016875/', '10086.png', cookies)
74 |
--------------------------------------------------------------------------------
/自动化篇/playwright/起点vip/demo2.py:
--------------------------------------------------------------------------------
1 | import playwright.sync_api
2 |
3 |
4 | def run(syncPlayWright: playwright.sync_api.Playwright, url: str, savePath: str, cookies: list[dict]):
5 | run_js = 'document.getElementById("navbar").remove();'
6 | browser = syncPlayWright.chromium.launch(
7 | headless=False,
8 | chromium_sandbox=False,
9 | channel="chrome",
10 | )
11 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
12 | content = browser.new_context(user_agent=ua)
13 | content.add_init_script(path=r'D://crawlProjects/stealth.min.js')
14 | content.add_cookies(cookies)
15 | page = content.new_page()
16 | page.goto(url)
17 | page.evaluate(run_js)
18 | page.locator(".print").screenshot(path="screenshot.png", animations='disabled')
19 | page.close()
20 |
21 |
22 | if __name__ == '__main__':
23 | cookies = []
24 | cookie_string = ''
25 | cookie_items = cookie_string.split(';')
26 | for item in cookie_items:
27 | name, value = item.split('=')
28 | cookies.append({'name': name, 'value': value, 'domain': '.qidian.com', 'path': '/'})
29 | with playwright.sync_api.sync_playwright() as p:
30 | run(p, 'https://www.qidian.com/chapter/1035571469/733045990/', '10086.png', cookies)
31 |
--------------------------------------------------------------------------------
/自动化篇/playwright/邮政编码/hello.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from playwright.sync_api import sync_playwright, Error
3 |
4 |
5 | def getCode(addr):
6 | # 用同步的方式打开一个浏览器
7 | with sync_playwright() as p:
8 | try:
9 | # 设置浏览器配置
10 | browser = p.chromium.launch(headless=True)
11 | # 打开一个新窗口
12 | page = browser.new_page()
13 | # 去往这个链接
14 | page.goto('https://www.youbianku.com/%E9%A6%96%E9%A1%B5')
15 | # 等待页面加载完毕
16 | page.wait_for_load_state('load')
17 | # 通过id选中input框
18 | search_input = page.query_selector("#mySearchInput")
19 | # 往input框输入数据
20 | search_input.type(addr)
21 | # 通过id选择按钮
22 | search_button = page.query_selector("#mySearchButton")
23 | # 按钮点击
24 | search_button.click()
25 | # 表格选择器
26 | table_selector = ".zipcode-datas"
27 | # 等待表格渲染完毕
28 | page.wait_for_selector(table_selector, timeout=5000)
29 | # 通过class选择表格
30 | table = page.query_selector(table_selector)
31 | # 根据表格的类选择不同的行
32 | if "top-space" in table.get_attribute("class"):
33 | postal_code_selector = "tr:nth-child(5) td a"
34 | else:
35 | postal_code_selector = "tr:nth-child(3) td a"
36 | # 获取邮政编码所在的行
37 | postal_code_element = table.query_selector(postal_code_selector)
38 | # 获取邮政编码
39 | return postal_code_element.inner_text()
40 | except Error as e:
41 | # 捕获异常,出现地址错误可能表格无法加载
42 | print(e)
43 | return '000000'
44 | finally:
45 | # 关闭浏览器
46 | browser.close()
47 |
48 |
49 | if __name__ == '__main__':
50 | # 定义一个地址存储器,每次先从里面查找,找不到再去请求
51 | storgeCode = {
52 | }
53 | # 打开文佳佳
54 | fileA = pd.read_excel('./file.xlsx')
55 | # 修改格式
56 | new_header = fileA.iloc[0]
57 | fileA.columns = new_header
58 | # 遍历文件
59 | for index, row in fileA.iterrows():
60 | address = row['通讯地址']
61 | # 判断地址是否为空
62 | if pd.notna(address) and address != '通讯地址':
63 | if storgeCode.get(address, None) is None:
64 | # 找不到,就去请求
65 | code = getCode(address)
66 | storgeCode[address] = code
67 | postal_code = storgeCode[address]
68 | fileA.at[index, '邮政编码'] = postal_code
69 | else:
70 | continue
71 | # 每次修改后打印一下
72 | print(fileA.iloc[index]['姓名'], fileA.iloc[index]['通讯地址'], fileA.iloc[index]['邮政编码'])
73 | # 保存修改
74 | fileA.to_excel('updated_file.xlsx', index=False)
75 |
--------------------------------------------------------------------------------
/自动化篇/selenium/12306模拟登录.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.keys import Keys
3 | from time import sleep
4 | from selenium.webdriver import ActionChains
5 | from selenium.webdriver import ChromeOptions
6 |
7 | # 实现让selenium规避被检测到的风险
8 | option = ChromeOptions()
9 | option.add_experimental_option('excludeSwitches', ['enable-logging'])
10 | option.add_experimental_option("excludeSwitches", ['enable-automation'])
11 | option.add_argument("--no-sandbox")
12 | option.add_argument("--disable-dev-shm-usage")
13 | option.add_argument("--window-size=1920,1080") # 建议设置窗口大小
14 | # option.add_argument('--headless')
15 | option.add_argument('--disable-gpu')
16 |
17 | bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
18 |
19 |
20 |
21 |
22 | # 去除特征识别 防止服务器识别到的selenium的特征从而阻止后续的滑动验证
23 | bro.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
24 | "source": """
25 | Object.defineProperty(navigator, 'webdriver', {
26 | get: () => undefined
27 | })
28 | """
29 | })
30 |
31 | bro.get('https://kyfw.12306.cn/otn/resources/login.html')
32 | bro.maximize_window()
33 |
34 | # 标签定位
35 | user = bro.find_element_by_id('J-userName')
36 | pwd = bro.find_element_by_id('J-password')
37 |
38 | # 传入数据
39 | user.send_keys('')
40 | sleep(1)
41 | pwd.send_keys('')
42 | sleep(1)
43 |
44 | # 登录
45 | login = bro.find_element_by_id('J-login')
46 | login.click()
47 | sleep(2)
48 |
49 | slide = bro.find_element('id', 'nc_1_n1z')
50 |
51 | # 验证码
52 | action = ActionChains(bro)
53 | action.click_and_hold(slide)
54 | action.move_by_offset(300, 0).perform()
55 | sleep(2)
56 | # 点击确定
57 | ok = bro.find_element_by_class_name('btn')
58 | ok.click()
59 | sleep(2)
60 |
61 | ticket = bro.find_element_by_id('link_for_ticket')
62 | ticket.click()
63 | sleep(2)
64 |
65 | # 输入查询车站
66 | From = bro.find_element_by_id('fromStationText')
67 | From.click()
68 | From.send_keys('泸州')
69 | From.send_keys(Keys.ENTER)
70 | sleep(0.5)
71 |
72 | To = bro.find_element_by_id('toStationText')
73 | To.click()
74 | To.send_keys('乐山')
75 | To.send_keys(Keys.ENTER)
76 | sleep(0.5)
77 |
78 | # 找到出发站、到达站的隐藏HTML标签
79 | js = "document.getElementById('train_date').removeAttribute('readonly')" # 去除日期栏只读属性
80 | bro.execute_script(js)
81 |
82 | # 选择日期
83 | data = bro.find_element_by_id('train_date')
84 | data.clear()
85 | data.send_keys('2022-12-31')
86 | data.send_keys(Keys.ENTER)
87 | sleep(0.5)
88 |
89 | # 查询
90 | find = bro.find_element_by_id('query_ticket')
91 | find.click()
92 | sleep(2)
93 |
94 | # 关闭浏览器
95 | # sleep(5)
96 | # bro.quit()
97 |
--------------------------------------------------------------------------------
/自动化篇/selenium/Twisted-20.3.0-cp39-cp39-win_amd64.whl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/自动化篇/selenium/Twisted-20.3.0-cp39-cp39-win_amd64.whl
--------------------------------------------------------------------------------
/自动化篇/selenium/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/自动化篇/selenium/chromedriver.exe
--------------------------------------------------------------------------------
/自动化篇/selenium/selenium其他自动化操作.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from selenium import webdriver
4 | from time import sleep
5 |
6 | bro = webdriver.Chrome(executable_path='chromedriver.exe')
7 |
8 | if __name__ == '__main__':
9 | bro = webdriver.Chrome()
10 | bro.get("https://useragentstring.com/pages/useragentstring.php?name=Chrome")
11 |
12 | bro.quit()
13 |
14 | #
15 | # # 标签定位
16 | # search_input = bro.find_element_by_id('q')
17 | # # 标签交互
18 | # search_input.send_keys('iphone')
19 | # sleep(2)
20 | # # 执行一组js程序
21 | #
22 | # bro.execute_script('window.scrollTo(0,document.body.scrollHeight)')
23 | # sleep(2)
24 | #
25 | # # 点击搜索按钮
26 | # search = bro.find_element_by_class_name('btn-search')
27 | # search.click()
28 | #
29 | # # 切换页面
30 | # bro.get('https://www.baidu.com')
31 | # sleep(2)
32 | # # 回退
33 | # bro.back()
34 | # sleep(2)
35 | # # 前进
36 | # bro.forward()
37 | #
38 | # # 关闭浏览器
39 | # sleep(5)
40 | # bro.quit()
41 |
--------------------------------------------------------------------------------
/自动化篇/selenium/selenium模拟登录.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from time import sleep
3 | from selenium.webdriver import ActionChains
4 |
5 | bro = webdriver.Chrome(executable_path='chromedriver.exe')
6 |
7 | bro.get('https://qzone.qq.com')
8 |
9 | # 切换作用域
10 | bro.switch_to.frame('login_frame')
11 | # 标签定位与点击
12 | pwdLogin = bro.find_element_by_id('switcher_plogin')
13 | pwdLogin.click()
14 |
15 | # 输入账号密码
16 | zhanghao = bro.find_element_by_id('u')
17 | zhanghao.send_keys('')
18 | pwd = bro.find_element_by_id('p')
19 | pwd.send_keys('')
20 |
21 | login = bro.find_element_by_id('login_button')
22 | login.click()
23 |
24 | sleep(5)
25 | bro.quit()
26 |
--------------------------------------------------------------------------------
/自动化篇/selenium/动作链和iframe的处理.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from time import sleep
3 | from selenium.webdriver import ActionChains
4 |
5 | # 创建对象
6 | bro = webdriver.Chrome('chromedriver.exe')
7 | # 指定url
8 | bro.get('https://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
9 |
10 | # 想要定位的标签是存在于iframe之中,则必须通过如下操作再进行标签定位
11 | # div = bro.find_element_by_id('draggable') 错误的方法定位
12 | bro.switch_to.frame('iframeResult') # 切换到我们想要指定的iframe中
13 | div = bro.find_element_by_id('draggable')
14 |
15 | # 动作链
16 | action = ActionChains(bro)
17 | # 点击长按指定的标签
18 | action.click_and_hold(div)
19 | for i in range(5):
20 | # perform表示立即执行动作链操作
21 | action.move_by_offset(17, 0).perform()
22 | sleep(0.3)
23 | # 释放动作链
24 | action.release()
25 |
26 | # 退出浏览器
27 | sleep(5)
28 | bro.quit()
29 |
--------------------------------------------------------------------------------
/自动化篇/selenium/谷歌无头浏览器+反检测.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver import ChromeOptions
3 | from time import sleep
4 |
5 | # 实现让selenium规避被检测到的风险
6 | option = ChromeOptions()
7 | option.add_experimental_option('excludeSwitches', ['enable-automation'])
8 | # 实现无可视化界面的操作
9 | option.add_argument('--headless')
10 | option.add_argument('--disable-gpu')
11 |
12 | bro = webdriver.Chrome(executable_path='chromedriver.exe', options=option)
13 |
14 | # 无可视化界面(无头浏览器) phantomJs
15 | bro.get('https://www.baidu.com')
16 | print(bro.page_source)
17 |
18 | # 关闭浏览器
19 | sleep(5)
20 | bro.quit()
21 |
22 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/wasm/air/README.md:
--------------------------------------------------------------------------------
1 | # 某东航空
2 |
3 | 需要自己将滑块的html文档下载到本地,在acw文件中指定网页的路径,要绝对路径
4 |
5 | 之后按照acw的注释完成文件,即可
--------------------------------------------------------------------------------
/进阶篇/js逆向/wasm/air/acw_tc_3.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | from urllib.parse import urlparse, parse_qs
5 |
6 | import playwright.sync_api
7 | import requests
8 | from playwright.sync_api import sync_playwright
9 |
10 | # 存放滑块的页面
11 | FILEPATH = ''
12 |
13 | # 拦截验证的路由,自己写一下url, 格式参照playwright官网
14 | INTERRUPT_ROUTE = ''
15 |
16 | # 指定谷歌浏览器路径,以debug模式打开,如果已经打开了debug,下面四行代码可以注释掉
17 | # chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
18 | # debugging_port = "--remote-debugging-port=9999"
19 | #
20 | # command = f"{chrome_path} {debugging_port}"
21 | # subprocess.Popen(command, shell=True)
22 |
23 | # 创建的ws链接
24 | WS_URL = 'http://localhost:your_port'
25 |
26 | headers = {
27 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
28 | }
29 |
30 |
31 | def replace_info(html: str):
32 | # 识别出requestInfo
33 | pattern = re.compile(r'requestInfo\s*=\s*\{.*?};', re.S)
34 | # 读取旧文件
35 | with open(FILEPATH, 'r', encoding='utf-8') as f:
36 | old_html = f.read()
37 | # 从新html中查找info, 如果有就做替换,没有就保留
38 | info = pattern.findall(html)[0]
39 | if info:
40 | new_html = pattern.sub(info, old_html)
41 | with open(FILEPATH, 'w', encoding='utf-8') as f:
42 | f.write(new_html)
43 |
44 | def get_226() -> dict:
45 | pattern = re.compile(r'\((.*)\)', re.S)
46 | result: dict = {}
47 |
48 | def intercept_xhr(route: playwright.sync_api.Route):
49 | params = parse_qs(urlparse(route.request.url).query)
50 | result['t'] = params['t'][0]
51 | resp = requests.get(url=route.request.url, headers=headers)
52 | data = json.loads(pattern.findall(resp.text)[0])
53 | # 我们获取到了数据是不是应该返还给result
54 | print(data)
55 | route.abort()
56 |
57 | with sync_playwright() as p:
58 | # 使用强化脚本来过验证
59 | browser = p.chromium.launch(
60 | # headless=False,
61 | )
62 | ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
63 | content = browser.new_context(user_agent=ua)
64 | content.add_init_script(path=r'D://crawlProjects/stealth.min.js')
65 | page = content.new_page()
66 |
67 | # # 创建一个ws链接
68 | # browser = p.chromium.connect_over_cdp(WS_URL)
69 | # # 使用浏览器的上下文创建页面
70 | # content = browser.contexts[0]
71 | # page = content.new_page()
72 |
73 | page.route(INTERRUPT_ROUTE, intercept_xhr)
74 | page.goto(FILEPATH)
75 |
76 | btn = page.locator('#nc_1_n1z')
77 | btn_position = btn.bounding_box()
78 | new_x = btn_position['x'] + random.randint(290, 310)
79 | new_y = btn_position['y']
80 | page.mouse.click(btn_position['x'], btn_position['y'])
81 |
82 | page.mouse.down()
83 | page.mouse.move(new_x, new_y)
84 | page.mouse.up()
85 |
86 | page.close()
87 | content.close()
88 | browser.close()
89 |
90 | return result
91 |
92 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/wasm/air/ddd.js:
--------------------------------------------------------------------------------
1 | // 由于某些不可抗力此文件分享到百度网盘领取
2 | // 链接:https://pan.baidu.com/s/1DMzG2h0kwnWepxfzhQWBqA?pwd=jrsg
3 | // 提取码:jrsg
4 | // --来自百度网盘超级会员V3的分享
--------------------------------------------------------------------------------
/进阶篇/js逆向/webPack/五矿/crwalBase.py:
--------------------------------------------------------------------------------
1 | import json
2 | from random import uniform
3 | from time import sleep
4 | from typing import Union, Generator, Literal
5 |
6 | from curl_cffi import requests
7 | from ddddocr import DdddOcr
8 | from execjs import compile
9 |
10 | Method = Literal['get', 'post', 'POST', 'GET']
11 |
12 |
13 | class Crawler:
14 | # 设置请求session
15 | session = requests.Session()
16 | # 请求方式
17 | methodProcessors = {
18 | 'get': requests.get,
19 | 'post': requests.post
20 | }
21 | sessionProcessors = {
22 | 'get': session.get,
23 | 'post': session.post
24 | }
25 | # 验证码识别
26 | ocr = DdddOcr()
27 |
28 | def ajax_requests(
29 | self, url: str,
30 | method: Method,
31 | params: dict = None,
32 | jsonData: dict = None,
33 | retryTimes: int = 10,
34 | timeOut: int = 20,
35 | headers: dict = None,
36 | isSession: bool = False,
37 | cookies: dict = None,
38 | ) -> requests.Response:
39 | methodProcessor = self.methodProcessors[method] if not isSession else self.sessionProcessors[method]
40 | for _ in range(retryTimes):
41 | try:
42 | response = methodProcessor(
43 | url=url,
44 | headers=headers,
45 | cookies=cookies,
46 | params=params,
47 | data=json.dumps(jsonData, ensure_ascii=False, separators=(',', ':')),
48 | json=jsonData,
49 | timeout=timeOut
50 | )
51 | return response
52 | except Exception as e:
53 | sleep(uniform(5, 10))
54 | print(
55 | f"错误链接: {url}",
56 | f"请求出现错误, 正在重试: {_}/{retryTimes}",
57 | f"错误信息为: {e}",
58 | sep='\n'
59 | )
60 | else:
61 | raise '重试5次后仍然无法获取数据,可能是加密参数错误或者ip风控'
62 |
63 | def get_code(self, url: str, params: dict = None, jsonData: dict = None) -> str:
64 | imgBytes = self.ajax_requests(
65 | url=url,
66 | method='get',
67 | jsonData=jsonData,
68 | params=params
69 | ).content
70 | return self.ocr.classification(imgBytes)
71 |
72 | @staticmethod
73 | def open_js(path: str):
74 | return compile(open(path, 'r', encoding='utf-8').read())
75 |
76 | # 用于检查传入的键值是否正确
77 | @staticmethod
78 | def check_key(dic: dict, key: str) -> Union[str, int, list, dict]:
79 | if key not in dic:
80 | raise NameError(f'错误的初始化键值, key = {key}')
81 | return dic[key]
82 |
83 | # 在字典中搜索关键字,返回信息,可以搜索到字典中所有匹配的关键字
84 | @staticmethod
85 | def search_dict(items: dict, search_key: str) -> Generator:
86 | stack = [items]
87 | while stack:
88 | current_item = stack.pop()
89 | if isinstance(current_item, dict):
90 | for key, value in current_item:
91 | if search_key == key:
92 | yield value
93 | else:
94 | stack.append(value)
95 | elif isinstance(current_item, list):
96 | for value in current_item:
97 | stack.append(value)
98 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/webPack/五矿/demo.js:
--------------------------------------------------------------------------------
1 | const crypto = require('crypto');
2 |
3 | function w(hexString) {
4 | const buffer = Buffer.from(hexString, 'hex');
5 | return buffer.toString('base64');
6 | }
7 |
8 | function md5(data) {
9 | return crypto.createHash('md5').update(data).digest('hex');
10 | }
11 |
12 | function rsa(data, key) {
13 | const publicKey = `-----BEGIN PUBLIC KEY-----\n${key}\n-----END PUBLIC KEY-----`;
14 | const buffer = Buffer.from(data, 'utf-8');
15 | const publicKeyBuffer = Buffer.from(publicKey, 'utf-8');
16 | const encryptedData = crypto.publicEncrypt({
17 | key: publicKeyBuffer,
18 | padding: crypto.constants.RSA_PKCS1_PADDING
19 | }, buffer);
20 | return encryptedData.toString('hex')
21 | }
22 |
23 |
24 | function getParams(data, key) {
25 | let a = JSON.stringify({
26 | ...data,
27 | ...{
28 | sign: md5(JSON.stringify(data)),
29 | timeStamp: +new Date
30 | }
31 | })
32 | var r = '';
33 | n = a.match(/.{1,50}/g);
34 | n.forEach((function (A) {
35 | var t = rsa(A, key);
36 | r += t
37 | }))
38 | return w(r)
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/webPack/五矿/encode.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import binascii
3 | import hashlib
4 | import json
5 | import time
6 | from Crypto.PublicKey import RSA
7 | from Crypto.Cipher import PKCS1_v1_5
8 |
9 |
10 | def w(hex_string):
11 | byte_data = binascii.unhexlify(hex_string) # 将十六进制字符串转换为字节数组
12 | base64_data = base64.b64encode(byte_data) # 将字节数组转换为Base64编码
13 | return base64_data.decode('utf-8')
14 |
15 |
16 | def md5(data):
17 | return hashlib.md5(data.encode('utf-8')).hexdigest()
18 |
19 |
20 | def rsa(plaintext, key):
21 | publicKey = f'-----BEGIN PUBLIC KEY-----\n{key}\n-----END PUBLIC KEY-----'
22 | public_key = RSA.import_key(publicKey)
23 | cipher_rsa = PKCS1_v1_5.new(public_key)
24 | return cipher_rsa.encrypt(plaintext.encode('utf-8')).hex()
25 |
26 |
27 | def getParams(data, key):
28 | a = json.dumps({
29 | **data,
30 | **{
31 | 'sign': md5(json.dumps(data, separators=(',', ':'), ensure_ascii=False)),
32 | 'timeStamp': int(time.time() * 1000)
33 | }
34 | }, ensure_ascii=False, separators=(',', ':'))
35 | n = [rsa(a[i:i + 50], key) for i in range(0, len(a), 50)]
36 | return w(''.join(n))
37 |
38 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/浏览器指纹检测/易九批/test.py:
--------------------------------------------------------------------------------
1 | import json
2 | import time
3 |
4 | import execjs
5 | from curl_cffi import requests
6 |
7 | URLS = [
8 | 'https://www.yijiupi.com/v54/ProductCategory/ListCategoryTree',
9 | 'https://www.yijiupi.com/v54/PurchaseChannel/List',
10 | 'https://www.yijiupi.com/v54/ProductCategory/ListProductCategory'
11 | ]
12 |
13 |
14 | def get_data(json_data, url, sepUrl):
15 | timestamp = str(int(time.time()))
16 | headers = {
17 | 'Content-Type': 'application/json',
18 | 'token': '',
19 | }
20 | # 问题的关键是把中文好好处理!!
21 | data = json.dumps(json_data, ensure_ascii=False, separators=(',', ':'))
22 | x_ = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()) \
23 | .call('setHeader', 'POST', sepUrl, data, timestamp)
24 | headers.update(x_)
25 | response = requests.post(url, headers=headers,
26 | data=data, impersonate='chrome110')
27 | print(response.json())
28 |
29 |
30 | if __name__ == '__main__':
31 | set1 = {
32 | 'json_data': {
33 | 'data': {
34 | 'zoneId': '4932265882383941446',
35 | },
36 | 'cityId': '701',
37 | 'userClassId': 1,
38 | 'userDisplayClass': 0,
39 | 'addressId': '',
40 | 'deviceType': 3,
41 | },
42 | 'url': URLS[0],
43 | 'sepUrl': '/v54/ProductCategory/ListCategoryTree'
44 | }
45 | # get_data(**set1)
46 | set2 = {
47 | 'json_data': {
48 | 'cityId': '701',
49 | 'userClassId': 1,
50 | 'userDisplayClass': 0,
51 | 'addressId': '',
52 | 'deviceType': 3,
53 | },
54 | 'url': URLS[1],
55 | 'sepUrl': '/v54/PurchaseChannel/List'
56 | }
57 | # get_data(**set2)
58 | set3 = {
59 | 'json_data': {
60 | 'data': {
61 | 'sonCategoryId': '',
62 | 'brandId': '',
63 | 'firstCategoryId': '',
64 | 'searchKey': '国台国酱',
65 | 'specialAreaId': '',
66 | 'categoryIds': [],
67 | 'brandIds': [],
68 | 'labelId': None,
69 | 'isAscending': '',
70 | 'searchModes': [
71 | 2,
72 | ],
73 | 'sort': 0,
74 | 'shopId': '',
75 | 'currentPage': 1,
76 | 'pageSize': 60,
77 | 'filterSpecialArea': False,
78 | 'searchSource': 1,
79 | 'warehouseIds': [],
80 | 'searchKeyNotCorrect': False,
81 | 'couponTemplateId': '',
82 | 'channelId': '',
83 | },
84 | 'cityId': '701',
85 | 'userClassId': 1,
86 | 'userDisplayClass': 0,
87 | 'addressId': '',
88 | 'deviceType': 3,
89 | },
90 | 'url': URLS[2],
91 | 'sepUrl': '/v54/ProductCategory/ListProductCategory'
92 | }
93 | get_data(**set3)
94 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/环境检测/BossJob/chaojiying.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding:utf-8
3 |
4 | from hashlib import md5
5 |
6 | import requests
7 |
8 |
9 | class Chaojiying_Client(object):
10 |
11 | def __init__(self, username, password, soft_id):
12 | self.username = username
13 | password = password.encode('utf8')
14 | self.password = md5(password).hexdigest()
15 | self.soft_id = soft_id
16 | self.base_params = {
17 | 'user': self.username,
18 | 'pass2': self.password,
19 | 'softid': self.soft_id,
20 | }
21 | self.headers = {
22 | 'Connection': 'Keep-Alive',
23 | 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
24 | }
25 |
26 | def PostPic(self, im, codetype):
27 | """
28 | im: 图片字节
29 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html
30 | """
31 | params = {
32 | 'codetype': codetype,
33 | }
34 | params.update(self.base_params)
35 | files = {'userfile': ('ccc.jpg', im)}
36 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files,
37 | headers=self.headers)
38 | return r.json()
39 |
40 | def PostPic_base64(self, base64_str, codetype):
41 | """
42 | im: 图片字节
43 | codetype: 题目类型 参考 http://www.chaojiying.com/price.html
44 | """
45 | params = {
46 | 'codetype': codetype,
47 | 'file_base64': base64_str
48 | }
49 | params.update(self.base_params)
50 | r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, headers=self.headers)
51 | return r.json()
52 |
53 | def ReportError(self, im_id):
54 | """
55 | im_id:报错题目的图片ID
56 | """
57 | params = {
58 | 'id': im_id,
59 | }
60 | params.update(self.base_params)
61 | r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
62 | return r.json()
63 |
64 |
65 | if __name__ == '__main__':
66 | chaojiying = Chaojiying_Client('******', '******', '96001') # 用户中心>>软件ID 生成一个替换 96001
67 | im = open('a.jpg', 'rb').read() # 本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
68 | print(chaojiying.PostPic(im, 1902)) # 1902 验证码类型
69 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/环境检测/RedBook/README.md:
--------------------------------------------------------------------------------
1 | # 使用
2 | ## 补环境的完整版
3 | > 见RedBook.py文件。使用前需要修改new/jssss.js文件的localstorage以及cookie
4 |
5 | ## 无需补环境的部分ios端api
6 | > 见demo.py文件,基本上不需要cookie,需要cookie的函数我列出来了
7 | > 其中只有一个接口需要逆向一个x-sign参数,这个参数很好逆向
--------------------------------------------------------------------------------
/进阶篇/js逆向/环境检测/pdd/demo.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 | anti_content = execjs.compile(open('hello.js', 'r', encoding='utf-8').read()).call('dt')
5 |
6 | headers = {
7 | 'Accept': 'application/json, text/javascript',
8 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
10 | }
11 |
12 | params = {
13 | 'tf_id': 'TFRQ0v00000Y_13394',
14 | 'page': '1',
15 | 'size': '100',
16 | 'anti_content': anti_content
17 | }
18 |
19 | response = requests.get('https://apiv2.pinduoduo.com/api/gindex/tf/query_tf_goods_info', params=params, headers=headers)
20 | print(response.text)
21 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/环境检测/猿人学2023第一题/test.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 | cookies = {
5 | }
6 |
7 | headers = {
8 | 'authority': 'match2023.yuanrenxue.cn',
9 | 'accept': 'application/json, text/javascript, */*; q=0.01',
10 | 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
11 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
12 | }
13 | value = 0
14 | for page in range(1, 6):
15 | token = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('solve', page)
16 | response = requests.post('https://match2023.yuanrenxue.cn/api/match2023/1', cookies=cookies, headers=headers,
17 | data=token)
18 | data = response.json()['data']
19 | for v in data:
20 | value += v['value']
21 | print(value)
22 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/环境检测/饿了么/hello.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
Title
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/fjs/demo.js:
--------------------------------------------------------------------------------
1 | const Crypto = require('crypto-js')
2 |
3 | var c = 'EB444973714E4A40876CE66BE45D5930'
4 | var b = 'B5A8904209931867'
5 | function decrypt(t) {
6 | var e = Crypto.enc.Utf8.parse(c)
7 | , n = Crypto.enc.Utf8.parse(b)
8 | , a = Crypto.AES.decrypt(t, e, {
9 | iv: n,
10 | mode: Crypto.mode.CBC,
11 | padding: Crypto.pad.Pkcs7
12 | });
13 | return a.toString(Crypto.enc.Utf8)
14 | }
15 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/fjs/fjs.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 | pageNum = 1
4 | # 控制请求的页数
5 | while pageNum < 2:
6 | # 准备js逆向出请求头和表单签名
7 | ts = int(execjs.compile(open('sign.js', 'r', encoding='utf-8').read()).call('ts'))
8 | json_data = {
9 | 'pageNo': pageNum,
10 | 'pageSize': 40,
11 | 'total': 5770,
12 | 'AREACODE': '',
13 | 'M_PROJECT_TYPE': '',
14 | 'KIND': 'GCJS',
15 | 'GGTYPE': '1',
16 | 'PROTYPE': '',
17 | 'timeType': '6',
18 | 'BeginTime': '2022-07-18 00:00:00',
19 | 'EndTime': '2023-01-18 23:59:59',
20 | 'createTime': [],
21 | 'ts': ts,
22 | }
23 | sign = str(execjs.compile(open('sign.js', 'r', encoding='utf-8').read()).call('sign', json_data))
24 | headers = {
25 | 'Accept': 'application/json, text/plain, */*',
26 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
27 | 'Connection': 'keep-alive',
28 | 'Content-Type': 'application/json;charset=UTF-8',
29 | 'Origin': 'https://ggzyfw.fujian.gov.cn',
30 | 'Referer': 'https://ggzyfw.fujian.gov.cn/business/list/',
31 | 'Sec-Fetch-Dest': 'empty',
32 | 'Sec-Fetch-Mode': 'cors',
33 | 'Sec-Fetch-Site': 'same-origin',
34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
35 | 'portal-sign': sign,
36 | 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
37 | 'sec-ch-ua-mobile': '?0',
38 | 'sec-ch-ua-platform': '"Windows"',
39 | }
40 |
41 | # 发起请求
42 | response = requests.post('https://ggzyfw.fujian.gov.cn/FwPortalApi/Trade/TradeInfo', headers=headers, json=json_data).json()
43 | data = response['Data']
44 |
45 | # 解密文件
46 | ctx = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', data)
47 | print(ctx)
48 | pageNum += 1
49 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/fjs/sign.js:
--------------------------------------------------------------------------------
1 | const Crypto = require('crypto-js')
2 |
3 | var d = "B3978D054A72A7002063637CCDF6B2E5"
4 |
5 | function sign(t) {
6 | for (var e in t)
7 | "" !== t[e] && void 0 !== t[e] || delete t[e];
8 | var n = d + l(t);
9 | return s(n)
10 | }
11 | function s(e) {
12 | return md5(e)
13 | }
14 |
15 | function l(t) {
16 | for (var e = Object.keys(t).sort(u), n = "", a = 0; a < e.length; a++)
17 | if (void 0 !== t[e[a]])
18 | if (t[e[a]] && t[e[a]]instanceof Object || t[e[a]]instanceof Array) {
19 | var i = JSON.stringify(t[e[a]]);
20 | n += e[a] + i
21 | } else
22 | n += e[a] + t[e[a]];
23 | return n
24 | }
25 |
26 | // 创建标准md5算法
27 | function md5(text){
28 | return Crypto.MD5(text).toString()
29 | }
30 | function u(t, e) {
31 | return t.toString().toUpperCase() > e.toString().toUpperCase() ? 1 : t.toString().toUpperCase() == e.toString().toUpperCase() ? 0 : -1
32 | }
33 |
34 | // 测试数据
35 | data = {
36 | 'pageNo': 1,
37 | 'pageSize': 20,
38 | 'total': 0,
39 | 'AREACODE': '',
40 | 'M_PROJECT_TYPE': '',
41 | 'KIND': 'GCJS',
42 | 'GGTYPE': '1',
43 | 'PROTYPE': '',
44 | 'timeType': '6',
45 | 'BeginTime': '2022-07-18 00:00:00',
46 | 'EndTime': '2023-01-18 23:59:59',
47 | 'createTime': [],
48 | 'ts': ts(),
49 | }
50 |
51 | // 生成时间戳
52 | function ts(){
53 | return (new Date).getTime()
54 | }
55 |
56 | console.log(ts())
57 | console.log(sign(data))
58 |
59 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/football/599_info.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 | import time
4 |
5 | headers = {
6 | 'authority': 'api.599.com',
7 | 'accept': 'application/json, text/plain, */*',
8 | 'accept-language': 'zh-CN,zh;q=0.9',
9 | 'cache-control': 'no-cache',
10 | 'origin': 'https://599.com',
11 | 'pragma': 'no-cache',
12 | 'referer': 'https://599.com/',
13 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
14 | 'sec-ch-ua-mobile': '?0',
15 | 'sec-ch-ua-platform': '"Windows"',
16 | 'sec-fetch-dest': 'empty',
17 | 'sec-fetch-mode': 'cors',
18 | 'sec-fetch-site': 'same-site',
19 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
20 | }
21 | ts = int(time.time() * 1000)
22 | pre_params = {
23 | "appType": "3",
24 | "channelNumber": "GF1001",
25 | "comId": "8",
26 | "lang": "zh",
27 | "platform": "pc",
28 | "st": ts,
29 | "timeZone": "8",
30 | "version": "671",
31 | "versionCode": "671"
32 | }
33 | sign = execjs.compile(open('js/sss.js', 'r', encoding='utf-8').read()).call('Z', '/footballapi/core/matchlist/v2/immediate', pre_params)
34 | params = {
35 | 'comId': '8',
36 | 'lang': 'zh',
37 | 'timeZone': '8',
38 | 'version': '671',
39 | 'versionCode': '671',
40 | 'channelNumber': 'GF1001',
41 | 'platform': 'pc',
42 | 'appType': '3',
43 | 'st': str(ts),
44 | 'sign': sign,
45 | }
46 | response = requests.get('https://api.599.com/footballapi/core/matchlist/v2/immediate', params=params, headers=headers)
47 |
48 | data = response.json()['data']
49 | ctx = execjs.compile(open('js/demo.js', 'r', encoding='utf-8').read()).call('decrypt', data)
50 | print(ctx)
51 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/football/js/sss.js:
--------------------------------------------------------------------------------
1 | const crypto = require('crypto-js')
2 |
3 | function md5(text){
4 | text = String(text)
5 | return crypto.MD5(text).toString()
6 | }
7 |
8 | var e = '/footballapi/core/matchlist/v2/immediate'
9 | var t = {
10 | "appType": "3",
11 | "channelNumber": "GF1001",
12 | "comId": "8",
13 | "lang": "zh",
14 | "platform": "pc",
15 | "st": 1678167676726,
16 | "timeZone": "8",
17 | "version": "671",
18 | "versionCode": "671"
19 | }
20 |
21 | function l() {
22 | return e
23 | }
24 | function Z(e, t) {
25 | var n = {}
26 | , o = e;
27 | for (var r in Object.keys(t).sort().map((function(e) {
28 | n[e] = t[e]
29 | }
30 | )),
31 | n)
32 | o = o + r + n[r];
33 | return o += md5("wjj"),
34 | md5(o).toLowerCase() + "99"
35 | }
36 |
37 | console.log(Z(e, t));
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/唯一艺术/demo.js:
--------------------------------------------------------------------------------
1 | const Crypto = require('crypto-js')
2 |
3 | var data = 'truiLeKm7AKyuie+33QCYVOB58uNUU9k+FEIeXVsr/ztKrMa9ytcHn11hxFo6XLAe2ye5nNmVQAAZ3zKiCcZZoPPcUBuypN/3xXg6+l98m38zldv8b2wlIVuy24U1PxbPFKGrQEbJTTwnoujMCcaeZfiOdyyjSMX24EXL8o244bbHdJm6UWRWxMux1ICO9tBg10IQxFo+j8/Cc3jAdGAlg=='
4 |
5 | window = {
6 | deciphering: function (t){
7 | {
8 | e = "4tBlCLWFZ3eD93CvDE2lpw==" || 32;
9 | var o = "ABCDEFGHJKMNPQRSTWXYZabcdefhijkmnprstwxyz2345678"
10 | , r = o.length;
11 | for (let t = 0; t < e; t++)
12 | o.charAt(Math.floor(Math.random() * r));
13 | return t
14 | }
15 | }
16 | }
17 |
18 | function encryptSelf(t, o) {
19 | var r = Crypto.enc.Base64.parse("4tBlCLWFZ3eD93CvDE2lpw==");
20 | let i = JSON.stringify({
21 | id: t.substr(0, t.length - 1),
22 | sum: o
23 | });
24 | var s = Crypto.enc.Utf8.parse(i);
25 | return Crypto.AES.encrypt(s, r, {
26 | mode: Crypto.mode.ECB,
27 | padding: Crypto.pad.Pkcs7
28 | }).toString()
29 | }
30 | function decrypt(t) {
31 | var e = Crypto.enc.Base64.parse("5opkytHOggKj5utjZOgszg==")
32 | var o = Crypto.AES.decrypt(t, e, {
33 | mode: Crypto.mode.ECB,
34 | padding: Crypto.pad.Pkcs7
35 | });
36 | return Crypto.enc.Utf8.stringify(o).toString()
37 | }
38 |
39 | function getSign(data){
40 | let dataresult = decrypt(data)
41 | , dataResultFun = dataresult.split(",")[0].substr(4)
42 | , dataResultId = dataresult.split(",")[1].split("=")[1]
43 | , sigresult = eval(dataResultFun);
44 | console.log(sigresult)
45 | return encryptSelf(dataResultId, sigresult)
46 | // return sigresult
47 | }
48 |
49 | function design(data){
50 | return encodeURIComponent(data)
51 | }
52 |
53 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/唯一艺术/test.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 |
5 | def getSign():
6 | url = 'https://api.theone.art/market/api/key/get'
7 | headers = {
8 | 'Accept': 'application/json, text/plain, */*',
9 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
10 | 'Connection': 'keep-alive',
11 | 'Origin': 'https://www.theone.art',
12 | 'Referer': 'https://www.theone.art/',
13 | 'Sec-Fetch-Dest': 'empty',
14 | 'Sec-Fetch-Mode': 'cors',
15 | 'Sec-Fetch-Site': 'same-site',
16 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
17 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
18 | 'sec-ch-ua-mobile': '?0',
19 | 'sec-ch-ua-platform': '"Windows"',
20 | }
21 | res = requests.get(url=url, headers=headers).json()
22 | data = str(res['data'])
23 | sign = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('getSign', data)
24 | return sign
25 |
26 |
27 | headers = {
28 | 'Accept': 'application/json, text/plain, */*',
29 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
30 | 'Connection': 'keep-alive',
31 | 'Content-Type': 'application/json;charset=UTF-8',
32 | 'Origin': 'https://www.theone.art',
33 | 'Referer': 'https://www.theone.art/',
34 | 'Sec-Fetch-Dest': 'empty',
35 | 'Sec-Fetch-Mode': 'cors',
36 | 'Sec-Fetch-Site': 'same-site',
37 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
38 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
39 | 'sec-ch-ua-mobile': '?0',
40 | 'sec-ch-ua-platform': '"Windows"',
41 | 'sig': '8hJWPRjfS7l%2Fj86OrejRjAZDLiwIzZfQcKKIuEWB3154u4wv3WeQIv2pV3nzAo3HnXEoW0t6Tmxp9nRUjnrGtA%3D%3D',
42 | }
43 |
44 | for pageNum in range(1, 20):
45 | sign = getSign()
46 | hsign = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('design', sign)
47 | json_data = {
48 | 'authorId': None,
49 | 'chainContract': None,
50 | 'commodityCategoryId': None,
51 | 'commodityCategoryIdList': [],
52 | 'commodityId': None,
53 | 'highPrice': '',
54 | 'lowPrice': '',
55 | 'pageCount': pageNum,
56 | 'pageSize': 20,
57 | 'seriesWorks': None,
58 | 'seriesWorksId': None,
59 | 'sort': {
60 | 'field': 2,
61 | 'upOrDown': 1,
62 | },
63 | 'statusSell': 1,
64 | 'topicId': None,
65 | 'typeMarket': 1,
66 | 'commodityTraitList': [],
67 | 'sig': sign,
68 | }
69 | response = requests.post('https://api.theone.art/market/api/saleRecord/list/v2', headers=headers, json=json_data)
70 | res = response.json()["data"]
71 | print(res)
72 |
73 |
74 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/娱乐指数/ylzs.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 | cookies = {
5 | 'mobile_iindex_uuid': '9f0ae384-2821-5797-8a76-87bb1cef4a1f',
6 | 'Hm_lvt_2873e2b0bdd5404c734992cd3ae7253f': '1674101222,1674103567',
7 | 'Hm_lpvt_2873e2b0bdd5404c734992cd3ae7253f': '1674103567',
8 | }
9 |
10 | headers = {
11 | 'authority': 'www.chinaindex.net',
12 | 'accept': 'application/json, text/plain, */*',
13 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
14 | # 'cookie': 'mobile_iindex_uuid=9f0ae384-2821-5797-8a76-87bb1cef4a1f; Hm_lvt_2873e2b0bdd5404c734992cd3ae7253f=1674101222,1674103567; Hm_lpvt_2873e2b0bdd5404c734992cd3ae7253f=1674103567',
15 | 'funcid': 'undefined',
16 | 'incognitomode': '0',
17 | 'referer': 'https://www.chinaindex.net/ranklist/5/0',
18 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
19 | 'sec-ch-ua-mobile': '?0',
20 | 'sec-ch-ua-platform': '"Windows"',
21 | 'sec-fetch-dest': 'empty',
22 | 'sec-fetch-mode': 'cors',
23 | 'sec-fetch-site': 'same-origin',
24 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
25 | 'uuid': '9f0ae384-2821-5797-8a76-87bb1cef4a1f',
26 | }
27 |
28 | params = {
29 | 'keyWord': '李知恩',
30 | 'sign': 'b3776cdf6331ee0f6653d1de544291c3'
31 | }
32 |
33 | response = requests.get(
34 | 'https://www.chinaindex.net/iIndexMobileServer/mobile/comm/getSearchResult',
35 | params=params,
36 | cookies=cookies,
37 | headers=headers,
38 | )
39 |
40 | r = response.json()['data']
41 | lastFetchTime = response.json()['lastFetchTime']
42 |
43 | ctx = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', r, lastFetchTime)
44 |
45 | print(ctx)
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/广东省公共资源交易/demo.js:
--------------------------------------------------------------------------------
1 | const crypto = require('crypto')
2 | const Py = "zxcvbnmlkjhgfdsaqwertyuiop0987654321QWERTYUIOPLKJHGFDSAZXCVBNM" , jq = Py + "-@#$%^&*+!";
3 | function Nonce(e) {
4 | return [...Array(e)].map(()=>Py[Vq(0, 61)]).join("")
5 | }
6 | function Vq(e, t) {
7 | switch (arguments.length) {
8 | case 1:
9 | return parseInt(Math.random() * e + 1, 10);
10 | case 2:
11 | return parseInt(Math.random() * (t - e + 1) + e, 10);
12 | default:
13 | return 0
14 | }
15 | }
16 | function lr(e=[]) {
17 | return e.map(t=>jq[t]).join("")
18 | }
19 | function Rg(e={}) {
20 | const {p: t, t: n, n: u, k: o} = e
21 | , r = zq(t);
22 | console.log(r)
23 | const hash = crypto.createHash('sha256')
24 | return hash.update(u + o + decodeURIComponent(r) + n).digest('hex')
25 | }
26 | function zq(e) {
27 | let t = "";
28 | return typeof e == "object" ? t = Object.keys(e).map(n=>`${n}=${e[n]}`).sort().join("&") : typeof e == "string" && (t = e.split("&").sort().join("&")),
29 | t
30 | }
31 | function hash256(datas){
32 | let c = lr([8, 28, 20, 42, 21, 53, 65, 6])
33 | a = Date.now()
34 | let l = Nonce(16)
35 | let Signature = Rg({
36 | p: JSON.stringify(datas).replace(/:/g, "=").replace(/["{}]/g, '').replace(/,/g, '&'),
37 | t: a,
38 | n: l,
39 | k: c
40 | })
41 | text = {
42 | App: lr([11, 11, 0, 21, 62, 25, 24, 19, 20, 15, 7]),
43 | Nonce: l,
44 | Signature: Signature,
45 | Timestamp: a,
46 | }
47 | return text
48 | }
49 |
50 | data = {
51 | 'type': "trading-type",
52 | "publishStartTime": "",
53 | "publishEndTime": "",
54 | "siteCode": "44",
55 | "secondType": "A",
56 | "projectType": "",
57 | "thirdType": "",
58 | "dateType": "",
59 | "total": 189352,
60 | "pageNo": 5,
61 | "pageSize": 10,
62 | "openConvert": false
63 | }
64 |
65 | console.log(hash256(data))
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/广东省公共资源交易/guang.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 | cookies = {
5 | '_horizon_uid': 'd6a5d5ea-b057-4431-8d41-982f8bf12b08',
6 | '_horizon_sid': 'e2c9e3b6-2ee8-49e6-a54a-0a15a39ee1b7',
7 | }
8 |
9 |
10 | def fun(page):
11 | json_data = {
12 | 'type': 'trading-type',
13 | 'publishStartTime': '',
14 | 'publishEndTime': '',
15 | 'siteCode': '44',
16 | 'secondType': 'A',
17 | 'projectType': '',
18 | 'thirdType': '',
19 | 'dateType': '',
20 | 'total': 189836,
21 | 'pageNo': page,
22 | 'pageSize': 10,
23 | 'openConvert': False,
24 | }
25 |
26 | data = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('hash256', json_data)
27 |
28 | headers = {
29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
30 | 'X-Dgi-Req-App': data['App'],
31 | 'X-Dgi-Req-Nonce': data['Nonce'],
32 | 'X-Dgi-Req-Signature': data['Signature'],
33 | 'X-Dgi-Req-Timestamp': str(data['Timestamp']),
34 | }
35 |
36 | response = requests.post('https://ygp.gdzwfw.gov.cn/ggzy-portal/search/v1/items', cookies=cookies, headers=headers,
37 | json=json_data)
38 | print(response.json())
39 |
40 |
41 | if __name__ == '__main__':
42 | for page in range(1, 5):
43 | fun(page)
44 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/有道翻译/demo.js:
--------------------------------------------------------------------------------
1 | // const Crypto = require('crypto-js逆向')
2 | //
3 | //
4 | // var data = 'Z21kD9ZK1ke6ugku2ccWuz4Ip5f4PLCoxWstZf_6UUyBoy8dpWc3NOXFRrnPMya7chcEL7e2Yz1xjFqcfdncOW4vOoJ66RTmRa8-dGZla_ExpWOUP0G1QJFtJ6Gj0ngir07R0ETWttaGO185v5rccLlZKqOCmJuChZSA-Dw9U6B2AOK4-RqYjAQEQ5vF7ph71eC5ZEvV6dm_xv0ywEOKi58R9xWx7fiJytxxlsz-oprAHdRXnI6kWszLLJJpr45DMBjoeArZfVssgWXzX_IlNUvTtj_1o95BpERVvV1FxGEeN-_TLgLaK9j7rjT4O-yPHpbuCk9q1BpLVSh3B4CPWCZPMIHwJiFtfQAC8_t-HWs45DWbW54DEny_doBItZ6v'
5 | // var key = 'ydsecret://query/key/B*RGygVywfNBwpmBaZg*WT7SIOUP2T0C9WHMZN39j^DAdaZhAnxvGcCY6VYFwnHl'
6 | // var iv = 'ydsecret://query/iv/C@lZe2YzHtZ2CYgaXKSVfsb7Y4QWHjITPPZ0nQp87fBeJ!Iv6v^6fvi2WN@bYpJ4'
7 | //
8 | // var ax = [8, 20, 157, 167, 60, 89, 206, 98, 85, 91, 1, 233, 47, 52, 232, 56]
9 | // var b = [210, 187, 27, 253, 232, 59, 56, 195, 68, 54, 99, 87, 183, 156, 174, 28]
10 |
11 | let data01 = '08149da73c59ce62555b01e92f34e838'//十六进制
12 |
13 | let newdata = Buffer.from(data01,'hex');//先把数据存在buf里面
14 |
15 | console.log("newdata ",newdata);
16 |
17 | console.log(newdata.toString("utf-8"));//使用toString函数就能转换成字符串
18 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/有道翻译/youdao.py:
--------------------------------------------------------------------------------
1 | import json
2 | from Crypto.Cipher import AES
3 | import base64
4 | import time
5 | from hashlib import md5
6 | import requests
7 |
8 |
9 | def sign():
10 | t = int(time.time() * 1000)
11 | n = f'client=fanyideskweb&mysticTime={t}&product=webfanyi&key=fsdsogkndfokasodnaso'
12 | obj = md5()
13 | obj.update(n.encode('utf-8'))
14 | sign = obj.hexdigest()
15 | return sign
16 |
17 |
18 | def decrypto(data):
19 | key = b'\x08\x14\x9d\xa7\x3c\x59\xce\x62\x55\x5b\x01\xe9\x2f\x34\xe8\x38'
20 | iv = b'\xd2\xbb\x1b\xfd\xe8\x3b\x38\xc3\x44\x36\x63\x57\xb7\x9c\xae\x1c'
21 | aes = AES.new(key, AES.MODE_CBC, iv)
22 | den_text = aes.decrypt(base64.urlsafe_b64decode(data))
23 | return str(den_text, 'utf-8').strip()
24 |
25 |
26 | def post(w, f, t):
27 | cookies = {
28 | 'OUTFOX_SEARCH_USER_ID': '123456789@192.168.60.5',
29 | }
30 | headers = {
31 | 'Accept': 'application/json, text/plain, */*',
32 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
33 | 'Connection': 'keep-alive',
34 | # 'Cookie': 'OUTFOX_SEARCH_USER_ID_NCOO=340028215.7799288; OUTFOX_SEARCH_USER_ID=-1551186736@49.52.96.107; P_INFO=18608219667|1670406132|1|youdaonote|00&99|null&null&null#shh&null#10#0|&0||18608219667',
35 | 'Origin': 'https://fanyi.youdao.com',
36 | 'Referer': 'https://fanyi.youdao.com/',
37 | 'Sec-Fetch-Dest': 'empty',
38 | 'Sec-Fetch-Mode': 'cors',
39 | 'Sec-Fetch-Site': 'same-site',
40 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
41 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
42 | 'sec-ch-ua-mobile': '?0',
43 | 'sec-ch-ua-platform': '"Windows"',
44 | }
45 | data = {
46 | 'i': w,
47 | 'from': f,
48 | 'to': t,
49 | 'dictResult': 'true',
50 | 'keyid': 'webfanyi',
51 | 'sign': sign(),
52 | 'client': 'fanyideskweb',
53 | 'product': 'webfanyi',
54 | 'appVersion': '1.0.0',
55 | 'vendor': 'web',
56 | 'pointParam': 'client,mysticTime,product',
57 | 'mysticTime': str(int(time.time() * 1000)),
58 | 'keyfrom': 'fanyi.web',
59 | }
60 | response = requests.post('https://dict.youdao.com/webtranslate', headers=headers, data=data, cookies=cookies)
61 | return response.text
62 |
63 |
64 | if __name__ == '__main__':
65 | while True:
66 | try:
67 | From = input('请输入开始语言(自动auto, 中文zh-CHS, 韩文ko, 英文en)\n')
68 | To = input('请输入翻译的语言(默认, 中文zh-CHS, 韩文ko, 英文en)\n')
69 | word = input('请输入单词:')
70 | enc = post(word, From, To)
71 | ctx = decrypto(enc)
72 | print(ctx)
73 | except:
74 | print('出现异常,请重新输入!')
75 | continue
76 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/烯牛数据/xiniu.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 |
5 | cookies = {
6 | 'btoken': '89091VUM5EXO41RJFVJ7G478EIJV2990',
7 | 'hy_data_2020_id': '185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb',
8 | 'hy_data_2020_js_sdk': '%7B%22distinct_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%2C%22site_id%22%3A211%2C%22user_company%22%3A105%2C%22props%22%3A%7B%7D%2C%22device_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%7D',
9 | 'sajssdk_2020_cross_new_user': '1',
10 | 'Hm_lvt_42317524c1662a500d12d3784dbea0f8': '1674013672',
11 | 'Hm_lpvt_42317524c1662a500d12d3784dbea0f8': '1674021425',
12 | }
13 |
14 | headers = {
15 | 'authority': 'www.xiniudata.com',
16 | 'accept': 'application/json',
17 | 'accept-language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
18 | 'content-type': 'application/json',
19 | # 'cookie': 'btoken=89091VUM5EXO41RJFVJ7G478EIJV2990; hy_data_2020_id=185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb; hy_data_2020_js_sdk=%7B%22distinct_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%2C%22site_id%22%3A211%2C%22user_company%22%3A105%2C%22props%22%3A%7B%7D%2C%22device_id%22%3A%22185c2fd82a1a09-073d27a69f05c6-26021151-1327104-185c2fd82a2dcb%22%7D; sajssdk_2020_cross_new_user=1; Hm_lvt_42317524c1662a500d12d3784dbea0f8=1674013672; Hm_lpvt_42317524c1662a500d12d3784dbea0f8=1674021425',
20 | 'origin': 'https://www.xiniudata.com',
21 | 'referer': 'https://www.xiniudata.com/industry/newest?from=data',
22 | 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
23 | 'sec-ch-ua-mobile': '?0',
24 | 'sec-ch-ua-platform': '"Windows"',
25 | 'sec-fetch-dest': 'empty',
26 | 'sec-fetch-mode': 'cors',
27 | 'sec-fetch-site': 'same-origin',
28 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
29 | }
30 |
31 | payload = {
32 | "sort": 1,
33 | "start": 0,
34 | "limit": 20
35 | }
36 |
37 | pl = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('hhy', payload))
38 | sig = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('hy', payload))
39 |
40 | json_data = {
41 | 'payload': pl,
42 | 'sig': sig,
43 | 'v': 1,
44 | }
45 |
46 | response = requests.post(
47 | 'https://www.xiniudata.com/api2/service/x_service/person_industry_list/list_industries_by_sort',
48 | # https://www.xiniudata.com/api2/service/x_service/person_industry_list/list_industries_by_sort
49 | cookies=cookies,
50 | headers=headers,
51 | json=json_data,
52 | )
53 |
54 | res = response.json()['d']
55 |
56 | ctx = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('dy', res))
57 |
58 | print(ctx)
59 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/网易云音乐/decrpyo.py:
--------------------------------------------------------------------------------
1 | import random
2 | from binascii import hexlify
3 | import base64
4 | from Crypto.Cipher import AES
5 |
6 | e = "010001"
7 | g = "0CoJUm6Qyw8W8jud"
8 | f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7"
9 | i3x = '{"csrf_token":"","cursor":"1672939386847","offset":"0","orderType":"1","pageNo":"3","pageSize":"20","rid":"R_SO_4_1835283134","threadId":"R_SO_4_1835283134"}'
10 |
11 |
12 | # 生成随机的16位字符传
13 | def RandomString(a):
14 | string = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
15 | randomStr = random.sample(string, a)
16 | return ''.join(randomStr)
17 |
18 |
19 | # AES加密算法
20 | def AESEncrypto(text, key):
21 | BS = 16
22 | pad = lambda s: s + (BS - len(s) % BS) * bytes([BS - len(s) % BS])
23 | c = key.encode("utf-8")
24 | d = "0102030405060708".encode("utf-8")
25 | e = text.encode("utf-8")
26 | aes = AES.new(c, AES.MODE_CBC, d)
27 | enc = base64.b64encode(aes.encrypt(pad(e))).decode("utf-8")
28 | return enc
29 |
30 |
31 | # RSA加密
32 | def RSAEncrypto(text):
33 | text = text[::-1] # 表示文本倒序
34 | result = pow(int(hexlify(text.encode('utf-8')), 16), int(e, 16), int(f, 16))
35 | return format(result, 'x').zfill(131)
36 |
37 |
38 | def d(text):
39 | i = RandomString(16)
40 | encText = AESEncrypto(text, g)
41 | encText = AESEncrypto(encText, i)
42 | encSecKey = RSAEncrypto(i)
43 | h = {
44 | "encText": encText,
45 | "encSecKey": encSecKey
46 | }
47 | return h
48 |
49 |
50 |
51 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/艺恩数据/yien.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 | headers = {
5 | 'Accept': 'text/plain, */*; q=0.01',
6 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
7 | 'Connection': 'keep-alive',
8 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
9 | 'Origin': 'https://www.endata.com.cn',
10 | 'Sec-Fetch-Dest': 'empty',
11 | 'Sec-Fetch-Mode': 'cors',
12 | 'Sec-Fetch-Site': 'same-origin',
13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
14 | 'X-Requested-With': 'XMLHttpRequest',
15 | 'sec-ch-ua': '"Not_A Brand";v="99", "Google Chrome";v="109", "Chromium";v="109"',
16 | 'sec-ch-ua-mobile': '?0',
17 | 'sec-ch-ua-platform': '"Windows"',
18 | }
19 |
20 | data = {
21 | 'year': '2023',
22 | 'MethodName': 'BoxOffice_GetYearInfoData',
23 | }
24 |
25 | response = requests.post('https://www.endata.com.cn/API/GetData.ashx', headers=headers, data=data)
26 |
27 | ctx = execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', response.text)
28 | print(ctx)
29 |
--------------------------------------------------------------------------------
/进阶篇/js逆向/请求头请求体加密/行行查/hanghangcha.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import execjs
3 |
4 | cookies = {
5 | 'UM_distinctid': '185c4195bd7e6f-038d88d1a6e504-26021151-144000-185c4195bd8968',
6 | 'Hm_lvt_1521e0fb49013136e79181f2888214a7': '1674032275',
7 | 'Hm_lpvt_1521e0fb49013136e79181f2888214a7': '1674032275',
8 | 'JSESSIONID': 'F83DF5ABA6CAAEE674C850D3483CB550',
9 | '_ACCOUNT_': 'OTM0NmEzMDU1YmEzNGY4MDk3NjliZDI4NjUyNzhmNDElNDAlNDBtb2JpbGU6MTY3NTI0MzYxNzI2NjowYjBlNmMwYzJhZTFhYjFjNzFjZjIyYTQ5MDM1ZDA4Yg',
10 | }
11 |
12 | headers = {
13 | 'Accept': 'application/json, text/javascript, */*; q=0.01',
14 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
15 | 'Auth-Plus': '',
16 | 'Connection': 'keep-alive',
17 | # 'Cookie': 'UM_distinctid=185c4195bd7e6f-038d88d1a6e504-26021151-144000-185c4195bd8968; Hm_lvt_1521e0fb49013136e79181f2888214a7=1674032275; Hm_lpvt_1521e0fb49013136e79181f2888214a7=1674032275; JSESSIONID=F83DF5ABA6CAAEE674C850D3483CB550; _ACCOUNT_=OTM0NmEzMDU1YmEzNGY4MDk3NjliZDI4NjUyNzhmNDElNDAlNDBtb2JpbGU6MTY3NTI0MzYxNzI2NjowYjBlNmMwYzJhZTFhYjFjNzFjZjIyYTQ5MDM1ZDA4Yg',
18 | 'Origin': 'https://www.hanghangcha.com',
19 | 'Sec-Fetch-Dest': 'empty',
20 | 'Sec-Fetch-Mode': 'cors',
21 | 'Sec-Fetch-Site': 'same-site',
22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
23 | 'X-Requested-With': 'XMLHttpRequest',
24 | 'clientInfo': 'web',
25 | 'clientVersion': '1.0.0',
26 | 'sec-ch-ua': '"Not?A_Brand";v="8", "Chromium";v="108", "Google Chrome";v="108"',
27 | 'sec-ch-ua-mobile': '?0',
28 | 'sec-ch-ua-platform': '"Windows"',
29 | }
30 |
31 | params = {
32 | 'filter': '{"title":null,"sortType":null,"limit":9,"skip":0,"userId":2636778}',
33 | }
34 |
35 | response = requests.get(
36 | 'https://api.hanghangcha.com/hhc/industry/articleWithTags',
37 | params=params,
38 | cookies=cookies,
39 | headers=headers,
40 | )
41 |
42 | data = response.json()['data']
43 |
44 | ctx = str(execjs.compile(open('demo.js', 'r', encoding='utf-8').read()).call('decrypt', data))
45 |
46 | print(ctx)
47 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/dandanzan/M3U8.py:
--------------------------------------------------------------------------------
1 | import time
2 | import m3u8, requests, os
3 | from urllib.parse import urljoin
4 | from Crypto.Cipher import AES
5 | from requests.exceptions import RequestException
6 |
7 |
8 | class M3U8:
9 | def __init__(self, url):
10 | self.decryptor = None
11 | self.url = url
12 | self.count = 0
13 | self.ts_urls = []
14 |
15 | def get_ts_url(self):
16 | son_url = self.url
17 | key_url = None
18 | r = m3u8.load(son_url).data
19 | self.ts_urls = [urljoin(son_url, ts['uri']) for ts in r['segments']]
20 | try:
21 | if r['segments'][0]['key']['uri'].startswith('h' or 'H'):
22 | key_url = r['segments'][0]['key']['uri']
23 | else:
24 | key_url = urljoin(son_url, r['segments'][0]['key']['uri'])
25 | except:
26 | pass
27 | if key_url:
28 | key = requests.get(key_url).content
29 | self.decryptor = AES.new(key, AES.MODE_CBC, b'\x00' * 16)
30 | else:
31 | self.decryptor = None
32 |
33 | def download(self, url, path, index, retry_times=5):
34 | for i in range(retry_times):
35 | try:
36 | resp = requests.get(url, timeout=60)
37 | if resp.status_code == 200:
38 | with open(path + f'\\{index}.ts', 'wb') as f:
39 | if self.decryptor:
40 | f.write(self.decryptor.decrypt(resp.content))
41 | else:
42 | f.write(resp.content)
43 | self.count += 1
44 | if self.count % 100 == 0:
45 | print(f'已经下载{self.count}个分片了!')
46 | return True
47 | except RequestException as e:
48 | print(f"Download failed: {url}\n{e}\nretrying ({i + 1}/{retry_times})...")
49 | time.sleep(5)
50 | raise RequestException(f"Failed to download {url} after {retry_times} retries.")
51 |
52 | @staticmethod
53 | def merge(total, path):
54 | with open(path + '\\video.mp4', 'ab') as fp:
55 | for index in range(total):
56 | try:
57 | f = path + f'\\{index}.ts'
58 | content = open(f, 'rb').read()
59 | fp.write(content)
60 | os.remove(path + f'\\{index}.ts')
61 | except Exception as e:
62 | print(e)
63 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/dandanzan/main.py:
--------------------------------------------------------------------------------
1 | from drama import drama
2 | from movie import movie
3 | from variety import variety
4 | import os, subprocess
5 |
6 |
7 | def clear_screen():
8 | subprocess.call('cls', shell=True)
9 |
10 |
11 | def drama_fun():
12 | key = input('请输入电视剧名称: ')
13 | d = drama(pipe)
14 | d.search(key)
15 | if d.item:
16 | if input('请输入1进行下载, 其他任意键回到主页面: ') == '1':
17 | flag = input('输入选择下载的剧集序号: ')
18 | ji = int(input('输入开始集数: '))
19 | jj = int(input('输入结束集数: '))
20 | clear_screen()
21 | d.get_m3u8_url(flag, ji, jj)
22 | d.download_all(ji, jj)
23 | clear_screen()
24 | else:
25 | clear_screen()
26 |
27 |
28 | def movie_fun():
29 | key = input('请输入电影名称: ')
30 | m = movie(pipe)
31 | m.search(key)
32 | if m.item:
33 | if input('请输入1进行下载, 其他任意键回到主页面: ') == '1':
34 | f = input('输入选择下载的电影序号: ')
35 | clear_screen()
36 | m.get_m3u8(f)
37 | m.download_movie()
38 | clear_screen()
39 | else:
40 | clear_screen()
41 |
42 |
43 | def variety_fun():
44 | key = input('请输入综艺的名称: ')
45 | v = variety(pipe)
46 | v.search(key)
47 | if v.item:
48 | flag = input('输入选择查看的综艺序号: ')
49 | clear_screen()
50 | v.print_num(flag)
51 | if input('请输入1进行下载, 其他任意键回到主页面: ') == '1':
52 | num = input('输入选择下载的期数: ')
53 | v.get_m3u8_urls(flag, num)
54 | v.download(num)
55 | clear_screen()
56 | else:
57 | clear_screen()
58 |
59 |
60 | if __name__ == '__main__':
61 | if not os.path.exists('D:/m3u8视频'):
62 | os.mkdir('D:/m3u8视频')
63 | pipe = input('清输入选择的下载通道(0-5)\n建议通道0, 如果出现程序闪退可考虑更换通道,或者打开VPN\n')
64 | while True:
65 | choice = input('请输入想要搜索的类型(1 表示电视剧, 2 表示电影, 3 表示综艺, 其他任意键退出): ')
66 | clear_screen()
67 | if choice == '1':
68 | drama_fun()
69 | elif choice == '2':
70 | movie_fun()
71 | elif choice == '3':
72 | variety_fun()
73 | else:
74 | break
75 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/dandanzan/movie.py:
--------------------------------------------------------------------------------
1 | import m3u8, re
2 | from lxml import etree
3 | from M3U8 import M3U8
4 | from drama import drama
5 | import prettytable as pt
6 | from urllib.parse import urljoin
7 | from concurrent.futures import ThreadPoolExecutor, wait
8 |
9 |
10 | class movie(drama):
11 | def search(self, key):
12 | params = {
13 | 'q': key,
14 | }
15 | response = self.session.get('https://dandanzan.net/so', params=params, headers=self.headers)
16 | tree = etree.HTML(response.text)
17 | li_list = tree.xpath('//div[@class="lists-content"]/ul/li')
18 | for li in li_list:
19 | try:
20 | a = li.xpath('./a/@href')[0]
21 | id = re.findall(self.r, a)[0]
22 | length = li.xpath('./a/div[1]/span/text()')[0].strip()
23 | name = li.xpath('./h2/a//text()')[0].strip()
24 | if not length.startswith('第'):
25 | s = {
26 | 'id': id,
27 | 'length': length,
28 | 'name': name
29 | }
30 | self.item.append(s)
31 | except:
32 | pass
33 | tb = pt.PrettyTable()
34 | tb.field_names = ['序号', '电影名称', '清晰度']
35 | tb.align = 'c'
36 | # 填充宽度
37 | tb.padding_width = 5
38 | count = 0
39 | for item in self.item:
40 | tb.add_row([count, item['name'], item['length']])
41 | count += 1
42 | print(tb)
43 |
44 | def get_m3u8(self, flag):
45 | id = self.item[int(flag)]['id']
46 | length = 'hd'
47 | self.dir_name = self.item[int(flag)]['name']
48 | url = f'https://dandanzan.net/fetch_plays/{id}/{length}'
49 | response = self.session.get(url)
50 | try:
51 | father_url = response.json()['video_plays'][self.pipe]['play_data']
52 | f_fata = m3u8.load(father_url).data
53 | son_url = urljoin(father_url, f_fata['playlists'][0]['uri'])
54 | self.m3u8_obj.append(M3U8(son_url))
55 | print('下载链接已获取')
56 | except Exception:
57 | raise '出现错误,电影资源不存在'
58 |
59 | def download_movie(self):
60 | self.m3u8_obj[0].get_ts_url()
61 | total = len(self.m3u8_obj[0].ts_urls)
62 | self.create_dir(1)
63 | print(f'视频一共的分片是{total}个...')
64 | with ThreadPoolExecutor(max_workers=16) as executor:
65 | futures = []
66 | for j, data in enumerate(self.m3u8_obj[0].ts_urls):
67 | future = executor.submit(self.m3u8_obj[0].download, data, self.path, j)
68 | futures.append(future)
69 | wait(futures)
70 | executor.shutdown()
71 | self.m3u8_obj[0].merge(total, self.path)
72 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/dandanzan/variety.py:
--------------------------------------------------------------------------------
1 | import m3u8, re
2 | from movie import movie
3 | from lxml import etree
4 | import prettytable as pt
5 | from urllib.parse import urljoin
6 | from concurrent.futures import ThreadPoolExecutor, wait
7 | from M3U8 import M3U8
8 |
9 |
10 | class variety(movie):
11 | def search(self, key):
12 | params = {
13 | 'q': key,
14 | }
15 | response = self.session.get('https://dandanzan.net/so', params=params, headers=self.headers)
16 | tree = etree.HTML(response.text)
17 | li_list = tree.xpath('//div[@class="lists-content"]/ul/li')
18 | for li in li_list:
19 | try:
20 | a = li.xpath('./a/@href')[0]
21 | id = re.findall(self.r, a)[0]
22 | length = li.xpath('./a/div[1]/span/text()')[0].strip()
23 | name = li.xpath('./h2/a//text()')[0].strip()
24 | s = {
25 | 'url': 'https://dandanzan.net' + a,
26 | 'id': id,
27 | 'length': length,
28 | 'name': name
29 | }
30 | self.item.append(s)
31 | except:
32 | pass
33 | tb = pt.PrettyTable()
34 | tb.field_names = ['序号', '综艺名称', '最新一期']
35 | tb.align = 'c'
36 | # 填充宽度
37 | tb.padding_width = 5
38 | count = 0
39 | for item in self.item:
40 | tb.add_row([count, item['name'], item['length']])
41 | count += 1
42 | print(tb)
43 |
44 | def get_m3u8_urls(self, flag, num):
45 | id = self.item[int(flag)]['id']
46 | length = num
47 | self.dir_name = self.item[int(flag)]['name']
48 | url = f'https://dandanzan.net/fetch_plays/{id}/{length}'
49 | response = self.session.get(url)
50 | try:
51 | father_url = response.json()['video_plays'][self.pipe]['play_data']
52 | f_fata = m3u8.load(father_url).data
53 | son_url = urljoin(father_url, f_fata['playlists'][0]['uri'])
54 | self.m3u8_obj.append(M3U8(son_url))
55 | print('下载链接已获取')
56 | except Exception:
57 | raise '出现错误,电影资源不存在'
58 |
59 | def print_num(self, flag):
60 | url = self.item[int(flag)]['url']
61 | resp = self.session.get(url, headers=self.headers)
62 | tree = etree.HTML(resp.text)
63 | li_list = tree.xpath('//ul[@id="eps-ul"]/li')
64 | num = []
65 | for li in li_list:
66 | number = li.xpath('./@ep_slug')[0]
67 | num.append(number)
68 | table = pt.PrettyTable()
69 | table.field_names = ['期数1', '期数2', '期数3', '期数4', '期数5']
70 | # 计算需要填充的空值数量
71 | num_padding = 5 - len(num) % 5
72 | # 填充空值
73 | num += [None] * num_padding
74 | for i in range(0, len(num), 5):
75 | table.add_row([*num[i:i + 5]])
76 | print(table)
77 |
78 | def download(self, num):
79 | self.m3u8_obj[0].get_ts_url()
80 | total = len(self.m3u8_obj[0].ts_urls)
81 | self.create_dir(num)
82 | print(f'视频一共的分片是{total}个...')
83 | with ThreadPoolExecutor(max_workers=16) as executor:
84 | futures = []
85 | for j, data in enumerate(self.m3u8_obj[0].ts_urls):
86 | future = executor.submit(self.m3u8_obj[0].download, data, self.path, j)
87 | futures.append(future)
88 | wait(futures)
89 | executor.shutdown()
90 | self.m3u8_obj[0].merge(total, self.path)
91 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/weibo全站爬取/base.py:
--------------------------------------------------------------------------------
1 | import json
2 | from random import uniform
3 | from time import sleep
4 | from typing import Union, Literal
5 |
6 | from curl_cffi import requests
7 |
8 | # 类型控制
9 | Accept = Literal['json', 'text', 'contents']
10 | Method = Literal['get', 'post', 'POST', 'GET']
11 |
12 |
13 | class Base:
14 | # 设置请求session
15 | session = requests.Session()
16 | # 请求头
17 | headers: dict = {}
18 | # 用户cookie
19 | cookies: dict = {}
20 | # 返回指定数据类型
21 | dataProcessors = {
22 | 'json': lambda resp: resp.json(),
23 | 'text': lambda resp: resp.text,
24 | 'contents': lambda resp: resp.content
25 | }
26 | # 请求方式
27 | methodProcessors = {
28 | 'get': requests.get,
29 | 'post': requests.post
30 | }
31 |
32 | def ajax_requests(
33 | self, url: str,
34 | method: Method,
35 | dataType: Accept,
36 | params: Union[dict, str, None],
37 | jsonData: Union[dict, None],
38 | retryTimes: int = 5,
39 | timeOut: int = 20
40 | ) -> Union[dict, str, bytes]:
41 | # 初始化请求发送器以及数据获取器
42 | dataProcessor = self.dataProcessors[dataType]
43 | methodProcessor = self.methodProcessors[method]
44 | for _ in range(retryTimes):
45 | try:
46 | response = methodProcessor(
47 | url=url,
48 | headers=self.headers,
49 | cookies=self.cookies,
50 | params=params,
51 | data=json.dumps(jsonData, ensure_ascii=False, separators=(',', ':')),
52 | timeout=timeOut
53 | )
54 | return dataProcessor(response)
55 | except json.decoder.JSONDecodeError:
56 | raise ValueError(f'无法被解析为json格式,错误链接为: {url}')
57 | except Exception as e:
58 | sleep(uniform(1, 5))
59 | print(
60 | f"错误链接: {url}",
61 | f"请求出现错误, 正在重试: {_}/{retryTimes}",
62 | f"错误信息为: {e}",
63 | sep='\n'
64 | )
65 | else:
66 | raise '重试5次后仍然无法获取数据,可能是加密参数错误或者ip风控'
67 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/baidu.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import os
3 | from mix_media import get
4 |
5 |
6 | def func(text, lan):
7 | params = {
8 | 'lan': lan,
9 | 'text': text,
10 | 'spd': '3',
11 | 'source': 'web',
12 | }
13 |
14 | response = requests.get('https://fanyi.baidu.com/gettts', params=params)
15 | return response.content
16 |
17 |
18 | def baidu(filepath, lan):
19 | with open(filepath, 'r', encoding='utf-8') as file:
20 | list = file.readlines()
21 | name = os.path.basename(filepath)
22 | if not os.path.exists(f'./media/baidu/{name}'):
23 | os.mkdir(f'./media/baidu/{name}')
24 | flag = 1
25 | while flag <= len(list):
26 | text = list[flag - 1].replace('\n', '')
27 | if text is not None:
28 | print(text)
29 | resp = func(text, lan)
30 | with open(f'./media/baidu/{name}/{flag}.mp3', 'wb') as file:
31 | file.write(resp)
32 | flag += 1
33 | get(f'media/baidu/{name}')
34 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/main.py:
--------------------------------------------------------------------------------
1 | from baidu import baidu
2 | from youdao import youdao
3 |
4 |
5 | if __name__ == '__main__':
6 | while True:
7 | flag = input('请选择来源:(1.百度 2.有道 3.退出)\n')
8 | if flag == '3':
9 | break
10 | path = input('请输入文件路径:\n')
11 | lan = input('请输入文件语言:(zh, en, kr/kor[有道, 百度])\n')
12 | if flag == '1':
13 | baidu(path, lan)
14 | elif flag == '2':
15 | youdao(path, lan)
16 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/media/baidu/kr/all.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/baidu/kr/all.mp3
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/media/baidu/wenku.txt/all.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/baidu/wenku.txt/all.mp3
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/media/youdao/kr/all.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/youdao/kr/all.mp3
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/media/youdao/kr_exam/all.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/youdao/kr_exam/all.mp3
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/media/youdao/wenku.txt/all.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xishandong/crawlProject/0fabdefb7eb966c1f342b95c2b09f48bdad52f9f/进阶篇/基础综合/语音爬虫/media/youdao/wenku.txt/all.mp3
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/mix_media.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 |
5 | def get(dir_path):
6 | file_list = os.listdir(dir_path)
7 | # file_list.sort(key=lambda i: int(re.match(r'(\d+)', i).group()))
8 | contents = []
9 | for cur_file in file_list:
10 | path = os.path.join(os.path.abspath(dir_path), cur_file)
11 | with open(path, 'rb') as fp:
12 | content = fp.read()
13 | contents.append(content)
14 | # os.remove(path)
15 | print(path)
16 | with open(f'{dir_path}/all.mp4', 'wb') as f:
17 | for c in contents:
18 | f.write(c)
19 |
20 |
21 | if __name__ == '__main__':
22 | path = input('请输入目录:\n')
23 | get(path)
24 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/text/en:
--------------------------------------------------------------------------------
1 | SHA-256 uses a series of mathematical operations,
2 | including bitwise operations (such as XOR, AND, and OR), rotations, and additions, to transform the input message and the internal state of the algorithm through 64 rounds.
3 | Each round has its own specific set of constants and functions.
4 | The message is first padded to a multiple of 512 bits (the block size) with a 1 bit,
5 | followed by as many 0 bits as necessary to reach the end of the block, and then a 64-bit integer representing the original length of the message is appended.
6 | The resulting message is then split into 512-bit blocks, which are processed in sequence.
7 | Each block is first divided into 16 32-bit words, which are then expanded into 64 32-bit words using a function that involves XOR, rotations, and additions.
8 | The expanded words are then processed through a series of 64 rounds, each of which involves several steps:
9 | The round constant is added to one of the words.
10 | Several words are passed through a set of logical functions (such as XOR, AND, and OR) and then added to another word.
11 | The words are rotated by a certain number of bits.
12 | The words are passed through another set of logical functions and added to another word.
13 | At the end of the 64 rounds, the resulting words are added to the initial hash values, and the resulting values are the final hash of the message.
14 | The specific constants and functions used in each round are carefully chosen to provide a high level of security and resistance to various attacks.
15 | The entire process of SHA-256 is designed to be computationally expensive and difficult to reverse, making it a strong cryptographic hash function.
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/text/kr:
--------------------------------------------------------------------------------
1 | 사람들은 모두 음식을 생각하면 즐거워질 것입니다 .
2 | “무엇을 , 어떻게 먹을까?”
3 | 아주 오랜 옛날부터 사람들은 이런 생각을 했습니다 . 그리고 음식과 음식을 먹는 방법은 나라마다 아주 다양합니다 .
4 | 한국 음식은 국물이 많기 때문에 숟가락과 젓가락을 사용하고 밥을 먹을 때도 그릇을 상 위에 놓고 숟가락으로 먹습니다 . 일본 사람들은 밥을 젓가락으로 먹기 때문에 밥그릇을 들고 먹어야 합니다 . 중국에는 튀긴 음식과 볶은 음식이 많아서 기름이 많고 뜨겁기 때문에 중국 젓가락은 한국 젓가락보다 더 깁니다 .
5 | 노르웨이는 겨울이 긴 나라입니다 . 그래서 사람들은 고기나 생선에 소금을 뿌려서 말린 후에 추운 경울에도 오랫동안 먹을 수 있는 음식을 만들었습니다 . 인도네시아는 날씨가 더워서 음식이 쉽게 상할 수 있기 때문에 볶은 음식이 많습니다 .
6 | 이렇게 나라마다 다른 식사 방법과 다양한
7 | 음식은 그 나라의 문화를 잘 보여 줍니다 . 다른
8 | 나라의 문화를 잘 알고 싶으면 그 나라의
9 | 음식을 드셔 보세요 .
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/text/kr_exam.txt:
--------------------------------------------------------------------------------
1 | 당신은 무슨 요리를 좋아합니까
2 | 나는 불고기를 좋아합니다
3 | 불고기 맛이 어때요
4 | 이 식당에서 무슨 음식이 제일 맛있어요
5 | 불고기 하나랑 비빔밥 하나 주세요
6 | 백화점과 지하철이 있습니다
7 | 우리 집은 아버지 어머니 누나 그리고 저, 모두 네 식구입니다
8 | 저는 한국어를 좋아합니다
9 | 고향에는 한국 친구가 있습니다
10 | 깨끗하다
11 | 더러 워
12 | 추 워
13 | 뜨 거 운
14 | 조용하다
15 | 시끄럽다
16 | 널찍하다
17 | 좁은
18 | 비 싼
19 | 값싼
20 | 우리가족은 다섯명입니다.
21 | 아버지, 어머니, 외할머니, 형님과 저입니다.
22 | 어머니와 아버지는 선생님이고 형은 의사입니다.
23 | 그리고 저는 대학생입니다.저의 고향은 중국 사천성입니다
24 | 우리 가족은 모두 불고기를 좋아한다.
25 | 그렇지만 형는 회를 더 좋아해요.
26 | 내가 가장 좋아하는 한국음식은 비빔밥이다
27 | 내가 나중에 한국을 여행할 수 있기를 바랍니다
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/text/wenku.txt:
--------------------------------------------------------------------------------
1 | 今天教给大家一种免费白嫖百度文库付费文档的方法
2 | 首先我们选择自己想要下载的文档
3 | 然后复制文档的链接
4 | 打开浏览器无痕模式
5 | 点击链接旁的小锁
6 | 将所有cookie禁用
7 | 右键检查
8 | 点击左上角的平板
9 | 切换模式再刷新页面
10 | 就可以看到完整的文档了
11 | 然后再点击平板
12 | 就可以复制所有文档
13 | 学会的朋友们记得一件三联哦
14 | 关注我
15 | 我是一个爱挖漏洞的up主
16 | 继续带你免费白嫖资源
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/text/zh.txt:
--------------------------------------------------------------------------------
1 | 实现hash256算法的过程:
2 | 第一、初始化
3 | 将32位整数赋值给8个变量,这8个变量是工作变量,以及定义32位小常数
4 | 第二、填充
5 | 将需要加密的文本内容填充到能被512整除余448,填充方式是在文本末尾添加1和若干个0
6 | 如果数据模512大于448,则需要再添加一组512位填充
7 | 第三、处理消息分组
8 | 将填充后的数据分为若干个512位的数据块,每个数据块称为一个消息分组。
9 | 对于每个消息分组,需要进行一次消息扩展操作和64次压缩函数操作
10 | a.消息扩展
11 | 将一个消息分组扩展为64个32位整数
12 | 对于每一块,将块分解为16个32-bit的big-endian的字,记为w[0], …, w[15]
13 | 也就是说,前16个字直接由消息的第i个块分解得到
14 | b.压缩函数
15 | (1) 将8个工作变量的值赋给8个临时变量
16 | (2) 对扩展消息进行64轮的处理
17 | (3) 在完成64轮的压缩函数操作之后,将新的哈希值与当前消息分组之前的哈希值相加,生成最终的哈希值。
18 | 第四、输出
19 |
20 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/text/zh_kr_exam:
--------------------------------------------------------------------------------
1 | 姓名 老师 姓 印度人 人 朋友 国家 公司职员 学生 大学生 警察 歌手 医生 护士 秘书 律师
2 | 谁 学校 课本 字典 杂志 钱包 橡皮 护照 钥匙 饼干 教室 银行 学生会馆 图书馆 洗手间
3 | 食堂 书店 邮政局 办公室 电脑 牛奶 啤酒 冰箱 百货店 地下铁 剧场 医院 公司
4 | 你喜欢吃什么菜
5 | 我喜欢烤肉
6 | 烤肉味道怎么样
7 | 这家店什么菜最好吃
8 | 请给我一份烤肉和一碗拌饭
9 | 有百货店和地下铁
10 | 我们家有爸爸妈妈姐姐还有我一共四口人
11 | 我喜欢韩国语
12 | 故乡有韩国朋友
--------------------------------------------------------------------------------
/进阶篇/基础综合/语音爬虫/youdao.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import os
3 | from mix_media import get
4 |
5 |
6 | def func(text, lan):
7 | cookies = {
8 | 'OUTFOX_SEARCH_USER_ID_NCOO': '1065325158.1443799',
9 | 'OUTFOX_SEARCH_USER_ID': '-527773617@180.168.188.248',
10 | }
11 |
12 | headers = {
13 | 'Accept': '*/*',
14 | 'Accept-Language': 'zh-CN,zh;q=0.9,zh-TW;q=0.8,en;q=0.7',
15 | 'Connection': 'keep-alive',
16 | # 'Cookie': 'OUTFOX_SEARCH_USER_ID_NCOO=1065325158.1443799; OUTFOX_SEARCH_USER_ID=-527773617@180.168.188.248',
17 | 'Range': 'bytes=0-',
18 | 'Referer': 'https://fanyi.youdao.com/',
19 | 'Sec-Fetch-Dest': 'audio',
20 | 'Sec-Fetch-Mode': 'no-cors',
21 | 'Sec-Fetch-Site': 'same-site',
22 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
23 | 'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
24 | 'sec-ch-ua-mobile': '?0',
25 | 'sec-ch-ua-platform': '"Windows"',
26 | }
27 |
28 | params = {
29 | 'audio': text,
30 | 'le': lan,
31 | }
32 |
33 | response = requests.get('https://dict.youdao.com/dictvoice', params=params, cookies=cookies, headers=headers)
34 | return response.content
35 |
36 |
37 | def youdao(filepath, lan):
38 | with open(filepath, 'r', encoding='utf-8') as file:
39 | list = file.readlines()
40 | name = os.path.basename(filepath)
41 | if not os.path.exists(f'./media/youdao/{name}'):
42 | os.mkdir(f'./media/youdao/{name}')
43 | flag = 1
44 | while flag <= len(list):
45 | text = list[flag - 1].strip()
46 | if text is not None:
47 | print(text)
48 | resp = func(text, lan)
49 | with open(f'./media/youdao/{name}/{flag}.mp3', 'wb') as file:
50 | file.write(resp)
51 | flag += 1
52 | get(f'media/youdao/{name}')
53 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/验证码相关/古诗文网验证码识别.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import ddddocr
3 | from lxml import etree
4 |
5 | if __name__ == '__main__':
6 | headers = {
7 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
8 | }
9 | url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
10 | page_text = requests.get(url=url, headers=headers).text
11 | tree = etree.HTML(page_text)
12 | # 将验证码图片保存到了本地
13 | code_img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0]
14 | code_data = requests.get(url=code_img_src, headers=headers).content
15 | with open('./code.jpg', 'wb') as fp:
16 | fp.write(code_data)
17 | # 解析验证码
18 | ocr = ddddocr.DdddOcr()
19 | with open('code.jpg', 'rb') as f:
20 | img_bytes = f.read()
21 | res = ocr.classification(img_bytes) # 解析到的验证码数据
22 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/验证码相关/模拟登录.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import ddddocr
3 | from lxml import etree
4 |
5 | # 获取验证码信息以及页面的隐藏信息,在这里是viewstate和viewstategenerator
6 | if __name__ == "__main__":
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
9 | }
10 | url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
11 | page_text = requests.get(url=url, headers=headers).text
12 | tree = etree.HTML(page_text)
13 | # 获取验证码图片连接
14 | code_img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0]
15 | # 获取viewstate的值
16 | viewstate = tree.xpath("//input[@id='__VIEWSTATE']/@value")[0]
17 | # 获取viewstategenerator的值
18 | viewstategenerator = tree.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
19 |
20 | # 将验证码图片保存到本地
21 | # code_data = requests.get(url=code_img_src, headers=headers).content 不可以这样使用,因为一旦请求,原本网页的验证码就会发生改变了
22 | # 这里我们使用requests中的Session()方法,将请求变成一个对象
23 | session = requests.Session()
24 | code_data = session.get(url=code_img_src, headers=headers).content
25 | with open('./code.jpg', 'wb') as fp:
26 | fp.write(code_data)
27 | # 解析验证码
28 | ocr = ddddocr.DdddOcr()
29 | with open('code.jpg', 'rb') as f:
30 | img_bytes = f.read()
31 | res = ocr.classification(img_bytes) # 解析到的验证码数据
32 |
33 | # 模拟登录发送post请求
34 | login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
35 | data = {
36 | '__VIEWSTATE': viewstate,
37 | '__VIEWSTATEGENERATOR': viewstategenerator,
38 | 'from': 'http://so.gushiwen.cn/user/collect.aspx',
39 | 'email': '@qq.com',
40 | 'pwd': '',
41 | 'code': res,
42 | 'denglu': '登录',
43 | }
44 | # 注意此处也应该用session不然验证码也会刷新
45 | login_page_text = session.post(url=login_url, data=data, headers=headers).text
46 |
47 | with open('gushiwen.html', 'w', encoding='utf-8') as fp:
48 | fp.write(login_page_text)
49 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/验证码相关/模拟登录之后的数据爬取.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import ddddocr
3 | from lxml import etree
4 |
5 | # 获取验证码信息以及页面的隐藏信息,在这里是viewstate和viewstategenerator
6 | if __name__ == "__main__":
7 | headers = {
8 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
9 | }
10 | url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
11 | page_text = requests.get(url=url, headers=headers).text
12 | tree = etree.HTML(page_text)
13 |
14 | # 获取验证码图片连接
15 | code_img_src = 'https://so.gushiwen.cn/' + tree.xpath('//*[@id="imgCode"]/@src')[0]
16 |
17 | # 获取viewstate的值
18 | viewstate = tree.xpath("//input[@id='__VIEWSTATE']/@value")[0]
19 |
20 | # 获取viewstategenerator的值
21 | viewstategenerator = tree.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
22 |
23 | # 将验证码图片保存到本地
24 | # code_data = requests.get(url=code_img_src, headers=headers).content 不可以这样使用,因为一旦请求,原本网页的验证码就会发生改变了
25 | # 这里我们使用requests中的session()方法,将请求变成一个对象
26 | session = requests.Session()
27 | code_data = session.get(url=code_img_src, headers=headers).content
28 | with open('./code.jpg', 'wb') as fp:
29 | fp.write(code_data)
30 |
31 | # 解析验证码
32 | ocr = ddddocr.DdddOcr()
33 | with open('code.jpg', 'rb') as f:
34 | img_bytes = f.read()
35 | res = ocr.classification(img_bytes) # 解析到的验证码数据
36 |
37 | # 模拟登录发送post请求
38 | login_url = 'https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx'
39 | data = {
40 | '__VIEWSTATE': viewstate,
41 | '__VIEWSTATEGENERATOR': viewstategenerator,
42 | 'from': 'http://so.gushiwen.cn/user/collect.aspx',
43 | 'email': '',
44 | 'pwd': '',
45 | 'code': res,
46 | 'denglu': '登录',
47 | }
48 | # 注意此处也应该用session不然验证码也会刷新
49 | session.post(url=login_url, data=data, headers=headers)
50 |
51 | detail_url = 'https://so.gushiwen.cn/user/collect.aspx?type=m&id=3760950&sort=t'
52 | detail_page_text = session.get(url=detail_url, headers=headers).text
53 | with open('infor.html', 'w', encoding='utf-8') as fp:
54 | fp.write(detail_page_text)
55 |
--------------------------------------------------------------------------------
/进阶篇/基础综合/验证码相关/验证码测试.py:
--------------------------------------------------------------------------------
1 | import ddddocr
2 |
3 | ocr = ddddocr.DdddOcr()
4 | # 简单的图片数字英文识别
5 | with open('1.png', 'rb') as f:
6 | img_bytes = f.read()
7 | res = ocr.classification(img_bytes)
8 |
9 | print(res)
--------------------------------------------------------------------------------
/进阶篇/爬虫轮子/README.md:
--------------------------------------------------------------------------------
1 | # 个人对requests库的二次封装
2 |
3 | > 对于爬虫常见的发送请求以及日志记录和响应校验进行了二次封装。
4 | >
5 | > 只需要在新的类继承CrawlBase即可,发送请求的函数使用do_request
6 | >
7 | > 可以设置中间件以及发生前校验和发送后校验
8 | >
9 |
10 | #### 未来设想
11 |
12 | 1. 增加用户池
13 | 2. 结构优化
14 | 3. 把请求响应都封装
15 | 4. 去重自动入库
16 | 5. 等等
--------------------------------------------------------------------------------
/验证码篇/滑块篇/阿里系226/226.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | import re
4 | from urllib.parse import urlparse, parse_qs
5 |
6 | import playwright.sync_api
7 | from playwright.sync_api import sync_playwright
8 |
9 | # 存放滑块的页面
10 | FILEPATH = ''
11 |
12 | # 拦截验证的路由,自己写一下url, 格式参照playwright官网
13 | INTERRUPT_ROUTE = ''
14 |
15 | # 指定谷歌浏览器路径,以debug模式打开,如果已经打开了debug,下面四行代码可以注释掉
16 | # chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
17 | # debugging_port = "--remote-debugging-port=9999"
18 | #
19 | # command = f"{chrome_path} {debugging_port}"
20 | # subprocess.Popen(command, shell=True)
21 |
22 | # 创建的ws链接
23 | WS_URL = 'http://localhost:your_port'
24 |
25 |
26 | pattern = re.compile(r'\((.*)\)', re.S)
27 |
28 | headers = {
29 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
30 | }
31 |
32 |
33 | def get_226() -> dict:
34 | result: dict = {}
35 |
36 | def intercept_xhr(route: playwright.sync_api.Route):
37 | params = parse_qs(urlparse(route.request.url).query)
38 | result['t'] = params['t'][0]
39 | # 这里不指定headers会出现意想不到的错误
40 | resp = route.fetch(headers=headers)
41 | data = json.loads(pattern.findall(resp.text())[0])
42 | # 我们获取到了数据是不是应该返还给result
43 | print(data)
44 | route.fulfill(response=resp)
45 |
46 | with sync_playwright() as p:
47 | # 创建一个ws链接
48 | browser = p.chromium.connect_over_cdp(WS_URL)
49 | # 使用浏览器的上下文创建页面
50 | content = browser.contexts[0]
51 | page = content.new_page()
52 | # 设置拦截规则
53 | page.route(INTERRUPT_ROUTE, intercept_xhr)
54 | page.goto(FILEPATH)
55 | # 开始滑动,获取对应的东西,在滑动距离增加一些随机值
56 | btn = page.locator('#nc_1_n1z')
57 | btn_position = btn.bounding_box()
58 | new_x = btn_position['x'] + random.randint(290, 310)
59 | new_y = btn_position['y']
60 | page.mouse.click(btn_position['x'], btn_position['y'])
61 | # 滑动了
62 | page.mouse.down()
63 | page.mouse.move(new_x, new_y)
64 | page.mouse.up()
65 | # 等待一段时间以观察拖动效果
66 | page.wait_for_timeout(1000)
67 | # 关闭所有
68 | page.close()
69 | content.close()
70 | browser.close()
71 | # 返回结果
72 | return result
73 |
--------------------------------------------------------------------------------
/验证码篇/滑块篇/阿里系226/README.md:
--------------------------------------------------------------------------------
1 | ## 使用
2 |
3 | 我们需要修改自己的存放滑块的位置,创建的ws地址以及写一下拦截的url和自己把数据返还回去
4 |
5 | 如果没有以debug模式打开浏览器并指定端口需要运行注释掉的代码
6 |
7 | 仅作学习分享,严禁非法使用
8 |
9 | 注:
10 | > 我们需要把滑块的html保存到本地,并且要做一些调整,可以删除部分除开滑块的逻辑
11 | >
12 | > 比如一些无用的样式,无用的dom元素,保留需要触发的逻辑即可。
13 | >
14 | > 这样可以大幅度提高效率,如果对效率没有要求也可以全网页保存,记得把一些js文件路径和css文件
15 | > 路径补全即可
16 | >
17 | > 对于高并发的情况,这个方法有待测试,因为是和本地浏览器以websocket方式链接,所以会比使用
18 | > 有头模式的反检测浏览器效率快一些,高并发情况可以自行进行测试
19 | >
20 | > 此方法适用于绝大部分人机校验,比还原算法节省99%工作量,大家可以根据自己的需求,自行选择
21 | > 绕过方式
22 |
--------------------------------------------------------------------------------
/验证码篇/滑块篇/飞瓜登录验证码定制阿里系滑块/README.md:
--------------------------------------------------------------------------------
1 | ## 使用
2 |
3 | 我们需要修改自己的存放滑块的位置,创建的ws地址以及写一下拦截的url和自己把数据返还回去
4 |
5 | 如果没有以debug模式打开浏览器并指定端口需要运行注释掉的代码
6 |
7 | 仅作学习分享,严禁非法使用
8 |
9 | 注:
10 | > 我们需要把滑块的html保存到本地,并且要做一些调整,可以删除部分除开滑块的逻辑
11 | >
12 | > 比如一些无用的样式,无用的dom元素,保留需要触发的逻辑即可。
13 | >
14 | > 这样可以大幅度提高效率,如果对效率没有要求也可以全网页保存,记得把一些js文件路径和css文件
15 | > 路径补全即可
16 | >
17 | > 对于高并发的情况,这个方法有待测试,因为是和本地浏览器以websocket方式链接,所以会比使用
18 | > 有头模式的反检测浏览器效率快一些,高并发情况可以自行进行测试
19 | >
20 | > 此方法适用于绝大部分人机校验,比还原算法节省99%工作量,大家可以根据自己的需求,自行选择
21 | > 绕过方式
22 |
--------------------------------------------------------------------------------
/验证码篇/滑块篇/飞瓜登录验证码定制阿里系滑块/send.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from test import main
3 | phone = []
4 | for _ in phone:
5 | headers = {
6 | 'Accept': '*/*',
7 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
8 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
9 | 'Origin': 'https://dy.feigua.cn',
10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
11 | }
12 | token = main()
13 |
14 | data = {
15 | 'tel': _,
16 | 'sessionid': token['sessionid'][0],
17 | 'sig': token['sig'][0],
18 | 'token': token['token'][0],
19 | }
20 |
21 | response = requests.post('https://dy.feigua.cn/login/SendLoginMessageCode', headers=headers, data=data)
22 | print(response.text)
23 |
--------------------------------------------------------------------------------
/验证码篇/滑块篇/飞瓜登录验证码定制阿里系滑块/test.py:
--------------------------------------------------------------------------------
1 | import random
2 | from urllib.parse import parse_qs
3 |
4 | from playwright.sync_api import Playwright, sync_playwright
5 |
6 | # 存放滑块的页面
7 | FILEPATH = ''
8 |
9 | # 拦截验证的路由,自己写一下url, 格式参照playwright官网
10 | INTERRUPT_ROUTE = ''
11 |
12 | # 指定谷歌浏览器路径,以debug模式打开,如果已经打开了debug,下面四行代码可以注释掉
13 | # chrome_path = r'"C:\Program Files\Google\Chrome\Application\chrome.exe"'
14 | # debugging_port = "--remote-debugging-port=9999"
15 | #
16 | # command = f"{chrome_path} {debugging_port}"
17 | # subprocess.Popen(command, shell=True)
18 |
19 | # 创建的ws链接
20 | WS_URL = 'http://localhost:your_port'
21 |
22 |
23 | def run(playwright: Playwright) -> dict:
24 | result: dict = {}
25 |
26 | # 拦截发送验证码api,把参数截获
27 | def intercept_xhr(route):
28 | data = parse_qs(route.request.post_data)
29 | route.abort()
30 | # 自行将data传出
31 | print(data)
32 |
33 | browser = playwright.chromium.connect_over_cdp(WS_URL)
34 | content = browser.contexts[0]
35 |
36 | page = content.new_page()
37 | page.route(INTERRUPT_ROUTE, intercept_xhr)
38 | page.goto(FILEPATH)
39 | # 进行点击,进入滑块状态
40 | page.get_by_role("link", name="注册 / 登录").click()
41 | page.get_by_role("link", name="手机登录").click()
42 | page.get_by_text("验证码登录").click()
43 | page.get_by_role("textbox", name="请输入绑定手机号码").click()
44 | page.get_by_role("textbox", name="请输入绑定手机号码").fill("手机号")
45 | page.get_by_role("link", name="获取验证码").click()
46 | # 有可能出现两种id
47 | try:
48 | btn = page.locator('#nc_2_n1z')
49 | btn_position = btn.bounding_box(timeout=10000)
50 | except:
51 | btn = page.locator('#nc_1_n1z')
52 | btn_position = btn.bounding_box()
53 | # 获取滑动位置
54 | new_x = btn_position['x'] + random.randint(390, 400)
55 | new_y = btn_position['y']
56 | page.mouse.click(btn_position['x'], btn_position['y'])
57 | # 滑动
58 | page.mouse.down()
59 | page.mouse.move(new_x, new_y)
60 | page.mouse.up()
61 | # 稍等一下
62 | page.wait_for_timeout(2000)
63 | # 关闭所有
64 | page.close()
65 | content.close()
66 | browser.close()
67 | return result
68 |
69 |
70 | def main():
71 | # 用于导出
72 | with sync_playwright() as playwright:
73 | a = run(playwright)
74 | return a
75 |
76 |
77 | if __name__ == '__main__':
78 | for _ in range(10):
79 | print(main())
80 |
--------------------------------------------------------------------------------