├── .gitignore ├── README.md ├── bilibili └── bilibili_user.py ├── biqu └── biqu.py ├── cninfo ├── crawler.py └── demo.js ├── dzdp_svg └── dzdp_svg.py ├── jijin └── TTJJ.py ├── lagou ├── lg.js └── lg.py ├── lianjia └── lianjia.py ├── music163 ├── Music.js └── Music.py ├── qcc └── qcc.py ├── scrapeCenter ├── spa1 │ └── crawl.py ├── spa14 │ ├── Wasm.wasm │ └── crawl.py ├── spa15 │ ├── crawl.py │ └── demo.js ├── spa16 │ └── crawl.py ├── spa2 │ └── crawl.py ├── spa3 │ └── crawl.py ├── spa5 │ └── crawl.py ├── spa6 │ ├── crawl.py │ └── demo.js ├── spa7 │ └── crawl.py ├── ssr1 │ └── crawl.py ├── ssr2 │ └── crawl.py ├── ssr3 │ └── crawl.py └── ssr4 │ └── crawl.py ├── tweet ├── GetToken.py └── Tweet.py ├── weather └── weather.py ├── weibo ├── get_fans_info.py ├── search.py ├── search_all.py ├── weibo_comment.py └── 大V.txt ├── youdao └── yd_tran.py ├── zhihu └── public_func.py └── ziru └── zr.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | .idea/ 132 | /weibo/exist.txt 133 | .DS_Store 134 | 135 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-spider 2 | python爬虫小项目 3 | 目录如下: 4 | 1. [笔趣阁小说下载](https://github.com/monkey-hjy/python-spider/tree/master/biqu) 5 | 2. [Tweet数据抓取](https://github.com/monkey-hjy/python-spider/tree/master/tweet) 6 | 3. [中国天气网数据查询](https://github.com/monkey-hjy/python-spider/tree/master/weather) 7 | 4. [网易云音乐逆向爬虫](https://github.com/monkey-hjy/python-spider/tree/master/music163) 8 | 5. [天天基金网指定基金数据抓取](https://github.com/monkey-hjy/python-spider/tree/master/jijin) 9 | 6. [微博信息抓取](https://github.com/monkey-hjy/python-spider/tree/master/weibo) 10 | 7. [有道翻译逆向](https://github.com/monkey-hjy/python-spider/tree/master/youdao) 11 | 8. [链家全国租房信息抓取](https://github.com/monkey-hjy/python-spider/tree/master/lianjia) 12 | 9. [企查查免登陆爬虫](https://github.com/monkey-hjy/python-spider/tree/master/qcc) 13 | 10. [大众点评svg加密](https://github.com/monkey-hjy/python-spider/tree/master/dzdp_svg) 14 | 11. [B站用户爬虫](https://github.com/monkey-hjy/python-spider/tree/master/bilibili) 15 | 12. [拉钩免登录爬虫](https://github.com/monkey-hjy/python-spider/blob/master/lagou) 16 | 13. [自如租房字体加密](https://github.com/monkey-hjy/python-spider/tree/master/ziru) 17 | 14. [知乎问答抓取](https://github.com/monkey-hjy/python-spider/tree/master/zhihu_answer) 18 | 15. [深证信数据服务平台](https://github.com/monkey-hjy/python-spider/tree/master/cninfo) 19 | 20 | 21 | - CSDN不定期更新文章。个人主页 [https://blog.csdn.net/qq_42452095](https://blog.csdn.net/qq_42452095) 22 | - B站不定期更新视频。个人主页 [https://space.bilibili.com/347405521/channel/detail?cid=181641](https://space.bilibili.com/347405521/channel/detail?cid=181641) 23 | - 对代码有问题的话可以在本项目的 [Issues](https://github.com/monkey-hjy/python-spider/issues) 中沟通 24 | - 如果有代写需求。可以联系QQ847703187 微信:847703187 25 | -------------------------------------------------------------------------------- /bilibili/bilibili_user.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # @File : bilibili_user.py 4 | # @Author : Monkey 5 | # @DATE : 2021/5/17 10:04 6 | from gevent import monkey; monkey.patch_all() 7 | import gevent.pool 8 | import requests 9 | import pymysql 10 | import datetime 11 | 12 | 13 | class BiliUser(object): 14 | """B站用户""" 15 | 16 | def __init__(self): 17 | self.pool = gevent.pool.Pool(size=50) 18 | # 10的7次幂。千万 19 | self.mid_list = list(range(1, pow(10, 7))) 20 | # self.mid_list = list(range(1, pow(10, 3))) 21 | self.conn = pymysql.Connect(host='localhost', user='root', password='root', port=3306, database='demo') 22 | self.cursor = self.conn.cursor() 23 | self.proxies = dict() 24 | self._headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'} 25 | self.data = [] 26 | self.ips = [] 27 | self.set_proxies() 28 | 29 | def set_proxies(self): 30 | """设置代理""" 31 | ip = "需要在这里填写上自己获取代理IP的方法" 32 | self.proxies = { 33 | 'http': 'http://{}'.format(ip), 34 | 'https': 'http://{}'.format(ip), 35 | } 36 | 37 | def get_fans_count(self, mid): 38 | """获取粉丝数量""" 39 | url = 'https://api.bilibili.com/x/relation/stat?vmid={}&jsonp=jsonp'.format(mid) 40 | response = requests.get(url, headers=self._headers, proxies=self.proxies).json() 41 | follower = response['data']['follower'] 42 | following = response['data']['following'] 43 | return follower, following 44 | 45 | def get_user_info(self, mid): 46 | """获取用户信息""" 47 | url = 'https://api.bilibili.com/x/space/acc/info?mid={}&jsonp=jsonp'.format(mid) 48 | err_count = 0 49 | while err_count < 5: 50 | try: 51 | response = requests.get(url, headers=self._headers, proxies=self.proxies, timeout=10).json() 52 | if response['code'] == 0: 53 | nike_name = response['data']['name'] 54 | sex = response['data']['sex'] 55 | level = response['data']['level'] 56 | sign = response['data']['sign'] 57 | birthday = response['data']['birthday'] 58 | follower, following = self.get_fans_count(mid) 59 | self.data.append([mid, nike_name, sex, level, sign, birthday, follower, following]) 60 | print('mid:{}\tdata:{}'.format(mid, len(self.data))) 61 | if len(self.data) >= 100: 62 | data, self.data = self.data, [] 63 | self.save_data(data) 64 | break 65 | elif response['code'] == -412: 66 | raise Exception 67 | else: 68 | print(datetime.datetime.now(), response, mid) 69 | break 70 | except Exception as e: 71 | err_count += 1 72 | self.set_proxies() 73 | # print(err_count, self.proxies, e) 74 | 75 | def save_data(self, data): 76 | """保存数据""" 77 | sql = "INSERT INTO bili (mid, nike_name, sex, level, sign, birthday, follower, following) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)" 78 | self.cursor.executemany(sql, data) 79 | self.conn.commit() 80 | print('{}\t保存成功 --- {}'.format(datetime.datetime.now(), len(data))) 81 | 82 | def __del__(self): 83 | self.conn.close() 84 | 85 | def run(self): 86 | """启动函数""" 87 | self.pool.map(self.get_user_info, self.mid_list) 88 | if self.data: 89 | self.save_data(self.data) 90 | 91 | 92 | if __name__ == '__main__': 93 | t = BiliUser() 94 | t.run() 95 | -------------------------------------------------------------------------------- /biqu/biqu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-02-24 17:12:52 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-02-24 17:16:23 6 | import requests 7 | from lxml import etree 8 | import random 9 | from datetime import datetime, time 10 | 11 | # 随机UA头 12 | USER_AGENT = [ 13 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 14 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 15 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 16 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 17 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 18 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 19 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 20 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 21 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 22 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 23 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 24 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 25 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 27 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 28 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 29 | ] 30 | 31 | 32 | class SpiderBook(object): 33 | 34 | def __init__(self): 35 | self.search_url = 'https://www.biqooge.com/modules/article/search.php' 36 | self._headers = {'user-agent': random.choice(USER_AGENT)} 37 | 38 | def search_book(self): 39 | book_name = self.book_name 40 | data = { 41 | 'searchtype': 'articlename', 42 | 'searchkey': book_name.encode('gbk'), 43 | } 44 | response = requests.post(self.search_url, headers=self._headers, data=data) 45 | response.encoding = response.apparent_encoding 46 | html = etree.HTML(response.text) 47 | name = html.xpath('//tr[@id="nr"]/td[1]/a/text()') 48 | book_url = html.xpath('//tr[@id="nr"]/td[1]/a/@href') 49 | author = html.xpath('//tr[@id="nr"]/td[3]/text()') 50 | for i in range(len(name)): 51 | print('编号{}信息:作者-{}\t书名-{}'.format(i, author[i], name[i])) 52 | need_id = int(input('输入需要的书籍编号:')) 53 | self.download_book(book_url[need_id]) 54 | 55 | def download_book(self, book_url): 56 | response = requests.get(book_url, headers=self._headers) 57 | response.encoding = response.apparent_encoding 58 | html = etree.HTML(response.text) 59 | zj_info = html.xpath('//dt[contains(text(), "章节目录")]/following-sibling::dd') 60 | for i in range(len(zj_info)): 61 | info = zj_info[i] 62 | zj_name = info.xpath('./a/text()')[0] 63 | zj_url = 'https://www.biqooge.com' + info.xpath('./a/@href')[0] 64 | zj_response = requests.get(zj_url, headers=self._headers) 65 | zj_response.encoding = zj_response.apparent_encoding 66 | zj_html = etree.HTML(zj_response.text) 67 | content = ''.join(zj_html.xpath('//div[@id="content"]/text()')) 68 | print('{}/{}\tname:{}\turl:{}'.format(i+1, len(zj_info), zj_name, zj_url)) 69 | with open('{}.txt'.format(self.book), 'a', encoding='utf8') as f: 70 | f.write(zj_name + '\n') 71 | f.write(content + '\n\n') 72 | 73 | def run(self): 74 | self.book_name = '完美世界' 75 | self.search_book() 76 | 77 | 78 | if __name__ == '__main__': 79 | s = SpiderBook() 80 | s.run() 81 | -------------------------------------------------------------------------------- /cninfo/crawler.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawler.py 3 | # Date: 2022/05/30 23:05 4 | # Auth: HJY 5 | import requests 6 | import execjs 7 | 8 | ctx = execjs.compile(open('./demo.js', encoding='utf-8').read()) 9 | url = 'https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1007' 10 | headers = { 11 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', 12 | 'mcode': ctx.call('getResCode') 13 | } 14 | data = { 15 | 'tdate': '2022-05-27', 16 | 'market': 'SZE' 17 | } 18 | response = requests.post(url, json=data, headers=headers) 19 | print(response.json()) 20 | -------------------------------------------------------------------------------- /cninfo/demo.js: -------------------------------------------------------------------------------- 1 | function getResCode(){ 2 | var time=Math.floor(new Date().getTime()/1000); 3 | return missjson(""+time); 4 | } 5 | 6 | function missjson(input) { 7 | var keyStr = "ABCDEFGHIJKLMNOP" + "QRSTUVWXYZabcdef" + "ghijklmnopqrstuv" + "wxyz0123456789+/" + "="; 8 | var output = ""; 9 | var chr1, chr2, chr3 = ""; 10 | var enc1, enc2, enc3, enc4 = ""; 11 | var i = 0; 12 | do { 13 | chr1 = input.charCodeAt(i++); 14 | chr2 = input.charCodeAt(i++); 15 | chr3 = input.charCodeAt(i++); 16 | enc1 = chr1 >> 2; 17 | enc2 = ((chr1 & 3) << 4) | (chr2 >> 4); 18 | enc3 = ((chr2 & 15) << 2) | (chr3 >> 6); 19 | enc4 = chr3 & 63; 20 | if (isNaN(chr2)) { 21 | enc3 = enc4 = 64; 22 | } else if (isNaN(chr3)) { 23 | enc4 = 64; 24 | } 25 | output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + keyStr.charAt(enc3) + keyStr.charAt(enc4); 26 | chr1 = chr2 = chr3 = ""; 27 | enc1 = enc2 = enc3 = enc4 = ""; 28 | } while (i < input.length); 29 | return output; 30 | } 31 | -------------------------------------------------------------------------------- /dzdp_svg/dzdp_svg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # @File : dzdp_svg.py 4 | # @Author : Monkey 5 | # @DATE : 2021/5/13 下午4:54 6 | import re 7 | import requests 8 | 9 | 10 | class DZDP(object): 11 | """大众点评""" 12 | 13 | def __init__(self): 14 | self.css_url = 'https://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/80da73cea991b1dac8e6c3eb8cfe7461.css' 15 | self.svg_url = 'https://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/20609a5f67dfd9a34fd762ac63e59960.svg' 16 | self.css_text = requests.get(self.css_url).text 17 | self.svg_info = {int(info.split('">')[0]): info.split('">')[1] for info in re.findall(r'y="(.*?)', requests.get(self.svg_url).text)} 18 | 19 | def get_txt(self, code): 20 | """获取到编码对应的文字""" 21 | try: 22 | patt = '%s{background:(.*?);' % code 23 | index = re.findall(patt, self.css_text)[0].replace('px', '').replace('-', '').split(' ') 24 | index_x, index_y = int(index[0][:-2]), int(index[1][:-2]) 25 | for key in self.svg_info: 26 | if key >= index_y: 27 | return self.svg_info[key][index_x // 14] 28 | except: 29 | return code 30 | 31 | 32 | if __name__ == '__main__': 33 | t = DZDP() 34 | print(t.get_txt(code='swnbb')) 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /jijin/TTJJ.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-03-04 11:18:58 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-03-04 11:19:17 6 | # 天天基金网数据抓取 7 | 8 | import requests 9 | import time 10 | import re 11 | import json 12 | import pandas as pd 13 | import random 14 | 15 | file_path = '基金查询.xlsx' 16 | fund_codes = ['001606', '000924', '005962', '004997', '006751'] 17 | start_date = '2019-01-01' 18 | end_date = '2021-10-30' 19 | url = 'http://api.fund.eastmoney.com/f10/lsjz' 20 | headers = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36', 22 | 'Referer': 'http://fundf10.eastmoney.com/', 23 | } 24 | result = dict() 25 | result_fsrq = [] 26 | for fund_code in fund_codes: 27 | params = { 28 | "callback": f"jQuery183{''.join([str(random.randrange(0, 10)) for _ in range(17)])}_{int(time.time() * 1000)}", 29 | "fundCode": fund_code, 30 | "pageIndex": "1", 31 | "pageSize": "100000", 32 | "startDate": start_date, 33 | "endDate": end_date, 34 | "_": str(int(time.time() * 1000)), 35 | } 36 | response = json.loads(re.findall(r'\((.*)\)', requests.get(url, headers=headers, params=params).text, re.S)[0]) 37 | # 日期 38 | FSRQ = [] 39 | # 单位净值 40 | DWJZ = [] 41 | fund_info = response['Data']['LSJZList'] 42 | for i in range(len(fund_info)): 43 | # FSRQ.append(datetime.datetime.strptime(fund_info[i]['FSRQ'], '%Y-%m-%d')) 44 | FSRQ.append(fund_info[i]['FSRQ']) 45 | DWJZ.append(fund_info[i]['DWJZ']) 46 | result_fsrq = FSRQ if len(FSRQ) > len(result_fsrq) else result_fsrq 47 | result[fund_code] = DWJZ 48 | max_len = 0 49 | for key in result: 50 | max_len = len(result[key]) if len(result[key]) > max_len else max_len 51 | for key in result: 52 | result[key] += [None] * (max_len - len(result[key])) 53 | result = pd.DataFrame(result) 54 | result.index = result_fsrq 55 | result.to_excel(file_path, encoding='ANSI') 56 | -------------------------------------------------------------------------------- /lagou/lg.py: -------------------------------------------------------------------------------- 1 | from gevent import monkey; monkey.patch_all() 2 | import gevent.pool 3 | import json 4 | import random 5 | import re 6 | 7 | from lxml import etree 8 | import execjs 9 | import requests 10 | from sns_spider.config.settings import USER_AGENTS 11 | import pymongo 12 | 13 | 14 | class LG(object): 15 | """拉钩 js逆向""" 16 | 17 | def __init__(self): 18 | self.client = pymongo.MongoClient(host='localhost', port=27017) 19 | self.mongo_col = self.client['demo']['lagou'] 20 | self.js_file = open('lg.js', encoding='utf8').read() 21 | self._headers = { 22 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36', 23 | 'referer': 'https://www.lagou.com/jobs/list_java/p-city_3?px=default', 24 | } 25 | self.token = '' 26 | self.proxies = dict() 27 | self.set_proxies() 28 | self.get_token() 29 | self.city_info = dict() 30 | 31 | def set_proxies(self): 32 | """设置代理""" 33 | ip = "获取到代理IP" 34 | self.proxies = { 35 | 'http': 'http://{}'.format(ip), 36 | 'https': 'http://{}'.format(ip), 37 | } 38 | 39 | def get_response(self, url, params=None, data=None, method='GET'): 40 | while True: 41 | try: 42 | if method == 'GET': 43 | response = requests.get(url, params=params, headers=self._headers, proxies=self.proxies) 44 | else: 45 | response = requests.post(url, params=params, data=data, headers=self._headers, proxies=self.proxies) 46 | response.encoding = response.apparent_encoding 47 | return response 48 | except: 49 | self.set_proxies() 50 | self.get_token() 51 | 52 | def get_token(self): 53 | """获取到游客cookie""" 54 | url = 'https://www.lagou.com/gongsi/allCity.html' 55 | while True: 56 | headers = {'user-agent': random.choice(USER_AGENTS)} 57 | try: 58 | response = requests.get(url, headers=headers, allow_redirects=False, proxies=self.proxies, timeout=10) 59 | response.encoding = response.apparent_encoding 60 | user_trace_token = re.findall(r'user_trace_token=(.*?);', response.headers['Set-Cookie'])[0] 61 | x_http_token = re.findall(r'X_HTTP_TOKEN=(.*?);', response.headers['Set-Cookie'])[0] 62 | href = response.headers['Location'] 63 | ctx = execjs.compile(self.js_file, cwd='/opt/homebrew/Cellar/node/16.3.0/bin/') 64 | self.token = ctx.call('window.gt.prototype.a', 65 | json.dumps({"href": href, "search": href.split('check.html')[1]})) 66 | self._headers['cookie'] = 'user_trace_token={};X_HTTP_TOKEN={};__lg_stoken__={}'.format( 67 | user_trace_token, x_http_token, self.token) 68 | return 69 | except Exception as e: 70 | print('获取token失败\tproxies:{}\te:{}'.format(self.proxies, e)) 71 | self.set_proxies() 72 | 73 | def get_city_info(self): 74 | """获取城市信息""" 75 | url = 'https://www.lagou.com/jobs/allCity.html' 76 | html = etree.HTML(self.get_response(url).text) 77 | city_url = html.xpath('//ul[@class="city_list"]/li/a/@href') 78 | city_name = html.xpath('//ul[@class="city_list"]/li/a/text()') 79 | self.city_info = {city_name[i]: city_url[i] for i in range(len(city_url))} 80 | 81 | def get_job_info(self, input_item): 82 | """获取职位信息""" 83 | url = 'https://www.lagou.com/jobs/positionAjax.json' 84 | params = { 85 | "px": "default", 86 | "city": input_item['city_name'], 87 | "district": input_item['district'], 88 | "needAddtionalResult": "false", 89 | } 90 | sid = '' 91 | page = 1 92 | while True: 93 | data = { 94 | "first": "true", 95 | "pn": page, 96 | "kd": input_item['keyword'], 97 | "sid": sid, 98 | } 99 | job_info = self.get_response(url, params=params, data=data, method='POST').json() 100 | if 'success' in job_info: 101 | sid = job_info['content']['showId'] 102 | job_info = job_info['content']['positionResult']['result'] 103 | if not job_info or page == 30: 104 | break 105 | self.parse_info(job_info, input_item) 106 | print('{}\t页码:{}\t数据量:{}'.format(input_item, page, len(job_info))) 107 | page += 1 108 | 109 | def parse_info(self, job_info, input_item): 110 | """解析内容""" 111 | items = list() 112 | for info in job_info: 113 | item = { 114 | '_id': info['positionId'], 115 | 'job_name': info['positionName'], 116 | 'job_url': 'https://www.lagou.com/jobs/{}.html'.format(info['positionId']), 117 | 'company_name': info['companyFullName'], 118 | 'company_size': info['companySize'], 119 | 'industry_field': info['industryField'], 120 | 'finance_stage': info['financeStage'], 121 | 'company_label': info['companyLabelList'], 122 | 'skill_label': info['skillLables'], 123 | 'position_label': info['positionLables'], 124 | 'create_time': info['createTime'], 125 | 'city': info['city'], 126 | 'district': info['district'], 127 | 'salary': info['salary'], 128 | 'work_year': info['workYear'], 129 | 'job_nature': info['jobNature'], 130 | 'education': info['education'], 131 | 'position_advantage': info['positionAdvantage'], 132 | 'position_detail': info['positionDetail'], 133 | 'position_address': info['positionAddress'] 134 | } 135 | items.append(item) 136 | try: 137 | self.mongo_col.insert_many(items) 138 | # print('{}\t插入成功。本次插入{}条'.format(input_item, len(items))) 139 | except: 140 | for item in items: 141 | try: 142 | self.mongo_col.insert_one(item) 143 | except: 144 | pass 145 | 146 | def run(self): 147 | """启动函数""" 148 | self.get_city_info() 149 | # print(self.city_info) 150 | for city_name, city_url in self.city_info.items(): 151 | # for city_name in ['郑州', '北京', '上海', '广州', '深圳']: 152 | city_url = self.city_info[city_name] 153 | if '-zhaopin' not in city_url: 154 | city_url = city_url.rstrip('/') + '-zhaopin/' 155 | response = self.get_response(url=city_url, method='GET') 156 | html = etree.HTML(response.text) 157 | district_name = html.xpath('//div[@data-type="district"]/a[position()>1]/text()') 158 | item = [{'city_name': city_name, 'district': name, 'keyword': 'python'} for name in district_name] 159 | print(item) 160 | pool = gevent.pool.Pool(size=1) 161 | pool.map(self.get_job_info, item) 162 | 163 | 164 | if __name__ == '__main__': 165 | t = LG() 166 | t.run() 167 | 168 | -------------------------------------------------------------------------------- /lianjia/lianjia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: 玛卡巴卡 3 | # Date: 2021/5/6 14:39 4 | 5 | import requests 6 | from lxml import etree 7 | import pymysql 8 | 9 | 10 | class Lianjia(object): 11 | """抓取链家租房信息""" 12 | 13 | def __init__(self): 14 | self.conn = pymysql.Connect(host='localhost', port=3306, user='root', password='root', database='demo') 15 | self.cursor = self.conn.cursor() 16 | self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'} 17 | 18 | def __del__(self): 19 | self.conn.close() 20 | 21 | def get_response(self, url): 22 | """发起请求""" 23 | response = requests.get(url, headers=self._headers) 24 | if response.status_code == 200: 25 | response.encoding = 'utf8' 26 | return response 27 | else: 28 | print('url:{}\tresponse:{}'.format(url, response)) 29 | 30 | def get_city_url(self): 31 | """获取到城市的链接""" 32 | url = 'https://www.lianjia.com/city/' 33 | html = etree.HTML(self.get_response(url).text) 34 | city_url = html.xpath('//ul[@class="city_list_ul"]//a/@href') 35 | for url in city_url: 36 | self.get_district_url(city_url=url) 37 | 38 | def get_district_url(self, city_url): 39 | """获取到区的链接""" 40 | html = etree.HTML(self.get_response(city_url + 'zufang/').text) 41 | district_url = html.xpath('//li[@class="filter__item--level2 "]/a/@href') 42 | for url in district_url: 43 | self.get_house_count(url=city_url[:-1] + url) 44 | 45 | def get_house_count(self, url): 46 | """获取当前区的房子数量""" 47 | html = etree.HTML(self.get_response(url).text) 48 | count = int(html.xpath('//span[@class="content__title--hl"]/text()')[0]) 49 | if count: 50 | if count >= 3000: 51 | filter_next_url = html.xpath('//li[@class="filter__item--level3 "]/a/@href') 52 | for filter_url in filter_next_url: 53 | the_filter_url = '/'.join(url.split('/')[:3]) + filter_url 54 | html = etree.HTML(self.get_response(the_filter_url).text) 55 | count = min(int(html.xpath('//span[@class="content__title--hl"]/text()')[0]), 3000) 56 | self.start(the_filter_url, count // 30 + 1) 57 | else: 58 | self.start(url, count // 30 + 1) 59 | else: 60 | print('{} 无房源'.format(url)) 61 | 62 | def start(self, url, end_page): 63 | """开始抓取数据""" 64 | for page in range(1, end_page+1): 65 | self.get_page_info(url='{}pg{}/'.format(url, page)) 66 | 67 | def get_page_info(self, url): 68 | """获取当前页房源信息""" 69 | print(url, end='\t') 70 | err_count = 0 71 | response = self.get_response(url) 72 | html = etree.HTML(response.text) 73 | house_element = html.xpath('//div[@class="content__list--item"]') 74 | for element in house_element: 75 | try: 76 | house_url = '/'.join(url.split('/')[:3]) + element.xpath('./a/@href')[0] 77 | house_code = element.xpath('./@data-house_code')[0] 78 | title = element.xpath('./a/@title')[0] 79 | des = ''.join(element.xpath('./div/p[2]//text()')).replace('\n', '').replace(' ', ' ') 80 | price = int(element.xpath('./div/span/em/text()')[0]) 81 | sql = "INSERT INTO lianjia (id, url, title, des, price) values ('%s', '%s', '%s', '%s', %d);" % (house_code, house_url, title, des, price) 82 | self.cursor.execute(sql) 83 | self.conn.commit() 84 | except Exception as e: 85 | err_count += 1 86 | print('出错占比:{}/{}'.format(err_count, len(house_element))) 87 | 88 | def run(self): 89 | """启动函数""" 90 | self.get_city_url() 91 | 92 | 93 | if __name__ == '__main__': 94 | t = Lianjia() 95 | t.run() 96 | -------------------------------------------------------------------------------- /music163/Music.js: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: monkey-hjy 3 | * @Date: 2021-02-24 17:42:52 4 | * @Last Modified by: monkey-hjy 5 | * @Last Modified time: 2021-02-24 17:48:21 6 | */ 7 | var CryptoJS = CryptoJS || function(u, p) { 8 | var d = {} 9 | , l = d.lib = {} 10 | , s = function() {} 11 | , t = l.Base = { 12 | extend: function(a) { 13 | s.prototype = this; 14 | var c = new s; 15 | a && c.mixIn(a); 16 | c.hasOwnProperty("init") || (c.init = function() { 17 | c.$super.init.apply(this, arguments) 18 | } 19 | ); 20 | c.init.prototype = c; 21 | c.$super = this; 22 | return c 23 | }, 24 | create: function() { 25 | var a = this.extend(); 26 | a.init.apply(a, arguments); 27 | return a 28 | }, 29 | init: function() {}, 30 | mixIn: function(a) { 31 | for (var c in a) 32 | a.hasOwnProperty(c) && (this[c] = a[c]); 33 | a.hasOwnProperty("toString") && (this.toString = a.toString) 34 | }, 35 | clone: function() { 36 | return this.init.prototype.extend(this) 37 | } 38 | } 39 | , r = l.WordArray = t.extend({ 40 | init: function(a, c) { 41 | a = this.words = a || []; 42 | this.sigBytes = c != p ? c : 4 * a.length 43 | }, 44 | toString: function(a) { 45 | return (a || v).stringify(this) 46 | }, 47 | concat: function(a) { 48 | var c = this.words 49 | , e = a.words 50 | , j = this.sigBytes; 51 | a = a.sigBytes; 52 | this.clamp(); 53 | if (j % 4) 54 | for (var k = 0; k < a; k++) 55 | c[j + k >>> 2] |= (e[k >>> 2] >>> 24 - 8 * (k % 4) & 255) << 24 - 8 * ((j + k) % 4); 56 | else if (65535 < e.length) 57 | for (k = 0; k < a; k += 4) 58 | c[j + k >>> 2] = e[k >>> 2]; 59 | else 60 | c.push.apply(c, e); 61 | this.sigBytes += a; 62 | return this 63 | }, 64 | clamp: function() { 65 | var a = this.words 66 | , c = this.sigBytes; 67 | a[c >>> 2] &= 4294967295 << 32 - 8 * (c % 4); 68 | a.length = u.ceil(c / 4) 69 | }, 70 | clone: function() { 71 | var a = t.clone.call(this); 72 | a.words = this.words.slice(0); 73 | return a 74 | }, 75 | random: function(a) { 76 | for (var c = [], e = 0; e < a; e += 4) 77 | c.push(4294967296 * u.random() | 0); 78 | return new r.init(c,a) 79 | } 80 | }) 81 | , w = d.enc = {} 82 | , v = w.Hex = { 83 | stringify: function(a) { 84 | var c = a.words; 85 | a = a.sigBytes; 86 | for (var e = [], j = 0; j < a; j++) { 87 | var k = c[j >>> 2] >>> 24 - 8 * (j % 4) & 255; 88 | e.push((k >>> 4).toString(16)); 89 | e.push((k & 15).toString(16)) 90 | } 91 | return e.join("") 92 | }, 93 | parse: function(a) { 94 | for (var c = a.length, e = [], j = 0; j < c; j += 2) 95 | e[j >>> 3] |= parseInt(a.substr(j, 2), 16) << 24 - 4 * (j % 8); 96 | return new r.init(e,c / 2) 97 | } 98 | } 99 | , b = w.Latin1 = { 100 | stringify: function(a) { 101 | var c = a.words; 102 | a = a.sigBytes; 103 | for (var e = [], j = 0; j < a; j++) 104 | e.push(String.fromCharCode(c[j >>> 2] >>> 24 - 8 * (j % 4) & 255)); 105 | return e.join("") 106 | }, 107 | parse: function(a) { 108 | for (var c = a.length, e = [], j = 0; j < c; j++) 109 | e[j >>> 2] |= (a.charCodeAt(j) & 255) << 24 - 8 * (j % 4); 110 | return new r.init(e,c) 111 | } 112 | } 113 | , x = w.Utf8 = { 114 | stringify: function(a) { 115 | try { 116 | return decodeURIComponent(escape(b.stringify(a))) 117 | } catch (c) { 118 | throw Error("Malformed UTF-8 data") 119 | } 120 | }, 121 | parse: function(a) { 122 | return b.parse(unescape(encodeURIComponent(a))) 123 | } 124 | } 125 | , q = l.BufferedBlockAlgorithm = t.extend({ 126 | reset: function() { 127 | this.i9b = new r.init; 128 | this.ty5D = 0 129 | }, 130 | vb6V: function(a) { 131 | "string" == typeof a && (a = x.parse(a)); 132 | this.i9b.concat(a); 133 | this.ty5D += a.sigBytes 134 | }, 135 | kY3x: function(a) { 136 | var c = this.i9b 137 | , e = c.words 138 | , j = c.sigBytes 139 | , k = this.blockSize 140 | , b = j / (4 * k) 141 | , b = a ? u.ceil(b) : u.max((b | 0) - this.JP1x, 0); 142 | a = b * k; 143 | j = u.min(4 * a, j); 144 | if (a) { 145 | for (var q = 0; q < a; q += k) 146 | this.qL5Q(e, q); 147 | q = e.splice(0, a); 148 | c.sigBytes -= j 149 | } 150 | return new r.init(q,j) 151 | }, 152 | clone: function() { 153 | var a = t.clone.call(this); 154 | a.i9b = this.i9b.clone(); 155 | return a 156 | }, 157 | JP1x: 0 158 | }); 159 | l.Hasher = q.extend({ 160 | cfg: t.extend(), 161 | init: function(a) { 162 | this.cfg = this.cfg.extend(a); 163 | this.reset() 164 | }, 165 | reset: function() { 166 | q.reset.call(this); 167 | this.lt3x() 168 | }, 169 | update: function(a) { 170 | this.vb6V(a); 171 | this.kY3x(); 172 | return this 173 | }, 174 | finalize: function(a) { 175 | a && this.vb6V(a); 176 | return this.mA4E() 177 | }, 178 | blockSize: 16, 179 | lS3x: function(a) { 180 | return function(b, e) { 181 | return (new a.init(e)).finalize(b) 182 | } 183 | }, 184 | vl6f: function(a) { 185 | return function(b, e) { 186 | return (new n.HMAC.init(a,e)).finalize(b) 187 | } 188 | } 189 | }); 190 | var n = d.algo = {}; 191 | return d 192 | }(Math); 193 | (function() { 194 | var u = CryptoJS 195 | , p = u.lib.WordArray; 196 | u.enc.Base64 = { 197 | stringify: function(d) { 198 | var l = d.words 199 | , p = d.sigBytes 200 | , t = this.bA0x; 201 | d.clamp(); 202 | d = []; 203 | for (var r = 0; r < p; r += 3) 204 | for (var w = (l[r >>> 2] >>> 24 - 8 * (r % 4) & 255) << 16 | (l[r + 1 >>> 2] >>> 24 - 8 * ((r + 1) % 4) & 255) << 8 | l[r + 2 >>> 2] >>> 24 - 8 * ((r + 2) % 4) & 255, v = 0; 4 > v && r + .75 * v < p; v++) 205 | d.push(t.charAt(w >>> 6 * (3 - v) & 63)); 206 | if (l = t.charAt(64)) 207 | for (; d.length % 4; ) 208 | d.push(l); 209 | return d.join("") 210 | }, 211 | parse: function(d) { 212 | var l = d.length 213 | , s = this.bA0x 214 | , t = s.charAt(64); 215 | t && (t = d.indexOf(t), 216 | -1 != t && (l = t)); 217 | for (var t = [], r = 0, w = 0; w < l; w++) 218 | if (w % 4) { 219 | var v = s.indexOf(d.charAt(w - 1)) << 2 * (w % 4) 220 | , b = s.indexOf(d.charAt(w)) >>> 6 - 2 * (w % 4); 221 | t[r >>> 2] |= (v | b) << 24 - 8 * (r % 4); 222 | r++ 223 | } 224 | return p.create(t, r) 225 | }, 226 | bA0x: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" 227 | } 228 | } 229 | )(); 230 | (function(u) { 231 | function p(b, n, a, c, e, j, k) { 232 | b = b + (n & a | ~n & c) + e + k; 233 | return (b << j | b >>> 32 - j) + n 234 | } 235 | function d(b, n, a, c, e, j, k) { 236 | b = b + (n & c | a & ~c) + e + k; 237 | return (b << j | b >>> 32 - j) + n 238 | } 239 | function l(b, n, a, c, e, j, k) { 240 | b = b + (n ^ a ^ c) + e + k; 241 | return (b << j | b >>> 32 - j) + n 242 | } 243 | function s(b, n, a, c, e, j, k) { 244 | b = b + (a ^ (n | ~c)) + e + k; 245 | return (b << j | b >>> 32 - j) + n 246 | } 247 | for (var t = CryptoJS, r = t.lib, w = r.WordArray, v = r.Hasher, r = t.algo, b = [], x = 0; 64 > x; x++) 248 | b[x] = 4294967296 * u.abs(u.sin(x + 1)) | 0; 249 | r = r.MD5 = v.extend({ 250 | lt3x: function() { 251 | this.cN1x = new w.init([1732584193, 4023233417, 2562383102, 271733878]) 252 | }, 253 | qL5Q: function(q, n) { 254 | for (var a = 0; 16 > a; a++) { 255 | var c = n + a 256 | , e = q[c]; 257 | q[c] = (e << 8 | e >>> 24) & 16711935 | (e << 24 | e >>> 8) & 4278255360 258 | } 259 | var a = this.cN1x.words 260 | , c = q[n + 0] 261 | , e = q[n + 1] 262 | , j = q[n + 2] 263 | , k = q[n + 3] 264 | , z = q[n + 4] 265 | , r = q[n + 5] 266 | , t = q[n + 6] 267 | , w = q[n + 7] 268 | , v = q[n + 8] 269 | , A = q[n + 9] 270 | , B = q[n + 10] 271 | , C = q[n + 11] 272 | , u = q[n + 12] 273 | , D = q[n + 13] 274 | , E = q[n + 14] 275 | , x = q[n + 15] 276 | , f = a[0] 277 | , m = a[1] 278 | , g = a[2] 279 | , h = a[3] 280 | , f = p(f, m, g, h, c, 7, b[0]) 281 | , h = p(h, f, m, g, e, 12, b[1]) 282 | , g = p(g, h, f, m, j, 17, b[2]) 283 | , m = p(m, g, h, f, k, 22, b[3]) 284 | , f = p(f, m, g, h, z, 7, b[4]) 285 | , h = p(h, f, m, g, r, 12, b[5]) 286 | , g = p(g, h, f, m, t, 17, b[6]) 287 | , m = p(m, g, h, f, w, 22, b[7]) 288 | , f = p(f, m, g, h, v, 7, b[8]) 289 | , h = p(h, f, m, g, A, 12, b[9]) 290 | , g = p(g, h, f, m, B, 17, b[10]) 291 | , m = p(m, g, h, f, C, 22, b[11]) 292 | , f = p(f, m, g, h, u, 7, b[12]) 293 | , h = p(h, f, m, g, D, 12, b[13]) 294 | , g = p(g, h, f, m, E, 17, b[14]) 295 | , m = p(m, g, h, f, x, 22, b[15]) 296 | , f = d(f, m, g, h, e, 5, b[16]) 297 | , h = d(h, f, m, g, t, 9, b[17]) 298 | , g = d(g, h, f, m, C, 14, b[18]) 299 | , m = d(m, g, h, f, c, 20, b[19]) 300 | , f = d(f, m, g, h, r, 5, b[20]) 301 | , h = d(h, f, m, g, B, 9, b[21]) 302 | , g = d(g, h, f, m, x, 14, b[22]) 303 | , m = d(m, g, h, f, z, 20, b[23]) 304 | , f = d(f, m, g, h, A, 5, b[24]) 305 | , h = d(h, f, m, g, E, 9, b[25]) 306 | , g = d(g, h, f, m, k, 14, b[26]) 307 | , m = d(m, g, h, f, v, 20, b[27]) 308 | , f = d(f, m, g, h, D, 5, b[28]) 309 | , h = d(h, f, m, g, j, 9, b[29]) 310 | , g = d(g, h, f, m, w, 14, b[30]) 311 | , m = d(m, g, h, f, u, 20, b[31]) 312 | , f = l(f, m, g, h, r, 4, b[32]) 313 | , h = l(h, f, m, g, v, 11, b[33]) 314 | , g = l(g, h, f, m, C, 16, b[34]) 315 | , m = l(m, g, h, f, E, 23, b[35]) 316 | , f = l(f, m, g, h, e, 4, b[36]) 317 | , h = l(h, f, m, g, z, 11, b[37]) 318 | , g = l(g, h, f, m, w, 16, b[38]) 319 | , m = l(m, g, h, f, B, 23, b[39]) 320 | , f = l(f, m, g, h, D, 4, b[40]) 321 | , h = l(h, f, m, g, c, 11, b[41]) 322 | , g = l(g, h, f, m, k, 16, b[42]) 323 | , m = l(m, g, h, f, t, 23, b[43]) 324 | , f = l(f, m, g, h, A, 4, b[44]) 325 | , h = l(h, f, m, g, u, 11, b[45]) 326 | , g = l(g, h, f, m, x, 16, b[46]) 327 | , m = l(m, g, h, f, j, 23, b[47]) 328 | , f = s(f, m, g, h, c, 6, b[48]) 329 | , h = s(h, f, m, g, w, 10, b[49]) 330 | , g = s(g, h, f, m, E, 15, b[50]) 331 | , m = s(m, g, h, f, r, 21, b[51]) 332 | , f = s(f, m, g, h, u, 6, b[52]) 333 | , h = s(h, f, m, g, k, 10, b[53]) 334 | , g = s(g, h, f, m, B, 15, b[54]) 335 | , m = s(m, g, h, f, e, 21, b[55]) 336 | , f = s(f, m, g, h, v, 6, b[56]) 337 | , h = s(h, f, m, g, x, 10, b[57]) 338 | , g = s(g, h, f, m, t, 15, b[58]) 339 | , m = s(m, g, h, f, D, 21, b[59]) 340 | , f = s(f, m, g, h, z, 6, b[60]) 341 | , h = s(h, f, m, g, C, 10, b[61]) 342 | , g = s(g, h, f, m, j, 15, b[62]) 343 | , m = s(m, g, h, f, A, 21, b[63]); 344 | a[0] = a[0] + f | 0; 345 | a[1] = a[1] + m | 0; 346 | a[2] = a[2] + g | 0; 347 | a[3] = a[3] + h | 0 348 | }, 349 | mA4E: function() { 350 | var b = this.i9b 351 | , n = b.words 352 | , a = 8 * this.ty5D 353 | , c = 8 * b.sigBytes; 354 | n[c >>> 5] |= 128 << 24 - c % 32; 355 | var e = u.floor(a / 4294967296); 356 | n[(c + 64 >>> 9 << 4) + 15] = (e << 8 | e >>> 24) & 16711935 | (e << 24 | e >>> 8) & 4278255360; 357 | n[(c + 64 >>> 9 << 4) + 14] = (a << 8 | a >>> 24) & 16711935 | (a << 24 | a >>> 8) & 4278255360; 358 | b.sigBytes = 4 * (n.length + 1); 359 | this.kY3x(); 360 | b = this.cN1x; 361 | n = b.words; 362 | for (a = 0; 4 > a; a++) 363 | c = n[a], 364 | n[a] = (c << 8 | c >>> 24) & 16711935 | (c << 24 | c >>> 8) & 4278255360; 365 | return b 366 | }, 367 | clone: function() { 368 | var b = v.clone.call(this); 369 | b.cN1x = this.cN1x.clone(); 370 | return b 371 | } 372 | }); 373 | t.MD5 = v.lS3x(r); 374 | t.HmacMD5 = v.vl6f(r) 375 | } 376 | )(Math); 377 | (function() { 378 | var u = CryptoJS 379 | , p = u.lib 380 | , d = p.Base 381 | , l = p.WordArray 382 | , p = u.algo 383 | , s = p.EvpKDF = d.extend({ 384 | cfg: d.extend({ 385 | keySize: 4, 386 | hasher: p.MD5, 387 | iterations: 1 388 | }), 389 | init: function(d) { 390 | this.cfg = this.cfg.extend(d) 391 | }, 392 | compute: function(d, r) { 393 | for (var p = this.cfg, s = p.hasher.create(), b = l.create(), u = b.words, q = p.keySize, p = p.iterations; u.length < q; ) { 394 | n && s.update(n); 395 | var n = s.update(d).finalize(r); 396 | s.reset(); 397 | for (var a = 1; a < p; a++) 398 | n = s.finalize(n), 399 | s.reset(); 400 | b.concat(n) 401 | } 402 | b.sigBytes = 4 * q; 403 | return b 404 | } 405 | }); 406 | u.EvpKDF = function(d, l, p) { 407 | return s.create(p).compute(d, l) 408 | } 409 | } 410 | )(); 411 | CryptoJS.lib.Cipher || function(u) { 412 | var p = CryptoJS 413 | , d = p.lib 414 | , l = d.Base 415 | , s = d.WordArray 416 | , t = d.BufferedBlockAlgorithm 417 | , r = p.enc.Base64 418 | , w = p.algo.EvpKDF 419 | , v = d.Cipher = t.extend({ 420 | cfg: l.extend(), 421 | createEncryptor: function(e, a) { 422 | return this.create(this.JY1x, e, a) 423 | }, 424 | createDecryptor: function(e, a) { 425 | return this.create(this.bqV9M, e, a) 426 | }, 427 | init: function(e, a, b) { 428 | this.cfg = this.cfg.extend(b); 429 | this.Qq2x = e; 430 | this.L0x = a; 431 | this.reset() 432 | }, 433 | reset: function() { 434 | t.reset.call(this); 435 | this.lt3x() 436 | }, 437 | process: function(e) { 438 | this.vb6V(e); 439 | return this.kY3x() 440 | }, 441 | finalize: function(e) { 442 | e && this.vb6V(e); 443 | return this.mA4E() 444 | }, 445 | keySize: 4, 446 | ivSize: 4, 447 | JY1x: 1, 448 | bqV9M: 2, 449 | lS3x: function(e) { 450 | return { 451 | encrypt: function(b, k, d) { 452 | return ("string" == typeof k ? c : a).encrypt(e, b, k, d) 453 | }, 454 | decrypt: function(b, k, d) { 455 | return ("string" == typeof k ? c : a).decrypt(e, b, k, d) 456 | } 457 | } 458 | } 459 | }); 460 | d.StreamCipher = v.extend({ 461 | mA4E: function() { 462 | return this.kY3x(!0) 463 | }, 464 | blockSize: 1 465 | }); 466 | var b = p.mode = {} 467 | , x = function(e, a, b) { 468 | var c = this.tw5B; 469 | c ? this.tw5B = u : c = this.DB9s; 470 | for (var d = 0; d < b; d++) 471 | e[a + d] ^= c[d] 472 | } 473 | , q = (d.BlockCipherMode = l.extend({ 474 | createEncryptor: function(e, a) { 475 | return this.Encryptor.create(e, a) 476 | }, 477 | createDecryptor: function(e, a) { 478 | return this.Decryptor.create(e, a) 479 | }, 480 | init: function(e, a) { 481 | this.vw6q = e; 482 | this.tw5B = a 483 | } 484 | })).extend(); 485 | q.Encryptor = q.extend({ 486 | processBlock: function(e, a) { 487 | var b = this.vw6q 488 | , c = b.blockSize; 489 | x.call(this, e, a, c); 490 | b.encryptBlock(e, a); 491 | this.DB9s = e.slice(a, a + c) 492 | } 493 | }); 494 | q.Decryptor = q.extend({ 495 | processBlock: function(e, a) { 496 | var b = this.vw6q 497 | , c = b.blockSize 498 | , d = e.slice(a, a + c); 499 | b.decryptBlock(e, a); 500 | x.call(this, e, a, c); 501 | this.DB9s = d 502 | } 503 | }); 504 | b = b.CBC = q; 505 | q = (p.pad = {}).Pkcs7 = { 506 | pad: function(a, b) { 507 | for (var c = 4 * b, c = c - a.sigBytes % c, d = c << 24 | c << 16 | c << 8 | c, l = [], n = 0; n < c; n += 4) 508 | l.push(d); 509 | c = s.create(l, c); 510 | a.concat(c) 511 | }, 512 | unpad: function(a) { 513 | a.sigBytes -= a.words[a.sigBytes - 1 >>> 2] & 255 514 | } 515 | }; 516 | d.BlockCipher = v.extend({ 517 | cfg: v.cfg.extend({ 518 | mode: b, 519 | padding: q 520 | }), 521 | reset: function() { 522 | v.reset.call(this); 523 | var a = this.cfg 524 | , b = a.iv 525 | , a = a.mode; 526 | if (this.Qq2x == this.JY1x) 527 | var c = a.createEncryptor; 528 | else 529 | c = a.createDecryptor, 530 | this.JP1x = 1; 531 | this.eT2x = c.call(a, this, b && b.words) 532 | }, 533 | qL5Q: function(a, b) { 534 | this.eT2x.processBlock(a, b) 535 | }, 536 | mA4E: function() { 537 | var a = this.cfg.padding; 538 | if (this.Qq2x == this.JY1x) { 539 | a.pad(this.i9b, this.blockSize); 540 | var b = this.kY3x(!0) 541 | } else 542 | b = this.kY3x(!0), 543 | a.unpad(b); 544 | return b 545 | }, 546 | blockSize: 4 547 | }); 548 | var n = d.CipherParams = l.extend({ 549 | init: function(a) { 550 | this.mixIn(a) 551 | }, 552 | toString: function(a) { 553 | return (a || this.formatter).stringify(this) 554 | } 555 | }) 556 | , b = (p.format = {}).OpenSSL = { 557 | stringify: function(a) { 558 | var b = a.ciphertext; 559 | a = a.salt; 560 | return (a ? s.create([1398893684, 1701076831]).concat(a).concat(b) : b).toString(r) 561 | }, 562 | parse: function(a) { 563 | a = r.parse(a); 564 | var b = a.words; 565 | if (1398893684 == b[0] && 1701076831 == b[1]) { 566 | var c = s.create(b.slice(2, 4)); 567 | b.splice(0, 4); 568 | a.sigBytes -= 16 569 | } 570 | return n.create({ 571 | ciphertext: a, 572 | salt: c 573 | }) 574 | } 575 | } 576 | , a = d.SerializableCipher = l.extend({ 577 | cfg: l.extend({ 578 | format: b 579 | }), 580 | encrypt: function(a, b, c, d) { 581 | d = this.cfg.extend(d); 582 | var l = a.createEncryptor(c, d); 583 | b = l.finalize(b); 584 | l = l.cfg; 585 | return n.create({ 586 | ciphertext: b, 587 | key: c, 588 | iv: l.iv, 589 | algorithm: a, 590 | mode: l.mode, 591 | padding: l.padding, 592 | blockSize: a.blockSize, 593 | formatter: d.format 594 | }) 595 | }, 596 | decrypt: function(a, b, c, d) { 597 | d = this.cfg.extend(d); 598 | b = this.Hj0x(b, d.format); 599 | return a.createDecryptor(c, d).finalize(b.ciphertext) 600 | }, 601 | Hj0x: function(a, b) { 602 | return "string" == typeof a ? b.parse(a, this) : a 603 | } 604 | }) 605 | , p = (p.kdf = {}).OpenSSL = { 606 | execute: function(a, b, c, d) { 607 | d || (d = s.random(8)); 608 | a = w.create({ 609 | keySize: b + c 610 | }).compute(a, d); 611 | c = s.create(a.words.slice(b), 4 * c); 612 | a.sigBytes = 4 * b; 613 | return n.create({ 614 | key: a, 615 | iv: c, 616 | salt: d 617 | }) 618 | } 619 | } 620 | , c = d.PasswordBasedCipher = a.extend({ 621 | cfg: a.cfg.extend({ 622 | kdf: p 623 | }), 624 | encrypt: function(b, c, d, l) { 625 | l = this.cfg.extend(l); 626 | d = l.kdf.execute(d, b.keySize, b.ivSize); 627 | l.iv = d.iv; 628 | b = a.encrypt.call(this, b, c, d.key, l); 629 | b.mixIn(d); 630 | return b 631 | }, 632 | decrypt: function(b, c, d, l) { 633 | l = this.cfg.extend(l); 634 | c = this.Hj0x(c, l.format); 635 | d = l.kdf.execute(d, b.keySize, b.ivSize, c.salt); 636 | l.iv = d.iv; 637 | return a.decrypt.call(this, b, c, d.key, l) 638 | } 639 | }) 640 | }(); 641 | (function() { 642 | for (var u = CryptoJS, p = u.lib.BlockCipher, d = u.algo, l = [], s = [], t = [], r = [], w = [], v = [], b = [], x = [], q = [], n = [], a = [], c = 0; 256 > c; c++) 643 | a[c] = 128 > c ? c << 1 : c << 1 ^ 283; 644 | for (var e = 0, j = 0, c = 0; 256 > c; c++) { 645 | var k = j ^ j << 1 ^ j << 2 ^ j << 3 ^ j << 4 646 | , k = k >>> 8 ^ k & 255 ^ 99; 647 | l[e] = k; 648 | s[k] = e; 649 | var z = a[e] 650 | , F = a[z] 651 | , G = a[F] 652 | , y = 257 * a[k] ^ 16843008 * k; 653 | t[e] = y << 24 | y >>> 8; 654 | r[e] = y << 16 | y >>> 16; 655 | w[e] = y << 8 | y >>> 24; 656 | v[e] = y; 657 | y = 16843009 * G ^ 65537 * F ^ 257 * z ^ 16843008 * e; 658 | b[k] = y << 24 | y >>> 8; 659 | x[k] = y << 16 | y >>> 16; 660 | q[k] = y << 8 | y >>> 24; 661 | n[k] = y; 662 | e ? (e = z ^ a[a[a[G ^ z]]], 663 | j ^= a[a[j]]) : e = j = 1 664 | } 665 | var H = [0, 1, 2, 4, 8, 16, 32, 64, 128, 27, 54] 666 | , d = d.AES = p.extend({ 667 | lt3x: function() { 668 | for (var a = this.L0x, c = a.words, d = a.sigBytes / 4, a = 4 * ((this.beT6N = d + 6) + 1), e = this.bqT9K = [], j = 0; j < a; j++) 669 | if (j < d) 670 | e[j] = c[j]; 671 | else { 672 | var k = e[j - 1]; 673 | j % d ? 6 < d && 4 == j % d && (k = l[k >>> 24] << 24 | l[k >>> 16 & 255] << 16 | l[k >>> 8 & 255] << 8 | l[k & 255]) : (k = k << 8 | k >>> 24, 674 | k = l[k >>> 24] << 24 | l[k >>> 16 & 255] << 16 | l[k >>> 8 & 255] << 8 | l[k & 255], 675 | k ^= H[j / d | 0] << 24); 676 | e[j] = e[j - d] ^ k 677 | } 678 | c = this.bqS9J = []; 679 | for (d = 0; d < a; d++) 680 | j = a - d, 681 | k = d % 4 ? e[j] : e[j - 4], 682 | c[d] = 4 > d || 4 >= j ? k : b[l[k >>> 24]] ^ x[l[k >>> 16 & 255]] ^ q[l[k >>> 8 & 255]] ^ n[l[k & 255]] 683 | }, 684 | encryptBlock: function(a, b) { 685 | this.DA9r(a, b, this.bqT9K, t, r, w, v, l) 686 | }, 687 | decryptBlock: function(a, c) { 688 | var d = a[c + 1]; 689 | a[c + 1] = a[c + 3]; 690 | a[c + 3] = d; 691 | this.DA9r(a, c, this.bqS9J, b, x, q, n, s); 692 | d = a[c + 1]; 693 | a[c + 1] = a[c + 3]; 694 | a[c + 3] = d 695 | }, 696 | DA9r: function(a, b, c, d, e, j, l, f) { 697 | for (var m = this.beT6N, g = a[b] ^ c[0], h = a[b + 1] ^ c[1], k = a[b + 2] ^ c[2], n = a[b + 3] ^ c[3], p = 4, r = 1; r < m; r++) 698 | var q = d[g >>> 24] ^ e[h >>> 16 & 255] ^ j[k >>> 8 & 255] ^ l[n & 255] ^ c[p++] 699 | , s = d[h >>> 24] ^ e[k >>> 16 & 255] ^ j[n >>> 8 & 255] ^ l[g & 255] ^ c[p++] 700 | , t = d[k >>> 24] ^ e[n >>> 16 & 255] ^ j[g >>> 8 & 255] ^ l[h & 255] ^ c[p++] 701 | , n = d[n >>> 24] ^ e[g >>> 16 & 255] ^ j[h >>> 8 & 255] ^ l[k & 255] ^ c[p++] 702 | , g = q 703 | , h = s 704 | , k = t; 705 | q = (f[g >>> 24] << 24 | f[h >>> 16 & 255] << 16 | f[k >>> 8 & 255] << 8 | f[n & 255]) ^ c[p++]; 706 | s = (f[h >>> 24] << 24 | f[k >>> 16 & 255] << 16 | f[n >>> 8 & 255] << 8 | f[g & 255]) ^ c[p++]; 707 | t = (f[k >>> 24] << 24 | f[n >>> 16 & 255] << 16 | f[g >>> 8 & 255] << 8 | f[h & 255]) ^ c[p++]; 708 | n = (f[n >>> 24] << 24 | f[g >>> 16 & 255] << 16 | f[h >>> 8 & 255] << 8 | f[k & 255]) ^ c[p++]; 709 | a[b] = q; 710 | a[b + 1] = s; 711 | a[b + 2] = t; 712 | a[b + 3] = n 713 | }, 714 | keySize: 8 715 | }); 716 | u.AES = p.lS3x(d) 717 | } 718 | )(); 719 | function RSAKeyPair(a, b, c) { 720 | this.e = biFromHex(a), 721 | this.d = biFromHex(b), 722 | this.m = biFromHex(c), 723 | this.chunkSize = 2 * biHighIndex(this.m), 724 | this.radix = 16, 725 | this.barrett = new BarrettMu(this.m) 726 | } 727 | function twoDigit(a) { 728 | return (10 > a ? "0" : "") + String(a) 729 | } 730 | function encryptedString(a, b) { 731 | for (var f, g, h, i, j, k, l, c = new Array, d = b.length, e = 0; d > e; ) 732 | c[e] = b.charCodeAt(e), 733 | e++; 734 | for (; 0 != c.length % a.chunkSize; ) 735 | c[e++] = 0; 736 | for (f = c.length, 737 | g = "", 738 | e = 0; f > e; e += a.chunkSize) { 739 | for (j = new BigInt, 740 | h = 0, 741 | i = e; i < e + a.chunkSize; ++h) 742 | j.digits[h] = c[i++], 743 | j.digits[h] += c[i++] << 8; 744 | k = a.barrett.powMod(j, a.e), 745 | l = 16 == a.radix ? biToHex(k) : biToString(k, a.radix), 746 | g += l + " " 747 | } 748 | return g.substring(0, g.length - 1) 749 | } 750 | function decryptedString(a, b) { 751 | var e, f, g, h, c = b.split(" "), d = ""; 752 | for (e = 0; e < c.length; ++e) 753 | for (h = 16 == a.radix ? biFromHex(c[e]) : biFromString(c[e], a.radix), 754 | g = a.barrett.powMod(h, a.d), 755 | f = 0; f <= biHighIndex(g); ++f) 756 | d += String.fromCharCode(255 & g.digits[f], g.digits[f] >> 8); 757 | return 0 == d.charCodeAt(d.length - 1) && (d = d.substring(0, d.length - 1)), 758 | d 759 | } 760 | function setMaxDigits(a) { 761 | maxDigits = a, 762 | ZERO_ARRAY = new Array(maxDigits); 763 | for (var b = 0; b < ZERO_ARRAY.length; b++) 764 | ZERO_ARRAY[b] = 0; 765 | bigZero = new BigInt, 766 | bigOne = new BigInt, 767 | bigOne.digits[0] = 1 768 | } 769 | function BigInt(a) { 770 | this.digits = "boolean" == typeof a && 1 == a ? null : ZERO_ARRAY.slice(0), 771 | this.isNeg = !1 772 | } 773 | function biFromDecimal(a) { 774 | for (var d, e, f, b = "-" == a.charAt(0), c = b ? 1 : 0; c < a.length && "0" == a.charAt(c); ) 775 | ++c; 776 | if (c == a.length) 777 | d = new BigInt; 778 | else { 779 | for (e = a.length - c, 780 | f = e % dpl10, 781 | 0 == f && (f = dpl10), 782 | d = biFromNumber(Number(a.substr(c, f))), 783 | c += f; c < a.length; ) 784 | d = biAdd(biMultiply(d, lr10), biFromNumber(Number(a.substr(c, dpl10)))), 785 | c += dpl10; 786 | d.isNeg = b 787 | } 788 | return d 789 | } 790 | function biCopy(a) { 791 | var b = new BigInt(!0); 792 | return b.digits = a.digits.slice(0), 793 | b.isNeg = a.isNeg, 794 | b 795 | } 796 | function biFromNumber(a) { 797 | var c, b = new BigInt; 798 | for (b.isNeg = 0 > a, 799 | a = Math.abs(a), 800 | c = 0; a > 0; ) 801 | b.digits[c++] = a & maxDigitVal, 802 | a >>= biRadixBits; 803 | return b 804 | } 805 | function reverseStr(a) { 806 | var c, b = ""; 807 | for (c = a.length - 1; c > -1; --c) 808 | b += a.charAt(c); 809 | return b 810 | } 811 | function biToString(a, b) { 812 | var d, e, c = new BigInt; 813 | for (c.digits[0] = b, 814 | d = biDivideModulo(a, c), 815 | e = hexatrigesimalToChar[d[1].digits[0]]; 1 == biCompare(d[0], bigZero); ) 816 | d = biDivideModulo(d[0], c), 817 | digit = d[1].digits[0], 818 | e += hexatrigesimalToChar[d[1].digits[0]]; 819 | return (a.isNeg ? "-" : "") + reverseStr(e) 820 | } 821 | function biToDecimal(a) { 822 | var c, d, b = new BigInt; 823 | for (b.digits[0] = 10, 824 | c = biDivideModulo(a, b), 825 | d = String(c[1].digits[0]); 1 == biCompare(c[0], bigZero); ) 826 | c = biDivideModulo(c[0], b), 827 | d += String(c[1].digits[0]); 828 | return (a.isNeg ? "-" : "") + reverseStr(d) 829 | } 830 | function digitToHex(a) { 831 | var b = 15 832 | , c = ""; 833 | for (i = 0; 4 > i; ++i) 834 | c += hexToChar[a & b], 835 | a >>>= 4; 836 | return reverseStr(c) 837 | } 838 | function biToHex(a) { 839 | var d, b = ""; 840 | for (biHighIndex(a), 841 | d = biHighIndex(a); d > -1; --d) 842 | b += digitToHex(a.digits[d]); 843 | return b 844 | } 845 | function charToHex(a) { 846 | var h, b = 48, c = b + 9, d = 97, e = d + 25, f = 65, g = 90; 847 | return h = a >= b && c >= a ? a - b : a >= f && g >= a ? 10 + a - f : a >= d && e >= a ? 10 + a - d : 0 848 | } 849 | function hexToDigit(a) { 850 | var d, b = 0, c = Math.min(a.length, 4); 851 | for (d = 0; c > d; ++d) 852 | b <<= 4, 853 | b |= charToHex(a.charCodeAt(d)); 854 | return b 855 | } 856 | function biFromHex(a) { 857 | var d, e, b = new BigInt, c = a.length; 858 | for (d = c, 859 | e = 0; d > 0; d -= 4, 860 | ++e) 861 | b.digits[e] = hexToDigit(a.substr(Math.max(d - 4, 0), Math.min(d, 4))); 862 | return b 863 | } 864 | function biFromString(a, b) { 865 | var g, h, i, j, c = "-" == a.charAt(0), d = c ? 1 : 0, e = new BigInt, f = new BigInt; 866 | for (f.digits[0] = 1, 867 | g = a.length - 1; g >= d; g--) 868 | h = a.charCodeAt(g), 869 | i = charToHex(h), 870 | j = biMultiplyDigit(f, i), 871 | e = biAdd(e, j), 872 | f = biMultiplyDigit(f, b); 873 | return e.isNeg = c, 874 | e 875 | } 876 | function biDump(a) { 877 | return (a.isNeg ? "-" : "") + a.digits.join(" ") 878 | } 879 | function biAdd(a, b) { 880 | var c, d, e, f; 881 | if (a.isNeg != b.isNeg) 882 | b.isNeg = !b.isNeg, 883 | c = biSubtract(a, b), 884 | b.isNeg = !b.isNeg; 885 | else { 886 | for (c = new BigInt, 887 | d = 0, 888 | f = 0; f < a.digits.length; ++f) 889 | e = a.digits[f] + b.digits[f] + d, 890 | c.digits[f] = 65535 & e, 891 | d = Number(e >= biRadix); 892 | c.isNeg = a.isNeg 893 | } 894 | return c 895 | } 896 | function biSubtract(a, b) { 897 | var c, d, e, f; 898 | if (a.isNeg != b.isNeg) 899 | b.isNeg = !b.isNeg, 900 | c = biAdd(a, b), 901 | b.isNeg = !b.isNeg; 902 | else { 903 | for (c = new BigInt, 904 | e = 0, 905 | f = 0; f < a.digits.length; ++f) 906 | d = a.digits[f] - b.digits[f] + e, 907 | c.digits[f] = 65535 & d, 908 | c.digits[f] < 0 && (c.digits[f] += biRadix), 909 | e = 0 - Number(0 > d); 910 | if (-1 == e) { 911 | for (e = 0, 912 | f = 0; f < a.digits.length; ++f) 913 | d = 0 - c.digits[f] + e, 914 | c.digits[f] = 65535 & d, 915 | c.digits[f] < 0 && (c.digits[f] += biRadix), 916 | e = 0 - Number(0 > d); 917 | c.isNeg = !a.isNeg 918 | } else 919 | c.isNeg = a.isNeg 920 | } 921 | return c 922 | } 923 | function biHighIndex(a) { 924 | for (var b = a.digits.length - 1; b > 0 && 0 == a.digits[b]; ) 925 | --b; 926 | return b 927 | } 928 | function biNumBits(a) { 929 | var e, b = biHighIndex(a), c = a.digits[b], d = (b + 1) * bitsPerDigit; 930 | for (e = d; e > d - bitsPerDigit && 0 == (32768 & c); --e) 931 | c <<= 1; 932 | return e 933 | } 934 | function biMultiply(a, b) { 935 | var d, h, i, k, c = new BigInt, e = biHighIndex(a), f = biHighIndex(b); 936 | for (k = 0; f >= k; ++k) { 937 | for (d = 0, 938 | i = k, 939 | j = 0; e >= j; ++j, 940 | ++i) 941 | h = c.digits[i] + a.digits[j] * b.digits[k] + d, 942 | c.digits[i] = h & maxDigitVal, 943 | d = h >>> biRadixBits; 944 | c.digits[k + e + 1] = d 945 | } 946 | return c.isNeg = a.isNeg != b.isNeg, 947 | c 948 | } 949 | function biMultiplyDigit(a, b) { 950 | var c, d, e, f; 951 | for (result = new BigInt, 952 | c = biHighIndex(a), 953 | d = 0, 954 | f = 0; c >= f; ++f) 955 | e = result.digits[f] + a.digits[f] * b + d, 956 | result.digits[f] = e & maxDigitVal, 957 | d = e >>> biRadixBits; 958 | return result.digits[1 + c] = d, 959 | result 960 | } 961 | function arrayCopy(a, b, c, d, e) { 962 | var g, h, f = Math.min(b + e, a.length); 963 | for (g = b, 964 | h = d; f > g; ++g, 965 | ++h) 966 | c[h] = a[g] 967 | } 968 | function biShiftLeft(a, b) { 969 | var e, f, g, h, c = Math.floor(b / bitsPerDigit), d = new BigInt; 970 | for (arrayCopy(a.digits, 0, d.digits, c, d.digits.length - c), 971 | e = b % bitsPerDigit, 972 | f = bitsPerDigit - e, 973 | g = d.digits.length - 1, 974 | h = g - 1; g > 0; --g, 975 | --h) 976 | d.digits[g] = d.digits[g] << e & maxDigitVal | (d.digits[h] & highBitMasks[e]) >>> f; 977 | return d.digits[0] = d.digits[g] << e & maxDigitVal, 978 | d.isNeg = a.isNeg, 979 | d 980 | } 981 | function biShiftRight(a, b) { 982 | var e, f, g, h, c = Math.floor(b / bitsPerDigit), d = new BigInt; 983 | for (arrayCopy(a.digits, c, d.digits, 0, a.digits.length - c), 984 | e = b % bitsPerDigit, 985 | f = bitsPerDigit - e, 986 | g = 0, 987 | h = g + 1; g < d.digits.length - 1; ++g, 988 | ++h) 989 | d.digits[g] = d.digits[g] >>> e | (d.digits[h] & lowBitMasks[e]) << f; 990 | return d.digits[d.digits.length - 1] >>>= e, 991 | d.isNeg = a.isNeg, 992 | d 993 | } 994 | function biMultiplyByRadixPower(a, b) { 995 | var c = new BigInt; 996 | return arrayCopy(a.digits, 0, c.digits, b, c.digits.length - b), 997 | c 998 | } 999 | function biDivideByRadixPower(a, b) { 1000 | var c = new BigInt; 1001 | return arrayCopy(a.digits, b, c.digits, 0, c.digits.length - b), 1002 | c 1003 | } 1004 | function biModuloByRadixPower(a, b) { 1005 | var c = new BigInt; 1006 | return arrayCopy(a.digits, 0, c.digits, 0, b), 1007 | c 1008 | } 1009 | function biCompare(a, b) { 1010 | if (a.isNeg != b.isNeg) 1011 | return 1 - 2 * Number(a.isNeg); 1012 | for (var c = a.digits.length - 1; c >= 0; --c) 1013 | if (a.digits[c] != b.digits[c]) 1014 | return a.isNeg ? 1 - 2 * Number(a.digits[c] > b.digits[c]) : 1 - 2 * Number(a.digits[c] < b.digits[c]); 1015 | return 0 1016 | } 1017 | function biDivideModulo(a, b) { 1018 | var f, g, h, i, j, k, l, m, n, o, p, q, r, s, c = biNumBits(a), d = biNumBits(b), e = b.isNeg; 1019 | if (d > c) 1020 | return a.isNeg ? (f = biCopy(bigOne), 1021 | f.isNeg = !b.isNeg, 1022 | a.isNeg = !1, 1023 | b.isNeg = !1, 1024 | g = biSubtract(b, a), 1025 | a.isNeg = !0, 1026 | b.isNeg = e) : (f = new BigInt, 1027 | g = biCopy(a)), 1028 | new Array(f,g); 1029 | for (f = new BigInt, 1030 | g = a, 1031 | h = Math.ceil(d / bitsPerDigit) - 1, 1032 | i = 0; b.digits[h] < biHalfRadix; ) 1033 | b = biShiftLeft(b, 1), 1034 | ++i, 1035 | ++d, 1036 | h = Math.ceil(d / bitsPerDigit) - 1; 1037 | for (g = biShiftLeft(g, i), 1038 | c += i, 1039 | j = Math.ceil(c / bitsPerDigit) - 1, 1040 | k = biMultiplyByRadixPower(b, j - h); -1 != biCompare(g, k); ) 1041 | ++f.digits[j - h], 1042 | g = biSubtract(g, k); 1043 | for (l = j; l > h; --l) { 1044 | for (m = l >= g.digits.length ? 0 : g.digits[l], 1045 | n = l - 1 >= g.digits.length ? 0 : g.digits[l - 1], 1046 | o = l - 2 >= g.digits.length ? 0 : g.digits[l - 2], 1047 | p = h >= b.digits.length ? 0 : b.digits[h], 1048 | q = h - 1 >= b.digits.length ? 0 : b.digits[h - 1], 1049 | f.digits[l - h - 1] = m == p ? maxDigitVal : Math.floor((m * biRadix + n) / p), 1050 | r = f.digits[l - h - 1] * (p * biRadix + q), 1051 | s = m * biRadixSquared + (n * biRadix + o); r > s; ) 1052 | --f.digits[l - h - 1], 1053 | r = f.digits[l - h - 1] * (p * biRadix | q), 1054 | s = m * biRadix * biRadix + (n * biRadix + o); 1055 | k = biMultiplyByRadixPower(b, l - h - 1), 1056 | g = biSubtract(g, biMultiplyDigit(k, f.digits[l - h - 1])), 1057 | g.isNeg && (g = biAdd(g, k), 1058 | --f.digits[l - h - 1]) 1059 | } 1060 | return g = biShiftRight(g, i), 1061 | f.isNeg = a.isNeg != e, 1062 | a.isNeg && (f = e ? biAdd(f, bigOne) : biSubtract(f, bigOne), 1063 | b = biShiftRight(b, i), 1064 | g = biSubtract(b, g)), 1065 | 0 == g.digits[0] && 0 == biHighIndex(g) && (g.isNeg = !1), 1066 | new Array(f,g) 1067 | } 1068 | function biDivide(a, b) { 1069 | return biDivideModulo(a, b)[0] 1070 | } 1071 | function biModulo(a, b) { 1072 | return biDivideModulo(a, b)[1] 1073 | } 1074 | function biMultiplyMod(a, b, c) { 1075 | return biModulo(biMultiply(a, b), c) 1076 | } 1077 | function biPow(a, b) { 1078 | for (var c = bigOne, d = a; ; ) { 1079 | if (0 != (1 & b) && (c = biMultiply(c, d)), 1080 | b >>= 1, 1081 | 0 == b) 1082 | break; 1083 | d = biMultiply(d, d) 1084 | } 1085 | return c 1086 | } 1087 | function biPowMod(a, b, c) { 1088 | for (var d = bigOne, e = a, f = b; ; ) { 1089 | if (0 != (1 & f.digits[0]) && (d = biMultiplyMod(d, e, c)), 1090 | f = biShiftRight(f, 1), 1091 | 0 == f.digits[0] && 0 == biHighIndex(f)) 1092 | break; 1093 | e = biMultiplyMod(e, e, c) 1094 | } 1095 | return d 1096 | } 1097 | function BarrettMu(a) { 1098 | this.modulus = biCopy(a), 1099 | this.k = biHighIndex(this.modulus) + 1; 1100 | var b = new BigInt; 1101 | b.digits[2 * this.k] = 1, 1102 | this.mu = biDivide(b, this.modulus), 1103 | this.bkplus1 = new BigInt, 1104 | this.bkplus1.digits[this.k + 1] = 1, 1105 | this.modulo = BarrettMu_modulo, 1106 | this.multiplyMod = BarrettMu_multiplyMod, 1107 | this.powMod = BarrettMu_powMod 1108 | } 1109 | function BarrettMu_modulo(a) { 1110 | var i, b = biDivideByRadixPower(a, this.k - 1), c = biMultiply(b, this.mu), d = biDivideByRadixPower(c, this.k + 1), e = biModuloByRadixPower(a, this.k + 1), f = biMultiply(d, this.modulus), g = biModuloByRadixPower(f, this.k + 1), h = biSubtract(e, g); 1111 | for (h.isNeg && (h = biAdd(h, this.bkplus1)), 1112 | i = biCompare(h, this.modulus) >= 0; i; ) 1113 | h = biSubtract(h, this.modulus), 1114 | i = biCompare(h, this.modulus) >= 0; 1115 | return h 1116 | } 1117 | function BarrettMu_multiplyMod(a, b) { 1118 | var c = biMultiply(a, b); 1119 | return this.modulo(c) 1120 | } 1121 | function BarrettMu_powMod(a, b) { 1122 | var d, e, c = new BigInt; 1123 | for (c.digits[0] = 1, 1124 | d = a, 1125 | e = b; ; ) { 1126 | if (0 != (1 & e.digits[0]) && (c = this.multiplyMod(c, d)), 1127 | e = biShiftRight(e, 1), 1128 | 0 == e.digits[0] && 0 == biHighIndex(e)) 1129 | break; 1130 | d = this.multiplyMod(d, d) 1131 | } 1132 | return c 1133 | } 1134 | var maxDigits, ZERO_ARRAY, bigZero, bigOne, dpl10, lr10, hexatrigesimalToChar, hexToChar, highBitMasks, lowBitMasks, biRadixBase = 2, biRadixBits = 16, bitsPerDigit = biRadixBits, biRadix = 65536, biHalfRadix = biRadix >>> 1, biRadixSquared = biRadix * biRadix, maxDigitVal = biRadix - 1, maxInteger = 9999999999999998; 1135 | setMaxDigits(20), 1136 | dpl10 = 15, 1137 | lr10 = biFromNumber(1e15), 1138 | hexatrigesimalToChar = new Array("0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"), 1139 | hexToChar = new Array("0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f"), 1140 | highBitMasks = new Array(0,32768,49152,57344,61440,63488,64512,65024,65280,65408,65472,65504,65520,65528,65532,65534,65535), 1141 | lowBitMasks = new Array(0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535); 1142 | 1143 | 1144 | 1145 | !function() { 1146 | function a(a) { 1147 | var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = ""; 1148 | for (d = 0; a > d; d += 1) 1149 | e = Math.random() * b.length, 1150 | e = Math.floor(e), 1151 | c += b.charAt(e); 1152 | return c 1153 | } 1154 | function b(a, b) { 1155 | var c = CryptoJS.enc.Utf8.parse(b) 1156 | , d = CryptoJS.enc.Utf8.parse("0102030405060708") 1157 | , e = CryptoJS.enc.Utf8.parse(a) 1158 | , f = CryptoJS.AES.encrypt(e, c, { 1159 | iv: d, 1160 | mode: CryptoJS.mode.CBC 1161 | }); 1162 | return f.toString() 1163 | } 1164 | function c(a, b, c) { 1165 | var d, e; 1166 | return setMaxDigits(131), 1167 | d = new RSAKeyPair(b,"",c), 1168 | e = encryptedString(d, a) 1169 | } 1170 | function d(d, e, f, g) { 1171 | var h = {} 1172 | , i = a(16); 1173 | return h.encText = b(d, g), 1174 | h.encText = b(h.encText, i), 1175 | h.encSecKey = c(i, e, f), 1176 | h 1177 | } 1178 | function e(a, b, d, e) { 1179 | var f = {}; 1180 | return f.encText = c(a + e, b, d), 1181 | f 1182 | } 1183 | asrsea = d, 1184 | ecnonasr = e 1185 | }(); 1186 | 1187 | // 这个函数是启动函数,接收一个歌曲ID。获取到对应的加密参数 1188 | function start(music_id) { 1189 | var i9b = { 1190 | "rid":"R_SO_4_" + music_id, 1191 | // 偏移量。可理解为初始下标 1192 | "offset": 0, 1193 | "total":"false", 1194 | // 每页的请求数量 1195 | "limit": 100, 1196 | "csrf_token":"" 1197 | }; 1198 | var bYf7Y = asrsea(JSON.stringify(i9b), "010001", "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7", "0CoJUm6Qyw8W8jud"); 1199 | return bYf7Y; 1200 | } 1201 | 1202 | -------------------------------------------------------------------------------- /music163/Music.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-02-24 17:42:40 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-02-25 10:49:45 6 | import requests 7 | import execjs 8 | import json 9 | 10 | 11 | class Music(object): 12 | """破解网易云音乐JS加密获取数据""" 13 | 14 | def __init__(self): 15 | self.get_comment_url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token=' 16 | 17 | @staticmethod 18 | def get_response(method=None, url=None, headers=None, data=None): 19 | """ 20 | 发起请求 21 | :params: method 请求类型:GET/POST 22 | :params: url 请求链接 23 | :params: headers 请求头 24 | :params: data post请求的表单 25 | """ 26 | if method is None: 27 | return '请求参数有误 -- method is None' 28 | if url is None: 29 | return '请求链接有误 --- url is None' 30 | if headers is None: 31 | headers = { 32 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)" 33 | "Chrome/88.0.4324.182 Safari/537.36", 34 | } 35 | if method == 'GET': 36 | response = requests.get(url=url, headers=headers) 37 | elif method == 'POST': 38 | response = requests.post(url=url, headers=headers, data=data) 39 | else: 40 | return '请求参数有误 -- method undefined' 41 | response.encoding = 'utf8' 42 | if response.status_code == 200: 43 | return response 44 | else: 45 | return '请求失败。状态码 %d' % response.status_code 46 | 47 | @staticmethod 48 | def get_token(music_id): 49 | """ 50 | 根据歌曲ID获取到对应的加密参数 51 | :param music_id: 需要抓取的歌曲ID 52 | """ 53 | js_file = open('Music.js', encoding='utf8').read() 54 | ctx = execjs.compile(js_file, cwd=r'C:\Users\Spider\AppData\Roaming\npm\node_modules') 55 | token = ctx.call('start', music_id) 56 | return { 57 | 'params': token['encText'], 58 | 'encSecKey': token['encSecKey'] 59 | } 60 | 61 | def get_comment(self, music_id): 62 | """ 63 | 获取评论数据 64 | :params music_id 歌曲id 65 | """ 66 | comment_response = self.get_response(method='POST', url=self.get_comment_url.format(music_id), 67 | data=self.get_token(music_id=music_id)).json() 68 | # 解析这个json串,即可获取到对应的数据 69 | print(json.dumps(comment_response)) 70 | 71 | def run(self): 72 | """启动函数""" 73 | test_music_id = 1366216050 74 | self.get_comment(music_id=test_music_id) 75 | 76 | 77 | if __name__ == '__main__': 78 | m = Music() 79 | m.run() 80 | -------------------------------------------------------------------------------- /qcc/qcc.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # @File : qcc.py 4 | # @Author : Monkey 5 | # @DATE : 2021/5/11 下午5:13 6 | 7 | import requests 8 | import re 9 | from lxml import etree 10 | 11 | 12 | class QCC(object): 13 | """企查查爬虫""" 14 | 15 | def __init__(self): 16 | self._headers = { 17 | 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36', 18 | } 19 | 20 | def get_cookie(self): 21 | """发起一次测试请求,获取到搜索的cookie""" 22 | url = 'https://www.qcc.com/web/search?key=测试' 23 | response = requests.get(url, headers=self._headers, allow_redirects=False) 24 | response.encoding = 'utf8' 25 | result = re.findall(r'div>您的请求ID是: \n(.*?)', response.text) 26 | if result: 27 | return result[0] 28 | 29 | def search(self, search_keyword): 30 | """搜索""" 31 | url = 'https://www.qcc.com/web/search?key={}'.format(search_keyword) 32 | headers = self._headers 33 | headers['cookie'] = 'acw_tc={}'.format(self.get_cookie()) 34 | response = requests.get(url, headers=headers) 35 | response.encoding = 'utf8' 36 | html = etree.HTML(response.text) 37 | com_url = html.xpath('//a[@class="title"]/@href') 38 | print('搜索到{}条结果。即将开始获取详细信息...'.format(len(com_url))) 39 | for url in com_url: 40 | self.get_com_info(url) 41 | 42 | def get_com_info(self, url): 43 | """获取公司的详细信息""" 44 | response = requests.get(url, headers=self._headers) 45 | html = etree.HTML(response.text) 46 | info_elements = html.xpath('//table[@class="ntable"]/tr') 47 | item = {'url': url} 48 | flag = True 49 | for element in info_elements: 50 | if not flag: 51 | break 52 | for index in range(0, len(element.xpath('./td')), 2): 53 | try: 54 | key = element.xpath('./td[{}]/text()'.format(index+1))[0].strip() 55 | if key == '公司介绍:' or key == '经营范围': 56 | flag = False 57 | if key == '法定代表人': 58 | item[key] = element.xpath('./td[{}]//h2/text()'.format(index+2))[0].strip() 59 | else: 60 | item[key] = element.xpath('./td[{}]//text()'.format(index+2))[0].strip() 61 | except: 62 | pass 63 | print(item) 64 | 65 | def run(self): 66 | """启动函数""" 67 | self.search(search_keyword='腾讯') 68 | 69 | 70 | if __name__ == '__main__': 71 | t = QCC() 72 | t.run() 73 | 74 | 75 | -------------------------------------------------------------------------------- /scrapeCenter/spa1/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 18:10 4 | # Auth: HJY 5 | 6 | """Ajax请求返回数据""" 7 | 8 | import requests 9 | from loguru import logger 10 | 11 | url = 'https://spa1.scrape.center/api/movie/?limit=100&offset=0' 12 | response = requests.get(url).json() 13 | for info in response['results']: 14 | logger.info(f'name: {info["name"]}, published_at: {info["published_at"]}, score: {info["score"]}') 15 | -------------------------------------------------------------------------------- /scrapeCenter/spa14/Wasm.wasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/monkey-hjy/python-spider/c65088d79b14643600bbae2796142b7a2384bf5c/scrapeCenter/spa14/Wasm.wasm -------------------------------------------------------------------------------- /scrapeCenter/spa14/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/14 11:35 4 | # Auth: HJY 5 | 6 | """ 7 | wasm加密 8 | e = this.$wasm.asm.encrypt(offset, time); 9 | """ 10 | 11 | import requests 12 | import pywasm 13 | 14 | import time 15 | import os 16 | 17 | wasm_fun = pywasm.load('scrapeCenter/spa14/Wasm.wasm') 18 | res = wasm_fun.exec('encrypt', [0, int(time.time())]) 19 | 20 | url = 'https://spa14.scrape.center/api/movie/' 21 | headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36' 23 | } 24 | params = { 25 | 'limit': 100, 26 | 'offset': 0, 27 | 'sign': res 28 | } 29 | response = requests.get(url, headers=headers, params=params).json() 30 | print(len(response['results'])) 31 | -------------------------------------------------------------------------------- /scrapeCenter/spa15/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/14 11:53 4 | # Auth: HJY 5 | 6 | """ 7 | wasm加密 8 | this.$wasm.ccall("encrypt", "string", ["string", "string"], [this.$store.state.url.index, Math.round((new Date).getTime() / 1e3).toString()]); 9 | this.$wasm.ccall; 10 | """ 11 | 12 | -------------------------------------------------------------------------------- /scrapeCenter/spa15/demo.js: -------------------------------------------------------------------------------- 1 | t = {} 2 | t["_encrypt"] = function() { 3 | return (t["_encrypt"] = t["asm"]["encrypt"]).apply(null, arguments) 4 | } 5 | t["stackSave"] = function() { 6 | return (bt = t["stackSave"] = t["asm"]["stackSave"]).apply(null, arguments) 7 | } 8 | 9 | function ot(n) { 10 | t["onAbort"] && t["onAbort"](n), 11 | n += "", 12 | S(n), 13 | j = !0, 14 | 1, 15 | n = "abort(" + n + "). Build with -s ASSERTIONS=1 for more info."; 16 | var e = new WebAssembly.RuntimeError(n); 17 | throw c(e), 18 | e 19 | } 20 | 21 | function k(t, n) { 22 | t || ot("Assertion failed: " + n) 23 | } 24 | 25 | function I(n) { 26 | var e = t["_" + n]; 27 | return k(e, "Cannot call unknown function " + n + ", make sure it is exported"), 28 | e 29 | } 30 | 31 | function L(t, n, e, r, i) { 32 | var o = { 33 | string: function(t) { 34 | var n = 0; 35 | if (null !== t && void 0 !== t && 0 !== t) { 36 | var e = 1 + (t.length << 2); 37 | n = xt(e), 38 | N(t, n, e) 39 | } 40 | return n 41 | }, 42 | array: function(t) { 43 | var n = xt(t.length); 44 | return D(t, n), 45 | n 46 | } 47 | }; 48 | function a(t) { 49 | return "string" === n ? W(t) : "boolean" === n ? Boolean(t) : t 50 | } 51 | var c = I(t) 52 | , u = [] 53 | , s = 0; 54 | if (r) 55 | for (var f = 0; f < r.length; f++) { 56 | var l = o[e[f]]; 57 | console.log('l: ', l) 58 | l ? (0 === s && (s = bt()), 59 | u[f] = l(r[f])) : u[f] = r[f] 60 | } 61 | var h = c.apply(null, u); 62 | return h = a(h), 63 | 0 !== s && _t(s), 64 | h 65 | } 66 | L("encrypt", "string", ["string", "string"], ['/api/movie', 1655534908]); -------------------------------------------------------------------------------- /scrapeCenter/spa16/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/18 15:14 4 | # Auth: HJY 5 | 6 | """http2协议""" 7 | 8 | import httpx 9 | client = httpx.Client(http2=True) 10 | url = 'https://spa16.scrape.center/api/book/?limit=18&offset=0' 11 | headers = { 12 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36' 13 | } 14 | response = client.get(url, headers=headers) 15 | print(response.text) 16 | -------------------------------------------------------------------------------- /scrapeCenter/spa2/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 18:13 4 | # Auth: HJY 5 | 6 | """有token参数加密""" 7 | 8 | import requests 9 | 10 | from loguru import logger 11 | import hashlib 12 | import base64 13 | import time 14 | 15 | 16 | def get_token(offset): 17 | now_t = str(int(time.time())) 18 | res = hashlib.sha1(f'/api/movie,{offset},{now_t}'.encode('utf8')).hexdigest() 19 | res += f',{now_t}' 20 | res = base64.b64encode(res.encode('utf8')).decode() 21 | return res 22 | 23 | url = 'https://spa2.scrape.center/api/movie/' 24 | params = { 25 | 'limit': 100, 26 | 'offset': 0, 27 | 'token': get_token(0) 28 | } 29 | response = requests.get(url, params=params).json() 30 | for info in response['results']: 31 | logger.info(f'name: {info["name"]}, published_at: {info["published_at"]}, score: {info["score"]}') 32 | -------------------------------------------------------------------------------- /scrapeCenter/spa3/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 18:40 4 | # Auth: HJY 5 | 6 | """下滑页面获取新数据""" 7 | 8 | import requests 9 | 10 | url = 'https://spa3.scrape.center/api/movie/?limit=100&offset=0' 11 | response = requests.get(url).json() 12 | print(len(response['results'])) 13 | -------------------------------------------------------------------------------- /scrapeCenter/spa5/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 18:44 4 | # Auth: HJY 5 | 6 | """动态渲染""" 7 | 8 | import requests 9 | 10 | url = 'https://spa5.scrape.center/api/book/?limit=5000&offset=0' 11 | response = requests.get(url).json() 12 | print(len(response['results'])) 13 | -------------------------------------------------------------------------------- /scrapeCenter/spa6/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/14 10:48 4 | # Auth: HJY 5 | 6 | """js加密。有混淆""" 7 | 8 | import hashlib 9 | import requests 10 | import time 11 | import base64 12 | 13 | 14 | def get_token(): 15 | now_t = str(int(time.time())) 16 | _0x189cbb = ['/api/movie', now_t] 17 | _0xf7c3c7 = hashlib.sha1(','.join(_0x189cbb).encode('utf8')).hexdigest() 18 | _0x3c8435 = _0xf7c3c7 + ',' + now_t 19 | _0x104b5b = base64.b64encode(_0x3c8435.encode('utf8')).decode('utf8') 20 | return _0x104b5b 21 | 22 | url = 'https://spa6.scrape.center/api/movie/' 23 | headers = { 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36' 25 | } 26 | params = { 27 | 'limit': 10, 28 | 'offset': 10, 29 | 'token': get_token(), 30 | } 31 | response = requests.get(url=url, headers=headers, params=params).json() 32 | print(len(response['results'])) 33 | -------------------------------------------------------------------------------- /scrapeCenter/spa6/demo.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | function _0x456254() { 4 | var _0x189cbb = f_3452(); 5 | for (var _0x5da681 = Math['round'](new Date()['getTime']() / 0x3e8)['toString'](), _0x2a83dd = arguments['length'], _0x31a891 = new Array(_0x2a83dd), _0x596a02 = 0x0; _0x596a02 < _0x2a83dd; _0x596a02++) 6 | _0x31a891[_0x596a02] = arguments[_0x596a02]; 7 | _0x31a891['push'](_0x5da681); 8 | console.log(_0x31a891); 9 | var _0xf7c3c7 = _0x189cbb['SHA1'](_0x31a891['join'](','))['toString'](_0x189cbb['enc']['Hex']) 10 | , _0x3c8435 = [_0xf7c3c7, _0x5da681]['join'](',') 11 | , _0x104b5b = _0x358b1f['encode'](_0x3c8435); 12 | return _0x104b5b; 13 | } 14 | 15 | 16 | _0x358b1f['encode'](_0x3c8435) 17 | 18 | _0x3c8435 = [_0xf7c3c7, _0x5da681]['join'](',') 19 | 20 | _0xf7c3c7 = sha1(_0x31a891['join'](','))['toString'](_0x189cbb['enc']['Hex']) 21 | _0x31a891 = ['/api/movie', time] 22 | _0x5da681 = time 23 | -------------------------------------------------------------------------------- /scrapeCenter/spa7/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/14 11:27 4 | # Auth: HJY 5 | 6 | """数据存储在js中""" 7 | 8 | import requests 9 | 10 | url = 'https://spa7.scrape.center/js/main.js' 11 | response = requests.get(url).text 12 | print(response) 13 | -------------------------------------------------------------------------------- /scrapeCenter/ssr1/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 17:37 4 | # Auth: HJY 5 | 6 | """静态网站。直接请求""" 7 | 8 | import requests 9 | from lxml import etree 10 | 11 | 12 | def parse_page(page): 13 | url = 'https://ssr1.scrape.center/page/{}'.format(page) 14 | headers = { 15 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36' 16 | } 17 | response = requests.get(url=url, headers=headers) 18 | response.encoding = response.apparent_encoding 19 | html = etree.HTML(response.text) 20 | info_element = html.xpath('//div[@class="el-col el-col-18 el-col-offset-3"]/div') 21 | for info in info_element: 22 | title = info.xpath('.//h2/text()')[0] 23 | types = ','.join(info.xpath('.//div[@class="categories"]//span/text()')) 24 | score = info.xpath('.//p[@class="score m-t-md m-b-n-sm"]/text()')[0].strip() 25 | item = {'标题': title, '类型': types, '评分': score} 26 | print(f'page: {page}, item: {item}') 27 | if info_element: 28 | parse_page(page + 1) 29 | 30 | 31 | parse_page(1) 32 | 33 | -------------------------------------------------------------------------------- /scrapeCenter/ssr2/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 17:45 4 | # Auth: HJY 5 | 6 | """无证书。关闭证书验证即可""" 7 | 8 | import requests 9 | 10 | url = 'https://ssr2.scrape.center/' 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36' 13 | } 14 | response = requests.get(url, headers=headers, verify=False) 15 | print(response.text) 16 | -------------------------------------------------------------------------------- /scrapeCenter/ssr3/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: crawl.py 3 | # Date: 2022/06/13 17:46 4 | # Auth: HJY 5 | 6 | """加http验证""" 7 | 8 | import requests 9 | 10 | url = 'https://ssr3.scrape.center/' 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36', 13 | 'Authorization': 'Basic YWRtaW46YWRtaW4=' 14 | } 15 | response = requests.get(url, headers=headers) 16 | print(response.text) 17 | -------------------------------------------------------------------------------- /scrapeCenter/ssr4/crawl.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # NAME: scrapy.py 3 | # Date: 2022/06/13 17:49 4 | # Auth: HJY 5 | 6 | """做延时。异步加快速度""" 7 | 8 | import requests 9 | import asyncio 10 | import aiohttp 11 | from loguru import logger 12 | 13 | import time 14 | 15 | 16 | start_time = time.time() 17 | 18 | 19 | async def get(url): 20 | session = aiohttp.ClientSession() 21 | headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36', 23 | } 24 | response = await session.get(url, headers=headers, verify_ssl=False) 25 | await response.text() 26 | await session.close() 27 | return response 28 | 29 | 30 | async def start(page): 31 | url = f'https://ssr4.scrape.center/page/{page}' 32 | logger.info(f'get {url}') 33 | response = await get(url) 34 | logger.info(f'get {url} done, response.status={response.status}') 35 | 36 | 37 | tasks = [asyncio.ensure_future(start(page)) for page in range(1, 10)] 38 | loop = asyncio.get_event_loop() 39 | loop.run_until_complete(asyncio.wait(tasks)) 40 | end_time = time.time() 41 | logger.info(f'耗时: {end_time - start_time}') 42 | 43 | -------------------------------------------------------------------------------- /tweet/GetToken.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-02-24 17:20:13 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-02-24 17:20:32 6 | import requests 7 | 8 | 9 | class GetToken(object): 10 | """获取到游客token""" 11 | def __init__(self): 12 | self.get_token_url = 'https://api.twitter.com/1.1/guest/activate.json' 13 | self.get_token_headers = { 14 | "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36', 15 | 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 16 | } 17 | 18 | def get_token(self, proxies_ip): 19 | proxies = { 20 | 'http': 'http://{}'.format(proxies_ip), 21 | 'https': 'http://{}'.format(proxies_ip), 22 | } 23 | err_count = 0 24 | while err_count < 5: 25 | try: 26 | response = requests.request(url=self.get_token_url, method="POST", headers=self.get_token_headers, 27 | timeout=15) 28 | response.close() 29 | return response.json().get('guest_token') 30 | except Exception as e: 31 | print(e) 32 | err_count += 1 33 | -------------------------------------------------------------------------------- /tweet/Tweet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-02-24 17:18:02 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-02-24 17:23:17 6 | from datetime import datetime 7 | import requests 8 | from GetToken import GetToken 9 | import random 10 | from prettytable import PrettyTable 11 | 12 | # 随机UA头 13 | USER_AGENT = [ 14 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 15 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 16 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 17 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 18 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 19 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 20 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 21 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 22 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 23 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 24 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 25 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 26 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 27 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 29 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 30 | ] 31 | 32 | 33 | class SearchTweet(GetToken): 34 | """ 35 | 根据关键词搜索推文或者用户 36 | 使用游客token进行抓取数据,没有次数限制 37 | 但是需要境外ip。。。 38 | """ 39 | 40 | def __init__(self): 41 | super().__init__() 42 | self.start = datetime.now() 43 | # 定义请求头。需要按照下面的代码去获取游客token 44 | self.headers = { 45 | 'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' 46 | '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA', 47 | 'user-agent': random.choice(USER_AGENT), 48 | 'x-guest-token': self.get_token(proxies_ip='127.0.0.1:10809'), 49 | } 50 | # 获取数据的接口 51 | self.url = 'https://twitter.com/i/api/2/search/adaptive.json' 52 | 53 | def start_requests(self, search_key, search_type='tweet'): 54 | """ 55 | 开始搜索 56 | :param search_key: 搜素关键词 57 | :param search_type: 搜索类别。tweet/推文。 account/用户 58 | :return: 59 | """ 60 | params = { 61 | "q": search_key, 62 | "count": 20, 63 | } 64 | if search_type == 'account': 65 | params['result_filter'] = 'user' 66 | response = requests.get(url=self.url, headers=self.headers, params=params, timeout=10) 67 | if response.status_code != 200: 68 | return f'{search_key} ERR === {response}' 69 | tweets = response.json().get('globalObjects').get('tweets') 70 | users = response.json().get('globalObjects').get('users') 71 | if not len(tweets) and not len(users): 72 | return f'{search_key}未抓到数据' 73 | p = PrettyTable() 74 | if search_type == 'tweet': 75 | tweet_id = [] 76 | create_time = [] 77 | full_text = [] 78 | user_name = [] 79 | screen_name = [] 80 | for key in tweets: 81 | tweet_id.append(key) 82 | create_time.append(tweets.get(key).get('created_at')) 83 | full_text.append(tweets.get(key).get('text')) 84 | user_id = tweets.get(key).get('user_id_str') 85 | user_name.append(users.get(user_id).get('name')) 86 | screen_name.append(users.get(user_id).get('screen_name')) 87 | p.add_column(fieldname='推文ID', column=tweet_id) 88 | p.add_column(fieldname='发文时间', column=create_time) 89 | p.add_column(fieldname='内容', column=full_text) 90 | p.add_column(fieldname='用户名', column=user_name) 91 | p.add_column(fieldname='账号', column=screen_name) 92 | else: 93 | user_name = [] 94 | screen_name = [] 95 | description = [] 96 | for key in users: 97 | user_name.append(users.get(key).get('name')) 98 | screen_name.append(users.get(key).get('screen_name')) 99 | description.append(users.get(key).get('description')) 100 | p.add_column(fieldname='用户名', column=user_name) 101 | p.add_column(fieldname='账号', column=screen_name) 102 | p.add_column(fieldname='简介', column=description) 103 | return p 104 | 105 | def run(self): 106 | search_key = ['葫芦娃', '奥特曼'] 107 | for key in search_key: 108 | result = self.start_requests(search_key=key, search_type='account') 109 | print(result) 110 | 111 | def __del__(self): 112 | end = datetime.now() 113 | print(f'开始:{self.start},结束:{end}\n用时:{end-self.start}') 114 | 115 | 116 | if __name__ == '__main__': 117 | t = SearchTweet() 118 | t.run() 119 | -------------------------------------------------------------------------------- /weather/weather.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-02-24 17:28:36 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-02-24 17:29:00 6 | # 中国天气网的接口。。。 7 | import requests 8 | from lxml import etree 9 | import pandas as pd 10 | from prettytable import PrettyTable 11 | import os 12 | 13 | 14 | def get_html(url): 15 | # 定义头文件 16 | headers = {'user-agent': 'Mozilla/5.0'} 17 | # 发起请求 18 | response = requests.get(url, headers=headers) 19 | # 修改编码 20 | response.encoding = 'utf8' 21 | # 处理成HTML格式 22 | html = etree.HTML(response.text) 23 | return html 24 | 25 | 26 | # 获取城市信息并保存到本地 27 | def get_cityinfo_write(html): 28 | print('获取城市信息') 29 | city_info = {} 30 | # 获取到城市信息 31 | province_url = html.xpath('//div[@class="lqcontentBoxheader"]//ul//li/a/@href') 32 | for i in range(len(province_url)): 33 | # 拼接出每个城市的URL,并获取到对应的HTML 34 | the_html = get_html('http://www.weather.com.cn' + province_url[i]) 35 | # 解析出城市名称 36 | city_name = the_html.xpath('//div[@class="conMidtab3"]//tr//td[position()<3]/a/text()') 37 | # 解析出城市链接 38 | city_url = the_html.xpath('//div[@class="conMidtab3"]//tr//td[position()<3]/a/@href') 39 | # 将城市信息存储到city_info中 40 | for j in range(len(city_name)): 41 | if j != 0 and city_name[j] == city_name[0]: 42 | break 43 | else: 44 | city_info[city_name[j]] = city_url[j] 45 | # 给数据设置列名 46 | data = pd.DataFrame(columns=['city_name', 'city_url']) 47 | # 填充数据 48 | data['city_name'] = city_info.keys() 49 | data['city_url'] = city_info.values() 50 | # 保存到本地 51 | data.to_csv(file_path, index=False, encoding='utf8') 52 | 53 | 54 | if __name__ == '__main__': 55 | # 实例化输出类 56 | p = PrettyTable() 57 | # 接口URL 58 | url = 'http://www.weather.com.cn/textFC/hb.shtml' 59 | # 调用获取HTML的方法 60 | html = get_html(url) 61 | file_path = '/home/monkey/File/中国天气网城市信息.csv' 62 | # 判断存放城市信息的数据文件是否存在。如果不存在,则调用get_cityinfo_write方法下载 63 | if not os.path.exists(file_path): 64 | get_cityinfo_write(html) 65 | # 读取城市信息 66 | data = pd.read_csv(file_path, encoding='utf8') 67 | # 获取到城市名称 68 | city_name = data['city_name'].tolist() 69 | # 获取到城市URL 70 | city_url = data['city_url'].tolist() 71 | # 让用户输入需要查询的城市 72 | name = input('请输入需要查询的城市名称:') 73 | # 如果名称输入正确,则进行查询 74 | if name in city_name: 75 | # 获取到当前城市天气信息的HTML 76 | city_html = get_html(city_url[city_name.index(name)]) 77 | # 解析出时间 78 | date = city_html.xpath('//ul[@class="t clearfix"]//li//h1/text()') 79 | # 解析出天气 80 | wea = city_html.xpath('//ul[@class="t clearfix"]//li/p[@class="wea"]/text()') 81 | # 解析出温度列表 82 | tem_list = ''.join(city_html.xpath('//ul[@class="t clearfix"]//li/p[@class="tem"]//text()')).split('\n') 83 | # 取出正确的数据 84 | tem = [tem_list[i] for i in range(len(tem_list)) if i % 2 != 0] 85 | # 解析出风量 86 | win = city_html.xpath('//ul[@class="t clearfix"]//li/p[@class="win"]/i/text()') 87 | print('{}的天气如下'.format(name)) 88 | # 把数据填充到表格中,美化输出 89 | p.add_column('日期', date) 90 | p.add_column('天气', wea) 91 | p.add_column('温度', tem) 92 | p.add_column('风量', win) 93 | print(p) 94 | else: 95 | print('输入的城市名称有误!') 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /weibo/get_fans_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-04-22 11:32:21 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-04-22 16:57:22 6 | from gevent import monkey; monkey.patch_all() 7 | import gevent.pool 8 | import json 9 | import requests 10 | import random 11 | import re 12 | import pymongo 13 | import datetime 14 | import redis 15 | 16 | 17 | class GetFansInfo(object): 18 | """获取某个账号粉丝的信息""" 19 | 20 | def __init__(self): 21 | self.mongo_conf = pymongo.MongoClient(host='127.0.0.1', port=27017) 22 | self.mongo_db = self.mongo_conf['data']['weibo'] 23 | self.redis_conf = redis.StrictRedis() 24 | # 参数1:用户ID。 25 | # 参数2:初始下标,下一页的下标会在本次请求返回 26 | self.get_fans_url = "https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{}&since_id={}" 27 | # 参数1:用户ID 28 | self.get_info_url = "https://weibo.com/p/100505{}/info?mod=pedit_more" 29 | self._headers = { 30 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36", 31 | } 32 | self.get_cookie() 33 | self.err_count = 0 34 | 35 | def __del__(self): 36 | self.redis_conf.close() 37 | self.mongo_conf.close() 38 | 39 | def get_response(self, url): 40 | """解析到对应URL的response""" 41 | err_count = 0 42 | while err_count < 5: 43 | try: 44 | response = requests.get(url, headers=self._headers) 45 | if response.status_code == 200: 46 | response.encoding = 'utf8' 47 | if 'Sina Visitor System' in response.text: 48 | raise Exception 49 | return response 50 | else: 51 | raise Exception 52 | except: 53 | err_count += 1 54 | self.get_cookie() 55 | return None 56 | 57 | def get_fans_info(self, user_info): 58 | """获取粉丝的信息""" 59 | user_info = user_info['user'] 60 | response = self.get_response(url=self.get_info_url.format(user_info['id'])) 61 | if response is None: 62 | print('出错 === {}'.format(user_info)) 63 | return 64 | city = re.findall(r'所在地:.*?pt_detail\\">(.*?)<', response.text) 65 | city = city[0] if city else '其他' 66 | gender = re.findall(r'性别:.*?pt_detail\\">(.*?)<', response.text) 67 | gender = gender[0] if gender else '未知' 68 | reg_date = re.findall(r'注册时间:.*?pt_detail\\">(.*?)<', response.text) 69 | reg_date = reg_date[0].replace('\\n', '').replace('\\r', '').strip() if reg_date else '未知' 70 | item = { 71 | "the_fans_id": user_info['id'], 72 | "screen_name": user_info['screen_name'], 73 | "followers_count": user_info['followers_count'], 74 | "follow_count": user_info['follow_count'], 75 | "gender": gender, 76 | "city": city, 77 | "reg_date": reg_date 78 | } 79 | self.mongo_db.insert_one(item) 80 | 81 | def get_fans_id(self, user_id, since_id=0): 82 | """获取到某个用户的粉丝""" 83 | print(datetime.datetime.now(), user_id, since_id) 84 | if since_id >= 4999: 85 | return 86 | response = self.get_response(url=self.get_fans_url.format(user_id, since_id)) 87 | if response is None: 88 | print('哥们。这个用户解析好像有点问题....\t{} is None'.format(user_id)) 89 | return 90 | elif response.json()['ok'] == 0: 91 | print('哥们。这个用户解析好像有点问题....\t{}\t{}\t{}'.format(self.err_count, user_id, response.json())) 92 | if self.err_count < 10: 93 | self.err_count += 1 94 | self.get_fans_id(user_id, since_id) 95 | else: 96 | pip = self.redis_conf.pipeline() 97 | [pip.sadd('new_wb_user', info['user']['id']) for info in response.json()['data']['cards'][-1]['card_group']] 98 | pip.execute() 99 | try: 100 | next_since_id = response.json()['data']['cardlistInfo']['since_id'] 101 | if next_since_id: 102 | self.err_count = 0 103 | self.get_fans_id(user_id=user_id, since_id=next_since_id) 104 | except Exception as e: 105 | print(e, user_id, since_id, response.json()) 106 | 107 | @staticmethod 108 | def get_tid(): 109 | """获取TID参数""" 110 | url = 'https://passport.weibo.com/visitor/genvisitor?cb=gen_callback&fp={"os":"1","browser":"Chrome89,0,4389,128","fonts":"undefined","screenInfo":"1920*1080*24","plugins":"Portable Document Format::internal-pdf-viewer::Chrome PDF Plugin|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client"}' 111 | response = requests.get(url).text 112 | tid = re.findall(r'"tid":"(.*?)"', response)[0] 113 | return tid 114 | 115 | def get_cookie(self): 116 | """获取 SUB 和 SUBP """ 117 | tid = self.get_tid() 118 | while True: 119 | url = 'https://passport.weibo.com/visitor/visitor?a=incarnate&t={}&w=3&c=95&gc=&cb=cross_domain&from=weibo&_rand={}'.format( 120 | tid, random.random()) 121 | response = json.loads(re.findall(r'\((.*?)\)', requests.get(url).text)[0]) 122 | if response.get('retcode') == 20000000 and response.get('data').get('sub'): 123 | cookie = '' 124 | for key in response.get('data'): 125 | cookie += '{}={};'.format(key.upper(), response.get('data').get(key)) 126 | self._headers['cookie'] = cookie.rstrip(';') 127 | return response.get('data') 128 | else: 129 | tid = self.get_tid() 130 | 131 | def run(self): 132 | """启动函数""" 133 | user_ids = list(set([line.replace('\n', '') for line in open('大V.txt', encoding='utf8').readlines()])) 134 | exist = [line.replace('\n', '') for line in open('exist.txt', encoding='utf8').readlines()] 135 | # # # 1、高并发跑。会有IP封禁问题。自行选择。。。 136 | # pool = gevent.pool.Pool(50) 137 | # pool.map(self.get_fans_id, user_ids) 138 | 139 | # 2、单线程跑。不会封禁IP。但是速度不是很快。 140 | for user_id in user_ids: 141 | if user_id in exist: 142 | continue 143 | self.get_fans_id(user_id) 144 | with open('exist.txt', encoding='utf8', mode='a') as f: 145 | f.write('{}\n'.format(user_id)) 146 | 147 | 148 | if __name__ == '__main__': 149 | t = GetFansInfo() 150 | t.run() 151 | """ 152 | 小时候 153 | 总是盼望着 154 | 盼望着有自己的零花钱 155 | 盼望着有一辆属于自己的自行车 156 | 盼望着玩到天黑不回家 157 | 盼望着妈妈不再唠叨我 158 | 159 | 长大了 160 | 总是想着 161 | 想着可以不用每天算计着花钱 162 | 想着可以真正的散散步 163 | 想着可以在家里休息一整天 164 | 想着可以每天陪着妈妈说话 165 | 166 | 听说 167 | 20岁的人 怀念童年 168 | 40岁的人 怀念青春 169 | 60岁的人 怀念壮年 170 | 只有那些孩子会缠着人问 171 | 妈妈 172 | 我什么时候长大呀 173 | ---- H 2021/4/26 上海 174 | ---- 结尾摘自 《儿时的夏日》 热评 175 | """ 176 | -------------------------------------------------------------------------------- /weibo/search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: 玛卡巴卡 3 | # Date: 2021/4/20 10:34 4 | import datetime 5 | import logging 6 | import re 7 | import time 8 | from multiprocessing.dummy import Pool as ThreadPool 9 | import requests 10 | import pandas as pd 11 | import random 12 | import os 13 | requests.packages.urllib3.disable_warnings() 14 | 15 | USER_AGENTS = [ 16 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36", 17 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 18 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 19 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 20 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 21 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 22 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 23 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 24 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 25 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 26 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 27 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 28 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 29 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 31 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 32 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 33 | ] 34 | 35 | 36 | class WeiBo(object): 37 | """ 38 | 按照固定的关键词搜索 39 | 采集得到的所有文章和评论信息 40 | """ 41 | 42 | def __init__(self): 43 | self.get_wb_url = 'https://m.weibo.cn/api/container/getIndex' 44 | self.comment_url = 'https://m.weibo.cn/comments/hotflow' 45 | self._headers = {'user-agent': ''} 46 | self.wb_info_list = dict() 47 | self.content_id = list() 48 | self.content = list() 49 | self.comment_id = list() 50 | self.comment = list() 51 | logging.basicConfig(level=logging.INFO, 52 | format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s', 53 | datefmt='%Y-%m-%d %H:%M:%S', 54 | filename='f:/PDemo/spider_log/{}-{}.log'.format(__file__.split('/')[-1].split('.')[0], str(datetime.datetime.now()).split(" ")[0]), 55 | filemode='a') 56 | 57 | def get_response(self, url, params=None, cookie=None): 58 | """发起请求""" 59 | err_count = 0 60 | while err_count < 5: 61 | try: 62 | time.sleep(1) 63 | if cookie is not None: 64 | self._headers['cookie'] = cookie 65 | else: 66 | self._headers = {'user-agent': random.choice(USER_AGENTS)} 67 | response = requests.get(url, params=params, headers=self._headers) 68 | if response.status_code == 200: 69 | return response 70 | else: 71 | err_count += 1 72 | time.sleep(30) 73 | except: 74 | err_count += 1 75 | return None 76 | 77 | def get_wb_id(self, keyword, page): 78 | """获取微博ID""" 79 | wb_id_list = [] 80 | params = { 81 | 'containerid': '100103type=1&q={}'.format(keyword), 82 | 'page_type': 'searchall', 83 | 'page': page, 84 | } 85 | response = self.get_response(url=self.get_wb_url, params=params) 86 | if response is None: 87 | logging.error('- 关键词:{},页码:{}\t出错'.format(keyword, page)) 88 | return 89 | response = response.json()['data']['cards'] 90 | for info in response: 91 | try: 92 | try: 93 | self.wb_info_list[info['mblog']['id']] = info['mblog']['comments_count'] 94 | wb_id_list.append([info['mblog']['id'], info['mblog']['comments_count']]) 95 | except: 96 | self.wb_info_list[info['card_group'][0]['mblog']['id']] = info['card_group'][0]['mblog'][ 97 | 'comments_count'] 98 | wb_id_list.append([info['card_group'][0]['mblog']['id'], info['card_group'][0]['mblog']['comments_count']]) 99 | except Exception as e: 100 | pass 101 | logging.info('{}\t{}\t{}'.format(keyword, page, len(wb_id_list))) 102 | if wb_id_list: 103 | return True 104 | else: 105 | return False 106 | 107 | def get_wb_content(self, id): 108 | """获取微博原文""" 109 | url = 'https://m.weibo.cn/statuses/extend?id={}'.format(id) 110 | response = self.get_response(url=url) 111 | if response is None: 112 | return 113 | try: 114 | content = re.sub('<.*?>', '', response.json()['data']['longTextContent']) 115 | self.content_id.append(id) 116 | self.content.append(content) 117 | logging.info('- {}\t{}'.format(id, len(content))) 118 | except Exception as e: 119 | logging.error('- {}\t{}'.format(e, id)) 120 | 121 | def get_wb_comment(self, wb_id): 122 | """获取微博评论""" 123 | max_id = 0 124 | max_id_type = 0 125 | while True: 126 | time.sleep(2) 127 | params = { 128 | 'id': wb_id, 129 | 'mid': wb_id, 130 | 'max_id': max_id, 131 | 'max_id_type': max_id_type, 132 | } 133 | err_count = 0 134 | while err_count < 4: 135 | response = self.get_response(url=self.comment_url, params=params, cookie='用户登录m.weibo.cn的cookie') 136 | if response is None: 137 | logging.error('{}出错'.format(wb_id)) 138 | return 139 | try: 140 | response.json() 141 | except: 142 | logging.error('转JSON失败 --- {}'.format(response.text)) 143 | return None 144 | if response.json()['ok']: 145 | try: 146 | response = response.json()['data'] 147 | logging.info('- {}\t{}\t{}'.format(wb_id, max_id, len(response['data']))) 148 | for info in response['data']: 149 | self.comment_id.append(wb_id) 150 | self.comment.append(re.sub('<.*?>', '', info['text'])) 151 | # 获取到下一页的ID,当作下次的参数使用 152 | next_max_id = response['max_id'] 153 | max_id_type = response['max_id_type'] 154 | if next_max_id == 0: 155 | return 156 | logging.info('- 下一页{}'.format(next_max_id)) 157 | max_id = next_max_id 158 | time.sleep(1) 159 | break 160 | except Exception as e: 161 | err_count += 1 162 | time.sleep(5) 163 | logging.error('- {}\t{}\t{}'.format(wb_id, err_count, e)) 164 | if err_count == 4: 165 | time.sleep(30) 166 | return 167 | else: 168 | logging.error('- {}\t{}'.format(response.json(), params)) 169 | return 170 | 171 | def run(self): 172 | """启动函数""" 173 | keyword_list = ['在这里放需要搜索的关键词'] 174 | for keyword in keyword_list: 175 | self.__init__() 176 | logging.info('=== {} ==='.format(keyword)) 177 | flag = True 178 | page = 1 179 | while flag: 180 | the_page_wb_id = self.get_wb_id(keyword=keyword, page=page) 181 | if the_page_wb_id: 182 | page += 1 183 | else: 184 | break 185 | logging.info(len(self.wb_info_list)) 186 | pool = ThreadPool(20) 187 | pool.map(self.get_wb_content, list(self.wb_info_list.keys())) 188 | for key in self.wb_info_list.keys(): 189 | if self.wb_info_list[key]: 190 | self.get_wb_comment(wb_id=key) 191 | 192 | content_data = pd.DataFrame({ 193 | '微博ID': self.content_id, 194 | '微博正文': self.content 195 | }) 196 | 197 | comment_data = pd.DataFrame({ 198 | '微博ID': self.comment_id, 199 | '评论': self.comment 200 | }) 201 | 202 | """ 203 | 可以在此对数据进行持久化保存 204 | """ 205 | 206 | 207 | if __name__ == '__main__': 208 | t = WeiBo() 209 | t.run() 210 | -------------------------------------------------------------------------------- /weibo/search_all.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | 4 | import pandas as pd 5 | import requests 6 | import random 7 | import re 8 | import datetime 9 | 10 | from lxml import etree 11 | 12 | 13 | class GetFansInfo(object): 14 | """搜索微博""" 15 | 16 | def __init__(self): 17 | self._headers = { 18 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36", 19 | } 20 | self.wb_id = list() 21 | self.user_name = list() 22 | self.content = list() 23 | self.create_date = list() 24 | self.img_list = list() 25 | 26 | @staticmethod 27 | def get_tid(): 28 | """获取TID参数""" 29 | url = 'https://passport.weibo.com/visitor/genvisitor?cb=gen_callback&fp={"os":"1","browser":"Chrome89,0,4389,128","fonts":"undefined","screenInfo":"1920*1080*24","plugins":"Portable Document Format::internal-pdf-viewer::Chrome PDF Plugin|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client"}' 30 | response = requests.get(url).text 31 | tid = re.findall(r'"tid":"(.*?)"', response)[0] 32 | return tid 33 | 34 | def get_cookie(self): 35 | """获取 SUB 和 SUBP """ 36 | tid = self.get_tid() 37 | while True: 38 | url = 'https://passport.weibo.com/visitor/visitor?a=incarnate&t={}&w=3&c=95&gc=&cb=cross_domain&from=weibo&_rand={}'.format( 39 | tid, random.random()) 40 | response = json.loads(re.findall(r'\((.*?)\)', requests.get(url).text)[0]) 41 | if response.get('retcode') == 20000000 and response.get('data').get('sub'): 42 | cookie = '' 43 | for key in response.get('data'): 44 | cookie += '{}={};'.format(key.upper(), response.get('data').get(key)) 45 | self._headers['cookie'] = cookie.rstrip(';') 46 | return response.get('data') 47 | else: 48 | tid = self.get_tid() 49 | 50 | def search(self): 51 | start_date = datetime.datetime.strptime('2020-12-11', '%Y-%m-%d') 52 | end_date = datetime.datetime.now() - datetime.timedelta(days=1) 53 | while start_date <= end_date: 54 | timescope1 = '{}-{}'.format(str(start_date).split()[0], start_date.hour) 55 | start_date += datetime.timedelta(hours=6) 56 | timescope2 = '{}-{}'.format(str(start_date).split()[0], start_date.hour) 57 | timescope = 'custom:{}:{}'.format(timescope1, timescope2) 58 | url = 'https://s.weibo.com/weibo' 59 | params = { 60 | 'q': '华夏家博会', 61 | 'typeall': '1', 62 | 'suball': '1', 63 | 'timescope': timescope, 64 | 'Refer': 'g', 65 | 'page': '1', 66 | } 67 | response = requests.get(url, headers=self._headers, params=params) 68 | response.encoding = 'utf8' 69 | if '未找到“华夏家博会”相关结果' in response.text: 70 | print(timescope, '无数据') 71 | continue 72 | html = etree.HTML(response.content) 73 | wb_info = html.xpath('//div[@action-type="feed_list_item"]') 74 | wb_id = html.xpath('//div[@action-type="feed_list_item"]/@mid') 75 | print(timescope, len(wb_info)) 76 | for i in range(len(wb_info)): 77 | info = wb_info[i] 78 | user_name = info.xpath('.//a[@class="name"]/text()') 79 | content = ''.join(info.xpath('.//p[@class="txt"]//text()')) 80 | img_url = info.xpath('.//div[@node-type="feed_list_media_prev"]//img/@src') 81 | create_date = info.xpath('.//p[@class="from"]/a[1]/text()') 82 | if not user_name: 83 | continue 84 | self.wb_id.append(wb_id[i]) 85 | self.user_name.append(user_name[0].strip()) 86 | self.content.append(content) 87 | self.img_list.append(img_url) 88 | self.create_date.append(create_date[0].strip()) 89 | # item = { 90 | # 'ID': wb_id[i], 91 | # '用户名': user_name[0].strip(), 92 | # '内容': content, 93 | # '图片链接': img_url, 94 | # '时间': create_date[0].strip(), 95 | # } 96 | # print(item) 97 | time.sleep(3) 98 | data = pd.DataFrame({ 99 | 'ID': self.wb_id, 100 | '用户名': self.user_name, 101 | '内容': self.content, 102 | '图片链接': self.img_list, 103 | '时间': self.create_date, 104 | }) 105 | data.to_excel('微博.xlsx', encoding='ANSI', index=False) 106 | 107 | def run(self): 108 | """启动函数""" 109 | self.search() 110 | 111 | 112 | if __name__ == '__main__': 113 | t = GetFansInfo() 114 | t.run() 115 | -------------------------------------------------------------------------------- /weibo/weibo_comment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Author: 玛卡巴卡 3 | # Date: 2021/4/19 17:10 4 | 5 | import requests 6 | import time 7 | 8 | 9 | class WBComment(object): 10 | """抓取微博全量评论。但是需要登录""" 11 | 12 | def __init__(self): 13 | self.comment_url = 'https://m.weibo.cn/comments/hotflow' 14 | self._headers = { 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36', 16 | 'cookie': '用户登录后的cookie', 17 | } 18 | 19 | def get_response(self, url, params=None): 20 | """发起请求""" 21 | response = requests.get(url=url, headers=self._headers, params=params) 22 | if response.status_code == 200: 23 | return response 24 | else: 25 | print('出错。返回的状态码是:{}'.format(response.status_code)) 26 | return None 27 | 28 | def start(self, wb_id): 29 | """启动函数,接受微博ID参数""" 30 | # 初始页码的ID。下一页的ID会存放在返回的数据中 31 | max_id = 0 32 | while True: 33 | params = { 34 | 'id': wb_id, 35 | 'mid': wb_id, 36 | 'max_id': max_id, 37 | 'max_id_type': 1, 38 | } 39 | response = self.get_response(url=self.comment_url, params=params) 40 | if response is None: 41 | print('{}出错'.format(weibo_id)) 42 | return 43 | response = response.json()['data'] 44 | print(max_id, len(response['data']), response['data'][0]['text']) 45 | # 获取到下一页的ID,当作下次的参数使用 46 | max_id = response['max_id'] 47 | time.sleep(1) 48 | 49 | -------------------------------------------------------------------------------- /youdao/yd_tran.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: monkey-hjy 3 | # @Date: 2021-04-27 11:35:40 4 | # @Last Modified by: monkey-hjy 5 | # @Last Modified time: 2021-04-27 11:36:08 6 | import requests 7 | import hashlib 8 | import time 9 | import random 10 | 11 | 12 | class YDDict(object): 13 | """有道翻译""" 14 | 15 | @staticmethod 16 | def get_data(keyword): 17 | """获取到其余的加密参数""" 18 | md = hashlib.md5() 19 | t = str(int(time.time() * 1000)) 20 | i = t + str(random.randrange(10)) 21 | md.update('fanyideskweb{}{}Tbh5E8=q6U3EXe+&L[4c@'.format(keyword, i).encode('utf8')) 22 | sign = md.hexdigest() 23 | return t, i, sign 24 | 25 | def translate(self, keyword='你好', data_from='AUTO', data_to='AUTO'): 26 | """ 27 | 对keyword进行翻译 28 | params: params_from 文本语言 29 | params: params_to 翻译成的语言类型 30 | """ 31 | url = 'https://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' 32 | headers = { 33 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36', 34 | 'Referer': 'https://fanyi.youdao.com/?keyfrom=fanyi-new.logo', 35 | 'Host': 'fanyi.youdao.com', 36 | 'Origin': 'https://fanyi.youdao.com', 37 | 'Cache-Control': 'no-cache', 38 | 'Connection': 'keep-alive', 39 | } 40 | t, i, sign = self.get_data(keyword) 41 | data = { 42 | "i": keyword, 43 | "from": data_from, 44 | "to": data_to, 45 | "smartresult": "dict", 46 | "client": "fanyideskweb", 47 | "salt": i, 48 | "sign": sign, 49 | "lts": t, 50 | # 这里bv是对UA加密得到的,所以也写成了定值 51 | "bv": "62c1eba97402d4ff4eb261254e974c27", 52 | "doctype": "json", 53 | "version": "2.1", 54 | "keyfrom": "fanyi.web", 55 | "action": "FY_BY_REALTlME", 56 | } 57 | response = requests.post(url, headers=headers, data=data) 58 | # json中包含结果,自己解析一下OK 59 | print(response.json()) 60 | 61 | 62 | if __name__ == '__main__': 63 | t = YDDict() 64 | t.translate(keyword='中国') 65 | -------------------------------------------------------------------------------- /zhihu/public_func.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # File: public_func.py 3 | # Date: 2024/1/5 11:03 4 | # Auth: HJY 5 | # Decs: 6 | import ctypes 7 | import os 8 | import random 9 | import time 10 | from datetime import datetime 11 | from hashlib import md5 12 | from urllib.parse import urlparse, parse_qs, urlencode 13 | 14 | import requests 15 | from requests import utils 16 | from loguru import logger 17 | 18 | # h:签名以来的固定数组 19 | h = { 20 | "zb": [20, 223, 245, 7, 248, 2, 194, 209, 87, 6, 227, 253, 240, 128, 222, 91, 237, 9, 125, 157, 230, 93, 252, 21 | 205, 90, 79, 144, 199, 159, 197, 186, 167, 39, 37, 156, 198, 38, 42, 43, 168, 217, 153, 15, 103, 80, 189, 22 | 71, 191, 97, 84, 247, 95, 36, 69, 14, 35, 12, 171, 28, 114, 178, 148, 86, 182, 32, 83, 158, 109, 22, 255, 23 | 94, 238, 151, 85, 77, 124, 254, 18, 4, 26, 123, 176, 232, 193, 131, 172, 143, 142, 150, 30, 10, 146, 162, 24 | 62, 224, 218, 196, 229, 1, 192, 213, 27, 110, 56, 231, 180, 138, 107, 242, 187, 54, 120, 19, 44, 117, 25 | 228, 215, 203, 53, 239, 251, 127, 81, 11, 133, 96, 204, 132, 41, 115, 73, 55, 249, 147, 102, 48, 122, 26 | 145, 106, 118, 74, 190, 29, 16, 174, 5, 177, 129, 63, 113, 99, 31, 161, 76, 246, 34, 211, 13, 60, 68, 27 | 207, 160, 65, 111, 82, 165, 67, 169, 225, 57, 112, 244, 155, 51, 236, 200, 233, 58, 61, 47, 100, 137, 28 | 185, 64, 17, 70, 234, 163, 219, 108, 170, 166, 59, 149, 52, 105, 24, 212, 78, 173, 45, 0, 116, 226, 119, 29 | 136, 206, 135, 175, 195, 25, 92, 121, 208, 126, 139, 3, 75, 141, 21, 130, 98, 241, 40, 154, 66, 184, 49, 30 | 181, 46, 243, 88, 101, 183, 8, 23, 72, 188, 104, 179, 210, 134, 250, 201, 164, 89, 216, 202, 220, 50, 31 | 221, 152, 140, 33, 235, 214], 32 | "zk": [1170614578, 1024848638, 1413669199, -343334464, -766094290, -1373058082, -143119608, -297228157, 33 | 1933479194, -971186181, -406453910, 460404854, -547427574, -1891326262, -1679095901, 2119585428, 34 | -2029270069, 2035090028, -1521520070, -5587175, -77751101, -2094365853, -1243052806, 1579901135, 35 | 1321810770, 456816404, -1391643889, -229302305, 330002838, -788960546, 363569021, -1947871109], 36 | "zm": [120, 50, 98, 101, 99, 98, 119, 100, 103, 107, 99, 119, 97, 99, 110, 111] 37 | } 38 | # salt: 签名依赖的最终数据 39 | salt = '6fpLRqJO8M/c3jnYxFkUVC4ZIG12SiH=5v0mXDazWBTsuw7QetbKdoPyAl+hN9rgE' 40 | # base_list: 第二次偏移需要使用的固定数组 41 | base_list = [48, 53, 57, 48, 53, 51, 102, 55, 100, 49, 53, 101, 48, 49, 100, 55] 42 | 43 | 44 | class PublicFunc: 45 | 46 | def __init__(self, log_name='default') -> None: 47 | self.now_date = datetime.now().strftime('%Y%m%d') 48 | log_path = '/data/log' if os.path.exists('/data/log') else '/Users/monkey/Documents/log' 49 | logger.add(os.path.join(log_path, f'{log_name}_{self.now_date}.log'), encoding='utf-8', 50 | enqueue=True, retention='10 days') 51 | self._headers = { 52 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36', 53 | } 54 | 55 | @staticmethod 56 | def parse_params(url): 57 | url = urlparse(url) 58 | params = {k: v[0] for k, v in parse_qs(url.query).items()} 59 | return params 60 | 61 | @staticmethod 62 | def get_proxies(): 63 | return { 64 | 'http': 'xxx', 65 | 'https': 'xxx', 66 | } 67 | 68 | def get_response(self, url, params=None, data=None, headers=None, method='get', cookies=None): 69 | err_count = 0 70 | e_res = None 71 | while err_count < 5: 72 | proxies = self.get_proxies() 73 | proxy_name = proxies.get('http').split('@')[-1].split('.')[1] 74 | try: 75 | headers = self._headers if headers is None else headers 76 | if method == 'get': 77 | response = requests.get(url, params=params, timeout=15, headers=headers, proxies=proxies, cookies=cookies) 78 | elif method == 'post': 79 | response = requests.post(url, data=data, timeout=15, headers=headers, cookies=cookies) 80 | else: 81 | return None 82 | if response.status_code == 200: 83 | response.encoding = 'utf8' 84 | if '网络不给力,请稍后重试' in response.text and 'paging' not in response.text: 85 | raise Exception('网络不给力,请稍后重试') 86 | if '安全验证' in response.text and 'paging' not in response.text: 87 | raise Exception('安全验证') 88 | return response 89 | if '"code":4041,"name":"NotFoundError","message":"资源不存在"' in response.text: 90 | return response 91 | raise Exception(response.status_code) 92 | except Exception as e: 93 | err_count += 1 94 | e_res = e 95 | return e_res 96 | 97 | @staticmethod 98 | def encrypt_md5(md5_str): 99 | """md5 加密""" 100 | md5_obj = md5() 101 | md5_obj.update(md5_str.encode()) 102 | return md5_obj.hexdigest() 103 | 104 | @staticmethod 105 | def str_to_unicode(translate_str): 106 | """将str 使用ord 转换成 整型列表""" 107 | ord_list = list() 108 | for str_ in translate_str: 109 | ord_list.append(ord(str_)) 110 | return ord_list 111 | 112 | @staticmethod 113 | def add_params_to_list(ord_list): 114 | """ 115 | 补全 ord_list 中数据 116 | 首先第一个部分是 随机数 * 127 117 | 第二部分是 0 118 | 第三部分是 ord_list 119 | 上面三部分构成长度为34的数组 120 | 第四部分是 [14,14,14,14,14,14,14,14,14,14,14,14,14,14] 121 | 最终构成长度为48为的数组 122 | :param ord_list: 123 | :return: 124 | """ 125 | params_list = list() 126 | random_num = int(random.random() * 127) # 随机值 控制每次签名不同 127 | params_list.append(random_num) 128 | params_list.append(0) 129 | params_list.extend(ord_list) 130 | params_list.extend([14 for i in range(14)]) 131 | return params_list 132 | 133 | @staticmethod 134 | def get_head_16(params_list): 135 | """ 136 | 获取 params_list 前16位 137 | 与数组base_list做异或操作: 138 | base_list=[48,53,57,48,53,51,102,55,100,49,53,101,48,49,100,55] 139 | :param params_list: 140 | :return: 141 | """ 142 | head_16_list = [params_list[index] ^ base_list[index] ^ 42 for index in range(16)] 143 | return head_16_list 144 | 145 | def js_func_g_x(self, e, t): 146 | """ 147 | 还原js 函数 __g.x 148 | :param e: 149 | :param t: 150 | :return: 151 | """ 152 | n = list() 153 | r = len(e) // 16 154 | # 16步进 155 | for i in range(0, r): 156 | a = [0 for i in range(16)] # 16位列表 157 | o = e[16 * i: 16 * (i + 1)] 158 | for c in range(16): 159 | a[c] = o[c] ^ t[c] 160 | t = self.js_func_g_r(a) 161 | n.extend(t) 162 | return n 163 | 164 | def js_func_g_r(self, e): 165 | """ 166 | 还原js 函数 __g.r 167 | :param e: 168 | :return: 169 | """ 170 | t = [0 for i in range(16)] # 16位列表 171 | n = [0 for j in range(36)] # 36位列表 172 | n[0] = self.js_func_b(e, 0) 173 | n[1] = self.js_func_b(e, 4) 174 | n[2] = self.js_func_b(e, 8) 175 | n[3] = self.js_func_b(e, 12) 176 | for r in range(32): 177 | o = self.js_func_g(n[r + 1] ^ n[r + 2] ^ n[r + 3] ^ h.get('zk')[r]) 178 | n[r + 4] = n[r] ^ o 179 | self.js_func_i(n[35], t, 0) 180 | self.js_func_i(n[34], t, 4) 181 | self.js_func_i(n[33], t, 8) 182 | self.js_func_i(n[32], t, 12) 183 | return t 184 | 185 | @staticmethod 186 | def js_func_b(e, t): 187 | """ 188 | 还原js 函数B 189 | :param e: 190 | :param t: 191 | :return: 192 | """ 193 | return (255 & e[t]) << 24 | (255 & e[t + 1]) << 16 | (255 & e[t + 2]) << 8 | 255 & e[t + 3] 194 | 195 | def js_func_g(self, e): 196 | """ 197 | 还原js function G 198 | :param e: 199 | :return: 200 | """ 201 | 202 | t = [0 for i in range(4)] # 16位列表 203 | n = [0 for j in range(4)] # 36位列表 204 | self.js_func_i(e, t, 0) # 调用 js_func_i 设定初始值 205 | n[0] = h.get('zb')[255 & t[0]] 206 | n[1] = h.get('zb')[255 & t[1]] 207 | n[2] = h.get('zb')[255 & t[2]] 208 | n[3] = h.get('zb')[255 & t[3]] 209 | r = self.js_func_b(n, 0) 210 | res = r ^ self.js_func_q(r, 2) ^ self.js_func_q(r, 10) ^ self.js_func_q(r, 18) ^ self.js_func_q(r, 24) 211 | return res 212 | 213 | def js_func_q(self, e, t): 214 | """ 215 | 还原js function Q 216 | :param e: 217 | :param t: 218 | :return: 219 | """ 220 | res = (4294967295 & e) << t | self.unsigned_right_shitf(e, 32 - t) 221 | return res 222 | 223 | def js_func_i(self, e, t, n): 224 | """ 225 | 还原 js func i 226 | :param e: 227 | :param t: 228 | :param n: 229 | :return: 230 | """ 231 | t[n] = 255 & self.unsigned_right_shitf(e, 24) 232 | t[n + 1] = 255 & self.unsigned_right_shitf(e, 16) 233 | t[n + 2] = 255 & self.unsigned_right_shitf(e, 8) 234 | t[n + 3] = 255 & e 235 | 236 | def unsigned_right_shitf(self, n, i): 237 | # 数字小于0,则转为32位无符号uint 238 | if n < 0: 239 | n = ctypes.c_uint32(n).value 240 | # 正常位移位数是为正数,但是为了兼容js之类的,负数就右移变成左移好了 241 | if i < 0: 242 | return -self.int_overflow(n << abs(i)) 243 | return self.int_overflow(n >> i) 244 | 245 | @staticmethod 246 | def int_overflow(val): 247 | maxint = 2147483647 248 | if not -maxint - 1 <= val <= maxint: 249 | val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1 250 | return val 251 | 252 | @staticmethod 253 | def get_result_value_list(new_48_list): 254 | """转换数值列表""" 255 | # 将列表[i:i+3]切片,并饭庄 256 | result_value_list = list() 257 | split_list = [new_48_list[i:i + 3] for i in range(0, len(new_48_list), 3)] 258 | split_list.reverse() 259 | for i in range(len(split_list)): 260 | _temp_list = split_list[i] 261 | _temp_list.reverse() 262 | _val = i % 4 263 | if _val == 0: 264 | temp_value_1 = _temp_list[_val] ^ 58 265 | temp_value_2 = _temp_list[1] << 8 266 | temp_value_3 = _temp_list[2] << 16 267 | elif _val == 1: 268 | temp_value_1 = _temp_list[0] 269 | temp_value_2 = (_temp_list[_val] ^ 58) << 8 270 | temp_value_3 = _temp_list[2] << 16 271 | elif _val == 2: 272 | temp_value_1 = _temp_list[0] 273 | temp_value_2 = _temp_list[1] << 8 274 | temp_value_3 = (_temp_list[_val] ^ 58) << 16 275 | else: 276 | temp_value_1 = _temp_list[0] 277 | temp_value_2 = _temp_list[1] << 8 278 | temp_value_3 = _temp_list[2] << 16 279 | value = temp_value_1 | temp_value_2 | temp_value_3 280 | result_value_list.append(value) 281 | return result_value_list 282 | 283 | @staticmethod 284 | def make_zhihu_sign(result_value_list): 285 | """通过salt 转换签名字符串""" 286 | sign_str = '' 287 | for _value in result_value_list: 288 | sign_str += salt[_value & 63] 289 | sign_str += salt[_value >> 6 & 63] 290 | sign_str += salt[_value >> 12 & 63] 291 | sign_str += salt[_value >> 18 & 63] 292 | return sign_str 293 | 294 | def test_case(self, url, d_c0): 295 | md5_str = '101_3_3.0+' + url + d_c0 296 | md5_res = self.encrypt_md5(md5_str) 297 | ord_list = self.str_to_unicode(md5_res) 298 | params_list = self.add_params_to_list(ord_list) 299 | head_16_list = self.get_head_16(params_list) 300 | end_32_list = params_list[16:] 301 | new_16_list = self.js_func_g_r(head_16_list) 302 | new_32_list = self.js_func_g_x(end_32_list, new_16_list) 303 | new_48_list = list() 304 | new_48_list.extend(new_16_list) 305 | new_48_list.extend(new_32_list) 306 | result_value_list = self.get_result_value_list(new_48_list) 307 | sign_str = self.make_zhihu_sign(result_value_list) 308 | return sign_str 309 | 310 | def get_cookie_d_c0(self, proxies=None): 311 | end_sign = self.test_case('/udid', '') 312 | headers = { 313 | 'x-zse-93': '101_3_3.0', 314 | 'x-api-version': '3.0.91', 315 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 316 | 'x-zse-96': '2.0_' + end_sign, 317 | 'accept': '*/*', 318 | } 319 | d_c0 = None 320 | err_count = 0 321 | while err_count <= 10: 322 | try: 323 | first_res = requests.post('https://www.zhihu.com/udid', data={}, headers=headers, proxies=proxies, 324 | timeout=60) 325 | cookie_t = utils.dict_from_cookiejar(first_res.cookies) 326 | d_c0 = cookie_t.get('d_c0') 327 | return d_c0 328 | except Exception as e: 329 | err_count += 1 330 | time.sleep(random.randint(1, 10)) 331 | logger.error(f'get_cookie_d_c0 err_count:{err_count}, proxies: {proxies}, e: {e}') 332 | return d_c0 333 | 334 | def _get_end_sign(self, md5_str): 335 | # md5_str = '101_3_3.0+'+url+d_c0 336 | md5_res = self.encrypt_md5(md5_str) 337 | ord_list = self.str_to_unicode(md5_res) 338 | params_list = self.add_params_to_list(ord_list) 339 | head_16_list = self.get_head_16(params_list) 340 | end_32_list = params_list[16:] 341 | new_16_list = self.js_func_g_r(head_16_list) 342 | new_32_list = self.js_func_g_x(end_32_list, new_16_list) 343 | new_48_list = list() 344 | new_48_list.extend(new_16_list) 345 | new_48_list.extend(new_32_list) 346 | result_value_list = self.get_result_value_list(new_48_list) 347 | sign_str = self.make_zhihu_sign(result_value_list) 348 | return sign_str 349 | 350 | @staticmethod 351 | def get_headers(d_c0, end_sign): 352 | headers = { 353 | "cookie": f"d_c0={d_c0};", 354 | 'x-zse-93': '101_3_3.0', 355 | 'x-api-version': '3.0.91', 356 | 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36', 357 | 'x-zse-96': '2.0_' + end_sign, 358 | 'accept': '*/*', 359 | # 'referer': 'https://www.zhihu.com/search?q=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E7%B4%A2%E9%9A%86%E8%BA%AB%E4%B8%96%E6%8F%AD%E7%A7%98&type=zvideo&utm_content=search_hot', 360 | 'accept-encoding': 'gzip, deflate', 361 | 'accept-language': 'zh-CN,zh;q=0.9', 362 | } 363 | return headers 364 | 365 | def run(self, keyword): 366 | url = f'https://www.zhihu.com/api/v4/search_v3?gk_version=gz-gaokao&t=general&q={keyword}&correction=1&offset=0&limit=20&filter_fields=&lc_idx=0&show_all_topics=0&search_source=Filter&vertical=answer&time_interval=a_week' 367 | url_params = t.parse_params(url) 368 | params = url_params 369 | offset = url_params.get('offset', 0) 370 | req_url = 'https://www.zhihu.com/api/v4/search_v3' 371 | reply_num = 0 372 | while True: 373 | d_c0 = t.get_cookie_d_c0() 374 | end_sign = t._get_end_sign(f'101_3_3.0+/api/v4/search_v3?{urlencode(params)}+{d_c0}') 375 | headers = t.get_headers(d_c0, end_sign) 376 | response = t.get_response(url=req_url, headers=headers, params=params) 377 | if isinstance(response, requests.Response): 378 | break 379 | reply_num += 1 380 | logger.error( 381 | f'search keyword reply {reply_num} times! keyword: {keyword}, offset: {offset}, e: {response}') 382 | if reply_num > 50: 383 | return 384 | response = response.json() 385 | return response 386 | 387 | 388 | if __name__ == '__main__': 389 | t = PublicFunc() 390 | keyword = '海贼王' 391 | t.run(keyword) 392 | 393 | -------------------------------------------------------------------------------- /ziru/zr.py: -------------------------------------------------------------------------------- 1 | from pytesseract.pytesseract import image_to_string 2 | import requests 3 | from lxml import etree 4 | from PIL import Image 5 | import pytesseract 6 | import re 7 | import time 8 | import os 9 | import pymysql 10 | 11 | 12 | class Ziru(object): 13 | 14 | def __init__(self): 15 | self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'} 16 | self.city_info = dict() 17 | self.cwd = '/'.join(__file__.split('/')[:-1]) 18 | self.conn = pymysql.Connection(host='localhost', user='root', password='root', database='demo', port=3306) 19 | self.cursor = self.conn.cursor() 20 | 21 | def __del__(self): 22 | self.conn.close() 23 | 24 | def get_response(self, url): 25 | response = requests.get(url, headers=self._headers) 26 | if response.status_code == 200: 27 | response.encoding = response.apparent_encoding 28 | return response 29 | else: 30 | print(response.status_code) 31 | return None 32 | 33 | def get_city_info(self): 34 | response = self.get_response(url='https://www.ziroom.com/') 35 | if response is None: 36 | return 37 | html = etree.HTML(response.text) 38 | city_name = html.xpath('//a[@class="Z_city_option ani"]/text()') 39 | city_url = html.xpath('//a[@class="Z_city_option ani"]/@href') 40 | self.city_info = dict(zip(city_name, city_url)) 41 | 42 | @staticmethod 43 | def image_identification(img_path): 44 | the_img = Image.open(img_path) 45 | result = pytesseract.image_to_string(the_img, config='--psm 7') 46 | os.remove(img_path) 47 | return list(result.strip()) 48 | 49 | def get_zone_info(self, city_url): 50 | response = self.get_response(city_url + 'z/') 51 | if response is None: 52 | return 53 | html = etree.HTML(response.text) 54 | zone_url = html.xpath('//a[text()="区域"]/following-sibling::div/a/@href') 55 | zone_name = html.xpath('//a[text()="区域"]/following-sibling::div/a/text()') 56 | zone_info = dict(zip(zone_name, zone_url)) 57 | for key in zone_info: 58 | print('开始获取{}的数据'.format(key)) 59 | self.get_room_info('https:{}'.format(zone_info[key])) 60 | 61 | def get_room_info(self, url): 62 | response = self.get_response(url) 63 | if response is None: 64 | print('{}获取失败'.format(url)) 65 | return 66 | print(url) 67 | html = etree.HTML(response.text) 68 | title = html.xpath('//h5[starts-with(@class, "title")]/a/text()') 69 | room_url = ['https:{}'.format(info) for info in html.xpath('//h5[starts-with(@class, "title")]/a/@href')] 70 | desc = html.xpath('//div[@class="desc"]/div[1]/text()') 71 | location = [info.strip() for info in html.xpath('//div[@class="location"]/text()')] 72 | room_price = list() 73 | room_element = html.xpath('//div[@class="Z_list"]/div[2]/div') 74 | for element in room_element: 75 | price = '' 76 | img_url = element.xpath('.//span[@class="num"]/@style') 77 | if not img_url: 78 | continue 79 | img_url = re.findall('url\((.*?)\)', img_url[0])[0] 80 | price_position = [float(re.findall('position: -(.*?)px', info)[0]) for info in element.xpath('.//span[@class="num"]/@style')] 81 | img_path = os.path.join(self.cwd, img_url.split('/')[-1]) 82 | with open(img_path, 'wb') as f: 83 | f.write(self.get_response('https:{}'.format(img_url)).content) 84 | img_nums = self.image_identification(img_path) 85 | for position in price_position: 86 | price += img_nums[int(position / 20)] 87 | try: 88 | room_price.append(int(price)) 89 | except: 90 | room_price.append(None) 91 | data = { 92 | '标题': title, 93 | '链接': room_url, 94 | '信息': desc, 95 | '地址': location, 96 | '价格': room_price, 97 | } 98 | self.save_data(data) 99 | next_url = html.xpath('//a[@class="next"]/@href') 100 | if next_url: 101 | self.get_room_info('https:{}'.format(next_url[0])) 102 | 103 | def save_data(self, item): 104 | data = list() 105 | for i in range(len(item['标题'])): 106 | info = list() 107 | for key in item.keys(): 108 | info.append(item[key][i]) 109 | data.append(info) 110 | sql = 'INSERT INTO ziru (title, url, info, location, price) VALUES (%s, %s, %s, %s, %s);' 111 | # print(data) 112 | self.cursor.executemany(sql, data) 113 | self.conn.commit() 114 | print('保存成功{}条'.format(len(data))) 115 | 116 | def run(self): 117 | self.get_zone_info('https://sh.ziroom.com/') 118 | 119 | 120 | if __name__ == '__main__': 121 | s = Ziru() 122 | s.run() 123 | 124 | --------------------------------------------------------------------------------