├── .gitignore
├── README.md
├── bilibili
    └── bilibili_user.py
├── biqu
    └── biqu.py
├── cninfo
    ├── crawler.py
    └── demo.js
├── dzdp_svg
    └── dzdp_svg.py
├── jijin
    └── TTJJ.py
├── lagou
    ├── lg.js
    └── lg.py
├── lianjia
    └── lianjia.py
├── music163
    ├── Music.js
    └── Music.py
├── qcc
    └── qcc.py
├── scrapeCenter
    ├── spa1
    │   └── crawl.py
    ├── spa14
    │   ├── Wasm.wasm
    │   └── crawl.py
    ├── spa15
    │   ├── crawl.py
    │   └── demo.js
    ├── spa16
    │   └── crawl.py
    ├── spa2
    │   └── crawl.py
    ├── spa3
    │   └── crawl.py
    ├── spa5
    │   └── crawl.py
    ├── spa6
    │   ├── crawl.py
    │   └── demo.js
    ├── spa7
    │   └── crawl.py
    ├── ssr1
    │   └── crawl.py
    ├── ssr2
    │   └── crawl.py
    ├── ssr3
    │   └── crawl.py
    └── ssr4
    │   └── crawl.py
├── tweet
    ├── GetToken.py
    └── Tweet.py
├── weather
    └── weather.py
├── weibo
    ├── get_fans_info.py
    ├── search.py
    ├── search_all.py
    ├── weibo_comment.py
    └── 大V.txt
├── youdao
    └── yd_tran.py
├── zhihu
    └── public_func.py
└── ziru
    └── zr.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | .idea/
132 | /weibo/exist.txt
133 | .DS_Store
134 | 
135 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python-spider
 2 | python爬虫小项目
 3 | 目录如下：
 4 | 1. [笔趣阁小说下载](https://github.com/monkey-hjy/python-spider/tree/master/biqu)
 5 | 2. [Tweet数据抓取](https://github.com/monkey-hjy/python-spider/tree/master/tweet)
 6 | 3. [中国天气网数据查询](https://github.com/monkey-hjy/python-spider/tree/master/weather)
 7 | 4. [网易云音乐逆向爬虫](https://github.com/monkey-hjy/python-spider/tree/master/music163)
 8 | 5. [天天基金网指定基金数据抓取](https://github.com/monkey-hjy/python-spider/tree/master/jijin)
 9 | 6. [微博信息抓取](https://github.com/monkey-hjy/python-spider/tree/master/weibo)
10 | 7. [有道翻译逆向](https://github.com/monkey-hjy/python-spider/tree/master/youdao)
11 | 8. [链家全国租房信息抓取](https://github.com/monkey-hjy/python-spider/tree/master/lianjia)
12 | 9. [企查查免登陆爬虫](https://github.com/monkey-hjy/python-spider/tree/master/qcc)
13 | 10. [大众点评svg加密](https://github.com/monkey-hjy/python-spider/tree/master/dzdp_svg)
14 | 11. [B站用户爬虫](https://github.com/monkey-hjy/python-spider/tree/master/bilibili)
15 | 12. [拉钩免登录爬虫](https://github.com/monkey-hjy/python-spider/blob/master/lagou)
16 | 13. [自如租房字体加密](https://github.com/monkey-hjy/python-spider/tree/master/ziru)
17 | 14. [知乎问答抓取](https://github.com/monkey-hjy/python-spider/tree/master/zhihu_answer)
18 | 15. [深证信数据服务平台](https://github.com/monkey-hjy/python-spider/tree/master/cninfo)
19 | 
20 | 
21 | - CSDN不定期更新文章。个人主页 [https://blog.csdn.net/qq_42452095](https://blog.csdn.net/qq_42452095)
22 | - B站不定期更新视频。个人主页 [https://space.bilibili.com/347405521/channel/detail?cid=181641](https://space.bilibili.com/347405521/channel/detail?cid=181641)
23 | - 对代码有问题的话可以在本项目的 [Issues](https://github.com/monkey-hjy/python-spider/issues) 中沟通
24 | - 如果有代写需求。可以联系QQ847703187  微信：847703187
25 | 


--------------------------------------------------------------------------------
/bilibili/bilibili_user.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # @File    :   bilibili_user.py    
 4 | # @Author  :   Monkey
 5 | # @DATE    :   2021/5/17 10:04
 6 | from gevent import monkey; monkey.patch_all()
 7 | import gevent.pool
 8 | import requests
 9 | import pymysql
10 | import datetime
11 | 
12 | 
13 | class BiliUser(object):
14 |     """B站用户"""
15 | 
16 |     def __init__(self):
17 |         self.pool = gevent.pool.Pool(size=50)
18 |         # 10的7次幂。千万
19 |         self.mid_list = list(range(1, pow(10, 7)))
20 |         # self.mid_list = list(range(1, pow(10, 3)))
21 |         self.conn = pymysql.Connect(host='localhost', user='root', password='root', port=3306, database='demo')
22 |         self.cursor = self.conn.cursor()
23 |         self.proxies = dict()
24 |         self._headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
25 |         self.data = []
26 |         self.ips = []
27 |         self.set_proxies()
28 | 
29 |     def set_proxies(self):
30 |         """设置代理"""
31 |         ip = "需要在这里填写上自己获取代理IP的方法"
32 |         self.proxies = {
33 |             'http': 'http://{}'.format(ip),
34 |             'https': 'http://{}'.format(ip),
35 |         }
36 | 
37 |     def get_fans_count(self, mid):
38 |         """获取粉丝数量"""
39 |         url = 'https://api.bilibili.com/x/relation/stat?vmid={}&jsonp=jsonp'.format(mid)
40 |         response = requests.get(url, headers=self._headers, proxies=self.proxies).json()
41 |         follower = response['data']['follower']
42 |         following = response['data']['following']
43 |         return follower, following
44 | 
45 |     def get_user_info(self, mid):
46 |         """获取用户信息"""
47 |         url = 'https://api.bilibili.com/x/space/acc/info?mid={}&jsonp=jsonp'.format(mid)
48 |         err_count = 0
49 |         while err_count < 5:
50 |             try:
51 |                 response = requests.get(url, headers=self._headers, proxies=self.proxies, timeout=10).json()
52 |                 if response['code'] == 0:
53 |                     nike_name = response['data']['name']
54 |                     sex = response['data']['sex']
55 |                     level = response['data']['level']
56 |                     sign = response['data']['sign']
57 |                     birthday = response['data']['birthday']
58 |                     follower, following = self.get_fans_count(mid)
59 |                     self.data.append([mid, nike_name, sex, level, sign, birthday, follower, following])
60 |                     print('mid:{}\tdata:{}'.format(mid, len(self.data)))
61 |                     if len(self.data) >= 100:
62 |                         data, self.data = self.data, []
63 |                         self.save_data(data)
64 |                     break
65 |                 elif response['code'] == -412:
66 |                     raise Exception
67 |                 else:
68 |                     print(datetime.datetime.now(), response, mid)
69 |                     break
70 |             except Exception as e:
71 |                 err_count += 1
72 |                 self.set_proxies()
73 |                 # print(err_count, self.proxies, e)
74 | 
75 |     def save_data(self, data):
76 |         """保存数据"""
77 |         sql = "INSERT INTO bili (mid, nike_name, sex, level, sign, birthday, follower, following) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)"
78 |         self.cursor.executemany(sql, data)
79 |         self.conn.commit()
80 |         print('{}\t保存成功 --- {}'.format(datetime.datetime.now(), len(data)))
81 | 
82 |     def __del__(self):
83 |         self.conn.close()
84 | 
85 |     def run(self):
86 |         """启动函数"""
87 |         self.pool.map(self.get_user_info, self.mid_list)
88 |         if self.data:
89 |             self.save_data(self.data)
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     t = BiliUser()
94 |     t.run()
95 | 


--------------------------------------------------------------------------------
/biqu/biqu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: monkey-hjy
 3 | # @Date:   2021-02-24 17:12:52
 4 | # @Last Modified by:   monkey-hjy
 5 | # @Last Modified time: 2021-02-24 17:16:23
 6 | import requests
 7 | from lxml import etree
 8 | import random
 9 | from datetime import datetime, time
10 | 
11 | # 随机UA头
12 | USER_AGENT = [
13 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
14 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
15 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
16 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
17 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
18 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
19 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
20 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
21 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
22 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
23 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
24 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
25 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
26 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
27 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
28 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
29 | ]
30 | 
31 | 
32 | class SpiderBook(object):
33 | 
34 |     def __init__(self):
35 |         self.search_url = 'https://www.biqooge.com/modules/article/search.php'
36 |         self._headers = {'user-agent': random.choice(USER_AGENT)}
37 |     
38 |     def search_book(self):
39 |         book_name = self.book_name
40 |         data = {
41 |             'searchtype': 'articlename',
42 |             'searchkey': book_name.encode('gbk'),
43 |         }
44 |         response = requests.post(self.search_url, headers=self._headers, data=data)
45 |         response.encoding = response.apparent_encoding
46 |         html = etree.HTML(response.text)
47 |         name = html.xpath('//tr[@id="nr"]/td[1]/a/text()')
48 |         book_url = html.xpath('//tr[@id="nr"]/td[1]/a/@href')
49 |         author = html.xpath('//tr[@id="nr"]/td[3]/text()')
50 |         for i in range(len(name)):
51 |             print('编号{}信息：作者-{}\t书名-{}'.format(i, author[i], name[i]))
52 |         need_id = int(input('输入需要的书籍编号：'))
53 |         self.download_book(book_url[need_id])
54 |     
55 |     def download_book(self, book_url):
56 |         response = requests.get(book_url, headers=self._headers)
57 |         response.encoding = response.apparent_encoding
58 |         html = etree.HTML(response.text)
59 |         zj_info = html.xpath('//dt[contains(text(), "章节目录")]/following-sibling::dd')
60 |         for i in range(len(zj_info)):
61 |             info = zj_info[i]
62 |             zj_name = info.xpath('./a/text()')[0]
63 |             zj_url = 'https://www.biqooge.com' + info.xpath('./a/@href')[0]
64 |             zj_response = requests.get(zj_url, headers=self._headers)
65 |             zj_response.encoding = zj_response.apparent_encoding
66 |             zj_html = etree.HTML(zj_response.text)
67 |             content = ''.join(zj_html.xpath('//div[@id="content"]/text()'))
68 |             print('{}/{}\tname:{}\turl:{}'.format(i+1, len(zj_info), zj_name, zj_url))
69 |             with open('{}.txt'.format(self.book), 'a', encoding='utf8') as f:
70 |                 f.write(zj_name + '\n')
71 |                 f.write(content + '\n\n')
72 |     
73 |     def run(self):
74 |         self.book_name = '完美世界'
75 |         self.search_book()
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     s = SpiderBook()
80 |     s.run()
81 | 


--------------------------------------------------------------------------------
/cninfo/crawler.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawler.py
 3 | # Date: 2022/05/30 23:05
 4 | # Auth: HJY
 5 | import requests
 6 | import execjs
 7 | 
 8 | ctx = execjs.compile(open('./demo.js', encoding='utf-8').read())
 9 | url = 'https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1007'
10 | headers = {
11 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
12 |     'mcode': ctx.call('getResCode')
13 | }
14 | data = {
15 |     'tdate': '2022-05-27',
16 |     'market': 'SZE'
17 | }
18 | response = requests.post(url, json=data, headers=headers)
19 | print(response.json())
20 | 


--------------------------------------------------------------------------------
/cninfo/demo.js:
--------------------------------------------------------------------------------
 1 | function getResCode(){
 2 |     var time=Math.floor(new Date().getTime()/1000); 
 3 |     return missjson(""+time);
 4 | }
 5 | 
 6 | function missjson(input) {
 7 |     var keyStr = "ABCDEFGHIJKLMNOP" + "QRSTUVWXYZabcdef" + "ghijklmnopqrstuv" + "wxyz0123456789+/" + "=";
 8 |     var output = "";
 9 |     var chr1, chr2, chr3 = "";
10 |     var enc1, enc2, enc3, enc4 = "";
11 |     var i = 0;
12 |     do {
13 |         chr1 = input.charCodeAt(i++);
14 |         chr2 = input.charCodeAt(i++);
15 |         chr3 = input.charCodeAt(i++);
16 |         enc1 = chr1 >> 2;
17 |         enc2 = ((chr1 & 3) << 4) | (chr2 >> 4);
18 |         enc3 = ((chr2 & 15) << 2) | (chr3 >> 6);
19 |         enc4 = chr3 & 63;
20 |         if (isNaN(chr2)) {
21 |             enc3 = enc4 = 64;
22 |         } else if (isNaN(chr3)) {
23 |             enc4 = 64;
24 |         }
25 |         output = output + keyStr.charAt(enc1) + keyStr.charAt(enc2) + keyStr.charAt(enc3) + keyStr.charAt(enc4);
26 |         chr1 = chr2 = chr3 = "";
27 |         enc1 = enc2 = enc3 = enc4 = "";
28 |     } while (i < input.length);
29 |     return output;
30 | }
31 | 


--------------------------------------------------------------------------------
/dzdp_svg/dzdp_svg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # @File    :   dzdp_svg.py    
 4 | # @Author  :   Monkey
 5 | # @DATE    :   2021/5/13 下午4:54
 6 | import re
 7 | import requests
 8 | 
 9 | 
10 | class DZDP(object):
11 |     """大众点评"""
12 | 
13 |     def __init__(self):
14 |         self.css_url = 'https://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/80da73cea991b1dac8e6c3eb8cfe7461.css'
15 |         self.svg_url = 'https://s3plus.meituan.net/v1/mss_0a06a471f9514fc79c981b5466f56b91/svgtextcss/20609a5f67dfd9a34fd762ac63e59960.svg'
16 |         self.css_text = requests.get(self.css_url).text
17 |         self.svg_info = {int(info.split('">')[0]): info.split('">')[1] for info in re.findall(r'y="(.*?)</text>', requests.get(self.svg_url).text)}
18 | 
19 |     def get_txt(self, code):
20 |         """获取到编码对应的文字"""
21 |         try:
22 |             patt = '%s{background:(.*?);' % code
23 |             index = re.findall(patt, self.css_text)[0].replace('px', '').replace('-', '').split(' ')
24 |             index_x, index_y = int(index[0][:-2]), int(index[1][:-2])
25 |             for key in self.svg_info:
26 |                 if key >= index_y:
27 |                     return self.svg_info[key][index_x // 14]
28 |         except:
29 |             return code
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     t = DZDP()
34 |     print(t.get_txt(code='swnbb'))
35 | 
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/jijin/TTJJ.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: monkey-hjy
 3 | # @Date:   2021-03-04 11:18:58
 4 | # @Last Modified by:   monkey-hjy
 5 | # @Last Modified time: 2021-03-04 11:19:17
 6 | # 天天基金网数据抓取
 7 | 
 8 | import requests
 9 | import time
10 | import re
11 | import json
12 | import pandas as pd
13 | import random
14 | 
15 | file_path = '基金查询.xlsx'
16 | fund_codes = ['001606', '000924', '005962', '004997', '006751']
17 | start_date = '2019-01-01'
18 | end_date = '2021-10-30'
19 | url = 'http://api.fund.eastmoney.com/f10/lsjz'
20 | headers = {
21 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
22 |     'Referer': 'http://fundf10.eastmoney.com/',
23 | }
24 | result = dict()
25 | result_fsrq = []
26 | for fund_code in fund_codes:
27 |     params = {
28 |         "callback": f"jQuery183{''.join([str(random.randrange(0, 10)) for _ in range(17)])}_{int(time.time() * 1000)}",
29 |         "fundCode": fund_code,
30 |         "pageIndex": "1",
31 |         "pageSize": "100000",
32 |         "startDate": start_date,
33 |         "endDate": end_date,
34 |         "_": str(int(time.time() * 1000)),
35 |     }
36 |     response = json.loads(re.findall(r'\((.*)\)', requests.get(url, headers=headers, params=params).text, re.S)[0])
37 |     # 日期
38 |     FSRQ = []
39 |     # 单位净值
40 |     DWJZ = []
41 |     fund_info = response['Data']['LSJZList']
42 |     for i in range(len(fund_info)):
43 |         # FSRQ.append(datetime.datetime.strptime(fund_info[i]['FSRQ'], '%Y-%m-%d'))
44 |         FSRQ.append(fund_info[i]['FSRQ'])
45 |         DWJZ.append(fund_info[i]['DWJZ'])
46 |     result_fsrq = FSRQ if len(FSRQ) > len(result_fsrq) else result_fsrq
47 |     result[fund_code] = DWJZ
48 | max_len = 0
49 | for key in result:
50 |     max_len = len(result[key]) if len(result[key]) > max_len else max_len
51 | for key in result:
52 |     result[key] += [None] * (max_len - len(result[key]))
53 | result = pd.DataFrame(result)
54 | result.index = result_fsrq
55 | result.to_excel(file_path, encoding='ANSI')
56 | 


--------------------------------------------------------------------------------
/lagou/lg.py:
--------------------------------------------------------------------------------
  1 | from gevent import monkey; monkey.patch_all()
  2 | import gevent.pool
  3 | import json
  4 | import random
  5 | import re
  6 | 
  7 | from lxml import etree
  8 | import execjs
  9 | import requests
 10 | from sns_spider.config.settings import USER_AGENTS
 11 | import pymongo
 12 | 
 13 | 
 14 | class LG(object):
 15 |     """拉钩 js逆向"""
 16 | 
 17 |     def __init__(self):
 18 |         self.client = pymongo.MongoClient(host='localhost', port=27017)
 19 |         self.mongo_col = self.client['demo']['lagou']
 20 |         self.js_file = open('lg.js', encoding='utf8').read()
 21 |         self._headers = {
 22 |             'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
 23 |             'referer': 'https://www.lagou.com/jobs/list_java/p-city_3?px=default',
 24 |         }
 25 |         self.token = ''
 26 |         self.proxies = dict()
 27 |         self.set_proxies()
 28 |         self.get_token()
 29 |         self.city_info = dict()
 30 | 
 31 |     def set_proxies(self):
 32 |         """设置代理"""
 33 |         ip = "获取到代理IP"
 34 |         self.proxies = {
 35 |             'http': 'http://{}'.format(ip),
 36 |             'https': 'http://{}'.format(ip),
 37 |         }
 38 | 
 39 |     def get_response(self, url, params=None, data=None, method='GET'):
 40 |         while True:
 41 |             try:
 42 |                 if method == 'GET':
 43 |                     response = requests.get(url, params=params, headers=self._headers, proxies=self.proxies)
 44 |                 else:
 45 |                     response = requests.post(url, params=params, data=data, headers=self._headers, proxies=self.proxies)
 46 |                 response.encoding = response.apparent_encoding
 47 |                 return response
 48 |             except:
 49 |                 self.set_proxies()
 50 |                 self.get_token()
 51 | 
 52 |     def get_token(self):
 53 |         """获取到游客cookie"""
 54 |         url = 'https://www.lagou.com/gongsi/allCity.html'
 55 |         while True:
 56 |             headers = {'user-agent': random.choice(USER_AGENTS)}
 57 |             try:
 58 |                 response = requests.get(url, headers=headers, allow_redirects=False, proxies=self.proxies, timeout=10)
 59 |                 response.encoding = response.apparent_encoding
 60 |                 user_trace_token = re.findall(r'user_trace_token=(.*?);', response.headers['Set-Cookie'])[0]
 61 |                 x_http_token = re.findall(r'X_HTTP_TOKEN=(.*?);', response.headers['Set-Cookie'])[0]
 62 |                 href = response.headers['Location']
 63 |                 ctx = execjs.compile(self.js_file, cwd='/opt/homebrew/Cellar/node/16.3.0/bin/')
 64 |                 self.token = ctx.call('window.gt.prototype.a',
 65 |                                       json.dumps({"href": href, "search": href.split('check.html')[1]}))
 66 |                 self._headers['cookie'] = 'user_trace_token={};X_HTTP_TOKEN={};__lg_stoken__={}'.format(
 67 |                     user_trace_token, x_http_token, self.token)
 68 |                 return
 69 |             except Exception as e:
 70 |                 print('获取token失败\tproxies:{}\te:{}'.format(self.proxies, e))
 71 |                 self.set_proxies()
 72 | 
 73 |     def get_city_info(self):
 74 |         """获取城市信息"""
 75 |         url = 'https://www.lagou.com/jobs/allCity.html'
 76 |         html = etree.HTML(self.get_response(url).text)
 77 |         city_url = html.xpath('//ul[@class="city_list"]/li/a/@href')
 78 |         city_name = html.xpath('//ul[@class="city_list"]/li/a/text()')
 79 |         self.city_info = {city_name[i]: city_url[i] for i in range(len(city_url))}
 80 | 
 81 |     def get_job_info(self, input_item):
 82 |         """获取职位信息"""
 83 |         url = 'https://www.lagou.com/jobs/positionAjax.json'
 84 |         params = {
 85 |             "px": "default",
 86 |             "city": input_item['city_name'],
 87 |             "district": input_item['district'],
 88 |             "needAddtionalResult": "false",
 89 |         }
 90 |         sid = ''
 91 |         page = 1
 92 |         while True:
 93 |             data = {
 94 |                 "first": "true",
 95 |                 "pn": page,
 96 |                 "kd": input_item['keyword'],
 97 |                 "sid": sid,
 98 |             }
 99 |             job_info = self.get_response(url, params=params, data=data, method='POST').json()
100 |             if 'success' in job_info:
101 |                 sid = job_info['content']['showId']
102 |                 job_info = job_info['content']['positionResult']['result']
103 |                 if not job_info or page == 30:
104 |                     break
105 |                 self.parse_info(job_info, input_item)
106 |                 print('{}\t页码：{}\t数据量：{}'.format(input_item, page, len(job_info)))
107 |             page += 1
108 | 
109 |     def parse_info(self, job_info, input_item):
110 |         """解析内容"""
111 |         items = list()
112 |         for info in job_info:
113 |             item = {
114 |                 '_id': info['positionId'],
115 |                 'job_name': info['positionName'],
116 |                 'job_url': 'https://www.lagou.com/jobs/{}.html'.format(info['positionId']),
117 |                 'company_name': info['companyFullName'],
118 |                 'company_size': info['companySize'],
119 |                 'industry_field': info['industryField'],
120 |                 'finance_stage': info['financeStage'],
121 |                 'company_label': info['companyLabelList'],
122 |                 'skill_label': info['skillLables'],
123 |                 'position_label': info['positionLables'],
124 |                 'create_time': info['createTime'],
125 |                 'city': info['city'],
126 |                 'district': info['district'],
127 |                 'salary': info['salary'],
128 |                 'work_year': info['workYear'],
129 |                 'job_nature': info['jobNature'],
130 |                 'education': info['education'],
131 |                 'position_advantage': info['positionAdvantage'],
132 |                 'position_detail': info['positionDetail'],
133 |                 'position_address': info['positionAddress']
134 |             }
135 |             items.append(item)
136 |         try:
137 |             self.mongo_col.insert_many(items)
138 |             # print('{}\t插入成功。本次插入{}条'.format(input_item, len(items)))
139 |         except:
140 |             for item in items:
141 |                 try:
142 |                     self.mongo_col.insert_one(item)
143 |                 except:
144 |                     pass
145 | 
146 |     def run(self):
147 |         """启动函数"""
148 |         self.get_city_info()
149 |         # print(self.city_info)
150 |         for city_name, city_url in self.city_info.items():
151 |         # for city_name in ['郑州', '北京', '上海', '广州', '深圳']:
152 |             city_url = self.city_info[city_name]
153 |             if '-zhaopin' not in city_url:
154 |                 city_url = city_url.rstrip('/') + '-zhaopin/'
155 |             response = self.get_response(url=city_url, method='GET')
156 |             html = etree.HTML(response.text)
157 |             district_name = html.xpath('//div[@data-type="district"]/a[position()>1]/text()')
158 |             item = [{'city_name': city_name, 'district': name, 'keyword': 'python'} for name in district_name]
159 |             print(item)
160 |             pool = gevent.pool.Pool(size=1)
161 |             pool.map(self.get_job_info, item)
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     t = LG()
166 |     t.run()
167 | 
168 | 


--------------------------------------------------------------------------------
/lianjia/lianjia.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Author:   玛卡巴卡
 3 | # Date:     2021/5/6 14:39
 4 | 
 5 | import requests
 6 | from lxml import etree
 7 | import pymysql
 8 | 
 9 | 
10 | class Lianjia(object):
11 |     """抓取链家租房信息"""
12 | 
13 |     def __init__(self):
14 |         self.conn = pymysql.Connect(host='localhost', port=3306, user='root', password='root', database='demo')
15 |         self.cursor = self.conn.cursor()
16 |         self._headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'}
17 | 
18 |     def __del__(self):
19 |         self.conn.close()
20 | 
21 |     def get_response(self, url):
22 |         """发起请求"""
23 |         response = requests.get(url, headers=self._headers)
24 |         if response.status_code == 200:
25 |             response.encoding = 'utf8'
26 |             return response
27 |         else:
28 |             print('url:{}\tresponse:{}'.format(url, response))
29 | 
30 |     def get_city_url(self):
31 |         """获取到城市的链接"""
32 |         url = 'https://www.lianjia.com/city/'
33 |         html = etree.HTML(self.get_response(url).text)
34 |         city_url = html.xpath('//ul[@class="city_list_ul"]//a/@href')
35 |         for url in city_url:
36 |             self.get_district_url(city_url=url)
37 | 
38 |     def get_district_url(self, city_url):
39 |         """获取到区的链接"""
40 |         html = etree.HTML(self.get_response(city_url + 'zufang/').text)
41 |         district_url = html.xpath('//li[@class="filter__item--level2  "]/a/@href')
42 |         for url in district_url:
43 |             self.get_house_count(url=city_url[:-1] + url)
44 | 
45 |     def get_house_count(self, url):
46 |         """获取当前区的房子数量"""
47 |         html = etree.HTML(self.get_response(url).text)
48 |         count = int(html.xpath('//span[@class="content__title--hl"]/text()')[0])
49 |         if count:
50 |             if count >= 3000:
51 |                 filter_next_url = html.xpath('//li[@class="filter__item--level3  "]/a/@href')
52 |                 for filter_url in filter_next_url:
53 |                     the_filter_url = '/'.join(url.split('/')[:3]) + filter_url
54 |                     html = etree.HTML(self.get_response(the_filter_url).text)
55 |                     count = min(int(html.xpath('//span[@class="content__title--hl"]/text()')[0]), 3000)
56 |                     self.start(the_filter_url, count // 30 + 1)
57 |             else:
58 |                 self.start(url, count // 30 + 1)
59 |         else:
60 |             print('{} 无房源'.format(url))
61 | 
62 |     def start(self, url, end_page):
63 |         """开始抓取数据"""
64 |         for page in range(1, end_page+1):
65 |             self.get_page_info(url='{}pg{}/'.format(url, page))
66 | 
67 |     def get_page_info(self, url):
68 |         """获取当前页房源信息"""
69 |         print(url, end='\t')
70 |         err_count = 0
71 |         response = self.get_response(url)
72 |         html = etree.HTML(response.text)
73 |         house_element = html.xpath('//div[@class="content__list--item"]')
74 |         for element in house_element:
75 |             try:
76 |                 house_url = '/'.join(url.split('/')[:3]) + element.xpath('./a/@href')[0]
77 |                 house_code = element.xpath('./@data-house_code')[0]
78 |                 title = element.xpath('./a/@title')[0]
79 |                 des = ''.join(element.xpath('./div/p[2]//text()')).replace('\n', '').replace('    ', ' ')
80 |                 price = int(element.xpath('./div/span/em/text()')[0])
81 |                 sql = "INSERT INTO lianjia (id, url, title, des, price) values ('%s', '%s', '%s', '%s', %d);" % (house_code, house_url, title, des, price)
82 |                 self.cursor.execute(sql)
83 |                 self.conn.commit()
84 |             except Exception as e:
85 |                 err_count += 1
86 |         print('出错占比：{}/{}'.format(err_count, len(house_element)))
87 | 
88 |     def run(self):
89 |         """启动函数"""
90 |         self.get_city_url()
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     t = Lianjia()
95 |     t.run()
96 | 


--------------------------------------------------------------------------------
/music163/Music.js:
--------------------------------------------------------------------------------
   1 | /*
   2 | * @Author: monkey-hjy
   3 | * @Date:   2021-02-24 17:42:52
   4 | * @Last Modified by:   monkey-hjy
   5 | * @Last Modified time: 2021-02-24 17:48:21
   6 | */
   7 | var CryptoJS = CryptoJS || function(u, p) {
   8 |     var d = {}
   9 |       , l = d.lib = {}
  10 |       , s = function() {}
  11 |       , t = l.Base = {
  12 |         extend: function(a) {
  13 |             s.prototype = this;
  14 |             var c = new s;
  15 |             a && c.mixIn(a);
  16 |             c.hasOwnProperty("init") || (c.init = function() {
  17 |                 c.$super.init.apply(this, arguments)
  18 |             }
  19 |             );
  20 |             c.init.prototype = c;
  21 |             c.$super = this;
  22 |             return c
  23 |         },
  24 |         create: function() {
  25 |             var a = this.extend();
  26 |             a.init.apply(a, arguments);
  27 |             return a
  28 |         },
  29 |         init: function() {},
  30 |         mixIn: function(a) {
  31 |             for (var c in a)
  32 |                 a.hasOwnProperty(c) && (this[c] = a[c]);
  33 |             a.hasOwnProperty("toString") && (this.toString = a.toString)
  34 |         },
  35 |         clone: function() {
  36 |             return this.init.prototype.extend(this)
  37 |         }
  38 |     }
  39 |       , r = l.WordArray = t.extend({
  40 |         init: function(a, c) {
  41 |             a = this.words = a || [];
  42 |             this.sigBytes = c != p ? c : 4 * a.length
  43 |         },
  44 |         toString: function(a) {
  45 |             return (a || v).stringify(this)
  46 |         },
  47 |         concat: function(a) {
  48 |             var c = this.words
  49 |               , e = a.words
  50 |               , j = this.sigBytes;
  51 |             a = a.sigBytes;
  52 |             this.clamp();
  53 |             if (j % 4)
  54 |                 for (var k = 0; k < a; k++)
  55 |                     c[j + k >>> 2] |= (e[k >>> 2] >>> 24 - 8 * (k % 4) & 255) << 24 - 8 * ((j + k) % 4);
  56 |             else if (65535 < e.length)
  57 |                 for (k = 0; k < a; k += 4)
  58 |                     c[j + k >>> 2] = e[k >>> 2];
  59 |             else
  60 |                 c.push.apply(c, e);
  61 |             this.sigBytes += a;
  62 |             return this
  63 |         },
  64 |         clamp: function() {
  65 |             var a = this.words
  66 |               , c = this.sigBytes;
  67 |             a[c >>> 2] &= 4294967295 << 32 - 8 * (c % 4);
  68 |             a.length = u.ceil(c / 4)
  69 |         },
  70 |         clone: function() {
  71 |             var a = t.clone.call(this);
  72 |             a.words = this.words.slice(0);
  73 |             return a
  74 |         },
  75 |         random: function(a) {
  76 |             for (var c = [], e = 0; e < a; e += 4)
  77 |                 c.push(4294967296 * u.random() | 0);
  78 |             return new r.init(c,a)
  79 |         }
  80 |     })
  81 |       , w = d.enc = {}
  82 |       , v = w.Hex = {
  83 |         stringify: function(a) {
  84 |             var c = a.words;
  85 |             a = a.sigBytes;
  86 |             for (var e = [], j = 0; j < a; j++) {
  87 |                 var k = c[j >>> 2] >>> 24 - 8 * (j % 4) & 255;
  88 |                 e.push((k >>> 4).toString(16));
  89 |                 e.push((k & 15).toString(16))
  90 |             }
  91 |             return e.join("")
  92 |         },
  93 |         parse: function(a) {
  94 |             for (var c = a.length, e = [], j = 0; j < c; j += 2)
  95 |                 e[j >>> 3] |= parseInt(a.substr(j, 2), 16) << 24 - 4 * (j % 8);
  96 |             return new r.init(e,c / 2)
  97 |         }
  98 |     }
  99 |       , b = w.Latin1 = {
 100 |         stringify: function(a) {
 101 |             var c = a.words;
 102 |             a = a.sigBytes;
 103 |             for (var e = [], j = 0; j < a; j++)
 104 |                 e.push(String.fromCharCode(c[j >>> 2] >>> 24 - 8 * (j % 4) & 255));
 105 |             return e.join("")
 106 |         },
 107 |         parse: function(a) {
 108 |             for (var c = a.length, e = [], j = 0; j < c; j++)
 109 |                 e[j >>> 2] |= (a.charCodeAt(j) & 255) << 24 - 8 * (j % 4);
 110 |             return new r.init(e,c)
 111 |         }
 112 |     }
 113 |       , x = w.Utf8 = {
 114 |         stringify: function(a) {
 115 |             try {
 116 |                 return decodeURIComponent(escape(b.stringify(a)))
 117 |             } catch (c) {
 118 |                 throw Error("Malformed UTF-8 data")
 119 |             }
 120 |         },
 121 |         parse: function(a) {
 122 |             return b.parse(unescape(encodeURIComponent(a)))
 123 |         }
 124 |     }
 125 |       , q = l.BufferedBlockAlgorithm = t.extend({
 126 |         reset: function() {
 127 |             this.i9b = new r.init;
 128 |             this.ty5D = 0
 129 |         },
 130 |         vb6V: function(a) {
 131 |             "string" == typeof a && (a = x.parse(a));
 132 |             this.i9b.concat(a);
 133 |             this.ty5D += a.sigBytes
 134 |         },
 135 |         kY3x: function(a) {
 136 |             var c = this.i9b
 137 |               , e = c.words
 138 |               , j = c.sigBytes
 139 |               , k = this.blockSize
 140 |               , b = j / (4 * k)
 141 |               , b = a ? u.ceil(b) : u.max((b | 0) - this.JP1x, 0);
 142 |             a = b * k;
 143 |             j = u.min(4 * a, j);
 144 |             if (a) {
 145 |                 for (var q = 0; q < a; q += k)
 146 |                     this.qL5Q(e, q);
 147 |                 q = e.splice(0, a);
 148 |                 c.sigBytes -= j
 149 |             }
 150 |             return new r.init(q,j)
 151 |         },
 152 |         clone: function() {
 153 |             var a = t.clone.call(this);
 154 |             a.i9b = this.i9b.clone();
 155 |             return a
 156 |         },
 157 |         JP1x: 0
 158 |     });
 159 |     l.Hasher = q.extend({
 160 |         cfg: t.extend(),
 161 |         init: function(a) {
 162 |             this.cfg = this.cfg.extend(a);
 163 |             this.reset()
 164 |         },
 165 |         reset: function() {
 166 |             q.reset.call(this);
 167 |             this.lt3x()
 168 |         },
 169 |         update: function(a) {
 170 |             this.vb6V(a);
 171 |             this.kY3x();
 172 |             return this
 173 |         },
 174 |         finalize: function(a) {
 175 |             a && this.vb6V(a);
 176 |             return this.mA4E()
 177 |         },
 178 |         blockSize: 16,
 179 |         lS3x: function(a) {
 180 |             return function(b, e) {
 181 |                 return (new a.init(e)).finalize(b)
 182 |             }
 183 |         },
 184 |         vl6f: function(a) {
 185 |             return function(b, e) {
 186 |                 return (new n.HMAC.init(a,e)).finalize(b)
 187 |             }
 188 |         }
 189 |     });
 190 |     var n = d.algo = {};
 191 |     return d
 192 | }(Math);
 193 | (function() {
 194 |     var u = CryptoJS
 195 |       , p = u.lib.WordArray;
 196 |     u.enc.Base64 = {
 197 |         stringify: function(d) {
 198 |             var l = d.words
 199 |               , p = d.sigBytes
 200 |               , t = this.bA0x;
 201 |             d.clamp();
 202 |             d = [];
 203 |             for (var r = 0; r < p; r += 3)
 204 |                 for (var w = (l[r >>> 2] >>> 24 - 8 * (r % 4) & 255) << 16 | (l[r + 1 >>> 2] >>> 24 - 8 * ((r + 1) % 4) & 255) << 8 | l[r + 2 >>> 2] >>> 24 - 8 * ((r + 2) % 4) & 255, v = 0; 4 > v && r + .75 * v < p; v++)
 205 |                     d.push(t.charAt(w >>> 6 * (3 - v) & 63));
 206 |             if (l = t.charAt(64))
 207 |                 for (; d.length % 4; )
 208 |                     d.push(l);
 209 |             return d.join("")
 210 |         },
 211 |         parse: function(d) {
 212 |             var l = d.length
 213 |               , s = this.bA0x
 214 |               , t = s.charAt(64);
 215 |             t && (t = d.indexOf(t),
 216 |             -1 != t && (l = t));
 217 |             for (var t = [], r = 0, w = 0; w < l; w++)
 218 |                 if (w % 4) {
 219 |                     var v = s.indexOf(d.charAt(w - 1)) << 2 * (w % 4)
 220 |                       , b = s.indexOf(d.charAt(w)) >>> 6 - 2 * (w % 4);
 221 |                     t[r >>> 2] |= (v | b) << 24 - 8 * (r % 4);
 222 |                     r++
 223 |                 }
 224 |             return p.create(t, r)
 225 |         },
 226 |         bA0x: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
 227 |     }
 228 | }
 229 | )();
 230 | (function(u) {
 231 |     function p(b, n, a, c, e, j, k) {
 232 |         b = b + (n & a | ~n & c) + e + k;
 233 |         return (b << j | b >>> 32 - j) + n
 234 |     }
 235 |     function d(b, n, a, c, e, j, k) {
 236 |         b = b + (n & c | a & ~c) + e + k;
 237 |         return (b << j | b >>> 32 - j) + n
 238 |     }
 239 |     function l(b, n, a, c, e, j, k) {
 240 |         b = b + (n ^ a ^ c) + e + k;
 241 |         return (b << j | b >>> 32 - j) + n
 242 |     }
 243 |     function s(b, n, a, c, e, j, k) {
 244 |         b = b + (a ^ (n | ~c)) + e + k;
 245 |         return (b << j | b >>> 32 - j) + n
 246 |     }
 247 |     for (var t = CryptoJS, r = t.lib, w = r.WordArray, v = r.Hasher, r = t.algo, b = [], x = 0; 64 > x; x++)
 248 |         b[x] = 4294967296 * u.abs(u.sin(x + 1)) | 0;
 249 |     r = r.MD5 = v.extend({
 250 |         lt3x: function() {
 251 |             this.cN1x = new w.init([1732584193, 4023233417, 2562383102, 271733878])
 252 |         },
 253 |         qL5Q: function(q, n) {
 254 |             for (var a = 0; 16 > a; a++) {
 255 |                 var c = n + a
 256 |                   , e = q[c];
 257 |                 q[c] = (e << 8 | e >>> 24) & 16711935 | (e << 24 | e >>> 8) & 4278255360
 258 |             }
 259 |             var a = this.cN1x.words
 260 |               , c = q[n + 0]
 261 |               , e = q[n + 1]
 262 |               , j = q[n + 2]
 263 |               , k = q[n + 3]
 264 |               , z = q[n + 4]
 265 |               , r = q[n + 5]
 266 |               , t = q[n + 6]
 267 |               , w = q[n + 7]
 268 |               , v = q[n + 8]
 269 |               , A = q[n + 9]
 270 |               , B = q[n + 10]
 271 |               , C = q[n + 11]
 272 |               , u = q[n + 12]
 273 |               , D = q[n + 13]
 274 |               , E = q[n + 14]
 275 |               , x = q[n + 15]
 276 |               , f = a[0]
 277 |               , m = a[1]
 278 |               , g = a[2]
 279 |               , h = a[3]
 280 |               , f = p(f, m, g, h, c, 7, b[0])
 281 |               , h = p(h, f, m, g, e, 12, b[1])
 282 |               , g = p(g, h, f, m, j, 17, b[2])
 283 |               , m = p(m, g, h, f, k, 22, b[3])
 284 |               , f = p(f, m, g, h, z, 7, b[4])
 285 |               , h = p(h, f, m, g, r, 12, b[5])
 286 |               , g = p(g, h, f, m, t, 17, b[6])
 287 |               , m = p(m, g, h, f, w, 22, b[7])
 288 |               , f = p(f, m, g, h, v, 7, b[8])
 289 |               , h = p(h, f, m, g, A, 12, b[9])
 290 |               , g = p(g, h, f, m, B, 17, b[10])
 291 |               , m = p(m, g, h, f, C, 22, b[11])
 292 |               , f = p(f, m, g, h, u, 7, b[12])
 293 |               , h = p(h, f, m, g, D, 12, b[13])
 294 |               , g = p(g, h, f, m, E, 17, b[14])
 295 |               , m = p(m, g, h, f, x, 22, b[15])
 296 |               , f = d(f, m, g, h, e, 5, b[16])
 297 |               , h = d(h, f, m, g, t, 9, b[17])
 298 |               , g = d(g, h, f, m, C, 14, b[18])
 299 |               , m = d(m, g, h, f, c, 20, b[19])
 300 |               , f = d(f, m, g, h, r, 5, b[20])
 301 |               , h = d(h, f, m, g, B, 9, b[21])
 302 |               , g = d(g, h, f, m, x, 14, b[22])
 303 |               , m = d(m, g, h, f, z, 20, b[23])
 304 |               , f = d(f, m, g, h, A, 5, b[24])
 305 |               , h = d(h, f, m, g, E, 9, b[25])
 306 |               , g = d(g, h, f, m, k, 14, b[26])
 307 |               , m = d(m, g, h, f, v, 20, b[27])
 308 |               , f = d(f, m, g, h, D, 5, b[28])
 309 |               , h = d(h, f, m, g, j, 9, b[29])
 310 |               , g = d(g, h, f, m, w, 14, b[30])
 311 |               , m = d(m, g, h, f, u, 20, b[31])
 312 |               , f = l(f, m, g, h, r, 4, b[32])
 313 |               , h = l(h, f, m, g, v, 11, b[33])
 314 |               , g = l(g, h, f, m, C, 16, b[34])
 315 |               , m = l(m, g, h, f, E, 23, b[35])
 316 |               , f = l(f, m, g, h, e, 4, b[36])
 317 |               , h = l(h, f, m, g, z, 11, b[37])
 318 |               , g = l(g, h, f, m, w, 16, b[38])
 319 |               , m = l(m, g, h, f, B, 23, b[39])
 320 |               , f = l(f, m, g, h, D, 4, b[40])
 321 |               , h = l(h, f, m, g, c, 11, b[41])
 322 |               , g = l(g, h, f, m, k, 16, b[42])
 323 |               , m = l(m, g, h, f, t, 23, b[43])
 324 |               , f = l(f, m, g, h, A, 4, b[44])
 325 |               , h = l(h, f, m, g, u, 11, b[45])
 326 |               , g = l(g, h, f, m, x, 16, b[46])
 327 |               , m = l(m, g, h, f, j, 23, b[47])
 328 |               , f = s(f, m, g, h, c, 6, b[48])
 329 |               , h = s(h, f, m, g, w, 10, b[49])
 330 |               , g = s(g, h, f, m, E, 15, b[50])
 331 |               , m = s(m, g, h, f, r, 21, b[51])
 332 |               , f = s(f, m, g, h, u, 6, b[52])
 333 |               , h = s(h, f, m, g, k, 10, b[53])
 334 |               , g = s(g, h, f, m, B, 15, b[54])
 335 |               , m = s(m, g, h, f, e, 21, b[55])
 336 |               , f = s(f, m, g, h, v, 6, b[56])
 337 |               , h = s(h, f, m, g, x, 10, b[57])
 338 |               , g = s(g, h, f, m, t, 15, b[58])
 339 |               , m = s(m, g, h, f, D, 21, b[59])
 340 |               , f = s(f, m, g, h, z, 6, b[60])
 341 |               , h = s(h, f, m, g, C, 10, b[61])
 342 |               , g = s(g, h, f, m, j, 15, b[62])
 343 |               , m = s(m, g, h, f, A, 21, b[63]);
 344 |             a[0] = a[0] + f | 0;
 345 |             a[1] = a[1] + m | 0;
 346 |             a[2] = a[2] + g | 0;
 347 |             a[3] = a[3] + h | 0
 348 |         },
 349 |         mA4E: function() {
 350 |             var b = this.i9b
 351 |               , n = b.words
 352 |               , a = 8 * this.ty5D
 353 |               , c = 8 * b.sigBytes;
 354 |             n[c >>> 5] |= 128 << 24 - c % 32;
 355 |             var e = u.floor(a / 4294967296);
 356 |             n[(c + 64 >>> 9 << 4) + 15] = (e << 8 | e >>> 24) & 16711935 | (e << 24 | e >>> 8) & 4278255360;
 357 |             n[(c + 64 >>> 9 << 4) + 14] = (a << 8 | a >>> 24) & 16711935 | (a << 24 | a >>> 8) & 4278255360;
 358 |             b.sigBytes = 4 * (n.length + 1);
 359 |             this.kY3x();
 360 |             b = this.cN1x;
 361 |             n = b.words;
 362 |             for (a = 0; 4 > a; a++)
 363 |                 c = n[a],
 364 |                 n[a] = (c << 8 | c >>> 24) & 16711935 | (c << 24 | c >>> 8) & 4278255360;
 365 |             return b
 366 |         },
 367 |         clone: function() {
 368 |             var b = v.clone.call(this);
 369 |             b.cN1x = this.cN1x.clone();
 370 |             return b
 371 |         }
 372 |     });
 373 |     t.MD5 = v.lS3x(r);
 374 |     t.HmacMD5 = v.vl6f(r)
 375 | }
 376 | )(Math);
 377 | (function() {
 378 |     var u = CryptoJS
 379 |       , p = u.lib
 380 |       , d = p.Base
 381 |       , l = p.WordArray
 382 |       , p = u.algo
 383 |       , s = p.EvpKDF = d.extend({
 384 |         cfg: d.extend({
 385 |             keySize: 4,
 386 |             hasher: p.MD5,
 387 |             iterations: 1
 388 |         }),
 389 |         init: function(d) {
 390 |             this.cfg = this.cfg.extend(d)
 391 |         },
 392 |         compute: function(d, r) {
 393 |             for (var p = this.cfg, s = p.hasher.create(), b = l.create(), u = b.words, q = p.keySize, p = p.iterations; u.length < q; ) {
 394 |                 n && s.update(n);
 395 |                 var n = s.update(d).finalize(r);
 396 |                 s.reset();
 397 |                 for (var a = 1; a < p; a++)
 398 |                     n = s.finalize(n),
 399 |                     s.reset();
 400 |                 b.concat(n)
 401 |             }
 402 |             b.sigBytes = 4 * q;
 403 |             return b
 404 |         }
 405 |     });
 406 |     u.EvpKDF = function(d, l, p) {
 407 |         return s.create(p).compute(d, l)
 408 |     }
 409 | }
 410 | )();
 411 | CryptoJS.lib.Cipher || function(u) {
 412 |     var p = CryptoJS
 413 |       , d = p.lib
 414 |       , l = d.Base
 415 |       , s = d.WordArray
 416 |       , t = d.BufferedBlockAlgorithm
 417 |       , r = p.enc.Base64
 418 |       , w = p.algo.EvpKDF
 419 |       , v = d.Cipher = t.extend({
 420 |         cfg: l.extend(),
 421 |         createEncryptor: function(e, a) {
 422 |             return this.create(this.JY1x, e, a)
 423 |         },
 424 |         createDecryptor: function(e, a) {
 425 |             return this.create(this.bqV9M, e, a)
 426 |         },
 427 |         init: function(e, a, b) {
 428 |             this.cfg = this.cfg.extend(b);
 429 |             this.Qq2x = e;
 430 |             this.L0x = a;
 431 |             this.reset()
 432 |         },
 433 |         reset: function() {
 434 |             t.reset.call(this);
 435 |             this.lt3x()
 436 |         },
 437 |         process: function(e) {
 438 |             this.vb6V(e);
 439 |             return this.kY3x()
 440 |         },
 441 |         finalize: function(e) {
 442 |             e && this.vb6V(e);
 443 |             return this.mA4E()
 444 |         },
 445 |         keySize: 4,
 446 |         ivSize: 4,
 447 |         JY1x: 1,
 448 |         bqV9M: 2,
 449 |         lS3x: function(e) {
 450 |             return {
 451 |                 encrypt: function(b, k, d) {
 452 |                     return ("string" == typeof k ? c : a).encrypt(e, b, k, d)
 453 |                 },
 454 |                 decrypt: function(b, k, d) {
 455 |                     return ("string" == typeof k ? c : a).decrypt(e, b, k, d)
 456 |                 }
 457 |             }
 458 |         }
 459 |     });
 460 |     d.StreamCipher = v.extend({
 461 |         mA4E: function() {
 462 |             return this.kY3x(!0)
 463 |         },
 464 |         blockSize: 1
 465 |     });
 466 |     var b = p.mode = {}
 467 |       , x = function(e, a, b) {
 468 |         var c = this.tw5B;
 469 |         c ? this.tw5B = u : c = this.DB9s;
 470 |         for (var d = 0; d < b; d++)
 471 |             e[a + d] ^= c[d]
 472 |     }
 473 |       , q = (d.BlockCipherMode = l.extend({
 474 |         createEncryptor: function(e, a) {
 475 |             return this.Encryptor.create(e, a)
 476 |         },
 477 |         createDecryptor: function(e, a) {
 478 |             return this.Decryptor.create(e, a)
 479 |         },
 480 |         init: function(e, a) {
 481 |             this.vw6q = e;
 482 |             this.tw5B = a
 483 |         }
 484 |     })).extend();
 485 |     q.Encryptor = q.extend({
 486 |         processBlock: function(e, a) {
 487 |             var b = this.vw6q
 488 |               , c = b.blockSize;
 489 |             x.call(this, e, a, c);
 490 |             b.encryptBlock(e, a);
 491 |             this.DB9s = e.slice(a, a + c)
 492 |         }
 493 |     });
 494 |     q.Decryptor = q.extend({
 495 |         processBlock: function(e, a) {
 496 |             var b = this.vw6q
 497 |               , c = b.blockSize
 498 |               , d = e.slice(a, a + c);
 499 |             b.decryptBlock(e, a);
 500 |             x.call(this, e, a, c);
 501 |             this.DB9s = d
 502 |         }
 503 |     });
 504 |     b = b.CBC = q;
 505 |     q = (p.pad = {}).Pkcs7 = {
 506 |         pad: function(a, b) {
 507 |             for (var c = 4 * b, c = c - a.sigBytes % c, d = c << 24 | c << 16 | c << 8 | c, l = [], n = 0; n < c; n += 4)
 508 |                 l.push(d);
 509 |             c = s.create(l, c);
 510 |             a.concat(c)
 511 |         },
 512 |         unpad: function(a) {
 513 |             a.sigBytes -= a.words[a.sigBytes - 1 >>> 2] & 255
 514 |         }
 515 |     };
 516 |     d.BlockCipher = v.extend({
 517 |         cfg: v.cfg.extend({
 518 |             mode: b,
 519 |             padding: q
 520 |         }),
 521 |         reset: function() {
 522 |             v.reset.call(this);
 523 |             var a = this.cfg
 524 |               , b = a.iv
 525 |               , a = a.mode;
 526 |             if (this.Qq2x == this.JY1x)
 527 |                 var c = a.createEncryptor;
 528 |             else
 529 |                 c = a.createDecryptor,
 530 |                 this.JP1x = 1;
 531 |             this.eT2x = c.call(a, this, b && b.words)
 532 |         },
 533 |         qL5Q: function(a, b) {
 534 |             this.eT2x.processBlock(a, b)
 535 |         },
 536 |         mA4E: function() {
 537 |             var a = this.cfg.padding;
 538 |             if (this.Qq2x == this.JY1x) {
 539 |                 a.pad(this.i9b, this.blockSize);
 540 |                 var b = this.kY3x(!0)
 541 |             } else
 542 |                 b = this.kY3x(!0),
 543 |                 a.unpad(b);
 544 |             return b
 545 |         },
 546 |         blockSize: 4
 547 |     });
 548 |     var n = d.CipherParams = l.extend({
 549 |         init: function(a) {
 550 |             this.mixIn(a)
 551 |         },
 552 |         toString: function(a) {
 553 |             return (a || this.formatter).stringify(this)
 554 |         }
 555 |     })
 556 |       , b = (p.format = {}).OpenSSL = {
 557 |         stringify: function(a) {
 558 |             var b = a.ciphertext;
 559 |             a = a.salt;
 560 |             return (a ? s.create([1398893684, 1701076831]).concat(a).concat(b) : b).toString(r)
 561 |         },
 562 |         parse: function(a) {
 563 |             a = r.parse(a);
 564 |             var b = a.words;
 565 |             if (1398893684 == b[0] && 1701076831 == b[1]) {
 566 |                 var c = s.create(b.slice(2, 4));
 567 |                 b.splice(0, 4);
 568 |                 a.sigBytes -= 16
 569 |             }
 570 |             return n.create({
 571 |                 ciphertext: a,
 572 |                 salt: c
 573 |             })
 574 |         }
 575 |     }
 576 |       , a = d.SerializableCipher = l.extend({
 577 |         cfg: l.extend({
 578 |             format: b
 579 |         }),
 580 |         encrypt: function(a, b, c, d) {
 581 |             d = this.cfg.extend(d);
 582 |             var l = a.createEncryptor(c, d);
 583 |             b = l.finalize(b);
 584 |             l = l.cfg;
 585 |             return n.create({
 586 |                 ciphertext: b,
 587 |                 key: c,
 588 |                 iv: l.iv,
 589 |                 algorithm: a,
 590 |                 mode: l.mode,
 591 |                 padding: l.padding,
 592 |                 blockSize: a.blockSize,
 593 |                 formatter: d.format
 594 |             })
 595 |         },
 596 |         decrypt: function(a, b, c, d) {
 597 |             d = this.cfg.extend(d);
 598 |             b = this.Hj0x(b, d.format);
 599 |             return a.createDecryptor(c, d).finalize(b.ciphertext)
 600 |         },
 601 |         Hj0x: function(a, b) {
 602 |             return "string" == typeof a ? b.parse(a, this) : a
 603 |         }
 604 |     })
 605 |       , p = (p.kdf = {}).OpenSSL = {
 606 |         execute: function(a, b, c, d) {
 607 |             d || (d = s.random(8));
 608 |             a = w.create({
 609 |                 keySize: b + c
 610 |             }).compute(a, d);
 611 |             c = s.create(a.words.slice(b), 4 * c);
 612 |             a.sigBytes = 4 * b;
 613 |             return n.create({
 614 |                 key: a,
 615 |                 iv: c,
 616 |                 salt: d
 617 |             })
 618 |         }
 619 |     }
 620 |       , c = d.PasswordBasedCipher = a.extend({
 621 |         cfg: a.cfg.extend({
 622 |             kdf: p
 623 |         }),
 624 |         encrypt: function(b, c, d, l) {
 625 |             l = this.cfg.extend(l);
 626 |             d = l.kdf.execute(d, b.keySize, b.ivSize);
 627 |             l.iv = d.iv;
 628 |             b = a.encrypt.call(this, b, c, d.key, l);
 629 |             b.mixIn(d);
 630 |             return b
 631 |         },
 632 |         decrypt: function(b, c, d, l) {
 633 |             l = this.cfg.extend(l);
 634 |             c = this.Hj0x(c, l.format);
 635 |             d = l.kdf.execute(d, b.keySize, b.ivSize, c.salt);
 636 |             l.iv = d.iv;
 637 |             return a.decrypt.call(this, b, c, d.key, l)
 638 |         }
 639 |     })
 640 | }();
 641 | (function() {
 642 |     for (var u = CryptoJS, p = u.lib.BlockCipher, d = u.algo, l = [], s = [], t = [], r = [], w = [], v = [], b = [], x = [], q = [], n = [], a = [], c = 0; 256 > c; c++)
 643 |         a[c] = 128 > c ? c << 1 : c << 1 ^ 283;
 644 |     for (var e = 0, j = 0, c = 0; 256 > c; c++) {
 645 |         var k = j ^ j << 1 ^ j << 2 ^ j << 3 ^ j << 4
 646 |           , k = k >>> 8 ^ k & 255 ^ 99;
 647 |         l[e] = k;
 648 |         s[k] = e;
 649 |         var z = a[e]
 650 |           , F = a[z]
 651 |           , G = a[F]
 652 |           , y = 257 * a[k] ^ 16843008 * k;
 653 |         t[e] = y << 24 | y >>> 8;
 654 |         r[e] = y << 16 | y >>> 16;
 655 |         w[e] = y << 8 | y >>> 24;
 656 |         v[e] = y;
 657 |         y = 16843009 * G ^ 65537 * F ^ 257 * z ^ 16843008 * e;
 658 |         b[k] = y << 24 | y >>> 8;
 659 |         x[k] = y << 16 | y >>> 16;
 660 |         q[k] = y << 8 | y >>> 24;
 661 |         n[k] = y;
 662 |         e ? (e = z ^ a[a[a[G ^ z]]],
 663 |         j ^= a[a[j]]) : e = j = 1
 664 |     }
 665 |     var H = [0, 1, 2, 4, 8, 16, 32, 64, 128, 27, 54]
 666 |       , d = d.AES = p.extend({
 667 |         lt3x: function() {
 668 |             for (var a = this.L0x, c = a.words, d = a.sigBytes / 4, a = 4 * ((this.beT6N = d + 6) + 1), e = this.bqT9K = [], j = 0; j < a; j++)
 669 |                 if (j < d)
 670 |                     e[j] = c[j];
 671 |                 else {
 672 |                     var k = e[j - 1];
 673 |                     j % d ? 6 < d && 4 == j % d && (k = l[k >>> 24] << 24 | l[k >>> 16 & 255] << 16 | l[k >>> 8 & 255] << 8 | l[k & 255]) : (k = k << 8 | k >>> 24,
 674 |                     k = l[k >>> 24] << 24 | l[k >>> 16 & 255] << 16 | l[k >>> 8 & 255] << 8 | l[k & 255],
 675 |                     k ^= H[j / d | 0] << 24);
 676 |                     e[j] = e[j - d] ^ k
 677 |                 }
 678 |             c = this.bqS9J = [];
 679 |             for (d = 0; d < a; d++)
 680 |                 j = a - d,
 681 |                 k = d % 4 ? e[j] : e[j - 4],
 682 |                 c[d] = 4 > d || 4 >= j ? k : b[l[k >>> 24]] ^ x[l[k >>> 16 & 255]] ^ q[l[k >>> 8 & 255]] ^ n[l[k & 255]]
 683 |         },
 684 |         encryptBlock: function(a, b) {
 685 |             this.DA9r(a, b, this.bqT9K, t, r, w, v, l)
 686 |         },
 687 |         decryptBlock: function(a, c) {
 688 |             var d = a[c + 1];
 689 |             a[c + 1] = a[c + 3];
 690 |             a[c + 3] = d;
 691 |             this.DA9r(a, c, this.bqS9J, b, x, q, n, s);
 692 |             d = a[c + 1];
 693 |             a[c + 1] = a[c + 3];
 694 |             a[c + 3] = d
 695 |         },
 696 |         DA9r: function(a, b, c, d, e, j, l, f) {
 697 |             for (var m = this.beT6N, g = a[b] ^ c[0], h = a[b + 1] ^ c[1], k = a[b + 2] ^ c[2], n = a[b + 3] ^ c[3], p = 4, r = 1; r < m; r++)
 698 |                 var q = d[g >>> 24] ^ e[h >>> 16 & 255] ^ j[k >>> 8 & 255] ^ l[n & 255] ^ c[p++]
 699 |                   , s = d[h >>> 24] ^ e[k >>> 16 & 255] ^ j[n >>> 8 & 255] ^ l[g & 255] ^ c[p++]
 700 |                   , t = d[k >>> 24] ^ e[n >>> 16 & 255] ^ j[g >>> 8 & 255] ^ l[h & 255] ^ c[p++]
 701 |                   , n = d[n >>> 24] ^ e[g >>> 16 & 255] ^ j[h >>> 8 & 255] ^ l[k & 255] ^ c[p++]
 702 |                   , g = q
 703 |                   , h = s
 704 |                   , k = t;
 705 |             q = (f[g >>> 24] << 24 | f[h >>> 16 & 255] << 16 | f[k >>> 8 & 255] << 8 | f[n & 255]) ^ c[p++];
 706 |             s = (f[h >>> 24] << 24 | f[k >>> 16 & 255] << 16 | f[n >>> 8 & 255] << 8 | f[g & 255]) ^ c[p++];
 707 |             t = (f[k >>> 24] << 24 | f[n >>> 16 & 255] << 16 | f[g >>> 8 & 255] << 8 | f[h & 255]) ^ c[p++];
 708 |             n = (f[n >>> 24] << 24 | f[g >>> 16 & 255] << 16 | f[h >>> 8 & 255] << 8 | f[k & 255]) ^ c[p++];
 709 |             a[b] = q;
 710 |             a[b + 1] = s;
 711 |             a[b + 2] = t;
 712 |             a[b + 3] = n
 713 |         },
 714 |         keySize: 8
 715 |     });
 716 |     u.AES = p.lS3x(d)
 717 | }
 718 | )();
 719 | function RSAKeyPair(a, b, c) {
 720 |     this.e = biFromHex(a),
 721 |     this.d = biFromHex(b),
 722 |     this.m = biFromHex(c),
 723 |     this.chunkSize = 2 * biHighIndex(this.m),
 724 |     this.radix = 16,
 725 |     this.barrett = new BarrettMu(this.m)
 726 | }
 727 | function twoDigit(a) {
 728 |     return (10 > a ? "0" : "") + String(a)
 729 | }
 730 | function encryptedString(a, b) {
 731 |     for (var f, g, h, i, j, k, l, c = new Array, d = b.length, e = 0; d > e; )
 732 |         c[e] = b.charCodeAt(e),
 733 |         e++;
 734 |     for (; 0 != c.length % a.chunkSize; )
 735 |         c[e++] = 0;
 736 |     for (f = c.length,
 737 |     g = "",
 738 |     e = 0; f > e; e += a.chunkSize) {
 739 |         for (j = new BigInt,
 740 |         h = 0,
 741 |         i = e; i < e + a.chunkSize; ++h)
 742 |             j.digits[h] = c[i++],
 743 |             j.digits[h] += c[i++] << 8;
 744 |         k = a.barrett.powMod(j, a.e),
 745 |         l = 16 == a.radix ? biToHex(k) : biToString(k, a.radix),
 746 |         g += l + " "
 747 |     }
 748 |     return g.substring(0, g.length - 1)
 749 | }
 750 | function decryptedString(a, b) {
 751 |     var e, f, g, h, c = b.split(" "), d = "";
 752 |     for (e = 0; e < c.length; ++e)
 753 |         for (h = 16 == a.radix ? biFromHex(c[e]) : biFromString(c[e], a.radix),
 754 |         g = a.barrett.powMod(h, a.d),
 755 |         f = 0; f <= biHighIndex(g); ++f)
 756 |             d += String.fromCharCode(255 & g.digits[f], g.digits[f] >> 8);
 757 |     return 0 == d.charCodeAt(d.length - 1) && (d = d.substring(0, d.length - 1)),
 758 |     d
 759 | }
 760 | function setMaxDigits(a) {
 761 |     maxDigits = a,
 762 |     ZERO_ARRAY = new Array(maxDigits);
 763 |     for (var b = 0; b < ZERO_ARRAY.length; b++)
 764 |         ZERO_ARRAY[b] = 0;
 765 |     bigZero = new BigInt,
 766 |     bigOne = new BigInt,
 767 |     bigOne.digits[0] = 1
 768 | }
 769 | function BigInt(a) {
 770 |     this.digits = "boolean" == typeof a && 1 == a ? null : ZERO_ARRAY.slice(0),
 771 |     this.isNeg = !1
 772 | }
 773 | function biFromDecimal(a) {
 774 |     for (var d, e, f, b = "-" == a.charAt(0), c = b ? 1 : 0; c < a.length && "0" == a.charAt(c); )
 775 |         ++c;
 776 |     if (c == a.length)
 777 |         d = new BigInt;
 778 |     else {
 779 |         for (e = a.length - c,
 780 |         f = e % dpl10,
 781 |         0 == f && (f = dpl10),
 782 |         d = biFromNumber(Number(a.substr(c, f))),
 783 |         c += f; c < a.length; )
 784 |             d = biAdd(biMultiply(d, lr10), biFromNumber(Number(a.substr(c, dpl10)))),
 785 |             c += dpl10;
 786 |         d.isNeg = b
 787 |     }
 788 |     return d
 789 | }
 790 | function biCopy(a) {
 791 |     var b = new BigInt(!0);
 792 |     return b.digits = a.digits.slice(0),
 793 |     b.isNeg = a.isNeg,
 794 |     b
 795 | }
 796 | function biFromNumber(a) {
 797 |     var c, b = new BigInt;
 798 |     for (b.isNeg = 0 > a,
 799 |     a = Math.abs(a),
 800 |     c = 0; a > 0; )
 801 |         b.digits[c++] = a & maxDigitVal,
 802 |         a >>= biRadixBits;
 803 |     return b
 804 | }
 805 | function reverseStr(a) {
 806 |     var c, b = "";
 807 |     for (c = a.length - 1; c > -1; --c)
 808 |         b += a.charAt(c);
 809 |     return b
 810 | }
 811 | function biToString(a, b) {
 812 |     var d, e, c = new BigInt;
 813 |     for (c.digits[0] = b,
 814 |     d = biDivideModulo(a, c),
 815 |     e = hexatrigesimalToChar[d[1].digits[0]]; 1 == biCompare(d[0], bigZero); )
 816 |         d = biDivideModulo(d[0], c),
 817 |         digit = d[1].digits[0],
 818 |         e += hexatrigesimalToChar[d[1].digits[0]];
 819 |     return (a.isNeg ? "-" : "") + reverseStr(e)
 820 | }
 821 | function biToDecimal(a) {
 822 |     var c, d, b = new BigInt;
 823 |     for (b.digits[0] = 10,
 824 |     c = biDivideModulo(a, b),
 825 |     d = String(c[1].digits[0]); 1 == biCompare(c[0], bigZero); )
 826 |         c = biDivideModulo(c[0], b),
 827 |         d += String(c[1].digits[0]);
 828 |     return (a.isNeg ? "-" : "") + reverseStr(d)
 829 | }
 830 | function digitToHex(a) {
 831 |     var b = 15
 832 |       , c = "";
 833 |     for (i = 0; 4 > i; ++i)
 834 |         c += hexToChar[a & b],
 835 |         a >>>= 4;
 836 |     return reverseStr(c)
 837 | }
 838 | function biToHex(a) {
 839 |     var d, b = "";
 840 |     for (biHighIndex(a),
 841 |     d = biHighIndex(a); d > -1; --d)
 842 |         b += digitToHex(a.digits[d]);
 843 |     return b
 844 | }
 845 | function charToHex(a) {
 846 |     var h, b = 48, c = b + 9, d = 97, e = d + 25, f = 65, g = 90;
 847 |     return h = a >= b && c >= a ? a - b : a >= f && g >= a ? 10 + a - f : a >= d && e >= a ? 10 + a - d : 0
 848 | }
 849 | function hexToDigit(a) {
 850 |     var d, b = 0, c = Math.min(a.length, 4);
 851 |     for (d = 0; c > d; ++d)
 852 |         b <<= 4,
 853 |         b |= charToHex(a.charCodeAt(d));
 854 |     return b
 855 | }
 856 | function biFromHex(a) {
 857 |     var d, e, b = new BigInt, c = a.length;
 858 |     for (d = c,
 859 |     e = 0; d > 0; d -= 4,
 860 |     ++e)
 861 |         b.digits[e] = hexToDigit(a.substr(Math.max(d - 4, 0), Math.min(d, 4)));
 862 |     return b
 863 | }
 864 | function biFromString(a, b) {
 865 |     var g, h, i, j, c = "-" == a.charAt(0), d = c ? 1 : 0, e = new BigInt, f = new BigInt;
 866 |     for (f.digits[0] = 1,
 867 |     g = a.length - 1; g >= d; g--)
 868 |         h = a.charCodeAt(g),
 869 |         i = charToHex(h),
 870 |         j = biMultiplyDigit(f, i),
 871 |         e = biAdd(e, j),
 872 |         f = biMultiplyDigit(f, b);
 873 |     return e.isNeg = c,
 874 |     e
 875 | }
 876 | function biDump(a) {
 877 |     return (a.isNeg ? "-" : "") + a.digits.join(" ")
 878 | }
 879 | function biAdd(a, b) {
 880 |     var c, d, e, f;
 881 |     if (a.isNeg != b.isNeg)
 882 |         b.isNeg = !b.isNeg,
 883 |         c = biSubtract(a, b),
 884 |         b.isNeg = !b.isNeg;
 885 |     else {
 886 |         for (c = new BigInt,
 887 |         d = 0,
 888 |         f = 0; f < a.digits.length; ++f)
 889 |             e = a.digits[f] + b.digits[f] + d,
 890 |             c.digits[f] = 65535 & e,
 891 |             d = Number(e >= biRadix);
 892 |         c.isNeg = a.isNeg
 893 |     }
 894 |     return c
 895 | }
 896 | function biSubtract(a, b) {
 897 |     var c, d, e, f;
 898 |     if (a.isNeg != b.isNeg)
 899 |         b.isNeg = !b.isNeg,
 900 |         c = biAdd(a, b),
 901 |         b.isNeg = !b.isNeg;
 902 |     else {
 903 |         for (c = new BigInt,
 904 |         e = 0,
 905 |         f = 0; f < a.digits.length; ++f)
 906 |             d = a.digits[f] - b.digits[f] + e,
 907 |             c.digits[f] = 65535 & d,
 908 |             c.digits[f] < 0 && (c.digits[f] += biRadix),
 909 |             e = 0 - Number(0 > d);
 910 |         if (-1 == e) {
 911 |             for (e = 0,
 912 |             f = 0; f < a.digits.length; ++f)
 913 |                 d = 0 - c.digits[f] + e,
 914 |                 c.digits[f] = 65535 & d,
 915 |                 c.digits[f] < 0 && (c.digits[f] += biRadix),
 916 |                 e = 0 - Number(0 > d);
 917 |             c.isNeg = !a.isNeg
 918 |         } else
 919 |             c.isNeg = a.isNeg
 920 |     }
 921 |     return c
 922 | }
 923 | function biHighIndex(a) {
 924 |     for (var b = a.digits.length - 1; b > 0 && 0 == a.digits[b]; )
 925 |         --b;
 926 |     return b
 927 | }
 928 | function biNumBits(a) {
 929 |     var e, b = biHighIndex(a), c = a.digits[b], d = (b + 1) * bitsPerDigit;
 930 |     for (e = d; e > d - bitsPerDigit && 0 == (32768 & c); --e)
 931 |         c <<= 1;
 932 |     return e
 933 | }
 934 | function biMultiply(a, b) {
 935 |     var d, h, i, k, c = new BigInt, e = biHighIndex(a), f = biHighIndex(b);
 936 |     for (k = 0; f >= k; ++k) {
 937 |         for (d = 0,
 938 |         i = k,
 939 |         j = 0; e >= j; ++j,
 940 |         ++i)
 941 |             h = c.digits[i] + a.digits[j] * b.digits[k] + d,
 942 |             c.digits[i] = h & maxDigitVal,
 943 |             d = h >>> biRadixBits;
 944 |         c.digits[k + e + 1] = d
 945 |     }
 946 |     return c.isNeg = a.isNeg != b.isNeg,
 947 |     c
 948 | }
 949 | function biMultiplyDigit(a, b) {
 950 |     var c, d, e, f;
 951 |     for (result = new BigInt,
 952 |     c = biHighIndex(a),
 953 |     d = 0,
 954 |     f = 0; c >= f; ++f)
 955 |         e = result.digits[f] + a.digits[f] * b + d,
 956 |         result.digits[f] = e & maxDigitVal,
 957 |         d = e >>> biRadixBits;
 958 |     return result.digits[1 + c] = d,
 959 |     result
 960 | }
 961 | function arrayCopy(a, b, c, d, e) {
 962 |     var g, h, f = Math.min(b + e, a.length);
 963 |     for (g = b,
 964 |     h = d; f > g; ++g,
 965 |     ++h)
 966 |         c[h] = a[g]
 967 | }
 968 | function biShiftLeft(a, b) {
 969 |     var e, f, g, h, c = Math.floor(b / bitsPerDigit), d = new BigInt;
 970 |     for (arrayCopy(a.digits, 0, d.digits, c, d.digits.length - c),
 971 |     e = b % bitsPerDigit,
 972 |     f = bitsPerDigit - e,
 973 |     g = d.digits.length - 1,
 974 |     h = g - 1; g > 0; --g,
 975 |     --h)
 976 |         d.digits[g] = d.digits[g] << e & maxDigitVal | (d.digits[h] & highBitMasks[e]) >>> f;
 977 |     return d.digits[0] = d.digits[g] << e & maxDigitVal,
 978 |     d.isNeg = a.isNeg,
 979 |     d
 980 | }
 981 | function biShiftRight(a, b) {
 982 |     var e, f, g, h, c = Math.floor(b / bitsPerDigit), d = new BigInt;
 983 |     for (arrayCopy(a.digits, c, d.digits, 0, a.digits.length - c),
 984 |     e = b % bitsPerDigit,
 985 |     f = bitsPerDigit - e,
 986 |     g = 0,
 987 |     h = g + 1; g < d.digits.length - 1; ++g,
 988 |     ++h)
 989 |         d.digits[g] = d.digits[g] >>> e | (d.digits[h] & lowBitMasks[e]) << f;
 990 |     return d.digits[d.digits.length - 1] >>>= e,
 991 |     d.isNeg = a.isNeg,
 992 |     d
 993 | }
 994 | function biMultiplyByRadixPower(a, b) {
 995 |     var c = new BigInt;
 996 |     return arrayCopy(a.digits, 0, c.digits, b, c.digits.length - b),
 997 |     c
 998 | }
 999 | function biDivideByRadixPower(a, b) {
1000 |     var c = new BigInt;
1001 |     return arrayCopy(a.digits, b, c.digits, 0, c.digits.length - b),
1002 |     c
1003 | }
1004 | function biModuloByRadixPower(a, b) {
1005 |     var c = new BigInt;
1006 |     return arrayCopy(a.digits, 0, c.digits, 0, b),
1007 |     c
1008 | }
1009 | function biCompare(a, b) {
1010 |     if (a.isNeg != b.isNeg)
1011 |         return 1 - 2 * Number(a.isNeg);
1012 |     for (var c = a.digits.length - 1; c >= 0; --c)
1013 |         if (a.digits[c] != b.digits[c])
1014 |             return a.isNeg ? 1 - 2 * Number(a.digits[c] > b.digits[c]) : 1 - 2 * Number(a.digits[c] < b.digits[c]);
1015 |     return 0
1016 | }
1017 | function biDivideModulo(a, b) {
1018 |     var f, g, h, i, j, k, l, m, n, o, p, q, r, s, c = biNumBits(a), d = biNumBits(b), e = b.isNeg;
1019 |     if (d > c)
1020 |         return a.isNeg ? (f = biCopy(bigOne),
1021 |         f.isNeg = !b.isNeg,
1022 |         a.isNeg = !1,
1023 |         b.isNeg = !1,
1024 |         g = biSubtract(b, a),
1025 |         a.isNeg = !0,
1026 |         b.isNeg = e) : (f = new BigInt,
1027 |         g = biCopy(a)),
1028 |         new Array(f,g);
1029 |     for (f = new BigInt,
1030 |     g = a,
1031 |     h = Math.ceil(d / bitsPerDigit) - 1,
1032 |     i = 0; b.digits[h] < biHalfRadix; )
1033 |         b = biShiftLeft(b, 1),
1034 |         ++i,
1035 |         ++d,
1036 |         h = Math.ceil(d / bitsPerDigit) - 1;
1037 |     for (g = biShiftLeft(g, i),
1038 |     c += i,
1039 |     j = Math.ceil(c / bitsPerDigit) - 1,
1040 |     k = biMultiplyByRadixPower(b, j - h); -1 != biCompare(g, k); )
1041 |         ++f.digits[j - h],
1042 |         g = biSubtract(g, k);
1043 |     for (l = j; l > h; --l) {
1044 |         for (m = l >= g.digits.length ? 0 : g.digits[l],
1045 |         n = l - 1 >= g.digits.length ? 0 : g.digits[l - 1],
1046 |         o = l - 2 >= g.digits.length ? 0 : g.digits[l - 2],
1047 |         p = h >= b.digits.length ? 0 : b.digits[h],
1048 |         q = h - 1 >= b.digits.length ? 0 : b.digits[h - 1],
1049 |         f.digits[l - h - 1] = m == p ? maxDigitVal : Math.floor((m * biRadix + n) / p),
1050 |         r = f.digits[l - h - 1] * (p * biRadix + q),
1051 |         s = m * biRadixSquared + (n * biRadix + o); r > s; )
1052 |             --f.digits[l - h - 1],
1053 |             r = f.digits[l - h - 1] * (p * biRadix | q),
1054 |             s = m * biRadix * biRadix + (n * biRadix + o);
1055 |         k = biMultiplyByRadixPower(b, l - h - 1),
1056 |         g = biSubtract(g, biMultiplyDigit(k, f.digits[l - h - 1])),
1057 |         g.isNeg && (g = biAdd(g, k),
1058 |         --f.digits[l - h - 1])
1059 |     }
1060 |     return g = biShiftRight(g, i),
1061 |     f.isNeg = a.isNeg != e,
1062 |     a.isNeg && (f = e ? biAdd(f, bigOne) : biSubtract(f, bigOne),
1063 |     b = biShiftRight(b, i),
1064 |     g = biSubtract(b, g)),
1065 |     0 == g.digits[0] && 0 == biHighIndex(g) && (g.isNeg = !1),
1066 |     new Array(f,g)
1067 | }
1068 | function biDivide(a, b) {
1069 |     return biDivideModulo(a, b)[0]
1070 | }
1071 | function biModulo(a, b) {
1072 |     return biDivideModulo(a, b)[1]
1073 | }
1074 | function biMultiplyMod(a, b, c) {
1075 |     return biModulo(biMultiply(a, b), c)
1076 | }
1077 | function biPow(a, b) {
1078 |     for (var c = bigOne, d = a; ; ) {
1079 |         if (0 != (1 & b) && (c = biMultiply(c, d)),
1080 |         b >>= 1,
1081 |         0 == b)
1082 |             break;
1083 |         d = biMultiply(d, d)
1084 |     }
1085 |     return c
1086 | }
1087 | function biPowMod(a, b, c) {
1088 |     for (var d = bigOne, e = a, f = b; ; ) {
1089 |         if (0 != (1 & f.digits[0]) && (d = biMultiplyMod(d, e, c)),
1090 |         f = biShiftRight(f, 1),
1091 |         0 == f.digits[0] && 0 == biHighIndex(f))
1092 |             break;
1093 |         e = biMultiplyMod(e, e, c)
1094 |     }
1095 |     return d
1096 | }
1097 | function BarrettMu(a) {
1098 |     this.modulus = biCopy(a),
1099 |     this.k = biHighIndex(this.modulus) + 1;
1100 |     var b = new BigInt;
1101 |     b.digits[2 * this.k] = 1,
1102 |     this.mu = biDivide(b, this.modulus),
1103 |     this.bkplus1 = new BigInt,
1104 |     this.bkplus1.digits[this.k + 1] = 1,
1105 |     this.modulo = BarrettMu_modulo,
1106 |     this.multiplyMod = BarrettMu_multiplyMod,
1107 |     this.powMod = BarrettMu_powMod
1108 | }
1109 | function BarrettMu_modulo(a) {
1110 |     var i, b = biDivideByRadixPower(a, this.k - 1), c = biMultiply(b, this.mu), d = biDivideByRadixPower(c, this.k + 1), e = biModuloByRadixPower(a, this.k + 1), f = biMultiply(d, this.modulus), g = biModuloByRadixPower(f, this.k + 1), h = biSubtract(e, g);
1111 |     for (h.isNeg && (h = biAdd(h, this.bkplus1)),
1112 |     i = biCompare(h, this.modulus) >= 0; i; )
1113 |         h = biSubtract(h, this.modulus),
1114 |         i = biCompare(h, this.modulus) >= 0;
1115 |     return h
1116 | }
1117 | function BarrettMu_multiplyMod(a, b) {
1118 |     var c = biMultiply(a, b);
1119 |     return this.modulo(c)
1120 | }
1121 | function BarrettMu_powMod(a, b) {
1122 |     var d, e, c = new BigInt;
1123 |     for (c.digits[0] = 1,
1124 |     d = a,
1125 |     e = b; ; ) {
1126 |         if (0 != (1 & e.digits[0]) && (c = this.multiplyMod(c, d)),
1127 |         e = biShiftRight(e, 1),
1128 |         0 == e.digits[0] && 0 == biHighIndex(e))
1129 |             break;
1130 |         d = this.multiplyMod(d, d)
1131 |     }
1132 |     return c
1133 | }
1134 | var maxDigits, ZERO_ARRAY, bigZero, bigOne, dpl10, lr10, hexatrigesimalToChar, hexToChar, highBitMasks, lowBitMasks, biRadixBase = 2, biRadixBits = 16, bitsPerDigit = biRadixBits, biRadix = 65536, biHalfRadix = biRadix >>> 1, biRadixSquared = biRadix * biRadix, maxDigitVal = biRadix - 1, maxInteger = 9999999999999998;
1135 | setMaxDigits(20),
1136 | dpl10 = 15,
1137 | lr10 = biFromNumber(1e15),
1138 | hexatrigesimalToChar = new Array("0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"),
1139 | hexToChar = new Array("0","1","2","3","4","5","6","7","8","9","a","b","c","d","e","f"),
1140 | highBitMasks = new Array(0,32768,49152,57344,61440,63488,64512,65024,65280,65408,65472,65504,65520,65528,65532,65534,65535),
1141 | lowBitMasks = new Array(0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535);
1142 | 
1143 | 
1144 | 
1145 | !function() {
1146 |     function a(a) {
1147 |         var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
1148 |         for (d = 0; a > d; d += 1)
1149 |             e = Math.random() * b.length,
1150 |             e = Math.floor(e),
1151 |             c += b.charAt(e);
1152 |         return c
1153 |     }
1154 |     function b(a, b) {
1155 |         var c = CryptoJS.enc.Utf8.parse(b)
1156 |           , d = CryptoJS.enc.Utf8.parse("0102030405060708")
1157 |           , e = CryptoJS.enc.Utf8.parse(a)
1158 |           , f = CryptoJS.AES.encrypt(e, c, {
1159 |             iv: d,
1160 |             mode: CryptoJS.mode.CBC
1161 |         });
1162 |         return f.toString()
1163 |     }
1164 |     function c(a, b, c) {
1165 |         var d, e;
1166 |         return setMaxDigits(131),
1167 |         d = new RSAKeyPair(b,"",c),
1168 |         e = encryptedString(d, a)
1169 |     }
1170 |     function d(d, e, f, g) {
1171 |         var h = {}
1172 |           , i = a(16);
1173 |         return h.encText = b(d, g),
1174 |         h.encText = b(h.encText, i),
1175 |         h.encSecKey = c(i, e, f),
1176 |         h
1177 |     }
1178 |     function e(a, b, d, e) {
1179 |         var f = {};
1180 |         return f.encText = c(a + e, b, d),
1181 |         f
1182 |     }
1183 |     asrsea = d,
1184 |     ecnonasr = e
1185 | }();
1186 | 
1187 | // 这个函数是启动函数，接收一个歌曲ID。获取到对应的加密参数
1188 | function start(music_id) {
1189 |     var i9b = {
1190 |         "rid":"R_SO_4_" + music_id,
1191 |         // 偏移量。可理解为初始下标
1192 |         "offset": 0,
1193 |         "total":"false",
1194 |         // 每页的请求数量
1195 |         "limit": 100,
1196 |         "csrf_token":""
1197 |     };
1198 |     var bYf7Y = asrsea(JSON.stringify(i9b), "010001", "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7", "0CoJUm6Qyw8W8jud");
1199 |     return bYf7Y;
1200 | }
1201 | 
1202 | 


--------------------------------------------------------------------------------
/music163/Music.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: monkey-hjy
 3 | # @Date:   2021-02-24 17:42:40
 4 | # @Last Modified by:   monkey-hjy
 5 | # @Last Modified time: 2021-02-25 10:49:45
 6 | import requests
 7 | import execjs
 8 | import json
 9 | 
10 | 
11 | class Music(object):
12 |     """破解网易云音乐JS加密获取数据"""
13 | 
14 |     def __init__(self):
15 |         self.get_comment_url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_{}?csrf_token='
16 | 
17 |     @staticmethod
18 |     def get_response(method=None, url=None, headers=None, data=None):
19 |         """
20 |         发起请求
21 |         :params: method 请求类型：GET/POST
22 |         :params: url 请求链接
23 |         :params: headers 请求头
24 |         :params: data post请求的表单
25 |         """
26 |         if method is None:
27 |             return '请求参数有误 -- method is None'
28 |         if url is None:
29 |             return '请求链接有误 --- url is None'
30 |         if headers is None:
31 |             headers = {
32 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
33 |                               "Chrome/88.0.4324.182 Safari/537.36",
34 |             }
35 |         if method == 'GET':
36 |             response = requests.get(url=url, headers=headers)
37 |         elif method == 'POST':
38 |             response = requests.post(url=url, headers=headers, data=data)
39 |         else:
40 |             return '请求参数有误 -- method undefined'
41 |         response.encoding = 'utf8'
42 |         if response.status_code == 200:
43 |             return response
44 |         else:
45 |             return '请求失败。状态码 %d' % response.status_code
46 | 
47 |     @staticmethod
48 |     def get_token(music_id):
49 |         """
50 |         根据歌曲ID获取到对应的加密参数
51 |         :param music_id: 需要抓取的歌曲ID
52 |         """
53 |         js_file = open('Music.js', encoding='utf8').read()
54 |         ctx = execjs.compile(js_file, cwd=r'C:\Users\Spider\AppData\Roaming\npm\node_modules')
55 |         token = ctx.call('start', music_id)
56 |         return {
57 |             'params': token['encText'],
58 |             'encSecKey': token['encSecKey']
59 |         }
60 | 
61 |     def get_comment(self, music_id):
62 |         """
63 |         获取评论数据
64 |         :params music_id 歌曲id
65 |         """
66 |         comment_response = self.get_response(method='POST', url=self.get_comment_url.format(music_id),
67 |                                              data=self.get_token(music_id=music_id)).json()
68 |         # 解析这个json串，即可获取到对应的数据
69 |         print(json.dumps(comment_response))
70 | 
71 |     def run(self):
72 |         """启动函数"""
73 |         test_music_id = 1366216050
74 |         self.get_comment(music_id=test_music_id)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     m = Music()
79 |     m.run()
80 | 


--------------------------------------------------------------------------------
/qcc/qcc.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # @File    :   qcc.py    
 4 | # @Author  :   Monkey
 5 | # @DATE    :   2021/5/11 下午5:13 
 6 | 
 7 | import requests
 8 | import re
 9 | from lxml import etree
10 | 
11 | 
12 | class QCC(object):
13 |     """企查查爬虫"""
14 | 
15 |     def __init__(self):
16 |         self._headers = {
17 |             'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
18 |         }
19 | 
20 |     def get_cookie(self):
21 |         """发起一次测试请求，获取到搜索的cookie"""
22 |         url = 'https://www.qcc.com/web/search?key=测试'
23 |         response = requests.get(url, headers=self._headers, allow_redirects=False)
24 |         response.encoding = 'utf8'
25 |         result = re.findall(r'div>您的请求ID是: <strong>\n(.*?)</strong></div>',  response.text)
26 |         if result:
27 |             return result[0]
28 | 
29 |     def search(self, search_keyword):
30 |         """搜索"""
31 |         url = 'https://www.qcc.com/web/search?key={}'.format(search_keyword)
32 |         headers = self._headers
33 |         headers['cookie'] = 'acw_tc={}'.format(self.get_cookie())
34 |         response = requests.get(url, headers=headers)
35 |         response.encoding = 'utf8'
36 |         html = etree.HTML(response.text)
37 |         com_url = html.xpath('//a[@class="title"]/@href')
38 |         print('搜索到{}条结果。即将开始获取详细信息...'.format(len(com_url)))
39 |         for url in com_url:
40 |             self.get_com_info(url)
41 | 
42 |     def get_com_info(self, url):
43 |         """获取公司的详细信息"""
44 |         response = requests.get(url, headers=self._headers)
45 |         html = etree.HTML(response.text)
46 |         info_elements = html.xpath('//table[@class="ntable"]/tr')
47 |         item = {'url': url}
48 |         flag = True
49 |         for element in info_elements:
50 |             if not flag:
51 |                 break
52 |             for index in range(0, len(element.xpath('./td')), 2):
53 |                 try:
54 |                     key = element.xpath('./td[{}]/text()'.format(index+1))[0].strip()
55 |                     if key == '公司介绍：' or key == '经营范围':
56 |                         flag = False
57 |                     if key == '法定代表人':
58 |                         item[key] = element.xpath('./td[{}]//h2/text()'.format(index+2))[0].strip()
59 |                     else:
60 |                         item[key] = element.xpath('./td[{}]//text()'.format(index+2))[0].strip()
61 |                 except:
62 |                     pass
63 |         print(item)
64 | 
65 |     def run(self):
66 |         """启动函数"""
67 |         self.search(search_keyword='腾讯')
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     t = QCC()
72 |     t.run()
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa1/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 18:10
 4 | # Auth: HJY
 5 | 
 6 | """Ajax请求返回数据"""
 7 | 
 8 | import requests
 9 | from loguru import logger
10 | 
11 | url = 'https://spa1.scrape.center/api/movie/?limit=100&offset=0'
12 | response = requests.get(url).json()
13 | for info in response['results']:
14 |     logger.info(f'name: {info["name"]}, published_at: {info["published_at"]}, score: {info["score"]}')
15 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa14/Wasm.wasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/monkey-hjy/python-spider/c65088d79b14643600bbae2796142b7a2384bf5c/scrapeCenter/spa14/Wasm.wasm


--------------------------------------------------------------------------------
/scrapeCenter/spa14/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/14 11:35
 4 | # Auth: HJY
 5 | 
 6 | """
 7 | wasm加密
 8 | e = this.$wasm.asm.encrypt(offset, time);
 9 | """
10 | 
11 | import requests
12 | import pywasm
13 | 
14 | import time
15 | import os
16 | 
17 | wasm_fun = pywasm.load('scrapeCenter/spa14/Wasm.wasm')
18 | res = wasm_fun.exec('encrypt', [0, int(time.time())])
19 | 
20 | url = 'https://spa14.scrape.center/api/movie/'
21 | headers = {
22 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36'
23 | }
24 | params = {
25 |     'limit': 100,
26 |     'offset': 0,
27 |     'sign': res
28 | }
29 | response = requests.get(url, headers=headers, params=params).json()
30 | print(len(response['results']))
31 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa15/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/14 11:53
 4 | # Auth: HJY
 5 | 
 6 | """
 7 | wasm加密
 8 | this.$wasm.ccall("encrypt", "string", ["string", "string"], [this.$store.state.url.index, Math.round((new Date).getTime() / 1e3).toString()]);
 9 | this.$wasm.ccall;
10 | """
11 | 
12 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa15/demo.js:
--------------------------------------------------------------------------------
 1 | t = {}
 2 | t["_encrypt"] = function() {
 3 |     return (t["_encrypt"] = t["asm"]["encrypt"]).apply(null, arguments)
 4 | }
 5 | t["stackSave"] = function() {
 6 |     return (bt = t["stackSave"] = t["asm"]["stackSave"]).apply(null, arguments)
 7 | }
 8 | 
 9 | function ot(n) {
10 |     t["onAbort"] && t["onAbort"](n),
11 |     n += "",
12 |     S(n),
13 |     j = !0,
14 |     1,
15 |     n = "abort(" + n + "). Build with -s ASSERTIONS=1 for more info.";
16 |     var e = new WebAssembly.RuntimeError(n);
17 |     throw c(e),
18 |     e
19 | }
20 | 
21 | function k(t, n) {
22 |     t || ot("Assertion failed: " + n)
23 | }
24 | 
25 | function I(n) {
26 |     var e = t["_" + n];
27 |     return k(e, "Cannot call unknown function " + n + ", make sure it is exported"),
28 |     e
29 | }
30 | 
31 | function L(t, n, e, r, i) {
32 |     var o = {
33 |         string: function(t) {
34 |             var n = 0;
35 |             if (null !== t && void 0 !== t && 0 !== t) {
36 |                 var e = 1 + (t.length << 2);
37 |                 n = xt(e),
38 |                 N(t, n, e)
39 |             }
40 |             return n
41 |         },
42 |         array: function(t) {
43 |             var n = xt(t.length);
44 |             return D(t, n),
45 |             n
46 |         }
47 |     };
48 |     function a(t) {
49 |         return "string" === n ? W(t) : "boolean" === n ? Boolean(t) : t
50 |     }
51 |     var c = I(t)
52 |       , u = []
53 |       , s = 0;
54 |     if (r)
55 |         for (var f = 0; f < r.length; f++) {
56 |             var l = o[e[f]];
57 |             console.log('l: ', l)
58 |             l ? (0 === s && (s = bt()),
59 |             u[f] = l(r[f])) : u[f] = r[f]
60 |         }
61 |     var h = c.apply(null, u);
62 |     return h = a(h),
63 |     0 !== s && _t(s),
64 |     h
65 | }
66 | L("encrypt", "string", ["string", "string"], ['/api/movie', 1655534908]);


--------------------------------------------------------------------------------
/scrapeCenter/spa16/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/18 15:14
 4 | # Auth: HJY
 5 | 
 6 | """http2协议"""
 7 | 
 8 | import httpx
 9 | client = httpx.Client(http2=True)
10 | url = 'https://spa16.scrape.center/api/book/?limit=18&offset=0'
11 | headers = {
12 |     'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
13 | }
14 | response = client.get(url, headers=headers)
15 | print(response.text)
16 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa2/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 18:13
 4 | # Auth: HJY
 5 | 
 6 | """有token参数加密"""
 7 | 
 8 | import requests
 9 | 
10 | from loguru import logger
11 | import hashlib
12 | import base64
13 | import time
14 | 
15 | 
16 | def get_token(offset):
17 |     now_t = str(int(time.time()))
18 |     res = hashlib.sha1(f'/api/movie,{offset},{now_t}'.encode('utf8')).hexdigest()
19 |     res += f',{now_t}'
20 |     res = base64.b64encode(res.encode('utf8')).decode()
21 |     return res
22 | 
23 | url = 'https://spa2.scrape.center/api/movie/'
24 | params = {
25 |     'limit': 100,
26 |     'offset': 0,
27 |     'token': get_token(0)
28 | }
29 | response = requests.get(url, params=params).json()
30 | for info in response['results']:
31 |     logger.info(f'name: {info["name"]}, published_at: {info["published_at"]}, score: {info["score"]}')
32 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa3/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 18:40
 4 | # Auth: HJY
 5 | 
 6 | """下滑页面获取新数据"""
 7 | 
 8 | import requests
 9 | 
10 | url = 'https://spa3.scrape.center/api/movie/?limit=100&offset=0'
11 | response = requests.get(url).json()
12 | print(len(response['results']))
13 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa5/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 18:44
 4 | # Auth: HJY
 5 | 
 6 | """动态渲染"""
 7 | 
 8 | import requests
 9 | 
10 | url = 'https://spa5.scrape.center/api/book/?limit=5000&offset=0'
11 | response = requests.get(url).json()
12 | print(len(response['results']))
13 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa6/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/14 10:48
 4 | # Auth: HJY
 5 | 
 6 | """js加密。有混淆"""
 7 | 
 8 | import hashlib
 9 | import requests
10 | import time
11 | import base64
12 | 
13 | 
14 | def get_token():
15 |     now_t = str(int(time.time()))
16 |     _0x189cbb = ['/api/movie', now_t]
17 |     _0xf7c3c7 = hashlib.sha1(','.join(_0x189cbb).encode('utf8')).hexdigest()
18 |     _0x3c8435 = _0xf7c3c7 + ',' + now_t
19 |     _0x104b5b = base64.b64encode(_0x3c8435.encode('utf8')).decode('utf8')
20 |     return _0x104b5b
21 | 
22 | url = 'https://spa6.scrape.center/api/movie/'
23 | headers = {
24 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.115 Safari/537.36'
25 | }
26 | params = {
27 |     'limit': 10,
28 |     'offset': 10,
29 |     'token': get_token(),
30 | }
31 | response = requests.get(url=url, headers=headers, params=params).json()
32 | print(len(response['results']))
33 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa6/demo.js:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | function _0x456254() {
 4 |     var _0x189cbb = f_3452();
 5 |     for (var _0x5da681 = Math['round'](new Date()['getTime']() / 0x3e8)['toString'](), _0x2a83dd = arguments['length'], _0x31a891 = new Array(_0x2a83dd), _0x596a02 = 0x0; _0x596a02 < _0x2a83dd; _0x596a02++)
 6 |         _0x31a891[_0x596a02] = arguments[_0x596a02];
 7 |     _0x31a891['push'](_0x5da681);
 8 |     console.log(_0x31a891);
 9 |     var _0xf7c3c7 = _0x189cbb['SHA1'](_0x31a891['join'](','))['toString'](_0x189cbb['enc']['Hex'])
10 |       , _0x3c8435 = [_0xf7c3c7, _0x5da681]['join'](',')
11 |       , _0x104b5b = _0x358b1f['encode'](_0x3c8435);
12 |     return _0x104b5b;
13 | }
14 | 
15 | 
16 | _0x358b1f['encode'](_0x3c8435)
17 | 
18 | _0x3c8435 = [_0xf7c3c7, _0x5da681]['join'](',')
19 | 
20 | _0xf7c3c7 = sha1(_0x31a891['join'](','))['toString'](_0x189cbb['enc']['Hex'])
21 |     _0x31a891 = ['/api/movie', time]
22 | _0x5da681 = time
23 | 


--------------------------------------------------------------------------------
/scrapeCenter/spa7/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/14 11:27
 4 | # Auth: HJY
 5 | 
 6 | """数据存储在js中"""
 7 | 
 8 | import requests
 9 | 
10 | url = 'https://spa7.scrape.center/js/main.js'
11 | response = requests.get(url).text
12 | print(response)
13 | 


--------------------------------------------------------------------------------
/scrapeCenter/ssr1/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 17:37
 4 | # Auth: HJY
 5 | 
 6 | """静态网站。直接请求"""
 7 | 
 8 | import requests
 9 | from lxml import etree
10 | 
11 | 
12 | def parse_page(page):
13 |     url = 'https://ssr1.scrape.center/page/{}'.format(page)
14 |     headers = {
15 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36'
16 |     }
17 |     response = requests.get(url=url, headers=headers)
18 |     response.encoding = response.apparent_encoding
19 |     html = etree.HTML(response.text)
20 |     info_element = html.xpath('//div[@class="el-col el-col-18 el-col-offset-3"]/div')
21 |     for info in info_element:
22 |         title = info.xpath('.//h2/text()')[0]
23 |         types = ','.join(info.xpath('.//div[@class="categories"]//span/text()'))
24 |         score = info.xpath('.//p[@class="score m-t-md m-b-n-sm"]/text()')[0].strip()
25 |         item = {'标题': title, '类型': types, '评分': score}
26 |         print(f'page: {page}, item: {item}')
27 |     if info_element:
28 |         parse_page(page + 1)
29 | 
30 | 
31 | parse_page(1)
32 | 
33 | 


--------------------------------------------------------------------------------
/scrapeCenter/ssr2/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 17:45
 4 | # Auth: HJY
 5 | 
 6 | """无证书。关闭证书验证即可"""
 7 | 
 8 | import requests
 9 | 
10 | url = 'https://ssr2.scrape.center/'
11 | headers = {
12 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36'
13 | }
14 | response = requests.get(url, headers=headers, verify=False)
15 | print(response.text)
16 | 


--------------------------------------------------------------------------------
/scrapeCenter/ssr3/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: crawl.py
 3 | # Date: 2022/06/13 17:46
 4 | # Auth: HJY
 5 | 
 6 | """加http验证"""
 7 | 
 8 | import requests
 9 | 
10 | url = 'https://ssr3.scrape.center/'
11 | headers = {
12 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36',
13 |     'Authorization': 'Basic YWRtaW46YWRtaW4='
14 | }
15 | response = requests.get(url, headers=headers)
16 | print(response.text)
17 | 


--------------------------------------------------------------------------------
/scrapeCenter/ssr4/crawl.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # NAME: scrapy.py
 3 | # Date: 2022/06/13 17:49
 4 | # Auth: HJY
 5 | 
 6 | """做延时。异步加快速度"""
 7 | 
 8 | import requests
 9 | import asyncio
10 | import aiohttp
11 | from loguru import logger
12 | 
13 | import time
14 | 
15 | 
16 | start_time = time.time()
17 | 
18 | 
19 | async def get(url):
20 |     session = aiohttp.ClientSession()
21 |     headers = {
22 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36',
23 |     }
24 |     response = await session.get(url, headers=headers, verify_ssl=False)
25 |     await response.text()
26 |     await session.close()
27 |     return response
28 | 
29 | 
30 | async def start(page):
31 |     url = f'https://ssr4.scrape.center/page/{page}'
32 |     logger.info(f'get {url}')
33 |     response = await get(url)
34 |     logger.info(f'get {url} done, response.status={response.status}')
35 | 
36 | 
37 | tasks = [asyncio.ensure_future(start(page)) for page in range(1, 10)]
38 | loop = asyncio.get_event_loop()
39 | loop.run_until_complete(asyncio.wait(tasks))
40 | end_time = time.time()
41 | logger.info(f'耗时: {end_time - start_time}')
42 | 
43 | 


--------------------------------------------------------------------------------
/tweet/GetToken.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: monkey-hjy
 3 | # @Date:   2021-02-24 17:20:13
 4 | # @Last Modified by:   monkey-hjy
 5 | # @Last Modified time: 2021-02-24 17:20:32
 6 | import requests
 7 | 
 8 | 
 9 | class GetToken(object):
10 |     """获取到游客token"""
11 |     def __init__(self):
12 |         self.get_token_url = 'https://api.twitter.com/1.1/guest/activate.json'
13 |         self.get_token_headers = {
14 |             "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
15 |             'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
16 |         }
17 | 
18 |     def get_token(self, proxies_ip):
19 |         proxies = {
20 |             'http': 'http://{}'.format(proxies_ip),
21 |             'https': 'http://{}'.format(proxies_ip),
22 |         }
23 |         err_count = 0
24 |         while err_count < 5:
25 |             try:
26 |                 response = requests.request(url=self.get_token_url, method="POST", headers=self.get_token_headers,
27 |                                             timeout=15)
28 |                 response.close()
29 |                 return response.json().get('guest_token')
30 |             except Exception as e:
31 |                 print(e)
32 |                 err_count += 1
33 | 


--------------------------------------------------------------------------------
/tweet/Tweet.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: monkey-hjy
  3 | # @Date:   2021-02-24 17:18:02
  4 | # @Last Modified by:   monkey-hjy
  5 | # @Last Modified time: 2021-02-24 17:23:17
  6 | from datetime import datetime
  7 | import requests
  8 | from GetToken import GetToken
  9 | import random
 10 | from prettytable import PrettyTable
 11 | 
 12 | # 随机UA头
 13 | USER_AGENT = [
 14 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 15 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 16 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 17 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 18 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 19 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 20 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 21 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 22 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 23 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 24 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 25 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
 26 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
 27 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
 28 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
 29 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
 30 | ]
 31 | 
 32 | 
 33 | class SearchTweet(GetToken):
 34 |     """
 35 |     根据关键词搜索推文或者用户
 36 |     使用游客token进行抓取数据，没有次数限制
 37 |     但是需要境外ip。。。
 38 |     """
 39 | 
 40 |     def __init__(self):
 41 |         super().__init__()
 42 |         self.start = datetime.now()
 43 |         # 定义请求头。需要按照下面的代码去获取游客token
 44 |         self.headers = {
 45 |             'authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs'
 46 |                              '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA',
 47 |             'user-agent': random.choice(USER_AGENT),
 48 |             'x-guest-token': self.get_token(proxies_ip='127.0.0.1:10809'),
 49 |         }
 50 |         # 获取数据的接口
 51 |         self.url = 'https://twitter.com/i/api/2/search/adaptive.json'
 52 | 
 53 |     def start_requests(self, search_key, search_type='tweet'):
 54 |         """
 55 |         开始搜索
 56 |         :param search_key: 搜素关键词
 57 |         :param search_type: 搜索类别。tweet/推文。   account/用户
 58 |         :return:
 59 |         """
 60 |         params = {
 61 |             "q": search_key,
 62 |             "count": 20,
 63 |         }
 64 |         if search_type == 'account':
 65 |             params['result_filter'] = 'user'
 66 |         response = requests.get(url=self.url, headers=self.headers, params=params, timeout=10)
 67 |         if response.status_code != 200:
 68 |             return f'{search_key} ERR  ===  {response}'
 69 |         tweets = response.json().get('globalObjects').get('tweets')
 70 |         users = response.json().get('globalObjects').get('users')
 71 |         if not len(tweets) and not len(users):
 72 |             return f'{search_key}未抓到数据'
 73 |         p = PrettyTable()
 74 |         if search_type == 'tweet':
 75 |             tweet_id = []
 76 |             create_time = []
 77 |             full_text = []
 78 |             user_name = []
 79 |             screen_name = []
 80 |             for key in tweets:
 81 |                 tweet_id.append(key)
 82 |                 create_time.append(tweets.get(key).get('created_at'))
 83 |                 full_text.append(tweets.get(key).get('text'))
 84 |                 user_id = tweets.get(key).get('user_id_str')
 85 |                 user_name.append(users.get(user_id).get('name'))
 86 |                 screen_name.append(users.get(user_id).get('screen_name'))
 87 |             p.add_column(fieldname='推文ID', column=tweet_id)
 88 |             p.add_column(fieldname='发文时间', column=create_time)
 89 |             p.add_column(fieldname='内容', column=full_text)
 90 |             p.add_column(fieldname='用户名', column=user_name)
 91 |             p.add_column(fieldname='账号', column=screen_name)
 92 |         else:
 93 |             user_name = []
 94 |             screen_name = []
 95 |             description = []
 96 |             for key in users:
 97 |                 user_name.append(users.get(key).get('name'))
 98 |                 screen_name.append(users.get(key).get('screen_name'))
 99 |                 description.append(users.get(key).get('description'))
100 |             p.add_column(fieldname='用户名', column=user_name)
101 |             p.add_column(fieldname='账号', column=screen_name)
102 |             p.add_column(fieldname='简介', column=description)
103 |         return p
104 | 
105 |     def run(self):
106 |         search_key = ['葫芦娃', '奥特曼']
107 |         for key in search_key:
108 |             result = self.start_requests(search_key=key, search_type='account')
109 |             print(result)
110 | 
111 |     def __del__(self):
112 |         end = datetime.now()
113 |         print(f'开始：{self.start}，结束：{end}\n用时：{end-self.start}')
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     t = SearchTweet()
118 |     t.run()
119 | 


--------------------------------------------------------------------------------
/weather/weather.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: monkey-hjy
 3 | # @Date:   2021-02-24 17:28:36
 4 | # @Last Modified by:   monkey-hjy
 5 | # @Last Modified time: 2021-02-24 17:29:00
 6 | # 中国天气网的接口。。。
 7 | import requests
 8 | from lxml import etree
 9 | import pandas as pd
10 | from prettytable import PrettyTable
11 | import os
12 | 
13 | 
14 | def get_html(url):
15 |     # 定义头文件
16 |     headers = {'user-agent': 'Mozilla/5.0'}
17 |     # 发起请求
18 |     response = requests.get(url, headers=headers)
19 |     # 修改编码
20 |     response.encoding = 'utf8'
21 |     # 处理成HTML格式
22 |     html = etree.HTML(response.text)
23 |     return html
24 | 
25 | 
26 | # 获取城市信息并保存到本地
27 | def get_cityinfo_write(html):
28 |     print('获取城市信息')
29 |     city_info = {}
30 |     # 获取到城市信息
31 |     province_url = html.xpath('//div[@class="lqcontentBoxheader"]//ul//li/a/@href')
32 |     for i in range(len(province_url)):
33 |         # 拼接出每个城市的URL，并获取到对应的HTML
34 |         the_html = get_html('http://www.weather.com.cn' + province_url[i])
35 |         # 解析出城市名称
36 |         city_name = the_html.xpath('//div[@class="conMidtab3"]//tr//td[position()<3]/a/text()')
37 |         # 解析出城市链接
38 |         city_url = the_html.xpath('//div[@class="conMidtab3"]//tr//td[position()<3]/a/@href')
39 |         # 将城市信息存储到city_info中
40 |         for j in range(len(city_name)):
41 |             if j != 0 and city_name[j] == city_name[0]:
42 |                 break
43 |             else:
44 |                 city_info[city_name[j]] = city_url[j]
45 |     # 给数据设置列名
46 |     data = pd.DataFrame(columns=['city_name', 'city_url'])
47 |     # 填充数据
48 |     data['city_name'] = city_info.keys()
49 |     data['city_url'] = city_info.values()
50 |     # 保存到本地
51 |     data.to_csv(file_path, index=False, encoding='utf8')
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     # 实例化输出类
56 |     p = PrettyTable()
57 |     # 接口URL
58 |     url = 'http://www.weather.com.cn/textFC/hb.shtml'
59 |     # 调用获取HTML的方法
60 |     html = get_html(url)
61 |     file_path = '/home/monkey/File/中国天气网城市信息.csv'
62 |     # 判断存放城市信息的数据文件是否存在。如果不存在，则调用get_cityinfo_write方法下载
63 |     if not os.path.exists(file_path):
64 |         get_cityinfo_write(html)
65 |     # 读取城市信息
66 |     data = pd.read_csv(file_path, encoding='utf8')
67 |     # 获取到城市名称
68 |     city_name = data['city_name'].tolist()
69 |     # 获取到城市URL
70 |     city_url = data['city_url'].tolist()
71 |     # 让用户输入需要查询的城市
72 |     name = input('请输入需要查询的城市名称：')
73 |     # 如果名称输入正确，则进行查询
74 |     if name in city_name:
75 |         # 获取到当前城市天气信息的HTML
76 |         city_html = get_html(city_url[city_name.index(name)])
77 |         # 解析出时间
78 |         date = city_html.xpath('//ul[@class="t clearfix"]//li//h1/text()')
79 |         # 解析出天气
80 |         wea = city_html.xpath('//ul[@class="t clearfix"]//li/p[@class="wea"]/text()')
81 |         # 解析出温度列表
82 |         tem_list = ''.join(city_html.xpath('//ul[@class="t clearfix"]//li/p[@class="tem"]//text()')).split('\n')
83 |         # 取出正确的数据
84 |         tem = [tem_list[i] for i in range(len(tem_list)) if i % 2 != 0]
85 |         # 解析出风量
86 |         win = city_html.xpath('//ul[@class="t clearfix"]//li/p[@class="win"]/i/text()')
87 |         print('{}的天气如下'.format(name))
88 |         # 把数据填充到表格中，美化输出
89 |         p.add_column('日期', date)
90 |         p.add_column('天气', wea)
91 |         p.add_column('温度', tem)
92 |         p.add_column('风量', win)
93 |         print(p)
94 |     else:
95 |         print('输入的城市名称有误！')
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/weibo/get_fans_info.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # @Author: monkey-hjy
  3 | # @Date:   2021-04-22 11:32:21
  4 | # @Last Modified by:   monkey-hjy
  5 | # @Last Modified time: 2021-04-22 16:57:22
  6 | from gevent import monkey; monkey.patch_all()
  7 | import gevent.pool
  8 | import json
  9 | import requests
 10 | import random
 11 | import re
 12 | import pymongo
 13 | import datetime
 14 | import redis
 15 | 
 16 | 
 17 | class GetFansInfo(object):
 18 |     """获取某个账号粉丝的信息"""
 19 | 
 20 |     def __init__(self):
 21 |         self.mongo_conf = pymongo.MongoClient(host='127.0.0.1', port=27017)
 22 |         self.mongo_db = self.mongo_conf['data']['weibo']
 23 |         self.redis_conf = redis.StrictRedis()
 24 |         # 参数1：用户ID。
 25 |         # 参数2：初始下标，下一页的下标会在本次请求返回
 26 |         self.get_fans_url = "https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{}&since_id={}"
 27 |         # 参数1：用户ID
 28 |         self.get_info_url = "https://weibo.com/p/100505{}/info?mod=pedit_more"
 29 |         self._headers = {
 30 |             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36",
 31 |         }
 32 |         self.get_cookie()
 33 |         self.err_count = 0
 34 | 
 35 |     def __del__(self):
 36 |         self.redis_conf.close()
 37 |         self.mongo_conf.close()
 38 | 
 39 |     def get_response(self, url):
 40 |         """解析到对应URL的response"""
 41 |         err_count = 0
 42 |         while err_count < 5:
 43 |             try:
 44 |                 response = requests.get(url, headers=self._headers)
 45 |                 if response.status_code == 200:
 46 |                     response.encoding = 'utf8'
 47 |                     if '<title>Sina Visitor System</title>' in response.text:
 48 |                         raise Exception
 49 |                     return response
 50 |                 else:
 51 |                     raise Exception
 52 |             except:
 53 |                 err_count += 1
 54 |                 self.get_cookie()
 55 |         return None
 56 | 
 57 |     def get_fans_info(self, user_info):
 58 |         """获取粉丝的信息"""
 59 |         user_info = user_info['user']
 60 |         response = self.get_response(url=self.get_info_url.format(user_info['id']))
 61 |         if response is None:
 62 |             print('出错 === {}'.format(user_info))
 63 |             return
 64 |         city = re.findall(r'所在地：.*?pt_detail\\">(.*?)<', response.text)
 65 |         city = city[0] if city else '其他'
 66 |         gender = re.findall(r'性别：.*?pt_detail\\">(.*?)<', response.text)
 67 |         gender = gender[0] if gender else '未知'
 68 |         reg_date = re.findall(r'注册时间：.*?pt_detail\\">(.*?)<', response.text)
 69 |         reg_date = reg_date[0].replace('\\n', '').replace('\\r', '').strip() if reg_date else '未知'
 70 |         item = {
 71 |             "the_fans_id": user_info['id'],
 72 |             "screen_name": user_info['screen_name'],
 73 |             "followers_count": user_info['followers_count'],
 74 |             "follow_count": user_info['follow_count'],
 75 |             "gender": gender,
 76 |             "city": city,
 77 |             "reg_date": reg_date
 78 |         }
 79 |         self.mongo_db.insert_one(item)
 80 | 
 81 |     def get_fans_id(self, user_id, since_id=0):
 82 |         """获取到某个用户的粉丝"""
 83 |         print(datetime.datetime.now(), user_id, since_id)
 84 |         if since_id >= 4999:
 85 |             return
 86 |         response = self.get_response(url=self.get_fans_url.format(user_id, since_id))
 87 |         if response is None:
 88 |             print('哥们。这个用户解析好像有点问题....\t{} is None'.format(user_id))
 89 |             return
 90 |         elif response.json()['ok'] == 0:
 91 |             print('哥们。这个用户解析好像有点问题....\t{}\t{}\t{}'.format(self.err_count, user_id, response.json()))
 92 |             if self.err_count < 10:
 93 |                 self.err_count += 1
 94 |                 self.get_fans_id(user_id, since_id)
 95 |         else:
 96 |             pip = self.redis_conf.pipeline()
 97 |             [pip.sadd('new_wb_user', info['user']['id']) for info in response.json()['data']['cards'][-1]['card_group']]
 98 |             pip.execute()
 99 |             try:
100 |                 next_since_id = response.json()['data']['cardlistInfo']['since_id']
101 |                 if next_since_id:
102 |                     self.err_count = 0
103 |                     self.get_fans_id(user_id=user_id, since_id=next_since_id)
104 |             except Exception as e:
105 |                 print(e, user_id, since_id, response.json())
106 | 
107 |     @staticmethod
108 |     def get_tid():
109 |         """获取TID参数"""
110 |         url = 'https://passport.weibo.com/visitor/genvisitor?cb=gen_callback&fp={"os":"1","browser":"Chrome89,0,4389,128","fonts":"undefined","screenInfo":"1920*1080*24","plugins":"Portable Document Format::internal-pdf-viewer::Chrome PDF Plugin|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client"}'
111 |         response = requests.get(url).text
112 |         tid = re.findall(r'"tid":"(.*?)"', response)[0]
113 |         return tid
114 | 
115 |     def get_cookie(self):
116 |         """获取 SUB 和 SUBP """
117 |         tid = self.get_tid()
118 |         while True:
119 |             url = 'https://passport.weibo.com/visitor/visitor?a=incarnate&t={}&w=3&c=95&gc=&cb=cross_domain&from=weibo&_rand={}'.format(
120 |                 tid, random.random())
121 |             response = json.loads(re.findall(r'\((.*?)\)', requests.get(url).text)[0])
122 |             if response.get('retcode') == 20000000 and response.get('data').get('sub'):
123 |                 cookie = ''
124 |                 for key in response.get('data'):
125 |                     cookie += '{}={};'.format(key.upper(), response.get('data').get(key))
126 |                 self._headers['cookie'] = cookie.rstrip(';')
127 |                 return response.get('data')
128 |             else:
129 |                 tid = self.get_tid()
130 | 
131 |     def run(self):
132 |         """启动函数"""
133 |         user_ids = list(set([line.replace('\n', '') for line in open('大V.txt', encoding='utf8').readlines()]))
134 |         exist = [line.replace('\n', '') for line in open('exist.txt', encoding='utf8').readlines()]
135 |         # # # 1、高并发跑。会有IP封禁问题。自行选择。。。
136 |         # pool = gevent.pool.Pool(50)
137 |         # pool.map(self.get_fans_id, user_ids)
138 | 
139 |         # 2、单线程跑。不会封禁IP。但是速度不是很快。
140 |         for user_id in user_ids:
141 |             if user_id in exist:
142 |                 continue
143 |             self.get_fans_id(user_id)
144 |             with open('exist.txt', encoding='utf8', mode='a') as f:
145 |                 f.write('{}\n'.format(user_id))
146 | 
147 | 
148 | if __name__ == '__main__':
149 |     t = GetFansInfo()
150 |     t.run()
151 | """
152 | 小时候
153 | 总是盼望着
154 | 盼望着有自己的零花钱
155 | 盼望着有一辆属于自己的自行车
156 | 盼望着玩到天黑不回家
157 | 盼望着妈妈不再唠叨我
158 | 
159 | 长大了
160 | 总是想着
161 | 想着可以不用每天算计着花钱
162 | 想着可以真正的散散步
163 | 想着可以在家里休息一整天
164 | 想着可以每天陪着妈妈说话
165 | 
166 | 听说
167 | 20岁的人 怀念童年
168 | 40岁的人 怀念青春
169 | 60岁的人 怀念壮年
170 | 只有那些孩子会缠着人问
171 | 妈妈
172 | 我什么时候长大呀
173 |     ---- H 2021/4/26 上海
174 |     ---- 结尾摘自 《儿时的夏日》 热评 
175 | """
176 | 


--------------------------------------------------------------------------------
/weibo/search.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Author:   玛卡巴卡
  3 | # Date:     2021/4/20 10:34
  4 | import datetime
  5 | import logging
  6 | import re
  7 | import time
  8 | from multiprocessing.dummy import Pool as ThreadPool
  9 | import requests
 10 | import pandas as pd
 11 | import random
 12 | import os
 13 | requests.packages.urllib3.disable_warnings()
 14 | 
 15 | USER_AGENTS = [
 16 |     "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36",
 17 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 18 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
 19 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
 20 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
 21 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
 22 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
 23 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
 24 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
 25 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
 26 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
 27 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
 28 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
 29 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
 30 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
 31 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
 32 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
 33 | ]
 34 | 
 35 | 
 36 | class WeiBo(object):
 37 |     """
 38 |     按照固定的关键词搜索
 39 |     采集得到的所有文章和评论信息
 40 |     """
 41 | 
 42 |     def __init__(self):
 43 |         self.get_wb_url = 'https://m.weibo.cn/api/container/getIndex'
 44 |         self.comment_url = 'https://m.weibo.cn/comments/hotflow'
 45 |         self._headers = {'user-agent': ''}
 46 |         self.wb_info_list = dict()
 47 |         self.content_id = list()
 48 |         self.content = list()
 49 |         self.comment_id = list()
 50 |         self.comment = list()
 51 |         logging.basicConfig(level=logging.INFO,
 52 |                             format='%(asctime)s %(filename)s [line:%(lineno)d] %(levelname)s %(message)s',
 53 |                             datefmt='%Y-%m-%d %H:%M:%S',
 54 |                             filename='f:/PDemo/spider_log/{}-{}.log'.format(__file__.split('/')[-1].split('.')[0], str(datetime.datetime.now()).split(" ")[0]),
 55 |                             filemode='a')
 56 | 
 57 |     def get_response(self, url, params=None, cookie=None):
 58 |         """发起请求"""
 59 |         err_count = 0
 60 |         while err_count < 5:
 61 |             try:
 62 |                 time.sleep(1)
 63 |                 if cookie is not None:
 64 |                     self._headers['cookie'] = cookie
 65 |                 else:
 66 |                     self._headers = {'user-agent': random.choice(USER_AGENTS)}
 67 |                 response = requests.get(url, params=params, headers=self._headers)
 68 |                 if response.status_code == 200:
 69 |                     return response
 70 |                 else:
 71 |                     err_count += 1
 72 |                     time.sleep(30)
 73 |             except:
 74 |                 err_count += 1
 75 |         return None
 76 | 
 77 |     def get_wb_id(self, keyword, page):
 78 |         """获取微博ID"""
 79 |         wb_id_list = []
 80 |         params = {
 81 |             'containerid': '100103type=1&q={}'.format(keyword),
 82 |             'page_type': 'searchall',
 83 |             'page': page,
 84 |         }
 85 |         response = self.get_response(url=self.get_wb_url, params=params)
 86 |         if response is None:
 87 |             logging.error('- 关键词：{}，页码：{}\t出错'.format(keyword, page))
 88 |             return
 89 |         response = response.json()['data']['cards']
 90 |         for info in response:
 91 |             try:
 92 |                 try:
 93 |                     self.wb_info_list[info['mblog']['id']] = info['mblog']['comments_count']
 94 |                     wb_id_list.append([info['mblog']['id'], info['mblog']['comments_count']])
 95 |                 except:
 96 |                     self.wb_info_list[info['card_group'][0]['mblog']['id']] = info['card_group'][0]['mblog'][
 97 |                         'comments_count']
 98 |                     wb_id_list.append([info['card_group'][0]['mblog']['id'], info['card_group'][0]['mblog']['comments_count']])
 99 |             except Exception as e:
100 |                 pass
101 |         logging.info('{}\t{}\t{}'.format(keyword, page, len(wb_id_list)))
102 |         if wb_id_list:
103 |             return True
104 |         else:
105 |             return False
106 | 
107 |     def get_wb_content(self, id):
108 |         """获取微博原文"""
109 |         url = 'https://m.weibo.cn/statuses/extend?id={}'.format(id)
110 |         response = self.get_response(url=url)
111 |         if response is None:
112 |             return
113 |         try:
114 |             content = re.sub('<.*?>', '', response.json()['data']['longTextContent'])
115 |             self.content_id.append(id)
116 |             self.content.append(content)
117 |             logging.info('- {}\t{}'.format(id, len(content)))
118 |         except Exception as e:
119 |             logging.error('- {}\t{}'.format(e, id))
120 | 
121 |     def get_wb_comment(self, wb_id):
122 |         """获取微博评论"""
123 |         max_id = 0
124 |         max_id_type = 0
125 |         while True:
126 |             time.sleep(2)
127 |             params = {
128 |                 'id': wb_id,
129 |                 'mid': wb_id,
130 |                 'max_id': max_id,
131 |                 'max_id_type': max_id_type,
132 |             }
133 |             err_count = 0
134 |             while err_count < 4:
135 |                 response = self.get_response(url=self.comment_url, params=params, cookie='用户登录m.weibo.cn的cookie')
136 |                 if response is None:
137 |                     logging.error('{}出错'.format(wb_id))
138 |                     return
139 |                 try:
140 |                     response.json()
141 |                 except:
142 |                     logging.error('转JSON失败 --- {}'.format(response.text))
143 |                     return None
144 |                 if response.json()['ok']:
145 |                     try:
146 |                         response = response.json()['data']
147 |                         logging.info('- {}\t{}\t{}'.format(wb_id, max_id, len(response['data'])))
148 |                         for info in response['data']:
149 |                             self.comment_id.append(wb_id)
150 |                             self.comment.append(re.sub('<.*?>', '', info['text']))
151 |                         # 获取到下一页的ID，当作下次的参数使用
152 |                         next_max_id = response['max_id']
153 |                         max_id_type = response['max_id_type']
154 |                         if next_max_id == 0:
155 |                             return
156 |                         logging.info('- 下一页{}'.format(next_max_id))
157 |                         max_id = next_max_id
158 |                         time.sleep(1)
159 |                         break
160 |                     except Exception as e:
161 |                         err_count += 1
162 |                         time.sleep(5)
163 |                         logging.error('- {}\t{}\t{}'.format(wb_id, err_count, e))
164 |                         if err_count == 4:
165 |                             time.sleep(30)
166 |                             return
167 |                 else:
168 |                     logging.error('- {}\t{}'.format(response.json(), params))
169 |                     return
170 | 
171 |     def run(self):
172 |         """启动函数"""
173 |         keyword_list = ['在这里放需要搜索的关键词']
174 |         for keyword in keyword_list:
175 |             self.__init__()
176 |             logging.info('=== {} ==='.format(keyword))
177 |             flag = True
178 |             page = 1
179 |             while flag:
180 |                 the_page_wb_id = self.get_wb_id(keyword=keyword, page=page)
181 |                 if the_page_wb_id:
182 |                     page += 1
183 |                 else:
184 |                     break
185 |             logging.info(len(self.wb_info_list))
186 |             pool = ThreadPool(20)
187 |             pool.map(self.get_wb_content, list(self.wb_info_list.keys()))
188 |             for key in self.wb_info_list.keys():
189 |                 if self.wb_info_list[key]:
190 |                     self.get_wb_comment(wb_id=key)
191 | 
192 |             content_data = pd.DataFrame({
193 |                 '微博ID': self.content_id,
194 |                 '微博正文': self.content
195 |             })
196 | 
197 |             comment_data = pd.DataFrame({
198 |                 '微博ID': self.comment_id,
199 |                 '评论': self.comment
200 |             })
201 | 
202 |             """
203 |             可以在此对数据进行持久化保存
204 |             """
205 | 
206 | 
207 | if __name__ == '__main__':
208 |     t = WeiBo()
209 |     t.run()
210 | 


--------------------------------------------------------------------------------
/weibo/search_all.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | 
  4 | import pandas as pd
  5 | import requests
  6 | import random
  7 | import re
  8 | import datetime
  9 | 
 10 | from lxml import etree
 11 | 
 12 | 
 13 | class GetFansInfo(object):
 14 |     """搜索微博"""
 15 | 
 16 |     def __init__(self):
 17 |         self._headers = {
 18 |             "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36",
 19 |         }
 20 |         self.wb_id = list()
 21 |         self.user_name = list()
 22 |         self.content = list()
 23 |         self.create_date = list()
 24 |         self.img_list = list()
 25 | 
 26 |     @staticmethod
 27 |     def get_tid():
 28 |         """获取TID参数"""
 29 |         url = 'https://passport.weibo.com/visitor/genvisitor?cb=gen_callback&fp={"os":"1","browser":"Chrome89,0,4389,128","fonts":"undefined","screenInfo":"1920*1080*24","plugins":"Portable Document Format::internal-pdf-viewer::Chrome PDF Plugin|::mhjfbmdgcfjbbpaeojofohoefgiehjai::Chrome PDF Viewer|::internal-nacl-plugin::Native Client"}'
 30 |         response = requests.get(url).text
 31 |         tid = re.findall(r'"tid":"(.*?)"', response)[0]
 32 |         return tid
 33 | 
 34 |     def get_cookie(self):
 35 |         """获取 SUB 和 SUBP """
 36 |         tid = self.get_tid()
 37 |         while True:
 38 |             url = 'https://passport.weibo.com/visitor/visitor?a=incarnate&t={}&w=3&c=95&gc=&cb=cross_domain&from=weibo&_rand={}'.format(
 39 |                 tid, random.random())
 40 |             response = json.loads(re.findall(r'\((.*?)\)', requests.get(url).text)[0])
 41 |             if response.get('retcode') == 20000000 and response.get('data').get('sub'):
 42 |                 cookie = ''
 43 |                 for key in response.get('data'):
 44 |                     cookie += '{}={};'.format(key.upper(), response.get('data').get(key))
 45 |                 self._headers['cookie'] = cookie.rstrip(';')
 46 |                 return response.get('data')
 47 |             else:
 48 |                 tid = self.get_tid()
 49 | 
 50 |     def search(self):
 51 |         start_date = datetime.datetime.strptime('2020-12-11', '%Y-%m-%d')
 52 |         end_date = datetime.datetime.now() - datetime.timedelta(days=1)
 53 |         while start_date <= end_date:
 54 |             timescope1 = '{}-{}'.format(str(start_date).split()[0], start_date.hour)
 55 |             start_date += datetime.timedelta(hours=6)
 56 |             timescope2 = '{}-{}'.format(str(start_date).split()[0], start_date.hour)
 57 |             timescope = 'custom:{}:{}'.format(timescope1, timescope2)
 58 |             url = 'https://s.weibo.com/weibo'
 59 |             params = {
 60 |                 'q': '华夏家博会',
 61 |                 'typeall': '1',
 62 |                 'suball': '1',
 63 |                 'timescope': timescope,
 64 |                 'Refer': 'g',
 65 |                 'page': '1',
 66 |             }
 67 |             response = requests.get(url, headers=self._headers, params=params)
 68 |             response.encoding = 'utf8'
 69 |             if '未找到“华夏家博会”相关结果' in response.text:
 70 |                 print(timescope, '无数据')
 71 |                 continue
 72 |             html = etree.HTML(response.content)
 73 |             wb_info = html.xpath('//div[@action-type="feed_list_item"]')
 74 |             wb_id = html.xpath('//div[@action-type="feed_list_item"]/@mid')
 75 |             print(timescope, len(wb_info))
 76 |             for i in range(len(wb_info)):
 77 |                 info = wb_info[i]
 78 |                 user_name = info.xpath('.//a[@class="name"]/text()')
 79 |                 content = ''.join(info.xpath('.//p[@class="txt"]//text()'))
 80 |                 img_url = info.xpath('.//div[@node-type="feed_list_media_prev"]//img/@src')
 81 |                 create_date = info.xpath('.//p[@class="from"]/a[1]/text()')
 82 |                 if not user_name:
 83 |                     continue
 84 |                 self.wb_id.append(wb_id[i])
 85 |                 self.user_name.append(user_name[0].strip())
 86 |                 self.content.append(content)
 87 |                 self.img_list.append(img_url)
 88 |                 self.create_date.append(create_date[0].strip())
 89 |                 # item = {
 90 |                 #     'ID': wb_id[i],
 91 |                 #     '用户名': user_name[0].strip(),
 92 |                 #     '内容': content,
 93 |                 #     '图片链接': img_url,
 94 |                 #     '时间': create_date[0].strip(),
 95 |                 # }
 96 |                 # print(item)
 97 |             time.sleep(3)
 98 |         data = pd.DataFrame({
 99 |             'ID': self.wb_id,
100 |             '用户名': self.user_name,
101 |             '内容': self.content,
102 |             '图片链接': self.img_list,
103 |             '时间': self.create_date,
104 |         })
105 |         data.to_excel('微博.xlsx', encoding='ANSI', index=False)
106 | 
107 |     def run(self):
108 |         """启动函数"""
109 |         self.search()
110 | 
111 | 
112 | if __name__ == '__main__':
113 |     t = GetFansInfo()
114 |     t.run()
115 | 


--------------------------------------------------------------------------------
/weibo/weibo_comment.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Author:   玛卡巴卡
 3 | # Date:     2021/4/19 17:10
 4 | 
 5 | import requests
 6 | import time
 7 | 
 8 | 
 9 | class WBComment(object):
10 |     """抓取微博全量评论。但是需要登录"""
11 | 
12 |     def __init__(self):
13 |         self.comment_url = 'https://m.weibo.cn/comments/hotflow'
14 |         self._headers = {
15 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36',
16 |             'cookie': '用户登录后的cookie',
17 |         }
18 | 
19 |     def get_response(self, url, params=None):
20 |         """发起请求"""
21 |         response = requests.get(url=url, headers=self._headers, params=params)
22 |         if response.status_code == 200:
23 |             return response
24 |         else:
25 |             print('出错。返回的状态码是：{}'.format(response.status_code))
26 |             return None
27 | 
28 |     def start(self, wb_id):
29 |         """启动函数，接受微博ID参数"""
30 |         # 初始页码的ID。下一页的ID会存放在返回的数据中
31 |         max_id = 0
32 |         while True:
33 |             params = {
34 |                 'id': wb_id,
35 |                 'mid': wb_id,
36 |                 'max_id': max_id,
37 |                 'max_id_type': 1,
38 |             }
39 |             response = self.get_response(url=self.comment_url, params=params)
40 |             if response is None:
41 |                 print('{}出错'.format(weibo_id))
42 |                 return
43 |             response = response.json()['data']
44 |             print(max_id, len(response['data']), response['data'][0]['text'])
45 |             # 获取到下一页的ID，当作下次的参数使用
46 |             max_id = response['max_id']
47 |             time.sleep(1)
48 | 
49 | 


--------------------------------------------------------------------------------
/youdao/yd_tran.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Author: monkey-hjy
 3 | # @Date:   2021-04-27 11:35:40
 4 | # @Last Modified by:   monkey-hjy
 5 | # @Last Modified time: 2021-04-27 11:36:08
 6 | import requests
 7 | import hashlib
 8 | import time
 9 | import random
10 | 
11 | 
12 | class YDDict(object):
13 |     """有道翻译"""
14 | 
15 |     @staticmethod
16 |     def get_data(keyword):
17 |         """获取到其余的加密参数"""
18 |         md = hashlib.md5()
19 |         t = str(int(time.time() * 1000))
20 |         i = t + str(random.randrange(10))
21 |         md.update('fanyideskweb{}{}Tbh5E8=q6U3EXe+&L[4c@'.format(keyword, i).encode('utf8'))
22 |         sign = md.hexdigest()
23 |         return t, i, sign
24 | 
25 |     def translate(self, keyword='你好', data_from='AUTO', data_to='AUTO'):
26 |         """
27 |         对keyword进行翻译
28 |         params: params_from 文本语言
29 |         params: params_to 翻译成的语言类型
30 |         """
31 |         url = 'https://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule'
32 |         headers = {
33 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36',
34 |             'Referer': 'https://fanyi.youdao.com/?keyfrom=fanyi-new.logo',
35 |             'Host': 'fanyi.youdao.com',
36 |             'Origin': 'https://fanyi.youdao.com',
37 |             'Cache-Control': 'no-cache',
38 |             'Connection': 'keep-alive',
39 |         }
40 |         t, i, sign = self.get_data(keyword)
41 |         data = {
42 |             "i": keyword,
43 |             "from": data_from,
44 |             "to": data_to,
45 |             "smartresult": "dict",
46 |             "client": "fanyideskweb",
47 |             "salt": i,
48 |             "sign": sign,
49 |             "lts": t,
50 |             # 这里bv是对UA加密得到的，所以也写成了定值
51 |             "bv": "62c1eba97402d4ff4eb261254e974c27",
52 |             "doctype": "json",
53 |             "version": "2.1",
54 |             "keyfrom": "fanyi.web",
55 |             "action": "FY_BY_REALTlME",
56 |         }
57 |         response = requests.post(url, headers=headers, data=data)
58 |         # json中包含结果，自己解析一下OK
59 |         print(response.json())
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     t = YDDict()
64 |     t.translate(keyword='中国')
65 | 


--------------------------------------------------------------------------------
/zhihu/public_func.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # File: public_func.py
  3 | # Date: 2024/1/5 11:03
  4 | # Auth: HJY
  5 | # Decs:
  6 | import ctypes
  7 | import os
  8 | import random
  9 | import time
 10 | from datetime import datetime
 11 | from hashlib import md5
 12 | from urllib.parse import urlparse, parse_qs, urlencode
 13 | 
 14 | import requests
 15 | from requests import utils
 16 | from loguru import logger
 17 | 
 18 | # h：签名以来的固定数组
 19 | h = {
 20 |     "zb": [20, 223, 245, 7, 248, 2, 194, 209, 87, 6, 227, 253, 240, 128, 222, 91, 237, 9, 125, 157, 230, 93, 252,
 21 |            205, 90, 79, 144, 199, 159, 197, 186, 167, 39, 37, 156, 198, 38, 42, 43, 168, 217, 153, 15, 103, 80, 189,
 22 |            71, 191, 97, 84, 247, 95, 36, 69, 14, 35, 12, 171, 28, 114, 178, 148, 86, 182, 32, 83, 158, 109, 22, 255,
 23 |            94, 238, 151, 85, 77, 124, 254, 18, 4, 26, 123, 176, 232, 193, 131, 172, 143, 142, 150, 30, 10, 146, 162,
 24 |            62, 224, 218, 196, 229, 1, 192, 213, 27, 110, 56, 231, 180, 138, 107, 242, 187, 54, 120, 19, 44, 117,
 25 |            228, 215, 203, 53, 239, 251, 127, 81, 11, 133, 96, 204, 132, 41, 115, 73, 55, 249, 147, 102, 48, 122,
 26 |            145, 106, 118, 74, 190, 29, 16, 174, 5, 177, 129, 63, 113, 99, 31, 161, 76, 246, 34, 211, 13, 60, 68,
 27 |            207, 160, 65, 111, 82, 165, 67, 169, 225, 57, 112, 244, 155, 51, 236, 200, 233, 58, 61, 47, 100, 137,
 28 |            185, 64, 17, 70, 234, 163, 219, 108, 170, 166, 59, 149, 52, 105, 24, 212, 78, 173, 45, 0, 116, 226, 119,
 29 |            136, 206, 135, 175, 195, 25, 92, 121, 208, 126, 139, 3, 75, 141, 21, 130, 98, 241, 40, 154, 66, 184, 49,
 30 |            181, 46, 243, 88, 101, 183, 8, 23, 72, 188, 104, 179, 210, 134, 250, 201, 164, 89, 216, 202, 220, 50,
 31 |            221, 152, 140, 33, 235, 214],
 32 |     "zk": [1170614578, 1024848638, 1413669199, -343334464, -766094290, -1373058082, -143119608, -297228157,
 33 |            1933479194, -971186181, -406453910, 460404854, -547427574, -1891326262, -1679095901, 2119585428,
 34 |            -2029270069, 2035090028, -1521520070, -5587175, -77751101, -2094365853, -1243052806, 1579901135,
 35 |            1321810770, 456816404, -1391643889, -229302305, 330002838, -788960546, 363569021, -1947871109],
 36 |     "zm": [120, 50, 98, 101, 99, 98, 119, 100, 103, 107, 99, 119, 97, 99, 110, 111]
 37 | }
 38 | # salt: 签名依赖的最终数据
 39 | salt = '6fpLRqJO8M/c3jnYxFkUVC4ZIG12SiH=5v0mXDazWBTsuw7QetbKdoPyAl+hN9rgE'
 40 | # base_list: 第二次偏移需要使用的固定数组
 41 | base_list = [48, 53, 57, 48, 53, 51, 102, 55, 100, 49, 53, 101, 48, 49, 100, 55]
 42 | 
 43 | 
 44 | class PublicFunc:
 45 | 
 46 |     def __init__(self, log_name='default') -> None:
 47 |         self.now_date = datetime.now().strftime('%Y%m%d')
 48 |         log_path = '/data/log' if os.path.exists('/data/log') else '/Users/monkey/Documents/log'
 49 |         logger.add(os.path.join(log_path, f'{log_name}_{self.now_date}.log'), encoding='utf-8',
 50 |                    enqueue=True, retention='10 days')
 51 |         self._headers = {
 52 |             'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
 53 |         }
 54 | 
 55 |     @staticmethod
 56 |     def parse_params(url):
 57 |         url = urlparse(url)
 58 |         params = {k: v[0] for k, v in parse_qs(url.query).items()}
 59 |         return params
 60 | 
 61 |     @staticmethod
 62 |     def get_proxies():
 63 |         return {
 64 |             'http': 'xxx',
 65 |             'https': 'xxx',
 66 |         }
 67 | 
 68 |     def get_response(self, url, params=None, data=None, headers=None, method='get', cookies=None):
 69 |         err_count = 0
 70 |         e_res = None
 71 |         while err_count < 5:
 72 |             proxies = self.get_proxies()
 73 |             proxy_name = proxies.get('http').split('@')[-1].split('.')[1]
 74 |             try:
 75 |                 headers = self._headers if headers is None else headers
 76 |                 if method == 'get':
 77 |                     response = requests.get(url, params=params, timeout=15, headers=headers, proxies=proxies, cookies=cookies)
 78 |                 elif method == 'post':
 79 |                     response = requests.post(url, data=data, timeout=15, headers=headers, cookies=cookies)
 80 |                 else:
 81 |                     return None
 82 |                 if response.status_code == 200:
 83 |                     response.encoding = 'utf8'
 84 |                     if '网络不给力，请稍后重试' in response.text and 'paging' not in response.text:
 85 |                         raise Exception('网络不给力，请稍后重试')
 86 |                     if '安全验证' in response.text and 'paging' not in response.text:
 87 |                         raise Exception('安全验证')
 88 |                     return response
 89 |                 if '"code":4041,"name":"NotFoundError","message":"资源不存在"' in response.text:
 90 |                     return response
 91 |                 raise Exception(response.status_code)
 92 |             except Exception as e:
 93 |                 err_count += 1
 94 |                 e_res = e
 95 |         return e_res
 96 | 
 97 |     @staticmethod
 98 |     def encrypt_md5(md5_str):
 99 |         """md5 加密"""
100 |         md5_obj = md5()
101 |         md5_obj.update(md5_str.encode())
102 |         return md5_obj.hexdigest()
103 | 
104 |     @staticmethod
105 |     def str_to_unicode(translate_str):
106 |         """将str 使用ord 转换成 整型列表"""
107 |         ord_list = list()
108 |         for str_ in translate_str:
109 |             ord_list.append(ord(str_))
110 |         return ord_list
111 | 
112 |     @staticmethod
113 |     def add_params_to_list(ord_list):
114 |         """
115 |         补全 ord_list 中数据
116 |         首先第一个部分是 随机数 * 127
117 |         第二部分是 0
118 |         第三部分是 ord_list
119 |         上面三部分构成长度为34的数组
120 |         第四部分是 [14,14,14,14,14,14,14,14,14,14,14,14,14,14]
121 |         最终构成长度为48为的数组
122 |         :param ord_list:
123 |         :return:
124 |         """
125 |         params_list = list()
126 |         random_num = int(random.random() * 127)  # 随机值 控制每次签名不同
127 |         params_list.append(random_num)
128 |         params_list.append(0)
129 |         params_list.extend(ord_list)
130 |         params_list.extend([14 for i in range(14)])
131 |         return params_list
132 | 
133 |     @staticmethod
134 |     def get_head_16(params_list):
135 |         """
136 |         获取 params_list 前16位
137 |         与数组base_list做异或操作：
138 |         base_list=[48,53,57,48,53,51,102,55,100,49,53,101,48,49,100,55]
139 |         :param params_list:
140 |         :return:
141 |         """
142 |         head_16_list = [params_list[index] ^ base_list[index] ^ 42 for index in range(16)]
143 |         return head_16_list
144 | 
145 |     def js_func_g_x(self, e, t):
146 |         """
147 |         还原js 函数 __g.x
148 |         :param e:
149 |         :param t:
150 |         :return:
151 |         """
152 |         n = list()
153 |         r = len(e) // 16
154 |         # 16步进
155 |         for i in range(0, r):
156 |             a = [0 for i in range(16)]  # 16位列表
157 |             o = e[16 * i: 16 * (i + 1)]
158 |             for c in range(16):
159 |                 a[c] = o[c] ^ t[c]
160 |             t = self.js_func_g_r(a)
161 |             n.extend(t)
162 |         return n
163 | 
164 |     def js_func_g_r(self, e):
165 |         """
166 |         还原js 函数 __g.r
167 |         :param e:
168 |         :return:
169 |         """
170 |         t = [0 for i in range(16)]  # 16位列表
171 |         n = [0 for j in range(36)]  # 36位列表
172 |         n[0] = self.js_func_b(e, 0)
173 |         n[1] = self.js_func_b(e, 4)
174 |         n[2] = self.js_func_b(e, 8)
175 |         n[3] = self.js_func_b(e, 12)
176 |         for r in range(32):
177 |             o = self.js_func_g(n[r + 1] ^ n[r + 2] ^ n[r + 3] ^ h.get('zk')[r])
178 |             n[r + 4] = n[r] ^ o
179 |         self.js_func_i(n[35], t, 0)
180 |         self.js_func_i(n[34], t, 4)
181 |         self.js_func_i(n[33], t, 8)
182 |         self.js_func_i(n[32], t, 12)
183 |         return t
184 | 
185 |     @staticmethod
186 |     def js_func_b(e, t):
187 |         """
188 |         还原js 函数B
189 |         :param e:
190 |         :param t:
191 |         :return:
192 |         """
193 |         return (255 & e[t]) << 24 | (255 & e[t + 1]) << 16 | (255 & e[t + 2]) << 8 | 255 & e[t + 3]
194 | 
195 |     def js_func_g(self, e):
196 |         """
197 |         还原js function G
198 |         :param e:
199 |         :return:
200 |         """
201 | 
202 |         t = [0 for i in range(4)]  # 16位列表
203 |         n = [0 for j in range(4)]  # 36位列表
204 |         self.js_func_i(e, t, 0)  # 调用 js_func_i 设定初始值
205 |         n[0] = h.get('zb')[255 & t[0]]
206 |         n[1] = h.get('zb')[255 & t[1]]
207 |         n[2] = h.get('zb')[255 & t[2]]
208 |         n[3] = h.get('zb')[255 & t[3]]
209 |         r = self.js_func_b(n, 0)
210 |         res = r ^ self.js_func_q(r, 2) ^ self.js_func_q(r, 10) ^ self.js_func_q(r, 18) ^ self.js_func_q(r, 24)
211 |         return res
212 | 
213 |     def js_func_q(self, e, t):
214 |         """
215 |         还原js function Q
216 |         :param e:
217 |         :param t:
218 |         :return:
219 |         """
220 |         res = (4294967295 & e) << t | self.unsigned_right_shitf(e, 32 - t)
221 |         return res
222 | 
223 |     def js_func_i(self, e, t, n):
224 |         """
225 |         还原 js func i
226 |         :param e:
227 |         :param t:
228 |         :param n:
229 |         :return:
230 |         """
231 |         t[n] = 255 & self.unsigned_right_shitf(e, 24)
232 |         t[n + 1] = 255 & self.unsigned_right_shitf(e, 16)
233 |         t[n + 2] = 255 & self.unsigned_right_shitf(e, 8)
234 |         t[n + 3] = 255 & e
235 | 
236 |     def unsigned_right_shitf(self, n, i):
237 |         # 数字小于0，则转为32位无符号uint
238 |         if n < 0:
239 |             n = ctypes.c_uint32(n).value
240 |         # 正常位移位数是为正数，但是为了兼容js之类的，负数就右移变成左移好了
241 |         if i < 0:
242 |             return -self.int_overflow(n << abs(i))
243 |         return self.int_overflow(n >> i)
244 | 
245 |     @staticmethod
246 |     def int_overflow(val):
247 |         maxint = 2147483647
248 |         if not -maxint - 1 <= val <= maxint:
249 |             val = (val + (maxint + 1)) % (2 * (maxint + 1)) - maxint - 1
250 |         return val
251 | 
252 |     @staticmethod
253 |     def get_result_value_list(new_48_list):
254 |         """转换数值列表"""
255 |         # 将列表[i:i+3]切片,并饭庄
256 |         result_value_list = list()
257 |         split_list = [new_48_list[i:i + 3] for i in range(0, len(new_48_list), 3)]
258 |         split_list.reverse()
259 |         for i in range(len(split_list)):
260 |             _temp_list = split_list[i]
261 |             _temp_list.reverse()
262 |             _val = i % 4
263 |             if _val == 0:
264 |                 temp_value_1 = _temp_list[_val] ^ 58
265 |                 temp_value_2 = _temp_list[1] << 8
266 |                 temp_value_3 = _temp_list[2] << 16
267 |             elif _val == 1:
268 |                 temp_value_1 = _temp_list[0]
269 |                 temp_value_2 = (_temp_list[_val] ^ 58) << 8
270 |                 temp_value_3 = _temp_list[2] << 16
271 |             elif _val == 2:
272 |                 temp_value_1 = _temp_list[0]
273 |                 temp_value_2 = _temp_list[1] << 8
274 |                 temp_value_3 = (_temp_list[_val] ^ 58) << 16
275 |             else:
276 |                 temp_value_1 = _temp_list[0]
277 |                 temp_value_2 = _temp_list[1] << 8
278 |                 temp_value_3 = _temp_list[2] << 16
279 |             value = temp_value_1 | temp_value_2 | temp_value_3
280 |             result_value_list.append(value)
281 |         return result_value_list
282 | 
283 |     @staticmethod
284 |     def make_zhihu_sign(result_value_list):
285 |         """通过salt 转换签名字符串"""
286 |         sign_str = ''
287 |         for _value in result_value_list:
288 |             sign_str += salt[_value & 63]
289 |             sign_str += salt[_value >> 6 & 63]
290 |             sign_str += salt[_value >> 12 & 63]
291 |             sign_str += salt[_value >> 18 & 63]
292 |         return sign_str
293 | 
294 |     def test_case(self, url, d_c0):
295 |         md5_str = '101_3_3.0+' + url + d_c0
296 |         md5_res = self.encrypt_md5(md5_str)
297 |         ord_list = self.str_to_unicode(md5_res)
298 |         params_list = self.add_params_to_list(ord_list)
299 |         head_16_list = self.get_head_16(params_list)
300 |         end_32_list = params_list[16:]
301 |         new_16_list = self.js_func_g_r(head_16_list)
302 |         new_32_list = self.js_func_g_x(end_32_list, new_16_list)
303 |         new_48_list = list()
304 |         new_48_list.extend(new_16_list)
305 |         new_48_list.extend(new_32_list)
306 |         result_value_list = self.get_result_value_list(new_48_list)
307 |         sign_str = self.make_zhihu_sign(result_value_list)
308 |         return sign_str
309 | 
310 |     def get_cookie_d_c0(self, proxies=None):
311 |         end_sign = self.test_case('/udid', '')
312 |         headers = {
313 |             'x-zse-93': '101_3_3.0',
314 |             'x-api-version': '3.0.91',
315 |             'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
316 |             'x-zse-96': '2.0_' + end_sign,
317 |             'accept': '*/*',
318 |         }
319 |         d_c0 = None
320 |         err_count = 0
321 |         while err_count <= 10:
322 |             try:
323 |                 first_res = requests.post('https://www.zhihu.com/udid', data={}, headers=headers, proxies=proxies,
324 |                                           timeout=60)
325 |                 cookie_t = utils.dict_from_cookiejar(first_res.cookies)
326 |                 d_c0 = cookie_t.get('d_c0')
327 |                 return d_c0
328 |             except Exception as e:
329 |                 err_count += 1
330 |                 time.sleep(random.randint(1, 10))
331 |                 logger.error(f'get_cookie_d_c0 err_count:{err_count}, proxies: {proxies}, e: {e}')
332 |         return d_c0
333 | 
334 |     def _get_end_sign(self, md5_str):
335 |         # md5_str = '101_3_3.0+'+url+d_c0
336 |         md5_res = self.encrypt_md5(md5_str)
337 |         ord_list = self.str_to_unicode(md5_res)
338 |         params_list = self.add_params_to_list(ord_list)
339 |         head_16_list = self.get_head_16(params_list)
340 |         end_32_list = params_list[16:]
341 |         new_16_list = self.js_func_g_r(head_16_list)
342 |         new_32_list = self.js_func_g_x(end_32_list, new_16_list)
343 |         new_48_list = list()
344 |         new_48_list.extend(new_16_list)
345 |         new_48_list.extend(new_32_list)
346 |         result_value_list = self.get_result_value_list(new_48_list)
347 |         sign_str = self.make_zhihu_sign(result_value_list)
348 |         return sign_str
349 | 
350 |     @staticmethod
351 |     def get_headers(d_c0, end_sign):
352 |         headers = {
353 |             "cookie": f"d_c0={d_c0};",
354 |             'x-zse-93': '101_3_3.0',
355 |             'x-api-version': '3.0.91',
356 |             'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36',
357 |             'x-zse-96': '2.0_' + end_sign,
358 |             'accept': '*/*',
359 |             # 'referer': 'https://www.zhihu.com/search?q=%E6%B5%B7%E8%B4%BC%E7%8E%8B%E7%B4%A2%E9%9A%86%E8%BA%AB%E4%B8%96%E6%8F%AD%E7%A7%98&type=zvideo&utm_content=search_hot',
360 |             'accept-encoding': 'gzip, deflate',
361 |             'accept-language': 'zh-CN,zh;q=0.9',
362 |         }
363 |         return headers
364 | 
365 |     def run(self, keyword):
366 |         url = f'https://www.zhihu.com/api/v4/search_v3?gk_version=gz-gaokao&t=general&q={keyword}&correction=1&offset=0&limit=20&filter_fields=&lc_idx=0&show_all_topics=0&search_source=Filter&vertical=answer&time_interval=a_week'
367 |         url_params = t.parse_params(url)
368 |         params = url_params
369 |         offset = url_params.get('offset', 0)
370 |         req_url = 'https://www.zhihu.com/api/v4/search_v3'
371 |         reply_num = 0
372 |         while True:
373 |             d_c0 = t.get_cookie_d_c0()
374 |             end_sign = t._get_end_sign(f'101_3_3.0+/api/v4/search_v3?{urlencode(params)}+{d_c0}')
375 |             headers = t.get_headers(d_c0, end_sign)
376 |             response = t.get_response(url=req_url, headers=headers, params=params)
377 |             if isinstance(response, requests.Response):
378 |                 break
379 |             reply_num += 1
380 |             logger.error(
381 |                 f'search keyword reply {reply_num} times! keyword: {keyword}, offset: {offset}, e: {response}')
382 |             if reply_num > 50:
383 |                 return
384 |         response = response.json()
385 |         return response
386 | 
387 | 
388 | if __name__ == '__main__':
389 |     t = PublicFunc()
390 |     keyword = '海贼王'
391 |     t.run(keyword)
392 | 
393 | 


--------------------------------------------------------------------------------
/ziru/zr.py:
--------------------------------------------------------------------------------
  1 | from pytesseract.pytesseract import image_to_string
  2 | import requests
  3 | from lxml import etree
  4 | from PIL import Image
  5 | import pytesseract
  6 | import re
  7 | import time
  8 | import os
  9 | import pymysql
 10 | 
 11 | 
 12 | class Ziru(object):
 13 | 
 14 |     def __init__(self):
 15 |         self._headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'}
 16 |         self.city_info =  dict()
 17 |         self.cwd = '/'.join(__file__.split('/')[:-1])
 18 |         self.conn = pymysql.Connection(host='localhost', user='root', password='root', database='demo', port=3306)
 19 |         self.cursor = self.conn.cursor()
 20 |     
 21 |     def __del__(self):
 22 |         self.conn.close()
 23 |     
 24 |     def get_response(self, url):
 25 |         response = requests.get(url, headers=self._headers)
 26 |         if response.status_code == 200:
 27 |             response.encoding = response.apparent_encoding
 28 |             return response
 29 |         else:
 30 |             print(response.status_code)
 31 |             return None
 32 |     
 33 |     def get_city_info(self):
 34 |         response = self.get_response(url='https://www.ziroom.com/')
 35 |         if response is None:
 36 |             return
 37 |         html = etree.HTML(response.text)
 38 |         city_name = html.xpath('//a[@class="Z_city_option ani"]/text()')
 39 |         city_url = html.xpath('//a[@class="Z_city_option ani"]/@href')
 40 |         self.city_info = dict(zip(city_name, city_url))
 41 |     
 42 |     @staticmethod
 43 |     def image_identification(img_path):
 44 |         the_img = Image.open(img_path)
 45 |         result = pytesseract.image_to_string(the_img, config='--psm 7')
 46 |         os.remove(img_path)
 47 |         return list(result.strip())
 48 |     
 49 |     def get_zone_info(self, city_url):
 50 |         response = self.get_response(city_url + 'z/')
 51 |         if response is None:
 52 |             return
 53 |         html = etree.HTML(response.text)
 54 |         zone_url = html.xpath('//a[text()="区域"]/following-sibling::div/a/@href')
 55 |         zone_name = html.xpath('//a[text()="区域"]/following-sibling::div/a/text()')
 56 |         zone_info = dict(zip(zone_name, zone_url))
 57 |         for key in zone_info:
 58 |             print('开始获取{}的数据'.format(key))
 59 |             self.get_room_info('https:{}'.format(zone_info[key]))
 60 |     
 61 |     def get_room_info(self, url):
 62 |         response = self.get_response(url)
 63 |         if response is None:
 64 |             print('{}获取失败'.format(url))
 65 |             return
 66 |         print(url)
 67 |         html = etree.HTML(response.text)
 68 |         title = html.xpath('//h5[starts-with(@class, "title")]/a/text()')
 69 |         room_url = ['https:{}'.format(info) for info in html.xpath('//h5[starts-with(@class, "title")]/a/@href')]
 70 |         desc = html.xpath('//div[@class="desc"]/div[1]/text()')
 71 |         location = [info.strip() for info in html.xpath('//div[@class="location"]/text()')]
 72 |         room_price = list()
 73 |         room_element = html.xpath('//div[@class="Z_list"]/div[2]/div')
 74 |         for element in room_element:
 75 |             price = ''
 76 |             img_url = element.xpath('.//span[@class="num"]/@style')
 77 |             if not img_url:
 78 |                 continue
 79 |             img_url = re.findall('url\((.*?)\)', img_url[0])[0]
 80 |             price_position = [float(re.findall('position: -(.*?)px', info)[0]) for info in element.xpath('.//span[@class="num"]/@style')]
 81 |             img_path = os.path.join(self.cwd, img_url.split('/')[-1])
 82 |             with open(img_path, 'wb') as f:
 83 |                 f.write(self.get_response('https:{}'.format(img_url)).content)
 84 |             img_nums = self.image_identification(img_path)
 85 |             for position in price_position:
 86 |                 price += img_nums[int(position / 20)]
 87 |             try:
 88 |                 room_price.append(int(price))
 89 |             except:
 90 |                 room_price.append(None)
 91 |         data = {
 92 |             '标题': title,
 93 |             '链接': room_url,
 94 |             '信息': desc,
 95 |             '地址': location,
 96 |             '价格': room_price,
 97 |         }
 98 |         self.save_data(data)
 99 |         next_url = html.xpath('//a[@class="next"]/@href')
100 |         if next_url:
101 |             self.get_room_info('https:{}'.format(next_url[0]))
102 |     
103 |     def save_data(self, item):
104 |         data = list()
105 |         for i in range(len(item['标题'])):
106 |             info = list()
107 |             for key in item.keys():
108 |                 info.append(item[key][i])
109 |             data.append(info)
110 |         sql = 'INSERT INTO ziru (title, url, info, location, price) VALUES (%s, %s, %s, %s, %s);'
111 |         # print(data)
112 |         self.cursor.executemany(sql, data)
113 |         self.conn.commit()
114 |         print('保存成功{}条'.format(len(data)))
115 |     
116 |     def run(self):
117 |         self.get_zone_info('https://sh.ziroom.com/')
118 | 
119 | 
120 | if __name__ == '__main__':
121 |     s = Ziru()
122 |     s.run()
123 | 
124 | 


--------------------------------------------------------------------------------