├── .gitignore
├── LICENSE
├── README.md
├── bilibili
├── analysis.py
├── assign_up.ini.tmp
├── basicBilibili.py
├── bsocket.py
├── geetestE.py
├── loginBilibili.py
└── upBilibili.py
├── blog
└── titleviews.py
├── brushclass
└── brushclass.py
├── buildmd
├── activateArticle.py
├── article.sql
├── buildmd.py
├── tbk.ini.tmp
└── tpwd.sql
├── ctrip
├── hotelDetail.js
└── hotelDetail.py
├── dytt8
└── dytt8.py
├── eastmoney
└── eastmoney.py
├── exam
├── shaoq.js
└── shaoq.py
├── mafengwo
├── hotel.js
└── mafengwo.py
├── movie
└── douban.py
├── netease
├── netease_music_base.py
├── netease_music_db.py
└── table.sql
├── news
└── news.py
├── press
└── press.py
├── proxy
├── getproxy.py
├── ip66.py
└── table.sql
├── requirement.txt
├── util
├── db.py
├── util.ini.tmp
└── util.py
└── zimuzu
├── zimuzu.ini.tmp
└── zimuzu.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
2 |
3 | # python cache
4 | __pycache__
5 |
6 | # jupyter
7 | .ipynb_checkpoints
8 |
9 | # test
10 | test*
11 | Untitled*
12 |
13 | # gatherproxy
14 | gatherproxy
15 |
16 | # log
17 | log
18 |
19 | # song_detail
20 | song_detail
21 |
22 | # ide
23 | .idea
24 | .vscode
25 |
26 | # data
27 | data
28 | yybzz
29 |
30 | .DS_Store
31 |
32 | *.csv
33 | *.txt
34 | *.ini
35 |
36 | # utils.agent
37 | utils/agent
38 |
39 | # history
40 | .history
41 |
42 | # tbk
43 | top
44 | picture*
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018-present gunjianpan(iofu728)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | 
4 | Spider Man
5 |
6 | [](https://github.com/iofu728/spider/blob/master/LICENSE)
7 | [](https://github.com/iofu728/spider/releases)
8 | [](https://github.com/iofu728/spider)
9 |
10 | 高可用代理IP池 高并发生成器 一些实战经验
11 | Highly Available Proxy IP Pool, Highly Concurrent Request Builder, Some Application
12 |
13 | ## Navigation
14 |
15 | | site | document | Last Modified time |
16 | | -------------------- | ----------------------------------------- | ------------------ |
17 | | some proxy site,etc. | [Proxy pool](#proxy-pool) | 20-06-01 |
18 | | music.163.com | [Netease](#netease) | 18-10-21 |
19 | | - | [Press Test System](#press-test-system) | 18-11-10 |
20 | | news.baidu.com | [News](#news) | 19-01-25 |
21 | | note.youdao.com | [Youdao Note](#youdao-note) | 20-01-04 |
22 | | jianshu.com/csdn.net | [blog](#blog) | 20-01-04 |
23 | | elective.pku.edu.cn | [Brush Class](#brush-class) | 19-10-11 |
24 | | zimuzu.tv | [zimuzu](#zimuzu) | 19-04-13 |
25 | | bilibili.com | [Bilibili](#bilibili) | 20-06-06 |
26 | | exam.shaoq.com | [shaoq](#shaoq) | 19-03-21 |
27 | | data.eastmoney.com | [Eastmoney](#eastmoney) | 19-03-29 |
28 | | hotel.ctrip.com | [Ctrip Hotel Detail](#ctrip-hotel-detail) | 19-10-11 |
29 | | douban.com | [DouBan](#douban) | 19-05-07 |
30 | | 66ip.cn | [66ip](#66ip) | 19-05-07 |
31 |
32 | ## keyword
33 |
34 | - Big data store
35 | - High concurrency requests
36 | - Support WebSocket
37 | - method for font cheat
38 | - method for js compile
39 | - Some Application
40 |
41 | ## Quick Start
42 |
43 | `docker` is on the road.
44 |
45 | ```bash
46 | $ git clone https://github.com/iofu728/spider.git
47 | $ cd spider
48 | $ pip install -r requirement.txt
49 |
50 | # load proxy pool
51 | $ python proxy/getproxy.py # to load proxy resources
52 | ```
53 |
54 | > To use proxy pool
55 |
56 | ```python
57 | ''' using proxy requests '''
58 | from proxy.getproxy import GetFreeProxy # to use proxy
59 | proxy_req = GetFreeProxy().proxy_req
60 | proxy_req(url:str, types:int, data=None, test_func=None, header=None)
61 |
62 | ''' using basic requests '''
63 | from util.util import basic_req
64 | basic_req(url: str, types: int, proxies=None, data=None, header=None, need_cookie: bool = False)
65 | ```
66 |
67 | ## Structure
68 |
69 | ```bash
70 | .
71 | ├── LICENSE
72 | ├── README.md
73 | ├── bilibili
74 | │ ├── analysis.py // data analysis
75 | │ ├── bilibili.py // bilibili basic
76 | │ └── bsocket.py // bilibili websocket
77 | ├── blog
78 | │ └── titleviews.py // Zhihu && CSDN && jianshu
79 | ├── brushclass
80 | │ └── brushclass.py // PKU elective
81 | ├── buildmd
82 | │ └── buildmd.py // Youdao Note
83 | ├── eastmoney
84 | │ └── eastmoney.py // font analysis
85 | ├── exam
86 | │ ├── shaoq.js // jsdom
87 | │ └── shaoq.py // compile js shaoq
88 | ├── log
89 | ├── netease
90 | │ ├── netease_music_base.py
91 | │ ├── netease_music_db.py // Netease Music
92 | │ └── table.sql
93 | ├── news
94 | │ └── news.py // Google && Baidu
95 | ├── press
96 | │ └── press.py // Press text
97 | ├── proxy
98 | │ ├── getproxy.py // Proxy pool
99 | │ └── table.sql
100 | ├── requirement.txt
101 | ├── utils
102 | │ ├── db.py
103 | │ └── utils.py
104 | └── zimuzu
105 | └── zimuzu.py // zimuzi
106 | ```
107 |
108 | ## Proxy pool
109 |
110 | > proxy pool is the heart of this project.
111 |
112 | - Highly Available Proxy IP Pool
113 | - By obtaining data from `Gatherproxy`, `Goubanjia`, `xici` etc. Free Proxy WebSite
114 | - Analysis of the Goubanjia port data
115 | - Quickly verify IP availability
116 | - Cooperate with Requests to automatically assign proxy Ip, with Retry mechanism, fail to write DB mechanism
117 | - two models for proxy shell
118 | - model 1: load gather proxy list && update proxy list file(need over the GFW, your personality passwd in http://gatherproxy.com to `proxy/data/passage` one line by username, one line by passwd)
119 | - model 0: update proxy pool db && test available
120 | - one common proxy api
121 | - `from proxy.getproxy import GetFreeProxy`
122 | - `proxy_req = GetFreeProxy().proxy_req`
123 | - `proxy_req(url: str, types: int, data=None, test_func=None, header=None)`
124 | - also one common basic req api
125 | - `from util import basic_req`
126 | - `basic_req(url: str, types: int, proxies=None, data=None, header=None)`
127 | - if you want spider by using proxy
128 | - because access proxy web need over the GFW, so maybe you can't use `model 1` to download proxy file.
129 | - download proxy txt from 'http://gatherproxy.com'
130 | - cp download_file proxy/data/gatherproxy
131 | - python proxy/getproxy.py --model==0
132 |
133 | ## Netease
134 |
135 | > Netease Music song playlist crawl - [netease/netease_music_db.py](https://github.com/iofu728/spider/blob/master/netease/netease_music_db.py)
136 |
137 | - problem: `big data store`
138 | - classify -> playlist id -> song_detail
139 | - V1 Write file, One run version, no proxy, no record progress mechanism
140 | - V1.5 Small amount of proxy IP
141 | - V2 Proxy IP pool, Record progress, Write to MySQL
142 |
143 | - Optimize the write to DB `Load data/ Replace INTO`
144 |
145 | - [Netease Music Spider for DB](https://wyydsb.xin/other/neteasedb.html)
146 | - [Netease Music Spider](https://wyydsb.xin/other/netease.html)
147 |
148 | ## Press Test System
149 |
150 | > Press Test System - [press/press.py](https://github.com/iofu728/spider/blob/master/press/press.py)
151 |
152 | - problem: `high concurrency requests`
153 | - By highly available proxy IP pool to pretend user.
154 | - Give some web service uneven pressure
155 | - To do: press uniform
156 |
157 | ## News
158 |
159 | > google & baidu info crawl- [news/news.py](https://github.com/iofu728/spider/blob/master/news/news.py)
160 |
161 | - get news from search engine by Proxy Engine
162 | - one model: careful analysis `DOM`
163 | - the other model: rough analysis `Chinese words`
164 |
165 | ## Youdao Note
166 |
167 | > Youdao Note documents crawl - [buildmd/buildmd.py](https://github.com/iofu728/spider/blob/master/buildmd/buildmd.py)
168 |
169 | - load data from `youdaoyun`
170 | - by series of rules to deal data to .md
171 |
172 | ## blog
173 |
174 | > csdn && zhihu && jianshu view info crawl - [blog/titleview.py](https://github.com/iofu728/spider/blob/master/blog/titleviews.py)
175 |
176 | ```bash
177 | $ python blog/titleviews.py --model=1 >> log 2>&1 # model = 1: load gather model or python blog/titleviews.py --model=1 >> proxy.log 2>&1
178 | $ python blog/titleviews.py --model=0 >> log 2>&1 # model = 0: update gather model
179 | ```
180 |
181 | ## Brush Class
182 |
183 | > PKU Class brush - [brushclass/brushclass.py](https://github.com/iofu728/spider/blob/master/brushclass/brushclass.py)
184 |
185 | - when your expected class have places, It will send you some email.
186 |
187 | ## zimuzu
188 |
189 | > ZiMuZu download list crawl - [zimuzu/zimuzu.py](https://github.com/iofu728/spider/blob/master/zimuzu/zimuzu.py)
190 |
191 | - when you want to download lots of show like Season 22, Season 21.
192 | - If click one by one, It is very boring, so zimuzu.py is all you need.
193 | - The thing you only need do is to wait for the program run.
194 | - And you copy the Thunder URL for one to download the movies.
195 | - Now The Winter will come, I think you need it to review ``.
196 |
197 | ## Bilibili
198 |
199 | > Get av data by http - [bilibili/bilibili.py](https://github.com/iofu728/spider/blob/master/bilibili/bilibili.py)
200 |
201 | - `homepage rank` -> check `tids` -> to check data every 2min(during on rank + one day)
202 | - monitor every rank av -> star num & basic data
203 |
204 | > Get av data by websocket - [bilibili/bsocket.py](https://github.com/iofu728/spider/blob/master/bilibili/bsocket.py)
205 |
206 | - base on WebSocket
207 | - byte analysis
208 | - heartbeat
209 |
210 | > Get comment data by http - [bilibili/bilibili.py](https://github.com/iofu728/spider/blob/master/bilibili/bilibili.py)
211 |
212 | - load comment from `/x/v2/reply`
213 |
214 | - UnicodeEncodeError: 'ascii' codec can't encode characters in position 7-10: ordinal not in range(128)
215 |
216 | - read/write in `utf-8`
217 | - with codecs.open(filename, 'r/w', encoding='utf-8')
218 |
219 | - `bilibili` some url return 404 like `http://api.bilibili.com/x/relation/stat?jsonp=jsonp&callback=__jp11&vmid=`
220 |
221 | basic_req auto add `host` to headers, but this URL can't request in ‘Host’
222 |
223 | ## shaoq
224 |
225 | > Get text data by compiling javascript - [exam/shaoq.py](https://github.com/iofu728/spider/blob/master/exam/shaoq.py)
226 |
227 | - Idea
228 |
229 | 1. get cookie
230 | 2. request image
231 | 3. requests after 5.5s
232 | 4. compile javascript code -> get css
233 | 5. analysic css
234 |
235 | - Requirement
236 |
237 | ```sh
238 | pip3 install PyExecJS
239 | yarn install add jsdom # npm install jsdom PS: not global
240 | ```
241 |
242 | - Can't get true html
243 |
244 | - Wait time must be 5.5s.
245 | - So you can use `threading` or `await asyncio.gather` to request image
246 |
247 | - [Coroutines and Tasks](https://docs.python.org/3/library/asyncio-task.html)
248 |
249 | - Error: Cannot find module 'jsdom'
250 |
251 | > jsdom must install in local not in global
252 |
253 | - [Cannot find module 'jsdom'](https://github.com/scala-js/scala-js/issues/2642)
254 |
255 | - remove subtree & edit subtree & re.findall
256 |
257 | ```py
258 | subtree.extract()
259 | subtree.string = new_string
260 | parent_tree.find_all(re.compile('''))
261 | ```
262 |
263 | - [extract()](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#extract)
264 | - [NavigableString](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigablestring)
265 | - [A regular expression](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#a-regular-expression)
266 |
267 | ## Eastmoney
268 |
269 | > Get stock info by analysis font - [eastmoney/eastmoney.py](https://github.com/iofu728/spider/blob/master/eastmoney/eastmoney.py)
270 |
271 | - font analysis
272 |
273 | - Idea
274 |
275 | 1. get data from HTML -> json
276 | 2. get font map -> transform num
277 | 3. or load font analysis font(contrast with base)
278 |
279 | - error: unpack requires a buffer of 20 bytes
280 |
281 | - requests.text -> str,
282 | - requests.content -> byte
283 |
284 | - [Struct.error: unpack requires a buffer of 16 bytes](https://stackoverflow.com/questions/51110525/struct-error-unpack-requires-a-buffer-of-16-bytes)
285 |
286 | - How to analysis font
287 |
288 | - use fonttools
289 | - get TTFont().getBestCamp()
290 | - contrast with base
291 |
292 | - configure file
293 |
294 | - cfg = ConfigParser()
295 | - cfg.read(assign_path, 'utf-8')
296 | - [13.10read configure file](https://python3-cookbook.readthedocs.io/zh_CN/latest/c13/p10_read_configuration_files.html)
297 |
298 | ## Ctrip Hotel Detail
299 |
300 | > Get Ctrip Hotel True Detail - [ctrip/hotelDetail.py](https://github.com/iofu728/spider/blob/master/ctrip/hotelDetail.py)
301 |
302 | - int32
303 |
304 | ```python
305 | np.int32()
306 | ```
307 |
308 | - js charCodeAt() in py
309 |
310 | [python 中如何实现 js 里的 charCodeAt()方法?](https://www.zhihu.com/question/57108214)
311 |
312 | ```python
313 | ord(string[index])
314 | ```
315 |
316 | - python access file fold import
317 |
318 | ```python
319 | import sys
320 | sys.path.append(os.getcwd())
321 | ```
322 |
323 | - generate char list
324 |
325 | using ASCII
326 |
327 | ```python
328 | lower_char = [chr(i) for i in range(97,123)] # a-z
329 | upper_char = [chr(i) for i in range(65,91)] # A-Z
330 | ```
331 |
332 | - Can't get cookie in `document.cookie`
333 |
334 | Service use `HttpOnly` in `Set-Cookie`
335 |
336 | - [Why doesn't document.cookie show all the cookie for the site?](https://stackoverflow.com/questions/1022112/why-doesnt-document-cookie-show-all-the-cookie-for-the-site)
337 | - [Secure and HttpOnly](https://en.wikipedia.org/wiki/HTTP_cookie#Secure_and_HttpOnly)
338 |
339 | > The Secure attribute is meant to keep cookie communication limited to encrypted transmission, directing browsers to use cookies only via secure/encrypted connections. However, if a web server sets a cookie with a secure attribute from a non-secure connection, the cookie can still be intercepted when it is sent to the user by **man-in-the-middle attacks**. Therefore, for maximum security, cookies with the Secure attribute should only be set over a secure connection.
340 | >
341 | > The HttpOnly attribute directs browsers not to expose cookies through channels other than HTTP (and HTTPS) requests. This means that the cookie cannot be accessed via client-side scripting languages (notably JavaScript), and therefore cannot be stolen easily via cross-site scripting (a pervasive attack technique).
342 |
343 | - ctrip cookie analysis
344 |
345 | | key | method | how | constant | login | finish |
346 | | ----------------------------- | ------ | --------------------------------------------------------------------------------------------------- | -------- | ----- | ------ |
347 | | `magicid` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 |
348 | | `ASP.NET_SessionId` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 |
349 | | `clientid` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 |
350 | | `_abtest_userid` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 |
351 | | `hoteluuid` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 |
352 | | `fcerror` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 |
353 | | `_zQdjfing` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 |
354 | | `OID_ForOnlineHotel` | js | `https://webresource.c-ctrip.com/ResHotelOnline/R8/search/js.merge/showhotelinformation.js` | 1 | 0 |
355 | | `_RSG` | req | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 |
356 | | `_RDG` | req | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 |
357 | | `_RGUID` | set | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 |
358 | | `_ga` | js | for google analysis | 1 | 0 |
359 | | `_gid` | js | for google analysis | 1 | 0 |
360 | | `MKT_Pagesource` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R3/float/floating_normal.min.js` | 1 | 0 |
361 | | `_HGUID` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 |
362 | | `HotelDomesticVisitedHotels1` | set | `https://hotels.ctrip.com/Domestic/tool/AjaxGetHotelAddtionalInfo.ashx` | 1 | 0 |
363 | | `_RF1` | req | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 |
364 | | `appFloatCnt` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R3/float/floating_normal.min.js?20190428` | 1 | 0 |
365 | | `gad_city` | set | `https://crm.ws.ctrip.com/Customer-Market-Proxy/AdCallProxyV2.aspx` | 1 | 0 |
366 | | `login_uid` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
367 | | `login_type` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
368 | | `cticket` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
369 | | `AHeadUserInfo` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
370 | | `ticket_ctrip` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
371 | | `DUID` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
372 | | `IsNonUser` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 |
373 | | `UUID` | req | `https://passport.ctrip.com/gateway/api/soa2/12770/setGuestData` | 1 | 1 |
374 | | `IsPersonalizedLogin` | js | `https://webresource.c-ctrip.com/ares2/basebiz/cusersdk/~0.0.8/default/login/1.0.0/loginsdk.min.js` | 1 | 1 |
375 | | `_bfi` | js | `https://webresource.c-ctrip.com/code/ubt/_bfa.min.js?v=20193_28.js` | 1 | 0 |
376 | | `_jzqco` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R1/remarketing/js/mba_ctrip.js` | 1 | 0 |
377 | | `__zpspc` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R1/remarketing/js/s.js` | 1 | 0 |
378 | | `_bfa` | js | `https://webresource.c-ctrip.com/code/ubt/_bfa.min.js?v=20193_28.js` | 1 | 0 |
379 | | `_bfs` | js | `https://webresource.c-ctrip.com/code/ubt/_bfa.min.js?v=20193_28.js` | 1 | 0 |
380 | | `utc` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 |
381 | | `htltmp` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 |
382 | | `htlstm` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 |
383 | | `arp_scroll_position` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 |
384 |
385 | - some fusion in ctrip
386 |
387 | ```js
388 | function a31(a233, a23, a94) {
389 | var a120 = {
390 | KWcVI: "mMa",
391 | hqRkQ: function a272(a309, a20) {
392 | return a309 + a20;
393 | },
394 | WILPP: function a69(a242, a488) {
395 | return a242(a488);
396 | },
397 | ydraP: function a293(a338, a255) {
398 | return a338 == a255;
399 | },
400 | ceIER: ";expires=",
401 | mDTlQ: function a221(a234, a225) {
402 | return a234 + a225;
403 | },
404 | dnvrD: function a268(a61, a351) {
405 | return a61 + a351;
406 | },
407 | DIGJw: function a368(a62, a223) {
408 | return a62 == a223;
409 | },
410 | pIWEz: function a260(a256, a284) {
411 | return a256 + a284;
412 | },
413 | jXvnT: ";path=/",
414 | };
415 | if (a120["KWcVI"] !== a120["KWcVI"]) {
416 | var a67 = new Date();
417 | a67[a845("0x1a", "4Vqw")](
418 | a120[a845("0x1b", "RswF")](a67["getDate"](), a94)
419 | );
420 | document[a845("0x1c", "WjvM")] =
421 | a120[a845("0x1d", "3082")](a233, "=") +
422 | a120[a845("0x1e", "TDHu")](escape, a23) +
423 | (a120["ydraP"](a94, null)
424 | ? ""
425 | : a120["hqRkQ"](a120["ceIER"], a67[a845("0x1f", "IErH")]())) +
426 | a845("0x20", "eHIq");
427 | } else {
428 | var a148 = a921(this, function() {
429 | var a291 = function() {
430 | return "dev";
431 | },
432 | a366 = function() {
433 | return "window";
434 | };
435 | var a198 = function() {
436 | var a168 = new RegExp("\\w+ *\\(\\) *{\\w+ *[' | '].+[' | '];? *}");
437 | return !a168["test"](a291["toString"]());
438 | };
439 | var a354 = function() {
440 | var a29 = new RegExp("(\\[x|u](\\w){2,4})+");
441 | return a29["test"](a366["toString"]());
442 | };
443 | var a243 = function(a2) {
444 | var a315 = ~-0x1 >> (0x1 + (0xff % 0x0));
445 | if (a2["indexOf"]("i" === a315)) {
446 | a310(a2);
447 | }
448 | };
449 | var a310 = function(a213) {
450 | var a200 = ~-0x4 >> (0x1 + (0xff % 0x0));
451 | if (a213["indexOf"]((!![] + "")[0x3]) !== a200) {
452 | a243(a213);
453 | }
454 | };
455 | if (!a198()) {
456 | if (!a354()) {
457 | a243("indеxOf");
458 | } else {
459 | a243("indexOf");
460 | }
461 | } else {
462 | a243("indеxOf");
463 | }
464 | });
465 | // a148();
466 | var a169 = new Date();
467 | a169["setDate"](a169["getDate"]() + a94);
468 | document["cookie"] = a120["mDTlQ"](
469 | a120["dnvrD"](
470 | a120["dnvrD"](a120["dnvrD"](a233, "="), escape(a23)),
471 | a120["DIGJw"](a94, null)
472 | ? ""
473 | : a120["pIWEz"](a120["ceIER"], a169["toGMTString"]())
474 | ),
475 | a120["jXvnT"]
476 | );
477 | }
478 | }
479 | ```
480 |
481 | equal to
482 |
483 | ```js
484 | document["cookie"] =
485 | a233 +
486 | "=" +
487 | escape(a23) +
488 | (a94 == null ? "" : ";expires=" + a169["toGMTString"]()) +
489 | ";path=/";
490 | ```
491 |
492 | So, It is only a function to set cookie & expires.
493 |
494 | And you can think `a31` is a entry point to judge where code about compiler cookie.
495 |
496 | - Get current timezone offset
497 |
498 | ```python
499 | import datetime, tzlocal
500 | local_tz = tzlocal.get_localzone()
501 | timezone_offset = -int(local_tz.utcoffset(datetime.datetime.today()).total_seconds() / 60)
502 | ```
503 |
504 | - JSON.stringfy(e)
505 |
506 | ```python
507 | import json
508 | json.dumps(e, separators=(',', ':'))
509 | ```
510 |
511 | - [JSON.stringify (Javascript) and json.dumps (Python) not equivalent on a list?](https://stackoverflow.com/questions/46227854/json-stringify-javascript-and-json-dumps-python-not-equivalent-on-a-list)
512 |
513 | - Element.getBoundingClientRect()
514 |
515 | return Element position
516 |
517 | - [Element.getBoundingClientRect()](https://developer.mozilla.org/en-US/docs/Web/API/Element/getBoundingClientRect)
518 | - [EventTarget.addEventListener()](https://developer.mozilla.org/en-US/docs/Web/API/EventTarget/addEventListener)
519 |
520 | ## DouBan
521 |
522 | - RuntimeError: dictionary changed size during iteration (when user pickle)
523 |
524 | - This situation maybe happen when your pickle params change in pickling.
525 | - so copy of your params before pickle
526 |
527 | ```python
528 | comment_loader = comment.copy()
529 | dump_bigger(comment_loader, '{}data.pkl'.format(data_dir))
530 | ```
531 |
532 | [How to avoid “RuntimeError: dictionary changed size during iteration” error?](https://stackoverflow.com/questions/11941817/how-to-avoid-runtimeerror-dictionary-changed-size-during-iteration-error)
533 | [pickling SimpleLazyObject fails just after accessing related object of wrapped model instance.](https://code.djangoproject.com/ticket/25426)
534 |
535 | - RecursionError: maximum recursion depth exceeded while pickling an object
536 |
537 | - object depth more than MAXIMUM stack depth
538 |
539 | ```python
540 | import sys
541 | sys.setrecursionlimit(10000)
542 | ```
543 |
544 | ## 66ip
545 |
546 | > Q: @liu wong 一段 js 代码在浏览器上执行的结果和在 python 上用 execjs 执行的结果不一样,有啥原因呢? http://www.66ip.cn/
547 |
548 | > A: 一般 eval 差异 主要是有编译环境,DOM,py 与 js 的字符规则,context 等有关
549 | > 像 66ip 这个网站,主要是从 py 与 js 的字符规则不同 + DOM 入手的,当然它也有可能是无意的(毕竟爬虫工程师用的不只是 py)
550 | > 首次访问 66ip 这个网站,会返回一个 521 的 response,header 里面塞了一个 HTTP-only 的 cookie,body 里面塞了一个 script
551 |
552 | ```js
553 | var x = "@...".replace(/@*$/, "").split("@"),
554 | y = "...",
555 | f = function(x, y) {
556 | return num;
557 | },
558 | z = f(
559 | y
560 | .match(/\w/g)
561 | .sort(function(x, y) {
562 | return f(x) - f(y);
563 | })
564 | .pop()
565 | );
566 | while (z++)
567 | try {
568 | eval(
569 | y.replace(/\b\w+\b/g, function(y) {
570 | return x[f(y, z) - 1] || "_" + y;
571 | })
572 | );
573 | break;
574 | } catch (_) {}
575 | ```
576 |
577 | > 可以看到 eval 的是 y 字符串用 x 数组做了一个字符替换之后的结果,所以按道理应该和编译环境没有关系,但把 eval 改成 aa 之后放在 py 和放在 node,chrome 中编译结果却不一样
578 | > 这是因为在 p 正则\b 会被转义为\x80,这就会导致正则匹配不到,就更不可能替换了,导致我们拿到的 eval_script 实际上是一串乱码
579 | > 这里用 r'{}'.format(eval_script) 来防止特殊符号被转义
580 | > 剩下的就是 对拿到的 eval_script 进行 dom 替换操作
581 | > 总的来说是一个挺不错的 js 逆向入门练手项目, 代码量不大,逻辑清晰
582 | > 具体代码参见[iofu728/spider](https://github.com/iofu728/spider/blob/master/proxy/ip66.py)
583 |
584 | 
585 |
586 | ## OceanBall V2
587 |
588 | check param list:
589 |
590 | | param | Ctrip | Incognito | Node | !!import |
591 | | ------------ | ----- | --------- | ---- | -------- |
592 | | define | ✔ | x | x |
593 | | \_\_filename | x | x | x |
594 | | module | x | x | ✔ | x |
595 | | process | ✔ | x | ✔ |
596 | | \_\_dirname | ✔ | x | x |
597 | | global | x | x | ✔ | x |
598 | | INT_MAX | ✔ | x | x |
599 | | require | ✔ | x | ✔ | ✔ |
600 | | History | ✔ | x |
601 | | Location | ✔ | x |
602 | | Window | ✔ | x |
603 | | Document | ✔ | x |
604 | | window | ✔ | x |
605 | | navigator | ✔ | x |
606 | | history | ✔ | x |
607 |
608 | **----To be continued----**
609 |
--------------------------------------------------------------------------------
/bilibili/analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-04-04 10:57:24
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-03-24 01:37:39
6 |
7 | import pandas as pd
8 | import numpy as np
9 | import time
10 | import os
11 | from util.util import time_stamp, echo, read_file
12 |
13 | root_dir = os.path.abspath('bilibili')
14 | data_dir = os.path.join(root_dir, 'data/')
15 | history_data_dir = os.path.join(data_dir, 'history_data/')
16 | history_dir = os.path.join(data_dir, 'history/')
17 |
18 |
19 | def analysis_csv():
20 | data_dir = 'bilibili/'
21 | df = pd.read_csv('%spublic.csv' % data_dir)
22 |
23 | '''one day'''
24 | df['fan'] = df['3'].fillna(0)
25 | df['time'] = df['1'].map(lambda x: x.split(None, 1)[1])
26 | df['fanadd'] = df['4'] - df['3']
27 | df['fanadd'] = df['fanadd'].map(lambda x: x if x > 0 else 0)
28 | df['fanadd_ratio'] = df['fanadd'] / df['3']
29 | df['fanadd_ratio'] = df['fanadd_ratio'].replace(
30 | [np.inf, -np.inf], np.nan).fillna(0)
31 | df['viewadd'] = (df['18'] - df['6']).fillna(0)
32 | df['viewadd'] = df['viewadd'].map(lambda x: x if x > 0 else 0)
33 | df['viewadd_ratio'] = (df['viewadd'] / df['6']).replace(
34 | [np.inf, -np.inf], np.nan).fillna(0)
35 | df['view_fan'] = (df['viewadd'] / df['3']).replace(
36 | [np.inf, -np.inf], np.nan).fillna(0)
37 | df['view_fan_20'] = df['view_fan'].map(lambda x: x if x < 20 else 0)
38 | df['view_fanadd'] = (df['viewadd'] / df['fanadd']).replace(
39 | [np.inf, -np.inf], np.nan).fillna(0)
40 |
41 | '''seven day'''
42 | df['seven'] = df['1'].map(lambda x: '1970-01-%d %s' % (int(time.strftime(
43 | "%w", time.strptime(x, "%Y-%m-%d %H:%M:%S"))) + 4, x.split(None, 1)[1]))
44 | need_columns = ['time', 'fan', 'fanadd', 'fanadd_ratio',
45 | 'viewadd', 'viewadd_ratio', 'view_fan', 'view_fan_20', 'view_fanadd', 'seven']
46 | result_df = pd.DataFrame(df, columns=need_columns)
47 | result_df.to_csv('%spublic_re.csv' % data_dir, index=False)
48 |
49 |
50 | def clean_csv(av_id: int):
51 | ''' clean csv '''
52 | csv_path = os.path.join(history_dir, '{}.csv'.format(av_id))
53 | output_path = os.path.join(history_data_dir, '{}_new.csv'.format(av_id))
54 | print(csv_path)
55 | csv = read_file(csv_path)
56 | last_time, last_view = csv[0].split(',')[:2]
57 | result = [csv[0]]
58 | last_time = time_stamp(last_time)
59 | last_view = int(last_view)
60 | empty_line = ','.join([' '] * (len(csv[0].split(',')) + 1))
61 | for line in csv[1:]:
62 | now_time, now_view = line.split(',')[:2]
63 | now_time = time_stamp(now_time)
64 | now_view = int(now_view)
65 | time_gap = now_time - last_time
66 |
67 | if now_view < last_view or now_view - last_view > 5000:
68 | continue
69 | if abs(time_gap) > 150:
70 | for ii in range(int((time_gap - 30) // 120)):
71 | result.append(empty_line)
72 | if abs(time_gap) > 90:
73 | result.append(line)
74 | last_view, last_time = now_view, now_time
75 | with open(output_path, 'w') as f:
76 | f.write('\n'.join(result))
77 |
--------------------------------------------------------------------------------
/bilibili/assign_up.ini.tmp:
--------------------------------------------------------------------------------
1 | [basic]
2 | bv_id = BV1GW411g7mc
3 | av_id = 21061574
4 | basic_av_p = -1
5 | author = 还有一天就放假了
6 | mid = 7792521
7 | rank_id = 119
8 | tid = 126
9 | view_abnormal = 1000
10 | history_check_list = 1,3,6
11 | ;split by ','
12 | special_info_email = 123456@163.com
13 | assign_email = 123456@163.com
14 |
15 | [assign]
16 | av_ids = 21061574,11624347
17 | bv_ids = BV1GW411g7mc
18 | ;split by ','
19 |
20 | [comment]
21 | keyword = 死全家|草泥马|.{0,4}\$\$_.{0,4}
22 | ;support re, use '|' split
23 | ignore_list = ^[2-3].*
24 | ignore_rpid = {"21061574":["242-2"],"21062574":["1242"],"21061577":["1284"]}
25 | ignore_start = 0.5
26 | ignore_end = 8.5
27 | email_limit = 5
28 |
29 | [login]
30 | username = 123
31 | password = 123
--------------------------------------------------------------------------------
/bilibili/basicBilibili.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-09-14 14:49:01
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-06-06 12:48:53
6 |
7 | import json
8 | import os
9 | import shutil
10 | import sys
11 | import urllib
12 | from configparser import ConfigParser
13 |
14 | sys.path.append(os.getcwd())
15 | from proxy.getproxy import GetFreeProxy
16 | from util.util import can_retry, get_accept, basic_req
17 |
18 |
19 | one_day = 86400
20 | root_dir = os.path.abspath("bilibili")
21 | data_dir = os.path.join(root_dir, "data/")
22 | assign_path = os.path.join(root_dir, "assign_up.ini")
23 | if not os.path.exists(assign_path):
24 | shutil.copy(assign_path + ".tmp", assign_path)
25 |
26 |
27 | class BasicBilibili(object):
28 | BILIBILI_URL = "https://www.bilibili.com"
29 | BASIC_AV_URL = "http://www.bilibili.com/video/av%d"
30 | BASIC_BV_URL = "http://www.bilibili.com/video/%s"
31 | ARCHIVE_STAT_URL = "http://api.bilibili.com/x/web-interface/archive/stat?aid=%d"
32 | VIEW_URL = "http://api.bilibili.com/x/web-interface/view?bvid=%s"
33 | RELATION_STAT_URL = (
34 | "http://api.bilibili.com/x/relation/stat?jsonp=jsonp&callback=__jp0&vmid=%d"
35 | )
36 | BASIC_RANKING_URL = "https://www.bilibili.com/ranking/all/%d/"
37 | SPACE_AVS_URL = (
38 | "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=1&ps=50&jsonp=jsonp"
39 | )
40 | REPLY_V2_URL = (
41 | "http://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=%d&type=1&oid=%d&sort=%d"
42 | )
43 | RANKING_URL = "https://api.bilibili.com/x/web-interface/ranking?rid=%d&day=%d&type=1&arc_type=%d&jsonp=jsonp&callback=__jp1"
44 | PLAYLIST_URL = "https://api.bilibili.com/x/player/pagelist?aid=%d&jsonp=jsonp"
45 | DM_URL = "https://api.bilibili.com/x/v1/dm/list.so?oid=%d"
46 | GET_KEY_URL = "http://passport.bilibili.com/login?act=getkey&r=%f"
47 | LOGIN_URL = "https://passport.bilibili.com/login"
48 | LOGIN_V2_URL = "https://passport.bilibili.com/web/login/v2"
49 | LOGIN_OAUTH_URL = "https://passport.bilibili.com/api/v2/oauth2/login"
50 | CAPTCHA_URL = "https://passport.bilibili.com/web/captcha/combine?plat=11"
51 | GET_KEY_URL = "https://passport.bilibili.com/api/oauth2/getKey"
52 | GETTYPE_URL = "https://api.geetest.com/gettype.php?gt=%s&callback=geetest_%d"
53 | M_BILIBILI_URL = "https://m.bilibili.com/video/%s"
54 | NO_RANK_CONSTANT = "No rank.....No Rank......No Rank....."
55 | JSON_KEYS = ["code", "message", "ttl", "data"]
56 | T_FORMAT = "%m-%d %H:%M"
57 |
58 | def __init__(self):
59 | super(BasicBilibili, self).__init__()
60 | self.proxy_req = GetFreeProxy().proxy_req
61 | self.del_map = {}
62 | self.rank_map = {}
63 | self.load_configure()
64 |
65 | def load_configure(self):
66 | """ load assign configure """
67 | cfg = ConfigParser()
68 | cfg.read(assign_path, "utf-8")
69 | self.assign_author = cfg.get("basic", "author")
70 | mid = cfg["basic"]["mid"]
71 | self.assign_mid = int(mid) if len(mid) else -1
72 | self.assign_rank_id = cfg.getint("basic", "rank_id")
73 | self.assign_tid = cfg.getint("basic", "tid")
74 | self.basic_bv_id = cfg.get("basic", "bv_id")
75 | self.view_abnormal = cfg.getint("basic", "view_abnormal")
76 | self.assign_ids = cfg.get("assign", "bv_ids").split(",")
77 | rank_map = {ii: {} for ii in self.assign_ids if ii not in self.del_map}
78 | self.rank_map = {**rank_map, **self.rank_map}
79 | self.keyword = cfg.get("comment", "keyword")
80 | self.ignore_rpid = json.loads(cfg.get("comment", "ignore_rpid"))
81 | self.ignore_list = cfg.get("comment", "ignore_list")
82 | self.ignore_start = cfg.getfloat("comment", "ignore_start")
83 | self.ignore_end = cfg.getfloat("comment", "ignore_end")
84 | self.email_limit = cfg.getint("comment", "email_limit")
85 | self.AV_URL = self.BASIC_BV_URL % self.basic_bv_id
86 | self.history_check_list = [
87 | int(ii) for ii in cfg.get("basic", "history_check_list").split(",")
88 | ]
89 | self.special_info_email = cfg.get("basic", "special_info_email").split(",")
90 | self.assign_rec = cfg.get("basic", "assign_email").split(",")
91 | self.username = urllib.parse.quote_plus(cfg.get("login", "username"))
92 | self.password = cfg.get("login", "password")
93 |
94 | def get_api_req(self, url: str, bv_id: str, types: int = 0, is_proxy: bool = True):
95 | r_req = self.proxy_req if is_proxy else basic_req
96 | if types == 0:
97 | req = r_req(url, 1, header=self.get_api_headers(bv_id))
98 | else:
99 | req = r_req(url, 3, header=self.get_api_headers(bv_id))
100 | req = self.decoder_jp(req)
101 | if req is None or list(req.keys()) != self.JSON_KEYS:
102 | if can_retry(url):
103 | return self.get_api_req(url, bv_id, types)
104 | else:
105 | return
106 | return req["data"]
107 |
108 | def get_api_headers(self, bv_id: str, types: int = 0) -> dict:
109 | if isinstance(bv_id, int):
110 | bv_id = "av{}".format(bv_id)
111 | if types == 0:
112 | return {"Accept": "*/*", "Referer": self.BASIC_BV_URL % bv_id}
113 | if types == 1:
114 | return {"Accept": get_accept("html"), "Host": self.BILIBILI_URL}
115 |
116 | def update_ini(self, bv_id: str, av_id: int):
117 | cfg = ConfigParser()
118 | cfg.read(assign_path, "utf-8")
119 | cfg.set("basic", "bv_id", bv_id)
120 | cfg.set("basic", "av_id", str(bv_id))
121 | bv_ids = cfg.get("assign", "bv_ids")
122 | cfg.set("assign", "bv_ids", "{},{}".format(bv_ids, bv_id))
123 | cfg.write(open(assign_path, "w"))
124 |
125 | def decoder_jp(self, text: str) -> dict:
126 | star_begin = text.find("{")
127 | if star_begin == -1:
128 | return {}
129 | star_json = text[star_begin:-1]
130 | try:
131 | return json.loads(star_json)
132 | except:
133 | return {}
134 |
135 | def update_proxy_basic(self):
136 | self.proxy_req = GetFreeProxy().proxy_req
137 |
--------------------------------------------------------------------------------
/bilibili/bsocket.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-03-26 10:21:05
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-06-06 11:32:49
6 |
7 |
8 | import asyncio
9 | import codecs
10 | import json
11 | import logging
12 | import os
13 | import shutil
14 | import struct
15 | import sys
16 | import time
17 | from collections import namedtuple
18 | from configparser import ConfigParser
19 | from enum import IntEnum
20 | from ssl import _create_unverified_context
21 |
22 | import aiohttp
23 | import regex
24 |
25 | sys.path.append(os.getcwd())
26 | from proxy.getproxy import GetFreeProxy
27 | from util.util import basic_req, can_retry, echo, mkdir, time_stamp, time_str
28 |
29 | logger = logging.getLogger(__name__)
30 | proxy_req = GetFreeProxy().proxy_req
31 | data_dir = "bilibili/data/"
32 | websocket_dir = "%swebsocket/" % data_dir
33 | assign_path = "bilibili/assign_up.ini"
34 | one_day = 86400
35 |
36 | """
37 | * bilibili @websocket
38 | * www.bilibili.com/video/av{av_id}
39 | * wss://broadcast.chat.bilibili.com:7823/sub
40 | """
41 |
42 |
43 | class Operation(IntEnum):
44 | SEND_HEARTBEAT = 2
45 | ONLINE = 3
46 | COMMAND = 5
47 | AUTH = 7
48 | RECV = 8
49 | NESTED = 9
50 | DANMAKU = 1000
51 |
52 |
53 | class BWebsocketClient:
54 | """ bilibili websocket client """
55 |
56 | ROOM_INIT_URL = "https://www.bilibili.com/video/bv%s"
57 | WEBSOCKET_URL = "wss://broadcast.chat.bilibili.com:7823/sub"
58 | PLAYLIST_URL = "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp"
59 | HEARTBEAT_BODY = "[object Object]"
60 | JSON_KEYS = ["code", "message", "ttl", "data"]
61 |
62 | HEADER_STRUCT = struct.Struct(">I2H2IH")
63 | HeaderTuple = namedtuple(
64 | "HeaderTuple",
65 | ("total_len", "header_len", "proto_ver", "operation", "time", "zero"),
66 | )
67 | _COMMAND_HANDLERS = {
68 | "DM": lambda client, command: client._on_get_danmaku(
69 | command["info"][1], command["info"][0]
70 | )
71 | }
72 |
73 | def __init__(self, av_id: int, bv_id: str, types: int = 0, p: int = -1):
74 | """ init class """
75 | self._av_id = av_id
76 | self._bv_id = bv_id
77 | self._room_id = None
78 | self._count = 1
79 | self._types = types
80 | self._begin_time = int(time_stamp())
81 | self._loop = asyncio.get_event_loop()
82 | self._session = aiohttp.ClientSession(loop=self._loop)
83 | self._is_running = False
84 | self._websocket = None
85 | self._p = p if p > 0 else 1
86 | self._getroom_id()
87 |
88 | async def close(self):
89 | await self._session.close()
90 |
91 | def run(self):
92 | """ Create Thread """
93 | if self._is_running:
94 | raise RuntimeError("This client is already running")
95 | self._is_running = True
96 | return asyncio.ensure_future(self._message_loop(), loop=self._loop)
97 |
98 | def get_cid(self, bv_id: str):
99 | playlist_url = self.PLAYLIST_URL % bv_id
100 | headers = {"Accept": "*/*", "Referer": self.ROOM_INIT_URL % bv_id}
101 | req = basic_req(playlist_url, 1, header=headers)
102 | if req is None or list(req.keys()) != self.JSON_KEYS:
103 | return
104 | cid = [ii["cid"] for ii in req["data"]]
105 | return cid
106 |
107 | def _getroom_id(self, proxy: bool = True):
108 | """ get av room id """
109 | cid = self.get_cid(self._bv_id)
110 | assert (
111 | cid and len(cid) >= self._p
112 | ), "Actual Page len: {} <=> Need Pages Num: {}".format(len(cid), self._p)
113 | self._room_id = int(cid[self._p - 1])
114 | echo(3, "Room_id:", self._room_id)
115 |
116 | def parse_struct(self, data: dict, operation: int):
117 | """ parse struct """
118 | assert (
119 | int(time_stamp()) < self._begin_time + 7 * one_day
120 | ), "Excess Max RunTime!!!"
121 |
122 | if operation == 7:
123 | body = json.dumps(data).replace(" ", "").encode("utf-8")
124 | else:
125 | body = self.HEARTBEAT_BODY.encode("utf-8")
126 | header = self.HEADER_STRUCT.pack(
127 | self.HEADER_STRUCT.size + len(body),
128 | self.HEADER_STRUCT.size,
129 | 1,
130 | operation,
131 | self._count,
132 | 0,
133 | )
134 | self._count += 1
135 | return header + body
136 |
137 | async def _send_auth(self):
138 | """ send auth """
139 | auth_params = {
140 | "room_id": "video://%d/%d" % (self._av_id, self._room_id),
141 | "platform": "web",
142 | "accepts": [1000],
143 | }
144 | await self._websocket.send_bytes(self.parse_struct(auth_params, Operation.AUTH))
145 |
146 | async def _message_loop(self):
147 | """ loop sent message """
148 |
149 | if self._room_id is None:
150 | self._getroom_id()
151 |
152 | while True:
153 | heartbeat_con = None
154 | try:
155 | async with self._session.ws_connect(self.WEBSOCKET_URL) as websocket:
156 | self._websocket = websocket
157 | await self._send_auth()
158 | heartbeat_con = asyncio.ensure_future(
159 | self._heartbeat_loop(), loop=self._loop
160 | )
161 |
162 | async for message in websocket:
163 | if message.type == aiohttp.WSMsgType.BINARY:
164 | await self._handle_message(message.data, 0)
165 | else:
166 | logger.warning(
167 | "Unknown Message type = %s %s",
168 | message.type,
169 | message.data,
170 | )
171 |
172 | except asyncio.CancelledError:
173 | break
174 | except aiohttp.ClientConnectorError:
175 | logger.warning("Retrying */*/*/*/---")
176 | try:
177 | await asyncio.sleep(5)
178 | except asyncio.CancelledError:
179 | break
180 | finally:
181 | if heartbeat_con is not None:
182 | heartbeat_con.cancel()
183 | try:
184 | await heartbeat_con
185 | except asyncio.CancelledError:
186 | break
187 | self._websocket = None
188 |
189 | self._is_running = False
190 |
191 | async def _heartbeat_loop(self):
192 | """ heart beat every 30s """
193 | if self._types and int(time_stamp()) > self._begin_time + one_day:
194 | self.close()
195 | for _ in range(int(one_day * 7 / 30)):
196 | try:
197 | await self._websocket.send_bytes(
198 | self.parse_struct({}, Operation.SEND_HEARTBEAT)
199 | )
200 | await asyncio.sleep(30)
201 | except (asyncio.CancelledError, aiohttp.ClientConnectorError):
202 | break
203 |
204 | async def _handle_message(self, message: str, offset: int = 0):
205 | """ handle message"""
206 | while offset < len(message):
207 | try:
208 | header = self.HeaderTuple(
209 | *self.HEADER_STRUCT.unpack_from(message, offset)
210 | )
211 | body = message[
212 | offset + self.HEADER_STRUCT.size : offset + header.total_len
213 | ]
214 | if (
215 | header.operation == Operation.ONLINE
216 | or header.operation == Operation.COMMAND
217 | ):
218 | body = json.loads(body.decode("utf-8"))
219 | if header.operation == Operation.ONLINE:
220 | await self._on_get_online(body)
221 | else:
222 | await self._handle_command(body)
223 | elif header.operation == Operation.RECV:
224 | print("Connect Build!!!")
225 | elif header.operation == Operation.NESTED:
226 | offset += self.HEADER_STRUCT.size
227 | continue
228 | elif header.operation == Operation.DANMAKU:
229 | body = json.loads(body.decode("utf-8"))
230 | print(body)
231 | print(">>>>DANMAKU tail socket>>>>")
232 | else:
233 | logger.warning(
234 | "Unknown operation = %d %s %s", header.operation, header, body
235 | )
236 | offset += header.total_len
237 | except:
238 | pass
239 |
240 | async def _handle_command(self, command):
241 | if isinstance(command, list):
242 | for one_command in command:
243 | await self._handle_command(one_command)
244 | return
245 |
246 | cmd = command["cmd"]
247 | if cmd in self._COMMAND_HANDLERS:
248 | handler = self._COMMAND_HANDLERS[cmd]
249 | if handler is not None:
250 | await handler(self, command)
251 | else:
252 | logger.warning("Unknown Command = %s %s", cmd, command)
253 |
254 | async def _on_get_online(self, online):
255 | """ get online num """
256 | pass
257 |
258 | async def _on_get_danmaku(self, content, user_name):
259 | """ get danmaku """
260 | pass
261 |
262 |
263 | class OneBWebsocketClient(BWebsocketClient):
264 | """ get one bilibili websocket client """
265 |
266 | async def _on_get_online(self, online):
267 | online = online["data"]["room"]["online"]
268 | with codecs.open(self.get_path("online"), "a", encoding="utf-8") as f:
269 | f.write(self.get_data([online]))
270 | print("Online:", online)
271 |
272 | async def _on_get_danmaku(self, content, user_name):
273 | with codecs.open(self.get_path("danmaku"), "a", encoding="utf-8") as f:
274 | f.write(self.get_data([content, user_name]))
275 | print(content, user_name)
276 |
277 | def get_data(self, origin_data: list) -> str:
278 | """ get data """
279 | return ",".join(str(ii) for ii in [time_str(), *origin_data]) + "\n"
280 |
281 | def get_path(self, types: str) -> str:
282 | """ get path """
283 | p_path = "_p%d" % self._p if self._p != -1 else ""
284 | return "%s%d_%s%s.csv" % (websocket_dir, self._av_id, types, p_path)
285 |
286 |
287 | async def async_main(av_id: int, bv_id: str, types: int, p: int):
288 | client = OneBWebsocketClient(av_id, bv_id, types, p=p)
289 | future = client.run()
290 | try:
291 | await future
292 | finally:
293 | await client.close()
294 |
295 |
296 | def BSocket(av_id: int, bv_id: str, types: int = 0, p: int = -1):
297 | """ build a loop websocket connect"""
298 | loop = asyncio.get_event_loop()
299 | try:
300 | loop.run_until_complete(async_main(av_id, bv_id, types, p))
301 | finally:
302 | loop.close()
303 |
304 |
305 | if __name__ == "__main__":
306 | mkdir(data_dir)
307 | mkdir(websocket_dir)
308 | if not os.path.exists(assign_path):
309 | shutil.copy(assign_path + ".tmp", assign_path)
310 |
311 | """ Test for San Diego demon """
312 | """ PS: the thread of BSocket have to be currentThread in its processing. """
313 | if len(sys.argv) == 4:
314 | bv_id = sys.argv[2]
315 | av_id = int(sys.argv[1])
316 | p = int(sys.argv[3])
317 | else:
318 | cfg = ConfigParser()
319 | cfg.read(assign_path, "utf-8")
320 | av_id = cfg.getint("basic", "av_id")
321 | bv_id = cfg.getint("basic", "bv_id")
322 | p = cfg.getint("basic", "basic_av_p") if len(cfg["basic"]["basic_av_p"]) else -1
323 |
324 | BSocket(av_id, bv_id, p=p)
325 |
--------------------------------------------------------------------------------
/bilibili/geetestE.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-09-15 19:25:31
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2019-10-09 23:23:44
6 |
7 | import os
8 | import sys
9 | import time
10 |
11 | import numpy as np
12 |
13 | sys.path.append(os.getcwd())
14 | from util.util import echo, time_stamp
15 |
16 | g = '0123456789abcdefghijklmnopqrstuvwxyz'
17 | FV = 4503599627370496
18 | DV = 268435456
19 | DB = 28
20 | DM = DV - 1
21 | F1 = 24
22 | F2 = 4
23 |
24 |
25 | class S(object):
26 | def __init__(self):
27 | e = [(255 & int(65536 * np.random.random())) for _ in range(256)]
28 | S = [ii for ii in range(256)]
29 | n = 0
30 | for t in range(256):
31 | n = (n + S[t] + e[t % len(e)]) & 255
32 | S[t], S[n] = S[n], S[t]
33 | self.S = S
34 | self.i = 0
35 | self.j = 0
36 |
37 | def __call__(self):
38 | if not len(self.S):
39 | self.get_S()
40 | self.i = (self.i + 1) & 255
41 | self.j = (self.j + self.S[self.i]) & 255
42 | self.S[self.i], self.S[self.j] = self.S[self.j], self.S[self.i]
43 | return self.S[(self.S[self.i] + self.S[self.j]) & 255]
44 |
45 |
46 | class E(object):
47 | def __init__(self, e=None, t: int = 256):
48 | super(E, self).__init__()
49 | self.t = 0
50 | self.s = 0
51 | self.E = {}
52 | self.T = {}
53 | if e is not None:
54 | if type(e) == int and e == 1:
55 | self.one()
56 | else:
57 | self.prepare_E(e, t)
58 |
59 | def __call__(self, e: list, t: int):
60 | self.prepare_E(e, t)
61 |
62 | def prepare_E(self, e: list, t: int):
63 | n, r, o, i, a, s = int(np.log2(t)), 0, 0, len(e) - 1, False, 0
64 | p = {ii + 48: ii for ii in range(10)}
65 | p = {**p, **{ii + 55: ii for ii in range(10, 36)}}
66 | p = {**p, **{ii + 87: ii for ii in range(10, 36)}}
67 |
68 | while 0 <= i:
69 | if n == 8:
70 | c = 255 & e[i]
71 | else:
72 | idx = ord(e[i])
73 | c = p[idx] if idx in p else -1
74 | if c < 0:
75 | if '-' == e[i]:
76 | a = True
77 | i -= 1
78 | continue
79 | a = False
80 | if s == 0:
81 | self.E[self.t] = c
82 | self.t += 1
83 | elif s + n > DB:
84 | self.E[self.t - 1] = self.E[self.t
85 | - 1] | ((c & (1 << DB - 3) - 1) << s)
86 | self.E[self.t] = c >> DB - s
87 | self.t += 1
88 | else:
89 | self.E[self.t - 1] = self.E[self.t - 1] | (c << s)
90 | s += n
91 | if s >= DB:
92 | s -= DB
93 | i -= 1
94 | if n == 8 and (128 & e[0]):
95 | self.s = -1
96 | if s > 0:
97 | self.E[self.t - 1] = self.E[self.t
98 | - 1] | ((1 << DB - s) - 1 << s)
99 | self.clamp()
100 | if a:
101 | echo(0, 'a is True')
102 | E().subTo(self, self)
103 |
104 | def clamp(self):
105 | ee = self.s & DM
106 | while 0 < self.t and self.E[self.t - 1] == ee:
107 | self.t -= 1
108 |
109 | def get_T(self):
110 | self.T['m'] = self
111 | self.T['mp'] = self.invDigit()
112 | self.T['mpl'] = 32767 & self.T['mp']
113 | self.T['mph'] = self.T['mp'] >> 15
114 | self.T['um'] = (1 << DB - 15) - 1
115 | self.T['mt2'] = 2 * self.t
116 |
117 | def invDigit(self):
118 | if self.t < 1:
119 | return 0
120 | e = self.E[0]
121 | if (0 == (1 & e)):
122 | return 0
123 | t = 3 & e
124 | t = t * (2 - (15 & e) * t) & 15
125 | t = t * (2 - (255 & e) * t) & 255
126 | t = t * (2 - ((65535 & e) * t & 65535)) & 65535
127 | t = t * (2 - e * t % DV) % DV
128 | return DV - t if t > 0 else -t
129 |
130 | def modPowInt(self, e: int, t):
131 | if e >= 256 and not t.isEven():
132 | t.get_T()
133 | else:
134 | echo(0, 'e:', e, 'isEven:', self.isEven())
135 | return self.exp(e, t)
136 |
137 | def exp(self, e: int, t):
138 | n, r = E(), E()
139 | i = self.y(e) - 2
140 | o = t.convert(self)
141 | o.copyTo(n)
142 | while 0 <= i:
143 | t.sqrTo(n, r)
144 | if (e & 1 << i) > 0:
145 | t.mulTo(r, o, n)
146 | else:
147 | n, r = r, n
148 | i -= 1
149 | return t.revert(n)
150 |
151 | def convert(self, e):
152 | t = E()
153 | e.dlShiftTo(self.T['m'].t, t)
154 | t.divRemTo(self.T['m'], t)
155 | if e.s < 0 and t.compareTo(E()) > 0:
156 | self.T['m'].subTo(t, t)
157 | return t
158 |
159 | def revert(self, e):
160 | t = E()
161 | e.copyTo(t)
162 | self.reduceE(t)
163 | return t
164 |
165 | def divRemTo(self, e, n):
166 | if (e.t <= 0):
167 | return False
168 | if self.t < e.t:
169 | return False
170 | i, a, s = E(), self.s, e.s
171 | c = DB - self.y(e.E[e.t - 1])
172 | if c > 0:
173 | e.lShiftTo(c, i)
174 | self.lShiftTo(c, n)
175 | else:
176 | e.copyTo(i)
177 | self.copyTo(n)
178 | u = i.t
179 | uu = i.E[u - 1]
180 | if uu == 0:
181 | return False
182 | l = uu * (1 << F1) + (i.E[u - 2] >> F2 if u > 1 else 0)
183 | h = FV / l
184 | f = (1 << F1) / l
185 | d = 1 << F2
186 | p = n.t
187 | g = p - u
188 | v = E()
189 | i.dlShiftTo(g, v)
190 | if n.compareTo(v) >= 0:
191 | n.E[n.t] = 1
192 | n.t += 1
193 | n.subTo(v, n)
194 | E(1).dlShiftTo(u, v)
195 | v.subTo(i, i)
196 | while i.t < u:
197 | i.E[i.t] = 0
198 | i.t += 1
199 | g -= 1
200 | while g >= 0:
201 | p -= 1
202 | if n.E[p] == uu:
203 | m = DM
204 | else:
205 | m = int(n.E[p] * h + (n.E[p - 1] + d) * f)
206 |
207 | n.E[p] += i.am(0, m, n, g, 0, u)
208 | if n.E[p] < m:
209 | i.dlShiftTo(g, v)
210 | n.subTo(v, n)
211 | m -= 1
212 | while n.E[p] < m:
213 | n.subTo(v, n)
214 | m -= 1
215 | g -= 1
216 | n.t = u
217 | n.clamp()
218 | if c > 0:
219 | n.rShiftTo(c, n)
220 | if a < 0:
221 | E().subTo(n, n)
222 |
223 | def lShiftTo(self, e: int, t):
224 | r = e % DB
225 | o = DB - r
226 | i = (1 << o) - 1
227 | a = int(e / DB)
228 | s = self.s << r & DB
229 | for n in range(self.t - 1, -1, -1):
230 | t.E[n + a + 1] = self.E[n] >> o | s
231 | s = (self.E[n] & i) << r
232 | for n in range(a):
233 | t.E[n] = 0
234 | t.E[a] = s
235 | t.t = self.t + a + 1
236 | t.s = self.s
237 | t.clamp()
238 |
239 | def rShiftTo(self, e: int, t):
240 | t.s = self.s
241 | n = int(e / DB)
242 | if n >= self.t:
243 | t.t = 0
244 | else:
245 | r = e % DB
246 | o = DB - r
247 | i = (1 << r) - 1
248 | t.E[0] = self.E[n] >> r
249 | for a in range(n + 1, self.t):
250 | t.E[a - n - 1] = (t.E[a - n - 1]) | ((self.E[a] & i) << o)
251 | t.E[a - n] = self.E[a] >> r
252 | if r > 0:
253 | t.E[self.t - n - 1] = t.E[self.t - n - 1] | ((self.s & i) << o)
254 | t.t = self.t - n
255 | t.clamp()
256 |
257 | def copyTo(self, e):
258 | for t in range(self.t):
259 | e.E[t] = self.E[t]
260 | e.t = self.t
261 | e.s = self.s
262 |
263 | def dlShiftTo(self, e: int, t):
264 | for ii in range(self.t):
265 | t.E[ii + e] = self.E[ii]
266 | for ii in range(e):
267 | t.E[ii] = 0
268 | t.t = self.t + e
269 | t.s = self.s
270 |
271 | def y(self, e: int):
272 | def yy(e: int, n: int, k: int):
273 | t = e >> k
274 | if t:
275 | e = t
276 | n += k
277 | return e, n
278 | n = 1
279 | e, n = yy(e, n, 16)
280 | e, n = yy(e, n, 8)
281 | e, n = yy(e, n, 4)
282 | e, n = yy(e, n, 2)
283 | e, n = yy(e, n, 1)
284 | return n
285 |
286 | def compareTo(self, e):
287 | t = self.s - e.s
288 | if t != 0:
289 | return t
290 | t = self.t - e.t
291 | if t != 0:
292 | return t if self.s > 0 else -t
293 | for n in range(self.t - 1, -1, -1):
294 | if self.E[n] - e.E[n] != 0:
295 | return self.E[n] - e.E[n]
296 | return 0
297 |
298 | def subTo(self, e, t):
299 | n, r, o = 0, 0, np.min([e.t, self.t])
300 | while n < o:
301 | r += self.E[n] - e.E[n]
302 | t.E[n] = r & DM
303 | n += 1
304 | r = r >> DM
305 | if e.t < self.t:
306 | r -= e.s
307 | while n < self.t:
308 | r += self.E[n]
309 | t.E[n] = r & DM
310 | n += 1
311 | r = r >> DM
312 | r += self.s
313 | else:
314 | r += self.s
315 | while n < e.t:
316 | r -= e.E[n]
317 | t.E[n] = r & DM
318 | n += 1
319 | r = r >> DM
320 | r -= e.s
321 | t.s = -1 if r < 0 else 0
322 | if r < -1 or r > 0:
323 | t.E[n] = DV + r if r < -1 else r
324 | n += 1
325 | t.t = n
326 | t.clamp()
327 |
328 | def one(self):
329 | self.E[0] = 1
330 | self.t = 1
331 |
332 | def am(self, e: int, t: int, n, r: int, o: int, i: int):
333 | a = 16383 & t
334 | s = t >> 14
335 | i -= 1
336 | while 0 <= i:
337 | c = 16383 & self.E[e]
338 | u = self.E[e] >> 14
339 | e += 1
340 | uu = s * c + u * a
341 | c = a * c + ((16383 & uu) << 14) + n.E[r] + o
342 | o = (c >> 28) + (uu >> 14) + s * u
343 | n.E[r] = DM & c
344 | r += 1
345 | i -= 1
346 | return o
347 |
348 | def sqrTo(self, e, t):
349 | e.squareTo(t)
350 | self.reduceE(t)
351 |
352 | def squareTo(self, e):
353 | e.t = 2 * self.t
354 | for n in range(e.t):
355 | e.E[n] = 0
356 | for n in range(self.t - 1):
357 | r = self.am(n, self.E[n], e, 2 * n, 0, 1)
358 | e.E[n + self.t] += self.am(n + 1, 2 * self.E[n],
359 | e, 2 * n + 1, r, self.t - n - 1)
360 | if e.E[n + self.t] >= DV:
361 | e.E[n + self.t] -= DV
362 | e.E[n + self.t + 1] = 1
363 | if e.t > 0:
364 | e.E[e.t - 1] += self.am(n, self.E[n], e, 2 * n, 0, 1)
365 | e.s = 0
366 | e.clamp()
367 |
368 | def reduceE(self, e):
369 | while e.t <= self.T['mt2']:
370 | e.E[e.t] = 0
371 | e.t += 1
372 | for t in range(self.T['m'].t):
373 | n = 32767 & e.E[t]
374 | r = (n * self.T['mpl'] + (n * self.T['mph'] + (e.E[t] >> 15)
375 | * self.T['mpl'] & self.T['um']) << 15) & DM
376 | n = t + self.T['m'].t
377 | e.E[n] += self.T['m'].am(0, r, e, t, 0, self.T['m'].t)
378 | while e.E[n] >= DV:
379 | e.E[n] -= DV
380 | n += 1
381 | e.E[n] += 1
382 | e.clamp()
383 | e.drShiftTo(self.T['m'].t, e)
384 | if e.compareTo(self.T['m']) >= 0:
385 | e.subTo(self.T['m'], e)
386 |
387 | def drShiftTo(self, e: int, t):
388 | for n in range(e, self.t):
389 | t.E[n - e] = self.E[n]
390 | t.t = np.max([self.t - e, 0])
391 | t.s = self.s
392 |
393 | def mulTo(self, e, t, n):
394 | e.multiplyTo(t, n)
395 | self.reduceE(n)
396 |
397 | def multiplyTo(self, e, t):
398 | t.t = self.t + e.t
399 | for o in range(self.t):
400 | t.E[o] = 0
401 | for o in range(e.t):
402 | t.E[o + self.t] = self.am(0, e.E[o], t, o, 0, self.t)
403 | t.s = 0
404 | t.clamp()
405 | if self.s != e.s:
406 | E().subTo(t, t)
407 |
408 | def isEven(self):
409 | t = 1 & self.E[0] if self.t else self.s
410 | return t == 0
411 |
412 | def abs(self):
413 | return self if self.s > 0 else self
414 |
415 | def tostring(self, e: int):
416 | if self.s < 0:
417 | echo('0|warning', '.s < 0', self.s)
418 | return '-'
419 | t = int(np.log2(e))
420 | r, o, i, a = (1 << t) - 1, False, '', self.t
421 | s = DB - a * DB % t
422 | if a > 0:
423 | if s < DB:
424 | n = self.E[a] >> s
425 | if n > 0:
426 | o = True
427 | i = g[n]
428 | a -= 1
429 | while a >= 0:
430 | if s < t:
431 | n = (self.E[a] & (1 << s) - 1) << t - s
432 | a -= 1
433 | s += DB - t
434 | n = n | (self.E[a] >> s)
435 | else:
436 | s -= t
437 | n = self.E[a] >> s & r
438 | if s <= 0:
439 | s += DB
440 | a -= 1
441 | if n > 0:
442 | o = True
443 | if o:
444 | i += g[n]
445 | return i if o else '0'
446 |
447 |
448 | class O(object):
449 | def __init__(self):
450 | self.T = {}
451 |
452 | self.n = {
453 | 'A': 48,
454 | 'BUTTON': 1,
455 | 'CANVAS': 1,
456 | 'CPUClass': None,
457 | 'DIV': 71,
458 | 'HTMLLength': 158225,
459 | 'IMG': 5,
460 | 'INPUT': 4,
461 | 'LABEL': 1,
462 | 'LI': 21,
463 | 'LINK': 3,
464 | 'P': 10,
465 | 'SCRIPT': 14,
466 | 'SPAN': 9,
467 | 'STYLE': 18,
468 | 'UL': 4,
469 | 'browserLanguage': "zh-CN",
470 | 'browserLanguages': "zh-CN,zh",
471 | 'canvas2DFP': "5eb3d9a167292cc324a4a6b692171a49",
472 | 'canvas3DFP': "b2284dba7b1ccb5ef8fabc22c0065611",
473 | 'colorDepth': 24,
474 | 'cookieEnabled': 1,
475 | 'devicePixelRatio': 2,
476 | 'deviceorientation': False,
477 | 'doNotTrack': 0,
478 | 'documentMode': "CSS1Compat",
479 | 'flashEnabled': -1,
480 | 'hardwareConcurrency': 8,
481 | 'indexedDBEnabled': 1,
482 | 'innerHeight': 150,
483 | 'innerWidth': 1680,
484 | 'internalip': None,
485 | 'javaEnabled': 0,
486 | 'jsFonts': "AndaleMono,Arial,ArialBlack,ArialHebrew,ArialNarrow,ArialRoundedMTBold,ArialUnicodeMS,ComicSansMS,Courier,CourierNew,Geneva,Georgia,Helvetica,HelveticaNeue,Impact,LUCIDAGRANDE,MicrosoftSansSerif,Monaco,Palatino,Tahoma,Times,TimesNewRoman,TrebuchetMS,Verdana,Wingdings,Wingdings2,Wingdings3",
487 | 'localStorageEnabled': 1,
488 | 'maxTouchPoints': 0,
489 | 'mediaDevices': -1,
490 | 'netEnabled': 1,
491 | 'outerHeight': 987,
492 | 'outerWidth': 1680,
493 | 'performanceTiming': "-1,-1,16,2,122,0,274,0,209,137,6,6,32,3405,3405,3408,35543,35544,35547,-1",
494 | 'platform': "MacIntel",
495 | 'plugins': "internal-pdf-viewer,mhjfbmdgcfjbbpaeojofohoefgiehjai,internal-nacl-plugin",
496 | 'screenAvailHeight': 987,
497 | 'screenAvailLeft': 0,
498 | 'screenAvailTop': 23,
499 | 'screenAvailWidth': 1680,
500 | 'screenHeight': 1050,
501 | 'screenLeft': 0,
502 | 'screenTop': 23,
503 | 'screenWidth': 1680,
504 | 'sessionStorageEnabled': 1,
505 | 'systemLanguage': None,
506 | 'textLength': 93737,
507 | 'timestamp': int(time_stamp()),
508 | 'timezone': -8,
509 | 'touchEvent': False,
510 | 'userAgent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3818.0 Safari/537.36",
511 | }
512 | self.t = ['textLength', 'HTMLLength', 'documentMode', 'A', 'ARTICLE', 'ASIDE', 'AUDIO', 'BASE', 'BUTTON', 'CANVAS', 'CODE', 'IFRAME', 'IMG', 'INPUT', 'LABEL', 'LINK', 'NAV', 'OBJECT', 'OL', 'PICTURE', 'PRE', 'SECTION', 'SELECT', 'SOURCE', 'SPAN', 'STYLE', 'TABLE', 'TEXTAREA', 'VIDEO', 'screenLeft', 'screenTop', 'screenAvailLeft', 'screenAvailTop', 'innerWidth', 'innerHeight', 'outerWidth', 'outerHeight', 'browserLanguage', 'browserLanguages', 'systemLanguage', 'devicePixelRatio', 'colorDepth',
513 | 'userAgent', 'cookieEnabled', 'netEnabled', 'screenWidth', 'screenHeight', 'screenAvailWidth', 'screenAvailHeight', 'localStorageEnabled', 'sessionStorageEnabled', 'indexedDBEnabled', 'CPUClass', 'platform', 'doNotTrack', 'timezone', 'canvas2DFP', 'canvas3DFP', 'plugins', 'maxTouchPoints', 'flashEnabled', 'javaEnabled', 'hardwareConcurrency', 'jsFonts', 'timestamp', 'performanceTiming', 'internalip', 'mediaDevices', 'DIV', 'P', 'UL', 'LI', 'SCRIPT', 'deviceorientation', 'touchEvent']
514 |
515 | def get_performanceTiming(self):
516 | r = ['navigationStart', 'redirectStart', 'redirectEnd', 'fetchStart', 'domainLookupStart',
517 | 'domainLookupEnd', 'connectStart', 'connectEnd', 'requestStart', 'responseStart']
518 | o = ['responseEnd', 'unloadEventStart', 'unloadEventEnd', 'domLoading', 'domInteractive', 'domContentLoadedEventStart',
519 | 'domContentLoadedEventEnd', 'domComplete', 'loadEventStart', 'loadEventEnd', 'msFirstPaint']
520 | n = {
521 | 'connectEnd': 1568518372487,
522 | 'connectStart': 1568518372213,
523 | 'domComplete': 1568518408239,
524 | 'domContentLoadedEventEnd': 1568518376104,
525 | 'domContentLoadedEventStart': 1568518376101,
526 | 'domInteractive': 1568518376101,
527 | 'domLoading': 1568518372728,
528 | 'domainLookupEnd': 1568518372213,
529 | 'domainLookupStart': 1568518372091,
530 | 'fetchStart': 1568518372089,
531 | 'loadEventEnd': 1568518408243,
532 | 'loadEventStart': 1568518408240,
533 | 'navigationStart': 1568518372073,
534 | 'redirectEnd': 0,
535 | 'redirectStart': 0,
536 | 'requestStart': 1568518372487,
537 | 'responseEnd': 1568518372833,
538 | 'responseStart': 1568518372696,
539 | 'secureConnectionStart': 1568518372348,
540 | 'unloadEventEnd': 1568518372702,
541 | 'unloadEventStart': 1568518372702,
542 | }
543 | i = []
544 | for e in range(1, len(r)):
545 | a = n[r[e]]
546 | if a == 0:
547 | i.append(-1)
548 | else:
549 | for s in range(e - 1, -1, -1):
550 | c = n[r[s]]
551 | if c:
552 | i.append(a - c)
553 | break
554 | u = n[r[len(r) - 1]]
555 | for e in o:
556 | if e in n and n[e]:
557 | i.append(n[e] - u)
558 | else:
559 | i.append(-1)
560 | self.n['performanceTiming'] = ','.join([str(ii) for ii in i])
561 |
562 | def __call__(self):
563 | self.get_performanceTiming()
564 | self.r = [self.n[ii] if ii in self.n else -1 for ii in self.t]
565 | self.i = '!!'.join([str(ii) for ii in self.r]).replace(
566 | 'False', 'false').replace('True', 'true')
567 |
568 |
569 | class T(object):
570 | ''' AES '''
571 | def gjson_stringify(self, o: dict):
572 | o_str = str(o).replace("'", '"').replace(
573 | 'True', 'true').replace('False', 'false')
574 | return o_str
575 |
576 | def parse(self, aes_key: str):
577 | n = {}
578 | for r, p in enumerate(aes_key):
579 | if r >> 2 in n:
580 | n[r >> 2] = (n[r >> 2]) | ((255 & ord(p)) << 24 - r % 4 * 8)
581 | else:
582 | n[r >> 2] = ((255 & ord(p)) << 24 - r % 4 * 8)
583 | return {
584 | 'sigBytes': len(aes_key),
585 | 'words': list(n.values()),
586 | }
587 |
588 | def encrypt(self, e: str, t: str):
589 | t = self.parse(t)
590 | n = {}
591 | n['iv'] = self.parse('0000000000000000')
592 |
593 | def E_encrypt(self, u: dict, e: str, t: dict, n: dict):
594 | self.createEncryptor(t, n)
595 |
596 | def createEncryptor(self, e: dict, r: dict):
597 | pass
598 |
599 | def create(self):
600 | e = 1
601 |
--------------------------------------------------------------------------------
/bilibili/loginBilibili.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-09-14 14:47:48
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-03-21 20:56:38
6 |
7 | import base64
8 | import json
9 | import os
10 | import sys
11 | import time
12 | import urllib
13 |
14 | import numpy as np
15 | import regex
16 | import rsa
17 |
18 | sys.path.append(os.getcwd())
19 | from util.util import can_retry, echo, encoder_cookie, send_email, time_stamp, get_accept, get_content_type
20 |
21 | from bilibili.basicBilibili import BasicBilibili
22 | from bilibili.geetestE import E, O, S
23 |
24 |
25 | proxy_req = 0
26 | one_day = 86400
27 | root_dir = os.path.abspath('bilibili')
28 | data_dir = os.path.join(root_dir, 'data/')
29 | PUBLIC = '00C1E3934D1614465B33053E7F48EE4EC87B14B95EF88947713D25EECBFF7E74C7977D02DC1D9451F79DD5D1C10C29ACB6A9B4D6FB7D0A0279B6719E1772565F09AF627715919221AEF91899CAE08C0D686D748B20A3603BE2318CA6BC2B59706592A9219D0BF05C9F65023A21D2330807252AE0066D59CEEFA5F2748EA80BAB81'
30 |
31 |
32 | class Login(BasicBilibili):
33 | ''' bilibili login module '''
34 |
35 | def __init__(self):
36 | super(Login, self).__init__()
37 | self.update_proxy(1)
38 | self.access_key = ''
39 | self.aes_key = ''
40 | self.T = E(list(PUBLIC), 16)
41 |
42 | def get_access_key(self):
43 | captcha, cookie = self.get_captcha()
44 | hash_salt, key, cookie = self.get_hash_salt(cookie)
45 | if captcha is None:
46 | return
47 | types, cookie = self.get_type(captcha['gt'], cookie)
48 | return {
49 | 'captcha': captcha,
50 | 'hash_salt': hash_salt,
51 | 'types': types,
52 | 'cookie': cookie
53 | }
54 |
55 | def get_aes_key(self):
56 | def wl():
57 | return hex(int(65536 * (1 + np.random.random())))[3:]
58 | return wl() + wl() + wl() + wl()
59 |
60 | def get_t(self, aes_key: str, t: int = 128):
61 | n = np.zeros(t).astype(np.int)
62 | for ii, jj in enumerate(aes_key):
63 | n[ii + 112] = ord(jj)
64 | i = S()
65 | for ii in range(t - 2, 1, -1):
66 | n[ii] = i()
67 | n[1] = 2
68 | return n
69 |
70 | def doPublic(self):
71 | n = self.get_t(self.get_aes_key())
72 | self.N = E(n, 256)
73 | n = self.N.modPowInt(65537, self.T)
74 | r = n.tostring(16)
75 | add = '' if not (1 & len(r)) else '0'
76 | return '{}{}'.format(add, r)
77 |
78 | def get_hash_salt(self, cookie: dict = {}):
79 | url = self.GET_KEY_URL % np.random.random()
80 | headers = self.get_login_headers(2, cookie)
81 | hash_salt, cookies = proxy_req(url, 1, header=headers,
82 | need_cookie=True)
83 | if hash_salt is None or list(hash_salt.keys()) != ['hash', 'key']:
84 | if can_retry(url):
85 | return self.get_hash_salt()
86 | else:
87 | return None, {}
88 | return hash_salt['hash'], hash_salt['key'], cookies
89 |
90 | def get_captcha(self, cookie: dict = {}):
91 | url = self.CAPTCHA_URL
92 | headers = self.get_login_headers(0, cookie)
93 | captcha, cookies = proxy_req(url, 1, header=headers, need_cookie=True)
94 | if captcha is None or list(captcha.keys()) != ['code', 'data']:
95 | if can_retry(url):
96 | return self.get_captcha()
97 | else:
98 | return None, {}
99 | return captcha['data']['result'], cookies
100 |
101 | def get_access_key_req(self, hash_salt: str, key: str, challenge: str, validate: str, cookie: dict = {}):
102 | data = {
103 | 'captchaType': 11,
104 | 'username': self.username,
105 | 'password': self.encoder_login_info(hash_salt, key),
106 | 'keep': True,
107 | 'key': key,
108 | 'goUrl': self.AV_URL,
109 | 'challenge': challenge,
110 | 'validate': validate,
111 | 'seccode': f'{validate}|jordan'
112 | }
113 | headers = self.get_login_headers(1, cookie)
114 | login = proxy_req(self.LOGIN_V2_URL, 12, header=headers)
115 |
116 | def get_type(self, gt: str, cookies: dict = {}) -> dict:
117 | url = self.GETTYPE_URL % (gt, int(time_stamp() * 1000))
118 | headers = self.get_login_headers(3, cookies)
119 | req, cookie = proxy_req(url, 3, header=headers, need_cookie=True)
120 | j_begin = req.find('{')
121 | if req == '' or j_begin == -1:
122 | if can_retry(self.GETTYPE_URL):
123 | return self.get_type(gt, cookies)
124 | else:
125 | return None, {}
126 | type_json = json.loads(req[j_begin:-1])
127 | return type_json['data'], cookie
128 |
129 | def encode_login_info(self, hash_salt: str, key: str):
130 | public_key = rsa.PublicKey.load_pkcs1_openssl_pem(key.encode())
131 | concate = rsa.encrypt(hash_salt + self.password).encode('utf-8')
132 | s = base64.b64encode(concate, public_key)
133 | s = urllib.parse.quote_plus(s)
134 | return s
135 |
136 | def get_login_headers(self, mode: int = 0, cookie: dict = {}):
137 | headers = {
138 | 'Referer': self.LOGIN_URL,
139 | }
140 | if mode != 3:
141 | headers['Accept'] = get_accept('*') if mode == 2 else get_accept('xhr')
142 | if mode == 1:
143 | headers['Content-Type'] = get_content_type('')
144 | elif mode == 2:
145 | headers['X-Requested-With'] = 'XMLHttpRequest'
146 | if len(cookie):
147 | headers['Cookie'] = encoder_cookie(cookie)
148 | return headers
149 |
150 | def update_proxy(self, mode: int = 0):
151 | global proxy_req
152 | if not mode:
153 | self.update_proxy_basic()
154 | proxy_req = self.proxy_req
155 |
--------------------------------------------------------------------------------
/blog/titleviews.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-02-09 11:10:52
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-01-04 14:41:30
6 |
7 | import argparse
8 | import codecs
9 | import datetime
10 | import re
11 | import os
12 | import threading
13 |
14 | from bs4 import BeautifulSoup
15 | from proxy.getproxy import GetFreeProxy
16 | from util.db import Db
17 | from util.util import begin_time, end_time, changeCookie, basic_req, can_retry, changeHtmlTimeout, echo, mkdir, read_file, echo, get_accept
18 |
19 | """
20 | * blog @http
21 | * www.zhihu.com/api/v4/creator/content_statistics
22 | * www.jianshu.com/u/
23 | * blog.csdn.net
24 | .data/
25 | ├── cookie // zhihu cookie
26 | ├── google // google analysis data
27 | ├── slug // blog title slug
28 | └── title // blog title list
29 | """
30 | proxy_req = GetFreeProxy().proxy_req
31 | data_dir = 'blog/data/'
32 |
33 |
34 | class TitleViews(object):
35 | ''' script of load my blog data -> analysis '''
36 | CSDN_URL = 'https://blog.csdn.net/iofu728'
37 | JIANSHU_URL = 'https://www.jianshu.com/u/2e0f69e4a4f0'
38 | ZHIHU_URL = 'https://www.zhihu.com/api/v4/creator/content_statistics/'
39 |
40 | def __init__(self):
41 | self.Db = Db("blog")
42 | self.local_views = {}
43 | self.title_map = {}
44 | self.title2slug = {}
45 | self.zhihu_views = {}
46 | self.zhihu_id = {}
47 | self.jianshu_views = {}
48 | self.jianshu_id = {}
49 | self.csdn_views = {}
50 | self.csdn_id = {}
51 | self.exist_data = {}
52 | self.getTitleMap()
53 | self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s'''
54 | self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s'''
55 | self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s'''
56 |
57 | def loadLocalView(self):
58 | ''' load local view '''
59 | test = read_file('{}google'.format(data_dir))[7:]
60 | for index in test:
61 | arr = index.split(',')
62 | slug = self.matchSlug(arr[0])
63 | if slug is None or slug not in self.title_map:
64 | continue
65 | print(slug + ' ' + str(arr[1]) + ' ' + arr[0])
66 | if slug in self.local_views:
67 | self.local_views[slug] += int(arr[1])
68 | else:
69 | self.local_views[slug] = int(arr[1])
70 |
71 | def getTitleMap(self):
72 | ''' get title map '''
73 | slug = read_file('{}slug'.format(data_dir))
74 | title = read_file('{}title'.format(data_dir))
75 | self.title_map = {tempslug.split(
76 | '"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug)}
77 | title2slug = {
78 | self.title_map[index]: index for index in self.title_map.keys()}
79 | noemoji_title = {self.filter_emoji(
80 | self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys()}
81 | self.title2slug = {**noemoji_title, **title2slug}
82 |
83 | def matchSlug(self, pattern: str):
84 | ''' match slug '''
85 | arr = re.search(r'\/([^\/]+).html', pattern)
86 | return None if arr is None else arr.group(1)
87 |
88 | def getZhihuView(self):
89 | cookie = ''.join(read_file('{}cookie'.format(data_dir)))
90 | changeCookie(cookie)
91 | url_basic = [
92 | self.ZHIHU_URL,
93 | 'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=',
94 | datetime.datetime.now().strftime("%Y-%m-%d"),
95 | '&page_no='
96 | ]
97 | url = ''.join(url_basic)
98 |
99 | json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i)
100 | if not json:
101 | return
102 | if not 'data' in json:
103 | if 'code' in json:
104 | echo('0|warning', json)
105 | return
106 | echo(3, 'zhihu', json)
107 | for index in json['data']:
108 | zhihu_title = index['title']
109 | zhihu_id = int(index['url_token'])
110 | zhihu_count = int(index['read_count'])
111 |
112 | if zhihu_title in self.title2slug:
113 | temp_slug = self.title2slug[zhihu_title]
114 | self.zhihu_id[temp_slug] = zhihu_id
115 | self.zhihu_views[temp_slug] = zhihu_count
116 | elif zhihu_id in self.zhihu_id_map:
117 | temp_slug = self.zhihu_id_map[zhihu_id]
118 | self.zhihu_id[temp_slug] = zhihu_id
119 | self.zhihu_views[temp_slug] = zhihu_count
120 | else:
121 | echo('0|debug', index['title'])
122 |
123 | for index in range(1, json['count'] // 10):
124 | echo(1, 'zhihu', index)
125 | json = self.get_request('{}{}'.format(url, 1 + index), 1, lambda i: not i)
126 | echo(2, 'zhihu', json)
127 | if not json:
128 | continue
129 | for index in json['data']:
130 | zhihu_title = index['title']
131 | zhihu_id = int(index['url_token'])
132 | zhihu_count = int(index['read_count'])
133 |
134 | if zhihu_title in self.title2slug:
135 | temp_slug = self.title2slug[zhihu_title]
136 | self.zhihu_id[temp_slug] = zhihu_id
137 | self.zhihu_views[temp_slug] = zhihu_count
138 | elif zhihu_id in self.zhihu_id_map:
139 | temp_slug = self.zhihu_id_map[zhihu_id]
140 | self.zhihu_id[temp_slug] = zhihu_id
141 | self.zhihu_views[temp_slug] = zhihu_count
142 | else:
143 | echo('0|debug', index['title'])
144 |
145 | def get_request(self, url: str, types: int, functs, header: dict = {}):
146 | if len(header):
147 | req = basic_req(url, types, header=header)
148 | else:
149 | req = basic_req(url, types)
150 |
151 | if functs(req):
152 | if can_retry(url):
153 | self.get_request(url, types, functs, header)
154 | return
155 | return req
156 |
157 | def getJianshuViews(self):
158 | ''' get jianshu views '''
159 | header = {'accept': get_accept('html')}
160 |
161 | for rounds in range(1, 4):
162 | url = self.JIANSHU_URL
163 | if rounds > 1:
164 | url += '?order_by=shared_at&page={}'.format(rounds)
165 | echo('1|debug', 'jianshu req url:', url)
166 | html = self.get_request(url, 0, lambda i: not i or not len(
167 | i.find_all('div', class_='content')), header)
168 | if html is None:
169 | echo(0, 'None')
170 | return
171 | for index in html.find_all('li', class_=["", 'have-img']):
172 | if len(index.find_all('i')) < 3:
173 | continue
174 | title = index.find_all('a', class_='title')[
175 | 0].text.replace('`', '')
176 | jianshu_id = int(index['data-note-id'])
177 | jianshu_count = int(index.find_all('a')[-2].text)
178 | if title in self.title2slug:
179 | temp_slug = self.title2slug[title]
180 | self.jianshu_id[temp_slug] = jianshu_id
181 | self.jianshu_views[temp_slug] = jianshu_count
182 | elif jianshu_id in self.jianshu_id_map:
183 | temp_slug = self.jianshu_id_map[jianshu_id]
184 | self.jianshu_id[temp_slug] = jianshu_id
185 | self.jianshu_views[temp_slug] = jianshu_count
186 | else:
187 | echo(1, title)
188 |
189 | def getCsdnViews(self):
190 | ''' get csdn views '''
191 |
192 | for index in range(1, 3):
193 | url = self.CSDN_URL
194 | if index > 1:
195 | url += '/article/list/{}?'.format(index)
196 | echo(1, 'csdn url', url)
197 |
198 | html = self.get_request(url, 0, lambda i: i is None or not i or not len(
199 | i.find_all('p', class_='content')))
200 | if html is None:
201 | echo(0, 'None')
202 | return
203 | for div_lists in html.find_all('div', class_='article-item-box csdn-tracking-statistics'):
204 | if 'style' in div_lists.attrs:
205 | continue
206 | csdn_id = int(div_lists['data-articleid'])
207 | title = div_lists.a.contents[2].replace(
208 | '\n', '').strip().replace('`', '')
209 | csdn_count = int(div_lists.find_all(
210 | 'span', class_='read-num')[0].span.text)
211 | if title in self.title2slug:
212 | temp_slug = self.title2slug[title]
213 | self.csdn_id[temp_slug] = csdn_id
214 | self.csdn_views[temp_slug] = csdn_count
215 | elif csdn_id in self.csdn_id_map:
216 | temp_slug = self.csdn_id_map[csdn_id]
217 | self.csdn_id[temp_slug] = csdn_id
218 | self.csdn_views[temp_slug] = csdn_count
219 | else:
220 | echo(1, title)
221 |
222 | def filter_emoji(self, desstr, restr=''):
223 | ''' filter emoji '''
224 | desstr = str(desstr)
225 | try:
226 | co = re.compile(u'[\U00010000-\U0010ffff]')
227 | except re.error:
228 | co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
229 | return co.sub(restr, desstr)
230 |
231 | def init_db(self):
232 | self.loadLocalView()
233 | self.getZhihuView()
234 | self.getJianshuViews()
235 | self.getCsdnViews()
236 | insert_list = []
237 | for index in self.title_map.keys():
238 | insert_list.append((index, self.local_views[index] if index in self.local_views else 0, self.zhihu_views[index] if index in self.zhihu_views else 0, self.csdn_views[index] if index in self.csdn_views else 0, self.jianshu_views[index]
239 | if index in self.jianshu_views else 0, self.zhihu_id[index] if index in self.zhihu_id else 0, self.csdn_id[index] if index in self.csdn_id else 0, self.jianshu_id[index] if index in self.jianshu_id else 0))
240 | # return insert_list
241 | results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1])
242 | if results:
243 | if len(insert_list):
244 | print('Insert ' + str(len(insert_list)) + ' Success!')
245 | else:
246 | pass
247 |
248 | def select_all(self):
249 | result = self.Db.select_db(
250 | "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0")
251 | if result == False:
252 | print("SELECT Error!")
253 | else:
254 | self.exist_data = {index[1]: list(index) for index in result}
255 | self.zhihu_id_map = {index[6]: index[1]
256 | for index in result if index[6]}
257 | self.csdn_id_map = {index[7]: index[1]
258 | for index in result if index[7]}
259 | self.jianshu_id_map = {index[8]: index[1]
260 | for index in result if index[8]}
261 | for index in self.exist_data:
262 | self.exist_data[index][-1] = self.exist_data[index][-1].strftime(
263 | '%Y-%m-%d %H:%M:%S')
264 |
265 | def update_view(self):
266 | changeHtmlTimeout(10)
267 | wait_map = {}
268 | self.select_all()
269 | self.getZhihuView()
270 | self.getJianshuViews()
271 | self.getCsdnViews()
272 | for index in self.zhihu_views.keys():
273 | if self.zhihu_views[index] == self.exist_data[index][3] and self.zhihu_id[index] == self.exist_data[index][6]:
274 | continue
275 | wait_map[index] = self.exist_data[index]
276 | wait_map[index][3] = self.zhihu_views[index]
277 | wait_map[index][6] = self.zhihu_id[index]
278 | for index in self.csdn_views.keys():
279 | if self.csdn_views[index] == self.exist_data[index][4] and self.csdn_id[index] == self.exist_data[index][7]:
280 | continue
281 | if index not in wait_map:
282 | wait_map[index] = self.exist_data[index]
283 | wait_map[index][4] = self.csdn_views[index]
284 | wait_map[index][7] = self.csdn_id[index]
285 | for index in self.jianshu_views.keys():
286 | if self.jianshu_views[index] == self.exist_data[index][5] and self.jianshu_id[index] == self.exist_data[index][8]:
287 | continue
288 | wait_map[index] = self.exist_data[index]
289 | wait_map[index][5] = self.jianshu_views[index]
290 | wait_map[index][8] = self.jianshu_id[index]
291 | update_list = [tuple(index) for index in wait_map.values()]
292 | # return update_list:q
293 | if not len(update_list):
294 | return
295 | results = self.Db.update_db(self.update_sql % str(update_list)[1:-1])
296 | if results:
297 | if len(update_list):
298 | print('Update ' + str(len(update_list)) + ' Success!')
299 | else:
300 | pass
301 |
302 | def new_day(self):
303 | day_data = self.Db.select_db(
304 | "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1")
305 | if not os.path.exists('../blog/log/basic'):
306 | print('File not exist!!!')
307 | return
308 | with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f:
309 | existed_spider = int(f.readlines()[1])
310 | today_date = datetime.datetime.now().strftime('%Y-%m-%d')
311 | new_day_list = [(today_date, day_data[0][0] +
312 | day_data[0][1], existed_spider)]
313 | results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1])
314 | if results:
315 | if len(new_day_list):
316 | print('New day update' + str(len(new_day_list)) + ' Success!')
317 | else:
318 | pass
319 |
320 | def load_csdn_img(self):
321 | ''' load csdn img '''
322 | mkdir(data_dir)
323 | urls = ['/article/list/2?', '']
324 | article_ids = []
325 | for url in urls:
326 | req = basic_req('{}{}'.format(self.CSDN_URL, url), 3)
327 | article_ids.extend(re.findall('data-articleid="(\w*?)"', req))
328 | echo(0, article_ids)
329 | article_thread = [threading.Thread(
330 | target=self.load_csdn_img_batch, args=(ii,)) for ii in article_ids]
331 | for work in article_thread:
332 | work.start()
333 | for work in article_thread:
334 | work.join()
335 |
336 | def load_csdn_img_batch(self, article_id: int):
337 | url = '{}/article/details/{}'.format(self.CSDN_URL, article_id)
338 | req = proxy_req(url, 3)
339 | if not 'iofu728' in req:
340 | if can_retry(url):
341 | self.load_csdn_img_batch(article_id)
342 | return
343 | img_lists = re.findall('"(https://cdn.nlark.com.*)" alt', req)
344 | img_thread = [threading.Thread(target=self.load_csdn_img_load, args=(
345 | jj, article_id, ii))for ii, jj in enumerate(img_lists)]
346 | echo(1, 'Article Need Load {} Img...'.format(len(img_lists)))
347 | for work in img_thread:
348 | work.start()
349 | for work in img_thread:
350 | work.join()
351 |
352 | def load_csdn_img_load(self, img_url: str, article_id: int, idx: int):
353 | img_dir = '{}{}/'.format(data_dir, article_id)
354 | img_path = '{}{}.png'.format(img_dir, idx)
355 | if os.path.exists(img_path):
356 | return
357 | req = proxy_req(img_url, 2)
358 | if type(req) == bool or req is None:
359 | if can_retry(img_url):
360 | self.load_csdn_img_load(img_url, article_id, idx)
361 | return
362 | mkdir(img_dir)
363 | with open(img_path, 'wb') as f:
364 | f.write(req.content)
365 |
366 |
367 | if __name__ == '__main__':
368 | if not os.path.exists(data_dir):
369 | os.makedirs(data_dir)
370 | parser = argparse.ArgumentParser(description='gunjianpan blog backup code')
371 | parser.add_argument('--model', type=int, default=1, metavar='N',
372 | help='model update or new day')
373 | model = parser.parse_args().model
374 | bb = TitleViews()
375 | if model == 1:
376 | bb.update_view()
377 | else:
378 | bb.new_day()
379 | bb.update_view()
380 |
--------------------------------------------------------------------------------
/brushclass/brushclass.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-02-25 21:13:45
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2019-10-11 02:03:24
6 |
7 | import argparse
8 | import time
9 | import random
10 | import os
11 | import sys
12 |
13 | sys.path.append(os.getcwd())
14 | from collections import Counter
15 | from proxy.getproxy import GetFreeProxy
16 | from util.util import begin_time, end_time, send_email, can_retry, echo, basic_req, get_accept, get_content_type
17 |
18 | proxy_req = GetFreeProxy().proxy_req
19 | data_path = 'brushclass/data/'
20 |
21 | """
22 | * brush @http
23 | * http://elective.pku.edu.cn
24 | * https://portal.w.pku.edu.cn/portal2017/bizcenter/score/retrScores.do
25 | .data/
26 | └── cookie // elective.pku.edu.cn cookie
27 | """
28 |
29 |
30 | class Brush(object):
31 | """
32 | brush class in http://elective.pku.edu.cn
33 | """
34 |
35 | def __init__(self, ):
36 | self.failured_map = {}
37 | self.laster_timestamp = 0
38 |
39 | def have_places(self):
40 | """
41 | brush class
42 | """
43 | version = begin_time()
44 | have_places = False
45 |
46 | while not have_places:
47 | if self.have_places_once():
48 | send_email('大数据专题', '大数据专题 有名额啦 有名额啦')
49 | send_email('大数据专题', '大数据专题 有名额啦 有名额啦')
50 | send_email('大数据专题', '大数据专题 有名额啦 有名额啦')
51 | have_places = True
52 | time.sleep(random.randint(10, 20))
53 | end_time(version)
54 |
55 | def have_places_once(self):
56 | """
57 | have places
58 | """
59 | url = 'http://elective.pku.edu.cn/elective2008/edu/pku/stu/elective/controller/supplement/refreshLimit.do'
60 | if not os.path.exists('%scookie' % data_path):
61 | print('Brush Cookie not exist!!!')
62 | return
63 | with open('%scookie' % data_path, 'r') as f:
64 | cookie = f.readlines()
65 | headers = {
66 | 'X-Requested-With': 'XMLHttpRequest',
67 | 'Cookie': '',
68 | 'Content-Type': get_content_type(),
69 | 'Accept': get_accept('xhr'),
70 | "Origin": "http://elective.pku.edu.cn",
71 | "Referer": "http://elective.pku.edu.cn/elective2008/edu/pku/stu/elective/controller/supplement/SupplyCancel.do",
72 | }
73 | headers['Cookie'] = cookie[0][:-1]
74 |
75 | data = {
76 | "index": '10',
77 | "seq": 'yjkc20141100016542',
78 | }
79 |
80 | ca = proxy_req(url, 11, data, header=headers)
81 |
82 | if not ca:
83 | if round(time.time()) - self.laster_timestamp > 60:
84 | send_email("Cookie failure", "Cookie failure")
85 | return False
86 | print(ca['electedNum'])
87 | self.laster_timestamp = round(time.time())
88 | return int(ca['electedNum']) < 120
89 |
90 |
91 | def get_score(cookie: str):
92 | SCORE_URL = 'https://portal.w.pku.edu.cn/portal2017/bizcenter/score/retrScores.do'
93 | headers = {
94 | 'Accept': get_accept('xhr'),
95 | 'Host': 'portal.w.pku.edu.cn',
96 | 'Origin': 'https://portal.w.pku.edu.cn',
97 | 'Referer': 'https://portal.w.pku.edu.cn/portal2017/',
98 | 'Cookie': cookie,
99 |
100 | }
101 | req = basic_req(SCORE_URL, 11, header=headers)
102 | if req is None or list(req.keys()) != ['success', 'xslb', 'xh', 'xm', 'scoreLists']:
103 | if can_retry(SCORE_URL):
104 | return get_score(cookie)
105 | else:
106 | return
107 | return req
108 |
109 |
110 | def get_gpa(cookie: str):
111 | score = get_score(cookie)
112 | if score is None:
113 | return
114 | need_cj = ['A', 'B', 'C', 'D', 'F']
115 | name = score['xm']
116 | student_id = score['xh']
117 | score_list = score['scoreLists']
118 | score_list = [(int(ii['xf']), ii['cj'])
119 | for ii in score_list if ii['cj'][0] in need_cj]
120 | grade_list = [(ii, get_grade_point(jj)) for ii, jj in score_list]
121 | TG = sum([ii * jj for ii, jj in grade_list])
122 | TC = sum([ii for ii, _ in grade_list])
123 | level = [ii[0] for _, ii in score_list]
124 | level_count = Counter(level)
125 | gpa = TG / TC
126 | echo(1, f'{name}, Congratulations u get {TC} credits and {gpa:.3f} gpa in this university.')
127 | for ii in need_cj:
128 | if ii not in level_count:
129 | continue
130 | count = level_count[ii]
131 | echo(2, f'U have {count} class get {ii}.')
132 |
133 |
134 | def get_grade_point(score: str):
135 | score_map = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'F': 0}
136 | grade_point = score_map[score[0]]
137 | if len(score) == 2 and score[0] != 'F':
138 | flag = 1 if score[1] == '+' else -1
139 | grade_point += 0.3 * flag
140 | grade_point = min(4, grade_point)
141 | return grade_point
142 |
143 |
144 | if __name__ == '__main__':
145 | if not os.path.exists(data_path):
146 | os.makedirs(data_path)
147 | parser = argparse.ArgumentParser(description='pku student helper')
148 | parser.add_argument('--mode', type=int, default=1, metavar='mode',help='0->bruchclass,1->get_gpa')
149 | parser.add_argument('--cookie', type=str, default='', metavar='cookie',help='portal cookie')
150 | mode = parser.parse_args().mode
151 | if mode == 0:
152 | brush = Brush()
153 | brush.have_places()
154 | else:
155 | cookie = parser.parse_args().cookie.replace('\'', '').replace('"', '')
156 | get_gpa(cookie)
157 |
--------------------------------------------------------------------------------
/buildmd/article.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE if not exists `article` (
2 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'auto-increment primary keys',
3 | `article_id` varchar(50) NOT NULL DEFAULT '0' COMMENT 'article id',
4 | `title` varchar(500) NOT NULL DEFAULT '0' COMMENT 'article title',
5 | `q` varchar(500) NOT NULL DEFAULT '0' COMMENT 'article q',
6 | `is_deleted` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'is deleted',
7 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'create time',
8 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
9 | PRIMARY KEY (`id`)
10 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 comment='article table';
--------------------------------------------------------------------------------
/buildmd/tbk.ini.tmp:
--------------------------------------------------------------------------------
1 | [TBK]
2 | appkey = 123
3 | secret = 123
4 | user_id = 123
5 | site_id = 123
6 | adzone_id = 123
7 | uland_url = http://
8 | test_item_id = 123
9 | test_finger_id = 111
10 | apikey = 111
11 |
12 | [YNOTE]
13 | cookie = "123456"
14 | home_id = 123456
15 | unlogin_id = 123456
--------------------------------------------------------------------------------
/buildmd/tpwd.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE if not exists `article_tpwd` (
2 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'auto-increment primary keys',
3 | `article_id` varchar(50) NOT NULL DEFAULT '0' COMMENT 'article id',
4 | `tpwd_id` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'tpwd item id',
5 | `item_id` varchar(30) NOT NULL DEFAULT '0' COMMENT 'goods item id',
6 | `tpwd` varchar(30) NOT NULL DEFAULT '0' COMMENT 'tpwd content',
7 | `domain` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'tpwd type @0->s.click, @1->item, @5->uland, @10->taoquan',
8 | `content` varchar(300) NOT NULL DEFAULT '_' COMMENT 'tpwd content',
9 | `url` varchar(1000) NOT NULL DEFAULT '_' COMMENT 'tpwd corresponding url',
10 | `commission_rate` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'commission rate',
11 | `commission_type` varchar(30) NOT NULL DEFAULT '' COMMENT 'commission type',
12 | `expire_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'expire time',
13 | `is_deleted` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'is deleted',
14 | `other1` varchar(300) NOT NULL DEFAULT '' COMMENT 'other1',
15 | `other2` varchar(300) NOT NULL DEFAULT '' COMMENT 'other2',
16 | `other3` varchar(300) NOT NULL DEFAULT '' COMMENT 'other3',
17 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'create time',
18 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
19 | PRIMARY KEY (`id`)
20 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 comment='table for article info in tbk';
21 |
--------------------------------------------------------------------------------
/ctrip/hotelDetail.js:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: gunjianpan
3 | * @Date: 2019-04-20 22:21:40
4 | * @Last Modified by: gunjianpan
5 | * @Last Modified time: 2019-04-21 13:18:38
6 | */
7 |
8 | const jsdom = require('jsdom');
9 | const {JSDOM} = jsdom;
10 |
11 | function genEleven(script, url, callback) {
12 | const dom = new JSDOM();
13 | window = dom.window;
14 | document = window.document;
15 | window.decodeURIComponent = decodeURIComponent;
16 | let href = url
17 | let userAgent = 'Chrome/73.0.3682.0'
18 | let geolocation = 0;
19 | document.createElement('div');
20 | var div = document.createElement('div');
21 | div.innerHTML = '333';
22 | window[callback] =
23 | function(e) {
24 | window.AAA = e();
25 | }
26 |
27 | eval(script);
28 | console.log(aaa);
29 | return aaa;
30 | }
31 | url = 'https://hotels.ctrip.com/hotel/4889292.html'
32 |
33 | script = 'let aaa = 1;'
34 | genEleven(script, url, 'CASNAuIDNBfCYLBKdi')
35 |
--------------------------------------------------------------------------------
/ctrip/hotelDetail.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-04-20 10:57:55
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2019-10-11 02:04:24
6 |
7 | import codecs
8 | import datetime
9 | import json
10 | import os
11 | import random
12 | import re
13 | import shutil
14 | import sys
15 | import threading
16 | import time
17 |
18 | import numpy as np
19 | import tzlocal
20 | from bs4 import BeautifulSoup
21 |
22 | import execjs
23 | import js2py
24 |
25 | sys.path.append(os.getcwd())
26 | from util.util import (basic_req, begin_time, changeHeaders, decoder_fuzz,
27 | echo, end_time, time_str, get_accept, get_content_type)
28 |
29 |
30 | data_dir = 'ctrip/data/'
31 | cookie_path = 'ctrip/cookie.txt'
32 | compress_path = 'ctrip/compress.js'
33 | one_day = 86400
34 |
35 |
36 | def decoder_confusion():
37 | ''' decoder confusion '''
38 | with open(f'{data_dir}fingerprint.js', 'r') as f:
39 | origin_js = [codecs.unicode_escape_decode(
40 | ii.strip())[0] for ii in f.readlines()]
41 | __0x3717e_begin = origin_js[1].index('[') + 1
42 | __0x3717e_end = origin_js[1].index(']')
43 | __0x3717e = origin_js[1][__0x3717e_begin:__0x3717e_end].split(',')
44 | __0x3717e = [ii.strip() for ii in __0x3717e]
45 | origin_str = '|||||'.join(origin_js)
46 | params = re.findall(r'var (.*?) =', origin_str)
47 | params_another = re.findall(r'function\((.*?)\)', origin_str)
48 | params_another = sum([ii.replace('|||||', '').split(',')
49 | for ii in params_another], [])
50 | params += params_another
51 |
52 | params = sorted(list(set([ii for ii in params if len(
53 | ii) > 6])), key=lambda ii: len(ii), reverse=True)
54 | for ii, jj in enumerate(__0x3717e):
55 | origin_str = origin_str.replace(f'__0x3717e[{ii}]', jj)
56 | for ii, jj in enumerate(params):
57 | origin_str = origin_str.replace(jj, f'a{ii}')
58 | with open(f'{data_dir}fingerprint_confusion.js', 'w') as f:
59 | f.write('\n'.join(origin_str.split('|||||')))
60 |
61 |
62 | def load_ocean():
63 | ''' load ocean '''
64 | with open(f'{data_dir}oceanball_origin.js', 'r') as f:
65 | origin_js = [ii.strip() for ii in f.readlines()]
66 | origin_str = '|||'.join(origin_js)
67 | params = [*re.findall(r'var ([a-zA-Z]*?) =', origin_str),
68 | re.findall(r'var ([a-zA-Z]*?);', origin_str)]
69 | params_another = re.findall(r'function\((.*?)\)', origin_str)
70 | params_another = sum([ii.replace('|||', '').split(',')
71 | for ii in params_another], [])
72 | params += params_another
73 | params += re.findall(r', ([a-zA-Z]*?)\)', origin_str)
74 | params += re.findall(r'\(([a-zA-Z]*?),', origin_str)
75 |
76 | params = sorted(list(set([ii for ii in params if len(
77 | ii) > 6])), key=lambda ii: len(ii), reverse=True)
78 | for ii, jj in enumerate(params):
79 | origin_str = origin_str.replace(jj, f'a{ii}')
80 | with open(f'{data_dir}oceanball_origin_decoder.js', 'w') as f:
81 | f.write(origin_str.replace('|||', '\n'))
82 |
83 |
84 | def load_ocean_v2():
85 | ''' load ocean ball v2 @2019.6.9 '''
86 | decoder_fuzz('(_\w{3,7}_\w{5})',
87 | '{}oceanballv2_july.js'.format(data_dir),
88 | replace_func=replace_params)
89 |
90 |
91 | def replace_params(origin_str: str, reg: str) -> str:
92 | ''' replace params '''
93 | params_re = re.findall(reg, origin_str)
94 | echo(1, "You're", re.findall('_(.*?)_', params_re[0])[0])
95 | params = {}
96 | for ii in params_re:
97 | if not ii in params:
98 | params[ii] = len(params)
99 | for ii in sorted(list(params.keys()), key=lambda i: -len(i)):
100 | origin_str = origin_str.replace(ii, f'a{params[ii]}')
101 | return origin_str
102 |
103 |
104 | def load_html_js():
105 | with open(f'{data_dir}html_js.js', 'r') as f:
106 | origin_js = [ii.strip() for ii in f.readlines()]
107 | origin_str = '|||'.join(origin_js)
108 |
109 | ''' long params name replace '''
110 | params = re.findall(r'_0x\w{6}?', origin_str)
111 | params += re.findall(r'_0x\w{5}?', origin_str)
112 | params += re.findall(r'_0x\w{4}?', origin_str)
113 | params = sorted(list(set(params)), key=lambda ii: len(ii), reverse=True)
114 |
115 | ''' __0x33920 '''
116 | __0x33920_begin = origin_js[35].index('[') + 1
117 | __0x33920_end = origin_js[35].index(']')
118 | __0x33920 = origin_js[35][__0x33920_begin:__0x33920_end].split(',')
119 | __0x33920 = [ii.strip() for ii in __0x33920]
120 | for ii, jj in enumerate(__0x33920):
121 | origin_str = origin_str.replace('__0x33920[{}]'.format(ii), jj)
122 |
123 | ''' _0x4f05 '''
124 | _0x4f05_dict = {2: "prototype", 3: "hashCode", 4: "length", 5: "pmqAv", 6: "charCodeAt", 11: "EcTAI", 12: "bTlKh", 13: "prototype", 14: "toString", 15: ";expires=", 16: ";path=/", 17: "getDate", 18: "xxxxt", 19: "xxxxt", 20: "ymGjh", 21: "DjPmX", 22: "cookie", 23: "cookie",
125 | 24: "split", 25: "length", 26: "webdriver", 27: "random", 28: "abs", 29: "userAgent", 30: "replace", 31: "abs", 32: "hashCode", 33: "substr", 34: "host", 35: "indexOf", 36: "m.ctrip", 45: "fcerror", 46: "_zQdjfing", 47: "_RGUID", 48: "replace", 49: "fromCharCode", 50: "QVALA"}
126 | _0x4f05_origin = {ii: hex(ii) for ii in _0x4f05_dict.keys()}
127 | _0x4f05_replace = {ii: re.findall(
128 | r'_0x4f05\("%s",.{7}\)' % jj, origin_str) for ii, jj in _0x4f05_origin.items()}
129 | print(_0x4f05_replace)
130 | for ii, jj in _0x4f05_replace.items():
131 | for kk in jj:
132 | origin_str = origin_str.replace(
133 | kk, '"{}"'.format(_0x4f05_dict[ii]))
134 |
135 | ''' _0x1bf9 '''
136 | _0x1bf9_dict = {1: "eit", 2: "NMs", 3: "FMx", 4: "utc", 5: "sign", 6: "sign", 22: "mMa", 23: ";path=/", 24: "KWcVI", 25: "KWcVI", 33: "setDate", 34: "getDate", 35: "cookie", 36: "dnvrD", 37: "dnvrD", 38: "dnvrD", 39: "ceIER", 40: "toGMTString", 41: "jXvnT", 42: "abs", 43: "hashCode", 47: "DkDiA", 48: "btRpY", 49: "sign", 50: "href", 51: "length", 52: "OZJLY", 53: "HWzfY", 54: "btRpY", 55: "ZQRZh", 56: "rSeVr", 57: "pow", 58: "pop", 59: "ZQRZh", 60: "KEEqN", 61: "xmTXV", 62: "abs", 63: "mytJr", 64: "btRpY", 65: "hashCode", 66: "abs", 67: "xbNid", 68: "evWhs", 69: "log",
137 | 70: "tStBb", 71: "toFixed", 72: "sign", 73: "wBNtc", 74: "abs", 75: "wyibM", 76: "bSvQq", 77: "dHSnF", 78: "random", 79: "getTimezoneOffset", 80: "BzPEC", 81: "dHSnF", 82: "WYJFv", 83: "WYJFv", 84: "split", 85: "length", 86: "QTDGI", 89: "BzPEC", 90: "AceIM", 91: "wOQ", 93: "TGIHa", 94: "join", 95: "join", 96: "join", 97: "HTF", 98: "ioW", 99: "HfzNS", 100: "MIA", 101: "FNbOm", 102: "HfzNS", 103: "OCGEJ", 104: "HfzNS", 105: "aYQhD", 107: "push", 108: "length", 109: "call", 110: "call", 111: "call", 112: "split", 113: "call", 114: "WYJFv", 115: "ZmtWg", 116: "zYC", 119: "join"}
138 | _0x1bf9_origin = {ii: hex(ii) for ii in _0x1bf9_dict.keys()}
139 | _0x1bf9_replace = {ii: re.findall(
140 | r'_0x1bf9\("%s",.{7}\)' % jj, origin_str) for ii, jj in _0x1bf9_origin.items()}
141 | print(_0x1bf9_replace)
142 | for ii, jj in _0x1bf9_replace.items():
143 | for kk in jj:
144 | origin_str = origin_str.replace(
145 | kk, '"{}"'.format(_0x1bf9_dict[ii]))
146 |
147 | for ii, jj in enumerate(params):
148 | origin_str = origin_str.replace(jj, 'a{}'.format(ii))
149 | with open('{}html_js_decoder.js'.format(data_dir), 'w') as f:
150 | f.write(origin_str.replace('|||', '\n'))
151 |
152 |
153 | HOTELS_URL = 'https://hotels.ctrip.com/'
154 | HOTEL_ROOMLIST_DETAIL_URL = '%sDomestic/tool/AjaxHote1RoomListForDetai1.aspx' % HOTELS_URL
155 | OCEANBALL_URL = '{}domestic/cas/oceanball'.format(HOTELS_URL)
156 | HOTEL_DETAIL_URL = '{}hotel/%d.html'.format(HOTELS_URL)
157 | AJAX_PROMOTION_URL = '{}Domestic/Tool/AjaxGetPromotionFilterList.aspx'.format(
158 | HOTELS_URL)
159 |
160 |
161 | class HotelDetail:
162 | ''' generate params for https://hotels.ctrip.com/Domestic/tool/AjaxHote1RoomListForDetai1.aspx '''
163 |
164 | def __init__(self):
165 | self.default_hotel_id = 4889292
166 | self.header = {
167 | 'Cookie': '',
168 | 'Accept': get_accept('html'),
169 | 'Content-Type': get_content_type(),
170 | }
171 |
172 | def generate_callback(self, e):
173 | ''' generate callback params e '''
174 | cl = [chr(ii) for ii in range(65, 123) if ii > 96 or ii < 91]
175 | o = ''.join(["CAS", *[cl[ii] for ii in np.random.randint(0, 51, e)]])
176 | return o
177 |
178 | def generate_eleven_v2(self, hotel_id: int):
179 | ################################################################
180 | #
181 | # [generate eleven] version 19.7.28(Test ✔️) write by gunjianpan
182 | #
183 | # 1. random generate 15 bit param `callback`;
184 | # 2. use callback request OCEANBALL -> get origin js;
185 | # 3. decoder params to union param;
186 | # 4. find where the code eval;
187 | # 'h=a3.pop(),i=a11(h);return a18(i.apply(h.o,g),ud,ud,0),'
188 | # 5. compare the env of chrome with node.
189 | # 'https://github.com/iofu728/spider/tree/develop#oceannballv2'
190 | # 5. you will get `爬虫前进的道路上还是会有各种各样想不到的事情会发生`
191 | # 6. final, return, and joint params;
192 | #
193 | ################################################################
194 |
195 | referer_url = HOTEL_DETAIL_URL % hotel_id
196 | self.header['Referer'] = referer_url
197 | callback = self.generate_callback(15)
198 | now_time = int(time.time() * 1000)
199 | url = f'{OCEANBALL_URL}?callback={callback}&_={now_time}'
200 | oj, cookie = basic_req(url, 3, need_cookie=True, header=self.header)
201 | print(cookie)
202 | oj = replace_params(oj, '(_\w{3,7}_\w{5,6})')
203 | oj = oj.replace('"this"', 'this').replace('\'', '"').replace('\n', '')
204 | ooj = oj
205 |
206 | ''' replace window '''
207 | oj = oj.replace('Window', 'window')
208 | oj = oj.replace('window', 'windows')
209 |
210 | ''' return answer '''
211 | echo(0, 'Num of a6[h][i]', oj.count('a19[0])}}return a18(a6[h][i]'))
212 | echo(0, 'Num 0f end', oj.count('});; })();'))
213 | oj = oj.replace('});; })();', '});;return aa;})();')
214 | ooj = ooj.replace('});; })();', '});;return aa;})();')
215 |
216 | ''' windows setting '''
217 | windows_str = 'function(){ var windows = {"navigator":{"userAgent":"Mozilla/5.0"}};aa=[];windows["' + \
218 | callback + \
219 | '"] = function(e) {temp = e();console.log(temp);return temp};'
220 | oj = oj.replace('function(){ ', windows_str)
221 |
222 | oj = "function aabb(){tt=" + oj + ";return tt;}"
223 |
224 | ''' replace param undefine replace'''
225 | oj = oj.replace('env.define;', 'windows.define;')
226 | oj = oj.replace('env.module;', 'windows.module;')
227 | oj = oj.replace('env.global;', 'windows.global;')
228 | oj = oj.replace('env.require;', 'windows.require;')
229 | oj = oj.replace('env.', '')
230 |
231 | ''' synchronous node & chrome v8 param'''
232 | oj = oj.replace(
233 | 'var a2=', 'require=undefined;module=undefined;global=undefined;var a2=')
234 | oj = oj.replace('process:process,', 'process:NO,')
235 | oj = oj.replace('process,', 'NO, ')
236 | oj = oj.replace(
237 | 'return a19[p];', 'var last = a19[p];if (last.k == 0 && last.o == 0 && last.r == 0 && last.v != 0) {last.v = TypeError();}return last;')
238 |
239 | oj = oj.replace('h=a3.pop(),i=a11(h);return a18(i.apply(h.o,g),ud,ud,0),',
240 | 'h=a3.pop(),i=a11(h);var test = h.k!="getOwnPropertyNames" ? i.apply(h.o,g) :[];if(h.o=="function tostring() { [python code] }"){test=23};if(g=="object window"){test=21};if(h.k=="keys"){test=["TEMPORARY", "PERSISTENT"];}aa=test;return a18(test, ud, ud, 0),')
241 |
242 | ''' eval script '''
243 | eleven = js2py.eval_js(oj + ';aabb()')
244 | echo(1, 'eleven', eleven)
245 | return eleven
246 |
247 | def generate_eleven(self, hotel_id: int):
248 | ################################################################
249 | #
250 | # [generate eleven] version 19.4.21(Test ✔️) write by gunjianpan
251 | #
252 | # 1. random generate 15 bit param `callback`;
253 | # 2. use callback request OCEANBALL -> get origin js;
254 | # 3. eval once -> (match array, and then chr() it) -> decoder js;
255 | # 4. replace document and windows(you also can use execjs & jsdom);
256 | # 5. warning you should replace `this` to some params,
257 | # Otherwise, you will get `老板给小三买了包, 却没有给你钱买房`
258 | # 6. final, return, and joint params;
259 | #
260 | ################################################################
261 |
262 | callback = self.generate_callback(15)
263 | now_time = int(time.time() * 1000)
264 | url = '{}?callback={}&_={}'.format(OCEANBALL_URL, callback, now_time)
265 | referer_url = HOTEL_DETAIL_URL % hotel_id
266 | changeHeaders(
267 | {'Referer': referer_url, 'if-modified-since': 'Thu, 01 Jan 1970 00:00:00 GMT'})
268 | oceanball_js, cookie = basic_req(url, 3, need_cookie=True)
269 | print(cookie)
270 |
271 | array = re.findall(r'\(\[(.*)\],', oceanball_js)[0].split(',')
272 | array = [int(ii) for ii in array]
273 | offset = int(re.findall(r'item-(\d*?)\)', oceanball_js)[0])
274 |
275 | ''' String.fromCharCode '''
276 | oe = ''.join([chr(ii - offset) for ii in array])
277 |
278 | ''' replace window[callback] callback function '''
279 | replace_str = re.findall(r'{}\(new.*\)\);'.format(callback), oe)[0]
280 | eleven_params = re.findall(
281 | r'{}\(new.*\+ (.*?) \+.*\)\);'.format(callback), oe)[0]
282 | replaced_str = 'return {};'.format(eleven_params)
283 | oe = oe.replace(replace_str, replaced_str)
284 | oe = oe.replace('\'', '"').replace('\r', '')
285 | oe = oe.replace(';!', 'let aaa = ', 1)
286 |
287 | replace = '''
288 | function(){let href="https://hotels.ctrip.com/hotel/%d.html";
289 | a={"documentElement": {"attributes":{}}};
290 | b={};
291 | function c(){};
292 | userAgent ="Chrome/73.0.3682.0";
293 | geolocation = 1;
294 | ''' % hotel_id
295 |
296 | ''' replace document & windown & navigator '''
297 | oe = oe.replace('document.body.innerHTML.length', '888').replace(
298 | 'document.body.innerHTML', '""')
299 | oe = oe.replace('document.createElement("div")', '{}')
300 | oe = oe.replace('window.HTMLSpanElement', 'c').replace(
301 | 'document.createElement("span")', 'new c')
302 | oe = oe.replace('window.location.href', 'href').replace(
303 | 'location.href', 'href')
304 | oe = oe.replace('navigator.', '')
305 | oe = oe.replace('new Image().', '').replace('new Image();', '')
306 | oe = oe.replace('document.all', '0').replace('document.referrer', '""')
307 | oe = oe.replace('this || ', '')
308 | oe = oe.replace('window["document"]', 'a')
309 |
310 | oe = oe.replace('document', 'a').replace('window', 'b')
311 | oe = oe.replace('function(){', replace, 1)
312 |
313 | ''' eval script '''
314 | eleven = js2py.eval_js(oe)
315 | echo(1, 'eleven', eleven)
316 |
317 | return eleven
318 |
319 | def generate_other_params(self, hotel_id: int = 4889292, city_id: int = 2,
320 | startDate: str = time_str(-1, '%Y-%m-%d'),
321 | depDate: str = time_str(int(time.time() + one_day), '%Y-%m-%d')):
322 | ''' generate other params '''
323 | params = {
324 | 'psid': None,
325 | 'MasterHotelID': hotel_id,
326 | 'hotel': hotel_id,
327 | 'EDM': 'F',
328 | 'roomId': None,
329 | 'IncludeRoom': None,
330 | 'city': city_id,
331 | 'showspothotel': 'T',
332 | 'supplier': None,
333 | 'IsDecoupleSpotHotelAndGroup': 'F',
334 | 'contrast': 0,
335 | 'brand': 776,
336 | 'startDate': startDate,
337 | 'depDate': depDate,
338 | 'IsFlash': 'F',
339 | 'RequestTravelMoney': 'F',
340 | 'hsids': None,
341 | 'IsJustConfirm': None,
342 | 'contyped': 0,
343 | 'priceInfo': -1,
344 | 'equip': None,
345 | 'filter': None,
346 | 'productcode': None,
347 | 'couponList': None,
348 | 'abForHuaZhu': None,
349 | 'defaultLoad': 'T',
350 | 'esfiltertag': None,
351 | 'estagid': None,
352 | 'Currency': None,
353 | 'Exchange': None,
354 | 'minRoomId': 0,
355 | 'maskDiscount': 0,
356 | 'TmFromList': 'F',
357 | 'th': 119,
358 | 'RoomGuestCount': '1,1,0',
359 | 'promotionf': None,
360 | 'allpoint': None,
361 | }
362 | return params
363 |
364 | def get_hotel_detail(self, hotel_id: int):
365 | ''' get hotel detail '''
366 | params = {
367 | **self.generate_other_params(hotel_id),
368 | 'eleven': self.generate_eleven_v2(hotel_id),
369 | 'callback': self.generate_callback(16),
370 | '_': int(time.time() * 1000)
371 | }
372 | params_list = ['{}={}'.format(
373 | ii, (jj if not jj is None else '')) for ii, jj in params.items()]
374 | url = '{}?{}'.format(HOTEL_ROOMLIST_DETAIL_URL, '&'.join(params_list))
375 | echo(2, 'XHR url', url)
376 | req, _ = basic_req(url, 1, need_cookie=True, header=self.header)
377 | return req
378 |
379 | def parse_detail(self, hotel_id: int = 4889292):
380 | ''' parse hotel detail '''
381 |
382 | version = begin_time()
383 | # self.user_action(hotel_id)
384 | # self.generate_cookie(hotel_id)
385 | # self.prepare_req()
386 | text = self.get_hotel_detail(hotel_id)
387 | html = BeautifulSoup(text['html'], 'html.parser')
388 | trs = html.findAll('tr')[2:]
389 | hotel_detail = []
390 |
391 | for tr in trs:
392 | room_name = re.findall('baseroomname="(.*?)"', str(tr))
393 | if not len(room_name):
394 | room_name = re.findall('l="nofollow">\n(.*?)\n', str(tr))
395 | room_name = room_name[0].strip() if len(
396 | room_name) else (hotel_detail[-1][0] if len(hotel_detail) else '')
397 | price = re.findall(r'(\d{4,5}?)', str(tr))
398 | if not len(price):
399 | continue
400 | sales_price_list = re.findall(r'促销优惠减(.*?)', str(tr))
401 | sales_price = sales_price_list[0] if len(sales_price_list) else ''
402 | price_type = re.findall('room_type_name">(.*?)', str(tr))[0]
403 | if 'em' in price_type:
404 | price_type = ','.join([*re.findall(
405 | '(.*?) dict:
424 | return {ii.split('=', 1)[0]: ii.split('=', 1)[1] for ii in cookie.split('; ')}
425 |
426 | def encoder_cookie(self, cookie_dict: {}) -> str:
427 | return '; '.join(['{}={}'.format(ii, jj)for ii, jj in cookie_dict.items()])
428 |
429 | def get_timezone_offset(self):
430 | local_tz = tzlocal.get_localzone()
431 | return -int(local_tz.utcoffset(datetime.datetime.today()).total_seconds() / 60)
432 |
433 | def a312(self, a312_value):
434 | a323_list = [0, 36, 5, 5, 5, 5, 137, 137, 36, 171]
435 | a199 = 0 if a312_value > len(a323_list) - 1 else a323_list[a312_value]
436 | return '{}{}'.format('0' if a199 < 16 else '', str(hex(a199)).split('0x', 1)[1])
437 |
438 | def generate_v1(self, time_stamp: int = 0):
439 | a241, a166, a144 = self.get_timezone_offset(), int(time.time() * 1000), 10
440 | a166 += sum([np.int32((int('0x2ede', 16) + ii) * a241)
441 | for ii in range(6)])
442 | a166 = a166 if not time_stamp else time_stamp
443 | a33 = [int(ii) for ii in list(str(a166))]
444 | for ii in range(len(a33)):
445 | a33[ii] ^= a144
446 | a144 = a33[ii]
447 |
448 | a34 = [int(ii) for ii in list(str(a166))]
449 | a167 = [a34[len(a34) - ii - 1] for ii, _ in enumerate(a34)]
450 | a13 = [0x3, 0x1, 0x2, 0x6, 0xb, 0x5, 0xa, 0x4, 0x8, 0x0, 0x9, 0x7, 0xc]
451 | a217 = [self.a312(a167[ii if ii > len(a167) else a13[ii]])
452 | for ii in range(len(a167))]
453 | cookie = {'htltmp': ''.join(
454 | [hex(ii)[-1] for ii in a33]), 'utc': str(a166), 'htlstmp': ''.join(a217), 'MKT_Pagesource': 'PC'}
455 | return cookie
456 |
457 | def login_cookie(self):
458 | if not os.path.exists(cookie_path):
459 | shutil.copy(cookie_path + '.tmp', cookie_path)
460 | with open(cookie_path) as f:
461 | cookie = self.decoder_cookie(f.read().strip())
462 | return cookie
463 |
464 | def user_action(self, hotel_id: int = 4889292):
465 |
466 | url = '{}hotel/{}.html'.format(HOTELS_URL, hotel_id)
467 | text = basic_req(url, 3)
468 | page_id = int(re.findall(r'id="page_id" value="(\d*?)" />', text)[0])
469 | correlation_id = re.findall(r'relationId" value="(\d*?)"/>', text)[0]
470 |
471 | e = self.login_cookie()['_bfa'].split('.')
472 | common = [page_id, e[1] + '.' + e[2], int(e[6]), int(e[7]), correlation_id,
473 | "M:70,181023_hod_fxtj:B;", '', '2.6.9', "vq5tkk-ufpyck-qsxbg3", "", "", "", "", "", "online"]
474 | _queue = [{
475 | 'action': 'click',
476 | 'xpath': "HTML/BODY[@id='mainbody']/FORM[@id='aspnetForm']/DIV[3][@id='base_bd']/DIV[4]/DIV[@id='divDetailMain']/DIV[9][@id='id_room_select_box']/DIV[2]/DIV/DIV/A[@id='changeBtn'][@x='{}'][@y='{}'][@rx='{}'][@ry='{}']".format(random.randint(50, 80), random.randint(650, 750), random.randint(20, 40), random.randint(5, 20)),
477 | 'ts': int(time.time() * 1000),
478 | }]
479 | ee = [[2, "useraction"], common, _queue]
480 | eee = json.dumps(ee, separators=(',', ':'))
481 | print(eee)
482 | compress = execjs.compile(open(compress_path).read())
483 | eeee = compress.call('compress', eee)
484 | echo(2, eeee)
485 | cookie = {'uid': 'Yn17vOkRm2gW+jCNwT8jPg=='}
486 | header = {
487 | 'Referer': 'https://hotels.ctrip.com/hotel/4889292.html',
488 | 'Cookie': self.encoder_cookie(cookie)
489 | }
490 | url = 'https://s.c-ctrip.com/bf.gif?ac=a&d={}&jv=1.0.0'.format(eeee)
491 | req = basic_req(url, 2, header=header)
492 | echo(0, req.cookies.get_dict())
493 |
494 | def prepare_req(self, hotel_id: int = 4889292, city_id: int = 2,
495 | startDate: str = time_str(-1, '%Y-%m-%d'),
496 | depDate: str = time_str(int(time.time() + one_day), '%Y-%m-%d')):
497 | referer_url = HOTEL_DETAIL_URL % hotel_id
498 |
499 | changeHeaders({'Referer': referer_url})
500 | data = {'city': city_id, 'checkin': startDate,
501 | 'cjeckout': depDate, 'defalutVal': None}
502 | return basic_req(AJAX_PROMOTION_URL, 11, data=data)
503 |
504 |
505 | if __name__ == '__main__':
506 | if not os.path.exists(data_dir):
507 | os.makedirs(data_dir)
508 | ch = HotelDetail()
509 | ch.parse_detail()
510 |
--------------------------------------------------------------------------------
/dytt8/dytt8.py:
--------------------------------------------------------------------------------
1 | '''
2 | @Author: gunjianpan
3 | @Date: 2019-04-20 15:04:03
4 | @Last Modified by: gunjianpan
5 | @Last Modified time: 2019-04-21 21:37:37
6 | '''
7 |
8 | import os
9 | import re
10 | import sys
11 | import threading
12 |
13 | sys.path.append(os.getcwd())
14 | from proxy.getproxy import GetFreeProxy
15 | from util.util import (begin_time, can_retry, echo, end_time,
16 | shuffle_batch_run_thread)
17 |
18 | proxy_req = GetFreeProxy().proxy_req
19 | HOMEPAGE_URL = 'https://www.dytt8.net'
20 | movie_list, movie_another, movie_again = [], [], []
21 |
22 |
23 | def load_index():
24 | ''' load index '''
25 | global movie_list
26 | version = begin_time()
27 | text = proxy_req(HOMEPAGE_URL, 3)
28 | if not len(text):
29 | if can_retry(HOMEPAGE_URL):
30 | load_index()
31 | return
32 | movie_list = re.findall('《(.*?)》', text)
33 | movie_more = re.findall('href="(.*?)">更多', text)
34 | for uri in movie_more:
35 | load_other(uri)
36 |
37 | threading_list = [threading.Thread(
38 | target=load_other, args=(ii,)) for ii in movie_another]
39 | shuffle_batch_run_thread(threading_list, 100)
40 | threading_list = [threading.Thread(
41 | target=load_other, args=(ii,)) for ii in movie_again]
42 | shuffle_batch_run_thread(threading_list, 100)
43 | # 对电影列表去重
44 | movie_list = set(movie_list)
45 | # 导出爬取的 电影列表
46 | out_path = 'dytt8_result.txt'
47 | with open(out_path, 'w') as f:
48 | f.write('\n'.join(movie_list))
49 | url_num = len([*movie_more, *movie_another]) + 1
50 | movie_num = len(movie_list)
51 | echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
52 | url_num, movie_num, out_path, end_time(version, 0)))
53 |
54 |
55 | def load_other(uri):
56 | ''' load other '''
57 | global movie_list, movie_another, movie_again
58 | url = HOMEPAGE_URL + uri if not 'http' in uri else uri
59 | text = proxy_req(url, 3)
60 | temp_list = re.findall('《(.*?)》', text)
61 | echo(2, 'loading', url, 'movie num:', len(temp_list))
62 |
63 | if text == '' or not len(temp_list):
64 | if can_retry(url):
65 | load_other(uri)
66 | else:
67 | movie_again.append(url)
68 | return
69 | if 'index' in url and '共' in text:
70 | total_page = re.findall('共(.*?)页', text)[0]
71 | suffix_str = re.findall(r"value=\'(.*?)1.html\' selected", text)[0]
72 | more_movie = [url.replace('index.html', '{}{}.html'.format(
73 | suffix_str, ii)) for ii in range(2, int(total_page) + 1)]
74 | else:
75 | more_movie = []
76 | movie_list += temp_list
77 | movie_another += more_movie
78 |
79 |
80 | if __name__ == '__main__':
81 | load_index()
82 |
--------------------------------------------------------------------------------
/eastmoney/eastmoney.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-03-29 10:35:27
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2019-03-29 12:38:54
6 |
7 | import codecs
8 | import json
9 | import os
10 | import pickle
11 | import requests
12 | import time
13 |
14 | from fontTools.ttLib import TTFont
15 |
16 | """
17 | * data.eastmoney.com/bbsj/201806/lrb.html
18 | .data/
19 | ├── base.pkl // base_unicode list
20 | ├── base.woff // base font file (autoload)
21 | ├── eastmony%Y-%m-%d_%H:%M:%S.csv // result .csv
22 | └── font.woff // last time font file
23 | """
24 | data_dir = 'eastmoney/data/'
25 | base_dir = '%sbase.' % data_dir
26 | base_pkl = '%spkl' % base_dir
27 | base_font = '%swoff' % base_dir
28 | url = 'http://data.eastmoney.com/bbsj/201806/lrb.html'
29 |
30 | header = {
31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
32 | 'Accept-Encoding': '',
33 | 'Accept-Language': 'zh-CN,zh;q=0.9',
34 | 'Cache-Control': 'no-cache',
35 | 'Connection': 'keep-alive',
36 | 'Host': 'data.eastmoney.com',
37 | 'Pragma': 'no-cache',
38 | 'Upgrade-Insecure-Requests': '1',
39 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3747.0 Safari/537.36'
40 | }
41 |
42 |
43 | def analysis_font(font_url: str, mode=None) -> dict:
44 | ''' analysis font '''
45 | if (not os.path.exists(base_font) or not os.path.exists(base_pkl)) and not mode:
46 | print('base file not exist!!!')
47 | return
48 |
49 | suffix = font_url.split('.')[-1]
50 | font = requests.get(font_url, headers=header, timeout=30)
51 | font_name = '%sfont.%s' % (data_dir, suffix)
52 | with codecs.open(font_name, 'wb') as f:
53 | f.write(font.content)
54 | font_map = TTFont(font_name).getBestCmap()
55 | ''' prepare base '''
56 | if not mode is None:
57 | char_list = [hex(ii).upper().replace('0X', '') +
58 | ';' for ii in font_map.keys()]
59 | base_unicode = [
60 | int(mode[ii]) if ii in mode else '.' for ii in char_list]
61 | pickle.dump(base_unicode, codecs.open(base_pkl, 'wb'))
62 | with codecs.open(base_font, 'wb') as f:
63 | f.write(font.content)
64 | return {}
65 |
66 | base_unicode = pickle.load(open(base_pkl, 'rb'))
67 |
68 | base_map = TTFont(base_font).getBestCmap()
69 | font_dict = {jj: base_unicode[ii]
70 | for ii, jj in enumerate(base_map.values())}
71 | num_dict = {hex(ii).upper().replace('0X', '') + ';': str(font_dict[jj])
72 | for ii, jj in font_map.items()}
73 | return num_dict
74 |
75 |
76 | def load_eastmoney():
77 | ''' load detail from eastmoney '''
78 | if not os.path.exists(data_dir):
79 | os.makedirs(data_dir)
80 | req = requests.get(url, headers=header, timeout=30)
81 | origin_str = req.text
82 |
83 | ''' parse json '''
84 | begin_index = origin_str.index('defjson')
85 | end_index = origin_str.index(']}},\r\n')
86 | json_str = origin_str[begin_index + 9:end_index + 3]
87 | json_str = json_str.replace('data:', '"data":').replace(
88 | 'pages:', '"pages":').replace('font:', '"font":')
89 | json_req = json.loads(json_str)
90 | font_url = json_req['font']['WoffUrl']
91 |
92 | ''' prepare base '''
93 | if not os.path.exists(base_pkl) or not os.path.exists(base_font):
94 | print('Prepare base<<<<<<<')
95 | font_map = json_req['font']['FontMapping']
96 | font_map = {ii['code']: str(ii['value']) for ii in font_map}
97 | analysis_font(font_url, font_map)
98 |
99 | ''' load font '''
100 | font_map = analysis_font(font_url)
101 | origin_data = json.dumps(json_req['data'])
102 |
103 | ''' load data '''
104 | for ii, jj in font_map.items():
105 | origin_data = origin_data.replace(ii, jj)
106 | replace_data = json.loads(origin_data)
107 | need_info = ['scode', 'sname', 'parentnetprofit', 'sjltz', 'totaloperatereve', 'tystz', 'operateexp',
108 | 'saleexp', 'manageexp', 'financeexp', 'totaloperateexp', 'operateprofit', 'sumprofit', 'noticedate']
109 | data = [ii[jj] for ii in replace_data for jj in need_info]
110 | result_data = [','.join(data[ii * 14:(ii + 1) * 14])
111 | for ii in range(len(replace_data))]
112 |
113 | ''' store data '''
114 | now_time = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime(time.time()))
115 | print(now_time, 'eastmoney data load Success!!!')
116 | with codecs.open('%seastmony%s.csv' % (data_dir, now_time), 'w', encoding='utf-8') as f:
117 | f.write('\n'.join(result_data))
118 |
119 |
120 | if __name__ == '__main__':
121 | if not os.path.exists(data_dir):
122 | os.makedirs(data_dir)
123 | load_eastmoney()
124 |
--------------------------------------------------------------------------------
/exam/shaoq.js:
--------------------------------------------------------------------------------
1 | const jsdom = require('jsdom');
2 | const {
3 | JSDOM
4 | } = jsdom;
5 |
6 | function get_css(html) {
7 | const dom = new JSDOM(html);
8 | window = dom.window;
9 | document = window.document;
10 | window.decodeURIComponent = decodeURIComponent;
11 |
12 | const script_element = document.querySelector('script');
13 | const script = script_element.innerHTML;
14 | eval(script);
15 | return window.document.querySelector('style').sheet.toString();
16 | }
--------------------------------------------------------------------------------
/exam/shaoq.py:
--------------------------------------------------------------------------------
1 | '''
2 | @Author: gunjianpan
3 | @Date: 2019-03-21 17:34:15
4 | @Last Modified by: gunjianpan
5 | @Last Modified time: 2019-04-18 19:33:44
6 | '''
7 |
8 | import execjs
9 | import requests
10 | import time
11 | import re
12 | import threading
13 |
14 | from bs4 import BeautifulSoup
15 |
16 | """
17 | * shaoq @http
18 | * shaoq.com:7777
19 | (single spider not use basic_req)
20 | """
21 |
22 |
23 | class Shaoq(object):
24 | """
25 | shao q exam
26 | """
27 |
28 | def __init__(self):
29 | self.test = 0
30 |
31 | def test_req(self):
32 | basic_url = 'http://shaoq.com:7777/'
33 | url = '%sexam' % basic_url
34 | headers = {
35 | 'pragma': 'no-cache',
36 | 'cache-control': 'no-cache',
37 | 'Host': 'shaoq.com:7777',
38 | 'Referer': 'http://shaoq.com:7777/exam',
39 | 'Cookie': '',
40 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
41 | "Accept-Encoding": "",
42 | "Accept-Language": "zh-CN,zh;q=0.9",
43 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36",
44 | }
45 |
46 | '''get cookie'''
47 | first_req = requests.get(url, headers=headers, verify=False)
48 | cookies_map = first_req.cookies.get_dict()
49 | cookies_list = ['%s=%s' % (ii, jj)for ii, jj in cookies_map.items()]
50 | self.cookie = ','.join(cookies_list)
51 | headers['Cookie'] = self.cookie
52 |
53 | ''' load img '''
54 | html = BeautifulSoup(first_req.text, 'html.parser')
55 | img_list = re.findall('
dict:
92 | return dict(re.findall(r'\.(.+)::before {content: "(.+)";}', css_result))
93 |
94 |
95 | if __name__ == '__main__':
96 | es = Shaoq()
97 | es.test_req()
98 |
--------------------------------------------------------------------------------
/mafengwo/hotel.js:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: gunjianpan
3 | * @Date: 2019-04-18 19:23:33
4 | * @Last Modified by: gunjianpan
5 | * @Last Modified time: 2019-04-22 10:31:47
6 | */
7 |
8 | const jsdom = require('jsdom');
9 | const {
10 | JSDOM
11 | } = jsdom;
12 |
13 | function analysis_js(html, salt, prepare_map) {
14 | const dom = new JSDOM(html);
15 | window = dom.window;
16 | document = window.document;
17 | window.decodeURIComponent = decodeURIComponent;
18 |
19 | const script_element = document.querySelector('script');
20 | console.log(script_element);
21 | const script = script_element.innerHTML;
22 | eval(script);
23 | return window['SparkMD5']['hash'](JSON['stringify'](prepare_map) + salt)['slice'](2, 12);
24 | }
--------------------------------------------------------------------------------
/mafengwo/mafengwo.py:
--------------------------------------------------------------------------------
1 | '''
2 | @Author: gunjianpan
3 | @Date: 2019-04-16 16:50:45
4 | @Last Modified by: gunjianpan
5 | @Last Modified time: 2019-04-20 01:33:26
6 | '''
7 | import codecs
8 | import execjs
9 | import numpy as np
10 | import os
11 | import re
12 | import time
13 | import threading
14 |
15 | from bs4 import BeautifulSoup
16 | from proxy.getproxy import GetFreeProxy
17 | from util.util import basic_req, echo, time_str, can_retry, begin_time, end_time, shuffle_batch_run_thread
18 |
19 | data_dir = 'mafengwo/data/'
20 | hotel_js_path = 'mafengwo/hotel.js'
21 | decoder_js_path = '{}decoder.js'.format(data_dir)
22 | origin_js_path = '{}origin.js'.format(data_dir)
23 | proxy_req = GetFreeProxy().proxy_req
24 |
25 |
26 | class Mafengwo:
27 | ''' some js confusion applications in mafengwo '''
28 |
29 | JD_URL = 'http://www.mafengwo.cn/jd/10186/gonglve.html'
30 | AJAX_ROUTER_URL = 'http://www.mafengwo.cn/ajax/router.php'
31 | MDD_URL = 'http://www.mafengwo.cn/mdd/'
32 |
33 | def __init__(self):
34 | self.spot_result = {}
35 | self.spot_pn = {}
36 | self.prepare_js()
37 |
38 | def decode_js_test(self):
39 | ''' decode js for test '''
40 | with open(decoder_js_path, 'r') as f:
41 | decoder_js = [codecs.unicode_escape_decode(
42 | ii.strip())[0] for ii in f.readlines()]
43 | __Ox2133f = [ii.strip()
44 | for ii in decoder_js[4][17:-2].replace('\"', '\'').split(',')]
45 | decoder_str = '|||'.join(decoder_js)
46 | params = re.findall(r'(\_0x\w{6,8}?)=|,|\)', decoder_str)
47 | params = sorted(list(set([ii for ii in params if len(
48 | ii) > 6])), key=lambda ii: len(ii), reverse=True)
49 | for ii, jj in enumerate(__Ox2133f):
50 | decoder_str = decoder_str.replace('__Ox2133f[{}]'.format(ii), jj)
51 | for ii, jj in enumerate(params):
52 | decoder_str = decoder_str.replace(jj, 'a{}'.format(ii))
53 | decoder_js = decoder_str.split('|||')
54 | with open(origin_js_path, 'w') as f:
55 | f.write('\n'.join(decoder_js))
56 | return decoder_js
57 |
58 | def prepare_js(self):
59 | ''' prepare js '''
60 | pre_text = basic_req(self.JD_URL, 3)
61 | INDEX_JS_URL = re.findall(
62 | r'src=.*index\.js.*" t', pre_text)[0].split('"')[1]
63 | origin_js = basic_req(INDEX_JS_URL, 3)
64 |
65 | ''' decoder js '''
66 | decode_js = codecs.unicode_escape_decode(origin_js)[0]
67 |
68 | ''' params replace '''
69 | replace_list_str = decode_js.split(';')[2]
70 | empty_index = replace_list_str.index(' ') + 1
71 | begin_index = replace_list_str.index('=[') + 2
72 | end_index = replace_list_str.index(']')
73 | replace_list = replace_list_str[begin_index:end_index].split(',')
74 | rp = replace_list_str[empty_index:begin_index - 2]
75 | for ii, jj in enumerate(replace_list):
76 | decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj)
77 | self.slat = replace_list[46].replace('"', '')
78 | echo(2, 'salt', self.slat)
79 |
80 | ''' load to local '''
81 | with open(decoder_js_path, 'w') as f:
82 | f.write(';\n'.join(decode_js.split(';')))
83 |
84 | ''' del function about ajax '''
85 | del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js)
86 | del_begin_index = decode_js.index(del_str[0])
87 |
88 | result_js = decode_js[:del_begin_index] + \
89 | decode_js[del_begin_index + len(del_str[0]):]
90 |
91 | result_js = decode_js[:del_begin_index] + \
92 | decode_js[del_begin_index + len(del_str[0]):]
93 | self.result_js = result_js
94 | self.js_compile = execjs.compile(open(hotel_js_path).read())
95 | echo(1, 'Load hotel index js success!!!')
96 |
97 | def js_compile_sn(self, prepare_map):
98 | ''' js compile sn '''
99 | wait_js = ''
100 | sn = self.js_compile.call(
101 | 'analysis_js', wait_js, self.slat, prepare_map)
102 | echo(2, '_sn', sn)
103 | return sn
104 |
105 | def load_sn(self, data: dict, now_time=0) -> dict:
106 | ''' load sn '''
107 |
108 | if not now_time:
109 | now_time = int(time.time() * 1000)
110 | prepare_map = {**data, '_ts': now_time}
111 |
112 | ''' _0xe7fex37 sorted & str num '''
113 | prepare_map = {ii: str(prepare_map[ii]) for ii in sorted(prepare_map)}
114 |
115 | ''' js compile sn '''
116 | sn = self.js_compile_sn(prepare_map)
117 |
118 | data = {
119 | **data,
120 | '_sn': sn,
121 | '_ts': now_time
122 | }
123 | return data
124 |
125 | def load_spot_once(self, pn=1, city_id=10186):
126 | ''' load spot once '''
127 | data = {
128 | 'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
129 | 'iMddid': city_id,
130 | 'iTagId': 0,
131 | 'iPage': pn,
132 | }
133 | data = self.load_sn(data)
134 | print(data)
135 | req = proxy_req(self.AJAX_ROUTER_URL, 11, data=data)
136 | if req is None or not 'data' in req or not 'list' in req['data']:
137 | if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)):
138 | self.load_spot_once(pn, city_id)
139 | return
140 | spot_list = req['data']['list']
141 | spot_pn = req['data']['page']
142 | spot_tmp = re.findall('.*?(.*?)
', spot_list)
143 | try:
144 | total_pn = int(re.findall('共(.*?)', spot_pn)[0])
145 | except Exception as e:
146 | total_pn = 1
147 | echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e)
148 |
149 | if city_id not in self.spot_result:
150 | self.spot_result[city_id] = spot_tmp
151 | else:
152 | self.spot_result[city_id] += spot_tmp
153 | self.spot_pn[city_id] = total_pn
154 |
155 | def load_spot(self, batch_size=50):
156 | ''' load spot '''
157 | version = begin_time()
158 | self.load_city_list()
159 | # self.city_list = [10186]
160 | city_threading = [threading.Thread(
161 | target=self.load_spot_once, args=(1, ii,))for ii in self.city_list]
162 | shuffle_batch_run_thread(city_threading, 150)
163 |
164 | spot_continue = []
165 | for ii, jj in self.spot_pn.items():
166 | spot_continue += [threading.Thread(
167 | target=self.load_spot_once, args=(pn, ii,)) for pn in range(2, jj + 1)]
168 |
169 | shuffle_batch_run_thread(spot_continue, 150)
170 | output = ['{},{}'.format(self.id2map[ii], ','.join(jj))
171 | for ii, jj in self.spot_result.items()]
172 | output_path = '{}spot.txt'.format(data_dir)
173 | with open(output_path, 'w') as f:
174 | f.write('\n'.join(output))
175 | city_num = len(self.city_list)
176 | spot_num = sum([len(ii) for ii in self.spot_result.values()])
177 | echo(1, 'City num: {}\nSpot num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format(
178 | city_num, spot_num, output_path, end_time(version, 0)))
179 |
180 | def load_city_list(self):
181 | ''' load city list '''
182 | text = basic_req(self.MDD_URL, 3)
183 | city_list = re.findall(
184 | '/travel-scenic-spot/mafengwo/(.*?).html" target="_blank">(.*?)(| 1 else seg.cut(
239 | index[3])[0] + seg.cut(index[3])[1]: int(index[1]) for index in city if index[3][-1:] == '州'}
240 | seg = pkuseg.pkuseg(model_name='../Model_retrieval/pkuseg')
241 | city_state1 = {seg.cut(index)[0] if len(seg.cut(index)[0]) > 1 else seg.cut(
242 | index)[0] + seg.cut(index)[1]: city_state[index] for index in city_state}
243 | city_area = {index[3][:-2]: int(index[1])
244 | for index in city if '地区' in index[3]}
245 | city_other = {index[3][:-1]: int(index[1])
246 | for index in city if index[3][-1:] == '市' or index[3][-1:] == '盟'}
247 | self.city_province = {**city_state1, **city_area, **city_other}
248 | self.city_province = {
249 | index: self.province_map[self.city_province[index]] for index in self.city_province}
250 | county = self.Db.select_db(
251 | 'select * from china_regions where level=3')
252 | county_area_pre = {index for index in county if index[3][-1] == '区'}
253 | county_area_two = {index[3][:-2]: int(index[1][:2]) for index in county_area_pre if len(
254 | index[3]) > 3 and (index[3][-2] == '矿' or index[3][-2] == '林')}
255 | # print('芒' in county_area_two, 'two')
256 | county_area_state = {seg.cut(index[3][:-2])[0]: int(index[1][:2])
257 | for index in county_area_pre if len(index[3]) > 2 and index[3][-2] == '族'}
258 | # print('芒' in county_area_state, 'state')
259 | county_area_other = {index[3][:-1]: int(index[1][:2]) for index in county_area_pre if len(
260 | index[3]) > 2 and index[3][-2] != '族' and index[3][-2] != '林' and index[3][-2] != '矿'}
261 | # print('芒' in county_area_other, 'other')
262 | county_county_pre = {index for index in county if index[3][-1] == '县'}
263 | county_county_two = {index[3]: int(
264 | index[1][:2]) for index in county_county_pre if len(index[3]) == 2}
265 | # print('芒' in county_county_two, 'two')
266 | seg = pkuseg.pkuseg()
267 | county_county_state = {seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(index[3])[0] + seg.cut(
268 | index[3])[1]: int(index[1][:2]) for index in county_county_pre if len(index[3]) > 2 and index[3][-3:-1] == '自治'}
269 | county_county_state = {
270 | index[:-2] if '族' in index and len(index) > 3 else index: county_county_state[index] for index in county_county_state}
271 | # print('芒' in county_county_state, 'state')
272 | county_county_other = {
273 | index[3][:-1]: int(index[1][:2]) for index in county_county_pre if index[3][-3:-1] != '自治' and len(index[3]) > 2}
274 | # print('芒' in county_county_other, 'other')
275 | county_city = {index[3][:-1] if len(index[3]) > 2 else index[3]: int(index[1][:2])
276 | for index in county if index[3][-1] == '市'}
277 | # print('芒' in county_city, 'city')
278 | county_domain = {index[3][:4]: int(
279 | index[1][:2]) for index in county if index[3][-1] == '域'}
280 | # print('芒' in county_domain, 'domain')
281 | county_other = {index[3]: int(
282 | index[1][:2]) for index in county if index[3][-1] == '盟' or index[3][-1] == '岛'}
283 | # print('芒' in county_other, 'other')
284 | county_province = {**county_area_two, **county_area_state, **county_area_other, **county_county_two,
285 | **county_county_state, **county_county_other, **county_city, **county_domain, **county_other}
286 | county_province = {
287 | index: self.province_map[county_province[index]] for index in county_province}
288 | self.city_province = {**self.city_province, **county_province}
289 | print({index for index in self.city_province if len(index) == 1})
290 |
291 | def test_province(self, maps, words):
292 | word_city = {}
293 | for index in maps:
294 | temp_num = words.count(index)
295 | province = maps[index]
296 | if temp_num:
297 | if province in word_city:
298 | word_city[province] += temp_num
299 | else:
300 | word_city[province] = temp_num
301 | print(sum(word_city.values()))
302 | return word_city
303 |
304 |
305 | class Get_baidu():
306 | """
307 | get info from baidu
308 | """
309 |
310 | def __init__(self):
311 | self.failuredmap = {}
312 | self.total_map = {}
313 | self.text_map = {}
314 | self.word = {}
315 | self.find_location = find_location()
316 |
317 | def get_summarization(self):
318 | """
319 | get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
320 | """
321 |
322 | version = begin_time()
323 | threadings = []
324 | for index in range(75):
325 | work = threading.Thread(
326 | target=self.summarization_once, args=(index,))
327 | threadings.append(work)
328 |
329 | for work in threadings:
330 | # time.sleep(.5)
331 | work.start()
332 | for work in threadings:
333 | work.join()
334 | # self.text_map = self.total_map[0]
335 |
336 | # for index in list(range(1, len(self.total_map))):
337 | # for ids in self.total_map[index]:
338 | # if ids in self.text_map:
339 | # self.text_map[ids] += self.total_map[index][ids]
340 | # else:
341 | # self.text_map[ids] = self.total_map[index][ids]
342 | # print(sum(self.text_map))
343 | word = [self.word[k] for k in sorted(self.word.keys())]
344 | with codecs.open('test', 'w', encoding='utf-8') as f:
345 | f.write("\n".join(word))
346 | end_time(version)
347 |
348 | def summarization_once(self, index):
349 | """
350 | get html from news
351 | """
352 | print(index)
353 | texts = []
354 | url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=919fab3c0002c9f1&wd=%E5%81%B7%E7%8B%97&oq=%E5%81%B7%E7%8B%97&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=919fab3c0002c9f1&rsv_t=7e30ggF%2BMa91oOURk1bMtN8af5unSwOR08TodNBB%2F%2B6B6RBEwUi8l8IAe28ACA%2B8b5I5&gpc=stf%3D1517038564%2C1548574564%7Cstftype%3D1&tfflag=1&bs=%E5%81%B7%E7%8B%97&rsv_sid=undefined&_ss=1&clist=28bc21fb856a58b7%09350102124f079888%0928bc21fb856a58b7%0928bc2159845c1cf3%0928bc2015823fa56b%0928a121fb84a7d1a6&hsug=&f4s=1&csor=2&_cr1=34767&pn=' + \
355 | str(index * 10)
356 | news_lists = proxy_req(url, 0)
357 | if not news_lists:
358 | if can_retry(url):
359 | self.summarization_once(index)
360 | return
361 | test = news_lists.find_all(
362 | 'div', class_=['c-row c-gap-top-small', 'c-span18 c-span-last'])
363 | word = self.cleantxt(news_lists.text)
364 | if not len(word):
365 | if can_retry(url):
366 | self.summarization_once(index)
367 | return
368 | temp_map = self.find_location.test_province(
369 | self.find_location.city_province, word)
370 | self.total_map[int(index)] = temp_map
371 | self.word[index] = word
372 |
373 | def cleantxt(self, raw):
374 | fil = re.compile(u'[^\u4e00-\u9fa5]+', re.UNICODE)
375 | return fil.sub(' ', raw)
376 |
377 |
378 | class Get_baidu_bjh():
379 | """
380 | get info from baidu bjh
381 | """
382 |
383 | def __init__(self):
384 | self.failuredmap = {}
385 | self.fail = []
386 | self.href_map = {}
387 | self.text_map = {}
388 | self.word = {}
389 | self.word_list = {}
390 |
391 | def get_href(self):
392 | """
393 | get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
394 | """
395 |
396 | version = begin_time()
397 | threadings = []
398 | for index in range(71):
399 | work = threading.Thread(
400 | target=self.href_once, args=(index,))
401 | threadings.append(work)
402 |
403 | for work in threadings:
404 | # time.sleep(.5)
405 | work.start()
406 | for work in threadings:
407 | work.join()
408 | href_map = [self.href_map[k] for k in sorted(self.href_map.keys())]
409 | self.href_map = sum(href_map, [])
410 | with codecs.open('bjh_href_poison.txt', 'w', encoding='utf-8') as f:
411 | f.write("\n".join(self.href_map))
412 | end_time(version)
413 |
414 | def href_once(self, index):
415 | """
416 | get html from news
417 | """
418 | print(index)
419 | texts = []
420 | url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=毒狗肉&pn=' + \
421 | str(index * 10)
422 | news_lists = proxy_req(url, 0)
423 | if not news_lists:
424 | if can_retry(url):
425 | self.href_once(index)
426 | return
427 | test = news_lists.find_all('div', class_='result')
428 | if not len(test):
429 | if can_retry(url):
430 | self.href_once(index)
431 | return
432 | href_list = [index.a['href'] for index in test]
433 | self.href_map[int(index)] = href_list
434 |
435 | def cleantxt(self, raw):
436 | fil = re.compile(u'[^\u4e00-\u9fa5]+', re.UNICODE)
437 | return fil.sub(' ', raw)
438 |
439 | def get_detail(self):
440 | """
441 | get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1
442 | """
443 |
444 | version = begin_time()
445 | threadings = []
446 | with codecs.open('bjh_href_poison.txt', 'r', encoding='utf-8') as f:
447 | href_list = f.readlines()
448 | for index, url in enumerate(href_list):
449 | work = threading.Thread(
450 | target=self.detail_once, args=(index, url,))
451 | threadings.append(work)
452 |
453 | for work in threadings:
454 | # time.sleep(.5)
455 | work.start()
456 | for work in threadings:
457 | work.join()
458 | word_list = [self.word_list[k] for k in sorted(self.word_list.keys())]
459 | with codecs.open('bjh_detail_poison', 'w', encoding='utf-8') as f:
460 | f.write("\n".join(word_list))
461 | self.failuredmap = {}
462 | with codecs.open('bjh.log', 'w', encoding='utf-8') as f:
463 | f.write('\n'.join(self.fail))
464 | self.fail = []
465 | end_time(version)
466 |
467 | def detail_once(self, index, url):
468 | """
469 | get html from news
470 | """
471 | # print(index)
472 | news_lists = proxy_req(url, 0)
473 | if not news_lists:
474 | if can_retry(url):
475 | self.detail_once(index, url)
476 | return
477 | test = news_lists.find_all(
478 | 'div', class_=['article-content', 'mth-editor-content', 'con-news-art', 'Custom_UnionStyle'])
479 | if not len(test):
480 | test = self.cleantxt(news_lists.text)
481 | if not len(test):
482 | if can_retry(url):
483 | self.detail_once(index, url)
484 | return
485 | self.word_list[index] = test
486 | return
487 | word_list = ''.join([index.text for index in test]
488 | ).replace('\u3000', '').replace('\n', '')
489 | self.word_list[int(index)] = word_list
490 |
--------------------------------------------------------------------------------
/press/press.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2018-11-10 11:17:16
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2019-03-25 21:18:30
6 | import threading
7 | import time
8 |
9 | from proxy.getproxy import GetFreeProxy
10 | from util.db import Db
11 | from util.util import begin_time, end_time, basic_req
12 |
13 | proxy_req = GetFreeProxy().proxy_req
14 |
15 |
16 | class Press_test():
17 | """
18 | give press in short time
19 | """
20 |
21 | def basic_press(self, url, times, types):
22 | """
23 | press have no data input
24 | """
25 | url = url + str(int(round(time.time() * 1000)))
26 | if types == 1:
27 | html = proxy_req(url, 1)
28 | else:
29 | html = basic_req(url, 1)
30 |
31 | if html == False and times < 5:
32 | self.basic_press(url, times + 1, types)
33 |
34 | def press_threading(self, url, qps, types):
35 | """
36 | press url at constant qps
37 | """
38 | version = begin_time()
39 | threadings = []
40 | for index in range(qps):
41 | work = threading.Thread(
42 | target=self.basic_press, args=(url, 0, types))
43 | threadings.append(work)
44 | for work in threadings:
45 | work.start()
46 | for work in threadings:
47 | work.join()
48 | end_time(version)
49 |
50 | def one_press_attack(self, url, qps, types, total):
51 | """
52 | press url from a long time
53 | """
54 | for index in range(total):
55 | self.press_threading(url, qps, types)
56 | print('Over')
57 |
--------------------------------------------------------------------------------
/proxy/ip66.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2019-05-07 00:20:48
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2019-05-07 22:34:22
6 |
7 | import js2py
8 | import re
9 |
10 | from util.util import basic_req, echo
11 |
12 | """
13 | * 66ip @http
14 | js decoder
15 | """
16 |
17 | IP66_URL = 'http://www.66ip.cn/'
18 | PRE_URL = '{}favicon.ico'.format(IP66_URL)
19 |
20 | header = {
21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
22 | 'Host': 'www.66ip.cn',
23 | 'Referer': 'http://www.66ip.cn/',
24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3785.0 Safari/537.36'
25 | }
26 |
27 |
28 | def generate_cookie():
29 | ''' eval 66ip.cn test in 19.5.7 '''
30 | req = basic_req(IP66_URL, 2, header=header)
31 | basic_cookie = req.cookies.get_dict()
32 |
33 | ''' !important \b in py -> \x80 '''
34 | req_text = r'{}'.format(req.text)
35 |
36 | ''' get the script will be eval '''
37 | script_text = re.findall('', req_text)[0]
38 | script_text = script_text.replace(
39 | '{eval(', '{aaa=').replace(');break', ';break')
40 | script_eval = r'{}'.format(js2py.eval_js(script_text + 'aaa'))
41 | echo(0, script_eval)
42 |
43 | try:
44 | ''' replace document & window '''
45 | params = re.findall(
46 | r'(__jsl_clearance=.*?)\'\+\(function\(\){(.*?join\(\'\'\))}\)\(\)', script_eval)
47 | wait_eval = params[0][1].replace(
48 | "document.createElement('div')", "{}").replace("", '')
49 | wait_replace = re.findall(
50 | r'=(.{1,5}\.firstChild\.href;)', wait_eval)[0]
51 | wait_eval = wait_eval.replace(wait_replace, '"http://www.66ip.cn/";')
52 |
53 | ''' eval & encoder cookie '''
54 | other_param = js2py.eval_js(
55 | 'function ddd() {window={};' + wait_eval + '}ddd()')
56 | cookie = '{}; {}{}'.format(encoder_cookie(
57 | basic_cookie), params[0][0], other_param)
58 | echo(1, 'cookie', cookie)
59 |
60 | return cookie
61 | except:
62 | generate_cookie()
63 |
64 |
65 | def encoder_cookie(cookie_dict: {}) -> str:
66 | return '; '.join(['{}={}'.format(ii, jj)for ii, jj in cookie_dict.items()])
67 |
68 |
69 | def req_ip66():
70 | ''' 66ip.cn js decoder '''
71 | header['Cookie'] = generate_cookie()
72 |
73 | req_text = basic_req(IP66_URL, 3, header=header)
74 | echo(2, req_text)
75 | return req_text
76 |
77 |
78 | if __name__ == "__main__":
79 | req_ip66()
80 |
--------------------------------------------------------------------------------
/proxy/table.sql:
--------------------------------------------------------------------------------
1 | /*
2 | * @Author: gunjianpan
3 | * @Date: 2018-10-19 15:01:18
4 | * @Last Modified by: gunjianpan
5 | * @Last Modified time: 2019-01-27 23:39:47
6 | */
7 | use netease;
8 | CREATE TABLE if not exists `ip_proxy` (
9 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'auto-increment primary keys',
10 | `address` varchar(50) NOT NULL DEFAULT '0' COMMENT 'proxy address',
11 | `http_type` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'http type, 1: https, 0: http',
12 | `is_failured` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'failure time',
13 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'create time',
14 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time',
15 | PRIMARY KEY (`id`)
16 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 comment='table for ip proxy';
17 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | pymysql
4 | requests
5 | bs4
6 | apscheduler
7 | pandas
8 | asyncio
9 | aiohttp
10 | apscheduler
11 | PyExecJS
12 | fonttools
13 | regex
14 | rsa
15 | opencv-python
--------------------------------------------------------------------------------
/util/db.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2018-10-24 13:32:39
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-06-06 13:11:46
6 |
7 | import os
8 | import shutil
9 | import sys
10 | import threading
11 | from configparser import ConfigParser
12 |
13 | import pymysql
14 | import time
15 |
16 | sys.path.append(os.getcwd())
17 | from util.util import echo, read_file
18 |
19 | configure_path = "util/util.ini"
20 |
21 |
22 | class Db(object):
23 | """ db operation, without sql injection """
24 |
25 | def __init__(self, database: str, return_type: str = "list"):
26 | self.load_configure()
27 | self.database = database
28 | self.return_type = return_type
29 | self.lock = threading.Lock()
30 | self.reconnect()
31 |
32 | def load_configure(self):
33 | """ load configure """
34 | if not os.path.exists(configure_path):
35 | shutil.copy(configure_path + ".tmp", configure_path)
36 | cfg = ConfigParser()
37 | cfg.read(configure_path, "utf-8")
38 | self.mysql_host = cfg.get("mysql", "hostname")
39 | self.mysql_user = cfg.get("mysql", "username")
40 | self.mysql_pw = cfg.get("mysql", "passwd")
41 | self.mysql_char = cfg.get("mysql", "charset")
42 |
43 | def connect_db(self, database: str, return_type: str):
44 | """ connect database """
45 | cursorclass = (
46 | pymysql.cursors.DictCursor
47 | if return_type == "dict"
48 | else pymysql.cursors.Cursor
49 | )
50 | try:
51 | self.db = pymysql.connect(
52 | host=self.mysql_host,
53 | user=self.mysql_user,
54 | password=self.mysql_pw,
55 | db=database,
56 | charset=self.mysql_char,
57 | cursorclass=cursorclass,
58 | )
59 | self.cursor = self.db.cursor()
60 | except pymysql.OperationalError:
61 | echo(0, "Please change mysql info in util/db.ini!!!")
62 | self.db = False
63 | self.cursor = None
64 | except pymysql.InternalError:
65 | echo(2, "Try to create database in mysql.........")
66 | if self.create_db(database):
67 | self.connect_db(database, return_type)
68 | else:
69 | self.db = False
70 | self.cursor = None
71 | except:
72 | echo(0, "Other db error!!!")
73 | self.db = False
74 | self.cursor = None
75 |
76 | def reconnect(self):
77 | self.connect_db(self.database, self.return_type)
78 |
79 | def _reConn(self, num: int = 28800, stime: int = 3):
80 | _number = 0
81 | _status = True
82 | while _status and _number <= num:
83 | try:
84 | self.conn.ping()
85 | _status = False
86 | except:
87 | self.reconnect()
88 | if self.db != False:
89 | _status = False
90 | break
91 | _number += 1
92 | time.sleep(stime)
93 |
94 | def create_db(self, database: str):
95 | """ crete database """
96 | db = pymysql.connect(
97 | host=self.mysql_host,
98 | user=self.mysql_user,
99 | password=self.mysql_pw,
100 | charset=self.mysql_char,
101 | )
102 | database_sql = "CREATE DATABASE if not exists {}".format(database)
103 | try:
104 | cursor = db.cursor()
105 | cursor.execute(database_sql)
106 | echo(2, "Create Database {} Success!!!".format(database))
107 | return True
108 | except:
109 | echo(0, "Create Database {} error".format(database))
110 | return False
111 |
112 | def create_table(self, sql_path: str):
113 | if not os.path.exists(sql_path):
114 | echo(0, "Create Table {} error, file not found".format(sql_path))
115 | return False
116 | create_table_sql = "\n".join(read_file(sql_path))
117 | try:
118 | cursor = self.db.cursor()
119 | cursor.execute(create_table_sql)
120 | echo(2, "Create Table from {} Success!!!".format(sql_path))
121 | return True
122 | except Exception as e:
123 | echo(0, "Create Table from {} error".format(sql_path), e)
124 | return False
125 |
126 | def select_db(self, sql: str):
127 | """ select sql @return False: Expection; list: Success """
128 | try:
129 | self._reConn()
130 | with self.db.cursor() as cursor:
131 | cursor.execute(sql)
132 | result = cursor.fetchall()
133 | self.db.commit()
134 | return result
135 | except Exception as e:
136 | echo(0, "execute sql {} error".format(sql), e)
137 | return False
138 |
139 | def select_one(self, sql: str):
140 | """ select one @return False: Expection; list: Success """
141 | try:
142 | self._reConn()
143 | with self.db.cursor() as cursor:
144 | cursor.execute(sql)
145 | result = cursor.fetchone()
146 | self.db.commit()
147 | return result
148 | except Exception as e:
149 | echo(0, "execute sql {} error".format(sql), e)
150 | return False
151 |
152 | def insert_db(self, sql: str):
153 | """ insert sql @return False: Expection; True: Success """
154 | self.lock.acquire()
155 | try:
156 | self._reConn()
157 | with self.db.cursor() as cursor:
158 | cursor.execute(sql)
159 | self.db.commit()
160 | self.lock.release()
161 | return True
162 | except Exception as e:
163 | self.lock.release()
164 | echo(0, "execute sql {} error".format(sql), e)
165 | self.db.rollback()
166 | return False
167 |
168 | def update_db(self, sql: str):
169 | """ update sql @return False: Expection; True: Success """
170 | self.lock.acquire()
171 | try:
172 | self._reConn()
173 | with self.db.cursor() as cursor:
174 | cursor.execute(sql)
175 | self.db.commit()
176 | self.lock.release()
177 | return True
178 | except Exception as e:
179 | self.lock.release()
180 | echo(0, "execute sql {} error".format(sql), e)
181 | self.db.rollback()
182 | return False
183 |
--------------------------------------------------------------------------------
/util/util.ini.tmp:
--------------------------------------------------------------------------------
1 | [mysql]
2 | hostname = localhost
3 | username = root
4 | passwd =
5 | charset = utf8mb4
6 | [email]
7 | rec_lists =
8 | send_lists =
9 | [ServerChan]
10 | SCKEY =
11 |
--------------------------------------------------------------------------------
/util/util.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # @Author: gunjianpan
3 | # @Date: 2018-10-19 15:33:46
4 | # @Last Modified by: gunjianpan
5 | # @Last Modified time: 2020-06-06 14:10:31
6 |
7 | from __future__ import (
8 | absolute_import,
9 | division,
10 | print_function,
11 | unicode_literals,
12 | with_statement,
13 | )
14 |
15 | import codecs
16 | import datetime
17 | import json
18 | import logging
19 | import os
20 | import pickle
21 | import platform
22 | import random
23 | import re
24 | import shutil
25 | import smtplib
26 | import threading
27 | import time
28 | import urllib
29 | from configparser import ConfigParser
30 | from email.mime.text import MIMEText
31 |
32 | import numpy as np
33 | import requests
34 | import urllib3
35 | from bs4 import BeautifulSoup
36 |
37 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
38 |
39 |
40 | def basic_req(
41 | url: str,
42 | types: int,
43 | proxies=None,
44 | data=None,
45 | header=None,
46 | need_cookie: bool = False,
47 | config: dict = {},
48 | ):
49 | """
50 | requests
51 | @types XY: X=0.->get; =1.->post;
52 | Y=0.->html; =1.->json; =2.->basic; =3.->text;
53 | """
54 | header = req_set(url, header)
55 | if "http" not in url:
56 | echo(
57 | "0|warning",
58 | "You should assign the type of [http]/[https] before the url str!!! The default is [http].",
59 | )
60 | if types not in [0, 1, 2, 3, 11, 12, 13]:
61 | echo("0|warning", types, " type is not supported!!!")
62 | return
63 |
64 | if types < 10:
65 | req_func = requests.get
66 | else:
67 | req_func = requests.post
68 | mode = types % 10
69 | if mode == 0:
70 | timeout = html_timeout
71 | else:
72 | timeout = json_timeout
73 | return get_basic(
74 | req_func, url, proxies, data, header, need_cookie, config, mode, timeout
75 | )
76 |
77 |
78 | def req_set(url: str, header):
79 | """ req headers set """
80 | global headers
81 | headers["Host"] = url.split("/")[2]
82 | index = random.randint(0, agent_len)
83 | headers["User-Agent"] = agent_lists[index]
84 | if not header is None and "User-Agent" not in header:
85 | header["User-Agent"] = agent_lists[index]
86 | return header
87 |
88 |
89 | def get_basic(
90 | req_func,
91 | url: str,
92 | proxies,
93 | data,
94 | header,
95 | need_cookie: bool,
96 | config: dict,
97 | mode: int = 0,
98 | timeouts: int = 5,
99 | ):
100 | """ basic get requests"""
101 | if header is None:
102 | header = headers
103 | allow_redirects = config.get("allow_redirects", True)
104 | timeout = config.get("timeout", timeouts)
105 | return_proxy = config.get("return_proxy", False)
106 | try:
107 | req = req_func(
108 | url,
109 | headers=header,
110 | verify=False,
111 | timeout=timeout,
112 | proxies=proxies,
113 | data=data,
114 | allow_redirects=allow_redirects,
115 | )
116 | if mode == 2:
117 | if return_proxy:
118 | return req, proxies
119 | return req
120 | elif mode == 0:
121 | if req.apparent_encoding == "utf-8" or "gbk" in req.apparent_encoding:
122 | req.encoding = req.apparent_encoding
123 | result = BeautifulSoup(req.text, "html.parser")
124 | elif mode == 1:
125 | result = req.json()
126 | elif mode == 3:
127 | result = req.text
128 | if need_cookie:
129 | return result, req.cookies.get_dict()
130 | if return_proxy:
131 | return result, proxies
132 | return result
133 | except:
134 | if mode == 3:
135 | result = ""
136 | elif mode == 0:
137 | result = BeautifulSoup("", "html.parser")
138 | else:
139 | result = None
140 | if need_cookie:
141 | return result, {}
142 | return result
143 |
144 |
145 | def changeCookie(cookie: str):
146 | """ change cookie """
147 | global headers
148 | headers["Cookie"] = cookie
149 |
150 |
151 | def changeHeaders(header: dict):
152 | """ change Headers """
153 | global headers
154 | headers = {**headers, **header}
155 |
156 |
157 | def changeHtmlTimeout(timeout: int):
158 | """ change html timeout """
159 | global html_timeout
160 | html_timeout = timeout
161 |
162 |
163 | def changeJsonTimeout(timeout: int):
164 | """ change json timeout """
165 | global json_timeout
166 | json_timeout = timeout
167 |
168 |
169 | def begin_time() -> int:
170 | """ multi-version time manage """
171 | global start
172 | start.append(time_stamp())
173 | return len(start) - 1
174 |
175 |
176 | def end_time_aver(version: int):
177 | time_spend = time_stamp() - start[version]
178 | spend_list.append(time_spend)
179 | echo(
180 | "2|info",
181 | "Last spend: {:.3f}s, Average spend: {:.3f}s.".format(
182 | time_spend, sum(spend_list) / len(spend_list)
183 | ),
184 | )
185 |
186 |
187 | def end_time(version: int, mode: int = 1):
188 | time_spend = time_stamp() - start[version]
189 | if not mode:
190 | return time_spend
191 | time_spend = get_time_str(time_spend)
192 | if mode == 2:
193 | echo("2|info", time_spend)
194 | return time_spend
195 |
196 |
197 | def empty():
198 | global spend_list
199 | spend_list = []
200 |
201 |
202 | def can_retry(url: str, time: int = 3) -> bool:
203 | """ judge can retry once """
204 | global failure_map
205 | if url not in failure_map:
206 | failure_map[url] = 0
207 | return True
208 | elif failure_map[url] < time:
209 | failure_map[url] += 1
210 | return True
211 | else:
212 | failure_map[url] = 0
213 | return False
214 |
215 |
216 | def send_server_chan(context: str, subject: str):
217 | if SCKEY == "":
218 | return
219 | url = BASIC_SCURL % SCKEY
220 | data = {"text": subject, "desp": context}
221 | req = basic_req(url, 11, data=data)
222 | if req and req.get("errmsg", "error") == "success":
223 | echo("2|warning", "Send sever chan success!!")
224 |
225 |
226 | def send_email(context: str, subject: str, add_rec=None, assign_rec=None) -> bool:
227 | """ send email """
228 | load_configure()
229 | send_server_chan(context, subject)
230 | email_rec = [ii for ii, jj in rec_lists if jj == "0"]
231 | email_cc = [ii for ii, jj in rec_lists if jj == "1"]
232 | if assign_rec is not None:
233 | email_rec = assign_rec
234 | send_email_once(email_rec, email_cc, context, subject)
235 | if not add_rec is None:
236 | send_email_once(add_rec, [], context, subject)
237 |
238 |
239 | def send_email_once(email_rec: list, email_cc: list, context: str, subject: str):
240 | send_index = random.randint(0, len(send_lists) - 1)
241 | mail_host = "smtp.163.com"
242 | mail_user, mail_pass = send_lists[send_index]
243 | sender = "{}@163.com".format(mail_user)
244 |
245 | sign = EMAIL_SIGN % time_str(time_format="%B %d")
246 | message = MIMEText("{}{}".format(context, sign), "plain", "utf-8")
247 | message["Subject"] = subject
248 | message["From"] = sender
249 | message["To"] = ", ".join(email_rec)
250 | message["Cc"] = ", ".join(email_cc)
251 |
252 | try:
253 | smtpObj = smtplib.SMTP_SSL(mail_host)
254 | smtpObj.connect(mail_host, 465)
255 | smtpObj.login(mail_user, mail_pass)
256 | smtpObj.sendmail(sender, email_rec + email_cc, message.as_string())
257 | smtpObj.quit()
258 | echo("1|warning", "Send email success!!")
259 | return True
260 | except smtplib.SMTPException as e:
261 | echo("0|warning", "Send email error", e)
262 | return False
263 |
264 |
265 | def dump_bigger(data, output_file: str):
266 | """ pickle.dump big file which size more than 4GB """
267 | max_bytes = 2 ** 31 - 1
268 | bytes_out = pickle.dumps(data, protocol=4)
269 | with open(output_file, "wb") as f_out:
270 | for idx in range(0, len(bytes_out), max_bytes):
271 | f_out.write(bytes_out[idx : idx + max_bytes])
272 |
273 |
274 | def load_bigger(input_file: str):
275 | """ pickle.load big file which size more than 4GB """
276 | max_bytes = 2 ** 31 - 1
277 | bytes_in = bytearray(0)
278 | input_size = os.path.getsize(input_file)
279 | with open(input_file, "rb") as f_in:
280 | for _ in range(0, input_size, max_bytes):
281 | bytes_in += f_in.read(max_bytes)
282 | return pickle.loads(bytes_in)
283 |
284 |
285 | def time_str(time_s: int = -1, time_format: str = "%Y-%m-%d %H:%M:%S"):
286 | """ time stamp -> time str """
287 | if time_s > 0:
288 | return time.strftime(time_format, time.localtime(time_s))
289 | return time.strftime(time_format, time.localtime(time_stamp()))
290 |
291 |
292 | def time_stamp(time_str: str = "", time_format: str = "%Y-%m-%d %H:%M:%S") -> float:
293 | """ time str -> time stamp """
294 | if not len(time_str):
295 | return time.time()
296 | return time.mktime(time.strptime(time_str, time_format))
297 |
298 |
299 | def echo(types, *args):
300 | """
301 | echo log -> stdout / log file
302 | @param: color: 0 -> red, 1 -> green, 2 -> yellow, 3 -> blue, 4 -> gray
303 | @param: log_type: info, warning, debug, error
304 | @param: is_service: bool
305 | """
306 | args = " ".join([str(ii) for ii in args])
307 | types = str(types)
308 | re_num = re.findall("\d", types)
309 | re_word = re.findall("[a-zA-Z]+", types)
310 | color = int(re_num[0]) if len(re_num) else 4
311 | log_type = re_word[0] if len(re_word) else "info"
312 |
313 | if is_service:
314 | log(log_type, args)
315 | return
316 | colors = {
317 | "red": "\033[91m",
318 | "green": "\033[92m",
319 | "yellow": "\033[93m",
320 | "blue": "\033[94m",
321 | "gray": "\033[90m",
322 | }
323 | if not color in list(range(len(colors.keys()))):
324 | color = 4
325 | if platform.system() == "Windows":
326 | print(args)
327 | else:
328 | print(list(colors.values())[color], args, "\033[0m")
329 |
330 |
331 | def shuffle_batch_run_thread(
332 | threading_list: list, batch_size: int = 24, is_await: bool = False
333 | ):
334 | """ shuffle batch run thread """
335 | thread_num = len(threading_list)
336 | np.random.shuffle(threading_list) # shuffle thread
337 | total_block = thread_num // batch_size + 1
338 | for block in range(total_block):
339 | for ii in threading_list[
340 | block * batch_size : min(thread_num, batch_size * (block + 1))
341 | ]:
342 | if threading.active_count() > batch_size:
343 | time.sleep(random.randint(2, 4) * (random.random() + 1))
344 | ii.start()
345 |
346 | if not is_await or block % 10 == 1:
347 | for ii in threading_list[
348 | block * batch_size : min(thread_num, batch_size * (block + 1))
349 | ]:
350 | ii.join()
351 | else:
352 | time.sleep(min(max(5, batch_size * 2 / 210), 10))
353 | echo(
354 | "1|info",
355 | time_str(),
356 | "{}/{}".format(total_block, block),
357 | "epochs finish.",
358 | "One Block {} Thread ".format(batch_size),
359 | )
360 |
361 |
362 | def mkdir(origin_dir: str):
363 | """ mkdir file dir"""
364 | if not os.path.exists(origin_dir):
365 | os.mkdir(origin_dir)
366 |
367 |
368 | def read_file(read_path: str, mode: int = 0):
369 | """ read file """
370 | if not os.path.exists(read_path):
371 | return [] if not mode else ""
372 | with open(read_path, "r", encoding="utf-8", newline="\n") as f:
373 | if not mode:
374 | data = [ii.strip() for ii in f.readlines()]
375 | elif mode == 1:
376 | data = f.read()
377 | elif mode == 2:
378 | data = list(f.readlines())
379 | return data
380 |
381 |
382 | def log(types: str, *log_args: list):
383 | """ log record @param: type: {'critical', 'error', 'warning', 'info', 'debug'} """
384 | mkdir(LOG_DIR)
385 | LOG_PATH = "{}{}.log".format(LOG_DIR, time_str(time_format="%Y%m%d"))
386 | logging.basicConfig(
387 | level=logging.DEBUG,
388 | filename=LOG_PATH,
389 | filemode="a",
390 | format="[%(asctime)s] [%(levelname)s] %(message)s",
391 | datefmt="%Y-%m-%d %H:%M:%S",
392 | )
393 | logging.getLogger("requests").setLevel(logging.WARNING)
394 | logging.getLogger("urllib3").setLevel(logging.WARNING)
395 | logging.getLogger("chardet").setLevel(logging.WARNING)
396 | log_str = " ".join([str(ii) for ii in log_args])
397 | if types == "critical":
398 | logging.critical(log_str)
399 | elif types == "error":
400 | logging.error(log_str)
401 | elif types == "warning":
402 | logging.warning(log_str)
403 | elif types == "info":
404 | logging.info(log_str)
405 | elif types == "debug":
406 | logging.debug(log_str)
407 | else:
408 | logging.info("{} {}".format(types, log_str))
409 |
410 |
411 | def decoder_url(url: str, do_decoder: bool = False) -> dict:
412 | if "?" not in url:
413 | return {}
414 | decoder_dict = {
415 | ii.split("=", 1)[0]: ii.split("=", 1)[1]
416 | for ii in url.split("?", 1)[1].split("&")
417 | if ii != ""
418 | }
419 | if do_decoder:
420 | decoder_dict = {
421 | key: urllib.parse.unquote(value) for key, value in decoder_dict.items()
422 | }
423 | return decoder_dict
424 |
425 |
426 | def encoder_url(url_dict: {}, origin_url: str) -> str:
427 | return "{}?{}".format(
428 | origin_url,
429 | "&".join(
430 | [
431 | "{}={}".format(ii, urllib.parse.quote(str(jj)))
432 | for ii, jj in url_dict.items()
433 | ]
434 | ),
435 | )
436 |
437 |
438 | def json_str(data: dict):
439 | """ equal to JSON.stringify in javascript """
440 | return json.dumps(data, separators=(",", ":"))
441 |
442 |
443 | def decoder_cookie(cookie: str) -> dict:
444 | return {ii.split("=", 1)[0]: ii.split("=", 1)[1] for ii in cookie.split("; ")}
445 |
446 |
447 | def encoder_cookie(cookie_dict: {}) -> str:
448 | return "; ".join(["{}={}".format(ii, jj) for ii, jj in cookie_dict.items()])
449 |
450 |
451 | def get_time_str(time_gap: int, is_gap: bool = True) -> str:
452 | if not is_gap:
453 | time_gap = int(time_gap // 60)
454 | day = int(time_gap // 1440)
455 | hour = int(time_gap / 60) % 24
456 | minute = int(time_gap % 60)
457 | result = ""
458 | if day:
459 | result += "{}Day ".format(day)
460 | if hour:
461 | result += "{:02d}h ".format(hour)
462 | if minute:
463 | if day and not hour:
464 | result += "{:02d}h ".format(hour)
465 | result += "{:02d}min".format(minute)
466 | return result.strip()
467 |
468 |
469 | def get_min_s(t: str) -> str:
470 | t = float(t)
471 | m = int(t // 60)
472 | s = int(t % 60)
473 | return "{:02d}:{:02d}".format(m, s)
474 |
475 |
476 | def replace_params(origin_str: str, reg: str) -> str:
477 | """ replace params """
478 | params_re = re.findall(reg, origin_str)
479 | params = {}
480 | for ii in params_re:
481 | if not ii in params:
482 | params[ii] = len(params)
483 | for ii in sorted(list(params.keys()), key=lambda i: -len(i)):
484 | origin_str = origin_str.replace(ii, f"a{params[ii]}")
485 | return origin_str
486 |
487 |
488 | def decoder_fuzz(reg: str, file_path: str, replace_func=replace_params):
489 | """ simple decoder of fuzz file """
490 | file_dir, file_name = os.path.split(file_path)
491 | origin_str = read_file(file_path, mode=1)
492 | origin_str = codecs.unicode_escape_decode(origin_str)[0]
493 | origin_str = replace_func(origin_str, reg)
494 | name1, name2 = file_name.split(".", 1)
495 | output_path = f"{file_dir}/{name1}_decoder.{name2}"
496 | echo(
497 | 1,
498 | "decoder fuzz file {} -> {}, total {} line.".format(
499 | file_name, output_path, origin_str.count("\n")
500 | ),
501 | )
502 | with open(output_path, "w") as f:
503 | f.write(origin_str)
504 |
505 |
506 | def get_accept(types: str) -> str:
507 | """ @param: types => html, json, xhr """
508 | if types == "html":
509 | return "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
510 | elif types == "json":
511 | return "application/json, text/javascript, */*; q=0.01"
512 | elif types == "xhr":
513 | return "application/json, text/plain, */*"
514 | return "*/*"
515 |
516 |
517 | def get_use_agent(types: str = "pc") -> str:
518 | """ @param: types => pc, mobile"""
519 | if types == "pc":
520 | return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36"
521 | return "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1"
522 |
523 |
524 | def get_content_type(types: str = "utf8") -> str:
525 | return "application/x-www-form-urlencoded{}".format(
526 | ";charset=UTF-8" if types == "utf8" else ""
527 | )
528 |
529 |
530 | def change_pic_size(picture_path: str, resize: tuple = (600, 600)):
531 | import cv2
532 |
533 | if not os.path.exists(picture_path):
534 | echo(0, "picture not found in", picture_path)
535 | return
536 | pic = cv2.imread(picture_path)
537 | pic = cv2.resize(pic, resize)
538 | split_text = os.path.splitext(picture_path)
539 | output_path = "{}_resize{}".format(*split_text)
540 | cv2.imwrite(output_path, pic)
541 |
542 |
543 | def load_configure():
544 | """ load configure """
545 | global LAST_CONFIG, rec_lists, send_lists, SCKEY
546 | if time_stamp() - LAST_CONFIG < 300:
547 | return
548 | if not os.path.exists(configure_path):
549 | shutil.copy(configure_path + ".tmp", configure_path)
550 | cfg = ConfigParser()
551 | cfg.read(configure_path, "utf-8")
552 | rec_list = cfg.get("email", "rec_lists").split(",")
553 | send_list = cfg.get("email", "send_lists").split(",")
554 | rec_lists = [ii.split(":") for ii in rec_list]
555 | send_lists = [ii.split(":") for ii in send_list]
556 | SCKEY = cfg.get("ServerChan", "SCKEY")
557 |
558 |
559 | headers = {
560 | "Cookie": "",
561 | "Accept": get_accept("html"),
562 | "Content-Type": get_content_type(),
563 | "User-Agent": get_use_agent(),
564 | }
565 | data_dir = "util/data/"
566 | log_path = "service.log"
567 | LAST_CONFIG = -1
568 | rec_lists, send_lists, SCKEY = [], [], ""
569 | configure_path = "util/util.ini"
570 | BASIC_SCURL = "https://sc.ftqq.com/%s.send"
571 | mkdir(data_dir)
572 | agent_lists = [
573 | " ".join(index.split()[1:])[1:-1] for index in read_file("{}agent".format(data_dir))
574 | ]
575 | if not len(agent_lists):
576 | agent_lists = [headers["User-Agent"]]
577 |
578 | agent_len = len(agent_lists) - 1
579 | html_timeout = 5
580 | json_timeout = 4
581 | start = []
582 | spend_list = []
583 | failure_map = {}
584 | is_service = False
585 | LOG_DIR = "log/"
586 | EMAIL_SIGN = "\n\n\nBest wish!!\n%s\n\n————————————————————\n• Send from script designed by gunjianpan."
587 | load_configure()
588 |
--------------------------------------------------------------------------------
/zimuzu/zimuzu.ini.tmp:
--------------------------------------------------------------------------------
1 | [basic]
2 | zimuzu_id:ooAnc4
3 | drama_name:Game_of_Thrones
--------------------------------------------------------------------------------
/zimuzu/zimuzu.py:
--------------------------------------------------------------------------------
1 | '''
2 | @Author: gunjianpan
3 | @Date: 2019-02-28 09:47:06
4 | @Last Modified by: gunjianpan
5 | @Last Modified time: 2019-04-13 14:11:45
6 | '''
7 |
8 | import codecs
9 | import os
10 | import re
11 | import shutil
12 |
13 | from configparser import ConfigParser
14 | from proxy.getproxy import GetFreeProxy
15 | from util.util import begin_time, end_time, can_retry
16 |
17 | proxy_req = GetFreeProxy().proxy_req
18 |
19 | """
20 | * zimuzu @http
21 | * zmz005.com/XXXXXX
22 | """
23 |
24 | configure_path = 'zimuzu/zimuzu.ini'
25 | data_dir = 'zimuzu/data/'
26 |
27 |
28 | class zimuzu():
29 | ''' load download link from zimuzu '''
30 |
31 | def __init__(self):
32 | cfg = ConfigParser()
33 | cfg.read(configure_path, 'utf-8')
34 | self.zimuzu_id = cfg.get('basic', 'zimuzu_id')
35 | self.drama_name = cfg.get('basic', 'drama_name')
36 |
37 | def load_url(self):
38 | ''' load url form zimuzu '''
39 |
40 | url = 'http://zmz005.com/{}'.format(self.zimuzu_id)
41 | detail = proxy_req(url, 0)
42 | total = []
43 |
44 | if not detail:
45 | print('retry')
46 | if can_retry(url):
47 | self.load_url()
48 | return
49 | season_list = detail.find_all(
50 | 'div', class_='tab-content info-content')[1:]
51 | for season in season_list:
52 | quality_list = season.find_all('div', class_='tab-pane')
53 | url_body = quality_list[1] if 'APP' in quality_list[0]['id'] else quality_list[0]
54 | season_id = re.findall(r"\d+\.?\d*", url_body['id'])[0]
55 | total.append(season_id)
56 | if int(season_id) < 12:
57 | url_body = quality_list[1]
58 |
59 | url_list = url_body.find_all('ul', class_='down-links')
60 | url = [index.find_all('div', class_='copy-link')[1]['data-url']
61 | for index in url_list]
62 | total.append('\n'.join(url) + '\n')
63 | with codecs.open('{}{}'.format(data_dir, self.drama_name), 'w', encoding='utf-8') as f:
64 | f.write('\n'.join(total))
65 |
66 |
67 | if __name__ == '__main__':
68 | if not os.path.exists(data_dir):
69 | os.makedirs(data_dir)
70 | if not os.path.exists(configure_path):
71 | shutil.copy(configure_path + '.tmp', configure_path)
72 | zimuzu = zimuzu()
73 | zimuzu.load_url()
74 |
--------------------------------------------------------------------------------