├── .gitignore ├── LICENSE ├── README.md ├── bilibili ├── analysis.py ├── assign_up.ini.tmp ├── basicBilibili.py ├── bsocket.py ├── geetestE.py ├── loginBilibili.py └── upBilibili.py ├── blog └── titleviews.py ├── brushclass └── brushclass.py ├── buildmd ├── activateArticle.py ├── article.sql ├── buildmd.py ├── tbk.ini.tmp └── tpwd.sql ├── ctrip ├── hotelDetail.js └── hotelDetail.py ├── dytt8 └── dytt8.py ├── eastmoney └── eastmoney.py ├── exam ├── shaoq.js └── shaoq.py ├── mafengwo ├── hotel.js └── mafengwo.py ├── movie └── douban.py ├── netease ├── netease_music_base.py ├── netease_music_db.py └── table.sql ├── news └── news.py ├── press └── press.py ├── proxy ├── getproxy.py ├── ip66.py └── table.sql ├── requirement.txt ├── util ├── db.py ├── util.ini.tmp └── util.py └── zimuzu ├── zimuzu.ini.tmp └── zimuzu.py /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # python cache 4 | __pycache__ 5 | 6 | # jupyter 7 | .ipynb_checkpoints 8 | 9 | # test 10 | test* 11 | Untitled* 12 | 13 | # gatherproxy 14 | gatherproxy 15 | 16 | # log 17 | log 18 | 19 | # song_detail 20 | song_detail 21 | 22 | # ide 23 | .idea 24 | .vscode 25 | 26 | # data 27 | data 28 | yybzz 29 | 30 | .DS_Store 31 | 32 | *.csv 33 | *.txt 34 | *.ini 35 | 36 | # utils.agent 37 | utils/agent 38 | 39 | # history 40 | .history 41 | 42 | # tbk 43 | top 44 | picture* 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-present gunjianpan(iofu728) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Spider logo

4 |

Spider Man

5 | 6 | [![GitHub](https://img.shields.io/github/license/iofu728/spider.svg?style=popout-square)](https://github.com/iofu728/spider/blob/master/LICENSE) 7 | [![GitHub tag](https://img.shields.io/github/tag/iofu728/spider.svg?style=popout-square)](https://github.com/iofu728/spider/releases) 8 | [![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/iofu728/spider.svg?style=popout-square)](https://github.com/iofu728/spider) 9 | 10 |

高可用代理IP池高并发生成器一些实战经验

11 |

Highly Available Proxy IP Pool, Highly Concurrent Request Builder, Some Application

12 | 13 | ## Navigation 14 | 15 | | site | document | Last Modified time | 16 | | -------------------- | ----------------------------------------- | ------------------ | 17 | | some proxy site,etc. | [Proxy pool](#proxy-pool) | 20-06-01 | 18 | | music.163.com | [Netease](#netease) | 18-10-21 | 19 | | - | [Press Test System](#press-test-system) | 18-11-10 | 20 | | news.baidu.com | [News](#news) | 19-01-25 | 21 | | note.youdao.com | [Youdao Note](#youdao-note) | 20-01-04 | 22 | | jianshu.com/csdn.net | [blog](#blog) | 20-01-04 | 23 | | elective.pku.edu.cn | [Brush Class](#brush-class) | 19-10-11 | 24 | | zimuzu.tv | [zimuzu](#zimuzu) | 19-04-13 | 25 | | bilibili.com | [Bilibili](#bilibili) | 20-06-06 | 26 | | exam.shaoq.com | [shaoq](#shaoq) | 19-03-21 | 27 | | data.eastmoney.com | [Eastmoney](#eastmoney) | 19-03-29 | 28 | | hotel.ctrip.com | [Ctrip Hotel Detail](#ctrip-hotel-detail) | 19-10-11 | 29 | | douban.com | [DouBan](#douban) | 19-05-07 | 30 | | 66ip.cn | [66ip](#66ip) | 19-05-07 | 31 | 32 | ## keyword 33 | 34 | - Big data store 35 | - High concurrency requests 36 | - Support WebSocket 37 | - method for font cheat 38 | - method for js compile 39 | - Some Application 40 | 41 | ## Quick Start 42 | 43 | `docker` is on the road. 44 | 45 | ```bash 46 | $ git clone https://github.com/iofu728/spider.git 47 | $ cd spider 48 | $ pip install -r requirement.txt 49 | 50 | # load proxy pool 51 | $ python proxy/getproxy.py # to load proxy resources 52 | ``` 53 | 54 | > To use proxy pool 55 | 56 | ```python 57 | ''' using proxy requests ''' 58 | from proxy.getproxy import GetFreeProxy # to use proxy 59 | proxy_req = GetFreeProxy().proxy_req 60 | proxy_req(url:str, types:int, data=None, test_func=None, header=None) 61 | 62 | ''' using basic requests ''' 63 | from util.util import basic_req 64 | basic_req(url: str, types: int, proxies=None, data=None, header=None, need_cookie: bool = False) 65 | ``` 66 | 67 | ## Structure 68 | 69 | ```bash 70 | . 71 | ├── LICENSE 72 | ├── README.md 73 | ├── bilibili 74 | │ ├── analysis.py // data analysis 75 | │ ├── bilibili.py // bilibili basic 76 | │ └── bsocket.py // bilibili websocket 77 | ├── blog 78 | │ └── titleviews.py // Zhihu && CSDN && jianshu 79 | ├── brushclass 80 | │ └── brushclass.py // PKU elective 81 | ├── buildmd 82 | │ └── buildmd.py // Youdao Note 83 | ├── eastmoney 84 | │ └── eastmoney.py // font analysis 85 | ├── exam 86 | │ ├── shaoq.js // jsdom 87 | │ └── shaoq.py // compile js shaoq 88 | ├── log 89 | ├── netease 90 | │ ├── netease_music_base.py 91 | │ ├── netease_music_db.py // Netease Music 92 | │ └── table.sql 93 | ├── news 94 | │ └── news.py // Google && Baidu 95 | ├── press 96 | │ └── press.py // Press text 97 | ├── proxy 98 | │ ├── getproxy.py // Proxy pool 99 | │ └── table.sql 100 | ├── requirement.txt 101 | ├── utils 102 | │ ├── db.py 103 | │ └── utils.py 104 | └── zimuzu 105 | └── zimuzu.py // zimuzi 106 | ``` 107 | 108 | ## Proxy pool 109 | 110 | > proxy pool is the heart of this project. 111 | 112 | - Highly Available Proxy IP Pool 113 | - By obtaining data from `Gatherproxy`, `Goubanjia`, `xici` etc. Free Proxy WebSite 114 | - Analysis of the Goubanjia port data 115 | - Quickly verify IP availability 116 | - Cooperate with Requests to automatically assign proxy Ip, with Retry mechanism, fail to write DB mechanism 117 | - two models for proxy shell 118 | - model 1: load gather proxy list && update proxy list file(need over the GFW, your personality passwd in http://gatherproxy.com to `proxy/data/passage` one line by username, one line by passwd) 119 | - model 0: update proxy pool db && test available 120 | - one common proxy api 121 | - `from proxy.getproxy import GetFreeProxy` 122 | - `proxy_req = GetFreeProxy().proxy_req` 123 | - `proxy_req(url: str, types: int, data=None, test_func=None, header=None)` 124 | - also one common basic req api 125 | - `from util import basic_req` 126 | - `basic_req(url: str, types: int, proxies=None, data=None, header=None)` 127 | - if you want spider by using proxy 128 | - because access proxy web need over the GFW, so maybe you can't use `model 1` to download proxy file. 129 | - download proxy txt from 'http://gatherproxy.com' 130 | - cp download_file proxy/data/gatherproxy 131 | - python proxy/getproxy.py --model==0 132 | 133 | ## Netease 134 | 135 | > Netease Music song playlist crawl - [netease/netease_music_db.py](https://github.com/iofu728/spider/blob/master/netease/netease_music_db.py) 136 | 137 | - problem: `big data store` 138 | - classify -> playlist id -> song_detail 139 | - V1 Write file, One run version, no proxy, no record progress mechanism 140 | - V1.5 Small amount of proxy IP 141 | - V2 Proxy IP pool, Record progress, Write to MySQL 142 | 143 | - Optimize the write to DB `Load data/ Replace INTO` 144 | 145 | - [Netease Music Spider for DB](https://wyydsb.xin/other/neteasedb.html) 146 | - [Netease Music Spider](https://wyydsb.xin/other/netease.html) 147 | 148 | ## Press Test System 149 | 150 | > Press Test System - [press/press.py](https://github.com/iofu728/spider/blob/master/press/press.py) 151 | 152 | - problem: `high concurrency requests` 153 | - By highly available proxy IP pool to pretend user. 154 | - Give some web service uneven pressure 155 | - To do: press uniform 156 | 157 | ## News 158 | 159 | > google & baidu info crawl- [news/news.py](https://github.com/iofu728/spider/blob/master/news/news.py) 160 | 161 | - get news from search engine by Proxy Engine 162 | - one model: careful analysis `DOM` 163 | - the other model: rough analysis `Chinese words` 164 | 165 | ## Youdao Note 166 | 167 | > Youdao Note documents crawl - [buildmd/buildmd.py](https://github.com/iofu728/spider/blob/master/buildmd/buildmd.py) 168 | 169 | - load data from `youdaoyun` 170 | - by series of rules to deal data to .md 171 | 172 | ## blog 173 | 174 | > csdn && zhihu && jianshu view info crawl - [blog/titleview.py](https://github.com/iofu728/spider/blob/master/blog/titleviews.py) 175 | 176 | ```bash 177 | $ python blog/titleviews.py --model=1 >> log 2>&1 # model = 1: load gather model or python blog/titleviews.py --model=1 >> proxy.log 2>&1 178 | $ python blog/titleviews.py --model=0 >> log 2>&1 # model = 0: update gather model 179 | ``` 180 | 181 | ## Brush Class 182 | 183 | > PKU Class brush - [brushclass/brushclass.py](https://github.com/iofu728/spider/blob/master/brushclass/brushclass.py) 184 | 185 | - when your expected class have places, It will send you some email. 186 | 187 | ## zimuzu 188 | 189 | > ZiMuZu download list crawl - [zimuzu/zimuzu.py](https://github.com/iofu728/spider/blob/master/zimuzu/zimuzu.py) 190 | 191 | - when you want to download lots of show like Season 22, Season 21. 192 | - If click one by one, It is very boring, so zimuzu.py is all you need. 193 | - The thing you only need do is to wait for the program run. 194 | - And you copy the Thunder URL for one to download the movies. 195 | - Now The Winter will come, I think you need it to review ``. 196 | 197 | ## Bilibili 198 | 199 | > Get av data by http - [bilibili/bilibili.py](https://github.com/iofu728/spider/blob/master/bilibili/bilibili.py) 200 | 201 | - `homepage rank` -> check `tids` -> to check data every 2min(during on rank + one day) 202 | - monitor every rank av -> star num & basic data 203 | 204 | > Get av data by websocket - [bilibili/bsocket.py](https://github.com/iofu728/spider/blob/master/bilibili/bsocket.py) 205 | 206 | - base on WebSocket 207 | - byte analysis 208 | - heartbeat 209 | 210 | > Get comment data by http - [bilibili/bilibili.py](https://github.com/iofu728/spider/blob/master/bilibili/bilibili.py) 211 | 212 | - load comment from `/x/v2/reply` 213 | 214 | - UnicodeEncodeError: 'ascii' codec can't encode characters in position 7-10: ordinal not in range(128) 215 | 216 | - read/write in `utf-8` 217 | - with codecs.open(filename, 'r/w', encoding='utf-8') 218 | 219 | - `bilibili` some url return 404 like `http://api.bilibili.com/x/relation/stat?jsonp=jsonp&callback=__jp11&vmid=` 220 | 221 | basic_req auto add `host` to headers, but this URL can't request in ‘Host’ 222 | 223 | ## shaoq 224 | 225 | > Get text data by compiling javascript - [exam/shaoq.py](https://github.com/iofu728/spider/blob/master/exam/shaoq.py) 226 | 227 | - Idea 228 | 229 | 1. get cookie 230 | 2. request image 231 | 3. requests after 5.5s 232 | 4. compile javascript code -> get css 233 | 5. analysic css 234 | 235 | - Requirement 236 | 237 | ```sh 238 | pip3 install PyExecJS 239 | yarn install add jsdom # npm install jsdom PS: not global 240 | ``` 241 | 242 | - Can't get true html 243 | 244 | - Wait time must be 5.5s. 245 | - So you can use `threading` or `await asyncio.gather` to request image 246 | 247 | - [Coroutines and Tasks](https://docs.python.org/3/library/asyncio-task.html) 248 | 249 | - Error: Cannot find module 'jsdom' 250 | 251 | > jsdom must install in local not in global 252 | 253 | - [Cannot find module 'jsdom'](https://github.com/scala-js/scala-js/issues/2642) 254 | 255 | - remove subtree & edit subtree & re.findall 256 | 257 | ```py 258 | subtree.extract() 259 | subtree.string = new_string 260 | parent_tree.find_all(re.compile(''')) 261 | ``` 262 | 263 | - [extract()](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#extract) 264 | - [NavigableString](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#navigablestring) 265 | - [A regular expression](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#a-regular-expression) 266 | 267 | ## Eastmoney 268 | 269 | > Get stock info by analysis font - [eastmoney/eastmoney.py](https://github.com/iofu728/spider/blob/master/eastmoney/eastmoney.py) 270 | 271 | - font analysis 272 | 273 | - Idea 274 | 275 | 1. get data from HTML -> json 276 | 2. get font map -> transform num 277 | 3. or load font analysis font(contrast with base) 278 | 279 | - error: unpack requires a buffer of 20 bytes 280 | 281 | - requests.text -> str, 282 | - requests.content -> byte 283 | 284 | - [Struct.error: unpack requires a buffer of 16 bytes](https://stackoverflow.com/questions/51110525/struct-error-unpack-requires-a-buffer-of-16-bytes) 285 | 286 | - How to analysis font 287 | 288 | - use fonttools 289 | - get TTFont().getBestCamp() 290 | - contrast with base 291 | 292 | - configure file 293 | 294 | - cfg = ConfigParser() 295 | - cfg.read(assign_path, 'utf-8') 296 | - [13.10read configure file](https://python3-cookbook.readthedocs.io/zh_CN/latest/c13/p10_read_configuration_files.html) 297 | 298 | ## Ctrip Hotel Detail 299 | 300 | > Get Ctrip Hotel True Detail - [ctrip/hotelDetail.py](https://github.com/iofu728/spider/blob/master/ctrip/hotelDetail.py) 301 | 302 | - int32 303 | 304 | ```python 305 | np.int32() 306 | ``` 307 | 308 | - js charCodeAt() in py 309 | 310 | [python 中如何实现 js 里的 charCodeAt()方法？](https://www.zhihu.com/question/57108214) 311 | 312 | ```python 313 | ord(string[index]) 314 | ``` 315 | 316 | - python access file fold import 317 | 318 | ```python 319 | import sys 320 | sys.path.append(os.getcwd()) 321 | ``` 322 | 323 | - generate char list 324 | 325 | using ASCII 326 | 327 | ```python 328 | lower_char = [chr(i) for i in range(97,123)] # a-z 329 | upper_char = [chr(i) for i in range(65,91)] # A-Z 330 | ``` 331 | 332 | - Can't get cookie in `document.cookie` 333 | 334 | Service use `HttpOnly` in `Set-Cookie` 335 | 336 | - [Why doesn't document.cookie show all the cookie for the site?](https://stackoverflow.com/questions/1022112/why-doesnt-document-cookie-show-all-the-cookie-for-the-site) 337 | - [Secure and HttpOnly](https://en.wikipedia.org/wiki/HTTP_cookie#Secure_and_HttpOnly) 338 | 339 | > The Secure attribute is meant to keep cookie communication limited to encrypted transmission, directing browsers to use cookies only via secure/encrypted connections. However, if a web server sets a cookie with a secure attribute from a non-secure connection, the cookie can still be intercepted when it is sent to the user by **man-in-the-middle attacks**. Therefore, for maximum security, cookies with the Secure attribute should only be set over a secure connection. 340 | > 341 | > The HttpOnly attribute directs browsers not to expose cookies through channels other than HTTP (and HTTPS) requests. This means that the cookie cannot be accessed via client-side scripting languages (notably JavaScript), and therefore cannot be stolen easily via cross-site scripting (a pervasive attack technique). 342 | 343 | - ctrip cookie analysis 344 | 345 | | key | method | how | constant | login | finish | 346 | | ----------------------------- | ------ | --------------------------------------------------------------------------------------------------- | -------- | ----- | ------ | 347 | | `magicid` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 | 348 | | `ASP.NET_SessionId` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 | 349 | | `clientid` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 | 350 | | `_abtest_userid` | set | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 1 | 351 | | `hoteluuid` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 352 | | `fcerror` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 353 | | `_zQdjfing` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 354 | | `OID_ForOnlineHotel` | js | `https://webresource.c-ctrip.com/ResHotelOnline/R8/search/js.merge/showhotelinformation.js` | 1 | 0 | 355 | | `_RSG` | req | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 | 356 | | `_RDG` | req | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 | 357 | | `_RGUID` | set | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 | 358 | | `_ga` | js | for google analysis | 1 | 0 | 359 | | `_gid` | js | for google analysis | 1 | 0 | 360 | | `MKT_Pagesource` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R3/float/floating_normal.min.js` | 1 | 0 | 361 | | `_HGUID` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 1 | 0 | 362 | | `HotelDomesticVisitedHotels1` | set | `https://hotels.ctrip.com/Domestic/tool/AjaxGetHotelAddtionalInfo.ashx` | 1 | 0 | 363 | | `_RF1` | req | `https://cdid.c-ctrip.com/chloro-device/v2/d` | 1 | 0 | 364 | | `appFloatCnt` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R3/float/floating_normal.min.js?20190428` | 1 | 0 | 365 | | `gad_city` | set | `https://crm.ws.ctrip.com/Customer-Market-Proxy/AdCallProxyV2.aspx` | 1 | 0 | 366 | | `login_uid` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 367 | | `login_type` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 368 | | `cticket` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 369 | | `AHeadUserInfo` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 370 | | `ticket_ctrip` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 371 | | `DUID` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 372 | | `IsNonUser` | set | `https://accounts.ctrip.com/ssoproxy/ssoCrossSetCookie` | 1 | 1 | 373 | | `UUID` | req | `https://passport.ctrip.com/gateway/api/soa2/12770/setGuestData` | 1 | 1 | 374 | | `IsPersonalizedLogin` | js | `https://webresource.c-ctrip.com/ares2/basebiz/cusersdk/~0.0.8/default/login/1.0.0/loginsdk.min.js` | 1 | 1 | 375 | | `_bfi` | js | `https://webresource.c-ctrip.com/code/ubt/_bfa.min.js?v=20193_28.js` | 1 | 0 | 376 | | `_jzqco` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R1/remarketing/js/mba_ctrip.js` | 1 | 0 | 377 | | `__zpspc` | js | `https://webresource.c-ctrip.com/ResUnionOnline/R1/remarketing/js/s.js` | 1 | 0 | 378 | | `_bfa` | js | `https://webresource.c-ctrip.com/code/ubt/_bfa.min.js?v=20193_28.js` | 1 | 0 | 379 | | `_bfs` | js | `https://webresource.c-ctrip.com/code/ubt/_bfa.min.js?v=20193_28.js` | 1 | 0 | 380 | | `utc` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 | 381 | | `htltmp` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 | 382 | | `htlstm` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 | 383 | | `arp_scroll_position` | js | `https://hotels.ctrip.com/hotel/xxx.html` | 0 | 0 | 1 | 384 | 385 | - some fusion in ctrip 386 | 387 | ```js 388 | function a31(a233, a23, a94) { 389 | var a120 = { 390 | KWcVI: "mMa", 391 | hqRkQ: function a272(a309, a20) { 392 | return a309 + a20; 393 | }, 394 | WILPP: function a69(a242, a488) { 395 | return a242(a488); 396 | }, 397 | ydraP: function a293(a338, a255) { 398 | return a338 == a255; 399 | }, 400 | ceIER: ";expires=", 401 | mDTlQ: function a221(a234, a225) { 402 | return a234 + a225; 403 | }, 404 | dnvrD: function a268(a61, a351) { 405 | return a61 + a351; 406 | }, 407 | DIGJw: function a368(a62, a223) { 408 | return a62 == a223; 409 | }, 410 | pIWEz: function a260(a256, a284) { 411 | return a256 + a284; 412 | }, 413 | jXvnT: ";path=/", 414 | }; 415 | if (a120["KWcVI"] !== a120["KWcVI"]) { 416 | var a67 = new Date(); 417 | a67[a845("0x1a", "4Vqw")]( 418 | a120[a845("0x1b", "RswF")](a67["getDate"](), a94) 419 | ); 420 | document[a845("0x1c", "WjvM")] = 421 | a120[a845("0x1d", "3082")](a233, "=") + 422 | a120[a845("0x1e", "TDHu")](escape, a23) + 423 | (a120["ydraP"](a94, null) 424 | ? "" 425 | : a120["hqRkQ"](a120["ceIER"], a67[a845("0x1f", "IErH")]())) + 426 | a845("0x20", "eHIq"); 427 | } else { 428 | var a148 = a921(this, function() { 429 | var a291 = function() { 430 | return "dev"; 431 | }, 432 | a366 = function() { 433 | return "window"; 434 | }; 435 | var a198 = function() { 436 | var a168 = new RegExp("\\w+ *\$\$ *{\\w+ *[' | '].+[' | '];? *}"); 437 | return !a168["test"](a291["toString"]()); 438 | }; 439 | var a354 = function() { 440 | var a29 = new RegExp("(\\[x|u](\\w){2,4})+"); 441 | return a29["test"](a366["toString"]()); 442 | }; 443 | var a243 = function(a2) { 444 | var a315 = ~-0x1 >> (0x1 + (0xff % 0x0)); 445 | if (a2["indexOf"]("i" === a315)) { 446 | a310(a2); 447 | } 448 | }; 449 | var a310 = function(a213) { 450 | var a200 = ~-0x4 >> (0x1 + (0xff % 0x0)); 451 | if (a213["indexOf"]((!![] + "")[0x3]) !== a200) { 452 | a243(a213); 453 | } 454 | }; 455 | if (!a198()) { 456 | if (!a354()) { 457 | a243("indÐµxOf"); 458 | } else { 459 | a243("indexOf"); 460 | } 461 | } else { 462 | a243("indÐµxOf"); 463 | } 464 | }); 465 | // a148(); 466 | var a169 = new Date(); 467 | a169["setDate"](a169["getDate"]() + a94); 468 | document["cookie"] = a120["mDTlQ"]( 469 | a120["dnvrD"]( 470 | a120["dnvrD"](a120["dnvrD"](a233, "="), escape(a23)), 471 | a120["DIGJw"](a94, null) 472 | ? "" 473 | : a120["pIWEz"](a120["ceIER"], a169["toGMTString"]()) 474 | ), 475 | a120["jXvnT"] 476 | ); 477 | } 478 | } 479 | ``` 480 | 481 | equal to 482 | 483 | ```js 484 | document["cookie"] = 485 | a233 + 486 | "=" + 487 | escape(a23) + 488 | (a94 == null ? "" : ";expires=" + a169["toGMTString"]()) + 489 | ";path=/"; 490 | ``` 491 | 492 | So, It is only a function to set cookie & expires. 493 | 494 | And you can think `a31` is a entry point to judge where code about compiler cookie. 495 | 496 | - Get current timezone offset 497 | 498 | ```python 499 | import datetime, tzlocal 500 | local_tz = tzlocal.get_localzone() 501 | timezone_offset = -int(local_tz.utcoffset(datetime.datetime.today()).total_seconds() / 60) 502 | ``` 503 | 504 | - JSON.stringfy(e) 505 | 506 | ```python 507 | import json 508 | json.dumps(e, separators=(',', ':')) 509 | ``` 510 | 511 | - [JSON.stringify (Javascript) and json.dumps (Python) not equivalent on a list?](https://stackoverflow.com/questions/46227854/json-stringify-javascript-and-json-dumps-python-not-equivalent-on-a-list) 512 | 513 | - Element.getBoundingClientRect() 514 | 515 | return Element position 516 | 517 | - [Element.getBoundingClientRect()](https://developer.mozilla.org/en-US/docs/Web/API/Element/getBoundingClientRect) 518 | - [EventTarget.addEventListener()](https://developer.mozilla.org/en-US/docs/Web/API/EventTarget/addEventListener) 519 | 520 | ## DouBan 521 | 522 | - RuntimeError: dictionary changed size during iteration (when user pickle) 523 | 524 | - This situation maybe happen when your pickle params change in pickling. 525 | - so copy of your params before pickle 526 | 527 | ```python 528 | comment_loader = comment.copy() 529 | dump_bigger(comment_loader, '{}data.pkl'.format(data_dir)) 530 | ``` 531 | 532 | [How to avoid “RuntimeError: dictionary changed size during iteration” error?](https://stackoverflow.com/questions/11941817/how-to-avoid-runtimeerror-dictionary-changed-size-during-iteration-error) 533 | [pickling SimpleLazyObject fails just after accessing related object of wrapped model instance.](https://code.djangoproject.com/ticket/25426) 534 | 535 | - RecursionError: maximum recursion depth exceeded while pickling an object 536 | 537 | - object depth more than MAXIMUM stack depth 538 | 539 | ```python 540 | import sys 541 | sys.setrecursionlimit(10000) 542 | ``` 543 | 544 | ## 66ip 545 | 546 | > Q: @liu wong 一段 js 代码在浏览器上执行的结果和在 python 上用 execjs 执行的结果不一样，有啥原因呢？ http://www.66ip.cn/ 547 | 548 | > A: 一般 eval 差异主要是有编译环境，DOM，py 与 js 的字符规则，context 等有关 549 | > 像 66ip 这个网站，主要是从 py 与 js 的字符规则不同 + DOM 入手的，当然它也有可能是无意的(毕竟爬虫工程师用的不只是 py) 550 | > 首次访问 66ip 这个网站，会返回一个 521 的 response，header 里面塞了一个 HTTP-only 的 cookie，body 里面塞了一个 script 551 | 552 | ```js 553 | var x = "@...".replace(/@*$/, "").split("@"), 554 | y = "...", 555 | f = function(x, y) { 556 | return num; 557 | }, 558 | z = f( 559 | y 560 | .match(/\w/g) 561 | .sort(function(x, y) { 562 | return f(x) - f(y); 563 | }) 564 | .pop() 565 | ); 566 | while (z++) 567 | try { 568 | eval( 569 | y.replace(/\b\w+\b/g, function(y) { 570 | return x[f(y, z) - 1] || "_" + y; 571 | }) 572 | ); 573 | break; 574 | } catch (_) {} 575 | ``` 576 | 577 | > 可以看到 eval 的是 y 字符串用 x 数组做了一个字符替换之后的结果，所以按道理应该和编译环境没有关系，但把 eval 改成 aa 之后放在 py 和放在 node，chrome 中编译结果却不一样 578 | > 这是因为在 p 正则\b 会被转义为\x80，这就会导致正则匹配不到，就更不可能替换了，导致我们拿到的 eval_script 实际上是一串乱码 579 | > 这里用 r'{}'.format(eval_script) 来防止特殊符号被转义 580 | > 剩下的就是对拿到的 eval_script 进行 dom 替换操作 581 | > 总的来说是一个挺不错的 js 逆向入门练手项目, 代码量不大，逻辑清晰 582 | > 具体代码参见[iofu728/spider](https://github.com/iofu728/spider/blob/master/proxy/ip66.py) 583 | 584 | ![image](https://cdn.nlark.com/yuque/0/2019/png/104214/1557240022438-bc891ec5-7bbc-412a-b4d4-f330608d21f0.png) 585 | 586 | ## OceanBall V2 587 | 588 | check param list: 589 | 590 | | param | Ctrip | Incognito | Node | !!import | 591 | | ------------ | ----- | --------- | ---- | -------- | 592 | | define | ✔ | x | x | 593 | | \_\_filename | x | x | x | 594 | | module | x | x | ✔ | x | 595 | | process | ✔ | x | ✔ | 596 | | \_\_dirname | ✔ | x | x | 597 | | global | x | x | ✔ | x | 598 | | INT_MAX | ✔ | x | x | 599 | | require | ✔ | x | ✔ | ✔ | 600 | | History | ✔ | x | 601 | | Location | ✔ | x | 602 | | Window | ✔ | x | 603 | | Document | ✔ | x | 604 | | window | ✔ | x | 605 | | navigator | ✔ | x | 606 | | history | ✔ | x | 607 | 608 | **----To be continued----** 609 | -------------------------------------------------------------------------------- /bilibili/analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-04-04 10:57:24 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-03-24 01:37:39 6 | 7 | import pandas as pd 8 | import numpy as np 9 | import time 10 | import os 11 | from util.util import time_stamp, echo, read_file 12 | 13 | root_dir = os.path.abspath('bilibili') 14 | data_dir = os.path.join(root_dir, 'data/') 15 | history_data_dir = os.path.join(data_dir, 'history_data/') 16 | history_dir = os.path.join(data_dir, 'history/') 17 | 18 | 19 | def analysis_csv(): 20 | data_dir = 'bilibili/' 21 | df = pd.read_csv('%spublic.csv' % data_dir) 22 | 23 | '''one day''' 24 | df['fan'] = df['3'].fillna(0) 25 | df['time'] = df['1'].map(lambda x: x.split(None, 1)[1]) 26 | df['fanadd'] = df['4'] - df['3'] 27 | df['fanadd'] = df['fanadd'].map(lambda x: x if x > 0 else 0) 28 | df['fanadd_ratio'] = df['fanadd'] / df['3'] 29 | df['fanadd_ratio'] = df['fanadd_ratio'].replace( 30 | [np.inf, -np.inf], np.nan).fillna(0) 31 | df['viewadd'] = (df['18'] - df['6']).fillna(0) 32 | df['viewadd'] = df['viewadd'].map(lambda x: x if x > 0 else 0) 33 | df['viewadd_ratio'] = (df['viewadd'] / df['6']).replace( 34 | [np.inf, -np.inf], np.nan).fillna(0) 35 | df['view_fan'] = (df['viewadd'] / df['3']).replace( 36 | [np.inf, -np.inf], np.nan).fillna(0) 37 | df['view_fan_20'] = df['view_fan'].map(lambda x: x if x < 20 else 0) 38 | df['view_fanadd'] = (df['viewadd'] / df['fanadd']).replace( 39 | [np.inf, -np.inf], np.nan).fillna(0) 40 | 41 | '''seven day''' 42 | df['seven'] = df['1'].map(lambda x: '1970-01-%d %s' % (int(time.strftime( 43 | "%w", time.strptime(x, "%Y-%m-%d %H:%M:%S"))) + 4, x.split(None, 1)[1])) 44 | need_columns = ['time', 'fan', 'fanadd', 'fanadd_ratio', 45 | 'viewadd', 'viewadd_ratio', 'view_fan', 'view_fan_20', 'view_fanadd', 'seven'] 46 | result_df = pd.DataFrame(df, columns=need_columns) 47 | result_df.to_csv('%spublic_re.csv' % data_dir, index=False) 48 | 49 | 50 | def clean_csv(av_id: int): 51 | ''' clean csv ''' 52 | csv_path = os.path.join(history_dir, '{}.csv'.format(av_id)) 53 | output_path = os.path.join(history_data_dir, '{}_new.csv'.format(av_id)) 54 | print(csv_path) 55 | csv = read_file(csv_path) 56 | last_time, last_view = csv[0].split(',')[:2] 57 | result = [csv[0]] 58 | last_time = time_stamp(last_time) 59 | last_view = int(last_view) 60 | empty_line = ','.join([' '] * (len(csv[0].split(',')) + 1)) 61 | for line in csv[1:]: 62 | now_time, now_view = line.split(',')[:2] 63 | now_time = time_stamp(now_time) 64 | now_view = int(now_view) 65 | time_gap = now_time - last_time 66 | 67 | if now_view < last_view or now_view - last_view > 5000: 68 | continue 69 | if abs(time_gap) > 150: 70 | for ii in range(int((time_gap - 30) // 120)): 71 | result.append(empty_line) 72 | if abs(time_gap) > 90: 73 | result.append(line) 74 | last_view, last_time = now_view, now_time 75 | with open(output_path, 'w') as f: 76 | f.write('\n'.join(result)) 77 | -------------------------------------------------------------------------------- /bilibili/assign_up.ini.tmp: -------------------------------------------------------------------------------- 1 | [basic] 2 | bv_id = BV1GW411g7mc 3 | av_id = 21061574 4 | basic_av_p = -1 5 | author = 还有一天就放假了 6 | mid = 7792521 7 | rank_id = 119 8 | tid = 126 9 | view_abnormal = 1000 10 | history_check_list = 1,3,6 11 | ;split by ',' 12 | special_info_email = 123456@163.com 13 | assign_email = 123456@163.com 14 | 15 | [assign] 16 | av_ids = 21061574,11624347 17 | bv_ids = BV1GW411g7mc 18 | ;split by ',' 19 | 20 | [comment] 21 | keyword = 死全家|草泥马|.{0,4}\$\$_.{0,4} 22 | ;support re, use '|' split 23 | ignore_list = ^[2-3].* 24 | ignore_rpid = {"21061574":["242-2"],"21062574":["1242"],"21061577":["1284"]} 25 | ignore_start = 0.5 26 | ignore_end = 8.5 27 | email_limit = 5 28 | 29 | [login] 30 | username = 123 31 | password = 123 -------------------------------------------------------------------------------- /bilibili/basicBilibili.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-09-14 14:49:01 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-06-06 12:48:53 6 | 7 | import json 8 | import os 9 | import shutil 10 | import sys 11 | import urllib 12 | from configparser import ConfigParser 13 | 14 | sys.path.append(os.getcwd()) 15 | from proxy.getproxy import GetFreeProxy 16 | from util.util import can_retry, get_accept, basic_req 17 | 18 | 19 | one_day = 86400 20 | root_dir = os.path.abspath("bilibili") 21 | data_dir = os.path.join(root_dir, "data/") 22 | assign_path = os.path.join(root_dir, "assign_up.ini") 23 | if not os.path.exists(assign_path): 24 | shutil.copy(assign_path + ".tmp", assign_path) 25 | 26 | 27 | class BasicBilibili(object): 28 | BILIBILI_URL = "https://www.bilibili.com" 29 | BASIC_AV_URL = "http://www.bilibili.com/video/av%d" 30 | BASIC_BV_URL = "http://www.bilibili.com/video/%s" 31 | ARCHIVE_STAT_URL = "http://api.bilibili.com/x/web-interface/archive/stat?aid=%d" 32 | VIEW_URL = "http://api.bilibili.com/x/web-interface/view?bvid=%s" 33 | RELATION_STAT_URL = ( 34 | "http://api.bilibili.com/x/relation/stat?jsonp=jsonp&callback=__jp0&vmid=%d" 35 | ) 36 | BASIC_RANKING_URL = "https://www.bilibili.com/ranking/all/%d/" 37 | SPACE_AVS_URL = ( 38 | "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=1&ps=50&jsonp=jsonp" 39 | ) 40 | REPLY_V2_URL = ( 41 | "http://api.bilibili.com/x/v2/reply?jsonp=jsonp&pn=%d&type=1&oid=%d&sort=%d" 42 | ) 43 | RANKING_URL = "https://api.bilibili.com/x/web-interface/ranking?rid=%d&day=%d&type=1&arc_type=%d&jsonp=jsonp&callback=__jp1" 44 | PLAYLIST_URL = "https://api.bilibili.com/x/player/pagelist?aid=%d&jsonp=jsonp" 45 | DM_URL = "https://api.bilibili.com/x/v1/dm/list.so?oid=%d" 46 | GET_KEY_URL = "http://passport.bilibili.com/login?act=getkey&r=%f" 47 | LOGIN_URL = "https://passport.bilibili.com/login" 48 | LOGIN_V2_URL = "https://passport.bilibili.com/web/login/v2" 49 | LOGIN_OAUTH_URL = "https://passport.bilibili.com/api/v2/oauth2/login" 50 | CAPTCHA_URL = "https://passport.bilibili.com/web/captcha/combine?plat=11" 51 | GET_KEY_URL = "https://passport.bilibili.com/api/oauth2/getKey" 52 | GETTYPE_URL = "https://api.geetest.com/gettype.php?gt=%s&callback=geetest_%d" 53 | M_BILIBILI_URL = "https://m.bilibili.com/video/%s" 54 | NO_RANK_CONSTANT = "No rank.....No Rank......No Rank....." 55 | JSON_KEYS = ["code", "message", "ttl", "data"] 56 | T_FORMAT = "%m-%d %H:%M" 57 | 58 | def __init__(self): 59 | super(BasicBilibili, self).__init__() 60 | self.proxy_req = GetFreeProxy().proxy_req 61 | self.del_map = {} 62 | self.rank_map = {} 63 | self.load_configure() 64 | 65 | def load_configure(self): 66 | """ load assign configure """ 67 | cfg = ConfigParser() 68 | cfg.read(assign_path, "utf-8") 69 | self.assign_author = cfg.get("basic", "author") 70 | mid = cfg["basic"]["mid"] 71 | self.assign_mid = int(mid) if len(mid) else -1 72 | self.assign_rank_id = cfg.getint("basic", "rank_id") 73 | self.assign_tid = cfg.getint("basic", "tid") 74 | self.basic_bv_id = cfg.get("basic", "bv_id") 75 | self.view_abnormal = cfg.getint("basic", "view_abnormal") 76 | self.assign_ids = cfg.get("assign", "bv_ids").split(",") 77 | rank_map = {ii: {} for ii in self.assign_ids if ii not in self.del_map} 78 | self.rank_map = {**rank_map, **self.rank_map} 79 | self.keyword = cfg.get("comment", "keyword") 80 | self.ignore_rpid = json.loads(cfg.get("comment", "ignore_rpid")) 81 | self.ignore_list = cfg.get("comment", "ignore_list") 82 | self.ignore_start = cfg.getfloat("comment", "ignore_start") 83 | self.ignore_end = cfg.getfloat("comment", "ignore_end") 84 | self.email_limit = cfg.getint("comment", "email_limit") 85 | self.AV_URL = self.BASIC_BV_URL % self.basic_bv_id 86 | self.history_check_list = [ 87 | int(ii) for ii in cfg.get("basic", "history_check_list").split(",") 88 | ] 89 | self.special_info_email = cfg.get("basic", "special_info_email").split(",") 90 | self.assign_rec = cfg.get("basic", "assign_email").split(",") 91 | self.username = urllib.parse.quote_plus(cfg.get("login", "username")) 92 | self.password = cfg.get("login", "password") 93 | 94 | def get_api_req(self, url: str, bv_id: str, types: int = 0, is_proxy: bool = True): 95 | r_req = self.proxy_req if is_proxy else basic_req 96 | if types == 0: 97 | req = r_req(url, 1, header=self.get_api_headers(bv_id)) 98 | else: 99 | req = r_req(url, 3, header=self.get_api_headers(bv_id)) 100 | req = self.decoder_jp(req) 101 | if req is None or list(req.keys()) != self.JSON_KEYS: 102 | if can_retry(url): 103 | return self.get_api_req(url, bv_id, types) 104 | else: 105 | return 106 | return req["data"] 107 | 108 | def get_api_headers(self, bv_id: str, types: int = 0) -> dict: 109 | if isinstance(bv_id, int): 110 | bv_id = "av{}".format(bv_id) 111 | if types == 0: 112 | return {"Accept": "*/*", "Referer": self.BASIC_BV_URL % bv_id} 113 | if types == 1: 114 | return {"Accept": get_accept("html"), "Host": self.BILIBILI_URL} 115 | 116 | def update_ini(self, bv_id: str, av_id: int): 117 | cfg = ConfigParser() 118 | cfg.read(assign_path, "utf-8") 119 | cfg.set("basic", "bv_id", bv_id) 120 | cfg.set("basic", "av_id", str(bv_id)) 121 | bv_ids = cfg.get("assign", "bv_ids") 122 | cfg.set("assign", "bv_ids", "{},{}".format(bv_ids, bv_id)) 123 | cfg.write(open(assign_path, "w")) 124 | 125 | def decoder_jp(self, text: str) -> dict: 126 | star_begin = text.find("{") 127 | if star_begin == -1: 128 | return {} 129 | star_json = text[star_begin:-1] 130 | try: 131 | return json.loads(star_json) 132 | except: 133 | return {} 134 | 135 | def update_proxy_basic(self): 136 | self.proxy_req = GetFreeProxy().proxy_req 137 | -------------------------------------------------------------------------------- /bilibili/bsocket.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-03-26 10:21:05 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-06-06 11:32:49 6 | 7 | 8 | import asyncio 9 | import codecs 10 | import json 11 | import logging 12 | import os 13 | import shutil 14 | import struct 15 | import sys 16 | import time 17 | from collections import namedtuple 18 | from configparser import ConfigParser 19 | from enum import IntEnum 20 | from ssl import _create_unverified_context 21 | 22 | import aiohttp 23 | import regex 24 | 25 | sys.path.append(os.getcwd()) 26 | from proxy.getproxy import GetFreeProxy 27 | from util.util import basic_req, can_retry, echo, mkdir, time_stamp, time_str 28 | 29 | logger = logging.getLogger(__name__) 30 | proxy_req = GetFreeProxy().proxy_req 31 | data_dir = "bilibili/data/" 32 | websocket_dir = "%swebsocket/" % data_dir 33 | assign_path = "bilibili/assign_up.ini" 34 | one_day = 86400 35 | 36 | """ 37 | * bilibili @websocket 38 | * www.bilibili.com/video/av{av_id} 39 | * wss://broadcast.chat.bilibili.com:7823/sub 40 | """ 41 | 42 | 43 | class Operation(IntEnum): 44 | SEND_HEARTBEAT = 2 45 | ONLINE = 3 46 | COMMAND = 5 47 | AUTH = 7 48 | RECV = 8 49 | NESTED = 9 50 | DANMAKU = 1000 51 | 52 | 53 | class BWebsocketClient: 54 | """ bilibili websocket client """ 55 | 56 | ROOM_INIT_URL = "https://www.bilibili.com/video/bv%s" 57 | WEBSOCKET_URL = "wss://broadcast.chat.bilibili.com:7823/sub" 58 | PLAYLIST_URL = "https://api.bilibili.com/x/player/pagelist?bvid=%s&jsonp=jsonp" 59 | HEARTBEAT_BODY = "[object Object]" 60 | JSON_KEYS = ["code", "message", "ttl", "data"] 61 | 62 | HEADER_STRUCT = struct.Struct(">I2H2IH") 63 | HeaderTuple = namedtuple( 64 | "HeaderTuple", 65 | ("total_len", "header_len", "proto_ver", "operation", "time", "zero"), 66 | ) 67 | _COMMAND_HANDLERS = { 68 | "DM": lambda client, command: client._on_get_danmaku( 69 | command["info"][1], command["info"][0] 70 | ) 71 | } 72 | 73 | def __init__(self, av_id: int, bv_id: str, types: int = 0, p: int = -1): 74 | """ init class """ 75 | self._av_id = av_id 76 | self._bv_id = bv_id 77 | self._room_id = None 78 | self._count = 1 79 | self._types = types 80 | self._begin_time = int(time_stamp()) 81 | self._loop = asyncio.get_event_loop() 82 | self._session = aiohttp.ClientSession(loop=self._loop) 83 | self._is_running = False 84 | self._websocket = None 85 | self._p = p if p > 0 else 1 86 | self._getroom_id() 87 | 88 | async def close(self): 89 | await self._session.close() 90 | 91 | def run(self): 92 | """ Create Thread """ 93 | if self._is_running: 94 | raise RuntimeError("This client is already running") 95 | self._is_running = True 96 | return asyncio.ensure_future(self._message_loop(), loop=self._loop) 97 | 98 | def get_cid(self, bv_id: str): 99 | playlist_url = self.PLAYLIST_URL % bv_id 100 | headers = {"Accept": "*/*", "Referer": self.ROOM_INIT_URL % bv_id} 101 | req = basic_req(playlist_url, 1, header=headers) 102 | if req is None or list(req.keys()) != self.JSON_KEYS: 103 | return 104 | cid = [ii["cid"] for ii in req["data"]] 105 | return cid 106 | 107 | def _getroom_id(self, proxy: bool = True): 108 | """ get av room id """ 109 | cid = self.get_cid(self._bv_id) 110 | assert ( 111 | cid and len(cid) >= self._p 112 | ), "Actual Page len: {} <=> Need Pages Num: {}".format(len(cid), self._p) 113 | self._room_id = int(cid[self._p - 1]) 114 | echo(3, "Room_id:", self._room_id) 115 | 116 | def parse_struct(self, data: dict, operation: int): 117 | """ parse struct """ 118 | assert ( 119 | int(time_stamp()) < self._begin_time + 7 * one_day 120 | ), "Excess Max RunTime!!!" 121 | 122 | if operation == 7: 123 | body = json.dumps(data).replace(" ", "").encode("utf-8") 124 | else: 125 | body = self.HEARTBEAT_BODY.encode("utf-8") 126 | header = self.HEADER_STRUCT.pack( 127 | self.HEADER_STRUCT.size + len(body), 128 | self.HEADER_STRUCT.size, 129 | 1, 130 | operation, 131 | self._count, 132 | 0, 133 | ) 134 | self._count += 1 135 | return header + body 136 | 137 | async def _send_auth(self): 138 | """ send auth """ 139 | auth_params = { 140 | "room_id": "video://%d/%d" % (self._av_id, self._room_id), 141 | "platform": "web", 142 | "accepts": [1000], 143 | } 144 | await self._websocket.send_bytes(self.parse_struct(auth_params, Operation.AUTH)) 145 | 146 | async def _message_loop(self): 147 | """ loop sent message """ 148 | 149 | if self._room_id is None: 150 | self._getroom_id() 151 | 152 | while True: 153 | heartbeat_con = None 154 | try: 155 | async with self._session.ws_connect(self.WEBSOCKET_URL) as websocket: 156 | self._websocket = websocket 157 | await self._send_auth() 158 | heartbeat_con = asyncio.ensure_future( 159 | self._heartbeat_loop(), loop=self._loop 160 | ) 161 | 162 | async for message in websocket: 163 | if message.type == aiohttp.WSMsgType.BINARY: 164 | await self._handle_message(message.data, 0) 165 | else: 166 | logger.warning( 167 | "Unknown Message type = %s %s", 168 | message.type, 169 | message.data, 170 | ) 171 | 172 | except asyncio.CancelledError: 173 | break 174 | except aiohttp.ClientConnectorError: 175 | logger.warning("Retrying */*/*/*/---") 176 | try: 177 | await asyncio.sleep(5) 178 | except asyncio.CancelledError: 179 | break 180 | finally: 181 | if heartbeat_con is not None: 182 | heartbeat_con.cancel() 183 | try: 184 | await heartbeat_con 185 | except asyncio.CancelledError: 186 | break 187 | self._websocket = None 188 | 189 | self._is_running = False 190 | 191 | async def _heartbeat_loop(self): 192 | """ heart beat every 30s """ 193 | if self._types and int(time_stamp()) > self._begin_time + one_day: 194 | self.close() 195 | for _ in range(int(one_day * 7 / 30)): 196 | try: 197 | await self._websocket.send_bytes( 198 | self.parse_struct({}, Operation.SEND_HEARTBEAT) 199 | ) 200 | await asyncio.sleep(30) 201 | except (asyncio.CancelledError, aiohttp.ClientConnectorError): 202 | break 203 | 204 | async def _handle_message(self, message: str, offset: int = 0): 205 | """ handle message""" 206 | while offset < len(message): 207 | try: 208 | header = self.HeaderTuple( 209 | *self.HEADER_STRUCT.unpack_from(message, offset) 210 | ) 211 | body = message[ 212 | offset + self.HEADER_STRUCT.size : offset + header.total_len 213 | ] 214 | if ( 215 | header.operation == Operation.ONLINE 216 | or header.operation == Operation.COMMAND 217 | ): 218 | body = json.loads(body.decode("utf-8")) 219 | if header.operation == Operation.ONLINE: 220 | await self._on_get_online(body) 221 | else: 222 | await self._handle_command(body) 223 | elif header.operation == Operation.RECV: 224 | print("Connect Build!!!") 225 | elif header.operation == Operation.NESTED: 226 | offset += self.HEADER_STRUCT.size 227 | continue 228 | elif header.operation == Operation.DANMAKU: 229 | body = json.loads(body.decode("utf-8")) 230 | print(body) 231 | print(">>>>DANMAKU tail socket>>>>") 232 | else: 233 | logger.warning( 234 | "Unknown operation = %d %s %s", header.operation, header, body 235 | ) 236 | offset += header.total_len 237 | except: 238 | pass 239 | 240 | async def _handle_command(self, command): 241 | if isinstance(command, list): 242 | for one_command in command: 243 | await self._handle_command(one_command) 244 | return 245 | 246 | cmd = command["cmd"] 247 | if cmd in self._COMMAND_HANDLERS: 248 | handler = self._COMMAND_HANDLERS[cmd] 249 | if handler is not None: 250 | await handler(self, command) 251 | else: 252 | logger.warning("Unknown Command = %s %s", cmd, command) 253 | 254 | async def _on_get_online(self, online): 255 | """ get online num """ 256 | pass 257 | 258 | async def _on_get_danmaku(self, content, user_name): 259 | """ get danmaku """ 260 | pass 261 | 262 | 263 | class OneBWebsocketClient(BWebsocketClient): 264 | """ get one bilibili websocket client """ 265 | 266 | async def _on_get_online(self, online): 267 | online = online["data"]["room"]["online"] 268 | with codecs.open(self.get_path("online"), "a", encoding="utf-8") as f: 269 | f.write(self.get_data([online])) 270 | print("Online:", online) 271 | 272 | async def _on_get_danmaku(self, content, user_name): 273 | with codecs.open(self.get_path("danmaku"), "a", encoding="utf-8") as f: 274 | f.write(self.get_data([content, user_name])) 275 | print(content, user_name) 276 | 277 | def get_data(self, origin_data: list) -> str: 278 | """ get data """ 279 | return ",".join(str(ii) for ii in [time_str(), *origin_data]) + "\n" 280 | 281 | def get_path(self, types: str) -> str: 282 | """ get path """ 283 | p_path = "_p%d" % self._p if self._p != -1 else "" 284 | return "%s%d_%s%s.csv" % (websocket_dir, self._av_id, types, p_path) 285 | 286 | 287 | async def async_main(av_id: int, bv_id: str, types: int, p: int): 288 | client = OneBWebsocketClient(av_id, bv_id, types, p=p) 289 | future = client.run() 290 | try: 291 | await future 292 | finally: 293 | await client.close() 294 | 295 | 296 | def BSocket(av_id: int, bv_id: str, types: int = 0, p: int = -1): 297 | """ build a loop websocket connect""" 298 | loop = asyncio.get_event_loop() 299 | try: 300 | loop.run_until_complete(async_main(av_id, bv_id, types, p)) 301 | finally: 302 | loop.close() 303 | 304 | 305 | if __name__ == "__main__": 306 | mkdir(data_dir) 307 | mkdir(websocket_dir) 308 | if not os.path.exists(assign_path): 309 | shutil.copy(assign_path + ".tmp", assign_path) 310 | 311 | """ Test for San Diego demon """ 312 | """ PS: the thread of BSocket have to be currentThread in its processing. """ 313 | if len(sys.argv) == 4: 314 | bv_id = sys.argv[2] 315 | av_id = int(sys.argv[1]) 316 | p = int(sys.argv[3]) 317 | else: 318 | cfg = ConfigParser() 319 | cfg.read(assign_path, "utf-8") 320 | av_id = cfg.getint("basic", "av_id") 321 | bv_id = cfg.getint("basic", "bv_id") 322 | p = cfg.getint("basic", "basic_av_p") if len(cfg["basic"]["basic_av_p"]) else -1 323 | 324 | BSocket(av_id, bv_id, p=p) 325 | -------------------------------------------------------------------------------- /bilibili/geetestE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-09-15 19:25:31 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2019-10-09 23:23:44 6 | 7 | import os 8 | import sys 9 | import time 10 | 11 | import numpy as np 12 | 13 | sys.path.append(os.getcwd()) 14 | from util.util import echo, time_stamp 15 | 16 | g = '0123456789abcdefghijklmnopqrstuvwxyz' 17 | FV = 4503599627370496 18 | DV = 268435456 19 | DB = 28 20 | DM = DV - 1 21 | F1 = 24 22 | F2 = 4 23 | 24 | 25 | class S(object): 26 | def __init__(self): 27 | e = [(255 & int(65536 * np.random.random())) for _ in range(256)] 28 | S = [ii for ii in range(256)] 29 | n = 0 30 | for t in range(256): 31 | n = (n + S[t] + e[t % len(e)]) & 255 32 | S[t], S[n] = S[n], S[t] 33 | self.S = S 34 | self.i = 0 35 | self.j = 0 36 | 37 | def __call__(self): 38 | if not len(self.S): 39 | self.get_S() 40 | self.i = (self.i + 1) & 255 41 | self.j = (self.j + self.S[self.i]) & 255 42 | self.S[self.i], self.S[self.j] = self.S[self.j], self.S[self.i] 43 | return self.S[(self.S[self.i] + self.S[self.j]) & 255] 44 | 45 | 46 | class E(object): 47 | def __init__(self, e=None, t: int = 256): 48 | super(E, self).__init__() 49 | self.t = 0 50 | self.s = 0 51 | self.E = {} 52 | self.T = {} 53 | if e is not None: 54 | if type(e) == int and e == 1: 55 | self.one() 56 | else: 57 | self.prepare_E(e, t) 58 | 59 | def __call__(self, e: list, t: int): 60 | self.prepare_E(e, t) 61 | 62 | def prepare_E(self, e: list, t: int): 63 | n, r, o, i, a, s = int(np.log2(t)), 0, 0, len(e) - 1, False, 0 64 | p = {ii + 48: ii for ii in range(10)} 65 | p = {**p, **{ii + 55: ii for ii in range(10, 36)}} 66 | p = {**p, **{ii + 87: ii for ii in range(10, 36)}} 67 | 68 | while 0 <= i: 69 | if n == 8: 70 | c = 255 & e[i] 71 | else: 72 | idx = ord(e[i]) 73 | c = p[idx] if idx in p else -1 74 | if c < 0: 75 | if '-' == e[i]: 76 | a = True 77 | i -= 1 78 | continue 79 | a = False 80 | if s == 0: 81 | self.E[self.t] = c 82 | self.t += 1 83 | elif s + n > DB: 84 | self.E[self.t - 1] = self.E[self.t 85 | - 1] | ((c & (1 << DB - 3) - 1) << s) 86 | self.E[self.t] = c >> DB - s 87 | self.t += 1 88 | else: 89 | self.E[self.t - 1] = self.E[self.t - 1] | (c << s) 90 | s += n 91 | if s >= DB: 92 | s -= DB 93 | i -= 1 94 | if n == 8 and (128 & e[0]): 95 | self.s = -1 96 | if s > 0: 97 | self.E[self.t - 1] = self.E[self.t 98 | - 1] | ((1 << DB - s) - 1 << s) 99 | self.clamp() 100 | if a: 101 | echo(0, 'a is True') 102 | E().subTo(self, self) 103 | 104 | def clamp(self): 105 | ee = self.s & DM 106 | while 0 < self.t and self.E[self.t - 1] == ee: 107 | self.t -= 1 108 | 109 | def get_T(self): 110 | self.T['m'] = self 111 | self.T['mp'] = self.invDigit() 112 | self.T['mpl'] = 32767 & self.T['mp'] 113 | self.T['mph'] = self.T['mp'] >> 15 114 | self.T['um'] = (1 << DB - 15) - 1 115 | self.T['mt2'] = 2 * self.t 116 | 117 | def invDigit(self): 118 | if self.t < 1: 119 | return 0 120 | e = self.E[0] 121 | if (0 == (1 & e)): 122 | return 0 123 | t = 3 & e 124 | t = t * (2 - (15 & e) * t) & 15 125 | t = t * (2 - (255 & e) * t) & 255 126 | t = t * (2 - ((65535 & e) * t & 65535)) & 65535 127 | t = t * (2 - e * t % DV) % DV 128 | return DV - t if t > 0 else -t 129 | 130 | def modPowInt(self, e: int, t): 131 | if e >= 256 and not t.isEven(): 132 | t.get_T() 133 | else: 134 | echo(0, 'e:', e, 'isEven:', self.isEven()) 135 | return self.exp(e, t) 136 | 137 | def exp(self, e: int, t): 138 | n, r = E(), E() 139 | i = self.y(e) - 2 140 | o = t.convert(self) 141 | o.copyTo(n) 142 | while 0 <= i: 143 | t.sqrTo(n, r) 144 | if (e & 1 << i) > 0: 145 | t.mulTo(r, o, n) 146 | else: 147 | n, r = r, n 148 | i -= 1 149 | return t.revert(n) 150 | 151 | def convert(self, e): 152 | t = E() 153 | e.dlShiftTo(self.T['m'].t, t) 154 | t.divRemTo(self.T['m'], t) 155 | if e.s < 0 and t.compareTo(E()) > 0: 156 | self.T['m'].subTo(t, t) 157 | return t 158 | 159 | def revert(self, e): 160 | t = E() 161 | e.copyTo(t) 162 | self.reduceE(t) 163 | return t 164 | 165 | def divRemTo(self, e, n): 166 | if (e.t <= 0): 167 | return False 168 | if self.t < e.t: 169 | return False 170 | i, a, s = E(), self.s, e.s 171 | c = DB - self.y(e.E[e.t - 1]) 172 | if c > 0: 173 | e.lShiftTo(c, i) 174 | self.lShiftTo(c, n) 175 | else: 176 | e.copyTo(i) 177 | self.copyTo(n) 178 | u = i.t 179 | uu = i.E[u - 1] 180 | if uu == 0: 181 | return False 182 | l = uu * (1 << F1) + (i.E[u - 2] >> F2 if u > 1 else 0) 183 | h = FV / l 184 | f = (1 << F1) / l 185 | d = 1 << F2 186 | p = n.t 187 | g = p - u 188 | v = E() 189 | i.dlShiftTo(g, v) 190 | if n.compareTo(v) >= 0: 191 | n.E[n.t] = 1 192 | n.t += 1 193 | n.subTo(v, n) 194 | E(1).dlShiftTo(u, v) 195 | v.subTo(i, i) 196 | while i.t < u: 197 | i.E[i.t] = 0 198 | i.t += 1 199 | g -= 1 200 | while g >= 0: 201 | p -= 1 202 | if n.E[p] == uu: 203 | m = DM 204 | else: 205 | m = int(n.E[p] * h + (n.E[p - 1] + d) * f) 206 | 207 | n.E[p] += i.am(0, m, n, g, 0, u) 208 | if n.E[p] < m: 209 | i.dlShiftTo(g, v) 210 | n.subTo(v, n) 211 | m -= 1 212 | while n.E[p] < m: 213 | n.subTo(v, n) 214 | m -= 1 215 | g -= 1 216 | n.t = u 217 | n.clamp() 218 | if c > 0: 219 | n.rShiftTo(c, n) 220 | if a < 0: 221 | E().subTo(n, n) 222 | 223 | def lShiftTo(self, e: int, t): 224 | r = e % DB 225 | o = DB - r 226 | i = (1 << o) - 1 227 | a = int(e / DB) 228 | s = self.s << r & DB 229 | for n in range(self.t - 1, -1, -1): 230 | t.E[n + a + 1] = self.E[n] >> o | s 231 | s = (self.E[n] & i) << r 232 | for n in range(a): 233 | t.E[n] = 0 234 | t.E[a] = s 235 | t.t = self.t + a + 1 236 | t.s = self.s 237 | t.clamp() 238 | 239 | def rShiftTo(self, e: int, t): 240 | t.s = self.s 241 | n = int(e / DB) 242 | if n >= self.t: 243 | t.t = 0 244 | else: 245 | r = e % DB 246 | o = DB - r 247 | i = (1 << r) - 1 248 | t.E[0] = self.E[n] >> r 249 | for a in range(n + 1, self.t): 250 | t.E[a - n - 1] = (t.E[a - n - 1]) | ((self.E[a] & i) << o) 251 | t.E[a - n] = self.E[a] >> r 252 | if r > 0: 253 | t.E[self.t - n - 1] = t.E[self.t - n - 1] | ((self.s & i) << o) 254 | t.t = self.t - n 255 | t.clamp() 256 | 257 | def copyTo(self, e): 258 | for t in range(self.t): 259 | e.E[t] = self.E[t] 260 | e.t = self.t 261 | e.s = self.s 262 | 263 | def dlShiftTo(self, e: int, t): 264 | for ii in range(self.t): 265 | t.E[ii + e] = self.E[ii] 266 | for ii in range(e): 267 | t.E[ii] = 0 268 | t.t = self.t + e 269 | t.s = self.s 270 | 271 | def y(self, e: int): 272 | def yy(e: int, n: int, k: int): 273 | t = e >> k 274 | if t: 275 | e = t 276 | n += k 277 | return e, n 278 | n = 1 279 | e, n = yy(e, n, 16) 280 | e, n = yy(e, n, 8) 281 | e, n = yy(e, n, 4) 282 | e, n = yy(e, n, 2) 283 | e, n = yy(e, n, 1) 284 | return n 285 | 286 | def compareTo(self, e): 287 | t = self.s - e.s 288 | if t != 0: 289 | return t 290 | t = self.t - e.t 291 | if t != 0: 292 | return t if self.s > 0 else -t 293 | for n in range(self.t - 1, -1, -1): 294 | if self.E[n] - e.E[n] != 0: 295 | return self.E[n] - e.E[n] 296 | return 0 297 | 298 | def subTo(self, e, t): 299 | n, r, o = 0, 0, np.min([e.t, self.t]) 300 | while n < o: 301 | r += self.E[n] - e.E[n] 302 | t.E[n] = r & DM 303 | n += 1 304 | r = r >> DM 305 | if e.t < self.t: 306 | r -= e.s 307 | while n < self.t: 308 | r += self.E[n] 309 | t.E[n] = r & DM 310 | n += 1 311 | r = r >> DM 312 | r += self.s 313 | else: 314 | r += self.s 315 | while n < e.t: 316 | r -= e.E[n] 317 | t.E[n] = r & DM 318 | n += 1 319 | r = r >> DM 320 | r -= e.s 321 | t.s = -1 if r < 0 else 0 322 | if r < -1 or r > 0: 323 | t.E[n] = DV + r if r < -1 else r 324 | n += 1 325 | t.t = n 326 | t.clamp() 327 | 328 | def one(self): 329 | self.E[0] = 1 330 | self.t = 1 331 | 332 | def am(self, e: int, t: int, n, r: int, o: int, i: int): 333 | a = 16383 & t 334 | s = t >> 14 335 | i -= 1 336 | while 0 <= i: 337 | c = 16383 & self.E[e] 338 | u = self.E[e] >> 14 339 | e += 1 340 | uu = s * c + u * a 341 | c = a * c + ((16383 & uu) << 14) + n.E[r] + o 342 | o = (c >> 28) + (uu >> 14) + s * u 343 | n.E[r] = DM & c 344 | r += 1 345 | i -= 1 346 | return o 347 | 348 | def sqrTo(self, e, t): 349 | e.squareTo(t) 350 | self.reduceE(t) 351 | 352 | def squareTo(self, e): 353 | e.t = 2 * self.t 354 | for n in range(e.t): 355 | e.E[n] = 0 356 | for n in range(self.t - 1): 357 | r = self.am(n, self.E[n], e, 2 * n, 0, 1) 358 | e.E[n + self.t] += self.am(n + 1, 2 * self.E[n], 359 | e, 2 * n + 1, r, self.t - n - 1) 360 | if e.E[n + self.t] >= DV: 361 | e.E[n + self.t] -= DV 362 | e.E[n + self.t + 1] = 1 363 | if e.t > 0: 364 | e.E[e.t - 1] += self.am(n, self.E[n], e, 2 * n, 0, 1) 365 | e.s = 0 366 | e.clamp() 367 | 368 | def reduceE(self, e): 369 | while e.t <= self.T['mt2']: 370 | e.E[e.t] = 0 371 | e.t += 1 372 | for t in range(self.T['m'].t): 373 | n = 32767 & e.E[t] 374 | r = (n * self.T['mpl'] + (n * self.T['mph'] + (e.E[t] >> 15) 375 | * self.T['mpl'] & self.T['um']) << 15) & DM 376 | n = t + self.T['m'].t 377 | e.E[n] += self.T['m'].am(0, r, e, t, 0, self.T['m'].t) 378 | while e.E[n] >= DV: 379 | e.E[n] -= DV 380 | n += 1 381 | e.E[n] += 1 382 | e.clamp() 383 | e.drShiftTo(self.T['m'].t, e) 384 | if e.compareTo(self.T['m']) >= 0: 385 | e.subTo(self.T['m'], e) 386 | 387 | def drShiftTo(self, e: int, t): 388 | for n in range(e, self.t): 389 | t.E[n - e] = self.E[n] 390 | t.t = np.max([self.t - e, 0]) 391 | t.s = self.s 392 | 393 | def mulTo(self, e, t, n): 394 | e.multiplyTo(t, n) 395 | self.reduceE(n) 396 | 397 | def multiplyTo(self, e, t): 398 | t.t = self.t + e.t 399 | for o in range(self.t): 400 | t.E[o] = 0 401 | for o in range(e.t): 402 | t.E[o + self.t] = self.am(0, e.E[o], t, o, 0, self.t) 403 | t.s = 0 404 | t.clamp() 405 | if self.s != e.s: 406 | E().subTo(t, t) 407 | 408 | def isEven(self): 409 | t = 1 & self.E[0] if self.t else self.s 410 | return t == 0 411 | 412 | def abs(self): 413 | return self if self.s > 0 else self 414 | 415 | def tostring(self, e: int): 416 | if self.s < 0: 417 | echo('0|warning', '.s < 0', self.s) 418 | return '-' 419 | t = int(np.log2(e)) 420 | r, o, i, a = (1 << t) - 1, False, '', self.t 421 | s = DB - a * DB % t 422 | if a > 0: 423 | if s < DB: 424 | n = self.E[a] >> s 425 | if n > 0: 426 | o = True 427 | i = g[n] 428 | a -= 1 429 | while a >= 0: 430 | if s < t: 431 | n = (self.E[a] & (1 << s) - 1) << t - s 432 | a -= 1 433 | s += DB - t 434 | n = n | (self.E[a] >> s) 435 | else: 436 | s -= t 437 | n = self.E[a] >> s & r 438 | if s <= 0: 439 | s += DB 440 | a -= 1 441 | if n > 0: 442 | o = True 443 | if o: 444 | i += g[n] 445 | return i if o else '0' 446 | 447 | 448 | class O(object): 449 | def __init__(self): 450 | self.T = {} 451 | 452 | self.n = { 453 | 'A': 48, 454 | 'BUTTON': 1, 455 | 'CANVAS': 1, 456 | 'CPUClass': None, 457 | 'DIV': 71, 458 | 'HTMLLength': 158225, 459 | 'IMG': 5, 460 | 'INPUT': 4, 461 | 'LABEL': 1, 462 | 'LI': 21, 463 | 'LINK': 3, 464 | 'P': 10, 465 | 'SCRIPT': 14, 466 | 'SPAN': 9, 467 | 'STYLE': 18, 468 | 'UL': 4, 469 | 'browserLanguage': "zh-CN", 470 | 'browserLanguages': "zh-CN,zh", 471 | 'canvas2DFP': "5eb3d9a167292cc324a4a6b692171a49", 472 | 'canvas3DFP': "b2284dba7b1ccb5ef8fabc22c0065611", 473 | 'colorDepth': 24, 474 | 'cookieEnabled': 1, 475 | 'devicePixelRatio': 2, 476 | 'deviceorientation': False, 477 | 'doNotTrack': 0, 478 | 'documentMode': "CSS1Compat", 479 | 'flashEnabled': -1, 480 | 'hardwareConcurrency': 8, 481 | 'indexedDBEnabled': 1, 482 | 'innerHeight': 150, 483 | 'innerWidth': 1680, 484 | 'internalip': None, 485 | 'javaEnabled': 0, 486 | 'jsFonts': "AndaleMono,Arial,ArialBlack,ArialHebrew,ArialNarrow,ArialRoundedMTBold,ArialUnicodeMS,ComicSansMS,Courier,CourierNew,Geneva,Georgia,Helvetica,HelveticaNeue,Impact,LUCIDAGRANDE,MicrosoftSansSerif,Monaco,Palatino,Tahoma,Times,TimesNewRoman,TrebuchetMS,Verdana,Wingdings,Wingdings2,Wingdings3", 487 | 'localStorageEnabled': 1, 488 | 'maxTouchPoints': 0, 489 | 'mediaDevices': -1, 490 | 'netEnabled': 1, 491 | 'outerHeight': 987, 492 | 'outerWidth': 1680, 493 | 'performanceTiming': "-1,-1,16,2,122,0,274,0,209,137,6,6,32,3405,3405,3408,35543,35544,35547,-1", 494 | 'platform': "MacIntel", 495 | 'plugins': "internal-pdf-viewer,mhjfbmdgcfjbbpaeojofohoefgiehjai,internal-nacl-plugin", 496 | 'screenAvailHeight': 987, 497 | 'screenAvailLeft': 0, 498 | 'screenAvailTop': 23, 499 | 'screenAvailWidth': 1680, 500 | 'screenHeight': 1050, 501 | 'screenLeft': 0, 502 | 'screenTop': 23, 503 | 'screenWidth': 1680, 504 | 'sessionStorageEnabled': 1, 505 | 'systemLanguage': None, 506 | 'textLength': 93737, 507 | 'timestamp': int(time_stamp()), 508 | 'timezone': -8, 509 | 'touchEvent': False, 510 | 'userAgent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3818.0 Safari/537.36", 511 | } 512 | self.t = ['textLength', 'HTMLLength', 'documentMode', 'A', 'ARTICLE', 'ASIDE', 'AUDIO', 'BASE', 'BUTTON', 'CANVAS', 'CODE', 'IFRAME', 'IMG', 'INPUT', 'LABEL', 'LINK', 'NAV', 'OBJECT', 'OL', 'PICTURE', 'PRE', 'SECTION', 'SELECT', 'SOURCE', 'SPAN', 'STYLE', 'TABLE', 'TEXTAREA', 'VIDEO', 'screenLeft', 'screenTop', 'screenAvailLeft', 'screenAvailTop', 'innerWidth', 'innerHeight', 'outerWidth', 'outerHeight', 'browserLanguage', 'browserLanguages', 'systemLanguage', 'devicePixelRatio', 'colorDepth', 513 | 'userAgent', 'cookieEnabled', 'netEnabled', 'screenWidth', 'screenHeight', 'screenAvailWidth', 'screenAvailHeight', 'localStorageEnabled', 'sessionStorageEnabled', 'indexedDBEnabled', 'CPUClass', 'platform', 'doNotTrack', 'timezone', 'canvas2DFP', 'canvas3DFP', 'plugins', 'maxTouchPoints', 'flashEnabled', 'javaEnabled', 'hardwareConcurrency', 'jsFonts', 'timestamp', 'performanceTiming', 'internalip', 'mediaDevices', 'DIV', 'P', 'UL', 'LI', 'SCRIPT', 'deviceorientation', 'touchEvent'] 514 | 515 | def get_performanceTiming(self): 516 | r = ['navigationStart', 'redirectStart', 'redirectEnd', 'fetchStart', 'domainLookupStart', 517 | 'domainLookupEnd', 'connectStart', 'connectEnd', 'requestStart', 'responseStart'] 518 | o = ['responseEnd', 'unloadEventStart', 'unloadEventEnd', 'domLoading', 'domInteractive', 'domContentLoadedEventStart', 519 | 'domContentLoadedEventEnd', 'domComplete', 'loadEventStart', 'loadEventEnd', 'msFirstPaint'] 520 | n = { 521 | 'connectEnd': 1568518372487, 522 | 'connectStart': 1568518372213, 523 | 'domComplete': 1568518408239, 524 | 'domContentLoadedEventEnd': 1568518376104, 525 | 'domContentLoadedEventStart': 1568518376101, 526 | 'domInteractive': 1568518376101, 527 | 'domLoading': 1568518372728, 528 | 'domainLookupEnd': 1568518372213, 529 | 'domainLookupStart': 1568518372091, 530 | 'fetchStart': 1568518372089, 531 | 'loadEventEnd': 1568518408243, 532 | 'loadEventStart': 1568518408240, 533 | 'navigationStart': 1568518372073, 534 | 'redirectEnd': 0, 535 | 'redirectStart': 0, 536 | 'requestStart': 1568518372487, 537 | 'responseEnd': 1568518372833, 538 | 'responseStart': 1568518372696, 539 | 'secureConnectionStart': 1568518372348, 540 | 'unloadEventEnd': 1568518372702, 541 | 'unloadEventStart': 1568518372702, 542 | } 543 | i = [] 544 | for e in range(1, len(r)): 545 | a = n[r[e]] 546 | if a == 0: 547 | i.append(-1) 548 | else: 549 | for s in range(e - 1, -1, -1): 550 | c = n[r[s]] 551 | if c: 552 | i.append(a - c) 553 | break 554 | u = n[r[len(r) - 1]] 555 | for e in o: 556 | if e in n and n[e]: 557 | i.append(n[e] - u) 558 | else: 559 | i.append(-1) 560 | self.n['performanceTiming'] = ','.join([str(ii) for ii in i]) 561 | 562 | def __call__(self): 563 | self.get_performanceTiming() 564 | self.r = [self.n[ii] if ii in self.n else -1 for ii in self.t] 565 | self.i = '!!'.join([str(ii) for ii in self.r]).replace( 566 | 'False', 'false').replace('True', 'true') 567 | 568 | 569 | class T(object): 570 | ''' AES ''' 571 | def gjson_stringify(self, o: dict): 572 | o_str = str(o).replace("'", '"').replace( 573 | 'True', 'true').replace('False', 'false') 574 | return o_str 575 | 576 | def parse(self, aes_key: str): 577 | n = {} 578 | for r, p in enumerate(aes_key): 579 | if r >> 2 in n: 580 | n[r >> 2] = (n[r >> 2]) | ((255 & ord(p)) << 24 - r % 4 * 8) 581 | else: 582 | n[r >> 2] = ((255 & ord(p)) << 24 - r % 4 * 8) 583 | return { 584 | 'sigBytes': len(aes_key), 585 | 'words': list(n.values()), 586 | } 587 | 588 | def encrypt(self, e: str, t: str): 589 | t = self.parse(t) 590 | n = {} 591 | n['iv'] = self.parse('0000000000000000') 592 | 593 | def E_encrypt(self, u: dict, e: str, t: dict, n: dict): 594 | self.createEncryptor(t, n) 595 | 596 | def createEncryptor(self, e: dict, r: dict): 597 | pass 598 | 599 | def create(self): 600 | e = 1 601 | -------------------------------------------------------------------------------- /bilibili/loginBilibili.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-09-14 14:47:48 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-03-21 20:56:38 6 | 7 | import base64 8 | import json 9 | import os 10 | import sys 11 | import time 12 | import urllib 13 | 14 | import numpy as np 15 | import regex 16 | import rsa 17 | 18 | sys.path.append(os.getcwd()) 19 | from util.util import can_retry, echo, encoder_cookie, send_email, time_stamp, get_accept, get_content_type 20 | 21 | from bilibili.basicBilibili import BasicBilibili 22 | from bilibili.geetestE import E, O, S 23 | 24 | 25 | proxy_req = 0 26 | one_day = 86400 27 | root_dir = os.path.abspath('bilibili') 28 | data_dir = os.path.join(root_dir, 'data/') 29 | PUBLIC = '00C1E3934D1614465B33053E7F48EE4EC87B14B95EF88947713D25EECBFF7E74C7977D02DC1D9451F79DD5D1C10C29ACB6A9B4D6FB7D0A0279B6719E1772565F09AF627715919221AEF91899CAE08C0D686D748B20A3603BE2318CA6BC2B59706592A9219D0BF05C9F65023A21D2330807252AE0066D59CEEFA5F2748EA80BAB81' 30 | 31 | 32 | class Login(BasicBilibili): 33 | ''' bilibili login module ''' 34 | 35 | def __init__(self): 36 | super(Login, self).__init__() 37 | self.update_proxy(1) 38 | self.access_key = '' 39 | self.aes_key = '' 40 | self.T = E(list(PUBLIC), 16) 41 | 42 | def get_access_key(self): 43 | captcha, cookie = self.get_captcha() 44 | hash_salt, key, cookie = self.get_hash_salt(cookie) 45 | if captcha is None: 46 | return 47 | types, cookie = self.get_type(captcha['gt'], cookie) 48 | return { 49 | 'captcha': captcha, 50 | 'hash_salt': hash_salt, 51 | 'types': types, 52 | 'cookie': cookie 53 | } 54 | 55 | def get_aes_key(self): 56 | def wl(): 57 | return hex(int(65536 * (1 + np.random.random())))[3:] 58 | return wl() + wl() + wl() + wl() 59 | 60 | def get_t(self, aes_key: str, t: int = 128): 61 | n = np.zeros(t).astype(np.int) 62 | for ii, jj in enumerate(aes_key): 63 | n[ii + 112] = ord(jj) 64 | i = S() 65 | for ii in range(t - 2, 1, -1): 66 | n[ii] = i() 67 | n[1] = 2 68 | return n 69 | 70 | def doPublic(self): 71 | n = self.get_t(self.get_aes_key()) 72 | self.N = E(n, 256) 73 | n = self.N.modPowInt(65537, self.T) 74 | r = n.tostring(16) 75 | add = '' if not (1 & len(r)) else '0' 76 | return '{}{}'.format(add, r) 77 | 78 | def get_hash_salt(self, cookie: dict = {}): 79 | url = self.GET_KEY_URL % np.random.random() 80 | headers = self.get_login_headers(2, cookie) 81 | hash_salt, cookies = proxy_req(url, 1, header=headers, 82 | need_cookie=True) 83 | if hash_salt is None or list(hash_salt.keys()) != ['hash', 'key']: 84 | if can_retry(url): 85 | return self.get_hash_salt() 86 | else: 87 | return None, {} 88 | return hash_salt['hash'], hash_salt['key'], cookies 89 | 90 | def get_captcha(self, cookie: dict = {}): 91 | url = self.CAPTCHA_URL 92 | headers = self.get_login_headers(0, cookie) 93 | captcha, cookies = proxy_req(url, 1, header=headers, need_cookie=True) 94 | if captcha is None or list(captcha.keys()) != ['code', 'data']: 95 | if can_retry(url): 96 | return self.get_captcha() 97 | else: 98 | return None, {} 99 | return captcha['data']['result'], cookies 100 | 101 | def get_access_key_req(self, hash_salt: str, key: str, challenge: str, validate: str, cookie: dict = {}): 102 | data = { 103 | 'captchaType': 11, 104 | 'username': self.username, 105 | 'password': self.encoder_login_info(hash_salt, key), 106 | 'keep': True, 107 | 'key': key, 108 | 'goUrl': self.AV_URL, 109 | 'challenge': challenge, 110 | 'validate': validate, 111 | 'seccode': f'{validate}|jordan' 112 | } 113 | headers = self.get_login_headers(1, cookie) 114 | login = proxy_req(self.LOGIN_V2_URL, 12, header=headers) 115 | 116 | def get_type(self, gt: str, cookies: dict = {}) -> dict: 117 | url = self.GETTYPE_URL % (gt, int(time_stamp() * 1000)) 118 | headers = self.get_login_headers(3, cookies) 119 | req, cookie = proxy_req(url, 3, header=headers, need_cookie=True) 120 | j_begin = req.find('{') 121 | if req == '' or j_begin == -1: 122 | if can_retry(self.GETTYPE_URL): 123 | return self.get_type(gt, cookies) 124 | else: 125 | return None, {} 126 | type_json = json.loads(req[j_begin:-1]) 127 | return type_json['data'], cookie 128 | 129 | def encode_login_info(self, hash_salt: str, key: str): 130 | public_key = rsa.PublicKey.load_pkcs1_openssl_pem(key.encode()) 131 | concate = rsa.encrypt(hash_salt + self.password).encode('utf-8') 132 | s = base64.b64encode(concate, public_key) 133 | s = urllib.parse.quote_plus(s) 134 | return s 135 | 136 | def get_login_headers(self, mode: int = 0, cookie: dict = {}): 137 | headers = { 138 | 'Referer': self.LOGIN_URL, 139 | } 140 | if mode != 3: 141 | headers['Accept'] = get_accept('*') if mode == 2 else get_accept('xhr') 142 | if mode == 1: 143 | headers['Content-Type'] = get_content_type('') 144 | elif mode == 2: 145 | headers['X-Requested-With'] = 'XMLHttpRequest' 146 | if len(cookie): 147 | headers['Cookie'] = encoder_cookie(cookie) 148 | return headers 149 | 150 | def update_proxy(self, mode: int = 0): 151 | global proxy_req 152 | if not mode: 153 | self.update_proxy_basic() 154 | proxy_req = self.proxy_req 155 | -------------------------------------------------------------------------------- /blog/titleviews.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-02-09 11:10:52 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-01-04 14:41:30 6 | 7 | import argparse 8 | import codecs 9 | import datetime 10 | import re 11 | import os 12 | import threading 13 | 14 | from bs4 import BeautifulSoup 15 | from proxy.getproxy import GetFreeProxy 16 | from util.db import Db 17 | from util.util import begin_time, end_time, changeCookie, basic_req, can_retry, changeHtmlTimeout, echo, mkdir, read_file, echo, get_accept 18 | 19 | """ 20 | * blog @http 21 | * www.zhihu.com/api/v4/creator/content_statistics 22 | * www.jianshu.com/u/ 23 | * blog.csdn.net 24 | .data/ 25 | ├── cookie // zhihu cookie 26 | ├── google // google analysis data 27 | ├── slug // blog title slug 28 | └── title // blog title list 29 | """ 30 | proxy_req = GetFreeProxy().proxy_req 31 | data_dir = 'blog/data/' 32 | 33 | 34 | class TitleViews(object): 35 | ''' script of load my blog data -> analysis ''' 36 | CSDN_URL = 'https://blog.csdn.net/iofu728' 37 | JIANSHU_URL = 'https://www.jianshu.com/u/2e0f69e4a4f0' 38 | ZHIHU_URL = 'https://www.zhihu.com/api/v4/creator/content_statistics/' 39 | 40 | def __init__(self): 41 | self.Db = Db("blog") 42 | self.local_views = {} 43 | self.title_map = {} 44 | self.title2slug = {} 45 | self.zhihu_views = {} 46 | self.zhihu_id = {} 47 | self.jianshu_views = {} 48 | self.jianshu_id = {} 49 | self.csdn_views = {} 50 | self.csdn_id = {} 51 | self.exist_data = {} 52 | self.getTitleMap() 53 | self.insert_sql = '''INSERT INTO title_views(`title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`) VALUES %s''' 54 | self.update_sql = '''REPLACE INTO title_views(`id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at`) VALUES %s''' 55 | self.new_day_sql = '''INSERT INTO page_views(`date`, `existed_views`, `existed_spider`) VALUES %s''' 56 | 57 | def loadLocalView(self): 58 | ''' load local view ''' 59 | test = read_file('{}google'.format(data_dir))[7:] 60 | for index in test: 61 | arr = index.split(',') 62 | slug = self.matchSlug(arr[0]) 63 | if slug is None or slug not in self.title_map: 64 | continue 65 | print(slug + ' ' + str(arr[1]) + ' ' + arr[0]) 66 | if slug in self.local_views: 67 | self.local_views[slug] += int(arr[1]) 68 | else: 69 | self.local_views[slug] = int(arr[1]) 70 | 71 | def getTitleMap(self): 72 | ''' get title map ''' 73 | slug = read_file('{}slug'.format(data_dir)) 74 | title = read_file('{}title'.format(data_dir)) 75 | self.title_map = {tempslug.split( 76 | '"')[1]: title[num].split('"')[1] for num, tempslug in enumerate(slug)} 77 | title2slug = { 78 | self.title_map[index]: index for index in self.title_map.keys()} 79 | noemoji_title = {self.filter_emoji( 80 | self.title_map[index]).replace('\u200d', ''): index for index in self.title_map.keys()} 81 | self.title2slug = {**noemoji_title, **title2slug} 82 | 83 | def matchSlug(self, pattern: str): 84 | ''' match slug ''' 85 | arr = re.search(r'\/([^\/]+).html', pattern) 86 | return None if arr is None else arr.group(1) 87 | 88 | def getZhihuView(self): 89 | cookie = ''.join(read_file('{}cookie'.format(data_dir))) 90 | changeCookie(cookie) 91 | url_basic = [ 92 | self.ZHIHU_URL, 93 | 'articles?order_field=object_created&order_sort=descend&begin_date=2018-09-01&end_date=', 94 | datetime.datetime.now().strftime("%Y-%m-%d"), 95 | '&page_no=' 96 | ] 97 | url = ''.join(url_basic) 98 | 99 | json = self.get_request('{}{}'.format(url, 1), 1, lambda i: not i) 100 | if not json: 101 | return 102 | if not 'data' in json: 103 | if 'code' in json: 104 | echo('0|warning', json) 105 | return 106 | echo(3, 'zhihu', json) 107 | for index in json['data']: 108 | zhihu_title = index['title'] 109 | zhihu_id = int(index['url_token']) 110 | zhihu_count = int(index['read_count']) 111 | 112 | if zhihu_title in self.title2slug: 113 | temp_slug = self.title2slug[zhihu_title] 114 | self.zhihu_id[temp_slug] = zhihu_id 115 | self.zhihu_views[temp_slug] = zhihu_count 116 | elif zhihu_id in self.zhihu_id_map: 117 | temp_slug = self.zhihu_id_map[zhihu_id] 118 | self.zhihu_id[temp_slug] = zhihu_id 119 | self.zhihu_views[temp_slug] = zhihu_count 120 | else: 121 | echo('0|debug', index['title']) 122 | 123 | for index in range(1, json['count'] // 10): 124 | echo(1, 'zhihu', index) 125 | json = self.get_request('{}{}'.format(url, 1 + index), 1, lambda i: not i) 126 | echo(2, 'zhihu', json) 127 | if not json: 128 | continue 129 | for index in json['data']: 130 | zhihu_title = index['title'] 131 | zhihu_id = int(index['url_token']) 132 | zhihu_count = int(index['read_count']) 133 | 134 | if zhihu_title in self.title2slug: 135 | temp_slug = self.title2slug[zhihu_title] 136 | self.zhihu_id[temp_slug] = zhihu_id 137 | self.zhihu_views[temp_slug] = zhihu_count 138 | elif zhihu_id in self.zhihu_id_map: 139 | temp_slug = self.zhihu_id_map[zhihu_id] 140 | self.zhihu_id[temp_slug] = zhihu_id 141 | self.zhihu_views[temp_slug] = zhihu_count 142 | else: 143 | echo('0|debug', index['title']) 144 | 145 | def get_request(self, url: str, types: int, functs, header: dict = {}): 146 | if len(header): 147 | req = basic_req(url, types, header=header) 148 | else: 149 | req = basic_req(url, types) 150 | 151 | if functs(req): 152 | if can_retry(url): 153 | self.get_request(url, types, functs, header) 154 | return 155 | return req 156 | 157 | def getJianshuViews(self): 158 | ''' get jianshu views ''' 159 | header = {'accept': get_accept('html')} 160 | 161 | for rounds in range(1, 4): 162 | url = self.JIANSHU_URL 163 | if rounds > 1: 164 | url += '?order_by=shared_at&page={}'.format(rounds) 165 | echo('1|debug', 'jianshu req url:', url) 166 | html = self.get_request(url, 0, lambda i: not i or not len( 167 | i.find_all('div', class_='content')), header) 168 | if html is None: 169 | echo(0, 'None') 170 | return 171 | for index in html.find_all('li', class_=["", 'have-img']): 172 | if len(index.find_all('i')) < 3: 173 | continue 174 | title = index.find_all('a', class_='title')[ 175 | 0].text.replace('`', '') 176 | jianshu_id = int(index['data-note-id']) 177 | jianshu_count = int(index.find_all('a')[-2].text) 178 | if title in self.title2slug: 179 | temp_slug = self.title2slug[title] 180 | self.jianshu_id[temp_slug] = jianshu_id 181 | self.jianshu_views[temp_slug] = jianshu_count 182 | elif jianshu_id in self.jianshu_id_map: 183 | temp_slug = self.jianshu_id_map[jianshu_id] 184 | self.jianshu_id[temp_slug] = jianshu_id 185 | self.jianshu_views[temp_slug] = jianshu_count 186 | else: 187 | echo(1, title) 188 | 189 | def getCsdnViews(self): 190 | ''' get csdn views ''' 191 | 192 | for index in range(1, 3): 193 | url = self.CSDN_URL 194 | if index > 1: 195 | url += '/article/list/{}?'.format(index) 196 | echo(1, 'csdn url', url) 197 | 198 | html = self.get_request(url, 0, lambda i: i is None or not i or not len( 199 | i.find_all('p', class_='content'))) 200 | if html is None: 201 | echo(0, 'None') 202 | return 203 | for div_lists in html.find_all('div', class_='article-item-box csdn-tracking-statistics'): 204 | if 'style' in div_lists.attrs: 205 | continue 206 | csdn_id = int(div_lists['data-articleid']) 207 | title = div_lists.a.contents[2].replace( 208 | '\n', '').strip().replace('`', '') 209 | csdn_count = int(div_lists.find_all( 210 | 'span', class_='read-num')[0].span.text) 211 | if title in self.title2slug: 212 | temp_slug = self.title2slug[title] 213 | self.csdn_id[temp_slug] = csdn_id 214 | self.csdn_views[temp_slug] = csdn_count 215 | elif csdn_id in self.csdn_id_map: 216 | temp_slug = self.csdn_id_map[csdn_id] 217 | self.csdn_id[temp_slug] = csdn_id 218 | self.csdn_views[temp_slug] = csdn_count 219 | else: 220 | echo(1, title) 221 | 222 | def filter_emoji(self, desstr, restr=''): 223 | ''' filter emoji ''' 224 | desstr = str(desstr) 225 | try: 226 | co = re.compile(u'[\U00010000-\U0010ffff]') 227 | except re.error: 228 | co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') 229 | return co.sub(restr, desstr) 230 | 231 | def init_db(self): 232 | self.loadLocalView() 233 | self.getZhihuView() 234 | self.getJianshuViews() 235 | self.getCsdnViews() 236 | insert_list = [] 237 | for index in self.title_map.keys(): 238 | insert_list.append((index, self.local_views[index] if index in self.local_views else 0, self.zhihu_views[index] if index in self.zhihu_views else 0, self.csdn_views[index] if index in self.csdn_views else 0, self.jianshu_views[index] 239 | if index in self.jianshu_views else 0, self.zhihu_id[index] if index in self.zhihu_id else 0, self.csdn_id[index] if index in self.csdn_id else 0, self.jianshu_id[index] if index in self.jianshu_id else 0)) 240 | # return insert_list 241 | results = self.Db.insert_db(self.insert_sql % str(insert_list)[1:-1]) 242 | if results: 243 | if len(insert_list): 244 | print('Insert ' + str(len(insert_list)) + ' Success!') 245 | else: 246 | pass 247 | 248 | def select_all(self): 249 | result = self.Db.select_db( 250 | "SELECT `id`, `title_name`, `local_views`, `zhihu_views`, `csdn_views`, `jianshu_views`, `zhihu_id`, `csdn_id`, `jianshu_id`, `created_at` from title_views where `is_deleted`=0") 251 | if result == False: 252 | print("SELECT Error!") 253 | else: 254 | self.exist_data = {index[1]: list(index) for index in result} 255 | self.zhihu_id_map = {index[6]: index[1] 256 | for index in result if index[6]} 257 | self.csdn_id_map = {index[7]: index[1] 258 | for index in result if index[7]} 259 | self.jianshu_id_map = {index[8]: index[1] 260 | for index in result if index[8]} 261 | for index in self.exist_data: 262 | self.exist_data[index][-1] = self.exist_data[index][-1].strftime( 263 | '%Y-%m-%d %H:%M:%S') 264 | 265 | def update_view(self): 266 | changeHtmlTimeout(10) 267 | wait_map = {} 268 | self.select_all() 269 | self.getZhihuView() 270 | self.getJianshuViews() 271 | self.getCsdnViews() 272 | for index in self.zhihu_views.keys(): 273 | if self.zhihu_views[index] == self.exist_data[index][3] and self.zhihu_id[index] == self.exist_data[index][6]: 274 | continue 275 | wait_map[index] = self.exist_data[index] 276 | wait_map[index][3] = self.zhihu_views[index] 277 | wait_map[index][6] = self.zhihu_id[index] 278 | for index in self.csdn_views.keys(): 279 | if self.csdn_views[index] == self.exist_data[index][4] and self.csdn_id[index] == self.exist_data[index][7]: 280 | continue 281 | if index not in wait_map: 282 | wait_map[index] = self.exist_data[index] 283 | wait_map[index][4] = self.csdn_views[index] 284 | wait_map[index][7] = self.csdn_id[index] 285 | for index in self.jianshu_views.keys(): 286 | if self.jianshu_views[index] == self.exist_data[index][5] and self.jianshu_id[index] == self.exist_data[index][8]: 287 | continue 288 | wait_map[index] = self.exist_data[index] 289 | wait_map[index][5] = self.jianshu_views[index] 290 | wait_map[index][8] = self.jianshu_id[index] 291 | update_list = [tuple(index) for index in wait_map.values()] 292 | # return update_list:q 293 | if not len(update_list): 294 | return 295 | results = self.Db.update_db(self.update_sql % str(update_list)[1:-1]) 296 | if results: 297 | if len(update_list): 298 | print('Update ' + str(len(update_list)) + ' Success!') 299 | else: 300 | pass 301 | 302 | def new_day(self): 303 | day_data = self.Db.select_db( 304 | "SELECT `today_views`, `existed_views` from page_views order by `id` desc limit 1") 305 | if not os.path.exists('../blog/log/basic'): 306 | print('File not exist!!!') 307 | return 308 | with codecs.open("../blog/log/basic", 'r', encoding='utf-8') as f: 309 | existed_spider = int(f.readlines()[1]) 310 | today_date = datetime.datetime.now().strftime('%Y-%m-%d') 311 | new_day_list = [(today_date, day_data[0][0] + 312 | day_data[0][1], existed_spider)] 313 | results = self.Db.insert_db(self.new_day_sql % str(new_day_list)[1:-1]) 314 | if results: 315 | if len(new_day_list): 316 | print('New day update' + str(len(new_day_list)) + ' Success!') 317 | else: 318 | pass 319 | 320 | def load_csdn_img(self): 321 | ''' load csdn img ''' 322 | mkdir(data_dir) 323 | urls = ['/article/list/2?', ''] 324 | article_ids = [] 325 | for url in urls: 326 | req = basic_req('{}{}'.format(self.CSDN_URL, url), 3) 327 | article_ids.extend(re.findall('data-articleid="(\w*?)"', req)) 328 | echo(0, article_ids) 329 | article_thread = [threading.Thread( 330 | target=self.load_csdn_img_batch, args=(ii,)) for ii in article_ids] 331 | for work in article_thread: 332 | work.start() 333 | for work in article_thread: 334 | work.join() 335 | 336 | def load_csdn_img_batch(self, article_id: int): 337 | url = '{}/article/details/{}'.format(self.CSDN_URL, article_id) 338 | req = proxy_req(url, 3) 339 | if not 'iofu728' in req: 340 | if can_retry(url): 341 | self.load_csdn_img_batch(article_id) 342 | return 343 | img_lists = re.findall('"(https://cdn.nlark.com.*)" alt', req) 344 | img_thread = [threading.Thread(target=self.load_csdn_img_load, args=( 345 | jj, article_id, ii))for ii, jj in enumerate(img_lists)] 346 | echo(1, 'Article Need Load {} Img...'.format(len(img_lists))) 347 | for work in img_thread: 348 | work.start() 349 | for work in img_thread: 350 | work.join() 351 | 352 | def load_csdn_img_load(self, img_url: str, article_id: int, idx: int): 353 | img_dir = '{}{}/'.format(data_dir, article_id) 354 | img_path = '{}{}.png'.format(img_dir, idx) 355 | if os.path.exists(img_path): 356 | return 357 | req = proxy_req(img_url, 2) 358 | if type(req) == bool or req is None: 359 | if can_retry(img_url): 360 | self.load_csdn_img_load(img_url, article_id, idx) 361 | return 362 | mkdir(img_dir) 363 | with open(img_path, 'wb') as f: 364 | f.write(req.content) 365 | 366 | 367 | if __name__ == '__main__': 368 | if not os.path.exists(data_dir): 369 | os.makedirs(data_dir) 370 | parser = argparse.ArgumentParser(description='gunjianpan blog backup code') 371 | parser.add_argument('--model', type=int, default=1, metavar='N', 372 | help='model update or new day') 373 | model = parser.parse_args().model 374 | bb = TitleViews() 375 | if model == 1: 376 | bb.update_view() 377 | else: 378 | bb.new_day() 379 | bb.update_view() 380 | -------------------------------------------------------------------------------- /brushclass/brushclass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-02-25 21:13:45 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2019-10-11 02:03:24 6 | 7 | import argparse 8 | import time 9 | import random 10 | import os 11 | import sys 12 | 13 | sys.path.append(os.getcwd()) 14 | from collections import Counter 15 | from proxy.getproxy import GetFreeProxy 16 | from util.util import begin_time, end_time, send_email, can_retry, echo, basic_req, get_accept, get_content_type 17 | 18 | proxy_req = GetFreeProxy().proxy_req 19 | data_path = 'brushclass/data/' 20 | 21 | """ 22 | * brush @http 23 | * http://elective.pku.edu.cn 24 | * https://portal.w.pku.edu.cn/portal2017/bizcenter/score/retrScores.do 25 | .data/ 26 | └── cookie // elective.pku.edu.cn cookie 27 | """ 28 | 29 | 30 | class Brush(object): 31 | """ 32 | brush class in http://elective.pku.edu.cn 33 | """ 34 | 35 | def __init__(self, ): 36 | self.failured_map = {} 37 | self.laster_timestamp = 0 38 | 39 | def have_places(self): 40 | """ 41 | brush class 42 | """ 43 | version = begin_time() 44 | have_places = False 45 | 46 | while not have_places: 47 | if self.have_places_once(): 48 | send_email('大数据专题', '大数据专题有名额啦有名额啦') 49 | send_email('大数据专题', '大数据专题有名额啦有名额啦') 50 | send_email('大数据专题', '大数据专题有名额啦有名额啦') 51 | have_places = True 52 | time.sleep(random.randint(10, 20)) 53 | end_time(version) 54 | 55 | def have_places_once(self): 56 | """ 57 | have places 58 | """ 59 | url = 'http://elective.pku.edu.cn/elective2008/edu/pku/stu/elective/controller/supplement/refreshLimit.do' 60 | if not os.path.exists('%scookie' % data_path): 61 | print('Brush Cookie not exist!!!') 62 | return 63 | with open('%scookie' % data_path, 'r') as f: 64 | cookie = f.readlines() 65 | headers = { 66 | 'X-Requested-With': 'XMLHttpRequest', 67 | 'Cookie': '', 68 | 'Content-Type': get_content_type(), 69 | 'Accept': get_accept('xhr'), 70 | "Origin": "http://elective.pku.edu.cn", 71 | "Referer": "http://elective.pku.edu.cn/elective2008/edu/pku/stu/elective/controller/supplement/SupplyCancel.do", 72 | } 73 | headers['Cookie'] = cookie[0][:-1] 74 | 75 | data = { 76 | "index": '10', 77 | "seq": 'yjkc20141100016542', 78 | } 79 | 80 | ca = proxy_req(url, 11, data, header=headers) 81 | 82 | if not ca: 83 | if round(time.time()) - self.laster_timestamp > 60: 84 | send_email("Cookie failure", "Cookie failure") 85 | return False 86 | print(ca['electedNum']) 87 | self.laster_timestamp = round(time.time()) 88 | return int(ca['electedNum']) < 120 89 | 90 | 91 | def get_score(cookie: str): 92 | SCORE_URL = 'https://portal.w.pku.edu.cn/portal2017/bizcenter/score/retrScores.do' 93 | headers = { 94 | 'Accept': get_accept('xhr'), 95 | 'Host': 'portal.w.pku.edu.cn', 96 | 'Origin': 'https://portal.w.pku.edu.cn', 97 | 'Referer': 'https://portal.w.pku.edu.cn/portal2017/', 98 | 'Cookie': cookie, 99 | 100 | } 101 | req = basic_req(SCORE_URL, 11, header=headers) 102 | if req is None or list(req.keys()) != ['success', 'xslb', 'xh', 'xm', 'scoreLists']: 103 | if can_retry(SCORE_URL): 104 | return get_score(cookie) 105 | else: 106 | return 107 | return req 108 | 109 | 110 | def get_gpa(cookie: str): 111 | score = get_score(cookie) 112 | if score is None: 113 | return 114 | need_cj = ['A', 'B', 'C', 'D', 'F'] 115 | name = score['xm'] 116 | student_id = score['xh'] 117 | score_list = score['scoreLists'] 118 | score_list = [(int(ii['xf']), ii['cj']) 119 | for ii in score_list if ii['cj'][0] in need_cj] 120 | grade_list = [(ii, get_grade_point(jj)) for ii, jj in score_list] 121 | TG = sum([ii * jj for ii, jj in grade_list]) 122 | TC = sum([ii for ii, _ in grade_list]) 123 | level = [ii[0] for _, ii in score_list] 124 | level_count = Counter(level) 125 | gpa = TG / TC 126 | echo(1, f'{name}, Congratulations u get {TC} credits and {gpa:.3f} gpa in this university.') 127 | for ii in need_cj: 128 | if ii not in level_count: 129 | continue 130 | count = level_count[ii] 131 | echo(2, f'U have {count} class get {ii}.') 132 | 133 | 134 | def get_grade_point(score: str): 135 | score_map = {'A': 4, 'B': 3, 'C': 2, 'D': 1, 'F': 0} 136 | grade_point = score_map[score[0]] 137 | if len(score) == 2 and score[0] != 'F': 138 | flag = 1 if score[1] == '+' else -1 139 | grade_point += 0.3 * flag 140 | grade_point = min(4, grade_point) 141 | return grade_point 142 | 143 | 144 | if __name__ == '__main__': 145 | if not os.path.exists(data_path): 146 | os.makedirs(data_path) 147 | parser = argparse.ArgumentParser(description='pku student helper') 148 | parser.add_argument('--mode', type=int, default=1, metavar='mode',help='0->bruchclass,1->get_gpa') 149 | parser.add_argument('--cookie', type=str, default='', metavar='cookie',help='portal cookie') 150 | mode = parser.parse_args().mode 151 | if mode == 0: 152 | brush = Brush() 153 | brush.have_places() 154 | else: 155 | cookie = parser.parse_args().cookie.replace('\'', '').replace('"', '') 156 | get_gpa(cookie) 157 | -------------------------------------------------------------------------------- /buildmd/article.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE if not exists `article` ( 2 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'auto-increment primary keys', 3 | `article_id` varchar(50) NOT NULL DEFAULT '0' COMMENT 'article id', 4 | `title` varchar(500) NOT NULL DEFAULT '0' COMMENT 'article title', 5 | `q` varchar(500) NOT NULL DEFAULT '0' COMMENT 'article q', 6 | `is_deleted` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'is deleted', 7 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'create time', 8 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time', 9 | PRIMARY KEY (`id`) 10 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 comment='article table'; -------------------------------------------------------------------------------- /buildmd/tbk.ini.tmp: -------------------------------------------------------------------------------- 1 | [TBK] 2 | appkey = 123 3 | secret = 123 4 | user_id = 123 5 | site_id = 123 6 | adzone_id = 123 7 | uland_url = http:// 8 | test_item_id = 123 9 | test_finger_id = 111 10 | apikey = 111 11 | 12 | [YNOTE] 13 | cookie = "123456" 14 | home_id = 123456 15 | unlogin_id = 123456 -------------------------------------------------------------------------------- /buildmd/tpwd.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE if not exists `article_tpwd` ( 2 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'auto-increment primary keys', 3 | `article_id` varchar(50) NOT NULL DEFAULT '0' COMMENT 'article id', 4 | `tpwd_id` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'tpwd item id', 5 | `item_id` varchar(30) NOT NULL DEFAULT '0' COMMENT 'goods item id', 6 | `tpwd` varchar(30) NOT NULL DEFAULT '0' COMMENT 'tpwd content', 7 | `domain` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'tpwd type @0->s.click, @1->item, @5->uland, @10->taoquan', 8 | `content` varchar(300) NOT NULL DEFAULT '_' COMMENT 'tpwd content', 9 | `url` varchar(1000) NOT NULL DEFAULT '_' COMMENT 'tpwd corresponding url', 10 | `commission_rate` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'commission rate', 11 | `commission_type` varchar(30) NOT NULL DEFAULT '' COMMENT 'commission type', 12 | `expire_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'expire time', 13 | `is_deleted` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'is deleted', 14 | `other1` varchar(300) NOT NULL DEFAULT '' COMMENT 'other1', 15 | `other2` varchar(300) NOT NULL DEFAULT '' COMMENT 'other2', 16 | `other3` varchar(300) NOT NULL DEFAULT '' COMMENT 'other3', 17 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'create time', 18 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time', 19 | PRIMARY KEY (`id`) 20 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 comment='table for article info in tbk'; 21 | -------------------------------------------------------------------------------- /ctrip/hotelDetail.js: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: gunjianpan 3 | * @Date: 2019-04-20 22:21:40 4 | * @Last Modified by: gunjianpan 5 | * @Last Modified time: 2019-04-21 13:18:38 6 | */ 7 | 8 | const jsdom = require('jsdom'); 9 | const {JSDOM} = jsdom; 10 | 11 | function genEleven(script, url, callback) { 12 | const dom = new JSDOM(); 13 | window = dom.window; 14 | document = window.document; 15 | window.decodeURIComponent = decodeURIComponent; 16 | let href = url 17 | let userAgent = 'Chrome/73.0.3682.0' 18 | let geolocation = 0; 19 | document.createElement('div'); 20 | var div = document.createElement('div'); 21 | div.innerHTML = '333'; 22 | window[callback] = 23 | function(e) { 24 | window.AAA = e(); 25 | } 26 | 27 | eval(script); 28 | console.log(aaa); 29 | return aaa; 30 | } 31 | url = 'https://hotels.ctrip.com/hotel/4889292.html' 32 | 33 | script = 'let aaa = 1;' 34 | genEleven(script, url, 'CASNAuIDNBfCYLBKdi') 35 | -------------------------------------------------------------------------------- /ctrip/hotelDetail.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-04-20 10:57:55 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2019-10-11 02:04:24 6 | 7 | import codecs 8 | import datetime 9 | import json 10 | import os 11 | import random 12 | import re 13 | import shutil 14 | import sys 15 | import threading 16 | import time 17 | 18 | import numpy as np 19 | import tzlocal 20 | from bs4 import BeautifulSoup 21 | 22 | import execjs 23 | import js2py 24 | 25 | sys.path.append(os.getcwd()) 26 | from util.util import (basic_req, begin_time, changeHeaders, decoder_fuzz, 27 | echo, end_time, time_str, get_accept, get_content_type) 28 | 29 | 30 | data_dir = 'ctrip/data/' 31 | cookie_path = 'ctrip/cookie.txt' 32 | compress_path = 'ctrip/compress.js' 33 | one_day = 86400 34 | 35 | 36 | def decoder_confusion(): 37 | ''' decoder confusion ''' 38 | with open(f'{data_dir}fingerprint.js', 'r') as f: 39 | origin_js = [codecs.unicode_escape_decode( 40 | ii.strip())[0] for ii in f.readlines()] 41 | __0x3717e_begin = origin_js[1].index('[') + 1 42 | __0x3717e_end = origin_js[1].index(']') 43 | __0x3717e = origin_js[1][__0x3717e_begin:__0x3717e_end].split(',') 44 | __0x3717e = [ii.strip() for ii in __0x3717e] 45 | origin_str = '|||||'.join(origin_js) 46 | params = re.findall(r'var (.*?) =', origin_str) 47 | params_another = re.findall(r'function$(.*?)$', origin_str) 48 | params_another = sum([ii.replace('|||||', '').split(',') 49 | for ii in params_another], []) 50 | params += params_another 51 | 52 | params = sorted(list(set([ii for ii in params if len( 53 | ii) > 6])), key=lambda ii: len(ii), reverse=True) 54 | for ii, jj in enumerate(__0x3717e): 55 | origin_str = origin_str.replace(f'__0x3717e[{ii}]', jj) 56 | for ii, jj in enumerate(params): 57 | origin_str = origin_str.replace(jj, f'a{ii}') 58 | with open(f'{data_dir}fingerprint_confusion.js', 'w') as f: 59 | f.write('\n'.join(origin_str.split('|||||'))) 60 | 61 | 62 | def load_ocean(): 63 | ''' load ocean ''' 64 | with open(f'{data_dir}oceanball_origin.js', 'r') as f: 65 | origin_js = [ii.strip() for ii in f.readlines()] 66 | origin_str = '|||'.join(origin_js) 67 | params = [*re.findall(r'var ([a-zA-Z]*?) =', origin_str), 68 | re.findall(r'var ([a-zA-Z]*?);', origin_str)] 69 | params_another = re.findall(r'function$(.*?)$', origin_str) 70 | params_another = sum([ii.replace('|||', '').split(',') 71 | for ii in params_another], []) 72 | params += params_another 73 | params += re.findall(r', ([a-zA-Z]*?)\)', origin_str) 74 | params += re.findall(r'\(([a-zA-Z]*?),', origin_str) 75 | 76 | params = sorted(list(set([ii for ii in params if len( 77 | ii) > 6])), key=lambda ii: len(ii), reverse=True) 78 | for ii, jj in enumerate(params): 79 | origin_str = origin_str.replace(jj, f'a{ii}') 80 | with open(f'{data_dir}oceanball_origin_decoder.js', 'w') as f: 81 | f.write(origin_str.replace('|||', '\n')) 82 | 83 | 84 | def load_ocean_v2(): 85 | ''' load ocean ball v2 @2019.6.9 ''' 86 | decoder_fuzz('(_\w{3,7}_\w{5})', 87 | '{}oceanballv2_july.js'.format(data_dir), 88 | replace_func=replace_params) 89 | 90 | 91 | def replace_params(origin_str: str, reg: str) -> str: 92 | ''' replace params ''' 93 | params_re = re.findall(reg, origin_str) 94 | echo(1, "You're", re.findall('_(.*?)_', params_re[0])[0]) 95 | params = {} 96 | for ii in params_re: 97 | if not ii in params: 98 | params[ii] = len(params) 99 | for ii in sorted(list(params.keys()), key=lambda i: -len(i)): 100 | origin_str = origin_str.replace(ii, f'a{params[ii]}') 101 | return origin_str 102 | 103 | 104 | def load_html_js(): 105 | with open(f'{data_dir}html_js.js', 'r') as f: 106 | origin_js = [ii.strip() for ii in f.readlines()] 107 | origin_str = '|||'.join(origin_js) 108 | 109 | ''' long params name replace ''' 110 | params = re.findall(r'_0x\w{6}?', origin_str) 111 | params += re.findall(r'_0x\w{5}?', origin_str) 112 | params += re.findall(r'_0x\w{4}?', origin_str) 113 | params = sorted(list(set(params)), key=lambda ii: len(ii), reverse=True) 114 | 115 | ''' __0x33920 ''' 116 | __0x33920_begin = origin_js[35].index('[') + 1 117 | __0x33920_end = origin_js[35].index(']') 118 | __0x33920 = origin_js[35][__0x33920_begin:__0x33920_end].split(',') 119 | __0x33920 = [ii.strip() for ii in __0x33920] 120 | for ii, jj in enumerate(__0x33920): 121 | origin_str = origin_str.replace('__0x33920[{}]'.format(ii), jj) 122 | 123 | ''' _0x4f05 ''' 124 | _0x4f05_dict = {2: "prototype", 3: "hashCode", 4: "length", 5: "pmqAv", 6: "charCodeAt", 11: "EcTAI", 12: "bTlKh", 13: "prototype", 14: "toString", 15: ";expires=", 16: ";path=/", 17: "getDate", 18: "xxxxt", 19: "xxxxt", 20: "ymGjh", 21: "DjPmX", 22: "cookie", 23: "cookie", 125 | 24: "split", 25: "length", 26: "webdriver", 27: "random", 28: "abs", 29: "userAgent", 30: "replace", 31: "abs", 32: "hashCode", 33: "substr", 34: "host", 35: "indexOf", 36: "m.ctrip", 45: "fcerror", 46: "_zQdjfing", 47: "_RGUID", 48: "replace", 49: "fromCharCode", 50: "QVALA"} 126 | _0x4f05_origin = {ii: hex(ii) for ii in _0x4f05_dict.keys()} 127 | _0x4f05_replace = {ii: re.findall( 128 | r'_0x4f05$"%s",.{7}$' % jj, origin_str) for ii, jj in _0x4f05_origin.items()} 129 | print(_0x4f05_replace) 130 | for ii, jj in _0x4f05_replace.items(): 131 | for kk in jj: 132 | origin_str = origin_str.replace( 133 | kk, '"{}"'.format(_0x4f05_dict[ii])) 134 | 135 | ''' _0x1bf9 ''' 136 | _0x1bf9_dict = {1: "eit", 2: "NMs", 3: "FMx", 4: "utc", 5: "sign", 6: "sign", 22: "mMa", 23: ";path=/", 24: "KWcVI", 25: "KWcVI", 33: "setDate", 34: "getDate", 35: "cookie", 36: "dnvrD", 37: "dnvrD", 38: "dnvrD", 39: "ceIER", 40: "toGMTString", 41: "jXvnT", 42: "abs", 43: "hashCode", 47: "DkDiA", 48: "btRpY", 49: "sign", 50: "href", 51: "length", 52: "OZJLY", 53: "HWzfY", 54: "btRpY", 55: "ZQRZh", 56: "rSeVr", 57: "pow", 58: "pop", 59: "ZQRZh", 60: "KEEqN", 61: "xmTXV", 62: "abs", 63: "mytJr", 64: "btRpY", 65: "hashCode", 66: "abs", 67: "xbNid", 68: "evWhs", 69: "log", 137 | 70: "tStBb", 71: "toFixed", 72: "sign", 73: "wBNtc", 74: "abs", 75: "wyibM", 76: "bSvQq", 77: "dHSnF", 78: "random", 79: "getTimezoneOffset", 80: "BzPEC", 81: "dHSnF", 82: "WYJFv", 83: "WYJFv", 84: "split", 85: "length", 86: "QTDGI", 89: "BzPEC", 90: "AceIM", 91: "wOQ", 93: "TGIHa", 94: "join", 95: "join", 96: "join", 97: "HTF", 98: "ioW", 99: "HfzNS", 100: "MIA", 101: "FNbOm", 102: "HfzNS", 103: "OCGEJ", 104: "HfzNS", 105: "aYQhD", 107: "push", 108: "length", 109: "call", 110: "call", 111: "call", 112: "split", 113: "call", 114: "WYJFv", 115: "ZmtWg", 116: "zYC", 119: "join"} 138 | _0x1bf9_origin = {ii: hex(ii) for ii in _0x1bf9_dict.keys()} 139 | _0x1bf9_replace = {ii: re.findall( 140 | r'_0x1bf9$"%s",.{7}$' % jj, origin_str) for ii, jj in _0x1bf9_origin.items()} 141 | print(_0x1bf9_replace) 142 | for ii, jj in _0x1bf9_replace.items(): 143 | for kk in jj: 144 | origin_str = origin_str.replace( 145 | kk, '"{}"'.format(_0x1bf9_dict[ii])) 146 | 147 | for ii, jj in enumerate(params): 148 | origin_str = origin_str.replace(jj, 'a{}'.format(ii)) 149 | with open('{}html_js_decoder.js'.format(data_dir), 'w') as f: 150 | f.write(origin_str.replace('|||', '\n')) 151 | 152 | 153 | HOTELS_URL = 'https://hotels.ctrip.com/' 154 | HOTEL_ROOMLIST_DETAIL_URL = '%sDomestic/tool/AjaxHote1RoomListForDetai1.aspx' % HOTELS_URL 155 | OCEANBALL_URL = '{}domestic/cas/oceanball'.format(HOTELS_URL) 156 | HOTEL_DETAIL_URL = '{}hotel/%d.html'.format(HOTELS_URL) 157 | AJAX_PROMOTION_URL = '{}Domestic/Tool/AjaxGetPromotionFilterList.aspx'.format( 158 | HOTELS_URL) 159 | 160 | 161 | class HotelDetail: 162 | ''' generate params for https://hotels.ctrip.com/Domestic/tool/AjaxHote1RoomListForDetai1.aspx ''' 163 | 164 | def __init__(self): 165 | self.default_hotel_id = 4889292 166 | self.header = { 167 | 'Cookie': '', 168 | 'Accept': get_accept('html'), 169 | 'Content-Type': get_content_type(), 170 | } 171 | 172 | def generate_callback(self, e): 173 | ''' generate callback params e ''' 174 | cl = [chr(ii) for ii in range(65, 123) if ii > 96 or ii < 91] 175 | o = ''.join(["CAS", *[cl[ii] for ii in np.random.randint(0, 51, e)]]) 176 | return o 177 | 178 | def generate_eleven_v2(self, hotel_id: int): 179 | ################################################################ 180 | # 181 | # [generate eleven] version 19.7.28(Test ✔️) write by gunjianpan 182 | # 183 | # 1. random generate 15 bit param `callback`; 184 | # 2. use callback request OCEANBALL -> get origin js; 185 | # 3. decoder params to union param; 186 | # 4. find where the code eval; 187 | # 'h=a3.pop(),i=a11(h);return a18(i.apply(h.o,g),ud,ud,0),' 188 | # 5. compare the env of chrome with node. 189 | # 'https://github.com/iofu728/spider/tree/develop#oceannballv2' 190 | # 5. you will get `爬虫前进的道路上还是会有各种各样想不到的事情会发生` 191 | # 6. final, return, and joint params; 192 | # 193 | ################################################################ 194 | 195 | referer_url = HOTEL_DETAIL_URL % hotel_id 196 | self.header['Referer'] = referer_url 197 | callback = self.generate_callback(15) 198 | now_time = int(time.time() * 1000) 199 | url = f'{OCEANBALL_URL}?callback={callback}&_={now_time}' 200 | oj, cookie = basic_req(url, 3, need_cookie=True, header=self.header) 201 | print(cookie) 202 | oj = replace_params(oj, '(_\w{3,7}_\w{5,6})') 203 | oj = oj.replace('"this"', 'this').replace('\'', '"').replace('\n', '') 204 | ooj = oj 205 | 206 | ''' replace window ''' 207 | oj = oj.replace('Window', 'window') 208 | oj = oj.replace('window', 'windows') 209 | 210 | ''' return answer ''' 211 | echo(0, 'Num of a6[h][i]', oj.count('a19[0])}}return a18(a6[h][i]')) 212 | echo(0, 'Num 0f end', oj.count('});; })();')) 213 | oj = oj.replace('});; })();', '});;return aa;})();') 214 | ooj = ooj.replace('});; })();', '});;return aa;})();') 215 | 216 | ''' windows setting ''' 217 | windows_str = 'function(){ var windows = {"navigator":{"userAgent":"Mozilla/5.0"}};aa=[];windows["' + \ 218 | callback + \ 219 | '"] = function(e) {temp = e();console.log(temp);return temp};' 220 | oj = oj.replace('function(){ ', windows_str) 221 | 222 | oj = "function aabb(){tt=" + oj + ";return tt;}" 223 | 224 | ''' replace param undefine replace''' 225 | oj = oj.replace('env.define;', 'windows.define;') 226 | oj = oj.replace('env.module;', 'windows.module;') 227 | oj = oj.replace('env.global;', 'windows.global;') 228 | oj = oj.replace('env.require;', 'windows.require;') 229 | oj = oj.replace('env.', '') 230 | 231 | ''' synchronous node & chrome v8 param''' 232 | oj = oj.replace( 233 | 'var a2=', 'require=undefined;module=undefined;global=undefined;var a2=') 234 | oj = oj.replace('process:process,', 'process:NO,') 235 | oj = oj.replace('process,', 'NO, ') 236 | oj = oj.replace( 237 | 'return a19[p];', 'var last = a19[p];if (last.k == 0 && last.o == 0 && last.r == 0 && last.v != 0) {last.v = TypeError();}return last;') 238 | 239 | oj = oj.replace('h=a3.pop(),i=a11(h);return a18(i.apply(h.o,g),ud,ud,0),', 240 | 'h=a3.pop(),i=a11(h);var test = h.k!="getOwnPropertyNames" ? i.apply(h.o,g) :[];if(h.o=="function tostring() { [python code] }"){test=23};if(g=="object window"){test=21};if(h.k=="keys"){test=["TEMPORARY", "PERSISTENT"];}aa=test;return a18(test, ud, ud, 0),') 241 | 242 | ''' eval script ''' 243 | eleven = js2py.eval_js(oj + ';aabb()') 244 | echo(1, 'eleven', eleven) 245 | return eleven 246 | 247 | def generate_eleven(self, hotel_id: int): 248 | ################################################################ 249 | # 250 | # [generate eleven] version 19.4.21(Test ✔️) write by gunjianpan 251 | # 252 | # 1. random generate 15 bit param `callback`; 253 | # 2. use callback request OCEANBALL -> get origin js; 254 | # 3. eval once -> (match array, and then chr() it) -> decoder js; 255 | # 4. replace document and windows(you also can use execjs & jsdom); 256 | # 5. warning you should replace `this` to some params, 257 | # Otherwise, you will get `老板给小三买了包，却没有给你钱买房` 258 | # 6. final, return, and joint params; 259 | # 260 | ################################################################ 261 | 262 | callback = self.generate_callback(15) 263 | now_time = int(time.time() * 1000) 264 | url = '{}?callback={}&_={}'.format(OCEANBALL_URL, callback, now_time) 265 | referer_url = HOTEL_DETAIL_URL % hotel_id 266 | changeHeaders( 267 | {'Referer': referer_url, 'if-modified-since': 'Thu, 01 Jan 1970 00:00:00 GMT'}) 268 | oceanball_js, cookie = basic_req(url, 3, need_cookie=True) 269 | print(cookie) 270 | 271 | array = re.findall(r'$\[(.*)\],', oceanball_js)[0].split(',') 272 | array = [int(ii) for ii in array] 273 | offset = int(re.findall(r'item-(\d*?)$', oceanball_js)[0]) 274 | 275 | ''' String.fromCharCode ''' 276 | oe = ''.join([chr(ii - offset) for ii in array]) 277 | 278 | ''' replace window[callback] callback function ''' 279 | replace_str = re.findall(r'{}$new.*$\);'.format(callback), oe)[0] 280 | eleven_params = re.findall( 281 | r'{}$new.*\+ (.*?) \+.*$\);'.format(callback), oe)[0] 282 | replaced_str = 'return {};'.format(eleven_params) 283 | oe = oe.replace(replace_str, replaced_str) 284 | oe = oe.replace('\'', '"').replace('\r', '') 285 | oe = oe.replace(';!', 'let aaa = ', 1) 286 | 287 | replace = ''' 288 | function(){let href="https://hotels.ctrip.com/hotel/%d.html"; 289 | a={"documentElement": {"attributes":{}}}; 290 | b={}; 291 | function c(){}; 292 | userAgent ="Chrome/73.0.3682.0"; 293 | geolocation = 1; 294 | ''' % hotel_id 295 | 296 | ''' replace document & windown & navigator ''' 297 | oe = oe.replace('document.body.innerHTML.length', '888').replace( 298 | 'document.body.innerHTML', '""') 299 | oe = oe.replace('document.createElement("div")', '{}') 300 | oe = oe.replace('window.HTMLSpanElement', 'c').replace( 301 | 'document.createElement("span")', 'new c') 302 | oe = oe.replace('window.location.href', 'href').replace( 303 | 'location.href', 'href') 304 | oe = oe.replace('navigator.', '') 305 | oe = oe.replace('new Image().', '').replace('new Image();', '') 306 | oe = oe.replace('document.all', '0').replace('document.referrer', '""') 307 | oe = oe.replace('this || ', '') 308 | oe = oe.replace('window["document"]', 'a') 309 | 310 | oe = oe.replace('document', 'a').replace('window', 'b') 311 | oe = oe.replace('function(){', replace, 1) 312 | 313 | ''' eval script ''' 314 | eleven = js2py.eval_js(oe) 315 | echo(1, 'eleven', eleven) 316 | 317 | return eleven 318 | 319 | def generate_other_params(self, hotel_id: int = 4889292, city_id: int = 2, 320 | startDate: str = time_str(-1, '%Y-%m-%d'), 321 | depDate: str = time_str(int(time.time() + one_day), '%Y-%m-%d')): 322 | ''' generate other params ''' 323 | params = { 324 | 'psid': None, 325 | 'MasterHotelID': hotel_id, 326 | 'hotel': hotel_id, 327 | 'EDM': 'F', 328 | 'roomId': None, 329 | 'IncludeRoom': None, 330 | 'city': city_id, 331 | 'showspothotel': 'T', 332 | 'supplier': None, 333 | 'IsDecoupleSpotHotelAndGroup': 'F', 334 | 'contrast': 0, 335 | 'brand': 776, 336 | 'startDate': startDate, 337 | 'depDate': depDate, 338 | 'IsFlash': 'F', 339 | 'RequestTravelMoney': 'F', 340 | 'hsids': None, 341 | 'IsJustConfirm': None, 342 | 'contyped': 0, 343 | 'priceInfo': -1, 344 | 'equip': None, 345 | 'filter': None, 346 | 'productcode': None, 347 | 'couponList': None, 348 | 'abForHuaZhu': None, 349 | 'defaultLoad': 'T', 350 | 'esfiltertag': None, 351 | 'estagid': None, 352 | 'Currency': None, 353 | 'Exchange': None, 354 | 'minRoomId': 0, 355 | 'maskDiscount': 0, 356 | 'TmFromList': 'F', 357 | 'th': 119, 358 | 'RoomGuestCount': '1,1,0', 359 | 'promotionf': None, 360 | 'allpoint': None, 361 | } 362 | return params 363 | 364 | def get_hotel_detail(self, hotel_id: int): 365 | ''' get hotel detail ''' 366 | params = { 367 | **self.generate_other_params(hotel_id), 368 | 'eleven': self.generate_eleven_v2(hotel_id), 369 | 'callback': self.generate_callback(16), 370 | '_': int(time.time() * 1000) 371 | } 372 | params_list = ['{}={}'.format( 373 | ii, (jj if not jj is None else '')) for ii, jj in params.items()] 374 | url = '{}?{}'.format(HOTEL_ROOMLIST_DETAIL_URL, '&'.join(params_list)) 375 | echo(2, 'XHR url', url) 376 | req, _ = basic_req(url, 1, need_cookie=True, header=self.header) 377 | return req 378 | 379 | def parse_detail(self, hotel_id: int = 4889292): 380 | ''' parse hotel detail ''' 381 | 382 | version = begin_time() 383 | # self.user_action(hotel_id) 384 | # self.generate_cookie(hotel_id) 385 | # self.prepare_req() 386 | text = self.get_hotel_detail(hotel_id) 387 | html = BeautifulSoup(text['html'], 'html.parser') 388 | trs = html.findAll('tr')[2:] 389 | hotel_detail = [] 390 | 391 | for tr in trs: 392 | room_name = re.findall('baseroomname="(.*?)"', str(tr)) 393 | if not len(room_name): 394 | room_name = re.findall('l="nofollow">\n(.*?)\n', str(tr)) 395 | room_name = room_name[0].strip() if len( 396 | room_name) else (hotel_detail[-1][0] if len(hotel_detail) else '') 397 | price = re.findall(r'(\d{4,5}?)', str(tr)) 398 | if not len(price): 399 | continue 400 | sales_price_list = re.findall(r'促销优惠减(.*?)', str(tr)) 401 | sales_price = sales_price_list[0] if len(sales_price_list) else '' 402 | price_type = re.findall('room_type_name">(.*?)', str(tr))[0] 403 | if 'em' in price_type: 404 | price_type = ','.join([*re.findall( 405 | '(.*?) dict: 424 | return {ii.split('=', 1)[0]: ii.split('=', 1)[1] for ii in cookie.split('; ')} 425 | 426 | def encoder_cookie(self, cookie_dict: {}) -> str: 427 | return '; '.join(['{}={}'.format(ii, jj)for ii, jj in cookie_dict.items()]) 428 | 429 | def get_timezone_offset(self): 430 | local_tz = tzlocal.get_localzone() 431 | return -int(local_tz.utcoffset(datetime.datetime.today()).total_seconds() / 60) 432 | 433 | def a312(self, a312_value): 434 | a323_list = [0, 36, 5, 5, 5, 5, 137, 137, 36, 171] 435 | a199 = 0 if a312_value > len(a323_list) - 1 else a323_list[a312_value] 436 | return '{}{}'.format('0' if a199 < 16 else '', str(hex(a199)).split('0x', 1)[1]) 437 | 438 | def generate_v1(self, time_stamp: int = 0): 439 | a241, a166, a144 = self.get_timezone_offset(), int(time.time() * 1000), 10 440 | a166 += sum([np.int32((int('0x2ede', 16) + ii) * a241) 441 | for ii in range(6)]) 442 | a166 = a166 if not time_stamp else time_stamp 443 | a33 = [int(ii) for ii in list(str(a166))] 444 | for ii in range(len(a33)): 445 | a33[ii] ^= a144 446 | a144 = a33[ii] 447 | 448 | a34 = [int(ii) for ii in list(str(a166))] 449 | a167 = [a34[len(a34) - ii - 1] for ii, _ in enumerate(a34)] 450 | a13 = [0x3, 0x1, 0x2, 0x6, 0xb, 0x5, 0xa, 0x4, 0x8, 0x0, 0x9, 0x7, 0xc] 451 | a217 = [self.a312(a167[ii if ii > len(a167) else a13[ii]]) 452 | for ii in range(len(a167))] 453 | cookie = {'htltmp': ''.join( 454 | [hex(ii)[-1] for ii in a33]), 'utc': str(a166), 'htlstmp': ''.join(a217), 'MKT_Pagesource': 'PC'} 455 | return cookie 456 | 457 | def login_cookie(self): 458 | if not os.path.exists(cookie_path): 459 | shutil.copy(cookie_path + '.tmp', cookie_path) 460 | with open(cookie_path) as f: 461 | cookie = self.decoder_cookie(f.read().strip()) 462 | return cookie 463 | 464 | def user_action(self, hotel_id: int = 4889292): 465 | 466 | url = '{}hotel/{}.html'.format(HOTELS_URL, hotel_id) 467 | text = basic_req(url, 3) 468 | page_id = int(re.findall(r'id="page_id" value="(\d*?)" />', text)[0]) 469 | correlation_id = re.findall(r'relationId" value="(\d*?)"/>', text)[0] 470 | 471 | e = self.login_cookie()['_bfa'].split('.') 472 | common = [page_id, e[1] + '.' + e[2], int(e[6]), int(e[7]), correlation_id, 473 | "M:70,181023_hod_fxtj:B;", '', '2.6.9', "vq5tkk-ufpyck-qsxbg3", "", "", "", "", "", "online"] 474 | _queue = [{ 475 | 'action': 'click', 476 | 'xpath': "HTML/BODY[@id='mainbody']/FORM[@id='aspnetForm']/DIV[3][@id='base_bd']/DIV[4]/DIV[@id='divDetailMain']/DIV[9][@id='id_room_select_box']/DIV[2]/DIV/DIV/A[@id='changeBtn'][@x='{}'][@y='{}'][@rx='{}'][@ry='{}']".format(random.randint(50, 80), random.randint(650, 750), random.randint(20, 40), random.randint(5, 20)), 477 | 'ts': int(time.time() * 1000), 478 | }] 479 | ee = [[2, "useraction"], common, _queue] 480 | eee = json.dumps(ee, separators=(',', ':')) 481 | print(eee) 482 | compress = execjs.compile(open(compress_path).read()) 483 | eeee = compress.call('compress', eee) 484 | echo(2, eeee) 485 | cookie = {'uid': 'Yn17vOkRm2gW+jCNwT8jPg=='} 486 | header = { 487 | 'Referer': 'https://hotels.ctrip.com/hotel/4889292.html', 488 | 'Cookie': self.encoder_cookie(cookie) 489 | } 490 | url = 'https://s.c-ctrip.com/bf.gif?ac=a&d={}&jv=1.0.0'.format(eeee) 491 | req = basic_req(url, 2, header=header) 492 | echo(0, req.cookies.get_dict()) 493 | 494 | def prepare_req(self, hotel_id: int = 4889292, city_id: int = 2, 495 | startDate: str = time_str(-1, '%Y-%m-%d'), 496 | depDate: str = time_str(int(time.time() + one_day), '%Y-%m-%d')): 497 | referer_url = HOTEL_DETAIL_URL % hotel_id 498 | 499 | changeHeaders({'Referer': referer_url}) 500 | data = {'city': city_id, 'checkin': startDate, 501 | 'cjeckout': depDate, 'defalutVal': None} 502 | return basic_req(AJAX_PROMOTION_URL, 11, data=data) 503 | 504 | 505 | if __name__ == '__main__': 506 | if not os.path.exists(data_dir): 507 | os.makedirs(data_dir) 508 | ch = HotelDetail() 509 | ch.parse_detail() 510 | -------------------------------------------------------------------------------- /dytt8/dytt8.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author: gunjianpan 3 | @Date: 2019-04-20 15:04:03 4 | @Last Modified by: gunjianpan 5 | @Last Modified time: 2019-04-21 21:37:37 6 | ''' 7 | 8 | import os 9 | import re 10 | import sys 11 | import threading 12 | 13 | sys.path.append(os.getcwd()) 14 | from proxy.getproxy import GetFreeProxy 15 | from util.util import (begin_time, can_retry, echo, end_time, 16 | shuffle_batch_run_thread) 17 | 18 | proxy_req = GetFreeProxy().proxy_req 19 | HOMEPAGE_URL = 'https://www.dytt8.net' 20 | movie_list, movie_another, movie_again = [], [], [] 21 | 22 | 23 | def load_index(): 24 | ''' load index ''' 25 | global movie_list 26 | version = begin_time() 27 | text = proxy_req(HOMEPAGE_URL, 3) 28 | if not len(text): 29 | if can_retry(HOMEPAGE_URL): 30 | load_index() 31 | return 32 | movie_list = re.findall('《(.*?)》', text) 33 | movie_more = re.findall('href="(.*?)">更多', text) 34 | for uri in movie_more: 35 | load_other(uri) 36 | 37 | threading_list = [threading.Thread( 38 | target=load_other, args=(ii,)) for ii in movie_another] 39 | shuffle_batch_run_thread(threading_list, 100) 40 | threading_list = [threading.Thread( 41 | target=load_other, args=(ii,)) for ii in movie_again] 42 | shuffle_batch_run_thread(threading_list, 100) 43 | # 对电影列表去重 44 | movie_list = set(movie_list) 45 | # 导出爬取的电影列表 46 | out_path = 'dytt8_result.txt' 47 | with open(out_path, 'w') as f: 48 | f.write('\n'.join(movie_list)) 49 | url_num = len([*movie_more, *movie_another]) + 1 50 | movie_num = len(movie_list) 51 | echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( 52 | url_num, movie_num, out_path, end_time(version, 0))) 53 | 54 | 55 | def load_other(uri): 56 | ''' load other ''' 57 | global movie_list, movie_another, movie_again 58 | url = HOMEPAGE_URL + uri if not 'http' in uri else uri 59 | text = proxy_req(url, 3) 60 | temp_list = re.findall('《(.*?)》', text) 61 | echo(2, 'loading', url, 'movie num:', len(temp_list)) 62 | 63 | if text == '' or not len(temp_list): 64 | if can_retry(url): 65 | load_other(uri) 66 | else: 67 | movie_again.append(url) 68 | return 69 | if 'index' in url and '共' in text: 70 | total_page = re.findall('共(.*?)页', text)[0] 71 | suffix_str = re.findall(r"value=\'(.*?)1.html\' selected", text)[0] 72 | more_movie = [url.replace('index.html', '{}{}.html'.format( 73 | suffix_str, ii)) for ii in range(2, int(total_page) + 1)] 74 | else: 75 | more_movie = [] 76 | movie_list += temp_list 77 | movie_another += more_movie 78 | 79 | 80 | if __name__ == '__main__': 81 | load_index() 82 | -------------------------------------------------------------------------------- /eastmoney/eastmoney.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-03-29 10:35:27 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2019-03-29 12:38:54 6 | 7 | import codecs 8 | import json 9 | import os 10 | import pickle 11 | import requests 12 | import time 13 | 14 | from fontTools.ttLib import TTFont 15 | 16 | """ 17 | * data.eastmoney.com/bbsj/201806/lrb.html 18 | .data/ 19 | ├── base.pkl // base_unicode list 20 | ├── base.woff // base font file (autoload) 21 | ├── eastmony%Y-%m-%d_%H:%M:%S.csv // result .csv 22 | └── font.woff // last time font file 23 | """ 24 | data_dir = 'eastmoney/data/' 25 | base_dir = '%sbase.' % data_dir 26 | base_pkl = '%spkl' % base_dir 27 | base_font = '%swoff' % base_dir 28 | url = 'http://data.eastmoney.com/bbsj/201806/lrb.html' 29 | 30 | header = { 31 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 32 | 'Accept-Encoding': '', 33 | 'Accept-Language': 'zh-CN,zh;q=0.9', 34 | 'Cache-Control': 'no-cache', 35 | 'Connection': 'keep-alive', 36 | 'Host': 'data.eastmoney.com', 37 | 'Pragma': 'no-cache', 38 | 'Upgrade-Insecure-Requests': '1', 39 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3747.0 Safari/537.36' 40 | } 41 | 42 | 43 | def analysis_font(font_url: str, mode=None) -> dict: 44 | ''' analysis font ''' 45 | if (not os.path.exists(base_font) or not os.path.exists(base_pkl)) and not mode: 46 | print('base file not exist!!!') 47 | return 48 | 49 | suffix = font_url.split('.')[-1] 50 | font = requests.get(font_url, headers=header, timeout=30) 51 | font_name = '%sfont.%s' % (data_dir, suffix) 52 | with codecs.open(font_name, 'wb') as f: 53 | f.write(font.content) 54 | font_map = TTFont(font_name).getBestCmap() 55 | ''' prepare base ''' 56 | if not mode is None: 57 | char_list = [hex(ii).upper().replace('0X', '&#x') + 58 | ';' for ii in font_map.keys()] 59 | base_unicode = [ 60 | int(mode[ii]) if ii in mode else '.' for ii in char_list] 61 | pickle.dump(base_unicode, codecs.open(base_pkl, 'wb')) 62 | with codecs.open(base_font, 'wb') as f: 63 | f.write(font.content) 64 | return {} 65 | 66 | base_unicode = pickle.load(open(base_pkl, 'rb')) 67 | 68 | base_map = TTFont(base_font).getBestCmap() 69 | font_dict = {jj: base_unicode[ii] 70 | for ii, jj in enumerate(base_map.values())} 71 | num_dict = {hex(ii).upper().replace('0X', '&#x') + ';': str(font_dict[jj]) 72 | for ii, jj in font_map.items()} 73 | return num_dict 74 | 75 | 76 | def load_eastmoney(): 77 | ''' load detail from eastmoney ''' 78 | if not os.path.exists(data_dir): 79 | os.makedirs(data_dir) 80 | req = requests.get(url, headers=header, timeout=30) 81 | origin_str = req.text 82 | 83 | ''' parse json ''' 84 | begin_index = origin_str.index('defjson') 85 | end_index = origin_str.index(']}},\r\n') 86 | json_str = origin_str[begin_index + 9:end_index + 3] 87 | json_str = json_str.replace('data:', '"data":').replace( 88 | 'pages:', '"pages":').replace('font:', '"font":') 89 | json_req = json.loads(json_str) 90 | font_url = json_req['font']['WoffUrl'] 91 | 92 | ''' prepare base ''' 93 | if not os.path.exists(base_pkl) or not os.path.exists(base_font): 94 | print('Prepare base<<<<<<<') 95 | font_map = json_req['font']['FontMapping'] 96 | font_map = {ii['code']: str(ii['value']) for ii in font_map} 97 | analysis_font(font_url, font_map) 98 | 99 | ''' load font ''' 100 | font_map = analysis_font(font_url) 101 | origin_data = json.dumps(json_req['data']) 102 | 103 | ''' load data ''' 104 | for ii, jj in font_map.items(): 105 | origin_data = origin_data.replace(ii, jj) 106 | replace_data = json.loads(origin_data) 107 | need_info = ['scode', 'sname', 'parentnetprofit', 'sjltz', 'totaloperatereve', 'tystz', 'operateexp', 108 | 'saleexp', 'manageexp', 'financeexp', 'totaloperateexp', 'operateprofit', 'sumprofit', 'noticedate'] 109 | data = [ii[jj] for ii in replace_data for jj in need_info] 110 | result_data = [','.join(data[ii * 14:(ii + 1) * 14]) 111 | for ii in range(len(replace_data))] 112 | 113 | ''' store data ''' 114 | now_time = time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime(time.time())) 115 | print(now_time, 'eastmoney data load Success!!!') 116 | with codecs.open('%seastmony%s.csv' % (data_dir, now_time), 'w', encoding='utf-8') as f: 117 | f.write('\n'.join(result_data)) 118 | 119 | 120 | if __name__ == '__main__': 121 | if not os.path.exists(data_dir): 122 | os.makedirs(data_dir) 123 | load_eastmoney() 124 | -------------------------------------------------------------------------------- /exam/shaoq.js: -------------------------------------------------------------------------------- 1 | const jsdom = require('jsdom'); 2 | const { 3 | JSDOM 4 | } = jsdom; 5 | 6 | function get_css(html) { 7 | const dom = new JSDOM(html); 8 | window = dom.window; 9 | document = window.document; 10 | window.decodeURIComponent = decodeURIComponent; 11 | 12 | const script_element = document.querySelector('script'); 13 | const script = script_element.innerHTML; 14 | eval(script); 15 | return window.document.querySelector('style').sheet.toString(); 16 | } -------------------------------------------------------------------------------- /exam/shaoq.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author: gunjianpan 3 | @Date: 2019-03-21 17:34:15 4 | @Last Modified by: gunjianpan 5 | @Last Modified time: 2019-04-18 19:33:44 6 | ''' 7 | 8 | import execjs 9 | import requests 10 | import time 11 | import re 12 | import threading 13 | 14 | from bs4 import BeautifulSoup 15 | 16 | """ 17 | * shaoq @http 18 | * shaoq.com:7777 19 | (single spider not use basic_req) 20 | """ 21 | 22 | 23 | class Shaoq(object): 24 | """ 25 | shao q exam 26 | """ 27 | 28 | def __init__(self): 29 | self.test = 0 30 | 31 | def test_req(self): 32 | basic_url = 'http://shaoq.com:7777/' 33 | url = '%sexam' % basic_url 34 | headers = { 35 | 'pragma': 'no-cache', 36 | 'cache-control': 'no-cache', 37 | 'Host': 'shaoq.com:7777', 38 | 'Referer': 'http://shaoq.com:7777/exam', 39 | 'Cookie': '', 40 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 41 | "Accept-Encoding": "", 42 | "Accept-Language": "zh-CN,zh;q=0.9", 43 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", 44 | } 45 | 46 | '''get cookie''' 47 | first_req = requests.get(url, headers=headers, verify=False) 48 | cookies_map = first_req.cookies.get_dict() 49 | cookies_list = ['%s=%s' % (ii, jj)for ii, jj in cookies_map.items()] 50 | self.cookie = ','.join(cookies_list) 51 | headers['Cookie'] = self.cookie 52 | 53 | ''' load img ''' 54 | html = BeautifulSoup(first_req.text, 'html.parser') 55 | img_list = re.findall('

dict: 92 | return dict(re.findall(r'\.(.+)::before {content: "(.+)";}', css_result)) 93 | 94 | 95 | if __name__ == '__main__': 96 | es = Shaoq() 97 | es.test_req() 98 | -------------------------------------------------------------------------------- /mafengwo/hotel.js: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: gunjianpan 3 | * @Date: 2019-04-18 19:23:33 4 | * @Last Modified by: gunjianpan 5 | * @Last Modified time: 2019-04-22 10:31:47 6 | */ 7 | 8 | const jsdom = require('jsdom'); 9 | const { 10 | JSDOM 11 | } = jsdom; 12 | 13 | function analysis_js(html, salt, prepare_map) { 14 | const dom = new JSDOM(html); 15 | window = dom.window; 16 | document = window.document; 17 | window.decodeURIComponent = decodeURIComponent; 18 | 19 | const script_element = document.querySelector('script'); 20 | console.log(script_element); 21 | const script = script_element.innerHTML; 22 | eval(script); 23 | return window['SparkMD5']['hash'](JSON['stringify'](prepare_map) + salt)['slice'](2, 12); 24 | } -------------------------------------------------------------------------------- /mafengwo/mafengwo.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author: gunjianpan 3 | @Date: 2019-04-16 16:50:45 4 | @Last Modified by: gunjianpan 5 | @Last Modified time: 2019-04-20 01:33:26 6 | ''' 7 | import codecs 8 | import execjs 9 | import numpy as np 10 | import os 11 | import re 12 | import time 13 | import threading 14 | 15 | from bs4 import BeautifulSoup 16 | from proxy.getproxy import GetFreeProxy 17 | from util.util import basic_req, echo, time_str, can_retry, begin_time, end_time, shuffle_batch_run_thread 18 | 19 | data_dir = 'mafengwo/data/' 20 | hotel_js_path = 'mafengwo/hotel.js' 21 | decoder_js_path = '{}decoder.js'.format(data_dir) 22 | origin_js_path = '{}origin.js'.format(data_dir) 23 | proxy_req = GetFreeProxy().proxy_req 24 | 25 | 26 | class Mafengwo: 27 | ''' some js confusion applications in mafengwo ''' 28 | 29 | JD_URL = 'http://www.mafengwo.cn/jd/10186/gonglve.html' 30 | AJAX_ROUTER_URL = 'http://www.mafengwo.cn/ajax/router.php' 31 | MDD_URL = 'http://www.mafengwo.cn/mdd/' 32 | 33 | def __init__(self): 34 | self.spot_result = {} 35 | self.spot_pn = {} 36 | self.prepare_js() 37 | 38 | def decode_js_test(self): 39 | ''' decode js for test ''' 40 | with open(decoder_js_path, 'r') as f: 41 | decoder_js = [codecs.unicode_escape_decode( 42 | ii.strip())[0] for ii in f.readlines()] 43 | __Ox2133f = [ii.strip() 44 | for ii in decoder_js[4][17:-2].replace('\"', '\'').split(',')] 45 | decoder_str = '|||'.join(decoder_js) 46 | params = re.findall(r'(\_0x\w{6,8}?)=|,|\)', decoder_str) 47 | params = sorted(list(set([ii for ii in params if len( 48 | ii) > 6])), key=lambda ii: len(ii), reverse=True) 49 | for ii, jj in enumerate(__Ox2133f): 50 | decoder_str = decoder_str.replace('__Ox2133f[{}]'.format(ii), jj) 51 | for ii, jj in enumerate(params): 52 | decoder_str = decoder_str.replace(jj, 'a{}'.format(ii)) 53 | decoder_js = decoder_str.split('|||') 54 | with open(origin_js_path, 'w') as f: 55 | f.write('\n'.join(decoder_js)) 56 | return decoder_js 57 | 58 | def prepare_js(self): 59 | ''' prepare js ''' 60 | pre_text = basic_req(self.JD_URL, 3) 61 | INDEX_JS_URL = re.findall( 62 | r'src=.*index\.js.*" t', pre_text)[0].split('"')[1] 63 | origin_js = basic_req(INDEX_JS_URL, 3) 64 | 65 | ''' decoder js ''' 66 | decode_js = codecs.unicode_escape_decode(origin_js)[0] 67 | 68 | ''' params replace ''' 69 | replace_list_str = decode_js.split(';')[2] 70 | empty_index = replace_list_str.index(' ') + 1 71 | begin_index = replace_list_str.index('=[') + 2 72 | end_index = replace_list_str.index(']') 73 | replace_list = replace_list_str[begin_index:end_index].split(',') 74 | rp = replace_list_str[empty_index:begin_index - 2] 75 | for ii, jj in enumerate(replace_list): 76 | decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj) 77 | self.slat = replace_list[46].replace('"', '') 78 | echo(2, 'salt', self.slat) 79 | 80 | ''' load to local ''' 81 | with open(decoder_js_path, 'w') as f: 82 | f.write(';\n'.join(decode_js.split(';'))) 83 | 84 | ''' del function about ajax ''' 85 | del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js) 86 | del_begin_index = decode_js.index(del_str[0]) 87 | 88 | result_js = decode_js[:del_begin_index] + \ 89 | decode_js[del_begin_index + len(del_str[0]):] 90 | 91 | result_js = decode_js[:del_begin_index] + \ 92 | decode_js[del_begin_index + len(del_str[0]):] 93 | self.result_js = result_js 94 | self.js_compile = execjs.compile(open(hotel_js_path).read()) 95 | echo(1, 'Load hotel index js success!!!') 96 | 97 | def js_compile_sn(self, prepare_map): 98 | ''' js compile sn ''' 99 | wait_js = '' 100 | sn = self.js_compile.call( 101 | 'analysis_js', wait_js, self.slat, prepare_map) 102 | echo(2, '_sn', sn) 103 | return sn 104 | 105 | def load_sn(self, data: dict, now_time=0) -> dict: 106 | ''' load sn ''' 107 | 108 | if not now_time: 109 | now_time = int(time.time() * 1000) 110 | prepare_map = {**data, '_ts': now_time} 111 | 112 | ''' _0xe7fex37 sorted & str num ''' 113 | prepare_map = {ii: str(prepare_map[ii]) for ii in sorted(prepare_map)} 114 | 115 | ''' js compile sn ''' 116 | sn = self.js_compile_sn(prepare_map) 117 | 118 | data = { 119 | **data, 120 | '_sn': sn, 121 | '_ts': now_time 122 | } 123 | return data 124 | 125 | def load_spot_once(self, pn=1, city_id=10186): 126 | ''' load spot once ''' 127 | data = { 128 | 'sAct': 'KMdd_StructWebAjax|GetPoisByTag', 129 | 'iMddid': city_id, 130 | 'iTagId': 0, 131 | 'iPage': pn, 132 | } 133 | data = self.load_sn(data) 134 | print(data) 135 | req = proxy_req(self.AJAX_ROUTER_URL, 11, data=data) 136 | if req is None or not 'data' in req or not 'list' in req['data']: 137 | if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)): 138 | self.load_spot_once(pn, city_id) 139 | return 140 | spot_list = req['data']['list'] 141 | spot_pn = req['data']['page'] 142 | spot_tmp = re.findall('

.?(.?)

', spot_list) 143 | try: 144 | total_pn = int(re.findall('共(.*?)', spot_pn)[0]) 145 | except Exception as e: 146 | total_pn = 1 147 | echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e) 148 | 149 | if city_id not in self.spot_result: 150 | self.spot_result[city_id] = spot_tmp 151 | else: 152 | self.spot_result[city_id] += spot_tmp 153 | self.spot_pn[city_id] = total_pn 154 | 155 | def load_spot(self, batch_size=50): 156 | ''' load spot ''' 157 | version = begin_time() 158 | self.load_city_list() 159 | # self.city_list = [10186] 160 | city_threading = [threading.Thread( 161 | target=self.load_spot_once, args=(1, ii,))for ii in self.city_list] 162 | shuffle_batch_run_thread(city_threading, 150) 163 | 164 | spot_continue = [] 165 | for ii, jj in self.spot_pn.items(): 166 | spot_continue += [threading.Thread( 167 | target=self.load_spot_once, args=(pn, ii,)) for pn in range(2, jj + 1)] 168 | 169 | shuffle_batch_run_thread(spot_continue, 150) 170 | output = ['{},{}'.format(self.id2map[ii], ','.join(jj)) 171 | for ii, jj in self.spot_result.items()] 172 | output_path = '{}spot.txt'.format(data_dir) 173 | with open(output_path, 'w') as f: 174 | f.write('\n'.join(output)) 175 | city_num = len(self.city_list) 176 | spot_num = sum([len(ii) for ii in self.spot_result.values()]) 177 | echo(1, 'City num: {}\nSpot num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( 178 | city_num, spot_num, output_path, end_time(version, 0))) 179 | 180 | def load_city_list(self): 181 | ''' load city list ''' 182 | text = basic_req(self.MDD_URL, 3) 183 | city_list = re.findall( 184 | '/travel-scenic-spot/mafengwo/(.*?).html" target="_blank">(.*?)(| 1 else seg.cut( 239 | index[3])[0] + seg.cut(index[3])[1]: int(index[1]) for index in city if index[3][-1:] == '州'} 240 | seg = pkuseg.pkuseg(model_name='../Model_retrieval/pkuseg') 241 | city_state1 = {seg.cut(index)[0] if len(seg.cut(index)[0]) > 1 else seg.cut( 242 | index)[0] + seg.cut(index)[1]: city_state[index] for index in city_state} 243 | city_area = {index[3][:-2]: int(index[1]) 244 | for index in city if '地区' in index[3]} 245 | city_other = {index[3][:-1]: int(index[1]) 246 | for index in city if index[3][-1:] == '市' or index[3][-1:] == '盟'} 247 | self.city_province = {**city_state1, **city_area, **city_other} 248 | self.city_province = { 249 | index: self.province_map[self.city_province[index]] for index in self.city_province} 250 | county = self.Db.select_db( 251 | 'select * from china_regions where level=3') 252 | county_area_pre = {index for index in county if index[3][-1] == '区'} 253 | county_area_two = {index[3][:-2]: int(index[1][:2]) for index in county_area_pre if len( 254 | index[3]) > 3 and (index[3][-2] == '矿' or index[3][-2] == '林')} 255 | # print('芒' in county_area_two, 'two') 256 | county_area_state = {seg.cut(index[3][:-2])[0]: int(index[1][:2]) 257 | for index in county_area_pre if len(index[3]) > 2 and index[3][-2] == '族'} 258 | # print('芒' in county_area_state, 'state') 259 | county_area_other = {index[3][:-1]: int(index[1][:2]) for index in county_area_pre if len( 260 | index[3]) > 2 and index[3][-2] != '族' and index[3][-2] != '林' and index[3][-2] != '矿'} 261 | # print('芒' in county_area_other, 'other') 262 | county_county_pre = {index for index in county if index[3][-1] == '县'} 263 | county_county_two = {index[3]: int( 264 | index[1][:2]) for index in county_county_pre if len(index[3]) == 2} 265 | # print('芒' in county_county_two, 'two') 266 | seg = pkuseg.pkuseg() 267 | county_county_state = {seg.cut(index[3])[0] if len(seg.cut(index[3])[0]) > 1 else seg.cut(index[3])[0] + seg.cut( 268 | index[3])[1]: int(index[1][:2]) for index in county_county_pre if len(index[3]) > 2 and index[3][-3:-1] == '自治'} 269 | county_county_state = { 270 | index[:-2] if '族' in index and len(index) > 3 else index: county_county_state[index] for index in county_county_state} 271 | # print('芒' in county_county_state, 'state') 272 | county_county_other = { 273 | index[3][:-1]: int(index[1][:2]) for index in county_county_pre if index[3][-3:-1] != '自治' and len(index[3]) > 2} 274 | # print('芒' in county_county_other, 'other') 275 | county_city = {index[3][:-1] if len(index[3]) > 2 else index[3]: int(index[1][:2]) 276 | for index in county if index[3][-1] == '市'} 277 | # print('芒' in county_city, 'city') 278 | county_domain = {index[3][:4]: int( 279 | index[1][:2]) for index in county if index[3][-1] == '域'} 280 | # print('芒' in county_domain, 'domain') 281 | county_other = {index[3]: int( 282 | index[1][:2]) for index in county if index[3][-1] == '盟' or index[3][-1] == '岛'} 283 | # print('芒' in county_other, 'other') 284 | county_province = {**county_area_two, **county_area_state, **county_area_other, **county_county_two, 285 | **county_county_state, **county_county_other, **county_city, **county_domain, **county_other} 286 | county_province = { 287 | index: self.province_map[county_province[index]] for index in county_province} 288 | self.city_province = {**self.city_province, **county_province} 289 | print({index for index in self.city_province if len(index) == 1}) 290 | 291 | def test_province(self, maps, words): 292 | word_city = {} 293 | for index in maps: 294 | temp_num = words.count(index) 295 | province = maps[index] 296 | if temp_num: 297 | if province in word_city: 298 | word_city[province] += temp_num 299 | else: 300 | word_city[province] = temp_num 301 | print(sum(word_city.values())) 302 | return word_city 303 | 304 | 305 | class Get_baidu(): 306 | """ 307 | get info from baidu 308 | """ 309 | 310 | def __init__(self): 311 | self.failuredmap = {} 312 | self.total_map = {} 313 | self.text_map = {} 314 | self.word = {} 315 | self.find_location = find_location() 316 | 317 | def get_summarization(self): 318 | """ 319 | get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 320 | """ 321 | 322 | version = begin_time() 323 | threadings = [] 324 | for index in range(75): 325 | work = threading.Thread( 326 | target=self.summarization_once, args=(index,)) 327 | threadings.append(work) 328 | 329 | for work in threadings: 330 | # time.sleep(.5) 331 | work.start() 332 | for work in threadings: 333 | work.join() 334 | # self.text_map = self.total_map[0] 335 | 336 | # for index in list(range(1, len(self.total_map))): 337 | # for ids in self.total_map[index]: 338 | # if ids in self.text_map: 339 | # self.text_map[ids] += self.total_map[index][ids] 340 | # else: 341 | # self.text_map[ids] = self.total_map[index][ids] 342 | # print(sum(self.text_map)) 343 | word = [self.word[k] for k in sorted(self.word.keys())] 344 | with codecs.open('test', 'w', encoding='utf-8') as f: 345 | f.write("\n".join(word)) 346 | end_time(version) 347 | 348 | def summarization_once(self, index): 349 | """ 350 | get html from news 351 | """ 352 | print(index) 353 | texts = [] 354 | url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=919fab3c0002c9f1&wd=%E5%81%B7%E7%8B%97&oq=%E5%81%B7%E7%8B%97&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=919fab3c0002c9f1&rsv_t=7e30ggF%2BMa91oOURk1bMtN8af5unSwOR08TodNBB%2F%2B6B6RBEwUi8l8IAe28ACA%2B8b5I5&gpc=stf%3D1517038564%2C1548574564%7Cstftype%3D1&tfflag=1&bs=%E5%81%B7%E7%8B%97&rsv_sid=undefined&_ss=1&clist=28bc21fb856a58b7%09350102124f079888%0928bc21fb856a58b7%0928bc2159845c1cf3%0928bc2015823fa56b%0928a121fb84a7d1a6&hsug=&f4s=1&csor=2&_cr1=34767&pn=' + \ 355 | str(index * 10) 356 | news_lists = proxy_req(url, 0) 357 | if not news_lists: 358 | if can_retry(url): 359 | self.summarization_once(index) 360 | return 361 | test = news_lists.find_all( 362 | 'div', class_=['c-row c-gap-top-small', 'c-span18 c-span-last']) 363 | word = self.cleantxt(news_lists.text) 364 | if not len(word): 365 | if can_retry(url): 366 | self.summarization_once(index) 367 | return 368 | temp_map = self.find_location.test_province( 369 | self.find_location.city_province, word) 370 | self.total_map[int(index)] = temp_map 371 | self.word[index] = word 372 | 373 | def cleantxt(self, raw): 374 | fil = re.compile(u'[^\u4e00-\u9fa5]+', re.UNICODE) 375 | return fil.sub(' ', raw) 376 | 377 | 378 | class Get_baidu_bjh(): 379 | """ 380 | get info from baidu bjh 381 | """ 382 | 383 | def __init__(self): 384 | self.failuredmap = {} 385 | self.fail = [] 386 | self.href_map = {} 387 | self.text_map = {} 388 | self.word = {} 389 | self.word_list = {} 390 | 391 | def get_href(self): 392 | """ 393 | get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 394 | """ 395 | 396 | version = begin_time() 397 | threadings = [] 398 | for index in range(71): 399 | work = threading.Thread( 400 | target=self.href_once, args=(index,)) 401 | threadings.append(work) 402 | 403 | for work in threadings: 404 | # time.sleep(.5) 405 | work.start() 406 | for work in threadings: 407 | work.join() 408 | href_map = [self.href_map[k] for k in sorted(self.href_map.keys())] 409 | self.href_map = sum(href_map, []) 410 | with codecs.open('bjh_href_poison.txt', 'w', encoding='utf-8') as f: 411 | f.write("\n".join(self.href_map)) 412 | end_time(version) 413 | 414 | def href_once(self, index): 415 | """ 416 | get html from news 417 | """ 418 | print(index) 419 | texts = [] 420 | url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=毒狗肉&pn=' + \ 421 | str(index * 10) 422 | news_lists = proxy_req(url, 0) 423 | if not news_lists: 424 | if can_retry(url): 425 | self.href_once(index) 426 | return 427 | test = news_lists.find_all('div', class_='result') 428 | if not len(test): 429 | if can_retry(url): 430 | self.href_once(index) 431 | return 432 | href_list = [index.a['href'] for index in test] 433 | self.href_map[int(index)] = href_list 434 | 435 | def cleantxt(self, raw): 436 | fil = re.compile(u'[^\u4e00-\u9fa5]+', re.UNICODE) 437 | return fil.sub(' ', raw) 438 | 439 | def get_detail(self): 440 | """ 441 | get summarization from http://news.baidu.com/ns?word=%E6%AF%92%E7%8B%97%E8%82%89&tn=news&from=news&cl=2&rn=20&ct=1 442 | """ 443 | 444 | version = begin_time() 445 | threadings = [] 446 | with codecs.open('bjh_href_poison.txt', 'r', encoding='utf-8') as f: 447 | href_list = f.readlines() 448 | for index, url in enumerate(href_list): 449 | work = threading.Thread( 450 | target=self.detail_once, args=(index, url,)) 451 | threadings.append(work) 452 | 453 | for work in threadings: 454 | # time.sleep(.5) 455 | work.start() 456 | for work in threadings: 457 | work.join() 458 | word_list = [self.word_list[k] for k in sorted(self.word_list.keys())] 459 | with codecs.open('bjh_detail_poison', 'w', encoding='utf-8') as f: 460 | f.write("\n".join(word_list)) 461 | self.failuredmap = {} 462 | with codecs.open('bjh.log', 'w', encoding='utf-8') as f: 463 | f.write('\n'.join(self.fail)) 464 | self.fail = [] 465 | end_time(version) 466 | 467 | def detail_once(self, index, url): 468 | """ 469 | get html from news 470 | """ 471 | # print(index) 472 | news_lists = proxy_req(url, 0) 473 | if not news_lists: 474 | if can_retry(url): 475 | self.detail_once(index, url) 476 | return 477 | test = news_lists.find_all( 478 | 'div', class_=['article-content', 'mth-editor-content', 'con-news-art', 'Custom_UnionStyle']) 479 | if not len(test): 480 | test = self.cleantxt(news_lists.text) 481 | if not len(test): 482 | if can_retry(url): 483 | self.detail_once(index, url) 484 | return 485 | self.word_list[index] = test 486 | return 487 | word_list = ''.join([index.text for index in test] 488 | ).replace('\u3000', '').replace('\n', '') 489 | self.word_list[int(index)] = word_list 490 | -------------------------------------------------------------------------------- /press/press.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2018-11-10 11:17:16 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2019-03-25 21:18:30 6 | import threading 7 | import time 8 | 9 | from proxy.getproxy import GetFreeProxy 10 | from util.db import Db 11 | from util.util import begin_time, end_time, basic_req 12 | 13 | proxy_req = GetFreeProxy().proxy_req 14 | 15 | 16 | class Press_test(): 17 | """ 18 | give press in short time 19 | """ 20 | 21 | def basic_press(self, url, times, types): 22 | """ 23 | press have no data input 24 | """ 25 | url = url + str(int(round(time.time() * 1000))) 26 | if types == 1: 27 | html = proxy_req(url, 1) 28 | else: 29 | html = basic_req(url, 1) 30 | 31 | if html == False and times < 5: 32 | self.basic_press(url, times + 1, types) 33 | 34 | def press_threading(self, url, qps, types): 35 | """ 36 | press url at constant qps 37 | """ 38 | version = begin_time() 39 | threadings = [] 40 | for index in range(qps): 41 | work = threading.Thread( 42 | target=self.basic_press, args=(url, 0, types)) 43 | threadings.append(work) 44 | for work in threadings: 45 | work.start() 46 | for work in threadings: 47 | work.join() 48 | end_time(version) 49 | 50 | def one_press_attack(self, url, qps, types, total): 51 | """ 52 | press url from a long time 53 | """ 54 | for index in range(total): 55 | self.press_threading(url, qps, types) 56 | print('Over') 57 | -------------------------------------------------------------------------------- /proxy/ip66.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2019-05-07 00:20:48 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2019-05-07 22:34:22 6 | 7 | import js2py 8 | import re 9 | 10 | from util.util import basic_req, echo 11 | 12 | """ 13 | * 66ip @http 14 | js decoder 15 | """ 16 | 17 | IP66_URL = 'http://www.66ip.cn/' 18 | PRE_URL = '{}favicon.ico'.format(IP66_URL) 19 | 20 | header = { 21 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 22 | 'Host': 'www.66ip.cn', 23 | 'Referer': 'http://www.66ip.cn/', 24 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3785.0 Safari/537.36' 25 | } 26 | 27 | 28 | def generate_cookie(): 29 | ''' eval 66ip.cn test in 19.5.7 ''' 30 | req = basic_req(IP66_URL, 2, header=header) 31 | basic_cookie = req.cookies.get_dict() 32 | 33 | ''' !important \b in py -> \x80 ''' 34 | req_text = r'{}'.format(req.text) 35 | 36 | ''' get the script will be eval ''' 37 | script_text = re.findall('', req_text)[0] 38 | script_text = script_text.replace( 39 | '{eval(', '{aaa=').replace(');break', ';break') 40 | script_eval = r'{}'.format(js2py.eval_js(script_text + 'aaa')) 41 | echo(0, script_eval) 42 | 43 | try: 44 | ''' replace document & window ''' 45 | params = re.findall( 46 | r'(__jsl_clearance=.*?)\'\+$function\(${(.*?join$\'\'$)}\)', script_eval) 47 | wait_eval = params[0][1].replace( 48 | "document.createElement('div')", "{}").replace("", '') 49 | wait_replace = re.findall( 50 | r'=(.{1,5}\.firstChild\.href;)', wait_eval)[0] 51 | wait_eval = wait_eval.replace(wait_replace, '"http://www.66ip.cn/";') 52 | 53 | ''' eval & encoder cookie ''' 54 | other_param = js2py.eval_js( 55 | 'function ddd() {window={};' + wait_eval + '}ddd()') 56 | cookie = '{}; {}{}'.format(encoder_cookie( 57 | basic_cookie), params[0][0], other_param) 58 | echo(1, 'cookie', cookie) 59 | 60 | return cookie 61 | except: 62 | generate_cookie() 63 | 64 | 65 | def encoder_cookie(cookie_dict: {}) -> str: 66 | return '; '.join(['{}={}'.format(ii, jj)for ii, jj in cookie_dict.items()]) 67 | 68 | 69 | def req_ip66(): 70 | ''' 66ip.cn js decoder ''' 71 | header['Cookie'] = generate_cookie() 72 | 73 | req_text = basic_req(IP66_URL, 3, header=header) 74 | echo(2, req_text) 75 | return req_text 76 | 77 | 78 | if __name__ == "__main__": 79 | req_ip66() 80 | -------------------------------------------------------------------------------- /proxy/table.sql: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: gunjianpan 3 | * @Date: 2018-10-19 15:01:18 4 | * @Last Modified by: gunjianpan 5 | * @Last Modified time: 2019-01-27 23:39:47 6 | */ 7 | use netease; 8 | CREATE TABLE if not exists `ip_proxy` ( 9 | `id` int(10) unsigned NOT NULL AUTO_INCREMENT COMMENT 'auto-increment primary keys', 10 | `address` varchar(50) NOT NULL DEFAULT '0' COMMENT 'proxy address', 11 | `http_type` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'http type, 1: https, 0: http', 12 | `is_failured` int(10) unsigned NOT NULL DEFAULT 0 COMMENT 'failure time', 13 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT 'create time', 14 | `updated_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT 'update time', 15 | PRIMARY KEY (`id`) 16 | ) ENGINE=InnoDB AUTO_INCREMENT=1 DEFAULT CHARSET=utf8mb4 comment='table for ip proxy'; 17 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | pymysql 4 | requests 5 | bs4 6 | apscheduler 7 | pandas 8 | asyncio 9 | aiohttp 10 | apscheduler 11 | PyExecJS 12 | fonttools 13 | regex 14 | rsa 15 | opencv-python -------------------------------------------------------------------------------- /util/db.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2018-10-24 13:32:39 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-06-06 13:11:46 6 | 7 | import os 8 | import shutil 9 | import sys 10 | import threading 11 | from configparser import ConfigParser 12 | 13 | import pymysql 14 | import time 15 | 16 | sys.path.append(os.getcwd()) 17 | from util.util import echo, read_file 18 | 19 | configure_path = "util/util.ini" 20 | 21 | 22 | class Db(object): 23 | """ db operation, without sql injection """ 24 | 25 | def __init__(self, database: str, return_type: str = "list"): 26 | self.load_configure() 27 | self.database = database 28 | self.return_type = return_type 29 | self.lock = threading.Lock() 30 | self.reconnect() 31 | 32 | def load_configure(self): 33 | """ load configure """ 34 | if not os.path.exists(configure_path): 35 | shutil.copy(configure_path + ".tmp", configure_path) 36 | cfg = ConfigParser() 37 | cfg.read(configure_path, "utf-8") 38 | self.mysql_host = cfg.get("mysql", "hostname") 39 | self.mysql_user = cfg.get("mysql", "username") 40 | self.mysql_pw = cfg.get("mysql", "passwd") 41 | self.mysql_char = cfg.get("mysql", "charset") 42 | 43 | def connect_db(self, database: str, return_type: str): 44 | """ connect database """ 45 | cursorclass = ( 46 | pymysql.cursors.DictCursor 47 | if return_type == "dict" 48 | else pymysql.cursors.Cursor 49 | ) 50 | try: 51 | self.db = pymysql.connect( 52 | host=self.mysql_host, 53 | user=self.mysql_user, 54 | password=self.mysql_pw, 55 | db=database, 56 | charset=self.mysql_char, 57 | cursorclass=cursorclass, 58 | ) 59 | self.cursor = self.db.cursor() 60 | except pymysql.OperationalError: 61 | echo(0, "Please change mysql info in util/db.ini!!!") 62 | self.db = False 63 | self.cursor = None 64 | except pymysql.InternalError: 65 | echo(2, "Try to create database in mysql.........") 66 | if self.create_db(database): 67 | self.connect_db(database, return_type) 68 | else: 69 | self.db = False 70 | self.cursor = None 71 | except: 72 | echo(0, "Other db error!!!") 73 | self.db = False 74 | self.cursor = None 75 | 76 | def reconnect(self): 77 | self.connect_db(self.database, self.return_type) 78 | 79 | def _reConn(self, num: int = 28800, stime: int = 3): 80 | _number = 0 81 | _status = True 82 | while _status and _number <= num: 83 | try: 84 | self.conn.ping() 85 | _status = False 86 | except: 87 | self.reconnect() 88 | if self.db != False: 89 | _status = False 90 | break 91 | _number += 1 92 | time.sleep(stime) 93 | 94 | def create_db(self, database: str): 95 | """ crete database """ 96 | db = pymysql.connect( 97 | host=self.mysql_host, 98 | user=self.mysql_user, 99 | password=self.mysql_pw, 100 | charset=self.mysql_char, 101 | ) 102 | database_sql = "CREATE DATABASE if not exists {}".format(database) 103 | try: 104 | cursor = db.cursor() 105 | cursor.execute(database_sql) 106 | echo(2, "Create Database {} Success!!!".format(database)) 107 | return True 108 | except: 109 | echo(0, "Create Database {} error".format(database)) 110 | return False 111 | 112 | def create_table(self, sql_path: str): 113 | if not os.path.exists(sql_path): 114 | echo(0, "Create Table {} error, file not found".format(sql_path)) 115 | return False 116 | create_table_sql = "\n".join(read_file(sql_path)) 117 | try: 118 | cursor = self.db.cursor() 119 | cursor.execute(create_table_sql) 120 | echo(2, "Create Table from {} Success!!!".format(sql_path)) 121 | return True 122 | except Exception as e: 123 | echo(0, "Create Table from {} error".format(sql_path), e) 124 | return False 125 | 126 | def select_db(self, sql: str): 127 | """ select sql @return False: Expection; list: Success """ 128 | try: 129 | self._reConn() 130 | with self.db.cursor() as cursor: 131 | cursor.execute(sql) 132 | result = cursor.fetchall() 133 | self.db.commit() 134 | return result 135 | except Exception as e: 136 | echo(0, "execute sql {} error".format(sql), e) 137 | return False 138 | 139 | def select_one(self, sql: str): 140 | """ select one @return False: Expection; list: Success """ 141 | try: 142 | self._reConn() 143 | with self.db.cursor() as cursor: 144 | cursor.execute(sql) 145 | result = cursor.fetchone() 146 | self.db.commit() 147 | return result 148 | except Exception as e: 149 | echo(0, "execute sql {} error".format(sql), e) 150 | return False 151 | 152 | def insert_db(self, sql: str): 153 | """ insert sql @return False: Expection; True: Success """ 154 | self.lock.acquire() 155 | try: 156 | self._reConn() 157 | with self.db.cursor() as cursor: 158 | cursor.execute(sql) 159 | self.db.commit() 160 | self.lock.release() 161 | return True 162 | except Exception as e: 163 | self.lock.release() 164 | echo(0, "execute sql {} error".format(sql), e) 165 | self.db.rollback() 166 | return False 167 | 168 | def update_db(self, sql: str): 169 | """ update sql @return False: Expection; True: Success """ 170 | self.lock.acquire() 171 | try: 172 | self._reConn() 173 | with self.db.cursor() as cursor: 174 | cursor.execute(sql) 175 | self.db.commit() 176 | self.lock.release() 177 | return True 178 | except Exception as e: 179 | self.lock.release() 180 | echo(0, "execute sql {} error".format(sql), e) 181 | self.db.rollback() 182 | return False 183 | -------------------------------------------------------------------------------- /util/util.ini.tmp: -------------------------------------------------------------------------------- 1 | [mysql] 2 | hostname = localhost 3 | username = root 4 | passwd = 5 | charset = utf8mb4 6 | [email] 7 | rec_lists = 8 | send_lists = 9 | [ServerChan] 10 | SCKEY = 11 | -------------------------------------------------------------------------------- /util/util.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: gunjianpan 3 | # @Date: 2018-10-19 15:33:46 4 | # @Last Modified by: gunjianpan 5 | # @Last Modified time: 2020-06-06 14:10:31 6 | 7 | from __future__ import ( 8 | absolute_import, 9 | division, 10 | print_function, 11 | unicode_literals, 12 | with_statement, 13 | ) 14 | 15 | import codecs 16 | import datetime 17 | import json 18 | import logging 19 | import os 20 | import pickle 21 | import platform 22 | import random 23 | import re 24 | import shutil 25 | import smtplib 26 | import threading 27 | import time 28 | import urllib 29 | from configparser import ConfigParser 30 | from email.mime.text import MIMEText 31 | 32 | import numpy as np 33 | import requests 34 | import urllib3 35 | from bs4 import BeautifulSoup 36 | 37 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 38 | 39 | 40 | def basic_req( 41 | url: str, 42 | types: int, 43 | proxies=None, 44 | data=None, 45 | header=None, 46 | need_cookie: bool = False, 47 | config: dict = {}, 48 | ): 49 | """ 50 | requests 51 | @types XY: X=0.->get; =1.->post; 52 | Y=0.->html; =1.->json; =2.->basic; =3.->text; 53 | """ 54 | header = req_set(url, header) 55 | if "http" not in url: 56 | echo( 57 | "0|warning", 58 | "You should assign the type of [http]/[https] before the url str!!! The default is [http].", 59 | ) 60 | if types not in [0, 1, 2, 3, 11, 12, 13]: 61 | echo("0|warning", types, " type is not supported!!!") 62 | return 63 | 64 | if types < 10: 65 | req_func = requests.get 66 | else: 67 | req_func = requests.post 68 | mode = types % 10 69 | if mode == 0: 70 | timeout = html_timeout 71 | else: 72 | timeout = json_timeout 73 | return get_basic( 74 | req_func, url, proxies, data, header, need_cookie, config, mode, timeout 75 | ) 76 | 77 | 78 | def req_set(url: str, header): 79 | """ req headers set """ 80 | global headers 81 | headers["Host"] = url.split("/")[2] 82 | index = random.randint(0, agent_len) 83 | headers["User-Agent"] = agent_lists[index] 84 | if not header is None and "User-Agent" not in header: 85 | header["User-Agent"] = agent_lists[index] 86 | return header 87 | 88 | 89 | def get_basic( 90 | req_func, 91 | url: str, 92 | proxies, 93 | data, 94 | header, 95 | need_cookie: bool, 96 | config: dict, 97 | mode: int = 0, 98 | timeouts: int = 5, 99 | ): 100 | """ basic get requests""" 101 | if header is None: 102 | header = headers 103 | allow_redirects = config.get("allow_redirects", True) 104 | timeout = config.get("timeout", timeouts) 105 | return_proxy = config.get("return_proxy", False) 106 | try: 107 | req = req_func( 108 | url, 109 | headers=header, 110 | verify=False, 111 | timeout=timeout, 112 | proxies=proxies, 113 | data=data, 114 | allow_redirects=allow_redirects, 115 | ) 116 | if mode == 2: 117 | if return_proxy: 118 | return req, proxies 119 | return req 120 | elif mode == 0: 121 | if req.apparent_encoding == "utf-8" or "gbk" in req.apparent_encoding: 122 | req.encoding = req.apparent_encoding 123 | result = BeautifulSoup(req.text, "html.parser") 124 | elif mode == 1: 125 | result = req.json() 126 | elif mode == 3: 127 | result = req.text 128 | if need_cookie: 129 | return result, req.cookies.get_dict() 130 | if return_proxy: 131 | return result, proxies 132 | return result 133 | except: 134 | if mode == 3: 135 | result = "" 136 | elif mode == 0: 137 | result = BeautifulSoup("", "html.parser") 138 | else: 139 | result = None 140 | if need_cookie: 141 | return result, {} 142 | return result 143 | 144 | 145 | def changeCookie(cookie: str): 146 | """ change cookie """ 147 | global headers 148 | headers["Cookie"] = cookie 149 | 150 | 151 | def changeHeaders(header: dict): 152 | """ change Headers """ 153 | global headers 154 | headers = {**headers, **header} 155 | 156 | 157 | def changeHtmlTimeout(timeout: int): 158 | """ change html timeout """ 159 | global html_timeout 160 | html_timeout = timeout 161 | 162 | 163 | def changeJsonTimeout(timeout: int): 164 | """ change json timeout """ 165 | global json_timeout 166 | json_timeout = timeout 167 | 168 | 169 | def begin_time() -> int: 170 | """ multi-version time manage """ 171 | global start 172 | start.append(time_stamp()) 173 | return len(start) - 1 174 | 175 | 176 | def end_time_aver(version: int): 177 | time_spend = time_stamp() - start[version] 178 | spend_list.append(time_spend) 179 | echo( 180 | "2|info", 181 | "Last spend: {:.3f}s, Average spend: {:.3f}s.".format( 182 | time_spend, sum(spend_list) / len(spend_list) 183 | ), 184 | ) 185 | 186 | 187 | def end_time(version: int, mode: int = 1): 188 | time_spend = time_stamp() - start[version] 189 | if not mode: 190 | return time_spend 191 | time_spend = get_time_str(time_spend) 192 | if mode == 2: 193 | echo("2|info", time_spend) 194 | return time_spend 195 | 196 | 197 | def empty(): 198 | global spend_list 199 | spend_list = [] 200 | 201 | 202 | def can_retry(url: str, time: int = 3) -> bool: 203 | """ judge can retry once """ 204 | global failure_map 205 | if url not in failure_map: 206 | failure_map[url] = 0 207 | return True 208 | elif failure_map[url] < time: 209 | failure_map[url] += 1 210 | return True 211 | else: 212 | failure_map[url] = 0 213 | return False 214 | 215 | 216 | def send_server_chan(context: str, subject: str): 217 | if SCKEY == "": 218 | return 219 | url = BASIC_SCURL % SCKEY 220 | data = {"text": subject, "desp": context} 221 | req = basic_req(url, 11, data=data) 222 | if req and req.get("errmsg", "error") == "success": 223 | echo("2|warning", "Send sever chan success!!") 224 | 225 | 226 | def send_email(context: str, subject: str, add_rec=None, assign_rec=None) -> bool: 227 | """ send email """ 228 | load_configure() 229 | send_server_chan(context, subject) 230 | email_rec = [ii for ii, jj in rec_lists if jj == "0"] 231 | email_cc = [ii for ii, jj in rec_lists if jj == "1"] 232 | if assign_rec is not None: 233 | email_rec = assign_rec 234 | send_email_once(email_rec, email_cc, context, subject) 235 | if not add_rec is None: 236 | send_email_once(add_rec, [], context, subject) 237 | 238 | 239 | def send_email_once(email_rec: list, email_cc: list, context: str, subject: str): 240 | send_index = random.randint(0, len(send_lists) - 1) 241 | mail_host = "smtp.163.com" 242 | mail_user, mail_pass = send_lists[send_index] 243 | sender = "{}@163.com".format(mail_user) 244 | 245 | sign = EMAIL_SIGN % time_str(time_format="%B %d") 246 | message = MIMEText("{}{}".format(context, sign), "plain", "utf-8") 247 | message["Subject"] = subject 248 | message["From"] = sender 249 | message["To"] = ", ".join(email_rec) 250 | message["Cc"] = ", ".join(email_cc) 251 | 252 | try: 253 | smtpObj = smtplib.SMTP_SSL(mail_host) 254 | smtpObj.connect(mail_host, 465) 255 | smtpObj.login(mail_user, mail_pass) 256 | smtpObj.sendmail(sender, email_rec + email_cc, message.as_string()) 257 | smtpObj.quit() 258 | echo("1|warning", "Send email success!!") 259 | return True 260 | except smtplib.SMTPException as e: 261 | echo("0|warning", "Send email error", e) 262 | return False 263 | 264 | 265 | def dump_bigger(data, output_file: str): 266 | """ pickle.dump big file which size more than 4GB """ 267 | max_bytes = 2 ** 31 - 1 268 | bytes_out = pickle.dumps(data, protocol=4) 269 | with open(output_file, "wb") as f_out: 270 | for idx in range(0, len(bytes_out), max_bytes): 271 | f_out.write(bytes_out[idx : idx + max_bytes]) 272 | 273 | 274 | def load_bigger(input_file: str): 275 | """ pickle.load big file which size more than 4GB """ 276 | max_bytes = 2 ** 31 - 1 277 | bytes_in = bytearray(0) 278 | input_size = os.path.getsize(input_file) 279 | with open(input_file, "rb") as f_in: 280 | for _ in range(0, input_size, max_bytes): 281 | bytes_in += f_in.read(max_bytes) 282 | return pickle.loads(bytes_in) 283 | 284 | 285 | def time_str(time_s: int = -1, time_format: str = "%Y-%m-%d %H:%M:%S"): 286 | """ time stamp -> time str """ 287 | if time_s > 0: 288 | return time.strftime(time_format, time.localtime(time_s)) 289 | return time.strftime(time_format, time.localtime(time_stamp())) 290 | 291 | 292 | def time_stamp(time_str: str = "", time_format: str = "%Y-%m-%d %H:%M:%S") -> float: 293 | """ time str -> time stamp """ 294 | if not len(time_str): 295 | return time.time() 296 | return time.mktime(time.strptime(time_str, time_format)) 297 | 298 | 299 | def echo(types, *args): 300 | """ 301 | echo log -> stdout / log file 302 | @param: color: 0 -> red, 1 -> green, 2 -> yellow, 3 -> blue, 4 -> gray 303 | @param: log_type: info, warning, debug, error 304 | @param: is_service: bool 305 | """ 306 | args = " ".join([str(ii) for ii in args]) 307 | types = str(types) 308 | re_num = re.findall("\d", types) 309 | re_word = re.findall("[a-zA-Z]+", types) 310 | color = int(re_num[0]) if len(re_num) else 4 311 | log_type = re_word[0] if len(re_word) else "info" 312 | 313 | if is_service: 314 | log(log_type, args) 315 | return 316 | colors = { 317 | "red": "\033[91m", 318 | "green": "\033[92m", 319 | "yellow": "\033[93m", 320 | "blue": "\033[94m", 321 | "gray": "\033[90m", 322 | } 323 | if not color in list(range(len(colors.keys()))): 324 | color = 4 325 | if platform.system() == "Windows": 326 | print(args) 327 | else: 328 | print(list(colors.values())[color], args, "\033[0m") 329 | 330 | 331 | def shuffle_batch_run_thread( 332 | threading_list: list, batch_size: int = 24, is_await: bool = False 333 | ): 334 | """ shuffle batch run thread """ 335 | thread_num = len(threading_list) 336 | np.random.shuffle(threading_list) # shuffle thread 337 | total_block = thread_num // batch_size + 1 338 | for block in range(total_block): 339 | for ii in threading_list[ 340 | block * batch_size : min(thread_num, batch_size * (block + 1)) 341 | ]: 342 | if threading.active_count() > batch_size: 343 | time.sleep(random.randint(2, 4) * (random.random() + 1)) 344 | ii.start() 345 | 346 | if not is_await or block % 10 == 1: 347 | for ii in threading_list[ 348 | block * batch_size : min(thread_num, batch_size * (block + 1)) 349 | ]: 350 | ii.join() 351 | else: 352 | time.sleep(min(max(5, batch_size * 2 / 210), 10)) 353 | echo( 354 | "1|info", 355 | time_str(), 356 | "{}/{}".format(total_block, block), 357 | "epochs finish.", 358 | "One Block {} Thread ".format(batch_size), 359 | ) 360 | 361 | 362 | def mkdir(origin_dir: str): 363 | """ mkdir file dir""" 364 | if not os.path.exists(origin_dir): 365 | os.mkdir(origin_dir) 366 | 367 | 368 | def read_file(read_path: str, mode: int = 0): 369 | """ read file """ 370 | if not os.path.exists(read_path): 371 | return [] if not mode else "" 372 | with open(read_path, "r", encoding="utf-8", newline="\n") as f: 373 | if not mode: 374 | data = [ii.strip() for ii in f.readlines()] 375 | elif mode == 1: 376 | data = f.read() 377 | elif mode == 2: 378 | data = list(f.readlines()) 379 | return data 380 | 381 | 382 | def log(types: str, *log_args: list): 383 | """ log record @param: type: {'critical', 'error', 'warning', 'info', 'debug'} """ 384 | mkdir(LOG_DIR) 385 | LOG_PATH = "{}{}.log".format(LOG_DIR, time_str(time_format="%Y%m%d")) 386 | logging.basicConfig( 387 | level=logging.DEBUG, 388 | filename=LOG_PATH, 389 | filemode="a", 390 | format="[%(asctime)s] [%(levelname)s] %(message)s", 391 | datefmt="%Y-%m-%d %H:%M:%S", 392 | ) 393 | logging.getLogger("requests").setLevel(logging.WARNING) 394 | logging.getLogger("urllib3").setLevel(logging.WARNING) 395 | logging.getLogger("chardet").setLevel(logging.WARNING) 396 | log_str = " ".join([str(ii) for ii in log_args]) 397 | if types == "critical": 398 | logging.critical(log_str) 399 | elif types == "error": 400 | logging.error(log_str) 401 | elif types == "warning": 402 | logging.warning(log_str) 403 | elif types == "info": 404 | logging.info(log_str) 405 | elif types == "debug": 406 | logging.debug(log_str) 407 | else: 408 | logging.info("{} {}".format(types, log_str)) 409 | 410 | 411 | def decoder_url(url: str, do_decoder: bool = False) -> dict: 412 | if "?" not in url: 413 | return {} 414 | decoder_dict = { 415 | ii.split("=", 1)[0]: ii.split("=", 1)[1] 416 | for ii in url.split("?", 1)[1].split("&") 417 | if ii != "" 418 | } 419 | if do_decoder: 420 | decoder_dict = { 421 | key: urllib.parse.unquote(value) for key, value in decoder_dict.items() 422 | } 423 | return decoder_dict 424 | 425 | 426 | def encoder_url(url_dict: {}, origin_url: str) -> str: 427 | return "{}?{}".format( 428 | origin_url, 429 | "&".join( 430 | [ 431 | "{}={}".format(ii, urllib.parse.quote(str(jj))) 432 | for ii, jj in url_dict.items() 433 | ] 434 | ), 435 | ) 436 | 437 | 438 | def json_str(data: dict): 439 | """ equal to JSON.stringify in javascript """ 440 | return json.dumps(data, separators=(",", ":")) 441 | 442 | 443 | def decoder_cookie(cookie: str) -> dict: 444 | return {ii.split("=", 1)[0]: ii.split("=", 1)[1] for ii in cookie.split("; ")} 445 | 446 | 447 | def encoder_cookie(cookie_dict: {}) -> str: 448 | return "; ".join(["{}={}".format(ii, jj) for ii, jj in cookie_dict.items()]) 449 | 450 | 451 | def get_time_str(time_gap: int, is_gap: bool = True) -> str: 452 | if not is_gap: 453 | time_gap = int(time_gap // 60) 454 | day = int(time_gap // 1440) 455 | hour = int(time_gap / 60) % 24 456 | minute = int(time_gap % 60) 457 | result = "" 458 | if day: 459 | result += "{}Day ".format(day) 460 | if hour: 461 | result += "{:02d}h ".format(hour) 462 | if minute: 463 | if day and not hour: 464 | result += "{:02d}h ".format(hour) 465 | result += "{:02d}min".format(minute) 466 | return result.strip() 467 | 468 | 469 | def get_min_s(t: str) -> str: 470 | t = float(t) 471 | m = int(t // 60) 472 | s = int(t % 60) 473 | return "{:02d}:{:02d}".format(m, s) 474 | 475 | 476 | def replace_params(origin_str: str, reg: str) -> str: 477 | """ replace params """ 478 | params_re = re.findall(reg, origin_str) 479 | params = {} 480 | for ii in params_re: 481 | if not ii in params: 482 | params[ii] = len(params) 483 | for ii in sorted(list(params.keys()), key=lambda i: -len(i)): 484 | origin_str = origin_str.replace(ii, f"a{params[ii]}") 485 | return origin_str 486 | 487 | 488 | def decoder_fuzz(reg: str, file_path: str, replace_func=replace_params): 489 | """ simple decoder of fuzz file """ 490 | file_dir, file_name = os.path.split(file_path) 491 | origin_str = read_file(file_path, mode=1) 492 | origin_str = codecs.unicode_escape_decode(origin_str)[0] 493 | origin_str = replace_func(origin_str, reg) 494 | name1, name2 = file_name.split(".", 1) 495 | output_path = f"{file_dir}/{name1}_decoder.{name2}" 496 | echo( 497 | 1, 498 | "decoder fuzz file {} -> {}, total {} line.".format( 499 | file_name, output_path, origin_str.count("\n") 500 | ), 501 | ) 502 | with open(output_path, "w") as f: 503 | f.write(origin_str) 504 | 505 | 506 | def get_accept(types: str) -> str: 507 | """ @param: types => html, json, xhr """ 508 | if types == "html": 509 | return "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" 510 | elif types == "json": 511 | return "application/json, text/javascript, */*; q=0.01" 512 | elif types == "xhr": 513 | return "application/json, text/plain, */*" 514 | return "*/*" 515 | 516 | 517 | def get_use_agent(types: str = "pc") -> str: 518 | """ @param: types => pc, mobile""" 519 | if types == "pc": 520 | return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36" 521 | return "Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1" 522 | 523 | 524 | def get_content_type(types: str = "utf8") -> str: 525 | return "application/x-www-form-urlencoded{}".format( 526 | ";charset=UTF-8" if types == "utf8" else "" 527 | ) 528 | 529 | 530 | def change_pic_size(picture_path: str, resize: tuple = (600, 600)): 531 | import cv2 532 | 533 | if not os.path.exists(picture_path): 534 | echo(0, "picture not found in", picture_path) 535 | return 536 | pic = cv2.imread(picture_path) 537 | pic = cv2.resize(pic, resize) 538 | split_text = os.path.splitext(picture_path) 539 | output_path = "{}_resize{}".format(*split_text) 540 | cv2.imwrite(output_path, pic) 541 | 542 | 543 | def load_configure(): 544 | """ load configure """ 545 | global LAST_CONFIG, rec_lists, send_lists, SCKEY 546 | if time_stamp() - LAST_CONFIG < 300: 547 | return 548 | if not os.path.exists(configure_path): 549 | shutil.copy(configure_path + ".tmp", configure_path) 550 | cfg = ConfigParser() 551 | cfg.read(configure_path, "utf-8") 552 | rec_list = cfg.get("email", "rec_lists").split(",") 553 | send_list = cfg.get("email", "send_lists").split(",") 554 | rec_lists = [ii.split(":") for ii in rec_list] 555 | send_lists = [ii.split(":") for ii in send_list] 556 | SCKEY = cfg.get("ServerChan", "SCKEY") 557 | 558 | 559 | headers = { 560 | "Cookie": "", 561 | "Accept": get_accept("html"), 562 | "Content-Type": get_content_type(), 563 | "User-Agent": get_use_agent(), 564 | } 565 | data_dir = "util/data/" 566 | log_path = "service.log" 567 | LAST_CONFIG = -1 568 | rec_lists, send_lists, SCKEY = [], [], "" 569 | configure_path = "util/util.ini" 570 | BASIC_SCURL = "https://sc.ftqq.com/%s.send" 571 | mkdir(data_dir) 572 | agent_lists = [ 573 | " ".join(index.split()[1:])[1:-1] for index in read_file("{}agent".format(data_dir)) 574 | ] 575 | if not len(agent_lists): 576 | agent_lists = [headers["User-Agent"]] 577 | 578 | agent_len = len(agent_lists) - 1 579 | html_timeout = 5 580 | json_timeout = 4 581 | start = [] 582 | spend_list = [] 583 | failure_map = {} 584 | is_service = False 585 | LOG_DIR = "log/" 586 | EMAIL_SIGN = "\n\n\nBest wish!!\n%s\n\n————————————————————\n• Send from script designed by gunjianpan." 587 | load_configure() 588 | -------------------------------------------------------------------------------- /zimuzu/zimuzu.ini.tmp: -------------------------------------------------------------------------------- 1 | [basic] 2 | zimuzu_id:ooAnc4 3 | drama_name:Game_of_Thrones -------------------------------------------------------------------------------- /zimuzu/zimuzu.py: -------------------------------------------------------------------------------- 1 | ''' 2 | @Author: gunjianpan 3 | @Date: 2019-02-28 09:47:06 4 | @Last Modified by: gunjianpan 5 | @Last Modified time: 2019-04-13 14:11:45 6 | ''' 7 | 8 | import codecs 9 | import os 10 | import re 11 | import shutil 12 | 13 | from configparser import ConfigParser 14 | from proxy.getproxy import GetFreeProxy 15 | from util.util import begin_time, end_time, can_retry 16 | 17 | proxy_req = GetFreeProxy().proxy_req 18 | 19 | """ 20 | * zimuzu @http 21 | * zmz005.com/XXXXXX 22 | """ 23 | 24 | configure_path = 'zimuzu/zimuzu.ini' 25 | data_dir = 'zimuzu/data/' 26 | 27 | 28 | class zimuzu(): 29 | ''' load download link from zimuzu ''' 30 | 31 | def __init__(self): 32 | cfg = ConfigParser() 33 | cfg.read(configure_path, 'utf-8') 34 | self.zimuzu_id = cfg.get('basic', 'zimuzu_id') 35 | self.drama_name = cfg.get('basic', 'drama_name') 36 | 37 | def load_url(self): 38 | ''' load url form zimuzu ''' 39 | 40 | url = 'http://zmz005.com/{}'.format(self.zimuzu_id) 41 | detail = proxy_req(url, 0) 42 | total = [] 43 | 44 | if not detail: 45 | print('retry') 46 | if can_retry(url): 47 | self.load_url() 48 | return 49 | season_list = detail.find_all( 50 | 'div', class_='tab-content info-content')[1:] 51 | for season in season_list: 52 | quality_list = season.find_all('div', class_='tab-pane') 53 | url_body = quality_list[1] if 'APP' in quality_list[0]['id'] else quality_list[0] 54 | season_id = re.findall(r"\d+\.?\d*", url_body['id'])[0] 55 | total.append(season_id) 56 | if int(season_id) < 12: 57 | url_body = quality_list[1] 58 | 59 | url_list = url_body.find_all('ul', class_='down-links') 60 | url = [index.find_all('div', class_='copy-link')[1]['data-url'] 61 | for index in url_list] 62 | total.append('\n'.join(url) + '\n') 63 | with codecs.open('{}{}'.format(data_dir, self.drama_name), 'w', encoding='utf-8') as f: 64 | f.write('\n'.join(total)) 65 | 66 | 67 | if __name__ == '__main__': 68 | if not os.path.exists(data_dir): 69 | os.makedirs(data_dir) 70 | if not os.path.exists(configure_path): 71 | shutil.copy(configure_path + '.tmp', configure_path) 72 | zimuzu = zimuzu() 73 | zimuzu.load_url() 74 | --------------------------------------------------------------------------------

Spider Man

.*?(.*?)

.?(.?)