├── requirements.txt ├── static ├── loading.gif └── css │ └── style.css ├── README.md ├── templates └── post.html ├── GetTxt.py ├── GetPpt.py └── GetAll.py /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | chardet 3 | bs4 4 | Pillow 5 | pdfkit 6 | flask 7 | imgkit 8 | img2pdf -------------------------------------------------------------------------------- /static/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ChangeWeDer/BaiduWenkuSpider_flaskWeb/HEAD/static/loading.gif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BaiduWenkuSpider_flaskWeb 2 | 以web server形式实现对百度文库文档以pdf形式原格式下载 3 | 如果觉得可以的话,可以点个**🌟**哦 4 | (**当前爬取方式可能已经不支持,仅提供flask开发参考**) 5 | 6 | ## 前言 7 | 首先,这是根据 8 | [https://github.com/M010K/BaiduWenkuSpider](https://github.com/M010K/BaiduWenkuSpider) 9 | 的项目进行一点修改得到的基于flask框架的python web项目, 10 | 可以对百度文库的文档转换为pdf格式进行下载 11 | 12 | **[博客地址](https://www.upstudy.top/index.php/archives/21/)** 13 | 14 | ## 如何使用? 15 | #### 一、下载项目zip包,或者直接用git获取 16 | 17 | **$ git clone https://github.com/ChangeWeDer/BaiduWenkuSpider_flaskWeb** 18 | 19 | 20 | #### 二、安装依赖 21 | 项目使用的依赖有 22 | 1. requests 23 | 2. chardet 24 | 3. bs4 25 | 4. Pillow 26 | 5. pdfkit 27 | 6. flask 28 | 7. imgkit 29 | 8. img2pdf 30 | 31 | cd到项目文件夹中使用命令,直接一键安装 32 | **pip install -r requirements.txt** 33 | 34 | #### 三、安装wkhtmltopdf工具 35 | [官网下载地址](https://wkhtmltopdf.org/downloads.html) 36 | 37 | 下载后按当前系统 38 | 配置环境变量即可 39 | 40 | **window:** 41 |  42 | 43 | **Centos:** 44 | 45 | [https://blog.csdn.net/LookingTomorrow/article/details/93513457](https://blog.csdn.net/LookingTomorrow/article/details/93513457) 46 | 47 | #### 四、直接运行GetAll.py文件,访问http://127.0.0.1:5000/post 即可(运行在服务器端则访问IP:5000/post) 48 | 49 |  50 | 51 | ps:ppt格式的文档不支持预览 52 | #### 五、Github源码下载地址 53 | [https://github.com/ChangeWeDer/BaiduWenkuSpider_flaskWeb](https://github.com/ChangeWeDer/BaiduWenkuSpider_flaskWeb) 54 | -------------------------------------------------------------------------------- /static/css/style.css: -------------------------------------------------------------------------------- 1 | .container { 2 | width: 500px; 3 | height: 50px; 4 | margin: 100px auto; 5 | } 6 | 7 | .parent { 8 | width: 100%; 9 | height: 42px; 10 | top: 4px; 11 | position: relative; 12 | } 13 | 14 | .parent>input:first-of-type { 15 | /*输入框高度设置为40px, border占据2px,总高度为42px*/ 16 | width: 380px; 17 | height: 40px; 18 | border: 1px solid #ccc; 19 | font-size: 16px; 20 | outline: none; 21 | } 22 | 23 | .parent>input:first-of-type:focus { 24 | border: 1px solid #317ef3; 25 | padding-left: 10px; 26 | } 27 | 28 | .parent>input:last-of-type { 29 | /*button按钮border并不占据外围大小,设置高度42px*/ 30 | width: 100px; 31 | height: 44px; 32 | position: absolute; 33 | background: #317ef3; 34 | border: 1px solid #317ef3; 35 | color: #fff; 36 | font-size: 16px; 37 | outline: none; 38 | } 39 | 40 | .a_demo_two { 41 | background-color:#317ef3; 42 | padding:10px; 43 | position:relative; 44 | font-family: 'Open Sans', sans-serif; 45 | font-size:12px; 46 | text-decoration:none; 47 | color:#fff; 48 | background-image: linear-gradient(bottom, rgb(100,170,30) 0%, rgb(129,212,51) 100%); 49 | box-shadow: inset 0px 1px 0px #b2f17f, 0px 6px 0px #3d6f0d; 50 | border-radius: 5px; 51 | } 52 | 53 | .a_demo_two:active { 54 | top:7px; 55 | background-image: linear-gradient(bottom, rgb(100,170,30) 100%, rgb(129,212,51) 0%); 56 | box-shadow: inset 0px 1px 0px #b2f17f, inset 0px -1px 0px #3d6f0d; 57 | color: #156785; 58 | text-shadow: 0px 1px 1px rgba(255,255,255,0.3); 59 | background: rgb(44,160,202); 60 | } 61 | 62 | .a_demo_two::before { 63 | background-color:#072239; 64 | content:""; 65 | display:block; 66 | position:absolute; 67 | width:100%; 68 | height:100%; 69 | padding-left:2px; 70 | padding-right:2px; 71 | padding-bottom:4px; 72 | left:-2px; 73 | top:5px; 74 | z-index:-1; 75 | border-radius: 6px; 76 | box-shadow: 0px 1px 0px #fff; 77 | } 78 | 79 | .a_demo_two:active::before { 80 | top:-2px; 81 | } 82 | 83 | 84 | -------------------------------------------------------------------------------- /templates/post.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |
5 | 6 | 7 |标签--文本内容 365 | """ 366 | p = t.get('p') 367 | ps = t.get('ps') 368 | s = t.get('s') 369 | z = [' ', "\n"] 370 | k, N = 10, 1349.19 / 1262.85 371 | # T = self.j 372 | U = self.O(ps) 373 | w, h, y, x, D= p.get('w'), p.get('h'), p.get('y'), p.get('x'), p.get('z') 374 | pattern=re.compile("[\s\t\0xa0]| [\0xa0\s\t]$") 375 | final = [] 376 | 377 | if U and ps and ((ps.get('_opacity') and ps.get('_opacity') == 1) or (ps.get('_alpha') and ps.get('_alpha') == 0)): 378 | return "" 379 | else: 380 | width = math.floor(w * k * N) 381 | height = math.floor(h * k * N) 382 | final.append("
') 390 | final.append(t.get('c') if t.get('c') else "") 391 | final.append(U and ps and str(self.isNumber(ps.get('_enter'))) and z[ps.get('_enter') if ps.get('_enter') else 1] or "") 392 | final.append("
") 393 | 394 | return "".join(final) 395 | 396 | 397 | def processStyleOfS(self, t, font, r, version): 398 | """ 399 | :param t: 文本的s属性 400 | :param font: font属性 401 | :param r:font属性 402 | :param version: 403 | :return:处理好的S属性字符串 404 | """ 405 | infoOfS = [] 406 | n = {"font-size": 1} 407 | p , u = 10, 1349.19 / 1262.85 408 | 409 | def fontfamily(o): 410 | n = font.get(o) or o if font else o 411 | if abs(version) > 5: 412 | infoOfS.append("font-family:'"+ n + "','" + o + "','" + (r.get('n') and r[n] or n) + "';") 413 | else: 414 | infoOfS.append("font-family:'" + o + "','" + n + "','" + (r.get(n) and r[n] or n) + "';") 415 | 416 | def bold(e): 417 | "false" == e or infoOfS.append("font-weight:600;") 418 | 419 | def letter(e): 420 | infoOfS.append("letter-spacing:" + str(eval(e) * p) + "px;") 421 | 422 | if t is not None: 423 | for attribute in t: 424 | if attribute == "font-family": 425 | fontfamily(t[attribute]) 426 | elif attribute == "bold": 427 | bold(t[attribute]) 428 | elif attribute == "letter-spacing": 429 | letter(t[attribute]) 430 | else: 431 | infoOfS.append(attribute + ":" + (str(math.floor(((t[attribute] if self.isNumber(t[attribute]) else eval(t[attribute])) * p * u))) + "px" if n.get(attribute) else t[attribute]) + ";") 432 | 433 | return "".join(infoOfS) 434 | 435 | 436 | def processStyleOfR(self, r, page): 437 | """ 438 | :param r: 文本的r属性 439 | :param page: 当前页面 440 | :return: 441 | """ 442 | l = " " + "reader-word-s" + str(page) + "-" 443 | return "".join([l + str(x) for x in r]) if isinstance(r, list) and len(r) != 0 else "" 444 | 445 | 446 | def processStyleOf_rotate(self, t, w, h, x, y, k, N): 447 | """ 448 | :param t: _rotate属性 449 | :param w: body中p.w 450 | :param h: body中p.h 451 | :param x: body中p.x 452 | :param y: body中p.y 453 | :param k: 倍数10 454 | :param N: 比例系数 455 | :return: 处理好的_rotate属性字符串 456 | """ 457 | p = [] 458 | s = k * N 459 | if t == 90: 460 | p.append("left:" + str(math.floor(x + (w - h) / 2) * s) + "px;" + "top:" + str(math.floor(y - (h - w) / 2) * s) + "px;" + "text-align: right;" + "height:" + str(math.floor(h + 7) * s) + "px;") 461 | elif t == 180: 462 | p.append("left:" + str(math.floor(x - w) * s) + "px;" + "top:" + str(math.floor(y - h) * s) + "px;") 463 | elif t == 270: 464 | p.append("left:" + str(math.floor(x + (h - w) / 2) * s) + "px;" + "top:" + str(math.floor(y - (w - h) / 2) * s) + "px;") 465 | 466 | return "-webkit-"+"transform:rotate("+str(t)+"deg);"+"".join(p) 467 | 468 | 469 | def processStyleOf_scaleX(self, t, width, height): 470 | """ 471 | :param t: _scaleX属性 472 | :param width: 计算好的页面width 473 | :param height:计算好的页面height 474 | :return: 处理好的_scaleX属性字符串 475 | """ 476 | return "-webkit-" + "transform: scaleX(" + str(t) + ");" + "-webkit-" + "transform-origin:left top;width:" + str(width + math.floor(width / 2)) + "px;height:" + str(height + math.floor(height / 2)) + "px;" 477 | 478 | 479 | def processStyleOfOpacity(self,t): 480 | """ 481 | :param t: opacity属性 482 | :return:处理好的opacity属性字符串 483 | """ 484 | t = (t or 0), 485 | return "opacity:" + str(t) + ";" 486 | 487 | 488 | def creatTagOfImage(self,t,url, *args): 489 | """ 490 | :param t: 图片的字典 491 | :param url:图片链接 492 | :param args: 493 | :return:图像标签 494 | """ 495 | u, l = t.get('p'), t.get('c') 496 | if u.get("opacity") and u.get('opacity') == 0: 497 | return "" 498 | else: 499 | if u.get("x1") or (u.get('rotate') != 0 and u.get('opacity') != 1): 500 | message = '' 502 | else: 503 | [s, h] = [str(x) for x in args] 504 | message = '