├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
└── pornhub_crawler
    ├── .idea
        ├── .gitignore
        ├── inspectionProfiles
        │   ├── Project_Default.xml
        │   └── profiles_settings.xml
        ├── misc.xml
        ├── modules.xml
        └── pornhub_crawler.iml
    ├── configure.txt
    ├── main.py
    ├── pornhub_video.sql
    ├── requirements.txt
    ├── tool
        ├── __init__.py
        ├── mysql.py
        └── video_download.py
    └── 安装库.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 lzkgbld
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | ## 使用方法：
 5 | 
 6 | 1.部署环境
 7 | pip install -r requirements.txt
 8 | --2.
 9 | 手动安装环境：(以下库默认请安装最新版)
10 | 1.request
11 | 2.PyMySQL
12 | 3.youtube-dl
13 | 4.lxml
14 | 5.requests-toolbelt
15 | 
16 | 2.修改配置文件
17 | 请打开configure.txt文件进行修改.
18 | 
19 | 3.创建数据库环境(可选)
20 | 此步不一定需要,仅是为了防止第二次运行导致获取重复视频,如果需要请按照下面数据库结构进行创建.
21 | 数据库相关配置信息请自行前往configure.txt填写。
22 | 去重原理是数据库只会记录抓取视频地址，由于名称存储使用pymysql存储会出现一些奇怪的错误,所以选择根据url地址进行去重。
23 | 数据库构建有两种方案:
24 | 1.自行导入pornhub_video.sql文件
25 | 2.自行创建数据库,并且在configure.txt填写好数据库名称，确定里面有张表名称为video,如果需要更换请自行修改源码目录下的tool/mysql.py。
26 | 3.如果使用自行创建数据库造成的数据库执行出错,请自行研究源码修复,本人不做过多的指导！
27 | 
28 | 3--1
29 | 如果不需要去重,请自行打开源码里面tool/video_download.py下面留下注释的执行语句,但是请确保configure.txt的数据还是有填写的,请勿留空.
30 | 
31 | 
32 | 4.运行爬虫,进入目录看到main.py文件,直接运行Python main.py
33 | 
34 | 
35 | 
36 | ## 如果需要下载pornhub会员收费视频,请执行下列步骤
37 | 
38 | 
39 | 
40 | 1.请使用对应cookie获取插件,获取cookie值,比如谷歌浏览器请使用cookies.txt(https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg)
41 | 由于本人只有谷歌，获取cookie浏览器插件仅介绍谷歌能用的,其他浏览器请自行寻找其他插件,将下载的cookie文本里面的内容全部复制粘贴,替换源码中的cookies.txt里面内容.
42 | 
43 | 2.请在configure.txt填好账号密码,请确定能够访问付费视频权限的账号！
44 | 
45 | 3.请在下载配置文件主动填写对应的配置信息.(1:免费 2:收费视频)
46 | 
47 | 4.运行爬虫,进入目录看到main.py文件,直接运行Python main.py
48 | 
49 | 
50 | ## 备注:
51 | 如果有任何问题请留言,尽可能修复.
52 | 预计后面会配置一个页面控制端,运行在页面直接配置好信息后直接调度爬虫...(开发时间未定)
53 | 
54 | 画质自动下载1080p如果有需要请video_download.py文件里面41行,为收藏夹画质,分类下载请修改84行和135行,注释已经写清楚是下载免费和收费视频的画质,可改正720p和480p,请确定P站有的画质!
55 | 请Linux运行的哥们修改一下pipelines.py文件,把最后的%(title)s.%(ext)s修改为%\(title\)s.%\(ext\)s  (原因是Linux不识别%加括号的命令....)
56 | 


--------------------------------------------------------------------------------
/pornhub_crawler/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/pornhub_crawler/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="false" level="WARNING" enabled_by_default="false">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="27">
 8 |             <item index="0" class="java.lang.String" itemvalue="Automat" />
 9 |             <item index="1" class="java.lang.String" itemvalue="queuelib" />
10 |             <item index="2" class="java.lang.String" itemvalue="cffi" />
11 |             <item index="3" class="java.lang.String" itemvalue="pyasn1" />
12 |             <item index="4" class="java.lang.String" itemvalue="pycparser" />
13 |             <item index="5" class="java.lang.String" itemvalue="constantly" />
14 |             <item index="6" class="java.lang.String" itemvalue="w3lib" />
15 |             <item index="7" class="java.lang.String" itemvalue="pyasn1-modules" />
16 |             <item index="8" class="java.lang.String" itemvalue="pyOpenSSL" />
17 |             <item index="9" class="java.lang.String" itemvalue="PyDispatcher" />
18 |             <item index="10" class="java.lang.String" itemvalue="lxml" />
19 |             <item index="11" class="java.lang.String" itemvalue="pywin32" />
20 |             <item index="12" class="java.lang.String" itemvalue="hyperlink" />
21 |             <item index="13" class="java.lang.String" itemvalue="service-identity" />
22 |             <item index="14" class="java.lang.String" itemvalue="Scrapy" />
23 |             <item index="15" class="java.lang.String" itemvalue="six" />
24 |             <item index="16" class="java.lang.String" itemvalue="cryptography" />
25 |             <item index="17" class="java.lang.String" itemvalue="PyHamcrest" />
26 |             <item index="18" class="java.lang.String" itemvalue="zope.interface" />
27 |             <item index="19" class="java.lang.String" itemvalue="incremental" />
28 |             <item index="20" class="java.lang.String" itemvalue="Protego" />
29 |             <item index="21" class="java.lang.String" itemvalue="attrs" />
30 |             <item index="22" class="java.lang.String" itemvalue="bcrypt" />
31 |             <item index="23" class="java.lang.String" itemvalue="parsel" />
32 |             <item index="24" class="java.lang.String" itemvalue="cssselect" />
33 |             <item index="25" class="java.lang.String" itemvalue="idna" />
34 |             <item index="26" class="java.lang.String" itemvalue="Twisted" />
35 |           </list>
36 |         </value>
37 |       </option>
38 |     </inspection_tool>
39 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
40 |       <option name="ignoredIdentifiers">
41 |         <list>
42 |           <option value="ErocoolBook.utils.WriteMysql.self" />
43 |           <option value="ErocoolBook.utils.WriteMysql.*" />
44 |         </list>
45 |       </option>
46 |     </inspection_tool>
47 |   </profile>
48 | </component>


--------------------------------------------------------------------------------
/pornhub_crawler/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/pornhub_crawler/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (Demo)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/pornhub_crawler/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pornhub_crawler.iml" filepath="$PROJECT_DIR$/.idea/pornhub_crawler.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/pornhub_crawler/.idea/pornhub_crawler.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="Python 3.7 (Demo)" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/pornhub_crawler/configure.txt:
--------------------------------------------------------------------------------
 1 | 爬取类型(1:分类 2:收藏)==2
 2 | 存储文件夹==D:/Demo
 3 | 起始页==1
 4 | 结束页==1
 5 | 完整地址==32972131
 6 | 是否收费(1:免费 2:收费视频)==1
 7 | 数据库地址==127.0.0.1
 8 | 数据库账号==root
 9 | 数据库密码==1cb1a931db23a8f3
10 | 数据库名称==ecchi_video
11 | 账号==请填写开通高级服务的账号
12 | 密码==请填写开通高级账号的密码


--------------------------------------------------------------------------------
/pornhub_crawler/main.py:
--------------------------------------------------------------------------------
 1 | # Author": "lzkgbld",
 2 | #  "Date": 2020/9/20 12:56
 3 | #  "LastEditors": "lzkgbld"
 4 | #  "LastEditTime": 2020/9/20 12:56
 5 | from tool.video_download import por_favorites, por_type
 6 | 
 7 | 
 8 | if __name__ == "__main__":
 9 |     # 调用方法将txt数据读取到list中
10 |     txt_data = []
11 |     for line in open('configure.txt', "r", encoding='UTF-8'):
12 |         txt_data.append(line.strip('\n'))
13 |     cl_video_type =None
14 |     cl_path = None
15 |     cl_num = None
16 |     cl_end = None
17 |     cl_url = None
18 |     cl_type = None
19 |     cl_host = None
20 |     cl_user = None
21 |     cl_pwd = None
22 |     cl_name = None
23 |     user_name = None
24 |     user_pwd = None
25 |     # 进行数据分类
26 |     # 如果只抓收藏，在完整地址输入抓的ID即可，默认抓收藏夹全部内容
27 |     # 选择收藏类型只需要填存储文件夹和完整地址里面输入ID即可，其他不会受到任何影响
28 |     for t in txt_data:
29 |         if t.split("==")[0] in "爬取类型(1:分类 2:收藏)":
30 |             cl_video_type = int(t.split("==")[1])
31 |         elif t.split("==")[0] in "存储文件夹":
32 |             cl_path = t.split("==")[1]
33 |         elif t.split("==")[0] in "起始页":
34 |             cl_num = t.split("==")[1]
35 |         elif t.split("==")[0] in "结束页":
36 |             cl_end = t.split("==")[1]
37 |         elif t.split("==")[0] in "完整地址":
38 |             cl_url = t.split("==")[1]
39 |         elif t.split("==")[0] in "是否收费(1:免费 2:收费视频)":
40 |             cl_type = t.split("==")[1]
41 |         elif t.split("==")[0] in "数据库地址":
42 |             cl_host = t.split("==")[1]
43 |         elif t.split("==")[0] in "数据库账号":
44 |             cl_user = t.split("==")[1]
45 |         elif t.split("==")[0] in "数据库密码":
46 |             cl_pwd = t.split("==")[1]
47 |         elif t.split("==")[0] in "数据库名称":
48 |             cl_name = t.split("==")[1]
49 |         elif t.split("==")[0] in "账号":
50 |             user_name = t.split("==")[1]
51 |         elif t.split("==")[0] in "密码":
52 |             user_pwd = t.split("==")[1]
53 |     if cl_video_type == 1:
54 |         por_type(cl_num, cl_end, cl_url, cl_type, cl_path, cl_host, cl_user, cl_pwd, cl_name, user_name, user_pwd)
55 |     elif cl_video_type == 2:
56 |         por_favorites(cl_url, cl_path, cl_host, cl_user, cl_pwd, cl_name)
57 | 
58 | # D:/Demo/video
59 | # 101749201


--------------------------------------------------------------------------------
/pornhub_crawler/pornhub_video.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Navicat Premium Data Transfer
 3 | 
 4 |  Source Server         : mysql
 5 |  Source Server Type    : MySQL
 6 |  Source Server Version : 50728
 7 |  Source Host           : localhost:3306
 8 |  Source Schema         : pornhub_video
 9 | 
10 |  Target Server Type    : MySQL
11 |  Target Server Version : 50728
12 |  File Encoding         : 65001
13 | 
14 |  Date: 26/09/2020 17:43:02
15 | */
16 | 
17 | SET NAMES utf8mb4;
18 | SET FOREIGN_KEY_CHECKS = 0;
19 | 
20 | -- ----------------------------
21 | -- Table structure for video
22 | -- ----------------------------
23 | DROP TABLE IF EXISTS `video`;
24 | CREATE TABLE `video`  (
25 |   `video_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL
26 | ) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
27 | 
28 | SET FOREIGN_KEY_CHECKS = 1;
29 | 


--------------------------------------------------------------------------------
/pornhub_crawler/requirements.txt:
--------------------------------------------------------------------------------
1 | PyMySQL==0.9.3
2 | youtube-dl==2020.6.16.1
3 | lxml==4.6.2
4 | requests==2.23.0
5 | requests-toolbelt==0.9.1
6 | 


--------------------------------------------------------------------------------
/pornhub_crawler/tool/__init__.py:
--------------------------------------------------------------------------------
1 | # Author": "lzkgbld",
2 | #  "Date": 2020/9/20 12:56
3 | #  "LastEditors": "lzkgbld"
4 | #  "LastEditTime": 2020/9/20 12:56


--------------------------------------------------------------------------------
/pornhub_crawler/tool/mysql.py:
--------------------------------------------------------------------------------
 1 | # Author": "lzkgbld",
 2 | #  "Date": 2020/9/20 13:26
 3 | #  "LastEditors": "lzkgbld"
 4 | #  "LastEditTime": 2020/9/20 13:26
 5 | import pymysql, time, os
 6 | 
 7 | 
 8 | class ManagementMysql(object):
 9 |     def __init__(self, host, user, password, database):
10 |         self.host = host
11 |         self.user = user
12 |         self.password = password
13 |         self.database = database
14 | 
15 |     # 检查视频是否存在
16 |     def check_video(self, video_url):
17 |         try:
18 |             conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset="utf8")
19 |             cursor = conn.cursor()
20 |             # 搜索数据库是否已有该内容
21 |             cursor.execute("SELECT video_url from video WHERE video_url=" + "'" + str(video_url) + "'" + ";")
22 |             # 关闭数据库连接
23 |             cursor.close()
24 |             conn.close()
25 |             if cursor.fetchone():
26 |                 return "NO"
27 |             else:
28 |                 return "OK"
29 |         except Exception as e:
30 |             # 如果文件存在则删除
31 |             # if os.path.exists("checklog.txt"):
32 |             #     os.remove("dest.txt")
33 |             # 写出错误log日志
34 |             with open("check_log.txt", "a+",encoding='utf-8') as f:
35 |                 f.writelines(str(e))
36 |                 f.writelines("\n")
37 |                 f.writelines("错误视频URL地址:"+str(video_url))
38 |                 f.writelines("\n")
39 |                 f.writelines("错误发生时间:" + str(time.strftime("%Y-%m-%d %H:%M:%S")))
40 |                 f.writelines("\n")
41 | 
42 |     def add_video(self, video_url):
43 |         try:
44 |             conn = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset="utf8")
45 |             cursor = conn.cursor()
46 |             # 搜索数据库是否已有该内容
47 |             cursor.execute("SELECT video_url from video WHERE video_url=" + "'" + str(video_url) + "'" + ";")
48 |             # 关闭数据库连接
49 |             cursor.close()
50 |             conn.close()
51 |             if cursor.fetchone():
52 |                 return "视频已存在"
53 |             else:
54 |                 cursor = conn.cursor()
55 |                 # 数据库插入语句
56 |                 cursor.execute(
57 |                     "INSERT INTO video(video_url) values ('%s')" % (str(video_url)))
58 |                 # 执行插入语句
59 |                 conn.commit()
60 |                 # 关闭数据库连接
61 |                 cursor.close()
62 |                 conn.close()
63 | 
64 |         except Exception as e:
65 |             # 如果文件存在则删除
66 |             # if os.path.exists("checklog.txt"):
67 |             #     os.remove("dest.txt")
68 |             # 写出错误log日志
69 |             with open("add_log.txt", "a+",encoding='utf-8') as f:
70 |                 f.writelines(str(e))
71 |                 f.writelines("\n")
72 |                 f.writelines("添加错误URL地址:"+str(video_url))
73 |                 f.writelines("\n")
74 |                 f.writelines("错误发生时间:" + str(time.strftime("%Y-%m-%d %H:%M:%S")))
75 |                 f.writelines("\n")


--------------------------------------------------------------------------------
/pornhub_crawler/tool/video_download.py:
--------------------------------------------------------------------------------
  1 | # Author": "lzkgbld",
  2 | #  "Date": 2020/9/20 12:57
  3 | #  "LastEditors": "lzkgbld"
  4 | #  "LastEditTime": 2020/9/20 12:57
  5 | import requests, os
  6 | from lxml import etree
  7 | from tool.mysql import ManagementMysql
  8 | 
  9 | 
 10 | def por_favorites(video_id, video_file, host, user, password, name):
 11 |     # 创建mysql工具类 请注释
 12 |     mysql = ManagementMysql(host, user, password, name)
 13 |     # 原地址
 14 |     video_url = 'https://cn.pornhub.com'
 15 |     # 获取视频数量
 16 |     num_url = 'https://cn.pornhub.com/playlist/'+video_id
 17 |     # 获取视频列表
 18 |     html_num = requests.get(num_url)
 19 |     html_num = html_num.text
 20 |     html_num = etree.HTML(html_num)
 21 |     data_num = html_num.xpath('//*[@id="aboutPlaylist"]/div[1]/text()')
 22 |     video_num = data_num[1].strip()
 23 |     video_num = int(video_num.split('个')[0].split('-')[-1].strip())
 24 |     video_num = int(video_num/50)+1
 25 |     num = 1
 26 |     page = 0
 27 |     while num <= video_num:
 28 |         # 获取视频列表
 29 |         url = 'https://cn.pornhub.com/playlist/viewChunked?id='+video_id+'&offset='+str(num)+'&itemsPerPage=50'
 30 |         # 获取视频链接
 31 |         html = requests.get(url)
 32 |         html = html.text
 33 |         html = etree.HTML(html)
 34 |         data = html.xpath("//li/div/div/a/@href")
 35 |         for d in data:
 36 |             dd = d.split('&')[0]
 37 |             video_u = video_url+dd
 38 |             # 如果不需要去重请注释这句话
 39 |             if mysql.check_video(video_u) == "OK":
 40 |                 # 添加下载任务  Linux="youtube-dl " + video_u + " -o " +video_file + "/" + "%\(title\)s.%\(ext\)s"
 41 |                 cmd = "youtube-dl -f 1080p " + video_u + " -o " + video_file + "/" + "%\(title\)s.%\(ext\)s"
 42 |                 print(cmd)
 43 |                 results = os.system(cmd)
 44 |                 print("下载状态" + str(results))
 45 |                 if results == 0:
 46 |                     pass
 47 |                 else:
 48 |                     cmd1 = "youtube-dl " + video_u + " -o " +video_file + "/" + "%\(title\)s.%\(ext\)s"
 49 |                     print(cmd1)
 50 |                     # 添加下载任务
 51 |                     results = os.system(cmd1)
 52 |                     print("画质下载错误,使用备用方案，选择已有最好画质下载")
 53 |                     print("修正下载状态码" + str(results))
 54 |                 # 请注释这条添加语句
 55 |                 mysql.add_video(url)
 56 |             else:
 57 |                 print("视频重复:"+video_u)
 58 |         page += 50
 59 |         num += 1
 60 | 
 61 | 
 62 | def por_type(start_num, end_num, video_url, charge_type, video_file, host, user, password, name, user_name, user_pwd):
 63 |     start_num = int(start_num)
 64 |     end_num = int(end_num)
 65 |     # 创建mysql工具类 请注释
 66 |     mysql = ManagementMysql(host, user, password, name)
 67 |     urls = 'https://cn.pornhub.com'
 68 |     headers = {
 69 |         'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.52 Safari/536.5'
 70 |     }
 71 |     # 不收费类型
 72 |     if charge_type == 1:
 73 |         while start_num <= end_num:
 74 |             urls = video_url + '&page=' + str(start_num)
 75 |             html = requests.get(urls, headers=headers).text
 76 |             html = etree.HTML(html)
 77 |             video_list = html.xpath('//*[@data-segment="straight"]/div/div[1]/a/@href')
 78 |             for vl in video_list:
 79 |                 # 拼接地址下载
 80 |                 url = urls+vl
 81 |                 if mysql.check_video(url) == "OK":
 82 |                     # 添加下载任务  Linux="youtube-dl " + url + " -o " +video_file + "/" + "%\(title\)s.%\(ext\)s"
 83 |                     cmd = "youtube-dl -f 1080p " + url + " -o " + video_file + "/" + "%\(title\)s.%\(ext\)s"
 84 |                     results = os.system(cmd)
 85 |                     print("下载状态" + str(results))
 86 |                     if results == 0:
 87 |                         pass
 88 |                     else:
 89 |                         cmd1 = "youtube-dl " + url + " -o " +video_file + "/" + "%\(title\)s.%\(ext\)s"
 90 |                         # 添加下载任务
 91 |                         results = os.system(cmd1)
 92 |                         print("画质下载错误,使用备用方案，选择已有最好画质下载")
 93 |                         print("修正下载状态码" + str(results))
 94 |                     # 请注释这条添加语句
 95 |                     mysql.add_video(url)
 96 |                 else:
 97 |                     print("视频重复:"+url)
 98 | 
 99 |             start_num += 1
100 |     # 收费类型
101 |     elif charge_type == 2:
102 |         urls = 'https://cn.pornhubpremium.com'
103 |         headers = {
104 |             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36",
105 |             'Referer': 'https://cn.pornhubpremium.com/premium/login'
106 |         }
107 | 
108 |         html_login = requests.get('https://cn.pornhubpremium.com/premium/login')
109 |         h_cookie = html_login.cookies
110 |         h_cookie = requests.utils.dict_from_cookiejar(h_cookie)
111 |         html_login = html_login.text
112 |         html = etree.HTML(html_login)
113 |         token = html.xpath('//*[@id="token"]/@value')
114 |         token = token[0]
115 |         redirect = {'from': 'pc_premium_login', 'segment': 'straight'}
116 |         # username=用户名  password=密码
117 |         data = {'username': user_name, 'password': user_pwd, 'token': token, 'redirect': redirect}
118 |         r = requests.post("https://cn.pornhubpremium.com/front/authenticate", headers=headers, data=data,
119 |                           cookies=h_cookie)
120 |         # print(r.text)
121 |         cookice = r.cookies
122 |         cookice = requests.utils.dict_from_cookiejar(cookice)
123 | 
124 |         while start_num <= end_num:
125 |             urls = video_url + '&page=' + str(start_num)
126 |             html = requests.get(urls, headers=headers, cookice=cookice).text
127 |             html = etree.HTML(html)
128 |             video_list = html.xpath('//*[@data-segment="straight"]/div/div[1]/a/@href')
129 |             for vl in video_list:
130 |                 # 拼接地址下载
131 |                 url = urls+vl
132 |                 # 如果需要进行去重验证请注释这条IF语句
133 |                 if mysql.check_video(url) == "OK":
134 |                     # 添加下载任务  Linux="youtube-dl " + url + " -o " +video_file + "/" + "%\(title\)s.%\(ext\)s"
135 |                     cmd = "youtube-dl -f 1080p " + url +' --cookies cookies.txt -o ' + video_file + "/" + "%\(title\)s.%\(ext\)s"
136 |                     results = os.system(cmd)
137 |                     print("下载状态" + str(results))
138 |                     if results == 0:
139 |                         pass
140 |                     else:
141 |                         cmd1 = "youtube-dl " + url +' --cookies cookies.txt -o ' + video_file + "/" + "%\(title\)s.%\(ext\)s"
142 |                         # 添加下载任务
143 |                         results = os.system(cmd1)
144 |                         print("画质下载错误,使用备用方案，选择已有最好画质下载")
145 |                         print("修正下载状态码" + str(results))
146 |                     # 请注释这条添加语句
147 |                     mysql.add_video(url)
148 |                 else:
149 |                     print("视频重复:"+url)
150 |             start_num += 1


--------------------------------------------------------------------------------
/pornhub_crawler/安装库.txt:
--------------------------------------------------------------------------------
1 | requests=2.23.0
2 | lxml=1.3.0
3 | pymysql=0.9.3
4 | YouTube-dl=2020.6.16.1


--------------------------------------------------------------------------------