├── .gitignore ├── requirements.txt ├── images ├── crawl_result.png └── download_xunlei.png ├── pornhub ├── spiders │ ├── __init__.py │ ├── RelatedShow.py │ ├── TopRated.py │ └── spider.py ├── items.py ├── __init__.py ├── pipelines.py └── settings.py ├── scrapy.cfg └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data 3 | /data/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | scrapy 2 | pillow 3 | pysocks 4 | lxml 5 | js2py 6 | clint 7 | fire -------------------------------------------------------------------------------- /images/crawl_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adultfree/pornhub/HEAD/images/crawl_result.png -------------------------------------------------------------------------------- /images/download_xunlei.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adultfree/pornhub/HEAD/images/download_xunlei.png -------------------------------------------------------------------------------- /pornhub/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = pornhub.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pornhub 12 | -------------------------------------------------------------------------------- /pornhub/spiders/RelatedShow.py: -------------------------------------------------------------------------------- 1 | from pornhub.items import * 2 | from pornhub.spiders.spider import Spider 3 | 4 | 5 | class RelatedShowSpider(Spider): 6 | name = "related_show" 7 | # 贴上喜欢的URL,下载其关联的URL,关联深度在settings.py中定义 8 | start_urls = ['https://www.pornhub.com/view_video.php?viewkey=ph5f5bf00b43be0'] 9 | 10 | def parse(self, response): 11 | for url in self.start_urls: 12 | yield scrapy.Request(url, callback=self.parse_detail) 13 | -------------------------------------------------------------------------------- /pornhub/spiders/TopRated.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from pornhub.spiders.spider import Spider 4 | 5 | 6 | class TopRatedSpider(Spider): 7 | name = "top_rated" 8 | start_urls = ['https://www.pornhub.com/video?o=tr'] 9 | 10 | def parse(self, response): 11 | items = self.get_mainpage_items(response) 12 | for item in items: 13 | yield item 14 | url = 'https://www.pornhub.com/view_video.php?viewkey=%s' % item["key"] 15 | yield scrapy.Request(url, callback=self.parse_detail) 16 | -------------------------------------------------------------------------------- /pornhub/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class WebmItem(scrapy.Item): 10 | url = scrapy.Field() 11 | filename = scrapy.Field() 12 | key = scrapy.Field() 13 | title = scrapy.Field() 14 | 15 | 16 | class Mp4Item(scrapy.Item): 17 | url = scrapy.Field() 18 | filename = scrapy.Field() 19 | key = scrapy.Field() 20 | title = scrapy.Field() 21 | categories = scrapy.Field() 22 | uploader = scrapy.Field() 23 | pornstars = scrapy.Field() 24 | productions = scrapy.Field() 25 | tags = scrapy.Field() 26 | -------------------------------------------------------------------------------- /pornhub/__init__.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | import logging 4 | import os 5 | 6 | from pornhub.settings import FILES_STORE, DOWNLOAD_MP4_VIDEO, DOWNLOAD_WEBM_VIDEO, DATA_FILE_STORE 7 | 8 | 9 | def setup_logger(logger_name, log_file, level=logging.INFO): 10 | log_setup = logging.getLogger(logger_name) 11 | fileHandler = logging.FileHandler(log_file, mode='a') 12 | log_setup.setLevel(level) 13 | log_setup.addHandler(fileHandler) 14 | # 不跟随parent logger 15 | log_setup.propagate = False 16 | # 若需要把数据打到屏幕上,则解注释此行 17 | # streamHandler = logging.StreamHandler() 18 | # log_setup.addHandler(streamHandler) 19 | 20 | current_datetime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 21 | # 不下载的时候,把链接保存到本地目录 22 | if not DOWNLOAD_WEBM_VIDEO: 23 | setup_logger("webem_logger", "./%s-webem.log" % current_datetime) 24 | if not DOWNLOAD_MP4_VIDEO: 25 | setup_logger("mp4_logger", "./%s-mp4.log" % current_datetime) 26 | 27 | data = {} 28 | if os.path.exists(DATA_FILE_STORE): 29 | with open(DATA_FILE_STORE, 'r') as f: 30 | data = json.loads(f.read()) 31 | -------------------------------------------------------------------------------- /pornhub/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | import logging 6 | import os 7 | 8 | from scrapy.pipelines.files import FilesPipeline 9 | 10 | from . import settings 11 | from .items import * 12 | 13 | 14 | class pornhubFilesPipeline(FilesPipeline): 15 | webem_logger = logging.getLogger('webem_logger') 16 | mp4_logger = logging.getLogger('mp4_logger') 17 | 18 | def item_completed(self, results, item, info): 19 | items = super().item_completed(results, item, info) 20 | return items 21 | 22 | def get_media_requests(self, item, info): 23 | # 检查目录中是否存在该文件或正在迅雷下载该文件 24 | filename = os.path.join(settings.FILES_STORE, item['filename']) 25 | xldownload_filename = filename + ".xltd" 26 | if os.path.exists(filename) or os.path.exists(xldownload_filename): 27 | info.spider.logger.warning("忽略已下载的文件: %s" % item['filename']) 28 | return 29 | if isinstance(item, WebmItem): 30 | if settings.DOWNLOAD_WEBM_VIDEO: 31 | yield scrapy.Request(item['url'], meta={'filename': item['filename']}) 32 | else: 33 | self.webem_logger.info(item['url']) 34 | else: 35 | if settings.DOWNLOAD_MP4_VIDEO: 36 | yield scrapy.Request(item['url'], meta={'filename': item['filename']}) 37 | else: 38 | self.mp4_logger.info(item['url']) 39 | 40 | def file_path(self, request, response=None, info=None): 41 | # file_path必须使用相对路径,因为在scrapy中会用源路径.join(相对路径) 42 | return request.meta['filename'] 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pornhub视频爬取 2 | 3 | ## 简介 4 | 5 | 众所周知,[Pornhub](http://pornhub.com/)是广大宅男宅女酷爱的网站之一:深夜逛一逛,神清气爽,祝我快速进入梦乡。 6 | 7 | 这个网站已经做得相当完善,尤其是它的关联推荐,深得我心。 8 | 9 | 由于防火墙的原因,在大陆地址不允许访问该网站。我使用[国外服务器](https://justhost.ru/?ref=66813)搭梯子,才能勉强看到一些小视频和动图。 10 | 11 | 每次被动图诱惑得不行,但观看视频却卡得要死。对于深夜寂寞的人来说,这样的浏览无异于饮鸩止渴。 12 | 13 | 思来想去,决定还是花点时间,参考Github上其他同志的代码,做一个简单的爬虫吧。 14 | 15 | ## 爬取的内容 16 | 17 | 本程序提供自定义深度的视频爬取,即获得爬取的页面后,继续下载其推荐栏目的页面。 18 | 19 | 目前提供[评分最高(TopRated)](https://www.pornhub.com/video?o=tr)页面的所有视频爬取,也提供仅对你感兴趣的某个视频进行下载。 20 | 21 | ## 使用方法 22 | 23 | ```shell script 24 | # 确认Python版本为Python3 25 | python3 --version 26 | root@adultfree:~/pornhub# python3 --version 27 | Python 3.5.2 28 | # 安装依赖包 29 | root@adultfree:~/pornhub# python3 -m pip install -r requirements.txt 30 | ...... 31 | # 开始爬取评分最高的部分视频(及其推荐视频) 32 | root@adultfree:~/pornhub# scrapy crawl top_rated 33 | 34 | # 开始爬取感兴趣的某个视频(及其推荐视频) 35 | root@adultfree:~/pornhub# scrapy crawl related_show 36 | ``` 37 | 38 | ## 注意事项 39 | 40 | 注意pornhub/settings.py中的下载选项 41 | 42 | ```python 43 | # 当DOWNLOAD_MP4_VIDEO设为True时,程序会试图下载视频 44 | # 在大陆地区下载速度非常慢,因此强烈建议仅获取地址(设为False) 45 | # 当以下两项设为False时,视频链接会被保存到"日期-时间-类型.log"文件中 46 | # 打开文件,将地址批量拷贝,放入迅雷中下载,速度会快很多 47 | DOWNLOAD_MP4_VIDEO = False 48 | DOWNLOAD_WEBM_VIDEO = False 49 | ``` 50 | 51 | 注意下载深度,默认只提取当前页面的视频。 52 | * 对related_show而言,只下载`RelatedShow.py`中指定的视频(及当页的视频简介) 53 | * 对top_rated而言,只下载主页上展示的所有视频(及视频页面的视频简介) 54 | 55 | ```python 56 | # 相关视频下载的深度,默认下载当前页面的视频(不下载相关视频) 57 | # 注意:每增加一层深度,下载视频数会呈指数级增加 58 | DEPTH_LIMIT = 1 59 | ``` 60 | 61 | 另外,我仅提取画质最高的版本链接。毕竟越清晰,就越真实 :-) 62 | 63 | ## 小技巧 64 | 65 | 爬取到的链接,通过迅雷下载,一开始速度会比较快,但大概过了10分钟左右,速度会逐渐降下来。 66 | 这是因为下载链接中包含了SessionId,而Pornhub服务器应该设置了对应的过期时间。 67 | 我每次下载大概会有2、3个链接下载不成功,只能把未下载完成的视频删除,重新爬取。 68 | 重新爬取的时候,代码会检查该视频是否已存在于`data`目录下,如果已存在则不再记录下载链接,为了实现视频检查,迅雷下载的目录最好定义在`pornhub/data`下。 69 | 70 | 71 | 另一个办法是直接在代理服务器上爬取。把代码上传到代理服务器,并将`DOWNLOAD_MP4_VIDEO`设置为`True`。 72 | 在代理服务器上的下载速度大概能达到30MB/s,很快硬盘就会被撑满。 73 | 目前没有代码检测硬盘剩余容量,因此自己注意一下爬取深度。 74 | 当数据爬取到代理服务器上以后,可以简单安装个Nginx静态服务器,把下载的内容放到DocumentRoot目录下,通过迅雷直接向代理服务器下载,这样则不会出现过期问题,并且下载速度基本相同。 75 | 76 | ## 效果展示 77 | 78 | #### 爬取结果 79 | 80 | ![爬取结果](https://raw.githubusercontent.com/adultfree/pornhub/master/images/crawl_result.png) 81 | 82 | #### 下载状态 83 | 84 | ![下载状态](https://raw.githubusercontent.com/adultfree/pornhub/master/images/download_xunlei.png) 85 | 86 | ## 补充说明 87 | 88 | 如果发现爬虫无法爬取信息,那应该是因为没有使用翻墙工具。 89 | 90 | 我最近发现,俄罗斯的Josthost服务器速度非常靠谱。尤其对电信和联通用户而言,速度可以说是非常快了。 91 | 我自己搭建了ShadowSocks环境。网速最高能达到20MB/s,平时能达到3MB/s左右。 92 | 93 | **访问Justhost,建议使用Chrome浏览器+Google Translate,否则只有俄语版本** 94 | 95 | 价格极其感人,我直接购买了一年,费用1047RUB,按当时的汇率,折算成105.81RMB。 96 | 想起前天请客吃饭,就花掉了230,这价格真的很便宜了。 97 | JustHost也有一个推荐返利活动,通过[此链接注册](https://justhost.ru/?ref=66813),我可以获得5%的返利。 98 | -------------------------------------------------------------------------------- /pornhub/spiders/spider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | 4 | import js2py 5 | 6 | from pornhub import data, DATA_FILE_STORE 7 | from pornhub.items import * 8 | 9 | 10 | class Spider(scrapy.Spider): 11 | 12 | def parse(self, response): 13 | raise NotImplementedError('{}.parse callback is not defined'.format(self.__class__.__name__)) 14 | 15 | def parse_detail(self, response): 16 | item = Mp4Item() 17 | item['key'] = response.request.url.split("=")[-1] 18 | item['title'] = ''.join(response.xpath('//h1//text()').extract()).strip() 19 | item['categories'] = list(map(lambda x: x.strip(), response.xpath('//div[@class="categoriesWrapper"]/a/text()').extract())) 20 | item['uploader'] = response.xpath('//div[@class="video-info-row"]/div/span[@class="usernameBadgesWrapper"]/a/text()').extract_first() 21 | item['pornstars'] = list(map(lambda x: x.strip(), response.xpath('//div[@class="pornstarsWrapper"]/a/text()').extract())) 22 | item['productions'] = list(map(lambda x: x.strip(), response.xpath('//div[@class="productionWrapper"]/a/text()').extract())) 23 | item['tags'] = list(map(lambda x: x.strip(), response.xpath('//div[@class="tagsWrapper"]/a/text()').extract())) 24 | selectors = response.xpath('//div[@id="player"]/script[1]/text()') 25 | if len(selectors) > 0: 26 | item['url'] = self.exeJs(''.join(selectors.extract_first().split("playerObjList")[0])) 27 | item['filename'] = item['url'].split("?")[0].split("/")[-1] 28 | # 将数据保存到data数据库中 29 | if item["key"] not in data: 30 | dict_item = dict(item) 31 | dict_item.pop("url") 32 | data[item["key"]] = dict_item 33 | yield item 34 | items = self.get_recommended_items(response) 35 | for item in items: 36 | yield item 37 | url = 'https://www.pornhub.com/view_video.php?viewkey=%s' % item["key"] 38 | yield scrapy.Request(url, callback=self.parse_detail) 39 | else: 40 | self.logger.info("视频下载失败: %s", str(item)) 41 | 42 | def get_mainpage_items(self, response): 43 | recommends = map(lambda x: x.split("=")[-1], response.xpath('//*[@class="phimage"]/a/@href').extract()) 44 | titles = response.xpath('//*[@class="phimage"]/a/@title').extract() 45 | webems = response.xpath('//*[@class="phimage"]/a/img/@data-mediabook').extract() 46 | return self.get_webm_items(recommends, titles, webems) 47 | 48 | def get_recommended_items(self, response): 49 | recommends = map(lambda x: x.split("=")[-1], response.xpath("//div[@class='video-wrapper js-relatedRecommended js-relatedVideos relatedVideos']//div[@class='phimage']/a/@href").extract()) 50 | titles = response.xpath("//div[@class='video-wrapper js-relatedRecommended js-relatedVideos relatedVideos']//div[@class='phimage']/a/@title").extract() 51 | webems = response.xpath("//div[@class='video-wrapper js-relatedRecommended js-relatedVideos relatedVideos']//div[@class='phimage']/a/img/@data-mediabook").extract() 52 | return self.get_webm_items(recommends, titles, webems) 53 | 54 | def get_webm_items(self, recommends, titles, webems): 55 | items = [] 56 | for (key, title, url) in zip(recommends, titles, webems): 57 | if key == "javascript:void(0)": 58 | # 跳过仅会员查看 59 | continue 60 | item = WebmItem() 61 | item["url"] = url 62 | item["filename"] = url.split("?")[0].split("/")[-1] 63 | item["key"] = key 64 | item["title"] = title 65 | items.append(item) 66 | return items 67 | 68 | def closed(self, reason): 69 | with open(DATA_FILE_STORE, 'w') as f: 70 | f.write(json.dumps(data, indent=True)) 71 | 72 | def exeJs(self, js): 73 | flashvars = re.findall('flashvars_\d+', js)[0] 74 | res = js2py.eval_js(js + flashvars) 75 | # 对非会员而言,最高清晰度为1080p 76 | if res.quality_1080p: 77 | return res.quality_1080p 78 | if res.quality_720p: 79 | return res.quality_720p 80 | elif res.quality_480p: 81 | return res.quality_480p 82 | elif res.quality_240p: 83 | return res.quality_240p 84 | else: 85 | self.logger.error('parse url error') 86 | -------------------------------------------------------------------------------- /pornhub/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for pornhub project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 8 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 9 | import datetime 10 | import os 11 | 12 | BOT_NAME = 'pornhub' 13 | 14 | SPIDER_MODULES = ['pornhub.spiders'] 15 | NEWSPIDER_MODULE = 'pornhub.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'pornhub (+http://www.yourdomain.com)' 19 | 20 | # 当DOWNLOAD_MP4_VIDEO设为True时,程序会试图下载视频 21 | # 在大陆地区下载速度非常慢,因此强烈建议仅获取地址(设为False) 22 | # 当以下两项设为False时,视频链接会被保存到"日期-时间-类型.log"文件中 23 | # 打开文件,将地址批量拷贝,放入迅雷中下载,速度会快很多 24 | DOWNLOAD_MP4_VIDEO = False 25 | DOWNLOAD_WEBM_VIDEO = False 26 | 27 | # 相关视频下载的深度,默认下载当前页面的视频(不下载相关视频)和当前页面推荐视频的动图 28 | # 注意:每增加一层深度,下载视频数会呈指数级增加,而每个链接的有效时间有限,深度最好不要超过4。 29 | DEPTH_LIMIT = 1 30 | 31 | # 以下类型的视频不下载(暂未实现) 32 | CATEGORY_BLACK_LIST = [ 33 | # 'Big Dick', 34 | # 'Cumshot', 35 | ] 36 | 37 | TAGS_BLACK_LIST = [ 38 | ] 39 | 40 | # Obey robots.txt rules 41 | ROBOTSTXT_OBEY = False 42 | 43 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 44 | CONCURRENT_REQUESTS = 8 45 | 46 | # Configure a delay for requests for the same website (default: 0) 47 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 48 | # See also autothrottle settings and docs 49 | DOWNLOAD_DELAY = 0 50 | # The download delay setting will honor only one of: 51 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 52 | # CONCURRENT_REQUESTS_PER_IP = 16 53 | 54 | # Disable cookies (enabled by default) 55 | COOKIES_ENABLED = False 56 | 57 | # 此处的LOG LEVEL最好设置为INFO,避免大量无用数据 58 | LOG_LEVEL = 'INFO' 59 | # 解注释此行则保存到文件中 60 | # LOG_FILE = "./scrapy-%s.log" % datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 61 | 62 | # Disable Telnet Console (enabled by default) 63 | # TELNETCONSOLE_ENABLED = False 64 | 65 | # Override the default request headers: 66 | # DEFAULT_REQUEST_HEADERS = { 67 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 68 | # 'Accept-Language': 'en', 69 | # } 70 | 71 | # Enable or disable spider middlewares 72 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 73 | # SPIDER_MIDDLEWARES = { 74 | # 'pornhub.middlewares.MyCustomSpiderMiddleware': 543, 75 | # } 76 | 77 | # Enable or disable downloader middlewares 78 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 79 | # DOWNLOADER_MIDDLEWARES = { 80 | # 'pornhub.middlewares.MyCustomDownloaderMiddleware': 543, 81 | # } 82 | 83 | # Enable or disable extensions 84 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 85 | # EXTENSIONS = { 86 | # 'scrapy.extensions.telnet.TelnetConsole': None, 87 | # } 88 | 89 | # Configure item pipelines 90 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 91 | ITEM_PIPELINES = { 92 | 'pornhub.pipelines.pornhubFilesPipeline': 1, 93 | } 94 | 95 | CURRENT_DIR = os.path.abspath(os.curdir) 96 | FILES_STORE = os.path.join(CURRENT_DIR, "data") 97 | DATA_FILE_STORE = os.path.join(CURRENT_DIR, "data", "data.json") 98 | 99 | # Enable and configure the AutoThrottle extension (disabled by default) 100 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 101 | # AUTOTHROTTLE_ENABLED = True 102 | # The initial download delay 103 | # AUTOTHROTTLE_START_DELAY = 5 104 | # The maximum download delay to be set in case of high latencies 105 | # AUTOTHROTTLE_MAX_DELAY = 60 106 | # The average number of requests Scrapy should be sending in parallel to 107 | # each remote server 108 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 109 | # Enable showing throttling stats for every response received: 110 | # AUTOTHROTTLE_DEBUG = False 111 | 112 | # Enable and configure HTTP caching (disabled by default) 113 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 114 | # HTTPCACHE_ENABLED = True 115 | # HTTPCACHE_EXPIRATION_SECS = 0 116 | # HTTPCACHE_DIR = 'httpcache' 117 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 118 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 119 | RETRY_TIMES = 10 120 | 121 | # 下载超时时间,如果这么长时间还未下载成功则认为下载失败 122 | DOWNLOAD_TIMEOUT = 10000 123 | DOWNLOAD_MAXSIZE = 0 124 | DOWNLOAD_WARNSIZE = 0 125 | --------------------------------------------------------------------------------