├── 32$)MJ2AG$VS5Z]45`5SI)Y.png ├── README.md ├── TGZZ9ICFB$}3TWRKB~_UH57.png └── pornhub ├── pornhub ├── spiders │ └── phb.py └── settings.py ├── pornhub ├── __pycache__ │ ├── __init__.cpython-36.pyc │ ├── items.cpython-36.pyc │ ├── middlewares.cpython-36.pyc │ ├── pipelines.cpython-36.pyc │ └── settings.cpython-36.pyc ├── items.py ├── middlewares.py ├── pipelines.py └── spiders │ ├── __init__.py │ └── __pycache__ │ ├── __init__.cpython-36.pyc │ └── phb.cpython-36.pyc └── scrapy.cfg /32$)MJ2AG$VS5Z]45`5SI)Y.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/32$)MJ2AG$VS5Z]45`5SI)Y.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pornhub_spider 2 | 爬取pornhub 上所有视频 3 | 4 | 5 | 一天大概爬取8w多条,出错率低于1%左右(页面问题) 6 | 7 | 8 | 9 | 10 | 11 | 全部分类页面开始进行爬取 12 | https://www.pornhub.com/categories?o=al 13 | 14 | 其中每个详情页的数据还是很好看到的,视频url是给html和json混合在一起的,使用re可以找到,根据清晰度选择清晰度url,想要1080p视频需要登录,使用多个cookies可以再页面上得到1080p的url 15 | 16 | 视频截图 还是进行一定混淆,含在json数据中 ...S{数字}.jpg 是含有多少张视频截图图片,得到数字值,将img切割,推导式拼接 17 | 18 | ---------------------------------------------------------------------------------------------- 19 | ---------------------------------------------------------------------------------------------- 20 | 21 | 保存在MongoDB中的中的中,以每个分类tag作为一个集合保存每个分类中的视频 22 | tag 属性 23 | duration 时长 24 | title 标题 25 | link_url 页面url 26 | count 观看次数 27 | video_tags 视频属性 28 | percent 好评率 29 | img_url 视频封面 30 | video_screenshot_imgs 视频截图图片 31 | video_url 视频url(1080,720,480,240) 32 | 33 | ---------------------------------------------------------------------------------------------- 34 | ---------------------------------------------------------------------------------------------- 35 | 36 | 37 | 38 | ip 我只有免费vpn,还是小水管10K-60K,请求频率也不是很快,一秒一个左右,暂时发现一定程度(6000-15000)会出现ip发现是爬虫 39 | 40 | 保存在MongoDB中的中的中,以每个分类tag作为一个集合保存每个分类中的视频 41 | 42 | 如果保存失败 erorr_save.txt 保存其中,等待之后进行重新抓取 43 | 44 | 45 | 抓取过程中如果发现页面中没有视频的url,那么可能是页面格式还是啥的有问题,保存为 erorr_request.txt 46 | 47 | 48 | 本来 img_url ,video_screenshot_imgs ,video_url 有一定该类出现错误,选择 try...except 以免出错影响程序 49 | 50 | ---------------------------------------------------------------------------------------------- 51 | ---------------------------------------------------------------------------------------------- 52 | 53 | 如果想要分布式,请在settings.py和主程序phb.py中将注释注释掉 54 | 55 | ---------------------------------------------------------------------------------------------- 56 | ---------------------------------------------------------------------------------------------- 57 | 如果想要下载视频,请请求中携带UA 和 link_url,403警告 58 | -------------------------------------------------------------------------------- /TGZZ9ICFB$}3TWRKB~_UH57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/TGZZ9ICFB$}3TWRKB~_UH57.png -------------------------------------------------------------------------------- /pornhub/ pornhub/ spiders/phb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | from pornhub.items import PornhubItem 5 | from scrapy_redis.spiders import RedisSpider 6 | 7 | 8 | class PhbSpider(scrapy.Spider): 9 | name = 'phb' 10 | allowed_domains = ['pornhub.com'] 11 | start_urls = ['https://www.pornhub.com/categories?o=al'] 12 | 13 | # 分布式 14 | # class PhbSpider(RedisSpider): 15 | # name = 'phb' 16 | # allowed_domains = ['pornhub.com'] 17 | # # start_urls = ['https://www.pornhub.com/categories?o=al'] 18 | # redis_key = 'start_urls:phb' 19 | 20 | # 所有104个分类 属性 21 | def parse(self, response): 22 | 23 | lis = response.xpath('//ul[@id="categoriesListSection"]/li') 24 | for li in lis: 25 | href = response.urljoin(li.xpath('./div/h5/a/@href').get()) 26 | tag = li.xpath('./div/h5/a/strong/text()').get() 27 | 28 | yield scrapy.Request(url=href, 29 | callback=self.parse_video, 30 | meta={'item':tag}) 31 | 32 | 33 | 34 | def parse_video(self,response): 35 | 36 | tag = response.meta.get('item') 37 | 38 | # 第一个是广告,剔除 39 | lis = response.xpath('//ul[@id="videoCategory"]/li')[1:] 40 | for li in lis: 41 | duration = li.xpath('./div//var[@class="duration"]/text()').get() 42 | href = response.urljoin(li.xpath('.//span[@class="title"]/a/@href').get()) 43 | 44 | yield scrapy.Request(url=href, 45 | callback=self.video_content, 46 | meta={'item':(tag,duration)}) 47 | 48 | # 下一页 49 | next_url = response.urljoin(response.xpath('//li[@class="page_next"]/a/@href').get()) 50 | if next_url: 51 | yield scrapy.Request(url=next_url, 52 | callback=self.parse_video, 53 | meta={'item': tag}) 54 | 55 | 56 | 57 | def video_content(self,response): 58 | tag,duration = response.meta.get('item') 59 | 60 | item = PornhubItem() 61 | 62 | link_url = response.url 63 | 64 | try: 65 | title = response.xpath('//span[@class="inlineFree"]/text()').get() 66 | except: 67 | title = None 68 | 69 | try: 70 | count = response.xpath('//span[@class="count"]/text()').get() 71 | except: 72 | count = None 73 | 74 | try: 75 | video_tags = ','.join(response.xpath('//div[@class="categoriesWrapper"]/a//text()').getall()) 76 | except: 77 | video_tags = None 78 | 79 | try: 80 | percent = response.xpath('//span[@class="percent"]/text()').get() 81 | except: 82 | percent = None 83 | 84 | try: 85 | img_url = response.xpath('//meta[@property="og:image"]/@content').get() 86 | except: 87 | img_url = None 88 | 89 | 90 | 91 | 92 | # 得到视频截图,其中S{?} 代表有多少个视频截图,得到值,将img切割,推导式拼接 93 | # 有一定情况出现问题 94 | try: 95 | video_screenshot_img = re.findall('"urlPattern":"(.*?)","thumbHeig',response.text)[0] 96 | num = int(re.findall('S{(\d+)}',video_screenshot_img)[0]) 97 | start_video_img = video_screenshot_img.split('S{')[0] 98 | video_screenshot_imgs = [start_video_img+'S{}.jpg'.format(i) for i in range(1,num)] 99 | except: 100 | video_screenshot_imgs = None 101 | 102 | 103 | # 有一定情况出现问题 104 | # 网络小水管,1080p是必须登陆才能在页面上看到的,如果需要请携带cookies重写中间件 105 | # 如果需要下载视频,请携带请求头请求,不然返回403 106 | # if '"quality":"1080"' in response.text: 107 | # video_url = re.findall('"quality":"1080","videoUrl":"(.*?)"},',response.text,re.S|re.I)[0] 108 | try: 109 | if '"quality":"720"' in response.text: 110 | video_url = re.findall('"quality":"720","videoUrl":"(.*?)"},', response.text, re.S | re.I)[0] 111 | elif '"quality":"480"' in response.text: 112 | video_url = re.findall('"quality":"480","videoUrl":"(.*?)"},', response.text, re.S | re.I)[0] 113 | elif '"quality":"240"' in response.text: 114 | video_url = re.findall('"quality":"240","videoUrl":"(.*?)"},', response.text, re.S | re.I)[0] 115 | except: 116 | # video_url = None 117 | with open('erorr_request.txt','a')as f: 118 | f.write(title+','+link_url) 119 | f.write('\n') 120 | 121 | item['tag'] = tag 122 | item['duration'] = duration 123 | item['title'] = title 124 | item['link_url'] = link_url 125 | item['count'] = count 126 | item['video_tags'] = video_tags 127 | item['percent'] = percent 128 | item['img_url'] = img_url 129 | item['video_screenshot_imgs'] = video_screenshot_imgs 130 | item['video_url'] = video_url 131 | 132 | yield item 133 | 134 | -------------------------------------------------------------------------------- /pornhub/ pornhub/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | 5 | BOT_NAME = 'pornhub' 6 | SPIDER_MODULES = ['pornhub.spiders'] 7 | NEWSPIDER_MODULE = 'pornhub.spiders' 8 | 9 | ROBOTSTXT_OBEY = False 10 | # log输出等级 11 | LOG_LEVEL = 'WARNING' 12 | # 爬虫运行多久关闭 13 | # CLOSESPIDER_TIMEOUT = 86400 # 24小时*3600秒 = 86400 14 | # DOWNLOAD_DELAY = 2 15 | DEFAULT_REQUEST_HEADERS = { 16 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 17 | 'Accept-Language': 'en', 18 | } 19 | DOWNLOADER_MIDDLEWARES = { 20 | 'pornhub.middlewares.Pornhub_UA_Middleware': 543, 21 | } 22 | ITEM_PIPELINES = { 23 | 'pornhub.pipelines.PornhubPipeline': 300, 24 | } 25 | 26 | 27 | 28 | 29 | MONGODB_HOST = '127.0.0.1' 30 | MONGODB_POST = 27017 31 | 32 | #使用Scrapy-Redis的调度器 33 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler" 34 | # # 利用Redis的集合实现去重 35 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 36 | # # 允许继续爬取 37 | # SCHEDULER_PERSIST = True 38 | # # 设置优先级 39 | # SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue' 40 | 41 | # REDIS_HOST = 'redis的host' 42 | # REDIS_POST = 6379 43 | -------------------------------------------------------------------------------- /pornhub/pornhub/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/pornhub/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/pornhub/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/pornhub/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/pornhub/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/pornhub/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class PornhubItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | tag = scrapy.Field() # 分类属性 15 | duration = scrapy.Field() # 时长 16 | title = scrapy.Field() # 标题 17 | link_url = scrapy.Field() # 详情页url 18 | count = scrapy.Field() # 观看次数 19 | video_tags = scrapy.Field() # 视频属性 20 | percent = scrapy.Field() # 好评率 21 | img_url = scrapy.Field() # 视频封面 22 | video_screenshot_imgs = scrapy.Field() # 视频截图 23 | video_url = scrapy.Field() # 视频url 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /pornhub/pornhub/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | import user_agent 10 | import requests 11 | 12 | class Pornhub_UA_Middleware(object): 13 | def process_request(self,request,spider): 14 | request.headers['User_Agent'] = user_agent.generate_user_agent() 15 | 16 | 17 | # 没有国外ip GG 只能用vpn的固定ip 如果有请随机ip 18 | # 更新 貌似是因为小水管的问题,我也就1秒一个多左右的请求,没封ip .不过最好还是隔半天换一个ip 19 | class Pornhub_IP_Middleware(object): 20 | 21 | def __init__(self): 22 | self.ip_url = 'http://188.131.212.24:5010/get/' 23 | self.ip = '' 24 | self.ip_count = 0 25 | 26 | def process_request(self,request,spider): 27 | if self.ip_count == 0 or self.ip_count == 10: 28 | res = requests.get(self.ip_url).content.decode() 29 | if not 'no proxy!' in res: 30 | self.ip = res 31 | self.ip_count = 1 32 | request.meta['proxy'] = 'http://' + self.ip 33 | self.ip_count += 1 34 | print('ip地址>>>',self.ip) 35 | 36 | 37 | def process_exception(self, request, exception, spider): 38 | if isinstance(exception, TimeoutError): 39 | return request 40 | 41 | -------------------------------------------------------------------------------- /pornhub/pornhub/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import csv 9 | import pymongo 10 | from pornhub.items import PornhubItem 11 | from pornhub.settings import * 12 | 13 | class PornhubPipeline(object): 14 | 15 | def __init__(self): 16 | # host = MONGODB_HOST 17 | # post = MONGODB_POST 18 | name = BOT_NAME 19 | 20 | self.client = pymongo.MongoClient() 21 | self.db = self.client[name] 22 | 23 | # 用来计数的 24 | self.count = 1 25 | 26 | def process_item(self, item, spider): 27 | 28 | if isinstance(item,PornhubItem): 29 | try: 30 | self.db[item['tag']].insert(dict(item)) 31 | print('保存成功',self.count, item['link_url']) 32 | 33 | except: 34 | print('monogdb 保存失败', item['link_url']) 35 | with open('erorr_save.txt', 'a')as f: 36 | f.write(str(item)) 37 | f.write('\n') 38 | 39 | self.count += 1 40 | return item 41 | 42 | def close_spider(self, spider): 43 | self.client.close() 44 | -------------------------------------------------------------------------------- /pornhub/pornhub/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /pornhub/pornhub/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/pornhub/spiders/__pycache__/phb.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/z1421012325/pornhub_spider/449d79f27f4afa36e94dd928f5e94bd41d2a52ef/pornhub/pornhub/spiders/__pycache__/phb.cpython-36.pyc -------------------------------------------------------------------------------- /pornhub/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = pornhub.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pornhub 12 | --------------------------------------------------------------------------------