├── .vs ├── ProjectSettings.json ├── VSWorkspaceState.json ├── pornhubbot │ └── v15 │ │ └── .suo └── slnx.sqlite ├── LICENSE ├── PornHub ├── PornHub │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── middlewares.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ ├── pornhub_type.cpython-36.pyc │ │ ├── settings.cpython-36.pyc │ │ └── user_agents.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── pornhub_type.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── pornHubSpider.cpython-36.pyc │ │ └── pornHubSpider.py │ └── user_agents.py ├── cataline.log ├── quickstart.py └── scrapy.cfg ├── README.md ├── README_zh.md ├── img ├── PornHubCode.png ├── WebHubCode.png ├── WebHubCode2.png ├── contribute.png ├── fukuan.png ├── gongzhonghao.png ├── mongodb.png ├── qrcode.jpg ├── running.png └── xiaomiquan.png └── requirements.txt /.vs/ProjectSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "CurrentProjectSetting": null 3 | } -------------------------------------------------------------------------------- /.vs/VSWorkspaceState.json: -------------------------------------------------------------------------------- 1 | { 2 | "ExpandedNodes": [ 3 | "", 4 | "\\PornHub", 5 | "\\PornHub\\PornHub", 6 | "\\PornHub\\PornHub\\spiders" 7 | ], 8 | "SelectedNode": "\\PornHub\\PornHub\\spiders\\pornHubSpider.py", 9 | "PreviewInSolutionExplorer": false 10 | } -------------------------------------------------------------------------------- /.vs/pornhubbot/v15/.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/.vs/pornhubbot/v15/.suo -------------------------------------------------------------------------------- /.vs/slnx.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/.vs/slnx.sqlite -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 xiyouMc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PornHub/PornHub/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__init__.py -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/middlewares.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/middlewares.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/pornhub_type.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/pornhub_type.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/__pycache__/user_agents.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/user_agents.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from scrapy import Item, Field 4 | 5 | 6 | class PornVideoItem(Item): 7 | video_title = Field() 8 | image_url = Field() 9 | video_duration = Field() 10 | quality_480p = Field() 11 | video_views = Field() 12 | video_rating = Field() 13 | link_url = Field() 14 | -------------------------------------------------------------------------------- /PornHub/PornHub/middlewares.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import random 3 | from PornHub.user_agents import agents 4 | import json 5 | 6 | 7 | class UserAgentMiddleware(object): 8 | """ 换User-Agent """ 9 | 10 | def process_request(self, request, spider): 11 | agent = random.choice(agents) 12 | request.headers["User-Agent"] = agent 13 | 14 | 15 | class CookiesMiddleware(object): 16 | """ 换Cookie """ 17 | cookie = { 18 | 'platform': 'pc', 19 | 'ss': '367701188698225489', 20 | 'bs': '%s', 21 | 'RNLBSERVERID': 'ded6699', 22 | 'FastPopSessionRequestNumber': '1', 23 | 'FPSRN': '1', 24 | 'performance_timing': 'home', 25 | 'RNKEY': '40859743*68067497:1190152786:3363277230:1' 26 | } 27 | 28 | def process_request(self, request, spider): 29 | bs = '' 30 | for i in range(32): 31 | bs += chr(random.randint(97, 122)) 32 | _cookie = json.dumps(self.cookie) % bs 33 | request.cookies = json.loads(_cookie) 34 | 35 | 36 | class RandomUserAgent(object): 37 | def __init__(self, agents): 38 | self.agents = agents 39 | 40 | @classmethod 41 | def from_crawler(cls, crawler): 42 | return cls(crawler.settings.getlist('USER_AGENTS')) 43 | 44 | def process_request(self, request, spider): 45 | request.headers.setdefault('User-Agent', random.choice(self.agents)) -------------------------------------------------------------------------------- /PornHub/PornHub/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | from pymongo import IndexModel, ASCENDING 10 | from PornHub.items import PornVideoItem 11 | from scrapy.pipelines.images import ImagesPipeline 12 | from scrapy import Request 13 | import json 14 | 15 | 16 | class PornhubMongoDBPipeline(object): 17 | def __init__(self): 18 | clinet = pymongo.MongoClient("localhost", 27017) 19 | db = clinet["PornHub"] 20 | self.PhRes = db["PhRes"] 21 | idx = IndexModel([('link_url', ASCENDING)], unique=True) 22 | self.PhRes.create_indexes([idx]) 23 | # if your existing DB has duplicate records, refer to: 24 | # https://stackoverflow.com/questions/35707496/remove-duplicate-in-mongodb/35711737 25 | 26 | def process_item(self, item, spider): 27 | print('MongoDBItem', item) 28 | """ 判断类型 存入MongoDB """ 29 | if isinstance(item, PornVideoItem): 30 | print('PornVideoItem True') 31 | try: 32 | self.PhRes.update_one({'link_url': item['link_url']}, {'$set': dict(item)}, upsert=True) 33 | except Exception: 34 | pass 35 | return item 36 | 37 | 38 | class ImageCachePipeline(ImagesPipeline): 39 | def get_media_requests(self, item, info): 40 | pics = item['pics'] 41 | list = json.loads(pics) 42 | for image_url in list: 43 | yield Request(image_url) 44 | 45 | def item_completed(self, results, item, info): 46 | image_paths=[x['path'] for ok,x in results if ok] 47 | if not image_paths: 48 | print("图片未下载好:%s" % image_paths) 49 | raise DropItem('图片未下载好 %s'%image_paths) -------------------------------------------------------------------------------- /PornHub/PornHub/pornhub_type.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | """归纳PornHub资源链接""" 3 | PH_TYPES = [ 4 | '', 5 | 'recommended', 6 | 'video?o=ht', # hot 7 | 'video?o=mv', # Most Viewed 8 | 'video?o=tr' #Top Rate 9 | ] -------------------------------------------------------------------------------- /PornHub/PornHub/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for pornhub project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'PornHub' 13 | 14 | SPIDER_MODULES = ['PornHub.spiders'] 15 | NEWSPIDER_MODULE = 'PornHub.spiders' 16 | 17 | #设置图片保存到本地的地址和过期时间 18 | IMAGES_STORE='/Users/payu/Pictures/Meizi' 19 | IMAGES_EXPIRES = 90 20 | 21 | DOWNLOAD_DELAY = 1 # 间隔时间 22 | # LOG_LEVEL = 'INFO' # 日志级别 23 | CONCURRENT_REQUESTS = 20 # 默认为16 24 | # CONCURRENT_ITEMS = 1 25 | # CONCURRENT_REQUESTS_PER_IP = 1 26 | REDIRECT_ENABLED = False 27 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 28 | #USER_AGENT = 'pornhub (+http://www.yourdomain.com)' 29 | 30 | # Obey robots.txt rules 31 | ROBOTSTXT_OBEY = True 32 | 33 | DOWNLOADER_MIDDLEWARES = { 34 | # 'PornHub.middlewares.RandomUserAgent': 1, 35 | "PornHub.middlewares.UserAgentMiddleware": 401, 36 | "PornHub.middlewares.CookiesMiddleware": 402, 37 | } 38 | 39 | ITEM_PIPELINES = { 40 | "PornHub.pipelines.PornhubMongoDBPipeline": 403, 41 | "PornHub.pipelines.ImageCachePipeline": 500, 42 | } 43 | 44 | FEED_URI=u'C:/Users/payu/Documents/pornhub.csv' 45 | FEED_FORMAT='CSV' 46 | 47 | DEPTH_PRIORITY = 1 48 | SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue' 49 | SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue' 50 | -------------------------------------------------------------------------------- /PornHub/PornHub/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /PornHub/PornHub/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/spiders/__pycache__/pornHubSpider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/spiders/__pycache__/pornHubSpider.cpython-36.pyc -------------------------------------------------------------------------------- /PornHub/PornHub/spiders/pornHubSpider.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import requests 3 | import logging 4 | from scrapy.spiders import CrawlSpider 5 | from scrapy.selector import Selector 6 | from PornHub.items import PornVideoItem 7 | from PornHub.pornhub_type import PH_TYPES 8 | from scrapy.http import Request 9 | import re 10 | import json 11 | import random 12 | 13 | 14 | class Spider(CrawlSpider): 15 | name = 'pornHubSpider' 16 | host = 'https://www.pornhub.com' 17 | start_urls = list(set(PH_TYPES)) 18 | logging.getLogger("requests").setLevel(logging.WARNING 19 | ) # 将requests的日志级别设成WARNING 20 | logging.basicConfig( 21 | level=logging.DEBUG, 22 | format= 23 | '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 24 | datefmt='%a, %d %b %Y %H:%M:%S', 25 | filename='cataline.log', 26 | filemode='w') 27 | 28 | # test = True 29 | def start_requests(self): 30 | for ph_type in self.start_urls: 31 | yield Request(url='https://www.pornhub.com/%s' % ph_type, 32 | callback=self.parse_ph_key) 33 | 34 | def parse_ph_key(self, response): 35 | selector = Selector(response) 36 | logging.debug('request url:------>' + response.url) 37 | # logging.info(selector) 38 | divs = selector.xpath('//div[@class="phimage"]') 39 | for div in divs: 40 | # logging.debug('divs :------>' + div.extract()) 41 | 42 | viewkey = re.findall('viewkey=(.*?)"', div.extract()) 43 | # logging.debug(viewkey) 44 | yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0], 45 | callback=self.parse_ph_info) 46 | url_next = selector.xpath( 47 | '//a[@class="orangeButton" and text()="Next "]/@href').extract() 48 | logging.debug(url_next) 49 | if url_next: 50 | # if self.test: 51 | logging.debug(' next page:---------->' + self.host + url_next[0]) 52 | yield Request(url=self.host + url_next[0],callback=self.parse_ph_key) 53 | # self.test = False 54 | 55 | def parse_ph_info(self, response): 56 | phItem = PornVideoItem() 57 | selector = Selector(response) 58 | # logging.info(selector) 59 | _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) 60 | logging.debug('PH信息的JSON:') 61 | logging.debug(_ph_info) 62 | _ph_info_json = json.loads(_ph_info[0]) 63 | duration = _ph_info_json.get('video_duration') 64 | phItem['video_duration'] = duration 65 | title = _ph_info_json.get('video_title') 66 | phItem['video_title'] = title 67 | image_url = _ph_info_json.get('image_url') 68 | phItem['image_url'] = image_url 69 | link_url = _ph_info_json.get('link_url') 70 | phItem['link_url'] = link_url 71 | quality_480p = _ph_info_json.get('quality_480p') 72 | phItem['quality_480p'] = quality_480p 73 | logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) 74 | yield phItem 75 | -------------------------------------------------------------------------------- /PornHub/PornHub/user_agents.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | agents = [ 4 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 5 | "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)", 6 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 7 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9", 8 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7", 9 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14", 10 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14", 11 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20", 12 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27", 13 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1", 14 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2", 15 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7", 16 | "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre", 17 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10", 18 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)", 19 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5", 20 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)", 21 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 22 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 23 | "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0", 24 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2", 25 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1", 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre", 27 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )", 28 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)", 29 | "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a", 30 | "Mozilla/2.02E (Win95; U)", 31 | "Mozilla/3.01Gold (Win95; I)", 32 | "Mozilla/4.8 [en] (Windows NT 5.1; U)", 33 | "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)", 34 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 35 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0", 36 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 37 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 38 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 39 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 40 | "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 41 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 42 | "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 43 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 44 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 45 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3", 46 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 47 | "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 48 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1", 49 | "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 50 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 51 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 52 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 53 | "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 54 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 55 | "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3", 56 | "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 57 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 58 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17", 59 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 60 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 61 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 62 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 63 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2", 64 | "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 65 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1", 66 | ] 67 | -------------------------------------------------------------------------------- /PornHub/cataline.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/cataline.log -------------------------------------------------------------------------------- /PornHub/quickstart.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | cmdline.execute("scrapy crawl pornHubSpider".split()) 4 | -------------------------------------------------------------------------------- /PornHub/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = PornHub.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = PornHub 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ![][py2x] [![GitHub forks][forks]][network] [![GitHub stars][stars]][stargazers] [![GitHub license][license]][lic_file] 3 | > Disclaimer: This project is intended to study the Scrapy Spider Framework and the MongoDB database, it cannot be used for commercial or other personal intentions. If used improperly, it will be the individuals bear. 4 | 5 | * The project is mainly used for crawling PornHub, the largest adult site in the world. In doing so it retrieves video titles, duration, mp4 link, cover url and direct PornHub url. 6 | * This project crawls PornHub.com quickly, but with a simple structure. 7 | * This project can crawl up to 5 millon PornHub videos per day, depending on your personal network. Because of my slow bandwith my results are relatively slow. 8 | * The crawler requests 10 threads at a time, and because of this can achieve the speed mentioned above. If your network is more performant you can request more threads and crawl a larger amount of videos per day. For the specific configuration see [pre-boot configuration] 9 | 10 | 11 | ## Environment, Architecture 12 | 13 | Language: Python2.7 14 | 15 | Environment: MacOS, 4G RAM 16 | 17 | Database: MongoDB 18 | 19 | * Mainly uses the scrapy reptile framework. 20 | * Join to the Spider randomly by extracted from the Cookie pool and UA pool. 21 | * Start_requests start five Request based on PornHub classification, and crawl the five categories at the same time. 22 | * Support paging crawl data, and join to the queue. 23 | 24 | ## Instructions for use 25 | 26 | ### Pre-boot configuration 27 | 28 | * Install MongoDB and start without configuration 29 | * Install Python dependent modules:Scrapy, pymongo, requests or `pip install -r requirements.txt` 30 | * Modify the configuration by needed, such as the interval time, the number of threads, etc. 31 | 32 | ### Start up 33 | 34 | * cd PornHub 35 | * python quickstart.py 36 | 37 | 38 | ## Run screenshots 39 | ![](https://github.com/xiyouMc/PornHubBot/blob/master/img/running.png?raw=true) 40 | ![](https://github.com/xiyouMc/PornHubBot/blob/master/img/mongodb.png?raw=true) 41 | 42 | ## Database description 43 | 44 | The table in the database that holds the data is PhRes. The following is a field description: 45 | 46 | #### PhRes table: 47 | 48 | video_title: The title of the video, and as a unique. 49 | link_url: Video jump to PornHub`s link 50 | image_url: Video cover link 51 | video_duration: The length of the video, in seconds 52 | quality_480p: Video 480p mp4 download address 53 | 54 | ## For Chinese 55 | 56 | * 关注微信公众号,学习Python开发 57 | 58 | 图片名称 59 | 60 | 61 | 62 | [py2x]: https://img.shields.io/badge/python-2.x-brightgreen.svg 63 | [issues_img]: https://img.shields.io/github/issues/xiyouMc/WebHubBot.svg 64 | [issues]: https://github.com/xiyouMc/WebHubBot/issues 65 | 66 | [forks]: https://img.shields.io/github/forks/xiyouMc/WebHubBot.svg 67 | [network]: https://github.com/xiyouMc/WebHubBot/network 68 | 69 | [stars]: https://img.shields.io/github/stars/xiyouMc/WebHubBot.svg 70 | [stargazers]: https://github.com/xiyouMc/WebHubBot/stargazers 71 | 72 | [license]: https://img.shields.io/badge/license-MIT-blue.svg 73 | [lic_file]: https://raw.githubusercontent.com/xiyouMc/WebHubBot/master/LICENSE 74 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | ![][py2x] [![GitHub forks][forks]][network] [![GitHub stars][stars]][stargazers] [![GitHub license][license]][lic_file] 2 | > 免责声明:本项目旨在学习Scrapy爬虫框架和MongoDB数据库,不可使用于商业和个人其他意图。若使用不当,均由个人承担。 3 | 4 | 图片名称 5 | 6 | 7 | ## 简介 8 | 9 | * 项目主要是爬取全球最大成人网站PornHub的视频标题、时长、mp4链接、封面URL和具体的PornHub链接 10 | * 项目爬的是PornHub.com,结构简单,速度飞快 11 | * 爬取PornHub视频的速度可以达到500万/天以上。具体视个人网络情况,因为我是家庭网络,所以相对慢一点。 12 | * 10个线程同时请求,可达到如上速度。若个人网络环境更好,可启动更多线程来请求,具体配置方法见 [启动前配置] 13 | 14 | 15 | ## 环境、架构 16 | 17 | 开发语言: Python2.7 18 | 19 | 开发环境: MacOS系统、4G内存 20 | 21 | 数据库: MongoDB 22 | 23 | * 主要使用 scrapy 爬虫框架 24 | * 从Cookie池和UA池中随机抽取一个加入到Spider 25 | * start_requests 根据 PorbHub 的分类,启动了5个Request,同时对五个分类进行爬取。 26 | * 并支持分页爬取数据,并加入到待爬队列。 27 | 28 | ## 使用说明 29 | 30 | ### 启动前配置 31 | 32 | * 安装MongoDB,并启动,不需要配置 33 | * 安装Python的依赖模块:Scrapy, pymongo, requests 或 `pip install -r requirements.txt` 34 | * 根据自己需要修改 Scrapy 中关于 间隔时间、启动Requests线程数等得配置 35 | 36 | ### 启动 37 | 38 | * python PornHub/quickstart.py 39 | 40 | ## 运行截图 41 | ![](https://github.com/xiyouMc/PornHubBot/blob/master/img/running.png?raw=true) 42 | ![](https://github.com/xiyouMc/PornHubBot/blob/master/img/mongodb.png?raw=true) 43 | 44 | ## 数据库说明 45 | 46 | 数据库中保存数据的表是 PhRes。以下是字段说明: 47 | 48 | #### PhRes 表: 49 | 50 | video_title:视频的标题,并作为唯一标识. 51 | link_url:视频调转到PornHub的链接 52 | image_url:视频的封面链接 53 | video_duration:视频的时长,以 s 为单位 54 | quality_480p: 视频480p的 mp4 下载地址 55 | 56 | 57 | [py2x]: https://img.shields.io/badge/python-2.x-brightgreen.svg 58 | [issues_img]: https://img.shields.io/github/issues/xiyouMc/WebHubBot.svg 59 | [issues]: https://github.com/xiyouMc/WebHubBot/issues 60 | 61 | [forks]: https://img.shields.io/github/forks/xiyouMc/WebHubBot.svg 62 | [network]: https://github.com/xiyouMc/WebHubBot/network 63 | 64 | [stars]: https://img.shields.io/github/stars/xiyouMc/WebHubBot.svg 65 | [stargazers]: https://github.com/xiyouMc/WebHubBot/stargazers 66 | 67 | [license]: https://img.shields.io/badge/license-MIT-blue.svg 68 | [lic_file]: https://raw.githubusercontent.com/xiyouMc/WebHubBot/master/LICENSE 69 | -------------------------------------------------------------------------------- /img/PornHubCode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/PornHubCode.png -------------------------------------------------------------------------------- /img/WebHubCode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/WebHubCode.png -------------------------------------------------------------------------------- /img/WebHubCode2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/WebHubCode2.png -------------------------------------------------------------------------------- /img/contribute.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/contribute.png -------------------------------------------------------------------------------- /img/fukuan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/fukuan.png -------------------------------------------------------------------------------- /img/gongzhonghao.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/gongzhonghao.png -------------------------------------------------------------------------------- /img/mongodb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/mongodb.png -------------------------------------------------------------------------------- /img/qrcode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/qrcode.jpg -------------------------------------------------------------------------------- /img/running.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/running.png -------------------------------------------------------------------------------- /img/xiaomiquan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/xiaomiquan.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | asn1crypto==0.22.0 # via cryptography 2 | attrs==17.2.0 # via automat, service-identity 3 | Automat==0.6.0 # via twisted 4 | certifi==2017.4.17 # via requests 5 | cffi==1.10.0 # via cryptography 6 | chardet==3.0.4 # via requests 7 | constantly==15.1.0 # via twisted 8 | cryptography==1.9 # via pyopenssl 9 | cssselect==1.0.1 # via parsel, scrapy 10 | enum34==1.1.6 # via cryptography 11 | hyperlink==17.2.1 # via twisted 12 | idna==2.5 # via cryptography, requests 13 | incremental==17.5.0 # via twisted 14 | ipaddress==1.0.18 # via cryptography 15 | lxml==3.8.0 # via parsel, scrapy 16 | parsel==1.2.0 # via scrapy 17 | pyasn1-modules==0.0.9 # via service-identity 18 | pyasn1==0.2.3 # via pyasn1-modules, service-identity 19 | pycparser==2.17 # via cffi 20 | PyDispatcher==2.0.5 # via scrapy 21 | pymongo==3.4.0 22 | pyopenssl==17.0.0 # via scrapy, service-identity 23 | queuelib==1.4.2 # via scrapy 24 | requests==2.18.1 25 | Scrapy==1.4.0 26 | service-identity==17.0.0 # via scrapy 27 | six==1.10.0 # via automat, cryptography, parsel, pyopenssl, scrapy, w3lib 28 | Twisted==17.5.0 # via scrapy 29 | urllib3==1.21.1 # via requests 30 | w3lib==1.17.0 # via parsel, scrapy 31 | zope.interface==4.4.2 # via twisted 32 | --------------------------------------------------------------------------------