├── .vs
├── ProjectSettings.json
├── VSWorkspaceState.json
├── pornhubbot
│ └── v15
│ │ └── .suo
└── slnx.sqlite
├── LICENSE
├── PornHub
├── PornHub
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── items.cpython-36.pyc
│ │ ├── middlewares.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ ├── pornhub_type.cpython-36.pyc
│ │ ├── settings.cpython-36.pyc
│ │ └── user_agents.cpython-36.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── pornhub_type.py
│ ├── settings.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-36.pyc
│ │ │ └── pornHubSpider.cpython-36.pyc
│ │ └── pornHubSpider.py
│ └── user_agents.py
├── cataline.log
├── quickstart.py
└── scrapy.cfg
├── README.md
├── README_zh.md
├── img
├── PornHubCode.png
├── WebHubCode.png
├── WebHubCode2.png
├── contribute.png
├── fukuan.png
├── gongzhonghao.png
├── mongodb.png
├── qrcode.jpg
├── running.png
└── xiaomiquan.png
└── requirements.txt
/.vs/ProjectSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "CurrentProjectSetting": null
3 | }
--------------------------------------------------------------------------------
/.vs/VSWorkspaceState.json:
--------------------------------------------------------------------------------
1 | {
2 | "ExpandedNodes": [
3 | "",
4 | "\\PornHub",
5 | "\\PornHub\\PornHub",
6 | "\\PornHub\\PornHub\\spiders"
7 | ],
8 | "SelectedNode": "\\PornHub\\PornHub\\spiders\\pornHubSpider.py",
9 | "PreviewInSolutionExplorer": false
10 | }
--------------------------------------------------------------------------------
/.vs/pornhubbot/v15/.suo:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/.vs/pornhubbot/v15/.suo
--------------------------------------------------------------------------------
/.vs/slnx.sqlite:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/.vs/slnx.sqlite
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2017 xiyouMc
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/PornHub/PornHub/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__init__.py
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/items.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/pornhub_type.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/pornhub_type.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/__pycache__/user_agents.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/__pycache__/user_agents.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from scrapy import Item, Field
4 |
5 |
6 | class PornVideoItem(Item):
7 | video_title = Field()
8 | image_url = Field()
9 | video_duration = Field()
10 | quality_480p = Field()
11 | video_views = Field()
12 | video_rating = Field()
13 | link_url = Field()
14 |
--------------------------------------------------------------------------------
/PornHub/PornHub/middlewares.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 | import random
3 | from PornHub.user_agents import agents
4 | import json
5 |
6 |
7 | class UserAgentMiddleware(object):
8 | """ 换User-Agent """
9 |
10 | def process_request(self, request, spider):
11 | agent = random.choice(agents)
12 | request.headers["User-Agent"] = agent
13 |
14 |
15 | class CookiesMiddleware(object):
16 | """ 换Cookie """
17 | cookie = {
18 | 'platform': 'pc',
19 | 'ss': '367701188698225489',
20 | 'bs': '%s',
21 | 'RNLBSERVERID': 'ded6699',
22 | 'FastPopSessionRequestNumber': '1',
23 | 'FPSRN': '1',
24 | 'performance_timing': 'home',
25 | 'RNKEY': '40859743*68067497:1190152786:3363277230:1'
26 | }
27 |
28 | def process_request(self, request, spider):
29 | bs = ''
30 | for i in range(32):
31 | bs += chr(random.randint(97, 122))
32 | _cookie = json.dumps(self.cookie) % bs
33 | request.cookies = json.loads(_cookie)
34 |
35 |
36 | class RandomUserAgent(object):
37 | def __init__(self, agents):
38 | self.agents = agents
39 |
40 | @classmethod
41 | def from_crawler(cls, crawler):
42 | return cls(crawler.settings.getlist('USER_AGENTS'))
43 |
44 | def process_request(self, request, spider):
45 | request.headers.setdefault('User-Agent', random.choice(self.agents))
--------------------------------------------------------------------------------
/PornHub/PornHub/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import pymongo
9 | from pymongo import IndexModel, ASCENDING
10 | from PornHub.items import PornVideoItem
11 | from scrapy.pipelines.images import ImagesPipeline
12 | from scrapy import Request
13 | import json
14 |
15 |
16 | class PornhubMongoDBPipeline(object):
17 | def __init__(self):
18 | clinet = pymongo.MongoClient("localhost", 27017)
19 | db = clinet["PornHub"]
20 | self.PhRes = db["PhRes"]
21 | idx = IndexModel([('link_url', ASCENDING)], unique=True)
22 | self.PhRes.create_indexes([idx])
23 | # if your existing DB has duplicate records, refer to:
24 | # https://stackoverflow.com/questions/35707496/remove-duplicate-in-mongodb/35711737
25 |
26 | def process_item(self, item, spider):
27 | print('MongoDBItem', item)
28 | """ 判断类型 存入MongoDB """
29 | if isinstance(item, PornVideoItem):
30 | print('PornVideoItem True')
31 | try:
32 | self.PhRes.update_one({'link_url': item['link_url']}, {'$set': dict(item)}, upsert=True)
33 | except Exception:
34 | pass
35 | return item
36 |
37 |
38 | class ImageCachePipeline(ImagesPipeline):
39 | def get_media_requests(self, item, info):
40 | pics = item['pics']
41 | list = json.loads(pics)
42 | for image_url in list:
43 | yield Request(image_url)
44 |
45 | def item_completed(self, results, item, info):
46 | image_paths=[x['path'] for ok,x in results if ok]
47 | if not image_paths:
48 | print("图片未下载好:%s" % image_paths)
49 | raise DropItem('图片未下载好 %s'%image_paths)
--------------------------------------------------------------------------------
/PornHub/PornHub/pornhub_type.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | """归纳PornHub资源链接"""
3 | PH_TYPES = [
4 | '',
5 | 'recommended',
6 | 'video?o=ht', # hot
7 | 'video?o=mv', # Most Viewed
8 | 'video?o=tr' #Top Rate
9 | ]
--------------------------------------------------------------------------------
/PornHub/PornHub/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for pornhub project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'PornHub'
13 |
14 | SPIDER_MODULES = ['PornHub.spiders']
15 | NEWSPIDER_MODULE = 'PornHub.spiders'
16 |
17 | #设置图片保存到本地的地址和过期时间
18 | IMAGES_STORE='/Users/payu/Pictures/Meizi'
19 | IMAGES_EXPIRES = 90
20 |
21 | DOWNLOAD_DELAY = 1 # 间隔时间
22 | # LOG_LEVEL = 'INFO' # 日志级别
23 | CONCURRENT_REQUESTS = 20 # 默认为16
24 | # CONCURRENT_ITEMS = 1
25 | # CONCURRENT_REQUESTS_PER_IP = 1
26 | REDIRECT_ENABLED = False
27 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
28 | #USER_AGENT = 'pornhub (+http://www.yourdomain.com)'
29 |
30 | # Obey robots.txt rules
31 | ROBOTSTXT_OBEY = True
32 |
33 | DOWNLOADER_MIDDLEWARES = {
34 | # 'PornHub.middlewares.RandomUserAgent': 1,
35 | "PornHub.middlewares.UserAgentMiddleware": 401,
36 | "PornHub.middlewares.CookiesMiddleware": 402,
37 | }
38 |
39 | ITEM_PIPELINES = {
40 | "PornHub.pipelines.PornhubMongoDBPipeline": 403,
41 | "PornHub.pipelines.ImageCachePipeline": 500,
42 | }
43 |
44 | FEED_URI=u'C:/Users/payu/Documents/pornhub.csv'
45 | FEED_FORMAT='CSV'
46 |
47 | DEPTH_PRIORITY = 1
48 | SCHEDULER_DISK_QUEUE = 'scrapy.squeues.PickleFifoDiskQueue'
49 | SCHEDULER_MEMORY_QUEUE = 'scrapy.squeues.FifoMemoryQueue'
50 |
--------------------------------------------------------------------------------
/PornHub/PornHub/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/PornHub/PornHub/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/spiders/__pycache__/pornHubSpider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/PornHub/spiders/__pycache__/pornHubSpider.cpython-36.pyc
--------------------------------------------------------------------------------
/PornHub/PornHub/spiders/pornHubSpider.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import requests
3 | import logging
4 | from scrapy.spiders import CrawlSpider
5 | from scrapy.selector import Selector
6 | from PornHub.items import PornVideoItem
7 | from PornHub.pornhub_type import PH_TYPES
8 | from scrapy.http import Request
9 | import re
10 | import json
11 | import random
12 |
13 |
14 | class Spider(CrawlSpider):
15 | name = 'pornHubSpider'
16 | host = 'https://www.pornhub.com'
17 | start_urls = list(set(PH_TYPES))
18 | logging.getLogger("requests").setLevel(logging.WARNING
19 | ) # 将requests的日志级别设成WARNING
20 | logging.basicConfig(
21 | level=logging.DEBUG,
22 | format=
23 | '%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s',
24 | datefmt='%a, %d %b %Y %H:%M:%S',
25 | filename='cataline.log',
26 | filemode='w')
27 |
28 | # test = True
29 | def start_requests(self):
30 | for ph_type in self.start_urls:
31 | yield Request(url='https://www.pornhub.com/%s' % ph_type,
32 | callback=self.parse_ph_key)
33 |
34 | def parse_ph_key(self, response):
35 | selector = Selector(response)
36 | logging.debug('request url:------>' + response.url)
37 | # logging.info(selector)
38 | divs = selector.xpath('//div[@class="phimage"]')
39 | for div in divs:
40 | # logging.debug('divs :------>' + div.extract())
41 |
42 | viewkey = re.findall('viewkey=(.*?)"', div.extract())
43 | # logging.debug(viewkey)
44 | yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],
45 | callback=self.parse_ph_info)
46 | url_next = selector.xpath(
47 | '//a[@class="orangeButton" and text()="Next "]/@href').extract()
48 | logging.debug(url_next)
49 | if url_next:
50 | # if self.test:
51 | logging.debug(' next page:---------->' + self.host + url_next[0])
52 | yield Request(url=self.host + url_next[0],callback=self.parse_ph_key)
53 | # self.test = False
54 |
55 | def parse_ph_info(self, response):
56 | phItem = PornVideoItem()
57 | selector = Selector(response)
58 | # logging.info(selector)
59 | _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract())
60 | logging.debug('PH信息的JSON:')
61 | logging.debug(_ph_info)
62 | _ph_info_json = json.loads(_ph_info[0])
63 | duration = _ph_info_json.get('video_duration')
64 | phItem['video_duration'] = duration
65 | title = _ph_info_json.get('video_title')
66 | phItem['video_title'] = title
67 | image_url = _ph_info_json.get('image_url')
68 | phItem['image_url'] = image_url
69 | link_url = _ph_info_json.get('link_url')
70 | phItem['link_url'] = link_url
71 | quality_480p = _ph_info_json.get('quality_480p')
72 | phItem['quality_480p'] = quality_480p
73 | logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url)
74 | yield phItem
75 |
--------------------------------------------------------------------------------
/PornHub/PornHub/user_agents.py:
--------------------------------------------------------------------------------
1 | # encoding=utf-8
2 |
3 | agents = [
4 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
5 | "Avant Browser/1.2.789rel1 (http://www.avantbrowser.com)",
6 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
7 | "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
8 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
9 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
10 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
11 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
12 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
13 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
14 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
15 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
16 | "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
17 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
18 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
19 | "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
20 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
21 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
22 | "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
23 | "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
24 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
25 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
26 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
27 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
28 | "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
29 | "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
30 | "Mozilla/2.02E (Win95; U)",
31 | "Mozilla/3.01Gold (Win95; I)",
32 | "Mozilla/4.8 [en] (Windows NT 5.1; U)",
33 | "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
34 | "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
35 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
36 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
37 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
38 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
39 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
40 | "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
41 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
42 | "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
43 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
44 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
45 | "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
46 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
47 | "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
48 | "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
49 | "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
50 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
51 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
52 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
53 | "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
54 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
55 | "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522 (KHTML, like Gecko) Safari/419.3",
56 | "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
57 | "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
58 | "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
59 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
60 | "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
61 | "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
62 | "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
63 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10 (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
64 | "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
65 | "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5 (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
66 | ]
67 |
--------------------------------------------------------------------------------
/PornHub/cataline.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/PornHub/cataline.log
--------------------------------------------------------------------------------
/PornHub/quickstart.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 |
3 | cmdline.execute("scrapy crawl pornHubSpider".split())
4 |
--------------------------------------------------------------------------------
/PornHub/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = PornHub.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = PornHub
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | ![][py2x] [![GitHub forks][forks]][network] [![GitHub stars][stars]][stargazers] [![GitHub license][license]][lic_file]
3 | > Disclaimer: This project is intended to study the Scrapy Spider Framework and the MongoDB database, it cannot be used for commercial or other personal intentions. If used improperly, it will be the individuals bear.
4 |
5 | * The project is mainly used for crawling PornHub, the largest adult site in the world. In doing so it retrieves video titles, duration, mp4 link, cover url and direct PornHub url.
6 | * This project crawls PornHub.com quickly, but with a simple structure.
7 | * This project can crawl up to 5 millon PornHub videos per day, depending on your personal network. Because of my slow bandwith my results are relatively slow.
8 | * The crawler requests 10 threads at a time, and because of this can achieve the speed mentioned above. If your network is more performant you can request more threads and crawl a larger amount of videos per day. For the specific configuration see [pre-boot configuration]
9 |
10 |
11 | ## Environment, Architecture
12 |
13 | Language: Python2.7
14 |
15 | Environment: MacOS, 4G RAM
16 |
17 | Database: MongoDB
18 |
19 | * Mainly uses the scrapy reptile framework.
20 | * Join to the Spider randomly by extracted from the Cookie pool and UA pool.
21 | * Start_requests start five Request based on PornHub classification, and crawl the five categories at the same time.
22 | * Support paging crawl data, and join to the queue.
23 |
24 | ## Instructions for use
25 |
26 | ### Pre-boot configuration
27 |
28 | * Install MongoDB and start without configuration
29 | * Install Python dependent modules:Scrapy, pymongo, requests or `pip install -r requirements.txt`
30 | * Modify the configuration by needed, such as the interval time, the number of threads, etc.
31 |
32 | ### Start up
33 |
34 | * cd PornHub
35 | * python quickstart.py
36 |
37 |
38 | ## Run screenshots
39 | 
40 | 
41 |
42 | ## Database description
43 |
44 | The table in the database that holds the data is PhRes. The following is a field description:
45 |
46 | #### PhRes table:
47 |
48 | video_title: The title of the video, and as a unique.
49 | link_url: Video jump to PornHub`s link
50 | image_url: Video cover link
51 | video_duration: The length of the video, in seconds
52 | quality_480p: Video 480p mp4 download address
53 |
54 | ## For Chinese
55 |
56 | * 关注微信公众号,学习Python开发
57 |
58 |
59 |
60 |
61 |
62 | [py2x]: https://img.shields.io/badge/python-2.x-brightgreen.svg
63 | [issues_img]: https://img.shields.io/github/issues/xiyouMc/WebHubBot.svg
64 | [issues]: https://github.com/xiyouMc/WebHubBot/issues
65 |
66 | [forks]: https://img.shields.io/github/forks/xiyouMc/WebHubBot.svg
67 | [network]: https://github.com/xiyouMc/WebHubBot/network
68 |
69 | [stars]: https://img.shields.io/github/stars/xiyouMc/WebHubBot.svg
70 | [stargazers]: https://github.com/xiyouMc/WebHubBot/stargazers
71 |
72 | [license]: https://img.shields.io/badge/license-MIT-blue.svg
73 | [lic_file]: https://raw.githubusercontent.com/xiyouMc/WebHubBot/master/LICENSE
74 |
--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
1 | ![][py2x] [![GitHub forks][forks]][network] [![GitHub stars][stars]][stargazers] [![GitHub license][license]][lic_file]
2 | > 免责声明:本项目旨在学习Scrapy爬虫框架和MongoDB数据库,不可使用于商业和个人其他意图。若使用不当,均由个人承担。
3 |
4 |
5 |
6 |
7 | ## 简介
8 |
9 | * 项目主要是爬取全球最大成人网站PornHub的视频标题、时长、mp4链接、封面URL和具体的PornHub链接
10 | * 项目爬的是PornHub.com,结构简单,速度飞快
11 | * 爬取PornHub视频的速度可以达到500万/天以上。具体视个人网络情况,因为我是家庭网络,所以相对慢一点。
12 | * 10个线程同时请求,可达到如上速度。若个人网络环境更好,可启动更多线程来请求,具体配置方法见 [启动前配置]
13 |
14 |
15 | ## 环境、架构
16 |
17 | 开发语言: Python2.7
18 |
19 | 开发环境: MacOS系统、4G内存
20 |
21 | 数据库: MongoDB
22 |
23 | * 主要使用 scrapy 爬虫框架
24 | * 从Cookie池和UA池中随机抽取一个加入到Spider
25 | * start_requests 根据 PorbHub 的分类,启动了5个Request,同时对五个分类进行爬取。
26 | * 并支持分页爬取数据,并加入到待爬队列。
27 |
28 | ## 使用说明
29 |
30 | ### 启动前配置
31 |
32 | * 安装MongoDB,并启动,不需要配置
33 | * 安装Python的依赖模块:Scrapy, pymongo, requests 或 `pip install -r requirements.txt`
34 | * 根据自己需要修改 Scrapy 中关于 间隔时间、启动Requests线程数等得配置
35 |
36 | ### 启动
37 |
38 | * python PornHub/quickstart.py
39 |
40 | ## 运行截图
41 | 
42 | 
43 |
44 | ## 数据库说明
45 |
46 | 数据库中保存数据的表是 PhRes。以下是字段说明:
47 |
48 | #### PhRes 表:
49 |
50 | video_title:视频的标题,并作为唯一标识.
51 | link_url:视频调转到PornHub的链接
52 | image_url:视频的封面链接
53 | video_duration:视频的时长,以 s 为单位
54 | quality_480p: 视频480p的 mp4 下载地址
55 |
56 |
57 | [py2x]: https://img.shields.io/badge/python-2.x-brightgreen.svg
58 | [issues_img]: https://img.shields.io/github/issues/xiyouMc/WebHubBot.svg
59 | [issues]: https://github.com/xiyouMc/WebHubBot/issues
60 |
61 | [forks]: https://img.shields.io/github/forks/xiyouMc/WebHubBot.svg
62 | [network]: https://github.com/xiyouMc/WebHubBot/network
63 |
64 | [stars]: https://img.shields.io/github/stars/xiyouMc/WebHubBot.svg
65 | [stargazers]: https://github.com/xiyouMc/WebHubBot/stargazers
66 |
67 | [license]: https://img.shields.io/badge/license-MIT-blue.svg
68 | [lic_file]: https://raw.githubusercontent.com/xiyouMc/WebHubBot/master/LICENSE
69 |
--------------------------------------------------------------------------------
/img/PornHubCode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/PornHubCode.png
--------------------------------------------------------------------------------
/img/WebHubCode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/WebHubCode.png
--------------------------------------------------------------------------------
/img/WebHubCode2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/WebHubCode2.png
--------------------------------------------------------------------------------
/img/contribute.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/contribute.png
--------------------------------------------------------------------------------
/img/fukuan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/fukuan.png
--------------------------------------------------------------------------------
/img/gongzhonghao.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/gongzhonghao.png
--------------------------------------------------------------------------------
/img/mongodb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/mongodb.png
--------------------------------------------------------------------------------
/img/qrcode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/qrcode.jpg
--------------------------------------------------------------------------------
/img/running.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/running.png
--------------------------------------------------------------------------------
/img/xiaomiquan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/levphon/pornhubbot/ed7d4f75b4f3f527cb38019aecbafe77f5f5690a/img/xiaomiquan.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | asn1crypto==0.22.0 # via cryptography
2 | attrs==17.2.0 # via automat, service-identity
3 | Automat==0.6.0 # via twisted
4 | certifi==2017.4.17 # via requests
5 | cffi==1.10.0 # via cryptography
6 | chardet==3.0.4 # via requests
7 | constantly==15.1.0 # via twisted
8 | cryptography==1.9 # via pyopenssl
9 | cssselect==1.0.1 # via parsel, scrapy
10 | enum34==1.1.6 # via cryptography
11 | hyperlink==17.2.1 # via twisted
12 | idna==2.5 # via cryptography, requests
13 | incremental==17.5.0 # via twisted
14 | ipaddress==1.0.18 # via cryptography
15 | lxml==3.8.0 # via parsel, scrapy
16 | parsel==1.2.0 # via scrapy
17 | pyasn1-modules==0.0.9 # via service-identity
18 | pyasn1==0.2.3 # via pyasn1-modules, service-identity
19 | pycparser==2.17 # via cffi
20 | PyDispatcher==2.0.5 # via scrapy
21 | pymongo==3.4.0
22 | pyopenssl==17.0.0 # via scrapy, service-identity
23 | queuelib==1.4.2 # via scrapy
24 | requests==2.18.1
25 | Scrapy==1.4.0
26 | service-identity==17.0.0 # via scrapy
27 | six==1.10.0 # via automat, cryptography, parsel, pyopenssl, scrapy, w3lib
28 | Twisted==17.5.0 # via scrapy
29 | urllib3==1.21.1 # via requests
30 | w3lib==1.17.0 # via parsel, scrapy
31 | zope.interface==4.4.2 # via twisted
32 |
--------------------------------------------------------------------------------