├── .gitignore ├── .gitmodules ├── chromedriver.exe ├── chromedriver_75.exe ├── client.py ├── crawl_videos.py ├── credentials.sample.yml ├── download_videos.py ├── download_xhamster.py ├── fapello_download.py ├── links.db ├── mega_list_sizes.py ├── readme.md ├── to_download.sample.yml ├── to_download_fapello.sample.yml └── xhamster.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | /credentials.json 109 | /videos/ 110 | /credentials.yml 111 | /to_download.yml 112 | /to_download_fapello.yml 113 | /fapello/ 114 | /mega_link.yaml 115 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "a_downloader"] 2 | path = a_downloader 3 | url = https://github.com/mariosemes/PornHub-downloader-python 4 | -------------------------------------------------------------------------------- /chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/racinmat/premium-downloader/ed849ee7fc17a05f5d65fbb16658f56a85a1c6b2/chromedriver.exe -------------------------------------------------------------------------------- /chromedriver_75.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/racinmat/premium-downloader/ed849ee7fc17a05f5d65fbb16658f56a85a1c6b2/chromedriver_75.exe -------------------------------------------------------------------------------- /client.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | 3 | 4 | class Client(object): 5 | 6 | def __init__(self, username, password) -> None: 7 | self.username = username 8 | self.password = password 9 | from splinter import Browser 10 | self.browser = Browser('chrome') 11 | 12 | def login(self, url='https://www.pornhubpremium.com/premium/login', homepage='https://www.pornhubpremium.com/'): 13 | browser = self.browser 14 | browser.visit(url) 15 | # age verification check 16 | if len(browser.find_by_css('.ageDisclaimer.isVisibleMTubes')) > 0: 17 | browser.find_by_css('#modalWrapMTubes > div > div > button').first.click() 18 | browser.find_by_css('#cookieBannerWrapper > .cbPrimaryCTA').click() 19 | browser.find_by_id('username').first.fill(self.username) 20 | browser.find_by_id('password').first.fill(self.password) 21 | # browser.find_by_id('remember_me').first.fill('on') 22 | browser.check('remember_me') 23 | sleep(0.1) # Time in seconds 24 | browser.find_by_id('submitLogin'if 'pornhubpremium' in homepage else 'submit').first.click() 25 | tries = 0 26 | while browser.url != homepage: 27 | if tries > 10: 28 | raise RuntimeError('We can not reach the homepage') 29 | sleep(1) # Time in seconds 30 | tries += 1 31 | return browser 32 | -------------------------------------------------------------------------------- /crawl_videos.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | 4 | import yaml 5 | import sqlite3 6 | from io import StringIO 7 | 8 | from splinter.driver.webdriver.chrome import WebDriver 9 | from client import Client 10 | import re 11 | 12 | 13 | def create_client(): 14 | with open('credentials.yml', mode='r', encoding='utf-8') as fp: 15 | credentials = yaml.safe_load(fp) 16 | username = credentials['username'] 17 | password = credentials['password'] 18 | client = Client(username, password) 19 | browser = client.login() 20 | return client, browser 21 | 22 | 23 | def create_ydl_client(base_dir='videos', use_youtube_dl=True): 24 | with open('credentials.yml', mode='r', encoding='utf-8') as fp: 25 | credentials = yaml.safe_load(fp) 26 | username = credentials['username'] 27 | password = credentials['password'] 28 | ydl_opts = { 29 | 'format': 'best', 30 | 'outtmpl': f'{base_dir}/%(id)s-%(title)s.mp4', 31 | 'nooverwrites': True, 32 | 'no_warnings': False, 33 | 'ignoreerrors': True, 34 | 'nocheckcertificate': True, 35 | 'verbose': True, 36 | 'username': username, 37 | 'password': password, 38 | } 39 | if use_youtube_dl: 40 | import youtube_dl 41 | ydl = youtube_dl.YoutubeDL(ydl_opts) 42 | else: 43 | import yt_dlp 44 | ydl = yt_dlp.YoutubeDL(ydl_opts) 45 | return ydl 46 | 47 | 48 | def get_links_for_star_videos(browser, name, video_links): 49 | pages_div = browser.find_by_css('body > div.wrapper > div > div.nf-wrapper > div.pagination3 > ul') 50 | other_pages = [] if len(pages_div) == 0 else pages_div.first.find_by_css( 51 | 'li.page_number') # sometimes pagination is missing at all 52 | if len(other_pages) == 0: 53 | pages_num = 1 54 | else: 55 | pages_num = int(other_pages.last.text) 56 | video_counter_sel = 'body > div.wrapper > div > div:nth-child({}) > div.showingCounter.pornstarVideosCounter' 57 | video_counter_sel1 = video_counter_sel.format(13) 58 | video_counter_sel2 = video_counter_sel.format(12) 59 | video_counter_sel3 = 'body > div.wrapper > div.container > div:nth-child(15) > div.showingCounter.pornstarVideosCounter' 60 | if len(browser.find_by_css(video_counter_sel1)) == 0 and len(browser.find_by_css('#pornstarsVideoSection')) == 0: 61 | # no videos 62 | print(f'no private videos for pornstar {name}') 63 | return video_links 64 | elif len(browser.find_by_css(video_counter_sel1)) > 0: 65 | videos_str = browser.find_by_css(video_counter_sel1).text 66 | elif len(browser.find_by_css(video_counter_sel2)) > 0: 67 | videos_str = browser.find_by_css(video_counter_sel2).text 68 | else: 69 | videos_str = browser.find_by_css(video_counter_sel3).text 70 | total_videos_num = int(videos_str.split(' ')[-1]) 71 | for page in range(1, pages_num + 1): 72 | browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}?premium=1&page={page}') 73 | videos_div = browser.find_by_css('#pornstarsVideoSection').first 74 | videos_list = list(videos_div.find_by_css('li.videoblock')) 75 | video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in 76 | videos_list] 77 | print(f'loaded {len(video_links)} videos for pornstar {name} in total') 78 | assert len(video_links) == total_videos_num 79 | return video_links 80 | 81 | 82 | def get_links_for_star_profile(browser, name, video_links): 83 | browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}/videos/premium') 84 | pages_div = browser.find_by_css( 85 | '#profileContent > div.profileContentLeft > section > div > div.nf-wrapper > div.pagination3') 86 | if len(pages_div) == 0: # no pagination, so only one page 87 | pages_num = 1 88 | else: 89 | pages_num = int(pages_div.first.find_by_css('li.page_number').last.text) 90 | for page in range(1, pages_num + 1): 91 | browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}/videos/premium?page={page}') 92 | videos_div = browser.find_by_css('#moreData').first 93 | videos_list = list(videos_div.find_by_css('li.videoblock')) 94 | video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in 95 | videos_list] 96 | print(f'loaded {len(video_links)} videos for pornstar {name} in total') 97 | return video_links 98 | 99 | 100 | def porn_star_all_premium_videos(browser: WebDriver, name): 101 | # example of type 1, no pagination: https://www.pornhubpremium.com/pornstar/sasha-foxxx/videos/premium 102 | # example of type 1, pagination: https://www.pornhubpremium.com/pornstar/asa-akira/videos/premium 103 | # example of type 2, no pagination: https://www.pornhubpremium.com/pornstar/madison-scott?premium=1 104 | # example of type 2, pagination: https://www.pornhubpremium.com/pornstar/sasha-grey?premium=1&page=2 105 | # there are 2 types of porn star pages, fuck it 106 | browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}?premium=1') 107 | video_links = [] 108 | if browser.is_element_present_by_id('profileHome'): 109 | video_links = get_links_for_star_profile(browser, name, video_links) 110 | elif browser.is_element_present_by_id('pornstarVideos'): 111 | video_links = get_links_for_star_videos(browser, name, video_links) 112 | # there has been redirect 113 | elif browser.url == 'https://www.pornhubpremium.com/pornstars': 114 | print(f'star {name} does not exist') 115 | return [] 116 | else: 117 | raise RuntimeError('error with profile, someting unknown') 118 | return video_links 119 | 120 | 121 | def channel_all_premium_videos(browser: WebDriver, name): 122 | browser.visit(f'https://www.pornhubpremium.com/channels/{name}/videos?premium=1') 123 | pages_list = browser.find_by_css('#channelsProfile > div.pagination3 > ul > li') 124 | if browser.title == 'Page Not Found': 125 | print(f'Channel {name} does not exist.') 126 | return [] 127 | elif len(pages_list) in [0, 1]: # no pagination, so only one page 128 | pages_num = 1 129 | else: 130 | pages_num = int(browser.find_by_css('#channelsProfile > div.pagination3 > ul > li.page_number').last.text) 131 | 132 | video_links = [] 133 | for page in range(1, pages_num + 1): 134 | browser.visit(f'https://www.pornhubpremium.com/channels/{name}/videos?premium=1&page={page}') 135 | videos_div = browser.find_by_css('ul#showAllChanelVideos').first 136 | videos_list = list(videos_div.find_by_css('li.videoblock')) 137 | video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in 138 | videos_list] 139 | print(f'loaded {len(video_links)} videos for pornstar {name} in total') 140 | return video_links 141 | 142 | 143 | def models_all_public_videos(browser: WebDriver, name): 144 | browser.visit(f'https://www.pornhub.com/model/{name}/videos') 145 | sel1 = '#channelsProfile > div.pagination3 > ul > li' 146 | sel2 = '#videosTab > div > div > div.nf-wrapper > div.pagination3.paginationGated > ul > li' 147 | pages_list1 = browser.find_by_css(sel1) 148 | pages_list2 = browser.find_by_css(sel2) 149 | sel, pages_list = max((sel1, pages_list1), (sel2, pages_list2), key=lambda x: len(x[1])) 150 | if browser.title == 'Page Not Found': 151 | print(f'Channel {name} does not exist.') 152 | return [] 153 | elif len(pages_list) in [0, 1]: # no pagination, so only one page 154 | pages_num = 1 155 | else: 156 | pages_num = int(browser.find_by_css(f'{sel}.page_number').last.text) 157 | 158 | video_links = [] 159 | for page in range(1, pages_num + 1): 160 | browser.visit(f'https://www.pornhub.com/model/{name}/videos?page={page}') 161 | videos_div = browser.find_by_css('ul#mostRecentVideosSection').first 162 | videos_list = list(videos_div.find_by_css('li.videoblock')) 163 | video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in 164 | videos_list] 165 | print(f'loaded {len(video_links)} videos for pornstar {name} in total') 166 | return video_links 167 | 168 | 169 | def get_porn_star_list(): 170 | with open('to_download.yml', 'r') as fp: 171 | try: 172 | return yaml.safe_load(fp)['stars'] 173 | except yaml.YAMLError as exc: 174 | print(exc) 175 | 176 | 177 | def get_channel_list(): 178 | with open('to_download.yml', 'r') as fp: 179 | try: 180 | return yaml.safe_load(fp)['channels'] 181 | except yaml.YAMLError as exc: 182 | print(exc) 183 | 184 | 185 | def get_model_list(): 186 | with open('to_download.yml', 'r') as fp: 187 | try: 188 | return yaml.safe_load(fp)['models'] 189 | except yaml.YAMLError as exc: 190 | print(exc) 191 | 192 | 193 | def add_video_if_not_exists(conn, video_id, video, source_name): 194 | if conn.execute(f'select exists(select 1 from videos where video_id = \'{video_id}\')').fetchone()[0]: 195 | return 196 | with conn: 197 | conn.execute('INSERT INTO videos (video_id, video_url, star_name, added_timestamp) VALUES (?, ?, ?, ?)', 198 | (video_id, video, source_name, datetime.now().isoformat())) 199 | 200 | 201 | def main(): 202 | client, browser = create_client() 203 | porn_stars = get_porn_star_list() 204 | channels = get_channel_list() 205 | models = get_model_list() 206 | 207 | conn = sqlite3.connect('links.db') 208 | conn.execute( 209 | "CREATE TABLE IF NOT EXISTS videos (video_id varchar NOT NULL, star_name varchar NOT NULL, " 210 | "video_url varchar NOT NULL, downloaded integer NOT NULL DEFAULT 0, download_forbidden int default NULL, " 211 | "added_timestamp varchar default null, downloaded_timestamp varchar default null);") 212 | 213 | # for star_name in porn_stars: 214 | # videos_list = porn_star_all_premium_videos(browser, star_name) 215 | # for video in videos_list: 216 | # video_id = re.search('viewkey=([\d\w]+)', video).group(1) 217 | # add_video_if_not_exists(conn, video_id, video, star_name) 218 | # print('done stars\n') 219 | # 220 | # for channel in channels: 221 | # videos_list = channel_all_premium_videos(browser, channel) 222 | # for video in videos_list: 223 | # video_id = re.search('viewkey=([\d\w]+)', video).group(1) 224 | # add_video_if_not_exists(conn, video_id, video, channel) 225 | # print('done channels\n') 226 | 227 | browser = client.login('https://www.pornhub.com/login', 'https://www.pornhub.com/') 228 | for model in models: 229 | videos_list = models_all_public_videos(browser, model) 230 | for video in videos_list: 231 | video_id = re.search('viewkey=([\d\w]+)', video).group(1) 232 | add_video_if_not_exists(conn, video_id, video, model) 233 | print('done models\n') 234 | print('done everything\n') 235 | 236 | 237 | if __name__ == '__main__': 238 | main() 239 | # todo: add script to move videos to dir by star name 240 | -------------------------------------------------------------------------------- /credentials.sample.yml: -------------------------------------------------------------------------------- 1 | username: "" 2 | password: "" 3 | -------------------------------------------------------------------------------- /download_videos.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from datetime import datetime 3 | from time import sleep 4 | from urllib import request 5 | from urllib.error import URLError 6 | 7 | import requests 8 | import progressbar 9 | import os.path as osp 10 | 11 | import youtube_dl 12 | from splinter.driver.webdriver.chrome import WebDriver 13 | 14 | from a_downloader.functions import custom_dl_download, ph_url_check, alive_check, get_dl_location 15 | from crawl_videos import create_client, create_ydl_client 16 | 17 | 18 | def is_download_forbidden(browser, conn, video_id): 19 | download_blocked_div = '.video-actions-tabs > .video-action-tab.download-tab > .verifyEmailWrapper' 20 | download_blocked_message = 'The download feature of this video has been disabled by' 21 | if len(browser.find_by_css(download_blocked_div)) > 0 and download_blocked_message in browser.find_by_css( 22 | download_blocked_div).text: 23 | print('video download is forbidden\n') 24 | with conn: 25 | conn.execute(f'UPDATE videos SET downloaded = 0, download_forbidden = 1 where video_id = "{video_id}"') 26 | return True 27 | return False 28 | 29 | 30 | def click_download_tab(browser, download_tab_button_sel): 31 | download_tab_button_active_sel = '.tab-menu-item.active[data-tab="download-tab"]' 32 | counter = 0 33 | while not browser.is_element_present_by_css(download_tab_button_active_sel): 34 | if counter > 10: 35 | print('can not click on download tab') 36 | return False 37 | sleep(0.1) # Time in seconds 38 | button = browser.find_by_css(download_tab_button_sel) 39 | browser.find_by_text('The download feature of this video has been disabled ') 40 | if len(button) == 0: 41 | print('disabled video download, trying alternative') 42 | return False 43 | button.click() 44 | print('clicking on it\n') 45 | counter += 1 46 | return True 47 | 48 | 49 | def download_using_youtube_dl(ydl, url, pre_callback) -> bool: 50 | pre_callback(url) 51 | ydl._download_retcode = 0 # because this is not set to 0 before each download, it is turned just from 0 to 1 52 | # so the line above resets it to default state 53 | download_ret_code = ydl.download([url]) 54 | return download_ret_code == 0 55 | 56 | 57 | def set_downloaded(conn, file_name, video_id): 58 | print(file_name, 'downloaded\n') 59 | with conn: 60 | conn.execute( 61 | f'UPDATE videos SET downloaded = 1, downloaded_timestamp = "{datetime.now().isoformat()}" ' 62 | f'where video_id = "{video_id}"') 63 | 64 | 65 | def download_official(query): 66 | browser: WebDriver = create_client() 67 | conn, videos_info = list_videos(query) 68 | pbar = prepare_pbar(videos_info) 69 | for i, video_info in enumerate(videos_info): 70 | pbar.update(i) 71 | video_info = dict(video_info) 72 | video_id = video_info['video_id'] 73 | video_url = video_info['video_url'] 74 | browser.visit(video_url) 75 | 76 | while browser.is_element_present_by_css('.recaptchaContent'): # sometimes wild captcha appears 77 | print("CAPTCHA NEEDED") 78 | sleep(60) 79 | 80 | if browser.is_element_present_by_css('.removed'): 81 | # video has been removed 82 | print('video has been removed\n') 83 | with conn: 84 | conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"') 85 | continue 86 | if not browser.is_element_visible_by_css( 87 | '.premiumIconTitleOnVideo') and not browser.is_element_present_by_css('#videoTitle'): 88 | # video has been removed 89 | print('video is somehow broken and not premiuzm\n') 90 | with conn: 91 | conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"') 92 | continue 93 | 94 | video_title = browser.find_by_css('#videoTitle').text # type: str 95 | # because of fucking windows 96 | video_title = video_title.replace(':', '').replace('?', '').replace('*', '').replace('"', '').replace('/', '') \ 97 | .replace('\\', '') 98 | browser.find_by_id('player').click() # pausing video 99 | browser.find_by_tag('body')._element.send_keys('M') # muting video 100 | 101 | file_name = f'videos/{video_id}-{video_title}.mp4' 102 | if osp.exists(file_name): 103 | with conn: 104 | conn.execute(f'UPDATE videos SET downloaded = 1 where video_id = "{video_id}"') 105 | continue 106 | 107 | if browser.is_element_present_by_css('.tab-menu-item.js-paidDownload[data-tab="download-tab"]'): 108 | # video has been removed 109 | print('video download is paid\n') 110 | with conn: 111 | conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"') 112 | continue 113 | 114 | download_tab_button_sel = '.tab-menu-item[data-tab="download-tab"]' 115 | vr_tab_button_sel = '.tab-menu-item[data-tab="vr-tab"]' 116 | if not browser.is_element_present_by_css(download_tab_button_sel) \ 117 | and browser.is_element_present_by_css(vr_tab_button_sel): 118 | # video has been removed 119 | print('video is vr, no download\n') 120 | with conn: 121 | conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"') 122 | continue 123 | 124 | click_download_tab(browser, download_tab_button_sel) 125 | if is_download_forbidden(browser, conn, video_id): 126 | continue 127 | 128 | download_link = get_download_link(browser) 129 | # must have here headers, otherwise it behaves as api and does not serve the video 130 | for _ in range(5): 131 | try: 132 | request.urlretrieve(download_link, file_name) 133 | break 134 | except URLError: 135 | print('connection failed, trying again\n') 136 | 137 | set_downloaded(conn, file_name, video_id) 138 | return pbar 139 | 140 | 141 | def download_ydl(ydl, query, downloaded_callback, pre_callback): 142 | conn, videos_info = list_videos(query) 143 | pbar = prepare_pbar(videos_info) 144 | for i, video_info in enumerate(videos_info): 145 | pbar.update(i) 146 | video_info = dict(video_info) 147 | video_id = video_info['video_id'] 148 | video_url = video_info['video_url'] 149 | download_success = download_using_youtube_dl(ydl, video_url, pre_callback) 150 | if download_success: 151 | downloaded_callback(conn, video_url, video_id) 152 | else: 153 | print(f'failed to download the video {video_id}, {video_url}') 154 | return pbar 155 | 156 | 157 | def prepare_pbar(videos_info): 158 | widgets = [progressbar.Percentage(), ' ', progressbar.Counter(), ' ', progressbar.Bar(), ' ', 159 | progressbar.FileTransferSpeed()] 160 | pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(videos_info)).start() 161 | return pbar 162 | 163 | 164 | def list_videos(query): 165 | conn = sqlite3.connect('links.db') 166 | conn.row_factory = sqlite3.Row 167 | videos_info = conn.execute(query).fetchall() 168 | return conn, videos_info 169 | 170 | 171 | def get_download_link(browser): 172 | sizes = [720, 480] 173 | download_link = None 174 | for size in sizes: 175 | if len(browser.find_link_by_text(f' {size}p')) == 0: 176 | # size not existing, trying another 177 | continue 178 | download_link = browser.find_link_by_text(f' {size}p').first['href'] 179 | break 180 | if download_link is None: 181 | raise RuntimeError('link for corresponding size not found') 182 | return download_link 183 | 184 | 185 | def check_url(url): 186 | ph_url_check(url) 187 | alive_check(url) 188 | 189 | def main(): 190 | use_ydl = True 191 | if use_ydl: 192 | ydl = create_ydl_client() 193 | pbar = download_ydl(ydl, 'select * from videos where downloaded = 0 and download_forbidden isnull', set_downloaded, 194 | check_url) 195 | else: 196 | pbar = download_official('select * from videos where downloaded = 0 and download_forbidden isnull') 197 | 198 | pbar.finish() 199 | print('done') 200 | 201 | 202 | if __name__ == '__main__': 203 | main() 204 | -------------------------------------------------------------------------------- /download_xhamster.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import re 3 | import sqlite3 4 | from datetime import datetime 5 | 6 | from splinter import Browser 7 | from splinter.driver.webdriver.chrome import WebDriver 8 | 9 | from a_downloader.functions import alive_check 10 | from crawl_videos import create_ydl_client 11 | from download_videos import download_ydl 12 | 13 | 14 | # %% 15 | def add_video_if_not_exists_xhamster(conn, video_id, video, source_name): 16 | if conn.execute(f'select exists(select 1 from videos_xhamster where video_id = \'{video_id}\')').fetchone()[0]: 17 | return 18 | with conn: 19 | conn.execute( 20 | 'INSERT INTO videos_xhamster (video_id, video_url, star_name, added_timestamp) VALUES (?, ?, ?, ?)', 21 | (video_id, video, source_name, datetime.now().isoformat())) 22 | 23 | 24 | def list_videos_page(browser: WebDriver, url): 25 | browser.visit(url) 26 | pages_div = browser.find_by_css( 27 | 'body > div.main-wrap > main > div > article > div.pornstar-content > div.pornstar-content__main > div.index-videos.mixed-section > div.pager-section > div > ul > li') 28 | # one is prev and one is next 29 | pages_num = max(len(pages_div) - 2, 1) 30 | video_links = [] 31 | for page in range(1, pages_num + 1): 32 | browser.visit(f"{url}/{page}") 33 | videos_div = browser.find_by_css( 34 | 'body > div.main-wrap > main > div > article > div.pornstar-content > div.pornstar-content__main > div.index-videos.mixed-section > div:nth-last-child(2) > div').first 35 | videos_list = list(videos_div.find_by_css('div.thumb-list__item.video-thumb')) 36 | video_links += [i.find_by_css('a').first['href'] for i in videos_list] 37 | print(f'loaded {len(video_links)} videos from url {url} in total') 38 | return video_links 39 | 40 | 41 | def list_videos_creator(conn, browser: WebDriver, creator): 42 | url = f'https://xhamster.com/creators/{creator}' 43 | videos_list = list_videos_page(browser, url) 44 | videos_list += list_videos_page(browser, f"{url}/exclusive") 45 | for video in videos_list: 46 | video_id = re.search('/videos/([\d\w-]+)', video).group(1) 47 | add_video_if_not_exists_xhamster(conn, video_id, video, creator) 48 | 49 | 50 | def crawl_videos(): 51 | conn = sqlite3.connect('links.db') 52 | conn.execute( 53 | "CREATE TABLE IF NOT EXISTS videos_xhamster (video_id varchar NOT NULL, star_name varchar NOT NULL, " 54 | "video_url varchar NOT NULL, downloaded integer NOT NULL DEFAULT 0, " 55 | "added_timestamp varchar default null, downloaded_timestamp varchar default null);") 56 | browser = Browser('chrome') 57 | browser.visit('https://xhamster.com') 58 | browser.find_by_css( 59 | 'body > div.cookies-modal__wrapper > div.cookies-modal > div.cookies-modal-footer > div > button.xh-button.button.cmd-button-accept-all.green.large2.square > span').first.click() 60 | list_videos_creator(conn, browser, 'angel') 61 | print('done') 62 | browser.quit() 63 | 64 | 65 | def check_url(url): 66 | alive_check(url) 67 | 68 | 69 | def set_downloaded(conn, file_name, video_id): 70 | print(file_name, 'downloaded\n') 71 | with conn: 72 | conn.execute( 73 | f'UPDATE videos_xhamster SET downloaded = 1, downloaded_timestamp = "{datetime.now().isoformat()}" ' 74 | f'where video_id = "{video_id}"') 75 | 76 | 77 | def download_videos(): 78 | ydl = create_ydl_client('videos_xhamster', False) 79 | pbar = download_ydl(ydl, 'select * from videos_xhamster where downloaded = 0', set_downloaded, check_url) 80 | pbar.finish() 81 | print('done') 82 | 83 | 84 | def main(): 85 | # crawl_videos() 86 | download_videos() 87 | 88 | 89 | # %% 90 | if __name__ == '__main__': 91 | main() 92 | -------------------------------------------------------------------------------- /fapello_download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import requests 4 | import yaml 5 | from lxml import html 6 | import progressbar 7 | 8 | from requests import Session 9 | 10 | hs = { 11 | 'User-Agent': 'Mozilla/5.0', 12 | 'X-Requested-With': 'XMLHttpRequest', 13 | 'Referer': 'https://permit.pcta.org/application/' 14 | } 15 | 16 | 17 | # from https://stackoverflow.com/a/58656261/5224881 18 | def download_file(s: Session, url, filename): 19 | # create response object 20 | r = s.get(url, stream=True) 21 | headers = r.headers 22 | size = int(headers["Content-Length"]) 23 | # download started 24 | read = 0 25 | with open(filename, 'wb') as f: 26 | for chunk in r.iter_content(chunk_size=1024 * 1024): 27 | read += len(chunk) 28 | if chunk: 29 | f.write(chunk) 30 | 31 | if size >= 0 and read < size: 32 | raise Exception( 33 | "retrieval incomplete: got only %i out of %i bytes" 34 | % (read, size)) 35 | 36 | 37 | def download_creator(creator_name, assume_naming=True, skip_failed=True): 38 | print(f'downloading {creator_name=}') 39 | s = requests.Session() 40 | s.headers.update(hs) 41 | print(f'loading failed urls') 42 | failed_txt = 'fapello_failed.txt' 43 | if osp.exists(failed_txt): 44 | with open(failed_txt, 'r', encoding='utf-8') as f: 45 | failed_urls = set(map(lambda x: x.strip(), f.readlines())) 46 | else: 47 | failed_urls = set() 48 | save_dir = 'fapello' 49 | base_url = 'https://fapello.com/{}/{}/' 50 | widgets = [progressbar.Percentage(), ' ', progressbar.Counter(), ' ', progressbar.Bar(), ' ', 51 | progressbar.FileTransferSpeed()] 52 | main_url = base_url[:-3].format(creator_name) 53 | response = s.get(main_url) 54 | if response.status_code == 302: 55 | print(f'creator page {main_url=} does not exist') 56 | return 57 | tree = html.fromstring(response.content) 58 | links = tree.xpath('//*[@id="content"]/div[1]/a') 59 | if len(links) == 0: 60 | print(f'creator page {main_url=} probably does not exist') 61 | return 62 | max_val = int(links[0].attrib['href'].split('/')[-2]) 63 | pbar = progressbar.ProgressBar(widgets=widgets, max_value=max_val).start() 64 | for i in range(1, max_val + 1): 65 | pbar.update(i) 66 | i_url = base_url.format(creator_name, i) 67 | 68 | if assume_naming: 69 | save_path = osp.join(save_dir, creator_name, f'{creator_name}_{i:04}.jpg') 70 | if osp.exists(save_path): 71 | continue 72 | 73 | if skip_failed and i_url in failed_urls: 74 | continue 75 | response = s.get(i_url, allow_redirects=False) 76 | if response.status_code != 200: 77 | print(f'non-existing page, storing to the list') 78 | with open(failed_txt, 'a+', encoding='utf-8') as f: 79 | f.write(i_url + '\n') 80 | continue 81 | 82 | tree = html.fromstring(response.content) 83 | img_path = tree.xpath('//*[@id="wrapper"]/div[2]/div/div/div/div[2]/a/img') 84 | if len(img_path) == 0: 85 | # not an image, try video 86 | vid_path = tree.xpath('//*[@id="wrapper"]/div[2]/div/div/div/div[2]/video/source') 87 | if len(vid_path) == 0: 88 | # not even video 89 | print(f'skipping page {i_url=}, probably invalid') 90 | continue 91 | else: 92 | target_url = vid_path[0].attrib['src'] 93 | save_path = osp.join(save_dir, creator_name, osp.basename(target_url)) 94 | else: 95 | target_url = img_path[0].attrib['src'] 96 | save_path = osp.join(save_dir, creator_name, osp.basename(target_url)) 97 | os.makedirs(osp.dirname(save_path), exist_ok=True) 98 | if osp.exists(save_path): 99 | continue 100 | download_file(s, target_url, save_path) 101 | pbar.finish() 102 | 103 | 104 | def get_creator_list(): 105 | with open('to_download_fapello.yml', 'r') as fp: 106 | try: 107 | return yaml.safe_load(fp)['stars'] 108 | except yaml.YAMLError as exc: 109 | print(exc) 110 | 111 | 112 | def main(): 113 | creator_names = get_creator_list() 114 | for creator_name in creator_names: 115 | download_creator(creator_name) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /links.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/racinmat/premium-downloader/ed849ee7fc17a05f5d65fbb16658f56a85a1c6b2/links.db -------------------------------------------------------------------------------- /mega_list_sizes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import subprocess 4 | import traceback 5 | 6 | import yaml 7 | 8 | 9 | # upload to server and then run `source activate py310` 10 | def examine_dir(a_dir, print_files, show_errors, max_depth, depth=0): 11 | output_bytes = subprocess.check_output(f'mega-ls -lh "{a_dir}"', shell=True) 12 | try: 13 | output = output_bytes.decode('utf-8').rstrip() 14 | parse_ls(a_dir, print_files, show_errors, max_depth, output, depth) 15 | except Exception as e: 16 | if show_errors: 17 | traceback.print_exc() 18 | hash_a_dir = str(hash(a_dir)) 19 | print(f'error in decoding {a_dir}, storing it as {hash_a_dir}') 20 | with open(f'/volume1/shared_data/video/other/of_leaks/error_output_{hash_a_dir}.txt', 'wb') as f: 21 | f.write(output_bytes) 22 | 23 | 24 | def parse_ls(a_dir, print_files, show_errors, max_depth, output, depth=0): 25 | # print(output) 26 | lines = output.split('\n') 27 | header_line = lines[0] 28 | rows_line = lines[1:] 29 | headers = re.split('\s+', header_line) 30 | rows = [re.split('\s+', r, maxsplit=len(headers)) for r in rows_line] 31 | # print('rows:') 32 | # print(rows) 33 | for flags, vers, size, date1, date2, name in rows: 34 | is_dir = flags == 'd---' 35 | if is_dir: 36 | inner_dir = a_dir + '/' + name 37 | size_lines = subprocess.check_output(f'mega-du -h "{inner_dir}"', shell=True).decode('utf-8') 38 | size_line = size_lines.split('\n')[1] 39 | size = ' '.join(re.split('\s+', size_line)[-2:]) 40 | print(f'{" " * 4 * (depth+1)}{size}, {name}') 41 | # print(f'{" " * 2 * (depth+1)}examining it') 42 | if depth + 1 <= max_depth: 43 | examine_dir(inner_dir, print_files, show_errors, max_depth, depth+1) 44 | else: 45 | size = f'{size} {date1}' 46 | date1 = date2 47 | # print(f'{size=}, {date1=}, {date2}, {name=}') 48 | date2, name = name.split(' ', maxsplit=1) 49 | if print_files: 50 | print(f'{" " * 4 * (depth+1)}{size}, {name}') 51 | 52 | 53 | def main(): 54 | with open('mega_link.yaml', mode='r', encoding='utf-8') as fp: 55 | main_link = yaml.safe_load(fp) 56 | # main_dir = main_link 57 | # os.system(f'mega-login {main_dir}') 58 | print_files = False 59 | show_errors = False 60 | max_depth = 4 61 | examine_dir('', print_files, show_errors, max_depth) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | # run as 67 | # source activate py310 68 | # cd ~/tmp/pycharm_project_395 69 | # python mega_list_sizes.py 70 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output.txt 2>&1 71 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_dirs_only.txt 2>&1 72 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_small.txt 2>&1 73 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_small_5.txt 2>&1 74 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_small_4.txt 2>&1 75 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Premium downloader 2 | 3 | This tools downloads premium pornhub videos. 4 | 5 | The `python crawl_videos.py` gathers video links by pornstars. 6 | 7 | The `python get_download_links.py` grabs video links and downloads using download link. 8 | 9 | ```bash 10 | conda create -n crawling python=3.9 11 | conda activate crawling 12 | conda install sqlite certifi 13 | ``` 14 | 15 | from https://googlechromelabs.github.io/chrome-for-testing/#stable I downloaded correct chromedriver 16 | and put it to `chromedriver.exe` -------------------------------------------------------------------------------- /to_download.sample.yml: -------------------------------------------------------------------------------- 1 | stars: 2 | - sasha-grey 3 | - rae-lil-black 4 | -------------------------------------------------------------------------------- /to_download_fapello.sample.yml: -------------------------------------------------------------------------------- 1 | stars: 2 | - bonbibonkers -------------------------------------------------------------------------------- /xhamster.py: -------------------------------------------------------------------------------- 1 | # https://gist.github.com/mopemope/891774 2 | from os import path 3 | from werkzeug import secure_filename 4 | import eventlet 5 | from eventlet.green import urllib2 6 | from pyquery import PyQuery as pq 7 | from urlparse import urlparse 8 | import psyco 9 | 10 | psyco.full() 11 | 12 | search_urls = [ 13 | "http://xhamster.com/channels/new-asian-%s.html", 14 | ] 15 | 16 | detail_urls = [] 17 | 18 | id_mode = True 19 | 20 | save_path = "/home/ma2/Public/xhamster/" 21 | pool = eventlet.GreenPool(2) 22 | q = [] 23 | 24 | import re 25 | 26 | base_url = "http://www.xhamster.com" 27 | download_re = re.compile("'file':\s*'([\w\d\.:/_\-\?=]*)'", re.M) 28 | download_re2 = re.compile("'srv':\s*'([\w\d.:/_]*)", re.M) 29 | 30 | 31 | def get_pagelist(url, page=1): 32 | q = [] 33 | conn = urllib2.urlopen(url % page) 34 | page = conn.read() 35 | d = pq(page) 36 | for anchor in d("a"): 37 | href = pq(anchor).attr.href 38 | if href.startswith("/movies"): 39 | q.append(base_url + href) 40 | return q 41 | 42 | 43 | def get_download_url(url): 44 | conn = urllib2.urlopen(url) 45 | page = conn.read() 46 | file_match = download_re.search(page) 47 | srv_match = download_re2.search(page) 48 | if srv_match and file_match: 49 | file_name = file_match.group(1) 50 | srv = srv_match.group(1) 51 | download_url = "%s/flv2/%s" % (srv, file_name) 52 | file_name = path.basename(download_url) 53 | return url, download_url, file_name 54 | 55 | 56 | def download_flv(url, down_url, file_name): 57 | print 58 | "'%s' ---- Try Download ----" % url 59 | 60 | out_path = path.join(save_path, file_name) 61 | if not file_name: 62 | print 63 | "'%s' ** Not Found Link ** " % url 64 | return 65 | 66 | partial = False 67 | try: 68 | conn = urllib2.urlopen(down_url) 69 | length = conn.info()['Content-Length'] 70 | length = int(length) 71 | if length < 1024 * 1024 * 150 or length > 1024 * 1024 * 700: 72 | print 73 | "*** '%s' is small! Skip!!!'%s' ***" % (url, length) 74 | return 75 | 76 | if path.exists(out_path): 77 | size = path.getsize(out_path) 78 | if size < length: 79 | r = "bytes=%s-" % size 80 | req = urllib2.Request(down_url, headers={"Range": r}) 81 | conn = urllib2.urlopen(req) 82 | print 83 | "'%s' == Resume!! '%s' ==" % (url, file_name) 84 | print 85 | "'%s' == File '%s' Size: %d/%d'" % (url, file_name, size, length) 86 | partial = True 87 | else: 88 | print 89 | "'%s' == Downloaded '%s' ==" % (url, file_name) 90 | return 91 | except: 92 | import traceback 93 | print 94 | traceback.format_exc() 95 | pool.spawn_n(download, url) 96 | return 97 | 98 | if partial: 99 | f = open(out_path, "rb+") 100 | f.seek(0, 2) 101 | else: 102 | f = open(out_path, "wb") 103 | 104 | print 105 | "'%s' == Start '%s' ==" % (url, file_name) 106 | while True: 107 | data = conn.read(1024 * 512) 108 | if not data: 109 | break 110 | f.write(data) 111 | # per = path.getsize(out_path) / float(length) * 100.0 112 | # print "'%s' == '%s' %d%% done. ==" % (url, file_name, per) 113 | print 114 | "'%s' == Finish '%s' ==" % (url, file_name) 115 | 116 | 117 | def download(url): 118 | if url.find("premium.xhamster.com") >= 0: 119 | return 120 | url, download_url, file_name = get_download_url(url) 121 | download_flv(url, download_url, file_name) 122 | 123 | 124 | def start(url, min_page=1, max_page=100): 125 | for i in xrange(min_page, max_page + 1): 126 | urls = get_pagelist(url, page=i) 127 | q.extend(urls) 128 | q.reverse() 129 | while q: 130 | url = q.pop() 131 | pool.spawn_n(download, url) 132 | 133 | 134 | if __name__ == '__main__': 135 | 136 | for url in search_urls: 137 | start(url=url) 138 | pool.waitall() --------------------------------------------------------------------------------