├── .gitignore
├── .gitmodules
├── chromedriver.exe
├── chromedriver_75.exe
├── client.py
├── crawl_videos.py
├── credentials.sample.yml
├── download_videos.py
├── download_xhamster.py
├── fapello_download.py
├── links.db
├── mega_list_sizes.py
├── readme.md
├── to_download.sample.yml
├── to_download_fapello.sample.yml
└── xhamster.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### Python template
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # Environments
 87 | .env
 88 | .venv
 89 | env/
 90 | venv/
 91 | ENV/
 92 | env.bak/
 93 | venv.bak/
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | /credentials.json
109 | /videos/
110 | /credentials.yml
111 | /to_download.yml
112 | /to_download_fapello.yml
113 | /fapello/
114 | /mega_link.yaml
115 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "a_downloader"]
2 | 	path = a_downloader
3 | 	url = https://github.com/mariosemes/PornHub-downloader-python
4 | 


--------------------------------------------------------------------------------
/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/racinmat/premium-downloader/ed849ee7fc17a05f5d65fbb16658f56a85a1c6b2/chromedriver.exe


--------------------------------------------------------------------------------
/chromedriver_75.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/racinmat/premium-downloader/ed849ee7fc17a05f5d65fbb16658f56a85a1c6b2/chromedriver_75.exe


--------------------------------------------------------------------------------
/client.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | 
 3 | 
 4 | class Client(object):
 5 | 
 6 |     def __init__(self, username, password) -> None:
 7 |         self.username = username
 8 |         self.password = password
 9 |         from splinter import Browser
10 |         self.browser = Browser('chrome')
11 | 
12 |     def login(self, url='https://www.pornhubpremium.com/premium/login', homepage='https://www.pornhubpremium.com/'):
13 |         browser = self.browser
14 |         browser.visit(url)
15 |         # age verification check
16 |         if len(browser.find_by_css('.ageDisclaimer.isVisibleMTubes')) > 0:
17 |             browser.find_by_css('#modalWrapMTubes > div > div > button').first.click()
18 |         browser.find_by_css('#cookieBannerWrapper > .cbPrimaryCTA').click()
19 |         browser.find_by_id('username').first.fill(self.username)
20 |         browser.find_by_id('password').first.fill(self.password)
21 |         # browser.find_by_id('remember_me').first.fill('on')
22 |         browser.check('remember_me')
23 |         sleep(0.1)  # Time in seconds
24 |         browser.find_by_id('submitLogin'if 'pornhubpremium' in homepage else 'submit').first.click()
25 |         tries = 0
26 |         while browser.url != homepage:
27 |             if tries > 10:
28 |                 raise RuntimeError('We can not reach the homepage')
29 |             sleep(1)  # Time in seconds
30 |             tries += 1
31 |         return browser
32 | 


--------------------------------------------------------------------------------
/crawl_videos.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime
  3 | 
  4 | import yaml
  5 | import sqlite3
  6 | from io import StringIO
  7 | 
  8 | from splinter.driver.webdriver.chrome import WebDriver
  9 | from client import Client
 10 | import re
 11 | 
 12 | 
 13 | def create_client():
 14 |     with open('credentials.yml', mode='r', encoding='utf-8') as fp:
 15 |         credentials = yaml.safe_load(fp)
 16 |     username = credentials['username']
 17 |     password = credentials['password']
 18 |     client = Client(username, password)
 19 |     browser = client.login()
 20 |     return client, browser
 21 | 
 22 | 
 23 | def create_ydl_client(base_dir='videos', use_youtube_dl=True):
 24 |     with open('credentials.yml', mode='r', encoding='utf-8') as fp:
 25 |         credentials = yaml.safe_load(fp)
 26 |     username = credentials['username']
 27 |     password = credentials['password']
 28 |     ydl_opts = {
 29 |         'format': 'best',
 30 |         'outtmpl': f'{base_dir}/%(id)s-%(title)s.mp4',
 31 |         'nooverwrites': True,
 32 |         'no_warnings': False,
 33 |         'ignoreerrors': True,
 34 |         'nocheckcertificate': True,
 35 |         'verbose': True,
 36 |         'username': username,
 37 |         'password': password,
 38 |     }
 39 |     if use_youtube_dl:
 40 |         import youtube_dl
 41 |         ydl = youtube_dl.YoutubeDL(ydl_opts)
 42 |     else:
 43 |         import yt_dlp
 44 |         ydl = yt_dlp.YoutubeDL(ydl_opts)
 45 |     return ydl
 46 | 
 47 | 
 48 | def get_links_for_star_videos(browser, name, video_links):
 49 |     pages_div = browser.find_by_css('body > div.wrapper > div > div.nf-wrapper > div.pagination3 > ul')
 50 |     other_pages = [] if len(pages_div) == 0 else pages_div.first.find_by_css(
 51 |         'li.page_number')  # sometimes pagination is missing at all
 52 |     if len(other_pages) == 0:
 53 |         pages_num = 1
 54 |     else:
 55 |         pages_num = int(other_pages.last.text)
 56 |     video_counter_sel = 'body > div.wrapper > div > div:nth-child({}) > div.showingCounter.pornstarVideosCounter'
 57 |     video_counter_sel1 = video_counter_sel.format(13)
 58 |     video_counter_sel2 = video_counter_sel.format(12)
 59 |     video_counter_sel3 = 'body > div.wrapper > div.container > div:nth-child(15) > div.showingCounter.pornstarVideosCounter'
 60 |     if len(browser.find_by_css(video_counter_sel1)) == 0 and len(browser.find_by_css('#pornstarsVideoSection')) == 0:
 61 |         #   no videos
 62 |         print(f'no private videos for pornstar {name}')
 63 |         return video_links
 64 |     elif len(browser.find_by_css(video_counter_sel1)) > 0:
 65 |         videos_str = browser.find_by_css(video_counter_sel1).text
 66 |     elif len(browser.find_by_css(video_counter_sel2)) > 0:
 67 |         videos_str = browser.find_by_css(video_counter_sel2).text
 68 |     else:
 69 |         videos_str = browser.find_by_css(video_counter_sel3).text
 70 |     total_videos_num = int(videos_str.split(' ')[-1])
 71 |     for page in range(1, pages_num + 1):
 72 |         browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}?premium=1&page={page}')
 73 |         videos_div = browser.find_by_css('#pornstarsVideoSection').first
 74 |         videos_list = list(videos_div.find_by_css('li.videoblock'))
 75 |         video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in
 76 |                         videos_list]
 77 |     print(f'loaded {len(video_links)} videos for pornstar {name} in total')
 78 |     assert len(video_links) == total_videos_num
 79 |     return video_links
 80 | 
 81 | 
 82 | def get_links_for_star_profile(browser, name, video_links):
 83 |     browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}/videos/premium')
 84 |     pages_div = browser.find_by_css(
 85 |         '#profileContent > div.profileContentLeft > section > div > div.nf-wrapper > div.pagination3')
 86 |     if len(pages_div) == 0:  # no pagination, so only one page
 87 |         pages_num = 1
 88 |     else:
 89 |         pages_num = int(pages_div.first.find_by_css('li.page_number').last.text)
 90 |     for page in range(1, pages_num + 1):
 91 |         browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}/videos/premium?page={page}')
 92 |         videos_div = browser.find_by_css('#moreData').first
 93 |         videos_list = list(videos_div.find_by_css('li.videoblock'))
 94 |         video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in
 95 |                         videos_list]
 96 |     print(f'loaded {len(video_links)} videos for pornstar {name} in total')
 97 |     return video_links
 98 | 
 99 | 
100 | def porn_star_all_premium_videos(browser: WebDriver, name):
101 |     # example of type 1, no pagination: https://www.pornhubpremium.com/pornstar/sasha-foxxx/videos/premium
102 |     # example of type 1, pagination: https://www.pornhubpremium.com/pornstar/asa-akira/videos/premium
103 |     # example of type 2, no pagination: https://www.pornhubpremium.com/pornstar/madison-scott?premium=1
104 |     # example of type 2, pagination: https://www.pornhubpremium.com/pornstar/sasha-grey?premium=1&page=2
105 |     # there are 2 types of porn star pages, fuck it
106 |     browser.visit(f'https://www.pornhubpremium.com/pornstar/{name}?premium=1')
107 |     video_links = []
108 |     if browser.is_element_present_by_id('profileHome'):
109 |         video_links = get_links_for_star_profile(browser, name, video_links)
110 |     elif browser.is_element_present_by_id('pornstarVideos'):
111 |         video_links = get_links_for_star_videos(browser, name, video_links)
112 |     # there has been redirect
113 |     elif browser.url == 'https://www.pornhubpremium.com/pornstars':
114 |         print(f'star {name} does not exist')
115 |         return []
116 |     else:
117 |         raise RuntimeError('error with profile, someting unknown')
118 |     return video_links
119 | 
120 | 
121 | def channel_all_premium_videos(browser: WebDriver, name):
122 |     browser.visit(f'https://www.pornhubpremium.com/channels/{name}/videos?premium=1')
123 |     pages_list = browser.find_by_css('#channelsProfile > div.pagination3 > ul > li')
124 |     if browser.title == 'Page Not Found':
125 |         print(f'Channel {name} does not exist.')
126 |         return []
127 |     elif len(pages_list) in [0, 1]:  # no pagination, so only one page
128 |         pages_num = 1
129 |     else:
130 |         pages_num = int(browser.find_by_css('#channelsProfile > div.pagination3 > ul > li.page_number').last.text)
131 | 
132 |     video_links = []
133 |     for page in range(1, pages_num + 1):
134 |         browser.visit(f'https://www.pornhubpremium.com/channels/{name}/videos?premium=1&page={page}')
135 |         videos_div = browser.find_by_css('ul#showAllChanelVideos').first
136 |         videos_list = list(videos_div.find_by_css('li.videoblock'))
137 |         video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in
138 |                         videos_list]
139 |     print(f'loaded {len(video_links)} videos for pornstar {name} in total')
140 |     return video_links
141 | 
142 | 
143 | def models_all_public_videos(browser: WebDriver, name):
144 |     browser.visit(f'https://www.pornhub.com/model/{name}/videos')
145 |     sel1 = '#channelsProfile > div.pagination3 > ul > li'
146 |     sel2 = '#videosTab > div > div > div.nf-wrapper > div.pagination3.paginationGated > ul > li'
147 |     pages_list1 = browser.find_by_css(sel1)
148 |     pages_list2 = browser.find_by_css(sel2)
149 |     sel, pages_list = max((sel1, pages_list1), (sel2, pages_list2), key=lambda x: len(x[1]))
150 |     if browser.title == 'Page Not Found':
151 |         print(f'Channel {name} does not exist.')
152 |         return []
153 |     elif len(pages_list) in [0, 1]:  # no pagination, so only one page
154 |         pages_num = 1
155 |     else:
156 |         pages_num = int(browser.find_by_css(f'{sel}.page_number').last.text)
157 | 
158 |     video_links = []
159 |     for page in range(1, pages_num + 1):
160 |         browser.visit(f'https://www.pornhub.com/model/{name}/videos?page={page}')
161 |         videos_div = browser.find_by_css('ul#mostRecentVideosSection').first
162 |         videos_list = list(videos_div.find_by_css('li.videoblock'))
163 |         video_links += [i.find_by_css('div > div.thumbnail-info-wrapper.clearfix > span > a').first['href'] for i in
164 |                         videos_list]
165 |     print(f'loaded {len(video_links)} videos for pornstar {name} in total')
166 |     return video_links
167 | 
168 | 
169 | def get_porn_star_list():
170 |     with open('to_download.yml', 'r') as fp:
171 |         try:
172 |             return yaml.safe_load(fp)['stars']
173 |         except yaml.YAMLError as exc:
174 |             print(exc)
175 | 
176 | 
177 | def get_channel_list():
178 |     with open('to_download.yml', 'r') as fp:
179 |         try:
180 |             return yaml.safe_load(fp)['channels']
181 |         except yaml.YAMLError as exc:
182 |             print(exc)
183 | 
184 | 
185 | def get_model_list():
186 |     with open('to_download.yml', 'r') as fp:
187 |         try:
188 |             return yaml.safe_load(fp)['models']
189 |         except yaml.YAMLError as exc:
190 |             print(exc)
191 | 
192 | 
193 | def add_video_if_not_exists(conn, video_id, video, source_name):
194 |     if conn.execute(f'select exists(select 1 from videos where video_id = \'{video_id}\')').fetchone()[0]:
195 |         return
196 |     with conn:
197 |         conn.execute('INSERT INTO videos (video_id, video_url, star_name, added_timestamp) VALUES (?, ?, ?, ?)',
198 |                      (video_id, video, source_name, datetime.now().isoformat()))
199 | 
200 | 
201 | def main():
202 |     client, browser = create_client()
203 |     porn_stars = get_porn_star_list()
204 |     channels = get_channel_list()
205 |     models = get_model_list()
206 | 
207 |     conn = sqlite3.connect('links.db')
208 |     conn.execute(
209 |         "CREATE TABLE IF NOT EXISTS videos (video_id varchar NOT NULL, star_name varchar NOT NULL, "
210 |         "video_url varchar NOT NULL, downloaded integer NOT NULL DEFAULT 0, download_forbidden int default NULL, "
211 |         "added_timestamp varchar default null, downloaded_timestamp varchar default null);")
212 | 
213 |     # for star_name in porn_stars:
214 |     #     videos_list = porn_star_all_premium_videos(browser, star_name)
215 |     #     for video in videos_list:
216 |     #         video_id = re.search('viewkey=([\d\w]+)', video).group(1)
217 |     #         add_video_if_not_exists(conn, video_id, video, star_name)
218 |     # print('done stars\n')
219 |     #
220 |     # for channel in channels:
221 |     #     videos_list = channel_all_premium_videos(browser, channel)
222 |     #     for video in videos_list:
223 |     #         video_id = re.search('viewkey=([\d\w]+)', video).group(1)
224 |     #         add_video_if_not_exists(conn, video_id, video, channel)
225 |     # print('done channels\n')
226 | 
227 |     browser = client.login('https://www.pornhub.com/login', 'https://www.pornhub.com/')
228 |     for model in models:
229 |         videos_list = models_all_public_videos(browser, model)
230 |         for video in videos_list:
231 |             video_id = re.search('viewkey=([\d\w]+)', video).group(1)
232 |             add_video_if_not_exists(conn, video_id, video, model)
233 |     print('done models\n')
234 |     print('done everything\n')
235 | 
236 | 
237 | if __name__ == '__main__':
238 |     main()
239 | # todo: add script to move videos to dir by star name
240 | 


--------------------------------------------------------------------------------
/credentials.sample.yml:
--------------------------------------------------------------------------------
1 | username: ""
2 | password: ""
3 | 


--------------------------------------------------------------------------------
/download_videos.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | from datetime import datetime
  3 | from time import sleep
  4 | from urllib import request
  5 | from urllib.error import URLError
  6 | 
  7 | import requests
  8 | import progressbar
  9 | import os.path as osp
 10 | 
 11 | import youtube_dl
 12 | from splinter.driver.webdriver.chrome import WebDriver
 13 | 
 14 | from a_downloader.functions import custom_dl_download, ph_url_check, alive_check, get_dl_location
 15 | from crawl_videos import create_client, create_ydl_client
 16 | 
 17 | 
 18 | def is_download_forbidden(browser, conn, video_id):
 19 |     download_blocked_div = '.video-actions-tabs > .video-action-tab.download-tab > .verifyEmailWrapper'
 20 |     download_blocked_message = 'The download feature of this video has been disabled by'
 21 |     if len(browser.find_by_css(download_blocked_div)) > 0 and download_blocked_message in browser.find_by_css(
 22 |         download_blocked_div).text:
 23 |         print('video download is forbidden\n')
 24 |         with conn:
 25 |             conn.execute(f'UPDATE videos SET downloaded = 0, download_forbidden = 1 where video_id = "{video_id}"')
 26 |         return True
 27 |     return False
 28 | 
 29 | 
 30 | def click_download_tab(browser, download_tab_button_sel):
 31 |     download_tab_button_active_sel = '.tab-menu-item.active[data-tab="download-tab"]'
 32 |     counter = 0
 33 |     while not browser.is_element_present_by_css(download_tab_button_active_sel):
 34 |         if counter > 10:
 35 |             print('can not click on download tab')
 36 |             return False
 37 |         sleep(0.1)  # Time in seconds
 38 |         button = browser.find_by_css(download_tab_button_sel)
 39 |         browser.find_by_text('The download feature of this video has been disabled ')
 40 |         if len(button) == 0:
 41 |             print('disabled video download, trying alternative')
 42 |             return False
 43 |         button.click()
 44 |         print('clicking on it\n')
 45 |         counter += 1
 46 |     return True
 47 | 
 48 | 
 49 | def download_using_youtube_dl(ydl, url, pre_callback) -> bool:
 50 |     pre_callback(url)
 51 |     ydl._download_retcode = 0  # because this is not set to 0 before each download, it is turned just from 0 to 1
 52 |     # so the line above resets it to default state
 53 |     download_ret_code = ydl.download([url])
 54 |     return download_ret_code == 0
 55 | 
 56 | 
 57 | def set_downloaded(conn, file_name, video_id):
 58 |     print(file_name, 'downloaded\n')
 59 |     with conn:
 60 |         conn.execute(
 61 |             f'UPDATE videos SET downloaded = 1, downloaded_timestamp = "{datetime.now().isoformat()}" '
 62 |             f'where video_id = "{video_id}"')
 63 | 
 64 | 
 65 | def download_official(query):
 66 |     browser: WebDriver = create_client()
 67 |     conn, videos_info = list_videos(query)
 68 |     pbar = prepare_pbar(videos_info)
 69 |     for i, video_info in enumerate(videos_info):
 70 |         pbar.update(i)
 71 |         video_info = dict(video_info)
 72 |         video_id = video_info['video_id']
 73 |         video_url = video_info['video_url']
 74 |         browser.visit(video_url)
 75 | 
 76 |         while browser.is_element_present_by_css('.recaptchaContent'):  # sometimes wild captcha appears
 77 |             print("CAPTCHA NEEDED")
 78 |             sleep(60)
 79 | 
 80 |         if browser.is_element_present_by_css('.removed'):
 81 |             # video has been removed
 82 |             print('video has been removed\n')
 83 |             with conn:
 84 |                 conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"')
 85 |             continue
 86 |         if not browser.is_element_visible_by_css(
 87 |             '.premiumIconTitleOnVideo') and not browser.is_element_present_by_css('#videoTitle'):
 88 |             # video has been removed
 89 |             print('video is somehow broken and not premiuzm\n')
 90 |             with conn:
 91 |                 conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"')
 92 |             continue
 93 | 
 94 |         video_title = browser.find_by_css('#videoTitle').text  # type: str
 95 |         # because of fucking windows
 96 |         video_title = video_title.replace(':', '').replace('?', '').replace('*', '').replace('"', '').replace('/', '') \
 97 |             .replace('\\', '')
 98 |         browser.find_by_id('player').click()  # pausing video
 99 |         browser.find_by_tag('body')._element.send_keys('M')  # muting video
100 | 
101 |         file_name = f'videos/{video_id}-{video_title}.mp4'
102 |         if osp.exists(file_name):
103 |             with conn:
104 |                 conn.execute(f'UPDATE videos SET downloaded = 1 where video_id = "{video_id}"')
105 |             continue
106 | 
107 |         if browser.is_element_present_by_css('.tab-menu-item.js-paidDownload[data-tab="download-tab"]'):
108 |             # video has been removed
109 |             print('video download is paid\n')
110 |             with conn:
111 |                 conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"')
112 |             continue
113 | 
114 |         download_tab_button_sel = '.tab-menu-item[data-tab="download-tab"]'
115 |         vr_tab_button_sel = '.tab-menu-item[data-tab="vr-tab"]'
116 |         if not browser.is_element_present_by_css(download_tab_button_sel) \
117 |             and browser.is_element_present_by_css(vr_tab_button_sel):
118 |             # video has been removed
119 |             print('video is vr, no download\n')
120 |             with conn:
121 |                 conn.execute(f'UPDATE videos SET download_forbidden = 1 where video_id = "{video_id}"')
122 |             continue
123 | 
124 |         click_download_tab(browser, download_tab_button_sel)
125 |         if is_download_forbidden(browser, conn, video_id):
126 |             continue
127 | 
128 |         download_link = get_download_link(browser)
129 |         # must have here headers, otherwise it behaves as api and does not serve the video
130 |         for _ in range(5):
131 |             try:
132 |                 request.urlretrieve(download_link, file_name)
133 |                 break
134 |             except URLError:
135 |                 print('connection failed, trying again\n')
136 | 
137 |         set_downloaded(conn, file_name, video_id)
138 |     return pbar
139 | 
140 | 
141 | def download_ydl(ydl, query, downloaded_callback, pre_callback):
142 |     conn, videos_info = list_videos(query)
143 |     pbar = prepare_pbar(videos_info)
144 |     for i, video_info in enumerate(videos_info):
145 |         pbar.update(i)
146 |         video_info = dict(video_info)
147 |         video_id = video_info['video_id']
148 |         video_url = video_info['video_url']
149 |         download_success = download_using_youtube_dl(ydl, video_url, pre_callback)
150 |         if download_success:
151 |             downloaded_callback(conn, video_url, video_id)
152 |         else:
153 |             print(f'failed to download the video {video_id}, {video_url}')
154 |     return pbar
155 | 
156 | 
157 | def prepare_pbar(videos_info):
158 |     widgets = [progressbar.Percentage(), ' ', progressbar.Counter(), ' ', progressbar.Bar(), ' ',
159 |                progressbar.FileTransferSpeed()]
160 |     pbar = progressbar.ProgressBar(widgets=widgets, max_value=len(videos_info)).start()
161 |     return pbar
162 | 
163 | 
164 | def list_videos(query):
165 |     conn = sqlite3.connect('links.db')
166 |     conn.row_factory = sqlite3.Row
167 |     videos_info = conn.execute(query).fetchall()
168 |     return conn, videos_info
169 | 
170 | 
171 | def get_download_link(browser):
172 |     sizes = [720, 480]
173 |     download_link = None
174 |     for size in sizes:
175 |         if len(browser.find_link_by_text(f' {size}p')) == 0:
176 |             # size not existing, trying another
177 |             continue
178 |         download_link = browser.find_link_by_text(f' {size}p').first['href']
179 |         break
180 |     if download_link is None:
181 |         raise RuntimeError('link for corresponding size not found')
182 |     return download_link
183 | 
184 | 
185 | def check_url(url):
186 |     ph_url_check(url)
187 |     alive_check(url)
188 | 
189 | def main():
190 |     use_ydl = True
191 |     if use_ydl:
192 |         ydl = create_ydl_client()
193 |         pbar = download_ydl(ydl, 'select * from videos where downloaded = 0 and download_forbidden isnull', set_downloaded,
194 |                             check_url)
195 |     else:
196 |         pbar = download_official('select * from videos where downloaded = 0 and download_forbidden isnull')
197 | 
198 |     pbar.finish()
199 |     print('done')
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     main()
204 | 


--------------------------------------------------------------------------------
/download_xhamster.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import re
 3 | import sqlite3
 4 | from datetime import datetime
 5 | 
 6 | from splinter import Browser
 7 | from splinter.driver.webdriver.chrome import WebDriver
 8 | 
 9 | from a_downloader.functions import alive_check
10 | from crawl_videos import create_ydl_client
11 | from download_videos import download_ydl
12 | 
13 | 
14 | # %%
15 | def add_video_if_not_exists_xhamster(conn, video_id, video, source_name):
16 |     if conn.execute(f'select exists(select 1 from videos_xhamster where video_id = \'{video_id}\')').fetchone()[0]:
17 |         return
18 |     with conn:
19 |         conn.execute(
20 |             'INSERT INTO videos_xhamster (video_id, video_url, star_name, added_timestamp) VALUES (?, ?, ?, ?)',
21 |             (video_id, video, source_name, datetime.now().isoformat()))
22 | 
23 | 
24 | def list_videos_page(browser: WebDriver, url):
25 |     browser.visit(url)
26 |     pages_div = browser.find_by_css(
27 |         'body > div.main-wrap > main > div > article > div.pornstar-content > div.pornstar-content__main > div.index-videos.mixed-section > div.pager-section > div > ul > li')
28 |     # one is prev and one is next
29 |     pages_num = max(len(pages_div) - 2, 1)
30 |     video_links = []
31 |     for page in range(1, pages_num + 1):
32 |         browser.visit(f"{url}/{page}")
33 |         videos_div = browser.find_by_css(
34 |             'body > div.main-wrap > main > div > article > div.pornstar-content > div.pornstar-content__main > div.index-videos.mixed-section > div:nth-last-child(2) > div').first
35 |         videos_list = list(videos_div.find_by_css('div.thumb-list__item.video-thumb'))
36 |         video_links += [i.find_by_css('a').first['href'] for i in videos_list]
37 |     print(f'loaded {len(video_links)} videos from url {url} in total')
38 |     return video_links
39 | 
40 | 
41 | def list_videos_creator(conn, browser: WebDriver, creator):
42 |     url = f'https://xhamster.com/creators/{creator}'
43 |     videos_list = list_videos_page(browser, url)
44 |     videos_list += list_videos_page(browser, f"{url}/exclusive")
45 |     for video in videos_list:
46 |         video_id = re.search('/videos/([\d\w-]+)', video).group(1)
47 |         add_video_if_not_exists_xhamster(conn, video_id, video, creator)
48 | 
49 | 
50 | def crawl_videos():
51 |     conn = sqlite3.connect('links.db')
52 |     conn.execute(
53 |         "CREATE TABLE IF NOT EXISTS videos_xhamster (video_id varchar NOT NULL, star_name varchar NOT NULL, "
54 |         "video_url varchar NOT NULL, downloaded integer NOT NULL DEFAULT 0, "
55 |         "added_timestamp varchar default null, downloaded_timestamp varchar default null);")
56 |     browser = Browser('chrome')
57 |     browser.visit('https://xhamster.com')
58 |     browser.find_by_css(
59 |         'body > div.cookies-modal__wrapper > div.cookies-modal > div.cookies-modal-footer > div > button.xh-button.button.cmd-button-accept-all.green.large2.square > span').first.click()
60 |     list_videos_creator(conn, browser, 'angel')
61 |     print('done')
62 |     browser.quit()
63 | 
64 | 
65 | def check_url(url):
66 |     alive_check(url)
67 | 
68 | 
69 | def set_downloaded(conn, file_name, video_id):
70 |     print(file_name, 'downloaded\n')
71 |     with conn:
72 |         conn.execute(
73 |             f'UPDATE videos_xhamster SET downloaded = 1, downloaded_timestamp = "{datetime.now().isoformat()}" '
74 |             f'where video_id = "{video_id}"')
75 | 
76 | 
77 | def download_videos():
78 |     ydl = create_ydl_client('videos_xhamster', False)
79 |     pbar = download_ydl(ydl, 'select * from videos_xhamster where downloaded = 0', set_downloaded, check_url)
80 |     pbar.finish()
81 |     print('done')
82 | 
83 | 
84 | def main():
85 |     # crawl_videos()
86 |     download_videos()
87 | 
88 | 
89 | # %%
90 | if __name__ == '__main__':
91 |     main()
92 | 


--------------------------------------------------------------------------------
/fapello_download.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import requests
  4 | import yaml
  5 | from lxml import html
  6 | import progressbar
  7 | 
  8 | from requests import Session
  9 | 
 10 | hs = {
 11 |     'User-Agent': 'Mozilla/5.0',
 12 |     'X-Requested-With': 'XMLHttpRequest',
 13 |     'Referer': 'https://permit.pcta.org/application/'
 14 | }
 15 | 
 16 | 
 17 | # from https://stackoverflow.com/a/58656261/5224881
 18 | def download_file(s: Session, url, filename):
 19 |     # create response object
 20 |     r = s.get(url, stream=True)
 21 |     headers = r.headers
 22 |     size = int(headers["Content-Length"])
 23 |     # download started
 24 |     read = 0
 25 |     with open(filename, 'wb') as f:
 26 |         for chunk in r.iter_content(chunk_size=1024 * 1024):
 27 |             read += len(chunk)
 28 |             if chunk:
 29 |                 f.write(chunk)
 30 | 
 31 |     if size >= 0 and read < size:
 32 |         raise Exception(
 33 |             "retrieval incomplete: got only %i out of %i bytes"
 34 |             % (read, size))
 35 | 
 36 | 
 37 | def download_creator(creator_name, assume_naming=True, skip_failed=True):
 38 |     print(f'downloading {creator_name=}')
 39 |     s = requests.Session()
 40 |     s.headers.update(hs)
 41 |     print(f'loading failed urls')
 42 |     failed_txt = 'fapello_failed.txt'
 43 |     if osp.exists(failed_txt):
 44 |         with open(failed_txt, 'r', encoding='utf-8') as f:
 45 |             failed_urls = set(map(lambda x: x.strip(), f.readlines()))
 46 |     else:
 47 |         failed_urls = set()
 48 |     save_dir = 'fapello'
 49 |     base_url = 'https://fapello.com/{}/{}/'
 50 |     widgets = [progressbar.Percentage(), ' ', progressbar.Counter(), ' ', progressbar.Bar(), ' ',
 51 |                progressbar.FileTransferSpeed()]
 52 |     main_url = base_url[:-3].format(creator_name)
 53 |     response = s.get(main_url)
 54 |     if response.status_code == 302:
 55 |         print(f'creator page {main_url=} does not exist')
 56 |         return
 57 |     tree = html.fromstring(response.content)
 58 |     links = tree.xpath('//*[@id="content"]/div[1]/a')
 59 |     if len(links) == 0:
 60 |         print(f'creator page {main_url=} probably does not exist')
 61 |         return
 62 |     max_val = int(links[0].attrib['href'].split('/')[-2])
 63 |     pbar = progressbar.ProgressBar(widgets=widgets, max_value=max_val).start()
 64 |     for i in range(1, max_val + 1):
 65 |         pbar.update(i)
 66 |         i_url = base_url.format(creator_name, i)
 67 | 
 68 |         if assume_naming:
 69 |             save_path = osp.join(save_dir, creator_name, f'{creator_name}_{i:04}.jpg')
 70 |             if osp.exists(save_path):
 71 |                 continue
 72 | 
 73 |         if skip_failed and i_url in failed_urls:
 74 |             continue
 75 |         response = s.get(i_url, allow_redirects=False)
 76 |         if response.status_code != 200:
 77 |             print(f'non-existing page, storing to the list')
 78 |             with open(failed_txt, 'a+', encoding='utf-8') as f:
 79 |                 f.write(i_url + '\n')
 80 |             continue
 81 | 
 82 |         tree = html.fromstring(response.content)
 83 |         img_path = tree.xpath('//*[@id="wrapper"]/div[2]/div/div/div/div[2]/a/img')
 84 |         if len(img_path) == 0:
 85 |             # not an image, try video
 86 |             vid_path = tree.xpath('//*[@id="wrapper"]/div[2]/div/div/div/div[2]/video/source')
 87 |             if len(vid_path) == 0:
 88 |                 # not even video
 89 |                 print(f'skipping page {i_url=}, probably invalid')
 90 |                 continue
 91 |             else:
 92 |                 target_url = vid_path[0].attrib['src']
 93 |                 save_path = osp.join(save_dir, creator_name, osp.basename(target_url))
 94 |         else:
 95 |             target_url = img_path[0].attrib['src']
 96 |             save_path = osp.join(save_dir, creator_name, osp.basename(target_url))
 97 |         os.makedirs(osp.dirname(save_path), exist_ok=True)
 98 |         if osp.exists(save_path):
 99 |             continue
100 |         download_file(s, target_url, save_path)
101 |     pbar.finish()
102 | 
103 | 
104 | def get_creator_list():
105 |     with open('to_download_fapello.yml', 'r') as fp:
106 |         try:
107 |             return yaml.safe_load(fp)['stars']
108 |         except yaml.YAMLError as exc:
109 |             print(exc)
110 | 
111 | 
112 | def main():
113 |     creator_names = get_creator_list()
114 |     for creator_name in creator_names:
115 |         download_creator(creator_name)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main()
120 | 


--------------------------------------------------------------------------------
/links.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/racinmat/premium-downloader/ed849ee7fc17a05f5d65fbb16658f56a85a1c6b2/links.db


--------------------------------------------------------------------------------
/mega_list_sizes.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import subprocess
 4 | import traceback
 5 | 
 6 | import yaml
 7 | 
 8 | 
 9 | # upload to server and then run `source activate py310`
10 | def examine_dir(a_dir, print_files, show_errors, max_depth, depth=0):
11 |     output_bytes = subprocess.check_output(f'mega-ls -lh "{a_dir}"', shell=True)
12 |     try:
13 |         output = output_bytes.decode('utf-8').rstrip()
14 |         parse_ls(a_dir, print_files, show_errors, max_depth, output, depth)
15 |     except Exception as e:
16 |         if show_errors:
17 |             traceback.print_exc()
18 |             hash_a_dir = str(hash(a_dir))
19 |             print(f'error in decoding {a_dir}, storing it as {hash_a_dir}')
20 |             with open(f'/volume1/shared_data/video/other/of_leaks/error_output_{hash_a_dir}.txt', 'wb') as f:
21 |                 f.write(output_bytes)
22 | 
23 | 
24 | def parse_ls(a_dir, print_files, show_errors, max_depth, output, depth=0):
25 |     # print(output)
26 |     lines = output.split('\n')
27 |     header_line = lines[0]
28 |     rows_line = lines[1:]
29 |     headers = re.split('\s+', header_line)
30 |     rows = [re.split('\s+', r, maxsplit=len(headers)) for r in rows_line]
31 |     # print('rows:')
32 |     # print(rows)
33 |     for flags, vers, size, date1, date2, name in rows:
34 |         is_dir = flags == 'd---'
35 |         if is_dir:
36 |             inner_dir = a_dir + '/' + name
37 |             size_lines = subprocess.check_output(f'mega-du -h "{inner_dir}"', shell=True).decode('utf-8')
38 |             size_line = size_lines.split('\n')[1]
39 |             size = ' '.join(re.split('\s+', size_line)[-2:])
40 |             print(f'{" " * 4 * (depth+1)}{size}, {name}')
41 |             # print(f'{" " * 2 * (depth+1)}examining it')
42 |             if depth + 1 <= max_depth:
43 |                 examine_dir(inner_dir, print_files, show_errors, max_depth, depth+1)
44 |         else:
45 |             size = f'{size} {date1}'
46 |             date1 = date2
47 |             # print(f'{size=}, {date1=}, {date2}, {name=}')
48 |             date2, name = name.split(' ', maxsplit=1)
49 |             if print_files:
50 |                 print(f'{" " * 4 * (depth+1)}{size}, {name}')
51 | 
52 | 
53 | def main():
54 |     with open('mega_link.yaml', mode='r', encoding='utf-8') as fp:
55 |         main_link = yaml.safe_load(fp)
56 |     # main_dir = main_link
57 |     # os.system(f'mega-login {main_dir}')
58 |     print_files = False
59 |     show_errors = False
60 |     max_depth = 4
61 |     examine_dir('', print_files, show_errors, max_depth)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     main()
66 | # run as
67 | # source activate py310
68 | # cd ~/tmp/pycharm_project_395
69 | # python mega_list_sizes.py
70 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output.txt 2>&1
71 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_dirs_only.txt 2>&1
72 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_small.txt 2>&1
73 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_small_5.txt 2>&1
74 | # python mega_list_sizes.py > /volume1/shared_data/video/other/of_leaks/total_output_small_4.txt 2>&1
75 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Premium downloader
 2 | 
 3 | This tools downloads premium pornhub videos.
 4 | 
 5 | The `python crawl_videos.py` gathers video links by pornstars.
 6 | 
 7 | The `python get_download_links.py` grabs video links and downloads using download link.
 8 | 
 9 | ```bash
10 | conda create -n crawling python=3.9
11 | conda activate crawling
12 | conda install sqlite certifi
13 | ```
14 | 
15 | from https://googlechromelabs.github.io/chrome-for-testing/#stable I downloaded correct chromedriver
16 | and put it to `chromedriver.exe`


--------------------------------------------------------------------------------
/to_download.sample.yml:
--------------------------------------------------------------------------------
1 | stars:
2 |   - sasha-grey
3 |   - rae-lil-black
4 | 


--------------------------------------------------------------------------------
/to_download_fapello.sample.yml:
--------------------------------------------------------------------------------
1 | stars:
2 |   - bonbibonkers


--------------------------------------------------------------------------------
/xhamster.py:
--------------------------------------------------------------------------------
  1 | # https://gist.github.com/mopemope/891774
  2 | from os import path
  3 | from werkzeug import secure_filename
  4 | import eventlet
  5 | from eventlet.green import urllib2
  6 | from pyquery import PyQuery as pq
  7 | from urlparse import urlparse
  8 | import psyco
  9 | 
 10 | psyco.full()
 11 | 
 12 | search_urls = [
 13 |     "http://xhamster.com/channels/new-asian-%s.html",
 14 | ]
 15 | 
 16 | detail_urls = []
 17 | 
 18 | id_mode = True
 19 | 
 20 | save_path = "/home/ma2/Public/xhamster/"
 21 | pool = eventlet.GreenPool(2)
 22 | q = []
 23 | 
 24 | import re
 25 | 
 26 | base_url = "http://www.xhamster.com"
 27 | download_re = re.compile("'file':\s*'([\w\d\.:/_\-\?=]*)'", re.M)
 28 | download_re2 = re.compile("'srv':\s*'([\w\d.:/_]*)", re.M)
 29 | 
 30 | 
 31 | def get_pagelist(url, page=1):
 32 |     q = []
 33 |     conn = urllib2.urlopen(url % page)
 34 |     page = conn.read()
 35 |     d = pq(page)
 36 |     for anchor in d("a"):
 37 |         href = pq(anchor).attr.href
 38 |         if href.startswith("/movies"):
 39 |             q.append(base_url + href)
 40 |     return q
 41 | 
 42 | 
 43 | def get_download_url(url):
 44 |     conn = urllib2.urlopen(url)
 45 |     page = conn.read()
 46 |     file_match = download_re.search(page)
 47 |     srv_match = download_re2.search(page)
 48 |     if srv_match and file_match:
 49 |         file_name = file_match.group(1)
 50 |         srv = srv_match.group(1)
 51 |         download_url = "%s/flv2/%s" % (srv, file_name)
 52 |         file_name = path.basename(download_url)
 53 |         return url, download_url, file_name
 54 | 
 55 | 
 56 | def download_flv(url, down_url, file_name):
 57 |     print
 58 |     "'%s' ---- Try Download ----" % url
 59 | 
 60 |     out_path = path.join(save_path, file_name)
 61 |     if not file_name:
 62 |         print
 63 |         "'%s' ** Not Found Link ** " % url
 64 |         return
 65 | 
 66 |     partial = False
 67 |     try:
 68 |         conn = urllib2.urlopen(down_url)
 69 |         length = conn.info()['Content-Length']
 70 |         length = int(length)
 71 |         if length < 1024 * 1024 * 150 or length > 1024 * 1024 * 700:
 72 |             print
 73 |             "*** '%s' is small! Skip!!!'%s' ***" % (url, length)
 74 |             return
 75 | 
 76 |         if path.exists(out_path):
 77 |             size = path.getsize(out_path)
 78 |             if size < length:
 79 |                 r = "bytes=%s-" % size
 80 |                 req = urllib2.Request(down_url, headers={"Range": r})
 81 |                 conn = urllib2.urlopen(req)
 82 |                 print
 83 |                 "'%s' == Resume!! '%s' ==" % (url, file_name)
 84 |                 print
 85 |                 "'%s' == File     '%s' Size: %d/%d'" % (url, file_name, size, length)
 86 |                 partial = True
 87 |             else:
 88 |                 print
 89 |                 "'%s' == Downloaded '%s' ==" % (url, file_name)
 90 |                 return
 91 |     except:
 92 |         import traceback
 93 |         print
 94 |         traceback.format_exc()
 95 |         pool.spawn_n(download, url)
 96 |         return
 97 | 
 98 |     if partial:
 99 |         f = open(out_path, "rb+")
100 |         f.seek(0, 2)
101 |     else:
102 |         f = open(out_path, "wb")
103 | 
104 |     print
105 |     "'%s' == Start '%s' ==" % (url, file_name)
106 |     while True:
107 |         data = conn.read(1024 * 512)
108 |         if not data:
109 |             break
110 |         f.write(data)
111 |         # per = path.getsize(out_path) / float(length) * 100.0
112 |         # print "'%s' == '%s' %d%% done. ==" % (url, file_name, per)
113 |     print
114 |     "'%s' == Finish '%s' ==" % (url, file_name)
115 | 
116 | 
117 | def download(url):
118 |     if url.find("premium.xhamster.com") >= 0:
119 |         return
120 |     url, download_url, file_name = get_download_url(url)
121 |     download_flv(url, download_url, file_name)
122 | 
123 | 
124 | def start(url, min_page=1, max_page=100):
125 |     for i in xrange(min_page, max_page + 1):
126 |         urls = get_pagelist(url, page=i)
127 |         q.extend(urls)
128 |     q.reverse()
129 |     while q:
130 |         url = q.pop()
131 |         pool.spawn_n(download, url)
132 | 
133 | 
134 | if __name__ == '__main__':
135 | 
136 |     for url in search_urls:
137 |         start(url=url)
138 |     pool.waitall()


--------------------------------------------------------------------------------