├── README.md ├── p_im_dl_lt.py ├── p_pl_dl_common.py ├── p_pl_dl_main.py ├── p_pl_dl_ph.py ├── p_pl_dl_pornve.py ├── p_pl_dl_pt.py ├── p_pl_dl_sb.py ├── p_pl_dl_xh.py ├── p_pl_dl_xv.py ├── p_pl_dl_youporn.py └── requirements.txt /README.md: -------------------------------------------------------------------------------- 1 | # p_pl_dl - Porn Playlist Downloader 2 | 3 | A porn playlist downloader using `youtube-dl` and `beautifulsoup`, along with some limited support for image albums. 4 | 5 | Currently supports: 6 | 7 | - lewdthots (albums only) 8 | - pornhub 9 | - porntrex 10 | - spankbang 11 | - xhamster 12 | - xvideos 13 | 14 | *** 15 | *** 16 | 17 | ## Overview 18 | 19 | #### Basic Usage 20 | 21 | Call `p_pl_dl_main` using command prompt. Pass in a text file with URLs using `-i`. Optionally, provide cookies with `-c`, and specify the download destination with `-d`. 22 | 23 | For cookies, you may pass in a single text file, or a folder path containing multiple cookie text files. 24 | 25 | Videos from each site will be downloaded to `\sites\` within the current working directory. 26 | 27 | Using a single cookie text file: 28 | ``` 29 | python p_pl_dl_main.py.py -i "C:\MyFolder\urls.txt" -c "C:\MyCookieFolder\cookies.txt" -d "F:\DownloadDestination" 30 | ``` 31 | 32 | Using multiple cookie text files stored in a folder: 33 | 34 | ``` 35 | python p_pl_dl_main.py.py -i "C:\MyFolder\urls.txt" -c "C:\MyCookieFolder\" -d "F:\DownloadDestination" 36 | ``` 37 | 38 | You may also restrict downloads to a specific site using `-o`. This may be useful if your `urls.txt` has lots of playlists/videos across many sites, but you need to scrape a specific one. Pass in the full name of the site as given in the list of supported sites above. 39 | 40 | ``` 41 | python p_pl_dl_main.py.py -i "C:\MyFolder\urls.txt" -c "C:\MyCookieFolder\" -d "F:\DownloadDestination" -o "xhamster" 42 | python p_pl_dl_main.py.py -i "C:\MyFolder\urls.txt" -c "C:\MyCookieFolder\" -d "F:\DownloadDestination" -o "spankbang" 43 | ``` 44 | 45 | *** 46 | 47 | #### Input TXT w/ URLs 48 | 49 | The URL text file should have URLs separated by a line break. The URLs may be for individual videos or entire playlists. 50 | 51 | Example: 52 | 53 | ``` 54 | https://www.xvideos.com/video35247781/ 55 | https://www.xhamster.com/videos/busty-blonde-girl-get-fucked-with-nice-lingerie-14429903 56 | ``` 57 | 58 | #### Cookies 59 | 60 | All cookie text files must have `# Netscape HTTP Cookie File` on its first line. If that line is not found, the file will not be recognized as a cookie file and ignored. 61 | -------------------------------------------------------------------------------- /p_im_dl_lt.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import p_pl_dl_common as dl_common 4 | 5 | sExtractor = 'lewdthots' 6 | 7 | sTestUrl = r"https://lewdthots.com/meg-turney-lord-raiden-topless-onlyfans-set-leaked/" 8 | 9 | 10 | def run(sUrl, *args, **kwargs): 11 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 12 | 13 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders) 14 | 15 | soup = dl_common.BeautifulSoup(html.text, 'html.parser') 16 | eGallery = soup.find(attrs={"class": 'mace-gallery-teaser'}) # Get gallery element into soup 17 | lGallery = eval(eGallery.attrs['data-g1-gallery']) # Should eval to a list of dicts 18 | 19 | lImageUrls = [] 20 | for dImage in lGallery: 21 | sImageUrl = dImage['full'] 22 | sImageUrl = sImageUrl.replace("\\", "") 23 | lImageUrls += [sImageUrl] 24 | print(f"Found {len(lImageUrls)} images") 25 | 26 | sArchive = rf".\\sites\\{sExtractor}\\dl_hist_{sExtractor}.txt" 27 | 28 | # Parse out album name, then check whether this album has already been downloaded 29 | sAlbumName = sUrl.split("/")[-2] if sUrl[-1] == '/' else sUrl.split("/")[-1] 30 | sAlbumName.replace("-", "_") 31 | 32 | bRun = True 33 | try: 34 | with open(sArchive) as archive: 35 | if sAlbumName in archive.read(): 36 | print(f"Archive already has an entry for {sAlbumName}") 37 | print("Skipping...") 38 | bRun = False 39 | except: 40 | pass 41 | 42 | if bRun: 43 | # Create subdirectory for the album - there has to be a better (more Pythonic) way... 44 | lPathComponents = ['sites', sExtractor, sAlbumName] 45 | sPath = '' 46 | for idx, sPathComponent in enumerate(lPathComponents): 47 | sPath += sPathComponent 48 | try: 49 | os.mkdir(sPath) 50 | except Exception: 51 | pass 52 | sPath += '/' 53 | 54 | nImageNum = 1 55 | for sImageUrl in lImageUrls: 56 | sImageName = sImageUrl.split('/')[-1] 57 | print(f"Processing image {nImageNum:>03} : {sImageName}") 58 | nFileName = f"{nImageNum:>03}_{sImageName}" 59 | with open(os.path.join('sites', sExtractor, sAlbumName, nFileName), 'wb') as handler: 60 | response = dl_common.requests.get(sImageUrl, stream=True) 61 | handler.write(response.content) 62 | nImageNum += 1 63 | 64 | with open(sArchive, 'a') as archive: 65 | archive.write(sAlbumName + "\r\n") 66 | 67 | -------------------------------------------------------------------------------- /p_pl_dl_common.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import re 3 | import requests 4 | import yt_dlp.utils as ytdl_utils 5 | from bs4 import BeautifulSoup 6 | from time import sleep 7 | from requests.adapters import HTTPAdapter 8 | from requests.packages.urllib3.util.ssl_ import create_urllib3_context 9 | 10 | CIPHERS = 'DEFAULT:@SECLEVEL=2' 11 | 12 | dYdlOptions = {'continuedl' : True, 13 | 'nooverwrites' : True, 14 | 'ignoreerrors' : True, 15 | 'restrictfilenames' : True, 16 | 'writeinfojson' : True, 17 | 'writeannotations' : True, 18 | 'nopostoverwrites' : True, 19 | 'download_archive' : 'dl_hist_{}.txt', 20 | 'outtmpl' : None, 21 | 'retries' : 3, 22 | 'fragment_retries' : 3 23 | } 24 | 25 | dHeaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"} 26 | 27 | session = requests.Session() 28 | dCookiesParsed = {} 29 | 30 | 31 | def randomizeHeader(): 32 | dHeaders['User-Agent'] = randomizeUserAgent() 33 | 34 | 35 | def randomizeUserAgent(): 36 | return ytdl_utils.random_user_agent() 37 | 38 | 39 | def parseCookieFile(sCookiesTxt): 40 | """ 41 | Parse a cookies text file and return a dictionary of key-value pairs 42 | compatible with requests. 43 | """ 44 | dCookies = {} 45 | with open(sCookiesTxt, 'r') as fp: 46 | for line in fp: 47 | # Need to keep "HTML Only" items for xhamster cookies 48 | if "#HttpOnly_xhamster.com" in line: 49 | line = line.replace("#HttpOnly_xhamster.com", ".xhamster.com") 50 | elif '#HttpOnly_.xhamster.com' in line: 51 | line = line.replace("#HttpOnly_.xhamster.com", ".xhamster.com") 52 | elif '#' in line or 'href' in line or len(line) == 1: 53 | continue 54 | 55 | if not re.match(r'^\#', line): 56 | lineFields = line.strip().split('\t') 57 | dCookies[lineFields[5]] = lineFields[6] 58 | 59 | global dCookiesParsed 60 | dCookiesParsed.update(dCookies) 61 | 62 | 63 | def parseCookies(sDirectory): 64 | """ 65 | Scans a directory for cookie text files. 66 | 67 | The cookie file must begin with: 68 | 69 | # Netscape HTTP Cookie File 70 | 71 | If that header line is not seen, the text file will be ignored. 72 | """ 73 | sRe = '# Netscape HTTP Cookie File' 74 | lTextFiles = glob.glob(rf"{sDirectory}\*.txt") 75 | 76 | for sTxt in lTextFiles: 77 | with open(sTxt, 'r') as fp: 78 | sFirstLine = fp.readline().rstrip() 79 | if sFirstLine == sRe: 80 | print(f"Parsing {sTxt} for cookies...") 81 | parseCookieFile(sTxt) 82 | else: 83 | print(f"Skipping {sTxt}...") 84 | sleep(0.250) 85 | sleep(1) 86 | 87 | 88 | def cookieHeaderStringGet(dCookies=None): 89 | if dCookies is None: 90 | dCookies = dCookiesParsed 91 | 92 | cookieString = '' 93 | for key, value in dCookies.items(): 94 | if cookieString != '': 95 | cookieString += '; ' 96 | cookieString += f"{key}={value}" 97 | 98 | return cookieString 99 | 100 | 101 | def addCipher(sPrefix): 102 | session.mount(sPrefix, CipherAdapter()) 103 | 104 | 105 | def runYtdl(): 106 | pass 107 | 108 | 109 | class Page: 110 | 111 | def __init__(self, url, headers=None): 112 | if headers is None: 113 | headers = dHeaders 114 | 115 | self.url = url 116 | self.content = session.get(url, headers=headers, cookies=dCookiesParsed) 117 | self.soup = BeautifulSoup(self.content.text, 'html.parser') 118 | self.videos = [] 119 | 120 | 121 | def _extract_video_urls(self, sFilter=None): 122 | """ 123 | Extract video URLs from a single playlist page. 124 | """ 125 | for a in self.soup.find_all('a', href=True): 126 | href = a['href'] 127 | if 'http' not in href: 128 | continue 129 | if sFilter is not None and sFilter not in href: 130 | continue 131 | if href not in self.videos: 132 | self.videos.append(a['href']) 133 | 134 | 135 | def _html_to_text(self, sFileName=None): 136 | if sFileName is None: 137 | sFileName = "html_content.txt" 138 | text_file = open(sFileName, "w", encoding='utf-8') 139 | text_file.write(self.content.text) 140 | text_file.close() 141 | 142 | 143 | class CipherAdapter(HTTPAdapter): 144 | # Sourced from https://stackoverflow.com/questions/64967706/python-requests-https-code-403-without-but-code-200-when-using-burpsuite 145 | 146 | def init_poolmanager(self, *args, **kwargs): 147 | context = create_urllib3_context(ciphers=CIPHERS) 148 | kwargs['ssl_context'] = context 149 | return super(CipherAdapter, self).init_poolmanager(*args, **kwargs) 150 | 151 | def proxy_manager_for(self, *args, **kwargs): 152 | context = create_urllib3_context(ciphers=CIPHERS) 153 | kwargs['ssl_context'] = context 154 | return super(CipherAdapter, self).proxy_manager_for(*args, **kwargs) 155 | -------------------------------------------------------------------------------- /p_pl_dl_main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | import traceback 5 | from time import sleep 6 | 7 | import p_pl_dl_common as dl_common 8 | import p_pl_dl_ph as dl_ph 9 | import p_pl_dl_pt as dl_pt 10 | import p_pl_dl_pornve as dl_pornve 11 | import p_pl_dl_sb as dl_sb 12 | import p_pl_dl_xh as dl_xh 13 | import p_pl_dl_xv as dl_xv 14 | import p_im_dl_lt as dl_lt 15 | 16 | 17 | def main(argv): 18 | print() 19 | 20 | if argv.dest is not None: 21 | os.chdir(argv.dest) 22 | print(f"Working download directory: {os.getcwd()}") 23 | sleep(2) 24 | 25 | print() 26 | sSourceCookies = argv.cookies 27 | if sSourceCookies is not None: 28 | print(f"Cookies source: {sSourceCookies}") 29 | if ".txt'" in sSourceCookies: 30 | dl_common.parseCookieFile(sSourceCookies) 31 | else: 32 | dl_common.parseCookies(sSourceCookies) 33 | else: 34 | print(f"No cookies provided!") 35 | sleep(0.5) 36 | 37 | print() 38 | sSourceUrls = argv.input 39 | print(f"Using the following input source: {sSourceUrls}") 40 | print() 41 | sleep(0.5) 42 | 43 | dSites = {'lewdthots' : False, 44 | 'pornhub' : False, 45 | 'porntrex' : False, 46 | 'pornve' : False, 47 | 'spankbang' : False, 48 | 'xhamster' : False, 49 | 'xvideos' : False, 50 | 'youporn' : False, 51 | } 52 | 53 | dExtractors = {'lewdthots' : dl_lt, 54 | 'pornhub' : dl_ph, 55 | 'porntrex' : dl_pt, 56 | 'pornve' : dl_pornve, 57 | 'spankbang' : dl_sb, 58 | 'xhamster' : dl_xh, 59 | 'xvideos' : dl_xv, 60 | } 61 | 62 | nVideoLimit = int(argv.limit) if argv.limit is not None else None 63 | print(f"Video limit per URL = {nVideoLimit}") 64 | 65 | # Get each URL into a dict 66 | dUrlDefs = {} 67 | with open(sSourceUrls) as fSourceUrls: 68 | sLines = fSourceUrls.readlines() 69 | for sLine in sLines: 70 | sUrl = sLine.strip() 71 | print(f"URL: {sUrl}") 72 | for sSite in dSites.keys(): 73 | if sSite in sLine: 74 | dSites[sSite] = True 75 | dUrlDefs[sUrl] = sSite 76 | print() 77 | print("Detected websites:") 78 | print(json.dumps(dSites, indent=4)) 79 | print() 80 | sleep(2) 81 | 82 | if argv.only is not None: 83 | argv.only = argv.only.lower() 84 | if argv.only in dSites.keys(): 85 | for key, value in dSites.items(): 86 | if argv.only == key: 87 | dSites[key] = True 88 | else: 89 | dSites[key] = False 90 | 91 | for sUrl, sSite in dUrlDefs.items(): 92 | if sSite in dExtractors.keys() and dSites[sSite]: 93 | try: 94 | dExtractors[sSite].run(sUrl, sCookieSource=None, nVideoLimit=nVideoLimit) # Cookies should already be parsed and available when going through main 95 | except: 96 | print("\r\n\r\n") 97 | traceback.print_exc() 98 | print("\r\n\r\n") 99 | continue 100 | else: 101 | print(f"No extractor available for {sSite} - {sUrl}") 102 | sleep(0.5) 103 | print() 104 | 105 | 106 | if __name__ == '__main__': 107 | argparser = argparse.ArgumentParser() 108 | argparser.add_argument('-i', '--input', help='Input TXT file with URLs to process', required=True) 109 | argparser.add_argument('-c', '--cookies', help='Input TXT file with cookies') 110 | argparser.add_argument('-d', '--dest', help='Download destination path') 111 | argparser.add_argument('-o', '--only', help='Only run a specific site') 112 | argparser.add_argument('-l', '--limit', help='Limit the number of videos') 113 | args = argparser.parse_args() 114 | main(args) 115 | -------------------------------------------------------------------------------- /p_pl_dl_ph.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from time import time 3 | import yt_dlp as youtube_dl 4 | 5 | import p_pl_dl_common as dl_common 6 | 7 | sExtractor = 'pornhub' 8 | 9 | 10 | def run(sUrl, sCookieSource=None, nVideoLimit=None, bDebug=False): 11 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 12 | 13 | if sCookieSource is not None: 14 | dl_common.parseCookieFile(sCookieSource) 15 | 16 | if dl_common.dCookiesParsed is None: 17 | print("WARNING :: No cookies were provided! Private videos/playlists will fail to download!\r\n") 18 | 19 | # Attempt initial connection 20 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 21 | print(f"Initial connection status: {html.status_code}") 22 | if html.status_code == 403: 23 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 24 | elif html.status_code != 200: 25 | raise ConnectionError(f"Initial connection failed : Status {html.status_code}") 26 | print() 27 | 28 | if bDebug: 29 | # Save HTML content to a text file for debug 30 | text_file = open("html_content.txt", "w", encoding='utf-8') 31 | text_file.write(html.text) 32 | text_file.close() 33 | 34 | page = Page_Pornhub(sUrl) 35 | 36 | dYdlOptions = dict(dl_common.dYdlOptions) 37 | dYdlOptions['download_archive'] = rf".\\sites\\{sExtractor}\\{dYdlOptions['download_archive'].format(sExtractor)}" 38 | 39 | # Set options helpful for pornhub 40 | # dYdlOptions['retries'] = 10 41 | # dYdlOptions['fragment_retries'] = 10 42 | # dYdlOptions['keep_fragments'] = True 43 | # dYdlOptions['skip_unavailable_fragments'] = False 44 | # dYdlOptions['external_downloader_args'] = ["-m3u8_hold_counters", "3", "-max_reload", "3"] 45 | 46 | lFailedUrls = [] 47 | 48 | def ytdlLoop(lUrls, bLogFailures): 49 | nonlocal lFailedUrls 50 | 51 | for nIdx, sVideoUrl in enumerate(lUrls): 52 | print(f"Processing video {nIdx + 1} of {len(lUrls)} :: {sVideoUrl}") 53 | print() 54 | 55 | sVideoId = sVideoUrl.split('view_video.php?viewkey=')[-1] 56 | dYdlOptions['outtmpl'] = rf'.\\sites\\{sExtractor}\\{sVideoId}_%(title).125s.mp4' 57 | 58 | nStart = time() 59 | try: 60 | with youtube_dl.YoutubeDL(dYdlOptions) as ydl: 61 | ydl.download([sVideoUrl]) 62 | except: 63 | if bLogFailures: 64 | print(f"\r\nEncountered some error for URL = {sVideoUrl}") 65 | print(f"Adding it to the retry list...") 66 | lFailedUrls += [sVideoUrl] 67 | continue 68 | nStop = time() 69 | print(f"\r\nElapsed time for URL = {sVideoUrl}: {round((nStop - nStart) / 60, 2)} minutes\r\n") 70 | 71 | if nVideoLimit is not None and (nIdx + 1) >= nVideoLimit: 72 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 73 | break 74 | print() 75 | 76 | ytdlLoop(page.videos, bLogFailures=True) 77 | 78 | if lFailedUrls: 79 | print("Retrying URLs that failed...") 80 | for sUrl in lFailedUrls: 81 | print(sUrl) 82 | ytdlLoop(lFailedUrls, bLogFailures=False) 83 | 84 | 85 | class Page_Pornhub(dl_common.Page): 86 | 87 | def __init__(self, url): 88 | super().__init__(url) 89 | 90 | nPageStatus = self.content.status_code 91 | if nPageStatus != 200: 92 | if nPageStatus == 403: 93 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 94 | 95 | self.sUrlType = self._get_url_type() 96 | self._playlistId = self.url.split('.com/')[1].split('/')[0] if self.sUrlType == 'playlist' else None 97 | 98 | if self.sUrlType == 'video': 99 | self.videos.append(self.url) 100 | self._nVideos = 1 101 | elif self.sUrlType == 'playlist': 102 | print("Playlist detected. Getting videos...") 103 | self._sUrlBaseFormat = self.urlStandardize(self.url) 104 | self._extract_video_urls() 105 | self._nVideos = len(self.videos) 106 | print(f"Found {self._nVideos} video URLs in the playlist") 107 | 108 | 109 | def _get_url_type(self): 110 | # Video URLs are in the form of /.../view_video.php?viewkey=ph602a75a6151e9 111 | # Favorites are in the form of /.../videos/favorites?page=2 112 | # Playlists take the form https://www.pornhub.com/playlist/123465789 ... not sure how to handle pages for these yet 113 | if '/view_video.php' in self.url: 114 | sUrlType = 'video' 115 | elif 'videos/favorites' in self.url: 116 | sUrlType = 'playlist' 117 | elif '/playlist/' in self.url: 118 | raise ValueError("Regular pornhub playlists are unsupported...") 119 | else: 120 | raise ValueError(f"Unable to determine {sExtractor} URL type for {self.url}! Please submit a bug report!") 121 | return sUrlType 122 | 123 | 124 | def _extract_video_urls(self, sFilter=None): 125 | """ 126 | Extract video URLs from all playlist pages. 127 | """ 128 | lUrlVideos = [] 129 | nPage = 0 130 | while True: 131 | nPage += 1 132 | 133 | lPageVideos = self._extract_page_urls(nPage) 134 | if lPageVideos: 135 | lUrlVideos += lPageVideos 136 | print(f"Found {len(lPageVideos)} videos on page {nPage:02}...") 137 | else: 138 | print(f"No videos found on page {nPage}. Stopping...") 139 | break 140 | self.videos += lUrlVideos 141 | 142 | 143 | def _extract_page_urls(self, nPage, sFilter=None): 144 | """ 145 | Extract video URLs from a single page of the playlist. 146 | """ 147 | sUrlBase = "https://www.pornhub.com{}" 148 | 149 | for nAttempts in range(3): 150 | sUrlPage = self._sUrlBaseFormat.format(nPage) 151 | content = dl_common.session.get(sUrlPage, cookies=dl_common.dCookiesParsed) 152 | if "503 Service Temporarily Unavailable" in content.text: 153 | sleep(3) 154 | continue 155 | soup = dl_common.BeautifulSoup(content.text, 'html.parser') 156 | break 157 | 158 | lVideos = [] 159 | lTags = soup.find_all(attrs={"class": 'pcVideoListItem js-pop videoblock videoBox'}) 160 | for tag in lTags: 161 | if 'id' in tag.attrs.keys() and 'vfavouriteVideo' in tag.attrs['id']: 162 | for a in tag.find_all('a', href=True): 163 | href = a['href'] 164 | if 'view_video.php?' not in href: 165 | continue 166 | if '&pkey=' in href: 167 | continue 168 | if href not in self.videos: 169 | sUrlFull = sUrlBase.format(href) 170 | if sUrlFull not in lVideos: 171 | lVideos.append(sUrlFull) 172 | return lVideos 173 | 174 | 175 | def urlStandardize(self, sUrl): 176 | """ 177 | Make sure URL ends with '/' and tack on f-string brackets for iterating through pages. 178 | """ 179 | if sUrl.endswith('favorites'): 180 | sUrl += '?page={}' 181 | return sUrl 182 | -------------------------------------------------------------------------------- /p_pl_dl_pornve.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import jsbeautifier 3 | import random 4 | import re 5 | import yt_dlp as youtube_dl 6 | 7 | import p_pl_dl_common as dl_common 8 | 9 | DEBUG = False 10 | 11 | sExtractor = 'pornve' 12 | sArchive = rf".\\sites\\{sExtractor}\\dl_hist_{sExtractor}.txt" 13 | 14 | 15 | def run(sUrl, sCookieSource=None, nVideoLimit=None, bDebug=False): 16 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 17 | 18 | if sCookieSource is not None: 19 | dl_common.parseCookieFile(sCookieSource) 20 | 21 | if dl_common.dCookiesParsed is None: 22 | print("WARNING :: No cookies were provided! Private videos/playlists will fail to download!\r\n") 23 | 24 | # Attempt initial connection 25 | dl_common.randomizeHeader() 26 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 27 | print(f"Initial connection status: {html.status_code}") 28 | if html.status_code == 403: 29 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 30 | elif html.status_code != 200: 31 | raise ConnectionError(f"Initial connection failed : Status {html.status_code}") 32 | print() 33 | sleepRandom(1, 3) 34 | 35 | if bDebug: 36 | # Save HTML content to a text file for debug 37 | text_file = open("html_content.txt", "w", encoding='utf-8') 38 | text_file.write(html.text) 39 | text_file.close() 40 | 41 | page = Page_Pornve(sUrl) 42 | sleepRandom(3, 5) 43 | 44 | dYdlOptions = dict(dl_common.dYdlOptions) 45 | dYdlOptions['download_archive'] = None 46 | 47 | for nIdx, sVideoUrl in enumerate(page.videos): 48 | if page.sUrlType == 'playlist': 49 | print(f"Processing playlist video {nIdx + 1} of {page._nVideos} :: {sVideoUrl}") 50 | print() 51 | 52 | # Get the actual video stream info for a video link from a playlist 53 | if page.sUrlType == 'playlist': 54 | pageVideo = Page_Pornve(sVideoUrl) 55 | sVideoName = pageVideo._sVideoName 56 | sVideoStreamUrl = pageVideo.videos[0] 57 | sPageUrl = pageVideo.url 58 | else: 59 | sVideoName = page._sVideoName 60 | sVideoStreamUrl = page.videos[0] 61 | sPageUrl = page.url 62 | 63 | bRun = True 64 | try: 65 | with open(sArchive) as archive: 66 | if sPageUrl in archive.read(): 67 | print(f"Archive already has an entry for {sPageUrl}") 68 | print("Skipping...") 69 | bRun = False 70 | except: 71 | pass 72 | 73 | if bRun: 74 | dYdlOptions['outtmpl'] = rf'.\\sites\\{sExtractor}\\{sVideoName}.%(ext)s' 75 | 76 | with youtube_dl.YoutubeDL(dYdlOptions) as ydl: 77 | ydl.cache.remove() 78 | ret = ydl.download([sVideoStreamUrl]) 79 | 80 | # Need to do our own archiving since YTDL will treat everything with the name "index-v1-a1" because 81 | # of how the video is extracted in _extract_video_stream 82 | # YTDL ret 0 is good, 1 is bad 83 | if not ret: 84 | with open(sArchive, 'a') as archive: 85 | archive.write(sPageUrl + "\r\n") 86 | 87 | if nVideoLimit is not None and (nIdx + 1) >= nVideoLimit: 88 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 89 | break 90 | print() 91 | sleepRandom(3, 5) 92 | 93 | 94 | class Page_Pornve(dl_common.Page): 95 | 96 | def __init__(self, url): 97 | super().__init__(url) 98 | 99 | nPageStatus = self.content.status_code 100 | if nPageStatus != 200: 101 | if nPageStatus == 403: 102 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 103 | 104 | self.sUrlType = self._get_url_type() 105 | 106 | if self.sUrlType == 'video': 107 | sVideoStreamUrl = self._extract_video_stream() 108 | self.videos.append(sVideoStreamUrl) 109 | 110 | sVideoNameComponents = self.url.split('.html')[0].split('/')[-2:] 111 | self._sVideoName = '_'.join(reversed(sVideoNameComponents)) 112 | 113 | self._nVideos = 1 114 | elif self.sUrlType == 'playlist': 115 | print("Playlist detected. Getting videos...") 116 | 117 | lUrlComponents = self.url.split('/') 118 | self._playlistId = lUrlComponents[-2] if not lUrlComponents[-1] else lUrlComponents[-1] 119 | 120 | self._extract_video_urls() 121 | self._nVideos = len(self.videos) 122 | print(f"Found {self._nVideos} video URLs in the playlist\r\n") 123 | 124 | 125 | def _get_url_type(self): 126 | if '/playlist/' in self.url: 127 | sUrlType = 'playlist' 128 | else: 129 | sUrlType = 'video' 130 | return sUrlType 131 | 132 | 133 | def _extract_video_urls(self, sFilter=None): 134 | """ 135 | Extract video URLs from all playlist pages. 136 | """ 137 | self._sUrlBaseFormat = f"https://pornve.com/?hide_search=1&op=search&playlist={self._playlistId}&sort_field=file_created&sort_order=down&page={{}}" 138 | 139 | lUrlVideos = [] 140 | nPage = 0 141 | while True: 142 | nPage += 1 143 | 144 | lPageVideos = self._extract_page_urls(nPage) 145 | if lPageVideos: 146 | lUrlVideos += lPageVideos 147 | print(f"Found {len(lPageVideos)} videos on page {nPage:02}...") 148 | else: 149 | print(f"No videos found on page {nPage}. Stopping...") 150 | break 151 | self.videos += lUrlVideos 152 | 153 | 154 | def _extract_page_urls(self, nPage, sFilter=None): 155 | """ 156 | Extract video URLs from a single page of the playlist. 157 | """ 158 | dl_common.randomizeHeader() 159 | for nAttempts in range(3): 160 | sUrlPage = self._sUrlBaseFormat.format(nPage) 161 | content = dl_common.session.get(sUrlPage, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 162 | if "503 Service Temporarily Unavailable" in content.text: 163 | if DEBUG: 164 | print("503 encountered! Sleeping...") 165 | sleepRandom() 166 | continue 167 | soup = dl_common.BeautifulSoup(content.text, 'html.parser') 168 | sleepRandom(1, 3) 169 | break 170 | 171 | lVideos = [] 172 | lProcessed = [] 173 | for a in soup.find_all('a', href=True): 174 | href = a['href'] 175 | if href in lProcessed: 176 | continue 177 | if f'?list={self._playlistId}' not in href: 178 | continue 179 | if sFilter is not None and sFilter not in href: 180 | continue 181 | if href not in self.videos: 182 | sCleanedUrl = self._clean_video_url(href) 183 | if sCleanedUrl is not None and sCleanedUrl not in lVideos: 184 | lVideos.append(sCleanedUrl) 185 | lProcessed += [href] 186 | return lVideos 187 | 188 | 189 | def _extract_video_stream(self): 190 | for nAttempts in range(3): 191 | content = dl_common.session.get(self.url, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 192 | if "503 Service Temporarily Unavailable" in content.text: 193 | if DEBUG: 194 | print("503 encountered! Sleeping...") 195 | sleepRandom() 196 | continue 197 | sleepRandom(1, 3) 198 | break 199 | 200 | sPackedCode = self._js_find_packed_code(content.text) 201 | sVideoStreamUrl = self._js_unpack_and_get_stream(sPackedCode) 202 | 203 | return sVideoStreamUrl 204 | 205 | 206 | def _clean_video_url(self, sUrlMasked, nAttempts=3): 207 | """ 208 | Unmask playlist videos. 209 | """ 210 | return sUrlMasked.split("?list=")[0] 211 | 212 | 213 | def _js_find_packed_code(self, htmlContent): 214 | lHtmlLines = htmlContent.split("\r\n") 215 | sPackedCode = None 216 | for row in lHtmlLines: 217 | if r"""eval(function(p,a,c,k,e,d)""" in row: 218 | sPackedCode = row 219 | if sPackedCode is None: 220 | raise ValueError("Did not find any packed JS code...") 221 | 222 | nIdxStart = len(sPackedCode) - len(sPackedCode.lstrip()) 223 | sPackedCode = sPackedCode[nIdxStart:] 224 | 225 | if sPackedCode[-1:] == '\n': 226 | sPackedCode = sPackedCode[:-1] 227 | 228 | return sPackedCode 229 | 230 | 231 | def _js_unpack_and_get_stream(self, packedData): 232 | """ 233 | Pass in obfuscated "eval(function(p,a,c,k,e,d)..." string 234 | """ 235 | url = None 236 | unpacked_data = jsbeautifier.beautify(packedData).split('"') 237 | for sData in unpacked_data: 238 | if ".m3u8" in sData: 239 | url = sData 240 | if url is None: 241 | raise ValueError("Could not find a video stream URL!") 242 | 243 | # unpacked_data_split = unpacked_data.split('>= nVideoLimit: 119 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 120 | break 121 | print() 122 | 123 | nTimeEnd = time() 124 | print(f"Run time: {round((nTimeEnd - nTimeStart) / 60, 2)} minutes") 125 | 126 | 127 | def urlBaseFormatGet(sUrl): 128 | """ 129 | Create the base f-string URL that be used to iteratively go through pages. 130 | 131 | Playlists and favorites use an AJAX format. They do not have simple page numbers. 132 | Search results use simple page numbers. 133 | """ 134 | sUrlBase = None 135 | if 'playlists' in sUrl: 136 | print("Using 'playlists' format...") 137 | nType = 10 138 | nPlaylistId = sUrl.split('/')[-2] 139 | sUrlBase = f'https://www.porntrex.com/my/playlists/{nPlaylistId}/?mode=async&function=get_block&block_id=list_videos_my_favourite_videos&fav_type={nType}&playlist_id={nPlaylistId}&sort_by=&from_my_fav_videos={{}}' 140 | elif 'favourites' in sUrl: 141 | print("Using 'favourites' format...") 142 | nType = 0 143 | nPlaylistId = 0 144 | sUrlBase = f'https://www.porntrex.com/my/favourites/videos/?mode=async&function=get_block&block_id=list_videos_my_favourite_videos&fav_type={nType}&playlist_id={nPlaylistId}&sort_by=&from_my_fav_videos={{}}' 145 | elif 'search' in sUrl: 146 | if not sUrl.endswith('/'): 147 | sUrl += '/' 148 | sUrl += '{}' 149 | sUrlBase = sUrl 150 | return sUrlBase 151 | 152 | 153 | class Video(dl_common.Page): 154 | 155 | def __init__(self, url): 156 | super().__init__(url) 157 | self.downloadUrl = None 158 | 159 | lUrlComponents = self.url.split('/') 160 | if lUrlComponents[-1] == '': 161 | lUrlComponents.pop(-1) 162 | self._lUrlComponents = lUrlComponents 163 | 164 | self.sVideoId = self._lUrlComponents[-2] 165 | self.sVideoName = self._lUrlComponents[-1] 166 | self.sFullName = '_'.join([self.sVideoId, self.sVideoName]) + '.mp4' 167 | 168 | self._extract_video_urls(sFilter='get_file') 169 | self._extract_video_largest() 170 | 171 | 172 | def _extract_video_largest(self): 173 | """ 174 | Get the file that has the largest file size, which should be the highest quality. 175 | """ 176 | nIdxBiggest = 0 177 | nBiggestSize = 0 178 | for index in range(len(self.videos)): 179 | nSize = int(dl_common.session.get(self.videos[index], cookies=dl_common.dCookiesParsed, stream=True).headers['Content-Length']) 180 | if nSize > nBiggestSize: 181 | nIdxBiggest = index 182 | nBiggestSize = nSize 183 | self.downloadUrl = self.videos[nIdxBiggest] 184 | -------------------------------------------------------------------------------- /p_pl_dl_sb.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from time import time 3 | import yt_dlp as youtube_dl 4 | import random 5 | 6 | import p_pl_dl_common as dl_common 7 | 8 | DEBUG = False 9 | 10 | sExtractor = 'spankbang' 11 | 12 | 13 | def run(sUrl, sCookieSource=None, nVideoLimit=None, bDebug=False): 14 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 15 | 16 | if sCookieSource is not None: 17 | dl_common.parseCookieFile(sCookieSource) 18 | 19 | if dl_common.dCookiesParsed is None: 20 | print("WARNING :: No cookies were provided! Private videos/playlists will fail to download!\r\n") 21 | 22 | # 20210619 :: Workaround for https://github.com/ppldl/p_pl_dl/issues/1 23 | # 20220710 :: Wrapping this in lazy try-except since I'm not sure this is needed anymore since I use ytdlp instead of ytdl 24 | try: 25 | dl_common.addCipher("https://spankbang.com") 26 | except: 27 | pass 28 | 29 | # Attempt initial connection 30 | dl_common.randomizeHeader() 31 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 32 | print(f"Initial connection status: {html.status_code}") 33 | if html.status_code == 403: 34 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 35 | elif html.status_code != 200: 36 | raise ConnectionError(f"Initial connection failed : Status {html.status_code}") 37 | print() 38 | sleepRandom(1, 3) 39 | 40 | if bDebug: 41 | # Save HTML content to a text file for debug 42 | text_file = open("html_content.txt", "w", encoding='utf-8') 43 | text_file.write(html.text) 44 | text_file.close() 45 | 46 | page = Page_Spankbang(sUrl) 47 | sleepRandom(3, 5) 48 | 49 | dYdlOptions = dict(dl_common.dYdlOptions) 50 | dYdlOptions['download_archive'] = rf".\\sites\\{sExtractor}\\{dYdlOptions['download_archive'].format(sExtractor)}" 51 | # dYdlOptions['referer'] = 'https://spankbang.com' 52 | # dYdlOptions['user_agent'] = dl_common.dHeaders['User-Agent'] # Not needed - YTDL already has a UA randomizer 53 | 54 | # Store info on videos that have already been downloaded 55 | sArchive = rf".\\sites\\{sExtractor}\\dl_hist_{sExtractor}.txt" 56 | with open(sArchive) as file: 57 | lines = file.readlines() 58 | lVidHistory = [line.rstrip().split(' ')[1] for line in lines] 59 | print(lVidHistory) 60 | 61 | for nIdx, sVideoUrl in enumerate(page.videos): 62 | if page.sUrlType == 'playlist': 63 | print(f"Processing playlist video {nIdx + 1} of {page._nVideos} :: {sVideoUrl}") 64 | print() 65 | 66 | sVidId = sVideoUrl.split('/')[3] 67 | print(sVidId) 68 | if sVidId in lVidHistory: 69 | print(f"{sVidId} has already been downloaded. Moving on...") 70 | continue 71 | 72 | dYdlOptions['outtmpl'] = rf'.\\sites\\{sExtractor}\\%(title).125s.%(ext)s' 73 | with youtube_dl.YoutubeDL(dYdlOptions) as ydl: 74 | ydl.cache.remove() 75 | ydl.download([sVideoUrl]) 76 | 77 | if nVideoLimit is not None and (nIdx + 1) >= nVideoLimit: 78 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 79 | break 80 | print() 81 | sleepRandom() 82 | 83 | 84 | class Page_Spankbang(dl_common.Page): 85 | 86 | def __init__(self, url): 87 | super().__init__(url) 88 | 89 | nPageStatus = self.content.status_code 90 | if nPageStatus != 200: 91 | if nPageStatus == 403: 92 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 93 | 94 | self.sUrlType = self._get_url_type() 95 | self._playlistId = self.url.split('.com/')[1].split('/')[0] if self.sUrlType == 'playlist' else None 96 | self._sUrlBaseFormat = urlStandardize(self.url) 97 | 98 | if self.sUrlType == 'video': 99 | self.videos.append(self.url) 100 | self._nVideos = 1 101 | elif self.sUrlType == 'playlist': 102 | print("Playlist detected. Getting videos...") 103 | self._extract_video_urls() 104 | self._nVideos = len(self.videos) 105 | print(f"Found {self._nVideos} video URLs in the playlist") 106 | 107 | 108 | def _get_url_type(self): 109 | # Video URLs are in the form of spankbang.com/vwxyz/video/full-content-name 110 | # Playlists are in the form of spankbang.com/ijklm/playlist/name-of-playlist 111 | # Within a playlist, its videos are "masked" as spankbang.com/ijklm-abc123/playlist/name-of-playlist 112 | if '/video/' in self.url: 113 | sUrlType = 'video' 114 | elif '/playlist/' in self.url: 115 | if '-' in self.url: 116 | sUrlType = 'video_masked' 117 | else: 118 | sUrlType = 'playlist' 119 | else: 120 | raise ValueError(f"Unable to determine {sExtractor} URL type for {self.url}! Please submit a bug report!") 121 | return sUrlType 122 | 123 | 124 | def _extract_video_urls(self, sFilter=None): 125 | """ 126 | Extract video URLs from all playlist pages. 127 | """ 128 | lUrlVideos = [] 129 | nPage = 0 130 | timeStart = time() 131 | while True: 132 | nPage += 1 133 | 134 | lPageVideos = self._extract_page_urls(nPage) 135 | if lPageVideos: 136 | lUrlVideos += lPageVideos 137 | print(f"Found {len(lPageVideos)} videos on page {nPage:02}...") 138 | else: 139 | print(f"No videos found on page {nPage}. Stopping...") 140 | break 141 | timeStop = time() 142 | timeElapsed = round((timeStop - timeStart) / 60, 1) 143 | print(f"Time elapsed: {timeElapsed} minutes") 144 | self.videos += lUrlVideos 145 | 146 | 147 | def _extract_page_urls(self, nPage, sFilter=None): 148 | """ 149 | Extract video URLs from a single page of the playlist. 150 | """ 151 | dl_common.randomizeHeader() 152 | for nAttempts in range(3): 153 | sUrlPage = self._sUrlBaseFormat.format(nPage) 154 | content = dl_common.session.get(sUrlPage, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 155 | if "503 Service Temporarily Unavailable" in content.text: 156 | if DEBUG: 157 | print("503 encountered! Sleeping...") 158 | sleepRandom() 159 | continue 160 | soup = dl_common.BeautifulSoup(content.text, 'html.parser') 161 | sleepRandom(3, 5) 162 | break 163 | 164 | lVideos = [] 165 | lProcessed = [] 166 | for a in soup.find_all('a', href=True): 167 | href = a['href'] 168 | if href in lProcessed: 169 | continue 170 | if 'playlist' not in href: 171 | continue 172 | if '/lang/' in href: 173 | continue 174 | if f'{self._playlistId}-' not in href: 175 | continue 176 | if sFilter is not None and sFilter not in href: 177 | continue 178 | if href not in self.videos: 179 | sUnmaskedUrl = self._unmask_video_url(href) 180 | if sUnmaskedUrl is not None and sUnmaskedUrl not in lVideos: 181 | lVideos.append(sUnmaskedUrl) 182 | lProcessed += [href] 183 | return lVideos 184 | 185 | 186 | def _unmask_video_url(self, sUrlMasked, nAttempts=3): 187 | """ 188 | Unmask playlist videos. 189 | """ 190 | sUrlFull = rf"https://spankbang.com{sUrlMasked}" 191 | if DEBUG: 192 | print(sUrlFull) 193 | 194 | # Load up the page using the masked URL from the playlist, then search its content for the real URL 195 | for nAttempt in range(nAttempts): 196 | content = dl_common.session.get(sUrlFull, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 197 | soup = dl_common.BeautifulSoup(content.text, 'html.parser') 198 | 199 | try: 200 | sCanonicalUrl = soup.find(attrs={'rel': 'canonical'}).attrs['href'] 201 | except: 202 | sCanonicalUrl = None 203 | 204 | if sCanonicalUrl is not None: 205 | break 206 | else: 207 | sleepRandom(1, 5) 208 | 209 | if sCanonicalUrl is None: 210 | print(f"Failed to unmask a URL for {sUrlMasked}") 211 | # sleepRandom(1, 3) 212 | return sCanonicalUrl 213 | 214 | 215 | def urlStandardize(sUrl): 216 | """ 217 | Make sure URL ends with '/' and tack on f-string brackets for iterating through pages. 218 | """ 219 | if sUrl[-1] != '/': 220 | sUrl += '/' 221 | sUrl += '{}' 222 | return sUrl 223 | 224 | 225 | def sleepRandom(nMin=5, nMax=10): 226 | """ 227 | Sleep for some random interval to help avoid tripping Cloudflare's anti-bot protection. 228 | """ 229 | nSleep = round(random.uniform(min(nMin, nMax), max(nMin, nMax)), 2) 230 | if DEBUG: 231 | print(nSleep) 232 | sleep(nSleep) 233 | -------------------------------------------------------------------------------- /p_pl_dl_xh.py: -------------------------------------------------------------------------------- 1 | import yt_dlp as youtube_dl 2 | 3 | import p_pl_dl_common as dl_common 4 | 5 | sExtractor = 'xhamster' 6 | 7 | # Something changed with xhamster where these headers are now required 8 | def _xhamsterHeaderGet(): 9 | dHeaders_xh = {'Host' : 'xhamster.com', 10 | 'User-Agent' : dl_common.randomizeUserAgent(), 11 | 'DNT' : '1', 12 | 'Connection' : 'keep-alive', 13 | 'Sec-Fetch-Dest' : 'document', 14 | 'Sec-Fetch-Mode' : 'navigate', 15 | 'Sec-Fetch-Site' : 'none', 16 | 'Sec-Fetch-User' : '?1', 17 | 'Cache-Control' : 'max-age=0', 18 | 'Cookie' : dl_common.cookieHeaderStringGet(), 19 | 'TE' : 'trailers' 20 | } 21 | return dHeaders_xh 22 | 23 | 24 | def run(sUrl, sCookieSource=None, nVideoLimit=None, bDebug=False): 25 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 26 | 27 | if sCookieSource is not None: 28 | dl_common.parseCookieFile(sCookieSource) 29 | 30 | if dl_common.dCookiesParsed is None: 31 | print("WARNING :: No cookies were provided! Private videos/playlists will fail to download!\r\n") 32 | 33 | if f'{sExtractor}.com/videos' in sUrl: 34 | sUrlType = 'video' 35 | elif f'{sExtractor}.com/my' in sUrl: 36 | sUrlType = 'playlist' 37 | else: 38 | raise ValueError(f"Unable to determine {sExtractor} URL type for {sUrl}! Please submit a bug report!") 39 | 40 | dXhamsterHeader = _xhamsterHeaderGet() 41 | 42 | # Attempt initial connection 43 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 44 | print(f"Initial connection status: {html.status_code}") 45 | if html.status_code == 403: 46 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 47 | elif html.status_code != 200: 48 | raise ConnectionError(f"Initial connection failed : Status {html.status_code}") 49 | print() 50 | 51 | if bDebug: 52 | # Save HTML content to a text file for debug 53 | text_file = open("html_content.txt", "w", encoding='utf-8') 54 | text_file.write(html.text) 55 | text_file.close() 56 | 57 | lUrlVideos = [] 58 | if sUrlType == 'playlist': 59 | print("Playlist detected. Getting videos...") 60 | sUrlBaseFormat = urlStandardize(sUrl) 61 | nPage = 0 62 | while True: 63 | nPage += 1 64 | print(f"Attempting page {nPage:02}") 65 | sUrlPage = sUrlBaseFormat.format(f'{nPage:02}') 66 | page = dl_common.Page(sUrlPage, headers=dXhamsterHeader) 67 | nPageStatus = page.content.status_code 68 | if nPageStatus != 200: 69 | if nPageStatus == 403: 70 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 71 | elif nPageStatus == 404: 72 | print(f"Page {nPage} returned 404!") 73 | print(f"Assuming page {nPage - 1} was the last page of the playlist") 74 | break 75 | 76 | if "Page not found" in page.content.text: 77 | break 78 | 79 | page._extract_video_urls() 80 | if page.videos: 81 | lUrlVideos += page.videos 82 | else: 83 | break 84 | 85 | # Remove non-video URLs that may have been picked up 86 | lTemp = [] 87 | for sUrl in lUrlVideos: 88 | if 'com/videos/recommended' in sUrl: 89 | continue 90 | if 'com/videos' in sUrl: 91 | lTemp += [sUrl] 92 | lUrlVideos = lTemp 93 | 94 | nNumVideos = len(lUrlVideos) 95 | print(f"Found {nNumVideos} video URLs in the playlist") 96 | if bDebug: 97 | for sUrl in lUrlVideos: 98 | print(sUrl) 99 | 100 | elif sUrlType == 'video': 101 | lUrlVideos = [sUrl] 102 | 103 | dYdlOptions = dict(dl_common.dYdlOptions) 104 | dYdlOptions['download_archive'] = rf".\\sites\\{sExtractor}\\{dYdlOptions['download_archive'].format(sExtractor)}" 105 | 106 | for nIdx, sVideoUrl in enumerate(lUrlVideos): 107 | if sUrlType == 'playlist': 108 | print(f"Processing playlist video {nIdx + 1} of {nNumVideos} :: {sVideoUrl}") 109 | print() 110 | 111 | dYdlOptions['outtmpl'] = rf'.\\sites\\{sExtractor}\\%(title)s.%(ext)s' 112 | 113 | with youtube_dl.YoutubeDL(dYdlOptions) as ydl: 114 | ydl.download([sVideoUrl]) 115 | 116 | if nVideoLimit is not None and (nIdx + 1) >= nVideoLimit: 117 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 118 | break 119 | print() 120 | 121 | 122 | def urlStandardize(sUrl): 123 | """ 124 | Make sure URL ends with '/' and tack on f-string brackets for iterating through pages. 125 | """ 126 | if sUrl[-1] != '/': 127 | sUrl += '/' 128 | sUrl += '{}' 129 | return sUrl 130 | -------------------------------------------------------------------------------- /p_pl_dl_xv.py: -------------------------------------------------------------------------------- 1 | import yt_dlp as youtube_dl 2 | 3 | import p_pl_dl_common as dl_common 4 | 5 | sExtractor = 'xvideos' 6 | 7 | 8 | def run(sUrl, sCookieSource=None, nVideoLimit=None, bDebug=False): 9 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 10 | 11 | if sCookieSource is not None: 12 | dl_common.parseCookieFile(sCookieSource) 13 | 14 | if dl_common.dCookiesParsed is None: 15 | print("WARNING :: No cookies were provided! Private videos/playlists will fail to download!\r\n") 16 | 17 | # Attempt initial connection 18 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 19 | print(f"Initial connection status: {html.status_code}") 20 | if html.status_code == 403: 21 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 22 | elif html.status_code != 200: 23 | raise ConnectionError(f"Initial connection failed : Status {html.status_code}") 24 | print() 25 | 26 | if bDebug: 27 | # Save HTML content to a text file for debug 28 | text_file = open("html_content.txt", "w", encoding='utf-8') 29 | text_file.write(html.text) 30 | text_file.close() 31 | 32 | page = Page_Xvideos(sUrl) 33 | nPageStatus = page.content.status_code 34 | if nPageStatus != 200: 35 | if nPageStatus == 403: 36 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 37 | 38 | dYdlOptions = dict(dl_common.dYdlOptions) 39 | dYdlOptions['download_archive'] = rf".\\sites\\{sExtractor}\\{dYdlOptions['download_archive'].format(sExtractor)}" 40 | 41 | print() 42 | for nIdx, sVideoUrl in enumerate(page.videos): 43 | if page.sUrlType == 'playlist': 44 | print(f"Processing playlist video {nIdx + 1} of {len(page.videos)} :: {sVideoUrl}") 45 | print() 46 | 47 | dYdlOptions['outtmpl'] = rf'.\\sites\\{sExtractor}\\%(title).125s.%(ext)s' 48 | 49 | with youtube_dl.YoutubeDL(dYdlOptions) as ydl: 50 | ydl.download([sVideoUrl]) 51 | 52 | if nVideoLimit is not None and (nIdx + 1) >= nVideoLimit: 53 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 54 | break 55 | print() 56 | 57 | 58 | class Page_Xvideos(dl_common.Page): 59 | 60 | def __init__(self, url): 61 | super().__init__(url) 62 | if f'{sExtractor}.com/video' in self.url: 63 | sUrlType = 'video' 64 | elif f'{sExtractor}.com/favorite' in self.url: 65 | sUrlType = 'playlist' 66 | else: 67 | raise ValueError(f"Unable to determine {sExtractor} URL type for {self.url}! Please submit a bug report!") 68 | self.sUrlType = sUrlType 69 | 70 | self._sUrlBaseFormat = urlStandardize(self.url) 71 | 72 | if self.sUrlType == 'video': 73 | self.videos.append(self.url) 74 | elif self.sUrlType == 'playlist': 75 | print("Playlist detected. Getting videos...") 76 | self._extract_video_urls() 77 | 78 | 79 | def _extract_video_urls(self, sFilter=None): 80 | """ 81 | Extract video URLs from all playlist pages. 82 | """ 83 | nNumPages = self._NumPagesGet() 84 | print(f"Found {nNumPages} pages in the playlist...") 85 | 86 | lUrlVideos = [] 87 | for nPage in range(0, nNumPages): 88 | lPageVideos = self._extract_page_urls(nPage) 89 | if lPageVideos: 90 | lUrlVideos += lPageVideos 91 | print(f"Found {len(lPageVideos)} on page {nPage + 1}") 92 | else: 93 | print(f"Failed to load page {nPage + 1}!") 94 | break 95 | 96 | # Remove non-video URLs that may have been picked up 97 | lTemp = [] 98 | for sUrl in lUrlVideos: 99 | if 'com/videos/recommended' in sUrl: 100 | continue 101 | if 'com/video' in sUrl: 102 | lTemp += [sUrl] 103 | else: 104 | raise ValueError(f"Not sure about this one: {sUrl}") 105 | lUrlVideos = lTemp 106 | 107 | nNumVideos = len(lUrlVideos) 108 | print(f"\r\nFound {nNumVideos} video URLs in the playlist") 109 | self.videos += lUrlVideos 110 | 111 | 112 | def _extract_page_urls(self, nPage, sFilter=None): 113 | """ 114 | Extract video URLs from a single playlist page. 115 | """ 116 | sUrlPage = self._sUrlBaseFormat.format(nPage) 117 | content = dl_common.session.get(sUrlPage, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 118 | soup = dl_common.BeautifulSoup(content.text, 'html.parser') 119 | 120 | lVideos = [] 121 | for a in soup.find_all('a', href=True): 122 | href = a['href'] 123 | if '/video' != href[:6]: 124 | continue 125 | if '/videos-i-like' == href: 126 | continue 127 | 128 | if sFilter is not None and sFilter not in href: 129 | continue 130 | 131 | sVideoUrlFull = 'https://www.xvideos.com' + a['href'] 132 | sVideoUrlSplit = sVideoUrlFull.split('?pl=')[0] 133 | 134 | if sVideoUrlSplit not in lVideos: 135 | lVideos.append(sVideoUrlSplit) 136 | return lVideos 137 | 138 | 139 | def _NumPagesGet(self): 140 | """ 141 | Return the number of pages in the playlist. 142 | """ 143 | pagination_block = self.soup.find(attrs={"class": 'pagination'}) 144 | 145 | # Check for playlist with many pages first (i.e. pagination of pages) 146 | try: 147 | nPages = int(pagination_block.find(attrs={"class": "last-page"}).string) 148 | except: 149 | # Then for a multi-page playlist (no pagination of pages) 150 | try: 151 | nPages = len(pagination_block.find_all('li')) - 1 152 | # If no pagination, assume only one page 153 | except AttributeError: 154 | nPages = 1 155 | return nPages 156 | 157 | 158 | def urlStandardize(sUrl): 159 | """ 160 | Make sure URL ends with '/' and tack on f-string brackets for iterating through pages. 161 | """ 162 | if sUrl[-1] != '/': 163 | sUrl += '/' 164 | sUrl += '{}' 165 | return sUrl 166 | -------------------------------------------------------------------------------- /p_pl_dl_youporn.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | from time import time 3 | import yt_dlp as youtube_dl 4 | 5 | import p_pl_dl_common as dl_common 6 | 7 | sExtractor = 'youporn' 8 | 9 | 10 | def run(sUrl, sCookieSource=None, nVideoLimit=None, bDebug=False): 11 | print(f"Running {sExtractor} extractor for {sUrl}\r\n") 12 | 13 | if sCookieSource is not None: 14 | dl_common.parseCookieFile(sCookieSource) 15 | 16 | if dl_common.dCookiesParsed is None: 17 | print("WARNING :: No cookies were provided! Private videos/playlists will fail to download!\r\n") 18 | 19 | # Attempt initial connection 20 | html = dl_common.session.get(sUrl, headers=dl_common.dHeaders, cookies=dl_common.dCookiesParsed) 21 | print(f"Initial connection status: {html.status_code}") 22 | if html.status_code == 403: 23 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 24 | elif html.status_code != 200: 25 | raise ConnectionError(f"Initial connection failed : Status {html.status_code}") 26 | print() 27 | 28 | if bDebug: 29 | # Save HTML content to a text file for debug 30 | text_file = open("html_content.txt", "w", encoding='utf-8') 31 | text_file.write(html.text) 32 | text_file.close() 33 | 34 | page = Page_Youporn(sUrl) 35 | 36 | dYdlOptions = dict(dl_common.dYdlOptions) 37 | dYdlOptions['download_archive'] = rf".\\sites\\{sExtractor}\\{dYdlOptions['download_archive'].format(sExtractor)}" 38 | 39 | lFailedUrls = [] 40 | 41 | def ytdlLoop(lUrls, bLogFailures): 42 | nonlocal lFailedUrls 43 | 44 | for nIdx, sVideoUrl in enumerate(lUrls): 45 | print(f"Processing video {nIdx + 1} of {len(lUrls)} :: {sVideoUrl}") 46 | print() 47 | 48 | sVideoId = sVideoUrl.split('view_video.php?viewkey=')[-1] 49 | dYdlOptions['outtmpl'] = rf'.\\sites\\{sExtractor}\\{sVideoId}_%(title).125s.mp4' 50 | 51 | nStart = time() 52 | try: 53 | with youtube_dl.YoutubeDL(dYdlOptions) as ydl: 54 | ydl.download([sVideoUrl]) 55 | except: 56 | if bLogFailures: 57 | print(f"\r\nEncountered some error for URL = {sVideoUrl}") 58 | print(f"Adding it to the retry list...") 59 | lFailedUrls += [sVideoUrl] 60 | continue 61 | nStop = time() 62 | print(f"\r\nElapsed time for URL = {sVideoUrl}: {round((nStop - nStart) / 60, 2)} minutes\r\n") 63 | 64 | if nVideoLimit is not None and (nIdx + 1) >= nVideoLimit: 65 | print(f"Hit the specified maximum limit of {nVideoLimit}. Stopping...") 66 | break 67 | print() 68 | 69 | ytdlLoop(page.videos, bLogFailures=True) 70 | 71 | if lFailedUrls: 72 | print("Retrying URLs that failed...") 73 | for sUrl in lFailedUrls: 74 | print(sUrl) 75 | ytdlLoop(lFailedUrls, bLogFailures=False) 76 | 77 | 78 | class Page_Youporn(dl_common.Page): 79 | 80 | def __init__(self, url): 81 | super().__init__(url) 82 | 83 | nPageStatus = self.content.status_code 84 | if nPageStatus != 200: 85 | if nPageStatus == 403: 86 | raise ConnectionError(f"403 Forbidden! Please check if cookies are required! Private videos/playlists cannot be accessed without cookies!") 87 | 88 | self.sUrlType = self._get_url_type() 89 | self._playlistId = self.url.split('.com/')[1].split('/')[0] if self.sUrlType == 'playlist' else None 90 | 91 | if self.sUrlType == 'video': 92 | self.videos.append(self.url) 93 | self._nVideos = 1 94 | elif self.sUrlType == 'playlist': 95 | print("Playlist detected. Getting videos...") 96 | self._sUrlBaseFormat = self.urlStandardize(self.url) 97 | self._extract_video_urls() 98 | self._nVideos = len(self.videos) 99 | print(f"Found {self._nVideos} video URLs in the playlist") 100 | 101 | 102 | def _get_url_type(self): 103 | if 'youporn.com/watch/' in self.url: 104 | sUrlType = 'video' 105 | elif 'youporn.com/favorites/' in self.url or 'youporn.com/collections/' in self.url: 106 | sUrlType = 'playlist' 107 | else: 108 | raise ValueError(f"Unable to determine {sExtractor} URL type for {self.url}! Please submit a bug report!") 109 | return sUrlType 110 | 111 | 112 | def _extract_video_urls(self, sFilter=None): 113 | """ 114 | Extract video URLs from all playlist pages. 115 | """ 116 | lUrlVideos = [] 117 | nPage = 0 118 | while True: 119 | nPage += 1 120 | 121 | lPageVideos = self._extract_page_urls(nPage) 122 | if lPageVideos: 123 | lUrlVideos += lPageVideos 124 | print(f"Found {len(lPageVideos)} videos on page {nPage:02}...") 125 | else: 126 | print(f"No videos found on page {nPage}. Stopping...") 127 | break 128 | self.videos += lUrlVideos 129 | 130 | 131 | def _extract_page_urls(self, nPage, sFilter=None): 132 | """ 133 | Extract video URLs from a single page of the playlist. 134 | """ 135 | sUrlBase = "https://www.pornhub.com{}" 136 | 137 | for nAttempts in range(3): 138 | sUrlPage = self._sUrlBaseFormat.format(nPage) 139 | content = dl_common.session.get(sUrlPage, cookies=dl_common.dCookiesParsed) 140 | if "503 Service Temporarily Unavailable" in content.text: 141 | sleep(3) 142 | continue 143 | soup = dl_common.BeautifulSoup(content.text, 'html.parser') 144 | break 145 | 146 | lVideos = [] 147 | lTags = soup.find_all(attrs={"class": 'pcVideoListItem js-pop videoblock videoBox'}) 148 | for tag in lTags: 149 | if 'id' in tag.attrs.keys() and 'vfavouriteVideo' in tag.attrs['id']: 150 | for a in tag.find_all('a', href=True): 151 | href = a['href'] 152 | if 'view_video.php?' not in href: 153 | continue 154 | if '&pkey=' in href: 155 | continue 156 | if href not in self.videos: 157 | sUrlFull = sUrlBase.format(href) 158 | if sUrlFull not in lVideos: 159 | lVideos.append(sUrlFull) 160 | return lVideos 161 | 162 | 163 | def urlStandardize(self, sUrl): 164 | """ 165 | Make sure URL ends with '/' and tack on f-string brackets for iterating through pages. 166 | """ 167 | if sUrl.endswith('favorites'): 168 | sUrl += '?page={}' 169 | return sUrl 170 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | requests 3 | yt-dlp>=2022.10.4 4 | jsbeautifier --------------------------------------------------------------------------------