├── FUNDING.yml ├── imgs ├── card_crop.png └── how_to_download.gif ├── push_^q^.bat ├── .github └── stale.yml ├── src └── extractor │ ├── _4chan_downloader.py │ ├── pawoo_downloader.py │ ├── baraag_downloader.py │ ├── file_downloader.py │ ├── youporn_downloader.py │ ├── kakaotv_downloader.py │ ├── youku_downloader.py │ ├── m3u8_downloader.py │ ├── vimeo_downloader.py │ ├── talk_op_gg_downloader.py │ ├── navertv_downloader.py │ ├── asiansister_downloader.py │ ├── xnxx_downloader.py │ ├── coub_downloader.py │ ├── kissjav_downloader.py │ ├── avgle_downloader.py │ ├── vlive_downloader.py │ ├── yandere_downloader.py │ ├── hentaicosplay_downloader.py │ ├── asmhentai_downloader.py │ ├── fc2_downloader.py │ ├── v2ph_downloader.py │ ├── afreeca_downloader.py │ ├── wikiart_downloader.py │ ├── navercafe_downloader.py │ ├── tokyomotion_downloader.py │ ├── nhentai_com_downloader.py │ ├── pandoratv_downloader.py │ ├── novelpia_downloader.py │ ├── nozomi_downloader.py │ ├── flickr_downloader.py │ ├── rule34_xxx_downloader.py │ ├── likee_downloader.py │ ├── nhentai_downloader.py │ ├── nico_downloader.py │ ├── hanime_downloader.py │ ├── kakuyomu_downloader.py │ ├── webtoon_downloader.py │ ├── comicwalker_downloader.py │ ├── hameln_downloader.py │ ├── imgur_downloader.py │ ├── discord_emoji_downloader.py │ ├── bdsmlr_downloader.py │ ├── nijie_downloader.py │ ├── hf_downloader.py │ ├── luscious_downloader.py │ ├── xvideo_downloader.py │ ├── gelbooru_downloader.py │ ├── bcy_downloader.py │ ├── danbooru_downloader.py │ ├── soundcloud_downloader.py │ ├── tiktok_downloader.py │ ├── naver_downloader.py │ ├── twitch_downloader.py │ ├── mrm_downloader.py │ ├── kakaopage_downloader.py │ └── lhscan_downloader.py ├── .gitignore ├── translation └── tr_ko.hdl └── README.md /FUNDING.yml: -------------------------------------------------------------------------------- 1 | patreon: KurtBestor -------------------------------------------------------------------------------- /imgs/card_crop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coko8023/Hitomi-Downloader/master/imgs/card_crop.png -------------------------------------------------------------------------------- /imgs/how_to_download.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coko8023/Hitomi-Downloader/master/imgs/how_to_download.gif -------------------------------------------------------------------------------- /push_^q^.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | git add . 4 | git commit -m "^q^" 5 | git push 6 | 7 | echo Done! 8 | pause>nul -------------------------------------------------------------------------------- /.github/stale.yml: -------------------------------------------------------------------------------- 1 | # Configuration for probot-stale - https://github.com/probot/stale 2 | 3 | # Number of days of inactivity before an Issue or Pull Request becomes stale 4 | daysUntilStale: 90 5 | # Number of days of inactivity before a stale Issue or Pull Request is closed 6 | daysUntilClose: 30 7 | # Issues or Pull Requests with these labels will never be considered stale 8 | exemptLabels: 9 | - help wanted 10 | - notice 11 | # Label to use when marking as stale 12 | staleLabel: stale 13 | # Comment to post when marking as stale. Set to `false` to disable 14 | markComment: > 15 | This issue has been automatically marked as stale because it has not had 16 | recent activity. It will be closed after 30 days if no further activity 17 | occurs, but feel free to re-open a closed issue if needed. -------------------------------------------------------------------------------- /src/extractor/_4chan_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Downloader, LazyUrl, clean_title, urljoin, get_ext 3 | from ratelimit import limits, sleep_and_retry 4 | 5 | 6 | class Image: 7 | def __init__(self, url, ref, n): 8 | self._url = url 9 | self.url = LazyUrl(ref, self.get, self) 10 | self.filename = '{:04}{}'.format(n, get_ext(url)) 11 | 12 | @sleep_and_retry 13 | @limits(2, 1) 14 | def get(self, _): 15 | return self._url 16 | 17 | 18 | 19 | @Downloader.register 20 | class Downloader_4chan(Downloader): 21 | type = '4chan' 22 | URLS = [r'regex:boards.(4chan|4channel).org'] 23 | MAX_CORE = 4 24 | display_name = '4chan' 25 | 26 | @classmethod 27 | def fix_url(cls, url): 28 | return url.split('#')[0] 29 | 30 | def read(self): 31 | soup = downloader.read_soup(self.url) 32 | for div in soup.findAll('div', class_='fileText'): 33 | href = urljoin(self.url, div.a['href']) 34 | img = Image(href, self.url, len(self.urls)) 35 | self.urls.append(img.url) 36 | 37 | board = self.url.split('/')[3] 38 | title = soup.find('span', class_='subject').text 39 | id_ = int(self.url.split('/thread/')[1].split('/')[0]) 40 | self.title = clean_title(f'[{board}] {title} ({id_})') 41 | -------------------------------------------------------------------------------- /src/extractor/pawoo_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Downloader, lazy, clean_title 4 | import ree as re 5 | from translator import tr_ 6 | from mastodon import get_imgs 7 | import json 8 | 9 | 10 | @Downloader.register 11 | class Downloader_pawoo(Downloader): 12 | type = 'pawoo' 13 | URLS = ['pawoo.net'] 14 | 15 | def init(self): 16 | self.url = 'https://pawoo.net/{}'.format(self.id_) 17 | self.referer = self.url 18 | 19 | @property 20 | def id_(self): 21 | return re.find('pawoo.net/([^/]+)', self.url.lower(), default=self.url) 22 | 23 | @lazy 24 | def soup(self): 25 | return downloader.read_soup(self.url) 26 | 27 | @property 28 | def name(self): 29 | name_raw = re.find(r'''['"]name['"] *: *['"](.+?)['"]''', str(self.soup), err='no name') 30 | name = json.loads('"{}"'.format(name_raw)) 31 | title = '{} (pawoo_{})'.format(name, self.id_) 32 | return clean_title(title) 33 | 34 | def read(self): 35 | self.title = tr_('읽는 중... {}').format(self.name) 36 | 37 | imgs = get_imgs('pawoo.net', self.id_, self.name, cw=self.cw) 38 | 39 | for img in imgs: 40 | self.urls.append(img.url) 41 | self.filenames[img.url] = img.filename 42 | 43 | self.title = self.name 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/extractor/baraag_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, Downloader, lazy, clean_title 4 | import ree as re 5 | from translator import tr_ 6 | from mastodon import get_imgs 7 | 8 | 9 | 10 | def get_id(url): 11 | return re.find('baraag.net/([^/]+)', url.lower()) 12 | 13 | 14 | @Downloader.register 15 | class Downloader_baraag(Downloader): 16 | type = 'baraag' 17 | URLS = ['baraag.net'] 18 | display_name = 'baraag.net' 19 | 20 | def init(self): 21 | self.referer = self.url 22 | 23 | @classmethod 24 | def fix_url(cls, url): 25 | id_ = get_id(url) or url 26 | return 'https://baraag.net/{}'.format(id_) 27 | 28 | @lazy 29 | def id(self): 30 | return get_id(self.url) 31 | 32 | @lazy 33 | def soup(self): 34 | return Soup(downloader.read_html(self.url)) 35 | 36 | @property 37 | def name(self): 38 | title = self.soup.find('h1').text.strip().split('\n')[0].strip() 39 | title = u'{} (baraag_{})'.format(title, self.id) 40 | return clean_title(title) 41 | 42 | def read(self): 43 | self.title = tr_(u'읽는 중... {}').format(self.name) 44 | 45 | imgs = get_imgs('baraag.net', self.id, self.name, cw=self.cw) 46 | 47 | for img in imgs: 48 | self.urls.append(img.url) 49 | self.filenames[img.url] = img.filename 50 | 51 | self.title = self.name 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /src/extractor/file_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader, json, os 2 | from constants import try_n 3 | from utils import Downloader, query_url, clean_title, get_ext 4 | from timee import sleep 5 | from hashlib import md5 6 | 7 | 8 | @Downloader.register 9 | class Downloader_file(Downloader): 10 | type = 'file' 11 | single = True 12 | URLS = [] 13 | 14 | @classmethod 15 | def fix_url(cls, url): 16 | if '://' not in url: 17 | url = 'https://' + url.lstrip('/') 18 | return 'file_' + url 19 | 20 | def read(self): 21 | qs = query_url(self.url) 22 | for key in qs: 23 | if key.lower() in ('file', 'filename'): 24 | name = qs[key][(-1)] 25 | break 26 | else: 27 | name = self.url 28 | for esc in ['?', '#']: 29 | name = name.split(esc)[0] 30 | name = os.path.basename(name.strip('/')) 31 | 32 | try: 33 | ext = downloader.get_ext(self.url) 34 | except: 35 | ext = '' 36 | if not ext: 37 | ext = get_ext(name) 38 | 39 | name = os.path.splitext(name)[0] 40 | 41 | self.urls.append(self.url) 42 | 43 | id_ = md5(self.url.encode('utf8')).hexdigest()[:8] 44 | tail = ' ({}){}'.format(id_, ext) 45 | filename = clean_title(name, n=-len(tail)) + tail 46 | 47 | self.filenames[self.url] = filename 48 | 49 | self.title = filename 50 | -------------------------------------------------------------------------------- /src/extractor/youporn_downloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, unicode_literals 2 | import downloader 3 | import ree as re 4 | from io import BytesIO 5 | import os 6 | from constants import try_n 7 | from utils import Downloader, LazyUrl, get_ext, format_filename, clean_title 8 | import ytdl 9 | 10 | 11 | 12 | @Downloader.register 13 | class Downloader_youporn(Downloader): 14 | type = 'youporn' 15 | single = True 16 | URLS = ['youporn.com'] 17 | display_name = 'YouPorn' 18 | 19 | @classmethod 20 | def fix_url(cls, url): 21 | if 'youporn.com' not in url.lower(): 22 | url = 'https://www.youporn.com/watch/{}'.format(url) 23 | return url 24 | 25 | def read(self): 26 | video = Video(self.url, cw=self.cw) 27 | 28 | self.urls.append(video.url) 29 | self.setIcon(video.thumb) 30 | 31 | self.enableSegment() 32 | 33 | self.title = video.title 34 | 35 | 36 | class Video(object): 37 | @try_n(4) 38 | def __init__(self, url, cw=None): 39 | ydl = ytdl.YoutubeDL(cw=cw) 40 | info = ydl.extract_info(url) 41 | 42 | f = info['formats'][-1] 43 | url_video = f['url'] 44 | self.url = LazyUrl(url, lambda _: url_video, self) 45 | 46 | self.url_thumb = info['thumbnails'][0]['url'] 47 | self.thumb = BytesIO() 48 | downloader.download(self.url_thumb, buffer=self.thumb) 49 | self.title = info['title'] 50 | ext = get_ext(url_video) 51 | self.filename = format_filename(self.title, info['id'], ext) 52 | -------------------------------------------------------------------------------- /src/extractor/kakaotv_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ytdl 3 | from utils import Downloader, try_n, LazyUrl, get_ext, format_filename 4 | from io import BytesIO as IO 5 | from m3u8_tools import M3u8_stream 6 | 7 | 8 | @Downloader.register 9 | class Downloader_vlive(Downloader): 10 | type = 'kakaotv' 11 | URLS = ['tv.kakao'] 12 | single = True 13 | display_name = 'KakaoTV' 14 | 15 | @classmethod 16 | def fix_url(cls, url): 17 | return url.split('?')[0].strip('/') 18 | 19 | def read(self): 20 | video = Video(self.url, cw=self.cw) 21 | video.url()# 22 | 23 | self.urls.append(video.url) 24 | self.setIcon(video.thumb) 25 | 26 | self.enableSegment() 27 | 28 | self.title = video.title 29 | 30 | 31 | 32 | class Video(object): 33 | _url = None 34 | 35 | def __init__(self, url, cw=None): 36 | self.url = LazyUrl(url, self.get, self) 37 | self.cw = cw 38 | 39 | @try_n(2) 40 | def get(self, url): 41 | if self._url: 42 | return self._url 43 | 44 | ydl = ytdl.YoutubeDL(cw=self.cw) 45 | info = ydl.extract_info(url) 46 | fs = [f for f in info['formats'] if f['ext'] == 'mp4'] 47 | f = sorted(fs, key=lambda f: f['height'])[-1] 48 | self._url = f['url'] 49 | 50 | self.thumb_url = info['thumbnails'][0]['url'] 51 | self.thumb = IO() 52 | downloader.download(self.thumb_url, buffer=self.thumb) 53 | self.title = info['title'] 54 | ext = get_ext(self._url) 55 | self.filename = format_filename(self.title, info['id'], ext) 56 | return self._url 57 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | # etc 104 | #*.bat 105 | call cmd.bat 106 | -------------------------------------------------------------------------------- /src/extractor/youku_downloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, unicode_literals 2 | import downloader 3 | import ytdl 4 | from m3u8_tools import M3u8_stream 5 | from utils import LazyUrl, get_ext, Downloader, format_filename, clean_title 6 | from io import BytesIO 7 | 8 | 9 | @Downloader.register 10 | class Downloader_youku(Downloader): 11 | type = 'youku' 12 | single = True 13 | URLS = ['v.youku.com'] 14 | 15 | def read(self): 16 | video = Video(self.url, cw=self.cw) 17 | video.url()# get thumb 18 | 19 | self.urls.append(video.url) 20 | self.setIcon(video.thumb) 21 | 22 | self.title = video.title 23 | 24 | 25 | class Video(object): 26 | _url = None 27 | 28 | def __init__(self, url, cw=None): 29 | self.url = LazyUrl(url, self.get, self) 30 | self.cw = cw 31 | 32 | def get(self, url): 33 | if self._url: 34 | return self._url 35 | 36 | ydl = ytdl.YoutubeDL(cw=self.cw) 37 | info = ydl.extract_info(url) 38 | 39 | # get best video 40 | fs = info['formats'] 41 | fs = sorted(fs, key=lambda x: int(x['width']), reverse=True) 42 | f = fs[0] 43 | url_video = f['url'] 44 | 45 | # thumb 46 | self.thumb_url = info['thumbnails'][0]['url'] 47 | self.thumb = BytesIO() 48 | downloader.download(self.thumb_url, buffer=self.thumb) 49 | 50 | # m3u8 51 | print(f['protocol']) 52 | if 'm3u8' in f['protocol']: 53 | url_video = M3u8_stream(url_video, referer=url) 54 | 55 | # title & filename 56 | self.title = info['title'] 57 | self.filename = format_filename(self.title, info['id'], '.mp4') 58 | 59 | self._url = url_video 60 | 61 | return self._url 62 | 63 | -------------------------------------------------------------------------------- /src/extractor/m3u8_downloader.py: -------------------------------------------------------------------------------- 1 | from utils import Downloader, LazyUrl, clean_title 2 | import utils 3 | from m3u8_tools import playlist2stream, M3u8_stream 4 | import os 5 | from hashlib import md5 6 | from translator import tr_ 7 | DEFAULT_N_THREAD = 2 8 | 9 | 10 | @Downloader.register 11 | class Downloader_m3u8(Downloader): 12 | type = 'm3u8' 13 | URLS = ['.m3u8'] 14 | single = True 15 | display_name = 'M3U8' 16 | 17 | @classmethod 18 | def fix_url(cls, url): 19 | if '://' not in url: 20 | url = 'http://' + url 21 | return url 22 | 23 | def read(self): 24 | n_thread = self.cw.format or DEFAULT_N_THREAD 25 | self.print_('n_thread: {}'.format(n_thread)) 26 | video = Video(self.url, n_thread) 27 | self.urls.append(video.url) 28 | self.title = '{} ({})'.format(video.title, video.id_) 29 | 30 | 31 | class Video(object): 32 | def __init__(self, url, n_thread): 33 | try: 34 | m = playlist2stream(url, n_thread=n_thread) 35 | except: 36 | m = M3u8_stream(url, n_thread=n_thread) 37 | self.url = LazyUrl(url, lambda _: m, self) 38 | self.title = os.path.splitext(os.path.basename(url))[0] 39 | self.id_ = md5(url.encode('utf8')).hexdigest()[:8] 40 | tail = ' ({}).mp4'.format(self.id_) 41 | self.filename = clean_title(self.title, n=-len(tail)) + tail 42 | 43 | 44 | import selector 45 | @selector.options('m3u8') 46 | def options(): 47 | def f(urls): 48 | n_thread, ok = utils.QInputDialog.getInt(Downloader.mainWindow, tr_('Set number of threads'), tr_('Number of threads?'), value=DEFAULT_N_THREAD, min=1, max=4, step=1) 49 | if not ok: 50 | return 51 | return n_thread 52 | return [ 53 | {'text': 'Set number of threads...', 'format': f}, 54 | ] 55 | -------------------------------------------------------------------------------- /src/extractor/vimeo_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ree as re 3 | from io import BytesIO as IO 4 | from error_printer import print_error 5 | from utils import Downloader, LazyUrl, get_ext, format_filename, try_n 6 | import ytdl 7 | 8 | 9 | 10 | @Downloader.register 11 | class Downloader_vimeo(Downloader): 12 | type = 'vimeo' 13 | URLS = ['vimeo.com'] 14 | single = True 15 | 16 | def init(self): 17 | if 'vimeo.com' not in self.url.lower(): 18 | self.url = u'https://vimeo.com/{}'.format(self.url) 19 | 20 | def read(self): 21 | video = Video(self.url, cw=self.cw) 22 | video.url()# 23 | 24 | self.urls.append(video.url) 25 | self.setIcon(video.thumb) 26 | 27 | self.enableSegment() 28 | 29 | self.title = video.title 30 | 31 | 32 | class Video(object): 33 | _url = None 34 | 35 | def __init__(self, url, cw=None): 36 | self.url = LazyUrl(url, self.get, self) 37 | self.cw = cw 38 | 39 | @try_n(4) 40 | def get(self, url): 41 | if self._url: 42 | return self._url 43 | 44 | ydl = ytdl.YoutubeDL(cw=self.cw) 45 | info = ydl.extract_info(url) 46 | fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']] 47 | fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True) 48 | if not fs: 49 | raise Exception('No MP4 videos') 50 | f = fs[0] 51 | 52 | self.thumb_url = info['thumbnails'][0]['url'] 53 | self.thumb = IO() 54 | downloader.download(self.thumb_url, buffer=self.thumb) 55 | self.title = info['title'] 56 | url_video = f['url'] 57 | ext = get_ext(url) or '.mp4' 58 | self.filename = format_filename(self.title, info['id'], ext) 59 | self._url = url_video 60 | return self._url 61 | -------------------------------------------------------------------------------- /src/extractor/talk_op_gg_downloader.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | # title: Download talk op.gg image 3 | # author: SaidBySolo 4 | # comment: op.gg 커뮤니티의 이미지를 다운로드합니다 5 | 6 | """ 7 | MIT License 8 | 9 | Copyright (c) 2020 SaidBySolo 10 | 11 | Permission is hereby granted, free of charge, to any person obtaining a copy 12 | of this software and associated documentation files (the "Software"), to deal 13 | in the Software without restriction, including without limitation the rights 14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 15 | copies of the Software, and to permit persons to whom the Software is 16 | furnished to do so, subject to the following conditions: 17 | 18 | The above copyright notice and this permission notice shall be included in all 19 | copies or substantial portions of the Software. 20 | 21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 27 | SOFTWARE. 28 | """ 29 | 30 | import requests 31 | from utils import Downloader, Soup 32 | 33 | 34 | @Downloader.register 35 | class DownloaderTalkOPGG(Downloader): 36 | type = "talkopgg" 37 | URLS = ["talk.op.gg"] 38 | 39 | def init(self) -> None: 40 | pass 41 | 42 | def read(self) -> None: 43 | response = requests.get(self.url) 44 | soup = Soup(response.text) 45 | 46 | self.title = soup.find("title").text 47 | 48 | image_element_list = soup.find("div", class_="article-content").findAll("img") 49 | 50 | for image_element in image_element_list: 51 | self.urls.append(image_element["src"]) 52 | -------------------------------------------------------------------------------- /src/extractor/navertv_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ree as re 3 | from io import BytesIO as IO 4 | import os 5 | from constants import try_n 6 | from error_printer import print_error 7 | from utils import Downloader, compatstr, LazyUrl, get_ext, format_filename, clean_title 8 | import ytdl 9 | 10 | 11 | 12 | @Downloader.register 13 | class Downloader_navertv(Downloader): 14 | type = 'navertv' 15 | single = True 16 | URLS = ['tv.naver.com'] 17 | display_name = 'Naver TV' 18 | 19 | def init(self): 20 | if not re.match('https?://.+', self.url, re.IGNORECASE): 21 | self.url = 'https://tv.naver.com/v/{}'.format(self.url) 22 | 23 | def read(self): 24 | video = Video(self.url, cw=self.cw) 25 | video.url()# 26 | 27 | self.urls.append(video.url) 28 | self.setIcon(video.thumb) 29 | 30 | self.enableSegment() 31 | 32 | self.title = video.title 33 | 34 | 35 | 36 | class Video(object): 37 | _url = None 38 | 39 | def __init__(self, url, cw=None): 40 | self.url = LazyUrl(url, self.get, self) 41 | self.cw = cw 42 | 43 | @try_n(4) 44 | def get(self, url): 45 | if self._url: 46 | return self._url 47 | 48 | ydl = ytdl.YoutubeDL(cw=self.cw) 49 | info = ydl.extract_info(url) 50 | fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']] 51 | fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True) 52 | if not fs: 53 | raise Exception('No MP4 videos') 54 | f = fs[0] 55 | self._url = f['url'] 56 | 57 | self.thumb_url = info['thumbnails'][0]['url'] 58 | self.thumb = IO() 59 | downloader.download(self.thumb_url, buffer=self.thumb) 60 | self.title = info['title'] 61 | id = info['id'] 62 | ext = get_ext(self._url) 63 | self.filename = format_filename(self.title, id, ext) 64 | return self._url 65 | -------------------------------------------------------------------------------- /src/extractor/asiansister_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, clean_title 3 | from timee import sleep 4 | import os 5 | import ree as re 6 | 7 | 8 | @Downloader.register 9 | class Downloader_asiansister(Downloader): 10 | type = 'asiansister' 11 | URLS = ['asiansister.com'] 12 | display_name = 'AsianSister' 13 | 14 | @try_n(4) 15 | def init(self): 16 | html = downloader.read_html(self.url) 17 | self.soup = Soup(html) 18 | 19 | @property 20 | def name(self): 21 | return clean_title(self.soup.find('title').text.replace('- ASIANSISTER.COM', '').strip()) 22 | 23 | def read(self): 24 | imgs = get_imgs(self.url, self.soup, self.name) 25 | 26 | for img in imgs: 27 | if img.type == 'video': 28 | self.single = True 29 | self.urls.append(img.url) 30 | 31 | self.title = self.name 32 | 33 | 34 | class Image(object): 35 | def __init__(self, url, referer, p, type='image'): 36 | self.url = LazyUrl(referer, lambda x: url, self) 37 | ext = os.path.splitext(url.split('?')[0])[1] 38 | self.filename = u'{:04}{}'.format(p, ext) 39 | self.type = type 40 | 41 | 42 | @try_n(4) 43 | def get_imgs(url, soup=None, name=None): 44 | if soup is None: 45 | html = downloader.read_html(url) 46 | soup = Soup(html) 47 | 48 | view = soup.findAll('div', class_='rootContant')[:2][-1] 49 | 50 | v = view.find('video') 51 | if v: 52 | img = v.find('source').attrs['src'] 53 | img = urljoin(url, img) 54 | img = Image(img, url, 0, 'video') 55 | ext = os.path.splitext(img.url().split('?')[0])[1] 56 | img.filename = u'{}{}'.format(name, ext) 57 | return [img] 58 | 59 | imgs = [] 60 | for img in view.findAll('img'): 61 | img = img.attrs['dataurl'] 62 | img = urljoin(url, img) 63 | img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.') 64 | img = Image(img, url, len(imgs)) 65 | imgs.append(img) 66 | 67 | return imgs 68 | -------------------------------------------------------------------------------- /src/extractor/xnxx_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Soup, cut_pair, urljoin, Downloader, LazyUrl, format_filename, clean_title 3 | import ree as re 4 | import m3u8 5 | from m3u8_tools import M3u8_stream, playlist2stream 6 | from timee import sleep 7 | import os 8 | from io import BytesIO as IO 9 | 10 | 11 | 12 | class Video(object): 13 | 14 | def __init__(self, url, url_page, title, url_thumb): 15 | self._url = url 16 | self.url = LazyUrl(url_page, self.get, self) 17 | self.id = get_id(url_page) 18 | self.title = title 19 | self.filename = format_filename(title, self.id, '.mp4') 20 | f = IO() 21 | self.url_thumb = url_thumb 22 | downloader.download(url_thumb, buffer=f) 23 | self.thumb = f 24 | 25 | def get(self, _): 26 | return self._url 27 | 28 | 29 | def get_id(url): 30 | return url.split('xnxx.com/')[1].split('/')[0] 31 | 32 | 33 | @Downloader.register 34 | class Downloader_xnxx(Downloader): 35 | type = 'xnxx' 36 | URLS = [r'regex:xnxx[0-9]*\.(com|es)'] 37 | single = True 38 | display_name = 'XNXX' 39 | 40 | @classmethod 41 | def fix_url(cls, url): 42 | return re.sub(r'xnxx[0-9]*\.(com|es)', 'xnxx.com', url) 43 | 44 | def read(self): 45 | video = get_video(self.url) 46 | self.urls.append(video.url) 47 | self.setIcon(video.thumb) 48 | self.title = video.title 49 | 50 | 51 | def get_video(url): 52 | html = downloader.read_html(url) 53 | soup = Soup(html) 54 | 55 | for script in soup.findAll('script'): 56 | script = script.text or script.string or '' 57 | hls = re.find(r'''html5player\.setVideoHLS\(['"](.+?)['"]''', script) 58 | if hls: 59 | break 60 | else: 61 | raise Exception('No VideoHLS') 62 | 63 | video = playlist2stream(hls) 64 | 65 | title = get_title(soup) 66 | 67 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'].strip() 68 | 69 | video = Video(video, url, title, url_thumb) 70 | return video 71 | 72 | 73 | def get_title(soup): 74 | return soup.find('meta', {'property': 'og:title'}).attrs['content'].strip() 75 | 76 | -------------------------------------------------------------------------------- /src/extractor/coub_downloader.py: -------------------------------------------------------------------------------- 1 | from utils import Downloader, LazyUrl, try_n, format_filename, get_ext 2 | import ytdl 3 | from io import BytesIO as IO 4 | import downloader 5 | import ree as re 6 | import ffmpeg 7 | PATTEN_IMAGIZER = r'coub-com-.+\.imagizer\.com' 8 | 9 | 10 | def get_id(url): 11 | return re.find(r'/view/([0-9a-z]+)', url, err='no id') 12 | 13 | 14 | @Downloader.register 15 | class Downloader_coub(Downloader): 16 | type = 'coub' 17 | URLS = ['coub.com', r'regex:'+PATTEN_IMAGIZER] 18 | single = True 19 | 20 | @classmethod 21 | def fix_url(cls, url): 22 | return re.sub(PATTEN_IMAGIZER, 'coub.com', url) 23 | 24 | @classmethod 25 | def key_id(cls, url): 26 | return get_id(url) 27 | 28 | def read(self): 29 | video = Video(self.url, cw=self.cw) 30 | video.url()# 31 | 32 | self.urls.append(video.url) 33 | self.setIcon(video.thumb) 34 | 35 | self.enableSegment() 36 | 37 | self.title = video.title 38 | 39 | 40 | 41 | class Video(object): 42 | _url = None 43 | 44 | def __init__(self, url, cw=None): 45 | self.url = LazyUrl(url, self.get, self, pp=self.pp) 46 | self.cw = cw 47 | 48 | @try_n(2) 49 | def get(self, url): 50 | if self._url: 51 | return self._url 52 | 53 | ydl = ytdl.YoutubeDL(cw=self.cw) 54 | info = ydl.extract_info(url) 55 | fs = [f for f in info['formats'] if f['ext'] == 'mp4'] 56 | f = sorted(fs, key=lambda f: int(f.get('filesize', 0)))[-1] 57 | self._url = f['url'] 58 | ## fs = [f for f in info['formats'] if f['ext'] == 'mp3'] 59 | ## self.f_audio = sorted(fs, key=lambda f: int(f.get('filesize', 0)))[-1] 60 | 61 | self.thumb_url = info['thumbnails'][0]['url'] 62 | self.thumb = IO() 63 | downloader.download(self.thumb_url, buffer=self.thumb) 64 | self.title = info['title'] 65 | ext = get_ext(self._url) 66 | self.filename = format_filename(self.title, info['id'], ext) 67 | return self._url 68 | 69 | def pp(self, filename): 70 | ## f = IO() 71 | ## downloader.download(self.f_audio['url'], buffer=f) 72 | ## ffmpeg.merge(filename, f) 73 | return filename 74 | -------------------------------------------------------------------------------- /src/extractor/kissjav_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Soup, urljoin, Downloader, LazyUrl, Session, try_n, format_filename, clean_title 3 | from timee import sleep 4 | import ree as re 5 | from io import BytesIO 6 | import clf2 7 | 8 | 9 | @Downloader.register 10 | class Downloader_kissjav(Downloader): 11 | type = 'kissjav' 12 | URLS = ['kissjav.com'] 13 | single = True 14 | display_name = 'KissJAV' 15 | 16 | def read(self): 17 | self.session = None#get_session(self.url, cw=self.cw) 18 | 19 | video = get_video(self.url, self.session) 20 | self.urls.append(video.url) 21 | self.setIcon(video.thumb) 22 | self.enableSegment(1024*1024//2) 23 | 24 | self.title = video.title 25 | 26 | 27 | @try_n(2) 28 | def get_video(url, session): 29 | soup = downloader.read_soup(url, session=session) 30 | 31 | view = soup.find('div', id='player-container-fluid') 32 | src_best = None 33 | res_best = -1 34 | for source in view.findAll('source'): 35 | src = urljoin(url, source.attrs['src']) 36 | res = re.find('([0-9]+)p', source.attrs['title']) 37 | res = int(res) if res else 0 38 | if res > res_best: 39 | src_best = src 40 | res_best = res 41 | 42 | if src_best is None: 43 | raise Exception('No source') 44 | 45 | title = soup.find('h1').text.strip() 46 | id = soup.find('div', id='video').attrs['data-id'] 47 | 48 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] 49 | 50 | #src_best = downloader.real_url(src_best) 51 | 52 | video = Video(src_best, url_thumb, url, title, id, session) 53 | return video 54 | 55 | 56 | class Video(object): 57 | def __init__(self, url, url_thumb, referer, title, id, session): 58 | self.title = title 59 | self.filename = format_filename(title, id, '.mp4') 60 | self.url = LazyUrl(referer, lambda x: url, self) 61 | 62 | self.thumb = BytesIO() 63 | self.url_thumb = url_thumb 64 | downloader.download(url_thumb, buffer=self.thumb, session=session) 65 | 66 | 67 | @try_n(2) 68 | def get_session(url, cw=None): 69 | session = Session() 70 | clf2.solve(url, session=session, cw=cw) 71 | return session 72 | 73 | -------------------------------------------------------------------------------- /src/extractor/avgle_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf8 2 | import downloader 3 | import os 4 | from m3u8_tools import M3u8_stream 5 | from utils import Soup, Downloader, LazyUrl, get_print, try_n, clean_title, check_alive 6 | from io import BytesIO 7 | import constants 8 | from error_printer import print_error 9 | import base64 10 | import json 11 | import webbrowser 12 | import errors 13 | 14 | 15 | @Downloader.register 16 | class Downloader_avgle(Downloader): 17 | type = 'avgle' 18 | single = True 19 | URLS = ['avgle.com'] 20 | 21 | def init(self): 22 | if not self.cw.data_: 23 | link = 'https://github.com/KurtBestor/Hitomi-Downloader/wiki/Chrome-Extension' 24 | webbrowser.open(link) 25 | raise errors.Invalid('No data; See: {}'.format(link)) 26 | 27 | def read(self): 28 | video = get_video(self.url, cw=self.cw) 29 | self.urls.append(video.url) 30 | 31 | self.setIcon(video.thumb) 32 | 33 | self.title = video.title 34 | 35 | 36 | @try_n(2) 37 | def get_video(url, cw=None): 38 | print_ = get_print(cw) 39 | 40 | check_alive(cw) 41 | 42 | data = cw.data_ 43 | version = data['version'] 44 | print_('version: {}'.format(version)) 45 | if version == '0.1': 46 | raise errors.OutdatedExtension() 47 | data = data['data'] 48 | if not isinstance(data, bytes): 49 | data = data.encode('utf8') 50 | s = base64.b64decode(data).decode('utf8') 51 | urls = json.loads(s) 52 | 53 | print_(u'\n'.join(urls[:4])) 54 | 55 | referer_seg = 'auto' if 'referer=force' in urls[0] else None # 1718 56 | 57 | stream = M3u8_stream(url, urls=urls, n_thread=4, referer_seg=referer_seg) 58 | 59 | html = downloader.read_html(url) 60 | soup = Soup(html) 61 | 62 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] 63 | title = soup.find('meta', {'property': 'og:title'}).attrs['content'].strip() 64 | 65 | video = Video(stream, url_thumb, url, title) 66 | 67 | return video 68 | 69 | 70 | class Video(object): 71 | def __init__(self, url, url_thumb, referer, title): 72 | self.url = LazyUrl(referer, lambda x: url, self) 73 | self.url_thumb = url_thumb 74 | self.thumb = BytesIO() 75 | downloader.download(url_thumb, referer=referer, buffer=self.thumb) 76 | self.title = title 77 | ext = '.mp4' 78 | self.filename = u'{}{}'.format(clean_title(title, n=-len(ext)), ext) 79 | 80 | 81 | -------------------------------------------------------------------------------- /src/extractor/vlive_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ytdl 3 | from utils import Downloader, try_n, LazyUrl, get_ext, format_filename, clean_title, pp_subtitle 4 | from io import BytesIO 5 | import ree as re 6 | from m3u8_tools import M3u8_stream 7 | import os 8 | 9 | 10 | @Downloader.register 11 | class Downloader_vlive(Downloader): 12 | type = 'vlive' 13 | URLS = ['vlive.tv'] 14 | single = True 15 | display_name = 'V LIVE' 16 | 17 | def init(self): 18 | if 'channels.vlive.tv' in self.url: 19 | raise NotImplementedError('channel') 20 | 21 | def read(self): 22 | cw = self.cw 23 | video = get_video(self.url, cw=cw) 24 | 25 | self.urls.append(video.url) 26 | 27 | self.setIcon(video.thumb) 28 | self.enableSegment() 29 | 30 | self.title = clean_title(video.title) 31 | 32 | 33 | @try_n(4) 34 | def get_video(url, cw=None): 35 | options = { 36 | 'noplaylist': True, 37 | } 38 | 39 | ydl = ytdl.YoutubeDL(options, cw=cw) 40 | info = ydl.extract_info(url) 41 | 42 | fs = [] 43 | for f in info['formats']: 44 | if f['ext'] != 'mp4': 45 | continue 46 | f['quality'] = f.get('vbr') or re.find('([0-9]+)p', f['format'], re.IGNORECASE) 47 | print(f['format'], f['quality']) 48 | fs.append(f) 49 | 50 | if not fs: 51 | raise Exception('No videos') 52 | 53 | f = sorted(fs, key=lambda f:f['quality'])[-1] 54 | 55 | subs = {} 56 | for sub, items in info['subtitles'].items(): 57 | sub = sub.split('_')[0] 58 | for item in items: 59 | if item['ext'] != 'vtt': 60 | continue 61 | subs[sub] = item['url'] 62 | video = Video(f, info, subs, cw) 63 | 64 | return video 65 | 66 | 67 | class Video(object): 68 | def __init__(self, f, info, subs, cw=None): 69 | self.title = title = info['title'] 70 | self.id = info['id'] 71 | self.url = f['url'] 72 | self.subs = subs 73 | self.cw = cw 74 | 75 | self.thumb = BytesIO() 76 | downloader.download(info['thumbnail'], buffer=self.thumb) 77 | 78 | ext = get_ext(self.url) 79 | if ext.lower() == '.m3u8': 80 | raise NotImplementedError('stream')# 81 | url = M3u8_stream(self.url, n_thread=4) 82 | else: 83 | url = self.url 84 | self.url = LazyUrl(self.url, lambda x: url, self, pp=self.pp) 85 | self.filename = format_filename(title, self.id, ext) 86 | 87 | def pp(self, filename): 88 | pp_subtitle(self, filename, self.cw) 89 | return filename 90 | 91 | 92 | -------------------------------------------------------------------------------- /src/extractor/yandere_downloader.py: -------------------------------------------------------------------------------- 1 | from utils import Downloader, urljoin, clean_title, try_n, check_alive, LazyUrl, get_ext, get_max_range 2 | from translator import tr_ 3 | import ree as re 4 | import downloader 5 | from ratelimit import limits, sleep_and_retry 6 | 7 | 8 | @try_n(4) 9 | @sleep_and_retry 10 | @limits(4, 1) 11 | def read_soup(url): 12 | return downloader.read_soup(url) 13 | 14 | 15 | @Downloader.register 16 | class Downloader_yandere(Downloader): 17 | type = 'yande.re' 18 | URLS = ['yande.re'] 19 | MAX_CORE = 4 20 | 21 | @classmethod 22 | def fix_url(cls, url): 23 | url = re.sub(r'([?&])page=[0-9]+&?', r'\1', url).rstrip('?&') 24 | pool = re.find('/pool/show/([0-9]+)', url) 25 | if pool is not None: 26 | url = urljoin(url, '/post?tags=pool%3A{}'.format(pool)) 27 | return url 28 | 29 | def read(self): 30 | title = self.get_title(self.url) 31 | 32 | url = self.url 33 | n = get_max_range(self.cw) 34 | ids = set() 35 | while True: 36 | check_alive(self.cw) 37 | soup = read_soup(url) 38 | for a in soup.find_all('a', class_='thumb'): 39 | id_ = re.find(r'/show/([0-9]+)', a['href'], err='no id') 40 | if id_ in ids: 41 | self.print_(f'dup: {id_}') 42 | continue 43 | ids.add(id_) 44 | img = Image(urljoin(url, a['href']), id_) 45 | self.urls.append(img.url) 46 | if len(self.urls) >= n: 47 | del self.urls[n:] 48 | break 49 | 50 | self.cw.setTitle('{} {} - {}'.format(tr_('읽는 중...'), title, len(self.urls))) 51 | 52 | next_page = soup.find('a', attrs={'rel':'next'}, href=True) 53 | if not next_page: 54 | break 55 | else: 56 | url = urljoin(self.url, next_page['href']) 57 | 58 | self.title = title 59 | 60 | def get_id(self, url:str) -> str: 61 | id_ = url.split('yande.re%20')[1].split('%20')[0] 62 | return int(id_) 63 | 64 | def get_title(self, url:str) -> str: 65 | if "tags=" not in url: 66 | raise NotImplementedError('no tags') 67 | 68 | url_tags = url.split("tags=")[-1].split('+') 69 | 70 | return clean_title(" ".join(url_tags)) 71 | 72 | 73 | class Image: 74 | 75 | def __init__(self, url, id_): 76 | self._id = id_ 77 | self.url = LazyUrl(url, self.get, self) 78 | 79 | def get(self, url): 80 | soup = read_soup(url) 81 | img = soup.find('a', class_='original-file-unchanged') or soup.find('a', class_='original-file-changed') 82 | img = urljoin(url, img['href']) 83 | ext = get_ext(img) 84 | self.filename = clean_title(self._id, n=-len(ext)) + ext 85 | return img 86 | -------------------------------------------------------------------------------- /translation/tr_ko.hdl: -------------------------------------------------------------------------------- 1 | { 2 | "lang": "ko", 3 | "items": { 4 | "#Cancel#": "취소", 5 | "#EB#": "{} EB", 6 | "#GB#": "{} GB", 7 | "#GIFs#": "GIF / WebP", 8 | "#KB#": "{} KB", 9 | "#KB/s#": "{} KB/s", 10 | "#MB#": "{} MB", 11 | "#MB/s#": "{} MB/s", 12 | "#OK#": "확인", 13 | "#PB#": "{} PB", 14 | "#TB#": "{} TB", 15 | "#boss_invalid_pw#": "Invalid password!", 16 | "#boss_pw#": "Password:", 17 | "#byte#": "{} byte", 18 | "#bytes#": "{} bytes", 19 | "#click#": "Click", 20 | "#combo_hour#": "{} 시간", 21 | "#combo_hours#": "{} 시간", 22 | "#combo_min#": "{} 분", 23 | "#combo_mins#": "{} 분", 24 | "#date01#": "1월 {d}일", 25 | "#date01y#": "1월 {d}일, {y}", 26 | "#date02#": "2월 {d}일", 27 | "#date02y#": "2월 {d}일, {y}", 28 | "#date03#": "3월 {d}일", 29 | "#date03y#": "3월 {d}일, {y}", 30 | "#date04#": "4월 {d}일", 31 | "#date04y#": "4월 {d}일, {y}", 32 | "#date05#": "5월 {d}일", 33 | "#date05y#": "5월 {d}일, {y}", 34 | "#date06#": "6월 {d}일", 35 | "#date06y#": "6월 {d}일, {y}", 36 | "#date07#": "7월 {d}일", 37 | "#date07y#": "7월 {d}일, {y}", 38 | "#date08#": "8월 {d}일", 39 | "#date08y#": "8월 {d}일, {y}", 40 | "#date09#": "9월 {d}일", 41 | "#date09y#": "9월 {d}일, {y}", 42 | "#date10#": "10월 {d}일", 43 | "#date10y#": "10월 {d}일, {y}", 44 | "#date11#": "11월 {d}일", 45 | "#date11y#": "11월 {d}일, {y}", 46 | "#date12#": "12월 {d}일", 47 | "#date12y#": "12월 {d}일, {y}", 48 | "#eta#": "{h:02}:{m:02}:{s:02}", 49 | "#filter_cookie#": "Netscape HTTP Cookie Files (*.txt)", 50 | "#invalid_browserRequired#": "Browser required; Use --safemode", 51 | "#invalid_loginRequired#": "Login required; Update your cookies", 52 | "#invalid_outdatedExtension#": "Extension is outdated; Update the extension", 53 | "#invalid_unknownSite#": "Unknown site", 54 | "#loading_lib#": "Loading: {}", 55 | "#new_item#": "New Item", 56 | "#p#": "{}p", 57 | "#recomm_all_langs#": "모든 언어", 58 | "#recomm_artist#": "작가", 59 | "#recomm_main#": "저장 폴더에 있는 작품들을 분석해서 작가를 추천합니다.\n\n결과에 나오는 정확도는 주어진 작품 내에서의 정확도입니다.\n작품은 많으면 많을수록 좋습니다. (100 개 이상 권장)\n\n{item} 개의 작품이 있습니다:", 60 | "#recomm_score#": "점수", 61 | "#setting_MB/s#": "MB/s", 62 | "#setting_autosaveL#": "", 63 | "#setting_autosaveR#": "마다", 64 | "#setting_incompleteL#": "", 65 | "#setting_incompleteR#": "후", 66 | "#task_artist#": "Artist", 67 | "#task_date#": "Date", 68 | "#task_done#": "Done", 69 | "#task_folder#": "Folder", 70 | "#task_incomplete#": "Incomplete", 71 | "#task_input#": "Input", 72 | "#task_invalid#": "Invalid", 73 | "#task_multiple#": "Multiple", 74 | "#task_single#": "Single", 75 | "#task_site#": "Site", 76 | "#task_status#": "Status", 77 | "#task_title#": "Title", 78 | "#task_type#": "Type", 79 | "#task_url#": "URL", 80 | "#task_zipfile#": "ZipFile" 81 | } 82 | } -------------------------------------------------------------------------------- /src/extractor/hentaicosplay_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf8 2 | import downloader 3 | from utils import Downloader, Session, Soup, LazyUrl, urljoin, get_ext, clean_title 4 | import ree as re 5 | from translator import tr_ 6 | import clf2 7 | from ratelimit import limits, sleep_and_retry 8 | 9 | 10 | 11 | class Image: 12 | 13 | def __init__(self, url, referer, p, session): 14 | self._url = url 15 | self._p = p 16 | self.url = LazyUrl(referer, self.get, self) 17 | self.session = session 18 | 19 | @sleep_and_retry 20 | @limits(2, 1) 21 | def get(self, referer): 22 | soup = downloader.read_soup(self._url, referer, session=self.session) 23 | div = soup.find('div', id='display_image_detail') 24 | url = urljoin(self._url, div.find('img').parent['href']) 25 | ext = get_ext(url) 26 | self.filename = '{:04}{}'.format(self._p, ext) 27 | return url, self._url 28 | 29 | 30 | @Downloader.register 31 | class Downloader_hentaicosplay(Downloader): 32 | type = 'hentaicosplay' 33 | URLS = ['hentai-cosplays.com'] 34 | icon = None 35 | display_name = 'Hentai Cosplay' 36 | MAX_CORE = 4 37 | 38 | @classmethod 39 | def fix_url(cls, url): 40 | url = re.sub(r'/page/[0-9]+', '', url) 41 | url = re.sub(r'/attachment/[0-9]+', '', url) 42 | url = re.sub(r'([a-zA-Z]+\.)hentai-cosplays\.com', 'hentai-cosplays.com', url) 43 | return url 44 | 45 | def init(self): 46 | self.session = Session() 47 | 48 | def read(self): 49 | if '/image/' not in self.url: 50 | raise NotImplementedError('Not a post') 51 | 52 | res = clf2.solve(self.url, session=self.session, cw=self.cw) 53 | soup = Soup(res['html']) 54 | title = soup.find('h2').text 55 | paginator = soup.find('div', id='paginator') 56 | pages = [self.url] 57 | for a in paginator.findAll('a'): 58 | href = a.get('href') 59 | if not href: 60 | continue 61 | href = urljoin(self.url, href) 62 | if href not in pages: 63 | pages.append(href) 64 | 65 | imgs = [] 66 | for i, page in enumerate(pages): 67 | if page == self.url: 68 | soup_page = soup 69 | else: 70 | soup_page = downloader.read_soup(page, session=self.session) 71 | view = soup_page.find('div', id='post') 72 | for img in view.findAll('img'): 73 | href = img.parent['href'] 74 | href = urljoin(page, href) 75 | img = Image(href, page, len(imgs), self.session) 76 | imgs.append(img) 77 | self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages))) 78 | 79 | for img in imgs: 80 | self.urls.append(img.url) 81 | 82 | self.title = clean_title(title) 83 | 84 | -------------------------------------------------------------------------------- /src/extractor/asmhentai_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf8 2 | import downloader 3 | import ree as re 4 | from utils import Soup, urljoin, Downloader, join 5 | import os 6 | 7 | 8 | 9 | def get_id(url): 10 | try: 11 | return int(url) 12 | except: 13 | if '/gallery/' in url: 14 | return int(re.find('/gallery/[0-9]+/([0-9]+)', url)) 15 | else: 16 | return int(re.find('/g/([0-9]+)', url)) 17 | 18 | 19 | @Downloader.register 20 | class Downloader_asmhentai(Downloader): 21 | type = 'asmhentai' 22 | URLS = ['asmhentai.com'] 23 | MAX_CORE = 8 24 | display_name = 'AsmHentai' 25 | 26 | def init(self): 27 | pass 28 | 29 | @classmethod 30 | def fix_url(cls, url): 31 | id_ = get_id(url) 32 | return 'https://asmhentai.com/g/{}/'.format(id_) 33 | 34 | def read(self): 35 | info, imgs = get_imgs(self.url) 36 | 37 | # 1225 38 | artist = join(info['artists']) 39 | self.artist = artist 40 | group = join(info['groups']) if info['groups'] else u'N/A' 41 | lang = info['language'][0] if info['language'] else u'N/A' 42 | series = info['parodies'][0] if info['parodies'] else u'N/A' 43 | title = self.format_title(info['category'][0], info['id'], info['title'], artist, group, series, lang) 44 | 45 | self.urls += imgs 46 | 47 | self.title = title 48 | 49 | 50 | 51 | def get_imgs(url): 52 | html = downloader.read_html(url) 53 | soup = Soup(html) 54 | 55 | info = get_info(url, soup) 56 | 57 | view = soup.find('div', class_='gallery') 58 | 59 | imgs = [] 60 | for img in view.findAll('div', class_='preview_thumb'): 61 | img = img.find('img').attrs.get('data-src') or img.find('img').attrs.get('src') 62 | img = urljoin(url, img).replace('t.jpg', '.jpg') 63 | imgs.append(img) 64 | 65 | return info, imgs 66 | 67 | 68 | def get_info(url, soup=None): 69 | if soup is None: 70 | html = downloader.read_html(url) 71 | soup = Soup(html) 72 | 73 | info = {} 74 | 75 | info['id'] = get_id(url) 76 | 77 | title = soup.find('h1').text.strip() 78 | info['title'] = title 79 | 80 | for tag in soup.findAll('span', class_='tag'): 81 | href = tag.parent.attrs['href'] 82 | href = urljoin(url, href).strip('/') 83 | 84 | key = href.split('/')[3] 85 | value = href.split('/')[-1] 86 | 87 | if key == 'language' and value == 'translated': 88 | continue 89 | 90 | if key in info: 91 | info[key].append(value) 92 | else: 93 | info[key] = [value] 94 | 95 | for key in ['artists', 'groups', 'parodies', 'tags', 'characters']: 96 | if key not in info: 97 | info[key] = [] 98 | 99 | return info 100 | 101 | -------------------------------------------------------------------------------- /src/extractor/fc2_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ree as re 3 | from utils import urljoin, Downloader, format_filename, Soup, LazyUrl, get_print, Session 4 | from m3u8_tools import M3u8_stream 5 | from io import BytesIO 6 | PATTERN_ID = r'/content/([^/]+)' 7 | 8 | 9 | @Downloader.register 10 | class Downloader_fc2(Downloader): 11 | type = 'fc2' 12 | single = True 13 | URLS = ['video.fc2.com'] 14 | 15 | @classmethod 16 | def fix_url(cls, url): 17 | if not re.match('https?://.+', url, re.IGNORECASE): 18 | url = 'https://video.fc2.com/content/{}'.format(url) 19 | return url 20 | 21 | @classmethod 22 | def key_id(cls, url): 23 | return re.find(PATTERN_ID, url) or url 24 | 25 | def read(self): 26 | self.session = Session() 27 | self.session.cookies.set('_ac', '1', domain='.video.fc2.com') 28 | info = get_info(self.url, self.session, self.cw) 29 | 30 | video = info['videos'][0] 31 | 32 | self.urls.append(video.url) 33 | 34 | f = BytesIO() 35 | downloader.download(video.url_thumb, referer=self.url, buffer=f) 36 | self.setIcon(f) 37 | 38 | self.title = info['title'] 39 | 40 | 41 | class Video(object): 42 | 43 | def __init__(self, url, url_thumb, referer, title, id_, session): 44 | self._url = url 45 | self.url = LazyUrl(referer, self.get, self) 46 | self.filename = format_filename(title, id_, '.mp4') 47 | self.url_thumb = url_thumb 48 | self.session = session 49 | 50 | def get(self, referer): 51 | ext = downloader.get_ext(self._url, session=self.session, referer=referer) 52 | if ext == '.m3u8': 53 | video = M3u8_stream(self._url, referer=referer, session=self.session, n_thread=4) 54 | else: 55 | video = self._url 56 | return video 57 | 58 | 59 | def get_info(url, session, cw=None): 60 | print_ = get_print(cw) 61 | info = {'videos': []} 62 | html = downloader.read_html(url, session=session) 63 | soup = Soup(html) 64 | info['title'] = soup.find('h2', class_='videoCnt_title').text.strip() 65 | 66 | id_ = re.find(PATTERN_ID, url, err='no id') 67 | print_('id: {}'.format(id_)) 68 | token = re.find(r'''window.FC2VideoObject.push\(\[['"]ae['"], *['"](.+?)['"]''', html, err='no token') 69 | print_('token: {}'.format(token)) 70 | 71 | url_api = 'https://video.fc2.com/api/v3/videoplaylist/{}?sh=1&fs=0'.format(id_) 72 | hdr = { 73 | 'X-FC2-Video-Access-Token': token, 74 | } 75 | data = downloader.read_json(url_api, url, session=session, headers=hdr) 76 | 77 | pl = data['playlist'] 78 | url_video = urljoin(url, pl.get('hq') or pl.get('nq') or pl['sample']) #3784 79 | url_thumb = soup.find('meta', {'property':'og:image'})['content'] 80 | video = Video(url_video, url_thumb, url, info['title'], id_, session) 81 | info['videos'].append(video) 82 | 83 | return info 84 | -------------------------------------------------------------------------------- /src/extractor/v2ph_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from __future__ import division, print_function, unicode_literals 3 | import downloader 4 | from utils import Soup, get_ext, LazyUrl, Downloader, try_n, clean_title, get_print 5 | import ree as re 6 | from translator import tr_ 7 | from timee import sleep 8 | import errors 9 | 10 | 11 | def setPage(url, p): 12 | url = url.split('?')[0] 13 | if p > 1: 14 | url += '?page={}'.format(p) 15 | return url 16 | 17 | 18 | def getPage(url): 19 | p = re.find('page=([0-9]+)', url) 20 | return int(p or 1) 21 | 22 | 23 | class Image(object): 24 | def __init__(self, url, referer, p): 25 | self.url = LazyUrl(referer, lambda x: url, self) 26 | ext = get_ext(url) 27 | self.filename = '{:04}{}'.format(p, ext) 28 | 29 | 30 | @Downloader.register 31 | class Downloader_v2ph(Downloader): 32 | type = 'v2ph' 33 | URLS = ['v2ph.com/album/'] 34 | MAX_CORE = 4 35 | display_name = 'V2PH' 36 | 37 | @classmethod 38 | def fix_url(cls, url): 39 | return url.split('?')[0] 40 | 41 | def read(self): 42 | info = get_info(self.url) 43 | 44 | for img in get_imgs(self.url, info['title'], self.cw): 45 | self.urls.append(img.url) 46 | 47 | self.title = clean_title(info['title']) 48 | 49 | 50 | 51 | @try_n(2) 52 | def get_info(url): 53 | html = downloader.read_html(url) 54 | soup = Soup(html) 55 | info = {} 56 | info['title'] = soup.find('h1').text.strip() 57 | return info 58 | 59 | 60 | def get_imgs(url, title, cw=None): 61 | print_ = get_print(cw) 62 | imgs = [] 63 | 64 | for p in range(1, 1001): 65 | url = setPage(url, p) 66 | print_(url) 67 | for try_ in range(4): 68 | try: 69 | html = downloader.read_html(url, user_agent=downloader.hdr['User-Agent']) 70 | #sleep(1) 71 | break 72 | except Exception as e: 73 | print(e) 74 | else: 75 | raise 76 | soup = Soup(html) 77 | 78 | view = soup.find('div', class_='photos-list') 79 | if view is None: 80 | if p == 1: 81 | raise errors.LoginRequired() 82 | else: 83 | break # Guest user 84 | for img in view.findAll('img'): 85 | img = img.attrs['data-src'] 86 | img = Image(img, url, len(imgs)) 87 | imgs.append(img) 88 | 89 | pgn = soup.find('ul', class_='pagination') 90 | ps = [getPage(a.attrs['href']) for a in pgn.findAll('a')] if pgn else [] 91 | if not ps or p >= max(ps): 92 | print('max p') 93 | break 94 | 95 | msg = '{} {} ({} / {})'.format(tr_('읽는 중...'), title, p, max(ps)) 96 | if cw: 97 | cw.setTitle(msg) 98 | else: 99 | print(msg) 100 | 101 | return imgs 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/extractor/afreeca_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Soup, Downloader, get_outdir, Session, LazyUrl, try_n, format_filename, get_print 3 | import ree as re 4 | from timee import sleep, time 5 | import os 6 | from io import BytesIO 7 | import shutil 8 | from m3u8_tools import playlist2stream, M3u8_stream 9 | import errors 10 | 11 | 12 | class Video(object): 13 | 14 | def __init__(self, stream, referer, id, title, url_thumb): 15 | self.url = LazyUrl(referer, lambda x: stream, self) 16 | self.id = id 17 | self.title = title 18 | self.filename = format_filename(title, id, '.mp4') 19 | self.url_thumb = url_thumb 20 | self.thumb = BytesIO() 21 | downloader.download(url_thumb, buffer=self.thumb) 22 | 23 | 24 | @Downloader.register 25 | class Downloader_afreeca(Downloader): 26 | type = 'afreeca' 27 | URLS = ['afreecatv.com'] 28 | single = True 29 | display_name = 'AfreecaTV' 30 | 31 | @classmethod 32 | def fix_url(cls, url): 33 | return url.rstrip(' /') 34 | 35 | def read(self): 36 | session = Session() 37 | video = get_video(self.url, session, self.cw) 38 | self.urls.append(video.url) 39 | 40 | self.setIcon(video.thumb) 41 | 42 | self.title = video.title 43 | 44 | 45 | @try_n(4) 46 | def _get_stream(url_m3u8): 47 | print('_get_stream', url_m3u8) 48 | try: 49 | stream = playlist2stream(url_m3u8) 50 | except Exception as e: 51 | print(e) 52 | stream = M3u8_stream(url_m3u8) 53 | return stream 54 | 55 | 56 | @try_n(8) 57 | def get_video(url, session, cw): 58 | print_ = get_print(cw) 59 | html = downloader.read_html(url, session=session) 60 | if "document.location.href='https://login." in html: 61 | raise errors.LoginRequired() 62 | soup = Soup(html) 63 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] 64 | print_('url_thumb: {}'.format(url_thumb)) 65 | params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter') 66 | params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000)) 67 | url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params 68 | print(url_xml) 69 | html = downloader.read_html(url_xml, session=session, referer=url) 70 | soup = Soup(html) 71 | if 'PARTIAL_ADULT' in html: 72 | raise errors.LoginRequired() 73 | title = soup.find('title').string.strip() 74 | urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html) 75 | if not urls_m3u8: 76 | raise Exception('no m3u8') 77 | streams = [] 78 | for url_m3u8 in urls_m3u8: 79 | try: 80 | stream = _get_stream(url_m3u8) 81 | except Exception as e: 82 | print(e) 83 | continue #2193 84 | streams.append(stream) 85 | for stream in streams[1:]: 86 | streams[0] += stream 87 | stream = streams[0] 88 | id = url.split('/')[(-1)].split('?')[0].split('#')[0] 89 | video = Video(stream, url, id, title, url_thumb) 90 | return video 91 | -------------------------------------------------------------------------------- /src/extractor/wikiart_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | import json 4 | from utils import LazyUrl, Downloader, Soup, get_print, clean_title 5 | import os 6 | from timee import sleep 7 | from translator import tr_ 8 | 9 | 10 | 11 | class Image(object): 12 | def __init__(self, url, referer, title, id): 13 | self.url = LazyUrl(referer, lambda _: url, self) 14 | ext = os.path.splitext(url.split('?')[0])[1] 15 | n = len(id) + len(ext) + 3 16 | title = clean_title(title, n=-n) 17 | self.filename = u'{} - {}{}'.format(id, title, ext) 18 | 19 | 20 | 21 | @Downloader.register 22 | class Downloader_wikiart(Downloader): 23 | type = 'wikiart' 24 | URLS = ['wikiart.org'] 25 | display_name = 'WikiArt' 26 | 27 | def init(self): 28 | self.url = u'https://www.wikiart.org/en/{}'.format(self.id_) 29 | html = downloader.read_html(self.url) 30 | self.soup = Soup(html) 31 | 32 | @property 33 | def id_(self): 34 | return get_id(self.url) 35 | 36 | def read(self): 37 | artist = get_artist(self.id_, self.soup) 38 | self.artist = artist 39 | 40 | for img in get_imgs(self.url, artist, cw=self.cw): 41 | self.urls.append(img.url) 42 | 43 | self.title = clean_title(artist) 44 | 45 | 46 | 47 | def get_id(url): 48 | userid = url.split('?')[0].split('#')[0].split('wikiart.org/')[1].split('/')[1] 49 | return userid 50 | 51 | 52 | def get_imgs(url, artist, cw=None): 53 | print_ = get_print(cw) 54 | userid = get_id(url) 55 | print(userid) 56 | 57 | imgs = [] 58 | ids = set() 59 | for p in range(1, 100): 60 | url_api = 'https://www.wikiart.org/en/{}/mode/all-paintings?json=2&layout=new&page={}&resultType=masonry'.format(userid, p) 61 | print(url_api) 62 | data_raw = downloader.read_html(url_api, referer=url) 63 | data = json.loads(data_raw) 64 | 65 | _imgs = data['Paintings'] 66 | n = data['AllPaintingsCount'] 67 | 68 | if not _imgs: 69 | print_('???') 70 | break 71 | 72 | for p in _imgs: 73 | img = p['image'] 74 | id = p['id'] 75 | referer = p['paintingUrl'] 76 | title = p['title'] 77 | if id in ids: 78 | print(u'duplicate: {}'.format(id)) 79 | continue 80 | ids.add(id) 81 | img = Image(img, referer, title, id) 82 | imgs.append(img) 83 | 84 | s = u'{} {} - {} / {}'.format(tr_(u'읽는 중...'), artist, len(imgs), n) 85 | if cw: 86 | if not cw.valid or not cw.alive: 87 | return [] 88 | cw.setTitle(s) 89 | else: 90 | print(s) 91 | 92 | if len(imgs) == n: 93 | print_('full') 94 | break 95 | 96 | return imgs 97 | 98 | 99 | def get_artist(userid, soup=None): 100 | if soup is None: 101 | url = u'https://www.wikiart.org/en/{}'.format(userid) 102 | html = downloader.read_html(url) 103 | soup = Soup(html) 104 | 105 | return soup.find('h3').text.strip() 106 | 107 | -------------------------------------------------------------------------------- /src/extractor/navercafe_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from utils import Downloader, get_print, urljoin, Soup, get_ext, LazyUrl, clean_title, downloader, re, try_n, errors, json 3 | 4 | 5 | @Downloader.register 6 | class Downloader_navercafe(Downloader): 7 | type = 'navercafe' 8 | URLS = ['cafe.naver.com'] 9 | 10 | @classmethod 11 | def fix_url(cls, url): 12 | m = re.find(r'cafe\.naver\.com/([^/?#]+).+?articleid%3D([0-9]+)', url) 13 | if m: 14 | url = 'https://cafe.naver.com/{}/{}'.format(*m) 15 | return url 16 | 17 | def read(self): 18 | info = get_info(self.url, self.cw) 19 | for img in info['imgs']: 20 | self.urls.append(img.url) 21 | tail = ' ({}_{})'.format(info['cafename'], info['id']) 22 | self.title = clean_title(info['title'], n=-len(tail)) + tail 23 | 24 | 25 | @try_n(4) 26 | def get_info(url, cw=None): 27 | print_ = get_print(cw) 28 | info = {} 29 | 30 | html = downloader.read_html(url) 31 | if '"cafe_cautionpage"' in html: 32 | raise errors.LoginRequired() 33 | url_article = re.find(r'''//cafe\.naver\.com/ArticleRead\.nhn\?articleid=[0-9]+&clubid=[0-9]+''', html, err='no iframe') 34 | url_article = urljoin(url, url_article) 35 | 36 | print_(url_article) 37 | 38 | articleid = re.find(r'articleid=([0-9]+)', url_article) 39 | clubid = re.find(r'clubid=([0-9]+)', url_article) 40 | url_api = f'https://apis.naver.com/cafe-web/cafe-articleapi/v2/cafes/{clubid}/articles/{articleid}?query=&useCafeId=true&requestFrom=A' 41 | 42 | j = downloader.read_json(url_api, url) 43 | 44 | info['title'] = j['result']['article']['subject'] 45 | info['cafename'] = j['result']['cafe']['url'] 46 | info['cafeid'] = clubid 47 | info['id'] = articleid 48 | 49 | html_content = j['result']['article']['contentHtml'] 50 | soup = Soup(html_content) 51 | 52 | imgs = [] 53 | 54 | pairs = [] 55 | 56 | for video in soup.findAll('span', class_='_naverVideo'): 57 | vid = video.attrs['vid'] 58 | key = video.attrs['key'] 59 | pairs.append((vid, key)) 60 | 61 | for script in soup.findAll('script', class_='__se_module_data'): 62 | data_raw = script['data-module'] 63 | data = json.loads(data_raw)['data'] 64 | vid = data.get('vid') 65 | if not vid: 66 | continue 67 | key = data['inkey'] 68 | pairs.append((vid, key)) 69 | 70 | for vid, key in pairs: 71 | url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key) 72 | data_raw = downloader.read_html(url_api) 73 | data = json.loads(data_raw) 74 | fs = data['videos']['list'] 75 | fs = sorted(fs, key=lambda f: f['size'], reverse=True) 76 | video = Image(fs[0]['source'], url_article, len(imgs)) 77 | imgs.append(video) 78 | 79 | for img in soup.findAll('img'): 80 | img = Image(urljoin(url_article, img['src']), url, len(imgs)) 81 | imgs.append(img) 82 | 83 | info['imgs'] = imgs 84 | 85 | return info 86 | 87 | 88 | class Image: 89 | def __init__(self, url, referer, p): 90 | self.url = LazyUrl(referer, lambda _: url, self) 91 | ext = get_ext(url) 92 | self.filename = '{:04}{}'.format(p, ext) 93 | -------------------------------------------------------------------------------- /src/extractor/tokyomotion_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, urljoin, Downloader, cut_pair, LazyUrl, clean_title 4 | from timee import sleep 5 | from translator import tr_ 6 | from io import BytesIO 7 | import ree as re 8 | import os 9 | 10 | 11 | @Downloader.register 12 | class Downloader_tokyomotion(Downloader): 13 | type = 'tokyomotion' 14 | URLS = ['tokyomotion.net'] 15 | single = True 16 | _type = None 17 | display_name = 'TOKYO Motion' 18 | 19 | def init(self): 20 | html = downloader.read_html(self.url) 21 | self.soup = Soup(html) 22 | if '/album/' in self.url: 23 | self._type = 'album' 24 | else: 25 | self._type = 'video' 26 | 27 | @property 28 | def name(self): 29 | title = get_title(self.soup) 30 | return clean_title(title) 31 | 32 | def read(self): 33 | if self._type == 'video': 34 | video = get_video(self.url, self.soup) 35 | self.urls.append(video.url) 36 | self.setIcon(video.thumb) 37 | elif self._type == 'album': 38 | imgs = get_imgs(self.url) 39 | for img in imgs: 40 | self.urls.append(img.url) 41 | self.single = False 42 | else: 43 | raise NotImplementedError('Unknown type: {}'.format(self._type)) 44 | 45 | self.title = self.name 46 | 47 | 48 | class Video(object): 49 | def __init__(self, url, url_thumb, referer, filename): 50 | self.url = LazyUrl(referer, lambda x: url, self) 51 | self.url_thumb = url_thumb 52 | self.thumb = BytesIO() 53 | downloader.download(url_thumb, referer=referer, buffer=self.thumb) 54 | self.filename = filename 55 | 56 | 57 | def get_title(soup): 58 | video = soup.find('video', id='vjsplayer') 59 | if video: 60 | title = soup.find('h3').text.strip() 61 | else: 62 | title = soup.find('title').text.split(' Album - ')[0].strip() 63 | return title 64 | 65 | 66 | def get_video(url, soup=None): 67 | if soup is None: 68 | html = downloader.read_html(url) 69 | soup = Soup(html) 70 | 71 | video = soup.find('video', id='vjsplayer').find('source').attrs['src'] 72 | url_thumb = soup.find('video', id='vjsplayer').attrs['poster'] 73 | title = get_title(soup) 74 | filename = u'{}.mp4'.format(clean_title(title)) 75 | video = Video(video, url_thumb, url, filename) 76 | return video 77 | 78 | 79 | class Image(object): 80 | def __init__(self, url, referer): 81 | self.url = LazyUrl(referer, lambda x: url, self) 82 | self.filename = os.path.basename(url.split('?')[0]) 83 | 84 | 85 | def get_imgs(url): 86 | id = re.find('album/.*?([0-9]+)', url) 87 | print('id:', id) 88 | url = 'https://www.tokyomotion.net/album/slideshow/{}'.format(id) 89 | 90 | html = downloader.read_html(url) 91 | soup = Soup(html) 92 | 93 | imgs = [] 94 | for a in soup.findAll('a', {'data-lightbox': 'slideshow-{}'.format(id)}): 95 | img = a.find('img').attrs['src'] 96 | img = img.replace('/tmb/', '/') 97 | img = Image(img, url) 98 | imgs.append(img) 99 | 100 | return imgs 101 | -------------------------------------------------------------------------------- /src/extractor/nhentai_com_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from __future__ import division, print_function, unicode_literals 3 | import downloader 4 | import ree as re 5 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, join, clean_title 6 | import os 7 | import json 8 | 9 | 10 | @Downloader.register 11 | class Downloader_nhentai_com(Downloader): 12 | type = 'nhentai_com' 13 | URLS = [r'regex:https?://nhentai.com'] 14 | MAX_CORE = 16 15 | display_name = 'nhentai.com' 16 | 17 | def init(self): 18 | self.info = get_info(self.url) 19 | self.url = self.info['url'] 20 | 21 | @classmethod 22 | def key_id(cls, url): 23 | url = url.lower() 24 | return re.find(r'/comic/([^/?]+)', url) or url 25 | 26 | def read(self): 27 | info = self.info 28 | 29 | artist = join(info['artists']) 30 | self.artist = artist if info['artists'] else None 31 | group = join(info['groups']) 32 | lang = info['lang'] or 'N/A' 33 | series = info['seriess'][0] if info['seriess'] else 'N/A' 34 | title = self.format_title(info['type'], info['id'], info['title'], artist, group, series, lang) 35 | 36 | for img in info['imgs']: 37 | self.urls.append(img.url) 38 | 39 | self.title = title 40 | 41 | 42 | @LazyUrl.register 43 | class LazyUrl_nhentai_com(LazyUrl): 44 | type = 'nhentai_com' 45 | def dump(self): 46 | referer = self._url 47 | url = self.image.url_img 48 | return { 49 | 'referer': referer, 50 | 'url': url, 51 | 'p': self.image.p, 52 | } 53 | @classmethod 54 | def load(cls, data): 55 | referer = data['referer'] 56 | url = data['url'] 57 | img = Image(referer, url, data['p']) 58 | return img.url 59 | 60 | 61 | class Image(object): 62 | def __init__(self, url_page, url_img, p): 63 | self.p = p 64 | self.referer = url_page 65 | self.filename = os.path.basename(url_img) 66 | self.url_img = url_img 67 | self.url = LazyUrl_nhentai_com(url_page, lambda _: self.url_img, self) 68 | 69 | 70 | @try_n(4) 71 | def get_info(url): 72 | url = downloader.real_url(url) 73 | q = re.find(r'/comic/([^/?]+)', url) 74 | 75 | url_api = 'https://nhentai.com/api/comics/{}'.format(q) 76 | data_raw = downloader.read_html(url_api, url) 77 | data = json.loads(data_raw) 78 | 79 | url_api = 'https://nhentai.com/api/comics/{}/images'.format(q) 80 | data_raw = downloader.read_html(url_api, url) 81 | data_images = json.loads(data_raw) 82 | 83 | info = {} 84 | info['url'] = url 85 | 86 | info['id'] = int(data['id']) 87 | info['type'] = data['category']['name'] 88 | info['title'] = data['title'] 89 | info['artists'] = [x['name'] for x in data['artists']] 90 | info['groups'] = [x['name'] for x in data['groups']] 91 | info['seriess'] = [x['name'] for x in data['parodies']] 92 | info['lang'] = data['language']['name'] 93 | 94 | imgs = [] 95 | for img in data_images['images']: 96 | img = urljoin(url, img['source_url']) 97 | img = Image(url, img, len(imgs)) 98 | imgs.append(img) 99 | info['imgs'] = imgs 100 | 101 | return info 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/extractor/pandoratv_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Session, Soup, LazyUrl, get_print, Downloader, get_ext, try_n, format_filename, clean_title 3 | import ree as re 4 | import json 5 | from io import BytesIO 6 | import errors 7 | 8 | 9 | 10 | class EmbedUrlError(Exception): pass 11 | 12 | 13 | @Downloader.register 14 | class Downloader_pandoratv(Downloader): 15 | type = 'pandoratv' 16 | URLS = ['pandora.tv'] 17 | single = True 18 | display_name = 'Pandora TV' 19 | 20 | @classmethod 21 | def fix_url(cls, url): 22 | return url.split('#')[0] 23 | 24 | def read(self): 25 | video = Video(self.url, format, cw=self.cw) 26 | try: 27 | video.url()# 28 | except EmbedUrlError as e: 29 | raise errors.Invalid(e.args[0]) 30 | 31 | self.urls.append(video.url) 32 | self.setIcon(video.thumb) 33 | 34 | self.enableSegment() 35 | 36 | self.title = video.title 37 | 38 | 39 | 40 | def extract(name, html, cw=None): 41 | print_ = get_print(cw) 42 | value = re.find(r'''{} *= *['"](.*?)['"]'''.format(name), html) 43 | if value is None: 44 | value = json.loads(re.find(r'''{} *= *(\[.*?\])'''.format(name), html)) 45 | print_('{}: {}'.format(name, value)) 46 | if value is None: 47 | raise Exception('No {}'.format(name)) 48 | return value 49 | 50 | 51 | class Video(object): 52 | _url_video = None 53 | 54 | def __init__(self, url, format='title', cw=None): 55 | self.url = LazyUrl(url, self.get, self) 56 | self.format = format 57 | self.cw = cw 58 | 59 | @try_n(2) 60 | def get(self, url): 61 | if self._url_video: 62 | return self._url_video 63 | cw = self.cw 64 | print_ = get_print(cw) 65 | html = downloader.read_html(url) 66 | soup = Soup(html) 67 | 68 | embedUrl = extract('embedUrl', html, cw) 69 | if embedUrl: 70 | raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl)) 71 | 72 | uid = extract('strLocalChUserId', html, cw) 73 | pid = extract('nLocalPrgId', html, cw) 74 | fid = extract('strFid', html, cw) 75 | resolType = extract('strResolType', html, cw) 76 | resolArr = extract('strResolArr', html, cw) 77 | vodSvr = extract('nVodSvr', html, cw) 78 | resols = extract('nInfo', html, cw) 79 | runtime = extract('runtime', html, cw) 80 | 81 | url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/' 82 | data = { 83 | 'userId': uid, 84 | 'prgId': pid, 85 | 'fid': fid, 86 | 'resolType': resolType, 87 | 'resolArr': ','.join(map(str, resolArr)), 88 | 'vodSvr': vodSvr, 89 | 'resol': max(resols), 90 | 'runtime': runtime, 91 | 'tvbox': 'false', 92 | 'defResol': 'true', 93 | 'embed': 'false', 94 | } 95 | session = Session() 96 | r = session.post(url_api, headers={'Referer': url}, data=data) 97 | data = json.loads(r.text) 98 | self._url_video = data['src'] 99 | 100 | self.title = soup.find('meta', {'property': 'og:description'})['content'] 101 | 102 | ext = get_ext(self._url_video) 103 | self.filename = format_filename(self.title, pid, ext) 104 | 105 | self.url_thumb = soup.find('meta', {'property': 'og:image'})['content'] 106 | self.thumb = BytesIO() 107 | downloader.download(self.url_thumb, buffer=self.thumb) 108 | 109 | return self._url_video 110 | 111 | -------------------------------------------------------------------------------- /src/extractor/novelpia_downloader.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from urllib.parse import urlparse 3 | from typing import List, cast 4 | 5 | from requests.sessions import session 6 | 7 | from errors import LoginRequired 8 | from utils import Downloader, Soup, Session, clean_title 9 | 10 | from bs4.element import Tag 11 | import requests 12 | 13 | 14 | @Downloader.register 15 | class Downloader_novelpia(Downloader): 16 | type = "novelpia" 17 | URLS = ["novelpia.com"] 18 | 19 | def __get_number(self, url: str) -> str: 20 | return url.replace("/viewer/", "") 21 | 22 | def __get_cookie(self) -> Session: 23 | session = requests.Session() 24 | user_key = Session().cookies.get("USERKEY", domain=".novelpia.com") 25 | login_key = Session().cookies.get("LOGINKEY", domain=".novelpia.com") 26 | 27 | if user_key and login_key: 28 | session.cookies.set("USERKEY", user_key, domain=".novelpia.com") 29 | session.cookies.set("LOGINKEY", login_key, domain=".novelpia.com") 30 | return session 31 | 32 | def init(self) -> None: 33 | self.parsed_url = urlparse(self.url) # url 나눔 34 | self.soup = Soup(requests.get(self.url).text) 35 | 36 | def read(self): 37 | session = self.__get_cookie() 38 | f = BytesIO() 39 | 40 | title_element = self.soup.find("b", {"class": "cut_line_one"}) 41 | 42 | if not title_element: 43 | raise LoginRequired 44 | 45 | # Maybe NavigableString? 46 | assert isinstance(title_element, Tag) 47 | self.title = title_element.text 48 | 49 | # css selecter is not working :( 50 | ep_num = self.soup.find( 51 | "span", 52 | { 53 | "style": "background-color:rgba(155,155,155,0.5);padding: 1px 6px;border-radius: 3px;font-size: 11px; margin-right: 3px;" 54 | }, 55 | ) 56 | assert isinstance(ep_num, Tag) 57 | 58 | ep_name = self.soup.find("span", {"class": "cut_line_one"}) 59 | assert isinstance(ep_name, Tag) 60 | 61 | # Dirty but for clean filename 62 | self.print_(ep_name.text) 63 | ep_name.text.replace(ep_num.text, "") 64 | self.print_(ep_name.text) 65 | self.print_(ep_num.text) 66 | 67 | self.filenames[f] = clean_title(f"{ep_num.text}: {ep_name.text}.txt", "safe") 68 | 69 | # https://novelpia.com/viewer/:number: 70 | numbers: List[str] = [] 71 | numbers.append(self.__get_number(self.parsed_url[2])) 72 | 73 | # Get real contents 74 | # https://novelpia.com/proc/viewer_data/:number: 75 | # {"s": [{"text": ""}]} 76 | viewer_datas = map( 77 | lambda number: f"https://novelpia.com/proc/viewer_data/{number}", numbers 78 | ) 79 | for viewer_data in viewer_datas: 80 | response = session.get(viewer_data) 81 | if response.text: 82 | response = response.json() 83 | for text_dict in response["s"]: 84 | text = text_dict["text"] 85 | if "img" in text: 86 | soup = Soup(text) 87 | img = soup.find("img") 88 | # Maybe NavigableString here too? 89 | assert isinstance(img, Tag) 90 | src = img.attrs["src"] 91 | filename = img.attrs["data-filename"] 92 | f.write(f"[{filename}]".encode("UTF-8")) 93 | self.urls.append(f"https:{src}") 94 | else: 95 | f.write(text_dict["text"].encode("UTF-8")) 96 | f.seek(0) 97 | self.urls.append(f) 98 | else: 99 | raise LoginRequired 100 | -------------------------------------------------------------------------------- /src/extractor/nozomi_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from urllib.parse import quote 3 | from io import BytesIO 4 | from utils import Downloader, query_url, LazyUrl, get_ext, urljoin, clean_title, check_alive, lock, get_print, get_max_range 5 | import errors 6 | from translator import tr_ 7 | 8 | 9 | class Image: 10 | 11 | def __init__(self, id, referer): 12 | self._id = id 13 | self.url = LazyUrl(referer, self.get, self) 14 | 15 | def get(self, referer): 16 | # https://j.nozomi.la/nozomi.js 17 | s_id = str(self._id) 18 | url_post = 'https://j.nozomi.la/post/{}/{}/{}.json'.format(s_id[-1], s_id[-3:-1], self._id) 19 | j = downloader.read_json(url_post, referer) 20 | url = urljoin(referer, j['imageurl']) 21 | ext = get_ext(url) 22 | self.filename = '{}{}'.format(self._id, ext) 23 | return url 24 | 25 | 26 | @Downloader.register 27 | class Downloader_nozomi(Downloader): 28 | type = 'nozomi' 29 | URLS = ['nozomi.la'] 30 | display_name = 'Nozomi.la' 31 | MAX_CORE = 15 32 | ACC_MTIME = True 33 | 34 | @classmethod 35 | def fix_url(cls, url): 36 | return url.split('#')[0] 37 | 38 | @property 39 | def name(self): 40 | qs = query_url(self.url) 41 | name = qs['q'][0] 42 | if self._popular: 43 | name += ' - Popular' 44 | return name 45 | 46 | def read(self): 47 | if '/post/' in self.url: 48 | raise errors.Invalid(tr_('개별 다운로드는 지원하지 않습니다: {}').format(self.url)) 49 | self._popular = 'search-Popular.' in self.url 50 | self.title = clean_title(self.name) 51 | qs = query_url(self.url) 52 | q = qs['q'][0] 53 | for id in get_ids_multi(q, self._popular, self.cw): 54 | img = Image(id, self.url) 55 | self.urls.append(img.url) 56 | 57 | 58 | @lock 59 | def get_ids(q, popular, cw): 60 | check_alive(cw) 61 | if q is None: 62 | if popular: 63 | url_api = 'https://j.nozomi.la/index-Popular.nozomi' 64 | else: 65 | url_api = 'https://j.nozomi.la/index.nozomi' 66 | else: 67 | if popular: 68 | url_api = 'https://j.nozomi.la/nozomi/popular/{}-Popular.nozomi'.format(quote(q)) 69 | else: 70 | url_api = 'https://j.nozomi.la/nozomi/{}.nozomi'.format(quote(q)) 71 | print(url_api) 72 | f = BytesIO() 73 | downloader.download(url_api, referer='https://nozomi.la/', buffer=f) 74 | data = f.read() 75 | ids = [] 76 | for i in range(0, len(data), 4): 77 | crop = data[i:i+4] 78 | id = crop[0]*16777216 + crop[1]*65536 + crop[2]*256 + crop[3] 79 | ids.append(id) 80 | return ids 81 | 82 | 83 | def get_ids_multi(q, popular, cw=None): 84 | print_ = get_print(cw) 85 | max_pid = get_max_range(cw) 86 | qs = q.split(' ') 87 | qs_pos = [q for q in qs if not q.startswith('-')] 88 | qs_neg = [q[1:] for q in qs if q.startswith('-')] 89 | q = qs_pos[0] if qs_pos else None 90 | ids = get_ids(q, popular, cw) 91 | print_('{}: {}'.format(q, len(ids))) 92 | 93 | # Positive 94 | for q in qs_pos[1:]: 95 | ids_ = get_ids(q, popular, cw) 96 | set_ids_ = set(ids_) 97 | ids_old = ids 98 | ids = [] 99 | for id in ids_old: 100 | if id in set_ids_: 101 | ids.append(id) 102 | print_('{}: {} ({})'.format(q, len(ids_), len(ids))) 103 | 104 | # Negative 105 | for q in qs_neg: 106 | ids_ = get_ids(q, popular, cw) 107 | set_ids_ = set(ids_) 108 | ids_old = ids 109 | ids = [] 110 | for id in ids_old: 111 | if id not in set_ids_: 112 | ids.append(id) 113 | print_('-{}: {} ({})'.format(q, len(ids_), len(ids))) 114 | return ids[:max_pid] 115 | -------------------------------------------------------------------------------- /src/extractor/flickr_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import downloader 3 | import flickr_api 4 | from timee import sleep 5 | from utils import Downloader, LazyUrl, query_url, clean_title 6 | import os 7 | from translator import tr_ 8 | import ree as re 9 | from datetime import datetime 10 | import flickr_auth 11 | 12 | 13 | alphabet = '123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ' 14 | base = len(alphabet) 15 | def b58encode(div, s=''): 16 | if div >= base: 17 | div, mod = divmod(div, base) 18 | return b58encode(div, alphabet[mod] + s) 19 | return alphabet[div] + s 20 | def b58decode(s): 21 | return sum(alphabet.index(c) * pow(base, i) for i, c in enumerate(reversed(s))) 22 | 23 | 24 | 25 | class Image(object): 26 | def __init__(self, photo): 27 | self.photo = photo 28 | self.id = photo.id 29 | self.filename = None 30 | 31 | def f(_=None): 32 | url = photo.getPhotoFile() 33 | #url = 'https://flic.kr/p/{}'.format(b58encode(int(photo.id))) 34 | ext = os.path.splitext(url)[1] 35 | date = datetime.fromtimestamp(int(photo.dateuploaded)) 36 | date = u'{:02}-{:02}-{:02}'.format(date.year%100, date.month, date.day) 37 | self.filename = u'[{}] {}{}'.format(date, self.id, ext) 38 | return url 39 | self.url = LazyUrl(u'flickr_{}'.format(self.id), f, self) 40 | 41 | 42 | def find_ps(url): 43 | user = flickr_api.Person.findByUrl(url) 44 | id = re.search('/albums/([0-9]+)', url).groups()[0] 45 | pss = user.getPhotosets() 46 | for ps in pss: 47 | if ps.id == id: 48 | break 49 | else: 50 | raise Exception('Not found photoset id') 51 | return user, ps 52 | 53 | 54 | @Downloader.register 55 | class Downloader_flickr(Downloader): 56 | type = 'flickr' 57 | URLS = ['flickr.com'] 58 | _name = None 59 | 60 | def init(self): 61 | if 'flickr.com' in self.url.lower(): 62 | self.url = self.url.replace('http://', 'https://') 63 | else: 64 | self.url = 'https://www.flickr.com/people/{}'.format(self.url) 65 | 66 | @property 67 | def name(self): 68 | global pss 69 | if self._name is None: 70 | url = self.url 71 | flickr_auth.get_api(url, self.cw) 72 | if '/albums/' in url: 73 | user, ps = find_ps(url) 74 | self._name = u'{} (flickr_album_{}_{})'.format(ps.title, user.id, ps.id) 75 | else: 76 | user = flickr_api.Person.findByUrl(url) 77 | self._name = u'{} (flickr_{})'.format(user.username, user.id) 78 | return clean_title(self._name) 79 | 80 | 81 | def read(self): 82 | self.title = self.name 83 | 84 | imgs = get_imgs(self.url, self.title, cw=self.cw) 85 | 86 | for img in imgs: 87 | self.urls.append(img.url) 88 | 89 | self.title = self.name 90 | 91 | 92 | def get_imgs(url, title=None, cw=None): 93 | flickr_auth.get_api(title, cw) 94 | if not flickr_auth.isAuth: 95 | raise Exception('No Auth') 96 | 97 | 98 | if '/albums/' in url: 99 | user, ps = find_ps(url) 100 | handle = ps 101 | else: 102 | user = flickr_api.Person.findByUrl(url) 103 | handle = user 104 | 105 | photos = [] 106 | 107 | per_page = 500 108 | for page in range(1, 200): 109 | photos_new = handle.getPhotos(per_page=per_page, page=page) 110 | photos += photos_new 111 | if len(photos_new) < per_page: 112 | break 113 | 114 | msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(photos)) 115 | if cw: 116 | if not cw.alive: 117 | break 118 | cw.setTitle(msg) 119 | else: 120 | print(msg) 121 | 122 | imgs = [] 123 | for photo in photos: 124 | img = Image(photo) 125 | imgs.append(img) 126 | 127 | return imgs 128 | 129 | -------------------------------------------------------------------------------- /src/extractor/rule34_xxx_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ree as re 3 | import os 4 | from utils import Downloader, urljoin, query_url, Soup, get_max_range, get_print, clean_title, try_n 5 | from translator import tr_ 6 | try: 7 | from urllib import quote # python2 8 | except: 9 | from urllib.parse import quote # python3 10 | import sys 11 | from timee import sleep 12 | from constants import clean_url 13 | LIMIT = 100 14 | 15 | 16 | def get_tags(url): 17 | url = clean_url(url) 18 | qs = query_url(url) 19 | if 'page=favorites' in url: 20 | id = qs.get('id', ['N/A'])[0] 21 | id = u'fav_{}'.format(id) 22 | else: 23 | tags = qs.get('tags', []) 24 | tags.sort() 25 | id = u' '.join(tags) 26 | if not id: 27 | id = u'N/A' 28 | return id 29 | 30 | 31 | @Downloader.register 32 | class Downloader_rule34_xxx(Downloader): 33 | type = 'rule34_xxx' 34 | URLS = ['rule34.xxx'] 35 | MAX_CORE = 8 36 | display_name = 'Rule34.xxx' 37 | _name = None 38 | 39 | @classmethod 40 | def fix_url(cls, url): 41 | if 'rule34.xxx' in url.lower(): 42 | url = url.replace('http://', 'https://') 43 | else: 44 | url = url.replace(' ', '+') 45 | while '++' in url: 46 | url = url.replace('++', '+') 47 | url = quote(url) 48 | url = url.replace('%2B', '+') 49 | url = u'https://rule34.xxx/index.php?page=post&s=list&tags={}'.format(url) 50 | return url 51 | 52 | @property 53 | def name(self): 54 | if self._name is None: 55 | tags = get_tags(self.url) 56 | self._name = tags 57 | return clean_title(self._name) 58 | 59 | def read(self): 60 | self.title = self.name 61 | 62 | imgs = get_imgs(self.url, self.name, cw=self.cw) 63 | 64 | for img in imgs: 65 | self.urls.append(img.url) 66 | self.filenames[img.url] = img.filename 67 | 68 | self.title = self.name 69 | 70 | 71 | class Image(object): 72 | def __init__(self, id_, url): 73 | self.url = url 74 | ext = os.path.splitext(url)[1] 75 | self.filename = u'{}{}'.format(id_, ext) 76 | 77 | 78 | def setPage(url, page): 79 | # Always use HTTPS 80 | url = url.replace('http://', 'https://') 81 | 82 | # Change the page 83 | if 'pid=' in url: 84 | url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url) 85 | else: 86 | url += '&pid={}'.format(page) 87 | 88 | return url 89 | 90 | 91 | def get_imgs(url, title=None, cw=None): 92 | url = clean_url(url) 93 | if 's=view' in url and 'page=favorites' not in url: 94 | raise NotImplementedError('Not Implemented') 95 | 96 | if 'page=dapi' not in url.lower(): 97 | tags = get_tags(url) 98 | tags = quote(tags, safe='/') 99 | tags = tags.replace('%20', '+') 100 | url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(tags, 0, LIMIT) 101 | 102 | print_ = get_print(cw) 103 | 104 | # Range 105 | max_pid = get_max_range(cw) 106 | 107 | imgs = [] 108 | ids = set() 109 | for p in range(500): #1017 110 | url = setPage(url, p) 111 | print_(url) 112 | html = try_n(4, sleep=30)(downloader.read_html)(url) #3340 113 | 114 | soup = Soup(html) 115 | posts = soup.findAll('post') 116 | if not posts: 117 | break 118 | for post in posts: 119 | id_ = post.attrs['id'] 120 | if id_ in ids: 121 | print('duplicate:', id_) 122 | continue 123 | ids.add(id_) 124 | url_img = post.attrs['file_url'] 125 | img = Image(id_, url_img) 126 | imgs.append(img) 127 | if len(imgs) >= max_pid: 128 | break 129 | 130 | if cw is not None: 131 | if not cw.alive: 132 | break 133 | cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) 134 | return imgs 135 | -------------------------------------------------------------------------------- /src/extractor/likee_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Session, Downloader, get_ext, LazyUrl, get_print 3 | import ree as re 4 | import json 5 | from io import BytesIO 6 | from translator import tr_ 7 | 8 | 9 | @Downloader.register 10 | class Downloader_likee(Downloader): 11 | type = 'likee' 12 | URLS = ['likee.video'] 13 | single = True 14 | display_name = 'Likee' 15 | 16 | def init(self): 17 | self.session = Session() 18 | 19 | def read(self): 20 | info = get_info(self.url, self.session, self.cw) 21 | self.print_('type: {}'.format(info['type'])) 22 | self.artist = info['artist'] 23 | 24 | if info['type'] != 'single': 25 | video = self.process_playlist(info['title'], info['videos']) 26 | else: 27 | video = info['videos'][0] 28 | video.url() 29 | self.urls.append(video.url) 30 | self.title = info['title'] 31 | 32 | thumb = BytesIO() 33 | downloader.download(video.url_thumb, referer=self.url, buffer=thumb) 34 | self.setIcon(thumb) 35 | 36 | 37 | def get_info(url, session, cw=None): 38 | print_ = get_print(cw) 39 | 40 | info = {} 41 | info['videos'] = [] 42 | 43 | if '/video/' in url: 44 | info['type'] = 'single' 45 | video = Video(url, session) 46 | video.url() 47 | info['videos'].append(video) 48 | info['title'] = video.id_ 49 | info['artist'] = video.artist 50 | return info 51 | 52 | info['type'] = 'channel' 53 | html = downloader.read_html(url, session=session) 54 | data_raw = html.split('window.data = ')[1].split('};')[0]+'}' 55 | data = json.loads(data_raw) 56 | info['uid'] = data['userinfo']['uid'] 57 | info['username'] = data['userinfo']['yyuid'] 58 | info['artist'] = data['userinfo']['nick_name'] 59 | info['title'] = '{} (likee_{})'.format(info['artist'], info['username']) 60 | 61 | lastPostId = '' 62 | urls = set() 63 | while True: 64 | url_api = 'https://likee.video/official_website/VideoApi/getUserVideo' 65 | r = session.post(url_api, data={'uid': info['uid'], 'count': '30', 'lastPostId': lastPostId}) 66 | data = json.loads(r.text) 67 | 68 | videos = data['data']['videoList'] 69 | if not videos: 70 | break 71 | 72 | for data in videos: 73 | url_post = 'https://likee.video/@{}/video/{}'.format(data['likeeId'], data['postId']) 74 | if url_post in urls: 75 | print_('duplicate: {}'.format(url_post)) 76 | continue 77 | urls.add(url_post) 78 | video = Video(url_post, session, data) 79 | video.url() 80 | info['videos'].append(video) 81 | lastPostId = data['postId'] 82 | 83 | msg = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(info['videos'])) 84 | if cw: 85 | if not cw.alive: 86 | return 87 | cw.setTitle(msg) 88 | else: 89 | print(msg) 90 | 91 | return info 92 | 93 | 94 | class Video(object): 95 | def __init__(self, url, session, data=None): 96 | self.id_ = re.find('/video/([0-9]+)', url, err='no id') 97 | self._session = session 98 | self._data = data 99 | self.url = LazyUrl(url, self.get, self) 100 | 101 | def get(self, url): 102 | if self._data: 103 | video = self._data 104 | else: 105 | url_api = 'https://likee.video/official_website/VideoApi/getVideoInfo' 106 | r = self._session.post(url_api, data={'postIds': str(self.id_)}) 107 | 108 | data = json.loads(r.text) 109 | video = data['data']['videoList'][0] 110 | 111 | url_video = video['videoUrl'] 112 | self.url_thumb = video['coverUrl'] 113 | self.artist = video['nickname'] 114 | ext = get_ext(url_video) 115 | self.title = self.id_ 116 | self.filename = '{}{}'.format(self.id_, ext) 117 | 118 | return url_video 119 | 120 | -------------------------------------------------------------------------------- /src/extractor/nhentai_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from __future__ import division, print_function, unicode_literals 3 | import downloader 4 | import ree as re 5 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, join, get_ext 6 | import os 7 | import json 8 | 9 | 10 | @Downloader.register 11 | class Downloader_nhentai(Downloader): 12 | type = 'nhentai' 13 | URLS = ['nhentai.net'] 14 | MAX_CORE = 16 15 | display_name = 'nhentai' 16 | 17 | def init(self): 18 | self.url = 'https://nhentai.net/g/{}/'.format(self.id_) 19 | 20 | @property 21 | def id_(self): 22 | try: 23 | return int(self.url) 24 | except: 25 | return int(re.find('/g/([0-9]+)', self.url)) 26 | 27 | def read(self): 28 | info, imgs = get_imgs(self.id_) 29 | 30 | # 1225 31 | artist = join(info.artists) 32 | self.artist = artist if info.artists else None 33 | group = join(info.groups) 34 | lang = info.lang or 'N/A' 35 | series = info.seriess[0] if info.seriess else 'N/A' 36 | title = self.format_title(info.type, info.id, info.title, artist, group, series, lang) 37 | 38 | for img in imgs: 39 | self.urls.append(img.url) 40 | 41 | self.title = title 42 | 43 | 44 | @LazyUrl.register 45 | class LazyUrl_nhentai(LazyUrl): 46 | type = 'nhentai' 47 | def dump(self): 48 | referer = self._url 49 | url = self.image.url_img 50 | return { 51 | 'referer': referer, 52 | 'url': url, 53 | 'p': self.image.p, 54 | } 55 | @classmethod 56 | def load(cls, data): 57 | referer = data['referer'] 58 | url = data['url'] 59 | img = Image(referer, url, data['p']) 60 | return img.url 61 | 62 | 63 | class Image(object): 64 | def __init__(self, url_page, url_img, p): 65 | self.p = p 66 | self.url = LazyUrl_nhentai(url_page, lambda _: url_img, self) 67 | self.filename = '{:04}{}'.format(p, get_ext(url_img)) 68 | 69 | 70 | class Info(object): 71 | def __init__(self, host, id, id_media, title, p, artists, groups, seriess, lang, type, formats): 72 | self.host = host 73 | self.id = id 74 | self.id_media = id_media 75 | self.title = title 76 | self.p = p 77 | self.artists = artists 78 | self.groups = groups 79 | self.seriess = seriess 80 | self.lang = lang 81 | self.type = type 82 | self.formats = formats 83 | 84 | 85 | @try_n(4) 86 | def get_info(id): 87 | url = 'https://nhentai.net/g/{}/1/'.format(id) 88 | referer = 'https://nhentai.net/g/{}/'.format(id) 89 | html = downloader.read_html(url, referer=referer) 90 | 91 | data = html.split('JSON.parse(')[1].split(');')[0] 92 | gal = json.loads(json.loads(data)) 93 | host = 'https://i.nhentai.net'#re.find('''media_url: *['"]([^'"]+)''', html, err='no host') 94 | 95 | id = int(gal['id']) 96 | id_media = int(gal['media_id']) 97 | title = gal['title']['english'] 98 | p = len(gal['images']['pages']) 99 | artists = [] 100 | groups = [] 101 | seriess = [] 102 | for tag in gal['tags']: 103 | type = tag['type'] 104 | if type == 'artist': 105 | artists.append(tag['name']) 106 | elif type == 'group': 107 | groups.append(tag['name']) 108 | elif type == 'parody' and tag['name'] != 'original': 109 | seriess.append(tag['name']) 110 | elif type == 'language': 111 | lang = tag['name'] 112 | elif type == 'category': 113 | type_ = tag['name'] 114 | formats = [] 115 | for img in gal['images']['pages']: 116 | type = img['t'] 117 | format = {'j':'jpg', 'p':'png', 'g':'gif'}[type] 118 | formats.append(format) 119 | info = Info(host, id, id_media, title, p, artists, groups, seriess, lang, type_, formats) 120 | return info 121 | 122 | 123 | def get_imgs(id): 124 | info = get_info(id) 125 | 126 | imgs = [] 127 | for p in range(1, info.p+1): 128 | name = '/galleries/{}/{}.{}'.format(info.id_media, p, info.formats[p-1]) 129 | url_page = 'https://nhentai.net/g/{}/{}/'.format(id, p) 130 | url_img = urljoin(info.host, name) 131 | img = Image(url_page, url_img, p) 132 | imgs.append(img) 133 | 134 | return info, imgs 135 | 136 | 137 | -------------------------------------------------------------------------------- /src/extractor/nico_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | import nndownload 4 | from io import BytesIO 5 | import ree as re 6 | from utils import Downloader, get_print, compatstr, format_filename, try_n, LazyUrl, get_abr 7 | import utils 8 | from nico_login import login, logout 9 | import ffmpeg 10 | import os 11 | import errors 12 | 13 | 14 | def get_id(url): 15 | if '/watch/' in url: 16 | return re.find('/watch/([a-zA-Z0-9]+)', url) 17 | 18 | 19 | class Video(object): 20 | def __init__(self, session, info, format, cw): 21 | self.session = session 22 | self.info = info 23 | self.title = info['title'] 24 | self.ext = info['ext'] 25 | self.id = info['id'] 26 | self.format = format 27 | self.username = info['uploader'] 28 | self.url = LazyUrl('https://www.nicovideo.jp/watch/{}'.format(self.id), lambda _: info['url'], self, pp=self.pp) 29 | self.cw = cw 30 | 31 | self.filename = format_filename(self.title, self.id, self.ext) 32 | 33 | self.url_thumb = info['thumbnail_url'] 34 | print('thumb:', self.url_thumb) 35 | self.thumb = BytesIO() 36 | downloader.download(self.url_thumb, buffer=self.thumb) 37 | 38 | def pp(self, filename): 39 | if self.format == 'mp4': 40 | return 41 | name, ext_old = os.path.splitext(filename) 42 | filename_new = '{}.mp3'.format(name) 43 | ffmpeg.convert(filename, filename_new, '-shortest -preset ultrafast -b:a {}k'.format(get_abr()), cw=self.cw) 44 | 45 | if utils.ui_setting.albumArt.isChecked(): 46 | self.thumb.seek(0)# 47 | ffmpeg.add_cover(filename_new, self.thumb, {'artist':self.username, 'title':self.title}, cw=self.cw) 48 | 49 | return filename_new 50 | 51 | def __repr__(self): 52 | return u'Video({})'.format(self.id) 53 | 54 | 55 | def suitable(url): 56 | if 'live.nico' in url: #3986 57 | return False 58 | if 'nicovideo.jp' not in url.lower(): 59 | return False 60 | return get_id(url) is not None 61 | 62 | 63 | @Downloader.register 64 | class Downloader_nico(Downloader): 65 | type = 'nico' 66 | single = True 67 | URLS = [suitable] 68 | display_name = 'Niconico' 69 | _format = 'mp4' 70 | MAX_SPEED = 2.0 71 | 72 | @classmethod 73 | def fix_url(cls, url): 74 | id_ = get_id(url) 75 | if re.find(r'^https?://', id_): 76 | return url 77 | if re.find(r'^https?://', url): 78 | domain = utils.domain(url) 79 | else: 80 | domain = 'www.nicovideo.jp' 81 | return 'https://{}/watch/{}'.format(domain, id_) 82 | 83 | def read(self): 84 | ui_setting = self.ui_setting 85 | if self.cw.format: 86 | self._format = self.cw.format 87 | 88 | if self._format == 'mp3': 89 | self.cw.setMusic(True) 90 | 91 | if ui_setting.nicoBox.isChecked(): 92 | username = compatstr(ui_setting.nico_id.text()) 93 | password = compatstr(ui_setting.nico_pw.text()) 94 | else: 95 | username = '' 96 | password = '' 97 | 98 | try: 99 | session = login(username, password) 100 | except Exception as e: 101 | logout() 102 | raise errors.Invalid(u'Failed to login: {}'.format(self.url), fail=True) 103 | 104 | self.session = session 105 | try: 106 | video = get_video(session, self.url, self._format, self.cw) 107 | except Exception as e: 108 | logout() 109 | raise 110 | 111 | self.urls.append(video.url) 112 | self.setIcon(video.thumb) 113 | 114 | self.enableSegment() 115 | 116 | self.title = video.title 117 | 118 | 119 | @try_n(2) 120 | def get_video(session, url, format, cw=None): 121 | print_ = get_print(cw) 122 | 123 | id = get_id(url) 124 | if 'live.nico' in url: #3986 125 | raise NotImplementedError('nama') 126 | #info = nndownload.request_nama(session, id) 127 | else: 128 | info = nndownload.request_video(session, id) 129 | video = Video(session, info, format, cw) 130 | 131 | return video 132 | 133 | 134 | import selector 135 | @selector.options('nico') 136 | def options(): 137 | return [ 138 | {'text': 'MP4 (동영상)', 'format': 'mp4'}, 139 | {'text': 'MP3 (음원)', 'format': 'mp3'}, 140 | ] 141 | -------------------------------------------------------------------------------- /src/extractor/hanime_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Session, Downloader, get_outdir, try_n, Soup, format_filename, clean_title, get_print, get_resolution 3 | import ree as re, json 4 | from io import BytesIO 5 | import os 6 | from timee import time 7 | from m3u8_tools import M3u8_stream 8 | from random import randrange 9 | 10 | 11 | class Video(object): 12 | 13 | def __init__(self, info, stream): 14 | self.info = info 15 | self.id = info['id'] 16 | self.title = info['name'] 17 | self.brand = info['brand'] 18 | self.url = stream['url'] 19 | self.url_thumb = info['poster_url'] 20 | self.thumb = BytesIO() 21 | downloader.download(self.url_thumb, buffer=self.thumb) 22 | ext = os.path.splitext(self.url.split('?')[0].split('#')[0])[1] 23 | if ext.lower() == '.m3u8': 24 | print('read m3u8:', self.url) 25 | ext = '.mp4' 26 | self.url = M3u8_stream(self.url, n_thread=4) 27 | else: 28 | size = downloader.get_size(self.url) 29 | if size <= 0: 30 | raise Exception('Size is 0') 31 | self.filename = format_filename('[{}] {}'.format(self.brand, self.title), self.id, ext) 32 | 33 | def __repr__(self): 34 | return ('Video({})').format(self.id) 35 | 36 | 37 | @Downloader.register 38 | class Downloader_hanime(Downloader): 39 | type = 'hanime' 40 | URLS = ['hanime.tv/hentai-videos/', 'hanime.tv/videos/'] 41 | single = True 42 | display_name = 'hanime.tv' 43 | 44 | def read(self): 45 | video, session = get_video(self.url, cw=self.cw) 46 | self.video = video 47 | 48 | self.urls.append(video.url) 49 | self.filenames[video.url] = video.filename 50 | 51 | self.setIcon(video.thumb) 52 | self.title = u'[{}] {}'.format(video.brand, video.title) 53 | 54 | 55 | @try_n(8) 56 | def get_video(url, session=None, cw=None): 57 | print_ = get_print(cw) 58 | if session is None: 59 | session = Session() 60 | session.headers['User-Agent'] = downloader.hdr['User-Agent'] 61 | session.headers['X-Directive'] = 'api' 62 | html = downloader.read_html(url, session=session) 63 | soup = Soup(html) 64 | for script in soup.findAll('script'): 65 | script = script.text or script.string or '' 66 | data = re.find('window.__NUXT__=(.+)', script) 67 | if data is not None: 68 | data = data.strip() 69 | if data.endswith(';'): 70 | data = data[:-1] 71 | data = json.loads(data) 72 | break 73 | else: 74 | raise Exception('No __NUXT__') 75 | 76 | info = data['state']['data']['video']['hentai_video'] 77 | query = info['slug'] 78 | #url_api = 'https://members.hanime.tv/api/v3/videos_manifests/{}?'.format(query) # old 79 | url_api = 'https://hanime.tv/rapi/v7/videos_manifests/{}?'.format(query) # new 80 | print(url_api) 81 | hdr = { 82 | 'x-signature': ''.join('{:x}'.format(randrange(16)) for i in range(32)), 83 | 'x-signature-version': 'web2', 84 | 'x-time': str(int(time())), 85 | } 86 | r = session.get(url_api, headers=hdr) 87 | print(r) 88 | data = json.loads(r.text) 89 | streams = [] 90 | for server in data['videos_manifest']['servers']: 91 | streams += server['streams'] 92 | 93 | streams_good = [] 94 | for stream in streams: 95 | url_video = stream['url'] 96 | if not url_video or 'deprecated.' in url_video: 97 | continue 98 | stream['height'] = int(stream['height']) 99 | streams_good.append(stream) 100 | 101 | if not streams_good: 102 | raise Exception('No video available') 103 | print('len(streams_good):', len(streams_good)) 104 | res = get_resolution() 105 | 106 | def print_stream(stream): 107 | print_([stream['extension'], stream['height'], stream['filesize_mbs'], stream['url']]) 108 | 109 | steams_filtered = [] 110 | for stream in streams_good: 111 | print_stream(stream) 112 | if stream['height'] <= res: #3712 113 | steams_filtered.append(stream) 114 | 115 | if steams_filtered: 116 | stream = sorted(steams_filtered, key=lambda _: _['height'])[-1] 117 | else: 118 | stream = sorted(streams_good, key=lambda _: _['height'])[0] 119 | 120 | print_('Final stream:') 121 | print_stream(stream) 122 | return Video(info, stream), session 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /src/extractor/kakuyomu_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | import utils 4 | from utils import Soup, urljoin, Downloader, LazyUrl, get_outdir, try_n, clean_title, get_print 5 | import os 6 | from timee import sleep 7 | from io import BytesIO 8 | from translator import tr_ 9 | 10 | 11 | 12 | class Page(object): 13 | def __init__(self, url, title, date, p): 14 | self.url = url 15 | self.title = clean_title(u'[{:04}] {}'.format(p, title), n=-4) 16 | self.date = date 17 | self.filename = u'{}.txt'.format(self.title) 18 | self.file = LazyUrl(self.url, self.get_file, self) 19 | 20 | def get_file(self, url): 21 | text = get_text(self) 22 | f = BytesIO() 23 | f.write(text.encode('utf8')) 24 | f.seek(0) 25 | #f.mode = 'w' 26 | return f 27 | 28 | 29 | @Downloader.register 30 | class Downloader_kakuyomu(Downloader): 31 | type = 'kakuyomu' 32 | URLS = ['kakuyomu.jp'] 33 | MAX_CORE = 2 34 | detect_removed = False 35 | display_name = 'カクヨム' 36 | 37 | def init(self): 38 | self.info = get_info(self.url, cw=self.cw) 39 | 40 | def read(self): 41 | outdir = get_outdir('kakuyomu') 42 | 43 | self.artist = self.info['artist'] 44 | title_dir = clean_title(u'[{}] {}'.format(self.artist, self.info['title'])) 45 | 46 | for page in self.info['pages']: 47 | file = os.path.join(outdir, title_dir, page.filename) 48 | if os.path.isfile(file): 49 | self.urls.append(file) 50 | else: 51 | self.urls.append(page.file) 52 | 53 | self.title = title_dir 54 | 55 | def post_processing(self): 56 | names = self.cw.names 57 | filename = clean_title(u'[merged] [{}] {}'.format(self.artist, self.info['title']), n=-4) + '.txt' 58 | filename = os.path.join(self.dir, filename) 59 | try: 60 | with utils.open(filename, 'wb') as f: 61 | f.write(u' {}\n\n \u4f5c\u8005\uff1a{}\n\n\n'.format(self.info['title'], self.artist).encode('utf8')) 62 | f.write(self.info['description'].encode('utf8')) 63 | for i, file in enumerate(names): 64 | self.cw.pbar.setFormat('[%v/%m] {} [{}/{}]'.format(tr_(u'\ubcd1\ud569...'), i, len(names))) 65 | with open(file, 'rb') as f_: 66 | text = f_.read() 67 | f.write(b'\n\n\n\n') 68 | f.write(text) 69 | finally: 70 | self.cw.pbar.setFormat('[%v/%m]') 71 | 72 | 73 | @try_n(4, sleep=30) 74 | def get_text(page): 75 | html = downloader.read_html(page.url) 76 | soup = Soup(html) 77 | view = soup.find('div', class_='widget-episodeBody') 78 | story = view.text.strip() 79 | text =u'''──────────────────────────────── 80 | 81 | ◆ {} {} 82 | 83 | ──────────────────────────────── 84 | 85 | 86 | {}'''.format(page.title, page.date, story) 87 | return text 88 | 89 | 90 | def get_info(url, soup=None, cw=None): 91 | print_ = get_print(cw) 92 | if soup is None: 93 | html = downloader.read_html(url) 94 | soup = Soup(html) 95 | 96 | info = {} 97 | 98 | info['title'] = soup.find('h1', id='workTitle').text.strip() 99 | info['artist'] = soup.find('span', id='workAuthor-activityName').text.strip() 100 | 101 | desc = soup.find('section', id='description') 102 | button = desc.find('span', class_='ui-truncateTextButton-expandButton') 103 | if button: 104 | print('decompose button') 105 | button.decompose() 106 | catch = desc.find('span', id='catchphrase-body') 107 | if catch is None: #4445 108 | print_('no catch') 109 | catch = '' 110 | else: 111 | catch = catch.text.strip() 112 | intro = desc.find('p', id='introduction') 113 | if intro is None: #4262 114 | print_('no intro') 115 | intro = '' 116 | else: 117 | intro = intro.text.strip() 118 | desc = u' {}{}'.format(catch, ('\n\n\n'+intro) if intro else '') 119 | info['description'] = desc 120 | 121 | pages = [] 122 | for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'): 123 | href = urljoin(url, a.attrs['href']) 124 | subtitle = a.find('span', class_='widget-toc-episode-titleLabel').text.strip() 125 | date = a.find('time', class_='widget-toc-episode-datePublished').text.strip() 126 | page = Page(href, subtitle, date, len(pages)+1) 127 | pages.append(page) 128 | 129 | info['pages'] = pages 130 | 131 | return info 132 | 133 | -------------------------------------------------------------------------------- /src/extractor/webtoon_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Soup, LazyUrl, clean_title, get_ext, get_imgs_already, urljoin, try_n, Downloader 3 | import os 4 | import page_selector 5 | from translator import tr_ 6 | import ree as re 7 | 8 | 9 | 10 | @Downloader.register 11 | class Downloader_webtoon(Downloader): 12 | type = 'webtoon' 13 | URLS = ['webtoon.com', 'webtoons.com'] 14 | MAX_CORE = 8 15 | MAX_SPEED = 4.0 16 | display_name = 'WEBTOON' 17 | 18 | def init(self): 19 | self.url = get_main(self.url) 20 | self.soup = downloader.read_soup(self.url) 21 | 22 | @classmethod 23 | def fix_url(cls, url): 24 | return url.replace('webtoon.com', 'webtoons.com') 25 | 26 | def read(self): 27 | title = clean_title(self.soup.find('h1').text.strip()) 28 | self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(title) 29 | imgs = get_imgs_all(self.url, title, cw=self.cw) 30 | for img in imgs: 31 | if isinstance(img, Image): 32 | self.urls.append(img.url) 33 | else: 34 | self.urls.append(img) 35 | 36 | self.title = title 37 | 38 | 39 | class Page(object): 40 | 41 | def __init__(self, url, title): 42 | self.url = url 43 | self.title = title 44 | 45 | 46 | class Image(object): 47 | 48 | def __init__(self, url, page, p): 49 | ext = get_ext(url) or downloader.get_ext(url, referer=page.url) 50 | self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext) 51 | 52 | self.url = LazyUrl(page.url, lambda _: url, self) 53 | 54 | 55 | @try_n(2) 56 | def get_imgs(page): 57 | html = downloader.read_html(page.url) 58 | if 'window.__motiontoonViewerState__' in html: 59 | raise NotImplementedError('motiontoon') 60 | soup = Soup(html) 61 | view = soup.find('div', class_='viewer_img') 62 | imgs = [] 63 | for img in view.findAll('img'): 64 | src = img.get('data-url') or img['src'] 65 | img = Image(urljoin(page.url, src), page, len(imgs)) 66 | imgs.append(img) 67 | return imgs 68 | 69 | 70 | def get_main(url): 71 | if 'episode_no=' in url: 72 | soup = downloader.read_soup(url) 73 | url = urljoin(url, soup.find('div', class_='subj_info').find('a')['href']) 74 | return url 75 | 76 | 77 | def set_page(url, p): 78 | if '&page=' not in url: 79 | url = url + '&page={}'.format(p) 80 | else: 81 | url = re.sub('&page=[0-9]+', '&page={}'.format(p), url) 82 | if p == 1: 83 | url = url.replace('&page=1', '') 84 | return url 85 | 86 | 87 | def get_pages(url): 88 | pages = [] 89 | urls = set() 90 | for p in range(1, 101): 91 | url_page = set_page(url, p) 92 | print(url_page) 93 | for try_ in range(4): 94 | try: 95 | soup = downloader.read_soup(url_page) 96 | view = soup.find('ul', id='_listUl') 97 | if view is None: 98 | raise Exception('no view') 99 | break 100 | except Exception as e: 101 | e_ = e 102 | print(e) 103 | else: 104 | raise e_ 105 | pages_new = [] 106 | for li in view.findAll('li', recursive=False): 107 | href = urljoin(url, li.find('a')['href']) 108 | title = li.find('span', class_='subj').text.strip() 109 | if href in urls: 110 | continue 111 | urls.add(href) 112 | no = int(li['data-episode-no']) 113 | title = '{:04} - {}'.format(no, title) 114 | page = Page(href, title) 115 | pages_new.append(page) 116 | if not pages_new: 117 | break 118 | pages += pages_new 119 | return pages[::-1] 120 | 121 | 122 | @page_selector.register('webtoon') 123 | @try_n(4) 124 | def f(url): 125 | url = get_main(url) 126 | return get_pages(url) 127 | 128 | 129 | def get_imgs_all(url, title, cw=None): 130 | pages = get_pages(url) 131 | pages = page_selector.filter(pages, cw) 132 | imgs = [] 133 | for p, page in enumerate(pages): 134 | imgs_already = get_imgs_already('webtoon', title, page, cw) 135 | if imgs_already: 136 | imgs += imgs_already 137 | continue 138 | imgs += get_imgs(page) 139 | msg = tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages)) 140 | if cw is not None: 141 | cw.setTitle(msg) 142 | if not cw.alive: 143 | break 144 | else: 145 | print(msg) 146 | 147 | return imgs 148 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |
4 |

5 | 6 | [![GitHub release](https://img.shields.io/github/release/KurtBestor/Hitomi-Downloader.svg?logo=github)](https://github.com/KurtBestor/Hitomi-Downloader/releases/latest) 7 | [![GitHub downloads](https://img.shields.io/github/downloads/KurtBestor/Hitomi-Downloader/latest/total.svg?logo=github)](https://github.com/KurtBestor/Hitomi-Downloader/releases/latest) 8 | [![GitHub downloads](https://img.shields.io/github/downloads/KurtBestor/Hitomi-Downloader/total.svg?logo=github)](https://github.com/KurtBestor/Hitomi-Downloader/releases) 9 | 10 | ## Links 11 | - [Download](https://github.com/KurtBestor/Hitomi-Downloader/releases/latest) 12 | - [Issues](https://github.com/KurtBestor/Hitomi-Downloader/issues) 13 | - [Scripts](https://github.com/KurtBestor/Hitomi-Downloader/wiki/Scripts) 14 | - [Chrome Extension](https://github.com/KurtBestor/Hitomi-Downloader/wiki/Chrome-Extension) 15 | 16 | ## Demo 17 | 18 | 19 | ## Features 20 | - 🍰 Simple and clear user interface 21 | - 🚀 Download acceleration 22 | - 💻 Supports 24 threads in a single task 23 | - 🚥 Supports speed limit 24 | - 📜 Supports user scripts 25 | - 🧲 Supports BitTorrent & Magnet 26 | - 🎞️ Supports M3U8 & MPD format videos 27 | - 🌙 Dark mode 28 | - 🧳 Portable 29 | - 📋 Clipboard monitor 30 | - 🗃️ Easy to organize tasks 31 | 32 | ## Supported Sites 33 | | Site | URL | 34 | | :--: | -- | 35 | | **4chan** | | 36 | | **AfreecaTV** | | 37 | | **ArtStation** | | 38 | | **AsianSister** | | 39 | | **AsmHentai** | | 40 | | **Avgle** | | 41 | | **baraag.net** | | 42 | | **半次元** | | 43 | | **BDSMlr** | | 44 | | **bilibili** | | 45 | | **ComicWalker** | | 46 | | **Coub** | | 47 | | **Danbooru** | | 48 | | **Kakao Webtoon** | | 49 | | **DeviantArt** | | 50 | | **E(x)Hentai Galleries** |
| 51 | | **Facebook** | | 52 | | **FC2 Video** | | 53 | | **Flickr** | | 54 | | **Gelbooru** | | 55 | | **Hameln** | | 56 | | **hanime.tv** | | 57 | | **Hentai Foundry** | | 58 | | **Hitomi.la** | | 59 | | **Hiyobi.me** | | 60 | | **Imgur** | | 61 | | **Instagram** | | 62 | | **Iwara** |
| 63 | | **Jmana** | | 64 | | **カクヨム** | | 65 | | **LHScan** | | 66 | | **Likee** | | 67 | | **Luscious** | | 68 | | **MyReadingManga** | | 69 | | **Naver Blog** | | 70 | | **Naver Cafe** | | 71 | | **Naver Post** | | 72 | | **Naver Webtoon** | | 73 | | **Naver TV** | | 74 | | **nhentai** | | 75 | | **nhentai.com** | | 76 | | **Niconico** | | 77 | | **ニジエ** | | 78 | | **Nozomi.la** | | 79 | | **Pawoo** | | 80 | | **Pinterest** | | 81 | | **Pixiv** | | 82 | | **pixivコミック** | | 83 | | **Pornhub** |
| 84 | | **Rule34.xxx** | | 85 | | **Sankaku Complex** |

| 86 | | **Soundcloud** | | 87 | | **小説家になろう** | | 88 | | **TOKYO Motion** | | 89 | | **Tumblr** | | 90 | | **Twitch** | | 91 | | **Twitter** | | 92 | | **Vimeo** | | 93 | | **V LIVE** | | 94 | | **Wayback Machine** | | 95 | | **Weibo** | | 96 | | **WikiArt** | | 97 | | **xHamster** | | 98 | | **XNXX** | | 99 | | **XVideos** | | 100 | | **Yande.re** | | 101 | | **Youku** | | 102 | | **YouTube** | | 103 | | **and more...** | [Supported sites by youtube-dl](http://ytdl-org.github.io/youtube-dl/supportedsites.html) | 104 | -------------------------------------------------------------------------------- /src/extractor/comicwalker_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, LazyUrl, urljoin, try_n, Downloader, get_print, clean_title, get_imgs_already 4 | import ree as re 5 | from itertools import cycle 6 | from io import BytesIO 7 | import json 8 | from timee import sleep 9 | from translator import tr_ 10 | import page_selector 11 | import os 12 | 13 | 14 | # https://static.comic-walker.com/viewer/cw-viewer.min.js 15 | def decode(s, hash): 16 | # generateKey 17 | key = int(hash[:16], 16) 18 | 19 | filter = [int((key>>i*8)%256) for i in range(8)][::-1] # 20 | s2 = bytes(x^y for x, y in zip(s, cycle(filter))) 21 | return s2 22 | 23 | 24 | class Image(object): 25 | def __init__(self, src, hash, p, page): 26 | def f(_): 27 | f = BytesIO() 28 | downloader.download(src, referer=page.url, buffer=f) 29 | s = f.read() 30 | s2 = decode(s, hash) 31 | f.seek(0) 32 | f.write(s2) 33 | f.seek(0) 34 | return f 35 | self.url = LazyUrl(page.url, f, self) 36 | self.filename = u'{}/{:04}.jpg'.format(page.title, p) 37 | 38 | 39 | class Page(object): 40 | def __init__(self, url, title): 41 | self.url = url 42 | self.title = clean_title(title) 43 | 44 | 45 | @Downloader.register 46 | class Downloader_comicwalker(Downloader): 47 | type = 'comicwalker' 48 | URLS = ['comic-walker.com/contents/detail/', 'comic-walker.jp/contents/detail/'] 49 | MAX_CORE = 4 50 | display_name = 'ComicWalker' 51 | _soup = None 52 | pages = None 53 | 54 | @property 55 | def soup(self): 56 | if self._soup is None: 57 | html = downloader.read_html(self.url) 58 | self._soup = Soup(html) 59 | return self._soup 60 | 61 | def read(self): 62 | cw = self.cw 63 | title = get_title(self.soup, cw) 64 | 65 | self.imgs = get_imgs(self.url, self.soup, cw) 66 | 67 | for img in self.imgs: 68 | if isinstance(img, Image): 69 | self.urls.append(img.url) 70 | else: 71 | self.urls.append(img) 72 | 73 | self.title = title 74 | 75 | 76 | def get_imgs_page(page): 77 | cid = re.find('[?&]cid=([a-zA-Z0-9_]+)', page.url) 78 | url_api = 'https://ssl.seiga.nicovideo.jp/api/v1/comicwalker/episodes/{}/frames'.format(cid) 79 | 80 | html = downloader.read_html(url_api, referer=page.url) 81 | 82 | meta = json.loads(html) 83 | data = meta['data'] 84 | imgs = [] 85 | for item in data['result']: 86 | src = item['meta']['source_url'] 87 | hash = item['meta']['drm_hash'] 88 | img = Image(src, hash, len(imgs), page) 89 | imgs.append(img) 90 | 91 | return imgs 92 | 93 | 94 | def get_pages(url, soup=None): 95 | if soup is None: 96 | html = downloader.read_html(url) 97 | soup = Soup(html) 98 | 99 | pages = [] 100 | for item in soup.findAll('div', class_='acBacknumber-item-leftbox'): 101 | item = item.parent 102 | a = item.find('a') 103 | title = a.attrs['title'] 104 | href = a.attrs['href'] 105 | href = urljoin(url, href) 106 | page = Page(href, title) 107 | pages.append(page) 108 | 109 | return pages 110 | 111 | 112 | def get_title(soup, cw=None): 113 | print_ = get_print(cw) 114 | for h1 in soup.findAll('h1'): 115 | title = h1.text.strip() 116 | if title: 117 | break 118 | else: 119 | raise Exception('no title') 120 | title_clean = clean_title(title) 121 | print_('get_title: "{}"({}) "{}"({})'.format(title, title.encode('utf8'), title_clean, title_clean.encode('utf8'))) 122 | return title_clean 123 | 124 | 125 | @page_selector.register('comicwalker') 126 | @try_n(4) 127 | def f(url): 128 | if '/viewer/' in url: 129 | raise Exception(tr_(u'목록 주소를 입력해주세요')) 130 | pages = get_pages(url) 131 | return pages 132 | 133 | 134 | def get_imgs(url, soup=None, cw=None): 135 | if soup is None: 136 | html = downloader.read_html(url) 137 | soup = Soup(html) 138 | 139 | title = get_title(soup, cw) 140 | 141 | pages = get_pages(url, soup) 142 | pages = page_selector.filter(pages, cw) 143 | 144 | imgs = [] 145 | for i, page in enumerate(pages): 146 | imgs_already = get_imgs_already('comicwalker', title, page, cw) 147 | if imgs_already: 148 | imgs += imgs_already 149 | continue 150 | 151 | if cw is not None: 152 | if not cw.alive: 153 | return 154 | cw.setTitle(u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))) 155 | 156 | imgs += get_imgs_page(page) 157 | 158 | return imgs 159 | 160 | -------------------------------------------------------------------------------- /src/extractor/hameln_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf8 2 | from __future__ import division, print_function, unicode_literals 3 | import downloader 4 | import os 5 | import utils 6 | from utils import Soup, urljoin, get_text, LazyUrl, try_n, Downloader, lazy, clean_title 7 | import ree as re 8 | from io import BytesIO 9 | from timee import sleep 10 | from translator import tr_ 11 | 12 | 13 | 14 | @Downloader.register 15 | class Downloader_hameln(Downloader): 16 | type = 'hameln' 17 | URLS = ['syosetu.org'] 18 | MAX_CORE = 2 19 | detect_removed = False 20 | 21 | def init(self): 22 | id_ = re.find('/novel/([^/]+)', self.url) 23 | if id_ is not None: 24 | self.url = 'https://syosetu.org/novel/{}/'.format(id_) 25 | 26 | @lazy 27 | def soup(self): 28 | html = read_html(self.url) 29 | soup = Soup(html) 30 | return soup 31 | 32 | @lazy 33 | def info(self): 34 | return get_info(self.url, self.soup) 35 | 36 | def read(self): 37 | for page in get_pages(self.url, self.soup): 38 | text = Text(page, len(self.urls)+1) 39 | self.urls.append(text.url) 40 | 41 | self.artist = self.info['artist'] 42 | self.title = clean_title('[{}] {}'.format(self.artist, self.info['title']), n=-len('[merged] .txt')) 43 | 44 | def post_processing(self): 45 | names = self.cw.names 46 | filename = os.path.join(self.dir, '[merged] {}.txt'.format(self.title)) 47 | try: 48 | with utils.open(filename, 'wb') as f: 49 | f.write(' {}\n\n 作者:{}\n\n\n'.format(self.info['title'], self.artist).encode('utf8')) 50 | if self.info['novel_ex']: 51 | f.write(self.info['novel_ex'].encode('utf8')) 52 | for i, file in enumerate(names): 53 | self.cw.pbar.setFormat('[%v/%m] {} [{}/{}]'.format(tr_('병합...'), i, len(names))) 54 | with open(file, 'rb') as f_: 55 | text = f_.read() 56 | f.write(b'\n\n\n\n') 57 | f.write(text) 58 | finally: 59 | self.cw.pbar.setFormat('[%v/%m]') 60 | 61 | 62 | class Text(object): 63 | def __init__(self, page, p): 64 | self.page = page 65 | self.url = LazyUrl(page.url, self.get, self) 66 | self.filename = clean_title('[{:04}] {}'.format(p, page.title), n=-4) + '.txt' 67 | 68 | def get(self, url): 69 | text = read_page(self.page) 70 | f = BytesIO() 71 | f.write(text.encode('utf8')) 72 | f.seek(0) 73 | return f 74 | 75 | 76 | class Page(object): 77 | def __init__(self, title, url): 78 | self.title = clean_title(title) 79 | self.url = url 80 | 81 | 82 | 83 | def read_html(url): 84 | return downloader.read_html(url, cookies={'over18': 'off'}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'}) 85 | 86 | 87 | def get_sss(soup): 88 | sss = [ss for ss in soup.findAll('div', class_='ss') if ss.attrs.get('id')!='fmenu'] 89 | return sss 90 | 91 | 92 | def get_pages(url, soup=None): 93 | if soup is None: 94 | html = read_html(url) 95 | soup = Soup(html) 96 | 97 | sss = get_sss(soup) 98 | list = sss[-1] 99 | 100 | pages = [] 101 | for tr in list.findAll('tr'): 102 | a = tr.find('a') 103 | if a is None: 104 | continue 105 | text =a.text.strip() 106 | href = urljoin(url, a.attrs['href']) 107 | page = Page(text, href) 108 | pages.append(page) 109 | 110 | return pages 111 | 112 | 113 | @try_n(22, sleep=30) 114 | def read_page(page): 115 | html = read_html(page.url) 116 | soup = Soup(html) 117 | 118 | text_top = get_text(soup.find('div', id='maegaki')) 119 | print(text_top.count('\n')) 120 | text_mid = get_text(soup.find('div', id='honbun')) 121 | text_bot = get_text(soup.find('div', id='atogaki')) 122 | 123 | texts = [text for text in (text_top, text_mid, text_bot) if text] 124 | 125 | story = ''' 126 | 127 | ──────────────────────────────── 128 | 129 | '''.join(texts) 130 | 131 | text = '''──────────────────────────────── 132 | 133 | ◆ {} 134 | 135 | ──────────────────────────────── 136 | 137 | 138 | {}'''.format(page.title, story) 139 | 140 | return text 141 | 142 | 143 | def get_info(url, soup=None): 144 | if soup is None: 145 | html = read_html(url) 146 | soup = Soup(html) 147 | 148 | info = {} 149 | info['artist'] = soup.find('span', {'itemprop':'author'}).text.strip() 150 | info['title'] = soup.find('span', {'itemprop':'name'}).text.strip() 151 | sss = get_sss(soup) 152 | info['novel_ex'] = get_text(sss[-2], '') 153 | return info 154 | 155 | -------------------------------------------------------------------------------- /src/extractor/imgur_downloader.py: -------------------------------------------------------------------------------- 1 | # uncompyle6 version 3.5.0 2 | # Python bytecode 2.7 (62211) 3 | # Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)] 4 | # Embedded file name: imgur_downloader.pyo 5 | # Compiled at: 2019-10-07 05:58:14 6 | import downloader 7 | from utils import Downloader, Soup, try_n, urljoin, get_max_range, clean_title, cut_pair 8 | import ree as re, json, os 9 | from timee import sleep 10 | from translator import tr_ 11 | 12 | @Downloader.register 13 | class Downloader_imgur(Downloader): 14 | type = 'imgur' 15 | URLS = ['imgur.com'] 16 | MAX_CORE = 16 17 | 18 | def init(self): 19 | self.info = get_info(self.url) 20 | 21 | @property 22 | def id_(self): 23 | return re.find('imgur.com/.+?/([0-9a-zA-Z]+)', self.url) 24 | 25 | @property 26 | def name(self): 27 | title = self.info['title'] or 'N/A' 28 | return clean_title(title, n=100) 29 | 30 | def read(self): 31 | imgs = get_imgs(self.url, self.info, self.cw) 32 | for img in imgs: 33 | ext = os.path.splitext(img.split('?')[0])[1] 34 | if len(imgs) > 1: 35 | self.filenames[img] = (u'{:04}{}').format(len(self.urls), ext) 36 | else: 37 | self.filenames[img] = clean_title(self.name, n=-len(ext)) + ext 38 | self.urls.append(img) 39 | 40 | self.single = len(imgs) == 1 41 | self.referer = self.url 42 | self.title = u'{} (imgur_{})'.format(self.name, self.id_) 43 | 44 | 45 | @try_n(4) 46 | def get_info(url): 47 | url = url.replace('/gallery/', '/a/') 48 | if '/r/' in url and url.split('/r/')[1].strip('/').count('/') == 0: 49 | title = re.find(r'/r/([^/]+)', url) 50 | info = {} 51 | info['title'] = title 52 | info['type'] = 'r' 53 | else: 54 | try: # legacy 55 | html = downloader.read_html(url, cookies={'over18':'1'}) 56 | s = re.find('image *: *({.+)', html) 57 | info_raw = cut_pair(s) 58 | except Exception as e: # new 59 | print(e) 60 | id_ = re.find(r'/a/([0-9a-zA-Z_]+)', url) or re.find(r'/r/[0-9a-zA-Z_]+/([0-9a-zA-Z_]+)', url, err='no id') 61 | url_api = 'https://api.imgur.com/post/v1/albums/{}?client_id=546c25a59c58ad7&include=media%2Cadconfig%2Caccount'.format(id_) 62 | info_raw = downloader.read_html(url_api, cookies={'over18':'1'}) 63 | info = json.loads(info_raw) 64 | info['type'] = 'a' 65 | return info 66 | 67 | 68 | def get_imgs(url, info=None, cw=None): 69 | print('get_imgs', url) 70 | if info is None: 71 | info = get_info(url) 72 | imgs = [] 73 | 74 | # Range 75 | max_pid = get_max_range(cw) 76 | 77 | if info['type'] == 'a': 78 | if 'album_images' in info: # legacy 79 | imgs_ = info['album_images']['images'] 80 | elif 'media' in info: # new 81 | imgs_ = info['media'] 82 | else: # legacy 83 | imgs_ = [info] 84 | 85 | for img in imgs_: 86 | img_url = img.get('url') # new 87 | if not img_url: # legacy 88 | hash = img['hash'] 89 | ext = img['ext'] 90 | img_url = 'https://i.imgur.com/{}{}'.format(hash, ext) 91 | if img_url in imgs: 92 | continue 93 | imgs.append(img_url) 94 | 95 | elif info['type'] == 'r': 96 | urls = set() 97 | for p in range(100): 98 | url_api = 'https://imgur.com/r/{}/new/page/{}/hit?scrolled'.format(info['title'], p) 99 | print(url_api) 100 | html = downloader.read_html(url_api, referer=url) 101 | soup = Soup(html) 102 | 103 | c = 0 104 | for post in soup.findAll('div', class_='post'): 105 | a = post.find('a', class_='image-list-link') 106 | url_post = urljoin(url, a.attrs['href']) 107 | if url_post in urls: 108 | continue 109 | urls.add(url_post) 110 | c += 1 111 | 112 | try: # for r18 images 113 | imgs += get_imgs(url_post) 114 | except Exception as e: 115 | print(e) 116 | 117 | s = (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), info['title'], len(imgs)) 118 | if cw is not None: 119 | if cw.alive: 120 | cw.setTitle(s) 121 | else: 122 | return [] 123 | else: 124 | print(s) 125 | 126 | if c == 0: 127 | print('same; break') 128 | break 129 | 130 | return imgs 131 | 132 | -------------------------------------------------------------------------------- /src/extractor/discord_emoji_downloader.py: -------------------------------------------------------------------------------- 1 | # coding: UTF-8 2 | # title: Discord 서버 커스텀 이모지 다운로드 3 | # author: SaidBySolo 4 | 5 | """ 6 | MIT License 7 | 8 | Copyright (c) 2020 SaidBySolo 9 | 10 | Permission is hereby granted, free of charge, to any person obtaining a copy 11 | of this software and associated documentation files (the "Software"), to deal 12 | in the Software without restriction, including without limitation the rights 13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | copies of the Software, and to permit persons to whom the Software is 15 | furnished to do so, subject to the following conditions: 16 | 17 | The above copyright notice and this permission notice shall be included in all 18 | copies or substantial portions of the Software. 19 | 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | SOFTWARE. 27 | """ 28 | from utils import Downloader, clean_title 29 | import requests 30 | import errors 31 | 32 | 33 | @Downloader.register 34 | class DownloaderDiscordEmoji(Downloader): 35 | type = "discord" 36 | 37 | def init(self): 38 | pass 39 | 40 | def read(self): 41 | token_guild_id_list = self.url.split( 42 | "/" 43 | ) # 값을 어떻게 받을지 몰라서 일단 나눴어요. discord_이메일/비밀번호/서버아이디 또는 discord_토큰/서버아이디 이런식으로 받게 해놨어요. 44 | 45 | if len(token_guild_id_list) == 2: 46 | token = token_guild_id_list[0] 47 | guild_id = token_guild_id_list[1] 48 | elif len(token_guild_id_list) == 3: 49 | email = token_guild_id_list[0] 50 | password = token_guild_id_list[1] 51 | guild_id = token_guild_id_list[2] 52 | 53 | response = self.post_account_info(email, password) 54 | account_info = response.json() 55 | if response.status_code == 400: 56 | if account_info.get("captcha_key"): 57 | raise errors.Invalid( 58 | "먼저 웹 또는 디스코드 앱에서 로그인하신후 캡차를 인증해주세요." 59 | ) # 메세지 박스 return하니까 멈춰서 raise로 해놨어요 60 | else: 61 | raise errors.Invalid("이메일 또는 비밀번호가 잘못되었습니다. 확인후 다시 시도해주세요.") 62 | else: 63 | if not account_info["token"]: 64 | raise errors.Invalid("토큰을 받아오지 못했어요. 2단계인증을 사용중이신경우 토큰을 이용해 요청해주세요.") 65 | else: 66 | token = account_info["token"] 67 | else: 68 | raise errors.Invalid("인자값이 더 많이왔어요.") 69 | 70 | guild_info_response = self.get_emoji_list(token, int(guild_id)) # 토큰과 함께 get요청함 71 | 72 | if guild_info_response.status_code != 200: 73 | raise errors.Invalid("정상적인 토큰이 아니거나 서버를 찾을수없어요. 맞는 토큰인지, 해당 서버에 접속해있는지 확인해주세요.") 74 | else: 75 | guild_info = guild_info_response.json() 76 | 77 | if guild_info["emojis"]: 78 | base_url = "https://cdn.discordapp.com/emojis/" 79 | for emoji in guild_info["emojis"]: # 이모지 리스트로 가져옴 80 | if emoji["animated"] is True: # 만약 gif면 gif 다운로드 81 | param = emoji["id"] + ".gif" 82 | else: # 아닐경우 png로 83 | param = emoji["id"] + ".png" 84 | 85 | self.title = clean_title( 86 | f'{guild_info["name"]}({guild_info["id"]})' # 폴더 이름은 서버 이름, id 87 | ) 88 | self.urls.append(base_url + param + "?v=1") # 인자 합치기 89 | else: 90 | raise errors.Invalid("해당 서버에는 이모지가 없어요") 91 | 92 | def get_emoji_list(self, token: str, guild_id: int) -> dict: 93 | response = requests.get( 94 | f"https://discordapp.com/api/v6/guilds/{guild_id}", 95 | headers={"Authorization": token}, 96 | ) 97 | if response.status_code == 401: 98 | response = requests.get( 99 | f"https://discordapp.com/api/v6/guilds/{guild_id}", 100 | headers={"Authorization": f"Bot {token}"}, 101 | ) 102 | 103 | return response 104 | 105 | def post_account_info(self, email: str, password: str) -> dict: 106 | response = requests.post( 107 | "https://discordapp.com/api/v8/auth/login", 108 | json={ 109 | "email": email, 110 | "password": password, 111 | "undelete": False, 112 | "captcha_key": None, 113 | "login_source": None, 114 | "gift_code_sku_id": None, 115 | }, 116 | ) 117 | 118 | return response 119 | -------------------------------------------------------------------------------- /src/extractor/bdsmlr_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Session, Soup, LazyUrl, Downloader, get_max_range, try_n, get_print, clean_title 4 | from datetime import datetime 5 | import ree as re 6 | import os 7 | from translator import tr_ 8 | from timee import sleep 9 | from error_printer import print_error 10 | import clf2 11 | import errors 12 | 13 | 14 | @Downloader.register 15 | class Downloader_bdsmlr(Downloader): 16 | type = 'bdsmlr' 17 | URLS = ['bdsmlr.com'] 18 | display_name = 'BDSMlr' 19 | 20 | def init(self): 21 | if u'bdsmlr.com/post/' in self.url: 22 | raise errors.Invalid(tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url)) 23 | 24 | self.url = 'https://{}.bdsmlr.com'.format(self.id_) 25 | self.session = Session() 26 | clf2.solve(self.url, session=self.session, cw=self.cw) 27 | 28 | @property 29 | def id_(self): 30 | url = self.url 31 | if 'bdsmlr.com' in url: 32 | if 'www.bdsmlr.com' in url: 33 | raise Exception('www.bdsmlr.com') 34 | gal_num = url.split('.bdsmlr.com')[0].split('/')[(-1)] 35 | else: 36 | gal_num = url 37 | return gal_num 38 | 39 | def read(self): 40 | info = get_imgs(self.id_, session=self.session, cw=self.cw) 41 | 42 | for post in info['posts']: 43 | self.urls.append(post.url) 44 | 45 | self.title = u'{} (bdsmlr_{})'.format(clean_title(info['username']), self.id_) 46 | 47 | 48 | class Post(object): 49 | def __init__(self, url, referer, id, p): 50 | self.id = id 51 | self.url = LazyUrl(referer, lambda x: url, self) 52 | ext = os.path.splitext(url)[1] 53 | self.filename = u'{}_p{}{}'.format(id, p, ext) 54 | 55 | 56 | def foo(url, soup, info, reblog=False): 57 | #print('foo', info['c'], len(info['ids'])) 58 | for post in soup.findAll('div', class_='wrap-post'): 59 | try: 60 | id = int(re.find('[0-9]+', post.attrs['class'][1])) 61 | except Exception as e: 62 | print(print_error(e)[-1]) 63 | continue 64 | if id in info['ids']: 65 | continue 66 | info['ids'].add(id) 67 | info['last'] = id 68 | if not reblog and post.find('div', class_='ogname'): 69 | continue 70 | for p, mag in enumerate(post.findAll(['a', 'div'], class_='magnify')): 71 | post = Post(mag.attrs['href'], url, id, p) 72 | info['posts'].append(post) 73 | info['c'] += 20 if info['c'] else 5 74 | 75 | 76 | @try_n(2) 77 | def get_imgs(user_id, session, cw=None): 78 | print_ = get_print(cw) 79 | url = 'https://{}.bdsmlr.com/'.format(user_id) 80 | info = {'c': 0, 'posts': [], 'ids': set()} 81 | 82 | html = downloader.read_html(url, session=session) 83 | soup = Soup(html) 84 | 85 | sorry = soup.find('div', class_='sorry') 86 | if sorry: 87 | raise Exception(sorry.text.strip()) 88 | 89 | username = soup.find('title').text.strip()### 90 | print('username:', username) 91 | info['username'] = username 92 | 93 | token = soup.find('meta', {'name': 'csrf-token'}).attrs['content'] 94 | print_(u'token: {}'.format(token)) 95 | 96 | max_pid = get_max_range(cw) 97 | 98 | n = len(info['ids']) 99 | for p in range(1000): 100 | if p == 0: 101 | url_api = 'https://{}.bdsmlr.com/loadfirst'.format(user_id) 102 | else: 103 | url_api = 'https://{}.bdsmlr.com/infinitepb2/{}'.format(user_id, user_id) 104 | data = { 105 | 'scroll': str(info['c']), 106 | 'timenow': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 107 | } 108 | if 'last' in info: 109 | data['last'] = str(info['last']) 110 | print_(u'n:{}, scroll:{}, last:{}'.format(len(info['posts']), data['scroll'], data.get('last'))) 111 | headers = { 112 | 'Referer': url, 113 | 'X-CSRF-TOKEN': token, 114 | } 115 | for try_ in range(4): 116 | try: 117 | r = session.post(url_api, data=data, headers=headers) 118 | if p == 0: 119 | r.raise_for_status() 120 | break 121 | except Exception as e: 122 | print(e) 123 | else: 124 | raise 125 | soup = Soup(r.text) 126 | foo(url, soup, info) 127 | if len(info['ids']) == n: 128 | print('same; break') 129 | break 130 | n = len(info['ids']) 131 | 132 | s = u'{} {} (tumblr_{}) - {}'.format(tr_(u'읽는 중...'), username, user_id, len(info['posts'])) 133 | if cw is not None: 134 | if not cw.alive: 135 | return 136 | cw.setTitle(s) 137 | else: 138 | print(s) 139 | 140 | if len(info['posts']) > max_pid: 141 | break 142 | 143 | return info 144 | 145 | -------------------------------------------------------------------------------- /src/extractor/nijie_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import downloader 3 | from utils import Downloader, urljoin, get_max_range, query_url, Soup, Session, LazyUrl, get_print, clean_title, try_n, get_ext 4 | from translator import tr_ 5 | from constants import clean_url 6 | import ree as re 7 | from errors import LoginRequired 8 | 9 | 10 | def get_id(url): 11 | return re.find('id=([0-9]+)', url) 12 | 13 | 14 | def get_name(soup): 15 | return soup.find('p', class_='user_icon').find('a', class_='name').text.strip() 16 | 17 | 18 | def isLogin(soup): 19 | if soup.find('ul', id="sub-menu"): 20 | return True 21 | return False 22 | 23 | 24 | @Downloader.register 25 | class Downloader_nijie(Downloader): 26 | type = 'nijie' 27 | URLS = ['nijie.info'] 28 | MAX_CORE = 4 29 | display_name = 'ニジエ' 30 | 31 | def init(self): 32 | if 'members.php' not in self.url and 'members_illust.php' not in self.url: 33 | raise NotImplementedError() 34 | id = get_id(self.url) 35 | html = downloader.read_html('https://nijie.info/members.php?id={}'.format(id)) 36 | self.soup = Soup(html) 37 | 38 | if not isLogin(self.soup): 39 | raise LoginRequired() 40 | 41 | @classmethod 42 | def fix_url(cls, url): 43 | if 'nijie.info' not in url.lower(): 44 | url = 'https://nijie.info/members.php?id={}'.format(url) 45 | return url.replace('http://', 'https://') 46 | 47 | @property 48 | def name(self): 49 | name = u'{} (nijie_{})'.format(get_name(self.soup), get_id(self.url)) 50 | return clean_title(name) 51 | 52 | def read(self): 53 | self.title = self.name 54 | 55 | imgs = get_imgs(self.url, self.name, cw=self.cw) 56 | 57 | for img in imgs: 58 | self.urls.append(img.url) 59 | 60 | self.title = self.name 61 | 62 | 63 | 64 | class Image(object): 65 | def __init__(self, id, url, p, lazy=True, img=None): 66 | self.id = id 67 | self.p = p 68 | if lazy: 69 | self.url = LazyUrl(url, self.get_single, self) 70 | else: 71 | self.url = LazyUrl(url, lambda _:img, self) 72 | ext = get_ext(img) 73 | self.filename = '{}_p{}{}'.format(id, p, ext) 74 | 75 | def get_single(self, url): # single 76 | img = get_imgs_post(self.id, url)[0].url() 77 | ext = get_ext(img) 78 | self.filename = '{}_p{}{}'.format(self.id, self.p, ext) 79 | return img 80 | 81 | 82 | @try_n(8, sleep=10) 83 | def get_imgs_post(id, url): 84 | #print('get_imgs_post', id, url) 85 | html = downloader.read_html(url) 86 | soup = Soup(html) 87 | view = soup.find('div', id='gallery') 88 | imgs = [] 89 | for img in view.findAll(class_='mozamoza'): 90 | url_img = urljoin(url, img['src']) 91 | url_img = re.sub('__rs_l[0-9]+x[0-9]+/', '', url_img) 92 | img = Image(id, url, len(imgs), False, url_img) 93 | imgs.append(img) 94 | return imgs 95 | 96 | 97 | def setPage(url, page): 98 | # Always use HTTPS 99 | url = url.replace('http://', 'https://') 100 | 101 | # Change the page 102 | if 'p=' in url: 103 | url = re.sub('p=[0-9]*', 'p={}'.format(page), url) 104 | else: 105 | url += '&p={}'.format(page) 106 | 107 | return url 108 | 109 | 110 | def get_imgs(url, title=None, cw=None): 111 | print_ = get_print(cw) 112 | url = clean_url(url) 113 | 114 | id = get_id(url) 115 | url = u'https://nijie.info/members_illust.php?id={}'.format(id) 116 | 117 | # Range 118 | max_pid = get_max_range(cw) 119 | 120 | imgs = [] 121 | url_imgs = set() 122 | for p in range(1, 1+100): 123 | url = setPage(url, p) 124 | print_(url) 125 | html = downloader.read_html(url) 126 | 127 | soup = Soup(html) 128 | posts = soup.findAll('div', class_='nijie') 129 | if not posts: 130 | print('no posts') 131 | break 132 | c = 0 133 | for post in posts: 134 | url_img = urljoin(url, post.a.attrs['href']) 135 | if url_img in url_imgs: 136 | print('duplicate:', url_img) 137 | continue 138 | url_imgs.add(url_img) 139 | id = int(re.find('[?&]id=([0-9]+)', url_img)) 140 | multi = post.find('div', class_='thumbnail-icon') 141 | if multi: 142 | imgs_ = get_imgs_post(id, url_img)# 143 | else: 144 | imgs_ = [Image(id, url_img, 0)] 145 | 146 | imgs += imgs_ 147 | c += 1 148 | 149 | if len(imgs) >= max_pid: 150 | break 151 | 152 | msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) 153 | if cw: 154 | if not cw.alive: 155 | return 156 | cw.setTitle(msg) 157 | else: 158 | print(msg) 159 | 160 | if len(imgs) >= max_pid or c == 0: 161 | break 162 | return imgs 163 | 164 | 165 | -------------------------------------------------------------------------------- /src/extractor/hf_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, urljoin, Session, LazyUrl, Downloader, lazy, try_n, clean_title 4 | import ree as re 5 | import os 6 | from translator import tr_ 7 | from timee import sleep 8 | URL_ENTER = 'https://www.hentai-foundry.com/site/index?enterAgree=1&size=1550' 9 | URL_FILTER = 'https://www.hentai-foundry.com/site/filters' 10 | 11 | 12 | class Image(object): 13 | def __init__(self, url, session): 14 | @try_n(4) 15 | def f(_): 16 | html = downloader.read_html(url, session=session) 17 | soup = Soup(html) 18 | 19 | box = soup.find('section', id='picBox') 20 | img = box.find('img') 21 | if img is None: 22 | raise Exception('No img') 23 | 24 | onclick = img.attrs.get('onclick', '') 25 | if onclick and '.src' in onclick: 26 | print('onclick', onclick) 27 | img = re.find('''.src *= *['"](.+?)['"]''', onclick) 28 | else: 29 | img = img.attrs['src'] 30 | img = urljoin(url, img) 31 | 32 | filename = clean_title(os.path.basename(img.split('?')[0])) 33 | name, ext = os.path.splitext(filename) 34 | 35 | # https://www.hentai-foundry.com/pictures/user/DrGraevling/74069/Eversong-Interrogation-pg.-13 36 | if ext.lower() not in ['.bmp', '.png', '.gif', '.jpg', '.jpeg', '.webp', '.webm', '.avi', '.mp4', '.mkv', '.wmv']: 37 | filename = u'{}.jpg'.format(name) 38 | 39 | self.filename = filename 40 | return img 41 | self.url = LazyUrl(url, f, self) 42 | 43 | 44 | def get_username(url): 45 | if 'user/' in url: 46 | username = url.split('user/')[1].split('?')[0].split('/')[0] 47 | return username 48 | 49 | 50 | @Downloader.register 51 | class Downloader_hf(Downloader): 52 | type = 'hf' 53 | URLS = ['hentai-foundry.com'] 54 | MAX_CORE = 16 55 | display_name = 'Hentai Foundry' 56 | 57 | def init(self): 58 | self.session = enter() 59 | 60 | @classmethod 61 | def fix_url(cls, url): 62 | username = get_username(url) 63 | return 'https://www.hentai-foundry.com/user/{}'.format(username) 64 | 65 | def read(self): 66 | username = get_username(self.url) 67 | self.title = username 68 | 69 | imgs = get_imgs(username, self.title, self.session, cw=self.cw) 70 | 71 | for img in imgs: 72 | self.urls.append(img.url) 73 | 74 | self.title = username 75 | 76 | 77 | @try_n(2) 78 | def enter(): 79 | print('enter') 80 | session = Session() 81 | 82 | r = session.get(URL_ENTER) 83 | 84 | # 862 85 | html = r.text 86 | soup = Soup(html) 87 | box = soup.find('aside', id='FilterBox') 88 | data = {} 89 | for select in box.findAll('select'): 90 | name = select.attrs['name'] 91 | value = select.findAll('option')[-1].attrs['value'] 92 | print(name, value) 93 | data[name] = value 94 | for input in box.findAll('input'): 95 | name = input.attrs['name'] 96 | value = input.attrs['value'] 97 | if name.startswith('rating_') or 'CSRF_TOKEN' in name: 98 | print(name, value) 99 | data[name] = value 100 | data.update({ 101 | 'filter_media': 'A', 102 | 'filter_order': 'date_new', 103 | 'filter_type': '0', 104 | }) 105 | r = session.post(URL_FILTER, data=data, headers={'Referer': r.url}) 106 | print(r) 107 | 108 | return session 109 | 110 | 111 | def get_imgs(username, title, session, cw=None): 112 | url = 'https://www.hentai-foundry.com/pictures/user/{}'.format(username) 113 | 114 | #downloader.read_html(url_enter, session=session) 115 | 116 | hrefs = [] 117 | for p in range(100): 118 | print(url) 119 | html = downloader.read_html(url, session=session) 120 | soup = Soup(html) 121 | 122 | if soup.find('div', id='entryButtonContainer'): 123 | session = enter() 124 | continue 125 | 126 | tab = soup.find('a', class_='active') 127 | n = re.find(r'\(([0-9]+)', tab.text) 128 | 129 | view = soup.find('div', class_='galleryViewTable') 130 | for a in view.findAll('a', class_='thumbLink'): 131 | href = urljoin(url, a.attrs['href']) 132 | if href in hrefs: 133 | print('dup') 134 | continue 135 | hrefs.append(href) 136 | 137 | next = soup.find(lambda tag: tag.name == 'li' and tag.get('class') == ['next']) 138 | if next is None: 139 | break 140 | url = urljoin(url, next.a.attrs['href']) 141 | 142 | s = u'{} {} ({} / {})'.format(tr_(u'읽는 중...'), title, len(hrefs), n) 143 | if cw: 144 | if not cw.alive: 145 | return [] 146 | cw.setTitle(s) 147 | else: 148 | print(s) 149 | 150 | imgs = [] 151 | for href in hrefs: 152 | img = Image(href, session) 153 | imgs.append(img) 154 | 155 | return imgs 156 | 157 | -------------------------------------------------------------------------------- /src/extractor/luscious_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, Downloader, LazyUrl, urljoin, try_n, get_outdir, clean_title 4 | import ree as re 5 | import os 6 | from timee import sleep 7 | from translator import tr_ 8 | from io import BytesIO 9 | import json 10 | 11 | 12 | class Image(object): 13 | def __init__(self, item, referer): 14 | self.item = item 15 | self.id = str(item['id']) 16 | self.referer = referer 17 | self.url = LazyUrl(referer, self.get, self) 18 | 19 | def get(self, url): 20 | img = urljoin(url, self.item['url_to_original']) 21 | ext = os.path.splitext(img.split('?')[0])[1] 22 | self.filename = u'{}{}'.format(self.id, ext) 23 | return img 24 | 25 | 26 | class Video(object): 27 | def __init__(self, url, title, url_thumb): 28 | self.url = url 29 | self.title = title 30 | ext = os.path.splitext(url.split('?')[0])[1] 31 | self.filename = u'{}{}'.format(clean_title(title), ext) 32 | self.url_thumb = url_thumb 33 | self.thumb = BytesIO() 34 | downloader.download(self.url_thumb, buffer=self.thumb) 35 | 36 | 37 | @Downloader.register 38 | class Downloader_luscious(Downloader): 39 | type = 'luscious' 40 | URLS = ['luscious.net'] 41 | MAX_CORE = 4 42 | 43 | @classmethod 44 | def fix_url(cls, url): 45 | url = url.replace('members.luscious.', 'www.luscious.') 46 | return url 47 | 48 | def read(self): 49 | for try_ in range(8): 50 | try: 51 | html = downloader.read_html(self.url) 52 | break 53 | except Exception as e: 54 | e_ = e 55 | self.print_error(e) 56 | self.print_('retry...') 57 | else: 58 | raise e_ 59 | soup = Soup(html) 60 | title = clean_title(get_title(soup)) 61 | 62 | self.title = tr_(u'읽는 중... {}').format(title) 63 | 64 | if '/videos/' in self.url: 65 | video = get_video(self.url, soup) 66 | imgs = [video] 67 | self.setIcon(video.thumb) 68 | else: 69 | imgs = get_imgs(self.url, soup, self.cw) 70 | 71 | dir = os.path.join(get_outdir(self.type), title) 72 | names = {} 73 | try: 74 | for name in os.listdir(dir): 75 | id = os.path.splitext(name)[0] 76 | names[id] = name 77 | except: 78 | pass 79 | 80 | for img in imgs: 81 | if img.id in names: 82 | url = os.path.join(dir, names[img.id]) 83 | else: 84 | url = img.url 85 | self.urls.append(url) 86 | 87 | self.title = title# 88 | 89 | 90 | def update(cw, title, imgs): 91 | s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)) 92 | if cw is not None: 93 | cw.setTitle(s) 94 | else: 95 | print(s) 96 | 97 | def get_imgs(url, soup=None, cw=None): 98 | if soup is None: 99 | html = downloader.read_html(url) 100 | soup = Soup(html) 101 | title = get_title(soup) 102 | 103 | imgs = [] 104 | for p in range(1, 81): 105 | imgs_new = get_imgs_p(url, p) 106 | if not imgs_new: 107 | break 108 | imgs += imgs_new 109 | update(cw, title, imgs) 110 | return imgs 111 | 112 | 113 | @try_n(4, sleep=30) 114 | def get_imgs_p(url, p=1): 115 | id = re.find('/albums/[^/]+?([0-9]+)/', url+'/') 116 | print(url, id) 117 | url_api = 'https://api.luscious.net/graphql/nobatch/?operationName=AlbumListOwnPictures&query=+query+AlbumListOwnPictures%28%24input%3A+PictureListInput%21%29+%7B+picture+%7B+list%28input%3A+%24input%29+%7B+info+%7B+...FacetCollectionInfo+%7D+items+%7B+...PictureStandardWithoutAlbum+%7D+%7D+%7D+%7D+fragment+FacetCollectionInfo+on+FacetCollectionInfo+%7B+page+has_next_page+has_previous_page+total_items+total_pages+items_per_page+url_complete+%7D+fragment+PictureStandardWithoutAlbum+on+Picture+%7B+__typename+id+title+created+like_status+number_of_comments+number_of_favorites+status+width+height+resolution+aspect_ratio+url_to_original+url_to_video+is_animated+position+tags+%7B+category+text+url+%7D+permissions+url+thumbnails+%7B+width+height+size+url+%7D+%7D+&variables=%7B%22input%22%3A%7B%22filters%22%3A%5B%7B%22name%22%3A%22album_id%22%2C%22value%22%3A%22{}%22%7D%5D%2C%22display%22%3A%22position%22%2C%22page%22%3A{}%7D%7D'.format(id, p) 118 | data_raw = downloader.read_html(url_api, referer=url) 119 | data = json.loads(data_raw) 120 | has_next_page = data['data']['picture']['list']['info']['has_next_page'] 121 | imgs = [] 122 | for item in data['data']['picture']['list']['items']: 123 | img = Image(item, url) 124 | imgs.append(img) 125 | 126 | return imgs 127 | 128 | 129 | def get_video(url, soup): 130 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] 131 | 132 | title = re.find('videos/([^/]+)', url) 133 | video = soup.find('video') 134 | url = video.source.attrs['src'] 135 | video = Video(url, title, url_thumb) 136 | return video 137 | 138 | 139 | def get_title(soup): 140 | return soup.find('h1').text.strip() 141 | -------------------------------------------------------------------------------- /src/extractor/xvideo_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | from utils import Downloader, Soup, LazyUrl, urljoin, format_filename, Session, get_ext, get_print, get_max_range, html_unescape 3 | from io import BytesIO 4 | from constants import try_n 5 | import ree as re 6 | from m3u8_tools import playlist2stream 7 | from translator import tr_ 8 | import json 9 | from timee import sleep 10 | from ratelimit import limits, sleep_and_retry 11 | CHANNEL_PATTERN = r'/(profiles|[^/]*channels)/([0-9a-zA-Z_]+)' 12 | 13 | 14 | def get_id(url): 15 | url = url.lower() 16 | if '/prof-video-click/' in url: 17 | return url.split('/prof-video-click/')[1].split('/')[2] 18 | return re.find(r'xvideos[0-9]*\.[^/]+/video([0-9]+)', url, err='no id') 19 | 20 | 21 | class Video(object): 22 | _url = None 23 | 24 | def __init__(self, url_page): 25 | url_page = Downloader_xvideo.fix_url(url_page) 26 | self.url = LazyUrl(url_page, self.get, self) 27 | 28 | def get(self, url_page): 29 | if not self._url: 30 | self._get(url_page) 31 | return self._url 32 | 33 | @try_n(4) 34 | @sleep_and_retry 35 | @limits(1, 2) 36 | def _get(self, url_page): 37 | id = get_id(url_page) 38 | html = downloader.read_html(url_page) 39 | soup = Soup(html) 40 | self.title = html_unescape(soup.find('title').text).replace('- XVIDEOS.COM', '').strip() 41 | url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) or re.find(r'''.setVideoUrlLow\(['"](.+?)['"]\)''', html) #https://www.xvideos.com/video65390539/party_night 42 | if not url: 43 | raise Exception('no video url') 44 | ext = get_ext(url) 45 | if ext.lower() == '.m3u8': 46 | url = playlist2stream(url, n_thread=5) 47 | self.url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'] 48 | self.filename = format_filename(self.title, id, '.mp4') 49 | self._url= url 50 | 51 | @property 52 | def thumb(self): 53 | self.url() 54 | f = BytesIO() 55 | downloader.download(self.url_thumb, buffer=f) 56 | return f 57 | 58 | 59 | 60 | @Downloader.register 61 | class Downloader_xvideo(Downloader): 62 | type = 'xvideo' 63 | URLS = [r'regex:[./]xvideos[0-9]*\.(com|in|es)'] 64 | single = True 65 | display_name = 'XVideos' 66 | 67 | def init(self): 68 | if 'xvideos.' in self.url.lower(): 69 | self.url = self.url.replace('http://', 'https://') 70 | else: 71 | self.url = 'https://www.xvideos.com/{}'.format(self.url) 72 | 73 | @classmethod 74 | def fix_url(cls, url): 75 | url = re.sub(r'[^/]*xvideos[0-9]*\.[^/]+', 'www.xvideos.com', url).replace('http://', 'https://') 76 | url = url.replace('/THUMBNUM/', '/') 77 | return url 78 | 79 | @classmethod 80 | def key_id(cls, url): 81 | res = re.find(CHANNEL_PATTERN, url) 82 | if res: 83 | return '_'.join(res) 84 | return url 85 | 86 | def read(self): 87 | res = re.find(CHANNEL_PATTERN, self.url) 88 | if res: 89 | header, username = res 90 | info = read_channel(self.url, self.cw) 91 | videos = [Video(url) for url in info['urls']] 92 | video = self.process_playlist('[Channel] {}'.format(info['name']), videos) 93 | else: 94 | video = Video(self.url) 95 | video.url() 96 | self.title = video.title 97 | self.urls.append(video.url) 98 | 99 | self.setIcon(video.thumb) 100 | 101 | 102 | def read_channel(url_page, cw=None): 103 | print_ = get_print(cw) 104 | res = re.find(CHANNEL_PATTERN, url_page) 105 | if res is None: 106 | raise Exception('Not channel') 107 | header, username = res 108 | print(header, username) 109 | max_pid = get_max_range(cw) 110 | info = {} 111 | info['header'] = header 112 | info['username'] = username 113 | session = Session() 114 | urls = [] 115 | ids = set() 116 | for p in range(100): 117 | url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p)) 118 | print_(url_api) 119 | r = session.post(url_api) 120 | data = json.loads(r.text) 121 | 122 | videos = data['videos'] 123 | if not videos: 124 | print_('empty') 125 | break 126 | 127 | for video in videos: 128 | id_ = video['id'] 129 | if id_ in ids: 130 | print_('duplicate: {}'.format(id_)) 131 | continue 132 | ids.add(id_) 133 | info['name'] = video['pn'] 134 | urls.append(urljoin(url_page, video['u'])) 135 | 136 | if len(urls) >= max_pid: 137 | break 138 | 139 | n = data['nb_videos'] 140 | 141 | s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls)) 142 | if cw: 143 | cw.setTitle(s) 144 | else: 145 | print(s) 146 | if len(ids) >= n: 147 | break 148 | sleep(1, cw) 149 | if not urls: 150 | raise Exception('no videos') 151 | info['urls'] = urls[:max_pid] 152 | return info 153 | -------------------------------------------------------------------------------- /src/extractor/gelbooru_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import downloader 3 | import ree as re 4 | import os 5 | from utils import Downloader, urljoin, query_url, Soup, get_max_range, get_print, LazyUrl, get_ext, clean_title, Session 6 | from translator import tr_ 7 | try: 8 | from urllib import quote # python2 9 | except: 10 | from urllib.parse import quote # python3 11 | import sys 12 | from timee import sleep 13 | from constants import clean_url 14 | 15 | 16 | def get_tags(url): 17 | url = clean_url(url) 18 | qs = query_url(url) 19 | if 'page=favorites' in url: 20 | id = qs.get('id', ['N/A'])[0] 21 | id = u'fav_{}'.format(id) 22 | else: 23 | tags = qs.get('tags', []) 24 | tags.sort() 25 | id = u' '.join(tags) 26 | if not id: 27 | id = u'N/A' 28 | return id 29 | 30 | 31 | @Downloader.register 32 | class Downloader_gelbooru(Downloader): 33 | type = 'gelbooru' 34 | URLS = ['gelbooru.com'] 35 | MAX_CORE = 8 36 | _name = None 37 | 38 | @classmethod 39 | def fix_url(cls, url): 40 | if 'gelbooru.com' in url.lower(): 41 | url = url.replace('http://', 'https://') 42 | else: 43 | url = url.replace(' ', '+') 44 | while '++' in url: 45 | url = url.replace('++', '+') 46 | url = quote(url) 47 | url = url.replace('%2B', '+') 48 | url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format(url) 49 | return url 50 | 51 | @property 52 | def name(self): 53 | if self._name is None: 54 | tags = get_tags(self.url) 55 | self._name = tags 56 | return clean_title(self._name) 57 | 58 | def read(self): 59 | self.title = self.name 60 | 61 | imgs = get_imgs(self.url, self.name, cw=self.cw) 62 | 63 | for img in imgs: 64 | self.urls.append(img.url) 65 | 66 | self.title = self.name 67 | 68 | 69 | @LazyUrl.register 70 | class LazyUrl_gelbooru(LazyUrl): 71 | type = 'gelbooru' 72 | def dump(self): 73 | return { 74 | 'id': self.image.id_, 75 | 'url': self.image._url, 76 | } 77 | @classmethod 78 | def load(cls, data): 79 | img = Image(data['id'], data['url']) 80 | return img.url 81 | 82 | 83 | class Image(object): 84 | def __init__(self, id_, url): 85 | self.id_ = id_ 86 | self._url = url 87 | self.url = LazyUrl_gelbooru(url, self.get, self) 88 | 89 | def get(self, url): 90 | html = downloader.read_html(url) 91 | soup = Soup(html) 92 | for li in soup.findAll('li'): 93 | if li.text.strip() == 'Original image': 94 | break 95 | else: 96 | raise Exception('no Original image') 97 | url = li.find('a')['href'] 98 | ext = get_ext(url) 99 | self.filename = u'{}{}'.format(self.id_, ext) 100 | return url 101 | 102 | 103 | def setPage(url, page): 104 | # Always use HTTPS 105 | url = url.replace('http://', 'https://') 106 | 107 | # Change the page 108 | if 'pid=' in url: 109 | url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url) 110 | else: 111 | url += '&pid={}'.format(page) 112 | 113 | if page == 0: 114 | url = url.replace('&pid=0', '') 115 | 116 | return url 117 | 118 | 119 | def get_imgs(url, title=None, cw=None): 120 | print_ = get_print(cw) 121 | url = clean_url(url) 122 | if 's=view' in url and 'page=favorites' not in url: 123 | raise NotImplementedError('Not Implemented') 124 | 125 | tags = get_tags(url) 126 | tags = quote(tags, safe='/') 127 | tags = tags.replace('%20', '+') 128 | url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format(tags) 129 | 130 | # 2566 131 | user_id = Session().cookies.get('user_id', domain='gelbooru.com') 132 | if user_id: 133 | cookies = None 134 | else: 135 | cookies = {'fringeBenefits': 'yup'} 136 | print_('user_id: {}'.format(user_id)) 137 | 138 | # Range 139 | max_pid = get_max_range(cw) 140 | 141 | imgs = [] 142 | ids = set() 143 | count_no_imgs = 0 144 | for p in range(500): #1017 145 | url = setPage(url, len(ids)) 146 | print_(url) 147 | html = downloader.read_html(url, cookies=cookies) 148 | 149 | soup = Soup(html) 150 | posts = soup.findAll(class_='thumbnail-preview') 151 | imgs_new = [] 152 | for post in posts: 153 | id_ = int(re.find('[0-9]+', post.find('a')['id'], err='no id')) 154 | if id_ in ids: 155 | print('duplicate:', id_) 156 | continue 157 | ids.add(id_) 158 | url_img = urljoin(url, post.find('a')['href']) 159 | img = Image(id_, url_img) 160 | imgs_new.append(img) 161 | if imgs_new: 162 | imgs += imgs_new 163 | count_no_imgs = 0 164 | else: 165 | print('no imgs') 166 | count_no_imgs += 1 167 | if count_no_imgs > 1: 168 | print('break') 169 | break 170 | 171 | if len(imgs) >= max_pid: 172 | break 173 | 174 | if cw is not None: 175 | if not cw.alive: 176 | break 177 | cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))) 178 | 179 | return imgs[:max_pid] 180 | -------------------------------------------------------------------------------- /src/extractor/bcy_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from __future__ import print_function 3 | import downloader 4 | from utils import Soup, cut_pair, LazyUrl, Downloader, get_print, get_max_range, try_n, clean_title, check_alive 5 | import json 6 | import ree as re 7 | import os 8 | from translator import tr_ 9 | 10 | 11 | @Downloader.register 12 | class Downloader_bcy(Downloader): 13 | type = 'bcy' 14 | URLS = ['bcy.net/item/detail/', 'bcy.net/u/'] 15 | MAX_CORE = 8 16 | display_name = '半次元' 17 | 18 | def init(self): 19 | self.html = downloader.read_html(self.url) 20 | self.info = get_info(self.url, self.html) 21 | 22 | @property 23 | def name(self): 24 | info = self.info 25 | if '/detail/' in self.url: 26 | title = u'{} (bcy_{}) - {}'.format(clean_title(info['artist']), info['uid'], info['id']) 27 | else: 28 | title = u'{} (bcy_{})'.format(clean_title(info['artist']), info['uid']) 29 | return title 30 | 31 | def read(self): 32 | imgs = get_imgs(self.url, self.html, cw=self.cw) 33 | 34 | for img in imgs: 35 | self.urls.append(img.url) 36 | 37 | self.title = self.name 38 | self.artist = self.info['artist'] 39 | 40 | 41 | def get_ssr_data(html): 42 | s = html.split('window.__ssr_data = JSON.parse("')[1].replace('\\"', '"') 43 | s = cut_pair(s).replace('"', '\\"') 44 | data = json.loads(json.loads('"{}"'.format(s))) 45 | return data 46 | 47 | 48 | @try_n(2) 49 | def get_imgs(url, html=None, cw=None): 50 | if '/detail/' not in url: 51 | return get_imgs_channel(url, html, cw) 52 | 53 | if html is None: 54 | html = downloader.read_html(url) 55 | 56 | data = get_ssr_data(html) 57 | 58 | multi = data['detail']['post_data']['multi'] 59 | 60 | imgs = [] 61 | 62 | for m in multi: 63 | path = m['original_path'] 64 | img = json.loads(u'"{}"'.format(path)) 65 | img = Image_single(img, url, len(imgs)) 66 | imgs.append(img) 67 | 68 | return imgs 69 | 70 | 71 | class Image_single(object): 72 | def __init__(self, url ,referer, p): 73 | self._url = url 74 | self.p = p 75 | self.url = LazyUrl(referer, self.get, self) 76 | 77 | def get(self, referer): 78 | ext = get_ext(self._url, referer) 79 | self.filename = '{:04}{}'.format(self.p, ext) 80 | return self._url 81 | 82 | 83 | class Image(object): 84 | def __init__(self, url, referer, id, p): 85 | self.id = id 86 | self.p = p 87 | self._url = url 88 | self.url = LazyUrl(referer, self.get, self) 89 | 90 | def get(self, referer): 91 | ext = get_ext(self._url, referer) 92 | self.filename = u'{}_p{}{}'.format(self.id, self.p, ext) 93 | return self._url 94 | 95 | 96 | def get_ext(url, referer=None): 97 | ext = os.path.splitext(url.split('?')[0].replace('~noop.image', ''))[1] 98 | if ext in ['.image', '']: 99 | ext = downloader.get_ext(url, referer=referer) 100 | return ext 101 | 102 | 103 | def get_info(url, html): 104 | soup = Soup(html) 105 | info = {} 106 | 107 | uname = soup.find('div', class_='user-name') or soup.find('p', class_='uname') or soup.find('div', class_='user-info-name') 108 | 109 | info['artist'] = uname.text.strip() 110 | 111 | j = get_ssr_data(html) 112 | 113 | if '/detail/' in url: 114 | info['uid'] = j['detail']['detail_user']['uid'] 115 | info['id'] = j['detail']['post_data']['item_id'] 116 | else: 117 | info['uid'] = j['homeInfo']['uid'] 118 | 119 | return info 120 | 121 | 122 | def get_imgs_channel(url, html=None, cw=None): 123 | print_ = get_print(cw) 124 | if html is None: 125 | html = downloader.read_html(url) 126 | info = get_info(url, html) 127 | 128 | # Range 129 | max_pid = get_max_range(cw) 130 | 131 | ids = set() 132 | imgs = [] 133 | for p in range(1000): 134 | url_api = 'https://bcy.net/apiv3/user/selfPosts?uid={}'.format(info['uid']) 135 | if imgs: 136 | url_api += '&since={}'.format(imgs[-1].id) 137 | data_raw = downloader.read_html(url_api, url) 138 | data = json.loads(data_raw)['data'] 139 | items = data['items'] 140 | if not items: 141 | print('no items') 142 | break 143 | c = 0 144 | for item in items: 145 | check_alive(cw) 146 | id = item['item_detail']['item_id'] 147 | if id in ids: 148 | print('duplicate') 149 | continue 150 | c += 1 151 | ids.add(id) 152 | url_single = u'https://bcy.net/item/detail/{}'.format(id) 153 | imgs_single = get_imgs(url_single, cw=cw) 154 | print_(str(id)) 155 | for p, img in enumerate(imgs_single): 156 | img = Image(img._url, url_single, id, p) 157 | imgs.append(img) 158 | s = u'{} {} - {}'.format(tr_(u'읽는 중...'), info['artist'], min(len(imgs), max_pid)) 159 | if cw: 160 | cw.setTitle(s) 161 | else: 162 | print(s) 163 | 164 | if len(imgs) >= max_pid: 165 | break 166 | if not c: 167 | print('not c') 168 | break 169 | if len(imgs) >= max_pid: 170 | print('over max_pid:', max_pid) 171 | break 172 | return imgs[:max_pid] 173 | 174 | -------------------------------------------------------------------------------- /src/extractor/danbooru_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | import downloader 3 | import ree as re 4 | from utils import Downloader, get_max_range, clean_title, get_print, try_n, urljoin, check_alive, LazyUrl, get_ext 5 | from translator import tr_ 6 | from urllib.parse import quote 7 | from urllib.parse import urlparse, parse_qs 8 | from ratelimit import limits, sleep_and_retry 9 | 10 | 11 | 12 | @Downloader.register 13 | class Downloader_danbooru(Downloader): 14 | type='danbooru' 15 | URLS = ['danbooru.donmai.us'] 16 | MAX_CORE = 8 17 | _name = None 18 | 19 | @classmethod 20 | def fix_url(cls, url): 21 | if 'donmai.us' in url: 22 | url = url.replace('http://', 'https://') 23 | else: 24 | url = url.replace(' ', '+') 25 | while '++' in url: 26 | url = url.replace('++', '+') 27 | url = 'https://danbooru.donmai.us/?tags={}'.format(quote(url)) 28 | return url.strip('+') 29 | 30 | @property 31 | def name(self): 32 | if self._name is None: 33 | parsed_url = urlparse(self.url) 34 | qs = parse_qs(parsed_url.query) 35 | if 'donmai.us/favorites' in self.url: 36 | id = qs.get('user_id', [''])[0] 37 | print('len(id) =', len(id), '"{}"'.format(id)) 38 | assert len(id) > 0, '[Fav] User id is not specified' 39 | id = 'fav_{}'.format(id) 40 | elif 'donmai.us/explore/posts/popular' in self.url: #4160 41 | soup = read_soup(self.url, self.cw) 42 | id = soup.find('h1').text 43 | else: 44 | tags = qs.get('tags', []) 45 | tags.sort() 46 | id = ' '.join(tags) 47 | if not id: 48 | id = 'N/A' 49 | self._name = id 50 | return clean_title(self._name) 51 | 52 | def read(self): 53 | self.title = self.name 54 | 55 | imgs = get_imgs(self.url, self.name, cw=self.cw) 56 | 57 | for img in imgs: 58 | self.urls.append(img.url) 59 | 60 | self.title = self.name 61 | 62 | 63 | class Image(object): 64 | def __init__(self, id, url, cw): 65 | self._cw = cw 66 | self.id = id 67 | self.url = LazyUrl(url, self.get, self) 68 | 69 | def get(self, url): 70 | soup = read_soup(url, self._cw) 71 | ori = soup.find('li', id='post-option-view-original') 72 | if ori: 73 | img = ori.find('a')['href'] 74 | else: 75 | img = soup.find('li', id='post-info-size').find('a')['href'] 76 | img = urljoin(url, img) 77 | ext = get_ext(img) 78 | self.filename = '{}{}'.format(self.id, ext) 79 | return img 80 | 81 | 82 | 83 | @sleep_and_retry 84 | @limits(2, 1) 85 | def wait(cw): 86 | check_alive(cw) 87 | 88 | 89 | def setPage(url, page): 90 | # Always use HTTPS 91 | url = url.replace('http://', 'https://') 92 | 93 | # Main page 94 | if re.findall(r'https://[\w]*[.]?donmai.us/?$', url): 95 | url = 'https://{}donmai.us/posts?page=1'.format('danbooru.' if 'danbooru.' in url else '') 96 | 97 | # Change the page 98 | if 'page=' in url: 99 | url = re.sub('page=[0-9]*', 'page={}'.format(page), url) 100 | else: 101 | url += '&page={}'.format(page) 102 | 103 | return url 104 | 105 | 106 | @try_n(4) #4103 107 | def read_soup(url, cw): 108 | check_alive(cw) 109 | wait(cw) 110 | return downloader.read_soup(url) 111 | 112 | 113 | def get_imgs(url, title=None, range_=None, cw=None): 114 | if 'donmai.us/artists' in url: 115 | raise NotImplementedError('Not Implemented') 116 | if 'donmai.us/posts/' in url: 117 | raise NotImplementedError('Not Implemented') 118 | 119 | print_ = get_print(cw) 120 | 121 | # Range 122 | max_pid = get_max_range(cw) 123 | 124 | if range_ is None: 125 | range_ = range(1, 1001) 126 | print(range_) 127 | imgs = [] 128 | i = 0 129 | empty_count = 0 130 | empty_count_global = 0 131 | url_imgs = set() 132 | while i < len(range_): 133 | check_alive(cw) 134 | p = range_[i] 135 | url = setPage(url, p) 136 | print_(url) 137 | soup = read_soup(url, cw) 138 | articles = soup.findAll('article') 139 | if articles: 140 | empty_count_global = 0 141 | else: 142 | empty_count += 1 143 | if empty_count < 4: 144 | s = 'empty page; retry... {}'.format(p) 145 | print_(s) 146 | continue 147 | else: 148 | empty_count = 0 149 | empty_count_global += 1 150 | 151 | if empty_count_global >= 6: 152 | break 153 | 154 | for article in articles: 155 | id = article.attrs['data-id'] 156 | 157 | #url_img = article.attrs['data-file-url'].strip() 158 | url_img = urljoin(url, article.find('a', class_='post-preview-link')['href']) #4160 159 | 160 | #print(url_img) 161 | if url_img not in url_imgs: 162 | url_imgs.add(url_img) 163 | img = Image(id, url_img, cw) 164 | imgs.append(img) 165 | 166 | if len(imgs) >= max_pid: 167 | break 168 | 169 | if cw is not None: 170 | cw.setTitle('{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs))) 171 | i += 1 172 | 173 | return imgs[:max_pid] 174 | -------------------------------------------------------------------------------- /src/extractor/soundcloud_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf8 2 | import downloader 3 | import json 4 | from io import BytesIO 5 | from utils import Downloader, LazyUrl, get_print, try_n, lock, clean_title 6 | from error_printer import print_error 7 | import os 8 | from timee import sleep 9 | import ffmpeg 10 | import ytdl 11 | from m3u8_tools import M3u8_stream 12 | CLIENT_ID = None 13 | 14 | 15 | @lock 16 | def get_cid(force=False): 17 | global CLIENT_ID 18 | if CLIENT_ID is None or force: 19 | print('update cid...') 20 | d = ytdl.YoutubeDL() 21 | e = ytdl.extractor.soundcloud.SoundcloudIE(d) 22 | e._update_client_id() 23 | CLIENT_ID = e._CLIENT_ID 24 | return CLIENT_ID 25 | 26 | 27 | class Audio(object): 28 | _url = None 29 | 30 | def __init__(self, info, album_art, cw=None): 31 | self.info = info 32 | self.album_art = album_art 33 | self.cw = cw 34 | self.url = LazyUrl(info['webpage_url'], self.get, self, pp=self.pp) 35 | 36 | def get(self, url): 37 | print_ = get_print(self.cw) 38 | if self._url: 39 | return self._url 40 | 41 | info = self.info 42 | 43 | ## ydl = ytdl.YoutubeDL() 44 | ## info = ydl.extract_info(url) 45 | 46 | formats = info['formats'] 47 | print(formats) 48 | formats = sorted(formats, key=lambda x: int(x.get('abr', 0)), reverse=True) 49 | url_audio = None 50 | 51 | for format in formats: 52 | protocol = format['protocol'] 53 | print_(u'【{}】 format【{}】 abr【{}】'.format(protocol, format['format'], format.get('abr', 0))) 54 | if not url_audio and protocol in ['http', 'https']: 55 | url_audio = format['url'] 56 | 57 | if not url_audio: 58 | url_audio = M3u8_stream(formats[0]['url']) 59 | self.album_art = False# 60 | 61 | self.username = info['uploader'] 62 | self.title = u'{} - {}'.format(self.username, info['title']) 63 | self.filename = u'{}{}'.format(clean_title(self.title, allow_dot=True, n=-4), '.mp3') 64 | 65 | thumb = None 66 | for t in info['thumbnails'][::-1]: 67 | width = t.get('width', 1080) 68 | if not 100 <= width <= 500: 69 | continue 70 | url_thumb = t['url'] 71 | thumb = BytesIO() 72 | try: 73 | downloader.download(url_thumb, buffer=thumb) 74 | break 75 | except Exception as e: 76 | print(e) 77 | thumb = None 78 | self.thumb = thumb 79 | 80 | self._url = url_audio 81 | return self._url 82 | 83 | def pp(self, filename): 84 | if self.thumb and self.album_art: 85 | self.thumb.seek(0)# 86 | ffmpeg.add_cover(filename, self.thumb, {'artist':self.username, 'title':self.info['title']}, cw=self.cw) 87 | 88 | 89 | @Downloader.register 90 | class Downloader_soundcloud(Downloader): 91 | type = 'soundcloud' 92 | single = True 93 | URLS = ['soundcloud.com'] 94 | #lock = True 95 | audio = None 96 | display_name = 'SoundCloud' 97 | 98 | def init(self): 99 | if 'soundcloud.com' in self.url.lower(): 100 | self.url = self.url.replace('http://', 'https://') 101 | else: 102 | self.url = 'https://soundcloud.com/{}'.format(self.url) 103 | 104 | def read(self): 105 | album_art = self.ui_setting.albumArt.isChecked() 106 | info = get_audios(self.url, self.cw, album_art) 107 | audios = info['audios'] 108 | 109 | if not audios: 110 | raise Exception('no audios') 111 | 112 | # first audio must be valid 113 | while audios: 114 | audio = audios[0] 115 | try: 116 | audio.url() 117 | break 118 | except Exception as e: 119 | e_ = e 120 | print(e) 121 | audios.remove(audio) 122 | else: 123 | raise e_ 124 | 125 | if len(audios) > 1: 126 | audio = self.process_playlist(info['title'], audios) 127 | else: 128 | self.urls.append(audio.url) 129 | self.title = audio.title 130 | 131 | self.artist = audio.username 132 | self.setIcon(audio.thumb) 133 | 134 | 135 | @try_n(2) 136 | def get_audios(url, cw, album_art): 137 | print_ = get_print(cw) 138 | url = url.rstrip('/') 139 | if url.count('/') == 3: 140 | url += '/tracks' 141 | 142 | info = { 143 | #'extract_flat': True, 144 | } 145 | 146 | ydl = ytdl.YoutubeDL(cw=cw) 147 | info = ydl.extract_info(url) 148 | if 'entries' in info: 149 | entries = info['entries'] 150 | title = info['title'] 151 | for _type in ['All', 'Tracks', 'Albums', 'Sets', 'Reposts', 'Likes', 'Spotlight']: 152 | x = '({})'.format(_type) 153 | if x in title: 154 | title = title.replace(x, '') 155 | kind = _type 156 | break 157 | else: 158 | kind = 'Playlist' 159 | print_(u'kind: {}'.format(kind)) 160 | info['title'] = u'[{}] {}'.format(kind.capitalize(), title) 161 | else: 162 | entries = [info] 163 | 164 | audios = [] 165 | for e in entries: 166 | if '/sets/' in e['webpage_url']: 167 | continue 168 | audio = Audio(e, album_art, cw=cw) 169 | audios.append(audio) 170 | 171 | info['audios'] = audios 172 | 173 | return info 174 | 175 | 176 | -------------------------------------------------------------------------------- /src/extractor/tiktok_downloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function, unicode_literals 2 | import downloader 3 | import ree as re 4 | from utils import Soup, LazyUrl, Downloader, try_n, compatstr, get_print, clean_title, Session, get_max_range, format_filename 5 | from io import BytesIO 6 | import clf2 7 | from translator import tr_ 8 | from timee import sleep 9 | from error_printer import print_error 10 | import ytdl 11 | PATTERN_VID = '/(v|video)/(?P[0-9]+)' 12 | SHOW = True 13 | 14 | 15 | def is_captcha(soup): 16 | return soup.find('div', class_="verify-wrap") is not None 17 | 18 | 19 | @Downloader.register 20 | class Downloader_tiktok(Downloader): 21 | type = 'tiktok' 22 | single = True 23 | URLS = ['tiktok.com'] 24 | display_name = 'TikTok' 25 | 26 | def init(self): 27 | cw = self.cw 28 | self.session = Session() 29 | res = clf2.solve(self.url, self.session, cw) 30 | self.url = self.fix_url(res['url']) #4324 31 | soup = Soup(res['html']) 32 | if is_captcha(soup): 33 | def f(html): 34 | return not is_captcha(Soup(html)) 35 | clf2.solve(self.url, self.session, cw, show=True, f=f) 36 | 37 | @classmethod 38 | def fix_url(cls, url): 39 | url = url.split('?')[0].split('#')[0].strip('/') 40 | if 'tiktok.com' not in url.lower(): 41 | url = 'https://www.tiktok.com/@{}'.format(url) 42 | return url 43 | 44 | def read(self): 45 | format = compatstr(self.ui_setting.youtubeFormat.currentText()).lower().strip() 46 | 47 | if re.search(PATTERN_VID, self.url) is None: 48 | info = read_channel(self.url, self.session, self.cw) 49 | items = info['items'] 50 | videos = [Video('https://www.tiktok.com/@{}/video/{}'.format(info['uid'], item['id']), self.session, format) for item in items] 51 | title = '{} (tiktok_{})'.format(info['nickname'], info['uid']) 52 | video = self.process_playlist(title, videos) 53 | else: 54 | video = Video(self.url, self.session, format) 55 | video.url() 56 | self.urls.append(video.url) 57 | self.title = clean_title(video.title) 58 | 59 | 60 | class Video(object): 61 | _url = None 62 | 63 | def __init__(self, url, session, format='title (id)'): 64 | self.url = LazyUrl(url, self.get, self) 65 | self.session = session 66 | self.format = format 67 | 68 | @try_n(2) 69 | def get(self, url): 70 | if self._url: 71 | return self._url 72 | m = re.search(PATTERN_VID, url) 73 | id = m.group('id') 74 | ext = '.mp4' 75 | self.title = id# 76 | self.filename = format_filename(self.title, id, ext) 77 | 78 | ydl = ytdl.YoutubeDL() 79 | info = ydl.extract_info(url) 80 | 81 | self._url = info['url'] 82 | 83 | return self._url 84 | 85 | 86 | def read_channel(url, session, cw=None): 87 | print_ = get_print(cw) 88 | 89 | info = {} 90 | info['items'] = [] 91 | 92 | ids = set() 93 | info['items'] = [] 94 | sd = { 95 | 'count_empty': 0, 96 | 'shown': SHOW, 97 | } 98 | 99 | max_pid = get_max_range(cw) 100 | 101 | def f(html, browser=None): 102 | soup = Soup(html) 103 | if is_captcha(soup): 104 | print('captcha') 105 | browser.show() 106 | sd['shown'] = True 107 | elif sd['shown'] and not SHOW: 108 | browser.hide() 109 | sd['shown'] = False 110 | try: 111 | st = soup.find('h2', class_='share-title') 112 | if st is None: 113 | st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c) 114 | info['uid'] = st.text.strip() 115 | st = soup.find('h1', class_='share-sub-title') 116 | if st is None: 117 | st = soup.find('h1', class_=lambda c: c and 'ShareSubTitle' in c) 118 | info['nickname'] = st.text.strip() 119 | except Exception as e: 120 | print_(print_error(e)[0]) 121 | c = 0 122 | ids_now = set() 123 | items = soup.findAll('div', class_='video-feed-item') + soup.findAll('div', class_=lambda c: c and 'DivItemContainer' in c) 124 | for div in items: 125 | a = div.find('a') 126 | if a is None: 127 | continue 128 | href = a['href'] 129 | if not href: 130 | continue 131 | m = re.search(PATTERN_VID, href) 132 | if m is None: 133 | continue 134 | id_video = int(m.group('id')) 135 | ids_now.add(id_video) 136 | if id_video in ids: 137 | continue 138 | ids.add(id_video) 139 | info['items'].append({'id': id_video}) 140 | c += 1 141 | 142 | print_('items: {}'.format(len(info['items']))) 143 | if len(info['items']) >= max_pid: 144 | info['items'] = info['items'][:max_pid] 145 | return True 146 | 147 | browser.runJavaScript('window.scrollTo(0, document.body.scrollHeight);') 148 | sleep(15, cw) 149 | 150 | if c or (ids_now and min(ids_now) > min(ids)): 151 | sd['count_empty'] = 0 152 | else: 153 | print_('empty') 154 | sd['count_empty'] += 1 155 | msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items'])) 156 | if cw: 157 | if not cw.alive: 158 | raise Exception('cw dead') 159 | cw.setTitle(msg) 160 | else: 161 | print(msg) 162 | return sd['count_empty'] > 4 163 | res = clf2.solve(url, session, cw, f=f, timeout=1800, show=SHOW, delay=0) 164 | 165 | if not info['items']: 166 | raise Exception('no items') 167 | 168 | return info 169 | 170 | -------------------------------------------------------------------------------- /src/extractor/naver_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf-8 2 | import downloader 3 | import ree as re 4 | from utils import urljoin, Downloader, Soup, LazyUrl, clean_title, get_ext 5 | import json 6 | from timee import sleep 7 | import collections 8 | import errors 9 | PATTERNS = ['.*blog.naver.com/(?P.+)/(?P[0-9]+)', 10 | '.*blog.naver.com/.+?blogId=(?P[^&]+).+?logNo=(?P[0-9]+)', 11 | '.*?(?P[0-9a-zA-Z_-]+)\.blog\.me/(?P[0-9]+)'] 12 | HDR = { 13 | 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 14 | 'Accept-Encoding': 'gzip, deflate', 15 | 'Accept-Language': 'ko, en-US; q=0.7, en; q=0.3', 16 | 'Connection': 'Keep-Alive', 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393', 18 | } 19 | 20 | def get_id(url): 21 | for pattern in PATTERNS: 22 | m = re.match(pattern, url) 23 | if m is None: 24 | continue 25 | username = m.group('username') 26 | pid = m.group('pid') 27 | break 28 | else: 29 | username, pid = None, None 30 | return username, pid 31 | 32 | 33 | @Downloader.register 34 | class Downloader_naver(Downloader): 35 | type = 'naver' 36 | URLS = ['blog.naver.', '.blog.me'] 37 | display_name = 'Naver Blog' 38 | 39 | def init(self): 40 | username, pid = get_id(self.url) 41 | if username is None: 42 | raise errors.Invalid('Invalid format: {}'.format(self.url)) 43 | self.url = 'https://blog.naver.com/{}/{}'.format(username, pid) 44 | self.headers = {'User-Agent': downloader.hdr['User-Agent']} 45 | 46 | @property 47 | def name(self): 48 | username, pid = get_id(self.url) 49 | return clean_title(u'{}/{}'.format(username, pid)) 50 | 51 | def read(self): 52 | self.title = u'읽는 중... {}'.format(self.name) 53 | 54 | imgs = get_imgs(self.url) 55 | 56 | filenames = {} 57 | for img in imgs: 58 | self.urls.append(img.url) 59 | 60 | self.title = self.name 61 | 62 | 63 | class Image(object): 64 | def __init__(self, url, referer, p): 65 | self.url = LazyUrl(referer, lambda _: url, self) 66 | #3788, #3817 67 | ext = get_ext(url) 68 | self.filename = '{:04}{}'.format(p, ext) 69 | 70 | 71 | class Video(object): 72 | def __init__(self, url, referer, p): 73 | self.url = LazyUrl(referer, lambda _: url, self) 74 | self.filename = 'video_{}.mp4'.format(p) 75 | 76 | 77 | def read_page(url, depth=0): 78 | print('read_page', url, depth) 79 | if depth > 10: 80 | raise Exception('Too deep') 81 | html = downloader.read_html(url, header=HDR) 82 | 83 | if len(html) < 5000: 84 | id = re.find('logNo=([0-9]+)', html, err='no id') 85 | username = re.find('blog.naver.com/([0-9a-zA-Z]+)', url) or re.find('blogId=([0-9a-zA-Z]+)', url, err='no username') 86 | url = 'https://m.blog.naver.com/PostView.nhn?blogId={}&logNo={}&proxyReferer='.format(username, id) 87 | print('###', username, id, url) 88 | 89 | soup = Soup(html) 90 | if soup.find('div', {'id': 'viewTypeSelector'}): 91 | return url, soup 92 | frame = soup.find('frame') 93 | if frame is None: 94 | print('frame is None') 95 | return read_page(url, depth+1) 96 | return read_page(urljoin('https://blog.naver.com', frame.attrs['src']), depth+1) 97 | 98 | 99 | 100 | def get_imgs(url): 101 | url = url.replace('blog.naver', 'm.blog.naver') 102 | referer = url 103 | url_frame, soup = read_page(url) 104 | 105 | imgs = [] 106 | urls = set() 107 | view = soup.find('div', {'id': 'viewTypeSelector'}) 108 | print('view', view is not None) 109 | 110 | imgs_ = view.findAll('span', class_='_img') + view.findAll('img') 111 | 112 | for img in imgs_: 113 | url = img.attrs.get('src', None) 114 | if url is None: 115 | url = img.attrs.get('thumburl', None) 116 | if url is None: 117 | print(u'invalid img: {}'.format(url)) 118 | continue 119 | 120 | if 'ssl.pstatic.net' in url: # 121 | continue 122 | 123 | if 'blogpfthumb-phinf.pstatic.net' in url: # profile 124 | continue 125 | 126 | if 'dthumb-phinf.pstatic.net' in url: # link 127 | continue 128 | 129 | if 'storep-phinf.pstatic.net' in url: # emoticon 130 | continue 131 | 132 | url = url.replace('mblogthumb-phinf', 'blogfiles') 133 | #url = re.sub('\?type=[a-zA-Z0-9]*', '?type=w1@2x', url) 134 | #url = re.sub('\?type=[a-zA-Z0-9]*', '', url) 135 | url = url.split('?')[0] 136 | 137 | if url in urls: 138 | print('### Duplicate:', url) 139 | continue 140 | 141 | urls.add(url) 142 | #url = url.split('?type=')[0] 143 | img = Image(url, referer, len(imgs)) 144 | imgs.append(img) 145 | 146 | pairs = [] 147 | 148 | for video in soup.findAll('span', class_='_naverVideo'): 149 | vid = video.attrs['vid'] 150 | key = video.attrs['key'] 151 | pairs.append((vid, key)) 152 | 153 | for script in soup.findAll('script', class_='__se_module_data'): 154 | data_raw = script['data-module'] 155 | data = json.loads(data_raw)['data'] 156 | vid = data.get('vid') 157 | if not vid: 158 | continue 159 | key = data['inkey'] 160 | pairs.append((vid, key)) 161 | 162 | videos = [] 163 | for vid, key in pairs: 164 | url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key) 165 | data_raw = downloader.read_html(url_api) 166 | data = json.loads(data_raw) 167 | fs = data['videos']['list'] 168 | fs = sorted(fs, key=lambda f: f['size'], reverse=True) 169 | video = Video(fs[0]['source'], url_frame, len(videos)) 170 | videos.append(video) 171 | 172 | return imgs + videos 173 | 174 | -------------------------------------------------------------------------------- /src/extractor/twitch_downloader.py: -------------------------------------------------------------------------------- 1 | #coding: utf8 2 | import downloader 3 | import ytdl 4 | from utils import Downloader, get_outdir, Soup, LazyUrl, try_n, compatstr, format_filename, get_ext, clean_title, Session, get_print, get_resolution, get_max_range 5 | from io import BytesIO 6 | from m3u8_tools import M3u8_stream 7 | import ree as re 8 | from translator import tr_ 9 | import errors 10 | import utils 11 | 12 | 13 | @Downloader.register 14 | class Downloader_twitch(Downloader): 15 | type = 'twitch' 16 | URLS = ['twitch.tv'] 17 | single = True 18 | 19 | def init(self): 20 | url = self.url 21 | if 'twitch.tv' in url: 22 | if not url.startswith('http://') and not url.startswith('https://'): 23 | url = 'https://' + url 24 | self.url = url 25 | else: 26 | url = 'https://www.twitch.tv/videos/{}'.format(url) 27 | self.url = url 28 | 29 | @classmethod 30 | def fix_url(cls, url): 31 | if re.search(r'/(videos|clips)\?filter=', url): 32 | return url.strip('/') 33 | return url.split('?')[0].strip('/') 34 | 35 | def read(self): 36 | if '/directory/' in self.url.lower(): 37 | raise errors.Invalid('[twitch] Directory is unsupported: {}'.format(self.url)) 38 | 39 | if self.url.count('/') == 3: 40 | if 'www.twitch.tv' in self.url or '//twitch.tv' in self.url: 41 | filter = 'live' 42 | else: 43 | filter = None 44 | elif self.url.count('/') == 4: 45 | filter = re.find(r'filter=([0-9a-zA-Z_]+)', self.url) or re.find(r'[0-9a-zA-Z_]+', self.url.split('/')[-1]) 46 | if filter is not None and filter.isdigit(): 47 | filter = None 48 | else: 49 | filter = None 50 | 51 | if filter is None: 52 | video = Video(self.url, self.cw) 53 | video.url() 54 | self.urls.append(video.url) 55 | self.title = video.title 56 | elif filter == 'live': 57 | video = Video(self.url, self.cw, live=True) 58 | video.url() 59 | self.urls.append(video.url) 60 | self.title = video.title 61 | elif filter == 'clips': 62 | info = get_videos(self.url, cw=self.cw) 63 | video = self.process_playlist('[Clip] {}'.format(info['name']), info['videos']) 64 | else: 65 | raise NotImplementedError(filter) 66 | 67 | self.setIcon(video.thumb) 68 | 69 | 70 | @try_n(2) 71 | def get_videos(url, cw=None): 72 | print_ = get_print(cw) 73 | print_(f'get_videos: {url}') 74 | info = {} 75 | options = { 76 | 'extract_flat': True, 77 | 'playlistend': get_max_range(cw), 78 | } 79 | videos = [] 80 | ydl = ytdl.YoutubeDL(options, cw=cw) 81 | info = ydl.extract_info(url) 82 | for e in info['entries']: 83 | video = Video(e['url'], cw) 84 | video.id = int(e['id']) 85 | videos.append(video) 86 | if 'name' not in info: 87 | info['name'] = ydl.extract_info(e['url'])['creator'] 88 | if not videos: 89 | raise Exception('no videos') 90 | info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True) 91 | return info 92 | 93 | 94 | def alter(seg): 95 | segs = [] 96 | if '-muted' in seg.url: 97 | seg_ = seg.copy() 98 | seg_.url = seg.url.replace('-muted', '') 99 | segs.append(seg_) 100 | segs.append(seg) 101 | return segs 102 | 103 | 104 | def extract_info(url, cw=None): 105 | print_ = get_print(cw) 106 | ydl = ytdl.YoutubeDL(cw=cw) 107 | try: 108 | info = ydl.extract_info(url) 109 | except Exception as e: 110 | ex = type(ytdl.get_extractor(url))(ydl) 111 | _download_info = getattr(ex, '_download_info', None) 112 | if _download_info is not None: 113 | vod_id = ex._match_id(url) 114 | info = _download_info(vod_id) 115 | print_(info) 116 | if 'HTTPError 403' in str(e): 117 | raise errors.LoginRequired() 118 | raise 119 | return info 120 | 121 | 122 | class Video(object): 123 | _url = None 124 | 125 | def __init__(self, url, cw, live=False): 126 | self.url = LazyUrl(url, self.get, self) 127 | self.cw = cw 128 | self._live = live 129 | 130 | @try_n(4) 131 | def get(self, url): 132 | print_ = get_print(self.cw) 133 | if self._url: 134 | return self._url 135 | info = extract_info(url, self.cw) 136 | 137 | def print_video(video): 138 | #print_(video)# 139 | print_('{}[{}] [{}] [{}] {}'.format('LIVE ', video['format_id'], video.get('height'), video.get('tbr'), video['url'])) 140 | 141 | videos = [video for video in info['formats'] if video.get('height')] 142 | 143 | videos = sorted(videos, key=lambda video:(video.get('height', 0), video.get('tbr', 0)), reverse=True) 144 | 145 | for video in videos: 146 | print_video(video) 147 | 148 | for video in videos: 149 | if video.get('height', 0) <= get_resolution(): #3723 150 | video_best = video 151 | break 152 | else: 153 | video_best = videos[-1] 154 | print_video(video) 155 | 156 | video = video_best['url'] 157 | 158 | ext = get_ext(video) 159 | self.title = info['title'] 160 | id = info['display_id'] 161 | 162 | if self._live: 163 | video = utils.LiveStream(video, headers=video_best.get('http_headers')) 164 | ext = '.mp4' 165 | else: 166 | if ext.lower() == '.m3u8': 167 | video = M3u8_stream(video, n_thread=4, alter=alter) 168 | ext = '.mp4' 169 | self.filename = format_filename(self.title, id, ext) 170 | self.url_thumb = info['thumbnail'] 171 | self.thumb = BytesIO() 172 | downloader.download(self.url_thumb, buffer=self.thumb) 173 | self._url = video 174 | return self._url 175 | -------------------------------------------------------------------------------- /src/extractor/mrm_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, urljoin, LazyUrl, Downloader, query_url, try_n, Session, get_print, clean_title, get_ext 4 | import os 5 | from translator import tr_ 6 | from timee import sleep 7 | import requests 8 | import ree as re 9 | import clf2# 10 | 11 | 12 | class Image(object): 13 | def __init__(self, url, p, page, cw): 14 | self.cw = cw 15 | ext = get_ext(url) 16 | self.filename = '{:04}{}'.format(p, ext) 17 | if page.title is not None: 18 | self.filename = '{}/{}'.format(page.title, self.filename) 19 | self._url = url 20 | self.url = LazyUrl(page.url, self.get, self) 21 | 22 | def get(self, _): 23 | return self._url#'tmp://' + clf2.download(self._url, cw=self.cw) 24 | 25 | 26 | class Page(object): 27 | def __init__(self, title, url, soup=None): 28 | self.title = clean_title(title) 29 | self.url = url 30 | self.soup = soup 31 | 32 | 33 | 34 | 35 | @Downloader.register 36 | class Downloader_mrm(Downloader): 37 | type = 'mrm' 38 | URLS = ['myreadingmanga.info'] 39 | _soup = None 40 | MAX_CORE = 4 41 | display_name = 'MyReadingManga' 42 | 43 | def init(self): 44 | self.session = get_session(self.url, self.cw) 45 | 46 | @classmethod 47 | def fix_url(cls, url): 48 | return re.find('https?://myreadingmanga.info/[^/]+', url, err='err') 49 | 50 | @property 51 | def soup(self): 52 | if self._soup is None: 53 | for try_ in range(8): 54 | try: 55 | html = read_html(self.url, session=self.session, cw=self.cw) 56 | break 57 | except Exception as e: 58 | e_ = e 59 | self.print_(e) 60 | else: 61 | raise e_ 62 | self._soup = Soup(html) 63 | return self._soup 64 | 65 | @property 66 | def name(self): 67 | title = get_title(self.soup) 68 | return title 69 | 70 | def read(self): 71 | self.title = '읽는 중... {}'.format(self.name) 72 | 73 | imgs = get_imgs(self.url, self.soup, self.session, self.cw) 74 | 75 | for img in imgs: 76 | self.urls.append(img.url) 77 | 78 | self.title = self.name 79 | 80 | 81 | def get_title(soup): 82 | title = soup.find('h1', class_='entry-title').text.strip() 83 | title = fix_title(title) 84 | title = clean_title(title) 85 | return title 86 | 87 | 88 | def get_imgs(url, soup=None, session=None, cw=None): 89 | if soup is None: 90 | html = read_html(url, session=session, cw=cw) 91 | soup = Soup(html) 92 | 93 | title = get_title(soup) 94 | 95 | pagination = soup.find('div', class_='pagination') 96 | 97 | if pagination is None: 98 | page = Page(None, url, soup) 99 | imgs = get_imgs_page(page, session=session, cw=cw) 100 | else: 101 | pages = get_pages(url, soup, session=session) 102 | imgs = [] 103 | for i, page in enumerate(pages): 104 | s = '{} {} / {} ({} / {})'.format(tr_('읽는 중...'), title, page.title, i+1, len(pages)) 105 | 106 | if cw: 107 | if not cw.alive: 108 | return 109 | cw.setTitle(s) 110 | else: 111 | print(s) 112 | 113 | imgs += get_imgs_page(page, session=session, cw=cw) 114 | 115 | if not imgs: 116 | raise Exception('no imgs') 117 | 118 | return imgs 119 | 120 | 121 | def get_pages(url, soup=None, session=None): 122 | if soup is None: 123 | html = read_html(url, session=session, cw=None) 124 | soup = Soup(html) 125 | pagination = soup.find('div', class_='pagination') 126 | 127 | pages = [] 128 | hrefs = set() 129 | for a in pagination.findAll('a'): 130 | href = a.attrs.get('href', '') 131 | href = urljoin(url, href) 132 | if not href.startswith(url): 133 | print('not match', href) 134 | continue 135 | while href.endswith('/'): 136 | href = href[:-1] 137 | if href in hrefs: 138 | print('duplicate', href) 139 | continue 140 | hrefs.add(href) 141 | text = a.text.strip() 142 | page = Page(text, href) 143 | pages.append(page) 144 | 145 | if url not in hrefs: 146 | page = Page('1', url, soup) 147 | pages.insert(0, page) 148 | 149 | return pages 150 | 151 | 152 | @try_n(4) 153 | def get_imgs_page(page, session=None, cw=None): 154 | url = page.url 155 | soup = page.soup 156 | if soup is None: 157 | html = read_html(url, session=session, cw=None) 158 | soup = Soup(html) 159 | page.soup = soup 160 | 161 | view = soup.find('div', class_='entry-content') 162 | 163 | imgs = [] 164 | for img in view.findAll('img'): 165 | img = img.attrs.get('data-lazy-src') or img.attrs.get('data-src') 166 | if img is None: 167 | continue 168 | img = urljoin(url, img) 169 | img = Image(img, len(imgs), page, cw) 170 | imgs.append(img) 171 | print(page.title, len(imgs), page.url) 172 | 173 | return imgs 174 | 175 | 176 | def fix_title(title): 177 | title = re.sub(r'\(?[^()]*?c\.[^() ]+\)?', '', title) 178 | while ' ' in title: 179 | title = title.replace(' ', ' ') 180 | return title 181 | 182 | 183 | def read_html(url, session, cw): 184 | ## html = downloader.read_html(url, session=session) 185 | ## soup = Soup(html) 186 | ## 187 | ## cf = soup.find('div', class_='cf-browser-verification') 188 | ## if cf is None: 189 | ## return html 190 | 191 | r = clf2.solve(url, cw=cw, session=session) 192 | 193 | return r['html'] 194 | 195 | 196 | @try_n(4) 197 | def get_session(url, cw=None): 198 | print_ = get_print(cw) 199 | ## html = downloader.read_html(url) 200 | ## soup = Soup(html) 201 | ## 202 | ## cf = soup.find('div', class_='cf-browser-verification') 203 | ## if cf is None: 204 | ## print_('no cf protection') 205 | ## return None 206 | 207 | print_('cf protection') 208 | r = clf2.solve(url, cw=cw) 209 | session = r['session'] 210 | 211 | return session 212 | 213 | -------------------------------------------------------------------------------- /src/extractor/kakaopage_downloader.py: -------------------------------------------------------------------------------- 1 | import downloader 2 | import ree as re 3 | from utils import Session, LazyUrl, Soup, Downloader, try_n, get_print, clean_title, print_error, urljoin, get_imgs_already 4 | from time import sleep 5 | from translator import tr_ 6 | import page_selector 7 | import json 8 | import clf2 9 | 10 | 11 | class Page(object): 12 | 13 | def __init__(self, id_, title): 14 | self.id_ = id_ 15 | self.title = title 16 | self.url = 'https://page.kakao.com/viewer?productId={}'.format(id_) 17 | 18 | 19 | class Image(object): 20 | 21 | def __init__(self, url, page, p): 22 | self.url = LazyUrl('https://page.kakao.com/', lambda _: url, self) 23 | ext = '.jpg' 24 | self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext) 25 | 26 | 27 | @Downloader.register 28 | class Downloader_kakaopage(Downloader): 29 | type = 'kakaopage' 30 | URLS = ['page.kakao.com/home'] 31 | MAX_CORE = 4 32 | MAX_SPEED = 4.0 33 | display_name = 'KakaoPage' 34 | 35 | def init(self): 36 | self.session = Session() 37 | 38 | @classmethod 39 | def fix_url(cls, url): 40 | id = re.find('/home/.+?/([0-9]+)', url) 41 | if id is not None: 42 | url = id 43 | if url.isdecimal(): 44 | url = 'https://page.kakao.com/home?seriesId={}'.format(url) 45 | return url 46 | 47 | def read(self): 48 | info = get_info(self.url, self.session, cw=self.cw) 49 | 50 | for img in info['imgs']: 51 | if isinstance(img, Image): 52 | img = img.url 53 | self.urls.append(img) 54 | 55 | self.artist = info['artist'] 56 | 57 | self.title = info['title'] 58 | 59 | 60 | 61 | def get_id(url): 62 | id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId') 63 | return id_ 64 | 65 | 66 | 67 | def get_pages(url, session): 68 | id_ = get_id(url) 69 | 70 | pages = [] 71 | ids = set() 72 | for p in range(500): #2966 73 | url_api = 'https://api2-page.kakao.com/api/v5/store/singles' 74 | data = { 75 | 'seriesid': id_, 76 | 'page': str(p), 77 | 'direction': 'asc', 78 | 'page_size': '20', 79 | 'without_hidden': 'true', 80 | } 81 | r = session.post(url_api, data=data, headers={'Referer': url}) 82 | print(p, r) 83 | data = r.json() 84 | 85 | singles = data['singles'] 86 | if not singles: 87 | print('no singles') 88 | break 89 | 90 | for single in singles: 91 | title_page = single['title'] 92 | id_page = single['id'] 93 | if id_page in ids: 94 | print('dup id') 95 | continue 96 | ids.add(id_page) 97 | page = Page(id_page, title_page) 98 | pages.append(page) 99 | sleep(.5) 100 | return pages 101 | 102 | 103 | def read_html(url, session): 104 | res = clf2.solve(url, session=session) 105 | return res['html'] 106 | 107 | 108 | @try_n(2) 109 | def get_imgs_page(page, session): 110 | html = read_html(page.url, session=session) 111 | did = re.find('"did" *: *"(.+?)"', html, err='no did') 112 | url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web' 113 | data = { 114 | 'productId': page.id_, 115 | 'device_mgr_uid': 'Windows - Chrome', 116 | 'device_model': 'Windows - Chrome', 117 | 'deviceId': did, 118 | } 119 | print(data) 120 | r = session.post(url_api, data=data, headers={'Referer': page.url}) 121 | data = r.json() 122 | if data['result_code']: 123 | raise Exception(data['message']) 124 | imgs = [] 125 | for file in data['downloadData']['members']['files']: 126 | url = file['secureUrl'] 127 | url = urljoin('https://page-edge-jz.kakao.com/sdownload/resource/', url) 128 | img = Image(url, page, len(imgs)) 129 | imgs.append(img) 130 | return imgs 131 | 132 | 133 | def get_info(url, session, cw=None): 134 | print_ = get_print(cw) 135 | pages = get_pages(url, session) 136 | pages = page_selector.filter(pages, cw) 137 | if not pages: 138 | raise Exception('no pages') 139 | 140 | info = {} 141 | 142 | html = read_html(url, session=session) 143 | soup = Soup(html) 144 | 145 | __NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__') 146 | if __NEXT_DATA__: 147 | data = json.loads(__NEXT_DATA__.string) 148 | tid = data['props']['initialState']['common']['constant']['tid'] 149 | print_('tid: {}'.format(tid)) 150 | session.cookies['_kptid'] = tid 151 | html = read_html(url, session=session) 152 | soup = Soup(html) 153 | 154 | title = soup.find('h2').text.strip() 155 | artist = soup.find('meta', {'name': 'author'})['content'] 156 | for x in [' ,', ', ']: 157 | while x in artist: 158 | artist = artist.replace(x, ',') 159 | artist = artist.replace(',', ', ') 160 | info['artist'] = artist 161 | info['title_raw'] = title 162 | info['title'] = clean_title('[{}] {}'.format(artist, title)) 163 | 164 | imgs = [] 165 | 166 | for i, page in enumerate(pages): 167 | if cw is not None: 168 | if not cw.alive: 169 | return 170 | cw.setTitle('{} {} / {} ({} / {})'.format(tr_('읽는 중...'), info['title'], page.title, i + 1, len(pages))) 171 | 172 | #3463 173 | imgs_already = get_imgs_already('kakaopage', info['title'], page, cw) 174 | if imgs_already: 175 | imgs += imgs_already 176 | continue 177 | 178 | try: 179 | _imgs = get_imgs_page(page, session) 180 | e_msg = None 181 | except Exception as e: 182 | _imgs = [] 183 | e_msg = print_error(e)[0] 184 | print_('{} {}'.format(page.title, len(_imgs))) 185 | if e_msg: 186 | print_(e_msg) 187 | 188 | imgs += _imgs 189 | sleep(.2) 190 | 191 | if not imgs: 192 | raise Exception('no imgs') 193 | 194 | info['imgs'] = imgs 195 | 196 | return info 197 | 198 | 199 | @page_selector.register('kakaopage') 200 | @try_n(4) 201 | def f(url): 202 | if 'seriesId=' not in url: 203 | raise Exception(tr_('목록 주소를 입력해주세요')) 204 | pages = get_pages(url, Session()) 205 | return pages 206 | -------------------------------------------------------------------------------- /src/extractor/lhscan_downloader.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | import downloader 3 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, Session, clean_title, get_print 4 | import os 5 | from translator import tr_ 6 | import page_selector 7 | import clf2 8 | import utils 9 | import base64 10 | import ree as re 11 | import errors 12 | ##from image_reader import QPixmap 13 | 14 | 15 | class Image(object): 16 | def __init__(self, url, page, p): 17 | self._url = url 18 | self.url = LazyUrl(page.url, self.get, self)#, pp=self.pp) 19 | ext = os.path.splitext(url)[1] 20 | if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']: 21 | ext = '.jpg' 22 | self.filename = u'{}/{:04}{}'.format(page.title, p, ext) 23 | 24 | def get(self, _): 25 | return self._url 26 | 27 | ## def pp(self, filename): 28 | ## pixmap = QPixmap(filename) 29 | ## pixmap.save(filename) 30 | ## return filename 31 | 32 | 33 | class Page(object): 34 | def __init__(self, title, url): 35 | self.title = clean_title(title) 36 | self.url = url 37 | 38 | 39 | def get_soup_session(url, cw=None): 40 | print_ = get_print(cw) 41 | session = Session() 42 | res = clf2.solve(url, session=session, cw=cw) 43 | print_('{} -> {}'.format(url, res['url'])) 44 | if res['url'].rstrip('/') == 'https://welovemanga.one': 45 | raise errors.LoginRequired() 46 | return Soup(res['html']), session 47 | 48 | 49 | @Downloader.register 50 | class Downloader_lhscan(Downloader): 51 | type = 'lhscan' 52 | URLS = [ 53 | #'lhscan.net', 'loveheaven.net', 54 | 'lovehug.net', 'welovemanga.', 55 | ] 56 | MAX_CORE = 16 57 | display_name = 'LHScan' 58 | _soup = None 59 | 60 | def init(self): 61 | self._soup, self.session = get_soup_session(self.url, self.cw) 62 | if not self.soup.find('ul', class_='manga-info'): 63 | raise errors.Invalid(u'{}: {}'.format(tr_(u'목록 주소를 입력해주세요'), self.url)) 64 | 65 | @classmethod 66 | def fix_url(cls, url): 67 | url = url.replace('lovehug.net', 'welovemanga.one') 68 | url = url.replace('welovemanga.net', 'welovemanga.one') #4298 69 | return url 70 | 71 | @property 72 | def soup(self): 73 | if self._soup is None: 74 | for try_ in range(8): 75 | try: 76 | html = downloader.read_html(self.url, session=self.session) 77 | break 78 | except Exception as e: 79 | e_ = e 80 | print(e) 81 | else: 82 | raise e_ 83 | self._soup = Soup(html) 84 | return self._soup 85 | 86 | @property 87 | def name(self): 88 | title = self.soup.find('ul', class_='manga-info').find('h3').text 89 | return clean_title(title) 90 | 91 | def read(self): 92 | self.title = tr_(u'읽는 중... {}').format(self.name) 93 | 94 | imgs = get_imgs(self.url, self.name, self.session, self.soup, self.cw) 95 | 96 | for img in imgs: 97 | self.urls.append(img.url) 98 | 99 | self.title = self.name 100 | 101 | 102 | @try_n(8) 103 | def get_imgs_page(page, referer, session, cw=None): 104 | print_ = get_print(cw) 105 | print_(page.title) 106 | 107 | html = downloader.read_html(page.url, referer, session=session) 108 | if clf2._is_captcha(Soup(html)): #4124 109 | html = clf2.solve(page.url, session, cw)['html'] 110 | if not html: 111 | raise Exception('empty html') 112 | html = html.replace('{}='.format(re.find(r"\$\(this\)\.attr\('(.+?)'", html, err='no cn')), 'data-src=') 113 | soup = Soup(html) 114 | 115 | view = soup.find('div', class_='chapter-content') 116 | 117 | if not view: 118 | raise Exception('no chapter-content') 119 | 120 | imgs = [] 121 | for img in soup.findAll('img', class_='chapter-img'): 122 | src = img.get('data-pagespeed-lazy-src') or img.get('data-src') or img.get('data-srcset') or img.get('data-aload') or img['src'] 123 | try: 124 | src = base64.b64decode(src).strip().decode('utf8') 125 | except: 126 | pass 127 | src0 = src 128 | src = src.replace('welovemanga.one', '1')# 129 | src = urljoin(page.url, src).strip() 130 | if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src: 131 | continue 132 | if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src: 133 | continue 134 | if 'LoveHug_600cfd96e98ff.jpg' in src: 135 | continue 136 | if 'image_5f0ecf23aed2e.png' in src: 137 | continue 138 | if '/uploads/lazy_loading.gif' in src: 139 | continue 140 | if not imgs: 141 | print_(src0) 142 | img = Image(src, page, len(imgs)) 143 | imgs.append(img) 144 | 145 | return imgs 146 | 147 | 148 | def get_pages(url, session, soup=None, cw=None): 149 | if soup is None: 150 | html = downloader.read_html(url, session=session) 151 | soup = Soup(html) 152 | 153 | tab = soup.find('ul', class_='list-chapters') 154 | 155 | pages = [] 156 | for li in tab.findAll('li'): 157 | text = li.find('div', class_='chapter-name').text.strip() 158 | href = li.parent['href'] 159 | href = urljoin(url, href) 160 | page = Page(text, href) 161 | pages.append(page) 162 | 163 | if not pages: 164 | raise Exception('no pages') 165 | 166 | return pages[::-1] 167 | 168 | 169 | @page_selector.register('lhscan') 170 | @try_n(4) 171 | def f(url): 172 | soup, session = get_soup_session(url) 173 | pages = get_pages(url, session, soup=soup) 174 | return pages 175 | 176 | 177 | @try_n(2) 178 | def get_imgs(url, title, session, soup=None, cw=None): 179 | if soup is None: 180 | html = downloader.read_html(url, session=session) 181 | soup = Soup(html) 182 | 183 | pages = get_pages(url, session, soup, cw) 184 | pages = page_selector.filter(pages, cw) 185 | 186 | imgs = [] 187 | for i, page in enumerate(pages): 188 | imgs += get_imgs_page(page, url, session, cw) 189 | s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages)) 190 | if cw is not None: 191 | if not cw.alive: 192 | return 193 | cw.setTitle(s) 194 | else: 195 | print(s) 196 | 197 | return imgs 198 | --------------------------------------------------------------------------------