├── FUNDING.yml
├── imgs
├── card_crop.png
└── how_to_download.gif
├── push_^q^.bat
├── .github
└── stale.yml
├── src
└── extractor
│ ├── _4chan_downloader.py
│ ├── pawoo_downloader.py
│ ├── baraag_downloader.py
│ ├── file_downloader.py
│ ├── youporn_downloader.py
│ ├── kakaotv_downloader.py
│ ├── youku_downloader.py
│ ├── m3u8_downloader.py
│ ├── vimeo_downloader.py
│ ├── talk_op_gg_downloader.py
│ ├── navertv_downloader.py
│ ├── asiansister_downloader.py
│ ├── xnxx_downloader.py
│ ├── coub_downloader.py
│ ├── kissjav_downloader.py
│ ├── avgle_downloader.py
│ ├── vlive_downloader.py
│ ├── yandere_downloader.py
│ ├── hentaicosplay_downloader.py
│ ├── asmhentai_downloader.py
│ ├── fc2_downloader.py
│ ├── v2ph_downloader.py
│ ├── afreeca_downloader.py
│ ├── wikiart_downloader.py
│ ├── navercafe_downloader.py
│ ├── tokyomotion_downloader.py
│ ├── nhentai_com_downloader.py
│ ├── pandoratv_downloader.py
│ ├── novelpia_downloader.py
│ ├── nozomi_downloader.py
│ ├── flickr_downloader.py
│ ├── rule34_xxx_downloader.py
│ ├── likee_downloader.py
│ ├── nhentai_downloader.py
│ ├── nico_downloader.py
│ ├── hanime_downloader.py
│ ├── kakuyomu_downloader.py
│ ├── webtoon_downloader.py
│ ├── comicwalker_downloader.py
│ ├── hameln_downloader.py
│ ├── imgur_downloader.py
│ ├── discord_emoji_downloader.py
│ ├── bdsmlr_downloader.py
│ ├── nijie_downloader.py
│ ├── hf_downloader.py
│ ├── luscious_downloader.py
│ ├── xvideo_downloader.py
│ ├── gelbooru_downloader.py
│ ├── bcy_downloader.py
│ ├── danbooru_downloader.py
│ ├── soundcloud_downloader.py
│ ├── tiktok_downloader.py
│ ├── naver_downloader.py
│ ├── twitch_downloader.py
│ ├── mrm_downloader.py
│ ├── kakaopage_downloader.py
│ └── lhscan_downloader.py
├── .gitignore
├── translation
└── tr_ko.hdl
└── README.md
/FUNDING.yml:
--------------------------------------------------------------------------------
1 | patreon: KurtBestor
--------------------------------------------------------------------------------
/imgs/card_crop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coko8023/Hitomi-Downloader/master/imgs/card_crop.png
--------------------------------------------------------------------------------
/imgs/how_to_download.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/coko8023/Hitomi-Downloader/master/imgs/how_to_download.gif
--------------------------------------------------------------------------------
/push_^q^.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 |
3 | git add .
4 | git commit -m "^q^"
5 | git push
6 |
7 | echo Done!
8 | pause>nul
--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
1 | # Configuration for probot-stale - https://github.com/probot/stale
2 |
3 | # Number of days of inactivity before an Issue or Pull Request becomes stale
4 | daysUntilStale: 90
5 | # Number of days of inactivity before a stale Issue or Pull Request is closed
6 | daysUntilClose: 30
7 | # Issues or Pull Requests with these labels will never be considered stale
8 | exemptLabels:
9 | - help wanted
10 | - notice
11 | # Label to use when marking as stale
12 | staleLabel: stale
13 | # Comment to post when marking as stale. Set to `false` to disable
14 | markComment: >
15 | This issue has been automatically marked as stale because it has not had
16 | recent activity. It will be closed after 30 days if no further activity
17 | occurs, but feel free to re-open a closed issue if needed.
--------------------------------------------------------------------------------
/src/extractor/_4chan_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Downloader, LazyUrl, clean_title, urljoin, get_ext
3 | from ratelimit import limits, sleep_and_retry
4 |
5 |
6 | class Image:
7 | def __init__(self, url, ref, n):
8 | self._url = url
9 | self.url = LazyUrl(ref, self.get, self)
10 | self.filename = '{:04}{}'.format(n, get_ext(url))
11 |
12 | @sleep_and_retry
13 | @limits(2, 1)
14 | def get(self, _):
15 | return self._url
16 |
17 |
18 |
19 | @Downloader.register
20 | class Downloader_4chan(Downloader):
21 | type = '4chan'
22 | URLS = [r'regex:boards.(4chan|4channel).org']
23 | MAX_CORE = 4
24 | display_name = '4chan'
25 |
26 | @classmethod
27 | def fix_url(cls, url):
28 | return url.split('#')[0]
29 |
30 | def read(self):
31 | soup = downloader.read_soup(self.url)
32 | for div in soup.findAll('div', class_='fileText'):
33 | href = urljoin(self.url, div.a['href'])
34 | img = Image(href, self.url, len(self.urls))
35 | self.urls.append(img.url)
36 |
37 | board = self.url.split('/')[3]
38 | title = soup.find('span', class_='subject').text
39 | id_ = int(self.url.split('/thread/')[1].split('/')[0])
40 | self.title = clean_title(f'[{board}] {title} ({id_})')
41 |
--------------------------------------------------------------------------------
/src/extractor/pawoo_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Downloader, lazy, clean_title
4 | import ree as re
5 | from translator import tr_
6 | from mastodon import get_imgs
7 | import json
8 |
9 |
10 | @Downloader.register
11 | class Downloader_pawoo(Downloader):
12 | type = 'pawoo'
13 | URLS = ['pawoo.net']
14 |
15 | def init(self):
16 | self.url = 'https://pawoo.net/{}'.format(self.id_)
17 | self.referer = self.url
18 |
19 | @property
20 | def id_(self):
21 | return re.find('pawoo.net/([^/]+)', self.url.lower(), default=self.url)
22 |
23 | @lazy
24 | def soup(self):
25 | return downloader.read_soup(self.url)
26 |
27 | @property
28 | def name(self):
29 | name_raw = re.find(r'''['"]name['"] *: *['"](.+?)['"]''', str(self.soup), err='no name')
30 | name = json.loads('"{}"'.format(name_raw))
31 | title = '{} (pawoo_{})'.format(name, self.id_)
32 | return clean_title(title)
33 |
34 | def read(self):
35 | self.title = tr_('읽는 중... {}').format(self.name)
36 |
37 | imgs = get_imgs('pawoo.net', self.id_, self.name, cw=self.cw)
38 |
39 | for img in imgs:
40 | self.urls.append(img.url)
41 | self.filenames[img.url] = img.filename
42 |
43 | self.title = self.name
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/extractor/baraag_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, Downloader, lazy, clean_title
4 | import ree as re
5 | from translator import tr_
6 | from mastodon import get_imgs
7 |
8 |
9 |
10 | def get_id(url):
11 | return re.find('baraag.net/([^/]+)', url.lower())
12 |
13 |
14 | @Downloader.register
15 | class Downloader_baraag(Downloader):
16 | type = 'baraag'
17 | URLS = ['baraag.net']
18 | display_name = 'baraag.net'
19 |
20 | def init(self):
21 | self.referer = self.url
22 |
23 | @classmethod
24 | def fix_url(cls, url):
25 | id_ = get_id(url) or url
26 | return 'https://baraag.net/{}'.format(id_)
27 |
28 | @lazy
29 | def id(self):
30 | return get_id(self.url)
31 |
32 | @lazy
33 | def soup(self):
34 | return Soup(downloader.read_html(self.url))
35 |
36 | @property
37 | def name(self):
38 | title = self.soup.find('h1').text.strip().split('\n')[0].strip()
39 | title = u'{} (baraag_{})'.format(title, self.id)
40 | return clean_title(title)
41 |
42 | def read(self):
43 | self.title = tr_(u'읽는 중... {}').format(self.name)
44 |
45 | imgs = get_imgs('baraag.net', self.id, self.name, cw=self.cw)
46 |
47 | for img in imgs:
48 | self.urls.append(img.url)
49 | self.filenames[img.url] = img.filename
50 |
51 | self.title = self.name
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/src/extractor/file_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader, json, os
2 | from constants import try_n
3 | from utils import Downloader, query_url, clean_title, get_ext
4 | from timee import sleep
5 | from hashlib import md5
6 |
7 |
8 | @Downloader.register
9 | class Downloader_file(Downloader):
10 | type = 'file'
11 | single = True
12 | URLS = []
13 |
14 | @classmethod
15 | def fix_url(cls, url):
16 | if '://' not in url:
17 | url = 'https://' + url.lstrip('/')
18 | return 'file_' + url
19 |
20 | def read(self):
21 | qs = query_url(self.url)
22 | for key in qs:
23 | if key.lower() in ('file', 'filename'):
24 | name = qs[key][(-1)]
25 | break
26 | else:
27 | name = self.url
28 | for esc in ['?', '#']:
29 | name = name.split(esc)[0]
30 | name = os.path.basename(name.strip('/'))
31 |
32 | try:
33 | ext = downloader.get_ext(self.url)
34 | except:
35 | ext = ''
36 | if not ext:
37 | ext = get_ext(name)
38 |
39 | name = os.path.splitext(name)[0]
40 |
41 | self.urls.append(self.url)
42 |
43 | id_ = md5(self.url.encode('utf8')).hexdigest()[:8]
44 | tail = ' ({}){}'.format(id_, ext)
45 | filename = clean_title(name, n=-len(tail)) + tail
46 |
47 | self.filenames[self.url] = filename
48 |
49 | self.title = filename
50 |
--------------------------------------------------------------------------------
/src/extractor/youporn_downloader.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, unicode_literals
2 | import downloader
3 | import ree as re
4 | from io import BytesIO
5 | import os
6 | from constants import try_n
7 | from utils import Downloader, LazyUrl, get_ext, format_filename, clean_title
8 | import ytdl
9 |
10 |
11 |
12 | @Downloader.register
13 | class Downloader_youporn(Downloader):
14 | type = 'youporn'
15 | single = True
16 | URLS = ['youporn.com']
17 | display_name = 'YouPorn'
18 |
19 | @classmethod
20 | def fix_url(cls, url):
21 | if 'youporn.com' not in url.lower():
22 | url = 'https://www.youporn.com/watch/{}'.format(url)
23 | return url
24 |
25 | def read(self):
26 | video = Video(self.url, cw=self.cw)
27 |
28 | self.urls.append(video.url)
29 | self.setIcon(video.thumb)
30 |
31 | self.enableSegment()
32 |
33 | self.title = video.title
34 |
35 |
36 | class Video(object):
37 | @try_n(4)
38 | def __init__(self, url, cw=None):
39 | ydl = ytdl.YoutubeDL(cw=cw)
40 | info = ydl.extract_info(url)
41 |
42 | f = info['formats'][-1]
43 | url_video = f['url']
44 | self.url = LazyUrl(url, lambda _: url_video, self)
45 |
46 | self.url_thumb = info['thumbnails'][0]['url']
47 | self.thumb = BytesIO()
48 | downloader.download(self.url_thumb, buffer=self.thumb)
49 | self.title = info['title']
50 | ext = get_ext(url_video)
51 | self.filename = format_filename(self.title, info['id'], ext)
52 |
--------------------------------------------------------------------------------
/src/extractor/kakaotv_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ytdl
3 | from utils import Downloader, try_n, LazyUrl, get_ext, format_filename
4 | from io import BytesIO as IO
5 | from m3u8_tools import M3u8_stream
6 |
7 |
8 | @Downloader.register
9 | class Downloader_vlive(Downloader):
10 | type = 'kakaotv'
11 | URLS = ['tv.kakao']
12 | single = True
13 | display_name = 'KakaoTV'
14 |
15 | @classmethod
16 | def fix_url(cls, url):
17 | return url.split('?')[0].strip('/')
18 |
19 | def read(self):
20 | video = Video(self.url, cw=self.cw)
21 | video.url()#
22 |
23 | self.urls.append(video.url)
24 | self.setIcon(video.thumb)
25 |
26 | self.enableSegment()
27 |
28 | self.title = video.title
29 |
30 |
31 |
32 | class Video(object):
33 | _url = None
34 |
35 | def __init__(self, url, cw=None):
36 | self.url = LazyUrl(url, self.get, self)
37 | self.cw = cw
38 |
39 | @try_n(2)
40 | def get(self, url):
41 | if self._url:
42 | return self._url
43 |
44 | ydl = ytdl.YoutubeDL(cw=self.cw)
45 | info = ydl.extract_info(url)
46 | fs = [f for f in info['formats'] if f['ext'] == 'mp4']
47 | f = sorted(fs, key=lambda f: f['height'])[-1]
48 | self._url = f['url']
49 |
50 | self.thumb_url = info['thumbnails'][0]['url']
51 | self.thumb = IO()
52 | downloader.download(self.thumb_url, buffer=self.thumb)
53 | self.title = info['title']
54 | ext = get_ext(self._url)
55 | self.filename = format_filename(self.title, info['id'], ext)
56 | return self._url
57 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # SageMath parsed files
80 | *.sage.py
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | .venv
87 | venv/
88 | ENV/
89 |
90 | # Spyder project settings
91 | .spyderproject
92 | .spyproject
93 |
94 | # Rope project settings
95 | .ropeproject
96 |
97 | # mkdocs documentation
98 | /site
99 |
100 | # mypy
101 | .mypy_cache/
102 |
103 | # etc
104 | #*.bat
105 | call cmd.bat
106 |
--------------------------------------------------------------------------------
/src/extractor/youku_downloader.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, unicode_literals
2 | import downloader
3 | import ytdl
4 | from m3u8_tools import M3u8_stream
5 | from utils import LazyUrl, get_ext, Downloader, format_filename, clean_title
6 | from io import BytesIO
7 |
8 |
9 | @Downloader.register
10 | class Downloader_youku(Downloader):
11 | type = 'youku'
12 | single = True
13 | URLS = ['v.youku.com']
14 |
15 | def read(self):
16 | video = Video(self.url, cw=self.cw)
17 | video.url()# get thumb
18 |
19 | self.urls.append(video.url)
20 | self.setIcon(video.thumb)
21 |
22 | self.title = video.title
23 |
24 |
25 | class Video(object):
26 | _url = None
27 |
28 | def __init__(self, url, cw=None):
29 | self.url = LazyUrl(url, self.get, self)
30 | self.cw = cw
31 |
32 | def get(self, url):
33 | if self._url:
34 | return self._url
35 |
36 | ydl = ytdl.YoutubeDL(cw=self.cw)
37 | info = ydl.extract_info(url)
38 |
39 | # get best video
40 | fs = info['formats']
41 | fs = sorted(fs, key=lambda x: int(x['width']), reverse=True)
42 | f = fs[0]
43 | url_video = f['url']
44 |
45 | # thumb
46 | self.thumb_url = info['thumbnails'][0]['url']
47 | self.thumb = BytesIO()
48 | downloader.download(self.thumb_url, buffer=self.thumb)
49 |
50 | # m3u8
51 | print(f['protocol'])
52 | if 'm3u8' in f['protocol']:
53 | url_video = M3u8_stream(url_video, referer=url)
54 |
55 | # title & filename
56 | self.title = info['title']
57 | self.filename = format_filename(self.title, info['id'], '.mp4')
58 |
59 | self._url = url_video
60 |
61 | return self._url
62 |
63 |
--------------------------------------------------------------------------------
/src/extractor/m3u8_downloader.py:
--------------------------------------------------------------------------------
1 | from utils import Downloader, LazyUrl, clean_title
2 | import utils
3 | from m3u8_tools import playlist2stream, M3u8_stream
4 | import os
5 | from hashlib import md5
6 | from translator import tr_
7 | DEFAULT_N_THREAD = 2
8 |
9 |
10 | @Downloader.register
11 | class Downloader_m3u8(Downloader):
12 | type = 'm3u8'
13 | URLS = ['.m3u8']
14 | single = True
15 | display_name = 'M3U8'
16 |
17 | @classmethod
18 | def fix_url(cls, url):
19 | if '://' not in url:
20 | url = 'http://' + url
21 | return url
22 |
23 | def read(self):
24 | n_thread = self.cw.format or DEFAULT_N_THREAD
25 | self.print_('n_thread: {}'.format(n_thread))
26 | video = Video(self.url, n_thread)
27 | self.urls.append(video.url)
28 | self.title = '{} ({})'.format(video.title, video.id_)
29 |
30 |
31 | class Video(object):
32 | def __init__(self, url, n_thread):
33 | try:
34 | m = playlist2stream(url, n_thread=n_thread)
35 | except:
36 | m = M3u8_stream(url, n_thread=n_thread)
37 | self.url = LazyUrl(url, lambda _: m, self)
38 | self.title = os.path.splitext(os.path.basename(url))[0]
39 | self.id_ = md5(url.encode('utf8')).hexdigest()[:8]
40 | tail = ' ({}).mp4'.format(self.id_)
41 | self.filename = clean_title(self.title, n=-len(tail)) + tail
42 |
43 |
44 | import selector
45 | @selector.options('m3u8')
46 | def options():
47 | def f(urls):
48 | n_thread, ok = utils.QInputDialog.getInt(Downloader.mainWindow, tr_('Set number of threads'), tr_('Number of threads?'), value=DEFAULT_N_THREAD, min=1, max=4, step=1)
49 | if not ok:
50 | return
51 | return n_thread
52 | return [
53 | {'text': 'Set number of threads...', 'format': f},
54 | ]
55 |
--------------------------------------------------------------------------------
/src/extractor/vimeo_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ree as re
3 | from io import BytesIO as IO
4 | from error_printer import print_error
5 | from utils import Downloader, LazyUrl, get_ext, format_filename, try_n
6 | import ytdl
7 |
8 |
9 |
10 | @Downloader.register
11 | class Downloader_vimeo(Downloader):
12 | type = 'vimeo'
13 | URLS = ['vimeo.com']
14 | single = True
15 |
16 | def init(self):
17 | if 'vimeo.com' not in self.url.lower():
18 | self.url = u'https://vimeo.com/{}'.format(self.url)
19 |
20 | def read(self):
21 | video = Video(self.url, cw=self.cw)
22 | video.url()#
23 |
24 | self.urls.append(video.url)
25 | self.setIcon(video.thumb)
26 |
27 | self.enableSegment()
28 |
29 | self.title = video.title
30 |
31 |
32 | class Video(object):
33 | _url = None
34 |
35 | def __init__(self, url, cw=None):
36 | self.url = LazyUrl(url, self.get, self)
37 | self.cw = cw
38 |
39 | @try_n(4)
40 | def get(self, url):
41 | if self._url:
42 | return self._url
43 |
44 | ydl = ytdl.YoutubeDL(cw=self.cw)
45 | info = ydl.extract_info(url)
46 | fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']]
47 | fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True)
48 | if not fs:
49 | raise Exception('No MP4 videos')
50 | f = fs[0]
51 |
52 | self.thumb_url = info['thumbnails'][0]['url']
53 | self.thumb = IO()
54 | downloader.download(self.thumb_url, buffer=self.thumb)
55 | self.title = info['title']
56 | url_video = f['url']
57 | ext = get_ext(url) or '.mp4'
58 | self.filename = format_filename(self.title, info['id'], ext)
59 | self._url = url_video
60 | return self._url
61 |
--------------------------------------------------------------------------------
/src/extractor/talk_op_gg_downloader.py:
--------------------------------------------------------------------------------
1 | # coding: UTF-8
2 | # title: Download talk op.gg image
3 | # author: SaidBySolo
4 | # comment: op.gg 커뮤니티의 이미지를 다운로드합니다
5 |
6 | """
7 | MIT License
8 |
9 | Copyright (c) 2020 SaidBySolo
10 |
11 | Permission is hereby granted, free of charge, to any person obtaining a copy
12 | of this software and associated documentation files (the "Software"), to deal
13 | in the Software without restriction, including without limitation the rights
14 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
15 | copies of the Software, and to permit persons to whom the Software is
16 | furnished to do so, subject to the following conditions:
17 |
18 | The above copyright notice and this permission notice shall be included in all
19 | copies or substantial portions of the Software.
20 |
21 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
22 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
24 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
26 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
27 | SOFTWARE.
28 | """
29 |
30 | import requests
31 | from utils import Downloader, Soup
32 |
33 |
34 | @Downloader.register
35 | class DownloaderTalkOPGG(Downloader):
36 | type = "talkopgg"
37 | URLS = ["talk.op.gg"]
38 |
39 | def init(self) -> None:
40 | pass
41 |
42 | def read(self) -> None:
43 | response = requests.get(self.url)
44 | soup = Soup(response.text)
45 |
46 | self.title = soup.find("title").text
47 |
48 | image_element_list = soup.find("div", class_="article-content").findAll("img")
49 |
50 | for image_element in image_element_list:
51 | self.urls.append(image_element["src"])
52 |
--------------------------------------------------------------------------------
/src/extractor/navertv_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ree as re
3 | from io import BytesIO as IO
4 | import os
5 | from constants import try_n
6 | from error_printer import print_error
7 | from utils import Downloader, compatstr, LazyUrl, get_ext, format_filename, clean_title
8 | import ytdl
9 |
10 |
11 |
12 | @Downloader.register
13 | class Downloader_navertv(Downloader):
14 | type = 'navertv'
15 | single = True
16 | URLS = ['tv.naver.com']
17 | display_name = 'Naver TV'
18 |
19 | def init(self):
20 | if not re.match('https?://.+', self.url, re.IGNORECASE):
21 | self.url = 'https://tv.naver.com/v/{}'.format(self.url)
22 |
23 | def read(self):
24 | video = Video(self.url, cw=self.cw)
25 | video.url()#
26 |
27 | self.urls.append(video.url)
28 | self.setIcon(video.thumb)
29 |
30 | self.enableSegment()
31 |
32 | self.title = video.title
33 |
34 |
35 |
36 | class Video(object):
37 | _url = None
38 |
39 | def __init__(self, url, cw=None):
40 | self.url = LazyUrl(url, self.get, self)
41 | self.cw = cw
42 |
43 | @try_n(4)
44 | def get(self, url):
45 | if self._url:
46 | return self._url
47 |
48 | ydl = ytdl.YoutubeDL(cw=self.cw)
49 | info = ydl.extract_info(url)
50 | fs = [f for f in info['formats'] if f['protocol'] in ['http', 'https']]
51 | fs = sorted(fs, key=lambda f: int(f.get('width', 0)), reverse=True)
52 | if not fs:
53 | raise Exception('No MP4 videos')
54 | f = fs[0]
55 | self._url = f['url']
56 |
57 | self.thumb_url = info['thumbnails'][0]['url']
58 | self.thumb = IO()
59 | downloader.download(self.thumb_url, buffer=self.thumb)
60 | self.title = info['title']
61 | id = info['id']
62 | ext = get_ext(self._url)
63 | self.filename = format_filename(self.title, id, ext)
64 | return self._url
65 |
--------------------------------------------------------------------------------
/src/extractor/asiansister_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, clean_title
3 | from timee import sleep
4 | import os
5 | import ree as re
6 |
7 |
8 | @Downloader.register
9 | class Downloader_asiansister(Downloader):
10 | type = 'asiansister'
11 | URLS = ['asiansister.com']
12 | display_name = 'AsianSister'
13 |
14 | @try_n(4)
15 | def init(self):
16 | html = downloader.read_html(self.url)
17 | self.soup = Soup(html)
18 |
19 | @property
20 | def name(self):
21 | return clean_title(self.soup.find('title').text.replace('- ASIANSISTER.COM', '').strip())
22 |
23 | def read(self):
24 | imgs = get_imgs(self.url, self.soup, self.name)
25 |
26 | for img in imgs:
27 | if img.type == 'video':
28 | self.single = True
29 | self.urls.append(img.url)
30 |
31 | self.title = self.name
32 |
33 |
34 | class Image(object):
35 | def __init__(self, url, referer, p, type='image'):
36 | self.url = LazyUrl(referer, lambda x: url, self)
37 | ext = os.path.splitext(url.split('?')[0])[1]
38 | self.filename = u'{:04}{}'.format(p, ext)
39 | self.type = type
40 |
41 |
42 | @try_n(4)
43 | def get_imgs(url, soup=None, name=None):
44 | if soup is None:
45 | html = downloader.read_html(url)
46 | soup = Soup(html)
47 |
48 | view = soup.findAll('div', class_='rootContant')[:2][-1]
49 |
50 | v = view.find('video')
51 | if v:
52 | img = v.find('source').attrs['src']
53 | img = urljoin(url, img)
54 | img = Image(img, url, 0, 'video')
55 | ext = os.path.splitext(img.url().split('?')[0])[1]
56 | img.filename = u'{}{}'.format(name, ext)
57 | return [img]
58 |
59 | imgs = []
60 | for img in view.findAll('img'):
61 | img = img.attrs['dataurl']
62 | img = urljoin(url, img)
63 | img = re.sub('/[a-z]+images/', '/images/', img).replace('_t.', '.')
64 | img = Image(img, url, len(imgs))
65 | imgs.append(img)
66 |
67 | return imgs
68 |
--------------------------------------------------------------------------------
/src/extractor/xnxx_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Soup, cut_pair, urljoin, Downloader, LazyUrl, format_filename, clean_title
3 | import ree as re
4 | import m3u8
5 | from m3u8_tools import M3u8_stream, playlist2stream
6 | from timee import sleep
7 | import os
8 | from io import BytesIO as IO
9 |
10 |
11 |
12 | class Video(object):
13 |
14 | def __init__(self, url, url_page, title, url_thumb):
15 | self._url = url
16 | self.url = LazyUrl(url_page, self.get, self)
17 | self.id = get_id(url_page)
18 | self.title = title
19 | self.filename = format_filename(title, self.id, '.mp4')
20 | f = IO()
21 | self.url_thumb = url_thumb
22 | downloader.download(url_thumb, buffer=f)
23 | self.thumb = f
24 |
25 | def get(self, _):
26 | return self._url
27 |
28 |
29 | def get_id(url):
30 | return url.split('xnxx.com/')[1].split('/')[0]
31 |
32 |
33 | @Downloader.register
34 | class Downloader_xnxx(Downloader):
35 | type = 'xnxx'
36 | URLS = [r'regex:xnxx[0-9]*\.(com|es)']
37 | single = True
38 | display_name = 'XNXX'
39 |
40 | @classmethod
41 | def fix_url(cls, url):
42 | return re.sub(r'xnxx[0-9]*\.(com|es)', 'xnxx.com', url)
43 |
44 | def read(self):
45 | video = get_video(self.url)
46 | self.urls.append(video.url)
47 | self.setIcon(video.thumb)
48 | self.title = video.title
49 |
50 |
51 | def get_video(url):
52 | html = downloader.read_html(url)
53 | soup = Soup(html)
54 |
55 | for script in soup.findAll('script'):
56 | script = script.text or script.string or ''
57 | hls = re.find(r'''html5player\.setVideoHLS\(['"](.+?)['"]''', script)
58 | if hls:
59 | break
60 | else:
61 | raise Exception('No VideoHLS')
62 |
63 | video = playlist2stream(hls)
64 |
65 | title = get_title(soup)
66 |
67 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content'].strip()
68 |
69 | video = Video(video, url, title, url_thumb)
70 | return video
71 |
72 |
73 | def get_title(soup):
74 | return soup.find('meta', {'property': 'og:title'}).attrs['content'].strip()
75 |
76 |
--------------------------------------------------------------------------------
/src/extractor/coub_downloader.py:
--------------------------------------------------------------------------------
1 | from utils import Downloader, LazyUrl, try_n, format_filename, get_ext
2 | import ytdl
3 | from io import BytesIO as IO
4 | import downloader
5 | import ree as re
6 | import ffmpeg
7 | PATTEN_IMAGIZER = r'coub-com-.+\.imagizer\.com'
8 |
9 |
10 | def get_id(url):
11 | return re.find(r'/view/([0-9a-z]+)', url, err='no id')
12 |
13 |
14 | @Downloader.register
15 | class Downloader_coub(Downloader):
16 | type = 'coub'
17 | URLS = ['coub.com', r'regex:'+PATTEN_IMAGIZER]
18 | single = True
19 |
20 | @classmethod
21 | def fix_url(cls, url):
22 | return re.sub(PATTEN_IMAGIZER, 'coub.com', url)
23 |
24 | @classmethod
25 | def key_id(cls, url):
26 | return get_id(url)
27 |
28 | def read(self):
29 | video = Video(self.url, cw=self.cw)
30 | video.url()#
31 |
32 | self.urls.append(video.url)
33 | self.setIcon(video.thumb)
34 |
35 | self.enableSegment()
36 |
37 | self.title = video.title
38 |
39 |
40 |
41 | class Video(object):
42 | _url = None
43 |
44 | def __init__(self, url, cw=None):
45 | self.url = LazyUrl(url, self.get, self, pp=self.pp)
46 | self.cw = cw
47 |
48 | @try_n(2)
49 | def get(self, url):
50 | if self._url:
51 | return self._url
52 |
53 | ydl = ytdl.YoutubeDL(cw=self.cw)
54 | info = ydl.extract_info(url)
55 | fs = [f for f in info['formats'] if f['ext'] == 'mp4']
56 | f = sorted(fs, key=lambda f: int(f.get('filesize', 0)))[-1]
57 | self._url = f['url']
58 | ## fs = [f for f in info['formats'] if f['ext'] == 'mp3']
59 | ## self.f_audio = sorted(fs, key=lambda f: int(f.get('filesize', 0)))[-1]
60 |
61 | self.thumb_url = info['thumbnails'][0]['url']
62 | self.thumb = IO()
63 | downloader.download(self.thumb_url, buffer=self.thumb)
64 | self.title = info['title']
65 | ext = get_ext(self._url)
66 | self.filename = format_filename(self.title, info['id'], ext)
67 | return self._url
68 |
69 | def pp(self, filename):
70 | ## f = IO()
71 | ## downloader.download(self.f_audio['url'], buffer=f)
72 | ## ffmpeg.merge(filename, f)
73 | return filename
74 |
--------------------------------------------------------------------------------
/src/extractor/kissjav_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Soup, urljoin, Downloader, LazyUrl, Session, try_n, format_filename, clean_title
3 | from timee import sleep
4 | import ree as re
5 | from io import BytesIO
6 | import clf2
7 |
8 |
9 | @Downloader.register
10 | class Downloader_kissjav(Downloader):
11 | type = 'kissjav'
12 | URLS = ['kissjav.com']
13 | single = True
14 | display_name = 'KissJAV'
15 |
16 | def read(self):
17 | self.session = None#get_session(self.url, cw=self.cw)
18 |
19 | video = get_video(self.url, self.session)
20 | self.urls.append(video.url)
21 | self.setIcon(video.thumb)
22 | self.enableSegment(1024*1024//2)
23 |
24 | self.title = video.title
25 |
26 |
27 | @try_n(2)
28 | def get_video(url, session):
29 | soup = downloader.read_soup(url, session=session)
30 |
31 | view = soup.find('div', id='player-container-fluid')
32 | src_best = None
33 | res_best = -1
34 | for source in view.findAll('source'):
35 | src = urljoin(url, source.attrs['src'])
36 | res = re.find('([0-9]+)p', source.attrs['title'])
37 | res = int(res) if res else 0
38 | if res > res_best:
39 | src_best = src
40 | res_best = res
41 |
42 | if src_best is None:
43 | raise Exception('No source')
44 |
45 | title = soup.find('h1').text.strip()
46 | id = soup.find('div', id='video').attrs['data-id']
47 |
48 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
49 |
50 | #src_best = downloader.real_url(src_best)
51 |
52 | video = Video(src_best, url_thumb, url, title, id, session)
53 | return video
54 |
55 |
56 | class Video(object):
57 | def __init__(self, url, url_thumb, referer, title, id, session):
58 | self.title = title
59 | self.filename = format_filename(title, id, '.mp4')
60 | self.url = LazyUrl(referer, lambda x: url, self)
61 |
62 | self.thumb = BytesIO()
63 | self.url_thumb = url_thumb
64 | downloader.download(url_thumb, buffer=self.thumb, session=session)
65 |
66 |
67 | @try_n(2)
68 | def get_session(url, cw=None):
69 | session = Session()
70 | clf2.solve(url, session=session, cw=cw)
71 | return session
72 |
73 |
--------------------------------------------------------------------------------
/src/extractor/avgle_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf8
2 | import downloader
3 | import os
4 | from m3u8_tools import M3u8_stream
5 | from utils import Soup, Downloader, LazyUrl, get_print, try_n, clean_title, check_alive
6 | from io import BytesIO
7 | import constants
8 | from error_printer import print_error
9 | import base64
10 | import json
11 | import webbrowser
12 | import errors
13 |
14 |
15 | @Downloader.register
16 | class Downloader_avgle(Downloader):
17 | type = 'avgle'
18 | single = True
19 | URLS = ['avgle.com']
20 |
21 | def init(self):
22 | if not self.cw.data_:
23 | link = 'https://github.com/KurtBestor/Hitomi-Downloader/wiki/Chrome-Extension'
24 | webbrowser.open(link)
25 | raise errors.Invalid('No data; See: {}'.format(link))
26 |
27 | def read(self):
28 | video = get_video(self.url, cw=self.cw)
29 | self.urls.append(video.url)
30 |
31 | self.setIcon(video.thumb)
32 |
33 | self.title = video.title
34 |
35 |
36 | @try_n(2)
37 | def get_video(url, cw=None):
38 | print_ = get_print(cw)
39 |
40 | check_alive(cw)
41 |
42 | data = cw.data_
43 | version = data['version']
44 | print_('version: {}'.format(version))
45 | if version == '0.1':
46 | raise errors.OutdatedExtension()
47 | data = data['data']
48 | if not isinstance(data, bytes):
49 | data = data.encode('utf8')
50 | s = base64.b64decode(data).decode('utf8')
51 | urls = json.loads(s)
52 |
53 | print_(u'\n'.join(urls[:4]))
54 |
55 | referer_seg = 'auto' if 'referer=force' in urls[0] else None # 1718
56 |
57 | stream = M3u8_stream(url, urls=urls, n_thread=4, referer_seg=referer_seg)
58 |
59 | html = downloader.read_html(url)
60 | soup = Soup(html)
61 |
62 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
63 | title = soup.find('meta', {'property': 'og:title'}).attrs['content'].strip()
64 |
65 | video = Video(stream, url_thumb, url, title)
66 |
67 | return video
68 |
69 |
70 | class Video(object):
71 | def __init__(self, url, url_thumb, referer, title):
72 | self.url = LazyUrl(referer, lambda x: url, self)
73 | self.url_thumb = url_thumb
74 | self.thumb = BytesIO()
75 | downloader.download(url_thumb, referer=referer, buffer=self.thumb)
76 | self.title = title
77 | ext = '.mp4'
78 | self.filename = u'{}{}'.format(clean_title(title, n=-len(ext)), ext)
79 |
80 |
81 |
--------------------------------------------------------------------------------
/src/extractor/vlive_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ytdl
3 | from utils import Downloader, try_n, LazyUrl, get_ext, format_filename, clean_title, pp_subtitle
4 | from io import BytesIO
5 | import ree as re
6 | from m3u8_tools import M3u8_stream
7 | import os
8 |
9 |
10 | @Downloader.register
11 | class Downloader_vlive(Downloader):
12 | type = 'vlive'
13 | URLS = ['vlive.tv']
14 | single = True
15 | display_name = 'V LIVE'
16 |
17 | def init(self):
18 | if 'channels.vlive.tv' in self.url:
19 | raise NotImplementedError('channel')
20 |
21 | def read(self):
22 | cw = self.cw
23 | video = get_video(self.url, cw=cw)
24 |
25 | self.urls.append(video.url)
26 |
27 | self.setIcon(video.thumb)
28 | self.enableSegment()
29 |
30 | self.title = clean_title(video.title)
31 |
32 |
33 | @try_n(4)
34 | def get_video(url, cw=None):
35 | options = {
36 | 'noplaylist': True,
37 | }
38 |
39 | ydl = ytdl.YoutubeDL(options, cw=cw)
40 | info = ydl.extract_info(url)
41 |
42 | fs = []
43 | for f in info['formats']:
44 | if f['ext'] != 'mp4':
45 | continue
46 | f['quality'] = f.get('vbr') or re.find('([0-9]+)p', f['format'], re.IGNORECASE)
47 | print(f['format'], f['quality'])
48 | fs.append(f)
49 |
50 | if not fs:
51 | raise Exception('No videos')
52 |
53 | f = sorted(fs, key=lambda f:f['quality'])[-1]
54 |
55 | subs = {}
56 | for sub, items in info['subtitles'].items():
57 | sub = sub.split('_')[0]
58 | for item in items:
59 | if item['ext'] != 'vtt':
60 | continue
61 | subs[sub] = item['url']
62 | video = Video(f, info, subs, cw)
63 |
64 | return video
65 |
66 |
67 | class Video(object):
68 | def __init__(self, f, info, subs, cw=None):
69 | self.title = title = info['title']
70 | self.id = info['id']
71 | self.url = f['url']
72 | self.subs = subs
73 | self.cw = cw
74 |
75 | self.thumb = BytesIO()
76 | downloader.download(info['thumbnail'], buffer=self.thumb)
77 |
78 | ext = get_ext(self.url)
79 | if ext.lower() == '.m3u8':
80 | raise NotImplementedError('stream')#
81 | url = M3u8_stream(self.url, n_thread=4)
82 | else:
83 | url = self.url
84 | self.url = LazyUrl(self.url, lambda x: url, self, pp=self.pp)
85 | self.filename = format_filename(title, self.id, ext)
86 |
87 | def pp(self, filename):
88 | pp_subtitle(self, filename, self.cw)
89 | return filename
90 |
91 |
92 |
--------------------------------------------------------------------------------
/src/extractor/yandere_downloader.py:
--------------------------------------------------------------------------------
1 | from utils import Downloader, urljoin, clean_title, try_n, check_alive, LazyUrl, get_ext, get_max_range
2 | from translator import tr_
3 | import ree as re
4 | import downloader
5 | from ratelimit import limits, sleep_and_retry
6 |
7 |
8 | @try_n(4)
9 | @sleep_and_retry
10 | @limits(4, 1)
11 | def read_soup(url):
12 | return downloader.read_soup(url)
13 |
14 |
15 | @Downloader.register
16 | class Downloader_yandere(Downloader):
17 | type = 'yande.re'
18 | URLS = ['yande.re']
19 | MAX_CORE = 4
20 |
21 | @classmethod
22 | def fix_url(cls, url):
23 | url = re.sub(r'([?&])page=[0-9]+&?', r'\1', url).rstrip('?&')
24 | pool = re.find('/pool/show/([0-9]+)', url)
25 | if pool is not None:
26 | url = urljoin(url, '/post?tags=pool%3A{}'.format(pool))
27 | return url
28 |
29 | def read(self):
30 | title = self.get_title(self.url)
31 |
32 | url = self.url
33 | n = get_max_range(self.cw)
34 | ids = set()
35 | while True:
36 | check_alive(self.cw)
37 | soup = read_soup(url)
38 | for a in soup.find_all('a', class_='thumb'):
39 | id_ = re.find(r'/show/([0-9]+)', a['href'], err='no id')
40 | if id_ in ids:
41 | self.print_(f'dup: {id_}')
42 | continue
43 | ids.add(id_)
44 | img = Image(urljoin(url, a['href']), id_)
45 | self.urls.append(img.url)
46 | if len(self.urls) >= n:
47 | del self.urls[n:]
48 | break
49 |
50 | self.cw.setTitle('{} {} - {}'.format(tr_('읽는 중...'), title, len(self.urls)))
51 |
52 | next_page = soup.find('a', attrs={'rel':'next'}, href=True)
53 | if not next_page:
54 | break
55 | else:
56 | url = urljoin(self.url, next_page['href'])
57 |
58 | self.title = title
59 |
60 | def get_id(self, url:str) -> str:
61 | id_ = url.split('yande.re%20')[1].split('%20')[0]
62 | return int(id_)
63 |
64 | def get_title(self, url:str) -> str:
65 | if "tags=" not in url:
66 | raise NotImplementedError('no tags')
67 |
68 | url_tags = url.split("tags=")[-1].split('+')
69 |
70 | return clean_title(" ".join(url_tags))
71 |
72 |
73 | class Image:
74 |
75 | def __init__(self, url, id_):
76 | self._id = id_
77 | self.url = LazyUrl(url, self.get, self)
78 |
79 | def get(self, url):
80 | soup = read_soup(url)
81 | img = soup.find('a', class_='original-file-unchanged') or soup.find('a', class_='original-file-changed')
82 | img = urljoin(url, img['href'])
83 | ext = get_ext(img)
84 | self.filename = clean_title(self._id, n=-len(ext)) + ext
85 | return img
86 |
--------------------------------------------------------------------------------
/translation/tr_ko.hdl:
--------------------------------------------------------------------------------
1 | {
2 | "lang": "ko",
3 | "items": {
4 | "#Cancel#": "취소",
5 | "#EB#": "{} EB",
6 | "#GB#": "{} GB",
7 | "#GIFs#": "GIF / WebP",
8 | "#KB#": "{} KB",
9 | "#KB/s#": "{} KB/s",
10 | "#MB#": "{} MB",
11 | "#MB/s#": "{} MB/s",
12 | "#OK#": "확인",
13 | "#PB#": "{} PB",
14 | "#TB#": "{} TB",
15 | "#boss_invalid_pw#": "Invalid password!",
16 | "#boss_pw#": "Password:",
17 | "#byte#": "{} byte",
18 | "#bytes#": "{} bytes",
19 | "#click#": "Click",
20 | "#combo_hour#": "{} 시간",
21 | "#combo_hours#": "{} 시간",
22 | "#combo_min#": "{} 분",
23 | "#combo_mins#": "{} 분",
24 | "#date01#": "1월 {d}일",
25 | "#date01y#": "1월 {d}일, {y}",
26 | "#date02#": "2월 {d}일",
27 | "#date02y#": "2월 {d}일, {y}",
28 | "#date03#": "3월 {d}일",
29 | "#date03y#": "3월 {d}일, {y}",
30 | "#date04#": "4월 {d}일",
31 | "#date04y#": "4월 {d}일, {y}",
32 | "#date05#": "5월 {d}일",
33 | "#date05y#": "5월 {d}일, {y}",
34 | "#date06#": "6월 {d}일",
35 | "#date06y#": "6월 {d}일, {y}",
36 | "#date07#": "7월 {d}일",
37 | "#date07y#": "7월 {d}일, {y}",
38 | "#date08#": "8월 {d}일",
39 | "#date08y#": "8월 {d}일, {y}",
40 | "#date09#": "9월 {d}일",
41 | "#date09y#": "9월 {d}일, {y}",
42 | "#date10#": "10월 {d}일",
43 | "#date10y#": "10월 {d}일, {y}",
44 | "#date11#": "11월 {d}일",
45 | "#date11y#": "11월 {d}일, {y}",
46 | "#date12#": "12월 {d}일",
47 | "#date12y#": "12월 {d}일, {y}",
48 | "#eta#": "{h:02}:{m:02}:{s:02}",
49 | "#filter_cookie#": "Netscape HTTP Cookie Files (*.txt)",
50 | "#invalid_browserRequired#": "Browser required; Use --safemode",
51 | "#invalid_loginRequired#": "Login required; Update your cookies",
52 | "#invalid_outdatedExtension#": "Extension is outdated; Update the extension",
53 | "#invalid_unknownSite#": "Unknown site",
54 | "#loading_lib#": "Loading: {}",
55 | "#new_item#": "New Item",
56 | "#p#": "{}p",
57 | "#recomm_all_langs#": "모든 언어",
58 | "#recomm_artist#": "작가",
59 | "#recomm_main#": "저장 폴더에 있는 작품들을 분석해서 작가를 추천합니다.\n\n결과에 나오는 정확도는 주어진 작품 내에서의 정확도입니다.\n작품은 많으면 많을수록 좋습니다. (100 개 이상 권장)\n\n{item} 개의 작품이 있습니다:",
60 | "#recomm_score#": "점수",
61 | "#setting_MB/s#": "MB/s",
62 | "#setting_autosaveL#": "",
63 | "#setting_autosaveR#": "마다",
64 | "#setting_incompleteL#": "",
65 | "#setting_incompleteR#": "후",
66 | "#task_artist#": "Artist",
67 | "#task_date#": "Date",
68 | "#task_done#": "Done",
69 | "#task_folder#": "Folder",
70 | "#task_incomplete#": "Incomplete",
71 | "#task_input#": "Input",
72 | "#task_invalid#": "Invalid",
73 | "#task_multiple#": "Multiple",
74 | "#task_single#": "Single",
75 | "#task_site#": "Site",
76 | "#task_status#": "Status",
77 | "#task_title#": "Title",
78 | "#task_type#": "Type",
79 | "#task_url#": "URL",
80 | "#task_zipfile#": "ZipFile"
81 | }
82 | }
--------------------------------------------------------------------------------
/src/extractor/hentaicosplay_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf8
2 | import downloader
3 | from utils import Downloader, Session, Soup, LazyUrl, urljoin, get_ext, clean_title
4 | import ree as re
5 | from translator import tr_
6 | import clf2
7 | from ratelimit import limits, sleep_and_retry
8 |
9 |
10 |
11 | class Image:
12 |
13 | def __init__(self, url, referer, p, session):
14 | self._url = url
15 | self._p = p
16 | self.url = LazyUrl(referer, self.get, self)
17 | self.session = session
18 |
19 | @sleep_and_retry
20 | @limits(2, 1)
21 | def get(self, referer):
22 | soup = downloader.read_soup(self._url, referer, session=self.session)
23 | div = soup.find('div', id='display_image_detail')
24 | url = urljoin(self._url, div.find('img').parent['href'])
25 | ext = get_ext(url)
26 | self.filename = '{:04}{}'.format(self._p, ext)
27 | return url, self._url
28 |
29 |
30 | @Downloader.register
31 | class Downloader_hentaicosplay(Downloader):
32 | type = 'hentaicosplay'
33 | URLS = ['hentai-cosplays.com']
34 | icon = None
35 | display_name = 'Hentai Cosplay'
36 | MAX_CORE = 4
37 |
38 | @classmethod
39 | def fix_url(cls, url):
40 | url = re.sub(r'/page/[0-9]+', '', url)
41 | url = re.sub(r'/attachment/[0-9]+', '', url)
42 | url = re.sub(r'([a-zA-Z]+\.)hentai-cosplays\.com', 'hentai-cosplays.com', url)
43 | return url
44 |
45 | def init(self):
46 | self.session = Session()
47 |
48 | def read(self):
49 | if '/image/' not in self.url:
50 | raise NotImplementedError('Not a post')
51 |
52 | res = clf2.solve(self.url, session=self.session, cw=self.cw)
53 | soup = Soup(res['html'])
54 | title = soup.find('h2').text
55 | paginator = soup.find('div', id='paginator')
56 | pages = [self.url]
57 | for a in paginator.findAll('a'):
58 | href = a.get('href')
59 | if not href:
60 | continue
61 | href = urljoin(self.url, href)
62 | if href not in pages:
63 | pages.append(href)
64 |
65 | imgs = []
66 | for i, page in enumerate(pages):
67 | if page == self.url:
68 | soup_page = soup
69 | else:
70 | soup_page = downloader.read_soup(page, session=self.session)
71 | view = soup_page.find('div', id='post')
72 | for img in view.findAll('img'):
73 | href = img.parent['href']
74 | href = urljoin(page, href)
75 | img = Image(href, page, len(imgs), self.session)
76 | imgs.append(img)
77 | self.cw.setTitle('{} {} ({} / {})'.format(tr_('읽는 중...'), title, i+1, len(pages)))
78 |
79 | for img in imgs:
80 | self.urls.append(img.url)
81 |
82 | self.title = clean_title(title)
83 |
84 |
--------------------------------------------------------------------------------
/src/extractor/asmhentai_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf8
2 | import downloader
3 | import ree as re
4 | from utils import Soup, urljoin, Downloader, join
5 | import os
6 |
7 |
8 |
9 | def get_id(url):
10 | try:
11 | return int(url)
12 | except:
13 | if '/gallery/' in url:
14 | return int(re.find('/gallery/[0-9]+/([0-9]+)', url))
15 | else:
16 | return int(re.find('/g/([0-9]+)', url))
17 |
18 |
19 | @Downloader.register
20 | class Downloader_asmhentai(Downloader):
21 | type = 'asmhentai'
22 | URLS = ['asmhentai.com']
23 | MAX_CORE = 8
24 | display_name = 'AsmHentai'
25 |
26 | def init(self):
27 | pass
28 |
29 | @classmethod
30 | def fix_url(cls, url):
31 | id_ = get_id(url)
32 | return 'https://asmhentai.com/g/{}/'.format(id_)
33 |
34 | def read(self):
35 | info, imgs = get_imgs(self.url)
36 |
37 | # 1225
38 | artist = join(info['artists'])
39 | self.artist = artist
40 | group = join(info['groups']) if info['groups'] else u'N/A'
41 | lang = info['language'][0] if info['language'] else u'N/A'
42 | series = info['parodies'][0] if info['parodies'] else u'N/A'
43 | title = self.format_title(info['category'][0], info['id'], info['title'], artist, group, series, lang)
44 |
45 | self.urls += imgs
46 |
47 | self.title = title
48 |
49 |
50 |
51 | def get_imgs(url):
52 | html = downloader.read_html(url)
53 | soup = Soup(html)
54 |
55 | info = get_info(url, soup)
56 |
57 | view = soup.find('div', class_='gallery')
58 |
59 | imgs = []
60 | for img in view.findAll('div', class_='preview_thumb'):
61 | img = img.find('img').attrs.get('data-src') or img.find('img').attrs.get('src')
62 | img = urljoin(url, img).replace('t.jpg', '.jpg')
63 | imgs.append(img)
64 |
65 | return info, imgs
66 |
67 |
68 | def get_info(url, soup=None):
69 | if soup is None:
70 | html = downloader.read_html(url)
71 | soup = Soup(html)
72 |
73 | info = {}
74 |
75 | info['id'] = get_id(url)
76 |
77 | title = soup.find('h1').text.strip()
78 | info['title'] = title
79 |
80 | for tag in soup.findAll('span', class_='tag'):
81 | href = tag.parent.attrs['href']
82 | href = urljoin(url, href).strip('/')
83 |
84 | key = href.split('/')[3]
85 | value = href.split('/')[-1]
86 |
87 | if key == 'language' and value == 'translated':
88 | continue
89 |
90 | if key in info:
91 | info[key].append(value)
92 | else:
93 | info[key] = [value]
94 |
95 | for key in ['artists', 'groups', 'parodies', 'tags', 'characters']:
96 | if key not in info:
97 | info[key] = []
98 |
99 | return info
100 |
101 |
--------------------------------------------------------------------------------
/src/extractor/fc2_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ree as re
3 | from utils import urljoin, Downloader, format_filename, Soup, LazyUrl, get_print, Session
4 | from m3u8_tools import M3u8_stream
5 | from io import BytesIO
6 | PATTERN_ID = r'/content/([^/]+)'
7 |
8 |
9 | @Downloader.register
10 | class Downloader_fc2(Downloader):
11 | type = 'fc2'
12 | single = True
13 | URLS = ['video.fc2.com']
14 |
15 | @classmethod
16 | def fix_url(cls, url):
17 | if not re.match('https?://.+', url, re.IGNORECASE):
18 | url = 'https://video.fc2.com/content/{}'.format(url)
19 | return url
20 |
21 | @classmethod
22 | def key_id(cls, url):
23 | return re.find(PATTERN_ID, url) or url
24 |
25 | def read(self):
26 | self.session = Session()
27 | self.session.cookies.set('_ac', '1', domain='.video.fc2.com')
28 | info = get_info(self.url, self.session, self.cw)
29 |
30 | video = info['videos'][0]
31 |
32 | self.urls.append(video.url)
33 |
34 | f = BytesIO()
35 | downloader.download(video.url_thumb, referer=self.url, buffer=f)
36 | self.setIcon(f)
37 |
38 | self.title = info['title']
39 |
40 |
41 | class Video(object):
42 |
43 | def __init__(self, url, url_thumb, referer, title, id_, session):
44 | self._url = url
45 | self.url = LazyUrl(referer, self.get, self)
46 | self.filename = format_filename(title, id_, '.mp4')
47 | self.url_thumb = url_thumb
48 | self.session = session
49 |
50 | def get(self, referer):
51 | ext = downloader.get_ext(self._url, session=self.session, referer=referer)
52 | if ext == '.m3u8':
53 | video = M3u8_stream(self._url, referer=referer, session=self.session, n_thread=4)
54 | else:
55 | video = self._url
56 | return video
57 |
58 |
59 | def get_info(url, session, cw=None):
60 | print_ = get_print(cw)
61 | info = {'videos': []}
62 | html = downloader.read_html(url, session=session)
63 | soup = Soup(html)
64 | info['title'] = soup.find('h2', class_='videoCnt_title').text.strip()
65 |
66 | id_ = re.find(PATTERN_ID, url, err='no id')
67 | print_('id: {}'.format(id_))
68 | token = re.find(r'''window.FC2VideoObject.push\(\[['"]ae['"], *['"](.+?)['"]''', html, err='no token')
69 | print_('token: {}'.format(token))
70 |
71 | url_api = 'https://video.fc2.com/api/v3/videoplaylist/{}?sh=1&fs=0'.format(id_)
72 | hdr = {
73 | 'X-FC2-Video-Access-Token': token,
74 | }
75 | data = downloader.read_json(url_api, url, session=session, headers=hdr)
76 |
77 | pl = data['playlist']
78 | url_video = urljoin(url, pl.get('hq') or pl.get('nq') or pl['sample']) #3784
79 | url_thumb = soup.find('meta', {'property':'og:image'})['content']
80 | video = Video(url_video, url_thumb, url, info['title'], id_, session)
81 | info['videos'].append(video)
82 |
83 | return info
84 |
--------------------------------------------------------------------------------
/src/extractor/v2ph_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from __future__ import division, print_function, unicode_literals
3 | import downloader
4 | from utils import Soup, get_ext, LazyUrl, Downloader, try_n, clean_title, get_print
5 | import ree as re
6 | from translator import tr_
7 | from timee import sleep
8 | import errors
9 |
10 |
11 | def setPage(url, p):
12 | url = url.split('?')[0]
13 | if p > 1:
14 | url += '?page={}'.format(p)
15 | return url
16 |
17 |
18 | def getPage(url):
19 | p = re.find('page=([0-9]+)', url)
20 | return int(p or 1)
21 |
22 |
23 | class Image(object):
24 | def __init__(self, url, referer, p):
25 | self.url = LazyUrl(referer, lambda x: url, self)
26 | ext = get_ext(url)
27 | self.filename = '{:04}{}'.format(p, ext)
28 |
29 |
30 | @Downloader.register
31 | class Downloader_v2ph(Downloader):
32 | type = 'v2ph'
33 | URLS = ['v2ph.com/album/']
34 | MAX_CORE = 4
35 | display_name = 'V2PH'
36 |
37 | @classmethod
38 | def fix_url(cls, url):
39 | return url.split('?')[0]
40 |
41 | def read(self):
42 | info = get_info(self.url)
43 |
44 | for img in get_imgs(self.url, info['title'], self.cw):
45 | self.urls.append(img.url)
46 |
47 | self.title = clean_title(info['title'])
48 |
49 |
50 |
51 | @try_n(2)
52 | def get_info(url):
53 | html = downloader.read_html(url)
54 | soup = Soup(html)
55 | info = {}
56 | info['title'] = soup.find('h1').text.strip()
57 | return info
58 |
59 |
60 | def get_imgs(url, title, cw=None):
61 | print_ = get_print(cw)
62 | imgs = []
63 |
64 | for p in range(1, 1001):
65 | url = setPage(url, p)
66 | print_(url)
67 | for try_ in range(4):
68 | try:
69 | html = downloader.read_html(url, user_agent=downloader.hdr['User-Agent'])
70 | #sleep(1)
71 | break
72 | except Exception as e:
73 | print(e)
74 | else:
75 | raise
76 | soup = Soup(html)
77 |
78 | view = soup.find('div', class_='photos-list')
79 | if view is None:
80 | if p == 1:
81 | raise errors.LoginRequired()
82 | else:
83 | break # Guest user
84 | for img in view.findAll('img'):
85 | img = img.attrs['data-src']
86 | img = Image(img, url, len(imgs))
87 | imgs.append(img)
88 |
89 | pgn = soup.find('ul', class_='pagination')
90 | ps = [getPage(a.attrs['href']) for a in pgn.findAll('a')] if pgn else []
91 | if not ps or p >= max(ps):
92 | print('max p')
93 | break
94 |
95 | msg = '{} {} ({} / {})'.format(tr_('읽는 중...'), title, p, max(ps))
96 | if cw:
97 | cw.setTitle(msg)
98 | else:
99 | print(msg)
100 |
101 | return imgs
102 |
103 |
104 |
--------------------------------------------------------------------------------
/src/extractor/afreeca_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Soup, Downloader, get_outdir, Session, LazyUrl, try_n, format_filename, get_print
3 | import ree as re
4 | from timee import sleep, time
5 | import os
6 | from io import BytesIO
7 | import shutil
8 | from m3u8_tools import playlist2stream, M3u8_stream
9 | import errors
10 |
11 |
12 | class Video(object):
13 |
14 | def __init__(self, stream, referer, id, title, url_thumb):
15 | self.url = LazyUrl(referer, lambda x: stream, self)
16 | self.id = id
17 | self.title = title
18 | self.filename = format_filename(title, id, '.mp4')
19 | self.url_thumb = url_thumb
20 | self.thumb = BytesIO()
21 | downloader.download(url_thumb, buffer=self.thumb)
22 |
23 |
24 | @Downloader.register
25 | class Downloader_afreeca(Downloader):
26 | type = 'afreeca'
27 | URLS = ['afreecatv.com']
28 | single = True
29 | display_name = 'AfreecaTV'
30 |
31 | @classmethod
32 | def fix_url(cls, url):
33 | return url.rstrip(' /')
34 |
35 | def read(self):
36 | session = Session()
37 | video = get_video(self.url, session, self.cw)
38 | self.urls.append(video.url)
39 |
40 | self.setIcon(video.thumb)
41 |
42 | self.title = video.title
43 |
44 |
45 | @try_n(4)
46 | def _get_stream(url_m3u8):
47 | print('_get_stream', url_m3u8)
48 | try:
49 | stream = playlist2stream(url_m3u8)
50 | except Exception as e:
51 | print(e)
52 | stream = M3u8_stream(url_m3u8)
53 | return stream
54 |
55 |
56 | @try_n(8)
57 | def get_video(url, session, cw):
58 | print_ = get_print(cw)
59 | html = downloader.read_html(url, session=session)
60 | if "document.location.href='https://login." in html:
61 | raise errors.LoginRequired()
62 | soup = Soup(html)
63 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
64 | print_('url_thumb: {}'.format(url_thumb))
65 | params = re.find('VodParameter *= *[\'"]([^\'"]+)[\'"]', html, err='No VodParameter')
66 | params += '&adultView=ADULT_VIEW&_={}'.format(int(time()*1000))
67 | url_xml = 'http://stbbs.afreecatv.com:8080/api/video/get_video_info.php?' + params
68 | print(url_xml)
69 | html = downloader.read_html(url_xml, session=session, referer=url)
70 | soup = Soup(html)
71 | if 'PARTIAL_ADULT' in html:
72 | raise errors.LoginRequired()
73 | title = soup.find('title').string.strip()
74 | urls_m3u8 = re.findall('https?://[^>]+playlist.m3u8', html)
75 | if not urls_m3u8:
76 | raise Exception('no m3u8')
77 | streams = []
78 | for url_m3u8 in urls_m3u8:
79 | try:
80 | stream = _get_stream(url_m3u8)
81 | except Exception as e:
82 | print(e)
83 | continue #2193
84 | streams.append(stream)
85 | for stream in streams[1:]:
86 | streams[0] += stream
87 | stream = streams[0]
88 | id = url.split('/')[(-1)].split('?')[0].split('#')[0]
89 | video = Video(stream, url, id, title, url_thumb)
90 | return video
91 |
--------------------------------------------------------------------------------
/src/extractor/wikiart_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | import json
4 | from utils import LazyUrl, Downloader, Soup, get_print, clean_title
5 | import os
6 | from timee import sleep
7 | from translator import tr_
8 |
9 |
10 |
11 | class Image(object):
12 | def __init__(self, url, referer, title, id):
13 | self.url = LazyUrl(referer, lambda _: url, self)
14 | ext = os.path.splitext(url.split('?')[0])[1]
15 | n = len(id) + len(ext) + 3
16 | title = clean_title(title, n=-n)
17 | self.filename = u'{} - {}{}'.format(id, title, ext)
18 |
19 |
20 |
21 | @Downloader.register
22 | class Downloader_wikiart(Downloader):
23 | type = 'wikiart'
24 | URLS = ['wikiart.org']
25 | display_name = 'WikiArt'
26 |
27 | def init(self):
28 | self.url = u'https://www.wikiart.org/en/{}'.format(self.id_)
29 | html = downloader.read_html(self.url)
30 | self.soup = Soup(html)
31 |
32 | @property
33 | def id_(self):
34 | return get_id(self.url)
35 |
36 | def read(self):
37 | artist = get_artist(self.id_, self.soup)
38 | self.artist = artist
39 |
40 | for img in get_imgs(self.url, artist, cw=self.cw):
41 | self.urls.append(img.url)
42 |
43 | self.title = clean_title(artist)
44 |
45 |
46 |
47 | def get_id(url):
48 | userid = url.split('?')[0].split('#')[0].split('wikiart.org/')[1].split('/')[1]
49 | return userid
50 |
51 |
52 | def get_imgs(url, artist, cw=None):
53 | print_ = get_print(cw)
54 | userid = get_id(url)
55 | print(userid)
56 |
57 | imgs = []
58 | ids = set()
59 | for p in range(1, 100):
60 | url_api = 'https://www.wikiart.org/en/{}/mode/all-paintings?json=2&layout=new&page={}&resultType=masonry'.format(userid, p)
61 | print(url_api)
62 | data_raw = downloader.read_html(url_api, referer=url)
63 | data = json.loads(data_raw)
64 |
65 | _imgs = data['Paintings']
66 | n = data['AllPaintingsCount']
67 |
68 | if not _imgs:
69 | print_('???')
70 | break
71 |
72 | for p in _imgs:
73 | img = p['image']
74 | id = p['id']
75 | referer = p['paintingUrl']
76 | title = p['title']
77 | if id in ids:
78 | print(u'duplicate: {}'.format(id))
79 | continue
80 | ids.add(id)
81 | img = Image(img, referer, title, id)
82 | imgs.append(img)
83 |
84 | s = u'{} {} - {} / {}'.format(tr_(u'읽는 중...'), artist, len(imgs), n)
85 | if cw:
86 | if not cw.valid or not cw.alive:
87 | return []
88 | cw.setTitle(s)
89 | else:
90 | print(s)
91 |
92 | if len(imgs) == n:
93 | print_('full')
94 | break
95 |
96 | return imgs
97 |
98 |
99 | def get_artist(userid, soup=None):
100 | if soup is None:
101 | url = u'https://www.wikiart.org/en/{}'.format(userid)
102 | html = downloader.read_html(url)
103 | soup = Soup(html)
104 |
105 | return soup.find('h3').text.strip()
106 |
107 |
--------------------------------------------------------------------------------
/src/extractor/navercafe_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from utils import Downloader, get_print, urljoin, Soup, get_ext, LazyUrl, clean_title, downloader, re, try_n, errors, json
3 |
4 |
5 | @Downloader.register
6 | class Downloader_navercafe(Downloader):
7 | type = 'navercafe'
8 | URLS = ['cafe.naver.com']
9 |
10 | @classmethod
11 | def fix_url(cls, url):
12 | m = re.find(r'cafe\.naver\.com/([^/?#]+).+?articleid%3D([0-9]+)', url)
13 | if m:
14 | url = 'https://cafe.naver.com/{}/{}'.format(*m)
15 | return url
16 |
17 | def read(self):
18 | info = get_info(self.url, self.cw)
19 | for img in info['imgs']:
20 | self.urls.append(img.url)
21 | tail = ' ({}_{})'.format(info['cafename'], info['id'])
22 | self.title = clean_title(info['title'], n=-len(tail)) + tail
23 |
24 |
25 | @try_n(4)
26 | def get_info(url, cw=None):
27 | print_ = get_print(cw)
28 | info = {}
29 |
30 | html = downloader.read_html(url)
31 | if '"cafe_cautionpage"' in html:
32 | raise errors.LoginRequired()
33 | url_article = re.find(r'''//cafe\.naver\.com/ArticleRead\.nhn\?articleid=[0-9]+&clubid=[0-9]+''', html, err='no iframe')
34 | url_article = urljoin(url, url_article)
35 |
36 | print_(url_article)
37 |
38 | articleid = re.find(r'articleid=([0-9]+)', url_article)
39 | clubid = re.find(r'clubid=([0-9]+)', url_article)
40 | url_api = f'https://apis.naver.com/cafe-web/cafe-articleapi/v2/cafes/{clubid}/articles/{articleid}?query=&useCafeId=true&requestFrom=A'
41 |
42 | j = downloader.read_json(url_api, url)
43 |
44 | info['title'] = j['result']['article']['subject']
45 | info['cafename'] = j['result']['cafe']['url']
46 | info['cafeid'] = clubid
47 | info['id'] = articleid
48 |
49 | html_content = j['result']['article']['contentHtml']
50 | soup = Soup(html_content)
51 |
52 | imgs = []
53 |
54 | pairs = []
55 |
56 | for video in soup.findAll('span', class_='_naverVideo'):
57 | vid = video.attrs['vid']
58 | key = video.attrs['key']
59 | pairs.append((vid, key))
60 |
61 | for script in soup.findAll('script', class_='__se_module_data'):
62 | data_raw = script['data-module']
63 | data = json.loads(data_raw)['data']
64 | vid = data.get('vid')
65 | if not vid:
66 | continue
67 | key = data['inkey']
68 | pairs.append((vid, key))
69 |
70 | for vid, key in pairs:
71 | url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key)
72 | data_raw = downloader.read_html(url_api)
73 | data = json.loads(data_raw)
74 | fs = data['videos']['list']
75 | fs = sorted(fs, key=lambda f: f['size'], reverse=True)
76 | video = Image(fs[0]['source'], url_article, len(imgs))
77 | imgs.append(video)
78 |
79 | for img in soup.findAll('img'):
80 | img = Image(urljoin(url_article, img['src']), url, len(imgs))
81 | imgs.append(img)
82 |
83 | info['imgs'] = imgs
84 |
85 | return info
86 |
87 |
88 | class Image:
89 | def __init__(self, url, referer, p):
90 | self.url = LazyUrl(referer, lambda _: url, self)
91 | ext = get_ext(url)
92 | self.filename = '{:04}{}'.format(p, ext)
93 |
--------------------------------------------------------------------------------
/src/extractor/tokyomotion_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, urljoin, Downloader, cut_pair, LazyUrl, clean_title
4 | from timee import sleep
5 | from translator import tr_
6 | from io import BytesIO
7 | import ree as re
8 | import os
9 |
10 |
11 | @Downloader.register
12 | class Downloader_tokyomotion(Downloader):
13 | type = 'tokyomotion'
14 | URLS = ['tokyomotion.net']
15 | single = True
16 | _type = None
17 | display_name = 'TOKYO Motion'
18 |
19 | def init(self):
20 | html = downloader.read_html(self.url)
21 | self.soup = Soup(html)
22 | if '/album/' in self.url:
23 | self._type = 'album'
24 | else:
25 | self._type = 'video'
26 |
27 | @property
28 | def name(self):
29 | title = get_title(self.soup)
30 | return clean_title(title)
31 |
32 | def read(self):
33 | if self._type == 'video':
34 | video = get_video(self.url, self.soup)
35 | self.urls.append(video.url)
36 | self.setIcon(video.thumb)
37 | elif self._type == 'album':
38 | imgs = get_imgs(self.url)
39 | for img in imgs:
40 | self.urls.append(img.url)
41 | self.single = False
42 | else:
43 | raise NotImplementedError('Unknown type: {}'.format(self._type))
44 |
45 | self.title = self.name
46 |
47 |
48 | class Video(object):
49 | def __init__(self, url, url_thumb, referer, filename):
50 | self.url = LazyUrl(referer, lambda x: url, self)
51 | self.url_thumb = url_thumb
52 | self.thumb = BytesIO()
53 | downloader.download(url_thumb, referer=referer, buffer=self.thumb)
54 | self.filename = filename
55 |
56 |
57 | def get_title(soup):
58 | video = soup.find('video', id='vjsplayer')
59 | if video:
60 | title = soup.find('h3').text.strip()
61 | else:
62 | title = soup.find('title').text.split(' Album - ')[0].strip()
63 | return title
64 |
65 |
66 | def get_video(url, soup=None):
67 | if soup is None:
68 | html = downloader.read_html(url)
69 | soup = Soup(html)
70 |
71 | video = soup.find('video', id='vjsplayer').find('source').attrs['src']
72 | url_thumb = soup.find('video', id='vjsplayer').attrs['poster']
73 | title = get_title(soup)
74 | filename = u'{}.mp4'.format(clean_title(title))
75 | video = Video(video, url_thumb, url, filename)
76 | return video
77 |
78 |
79 | class Image(object):
80 | def __init__(self, url, referer):
81 | self.url = LazyUrl(referer, lambda x: url, self)
82 | self.filename = os.path.basename(url.split('?')[0])
83 |
84 |
85 | def get_imgs(url):
86 | id = re.find('album/.*?([0-9]+)', url)
87 | print('id:', id)
88 | url = 'https://www.tokyomotion.net/album/slideshow/{}'.format(id)
89 |
90 | html = downloader.read_html(url)
91 | soup = Soup(html)
92 |
93 | imgs = []
94 | for a in soup.findAll('a', {'data-lightbox': 'slideshow-{}'.format(id)}):
95 | img = a.find('img').attrs['src']
96 | img = img.replace('/tmb/', '/')
97 | img = Image(img, url)
98 | imgs.append(img)
99 |
100 | return imgs
101 |
--------------------------------------------------------------------------------
/src/extractor/nhentai_com_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from __future__ import division, print_function, unicode_literals
3 | import downloader
4 | import ree as re
5 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, join, clean_title
6 | import os
7 | import json
8 |
9 |
10 | @Downloader.register
11 | class Downloader_nhentai_com(Downloader):
12 | type = 'nhentai_com'
13 | URLS = [r'regex:https?://nhentai.com']
14 | MAX_CORE = 16
15 | display_name = 'nhentai.com'
16 |
17 | def init(self):
18 | self.info = get_info(self.url)
19 | self.url = self.info['url']
20 |
21 | @classmethod
22 | def key_id(cls, url):
23 | url = url.lower()
24 | return re.find(r'/comic/([^/?]+)', url) or url
25 |
26 | def read(self):
27 | info = self.info
28 |
29 | artist = join(info['artists'])
30 | self.artist = artist if info['artists'] else None
31 | group = join(info['groups'])
32 | lang = info['lang'] or 'N/A'
33 | series = info['seriess'][0] if info['seriess'] else 'N/A'
34 | title = self.format_title(info['type'], info['id'], info['title'], artist, group, series, lang)
35 |
36 | for img in info['imgs']:
37 | self.urls.append(img.url)
38 |
39 | self.title = title
40 |
41 |
42 | @LazyUrl.register
43 | class LazyUrl_nhentai_com(LazyUrl):
44 | type = 'nhentai_com'
45 | def dump(self):
46 | referer = self._url
47 | url = self.image.url_img
48 | return {
49 | 'referer': referer,
50 | 'url': url,
51 | 'p': self.image.p,
52 | }
53 | @classmethod
54 | def load(cls, data):
55 | referer = data['referer']
56 | url = data['url']
57 | img = Image(referer, url, data['p'])
58 | return img.url
59 |
60 |
61 | class Image(object):
62 | def __init__(self, url_page, url_img, p):
63 | self.p = p
64 | self.referer = url_page
65 | self.filename = os.path.basename(url_img)
66 | self.url_img = url_img
67 | self.url = LazyUrl_nhentai_com(url_page, lambda _: self.url_img, self)
68 |
69 |
70 | @try_n(4)
71 | def get_info(url):
72 | url = downloader.real_url(url)
73 | q = re.find(r'/comic/([^/?]+)', url)
74 |
75 | url_api = 'https://nhentai.com/api/comics/{}'.format(q)
76 | data_raw = downloader.read_html(url_api, url)
77 | data = json.loads(data_raw)
78 |
79 | url_api = 'https://nhentai.com/api/comics/{}/images'.format(q)
80 | data_raw = downloader.read_html(url_api, url)
81 | data_images = json.loads(data_raw)
82 |
83 | info = {}
84 | info['url'] = url
85 |
86 | info['id'] = int(data['id'])
87 | info['type'] = data['category']['name']
88 | info['title'] = data['title']
89 | info['artists'] = [x['name'] for x in data['artists']]
90 | info['groups'] = [x['name'] for x in data['groups']]
91 | info['seriess'] = [x['name'] for x in data['parodies']]
92 | info['lang'] = data['language']['name']
93 |
94 | imgs = []
95 | for img in data_images['images']:
96 | img = urljoin(url, img['source_url'])
97 | img = Image(url, img, len(imgs))
98 | imgs.append(img)
99 | info['imgs'] = imgs
100 |
101 | return info
102 |
103 |
104 |
--------------------------------------------------------------------------------
/src/extractor/pandoratv_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Session, Soup, LazyUrl, get_print, Downloader, get_ext, try_n, format_filename, clean_title
3 | import ree as re
4 | import json
5 | from io import BytesIO
6 | import errors
7 |
8 |
9 |
10 | class EmbedUrlError(Exception): pass
11 |
12 |
13 | @Downloader.register
14 | class Downloader_pandoratv(Downloader):
15 | type = 'pandoratv'
16 | URLS = ['pandora.tv']
17 | single = True
18 | display_name = 'Pandora TV'
19 |
20 | @classmethod
21 | def fix_url(cls, url):
22 | return url.split('#')[0]
23 |
24 | def read(self):
25 | video = Video(self.url, format, cw=self.cw)
26 | try:
27 | video.url()#
28 | except EmbedUrlError as e:
29 | raise errors.Invalid(e.args[0])
30 |
31 | self.urls.append(video.url)
32 | self.setIcon(video.thumb)
33 |
34 | self.enableSegment()
35 |
36 | self.title = video.title
37 |
38 |
39 |
40 | def extract(name, html, cw=None):
41 | print_ = get_print(cw)
42 | value = re.find(r'''{} *= *['"](.*?)['"]'''.format(name), html)
43 | if value is None:
44 | value = json.loads(re.find(r'''{} *= *(\[.*?\])'''.format(name), html))
45 | print_('{}: {}'.format(name, value))
46 | if value is None:
47 | raise Exception('No {}'.format(name))
48 | return value
49 |
50 |
51 | class Video(object):
52 | _url_video = None
53 |
54 | def __init__(self, url, format='title', cw=None):
55 | self.url = LazyUrl(url, self.get, self)
56 | self.format = format
57 | self.cw = cw
58 |
59 | @try_n(2)
60 | def get(self, url):
61 | if self._url_video:
62 | return self._url_video
63 | cw = self.cw
64 | print_ = get_print(cw)
65 | html = downloader.read_html(url)
66 | soup = Soup(html)
67 |
68 | embedUrl = extract('embedUrl', html, cw)
69 | if embedUrl:
70 | raise EmbedUrlError('[pandoratv] EmbedUrl: {}'.format(embedUrl))
71 |
72 | uid = extract('strLocalChUserId', html, cw)
73 | pid = extract('nLocalPrgId', html, cw)
74 | fid = extract('strFid', html, cw)
75 | resolType = extract('strResolType', html, cw)
76 | resolArr = extract('strResolArr', html, cw)
77 | vodSvr = extract('nVodSvr', html, cw)
78 | resols = extract('nInfo', html, cw)
79 | runtime = extract('runtime', html, cw)
80 |
81 | url_api = 'http://www.pandora.tv/external/getExternalApi/getVodUrl/'
82 | data = {
83 | 'userId': uid,
84 | 'prgId': pid,
85 | 'fid': fid,
86 | 'resolType': resolType,
87 | 'resolArr': ','.join(map(str, resolArr)),
88 | 'vodSvr': vodSvr,
89 | 'resol': max(resols),
90 | 'runtime': runtime,
91 | 'tvbox': 'false',
92 | 'defResol': 'true',
93 | 'embed': 'false',
94 | }
95 | session = Session()
96 | r = session.post(url_api, headers={'Referer': url}, data=data)
97 | data = json.loads(r.text)
98 | self._url_video = data['src']
99 |
100 | self.title = soup.find('meta', {'property': 'og:description'})['content']
101 |
102 | ext = get_ext(self._url_video)
103 | self.filename = format_filename(self.title, pid, ext)
104 |
105 | self.url_thumb = soup.find('meta', {'property': 'og:image'})['content']
106 | self.thumb = BytesIO()
107 | downloader.download(self.url_thumb, buffer=self.thumb)
108 |
109 | return self._url_video
110 |
111 |
--------------------------------------------------------------------------------
/src/extractor/novelpia_downloader.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | from urllib.parse import urlparse
3 | from typing import List, cast
4 |
5 | from requests.sessions import session
6 |
7 | from errors import LoginRequired
8 | from utils import Downloader, Soup, Session, clean_title
9 |
10 | from bs4.element import Tag
11 | import requests
12 |
13 |
14 | @Downloader.register
15 | class Downloader_novelpia(Downloader):
16 | type = "novelpia"
17 | URLS = ["novelpia.com"]
18 |
19 | def __get_number(self, url: str) -> str:
20 | return url.replace("/viewer/", "")
21 |
22 | def __get_cookie(self) -> Session:
23 | session = requests.Session()
24 | user_key = Session().cookies.get("USERKEY", domain=".novelpia.com")
25 | login_key = Session().cookies.get("LOGINKEY", domain=".novelpia.com")
26 |
27 | if user_key and login_key:
28 | session.cookies.set("USERKEY", user_key, domain=".novelpia.com")
29 | session.cookies.set("LOGINKEY", login_key, domain=".novelpia.com")
30 | return session
31 |
32 | def init(self) -> None:
33 | self.parsed_url = urlparse(self.url) # url 나눔
34 | self.soup = Soup(requests.get(self.url).text)
35 |
36 | def read(self):
37 | session = self.__get_cookie()
38 | f = BytesIO()
39 |
40 | title_element = self.soup.find("b", {"class": "cut_line_one"})
41 |
42 | if not title_element:
43 | raise LoginRequired
44 |
45 | # Maybe NavigableString?
46 | assert isinstance(title_element, Tag)
47 | self.title = title_element.text
48 |
49 | # css selecter is not working :(
50 | ep_num = self.soup.find(
51 | "span",
52 | {
53 | "style": "background-color:rgba(155,155,155,0.5);padding: 1px 6px;border-radius: 3px;font-size: 11px; margin-right: 3px;"
54 | },
55 | )
56 | assert isinstance(ep_num, Tag)
57 |
58 | ep_name = self.soup.find("span", {"class": "cut_line_one"})
59 | assert isinstance(ep_name, Tag)
60 |
61 | # Dirty but for clean filename
62 | self.print_(ep_name.text)
63 | ep_name.text.replace(ep_num.text, "")
64 | self.print_(ep_name.text)
65 | self.print_(ep_num.text)
66 |
67 | self.filenames[f] = clean_title(f"{ep_num.text}: {ep_name.text}.txt", "safe")
68 |
69 | # https://novelpia.com/viewer/:number:
70 | numbers: List[str] = []
71 | numbers.append(self.__get_number(self.parsed_url[2]))
72 |
73 | # Get real contents
74 | # https://novelpia.com/proc/viewer_data/:number:
75 | # {"s": [{"text": ""}]}
76 | viewer_datas = map(
77 | lambda number: f"https://novelpia.com/proc/viewer_data/{number}", numbers
78 | )
79 | for viewer_data in viewer_datas:
80 | response = session.get(viewer_data)
81 | if response.text:
82 | response = response.json()
83 | for text_dict in response["s"]:
84 | text = text_dict["text"]
85 | if "img" in text:
86 | soup = Soup(text)
87 | img = soup.find("img")
88 | # Maybe NavigableString here too?
89 | assert isinstance(img, Tag)
90 | src = img.attrs["src"]
91 | filename = img.attrs["data-filename"]
92 | f.write(f"[{filename}]".encode("UTF-8"))
93 | self.urls.append(f"https:{src}")
94 | else:
95 | f.write(text_dict["text"].encode("UTF-8"))
96 | f.seek(0)
97 | self.urls.append(f)
98 | else:
99 | raise LoginRequired
100 |
--------------------------------------------------------------------------------
/src/extractor/nozomi_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from urllib.parse import quote
3 | from io import BytesIO
4 | from utils import Downloader, query_url, LazyUrl, get_ext, urljoin, clean_title, check_alive, lock, get_print, get_max_range
5 | import errors
6 | from translator import tr_
7 |
8 |
9 | class Image:
10 |
11 | def __init__(self, id, referer):
12 | self._id = id
13 | self.url = LazyUrl(referer, self.get, self)
14 |
15 | def get(self, referer):
16 | # https://j.nozomi.la/nozomi.js
17 | s_id = str(self._id)
18 | url_post = 'https://j.nozomi.la/post/{}/{}/{}.json'.format(s_id[-1], s_id[-3:-1], self._id)
19 | j = downloader.read_json(url_post, referer)
20 | url = urljoin(referer, j['imageurl'])
21 | ext = get_ext(url)
22 | self.filename = '{}{}'.format(self._id, ext)
23 | return url
24 |
25 |
26 | @Downloader.register
27 | class Downloader_nozomi(Downloader):
28 | type = 'nozomi'
29 | URLS = ['nozomi.la']
30 | display_name = 'Nozomi.la'
31 | MAX_CORE = 15
32 | ACC_MTIME = True
33 |
34 | @classmethod
35 | def fix_url(cls, url):
36 | return url.split('#')[0]
37 |
38 | @property
39 | def name(self):
40 | qs = query_url(self.url)
41 | name = qs['q'][0]
42 | if self._popular:
43 | name += ' - Popular'
44 | return name
45 |
46 | def read(self):
47 | if '/post/' in self.url:
48 | raise errors.Invalid(tr_('개별 다운로드는 지원하지 않습니다: {}').format(self.url))
49 | self._popular = 'search-Popular.' in self.url
50 | self.title = clean_title(self.name)
51 | qs = query_url(self.url)
52 | q = qs['q'][0]
53 | for id in get_ids_multi(q, self._popular, self.cw):
54 | img = Image(id, self.url)
55 | self.urls.append(img.url)
56 |
57 |
58 | @lock
59 | def get_ids(q, popular, cw):
60 | check_alive(cw)
61 | if q is None:
62 | if popular:
63 | url_api = 'https://j.nozomi.la/index-Popular.nozomi'
64 | else:
65 | url_api = 'https://j.nozomi.la/index.nozomi'
66 | else:
67 | if popular:
68 | url_api = 'https://j.nozomi.la/nozomi/popular/{}-Popular.nozomi'.format(quote(q))
69 | else:
70 | url_api = 'https://j.nozomi.la/nozomi/{}.nozomi'.format(quote(q))
71 | print(url_api)
72 | f = BytesIO()
73 | downloader.download(url_api, referer='https://nozomi.la/', buffer=f)
74 | data = f.read()
75 | ids = []
76 | for i in range(0, len(data), 4):
77 | crop = data[i:i+4]
78 | id = crop[0]*16777216 + crop[1]*65536 + crop[2]*256 + crop[3]
79 | ids.append(id)
80 | return ids
81 |
82 |
83 | def get_ids_multi(q, popular, cw=None):
84 | print_ = get_print(cw)
85 | max_pid = get_max_range(cw)
86 | qs = q.split(' ')
87 | qs_pos = [q for q in qs if not q.startswith('-')]
88 | qs_neg = [q[1:] for q in qs if q.startswith('-')]
89 | q = qs_pos[0] if qs_pos else None
90 | ids = get_ids(q, popular, cw)
91 | print_('{}: {}'.format(q, len(ids)))
92 |
93 | # Positive
94 | for q in qs_pos[1:]:
95 | ids_ = get_ids(q, popular, cw)
96 | set_ids_ = set(ids_)
97 | ids_old = ids
98 | ids = []
99 | for id in ids_old:
100 | if id in set_ids_:
101 | ids.append(id)
102 | print_('{}: {} ({})'.format(q, len(ids_), len(ids)))
103 |
104 | # Negative
105 | for q in qs_neg:
106 | ids_ = get_ids(q, popular, cw)
107 | set_ids_ = set(ids_)
108 | ids_old = ids
109 | ids = []
110 | for id in ids_old:
111 | if id not in set_ids_:
112 | ids.append(id)
113 | print_('-{}: {} ({})'.format(q, len(ids_), len(ids)))
114 | return ids[:max_pid]
115 |
--------------------------------------------------------------------------------
/src/extractor/flickr_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import downloader
3 | import flickr_api
4 | from timee import sleep
5 | from utils import Downloader, LazyUrl, query_url, clean_title
6 | import os
7 | from translator import tr_
8 | import ree as re
9 | from datetime import datetime
10 | import flickr_auth
11 |
12 |
13 | alphabet = '123456789abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRSTUVWXYZ'
14 | base = len(alphabet)
15 | def b58encode(div, s=''):
16 | if div >= base:
17 | div, mod = divmod(div, base)
18 | return b58encode(div, alphabet[mod] + s)
19 | return alphabet[div] + s
20 | def b58decode(s):
21 | return sum(alphabet.index(c) * pow(base, i) for i, c in enumerate(reversed(s)))
22 |
23 |
24 |
25 | class Image(object):
26 | def __init__(self, photo):
27 | self.photo = photo
28 | self.id = photo.id
29 | self.filename = None
30 |
31 | def f(_=None):
32 | url = photo.getPhotoFile()
33 | #url = 'https://flic.kr/p/{}'.format(b58encode(int(photo.id)))
34 | ext = os.path.splitext(url)[1]
35 | date = datetime.fromtimestamp(int(photo.dateuploaded))
36 | date = u'{:02}-{:02}-{:02}'.format(date.year%100, date.month, date.day)
37 | self.filename = u'[{}] {}{}'.format(date, self.id, ext)
38 | return url
39 | self.url = LazyUrl(u'flickr_{}'.format(self.id), f, self)
40 |
41 |
42 | def find_ps(url):
43 | user = flickr_api.Person.findByUrl(url)
44 | id = re.search('/albums/([0-9]+)', url).groups()[0]
45 | pss = user.getPhotosets()
46 | for ps in pss:
47 | if ps.id == id:
48 | break
49 | else:
50 | raise Exception('Not found photoset id')
51 | return user, ps
52 |
53 |
54 | @Downloader.register
55 | class Downloader_flickr(Downloader):
56 | type = 'flickr'
57 | URLS = ['flickr.com']
58 | _name = None
59 |
60 | def init(self):
61 | if 'flickr.com' in self.url.lower():
62 | self.url = self.url.replace('http://', 'https://')
63 | else:
64 | self.url = 'https://www.flickr.com/people/{}'.format(self.url)
65 |
66 | @property
67 | def name(self):
68 | global pss
69 | if self._name is None:
70 | url = self.url
71 | flickr_auth.get_api(url, self.cw)
72 | if '/albums/' in url:
73 | user, ps = find_ps(url)
74 | self._name = u'{} (flickr_album_{}_{})'.format(ps.title, user.id, ps.id)
75 | else:
76 | user = flickr_api.Person.findByUrl(url)
77 | self._name = u'{} (flickr_{})'.format(user.username, user.id)
78 | return clean_title(self._name)
79 |
80 |
81 | def read(self):
82 | self.title = self.name
83 |
84 | imgs = get_imgs(self.url, self.title, cw=self.cw)
85 |
86 | for img in imgs:
87 | self.urls.append(img.url)
88 |
89 | self.title = self.name
90 |
91 |
92 | def get_imgs(url, title=None, cw=None):
93 | flickr_auth.get_api(title, cw)
94 | if not flickr_auth.isAuth:
95 | raise Exception('No Auth')
96 |
97 |
98 | if '/albums/' in url:
99 | user, ps = find_ps(url)
100 | handle = ps
101 | else:
102 | user = flickr_api.Person.findByUrl(url)
103 | handle = user
104 |
105 | photos = []
106 |
107 | per_page = 500
108 | for page in range(1, 200):
109 | photos_new = handle.getPhotos(per_page=per_page, page=page)
110 | photos += photos_new
111 | if len(photos_new) < per_page:
112 | break
113 |
114 | msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(photos))
115 | if cw:
116 | if not cw.alive:
117 | break
118 | cw.setTitle(msg)
119 | else:
120 | print(msg)
121 |
122 | imgs = []
123 | for photo in photos:
124 | img = Image(photo)
125 | imgs.append(img)
126 |
127 | return imgs
128 |
129 |
--------------------------------------------------------------------------------
/src/extractor/rule34_xxx_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ree as re
3 | import os
4 | from utils import Downloader, urljoin, query_url, Soup, get_max_range, get_print, clean_title, try_n
5 | from translator import tr_
6 | try:
7 | from urllib import quote # python2
8 | except:
9 | from urllib.parse import quote # python3
10 | import sys
11 | from timee import sleep
12 | from constants import clean_url
13 | LIMIT = 100
14 |
15 |
16 | def get_tags(url):
17 | url = clean_url(url)
18 | qs = query_url(url)
19 | if 'page=favorites' in url:
20 | id = qs.get('id', ['N/A'])[0]
21 | id = u'fav_{}'.format(id)
22 | else:
23 | tags = qs.get('tags', [])
24 | tags.sort()
25 | id = u' '.join(tags)
26 | if not id:
27 | id = u'N/A'
28 | return id
29 |
30 |
31 | @Downloader.register
32 | class Downloader_rule34_xxx(Downloader):
33 | type = 'rule34_xxx'
34 | URLS = ['rule34.xxx']
35 | MAX_CORE = 8
36 | display_name = 'Rule34.xxx'
37 | _name = None
38 |
39 | @classmethod
40 | def fix_url(cls, url):
41 | if 'rule34.xxx' in url.lower():
42 | url = url.replace('http://', 'https://')
43 | else:
44 | url = url.replace(' ', '+')
45 | while '++' in url:
46 | url = url.replace('++', '+')
47 | url = quote(url)
48 | url = url.replace('%2B', '+')
49 | url = u'https://rule34.xxx/index.php?page=post&s=list&tags={}'.format(url)
50 | return url
51 |
52 | @property
53 | def name(self):
54 | if self._name is None:
55 | tags = get_tags(self.url)
56 | self._name = tags
57 | return clean_title(self._name)
58 |
59 | def read(self):
60 | self.title = self.name
61 |
62 | imgs = get_imgs(self.url, self.name, cw=self.cw)
63 |
64 | for img in imgs:
65 | self.urls.append(img.url)
66 | self.filenames[img.url] = img.filename
67 |
68 | self.title = self.name
69 |
70 |
71 | class Image(object):
72 | def __init__(self, id_, url):
73 | self.url = url
74 | ext = os.path.splitext(url)[1]
75 | self.filename = u'{}{}'.format(id_, ext)
76 |
77 |
78 | def setPage(url, page):
79 | # Always use HTTPS
80 | url = url.replace('http://', 'https://')
81 |
82 | # Change the page
83 | if 'pid=' in url:
84 | url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url)
85 | else:
86 | url += '&pid={}'.format(page)
87 |
88 | return url
89 |
90 |
91 | def get_imgs(url, title=None, cw=None):
92 | url = clean_url(url)
93 | if 's=view' in url and 'page=favorites' not in url:
94 | raise NotImplementedError('Not Implemented')
95 |
96 | if 'page=dapi' not in url.lower():
97 | tags = get_tags(url)
98 | tags = quote(tags, safe='/')
99 | tags = tags.replace('%20', '+')
100 | url = "https://rule34.xxx/index.php?page=dapi&s=post&q=index&tags={}&pid={}&limit={}".format(tags, 0, LIMIT)
101 |
102 | print_ = get_print(cw)
103 |
104 | # Range
105 | max_pid = get_max_range(cw)
106 |
107 | imgs = []
108 | ids = set()
109 | for p in range(500): #1017
110 | url = setPage(url, p)
111 | print_(url)
112 | html = try_n(4, sleep=30)(downloader.read_html)(url) #3340
113 |
114 | soup = Soup(html)
115 | posts = soup.findAll('post')
116 | if not posts:
117 | break
118 | for post in posts:
119 | id_ = post.attrs['id']
120 | if id_ in ids:
121 | print('duplicate:', id_)
122 | continue
123 | ids.add(id_)
124 | url_img = post.attrs['file_url']
125 | img = Image(id_, url_img)
126 | imgs.append(img)
127 | if len(imgs) >= max_pid:
128 | break
129 |
130 | if cw is not None:
131 | if not cw.alive:
132 | break
133 | cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)))
134 | return imgs
135 |
--------------------------------------------------------------------------------
/src/extractor/likee_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Session, Downloader, get_ext, LazyUrl, get_print
3 | import ree as re
4 | import json
5 | from io import BytesIO
6 | from translator import tr_
7 |
8 |
9 | @Downloader.register
10 | class Downloader_likee(Downloader):
11 | type = 'likee'
12 | URLS = ['likee.video']
13 | single = True
14 | display_name = 'Likee'
15 |
16 | def init(self):
17 | self.session = Session()
18 |
19 | def read(self):
20 | info = get_info(self.url, self.session, self.cw)
21 | self.print_('type: {}'.format(info['type']))
22 | self.artist = info['artist']
23 |
24 | if info['type'] != 'single':
25 | video = self.process_playlist(info['title'], info['videos'])
26 | else:
27 | video = info['videos'][0]
28 | video.url()
29 | self.urls.append(video.url)
30 | self.title = info['title']
31 |
32 | thumb = BytesIO()
33 | downloader.download(video.url_thumb, referer=self.url, buffer=thumb)
34 | self.setIcon(thumb)
35 |
36 |
37 | def get_info(url, session, cw=None):
38 | print_ = get_print(cw)
39 |
40 | info = {}
41 | info['videos'] = []
42 |
43 | if '/video/' in url:
44 | info['type'] = 'single'
45 | video = Video(url, session)
46 | video.url()
47 | info['videos'].append(video)
48 | info['title'] = video.id_
49 | info['artist'] = video.artist
50 | return info
51 |
52 | info['type'] = 'channel'
53 | html = downloader.read_html(url, session=session)
54 | data_raw = html.split('window.data = ')[1].split('};')[0]+'}'
55 | data = json.loads(data_raw)
56 | info['uid'] = data['userinfo']['uid']
57 | info['username'] = data['userinfo']['yyuid']
58 | info['artist'] = data['userinfo']['nick_name']
59 | info['title'] = '{} (likee_{})'.format(info['artist'], info['username'])
60 |
61 | lastPostId = ''
62 | urls = set()
63 | while True:
64 | url_api = 'https://likee.video/official_website/VideoApi/getUserVideo'
65 | r = session.post(url_api, data={'uid': info['uid'], 'count': '30', 'lastPostId': lastPostId})
66 | data = json.loads(r.text)
67 |
68 | videos = data['data']['videoList']
69 | if not videos:
70 | break
71 |
72 | for data in videos:
73 | url_post = 'https://likee.video/@{}/video/{}'.format(data['likeeId'], data['postId'])
74 | if url_post in urls:
75 | print_('duplicate: {}'.format(url_post))
76 | continue
77 | urls.add(url_post)
78 | video = Video(url_post, session, data)
79 | video.url()
80 | info['videos'].append(video)
81 | lastPostId = data['postId']
82 |
83 | msg = '{} {} - {}'.format(tr_('읽는 중...'), info['title'], len(info['videos']))
84 | if cw:
85 | if not cw.alive:
86 | return
87 | cw.setTitle(msg)
88 | else:
89 | print(msg)
90 |
91 | return info
92 |
93 |
94 | class Video(object):
95 | def __init__(self, url, session, data=None):
96 | self.id_ = re.find('/video/([0-9]+)', url, err='no id')
97 | self._session = session
98 | self._data = data
99 | self.url = LazyUrl(url, self.get, self)
100 |
101 | def get(self, url):
102 | if self._data:
103 | video = self._data
104 | else:
105 | url_api = 'https://likee.video/official_website/VideoApi/getVideoInfo'
106 | r = self._session.post(url_api, data={'postIds': str(self.id_)})
107 |
108 | data = json.loads(r.text)
109 | video = data['data']['videoList'][0]
110 |
111 | url_video = video['videoUrl']
112 | self.url_thumb = video['coverUrl']
113 | self.artist = video['nickname']
114 | ext = get_ext(url_video)
115 | self.title = self.id_
116 | self.filename = '{}{}'.format(self.id_, ext)
117 |
118 | return url_video
119 |
120 |
--------------------------------------------------------------------------------
/src/extractor/nhentai_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from __future__ import division, print_function, unicode_literals
3 | import downloader
4 | import ree as re
5 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, join, get_ext
6 | import os
7 | import json
8 |
9 |
10 | @Downloader.register
11 | class Downloader_nhentai(Downloader):
12 | type = 'nhentai'
13 | URLS = ['nhentai.net']
14 | MAX_CORE = 16
15 | display_name = 'nhentai'
16 |
17 | def init(self):
18 | self.url = 'https://nhentai.net/g/{}/'.format(self.id_)
19 |
20 | @property
21 | def id_(self):
22 | try:
23 | return int(self.url)
24 | except:
25 | return int(re.find('/g/([0-9]+)', self.url))
26 |
27 | def read(self):
28 | info, imgs = get_imgs(self.id_)
29 |
30 | # 1225
31 | artist = join(info.artists)
32 | self.artist = artist if info.artists else None
33 | group = join(info.groups)
34 | lang = info.lang or 'N/A'
35 | series = info.seriess[0] if info.seriess else 'N/A'
36 | title = self.format_title(info.type, info.id, info.title, artist, group, series, lang)
37 |
38 | for img in imgs:
39 | self.urls.append(img.url)
40 |
41 | self.title = title
42 |
43 |
44 | @LazyUrl.register
45 | class LazyUrl_nhentai(LazyUrl):
46 | type = 'nhentai'
47 | def dump(self):
48 | referer = self._url
49 | url = self.image.url_img
50 | return {
51 | 'referer': referer,
52 | 'url': url,
53 | 'p': self.image.p,
54 | }
55 | @classmethod
56 | def load(cls, data):
57 | referer = data['referer']
58 | url = data['url']
59 | img = Image(referer, url, data['p'])
60 | return img.url
61 |
62 |
63 | class Image(object):
64 | def __init__(self, url_page, url_img, p):
65 | self.p = p
66 | self.url = LazyUrl_nhentai(url_page, lambda _: url_img, self)
67 | self.filename = '{:04}{}'.format(p, get_ext(url_img))
68 |
69 |
70 | class Info(object):
71 | def __init__(self, host, id, id_media, title, p, artists, groups, seriess, lang, type, formats):
72 | self.host = host
73 | self.id = id
74 | self.id_media = id_media
75 | self.title = title
76 | self.p = p
77 | self.artists = artists
78 | self.groups = groups
79 | self.seriess = seriess
80 | self.lang = lang
81 | self.type = type
82 | self.formats = formats
83 |
84 |
85 | @try_n(4)
86 | def get_info(id):
87 | url = 'https://nhentai.net/g/{}/1/'.format(id)
88 | referer = 'https://nhentai.net/g/{}/'.format(id)
89 | html = downloader.read_html(url, referer=referer)
90 |
91 | data = html.split('JSON.parse(')[1].split(');')[0]
92 | gal = json.loads(json.loads(data))
93 | host = 'https://i.nhentai.net'#re.find('''media_url: *['"]([^'"]+)''', html, err='no host')
94 |
95 | id = int(gal['id'])
96 | id_media = int(gal['media_id'])
97 | title = gal['title']['english']
98 | p = len(gal['images']['pages'])
99 | artists = []
100 | groups = []
101 | seriess = []
102 | for tag in gal['tags']:
103 | type = tag['type']
104 | if type == 'artist':
105 | artists.append(tag['name'])
106 | elif type == 'group':
107 | groups.append(tag['name'])
108 | elif type == 'parody' and tag['name'] != 'original':
109 | seriess.append(tag['name'])
110 | elif type == 'language':
111 | lang = tag['name']
112 | elif type == 'category':
113 | type_ = tag['name']
114 | formats = []
115 | for img in gal['images']['pages']:
116 | type = img['t']
117 | format = {'j':'jpg', 'p':'png', 'g':'gif'}[type]
118 | formats.append(format)
119 | info = Info(host, id, id_media, title, p, artists, groups, seriess, lang, type_, formats)
120 | return info
121 |
122 |
123 | def get_imgs(id):
124 | info = get_info(id)
125 |
126 | imgs = []
127 | for p in range(1, info.p+1):
128 | name = '/galleries/{}/{}.{}'.format(info.id_media, p, info.formats[p-1])
129 | url_page = 'https://nhentai.net/g/{}/{}/'.format(id, p)
130 | url_img = urljoin(info.host, name)
131 | img = Image(url_page, url_img, p)
132 | imgs.append(img)
133 |
134 | return info, imgs
135 |
136 |
137 |
--------------------------------------------------------------------------------
/src/extractor/nico_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | import nndownload
4 | from io import BytesIO
5 | import ree as re
6 | from utils import Downloader, get_print, compatstr, format_filename, try_n, LazyUrl, get_abr
7 | import utils
8 | from nico_login import login, logout
9 | import ffmpeg
10 | import os
11 | import errors
12 |
13 |
14 | def get_id(url):
15 | if '/watch/' in url:
16 | return re.find('/watch/([a-zA-Z0-9]+)', url)
17 |
18 |
19 | class Video(object):
20 | def __init__(self, session, info, format, cw):
21 | self.session = session
22 | self.info = info
23 | self.title = info['title']
24 | self.ext = info['ext']
25 | self.id = info['id']
26 | self.format = format
27 | self.username = info['uploader']
28 | self.url = LazyUrl('https://www.nicovideo.jp/watch/{}'.format(self.id), lambda _: info['url'], self, pp=self.pp)
29 | self.cw = cw
30 |
31 | self.filename = format_filename(self.title, self.id, self.ext)
32 |
33 | self.url_thumb = info['thumbnail_url']
34 | print('thumb:', self.url_thumb)
35 | self.thumb = BytesIO()
36 | downloader.download(self.url_thumb, buffer=self.thumb)
37 |
38 | def pp(self, filename):
39 | if self.format == 'mp4':
40 | return
41 | name, ext_old = os.path.splitext(filename)
42 | filename_new = '{}.mp3'.format(name)
43 | ffmpeg.convert(filename, filename_new, '-shortest -preset ultrafast -b:a {}k'.format(get_abr()), cw=self.cw)
44 |
45 | if utils.ui_setting.albumArt.isChecked():
46 | self.thumb.seek(0)#
47 | ffmpeg.add_cover(filename_new, self.thumb, {'artist':self.username, 'title':self.title}, cw=self.cw)
48 |
49 | return filename_new
50 |
51 | def __repr__(self):
52 | return u'Video({})'.format(self.id)
53 |
54 |
55 | def suitable(url):
56 | if 'live.nico' in url: #3986
57 | return False
58 | if 'nicovideo.jp' not in url.lower():
59 | return False
60 | return get_id(url) is not None
61 |
62 |
63 | @Downloader.register
64 | class Downloader_nico(Downloader):
65 | type = 'nico'
66 | single = True
67 | URLS = [suitable]
68 | display_name = 'Niconico'
69 | _format = 'mp4'
70 | MAX_SPEED = 2.0
71 |
72 | @classmethod
73 | def fix_url(cls, url):
74 | id_ = get_id(url)
75 | if re.find(r'^https?://', id_):
76 | return url
77 | if re.find(r'^https?://', url):
78 | domain = utils.domain(url)
79 | else:
80 | domain = 'www.nicovideo.jp'
81 | return 'https://{}/watch/{}'.format(domain, id_)
82 |
83 | def read(self):
84 | ui_setting = self.ui_setting
85 | if self.cw.format:
86 | self._format = self.cw.format
87 |
88 | if self._format == 'mp3':
89 | self.cw.setMusic(True)
90 |
91 | if ui_setting.nicoBox.isChecked():
92 | username = compatstr(ui_setting.nico_id.text())
93 | password = compatstr(ui_setting.nico_pw.text())
94 | else:
95 | username = ''
96 | password = ''
97 |
98 | try:
99 | session = login(username, password)
100 | except Exception as e:
101 | logout()
102 | raise errors.Invalid(u'Failed to login: {}'.format(self.url), fail=True)
103 |
104 | self.session = session
105 | try:
106 | video = get_video(session, self.url, self._format, self.cw)
107 | except Exception as e:
108 | logout()
109 | raise
110 |
111 | self.urls.append(video.url)
112 | self.setIcon(video.thumb)
113 |
114 | self.enableSegment()
115 |
116 | self.title = video.title
117 |
118 |
119 | @try_n(2)
120 | def get_video(session, url, format, cw=None):
121 | print_ = get_print(cw)
122 |
123 | id = get_id(url)
124 | if 'live.nico' in url: #3986
125 | raise NotImplementedError('nama')
126 | #info = nndownload.request_nama(session, id)
127 | else:
128 | info = nndownload.request_video(session, id)
129 | video = Video(session, info, format, cw)
130 |
131 | return video
132 |
133 |
134 | import selector
135 | @selector.options('nico')
136 | def options():
137 | return [
138 | {'text': 'MP4 (동영상)', 'format': 'mp4'},
139 | {'text': 'MP3 (음원)', 'format': 'mp3'},
140 | ]
141 |
--------------------------------------------------------------------------------
/src/extractor/hanime_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Session, Downloader, get_outdir, try_n, Soup, format_filename, clean_title, get_print, get_resolution
3 | import ree as re, json
4 | from io import BytesIO
5 | import os
6 | from timee import time
7 | from m3u8_tools import M3u8_stream
8 | from random import randrange
9 |
10 |
11 | class Video(object):
12 |
13 | def __init__(self, info, stream):
14 | self.info = info
15 | self.id = info['id']
16 | self.title = info['name']
17 | self.brand = info['brand']
18 | self.url = stream['url']
19 | self.url_thumb = info['poster_url']
20 | self.thumb = BytesIO()
21 | downloader.download(self.url_thumb, buffer=self.thumb)
22 | ext = os.path.splitext(self.url.split('?')[0].split('#')[0])[1]
23 | if ext.lower() == '.m3u8':
24 | print('read m3u8:', self.url)
25 | ext = '.mp4'
26 | self.url = M3u8_stream(self.url, n_thread=4)
27 | else:
28 | size = downloader.get_size(self.url)
29 | if size <= 0:
30 | raise Exception('Size is 0')
31 | self.filename = format_filename('[{}] {}'.format(self.brand, self.title), self.id, ext)
32 |
33 | def __repr__(self):
34 | return ('Video({})').format(self.id)
35 |
36 |
37 | @Downloader.register
38 | class Downloader_hanime(Downloader):
39 | type = 'hanime'
40 | URLS = ['hanime.tv/hentai-videos/', 'hanime.tv/videos/']
41 | single = True
42 | display_name = 'hanime.tv'
43 |
44 | def read(self):
45 | video, session = get_video(self.url, cw=self.cw)
46 | self.video = video
47 |
48 | self.urls.append(video.url)
49 | self.filenames[video.url] = video.filename
50 |
51 | self.setIcon(video.thumb)
52 | self.title = u'[{}] {}'.format(video.brand, video.title)
53 |
54 |
55 | @try_n(8)
56 | def get_video(url, session=None, cw=None):
57 | print_ = get_print(cw)
58 | if session is None:
59 | session = Session()
60 | session.headers['User-Agent'] = downloader.hdr['User-Agent']
61 | session.headers['X-Directive'] = 'api'
62 | html = downloader.read_html(url, session=session)
63 | soup = Soup(html)
64 | for script in soup.findAll('script'):
65 | script = script.text or script.string or ''
66 | data = re.find('window.__NUXT__=(.+)', script)
67 | if data is not None:
68 | data = data.strip()
69 | if data.endswith(';'):
70 | data = data[:-1]
71 | data = json.loads(data)
72 | break
73 | else:
74 | raise Exception('No __NUXT__')
75 |
76 | info = data['state']['data']['video']['hentai_video']
77 | query = info['slug']
78 | #url_api = 'https://members.hanime.tv/api/v3/videos_manifests/{}?'.format(query) # old
79 | url_api = 'https://hanime.tv/rapi/v7/videos_manifests/{}?'.format(query) # new
80 | print(url_api)
81 | hdr = {
82 | 'x-signature': ''.join('{:x}'.format(randrange(16)) for i in range(32)),
83 | 'x-signature-version': 'web2',
84 | 'x-time': str(int(time())),
85 | }
86 | r = session.get(url_api, headers=hdr)
87 | print(r)
88 | data = json.loads(r.text)
89 | streams = []
90 | for server in data['videos_manifest']['servers']:
91 | streams += server['streams']
92 |
93 | streams_good = []
94 | for stream in streams:
95 | url_video = stream['url']
96 | if not url_video or 'deprecated.' in url_video:
97 | continue
98 | stream['height'] = int(stream['height'])
99 | streams_good.append(stream)
100 |
101 | if not streams_good:
102 | raise Exception('No video available')
103 | print('len(streams_good):', len(streams_good))
104 | res = get_resolution()
105 |
106 | def print_stream(stream):
107 | print_([stream['extension'], stream['height'], stream['filesize_mbs'], stream['url']])
108 |
109 | steams_filtered = []
110 | for stream in streams_good:
111 | print_stream(stream)
112 | if stream['height'] <= res: #3712
113 | steams_filtered.append(stream)
114 |
115 | if steams_filtered:
116 | stream = sorted(steams_filtered, key=lambda _: _['height'])[-1]
117 | else:
118 | stream = sorted(streams_good, key=lambda _: _['height'])[0]
119 |
120 | print_('Final stream:')
121 | print_stream(stream)
122 | return Video(info, stream), session
123 |
124 |
125 |
126 |
--------------------------------------------------------------------------------
/src/extractor/kakuyomu_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | import utils
4 | from utils import Soup, urljoin, Downloader, LazyUrl, get_outdir, try_n, clean_title, get_print
5 | import os
6 | from timee import sleep
7 | from io import BytesIO
8 | from translator import tr_
9 |
10 |
11 |
12 | class Page(object):
13 | def __init__(self, url, title, date, p):
14 | self.url = url
15 | self.title = clean_title(u'[{:04}] {}'.format(p, title), n=-4)
16 | self.date = date
17 | self.filename = u'{}.txt'.format(self.title)
18 | self.file = LazyUrl(self.url, self.get_file, self)
19 |
20 | def get_file(self, url):
21 | text = get_text(self)
22 | f = BytesIO()
23 | f.write(text.encode('utf8'))
24 | f.seek(0)
25 | #f.mode = 'w'
26 | return f
27 |
28 |
29 | @Downloader.register
30 | class Downloader_kakuyomu(Downloader):
31 | type = 'kakuyomu'
32 | URLS = ['kakuyomu.jp']
33 | MAX_CORE = 2
34 | detect_removed = False
35 | display_name = 'カクヨム'
36 |
37 | def init(self):
38 | self.info = get_info(self.url, cw=self.cw)
39 |
40 | def read(self):
41 | outdir = get_outdir('kakuyomu')
42 |
43 | self.artist = self.info['artist']
44 | title_dir = clean_title(u'[{}] {}'.format(self.artist, self.info['title']))
45 |
46 | for page in self.info['pages']:
47 | file = os.path.join(outdir, title_dir, page.filename)
48 | if os.path.isfile(file):
49 | self.urls.append(file)
50 | else:
51 | self.urls.append(page.file)
52 |
53 | self.title = title_dir
54 |
55 | def post_processing(self):
56 | names = self.cw.names
57 | filename = clean_title(u'[merged] [{}] {}'.format(self.artist, self.info['title']), n=-4) + '.txt'
58 | filename = os.path.join(self.dir, filename)
59 | try:
60 | with utils.open(filename, 'wb') as f:
61 | f.write(u' {}\n\n \u4f5c\u8005\uff1a{}\n\n\n'.format(self.info['title'], self.artist).encode('utf8'))
62 | f.write(self.info['description'].encode('utf8'))
63 | for i, file in enumerate(names):
64 | self.cw.pbar.setFormat('[%v/%m] {} [{}/{}]'.format(tr_(u'\ubcd1\ud569...'), i, len(names)))
65 | with open(file, 'rb') as f_:
66 | text = f_.read()
67 | f.write(b'\n\n\n\n')
68 | f.write(text)
69 | finally:
70 | self.cw.pbar.setFormat('[%v/%m]')
71 |
72 |
73 | @try_n(4, sleep=30)
74 | def get_text(page):
75 | html = downloader.read_html(page.url)
76 | soup = Soup(html)
77 | view = soup.find('div', class_='widget-episodeBody')
78 | story = view.text.strip()
79 | text =u'''────────────────────────────────
80 |
81 | ◆ {} {}
82 |
83 | ────────────────────────────────
84 |
85 |
86 | {}'''.format(page.title, page.date, story)
87 | return text
88 |
89 |
90 | def get_info(url, soup=None, cw=None):
91 | print_ = get_print(cw)
92 | if soup is None:
93 | html = downloader.read_html(url)
94 | soup = Soup(html)
95 |
96 | info = {}
97 |
98 | info['title'] = soup.find('h1', id='workTitle').text.strip()
99 | info['artist'] = soup.find('span', id='workAuthor-activityName').text.strip()
100 |
101 | desc = soup.find('section', id='description')
102 | button = desc.find('span', class_='ui-truncateTextButton-expandButton')
103 | if button:
104 | print('decompose button')
105 | button.decompose()
106 | catch = desc.find('span', id='catchphrase-body')
107 | if catch is None: #4445
108 | print_('no catch')
109 | catch = ''
110 | else:
111 | catch = catch.text.strip()
112 | intro = desc.find('p', id='introduction')
113 | if intro is None: #4262
114 | print_('no intro')
115 | intro = ''
116 | else:
117 | intro = intro.text.strip()
118 | desc = u' {}{}'.format(catch, ('\n\n\n'+intro) if intro else '')
119 | info['description'] = desc
120 |
121 | pages = []
122 | for a in soup.findAll('a', class_='widget-toc-episode-episodeTitle'):
123 | href = urljoin(url, a.attrs['href'])
124 | subtitle = a.find('span', class_='widget-toc-episode-titleLabel').text.strip()
125 | date = a.find('time', class_='widget-toc-episode-datePublished').text.strip()
126 | page = Page(href, subtitle, date, len(pages)+1)
127 | pages.append(page)
128 |
129 | info['pages'] = pages
130 |
131 | return info
132 |
133 |
--------------------------------------------------------------------------------
/src/extractor/webtoon_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Soup, LazyUrl, clean_title, get_ext, get_imgs_already, urljoin, try_n, Downloader
3 | import os
4 | import page_selector
5 | from translator import tr_
6 | import ree as re
7 |
8 |
9 |
10 | @Downloader.register
11 | class Downloader_webtoon(Downloader):
12 | type = 'webtoon'
13 | URLS = ['webtoon.com', 'webtoons.com']
14 | MAX_CORE = 8
15 | MAX_SPEED = 4.0
16 | display_name = 'WEBTOON'
17 |
18 | def init(self):
19 | self.url = get_main(self.url)
20 | self.soup = downloader.read_soup(self.url)
21 |
22 | @classmethod
23 | def fix_url(cls, url):
24 | return url.replace('webtoon.com', 'webtoons.com')
25 |
26 | def read(self):
27 | title = clean_title(self.soup.find('h1').text.strip())
28 | self.title = tr_(u'\uc77d\ub294 \uc911... {}').format(title)
29 | imgs = get_imgs_all(self.url, title, cw=self.cw)
30 | for img in imgs:
31 | if isinstance(img, Image):
32 | self.urls.append(img.url)
33 | else:
34 | self.urls.append(img)
35 |
36 | self.title = title
37 |
38 |
39 | class Page(object):
40 |
41 | def __init__(self, url, title):
42 | self.url = url
43 | self.title = title
44 |
45 |
46 | class Image(object):
47 |
48 | def __init__(self, url, page, p):
49 | ext = get_ext(url) or downloader.get_ext(url, referer=page.url)
50 | self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
51 |
52 | self.url = LazyUrl(page.url, lambda _: url, self)
53 |
54 |
55 | @try_n(2)
56 | def get_imgs(page):
57 | html = downloader.read_html(page.url)
58 | if 'window.__motiontoonViewerState__' in html:
59 | raise NotImplementedError('motiontoon')
60 | soup = Soup(html)
61 | view = soup.find('div', class_='viewer_img')
62 | imgs = []
63 | for img in view.findAll('img'):
64 | src = img.get('data-url') or img['src']
65 | img = Image(urljoin(page.url, src), page, len(imgs))
66 | imgs.append(img)
67 | return imgs
68 |
69 |
70 | def get_main(url):
71 | if 'episode_no=' in url:
72 | soup = downloader.read_soup(url)
73 | url = urljoin(url, soup.find('div', class_='subj_info').find('a')['href'])
74 | return url
75 |
76 |
77 | def set_page(url, p):
78 | if '&page=' not in url:
79 | url = url + '&page={}'.format(p)
80 | else:
81 | url = re.sub('&page=[0-9]+', '&page={}'.format(p), url)
82 | if p == 1:
83 | url = url.replace('&page=1', '')
84 | return url
85 |
86 |
87 | def get_pages(url):
88 | pages = []
89 | urls = set()
90 | for p in range(1, 101):
91 | url_page = set_page(url, p)
92 | print(url_page)
93 | for try_ in range(4):
94 | try:
95 | soup = downloader.read_soup(url_page)
96 | view = soup.find('ul', id='_listUl')
97 | if view is None:
98 | raise Exception('no view')
99 | break
100 | except Exception as e:
101 | e_ = e
102 | print(e)
103 | else:
104 | raise e_
105 | pages_new = []
106 | for li in view.findAll('li', recursive=False):
107 | href = urljoin(url, li.find('a')['href'])
108 | title = li.find('span', class_='subj').text.strip()
109 | if href in urls:
110 | continue
111 | urls.add(href)
112 | no = int(li['data-episode-no'])
113 | title = '{:04} - {}'.format(no, title)
114 | page = Page(href, title)
115 | pages_new.append(page)
116 | if not pages_new:
117 | break
118 | pages += pages_new
119 | return pages[::-1]
120 |
121 |
122 | @page_selector.register('webtoon')
123 | @try_n(4)
124 | def f(url):
125 | url = get_main(url)
126 | return get_pages(url)
127 |
128 |
129 | def get_imgs_all(url, title, cw=None):
130 | pages = get_pages(url)
131 | pages = page_selector.filter(pages, cw)
132 | imgs = []
133 | for p, page in enumerate(pages):
134 | imgs_already = get_imgs_already('webtoon', title, page, cw)
135 | if imgs_already:
136 | imgs += imgs_already
137 | continue
138 | imgs += get_imgs(page)
139 | msg = tr_(u'\uc77d\ub294 \uc911... {} / {} ({}/{})').format(title, page.title, p + 1, len(pages))
140 | if cw is not None:
141 | cw.setTitle(msg)
142 | if not cw.alive:
143 | break
144 | else:
145 | print(msg)
146 |
147 | return imgs
148 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | [](https://github.com/KurtBestor/Hitomi-Downloader/releases/latest)
7 | [](https://github.com/KurtBestor/Hitomi-Downloader/releases/latest)
8 | [](https://github.com/KurtBestor/Hitomi-Downloader/releases)
9 |
10 | ## Links
11 | - [Download](https://github.com/KurtBestor/Hitomi-Downloader/releases/latest)
12 | - [Issues](https://github.com/KurtBestor/Hitomi-Downloader/issues)
13 | - [Scripts](https://github.com/KurtBestor/Hitomi-Downloader/wiki/Scripts)
14 | - [Chrome Extension](https://github.com/KurtBestor/Hitomi-Downloader/wiki/Chrome-Extension)
15 |
16 | ## Demo
17 |
18 |
19 | ## Features
20 | - 🍰 Simple and clear user interface
21 | - 🚀 Download acceleration
22 | - 💻 Supports 24 threads in a single task
23 | - 🚥 Supports speed limit
24 | - 📜 Supports user scripts
25 | - 🧲 Supports BitTorrent & Magnet
26 | - 🎞️ Supports M3U8 & MPD format videos
27 | - 🌙 Dark mode
28 | - 🧳 Portable
29 | - 📋 Clipboard monitor
30 | - 🗃️ Easy to organize tasks
31 |
32 | ## Supported Sites
33 | | Site | URL |
34 | | :--: | -- |
35 | | **4chan** | |
36 | | **AfreecaTV** | |
37 | | **ArtStation** | |
38 | | **AsianSister** | |
39 | | **AsmHentai** | |
40 | | **Avgle** | |
41 | | **baraag.net** | |
42 | | **半次元** | |
43 | | **BDSMlr** | |
44 | | **bilibili** | |
45 | | **ComicWalker** | |
46 | | **Coub** | |
47 | | **Danbooru** | |
48 | | **Kakao Webtoon** | |
49 | | **DeviantArt** | |
50 | | **E(x)Hentai Galleries** |
|
51 | | **Facebook** | |
52 | | **FC2 Video** | |
53 | | **Flickr** | |
54 | | **Gelbooru** | |
55 | | **Hameln** | |
56 | | **hanime.tv** | |
57 | | **Hentai Foundry** | |
58 | | **Hitomi.la** | |
59 | | **Hiyobi.me** | |
60 | | **Imgur** | |
61 | | **Instagram** | |
62 | | **Iwara** |
|
63 | | **Jmana** | |
64 | | **カクヨム** | |
65 | | **LHScan** | |
66 | | **Likee** | |
67 | | **Luscious** | |
68 | | **MyReadingManga** | |
69 | | **Naver Blog** | |
70 | | **Naver Cafe** | |
71 | | **Naver Post** | |
72 | | **Naver Webtoon** | |
73 | | **Naver TV** | |
74 | | **nhentai** | |
75 | | **nhentai.com** | |
76 | | **Niconico** | |
77 | | **ニジエ** | |
78 | | **Nozomi.la** | |
79 | | **Pawoo** | |
80 | | **Pinterest** | |
81 | | **Pixiv** | |
82 | | **pixivコミック** | |
83 | | **Pornhub** |
|
84 | | **Rule34.xxx** | |
85 | | **Sankaku Complex** |
|
86 | | **Soundcloud** | |
87 | | **小説家になろう** | |
88 | | **TOKYO Motion** | |
89 | | **Tumblr** | |
90 | | **Twitch** | |
91 | | **Twitter** | |
92 | | **Vimeo** | |
93 | | **V LIVE** | |
94 | | **Wayback Machine** | |
95 | | **Weibo** | |
96 | | **WikiArt** | |
97 | | **xHamster** | |
98 | | **XNXX** | |
99 | | **XVideos** | |
100 | | **Yande.re** | |
101 | | **Youku** | |
102 | | **YouTube** | |
103 | | **and more...** | [Supported sites by youtube-dl](http://ytdl-org.github.io/youtube-dl/supportedsites.html) |
104 |
--------------------------------------------------------------------------------
/src/extractor/comicwalker_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, LazyUrl, urljoin, try_n, Downloader, get_print, clean_title, get_imgs_already
4 | import ree as re
5 | from itertools import cycle
6 | from io import BytesIO
7 | import json
8 | from timee import sleep
9 | from translator import tr_
10 | import page_selector
11 | import os
12 |
13 |
14 | # https://static.comic-walker.com/viewer/cw-viewer.min.js
15 | def decode(s, hash):
16 | # generateKey
17 | key = int(hash[:16], 16)
18 |
19 | filter = [int((key>>i*8)%256) for i in range(8)][::-1] #
20 | s2 = bytes(x^y for x, y in zip(s, cycle(filter)))
21 | return s2
22 |
23 |
24 | class Image(object):
25 | def __init__(self, src, hash, p, page):
26 | def f(_):
27 | f = BytesIO()
28 | downloader.download(src, referer=page.url, buffer=f)
29 | s = f.read()
30 | s2 = decode(s, hash)
31 | f.seek(0)
32 | f.write(s2)
33 | f.seek(0)
34 | return f
35 | self.url = LazyUrl(page.url, f, self)
36 | self.filename = u'{}/{:04}.jpg'.format(page.title, p)
37 |
38 |
39 | class Page(object):
40 | def __init__(self, url, title):
41 | self.url = url
42 | self.title = clean_title(title)
43 |
44 |
45 | @Downloader.register
46 | class Downloader_comicwalker(Downloader):
47 | type = 'comicwalker'
48 | URLS = ['comic-walker.com/contents/detail/', 'comic-walker.jp/contents/detail/']
49 | MAX_CORE = 4
50 | display_name = 'ComicWalker'
51 | _soup = None
52 | pages = None
53 |
54 | @property
55 | def soup(self):
56 | if self._soup is None:
57 | html = downloader.read_html(self.url)
58 | self._soup = Soup(html)
59 | return self._soup
60 |
61 | def read(self):
62 | cw = self.cw
63 | title = get_title(self.soup, cw)
64 |
65 | self.imgs = get_imgs(self.url, self.soup, cw)
66 |
67 | for img in self.imgs:
68 | if isinstance(img, Image):
69 | self.urls.append(img.url)
70 | else:
71 | self.urls.append(img)
72 |
73 | self.title = title
74 |
75 |
76 | def get_imgs_page(page):
77 | cid = re.find('[?&]cid=([a-zA-Z0-9_]+)', page.url)
78 | url_api = 'https://ssl.seiga.nicovideo.jp/api/v1/comicwalker/episodes/{}/frames'.format(cid)
79 |
80 | html = downloader.read_html(url_api, referer=page.url)
81 |
82 | meta = json.loads(html)
83 | data = meta['data']
84 | imgs = []
85 | for item in data['result']:
86 | src = item['meta']['source_url']
87 | hash = item['meta']['drm_hash']
88 | img = Image(src, hash, len(imgs), page)
89 | imgs.append(img)
90 |
91 | return imgs
92 |
93 |
94 | def get_pages(url, soup=None):
95 | if soup is None:
96 | html = downloader.read_html(url)
97 | soup = Soup(html)
98 |
99 | pages = []
100 | for item in soup.findAll('div', class_='acBacknumber-item-leftbox'):
101 | item = item.parent
102 | a = item.find('a')
103 | title = a.attrs['title']
104 | href = a.attrs['href']
105 | href = urljoin(url, href)
106 | page = Page(href, title)
107 | pages.append(page)
108 |
109 | return pages
110 |
111 |
112 | def get_title(soup, cw=None):
113 | print_ = get_print(cw)
114 | for h1 in soup.findAll('h1'):
115 | title = h1.text.strip()
116 | if title:
117 | break
118 | else:
119 | raise Exception('no title')
120 | title_clean = clean_title(title)
121 | print_('get_title: "{}"({}) "{}"({})'.format(title, title.encode('utf8'), title_clean, title_clean.encode('utf8')))
122 | return title_clean
123 |
124 |
125 | @page_selector.register('comicwalker')
126 | @try_n(4)
127 | def f(url):
128 | if '/viewer/' in url:
129 | raise Exception(tr_(u'목록 주소를 입력해주세요'))
130 | pages = get_pages(url)
131 | return pages
132 |
133 |
134 | def get_imgs(url, soup=None, cw=None):
135 | if soup is None:
136 | html = downloader.read_html(url)
137 | soup = Soup(html)
138 |
139 | title = get_title(soup, cw)
140 |
141 | pages = get_pages(url, soup)
142 | pages = page_selector.filter(pages, cw)
143 |
144 | imgs = []
145 | for i, page in enumerate(pages):
146 | imgs_already = get_imgs_already('comicwalker', title, page, cw)
147 | if imgs_already:
148 | imgs += imgs_already
149 | continue
150 |
151 | if cw is not None:
152 | if not cw.alive:
153 | return
154 | cw.setTitle(u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages)))
155 |
156 | imgs += get_imgs_page(page)
157 |
158 | return imgs
159 |
160 |
--------------------------------------------------------------------------------
/src/extractor/hameln_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf8
2 | from __future__ import division, print_function, unicode_literals
3 | import downloader
4 | import os
5 | import utils
6 | from utils import Soup, urljoin, get_text, LazyUrl, try_n, Downloader, lazy, clean_title
7 | import ree as re
8 | from io import BytesIO
9 | from timee import sleep
10 | from translator import tr_
11 |
12 |
13 |
14 | @Downloader.register
15 | class Downloader_hameln(Downloader):
16 | type = 'hameln'
17 | URLS = ['syosetu.org']
18 | MAX_CORE = 2
19 | detect_removed = False
20 |
21 | def init(self):
22 | id_ = re.find('/novel/([^/]+)', self.url)
23 | if id_ is not None:
24 | self.url = 'https://syosetu.org/novel/{}/'.format(id_)
25 |
26 | @lazy
27 | def soup(self):
28 | html = read_html(self.url)
29 | soup = Soup(html)
30 | return soup
31 |
32 | @lazy
33 | def info(self):
34 | return get_info(self.url, self.soup)
35 |
36 | def read(self):
37 | for page in get_pages(self.url, self.soup):
38 | text = Text(page, len(self.urls)+1)
39 | self.urls.append(text.url)
40 |
41 | self.artist = self.info['artist']
42 | self.title = clean_title('[{}] {}'.format(self.artist, self.info['title']), n=-len('[merged] .txt'))
43 |
44 | def post_processing(self):
45 | names = self.cw.names
46 | filename = os.path.join(self.dir, '[merged] {}.txt'.format(self.title))
47 | try:
48 | with utils.open(filename, 'wb') as f:
49 | f.write(' {}\n\n 作者:{}\n\n\n'.format(self.info['title'], self.artist).encode('utf8'))
50 | if self.info['novel_ex']:
51 | f.write(self.info['novel_ex'].encode('utf8'))
52 | for i, file in enumerate(names):
53 | self.cw.pbar.setFormat('[%v/%m] {} [{}/{}]'.format(tr_('병합...'), i, len(names)))
54 | with open(file, 'rb') as f_:
55 | text = f_.read()
56 | f.write(b'\n\n\n\n')
57 | f.write(text)
58 | finally:
59 | self.cw.pbar.setFormat('[%v/%m]')
60 |
61 |
62 | class Text(object):
63 | def __init__(self, page, p):
64 | self.page = page
65 | self.url = LazyUrl(page.url, self.get, self)
66 | self.filename = clean_title('[{:04}] {}'.format(p, page.title), n=-4) + '.txt'
67 |
68 | def get(self, url):
69 | text = read_page(self.page)
70 | f = BytesIO()
71 | f.write(text.encode('utf8'))
72 | f.seek(0)
73 | return f
74 |
75 |
76 | class Page(object):
77 | def __init__(self, title, url):
78 | self.title = clean_title(title)
79 | self.url = url
80 |
81 |
82 |
83 | def read_html(url):
84 | return downloader.read_html(url, cookies={'over18': 'off'}, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'})
85 |
86 |
87 | def get_sss(soup):
88 | sss = [ss for ss in soup.findAll('div', class_='ss') if ss.attrs.get('id')!='fmenu']
89 | return sss
90 |
91 |
92 | def get_pages(url, soup=None):
93 | if soup is None:
94 | html = read_html(url)
95 | soup = Soup(html)
96 |
97 | sss = get_sss(soup)
98 | list = sss[-1]
99 |
100 | pages = []
101 | for tr in list.findAll('tr'):
102 | a = tr.find('a')
103 | if a is None:
104 | continue
105 | text =a.text.strip()
106 | href = urljoin(url, a.attrs['href'])
107 | page = Page(text, href)
108 | pages.append(page)
109 |
110 | return pages
111 |
112 |
113 | @try_n(22, sleep=30)
114 | def read_page(page):
115 | html = read_html(page.url)
116 | soup = Soup(html)
117 |
118 | text_top = get_text(soup.find('div', id='maegaki'))
119 | print(text_top.count('\n'))
120 | text_mid = get_text(soup.find('div', id='honbun'))
121 | text_bot = get_text(soup.find('div', id='atogaki'))
122 |
123 | texts = [text for text in (text_top, text_mid, text_bot) if text]
124 |
125 | story = '''
126 |
127 | ────────────────────────────────
128 |
129 | '''.join(texts)
130 |
131 | text = '''────────────────────────────────
132 |
133 | ◆ {}
134 |
135 | ────────────────────────────────
136 |
137 |
138 | {}'''.format(page.title, story)
139 |
140 | return text
141 |
142 |
143 | def get_info(url, soup=None):
144 | if soup is None:
145 | html = read_html(url)
146 | soup = Soup(html)
147 |
148 | info = {}
149 | info['artist'] = soup.find('span', {'itemprop':'author'}).text.strip()
150 | info['title'] = soup.find('span', {'itemprop':'name'}).text.strip()
151 | sss = get_sss(soup)
152 | info['novel_ex'] = get_text(sss[-2], '')
153 | return info
154 |
155 |
--------------------------------------------------------------------------------
/src/extractor/imgur_downloader.py:
--------------------------------------------------------------------------------
1 | # uncompyle6 version 3.5.0
2 | # Python bytecode 2.7 (62211)
3 | # Decompiled from: Python 2.7.16 (v2.7.16:413a49145e, Mar 4 2019, 01:30:55) [MSC v.1500 32 bit (Intel)]
4 | # Embedded file name: imgur_downloader.pyo
5 | # Compiled at: 2019-10-07 05:58:14
6 | import downloader
7 | from utils import Downloader, Soup, try_n, urljoin, get_max_range, clean_title, cut_pair
8 | import ree as re, json, os
9 | from timee import sleep
10 | from translator import tr_
11 |
12 | @Downloader.register
13 | class Downloader_imgur(Downloader):
14 | type = 'imgur'
15 | URLS = ['imgur.com']
16 | MAX_CORE = 16
17 |
18 | def init(self):
19 | self.info = get_info(self.url)
20 |
21 | @property
22 | def id_(self):
23 | return re.find('imgur.com/.+?/([0-9a-zA-Z]+)', self.url)
24 |
25 | @property
26 | def name(self):
27 | title = self.info['title'] or 'N/A'
28 | return clean_title(title, n=100)
29 |
30 | def read(self):
31 | imgs = get_imgs(self.url, self.info, self.cw)
32 | for img in imgs:
33 | ext = os.path.splitext(img.split('?')[0])[1]
34 | if len(imgs) > 1:
35 | self.filenames[img] = (u'{:04}{}').format(len(self.urls), ext)
36 | else:
37 | self.filenames[img] = clean_title(self.name, n=-len(ext)) + ext
38 | self.urls.append(img)
39 |
40 | self.single = len(imgs) == 1
41 | self.referer = self.url
42 | self.title = u'{} (imgur_{})'.format(self.name, self.id_)
43 |
44 |
45 | @try_n(4)
46 | def get_info(url):
47 | url = url.replace('/gallery/', '/a/')
48 | if '/r/' in url and url.split('/r/')[1].strip('/').count('/') == 0:
49 | title = re.find(r'/r/([^/]+)', url)
50 | info = {}
51 | info['title'] = title
52 | info['type'] = 'r'
53 | else:
54 | try: # legacy
55 | html = downloader.read_html(url, cookies={'over18':'1'})
56 | s = re.find('image *: *({.+)', html)
57 | info_raw = cut_pair(s)
58 | except Exception as e: # new
59 | print(e)
60 | id_ = re.find(r'/a/([0-9a-zA-Z_]+)', url) or re.find(r'/r/[0-9a-zA-Z_]+/([0-9a-zA-Z_]+)', url, err='no id')
61 | url_api = 'https://api.imgur.com/post/v1/albums/{}?client_id=546c25a59c58ad7&include=media%2Cadconfig%2Caccount'.format(id_)
62 | info_raw = downloader.read_html(url_api, cookies={'over18':'1'})
63 | info = json.loads(info_raw)
64 | info['type'] = 'a'
65 | return info
66 |
67 |
68 | def get_imgs(url, info=None, cw=None):
69 | print('get_imgs', url)
70 | if info is None:
71 | info = get_info(url)
72 | imgs = []
73 |
74 | # Range
75 | max_pid = get_max_range(cw)
76 |
77 | if info['type'] == 'a':
78 | if 'album_images' in info: # legacy
79 | imgs_ = info['album_images']['images']
80 | elif 'media' in info: # new
81 | imgs_ = info['media']
82 | else: # legacy
83 | imgs_ = [info]
84 |
85 | for img in imgs_:
86 | img_url = img.get('url') # new
87 | if not img_url: # legacy
88 | hash = img['hash']
89 | ext = img['ext']
90 | img_url = 'https://i.imgur.com/{}{}'.format(hash, ext)
91 | if img_url in imgs:
92 | continue
93 | imgs.append(img_url)
94 |
95 | elif info['type'] == 'r':
96 | urls = set()
97 | for p in range(100):
98 | url_api = 'https://imgur.com/r/{}/new/page/{}/hit?scrolled'.format(info['title'], p)
99 | print(url_api)
100 | html = downloader.read_html(url_api, referer=url)
101 | soup = Soup(html)
102 |
103 | c = 0
104 | for post in soup.findAll('div', class_='post'):
105 | a = post.find('a', class_='image-list-link')
106 | url_post = urljoin(url, a.attrs['href'])
107 | if url_post in urls:
108 | continue
109 | urls.add(url_post)
110 | c += 1
111 |
112 | try: # for r18 images
113 | imgs += get_imgs(url_post)
114 | except Exception as e:
115 | print(e)
116 |
117 | s = (u'{} {} ({})').format(tr_(u'\uc77d\ub294 \uc911...'), info['title'], len(imgs))
118 | if cw is not None:
119 | if cw.alive:
120 | cw.setTitle(s)
121 | else:
122 | return []
123 | else:
124 | print(s)
125 |
126 | if c == 0:
127 | print('same; break')
128 | break
129 |
130 | return imgs
131 |
132 |
--------------------------------------------------------------------------------
/src/extractor/discord_emoji_downloader.py:
--------------------------------------------------------------------------------
1 | # coding: UTF-8
2 | # title: Discord 서버 커스텀 이모지 다운로드
3 | # author: SaidBySolo
4 |
5 | """
6 | MIT License
7 |
8 | Copyright (c) 2020 SaidBySolo
9 |
10 | Permission is hereby granted, free of charge, to any person obtaining a copy
11 | of this software and associated documentation files (the "Software"), to deal
12 | in the Software without restriction, including without limitation the rights
13 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14 | copies of the Software, and to permit persons to whom the Software is
15 | furnished to do so, subject to the following conditions:
16 |
17 | The above copyright notice and this permission notice shall be included in all
18 | copies or substantial portions of the Software.
19 |
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 | SOFTWARE.
27 | """
28 | from utils import Downloader, clean_title
29 | import requests
30 | import errors
31 |
32 |
33 | @Downloader.register
34 | class DownloaderDiscordEmoji(Downloader):
35 | type = "discord"
36 |
37 | def init(self):
38 | pass
39 |
40 | def read(self):
41 | token_guild_id_list = self.url.split(
42 | "/"
43 | ) # 값을 어떻게 받을지 몰라서 일단 나눴어요. discord_이메일/비밀번호/서버아이디 또는 discord_토큰/서버아이디 이런식으로 받게 해놨어요.
44 |
45 | if len(token_guild_id_list) == 2:
46 | token = token_guild_id_list[0]
47 | guild_id = token_guild_id_list[1]
48 | elif len(token_guild_id_list) == 3:
49 | email = token_guild_id_list[0]
50 | password = token_guild_id_list[1]
51 | guild_id = token_guild_id_list[2]
52 |
53 | response = self.post_account_info(email, password)
54 | account_info = response.json()
55 | if response.status_code == 400:
56 | if account_info.get("captcha_key"):
57 | raise errors.Invalid(
58 | "먼저 웹 또는 디스코드 앱에서 로그인하신후 캡차를 인증해주세요."
59 | ) # 메세지 박스 return하니까 멈춰서 raise로 해놨어요
60 | else:
61 | raise errors.Invalid("이메일 또는 비밀번호가 잘못되었습니다. 확인후 다시 시도해주세요.")
62 | else:
63 | if not account_info["token"]:
64 | raise errors.Invalid("토큰을 받아오지 못했어요. 2단계인증을 사용중이신경우 토큰을 이용해 요청해주세요.")
65 | else:
66 | token = account_info["token"]
67 | else:
68 | raise errors.Invalid("인자값이 더 많이왔어요.")
69 |
70 | guild_info_response = self.get_emoji_list(token, int(guild_id)) # 토큰과 함께 get요청함
71 |
72 | if guild_info_response.status_code != 200:
73 | raise errors.Invalid("정상적인 토큰이 아니거나 서버를 찾을수없어요. 맞는 토큰인지, 해당 서버에 접속해있는지 확인해주세요.")
74 | else:
75 | guild_info = guild_info_response.json()
76 |
77 | if guild_info["emojis"]:
78 | base_url = "https://cdn.discordapp.com/emojis/"
79 | for emoji in guild_info["emojis"]: # 이모지 리스트로 가져옴
80 | if emoji["animated"] is True: # 만약 gif면 gif 다운로드
81 | param = emoji["id"] + ".gif"
82 | else: # 아닐경우 png로
83 | param = emoji["id"] + ".png"
84 |
85 | self.title = clean_title(
86 | f'{guild_info["name"]}({guild_info["id"]})' # 폴더 이름은 서버 이름, id
87 | )
88 | self.urls.append(base_url + param + "?v=1") # 인자 합치기
89 | else:
90 | raise errors.Invalid("해당 서버에는 이모지가 없어요")
91 |
92 | def get_emoji_list(self, token: str, guild_id: int) -> dict:
93 | response = requests.get(
94 | f"https://discordapp.com/api/v6/guilds/{guild_id}",
95 | headers={"Authorization": token},
96 | )
97 | if response.status_code == 401:
98 | response = requests.get(
99 | f"https://discordapp.com/api/v6/guilds/{guild_id}",
100 | headers={"Authorization": f"Bot {token}"},
101 | )
102 |
103 | return response
104 |
105 | def post_account_info(self, email: str, password: str) -> dict:
106 | response = requests.post(
107 | "https://discordapp.com/api/v8/auth/login",
108 | json={
109 | "email": email,
110 | "password": password,
111 | "undelete": False,
112 | "captcha_key": None,
113 | "login_source": None,
114 | "gift_code_sku_id": None,
115 | },
116 | )
117 |
118 | return response
119 |
--------------------------------------------------------------------------------
/src/extractor/bdsmlr_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Session, Soup, LazyUrl, Downloader, get_max_range, try_n, get_print, clean_title
4 | from datetime import datetime
5 | import ree as re
6 | import os
7 | from translator import tr_
8 | from timee import sleep
9 | from error_printer import print_error
10 | import clf2
11 | import errors
12 |
13 |
14 | @Downloader.register
15 | class Downloader_bdsmlr(Downloader):
16 | type = 'bdsmlr'
17 | URLS = ['bdsmlr.com']
18 | display_name = 'BDSMlr'
19 |
20 | def init(self):
21 | if u'bdsmlr.com/post/' in self.url:
22 | raise errors.Invalid(tr_(u'개별 다운로드는 지원하지 않습니다: {}').format(self.url))
23 |
24 | self.url = 'https://{}.bdsmlr.com'.format(self.id_)
25 | self.session = Session()
26 | clf2.solve(self.url, session=self.session, cw=self.cw)
27 |
28 | @property
29 | def id_(self):
30 | url = self.url
31 | if 'bdsmlr.com' in url:
32 | if 'www.bdsmlr.com' in url:
33 | raise Exception('www.bdsmlr.com')
34 | gal_num = url.split('.bdsmlr.com')[0].split('/')[(-1)]
35 | else:
36 | gal_num = url
37 | return gal_num
38 |
39 | def read(self):
40 | info = get_imgs(self.id_, session=self.session, cw=self.cw)
41 |
42 | for post in info['posts']:
43 | self.urls.append(post.url)
44 |
45 | self.title = u'{} (bdsmlr_{})'.format(clean_title(info['username']), self.id_)
46 |
47 |
48 | class Post(object):
49 | def __init__(self, url, referer, id, p):
50 | self.id = id
51 | self.url = LazyUrl(referer, lambda x: url, self)
52 | ext = os.path.splitext(url)[1]
53 | self.filename = u'{}_p{}{}'.format(id, p, ext)
54 |
55 |
56 | def foo(url, soup, info, reblog=False):
57 | #print('foo', info['c'], len(info['ids']))
58 | for post in soup.findAll('div', class_='wrap-post'):
59 | try:
60 | id = int(re.find('[0-9]+', post.attrs['class'][1]))
61 | except Exception as e:
62 | print(print_error(e)[-1])
63 | continue
64 | if id in info['ids']:
65 | continue
66 | info['ids'].add(id)
67 | info['last'] = id
68 | if not reblog and post.find('div', class_='ogname'):
69 | continue
70 | for p, mag in enumerate(post.findAll(['a', 'div'], class_='magnify')):
71 | post = Post(mag.attrs['href'], url, id, p)
72 | info['posts'].append(post)
73 | info['c'] += 20 if info['c'] else 5
74 |
75 |
76 | @try_n(2)
77 | def get_imgs(user_id, session, cw=None):
78 | print_ = get_print(cw)
79 | url = 'https://{}.bdsmlr.com/'.format(user_id)
80 | info = {'c': 0, 'posts': [], 'ids': set()}
81 |
82 | html = downloader.read_html(url, session=session)
83 | soup = Soup(html)
84 |
85 | sorry = soup.find('div', class_='sorry')
86 | if sorry:
87 | raise Exception(sorry.text.strip())
88 |
89 | username = soup.find('title').text.strip()###
90 | print('username:', username)
91 | info['username'] = username
92 |
93 | token = soup.find('meta', {'name': 'csrf-token'}).attrs['content']
94 | print_(u'token: {}'.format(token))
95 |
96 | max_pid = get_max_range(cw)
97 |
98 | n = len(info['ids'])
99 | for p in range(1000):
100 | if p == 0:
101 | url_api = 'https://{}.bdsmlr.com/loadfirst'.format(user_id)
102 | else:
103 | url_api = 'https://{}.bdsmlr.com/infinitepb2/{}'.format(user_id, user_id)
104 | data = {
105 | 'scroll': str(info['c']),
106 | 'timenow': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
107 | }
108 | if 'last' in info:
109 | data['last'] = str(info['last'])
110 | print_(u'n:{}, scroll:{}, last:{}'.format(len(info['posts']), data['scroll'], data.get('last')))
111 | headers = {
112 | 'Referer': url,
113 | 'X-CSRF-TOKEN': token,
114 | }
115 | for try_ in range(4):
116 | try:
117 | r = session.post(url_api, data=data, headers=headers)
118 | if p == 0:
119 | r.raise_for_status()
120 | break
121 | except Exception as e:
122 | print(e)
123 | else:
124 | raise
125 | soup = Soup(r.text)
126 | foo(url, soup, info)
127 | if len(info['ids']) == n:
128 | print('same; break')
129 | break
130 | n = len(info['ids'])
131 |
132 | s = u'{} {} (tumblr_{}) - {}'.format(tr_(u'읽는 중...'), username, user_id, len(info['posts']))
133 | if cw is not None:
134 | if not cw.alive:
135 | return
136 | cw.setTitle(s)
137 | else:
138 | print(s)
139 |
140 | if len(info['posts']) > max_pid:
141 | break
142 |
143 | return info
144 |
145 |
--------------------------------------------------------------------------------
/src/extractor/nijie_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import downloader
3 | from utils import Downloader, urljoin, get_max_range, query_url, Soup, Session, LazyUrl, get_print, clean_title, try_n, get_ext
4 | from translator import tr_
5 | from constants import clean_url
6 | import ree as re
7 | from errors import LoginRequired
8 |
9 |
10 | def get_id(url):
11 | return re.find('id=([0-9]+)', url)
12 |
13 |
14 | def get_name(soup):
15 | return soup.find('p', class_='user_icon').find('a', class_='name').text.strip()
16 |
17 |
18 | def isLogin(soup):
19 | if soup.find('ul', id="sub-menu"):
20 | return True
21 | return False
22 |
23 |
24 | @Downloader.register
25 | class Downloader_nijie(Downloader):
26 | type = 'nijie'
27 | URLS = ['nijie.info']
28 | MAX_CORE = 4
29 | display_name = 'ニジエ'
30 |
31 | def init(self):
32 | if 'members.php' not in self.url and 'members_illust.php' not in self.url:
33 | raise NotImplementedError()
34 | id = get_id(self.url)
35 | html = downloader.read_html('https://nijie.info/members.php?id={}'.format(id))
36 | self.soup = Soup(html)
37 |
38 | if not isLogin(self.soup):
39 | raise LoginRequired()
40 |
41 | @classmethod
42 | def fix_url(cls, url):
43 | if 'nijie.info' not in url.lower():
44 | url = 'https://nijie.info/members.php?id={}'.format(url)
45 | return url.replace('http://', 'https://')
46 |
47 | @property
48 | def name(self):
49 | name = u'{} (nijie_{})'.format(get_name(self.soup), get_id(self.url))
50 | return clean_title(name)
51 |
52 | def read(self):
53 | self.title = self.name
54 |
55 | imgs = get_imgs(self.url, self.name, cw=self.cw)
56 |
57 | for img in imgs:
58 | self.urls.append(img.url)
59 |
60 | self.title = self.name
61 |
62 |
63 |
64 | class Image(object):
65 | def __init__(self, id, url, p, lazy=True, img=None):
66 | self.id = id
67 | self.p = p
68 | if lazy:
69 | self.url = LazyUrl(url, self.get_single, self)
70 | else:
71 | self.url = LazyUrl(url, lambda _:img, self)
72 | ext = get_ext(img)
73 | self.filename = '{}_p{}{}'.format(id, p, ext)
74 |
75 | def get_single(self, url): # single
76 | img = get_imgs_post(self.id, url)[0].url()
77 | ext = get_ext(img)
78 | self.filename = '{}_p{}{}'.format(self.id, self.p, ext)
79 | return img
80 |
81 |
82 | @try_n(8, sleep=10)
83 | def get_imgs_post(id, url):
84 | #print('get_imgs_post', id, url)
85 | html = downloader.read_html(url)
86 | soup = Soup(html)
87 | view = soup.find('div', id='gallery')
88 | imgs = []
89 | for img in view.findAll(class_='mozamoza'):
90 | url_img = urljoin(url, img['src'])
91 | url_img = re.sub('__rs_l[0-9]+x[0-9]+/', '', url_img)
92 | img = Image(id, url, len(imgs), False, url_img)
93 | imgs.append(img)
94 | return imgs
95 |
96 |
97 | def setPage(url, page):
98 | # Always use HTTPS
99 | url = url.replace('http://', 'https://')
100 |
101 | # Change the page
102 | if 'p=' in url:
103 | url = re.sub('p=[0-9]*', 'p={}'.format(page), url)
104 | else:
105 | url += '&p={}'.format(page)
106 |
107 | return url
108 |
109 |
110 | def get_imgs(url, title=None, cw=None):
111 | print_ = get_print(cw)
112 | url = clean_url(url)
113 |
114 | id = get_id(url)
115 | url = u'https://nijie.info/members_illust.php?id={}'.format(id)
116 |
117 | # Range
118 | max_pid = get_max_range(cw)
119 |
120 | imgs = []
121 | url_imgs = set()
122 | for p in range(1, 1+100):
123 | url = setPage(url, p)
124 | print_(url)
125 | html = downloader.read_html(url)
126 |
127 | soup = Soup(html)
128 | posts = soup.findAll('div', class_='nijie')
129 | if not posts:
130 | print('no posts')
131 | break
132 | c = 0
133 | for post in posts:
134 | url_img = urljoin(url, post.a.attrs['href'])
135 | if url_img in url_imgs:
136 | print('duplicate:', url_img)
137 | continue
138 | url_imgs.add(url_img)
139 | id = int(re.find('[?&]id=([0-9]+)', url_img))
140 | multi = post.find('div', class_='thumbnail-icon')
141 | if multi:
142 | imgs_ = get_imgs_post(id, url_img)#
143 | else:
144 | imgs_ = [Image(id, url_img, 0)]
145 |
146 | imgs += imgs_
147 | c += 1
148 |
149 | if len(imgs) >= max_pid:
150 | break
151 |
152 | msg = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
153 | if cw:
154 | if not cw.alive:
155 | return
156 | cw.setTitle(msg)
157 | else:
158 | print(msg)
159 |
160 | if len(imgs) >= max_pid or c == 0:
161 | break
162 | return imgs
163 |
164 |
165 |
--------------------------------------------------------------------------------
/src/extractor/hf_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, urljoin, Session, LazyUrl, Downloader, lazy, try_n, clean_title
4 | import ree as re
5 | import os
6 | from translator import tr_
7 | from timee import sleep
8 | URL_ENTER = 'https://www.hentai-foundry.com/site/index?enterAgree=1&size=1550'
9 | URL_FILTER = 'https://www.hentai-foundry.com/site/filters'
10 |
11 |
12 | class Image(object):
13 | def __init__(self, url, session):
14 | @try_n(4)
15 | def f(_):
16 | html = downloader.read_html(url, session=session)
17 | soup = Soup(html)
18 |
19 | box = soup.find('section', id='picBox')
20 | img = box.find('img')
21 | if img is None:
22 | raise Exception('No img')
23 |
24 | onclick = img.attrs.get('onclick', '')
25 | if onclick and '.src' in onclick:
26 | print('onclick', onclick)
27 | img = re.find('''.src *= *['"](.+?)['"]''', onclick)
28 | else:
29 | img = img.attrs['src']
30 | img = urljoin(url, img)
31 |
32 | filename = clean_title(os.path.basename(img.split('?')[0]))
33 | name, ext = os.path.splitext(filename)
34 |
35 | # https://www.hentai-foundry.com/pictures/user/DrGraevling/74069/Eversong-Interrogation-pg.-13
36 | if ext.lower() not in ['.bmp', '.png', '.gif', '.jpg', '.jpeg', '.webp', '.webm', '.avi', '.mp4', '.mkv', '.wmv']:
37 | filename = u'{}.jpg'.format(name)
38 |
39 | self.filename = filename
40 | return img
41 | self.url = LazyUrl(url, f, self)
42 |
43 |
44 | def get_username(url):
45 | if 'user/' in url:
46 | username = url.split('user/')[1].split('?')[0].split('/')[0]
47 | return username
48 |
49 |
50 | @Downloader.register
51 | class Downloader_hf(Downloader):
52 | type = 'hf'
53 | URLS = ['hentai-foundry.com']
54 | MAX_CORE = 16
55 | display_name = 'Hentai Foundry'
56 |
57 | def init(self):
58 | self.session = enter()
59 |
60 | @classmethod
61 | def fix_url(cls, url):
62 | username = get_username(url)
63 | return 'https://www.hentai-foundry.com/user/{}'.format(username)
64 |
65 | def read(self):
66 | username = get_username(self.url)
67 | self.title = username
68 |
69 | imgs = get_imgs(username, self.title, self.session, cw=self.cw)
70 |
71 | for img in imgs:
72 | self.urls.append(img.url)
73 |
74 | self.title = username
75 |
76 |
77 | @try_n(2)
78 | def enter():
79 | print('enter')
80 | session = Session()
81 |
82 | r = session.get(URL_ENTER)
83 |
84 | # 862
85 | html = r.text
86 | soup = Soup(html)
87 | box = soup.find('aside', id='FilterBox')
88 | data = {}
89 | for select in box.findAll('select'):
90 | name = select.attrs['name']
91 | value = select.findAll('option')[-1].attrs['value']
92 | print(name, value)
93 | data[name] = value
94 | for input in box.findAll('input'):
95 | name = input.attrs['name']
96 | value = input.attrs['value']
97 | if name.startswith('rating_') or 'CSRF_TOKEN' in name:
98 | print(name, value)
99 | data[name] = value
100 | data.update({
101 | 'filter_media': 'A',
102 | 'filter_order': 'date_new',
103 | 'filter_type': '0',
104 | })
105 | r = session.post(URL_FILTER, data=data, headers={'Referer': r.url})
106 | print(r)
107 |
108 | return session
109 |
110 |
111 | def get_imgs(username, title, session, cw=None):
112 | url = 'https://www.hentai-foundry.com/pictures/user/{}'.format(username)
113 |
114 | #downloader.read_html(url_enter, session=session)
115 |
116 | hrefs = []
117 | for p in range(100):
118 | print(url)
119 | html = downloader.read_html(url, session=session)
120 | soup = Soup(html)
121 |
122 | if soup.find('div', id='entryButtonContainer'):
123 | session = enter()
124 | continue
125 |
126 | tab = soup.find('a', class_='active')
127 | n = re.find(r'\(([0-9]+)', tab.text)
128 |
129 | view = soup.find('div', class_='galleryViewTable')
130 | for a in view.findAll('a', class_='thumbLink'):
131 | href = urljoin(url, a.attrs['href'])
132 | if href in hrefs:
133 | print('dup')
134 | continue
135 | hrefs.append(href)
136 |
137 | next = soup.find(lambda tag: tag.name == 'li' and tag.get('class') == ['next'])
138 | if next is None:
139 | break
140 | url = urljoin(url, next.a.attrs['href'])
141 |
142 | s = u'{} {} ({} / {})'.format(tr_(u'읽는 중...'), title, len(hrefs), n)
143 | if cw:
144 | if not cw.alive:
145 | return []
146 | cw.setTitle(s)
147 | else:
148 | print(s)
149 |
150 | imgs = []
151 | for href in hrefs:
152 | img = Image(href, session)
153 | imgs.append(img)
154 |
155 | return imgs
156 |
157 |
--------------------------------------------------------------------------------
/src/extractor/luscious_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, Downloader, LazyUrl, urljoin, try_n, get_outdir, clean_title
4 | import ree as re
5 | import os
6 | from timee import sleep
7 | from translator import tr_
8 | from io import BytesIO
9 | import json
10 |
11 |
12 | class Image(object):
13 | def __init__(self, item, referer):
14 | self.item = item
15 | self.id = str(item['id'])
16 | self.referer = referer
17 | self.url = LazyUrl(referer, self.get, self)
18 |
19 | def get(self, url):
20 | img = urljoin(url, self.item['url_to_original'])
21 | ext = os.path.splitext(img.split('?')[0])[1]
22 | self.filename = u'{}{}'.format(self.id, ext)
23 | return img
24 |
25 |
26 | class Video(object):
27 | def __init__(self, url, title, url_thumb):
28 | self.url = url
29 | self.title = title
30 | ext = os.path.splitext(url.split('?')[0])[1]
31 | self.filename = u'{}{}'.format(clean_title(title), ext)
32 | self.url_thumb = url_thumb
33 | self.thumb = BytesIO()
34 | downloader.download(self.url_thumb, buffer=self.thumb)
35 |
36 |
37 | @Downloader.register
38 | class Downloader_luscious(Downloader):
39 | type = 'luscious'
40 | URLS = ['luscious.net']
41 | MAX_CORE = 4
42 |
43 | @classmethod
44 | def fix_url(cls, url):
45 | url = url.replace('members.luscious.', 'www.luscious.')
46 | return url
47 |
48 | def read(self):
49 | for try_ in range(8):
50 | try:
51 | html = downloader.read_html(self.url)
52 | break
53 | except Exception as e:
54 | e_ = e
55 | self.print_error(e)
56 | self.print_('retry...')
57 | else:
58 | raise e_
59 | soup = Soup(html)
60 | title = clean_title(get_title(soup))
61 |
62 | self.title = tr_(u'읽는 중... {}').format(title)
63 |
64 | if '/videos/' in self.url:
65 | video = get_video(self.url, soup)
66 | imgs = [video]
67 | self.setIcon(video.thumb)
68 | else:
69 | imgs = get_imgs(self.url, soup, self.cw)
70 |
71 | dir = os.path.join(get_outdir(self.type), title)
72 | names = {}
73 | try:
74 | for name in os.listdir(dir):
75 | id = os.path.splitext(name)[0]
76 | names[id] = name
77 | except:
78 | pass
79 |
80 | for img in imgs:
81 | if img.id in names:
82 | url = os.path.join(dir, names[img.id])
83 | else:
84 | url = img.url
85 | self.urls.append(url)
86 |
87 | self.title = title#
88 |
89 |
90 | def update(cw, title, imgs):
91 | s = u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs))
92 | if cw is not None:
93 | cw.setTitle(s)
94 | else:
95 | print(s)
96 |
97 | def get_imgs(url, soup=None, cw=None):
98 | if soup is None:
99 | html = downloader.read_html(url)
100 | soup = Soup(html)
101 | title = get_title(soup)
102 |
103 | imgs = []
104 | for p in range(1, 81):
105 | imgs_new = get_imgs_p(url, p)
106 | if not imgs_new:
107 | break
108 | imgs += imgs_new
109 | update(cw, title, imgs)
110 | return imgs
111 |
112 |
113 | @try_n(4, sleep=30)
114 | def get_imgs_p(url, p=1):
115 | id = re.find('/albums/[^/]+?([0-9]+)/', url+'/')
116 | print(url, id)
117 | url_api = 'https://api.luscious.net/graphql/nobatch/?operationName=AlbumListOwnPictures&query=+query+AlbumListOwnPictures%28%24input%3A+PictureListInput%21%29+%7B+picture+%7B+list%28input%3A+%24input%29+%7B+info+%7B+...FacetCollectionInfo+%7D+items+%7B+...PictureStandardWithoutAlbum+%7D+%7D+%7D+%7D+fragment+FacetCollectionInfo+on+FacetCollectionInfo+%7B+page+has_next_page+has_previous_page+total_items+total_pages+items_per_page+url_complete+%7D+fragment+PictureStandardWithoutAlbum+on+Picture+%7B+__typename+id+title+created+like_status+number_of_comments+number_of_favorites+status+width+height+resolution+aspect_ratio+url_to_original+url_to_video+is_animated+position+tags+%7B+category+text+url+%7D+permissions+url+thumbnails+%7B+width+height+size+url+%7D+%7D+&variables=%7B%22input%22%3A%7B%22filters%22%3A%5B%7B%22name%22%3A%22album_id%22%2C%22value%22%3A%22{}%22%7D%5D%2C%22display%22%3A%22position%22%2C%22page%22%3A{}%7D%7D'.format(id, p)
118 | data_raw = downloader.read_html(url_api, referer=url)
119 | data = json.loads(data_raw)
120 | has_next_page = data['data']['picture']['list']['info']['has_next_page']
121 | imgs = []
122 | for item in data['data']['picture']['list']['items']:
123 | img = Image(item, url)
124 | imgs.append(img)
125 |
126 | return imgs
127 |
128 |
129 | def get_video(url, soup):
130 | url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
131 |
132 | title = re.find('videos/([^/]+)', url)
133 | video = soup.find('video')
134 | url = video.source.attrs['src']
135 | video = Video(url, title, url_thumb)
136 | return video
137 |
138 |
139 | def get_title(soup):
140 | return soup.find('h1').text.strip()
141 |
--------------------------------------------------------------------------------
/src/extractor/xvideo_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | from utils import Downloader, Soup, LazyUrl, urljoin, format_filename, Session, get_ext, get_print, get_max_range, html_unescape
3 | from io import BytesIO
4 | from constants import try_n
5 | import ree as re
6 | from m3u8_tools import playlist2stream
7 | from translator import tr_
8 | import json
9 | from timee import sleep
10 | from ratelimit import limits, sleep_and_retry
11 | CHANNEL_PATTERN = r'/(profiles|[^/]*channels)/([0-9a-zA-Z_]+)'
12 |
13 |
14 | def get_id(url):
15 | url = url.lower()
16 | if '/prof-video-click/' in url:
17 | return url.split('/prof-video-click/')[1].split('/')[2]
18 | return re.find(r'xvideos[0-9]*\.[^/]+/video([0-9]+)', url, err='no id')
19 |
20 |
21 | class Video(object):
22 | _url = None
23 |
24 | def __init__(self, url_page):
25 | url_page = Downloader_xvideo.fix_url(url_page)
26 | self.url = LazyUrl(url_page, self.get, self)
27 |
28 | def get(self, url_page):
29 | if not self._url:
30 | self._get(url_page)
31 | return self._url
32 |
33 | @try_n(4)
34 | @sleep_and_retry
35 | @limits(1, 2)
36 | def _get(self, url_page):
37 | id = get_id(url_page)
38 | html = downloader.read_html(url_page)
39 | soup = Soup(html)
40 | self.title = html_unescape(soup.find('title').text).replace('- XVIDEOS.COM', '').strip()
41 | url = re.find(r'''.setVideoHLS\(['"](.+?)['"]\)''', html) or re.find(r'''.setVideoUrlLow\(['"](.+?)['"]\)''', html) #https://www.xvideos.com/video65390539/party_night
42 | if not url:
43 | raise Exception('no video url')
44 | ext = get_ext(url)
45 | if ext.lower() == '.m3u8':
46 | url = playlist2stream(url, n_thread=5)
47 | self.url_thumb = soup.find('meta', {'property': 'og:image'}).attrs['content']
48 | self.filename = format_filename(self.title, id, '.mp4')
49 | self._url= url
50 |
51 | @property
52 | def thumb(self):
53 | self.url()
54 | f = BytesIO()
55 | downloader.download(self.url_thumb, buffer=f)
56 | return f
57 |
58 |
59 |
60 | @Downloader.register
61 | class Downloader_xvideo(Downloader):
62 | type = 'xvideo'
63 | URLS = [r'regex:[./]xvideos[0-9]*\.(com|in|es)']
64 | single = True
65 | display_name = 'XVideos'
66 |
67 | def init(self):
68 | if 'xvideos.' in self.url.lower():
69 | self.url = self.url.replace('http://', 'https://')
70 | else:
71 | self.url = 'https://www.xvideos.com/{}'.format(self.url)
72 |
73 | @classmethod
74 | def fix_url(cls, url):
75 | url = re.sub(r'[^/]*xvideos[0-9]*\.[^/]+', 'www.xvideos.com', url).replace('http://', 'https://')
76 | url = url.replace('/THUMBNUM/', '/')
77 | return url
78 |
79 | @classmethod
80 | def key_id(cls, url):
81 | res = re.find(CHANNEL_PATTERN, url)
82 | if res:
83 | return '_'.join(res)
84 | return url
85 |
86 | def read(self):
87 | res = re.find(CHANNEL_PATTERN, self.url)
88 | if res:
89 | header, username = res
90 | info = read_channel(self.url, self.cw)
91 | videos = [Video(url) for url in info['urls']]
92 | video = self.process_playlist('[Channel] {}'.format(info['name']), videos)
93 | else:
94 | video = Video(self.url)
95 | video.url()
96 | self.title = video.title
97 | self.urls.append(video.url)
98 |
99 | self.setIcon(video.thumb)
100 |
101 |
102 | def read_channel(url_page, cw=None):
103 | print_ = get_print(cw)
104 | res = re.find(CHANNEL_PATTERN, url_page)
105 | if res is None:
106 | raise Exception('Not channel')
107 | header, username = res
108 | print(header, username)
109 | max_pid = get_max_range(cw)
110 | info = {}
111 | info['header'] = header
112 | info['username'] = username
113 | session = Session()
114 | urls = []
115 | ids = set()
116 | for p in range(100):
117 | url_api = urljoin(url_page, '/{}/{}/videos/best/{}'.format(header, username, p))
118 | print_(url_api)
119 | r = session.post(url_api)
120 | data = json.loads(r.text)
121 |
122 | videos = data['videos']
123 | if not videos:
124 | print_('empty')
125 | break
126 |
127 | for video in videos:
128 | id_ = video['id']
129 | if id_ in ids:
130 | print_('duplicate: {}'.format(id_))
131 | continue
132 | ids.add(id_)
133 | info['name'] = video['pn']
134 | urls.append(urljoin(url_page, video['u']))
135 |
136 | if len(urls) >= max_pid:
137 | break
138 |
139 | n = data['nb_videos']
140 |
141 | s = '{} {} - {}'.format(tr_('읽는 중...'), info['name'], len(urls))
142 | if cw:
143 | cw.setTitle(s)
144 | else:
145 | print(s)
146 | if len(ids) >= n:
147 | break
148 | sleep(1, cw)
149 | if not urls:
150 | raise Exception('no videos')
151 | info['urls'] = urls[:max_pid]
152 | return info
153 |
--------------------------------------------------------------------------------
/src/extractor/gelbooru_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import downloader
3 | import ree as re
4 | import os
5 | from utils import Downloader, urljoin, query_url, Soup, get_max_range, get_print, LazyUrl, get_ext, clean_title, Session
6 | from translator import tr_
7 | try:
8 | from urllib import quote # python2
9 | except:
10 | from urllib.parse import quote # python3
11 | import sys
12 | from timee import sleep
13 | from constants import clean_url
14 |
15 |
16 | def get_tags(url):
17 | url = clean_url(url)
18 | qs = query_url(url)
19 | if 'page=favorites' in url:
20 | id = qs.get('id', ['N/A'])[0]
21 | id = u'fav_{}'.format(id)
22 | else:
23 | tags = qs.get('tags', [])
24 | tags.sort()
25 | id = u' '.join(tags)
26 | if not id:
27 | id = u'N/A'
28 | return id
29 |
30 |
31 | @Downloader.register
32 | class Downloader_gelbooru(Downloader):
33 | type = 'gelbooru'
34 | URLS = ['gelbooru.com']
35 | MAX_CORE = 8
36 | _name = None
37 |
38 | @classmethod
39 | def fix_url(cls, url):
40 | if 'gelbooru.com' in url.lower():
41 | url = url.replace('http://', 'https://')
42 | else:
43 | url = url.replace(' ', '+')
44 | while '++' in url:
45 | url = url.replace('++', '+')
46 | url = quote(url)
47 | url = url.replace('%2B', '+')
48 | url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format(url)
49 | return url
50 |
51 | @property
52 | def name(self):
53 | if self._name is None:
54 | tags = get_tags(self.url)
55 | self._name = tags
56 | return clean_title(self._name)
57 |
58 | def read(self):
59 | self.title = self.name
60 |
61 | imgs = get_imgs(self.url, self.name, cw=self.cw)
62 |
63 | for img in imgs:
64 | self.urls.append(img.url)
65 |
66 | self.title = self.name
67 |
68 |
69 | @LazyUrl.register
70 | class LazyUrl_gelbooru(LazyUrl):
71 | type = 'gelbooru'
72 | def dump(self):
73 | return {
74 | 'id': self.image.id_,
75 | 'url': self.image._url,
76 | }
77 | @classmethod
78 | def load(cls, data):
79 | img = Image(data['id'], data['url'])
80 | return img.url
81 |
82 |
83 | class Image(object):
84 | def __init__(self, id_, url):
85 | self.id_ = id_
86 | self._url = url
87 | self.url = LazyUrl_gelbooru(url, self.get, self)
88 |
89 | def get(self, url):
90 | html = downloader.read_html(url)
91 | soup = Soup(html)
92 | for li in soup.findAll('li'):
93 | if li.text.strip() == 'Original image':
94 | break
95 | else:
96 | raise Exception('no Original image')
97 | url = li.find('a')['href']
98 | ext = get_ext(url)
99 | self.filename = u'{}{}'.format(self.id_, ext)
100 | return url
101 |
102 |
103 | def setPage(url, page):
104 | # Always use HTTPS
105 | url = url.replace('http://', 'https://')
106 |
107 | # Change the page
108 | if 'pid=' in url:
109 | url = re.sub('pid=[0-9]*', 'pid={}'.format(page), url)
110 | else:
111 | url += '&pid={}'.format(page)
112 |
113 | if page == 0:
114 | url = url.replace('&pid=0', '')
115 |
116 | return url
117 |
118 |
119 | def get_imgs(url, title=None, cw=None):
120 | print_ = get_print(cw)
121 | url = clean_url(url)
122 | if 's=view' in url and 'page=favorites' not in url:
123 | raise NotImplementedError('Not Implemented')
124 |
125 | tags = get_tags(url)
126 | tags = quote(tags, safe='/')
127 | tags = tags.replace('%20', '+')
128 | url = 'https://gelbooru.com/index.php?page=post&s=list&tags={}'.format(tags)
129 |
130 | # 2566
131 | user_id = Session().cookies.get('user_id', domain='gelbooru.com')
132 | if user_id:
133 | cookies = None
134 | else:
135 | cookies = {'fringeBenefits': 'yup'}
136 | print_('user_id: {}'.format(user_id))
137 |
138 | # Range
139 | max_pid = get_max_range(cw)
140 |
141 | imgs = []
142 | ids = set()
143 | count_no_imgs = 0
144 | for p in range(500): #1017
145 | url = setPage(url, len(ids))
146 | print_(url)
147 | html = downloader.read_html(url, cookies=cookies)
148 |
149 | soup = Soup(html)
150 | posts = soup.findAll(class_='thumbnail-preview')
151 | imgs_new = []
152 | for post in posts:
153 | id_ = int(re.find('[0-9]+', post.find('a')['id'], err='no id'))
154 | if id_ in ids:
155 | print('duplicate:', id_)
156 | continue
157 | ids.add(id_)
158 | url_img = urljoin(url, post.find('a')['href'])
159 | img = Image(id_, url_img)
160 | imgs_new.append(img)
161 | if imgs_new:
162 | imgs += imgs_new
163 | count_no_imgs = 0
164 | else:
165 | print('no imgs')
166 | count_no_imgs += 1
167 | if count_no_imgs > 1:
168 | print('break')
169 | break
170 |
171 | if len(imgs) >= max_pid:
172 | break
173 |
174 | if cw is not None:
175 | if not cw.alive:
176 | break
177 | cw.setTitle(u'{} {} - {}'.format(tr_(u'읽는 중...'), title, len(imgs)))
178 |
179 | return imgs[:max_pid]
180 |
--------------------------------------------------------------------------------
/src/extractor/bcy_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from __future__ import print_function
3 | import downloader
4 | from utils import Soup, cut_pair, LazyUrl, Downloader, get_print, get_max_range, try_n, clean_title, check_alive
5 | import json
6 | import ree as re
7 | import os
8 | from translator import tr_
9 |
10 |
11 | @Downloader.register
12 | class Downloader_bcy(Downloader):
13 | type = 'bcy'
14 | URLS = ['bcy.net/item/detail/', 'bcy.net/u/']
15 | MAX_CORE = 8
16 | display_name = '半次元'
17 |
18 | def init(self):
19 | self.html = downloader.read_html(self.url)
20 | self.info = get_info(self.url, self.html)
21 |
22 | @property
23 | def name(self):
24 | info = self.info
25 | if '/detail/' in self.url:
26 | title = u'{} (bcy_{}) - {}'.format(clean_title(info['artist']), info['uid'], info['id'])
27 | else:
28 | title = u'{} (bcy_{})'.format(clean_title(info['artist']), info['uid'])
29 | return title
30 |
31 | def read(self):
32 | imgs = get_imgs(self.url, self.html, cw=self.cw)
33 |
34 | for img in imgs:
35 | self.urls.append(img.url)
36 |
37 | self.title = self.name
38 | self.artist = self.info['artist']
39 |
40 |
41 | def get_ssr_data(html):
42 | s = html.split('window.__ssr_data = JSON.parse("')[1].replace('\\"', '"')
43 | s = cut_pair(s).replace('"', '\\"')
44 | data = json.loads(json.loads('"{}"'.format(s)))
45 | return data
46 |
47 |
48 | @try_n(2)
49 | def get_imgs(url, html=None, cw=None):
50 | if '/detail/' not in url:
51 | return get_imgs_channel(url, html, cw)
52 |
53 | if html is None:
54 | html = downloader.read_html(url)
55 |
56 | data = get_ssr_data(html)
57 |
58 | multi = data['detail']['post_data']['multi']
59 |
60 | imgs = []
61 |
62 | for m in multi:
63 | path = m['original_path']
64 | img = json.loads(u'"{}"'.format(path))
65 | img = Image_single(img, url, len(imgs))
66 | imgs.append(img)
67 |
68 | return imgs
69 |
70 |
71 | class Image_single(object):
72 | def __init__(self, url ,referer, p):
73 | self._url = url
74 | self.p = p
75 | self.url = LazyUrl(referer, self.get, self)
76 |
77 | def get(self, referer):
78 | ext = get_ext(self._url, referer)
79 | self.filename = '{:04}{}'.format(self.p, ext)
80 | return self._url
81 |
82 |
83 | class Image(object):
84 | def __init__(self, url, referer, id, p):
85 | self.id = id
86 | self.p = p
87 | self._url = url
88 | self.url = LazyUrl(referer, self.get, self)
89 |
90 | def get(self, referer):
91 | ext = get_ext(self._url, referer)
92 | self.filename = u'{}_p{}{}'.format(self.id, self.p, ext)
93 | return self._url
94 |
95 |
96 | def get_ext(url, referer=None):
97 | ext = os.path.splitext(url.split('?')[0].replace('~noop.image', ''))[1]
98 | if ext in ['.image', '']:
99 | ext = downloader.get_ext(url, referer=referer)
100 | return ext
101 |
102 |
103 | def get_info(url, html):
104 | soup = Soup(html)
105 | info = {}
106 |
107 | uname = soup.find('div', class_='user-name') or soup.find('p', class_='uname') or soup.find('div', class_='user-info-name')
108 |
109 | info['artist'] = uname.text.strip()
110 |
111 | j = get_ssr_data(html)
112 |
113 | if '/detail/' in url:
114 | info['uid'] = j['detail']['detail_user']['uid']
115 | info['id'] = j['detail']['post_data']['item_id']
116 | else:
117 | info['uid'] = j['homeInfo']['uid']
118 |
119 | return info
120 |
121 |
122 | def get_imgs_channel(url, html=None, cw=None):
123 | print_ = get_print(cw)
124 | if html is None:
125 | html = downloader.read_html(url)
126 | info = get_info(url, html)
127 |
128 | # Range
129 | max_pid = get_max_range(cw)
130 |
131 | ids = set()
132 | imgs = []
133 | for p in range(1000):
134 | url_api = 'https://bcy.net/apiv3/user/selfPosts?uid={}'.format(info['uid'])
135 | if imgs:
136 | url_api += '&since={}'.format(imgs[-1].id)
137 | data_raw = downloader.read_html(url_api, url)
138 | data = json.loads(data_raw)['data']
139 | items = data['items']
140 | if not items:
141 | print('no items')
142 | break
143 | c = 0
144 | for item in items:
145 | check_alive(cw)
146 | id = item['item_detail']['item_id']
147 | if id in ids:
148 | print('duplicate')
149 | continue
150 | c += 1
151 | ids.add(id)
152 | url_single = u'https://bcy.net/item/detail/{}'.format(id)
153 | imgs_single = get_imgs(url_single, cw=cw)
154 | print_(str(id))
155 | for p, img in enumerate(imgs_single):
156 | img = Image(img._url, url_single, id, p)
157 | imgs.append(img)
158 | s = u'{} {} - {}'.format(tr_(u'읽는 중...'), info['artist'], min(len(imgs), max_pid))
159 | if cw:
160 | cw.setTitle(s)
161 | else:
162 | print(s)
163 |
164 | if len(imgs) >= max_pid:
165 | break
166 | if not c:
167 | print('not c')
168 | break
169 | if len(imgs) >= max_pid:
170 | print('over max_pid:', max_pid)
171 | break
172 | return imgs[:max_pid]
173 |
174 |
--------------------------------------------------------------------------------
/src/extractor/danbooru_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 | import downloader
3 | import ree as re
4 | from utils import Downloader, get_max_range, clean_title, get_print, try_n, urljoin, check_alive, LazyUrl, get_ext
5 | from translator import tr_
6 | from urllib.parse import quote
7 | from urllib.parse import urlparse, parse_qs
8 | from ratelimit import limits, sleep_and_retry
9 |
10 |
11 |
12 | @Downloader.register
13 | class Downloader_danbooru(Downloader):
14 | type='danbooru'
15 | URLS = ['danbooru.donmai.us']
16 | MAX_CORE = 8
17 | _name = None
18 |
19 | @classmethod
20 | def fix_url(cls, url):
21 | if 'donmai.us' in url:
22 | url = url.replace('http://', 'https://')
23 | else:
24 | url = url.replace(' ', '+')
25 | while '++' in url:
26 | url = url.replace('++', '+')
27 | url = 'https://danbooru.donmai.us/?tags={}'.format(quote(url))
28 | return url.strip('+')
29 |
30 | @property
31 | def name(self):
32 | if self._name is None:
33 | parsed_url = urlparse(self.url)
34 | qs = parse_qs(parsed_url.query)
35 | if 'donmai.us/favorites' in self.url:
36 | id = qs.get('user_id', [''])[0]
37 | print('len(id) =', len(id), '"{}"'.format(id))
38 | assert len(id) > 0, '[Fav] User id is not specified'
39 | id = 'fav_{}'.format(id)
40 | elif 'donmai.us/explore/posts/popular' in self.url: #4160
41 | soup = read_soup(self.url, self.cw)
42 | id = soup.find('h1').text
43 | else:
44 | tags = qs.get('tags', [])
45 | tags.sort()
46 | id = ' '.join(tags)
47 | if not id:
48 | id = 'N/A'
49 | self._name = id
50 | return clean_title(self._name)
51 |
52 | def read(self):
53 | self.title = self.name
54 |
55 | imgs = get_imgs(self.url, self.name, cw=self.cw)
56 |
57 | for img in imgs:
58 | self.urls.append(img.url)
59 |
60 | self.title = self.name
61 |
62 |
63 | class Image(object):
64 | def __init__(self, id, url, cw):
65 | self._cw = cw
66 | self.id = id
67 | self.url = LazyUrl(url, self.get, self)
68 |
69 | def get(self, url):
70 | soup = read_soup(url, self._cw)
71 | ori = soup.find('li', id='post-option-view-original')
72 | if ori:
73 | img = ori.find('a')['href']
74 | else:
75 | img = soup.find('li', id='post-info-size').find('a')['href']
76 | img = urljoin(url, img)
77 | ext = get_ext(img)
78 | self.filename = '{}{}'.format(self.id, ext)
79 | return img
80 |
81 |
82 |
83 | @sleep_and_retry
84 | @limits(2, 1)
85 | def wait(cw):
86 | check_alive(cw)
87 |
88 |
89 | def setPage(url, page):
90 | # Always use HTTPS
91 | url = url.replace('http://', 'https://')
92 |
93 | # Main page
94 | if re.findall(r'https://[\w]*[.]?donmai.us/?$', url):
95 | url = 'https://{}donmai.us/posts?page=1'.format('danbooru.' if 'danbooru.' in url else '')
96 |
97 | # Change the page
98 | if 'page=' in url:
99 | url = re.sub('page=[0-9]*', 'page={}'.format(page), url)
100 | else:
101 | url += '&page={}'.format(page)
102 |
103 | return url
104 |
105 |
106 | @try_n(4) #4103
107 | def read_soup(url, cw):
108 | check_alive(cw)
109 | wait(cw)
110 | return downloader.read_soup(url)
111 |
112 |
113 | def get_imgs(url, title=None, range_=None, cw=None):
114 | if 'donmai.us/artists' in url:
115 | raise NotImplementedError('Not Implemented')
116 | if 'donmai.us/posts/' in url:
117 | raise NotImplementedError('Not Implemented')
118 |
119 | print_ = get_print(cw)
120 |
121 | # Range
122 | max_pid = get_max_range(cw)
123 |
124 | if range_ is None:
125 | range_ = range(1, 1001)
126 | print(range_)
127 | imgs = []
128 | i = 0
129 | empty_count = 0
130 | empty_count_global = 0
131 | url_imgs = set()
132 | while i < len(range_):
133 | check_alive(cw)
134 | p = range_[i]
135 | url = setPage(url, p)
136 | print_(url)
137 | soup = read_soup(url, cw)
138 | articles = soup.findAll('article')
139 | if articles:
140 | empty_count_global = 0
141 | else:
142 | empty_count += 1
143 | if empty_count < 4:
144 | s = 'empty page; retry... {}'.format(p)
145 | print_(s)
146 | continue
147 | else:
148 | empty_count = 0
149 | empty_count_global += 1
150 |
151 | if empty_count_global >= 6:
152 | break
153 |
154 | for article in articles:
155 | id = article.attrs['data-id']
156 |
157 | #url_img = article.attrs['data-file-url'].strip()
158 | url_img = urljoin(url, article.find('a', class_='post-preview-link')['href']) #4160
159 |
160 | #print(url_img)
161 | if url_img not in url_imgs:
162 | url_imgs.add(url_img)
163 | img = Image(id, url_img, cw)
164 | imgs.append(img)
165 |
166 | if len(imgs) >= max_pid:
167 | break
168 |
169 | if cw is not None:
170 | cw.setTitle('{} {} - {}'.format(tr_('읽는 중...'), title, len(imgs)))
171 | i += 1
172 |
173 | return imgs[:max_pid]
174 |
--------------------------------------------------------------------------------
/src/extractor/soundcloud_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf8
2 | import downloader
3 | import json
4 | from io import BytesIO
5 | from utils import Downloader, LazyUrl, get_print, try_n, lock, clean_title
6 | from error_printer import print_error
7 | import os
8 | from timee import sleep
9 | import ffmpeg
10 | import ytdl
11 | from m3u8_tools import M3u8_stream
12 | CLIENT_ID = None
13 |
14 |
15 | @lock
16 | def get_cid(force=False):
17 | global CLIENT_ID
18 | if CLIENT_ID is None or force:
19 | print('update cid...')
20 | d = ytdl.YoutubeDL()
21 | e = ytdl.extractor.soundcloud.SoundcloudIE(d)
22 | e._update_client_id()
23 | CLIENT_ID = e._CLIENT_ID
24 | return CLIENT_ID
25 |
26 |
27 | class Audio(object):
28 | _url = None
29 |
30 | def __init__(self, info, album_art, cw=None):
31 | self.info = info
32 | self.album_art = album_art
33 | self.cw = cw
34 | self.url = LazyUrl(info['webpage_url'], self.get, self, pp=self.pp)
35 |
36 | def get(self, url):
37 | print_ = get_print(self.cw)
38 | if self._url:
39 | return self._url
40 |
41 | info = self.info
42 |
43 | ## ydl = ytdl.YoutubeDL()
44 | ## info = ydl.extract_info(url)
45 |
46 | formats = info['formats']
47 | print(formats)
48 | formats = sorted(formats, key=lambda x: int(x.get('abr', 0)), reverse=True)
49 | url_audio = None
50 |
51 | for format in formats:
52 | protocol = format['protocol']
53 | print_(u'【{}】 format【{}】 abr【{}】'.format(protocol, format['format'], format.get('abr', 0)))
54 | if not url_audio and protocol in ['http', 'https']:
55 | url_audio = format['url']
56 |
57 | if not url_audio:
58 | url_audio = M3u8_stream(formats[0]['url'])
59 | self.album_art = False#
60 |
61 | self.username = info['uploader']
62 | self.title = u'{} - {}'.format(self.username, info['title'])
63 | self.filename = u'{}{}'.format(clean_title(self.title, allow_dot=True, n=-4), '.mp3')
64 |
65 | thumb = None
66 | for t in info['thumbnails'][::-1]:
67 | width = t.get('width', 1080)
68 | if not 100 <= width <= 500:
69 | continue
70 | url_thumb = t['url']
71 | thumb = BytesIO()
72 | try:
73 | downloader.download(url_thumb, buffer=thumb)
74 | break
75 | except Exception as e:
76 | print(e)
77 | thumb = None
78 | self.thumb = thumb
79 |
80 | self._url = url_audio
81 | return self._url
82 |
83 | def pp(self, filename):
84 | if self.thumb and self.album_art:
85 | self.thumb.seek(0)#
86 | ffmpeg.add_cover(filename, self.thumb, {'artist':self.username, 'title':self.info['title']}, cw=self.cw)
87 |
88 |
89 | @Downloader.register
90 | class Downloader_soundcloud(Downloader):
91 | type = 'soundcloud'
92 | single = True
93 | URLS = ['soundcloud.com']
94 | #lock = True
95 | audio = None
96 | display_name = 'SoundCloud'
97 |
98 | def init(self):
99 | if 'soundcloud.com' in self.url.lower():
100 | self.url = self.url.replace('http://', 'https://')
101 | else:
102 | self.url = 'https://soundcloud.com/{}'.format(self.url)
103 |
104 | def read(self):
105 | album_art = self.ui_setting.albumArt.isChecked()
106 | info = get_audios(self.url, self.cw, album_art)
107 | audios = info['audios']
108 |
109 | if not audios:
110 | raise Exception('no audios')
111 |
112 | # first audio must be valid
113 | while audios:
114 | audio = audios[0]
115 | try:
116 | audio.url()
117 | break
118 | except Exception as e:
119 | e_ = e
120 | print(e)
121 | audios.remove(audio)
122 | else:
123 | raise e_
124 |
125 | if len(audios) > 1:
126 | audio = self.process_playlist(info['title'], audios)
127 | else:
128 | self.urls.append(audio.url)
129 | self.title = audio.title
130 |
131 | self.artist = audio.username
132 | self.setIcon(audio.thumb)
133 |
134 |
135 | @try_n(2)
136 | def get_audios(url, cw, album_art):
137 | print_ = get_print(cw)
138 | url = url.rstrip('/')
139 | if url.count('/') == 3:
140 | url += '/tracks'
141 |
142 | info = {
143 | #'extract_flat': True,
144 | }
145 |
146 | ydl = ytdl.YoutubeDL(cw=cw)
147 | info = ydl.extract_info(url)
148 | if 'entries' in info:
149 | entries = info['entries']
150 | title = info['title']
151 | for _type in ['All', 'Tracks', 'Albums', 'Sets', 'Reposts', 'Likes', 'Spotlight']:
152 | x = '({})'.format(_type)
153 | if x in title:
154 | title = title.replace(x, '')
155 | kind = _type
156 | break
157 | else:
158 | kind = 'Playlist'
159 | print_(u'kind: {}'.format(kind))
160 | info['title'] = u'[{}] {}'.format(kind.capitalize(), title)
161 | else:
162 | entries = [info]
163 |
164 | audios = []
165 | for e in entries:
166 | if '/sets/' in e['webpage_url']:
167 | continue
168 | audio = Audio(e, album_art, cw=cw)
169 | audios.append(audio)
170 |
171 | info['audios'] = audios
172 |
173 | return info
174 |
175 |
176 |
--------------------------------------------------------------------------------
/src/extractor/tiktok_downloader.py:
--------------------------------------------------------------------------------
1 | from __future__ import division, print_function, unicode_literals
2 | import downloader
3 | import ree as re
4 | from utils import Soup, LazyUrl, Downloader, try_n, compatstr, get_print, clean_title, Session, get_max_range, format_filename
5 | from io import BytesIO
6 | import clf2
7 | from translator import tr_
8 | from timee import sleep
9 | from error_printer import print_error
10 | import ytdl
11 | PATTERN_VID = '/(v|video)/(?P[0-9]+)'
12 | SHOW = True
13 |
14 |
15 | def is_captcha(soup):
16 | return soup.find('div', class_="verify-wrap") is not None
17 |
18 |
19 | @Downloader.register
20 | class Downloader_tiktok(Downloader):
21 | type = 'tiktok'
22 | single = True
23 | URLS = ['tiktok.com']
24 | display_name = 'TikTok'
25 |
26 | def init(self):
27 | cw = self.cw
28 | self.session = Session()
29 | res = clf2.solve(self.url, self.session, cw)
30 | self.url = self.fix_url(res['url']) #4324
31 | soup = Soup(res['html'])
32 | if is_captcha(soup):
33 | def f(html):
34 | return not is_captcha(Soup(html))
35 | clf2.solve(self.url, self.session, cw, show=True, f=f)
36 |
37 | @classmethod
38 | def fix_url(cls, url):
39 | url = url.split('?')[0].split('#')[0].strip('/')
40 | if 'tiktok.com' not in url.lower():
41 | url = 'https://www.tiktok.com/@{}'.format(url)
42 | return url
43 |
44 | def read(self):
45 | format = compatstr(self.ui_setting.youtubeFormat.currentText()).lower().strip()
46 |
47 | if re.search(PATTERN_VID, self.url) is None:
48 | info = read_channel(self.url, self.session, self.cw)
49 | items = info['items']
50 | videos = [Video('https://www.tiktok.com/@{}/video/{}'.format(info['uid'], item['id']), self.session, format) for item in items]
51 | title = '{} (tiktok_{})'.format(info['nickname'], info['uid'])
52 | video = self.process_playlist(title, videos)
53 | else:
54 | video = Video(self.url, self.session, format)
55 | video.url()
56 | self.urls.append(video.url)
57 | self.title = clean_title(video.title)
58 |
59 |
60 | class Video(object):
61 | _url = None
62 |
63 | def __init__(self, url, session, format='title (id)'):
64 | self.url = LazyUrl(url, self.get, self)
65 | self.session = session
66 | self.format = format
67 |
68 | @try_n(2)
69 | def get(self, url):
70 | if self._url:
71 | return self._url
72 | m = re.search(PATTERN_VID, url)
73 | id = m.group('id')
74 | ext = '.mp4'
75 | self.title = id#
76 | self.filename = format_filename(self.title, id, ext)
77 |
78 | ydl = ytdl.YoutubeDL()
79 | info = ydl.extract_info(url)
80 |
81 | self._url = info['url']
82 |
83 | return self._url
84 |
85 |
86 | def read_channel(url, session, cw=None):
87 | print_ = get_print(cw)
88 |
89 | info = {}
90 | info['items'] = []
91 |
92 | ids = set()
93 | info['items'] = []
94 | sd = {
95 | 'count_empty': 0,
96 | 'shown': SHOW,
97 | }
98 |
99 | max_pid = get_max_range(cw)
100 |
101 | def f(html, browser=None):
102 | soup = Soup(html)
103 | if is_captcha(soup):
104 | print('captcha')
105 | browser.show()
106 | sd['shown'] = True
107 | elif sd['shown'] and not SHOW:
108 | browser.hide()
109 | sd['shown'] = False
110 | try:
111 | st = soup.find('h2', class_='share-title')
112 | if st is None:
113 | st = soup.find('h2', class_=lambda c: c and 'ShareTitle' in c)
114 | info['uid'] = st.text.strip()
115 | st = soup.find('h1', class_='share-sub-title')
116 | if st is None:
117 | st = soup.find('h1', class_=lambda c: c and 'ShareSubTitle' in c)
118 | info['nickname'] = st.text.strip()
119 | except Exception as e:
120 | print_(print_error(e)[0])
121 | c = 0
122 | ids_now = set()
123 | items = soup.findAll('div', class_='video-feed-item') + soup.findAll('div', class_=lambda c: c and 'DivItemContainer' in c)
124 | for div in items:
125 | a = div.find('a')
126 | if a is None:
127 | continue
128 | href = a['href']
129 | if not href:
130 | continue
131 | m = re.search(PATTERN_VID, href)
132 | if m is None:
133 | continue
134 | id_video = int(m.group('id'))
135 | ids_now.add(id_video)
136 | if id_video in ids:
137 | continue
138 | ids.add(id_video)
139 | info['items'].append({'id': id_video})
140 | c += 1
141 |
142 | print_('items: {}'.format(len(info['items'])))
143 | if len(info['items']) >= max_pid:
144 | info['items'] = info['items'][:max_pid]
145 | return True
146 |
147 | browser.runJavaScript('window.scrollTo(0, document.body.scrollHeight);')
148 | sleep(15, cw)
149 |
150 | if c or (ids_now and min(ids_now) > min(ids)):
151 | sd['count_empty'] = 0
152 | else:
153 | print_('empty')
154 | sd['count_empty'] += 1
155 | msg = '{} {} (tiktok_{}) - {}'.format(tr_('읽는 중...'), info.get('nickname'), info.get('uid'), len(info['items']))
156 | if cw:
157 | if not cw.alive:
158 | raise Exception('cw dead')
159 | cw.setTitle(msg)
160 | else:
161 | print(msg)
162 | return sd['count_empty'] > 4
163 | res = clf2.solve(url, session, cw, f=f, timeout=1800, show=SHOW, delay=0)
164 |
165 | if not info['items']:
166 | raise Exception('no items')
167 |
168 | return info
169 |
170 |
--------------------------------------------------------------------------------
/src/extractor/naver_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf-8
2 | import downloader
3 | import ree as re
4 | from utils import urljoin, Downloader, Soup, LazyUrl, clean_title, get_ext
5 | import json
6 | from timee import sleep
7 | import collections
8 | import errors
9 | PATTERNS = ['.*blog.naver.com/(?P.+)/(?P[0-9]+)',
10 | '.*blog.naver.com/.+?blogId=(?P[^&]+).+?logNo=(?P[0-9]+)',
11 | '.*?(?P[0-9a-zA-Z_-]+)\.blog\.me/(?P[0-9]+)']
12 | HDR = {
13 | 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*',
14 | 'Accept-Encoding': 'gzip, deflate',
15 | 'Accept-Language': 'ko, en-US; q=0.7, en; q=0.3',
16 | 'Connection': 'Keep-Alive',
17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
18 | }
19 |
20 | def get_id(url):
21 | for pattern in PATTERNS:
22 | m = re.match(pattern, url)
23 | if m is None:
24 | continue
25 | username = m.group('username')
26 | pid = m.group('pid')
27 | break
28 | else:
29 | username, pid = None, None
30 | return username, pid
31 |
32 |
33 | @Downloader.register
34 | class Downloader_naver(Downloader):
35 | type = 'naver'
36 | URLS = ['blog.naver.', '.blog.me']
37 | display_name = 'Naver Blog'
38 |
39 | def init(self):
40 | username, pid = get_id(self.url)
41 | if username is None:
42 | raise errors.Invalid('Invalid format: {}'.format(self.url))
43 | self.url = 'https://blog.naver.com/{}/{}'.format(username, pid)
44 | self.headers = {'User-Agent': downloader.hdr['User-Agent']}
45 |
46 | @property
47 | def name(self):
48 | username, pid = get_id(self.url)
49 | return clean_title(u'{}/{}'.format(username, pid))
50 |
51 | def read(self):
52 | self.title = u'읽는 중... {}'.format(self.name)
53 |
54 | imgs = get_imgs(self.url)
55 |
56 | filenames = {}
57 | for img in imgs:
58 | self.urls.append(img.url)
59 |
60 | self.title = self.name
61 |
62 |
63 | class Image(object):
64 | def __init__(self, url, referer, p):
65 | self.url = LazyUrl(referer, lambda _: url, self)
66 | #3788, #3817
67 | ext = get_ext(url)
68 | self.filename = '{:04}{}'.format(p, ext)
69 |
70 |
71 | class Video(object):
72 | def __init__(self, url, referer, p):
73 | self.url = LazyUrl(referer, lambda _: url, self)
74 | self.filename = 'video_{}.mp4'.format(p)
75 |
76 |
77 | def read_page(url, depth=0):
78 | print('read_page', url, depth)
79 | if depth > 10:
80 | raise Exception('Too deep')
81 | html = downloader.read_html(url, header=HDR)
82 |
83 | if len(html) < 5000:
84 | id = re.find('logNo=([0-9]+)', html, err='no id')
85 | username = re.find('blog.naver.com/([0-9a-zA-Z]+)', url) or re.find('blogId=([0-9a-zA-Z]+)', url, err='no username')
86 | url = 'https://m.blog.naver.com/PostView.nhn?blogId={}&logNo={}&proxyReferer='.format(username, id)
87 | print('###', username, id, url)
88 |
89 | soup = Soup(html)
90 | if soup.find('div', {'id': 'viewTypeSelector'}):
91 | return url, soup
92 | frame = soup.find('frame')
93 | if frame is None:
94 | print('frame is None')
95 | return read_page(url, depth+1)
96 | return read_page(urljoin('https://blog.naver.com', frame.attrs['src']), depth+1)
97 |
98 |
99 |
100 | def get_imgs(url):
101 | url = url.replace('blog.naver', 'm.blog.naver')
102 | referer = url
103 | url_frame, soup = read_page(url)
104 |
105 | imgs = []
106 | urls = set()
107 | view = soup.find('div', {'id': 'viewTypeSelector'})
108 | print('view', view is not None)
109 |
110 | imgs_ = view.findAll('span', class_='_img') + view.findAll('img')
111 |
112 | for img in imgs_:
113 | url = img.attrs.get('src', None)
114 | if url is None:
115 | url = img.attrs.get('thumburl', None)
116 | if url is None:
117 | print(u'invalid img: {}'.format(url))
118 | continue
119 |
120 | if 'ssl.pstatic.net' in url: #
121 | continue
122 |
123 | if 'blogpfthumb-phinf.pstatic.net' in url: # profile
124 | continue
125 |
126 | if 'dthumb-phinf.pstatic.net' in url: # link
127 | continue
128 |
129 | if 'storep-phinf.pstatic.net' in url: # emoticon
130 | continue
131 |
132 | url = url.replace('mblogthumb-phinf', 'blogfiles')
133 | #url = re.sub('\?type=[a-zA-Z0-9]*', '?type=w1@2x', url)
134 | #url = re.sub('\?type=[a-zA-Z0-9]*', '', url)
135 | url = url.split('?')[0]
136 |
137 | if url in urls:
138 | print('### Duplicate:', url)
139 | continue
140 |
141 | urls.add(url)
142 | #url = url.split('?type=')[0]
143 | img = Image(url, referer, len(imgs))
144 | imgs.append(img)
145 |
146 | pairs = []
147 |
148 | for video in soup.findAll('span', class_='_naverVideo'):
149 | vid = video.attrs['vid']
150 | key = video.attrs['key']
151 | pairs.append((vid, key))
152 |
153 | for script in soup.findAll('script', class_='__se_module_data'):
154 | data_raw = script['data-module']
155 | data = json.loads(data_raw)['data']
156 | vid = data.get('vid')
157 | if not vid:
158 | continue
159 | key = data['inkey']
160 | pairs.append((vid, key))
161 |
162 | videos = []
163 | for vid, key in pairs:
164 | url_api = 'https://apis.naver.com/rmcnmv/rmcnmv/vod/play/v2.0/{}?key={}'.format(vid, key)
165 | data_raw = downloader.read_html(url_api)
166 | data = json.loads(data_raw)
167 | fs = data['videos']['list']
168 | fs = sorted(fs, key=lambda f: f['size'], reverse=True)
169 | video = Video(fs[0]['source'], url_frame, len(videos))
170 | videos.append(video)
171 |
172 | return imgs + videos
173 |
174 |
--------------------------------------------------------------------------------
/src/extractor/twitch_downloader.py:
--------------------------------------------------------------------------------
1 | #coding: utf8
2 | import downloader
3 | import ytdl
4 | from utils import Downloader, get_outdir, Soup, LazyUrl, try_n, compatstr, format_filename, get_ext, clean_title, Session, get_print, get_resolution, get_max_range
5 | from io import BytesIO
6 | from m3u8_tools import M3u8_stream
7 | import ree as re
8 | from translator import tr_
9 | import errors
10 | import utils
11 |
12 |
13 | @Downloader.register
14 | class Downloader_twitch(Downloader):
15 | type = 'twitch'
16 | URLS = ['twitch.tv']
17 | single = True
18 |
19 | def init(self):
20 | url = self.url
21 | if 'twitch.tv' in url:
22 | if not url.startswith('http://') and not url.startswith('https://'):
23 | url = 'https://' + url
24 | self.url = url
25 | else:
26 | url = 'https://www.twitch.tv/videos/{}'.format(url)
27 | self.url = url
28 |
29 | @classmethod
30 | def fix_url(cls, url):
31 | if re.search(r'/(videos|clips)\?filter=', url):
32 | return url.strip('/')
33 | return url.split('?')[0].strip('/')
34 |
35 | def read(self):
36 | if '/directory/' in self.url.lower():
37 | raise errors.Invalid('[twitch] Directory is unsupported: {}'.format(self.url))
38 |
39 | if self.url.count('/') == 3:
40 | if 'www.twitch.tv' in self.url or '//twitch.tv' in self.url:
41 | filter = 'live'
42 | else:
43 | filter = None
44 | elif self.url.count('/') == 4:
45 | filter = re.find(r'filter=([0-9a-zA-Z_]+)', self.url) or re.find(r'[0-9a-zA-Z_]+', self.url.split('/')[-1])
46 | if filter is not None and filter.isdigit():
47 | filter = None
48 | else:
49 | filter = None
50 |
51 | if filter is None:
52 | video = Video(self.url, self.cw)
53 | video.url()
54 | self.urls.append(video.url)
55 | self.title = video.title
56 | elif filter == 'live':
57 | video = Video(self.url, self.cw, live=True)
58 | video.url()
59 | self.urls.append(video.url)
60 | self.title = video.title
61 | elif filter == 'clips':
62 | info = get_videos(self.url, cw=self.cw)
63 | video = self.process_playlist('[Clip] {}'.format(info['name']), info['videos'])
64 | else:
65 | raise NotImplementedError(filter)
66 |
67 | self.setIcon(video.thumb)
68 |
69 |
70 | @try_n(2)
71 | def get_videos(url, cw=None):
72 | print_ = get_print(cw)
73 | print_(f'get_videos: {url}')
74 | info = {}
75 | options = {
76 | 'extract_flat': True,
77 | 'playlistend': get_max_range(cw),
78 | }
79 | videos = []
80 | ydl = ytdl.YoutubeDL(options, cw=cw)
81 | info = ydl.extract_info(url)
82 | for e in info['entries']:
83 | video = Video(e['url'], cw)
84 | video.id = int(e['id'])
85 | videos.append(video)
86 | if 'name' not in info:
87 | info['name'] = ydl.extract_info(e['url'])['creator']
88 | if not videos:
89 | raise Exception('no videos')
90 | info['videos'] = sorted(videos, key=lambda video: video.id, reverse=True)
91 | return info
92 |
93 |
94 | def alter(seg):
95 | segs = []
96 | if '-muted' in seg.url:
97 | seg_ = seg.copy()
98 | seg_.url = seg.url.replace('-muted', '')
99 | segs.append(seg_)
100 | segs.append(seg)
101 | return segs
102 |
103 |
104 | def extract_info(url, cw=None):
105 | print_ = get_print(cw)
106 | ydl = ytdl.YoutubeDL(cw=cw)
107 | try:
108 | info = ydl.extract_info(url)
109 | except Exception as e:
110 | ex = type(ytdl.get_extractor(url))(ydl)
111 | _download_info = getattr(ex, '_download_info', None)
112 | if _download_info is not None:
113 | vod_id = ex._match_id(url)
114 | info = _download_info(vod_id)
115 | print_(info)
116 | if 'HTTPError 403' in str(e):
117 | raise errors.LoginRequired()
118 | raise
119 | return info
120 |
121 |
122 | class Video(object):
123 | _url = None
124 |
125 | def __init__(self, url, cw, live=False):
126 | self.url = LazyUrl(url, self.get, self)
127 | self.cw = cw
128 | self._live = live
129 |
130 | @try_n(4)
131 | def get(self, url):
132 | print_ = get_print(self.cw)
133 | if self._url:
134 | return self._url
135 | info = extract_info(url, self.cw)
136 |
137 | def print_video(video):
138 | #print_(video)#
139 | print_('{}[{}] [{}] [{}] {}'.format('LIVE ', video['format_id'], video.get('height'), video.get('tbr'), video['url']))
140 |
141 | videos = [video for video in info['formats'] if video.get('height')]
142 |
143 | videos = sorted(videos, key=lambda video:(video.get('height', 0), video.get('tbr', 0)), reverse=True)
144 |
145 | for video in videos:
146 | print_video(video)
147 |
148 | for video in videos:
149 | if video.get('height', 0) <= get_resolution(): #3723
150 | video_best = video
151 | break
152 | else:
153 | video_best = videos[-1]
154 | print_video(video)
155 |
156 | video = video_best['url']
157 |
158 | ext = get_ext(video)
159 | self.title = info['title']
160 | id = info['display_id']
161 |
162 | if self._live:
163 | video = utils.LiveStream(video, headers=video_best.get('http_headers'))
164 | ext = '.mp4'
165 | else:
166 | if ext.lower() == '.m3u8':
167 | video = M3u8_stream(video, n_thread=4, alter=alter)
168 | ext = '.mp4'
169 | self.filename = format_filename(self.title, id, ext)
170 | self.url_thumb = info['thumbnail']
171 | self.thumb = BytesIO()
172 | downloader.download(self.url_thumb, buffer=self.thumb)
173 | self._url = video
174 | return self._url
175 |
--------------------------------------------------------------------------------
/src/extractor/mrm_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, urljoin, LazyUrl, Downloader, query_url, try_n, Session, get_print, clean_title, get_ext
4 | import os
5 | from translator import tr_
6 | from timee import sleep
7 | import requests
8 | import ree as re
9 | import clf2#
10 |
11 |
12 | class Image(object):
13 | def __init__(self, url, p, page, cw):
14 | self.cw = cw
15 | ext = get_ext(url)
16 | self.filename = '{:04}{}'.format(p, ext)
17 | if page.title is not None:
18 | self.filename = '{}/{}'.format(page.title, self.filename)
19 | self._url = url
20 | self.url = LazyUrl(page.url, self.get, self)
21 |
22 | def get(self, _):
23 | return self._url#'tmp://' + clf2.download(self._url, cw=self.cw)
24 |
25 |
26 | class Page(object):
27 | def __init__(self, title, url, soup=None):
28 | self.title = clean_title(title)
29 | self.url = url
30 | self.soup = soup
31 |
32 |
33 |
34 |
35 | @Downloader.register
36 | class Downloader_mrm(Downloader):
37 | type = 'mrm'
38 | URLS = ['myreadingmanga.info']
39 | _soup = None
40 | MAX_CORE = 4
41 | display_name = 'MyReadingManga'
42 |
43 | def init(self):
44 | self.session = get_session(self.url, self.cw)
45 |
46 | @classmethod
47 | def fix_url(cls, url):
48 | return re.find('https?://myreadingmanga.info/[^/]+', url, err='err')
49 |
50 | @property
51 | def soup(self):
52 | if self._soup is None:
53 | for try_ in range(8):
54 | try:
55 | html = read_html(self.url, session=self.session, cw=self.cw)
56 | break
57 | except Exception as e:
58 | e_ = e
59 | self.print_(e)
60 | else:
61 | raise e_
62 | self._soup = Soup(html)
63 | return self._soup
64 |
65 | @property
66 | def name(self):
67 | title = get_title(self.soup)
68 | return title
69 |
70 | def read(self):
71 | self.title = '읽는 중... {}'.format(self.name)
72 |
73 | imgs = get_imgs(self.url, self.soup, self.session, self.cw)
74 |
75 | for img in imgs:
76 | self.urls.append(img.url)
77 |
78 | self.title = self.name
79 |
80 |
81 | def get_title(soup):
82 | title = soup.find('h1', class_='entry-title').text.strip()
83 | title = fix_title(title)
84 | title = clean_title(title)
85 | return title
86 |
87 |
88 | def get_imgs(url, soup=None, session=None, cw=None):
89 | if soup is None:
90 | html = read_html(url, session=session, cw=cw)
91 | soup = Soup(html)
92 |
93 | title = get_title(soup)
94 |
95 | pagination = soup.find('div', class_='pagination')
96 |
97 | if pagination is None:
98 | page = Page(None, url, soup)
99 | imgs = get_imgs_page(page, session=session, cw=cw)
100 | else:
101 | pages = get_pages(url, soup, session=session)
102 | imgs = []
103 | for i, page in enumerate(pages):
104 | s = '{} {} / {} ({} / {})'.format(tr_('읽는 중...'), title, page.title, i+1, len(pages))
105 |
106 | if cw:
107 | if not cw.alive:
108 | return
109 | cw.setTitle(s)
110 | else:
111 | print(s)
112 |
113 | imgs += get_imgs_page(page, session=session, cw=cw)
114 |
115 | if not imgs:
116 | raise Exception('no imgs')
117 |
118 | return imgs
119 |
120 |
121 | def get_pages(url, soup=None, session=None):
122 | if soup is None:
123 | html = read_html(url, session=session, cw=None)
124 | soup = Soup(html)
125 | pagination = soup.find('div', class_='pagination')
126 |
127 | pages = []
128 | hrefs = set()
129 | for a in pagination.findAll('a'):
130 | href = a.attrs.get('href', '')
131 | href = urljoin(url, href)
132 | if not href.startswith(url):
133 | print('not match', href)
134 | continue
135 | while href.endswith('/'):
136 | href = href[:-1]
137 | if href in hrefs:
138 | print('duplicate', href)
139 | continue
140 | hrefs.add(href)
141 | text = a.text.strip()
142 | page = Page(text, href)
143 | pages.append(page)
144 |
145 | if url not in hrefs:
146 | page = Page('1', url, soup)
147 | pages.insert(0, page)
148 |
149 | return pages
150 |
151 |
152 | @try_n(4)
153 | def get_imgs_page(page, session=None, cw=None):
154 | url = page.url
155 | soup = page.soup
156 | if soup is None:
157 | html = read_html(url, session=session, cw=None)
158 | soup = Soup(html)
159 | page.soup = soup
160 |
161 | view = soup.find('div', class_='entry-content')
162 |
163 | imgs = []
164 | for img in view.findAll('img'):
165 | img = img.attrs.get('data-lazy-src') or img.attrs.get('data-src')
166 | if img is None:
167 | continue
168 | img = urljoin(url, img)
169 | img = Image(img, len(imgs), page, cw)
170 | imgs.append(img)
171 | print(page.title, len(imgs), page.url)
172 |
173 | return imgs
174 |
175 |
176 | def fix_title(title):
177 | title = re.sub(r'\(?[^()]*?c\.[^() ]+\)?', '', title)
178 | while ' ' in title:
179 | title = title.replace(' ', ' ')
180 | return title
181 |
182 |
183 | def read_html(url, session, cw):
184 | ## html = downloader.read_html(url, session=session)
185 | ## soup = Soup(html)
186 | ##
187 | ## cf = soup.find('div', class_='cf-browser-verification')
188 | ## if cf is None:
189 | ## return html
190 |
191 | r = clf2.solve(url, cw=cw, session=session)
192 |
193 | return r['html']
194 |
195 |
196 | @try_n(4)
197 | def get_session(url, cw=None):
198 | print_ = get_print(cw)
199 | ## html = downloader.read_html(url)
200 | ## soup = Soup(html)
201 | ##
202 | ## cf = soup.find('div', class_='cf-browser-verification')
203 | ## if cf is None:
204 | ## print_('no cf protection')
205 | ## return None
206 |
207 | print_('cf protection')
208 | r = clf2.solve(url, cw=cw)
209 | session = r['session']
210 |
211 | return session
212 |
213 |
--------------------------------------------------------------------------------
/src/extractor/kakaopage_downloader.py:
--------------------------------------------------------------------------------
1 | import downloader
2 | import ree as re
3 | from utils import Session, LazyUrl, Soup, Downloader, try_n, get_print, clean_title, print_error, urljoin, get_imgs_already
4 | from time import sleep
5 | from translator import tr_
6 | import page_selector
7 | import json
8 | import clf2
9 |
10 |
11 | class Page(object):
12 |
13 | def __init__(self, id_, title):
14 | self.id_ = id_
15 | self.title = title
16 | self.url = 'https://page.kakao.com/viewer?productId={}'.format(id_)
17 |
18 |
19 | class Image(object):
20 |
21 | def __init__(self, url, page, p):
22 | self.url = LazyUrl('https://page.kakao.com/', lambda _: url, self)
23 | ext = '.jpg'
24 | self.filename = '{}/{:04}{}'.format(clean_title(page.title), p, ext)
25 |
26 |
27 | @Downloader.register
28 | class Downloader_kakaopage(Downloader):
29 | type = 'kakaopage'
30 | URLS = ['page.kakao.com/home']
31 | MAX_CORE = 4
32 | MAX_SPEED = 4.0
33 | display_name = 'KakaoPage'
34 |
35 | def init(self):
36 | self.session = Session()
37 |
38 | @classmethod
39 | def fix_url(cls, url):
40 | id = re.find('/home/.+?/([0-9]+)', url)
41 | if id is not None:
42 | url = id
43 | if url.isdecimal():
44 | url = 'https://page.kakao.com/home?seriesId={}'.format(url)
45 | return url
46 |
47 | def read(self):
48 | info = get_info(self.url, self.session, cw=self.cw)
49 |
50 | for img in info['imgs']:
51 | if isinstance(img, Image):
52 | img = img.url
53 | self.urls.append(img)
54 |
55 | self.artist = info['artist']
56 |
57 | self.title = info['title']
58 |
59 |
60 |
61 | def get_id(url):
62 | id_ = re.find('seriesId=([0-9]+)', url, err='No seriesId')
63 | return id_
64 |
65 |
66 |
67 | def get_pages(url, session):
68 | id_ = get_id(url)
69 |
70 | pages = []
71 | ids = set()
72 | for p in range(500): #2966
73 | url_api = 'https://api2-page.kakao.com/api/v5/store/singles'
74 | data = {
75 | 'seriesid': id_,
76 | 'page': str(p),
77 | 'direction': 'asc',
78 | 'page_size': '20',
79 | 'without_hidden': 'true',
80 | }
81 | r = session.post(url_api, data=data, headers={'Referer': url})
82 | print(p, r)
83 | data = r.json()
84 |
85 | singles = data['singles']
86 | if not singles:
87 | print('no singles')
88 | break
89 |
90 | for single in singles:
91 | title_page = single['title']
92 | id_page = single['id']
93 | if id_page in ids:
94 | print('dup id')
95 | continue
96 | ids.add(id_page)
97 | page = Page(id_page, title_page)
98 | pages.append(page)
99 | sleep(.5)
100 | return pages
101 |
102 |
103 | def read_html(url, session):
104 | res = clf2.solve(url, session=session)
105 | return res['html']
106 |
107 |
108 | @try_n(2)
109 | def get_imgs_page(page, session):
110 | html = read_html(page.url, session=session)
111 | did = re.find('"did" *: *"(.+?)"', html, err='no did')
112 | url_api = 'https://api2-page.kakao.com/api/v1/inven/get_download_data/web'
113 | data = {
114 | 'productId': page.id_,
115 | 'device_mgr_uid': 'Windows - Chrome',
116 | 'device_model': 'Windows - Chrome',
117 | 'deviceId': did,
118 | }
119 | print(data)
120 | r = session.post(url_api, data=data, headers={'Referer': page.url})
121 | data = r.json()
122 | if data['result_code']:
123 | raise Exception(data['message'])
124 | imgs = []
125 | for file in data['downloadData']['members']['files']:
126 | url = file['secureUrl']
127 | url = urljoin('https://page-edge-jz.kakao.com/sdownload/resource/', url)
128 | img = Image(url, page, len(imgs))
129 | imgs.append(img)
130 | return imgs
131 |
132 |
133 | def get_info(url, session, cw=None):
134 | print_ = get_print(cw)
135 | pages = get_pages(url, session)
136 | pages = page_selector.filter(pages, cw)
137 | if not pages:
138 | raise Exception('no pages')
139 |
140 | info = {}
141 |
142 | html = read_html(url, session=session)
143 | soup = Soup(html)
144 |
145 | __NEXT_DATA__ = soup.find('script', id='__NEXT_DATA__')
146 | if __NEXT_DATA__:
147 | data = json.loads(__NEXT_DATA__.string)
148 | tid = data['props']['initialState']['common']['constant']['tid']
149 | print_('tid: {}'.format(tid))
150 | session.cookies['_kptid'] = tid
151 | html = read_html(url, session=session)
152 | soup = Soup(html)
153 |
154 | title = soup.find('h2').text.strip()
155 | artist = soup.find('meta', {'name': 'author'})['content']
156 | for x in [' ,', ', ']:
157 | while x in artist:
158 | artist = artist.replace(x, ',')
159 | artist = artist.replace(',', ', ')
160 | info['artist'] = artist
161 | info['title_raw'] = title
162 | info['title'] = clean_title('[{}] {}'.format(artist, title))
163 |
164 | imgs = []
165 |
166 | for i, page in enumerate(pages):
167 | if cw is not None:
168 | if not cw.alive:
169 | return
170 | cw.setTitle('{} {} / {} ({} / {})'.format(tr_('읽는 중...'), info['title'], page.title, i + 1, len(pages)))
171 |
172 | #3463
173 | imgs_already = get_imgs_already('kakaopage', info['title'], page, cw)
174 | if imgs_already:
175 | imgs += imgs_already
176 | continue
177 |
178 | try:
179 | _imgs = get_imgs_page(page, session)
180 | e_msg = None
181 | except Exception as e:
182 | _imgs = []
183 | e_msg = print_error(e)[0]
184 | print_('{} {}'.format(page.title, len(_imgs)))
185 | if e_msg:
186 | print_(e_msg)
187 |
188 | imgs += _imgs
189 | sleep(.2)
190 |
191 | if not imgs:
192 | raise Exception('no imgs')
193 |
194 | info['imgs'] = imgs
195 |
196 | return info
197 |
198 |
199 | @page_selector.register('kakaopage')
200 | @try_n(4)
201 | def f(url):
202 | if 'seriesId=' not in url:
203 | raise Exception(tr_('목록 주소를 입력해주세요'))
204 | pages = get_pages(url, Session())
205 | return pages
206 |
--------------------------------------------------------------------------------
/src/extractor/lhscan_downloader.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | import downloader
3 | from utils import Soup, urljoin, LazyUrl, Downloader, try_n, Session, clean_title, get_print
4 | import os
5 | from translator import tr_
6 | import page_selector
7 | import clf2
8 | import utils
9 | import base64
10 | import ree as re
11 | import errors
12 | ##from image_reader import QPixmap
13 |
14 |
15 | class Image(object):
16 | def __init__(self, url, page, p):
17 | self._url = url
18 | self.url = LazyUrl(page.url, self.get, self)#, pp=self.pp)
19 | ext = os.path.splitext(url)[1]
20 | if ext.lower()[1:] not in ['jpg', 'jpeg', 'bmp', 'png', 'gif', 'webm', 'webp']:
21 | ext = '.jpg'
22 | self.filename = u'{}/{:04}{}'.format(page.title, p, ext)
23 |
24 | def get(self, _):
25 | return self._url
26 |
27 | ## def pp(self, filename):
28 | ## pixmap = QPixmap(filename)
29 | ## pixmap.save(filename)
30 | ## return filename
31 |
32 |
33 | class Page(object):
34 | def __init__(self, title, url):
35 | self.title = clean_title(title)
36 | self.url = url
37 |
38 |
39 | def get_soup_session(url, cw=None):
40 | print_ = get_print(cw)
41 | session = Session()
42 | res = clf2.solve(url, session=session, cw=cw)
43 | print_('{} -> {}'.format(url, res['url']))
44 | if res['url'].rstrip('/') == 'https://welovemanga.one':
45 | raise errors.LoginRequired()
46 | return Soup(res['html']), session
47 |
48 |
49 | @Downloader.register
50 | class Downloader_lhscan(Downloader):
51 | type = 'lhscan'
52 | URLS = [
53 | #'lhscan.net', 'loveheaven.net',
54 | 'lovehug.net', 'welovemanga.',
55 | ]
56 | MAX_CORE = 16
57 | display_name = 'LHScan'
58 | _soup = None
59 |
60 | def init(self):
61 | self._soup, self.session = get_soup_session(self.url, self.cw)
62 | if not self.soup.find('ul', class_='manga-info'):
63 | raise errors.Invalid(u'{}: {}'.format(tr_(u'목록 주소를 입력해주세요'), self.url))
64 |
65 | @classmethod
66 | def fix_url(cls, url):
67 | url = url.replace('lovehug.net', 'welovemanga.one')
68 | url = url.replace('welovemanga.net', 'welovemanga.one') #4298
69 | return url
70 |
71 | @property
72 | def soup(self):
73 | if self._soup is None:
74 | for try_ in range(8):
75 | try:
76 | html = downloader.read_html(self.url, session=self.session)
77 | break
78 | except Exception as e:
79 | e_ = e
80 | print(e)
81 | else:
82 | raise e_
83 | self._soup = Soup(html)
84 | return self._soup
85 |
86 | @property
87 | def name(self):
88 | title = self.soup.find('ul', class_='manga-info').find('h3').text
89 | return clean_title(title)
90 |
91 | def read(self):
92 | self.title = tr_(u'읽는 중... {}').format(self.name)
93 |
94 | imgs = get_imgs(self.url, self.name, self.session, self.soup, self.cw)
95 |
96 | for img in imgs:
97 | self.urls.append(img.url)
98 |
99 | self.title = self.name
100 |
101 |
102 | @try_n(8)
103 | def get_imgs_page(page, referer, session, cw=None):
104 | print_ = get_print(cw)
105 | print_(page.title)
106 |
107 | html = downloader.read_html(page.url, referer, session=session)
108 | if clf2._is_captcha(Soup(html)): #4124
109 | html = clf2.solve(page.url, session, cw)['html']
110 | if not html:
111 | raise Exception('empty html')
112 | html = html.replace('{}='.format(re.find(r"\$\(this\)\.attr\('(.+?)'", html, err='no cn')), 'data-src=')
113 | soup = Soup(html)
114 |
115 | view = soup.find('div', class_='chapter-content')
116 |
117 | if not view:
118 | raise Exception('no chapter-content')
119 |
120 | imgs = []
121 | for img in soup.findAll('img', class_='chapter-img'):
122 | src = img.get('data-pagespeed-lazy-src') or img.get('data-src') or img.get('data-srcset') or img.get('data-aload') or img['src']
123 | try:
124 | src = base64.b64decode(src).strip().decode('utf8')
125 | except:
126 | pass
127 | src0 = src
128 | src = src.replace('welovemanga.one', '1')#
129 | src = urljoin(page.url, src).strip()
130 | if 'Credit_LHScan_' in src or '5e1ad960d67b2_5e1ad962338c7' in src:
131 | continue
132 | if 'fe132b3d32acc39f5adcea9075bedad4LoveHeaven' in src:
133 | continue
134 | if 'LoveHug_600cfd96e98ff.jpg' in src:
135 | continue
136 | if 'image_5f0ecf23aed2e.png' in src:
137 | continue
138 | if '/uploads/lazy_loading.gif' in src:
139 | continue
140 | if not imgs:
141 | print_(src0)
142 | img = Image(src, page, len(imgs))
143 | imgs.append(img)
144 |
145 | return imgs
146 |
147 |
148 | def get_pages(url, session, soup=None, cw=None):
149 | if soup is None:
150 | html = downloader.read_html(url, session=session)
151 | soup = Soup(html)
152 |
153 | tab = soup.find('ul', class_='list-chapters')
154 |
155 | pages = []
156 | for li in tab.findAll('li'):
157 | text = li.find('div', class_='chapter-name').text.strip()
158 | href = li.parent['href']
159 | href = urljoin(url, href)
160 | page = Page(text, href)
161 | pages.append(page)
162 |
163 | if not pages:
164 | raise Exception('no pages')
165 |
166 | return pages[::-1]
167 |
168 |
169 | @page_selector.register('lhscan')
170 | @try_n(4)
171 | def f(url):
172 | soup, session = get_soup_session(url)
173 | pages = get_pages(url, session, soup=soup)
174 | return pages
175 |
176 |
177 | @try_n(2)
178 | def get_imgs(url, title, session, soup=None, cw=None):
179 | if soup is None:
180 | html = downloader.read_html(url, session=session)
181 | soup = Soup(html)
182 |
183 | pages = get_pages(url, session, soup, cw)
184 | pages = page_selector.filter(pages, cw)
185 |
186 | imgs = []
187 | for i, page in enumerate(pages):
188 | imgs += get_imgs_page(page, url, session, cw)
189 | s = u'{} {} / {} ({} / {})'.format(tr_(u'읽는 중...'), title, page.title, i+1, len(pages))
190 | if cw is not None:
191 | if not cw.alive:
192 | return
193 | cw.setTitle(s)
194 | else:
195 | print(s)
196 |
197 | return imgs
198 |
--------------------------------------------------------------------------------