├── docs
└── .gitkeep
├── test
├── __init__.py
├── bootstrap.py
├── testserver
│ ├── __init__.py
│ ├── static
│ │ └── test.png
│ ├── templates
│ │ ├── index.html
│ │ └── chapter.html
│ └── server.py
├── test_utils.py
└── test_scheduler.py
├── dcdownloader
├── __init__.py
├── parser
│ ├── __init__.py
│ ├── SimpleParser.py
│ ├── EhentaiParser.py
│ ├── DmzjParser.py
│ └── BaseParser.py
├── parser_selector.py
├── base_logger.py
├── main.py
├── title.py
├── version.py
├── arg_parse.py
├── aiohttp_proxy_connector.py
├── config.py
├── utils.py
└── scheduler.py
├── _config.yml
├── devscript
├── run_test.sh
├── build.sh
└── build_pyinstaller_windows.sh
├── requirements.txt
├── setup.py
├── LICENSE
├── .travis.yml
├── .gitignore
└── README.md
/docs/.gitkeep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dcdownloader/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/dcdownloader/parser/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-hacker
--------------------------------------------------------------------------------
/test/bootstrap.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | sys.path.append(sys.path[0] + '/../')
4 | import dcdownloader
--------------------------------------------------------------------------------
/test/testserver/__init__.py:
--------------------------------------------------------------------------------
1 | import test.testserver.server
2 |
3 | if __name__ == '__main__':
4 | test.testserver.server.launch()
--------------------------------------------------------------------------------
/test/testserver/static/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dev-techmoe/python-dcdownloader/HEAD/test/testserver/static/test.png
--------------------------------------------------------------------------------
/devscript/run_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "Launch Web Test Server"
4 | pip install flask
5 | python3 -m test.testserver.server &
6 |
7 | pip install .
8 | pytest test
--------------------------------------------------------------------------------
/test/testserver/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | test_comic
6 |
7 | {% for i in chapter_list %}
8 | - {{ i }}
9 | {% endfor %}
10 |
11 |
12 |
--------------------------------------------------------------------------------
/test/testserver/templates/chapter.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | {{ title }}
8 |
9 | {% for i in images.keys() %}
10 | 
11 | {% endfor %}
12 |
13 |
14 |
--------------------------------------------------------------------------------
/dcdownloader/parser_selector.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | regular = {
4 | 'manhua.dmzj.com': 'DmzjParser',
5 | 'e-hentai.org': 'EhentaiParser'
6 | }
7 |
8 | def get_parser(url):
9 | for (k, v) in regular.items():
10 | if re.search(k, url):
11 | module = __import__('dcdownloader.parser.' + v, fromlist=[v])
12 |
13 | return getattr(module, v)()
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiofiles==0.4.0
2 | aiohttp==3.4.4
3 | astroid==2.0.4
4 | async-timeout==3.0.1
5 | attrs==18.2.0
6 | chardet==3.0.4
7 | colorlog==3.1.4
8 | cssselect==1.0.3
9 | filetype==1.0.1
10 | idna==2.7
11 | isort==4.3.4
12 | lazy-object-proxy==1.3.1
13 | lxml==4.2.5
14 | mccabe==0.6.1
15 | multidict==4.4.2
16 | pylint==2.1.1
17 | pyquery==1.4.0
18 | PyYAML==5.1
19 | six==1.11.0
20 | wrapt==1.10.11
21 | yarl==1.2.6
22 |
--------------------------------------------------------------------------------
/dcdownloader/base_logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from colorlog import ColoredFormatter
3 |
4 | logging_level = logging.INFO
5 |
6 | def getLogger(name=__name__):
7 | logger_base = logging.getLogger(name)
8 | logger_base.setLevel(logging_level)
9 | stream_handler = logging.StreamHandler()
10 |
11 | color_formatter = ColoredFormatter('%(log_color)s[%(levelname)-8s] %(message)s')
12 |
13 | stream_handler.setFormatter(color_formatter)
14 |
15 | logger_base .addHandler(stream_handler)
16 |
17 |
18 | return logger_base
--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
1 | from test.bootstrap import dcdownloader
2 | from dcdownloader import utils
3 | import os
4 |
5 | def test_update_window_title():
6 | utils.update_window_title(msg='test')
7 |
8 | def test_generate_aiohttp_session_config():
9 | out = utils.generate_aiohttp_session_config(key='value')
10 | assert out['key'] == 'value'
11 | assert out['proxy'] == dcdownloader.config.get('proxy')
12 |
13 | def test_mkdir():
14 | test_path = '/tmp/abc/def'
15 | utils.mkdir(test_path)
16 |
17 | assert os.path.exists(test_path)
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | # load requirement list
4 | with open('requirements.txt') as f:
5 | required_modules = f.read().splitlines()
6 |
7 | setup(
8 | name='DCDownloader',
9 | version='1.0',
10 | description="a downloader that supports many comic sites",
11 | author='techmoe',
12 | url='https://github.com/dev-techmoe/python-dcdownloader',
13 | license='MIT',
14 | packages=find_packages(include='dcdownloader.*'),
15 | install_requires=required_modules,
16 | entry_points="""
17 | [console_scripts]
18 | dcdownloader = dcdownloader.main:main
19 | """
20 | )
--------------------------------------------------------------------------------
/dcdownloader/main.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../'))
3 |
4 | from dcdownloader import arg_parse, version
5 |
6 | # for unittest
7 | cmd_args = None
8 |
9 | def main():
10 | args = arg_parse.parser.parse_args(cmd_args)
11 |
12 | version.show_welcome()
13 |
14 | from dcdownloader.scheduler import Scheduler
15 | from dcdownloader import parser_selector
16 | s = Scheduler(url=args.url, output_path=args.output_path, parser=parser_selector.get_parser(args.url),
17 | fetch_only=args.fetch_only, proxy=args.proxy, verify_ssl=args.verify_ssl)
18 | s.run()
19 |
20 |
21 | if __name__ == '__main__':
22 | main()
23 |
--------------------------------------------------------------------------------
/dcdownloader/title.py:
--------------------------------------------------------------------------------
1 | import sys, platform
2 |
3 | def for_linux(title):
4 | sys.stdout.write("\x1b]2;%s\x07" % title)
5 | sys.stdout.flush()
6 |
7 | def for_windows(title):
8 | import ctypes
9 | ctypes.windll.kernel32.SetConsoleTitleW(str(title))
10 |
11 | def check_platform():
12 | platform_flag = platform.system()
13 | if platform_flag == 'Windows':
14 | return False
15 | else:
16 | return True
17 |
18 | def update(title):
19 | if check_platform():
20 | for_linux(title)
21 | else:
22 | for_windows(title)
23 |
24 | if __name__ == '__main__':
25 | while True:
26 | import time
27 | update('test - %s' % time.time())
28 | time.sleep(1)
29 |
--------------------------------------------------------------------------------
/devscript/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | DOCKER_IMAGE="cdrx/pyinstaller-windows:python3-32bit"
4 |
5 | chmod +x ./devscript/*.sh
6 |
7 | echo "TRAVIS_BUILD_NUMBER=${TRAVIS_BUILD_NUMBER}"
8 | echo "BUILD_APP_ENTRY=${BUILD_APP_ENTRY}"
9 | echo "BUILD_OUTPUT_FILE_NAME=${BUILD_OUTPUT_FILE_NAME}"
10 | echo "DOCKER_IMAGE=${DOCKER_IMAGE}"
11 |
12 | # pull the docker images and create the container for build
13 | docker pull ${DOCKER_IMAGE}
14 | docker run -v "$(pwd):/src/" \
15 | -e "TRAVIS_BUILD_NUMBER=${TRAVIS_BUILD_NUMBER}" \
16 | -e "BUILD_APP_ENTRY=${BUILD_APP_ENTRY}" \
17 | -e "BUILD_OUTPUT_FILE_NAME=${BUILD_OUTPUT_FILE_NAME}" \
18 | ${DOCKER_IMAGE} "chmod +x devscript/build_pyinstaller_windows.sh && devscript/build_pyinstaller_windows.sh"
--------------------------------------------------------------------------------
/dcdownloader/version.py:
--------------------------------------------------------------------------------
1 |
2 | version = '2.0'
3 |
4 |
5 | ascii_art = r'''
6 | _____ _____ _____ _ _
7 | | __ \ / ____| __ \ | | | |
8 | | | | | | | | | | _____ ___ __ | | ___ __ _ __| | ___ _ __
9 | | | | | | | | | |/ _ \ \ /\ / / '_ \| |/ _ \ / _` |/ _` |/ _ \ '__|
10 | | |__| | |____| |__| | (_) \ V V /| | | | | (_) | (_| | (_| | __/ |
11 | |_____/ \_____|_____/ \___/ \_/\_/ |_| |_|_|\___/ \__,_|\__,_|\___|_|
12 | '''
13 |
14 | def show_welcome():
15 | print(ascii_art)
16 | print('DCDownloader version {version}'.format(version=version))
17 | print('Homepage: https://github.com/dev-techmoe/python-dcdownloader')
18 | print('Author: techmoe https://lolicookie.com')
19 | print()
--------------------------------------------------------------------------------
/devscript/build_pyinstaller_windows.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # NOTICE: this script is only running in the Docker container created by "cdrx/docker-pyinstaller"
4 |
5 | echo "Install dependencies"
6 | pip install -e .
7 | echo "Build windows executable files"
8 |
9 | # pyinstaller doesn't support to use the wildcard in CLI argument `--hidden-import`,
10 | # I will try to fix this problem in next version
11 |
12 | pyinstaller -F ${BUILD_APP_ENTRY} \
13 | --distpath pyinstaller/dist \
14 | --specpath pyinstaller/spec \
15 | --workpath pyinstaller/build \
16 | --hidden-import="dcdownloader.parser.DmzjParser" \
17 | --hidden-import="dcdownloader.parser.EhentaiParser"
18 |
19 | echo "Rename output file"
20 | mv pyinstaller/dist/main.exe pyinstaller/dist/${BUILD_OUTPUT_FILE_NAME}
--------------------------------------------------------------------------------
/dcdownloader/arg_parse.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 |
4 | parser = argparse.ArgumentParser()
5 |
6 | parser.add_argument('url', metavar='URL', help='target URL')
7 | parser.add_argument('output_path', metavar="OUTPUT_PATH", nargs='?', default='.', help='output path of downloaded file (default: current directory)')
8 | parser.add_argument('--proxy', help='HTTP proxy address for connection')
9 | parser.add_argument('--no-verify-ssl', dest='verify_ssl', action='store_false', help='Disable the SSL certificate verifying when connecting')
10 | parser.add_argument('-v', '--version', action='version', help='show version', version='dcdownloader 1.0')
11 | parser.add_argument('-V', '--verbose', action='store_true', help='show more running detail')
12 | parser.add_argument('--fetch-only', dest='fetch_only', action='store_true', help='Ignore all download process (only fetch chapter and image urls)')
--------------------------------------------------------------------------------
/dcdownloader/parser/SimpleParser.py:
--------------------------------------------------------------------------------
1 | from dcdownloader.parser.BaseParser import BaseParser
2 | from pyquery import PyQuery as pq
3 |
4 | class SimpleParser(BaseParser):
5 |
6 | async def parse_info(self, data):
7 | doc = pq(data)
8 | name = doc('h1').text()
9 |
10 | output = {}
11 | if name:
12 | output.setdefault('name', name)
13 |
14 | return output
15 |
16 | async def parse_chapter(self, response_data):
17 | doc = pq(response_data)
18 | data = {}
19 | for i in doc('.chapter_list a'):
20 | data.setdefault(i.text, pq(i).attr('href'))
21 | return (data,)
22 |
23 | async def parse_image_list(self, response_data):
24 | doc = pq(response_data)
25 | data = {}
26 |
27 | for i in doc('ul li img'):
28 | data.setdefault(pq(i).attr('alt'), pq(i).attr('src'))
29 |
30 | return data
31 |
32 |
33 | async def parse_downloaded_data(self, data):
34 | pass
--------------------------------------------------------------------------------
/dcdownloader/aiohttp_proxy_connector.py:
--------------------------------------------------------------------------------
1 | import aiohttp
2 | from yarl import URL
3 |
4 |
5 | class ProxyConnector(aiohttp.connector.TCPConnector):
6 |
7 | def __init__(self, *args, **kwargs):
8 | if kwargs.get('proxy'):
9 | self.proxy = kwargs.get('proxy')
10 |
11 | kwargs.pop('proxy')
12 |
13 | super().__init__(*args, **kwargs)
14 |
15 | async def _create_connection(self, req, traces=None, timeout=None):
16 | if req.proxy == None and 'proxy' in dir(self):
17 | # req.setdefault('proxy', URL(self.proxy))
18 | req.proxy = URL(self.proxy)
19 |
20 | if req.proxy:
21 | _, proto = await super()._create_proxy_connection(
22 | req,
23 | traces=traces,
24 | timeout=timeout
25 | )
26 | else:
27 | _, proto = await super()._create_direct_connection(
28 | req,
29 | traces=traces,
30 | timeout=timeout
31 | )
32 |
33 | return proto
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 techmoe
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/dcdownloader/config.py:
--------------------------------------------------------------------------------
1 | import yaml
2 |
3 | # 填充默认设置
4 | default_config = {
5 | 'debug_mode': False,
6 | 'save_manifest_file': True,
7 | 'output_path': './output',
8 | 'proxy': None,
9 | 'downloader_max_connection_number': 5,
10 | 'downloader_max_retry_number': 5,
11 | 'friendly_console_output': False,
12 | 'header': {
13 | 'referer': 'https://manhua.dmzj.com/',
14 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
15 | },
16 | }
17 |
18 | config = {}
19 |
20 | def load(text):
21 | global config
22 | config = yaml.load(text)
23 |
24 | def load_file(file_path):
25 | with open(file_path) as f:
26 | load(f.read())
27 |
28 | def get(key, fallback=False):
29 | keys = key.split('.')
30 |
31 | if fallback == True:
32 | target = default_config
33 | else:
34 | target = config
35 |
36 | for k in keys:
37 | target = target.get(k)
38 |
39 | if target == None and not fallback == True:
40 | target = get(key, fallback=True)
41 | return target
42 |
--------------------------------------------------------------------------------
/dcdownloader/parser/EhentaiParser.py:
--------------------------------------------------------------------------------
1 | from pyquery import PyQuery as pq
2 |
3 | from abc import ABCMeta, abstractmethod
4 |
5 | class EhentaiParser(metaclass=ABCMeta):
6 | base_url = ''
7 | chapter_mode = False
8 |
9 | async def parse_info(self, data):
10 | doc = pq(data)
11 | name = doc('#gn').text()
12 |
13 | return {'name': name}
14 |
15 | async def parse_chapter(self, data):
16 | doc = pq(data)
17 | _img_list = doc('.gdtm a')
18 |
19 | img_list = []
20 |
21 | for img in _img_list:
22 | url = pq(img).attr('href')
23 | img_list.append(url)
24 |
25 | next_page = doc('.ptds + td a')
26 | next_page_url = None
27 | if not next_page == None:
28 | next_page_url = pq(next_page).attr('href')
29 |
30 | return (img_list, next_page_url)
31 |
32 | async def parse_image_list(self, data):
33 | doc = pq(data)
34 | img_name = pq(doc('#i2 div')[2]).text().split('.')[0]
35 | img_url = doc('#i3 img').attr('src')
36 |
37 | return {
38 | img_name: img_url
39 | }
40 | # return {
41 | # 'file_name': 'url'
42 | # }
43 | pass
--------------------------------------------------------------------------------
/test/test_scheduler.py:
--------------------------------------------------------------------------------
1 | from test.bootstrap import dcdownloader
2 | from dcdownloader.scheduler import Scheduler
3 | from test.testserver.server import book
4 |
5 | class TestScheduler(object):
6 |
7 | test_server_url = 'http://localhost:32321'
8 | s = Scheduler(test_server_url, output_path='/tmp')
9 |
10 | def test___get_chapter_list(self):
11 | correct_result = {}
12 | for k in book.keys():
13 | correct_result.setdefault(k, '/' + k)
14 |
15 | result = self.s._get_chapter_list(self.test_server_url)
16 |
17 | assert result == correct_result
18 |
19 | def test__get_image_url_list(self):
20 |
21 | target_url_list = {}
22 | for k in book.keys():
23 | target_url_list.setdefault(k, self.test_server_url +'/' + k)
24 |
25 | result = self.s._get_image_url_list(target_url_list)
26 |
27 | assert result == book
28 |
29 | def test__start_download(self):
30 | test_data = book
31 | for (a, b) in test_data.items():
32 | for (c, d) in test_data[a].items():
33 | test_data[a][c] = self.test_server_url + test_data[a][c]
34 |
35 | self.s._start_download(test_data, 'test')
36 |
37 | def test__get_info(self):
38 | info = self.s._get_info(self.test_server_url)
39 |
40 | assert info['name'] == 'test_comic'
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - 3.6
4 | sudo: required
5 | services:
6 | - docker
7 |
8 | env:
9 | global:
10 | - BUILD_APP_ENTRY="./dcdownloader/main.py"
11 | - BUILD_OUTPUT_FILE_NAME="dcdownloader_windows_build_${TRAVIS_BUILD_NUMBER}.exe"
12 | script:
13 | - chmod +x ./devscript/run_test.sh
14 | - devscript/run_test.sh
15 |
16 | before_deploy:
17 | - chmod +x ./devscript/build.sh
18 | - "./devscript/build.sh"
19 |
20 | deploy:
21 | provider: releases
22 | api_key:
23 | secure: rsE8hL+4KQBaxcwBLbF0zMskg1y23Ek89enYUHWg4DhTJVm4aWLkkueRoysUJ8whkwzrxy9Ng/+rF8jQpP9Jn+4Yiu2IyK5UTGYOGOIJBHcMbtFSC461cq8Yp7Hy5NBE4HKzt6q/Ne1AfNOQyCiLD95wH+T46JX9rBWUT/m5y/VYfgOzNV+AkG8brcNLbS4OGZ2drBG1I8js16mFQg8k5VbyapP+wQhDe1rtvf9FAtdAWUL8/IOJUrvUItYr7xsnmmiK4I3D1/nUPuf7Ltqywpe6IOEqZlxGmMtuqhm79dequOtb29OD6AcjgCOLKdjUlQqR/IzPTpkyD0bmDCbQe+d+5ASejQ7gZrCWZ/obC8dJI7r64Z89Y/NriW6l+l+eAg1R++KTsh3jWW4pZwyANDnlJIRszGk2lTosgPqwE3NcFHsl+GclasaL3ADoVZJR/ibWXbewbj28nJr5STINTFKxprYwXCELH7Qyq/Y6UtDiDEAeBJOyJ7IPtLQaHSdSOV+giIsJBLEdcfU+ky+7Gkk6ZosSiNDbcMbTDOxRTLw1XVx1HaJHzT9zD3aKSs/NrwhTLTpJ/qv6UKaZe9F6pOQLY2BDnl4kDOB9Pjs9M+aZUWiWE+grpB7ucQCWvYcJx76Xd3jTcYDDRrKYZlcE0uSVoy7bbJMYRFfInreXmGI=
24 | file: pyinstaller/dist/${BUILD_OUTPUT_FILE_NAME}
25 | on:
26 | repo: dev-techmoe/python-dcdownloader
27 | tags: true
28 | all_branches: true
29 | body: "auto published by CI, no more description now."
30 | prerelease: true
31 |
--------------------------------------------------------------------------------
/dcdownloader/parser/DmzjParser.py:
--------------------------------------------------------------------------------
1 | from dcdownloader.parser.BaseParser import BaseParser
2 | from pyquery import PyQuery as pq
3 | from dcdownloader import utils
4 | import json, re, urllib
5 |
6 | class DmzjParser(BaseParser):
7 | image_base_url = 'https://images.dmzj.com'
8 | page_base_url = 'https://manhua.dmzj.com'
9 | filename_extension = 'jpg'
10 | request_header = {
11 | 'referer': 'https://manhua.dmzj.com/',
12 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.167 Safari/537.36'
13 | }
14 |
15 | async def parse_info(self, data):
16 | doc = pq(data)
17 | comic_name = doc('.anim_title_text h1').text()
18 |
19 | return {
20 | 'name': comic_name
21 | }
22 |
23 | async def parse_chapter(self, data):
24 | doc = pq(data)
25 | url_list = {}
26 | d = doc('.cartoon_online_border ul li a')
27 |
28 | for u in doc('.cartoon_online_border ul li a'):
29 | url_list.setdefault(pq(u).text(), self.page_base_url + pq(u).attr('href'))
30 |
31 | return (url_list, )
32 |
33 | async def parse_image_list(self, data):
34 | jspacker_string = re.search(r'(eval\(.+\))', data).group()
35 | jspacker_string = utils.decode_packed_codes(jspacker_string)
36 |
37 | image_list = re.search(r'(\[.+\])', jspacker_string).group()
38 | image_list = urllib.parse.unquote(image_list).replace('\\', '')
39 | image_list = json.loads(image_list)
40 |
41 | images = {}
42 |
43 | for k in image_list:
44 | images.setdefault(k.split('/')[-1].split('.')[0], self.image_base_url + '/' + k)
45 | return images
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 |
107 | tests/
108 | .vscode/
109 |
110 | *_temp*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DCDownloader
2 |
3 | 
4 | [](https://github.com/dev-techmoe/python-dcdownloader/blob/master/LICENSE)
5 | [](https://travis-ci.org/dev-techmoe/python-dcdownloader)
6 | [](https://github.com/dev-techmoe/python-dcdownloader/stargazers)
7 |
8 | 专注于漫画网站、图站等类似形式的内容站点的批量下载器框架。
9 |
10 | ## 说明
11 | 这个项目最开始是作者编写的一个仅支持某个漫画网站的批量下载器,后来有人提建议说有增加网站的需求,作者便重新梳理了一下思路重构了代码,使其作为一种框架形式存在。DCDownloader是目前一款异步实现的、支持自定义适配的,专注于漫画网站、图站等类似形式的内容站点的批量下载器。通过编写Parser来做到适配不同的网站。
12 | 目前项目中内置了三个Parser,分别是
13 |
14 | * SimpleParser 一个Parser的例子,希望自己编写Parser的话可以参考这个的实现,同时也应用于单元测试过程中。
15 | * DmzjParser 动漫之家漫画站(非原创区)
16 | * EhentaiParser Ehentai
17 |
18 | ## 安装
19 | ### Windows
20 | [exe可执行文件下载](https://github.com/dev-techmoe/python-dcdownloader/releases)
21 |
22 | ### Linux/OSX
23 | 请确认您本机已安装python和pip并且python版本大于3.4.3
24 | ```bash
25 | $ pip3 install https://github.com/dev-techmoe/python-dcdownloader/archive/master.zip
26 | $ dcdownloader -h
27 | ```
28 |
29 | ## 可用命令
30 | ```
31 | usage: dcdownloader [-h] [--proxy PROXY] [--no-verify-ssl] [-v] [-V] [--fetch-only]
32 | URL [OUTPUT_PATH]
33 |
34 | positional arguments:
35 | URL target URL
36 | OUTPUT_PATH output path of downloaded file (default: current directory)
37 |
38 | optional arguments:
39 | -h, --help show this help message and exit
40 | --proxy PROXY HTTP proxy address for connection
41 | --no-verify-ssl Disable the SSL certificate verifying when connecting
42 | -v, --version show version
43 | -V, --verbose show more running detail
44 | --fetch-only Ignore all download process (only fetch chapter and image urls)
45 |
46 | ```
47 |
48 | ## 免责声明
49 | 这个项目更多的其实是作为作者个人的练习项目存在,方便使用只是其二。为了不对目标站造成困扰,作者已将默认并发数设置到了一个作者认为不会对目标站造成影响的程度。由于使用者自身使用所造成的问题作者不付任何责任,同时作者不对任何下载内容承担任何责任。
50 |
51 | ## 贡献
52 | 本项目欢迎提交pr,非常希望您能够帮助改善本项目。您可以帮助我来适配更多的网站,但是我目前**不希望用来解析收录原创漫画站的Parser**,还请见谅。
53 |
54 | ## License
55 | MIT
--------------------------------------------------------------------------------
/dcdownloader/parser/BaseParser.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 |
3 | class BaseParser(metaclass=ABCMeta):
4 |
5 | # filename extension for downloaded file
6 | filename_extension = None
7 | # request_header
8 | request_header = None
9 |
10 | # if target website not has the chapter list
11 | # e.g. Ehentai
12 | # turn this attribute to False
13 | chapter_mode = True
14 |
15 | @abstractmethod
16 | async def parse_info(self, data):
17 | """ Get detail of target page (e.g.: name, author, etc.)
18 | Args:
19 | data: data come from requesting specified URL.
20 |
21 | Returns:
22 | {
23 | 'name': '...'
24 | }
25 | """
26 |
27 |
28 | @abstractmethod
29 | async def parse_chapter(self, data, page):
30 | """ Parse chapter data from received data.
31 | Args:
32 | data: data come from requesting specified URL.
33 |
34 | Returns:
35 | if self.chapter_mode == True (default):
36 | (
37 | {
38 | 'chapter_name': 'url'
39 | },
40 | 'url_of_next_chapter_page(optional)'
41 | )
42 | else:
43 | (
44 | (, , , ...),
45 | 'url_of_next_chapter_page(optional)'
46 | )
47 | Return must be a list
48 | """
49 |
50 | @abstractmethod
51 | async def parse_image_list(self, data):
52 | """ Parse image URL from received data.
53 | Args:
54 | data: data come from requesting specified URL.
55 |
56 | Returns:
57 | {
58 | 'file_name': 'url'
59 | }
60 | """
61 |
62 | async def parse_downloaded_data(self, data):
63 | """ Process the downloaded data. (optional)
64 | You can do some action (such as decrypt, unzip, etc) in there
65 | Args:
66 | data: data come from requesting specified URL.
67 |
68 | Returns:
69 | processed data
70 | """
--------------------------------------------------------------------------------
/test/testserver/server.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, send_from_directory
2 | app = Flask(__name__)
3 | from flask import render_template
4 | from threading import Thread
5 |
6 | book = {
7 | 'chapter_1': {
8 | '1': '/static/test.png',
9 | '2': '/static/test.png',
10 | '3': '/static/test.png',
11 | '4': '/static/test.png',
12 | '5': '/static/test.png'
13 | },
14 | 'chapter_1': {
15 | '1': '/static/test.png',
16 | '2': '/static/test.png',
17 | '3': '/static/test.png',
18 | '4': '/static/test.png',
19 | '5': '/static/test.png'
20 | },
21 | 'chapter_2': {
22 | '1': '/static/test.png',
23 | '2': '/static/test.png',
24 | '3': '/static/test.png',
25 | '4': '/static/test.png',
26 | '5': '/static/test.png'
27 | },
28 | 'chapter_3': {
29 | '1': '/static/test.png',
30 | '2': '/static/test.png',
31 | '3': '/static/test.png',
32 | '4': '/static/test.png',
33 | '5': '/static/test.png'
34 | },
35 | 'chapter_4': {
36 | '1': '/static/test.png',
37 | '2': '/static/test.png',
38 | '3': '/static/test.png',
39 | '4': '/static/test.png',
40 | '5': '/static/test.png'
41 | },
42 | 'chapter_5': {
43 | '1': '/static/test.png',
44 | '2': '/static/test.png',
45 | '3': '/static/test.png',
46 | '4': '/static/test.png',
47 | '5': '/static/test.png'
48 | }
49 | }
50 |
51 | index_template = r"""
52 |
53 | """
54 |
55 | chapter_num_per_page = 3
56 |
57 | @app.route('/static/')
58 | def re_static(path):
59 | return send_from_directory('static', path)
60 |
61 | @app.route('/')
62 | def main():
63 | return render_template('index.html', chapter_list=book.keys())
64 |
65 | @app.route('/')
66 | def chapter(chapter_id):
67 | if chapter_id in book:
68 | return render_template('chapter.html', images=book[chapter_id])
69 | else:
70 | return 'chapter not found'
71 |
72 | class ServerThread(Thread):
73 | def run(self):
74 | app.debug = False
75 | app.run(port=32321, debug=False, use_reloader=False, threaded=True)
76 |
77 | def launch():
78 | ServerThread().start()
79 |
80 | if __name__ == '__main__':
81 | launch()
--------------------------------------------------------------------------------
/dcdownloader/utils.py:
--------------------------------------------------------------------------------
1 | import re, os, traceback
2 | from dcdownloader import config, title
3 |
4 | def decode_packed_codes(code):
5 | def encode_base_n(num, n, table=None):
6 | FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
7 | if not table:
8 | table = FULL_TABLE[:n]
9 |
10 | if n > len(table):
11 | raise ValueError('base %d exceeds table length %d' % (n, len(table)))
12 |
13 | if num == 0:
14 | return table[0]
15 |
16 | ret = ''
17 | while num:
18 | ret = table[num % n] + ret
19 | num = num // n
20 | return ret
21 |
22 | pattern = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
23 | mobj = re.search(pattern, code)
24 | obfucasted_code, base, count, symbols = mobj.groups()
25 | base = int(base)
26 | count = int(count)
27 | symbols = symbols.split('|')
28 | symbol_table = {}
29 |
30 | while count:
31 | count -= 1
32 | base_n_count = encode_base_n(count, base)
33 | symbol_table[base_n_count] = symbols[count] or base_n_count
34 |
35 | return re.sub(
36 | r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
37 | obfucasted_code)
38 |
39 | def generate_aiohttp_session_config(**kwargs):
40 | params = {
41 | 'timeout': 50,
42 | 'verify_ssl': config.get('debug_mode'),
43 | 'proxy': config.get('proxy')
44 | }
45 | params.update(kwargs)
46 |
47 | return params
48 |
49 | def update_window_title(mode=None, msg=None):
50 | app_name = 'DCDownloader'
51 |
52 | window_title = app_name
53 |
54 | if not mode == None:
55 | window_title = window_title + ': %s' % mode
56 |
57 | if not msg == None:
58 | window_title = window_title + ' - %s' % msg
59 |
60 | title.update(window_title)
61 |
62 | def mkdir(path):
63 | path_ = path.split('/')
64 |
65 | for i in range(0, len(path_)):
66 | p = '/'.join(path_[0:i+1])
67 | if p and not os.path.exists(p):
68 | os.mkdir(p)
69 |
70 | def retry(max_num=5, on_retry=None, on_fail=None, on_fail_exit=False):
71 | remaining_num = max_num
72 | def decorate(func):
73 | async def _retry(*args, **kwargs):
74 | nonlocal max_num, remaining_num
75 | try:
76 | return await func(*args, **kwargs)
77 | except Exception as err:
78 | if not on_retry == None:
79 | # traceback.print_exc()
80 | on_retry(err=err, args=[args, kwargs], retry_num=max_num - remaining_num)
81 |
82 | if remaining_num > 1:
83 | remaining_num -= 1
84 | return await _retry(*args, **kwargs)
85 | else:
86 | if not on_fail == None:
87 | on_fail(err=err, args=[args, kwargs], retry_num=max_num - remaining_num)
88 | remaining_num = max_num
89 | if on_fail_exit == True:
90 | exit()
91 |
92 |
93 | return _retry
94 |
95 | return decorate
96 |
97 |
--------------------------------------------------------------------------------
/dcdownloader/scheduler.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import logging
3 |
4 | import aiofiles
5 | import aiohttp
6 | import filetype
7 |
8 | from dcdownloader import base_logger, utils
9 | # for test
10 | from dcdownloader.parser.SimpleParser import SimpleParser
11 | from dcdownloader.utils import retry
12 | from .aiohttp_proxy_connector import ProxyConnector
13 | logger = base_logger.getLogger(__name__)
14 |
15 | class Scheduler(object):
16 | download_total_number = 0
17 | download_complete_number = 0
18 |
19 | def __init__(self, url, output_path='.', name='Scheduler', max_connection_num=10, max_retry_num=5,
20 | proxy=None, header=None, save_manifest_file=False, parser=SimpleParser(),
21 | fetch_only=False, verify_ssl=True):
22 |
23 | # usable config:
24 | # name: Scheduler instance name
25 | # url: url of target comic
26 | # downloader_max_connection_num: max connection number for downloading
27 | # downloader_max_retry_num: max retry number for downloading
28 | # proxy: proxy setting (e.g: http://127.0.0.1:1081)
29 | # header: http request header
30 | # save_manifest_file: (not complete)
31 | self.url = url
32 | self.output_path = output_path
33 | self.name = name
34 | self.max_connection_num = max_connection_num
35 | self.max_retry_num = max_retry_num
36 | self.proxy = proxy
37 | self.header = None
38 | self.save_manifest_file = False
39 | self.fetch_only = fetch_only
40 | self.verify_ssl = verify_ssl
41 |
42 | self.sema = asyncio.Semaphore(self.max_connection_num)
43 |
44 | self.parser = parser
45 |
46 | if 'request_header' in dir(self.parser):
47 | self.header = self.parser.request_header
48 |
49 | self.aiohttp_session = aiohttp.ClientSession(
50 | connector=ProxyConnector(proxy=proxy, verify_ssl=self.verify_ssl), headers=self.header, read_timeout=30)
51 |
52 | def run(self):
53 | logger.info('Using parser %s ..', type(self.parser).__name__)
54 | logger.info('Fetch information')
55 | info = self._get_info(self.url)
56 |
57 | if not info:
58 | logger.error('No comic infomation found.')
59 | return
60 | else:
61 | logger.info('Comic name: %s', info.get('name'))
62 |
63 | logger.info('Fetch chapter list')
64 | clist = self._get_chapter_list(base_url=self.url)
65 |
66 | if not clist:
67 | logger.error('No chapter list found')
68 | return
69 | else:
70 | logger.info('Chapter number: %d', len(clist))
71 |
72 | logger.info('Fetch image url list')
73 | img_list = self._get_image_url_list(clist)
74 | logger.info('Total image number: %s', self.total_image_num)
75 | logger.info('Start download images')
76 | self._start_download(img_list, info['name'])
77 | logger.info('Download comlpleted')
78 |
79 | self._close_request_session()
80 |
81 | def _get_info(self, base_url):
82 | info = {}
83 |
84 | logger.debug('Fetching target information')
85 | @retry(max_num=self.max_retry_num,
86 | on_retry=lambda err, args, retry_num: logger.warning('Failed to get info %s (%s), retrying.', args[0][0],str(err)),
87 | on_fail=lambda err, args, retry_num: logger.error('Failed to get info %s (%s)', args[0][0],str(err)),
88 | on_fail_exit=True)
89 | async def fetch(url):
90 | #async with aiohttp.ClientSession(connector=ProxyConnector(proxy='http://192.168.28.1:8888')) as sess:
91 | async with self.aiohttp_session.get(url, verify_ssl=self.verify_ssl) as resp:
92 | nonlocal info
93 | ret_data = await resp.text()
94 | info = await self.parser.parse_info(ret_data)
95 |
96 | loop = asyncio.get_event_loop()
97 | loop.run_until_complete(asyncio.gather(fetch(base_url)))
98 |
99 | return info
100 |
101 |
102 | def _get_chapter_list(self, base_url):
103 | logger.debug('Starting fetch chapter list')
104 | chapter_list = {}
105 |
106 | # chapter_list = {
107 | # 'chapter_name': 'url'
108 | # }
109 |
110 | # @retry(stop=stop_after_attempt(self.max_retry_num))
111 | # @retry(max_num=self.max_retry_num)
112 | @retry(max_num=self.max_retry_num,
113 | on_retry=lambda err, args, retry_num: logger.warning('Failed to fetch chapter list %s (%s), retrying.', args[0][0],str(err)),
114 | on_fail=lambda err, args, retry_num: logger.error('Failed to fetch chapter list %s (%s)', args[0][0],str(err)),
115 | on_fail_exit=True)
116 | async def fetch(url, asyncio_loop, page=1):
117 | with (await self.sema):
118 | async with self.aiohttp_session.get(url) as ret:
119 |
120 | ret_data = await ret.text()
121 | parsed_data = await self.parser.parse_chapter(ret_data)
122 |
123 | if self.parser.chapter_mode:
124 | chapter_list.update( parsed_data[0] )
125 | else:
126 | for i in parsed_data[0]:
127 | chapter_list.setdefault('{}-{}'.format(page,parsed_data[0].index(i)), i)
128 |
129 | if len(parsed_data) > 1 and not parsed_data[1] == None:
130 | page += 1
131 | await fetch(parsed_data[1], asyncio_loop, page)
132 |
133 | loop = asyncio.get_event_loop()
134 | loop.run_until_complete(asyncio.gather(fetch(base_url, loop)))
135 |
136 | return chapter_list
137 |
138 |
139 | def _get_image_url_list(self, chapter_list):
140 |
141 | image_url_list = {}
142 |
143 | # image_url_list = {
144 | # 'chapter_name': {
145 | # 'file_name': 'url'
146 | # }
147 | # # ...
148 | # }
149 |
150 | #@retry(max_num=self.max_retry_num,
151 | # on_retry=lambda err, args, num: logger.warning('Failed to fetch chapter list (%s=>%s)', args[0][0], args[0][1]))
152 | total_image_num = 0
153 | @retry(max_num=self.max_retry_num,
154 | on_retry=lambda err, args, retry_num: logger.warning('Failed to fetch image list "%s" (%s), retrying.', str(args[0]), str(err)),
155 | on_fail=lambda err, args, retry_num: logger.error('Failed to fetch image list "%s" (%s)', str(args[0]), str(err)),
156 | on_fail_exit=True)
157 | async def fetch(chapter_name, chapter_url):
158 | nonlocal total_image_num
159 | with (await self.sema):
160 | async with self.aiohttp_session.get(chapter_url, verify_ssl=self.verify_ssl) as resp:
161 | image_list = await self.parser.parse_image_list(await resp.text())
162 | total_image_num += len(image_list)
163 | image_url_list.update({chapter_name: image_list})
164 |
165 | loop = asyncio.get_event_loop()
166 | future_list = []
167 |
168 |
169 | for k, v in chapter_list.items():
170 | future_list.append(fetch(k, v))
171 |
172 |
173 |
174 | loop.run_until_complete(asyncio.gather(*future_list))
175 | self.total_image_num = total_image_num
176 | return image_url_list
177 |
178 | def _start_download(self, image_url_list, comic_name):
179 | # 解藕希望
180 |
181 | # @retry(stop=stop_after_attempt(self.max_retry_num), after=after_log(logger, logging.DEBUG))
182 | #@retry(max_num=self.max_retry_num, on_retry=self._downloader_on_retry)
183 | @retry(max_num=self.max_retry_num,
184 | on_retry=lambda err, args, retry_num: logger.warning('Failed to update downloading status (%s), retrying.', str(err)),
185 | on_fail=lambda err, args, retry_num: logger.error('Failed to update downloading status (%s)', str(err)) )
186 | async def update_count(save_path, name):
187 | logger.info('Download complete: %s', self._generate_download_info(name, save_path))
188 | self.download_complete_number += 1
189 | if '_on_download_complete' in dir(self.parser):
190 | getattr(self, '_on_download_complete')()
191 |
192 | # @retry(stop=stop_after_attempt(self.max_retry_num), after=after_log(logger, logging.DEBUG))
193 | # @retry(max_num=self.max_retry_num, on_retry=self._downloader_on_retry)
194 | @retry(max_num=self.max_retry_num,
195 | on_retry=lambda err, args, retry_num: logger.warning('Failed to save file "%s" (%s), retrying.', args[1]['save_path'],str(err)),
196 | on_fail=lambda err, args, retry_num: logger.error('Failed to save file "%s" (%s)', args[1]['save_path'],str(err)))
197 | async def save_file(binary, save_path, name):
198 | logger.debug('Saving file %s', self._generate_download_info(name, save_path))
199 | async with aiofiles.open(save_path, 'wb') as f:
200 | await f.write(binary)
201 | await update_count(save_path=save_path, name=name)
202 |
203 |
204 | @retry(max_num=self.max_retry_num,
205 | on_retry=lambda err, args, retry_num: logger.warning('Failed to request url "%s" (%s), retrying.', args[1]['image_url'], str(err)),
206 | on_fail=lambda err, args, retry_num: logger.error('Failed to request target "%s" (%s)', args[1]['image_url'], str(err)) )
207 | async def download(image_url, save_path, name):
208 | with (await self.sema):
209 | logger.info('Start download: %s', self._generate_download_info(name, save_path))
210 | utils.mkdir('/'.join(save_path.split('/')[:-1]))
211 | #async with aiohttp.ClientSession(headers=self.header) as session:
212 |
213 | if self.fetch_only:
214 | logger.warning('Fetch only mode is on, all downloading process will not run')
215 | return
216 |
217 | async with self.aiohttp_session.get(image_url, verify_ssl=self.verify_ssl) as resp:
218 | resp_data = await resp.content.read()
219 | if 'on_download_complete' in dir(self.parser):
220 | resp_data = getattr(self.parser, 'on_download_complete')(resp_data)
221 |
222 | if 'filename_extension' in dir(self.parser):
223 | filename_extension = self.parser.filename_extension
224 | else:
225 | filename_extension = filetype.guess(resp_data).extension
226 |
227 | if filename_extension:
228 | save_path += '.' + filename_extension
229 | else:
230 | logger.warning('unknown filetype')
231 |
232 | # return (resp_data, save_file)
233 | await save_file(binary=resp_data, save_path=save_path, name=name)
234 |
235 | loop = asyncio.get_event_loop()
236 | future_list = []
237 |
238 |
239 | for k, v in image_url_list.items():
240 | for name, url in v.items():
241 | # future_list.append(download(url, path + ))
242 | # path = '_temp/' + comic_name + k + '/'+ name
243 | if 'chapter_mode' in dir(self.parser) and not self.parser.chapter_mode:
244 | path = '/'.join([self.output_path, comic_name, name])
245 | else:
246 | path = '/'.join([self.output_path, comic_name, k, name])
247 |
248 | future_list.append(download(image_url=url, save_path=path, name=name))
249 |
250 | loop.run_until_complete(asyncio.gather(*future_list))
251 |
252 |
253 | def _generate_download_info(self, name, path):
254 | return name + ' => '+ path
255 |
256 |
257 | def _downloader_on_retry(self, err, args, retry_num):
258 | logger.warning('Download fail (%s) %s, retry number: %s', str(err),
259 | self._generate_download_info(args[1]['name'], args[1]['save_path']), retry_num)
260 |
261 | def _close_request_session(self):
262 | asyncio.get_event_loop().run_until_complete(asyncio.gather(self.aiohttp_session.close()))
263 |
264 | def __del__(self):
265 | self._close_request_session()
266 | def _on_download_complete(self):
267 | pass
268 |
269 | def _call_parser_hook(self, hook_name):
270 | pass
271 |
--------------------------------------------------------------------------------