├── test
    ├── __init__.py
    ├── test_config.py
    ├── test_oauth.py
    └── test_cookies.py
├── requirements.txt
├── setup.cfg
├── bin
    └── gallery-dl
├── scripts
    ├── hook-gallery_dl.py
    ├── util.py
    ├── pyinstaller.py
    ├── run_tests.sh
    ├── build_testresult_db.py
    ├── bash_completion.py
    ├── create_test_data.py
    └── release.sh
├── gallery_dl
    ├── version.py
    ├── __main__.py
    ├── downloader
    │   ├── text.py
    │   ├── __init__.py
    │   ├── common.py
    │   └── ytdl.py
    ├── postprocessor
    │   ├── mtime.py
    │   ├── common.py
    │   ├── __init__.py
    │   ├── classify.py
    │   ├── metadata.py
    │   ├── exec.py
    │   └── zip.py
    ├── extractor
    │   ├── 8chan.py
    │   ├── 4chan.py
    │   ├── mangapanda.py
    │   ├── ngomik.py
    │   ├── recursive.py
    │   ├── message.py
    │   ├── mangastream.py
    │   ├── chan.py
    │   ├── idolcomplex.py
    │   ├── directlink.py
    │   ├── adultempire.py
    │   ├── nsfwalbum.py
    │   ├── imgth.py
    │   ├── mangafox.py
    │   ├── rule34.py
    │   ├── lineblog.py
    │   ├── erolord.py
    │   ├── yandere.py
    │   ├── safebooru.py
    │   ├── senmanga.py
    │   ├── e621.py
    │   ├── hypnohub.py
    │   ├── test.py
    │   ├── khinsider.py
    │   ├── gfycat.py
    │   ├── konachan.py
    │   ├── 3dbooru.py
    │   ├── 2chan.py
    │   ├── slideshare.py
    │   ├── danbooru.py
    │   ├── vanillarock.py
    │   ├── myportfolio.py
    │   ├── hentaicafe.py
    │   ├── readcomiconline.py
    │   ├── hentainexus.py
    │   ├── fuskator.py
    │   ├── hentai2read.py
    │   ├── pururin.py
    │   ├── hbrowse.py
    │   ├── hentaihere.py
    │   ├── fallenangels.py
    │   ├── bobx.py
    │   ├── warosu.py
    │   ├── paheal.py
    │   ├── komikcast.py
    │   ├── plurk.py
    │   ├── sankakucomplex.py
    │   └── shopify.py
    ├── exception.py
    ├── config.py
    └── oauth.py
├── .travis.yml
├── .gitignore
├── Makefile
├── snap
    ├── local
    │   └── launchers
    │   │   └── gallery-dl-launch
    └── snapcraft.yaml
└── setup.py


/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.11.0
2 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = gallery_dl/__init__.py,gallery_dl/__main__.py,setup.py,build,scripts,archive
3 | ignore = E203,E226,W504
4 | 


--------------------------------------------------------------------------------
/bin/gallery-dl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: UTF-8 -*-
3 | 
4 | import gallery_dl
5 | 
6 | if __name__ == '__main__':
7 |     gallery_dl.main()
8 | 


--------------------------------------------------------------------------------
/scripts/hook-gallery_dl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from gallery_dl import extractor, downloader, postprocessor
 4 | 
 5 | hiddenimports = [
 6 |     package.__name__ + "." + module
 7 |     for package in (extractor, downloader, postprocessor)
 8 |     for module in package.modules
 9 | ]
10 | 


--------------------------------------------------------------------------------
/scripts/util.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import sys
 4 | import os.path
 5 | 
 6 | ROOTDIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 7 | sys.path.insert(0, os.path.realpath(ROOTDIR))
 8 | 
 9 | 
10 | def path(*segments, join=os.path.join):
11 |     return join(ROOTDIR, *segments)
12 | 


--------------------------------------------------------------------------------
/gallery_dl/version.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | __version__ = "1.11.0-dev"
10 | 


--------------------------------------------------------------------------------
/scripts/pyinstaller.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Build a standalone executable using PyInstaller"""
 5 | 
 6 | import PyInstaller.__main__
 7 | import util
 8 | 
 9 | PyInstaller.__main__.run([
10 |     "--onefile",
11 |     "--console",
12 |     "--name", "gallery-dl." + ("exe" if PyInstaller.is_win else "bin"),
13 |     "--additional-hooks-dir", util.path("scripts"),
14 |     "--distpath", util.path("dist"),
15 |     "--workpath", util.path("build"),
16 |     "--specpath", util.path("build"),
17 |     util.path("gallery_dl", "__main__.py"),
18 | ])
19 | 


--------------------------------------------------------------------------------
/gallery_dl/__main__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2017 Mike Fährmann
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License version 2 as
 8 | # published by the Free Software Foundation.
 9 | 
10 | import sys
11 | 
12 | if __package__ is None and not hasattr(sys, "frozen"):
13 |     import os.path
14 |     path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
15 |     sys.path.insert(0, os.path.realpath(path))
16 | 
17 | import gallery_dl
18 | 
19 | if __name__ == "__main__":
20 |     gallery_dl.main()
21 | 


--------------------------------------------------------------------------------
/scripts/run_tests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 4 | 
 5 | TESTS_CORE=(config cookies downloader extractor oauth postprocessor text util)
 6 | TESTS_RESULTS=(results)
 7 | 
 8 | 
 9 | # select tests
10 | case "${1:-${GALLERYDL_TESTS:-core}}" in
11 |     core)    TESTS=( ${TESTS_CORE[@]}    );;
12 |     results) TESTS=( ${TESTS_RESULTS[@]} );;
13 |     *)       TESTS=(                     );;
14 | esac
15 | 
16 | 
17 | # transform each array element to test_###.py
18 | TESTS=( ${TESTS[@]/#/test_} )
19 | TESTS=( ${TESTS[@]/%/.py}   )
20 | 
21 | 
22 | # run 'nosetests' with selected tests
23 | # (or all tests if ${TESTS} is empty)
24 | nosetests --verbose -w "${DIR}/../test" ${TESTS[@]}
25 | 


--------------------------------------------------------------------------------
/gallery_dl/downloader/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2014-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Downloader module for text: URLs"""
10 | 
11 | from .common import DownloaderBase
12 | 
13 | 
14 | class TextDownloader(DownloaderBase):
15 |     scheme = "text"
16 | 
17 |     def download(self, url, pathfmt):
18 |         if self.part:
19 |             pathfmt.part_enable(self.partdir)
20 |         self.out.start(pathfmt.path)
21 |         with pathfmt.open("wb") as file:
22 |             file.write(url.encode()[5:])
23 |         return True
24 | 
25 | 
26 | __downloader__ = TextDownloader
27 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/mtime.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Use metadata as file modification time"""
10 | 
11 | from .common import PostProcessor
12 | from ..text import parse_int
13 | 
14 | 
15 | class MtimePP(PostProcessor):
16 | 
17 |     def __init__(self, pathfmt, options):
18 |         PostProcessor.__init__(self)
19 |         self.key = options.get("key", "date")
20 | 
21 |     def run(self, pathfmt):
22 |         mtime = pathfmt.kwdict.get(self.key)
23 |         ts = getattr(mtime, "timestamp", None)
24 |         pathfmt.kwdict["_mtime"] = ts() if ts else parse_int(mtime)
25 | 
26 | 
27 | __postprocessor__ = MtimePP
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "3.4"
 4 |   - "3.5"
 5 |   - "3.6"
 6 |   - "pypy3"
 7 | env:
 8 |   - GALLERYDL_TESTS=core
 9 | matrix:
10 |   include:
11 |     - python: "3.7"
12 |       dist: xenial
13 |     - python: "3.8-dev"
14 |       dist: xenial
15 |     - python: "3.6"
16 |       env: GALLERYDL_TESTS=results
17 |     - language: minimal
18 |       dist: xenial
19 |       env: GALLERYDL_TESTS=snap
20 |       addons:
21 |         snaps:
22 |           - name: snapcraft
23 |             classic: true
24 |       install:
25 |         - true
26 |       script:
27 |         - sudo apt update
28 |         - snapcraft --destructive-mode
29 |         - sudo snap try
30 |         - snap run gallery-dl --verbose https://twitter.com/ubuntu/status/1121001597092364288
31 | 
32 | git:
33 |   depth: 3
34 |   quiet: true
35 | branches:
36 |   only:
37 |   - master
38 |   - /^v\d+\.\d+\.\d+(-\S*)?$/
39 |   - /^test(-\w+)+$/
40 | 
41 | install:
42 |   - pip install -r requirements.txt pyOpenSSL
43 | script:
44 |   - ./scripts/run_tests.sh
45 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Common classes and constants used by postprocessor modules."""
10 | 
11 | import logging
12 | 
13 | 
14 | class PostProcessor():
15 |     """Base class for postprocessors"""
16 | 
17 |     def __init__(self):
18 |         name = self.__class__.__name__[:-2].lower()
19 |         self.log = logging.getLogger("postprocessor." + name)
20 | 
21 |     @staticmethod
22 |     def prepare(pathfmt):
23 |         """Update file paths, etc."""
24 | 
25 |     @staticmethod
26 |     def run(pathfmt):
27 |         """Execute the postprocessor for a file"""
28 | 
29 |     @staticmethod
30 |     def run_after(pathfmt):
31 |         """Execute postprocessor after moving a file to its target location"""
32 | 
33 |     @staticmethod
34 |     def finalize():
35 |         """Cleanup"""
36 | 
37 |     def __repr__(self):
38 |         return self.__class__.__name__
39 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/8chan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2014-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images and videos from https://8ch.net/"""
10 | 
11 | from . import chan
12 | 
13 | 
14 | class InfinitychanThreadExtractor(chan.ChanThreadExtractor):
15 |     """Extractor for images from threads from 8ch.net"""
16 |     category = "8chan"
17 |     filename_fmt = "{time}-{filename}{ext}"
18 |     pattern = r"(?:https?://)?(?:www\.)?8ch\.net/([^/]+)/res/(\d+)"
19 |     test = ("https://8ch.net/builders/res/3.html", {
20 |         "url": "5d85c0509f907f217aea379f862b41bf3d01f645",
21 |         "keyword": "0c497190c0c0f826925fde09815351d01869c783",
22 |     })
23 |     api_url = "https://8ch.net/{board}/res/{thread}.json"
24 |     file_url = "https://media.8ch.net/{board}/src/{tim}{ext}"
25 |     file_url_v2 = "https://media.8ch.net/file_store/{tim}{ext}"
26 | 
27 |     def build_url(self, post):
28 |         fmt = self.file_url if len(post["tim"]) < 64 else self.file_url_v2
29 |         return fmt.format_map(post)
30 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Post-processing modules"""
10 | 
11 | import importlib
12 | import logging
13 | 
14 | modules = [
15 |     "classify",
16 |     "exec",
17 |     "metadata",
18 |     "mtime",
19 |     "ugoira",
20 |     "zip",
21 | ]
22 | 
23 | log = logging.getLogger("postprocessor")
24 | 
25 | 
26 | def find(name):
27 |     """Return a postprocessor class with the given name"""
28 |     try:
29 |         return _cache[name]
30 |     except KeyError:
31 |         pass
32 | 
33 |     klass = None
34 |     if name in modules:  # prevent unwanted imports
35 |         try:
36 |             module = importlib.import_module("." + name, __package__)
37 |         except ImportError:
38 |             pass
39 |         else:
40 |             klass = module.__postprocessor__
41 |     _cache[name] = klass
42 |     return klass
43 | 
44 | 
45 | # --------------------------------------------------------------------
46 | # internals
47 | 
48 | _cache = {}
49 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | archive/
 2 | 
 3 | # Byte-compiled / optimized / DLL files
 4 | __pycache__/
 5 | *.py[cod]
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # Manpages
62 | gallery-dl.1
63 | gallery-dl.conf.5
64 | 
65 | # Bash completion
66 | gallery-dl.bash_completion
67 | 
68 | # Snap packaging specific
69 | /snap/.snapcraft/
70 | /parts/
71 | /stage/
72 | /prime/
73 | 
74 | /*.snap
75 | /*_source.tar.bz2
76 | 


--------------------------------------------------------------------------------
/gallery_dl/downloader/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Downloader modules"""
10 | 
11 | import importlib
12 | 
13 | modules = [
14 |     "http",
15 |     "text",
16 |     "ytdl",
17 | ]
18 | 
19 | 
20 | def find(scheme):
21 |     """Return downloader class suitable for handling the given scheme"""
22 |     try:
23 |         return _cache[scheme]
24 |     except KeyError:
25 |         pass
26 | 
27 |     klass = None
28 |     if scheme == "https":
29 |         scheme = "http"
30 |     if scheme in modules:  # prevent unwanted imports
31 |         try:
32 |             module = importlib.import_module("." + scheme, __package__)
33 |         except ImportError:
34 |             pass
35 |         else:
36 |             klass = module.__downloader__
37 | 
38 |     if scheme == "http":
39 |         _cache["http"] = _cache["https"] = klass
40 |     else:
41 |         _cache[scheme] = klass
42 |     return klass
43 | 
44 | 
45 | # --------------------------------------------------------------------
46 | # internals
47 | 
48 | _cache = {}
49 | 


--------------------------------------------------------------------------------
/gallery_dl/downloader/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2014-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Common classes and constants used by downloader modules."""
10 | 
11 | import os
12 | import logging
13 | from .. import config, util
14 | 
15 | 
16 | class DownloaderBase():
17 |     """Base class for downloaders"""
18 |     scheme = ""
19 | 
20 |     def __init__(self, extractor, output):
21 |         self.session = extractor.session
22 |         self.out = output
23 |         self.log = logging.getLogger("downloader." + self.scheme)
24 |         self.part = self.config("part", True)
25 |         self.partdir = self.config("part-directory")
26 | 
27 |         if self.partdir:
28 |             self.partdir = util.expand_path(self.partdir)
29 |             os.makedirs(self.partdir, exist_ok=True)
30 | 
31 |     def config(self, key, default=None):
32 |         """Interpolate downloader config value for 'key'"""
33 |         return config.interpolate(("downloader", self.scheme, key), default)
34 | 
35 |     def download(self, url, pathfmt):
36 |         """Write data from 'url' into the file specified by 'pathfmt'"""
37 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | PREFIX ?= /usr/local
 3 | BINDIR ?= $(PREFIX)/bin
 4 | MANDIR ?= $(PREFIX)/man
 5 | SHAREDIR ?= $(PREFIX)/share
 6 | PYTHON ?= /usr/bin/env python3
 7 | 
 8 | # set SYSCONFDIR to /etc if PREFIX=/usr or PREFIX=/usr/local
 9 | SYSCONFDIR = $(shell if [ $(PREFIX) = /usr -o $(PREFIX) = /usr/local ]; then echo /etc; else echo $(PREFIX)/etc; fi)
10 | 
11 | all: man completion docs/supportedsites.rst
12 | 
13 | clean:
14 | 	$(RM) gallery-dl.1 gallery-dl.conf.5 gallery-dl.bash_completion
15 | 	$(RM) -r build/
16 | 
17 | install: man completion
18 | 	$(PYTHON) setup.py install
19 | 
20 | release: man completion docs/supportedsites.rst
21 | 	scripts/release.sh
22 | 
23 | test:
24 | 	scripts/run_tests.sh
25 | 
26 | executable:
27 | 	scripts/pyinstaller.py
28 | 
29 | completion: gallery-dl.bash_completion
30 | 
31 | man: gallery-dl.1 gallery-dl.conf.5
32 | 
33 | .PHONY: all clean install release test executable completion man
34 | 
35 | docs/supportedsites.rst: gallery_dl/*/*.py scripts/supportedsites.py
36 | 	$(PYTHON) scripts/supportedsites.py
37 | 
38 | gallery-dl.1: gallery_dl/option.py scripts/man.py
39 | 	$(PYTHON) scripts/man.py
40 | 
41 | gallery-dl.conf.5: docs/configuration.rst scripts/man.py
42 | 	$(PYTHON) scripts/man.py
43 | 
44 | gallery-dl.bash_completion: gallery_dl/option.py scripts/bash_completion.py
45 | 	$(PYTHON) scripts/bash_completion.py
46 | 


--------------------------------------------------------------------------------
/snap/local/launchers/gallery-dl-launch:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # This is the maintainence launcher for the snap, make necessary runtime environment changes to make the snap work here.  You may also insert security confinement/deprecation/obsoletion notice of the snap here.
 3 | 
 4 | set \
 5 | 	-o errexit \
 6 | 	-o errtrace \
 7 | 	-o nounset \
 8 | 	-o pipefail
 9 | 
10 | # Use user's real home directory for canonical configuration path access
11 | declare REALHOME="$(
12 | 	getent passwd "${USER}" \
13 | 		| cut --delimiter=: --fields=6
14 | )"
15 | HOME="${REALHOME}"
16 | 
17 | if ! test -f "${SNAP_USER_COMMON}"/marker_disable_interface_warning; then
18 | 	# Warn if the `removable-media` interface isn't connected
19 | 	if ! ls /media &>/dev/null; then
20 | 		printf -- \
21 | 			"It seems that this snap isn't connected to the \`removable-media\` security confinement interface.  If you want to save the files under \`/media\`, \`/run/media\`, or \`/mnt\` directories you need to connect this snap to the \`removable-media\` interface by running the following command in a terminal:\\n\\n    sudo snap connect %s:removable-media\\n\\n" \
22 | 			"${SNAP_NAME}" \
23 | 			>&2
24 | 		printf -- \
25 | 			"To disable this warning create an empty file at the following path:\\n\\n    %s/marker_disable_interface_warning\\n\\n" \
26 | 			"${SNAP_USER_COMMON}" \
27 | 			>&2
28 | 	fi
29 | fi
30 | 
31 | # Finally run the next part of the command chain
32 | exec "${@}"
33 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/4chan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images and videos from https://www.4chan.org/"""
10 | 
11 | from . import chan
12 | from .. import text
13 | 
14 | 
15 | class FourchanThreadExtractor(chan.ChanThreadExtractor):
16 |     """Extractor for images from threads from 4chan.org"""
17 |     category = "4chan"
18 |     pattern = (r"(?:https?://)?boards\.4chan(?:nel)?\.org"
19 |                r"/([^/]+)/thread/(\d+)")
20 |     test = (
21 |         ("https://boards.4chan.org/tg/thread/15396072/", {
22 |             "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
23 |             "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
24 |             "content": "20b7b51afa51c9c31a0020a0737b889532c8d7ec",
25 |         }),
26 |         ("https://boards.4channel.org/tg/thread/15396072/", {
27 |             "url": "39082ad166161966d7ba8e37f2173a824eb540f0",
28 |             "keyword": "7ae2f4049adf0d2f835eb91b6b26b7f4ec882e0a",
29 |         }),
30 |     )
31 |     api_url = "https://a.4cdn.org/{board}/thread/{thread}.json"
32 |     file_url = "https://i.4cdn.org/{board}/{tim}{ext}"
33 | 
34 |     def update(self, post, data=None):
35 |         chan.ChanThreadExtractor.update(self, post, data)
36 |         post["filename"] = text.unescape(post["filename"])
37 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/mangapanda.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract manga-chapters and entire manga from https://www.mangapanda.com/"""
10 | 
11 | from .mangareader import MangareaderMangaExtractor, MangareaderChapterExtractor
12 | 
13 | 
14 | class MangapandaBase():
15 |     """Base class for mangapanda extractors"""
16 |     category = "mangapanda"
17 |     root = "https://www.mangapanda.com"
18 | 
19 | 
20 | class MangapandaChapterExtractor(MangapandaBase, MangareaderChapterExtractor):
21 |     """Extractor for manga-chapters from mangapanda.com"""
22 |     pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com((/[^/?&#]+)/(\d+))"
23 |     test = ("https://www.mangapanda.com/red-storm/2", {
24 |         "url": "1f633f776e950531ba9b1e81965316458e785261",
25 |         "keyword": "b24df4b9cc36383fb6a44e06d32a3884a4dcb5fb",
26 |     })
27 | 
28 | 
29 | class MangapandaMangaExtractor(MangapandaBase, MangareaderMangaExtractor):
30 |     """Extractor for manga from mangapanda.com"""
31 |     chapterclass = MangapandaChapterExtractor
32 |     pattern = r"(?:https?://)?(?:www\.)?mangapanda\.com(/[^/?&#]+)/?$"
33 |     test = ("https://www.mangapanda.com/mushishi", {
34 |         "url": "357f965732371cac1990fee8b480f62e29141a42",
35 |         "keyword": "031b3ea085921c552de017ecbb9b906e462229c9",
36 |     })
37 | 


--------------------------------------------------------------------------------
/scripts/build_testresult_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """Collect results of extractor unit tests"""
 5 | 
 6 | import sys
 7 | import os.path
 8 | import datetime
 9 | 
10 | import util
11 | from gallery_dl import extractor, job, config
12 | from test.test_results import setup_test_config
13 | 
14 | 
15 | # filter test cases
16 | 
17 | tests = [
18 |     (idx, extr, url, result)
19 | 
20 |     for extr in extractor.extractors()
21 |     if hasattr(extr, "test") and extr.test
22 |     if len(sys.argv) <= 1 or extr.category in sys.argv
23 | 
24 |     for idx, (url, result) in enumerate(extr._get_tests())
25 |     if result
26 | ]
27 | 
28 | 
29 | # setup target directory
30 | 
31 | path = util.path("archive", "testdb", str(datetime.date.today()))
32 | os.makedirs(path, exist_ok=True)
33 | 
34 | 
35 | for idx, extr, url, result in tests:
36 | 
37 |     # filename
38 |     name = "{}-{}-{}.json".format(extr.category, extr.subcategory, idx)
39 |     print(name)
40 | 
41 |     # config values
42 |     setup_test_config()
43 | 
44 |     if "options" in result:
45 |         for key, value in result["options"]:
46 |             config.set(key.split("."), value)
47 |     if "range" in result:
48 |         config.set(("image-range",), result["range"])
49 |         config.set(("chapter-range",), result["range"])
50 | 
51 |     # write test data
52 |     try:
53 |         with open(os.path.join(path, name), "w") as outfile:
54 |             job.DataJob(url, file=outfile, ensure_ascii=False).run()
55 |     except KeyboardInterrupt:
56 |         sys.exit()
57 | 


--------------------------------------------------------------------------------
/scripts/bash_completion.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2019 Mike Fährmann
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License version 2 as
 8 | # published by the Free Software Foundation.
 9 | 
10 | """Generate bash completion script from gallery-dl's argument parser"""
11 | 
12 | import util
13 | from gallery_dl import option
14 | 
15 | 
16 | TEMPLATE = """_gallery_dl()
17 | {
18 |     local cur prev
19 |     COMPREPLY=()
20 |     cur="${COMP_WORDS[COMP_CWORD]}"
21 |     prev="${COMP_WORDS[COMP_CWORD-1]}"
22 | 
23 |     if [[ "${prev}" =~ ^(%(fileopts)s)$ ]]; then
24 |         COMPREPLY=( $(compgen -f -- "${cur}") )
25 |     elif [[ "${prev}" =~ ^(%(diropts)s)$ ]]; then
26 |         COMPREPLY=( $(compgen -d -- "${cur}") )
27 |     else
28 |         COMPREPLY=( $(compgen -W "%(opts)s" -- "${cur}") )
29 |     fi
30 | }
31 | 
32 | complete -F _gallery_dl gallery-dl
33 | """
34 | 
35 | opts = []
36 | diropts = []
37 | fileopts = []
38 | for action in option.build_parser()._actions:
39 | 
40 |     if action.metavar in ("DEST",):
41 |         diropts.extend(action.option_strings)
42 | 
43 |     elif action.metavar in ("FILE", "CFG"):
44 |         fileopts.extend(action.option_strings)
45 | 
46 |     for opt in action.option_strings:
47 |         if opt.startswith("--"):
48 |             opts.append(opt)
49 | 
50 | PATH = util.path("gallery-dl.bash_completion")
51 | with open(PATH, "w", encoding="utf-8") as file:
52 |     file.write(TEMPLATE % {
53 |         "opts"    : " ".join(opts),
54 |         "diropts" : "|".join(diropts),
55 |         "fileopts": "|".join(fileopts),
56 |     })
57 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/ngomik.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract manga-chapters and entire manga from http://ngomik.in/"""
10 | 
11 | from .common import ChapterExtractor
12 | from .. import text
13 | import re
14 | 
15 | 
16 | class NgomikChapterExtractor(ChapterExtractor):
17 |     """Extractor for manga-chapters from ngomik.in"""
18 |     category = "ngomik"
19 |     root = "http://ngomik.in"
20 |     pattern = (r"(?:https?://)?(?:www\.)?ngomik\.in"
21 |                r"(/[^/?&#]+-chapter-[^/?&#]+)")
22 |     test = (
23 |         ("https://www.ngomik.in/14-sai-no-koi-chapter-1-6/", {
24 |             "url": "8e67fdf751bbc79bc6f4dead7675008ddb8e32a4",
25 |             "keyword": "204d177f09d438fd50c9c28d98c73289194640d8",
26 |         }),
27 |         ("https://ngomik.in/break-blade-chapter-26/", {
28 |             "count": 34,
29 |         }),
30 |     )
31 | 
32 |     def metadata(self, page):
33 |         info = text.extract(page, '<title>', "</title>")[0]
34 |         manga, _, chapter = info.partition(" Chapter ")
35 |         chapter, sep, minor = chapter.partition(" ")[0].partition(".")
36 | 
37 |         return {
38 |             "manga": text.unescape(manga),
39 |             "chapter": text.parse_int(chapter),
40 |             "chapter_minor": sep + minor,
41 |             "lang": "id",
42 |             "language": "Indonesian",
43 |         }
44 | 
45 |     @staticmethod
46 |     def images(page):
47 |         readerarea = text.extract(page, 'id="readerarea"', 'class="chnav"')[0]
48 |         return [
49 |             (text.unescape(url), None)
50 |             for url in re.findall(r"\ssrc=[\"']?([^\"' >]+)", readerarea)
51 |         ]
52 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/recursive.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Recursive extractor"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import extractor, util
13 | import requests
14 | import re
15 | 
16 | 
17 | class RecursiveExtractor(Extractor):
18 |     """Extractor that fetches URLs from a remote or local source"""
19 |     category = "recursive"
20 |     pattern = r"r(?:ecursive)?:"
21 |     test = ("recursive:https://pastebin.com/raw/FLwrCYsT", {
22 |         "url": "eee86d65c346361b818e8f4b2b307d9429f136a2",
23 |     })
24 | 
25 |     def items(self):
26 |         blist = self.config(
27 |             "blacklist", {"directlink"} | util.SPECIAL_EXTRACTORS)
28 | 
29 |         self.session.mount("file://", FileAdapter())
30 |         page = self.request(self.url.partition(":")[2]).text
31 | 
32 |         yield Message.Version, 1
33 |         with extractor.blacklist(blist):
34 |             for match in re.finditer(r"https?://[^\s\"']+", page):
35 |                 yield Message.Queue, match.group(0), {}
36 | 
37 | 
38 | class FileAdapter(requests.adapters.BaseAdapter):
39 |     """Requests adapter for local files"""
40 | 
41 |     def send(self, request, **kwargs):
42 |         response = requests.Response()
43 |         try:
44 |             response.raw = open(request.url[7:], "rb")
45 |         except OSError:
46 |             import io
47 |             response.raw = io.BytesIO()
48 |             response.status_code = requests.codes.bad_request
49 |         else:
50 |             response.raw.release_conn = response.raw.close
51 |             response.status_code = requests.codes.ok
52 |         return response
53 | 
54 |     def close(self):
55 |         pass
56 | 


--------------------------------------------------------------------------------
/scripts/create_test_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2015-2019 Mike Fährmann
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License version 2 as
 8 | # published by the Free Software Foundation.
 9 | 
10 | """Create testdata for extractor tests"""
11 | 
12 | import argparse
13 | 
14 | import util  # noqa
15 | from gallery_dl import extractor
16 | from test.test_results import ResultJob, setup_test_config
17 | 
18 | 
19 | TESTDATA_FMT = """
20 |     test = ("{}", {{
21 |         "url": "{}",
22 |         "keyword": "{}",
23 |         "content": "{}",
24 |     }})
25 | """
26 | 
27 | TESTDATA_EXCEPTION_FMT = """
28 |     test = ("{}", {{
29 |         "exception": exception.{},
30 |     }})
31 | """
32 | 
33 | 
34 | def main():
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument("--content", action="store_true")
37 |     parser.add_argument("--recreate", action="store_true")
38 |     parser.add_argument("urls", nargs="*")
39 |     args = parser.parse_args()
40 | 
41 |     if args.recreate:
42 |         urls = [
43 |             test[0]
44 |             for extr in extractor.extractors() if extr.category in args.urls
45 |             for test in extr.test
46 |         ]
47 |     else:
48 |         urls = args.urls
49 | 
50 |     setup_test_config()
51 | 
52 |     for url in urls:
53 |         tjob = ResultJob(url, content=args.content)
54 |         try:
55 |             tjob.run()
56 |         except Exception as exc:
57 |             fmt = TESTDATA_EXCEPTION_FMT
58 |             data = (exc.__class__.__name__,)
59 |         else:
60 |             fmt = TESTDATA_FMT
61 |             data = (tjob.hash_url.hexdigest(),
62 |                     tjob.hash_keyword.hexdigest(),
63 |                     tjob.hash_content.hexdigest())
64 |         print(tjob.extractor.__class__.__name__)
65 |         print(fmt.format(url, *data))
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/classify.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Categorize files by file extension"""
10 | 
11 | from .common import PostProcessor
12 | import os
13 | 
14 | 
15 | class ClassifyPP(PostProcessor):
16 | 
17 |     DEFAULT_MAPPING = {
18 |         "Music" : ("mp3", "aac", "flac", "ogg", "wma", "m4a", "wav"),
19 |         "Video" : ("flv", "ogv", "avi", "mp4", "mpg", "mpeg", "3gp", "mkv",
20 |                    "webm", "vob", "wmv"),
21 |         "Pictures" : ("jpg", "jpeg", "png", "gif", "bmp", "svg", "webp"),
22 |         "Archives" : ("zip", "rar", "7z", "tar", "gz", "bz2"),
23 |     }
24 | 
25 |     def __init__(self, pathfmt, options):
26 |         PostProcessor.__init__(self)
27 |         mapping = options.get("mapping", self.DEFAULT_MAPPING)
28 | 
29 |         self.mapping = {
30 |             ext: directory
31 |             for directory, exts in mapping.items()
32 |             for ext in exts
33 |         }
34 | 
35 |     def prepare(self, pathfmt):
36 |         ext = pathfmt.extension
37 |         if ext in self.mapping:
38 |             # set initial paths to enable download skips
39 |             self._build_paths(pathfmt, self.mapping[ext])
40 | 
41 |     def run(self, pathfmt):
42 |         ext = pathfmt.extension
43 |         if ext in self.mapping:
44 |             # rebuild paths in case the filename extension changed
45 |             path = self._build_paths(pathfmt, self.mapping[ext])
46 |             os.makedirs(path, exist_ok=True)
47 | 
48 |     @staticmethod
49 |     def _build_paths(pathfmt, extra):
50 |         path = pathfmt.realdirectory + extra
51 |         pathfmt.realpath = path + os.sep + pathfmt.filename
52 |         pathfmt.path = pathfmt.directory + extra + os.sep + pathfmt.filename
53 |         return path
54 | 
55 | 
56 | __postprocessor__ = ClassifyPP
57 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/message.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2018 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | 
10 | class Message():
11 |     """Enum for message identifiers
12 | 
13 |     Extractors yield their results as message-tuples, where the first element
14 |     is one of the following identifiers. This message-identifier determines
15 |     the type and meaning of the other elements in such a tuple.
16 | 
17 |     - Message.Version:
18 |       - Message protocol version (currently always '1')
19 |       - 2nd element specifies the version of all following messages as integer
20 | 
21 |     - Message.Directory:
22 |       - Sets the target directory for all following images
23 |       - 2nd element is a dictionary containing general metadata
24 | 
25 |     - Message.Url:
26 |       - Image URL and its metadata
27 |       - 2nd element is the URL as a string
28 |       - 3rd element is a dictionary with image-specific metadata
29 | 
30 |     - Message.Headers:  # obsolete
31 |       - HTTP headers to use while downloading
32 |       - 2nd element is a dictionary with header-name and -value pairs
33 | 
34 |     - Message.Cookies:  # obsolete
35 |       - Cookies to use while downloading
36 |       - 2nd element is a dictionary with cookie-name and -value pairs
37 | 
38 |     - Message.Queue:
39 |       - (External) URL that should be handled by another extractor
40 |       - 2nd element is the (external) URL as a string
41 |       - 3rd element is a dictionary containing URL-specific metadata
42 | 
43 |     - Message.Urllist:
44 |       - Same as Message.Url, but its 2nd element is a list of multiple URLs
45 |       - The additional URLs serve as a fallback if the primary one fails
46 |     """
47 | 
48 |     Version = 1
49 |     Directory = 2
50 |     Url = 3
51 |     #  Headers = 4
52 |     #  Cookies = 5
53 |     Queue = 6
54 |     Urllist = 7
55 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/metadata.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Write metadata to JSON files"""
10 | 
11 | from .common import PostProcessor
12 | from .. import util
13 | 
14 | 
15 | class MetadataPP(PostProcessor):
16 | 
17 |     def __init__(self, pathfmt, options):
18 |         PostProcessor.__init__(self)
19 | 
20 |         mode = options.get("mode", "json")
21 |         ext = "txt"
22 | 
23 |         if mode == "custom":
24 |             self.write = self._write_custom
25 |             self.formatter = util.Formatter(options.get("format"))
26 |         elif mode == "tags":
27 |             self.write = self._write_tags
28 |         else:
29 |             self.write = self._write_json
30 |             self.indent = options.get("indent", 4)
31 |             self.ascii = options.get("ascii", False)
32 |             ext = "json"
33 | 
34 |         self.extension = options.get("extension", ext)
35 | 
36 |     def run(self, pathfmt):
37 |         path = "{}.{}".format(pathfmt.realpath, self.extension)
38 |         with open(path, "w", encoding="utf-8") as file:
39 |             self.write(file, pathfmt.kwdict)
40 | 
41 |     def _write_custom(self, file, kwdict):
42 |         output = self.formatter.format_map(kwdict)
43 |         file.write(output)
44 | 
45 |     def _write_tags(self, file, kwdict):
46 |         tags = kwdict.get("tags") or kwdict.get("tag_string")
47 | 
48 |         if not tags:
49 |             return
50 | 
51 |         if not isinstance(tags, list):
52 |             taglist = tags.split(", ")
53 |             if len(taglist) < len(tags) / 16:
54 |                 taglist = tags.split(" ")
55 |             tags = taglist
56 | 
57 |         file.write("\n".join(tags))
58 |         file.write("\n")
59 | 
60 |     def _write_json(self, file, kwdict):
61 |         util.dump_json(kwdict, file, self.ascii, self.indent)
62 | 
63 | 
64 | __postprocessor__ = MetadataPP
65 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/mangastream.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract manga-chapters from https://readms.net/"""
10 | 
11 | from .common import ChapterExtractor
12 | from .. import text
13 | 
14 | 
15 | class MangastreamChapterExtractor(ChapterExtractor):
16 |     """Extractor for manga-chapters from mangastream.com"""
17 |     category = "mangastream"
18 |     archive_fmt = "{chapter_id}_{page}"
19 |     pattern = (r"(?:https?://)?(?:www\.)?(?:readms\.net|mangastream\.com)"
20 |                r"/r(?:ead)?/([^/]*/([^/]+)/(\d+))")
21 |     test = (
22 |         ("https://readms.net/r/onepunch_man/087/4874/1"),
23 |         ("https://mangastream.com/r/onepunch_man/087/4874/1"),
24 |     )
25 |     root = "https://readms.net"
26 | 
27 |     def __init__(self, match):
28 |         self.part, self.chapter, self.chapter_id = match.groups()
29 |         url = "{}/r/{}".format(self.root, self.part)
30 |         ChapterExtractor.__init__(self, match, url)
31 | 
32 |     def metadata(self, page):
33 |         manga, pos = text.extract(
34 |             page, '<span class="hidden-xs hidden-sm">', "<")
35 |         pos = page.find(self.part, pos)
36 |         title, pos = text.extract(page, ' - ', '<', pos)
37 |         count, pos = text.extract(page, 'Last Page (', ')', pos)
38 |         return {
39 |             "manga": manga,
40 |             "chapter": text.unquote(self.chapter),
41 |             "chapter_id": text.parse_int(self.chapter_id),
42 |             "title": title,
43 |             "count": text.parse_int(count, 1),
44 |             "lang": "en",
45 |             "language": "English",
46 |         }
47 | 
48 |     def images(self, page):
49 |         while True:
50 |             pos = page.index(' class="page"')
51 |             next_url = text.extract(page, ' href="', '"', pos)[0]
52 |             image_url = text.extract(page, ' src="', '"', pos)[0]
53 |             yield text.urljoin(self.root, image_url), None
54 |             page = self.request(text.urljoin(self.root, next_url)).text
55 | 


--------------------------------------------------------------------------------
/gallery_dl/exception.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2018 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Exception classes used by gallery-dl
10 | 
11 | Class Hierarchy:
12 | 
13 | Exception
14 |  +-- GalleryDLException
15 |       +-- ExtractionError
16 |       |    +-- AuthenticationError
17 |       |    +-- AuthorizationError
18 |       |    +-- NotFoundError
19 |       |    +-- HttpError
20 |       +-- DownloadError
21 |       |    +-- DownloadComplete
22 |       |    +-- DownloadRetry
23 |       +-- NoExtractorError
24 |       +-- FormatError
25 |       +-- FilterError
26 |       +-- StopExtraction
27 | """
28 | 
29 | 
30 | class GalleryDLException(Exception):
31 |     """Base class for GalleryDL exceptions"""
32 | 
33 | 
34 | class ExtractionError(GalleryDLException):
35 |     """Base class for exceptions during information extraction"""
36 | 
37 | 
38 | class AuthenticationError(ExtractionError):
39 |     """Invalid or missing login information"""
40 | 
41 | 
42 | class AuthorizationError(ExtractionError):
43 |     """Insufficient privileges to access a resource"""
44 | 
45 | 
46 | class NotFoundError(ExtractionError):
47 |     """Requested resource (gallery/image) does not exist"""
48 | 
49 | 
50 | class HttpError(ExtractionError):
51 |     """HTTP request during extraction failed"""
52 | 
53 | 
54 | class DownloadError(GalleryDLException):
55 |     """Base class for exceptions during file downloads"""
56 | 
57 | 
58 | class DownloadRetry(DownloadError):
59 |     """Download attempt failed and should be retried"""
60 | 
61 | 
62 | class DownloadComplete(DownloadError):
63 |     """Output file of attempted download is already complete"""
64 | 
65 | 
66 | class NoExtractorError(GalleryDLException):
67 |     """No extractor can handle the given URL"""
68 | 
69 | 
70 | class FormatError(GalleryDLException):
71 |     """Error while building output path"""
72 | 
73 | 
74 | class FilterError(GalleryDLException):
75 |     """Error while evaluating a filter expression"""
76 | 
77 | 
78 | class StopExtraction(GalleryDLException):
79 |     """Extraction should stop"""
80 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/exec.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Execute processes"""
10 | 
11 | from .common import PostProcessor
12 | from .. import util
13 | import subprocess
14 | import os
15 | 
16 | 
17 | if os.name == "nt":
18 |     def quote(s):
19 |         return '"' + s.replace('"', '\\"') + '"'
20 | else:
21 |     from shlex import quote
22 | 
23 | 
24 | class ExecPP(PostProcessor):
25 | 
26 |     def __init__(self, pathfmt, options):
27 |         PostProcessor.__init__(self)
28 |         args = options["command"]
29 | 
30 |         if isinstance(args, str):
31 |             if "{}" not in args:
32 |                 args += " {}"
33 |             self.args = args
34 |             self.shell = True
35 |             self._format = self._format_args_string
36 |         else:
37 |             self.args = [util.Formatter(arg) for arg in args]
38 |             self.shell = False
39 |             self._format = self._format_args_list
40 | 
41 |         if options.get("async", False):
42 |             self._exec = self._exec_async
43 | 
44 |     def run_after(self, pathfmt):
45 |         self._exec(self._format(pathfmt))
46 | 
47 |     def _format_args_string(self, pathfmt):
48 |         return self.args.replace("{}", quote(pathfmt.realpath))
49 | 
50 |     def _format_args_list(self, pathfmt):
51 |         kwdict = pathfmt.kwdict
52 |         kwdict["_directory"] = pathfmt.realdirectory
53 |         kwdict["_filename"] = pathfmt.filename
54 |         kwdict["_path"] = pathfmt.realpath
55 |         return [arg.format_map(kwdict) for arg in self.args]
56 | 
57 |     def _exec(self, args):
58 |         self.log.debug("Running '%s'", args)
59 |         retcode = subprocess.Popen(args, shell=self.shell).wait()
60 |         if retcode:
61 |             self.log.warning(
62 |                 "Executing '%s' returned with non-zero exit status (%d)",
63 |                 " ".join(args) if isinstance(args, list) else args, retcode)
64 | 
65 |     def _exec_async(self, args):
66 |         subprocess.Popen(args, shell=self.shell)
67 | 
68 | 
69 | __postprocessor__ = ExecPP
70 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/chan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Base classes for extractors for different Futaba Channel-like boards"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class ChanThreadExtractor(Extractor):
16 |     """Base class for extractors for Futaba Channel-like boards"""
17 |     category = "chan"
18 |     subcategory = "thread"
19 |     directory_fmt = ("{category}", "{board}", "{thread} - {title}")
20 |     filename_fmt = "{tim}-{filename}.{extension}"
21 |     archive_fmt = "{board}_{thread}_{tim}"
22 |     api_url = ""
23 |     file_url = ""
24 | 
25 |     def __init__(self, match):
26 |         Extractor.__init__(self, match)
27 |         self.metadata = {
28 |             "board": match.group(1),
29 |             "thread": match.group(2),
30 |         }
31 | 
32 |     def items(self):
33 |         yield Message.Version, 1
34 |         url = self.api_url.format_map(self.metadata)
35 |         posts = self.request(url).json()["posts"]
36 |         self.metadata["title"] = self.get_thread_title(posts[0])
37 |         yield Message.Directory, self.metadata
38 |         for post in posts:
39 |             if "filename" not in post:
40 |                 continue
41 |             self.update(post)
42 |             yield Message.Url, self.build_url(post), post
43 |             if "extra_files" in post:
44 |                 for file in post["extra_files"]:
45 |                     self.update(post, file)
46 |                     yield Message.Url, self.build_url(post), post
47 | 
48 |     def update(self, post, data=None):
49 |         """Update keyword dictionary"""
50 |         post.update(data or self.metadata)
51 |         post["extension"] = post["ext"][1:]
52 | 
53 |     def build_url(self, post):
54 |         """Construct an image url out of a post object"""
55 |         return self.file_url.format_map(post)
56 | 
57 |     @staticmethod
58 |     def get_thread_title(post):
59 |         """Return thread title from first post"""
60 |         title = post["sub"] if "sub" in post else text.remove_html(post["com"])
61 |         return text.unescape(title)[:50]
62 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/idolcomplex.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://idol.sankakucomplex.com/"""
10 | 
11 | from . import sankaku
12 | 
13 | 
14 | class IdolcomplexExtractor(sankaku.SankakuExtractor):
15 |     """Base class for idolcomplex extractors"""
16 |     category = "idolcomplex"
17 |     cookiedomain = "idol.sankakucomplex.com"
18 |     subdomain = "idol"
19 | 
20 | 
21 | class IdolcomplexTagExtractor(IdolcomplexExtractor,
22 |                               sankaku.SankakuTagExtractor):
23 |     """Extractor for images from idol.sankakucomplex.com by search-tags"""
24 |     pattern = r"(?:https?://)?idol\.sankakucomplex\.com/\?([^#]*)"
25 |     test = (
26 |         ("https://idol.sankakucomplex.com/?tags=lyumos+wreath", {
27 |             "count": ">= 6",
28 |             "pattern": r"https://is\.sankakucomplex\.com/data/[^/]{2}/[^/]{2}"
29 |                        r"/[^/]{32}\.\w+\?e=\d+&m=[^&#]+",
30 |         }),
31 |         ("https://idol.sankakucomplex.com"
32 |          "/?tags=lyumos+wreath&page=3&next=694215"),
33 |     )
34 | 
35 | 
36 | class IdolcomplexPoolExtractor(IdolcomplexExtractor,
37 |                                sankaku.SankakuPoolExtractor):
38 |     """Extractor for image-pools from idol.sankakucomplex.com"""
39 |     pattern = r"(?:https?://)?idol\.sankakucomplex\.com/pool/show/(\d+)"
40 |     test = ("https://idol.sankakucomplex.com/pool/show/145", {
41 |         "count": 3,
42 |     })
43 | 
44 | 
45 | class IdolcomplexPostExtractor(IdolcomplexExtractor,
46 |                                sankaku.SankakuPostExtractor):
47 |     """Extractor for single images from idol.sankakucomplex.com"""
48 |     pattern = r"(?:https?://)?idol\.sankakucomplex\.com/post/show/(\d+)"
49 |     test = ("https://idol.sankakucomplex.com/post/show/694215", {
50 |         "content": "694ec2491240787d75bf5d0c75d0082b53a85afd",
51 |         "options": (("tags", True),),
52 |         "keyword": {
53 |             "tags_character": "shani_(the_witcher)",
54 |             "tags_copyright": "the_witcher",
55 |             "tags_idol": str,
56 |             "tags_medium": str,
57 |             "tags_general": str,
58 |         },
59 |     })
60 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/directlink.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2017-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Direct link handling"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class DirectlinkExtractor(Extractor):
16 |     """Extractor for direct links to images and other media files"""
17 |     category = "directlink"
18 |     filename_fmt = "{domain}/{path}"
19 |     archive_fmt = "{domain}/{path}"
20 |     pattern = (r"(?i)https?://(?P<domain>[^/?&#]+)/(?P<path>[^?&#]+\."
21 |                r"(?:jpe?g|jpe|png|gif|web[mp]|mp4|mkv|og[gmv]|opus))"
22 |                r"(?:\?(?P<query>[^/?#]*))?(?:#(?P<fragment>.*))?$")
23 |     test = (
24 |         (("https://en.wikipedia.org/static/images/project-logos/enwiki.png"), {
25 |             "url": "18c5d00077332e98e53be9fed2ee4be66154b88d",
26 |             "keyword": "e81b9fe3022e971365dd859f38e4ef717a6c69ed",
27 |         }),
28 |         # more complex example
29 |         ("https://example.org/path/file.webm?que=1&ry=2#fragment", {
30 |             "url": "fd4aec8a32842343394e6078a06c3e6b647bf671",
31 |             "keyword": "ff75764b1ae66615b723a6357b8193fa2de84678",
32 |         }),
33 |         # percent-encoded characters
34 |         ("https://example.org/%27%3C%23/%23%3E%27.jpg?key=%3C%26%3E", {
35 |             "url": "2627e8140727fdf743f86fe18f69f99a052c9718",
36 |             "keyword": "4d19dc12e41ffcb4cbec2013e335cf482377c35e",
37 |         }),
38 |         # upper case file extension (#296)
39 |         ("https://post-phinf.pstatic.net/MjAxOTA1MjlfMTQ4/MDAxNTU5MTI2NjcyNTkw"
40 |          ".JUzkGb4V6dj9DXjLclrOoqR64uDxHFUO5KDriRdKpGwg.88mCtd4iT1NHlpVKSCaUpP"
41 |          "mZPiDgT8hmQdQ5K_gYyu0g.JPEG/2.JPG"),
42 |     )
43 | 
44 |     def __init__(self, match):
45 |         Extractor.__init__(self, match)
46 |         self.data = match.groupdict()
47 | 
48 |     def items(self):
49 |         text.nameext_from_url(self.url, self.data)
50 |         for key, value in self.data.items():
51 |             if value:
52 |                 self.data[key] = text.unquote(value)
53 | 
54 |         yield Message.Version, 1
55 |         yield Message.Directory, self.data
56 |         yield Message.Url, self.url, self.data
57 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/adultempire.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://www.adultempire.com/"""
10 | 
11 | from .common import GalleryExtractor
12 | from .. import text
13 | 
14 | 
15 | class AdultempireGalleryExtractor(GalleryExtractor):
16 |     """Extractor for image galleries from www.adultempire.com"""
17 |     category = "adultempire"
18 |     root = "https://www.adultempire.com"
19 |     pattern = (r"(?:https?://)?(?:www\.)?adult(?:dvd)?empire\.com"
20 |                r"(/(\d+)/gallery\.html)")
21 |     test = (
22 |         ("https://www.adultempire.com/5998/gallery.html", {
23 |             "range": "1",
24 |             "keyword": "5b3266e69801db0d78c22181da23bc102886e027",
25 |             "content": "5c6beb31e5e3cdc90ee5910d5c30f9aaec977b9e",
26 |         }),
27 |         ("https://www.adultdvdempire.com/5683/gallery.html", {
28 |             "url": "b12cd1a65cae8019d837505adb4d6a2c1ed4d70d",
29 |             "keyword": "8d448d79c4ac5f5b10a3019d5b5129ddb43655e5",
30 |         }),
31 |     )
32 | 
33 |     def __init__(self, match):
34 |         GalleryExtractor.__init__(self, match)
35 |         self.gallery_id = match.group(2)
36 | 
37 |     def metadata(self, page):
38 |         extr = text.extract_from(page, page.index('<div id="content">'))
39 |         return {
40 |             "gallery_id": text.parse_int(self.gallery_id),
41 |             "title"     : text.unescape(extr('title="', '"')),
42 |             "studio"    : extr(">studio</small>", "<").strip(),
43 |             "date"      : text.parse_datetime(extr(
44 |                 ">released</small>", "<").strip(), "%m/%d/%Y"),
45 |             "actors"    : sorted(text.split_html(extr(
46 |                 '<ul class="item-details item-cast-list ', '</ul>'))[1:]),
47 |         }
48 | 
49 |     def images(self, page):
50 |         params = {"page": 1}
51 |         while True:
52 |             urls = list(text.extract_iter(page, 'rel="L"><img src="', '"'))
53 |             for url in urls:
54 |                 yield url.replace("_200.", "_9600."), None
55 |             if len(urls) < 24:
56 |                 return
57 |             params["page"] += 1
58 |             page = self.request(self.gallery_url, params=params).text
59 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/nsfwalbum.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://nsfwalbum.com/"""
10 | 
11 | from .common import GalleryExtractor
12 | from .. import text
13 | 
14 | 
15 | class NsfwalbumAlbumExtractor(GalleryExtractor):
16 |     """Extractor for image albums on nsfwalbum.com"""
17 |     category = "nsfwalbum"
18 |     subcategory = "album"
19 |     root = "https://nsfwalbum.com"
20 |     filename_fmt = "{album_id}_{num:>03}_{id}.{extension}"
21 |     directory_fmt = ("{category}", "{album_id} {title}")
22 |     archive_fmt = "{id}"
23 |     pattern = r"(?:https?://)?(?:www\.)?nsfwalbum\.com(/album/(\d+))"
24 |     test = ("https://nsfwalbum.com/album/401611", {
25 |         "range": "1-5",
26 |         "url": "b0481fc7fad5982da397b6359fbed8421b8ba284",
27 |         "keyword": "e98f9b0d473c00000831618d0235863b1dd78294",
28 |     })
29 | 
30 |     def __init__(self, match):
31 |         self.album_id = match.group(2)
32 |         GalleryExtractor.__init__(self, match)
33 | 
34 |     def metadata(self, page):
35 |         extr = text.extract_from(page)
36 |         return {
37 |             "album_id": text.parse_int(self.album_id),
38 |             "title"   : text.unescape(extr('<h6>', '</h6>')),
39 |             "models"  : text.split_html(extr('"models"> Models:', '</div>')),
40 |             "studio"  : text.remove_html(extr('"models"> Studio:', '</div>')),
41 |         }
42 | 
43 |     def images(self, page):
44 |         iframe = self.root + "/iframe_image.php?id="
45 |         backend = self.root + "/backend.php"
46 |         for image_id in text.extract_iter(page, 'data-img-id="', '"'):
47 |             spirit = text.extract(self.request(
48 |                 iframe + image_id).text, 'giraffe.annihilate("', '"')[0]
49 |             params = {"spirit": self._annihilate(spirit), "photo": image_id}
50 |             data = self.request(backend, params=params).json()
51 |             yield data[0], {
52 |                 "id"    : text.parse_int(image_id),
53 |                 "width" : text.parse_int(data[1]),
54 |                 "height": text.parse_int(data[2]),
55 |             }
56 | 
57 |     @staticmethod
58 |     def _annihilate(value, base=6):
59 |         return "".join(
60 |             chr(ord(char) ^ base)
61 |             for char in value
62 |         )
63 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/imgth.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://imgth.com/"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class ImgthGalleryExtractor(Extractor):
16 |     """Extractor for image galleries from imgth.com"""
17 |     category = "imgth"
18 |     subcategory = "gallery"
19 |     directory_fmt = ("{category}", "{gallery_id} {title}")
20 |     filename_fmt = "{category}_{gallery_id}_{num:>03}.{extension}"
21 |     archive_fmt = "{gallery_id}_{num}"
22 |     pattern = r"(?:https?://)?imgth\.com/gallery/(\d+)"
23 |     test = ("http://imgth.com/gallery/37/wallpaper-anime", {
24 |         "url": "4ae1d281ca2b48952cf5cca57e9914402ad72748",
25 |         "keyword": "6f8c00d6849ea89d1a028764675ec1fe9dbd87e2",
26 |     })
27 | 
28 |     def __init__(self, match):
29 |         Extractor.__init__(self, match)
30 |         self.gid = match.group(1)
31 |         self.url_base = "https://imgth.com/gallery/" + self.gid + "/g/page/"
32 | 
33 |     def items(self):
34 |         page = self.request(self.url_base + "0").text
35 |         data = self.metadata(page)
36 |         yield Message.Version, 1
37 |         yield Message.Directory, data
38 |         for data["num"], url in enumerate(self.images(page), 1):
39 |             yield Message.Url, url, text.nameext_from_url(url, data)
40 | 
41 |     def images(self, page):
42 |         """Yield all image urls for this gallery"""
43 |         pnum = 0
44 |         while True:
45 |             thumbs = text.extract(page, '<ul class="thumbnails">', '</ul>')[0]
46 |             for url in text.extract_iter(thumbs, '<img src="', '"'):
47 |                 yield "https://imgth.com/images" + url[24:]
48 |             if '<li class="next">' not in page:
49 |                 return
50 |             pnum += 1
51 |             page = self.request(self.url_base + str(pnum)).text
52 | 
53 |     def metadata(self, page):
54 |         """Collect metadata for extractor-job"""
55 |         return text.extract_all(page, (
56 |             ("title", '<h1>', '</h1>'),
57 |             ("count", 'total of images in this gallery: ', ' '),
58 |             ("date" , 'created on ', ' by <'),
59 |             (None   , 'href="/users/', ''),
60 |             ("user" , '>', '<'),
61 |         ), values={"gallery_id": self.gid})[0]
62 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/mangafox.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2017-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract manga-chapters and entire manga from https://fanfox.net/"""
10 | 
11 | from .common import ChapterExtractor
12 | from .. import text
13 | 
14 | 
15 | class MangafoxChapterExtractor(ChapterExtractor):
16 |     """Extractor for manga-chapters from fanfox.net"""
17 |     category = "mangafox"
18 |     pattern = (r"(?:https?://)?(?:www\.|m\.)?(?:mangafox\.me|fanfox\.net)"
19 |                r"(/manga/[^/]+/((?:v(\d+)/)?c(\d+)([^/?&#]*)))")
20 |     test = (
21 |         ("http://fanfox.net/manga/kidou_keisatsu_patlabor/v05/c006.2/1.html", {
22 |             "keyword": "5661dab258d42d09d98f194f7172fb9851a49766",
23 |             "content": "5c50c252dcf12ffecf68801f4db8a2167265f66c",
24 |         }),
25 |         ("http://mangafox.me/manga/kidou_keisatsu_patlabor/v05/c006.2/"),
26 |     )
27 |     root = "https://m.fanfox.net"
28 | 
29 |     def __init__(self, match):
30 |         base, self.cstr, self.volume, self.chapter, self.minor = match.groups()
31 |         self.urlbase = self.root + base
32 |         ChapterExtractor.__init__(self, match, self.urlbase + "/1.html")
33 | 
34 |     def metadata(self, page):
35 |         manga, pos = text.extract(page, "<title>", "</title>")
36 |         count, pos = text.extract(
37 |             page, ">", "<", page.find("</select>", pos) - 20)
38 |         sid  , pos = text.extract(page, "var series_id =", ";", pos)
39 |         cid  , pos = text.extract(page, "var chapter_id =", ";", pos)
40 | 
41 |         return {
42 |             "manga": text.unescape(manga),
43 |             "volume": text.parse_int(self.volume),
44 |             "chapter": text.parse_int(self.chapter),
45 |             "chapter_minor": self.minor or "",
46 |             "chapter_string": self.cstr,
47 |             "count": text.parse_int(count),
48 |             "sid": text.parse_int(sid),
49 |             "cid": text.parse_int(cid),
50 |         }
51 | 
52 |     def images(self, page):
53 |         pnum = 1
54 |         while True:
55 |             url, pos = text.extract(page, '<img src="', '"')
56 |             yield url, None
57 |             url, pos = text.extract(page, ' src="', '"', pos)
58 |             yield url, None
59 | 
60 |             pnum += 2
61 |             page = self.request("{}/{}.html".format(self.urlbase, pnum)).text
62 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/rule34.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://rule34.xxx/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class Rule34Extractor(booru.XmlParserMixin,
15 |                       booru.GelbooruPageMixin,
16 |                       booru.BooruExtractor):
17 |     """Base class for rule34 extractors"""
18 |     category = "rule34"
19 |     api_url = "https://rule34.xxx/index.php"
20 |     post_url = "https://rule34.xxx/index.php?page=post&s=view&id={}"
21 |     pool_url = "https://rule34.xxx/index.php?page=pool&s=show&id={}"
22 |     page_limit = 4000
23 | 
24 |     def __init__(self, match):
25 |         super().__init__(match)
26 |         self.params.update({"page": "dapi", "s": "post", "q": "index"})
27 | 
28 | 
29 | class Rule34TagExtractor(booru.TagMixin, Rule34Extractor):
30 |     """Extractor for images from rule34.xxx based on search-tags"""
31 |     pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
32 |                r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
33 |     test = ("https://rule34.xxx/index.php?page=post&s=list&tags=danraku", {
34 |         "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
35 |         "pattern": r"https?://([^.]+\.)?rule34\.xxx/images/\d+/[0-9a-f]+\.jpg",
36 |         "count": 1,
37 |     })
38 | 
39 | 
40 | class Rule34PoolExtractor(booru.GelbooruPoolMixin, Rule34Extractor):
41 |     """Extractor for image-pools from rule34.xxx"""
42 |     pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
43 |                r"\?page=pool&s=show&id=(?P<pool>\d+)")
44 |     test = ("https://rule34.xxx/index.php?page=pool&s=show&id=179", {
45 |         "count": 3,
46 |     })
47 | 
48 | 
49 | class Rule34PostExtractor(booru.PostMixin, Rule34Extractor):
50 |     """Extractor for single images from rule34.xxx"""
51 |     pattern = (r"(?:https?://)?(?:www\.)?rule34\.xxx/(?:index\.php)?"
52 |                r"\?page=post&s=view&id=(?P<post>\d+)")
53 |     test = ("https://rule34.xxx/index.php?page=post&s=view&id=1995545", {
54 |         "content": "97e4bbf86c3860be18de384d02d544251afe1d45",
55 |         "options": (("tags", True),),
56 |         "keyword": {
57 |             "tags_artist": "danraku",
58 |             "tags_character": "kashima_(kantai_collection)",
59 |             "tags_copyright": "kantai_collection",
60 |             "tags_general": str,
61 |             "tags_metadata": str,
62 |         },
63 |     })
64 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/lineblog.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://www.lineblog.me/"""
10 | 
11 | from .livedoor import LivedoorBlogExtractor, LivedoorPostExtractor
12 | from .. import text
13 | 
14 | 
15 | class LineblogBase():
16 |     """Base class for lineblog extractors"""
17 |     category = "lineblog"
18 |     root = "https://lineblog.me"
19 | 
20 |     def _images(self, post):
21 |         imgs = []
22 |         body = post.pop("body")
23 | 
24 |         for num, img in enumerate(text.extract_iter(body, "<img ", ">"), 1):
25 |             src = text.extract(img, 'src="', '"')[0]
26 |             alt = text.extract(img, 'alt="', '"')[0]
27 | 
28 |             if not src:
29 |                 continue
30 |             if src.startswith("https://obs.line-scdn.") and src.count("/") > 3:
31 |                 src = src.rpartition("/")[0]
32 | 
33 |             imgs.append(text.nameext_from_url(alt or src, {
34 |                 "url" : src,
35 |                 "num" : num,
36 |                 "hash": src.rpartition("/")[2],
37 |                 "post": post,
38 |             }))
39 | 
40 |         return imgs
41 | 
42 | 
43 | class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor):
44 |     """Extractor for a user's blog on lineblog.me"""
45 |     pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?&#])"
46 |     test = ("https://lineblog.me/mamoru_miyano/", {
47 |         "range": "1-20",
48 |         "count": 20,
49 |         "pattern": r"https://obs.line-scdn.net/[\w-]+$",
50 |         "keyword": {
51 |             "post": {
52 |                 "categories" : tuple,
53 |                 "date"       : "type:datetime",
54 |                 "description": str,
55 |                 "id"         : int,
56 |                 "tags"       : list,
57 |                 "title"      : str,
58 |                 "user"       : "mamoru_miyano"
59 |             },
60 |             "filename": str,
61 |             "hash"    : r"re:\w{32,}",
62 |             "num"     : int,
63 |         },
64 |     })
65 | 
66 | 
67 | class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor):
68 |     """Extractor for blog posts on lineblog.me"""
69 |     pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)"
70 |     test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", {
71 |         "url": "24afeb4044c554f80c374b52bf8109c6f1c0c757",
72 |         "keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb",
73 |     })
74 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/erolord.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from http://erolord.com/"""
10 | 
11 | from .common import GalleryExtractor
12 | from .. import text, util
13 | import json
14 | 
15 | 
16 | class ErolordGalleryExtractor(GalleryExtractor):
17 |     """Extractor for image galleries from erolord.com"""
18 |     category = "erolord"
19 |     root = "http://erolord.com"
20 |     pattern = r"(?:https?://)?(?:www\.)?erolord.com(/doujin/(\d+)/?)"
21 |     test = ("http://erolord.com/doujin/2189055/", {
22 |         "url": "7ce6d10a3934102b95c9718a34ccd3d35f55d85f",
23 |         "keyword": {
24 |             "title"     : "Amazon No Hiyaku | Amazon Elixir",
25 |             "gallery_id": 2189055,
26 |             "count"     : 16,
27 |             "artist"    : ["Morris"],
28 |             "group"     : list,
29 |             "parody"    : list,
30 |             "characters": list,
31 |             "tags"      : list,
32 |             "lang"      : "en",
33 |             "language"  : "English",
34 |         },
35 |     })
36 | 
37 |     def __init__(self, match):
38 |         GalleryExtractor.__init__(self, match)
39 |         self.gallery_id = match.group(2)
40 | 
41 |     def metadata(self, page):
42 |         extr = text.extract_from(page)
43 |         split = text.split_html
44 |         title, _, language = extr('<h1 class="t64">', '</h1>').rpartition(" ")
45 |         language = language.strip("[]")
46 | 
47 |         return {
48 |             "gallery_id": text.parse_int(self.gallery_id),
49 |             "title"     : text.unescape(title),
50 |             # double quotes for anime, circle, tags
51 |             # single quotes for characters, artist
52 |             "parody"    : split(extr('class="sp1">Anime:'     , "</div>\r")),
53 |             "characters": split(extr("class='sp1'>Characters:", "</div>\r")),
54 |             "artist"    : split(extr("class='sp1'>Artist:"    , "</div>\r")),
55 |             "group"     : split(extr('class="sp1">Circle:'    , "</div>\r")),
56 |             "tags"      : split(extr('class="sp1">Tags:'      , "</div>\r")),
57 |             "lang"      : util.language_to_code(language),
58 |             "language"  : language,
59 |         }
60 | 
61 |     def images(self, page):
62 |         url = self.root + text.extract(page, 'id="d1"><a href="', '"')[0]
63 |         imgs = text.extract(self.request(url).text, 'var imgs=', ';')[0]
64 |         return [(self.root + path, None) for path in json.loads(imgs)]
65 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/yandere.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://yande.re/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class YandereExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
15 |     """Base class for yandere extractors"""
16 |     category = "yandere"
17 |     api_url = "https://yande.re/post.json"
18 |     post_url = "https://yande.re/post/show/{}"
19 | 
20 | 
21 | class YandereTagExtractor(booru.TagMixin, YandereExtractor):
22 |     """Extractor for images from yande.re based on search-tags"""
23 |     pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
24 |                r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
25 |     test = ("https://yande.re/post?tags=ouzoku+armor", {
26 |         "content": "59201811c728096b2d95ce6896fd0009235fe683",
27 |     })
28 | 
29 | 
30 | class YanderePoolExtractor(booru.PoolMixin, YandereExtractor):
31 |     """Extractor for image-pools from yande.re"""
32 |     pattern = r"(?:https?://)?(?:www\.)?yande\.re/pool/show/(?P<pool>\d+)"
33 |     test = ("https://yande.re/pool/show/318", {
34 |         "content": "2a35b9d6edecce11cc2918c6dce4de2198342b68",
35 |     })
36 | 
37 | 
38 | class YanderePostExtractor(booru.PostMixin, YandereExtractor):
39 |     """Extractor for single images from yande.re"""
40 |     pattern = r"(?:https?://)?(?:www\.)?yande\.re/post/show/(?P<post>\d+)"
41 |     test = ("https://yande.re/post/show/51824", {
42 |         "content": "59201811c728096b2d95ce6896fd0009235fe683",
43 |         "options": (("tags", True),),
44 |         "keyword": {
45 |             "tags_artist": "sasaki_tamaru",
46 |             "tags_circle": "softhouse_chara",
47 |             "tags_copyright": "ouzoku",
48 |             "tags_general": str,
49 |         },
50 |     })
51 | 
52 | 
53 | class YanderePopularExtractor(booru.MoebooruPopularMixin, YandereExtractor):
54 |     """Extractor for popular images from yande.re"""
55 |     pattern = (r"(?:https?://)?(?:www\.)?yande\.re"
56 |                r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
57 |                r"(?:\?(?P<query>[^#]*))?")
58 |     test = (
59 |         ("https://yande.re/post/popular_by_month?month=6&year=2014", {
60 |             "count": 40,
61 |         }),
62 |         ("https://yande.re/post/popular_recent"),
63 |     )
64 | 
65 |     def __init__(self, match):
66 |         super().__init__(match)
67 |         self.api_url = "https://yande.re/post/popular_{scale}.json".format(
68 |             scale=self.scale)
69 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/safebooru.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://safebooru.org/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class SafebooruExtractor(booru.XmlParserMixin,
15 |                          booru.GelbooruPageMixin,
16 |                          booru.BooruExtractor):
17 |     """Base class for safebooru extractors"""
18 |     category = "safebooru"
19 |     api_url = "https://safebooru.org/index.php"
20 |     post_url = "https://safebooru.org/index.php?page=post&s=view&id={}"
21 |     pool_url = "https://safebooru.org/index.php?page=pool&s=show&id={}"
22 | 
23 |     def __init__(self, match):
24 |         super().__init__(match)
25 |         self.params.update({"page": "dapi", "s": "post", "q": "index"})
26 | 
27 | 
28 | class SafebooruTagExtractor(booru.TagMixin, SafebooruExtractor):
29 |     """Extractor for images from safebooru.org based on search-tags"""
30 |     pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
31 |                r"\?page=post&s=list&tags=(?P<tags>[^&#]+)")
32 |     test = ("https://safebooru.org/index.php?page=post&s=list&tags=bonocho", {
33 |         "url": "17c61b386530cf4c30842c9f580d15ef1cd09586",
34 |         "content": "e5ad4c5bf241b1def154958535bef6c2f6b733eb",
35 |     })
36 | 
37 | 
38 | class SafebooruPoolExtractor(booru.GelbooruPoolMixin, SafebooruExtractor):
39 |     """Extractor for image-pools from safebooru.org"""
40 |     pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
41 |                r"\?page=pool&s=show&id=(?P<pool>\d+)")
42 |     test = ("https://safebooru.org/index.php?page=pool&s=show&id=11", {
43 |         "count": 5,
44 |     })
45 | 
46 | 
47 | class SafebooruPostExtractor(booru.PostMixin, SafebooruExtractor):
48 |     """Extractor for single images from safebooru.org"""
49 |     pattern = (r"(?:https?://)?(?:www\.)?safebooru\.org/(?:index\.php)?"
50 |                r"\?page=post&s=view&id=(?P<post>\d+)")
51 |     test = ("https://safebooru.org/index.php?page=post&s=view&id=1169132", {
52 |         "url": "cf05e37a3c62b2d55788e2080b8eabedb00f999b",
53 |         "content": "93b293b27dabd198afafabbaf87c49863ac82f27",
54 |         "options": (("tags", True),),
55 |         "keyword": {
56 |             "tags_artist": "kawanakajima",
57 |             "tags_character": "heath_ledger ronald_mcdonald the_joker",
58 |             "tags_copyright": "dc_comics mcdonald's the_dark_knight",
59 |             "tags_general": str,
60 |         },
61 |     })
62 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/senmanga.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract manga-chapters from from https://raw.senmanga.com/"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class SenmangaChapterExtractor(Extractor):
16 |     """Extractor for manga-chapters from raw.senmanga.com"""
17 |     category = "senmanga"
18 |     subcategory = "chapter"
19 |     directory_fmt = ("{category}", "{manga}", "{chapter_string}")
20 |     filename_fmt = "{manga}_{chapter_string}_{page:>03}.{extension}"
21 |     archive_fmt = "{manga}_{chapter_string}_{page}"
22 |     pattern = r"(?:https?://)?raw\.senmanga\.com/([^/]+/[^/]+)"
23 |     test = (
24 |         ("http://raw.senmanga.com/Bokura-wa-Minna-Kawaisou/37A/1", {
25 |             "url": "5f95140ff511d8497e2ec08fa7267c6bb231faec",
26 |             "keyword": "705d941a150765edb33cd2707074bd703a93788c",
27 |             "content": "0e37b1995708ffc175f2e175d91a518e6948c379",
28 |         }),
29 |         ("http://raw.senmanga.com/Love-Lab/2016-03/1", {
30 |             "url": "8347b9f00c14b864dd3c19a1f5ae52adb2ef00de",
31 |             "keyword": "8a8ab2529ba2edfc83a6b3a8bede1d6c580db7b4",
32 |         }),
33 |     )
34 |     root = "https://raw.senmanga.com"
35 | 
36 |     def __init__(self, match):
37 |         Extractor.__init__(self, match)
38 |         part = match.group(1)
39 |         self.chapter_url = "{}/{}/".format(self.root, part)
40 |         self.img_url = "{}/viewer/{}/".format(self.root, part)
41 |         self.session.headers["Referer"] = self.chapter_url
42 | 
43 |     def items(self):
44 |         data = self.metadata()
45 |         yield Message.Version, 1
46 |         yield Message.Directory, data
47 |         for data["page"] in range(1, data["count"]+1):
48 |             data["extension"] = None
49 |             yield Message.Url, self.img_url + str(data["page"]), data
50 | 
51 |     def metadata(self):
52 |         """Collect metadata for extractor-job"""
53 |         page = self.request(self.chapter_url).text
54 |         self.session.cookies.clear()
55 |         title, pos = text.extract(page, '<title>', '</title>')
56 |         count, pos = text.extract(page, '</select> of ', '\n', pos)
57 |         manga, _, chapter = title.partition(" - Chapter ")
58 | 
59 |         return {
60 |             "manga": text.unescape(manga).replace("-", " "),
61 |             "chapter_string": chapter.partition(" - Page ")[0],
62 |             "count": text.parse_int(count),
63 |             "lang": "jp",
64 |             "language": "Japanese",
65 |         }
66 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/e621.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2014-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://e621.net/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class E621Extractor(booru.MoebooruPageMixin, booru.BooruExtractor):
15 |     """Base class for e621 extractors"""
16 |     category = "e621"
17 |     api_url = "https://e621.net/post/index.json"
18 |     post_url = "https://e621.net/post/show/{}"
19 |     page_limit = 750
20 | 
21 | 
22 | class E621TagExtractor(booru.TagMixin, E621Extractor):
23 |     """Extractor for images from e621.net based on search-tags"""
24 |     pattern = (r"(?:https?://)?(?:www\.)?e621\.net/post"
25 |                r"(?:/index/\d+/|\?tags=)(?P<tags>[^/?&#]+)")
26 |     test = (
27 |         ("https://e621.net/post/index/1/anry", {
28 |             "url": "8021e5ea28d47c474c1ffc9bd44863c4d45700ba",
29 |             "content": "501d1e5d922da20ee8ff9806f5ed3ce3a684fd58",
30 |         }),
31 |         ("https://e621.net/post?tags=anry"),
32 |     )
33 | 
34 | 
35 | class E621PoolExtractor(booru.PoolMixin, E621Extractor):
36 |     """Extractor for image-pools from e621.net"""
37 |     pattern = r"(?:https?://)?(?:www\.)?e621\.net/pool/show/(?P<pool>\d+)"
38 |     test = ("https://e621.net/pool/show/73", {
39 |         "url": "842f2fb065c7c339486a9b1d689020b8569888ed",
40 |         "content": "c2c87b7a9150509496cddc75ccab08109922876a",
41 |     })
42 | 
43 | 
44 | class E621PostExtractor(booru.PostMixin, E621Extractor):
45 |     """Extractor for single images from e621.net"""
46 |     pattern = r"(?:https?://)?(?:www\.)?e621\.net/post/show/(?P<post>\d+)"
47 |     test = ("https://e621.net/post/show/535", {
48 |         "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529",
49 |         "content": "66f46e96a893fba8e694c4e049b23c2acc9af462",
50 |         "options": (("tags", True),),
51 |         "keyword": {
52 |             "tags_artist": "anry",
53 |             "tags_general": str,
54 |             "tags_species": str,
55 |         },
56 |     })
57 | 
58 | 
59 | class E621PopularExtractor(booru.MoebooruPopularMixin, E621Extractor):
60 |     """Extractor for popular images from 621.net"""
61 |     pattern = (r"(?:https?://)?(?:www\.)?e621\.net"
62 |                r"/post/popular_by_(?P<scale>day|week|month)"
63 |                r"(?:\?(?P<query>[^#]*))?")
64 |     test = ("https://e621.net/post/popular_by_month?month=6&year=2013", {
65 |         "count": 32,
66 |     })
67 | 
68 |     def __init__(self, match):
69 |         super().__init__(match)
70 |         self.api_url = "https://e621.net/post/popular_by_{scale}.json".format(
71 |             scale=self.scale)
72 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/hypnohub.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://hypnohub.net/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class HypnohubExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
15 |     """Base class for hypnohub extractors"""
16 |     category = "hypnohub"
17 |     api_url = "https://hypnohub.net/post.json"
18 |     post_url = "https://hypnohub.net/post/show/{}"
19 | 
20 | 
21 | class HypnohubTagExtractor(booru.TagMixin, HypnohubExtractor):
22 |     """Extractor for images from hypnohub.net based on search-tags"""
23 |     pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
24 |                r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
25 |     test = ("https://hypnohub.net/post?tags=gonoike_biwa", {
26 |         "url": "0deaf1a2f832cfc4354c531259b949e850da1e7e",
27 |     })
28 | 
29 | 
30 | class HypnohubPoolExtractor(booru.PoolMixin, HypnohubExtractor):
31 |     """Extractor for image-pools from hypnohub.net"""
32 |     pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/pool/show/(?P<pool>\d+)"
33 |     test = ("https://hypnohub.net/pool/show/61", {
34 |         "url": "fd74991c8729e77acd3c35eb6ddc4128ff445adf",
35 |     })
36 | 
37 | 
38 | class HypnohubPostExtractor(booru.PostMixin, HypnohubExtractor):
39 |     """Extractor for single images from hypnohub.net"""
40 |     pattern = r"(?:https?://)?(?:www\.)?hypnohub\.net/post/show/(?P<post>\d+)"
41 |     test = ("https://hypnohub.net/post/show/73964", {
42 |         "content": "02d5f5a8396b621a6efc04c5f8ef1b7225dfc6ee",
43 |         "options": (("tags", True),),
44 |         "keyword": {
45 |             "tags_artist": "gonoike_biwa icontrol_(manipper)",
46 |             "tags_character": "komaru_naegi",
47 |             "tags_copyright": "dangan_ronpa dangan_ronpa_another_episode",
48 |             "tags_general": str,
49 |         },
50 |     })
51 | 
52 | 
53 | class HypnohubPopularExtractor(booru.MoebooruPopularMixin, HypnohubExtractor):
54 |     """Extractor for popular images from hypnohub.net"""
55 |     pattern = (r"(?:https?://)?(?:www\.)?hypnohub\.net"
56 |                r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
57 |                r"(?:\?(?P<query>[^#]*))?")
58 |     test = (
59 |         ("https://hypnohub.net/post/popular_by_month?month=6&year=2014", {
60 |             "count": 20,
61 |         }),
62 |         ("https://hypnohub.net/post/popular_recent"),
63 |     )
64 | 
65 |     def __init__(self, match):
66 |         super().__init__(match)
67 |         self.api_url = "https://hypnohub.net/post/popular_{scale}.json".format(
68 |             scale=self.scale)
69 | 


--------------------------------------------------------------------------------
/gallery_dl/postprocessor/zip.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Store files in ZIP archives"""
10 | 
11 | from .common import PostProcessor
12 | import zipfile
13 | import os
14 | 
15 | 
16 | class ZipPP(PostProcessor):
17 | 
18 |     COMPRESSION_ALGORITHMS = {
19 |         "store": zipfile.ZIP_STORED,
20 |         "zip": zipfile.ZIP_DEFLATED,
21 |         "bzip2": zipfile.ZIP_BZIP2,
22 |         "lzma": zipfile.ZIP_LZMA,
23 |     }
24 | 
25 |     def __init__(self, pathfmt, options):
26 |         PostProcessor.__init__(self)
27 |         self.delete = not options.get("keep-files", False)
28 |         ext = "." + options.get("extension", "zip")
29 |         algorithm = options.get("compression", "store")
30 |         if algorithm not in self.COMPRESSION_ALGORITHMS:
31 |             self.log.warning(
32 |                 "unknown compression algorithm '%s'; falling back to 'store'",
33 |                 algorithm)
34 |             algorithm = "store"
35 | 
36 |         self.path = pathfmt.realdirectory
37 |         args = (self.path[:-1] + ext, "a",
38 |                 self.COMPRESSION_ALGORITHMS[algorithm], True)
39 | 
40 |         if options.get("mode") == "safe":
41 |             self.run = self._write_safe
42 |             self.zfile = None
43 |             self.args = args
44 |         else:
45 |             self.run = self._write
46 |             self.zfile = zipfile.ZipFile(*args)
47 | 
48 |     def _write(self, pathfmt, zfile=None):
49 |         # 'NameToInfo' is not officially documented, but it's available
50 |         # for all supported Python versions and using it directly is a lot
51 |         # faster than calling getinfo()
52 |         if zfile is None:
53 |             zfile = self.zfile
54 |         if pathfmt.filename not in zfile.NameToInfo:
55 |             zfile.write(pathfmt.temppath, pathfmt.filename)
56 |             pathfmt.delete = self.delete
57 | 
58 |     def _write_safe(self, pathfmt):
59 |         with zipfile.ZipFile(*self.args) as zfile:
60 |             self._write(pathfmt, zfile)
61 | 
62 |     def finalize(self):
63 |         if self.zfile:
64 |             self.zfile.close()
65 | 
66 |         if self.delete:
67 |             try:
68 |                 # remove target directory
69 |                 os.rmdir(self.path)
70 |             except OSError:
71 |                 pass
72 | 
73 |             if self.zfile and not self.zfile.NameToInfo:
74 |                 try:
75 |                     # delete empty zip archive
76 |                     os.unlink(self.zfile.filename)
77 |                 except OSError:
78 |                     pass
79 | 
80 | 
81 | __postprocessor__ = ZipPP
82 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2017 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Utility extractor to execute tests of other extractors"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import extractor, exception
13 | 
14 | 
15 | class TestExtractor(Extractor):
16 |     """Extractor to select and run the test URLs of other extractors
17 | 
18 |     The general form is 'test:<categories>:<subcategories>:<indices>', where
19 |     <categories> and <subcategories> are comma-separated (sub)category names
20 |     and <indices> is a comma-seperated list of array indices.
21 |     To select all possible values for a field use the star '*' character or
22 |     leave the field empty.
23 | 
24 |     Examples:
25 |         - test:pixiv
26 |             run all pixiv tests
27 | 
28 |         - test:pixiv:user,favorite:0
29 |             run the first test of the PixivUser- and PixivFavoriteExtractor
30 | 
31 |         - test:
32 |             run all tests
33 |     """
34 |     category = "test"
35 |     pattern = r"t(?:est)?:([^:]*)(?::([^:]*)(?::(\*|[\d,]*))?)?$"
36 |     test = (
37 |         ("test:pixiv"),
38 |         ("test:pixiv:user,favorite:0"),
39 |         ("test:"),
40 |     )
41 | 
42 |     def __init__(self, match):
43 |         Extractor.__init__(self, match)
44 |         categories, subcategories, indices = match.groups()
45 |         self.categories = self._split(categories)
46 |         self.subcategories = self._split(subcategories)
47 |         self.indices = self._split(indices) or self
48 | 
49 |     def items(self):
50 |         extractors = extractor.extractors()
51 | 
52 |         if self.categories:
53 |             extractors = [
54 |                 extr for extr in extractors
55 |                 if extr.category in self.categories
56 |             ]
57 | 
58 |         if self.subcategories:
59 |             extractors = [
60 |                 extr for extr in extractors
61 |                 if extr.subcategory in self.subcategories
62 |             ]
63 | 
64 |         tests = [
65 |             test
66 |             for extr in extractors
67 |             for index, test in enumerate(extr._get_tests())
68 |             if str(index) in self.indices
69 |         ]
70 | 
71 |         if not tests:
72 |             raise exception.NotFoundError("test")
73 | 
74 |         yield Message.Version, 1
75 |         for test in tests:
76 |             yield Message.Queue, test[0], {}
77 | 
78 |     @staticmethod
79 |     def __contains__(_):
80 |         return True
81 | 
82 |     @staticmethod
83 |     def _split(value):
84 |         if value and value != "*":
85 |             return value.split(",")
86 |         return None
87 | 


--------------------------------------------------------------------------------
/test/test_config.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Copyright 2015-2017 Mike Fährmann
 5 | #
 6 | # This program is free software; you can redistribute it and/or modify
 7 | # it under the terms of the GNU General Public License version 2 as
 8 | # published by the Free Software Foundation.
 9 | 
10 | import unittest
11 | import gallery_dl.config as config
12 | import os
13 | import tempfile
14 | 
15 | 
16 | class TestConfig(unittest.TestCase):
17 | 
18 |     def setUp(self):
19 |         fd, self._configfile = tempfile.mkstemp()
20 |         with os.fdopen(fd, "w") as file:
21 |             file.write('{"a": "1", "b": {"a": 2, "c": "text"}}')
22 |         config.load((self._configfile,))
23 | 
24 |     def tearDown(self):
25 |         config.clear()
26 |         os.remove(self._configfile)
27 | 
28 |     def test_get(self):
29 |         self.assertEqual(config.get(["a"]), "1")
30 |         self.assertEqual(config.get(["b", "c"]), "text")
31 |         self.assertEqual(config.get(["d"]), None)
32 |         self.assertEqual(config.get(["e", "f", "g"], 123), 123)
33 | 
34 |     def test_interpolate(self):
35 |         self.assertEqual(config.interpolate(["a"]), "1")
36 |         self.assertEqual(config.interpolate(["b", "a"]), "1")
37 |         self.assertEqual(config.interpolate(["b", "c"], "2"), "text")
38 |         self.assertEqual(config.interpolate(["b", "d"], "2"), "2")
39 |         config.set(["d"], 123)
40 |         self.assertEqual(config.interpolate(["b", "d"], "2"), 123)
41 |         self.assertEqual(config.interpolate(["d", "d"], "2"), 123)
42 | 
43 |     def test_set(self):
44 |         config.set(["b", "c"], [1, 2, 3])
45 |         config.set(["e", "f", "g"], value=234)
46 |         self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
47 |         self.assertEqual(config.get(["e", "f", "g"]), 234)
48 | 
49 |     def test_setdefault(self):
50 |         config.setdefault(["b", "c"], [1, 2, 3])
51 |         config.setdefault(["e", "f", "g"], value=234)
52 |         self.assertEqual(config.get(["b", "c"]), "text")
53 |         self.assertEqual(config.get(["e", "f", "g"]), 234)
54 | 
55 |     def test_unset(self):
56 |         config.unset(["a"])
57 |         config.unset(["b", "c"])
58 |         config.unset(["c", "d"])
59 |         self.assertEqual(config.get(["a"]), None)
60 |         self.assertEqual(config.get(["b", "a"]), 2)
61 |         self.assertEqual(config.get(["b", "c"]), None)
62 | 
63 |     def test_apply(self):
64 |         options = (
65 |             (["b", "c"], [1, 2, 3]),
66 |             (["e", "f", "g"], 234),
67 |         )
68 | 
69 |         self.assertEqual(config.get(["b", "c"]), "text")
70 |         self.assertEqual(config.get(["e", "f", "g"]), None)
71 | 
72 |         with config.apply(options):
73 |             self.assertEqual(config.get(["b", "c"]), [1, 2, 3])
74 |             self.assertEqual(config.get(["e", "f", "g"]), 234)
75 | 
76 |         self.assertEqual(config.get(["b", "c"]), "text")
77 |         self.assertEqual(config.get(["e", "f", "g"]), None)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     unittest.main()
82 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/khinsider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract soundtracks from https://downloads.khinsider.com/"""
10 | 
11 | from .common import Extractor, Message, AsynchronousMixin
12 | from .. import text, exception
13 | 
14 | 
15 | class KhinsiderSoundtrackExtractor(AsynchronousMixin, Extractor):
16 |     """Extractor for soundtracks from khinsider.com"""
17 |     category = "khinsider"
18 |     subcategory = "soundtrack"
19 |     directory_fmt = ("{category}", "{album}")
20 |     archive_fmt = "{album}_{filename}.{extension}"
21 |     pattern = (r"(?:https?://)?downloads\.khinsider\.com"
22 |                r"/game-soundtracks/album/([^/?&#]+)")
23 |     test = (("https://downloads.khinsider.com"
24 |              "/game-soundtracks/album/horizon-riders-wii"), {
25 |         "pattern": r"https?://\d+\.\d+\.\d+\.\d+/ost/horizon-riders-wii/[^/]+"
26 |                    r"/Horizon%20Riders%20Wii%20-%20Full%20Soundtrack\.mp3",
27 |         "count": 1,
28 |         "keyword": "b4f460c78dd23e1f1121f4ac784dd67ded7c2679",
29 |     })
30 |     root = "https://downloads.khinsider.com"
31 | 
32 |     def __init__(self, match):
33 |         Extractor.__init__(self, match)
34 |         self.album = match.group(1)
35 | 
36 |     def items(self):
37 |         url = (self.root + "/game-soundtracks/album/" + self.album)
38 |         page = self.request(url, encoding="utf-8").text
39 |         data = self.get_job_metadata(page)
40 |         yield Message.Version, 1
41 |         yield Message.Directory, data
42 |         for url, track in self.get_album_tracks(page):
43 |             track.update(data)
44 |             yield Message.Url, url, track
45 | 
46 |     def get_job_metadata(self, page):
47 |         """Collect metadata for extractor-job"""
48 |         if "Download all songs at once:" not in page:
49 |             raise exception.NotFoundError("soundtrack")
50 |         data = text.extract_all(page, (
51 |             ("album", "Album name: <b>", "</b>"),
52 |             ("count", "Number of Files: <b>", "</b>"),
53 |             ("size" , "Total Filesize: <b>", "</b>"),
54 |             ("date" , "Date added: <b>", "</b>"),
55 |             ("type" , "Album type: <b>", "</b>"),
56 |         ))[0]
57 |         data["album"] = text.unescape(data["album"])
58 |         return data
59 | 
60 |     def get_album_tracks(self, page):
61 |         """Collect url and metadata for all tracks of a soundtrack"""
62 |         page = text.extract(page, '<table id="songlist">', '</table>')[0]
63 |         for num, url in enumerate(text.extract_iter(
64 |                 page, '<td class="clickable-row"><a href="', '"'), 1):
65 |             url = text.urljoin(self.root, url)
66 |             page = self.request(url, encoding="utf-8").text
67 |             url = text.extract(
68 |                 page, '<p><a style="color: #21363f;" href="', '"')[0]
69 |             yield url, text.nameext_from_url(url, {"num": num})
70 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/gfycat.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2017-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://gfycat.com/"""
10 | 
11 | from .common import Extractor, Message
12 | 
13 | 
14 | class GfycatExtractor(Extractor):
15 |     """Base class for gfycat extractors"""
16 |     category = "gfycat"
17 |     filename_fmt = "{category}_{gfyName}{title:?_//}.{extension}"
18 |     archive_fmt = "{gfyName}"
19 |     root = "https://gfycat.com"
20 | 
21 |     def __init__(self, match):
22 |         Extractor.__init__(self, match)
23 |         self.formats = (self.config("format", "mp4"), "mp4", "webm", "gif")
24 | 
25 |     def _select_format(self, gfyitem):
26 |         for fmt in self.formats:
27 |             key = fmt + "Url"
28 |             if key in gfyitem:
29 |                 url = gfyitem[key]
30 |                 gfyitem["extension"] = url.rpartition(".")[2]
31 |                 return url
32 |         return ""
33 | 
34 |     def _get_info(self, gfycat_id):
35 |         url = "https://api.gfycat.com/v1/gfycats/" + gfycat_id
36 |         return self.request(url).json()["gfyItem"]
37 | 
38 | 
39 | class GfycatImageExtractor(GfycatExtractor):
40 |     """Extractor for individual images from gfycat.com"""
41 |     subcategory = "image"
42 |     pattern = (r"(?:https?://)?(?:\w+\.)?gfycat\.com"
43 |                r"/(?:gifs/detail/|\w+/)?([A-Za-z]+)")
44 |     test = (
45 |         ("https://gfycat.com/GrayGenerousCowrie", {
46 |             "url": "e0b5e1d7223108249b15c3c7898dd358dbfae045",
47 |             "content": "5786028e04b155baa20b87c5f4f77453cd5edc37",
48 |             "keyword": {
49 |                 "gfyId": "graygenerouscowrie",
50 |                 "gfyName": "GrayGenerousCowrie",
51 |                 "gfyNumber": "755075459",
52 |                 "title": "Bottom's up",
53 |                 "userName": "jackson3oh3",
54 |                 "createDate": 1495884169,
55 |                 "md5": "a4796e05b0db9ba9ce5140145cd318aa",
56 |                 "width": 400,
57 |                 "height": 224,
58 |                 "frameRate": 23,
59 |                 "numFrames": 158,
60 |                 "views": int,
61 |             },
62 |         }),
63 |         (("https://thumbs.gfycat.com/SillyLameIsabellinewheatear"
64 |           "-size_restricted.gif"), {
65 |             "url": "13b32e6cc169d086577d7dd3fd36ee6cdbc02726",
66 |         }),
67 |         ("https://gfycat.com/detail/UnequaledHastyAnkole?tagname=aww", {
68 |             "url": "e24c9f69897fd223343782425a429c5cab6a768e",
69 |         }),
70 |         ("https://gfycat.com/gifs/detail/UnequaledHastyAnkole"),
71 |         ("https://gfycat.com/ifr/UnequaledHastyAnkole"),
72 |         ("https://gfycat.com/ru/UnequaledHastyAnkole"),
73 |     )
74 | 
75 |     def __init__(self, match):
76 |         GfycatExtractor.__init__(self, match)
77 |         self.gfycat_id = match.group(1)
78 | 
79 |     def items(self):
80 |         gfyitem = self._get_info(self.gfycat_id)
81 |         yield Message.Version, 1
82 |         yield Message.Directory, gfyitem
83 |         yield Message.Url, self._select_format(gfyitem), gfyitem
84 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/konachan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://konachan.com/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class KonachanExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
15 |     """Base class for konachan extractors"""
16 |     category = "konachan"
17 | 
18 |     def __init__(self, match):
19 |         root = "https://konachan." + match.group("tld")
20 |         self.api_url = root + "/post.json"
21 |         self.post_url = root + "/post/show/{}"
22 |         super().__init__(match)
23 | 
24 | 
25 | class KonachanTagExtractor(booru.TagMixin, KonachanExtractor):
26 |     """Extractor for images from konachan.com based on search-tags"""
27 |     pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
28 |                r"/post\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)")
29 |     test = (
30 |         ("https://konachan.com/post?tags=patata", {
31 |             "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d",
32 |         }),
33 |         ("https://konachan.net/post?tags=patata"),
34 |     )
35 | 
36 | 
37 | class KonachanPoolExtractor(booru.PoolMixin, KonachanExtractor):
38 |     """Extractor for image-pools from konachan.com"""
39 |     pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
40 |                r"/pool/show/(?P<pool>\d+)")
41 |     test = (
42 |         ("https://konachan.com/pool/show/95", {
43 |             "content": "cf0546e38a93c2c510a478f8744e60687b7a8426",
44 |         }),
45 |         ("https://konachan.net/pool/show/95"),
46 |     )
47 | 
48 | 
49 | class KonachanPostExtractor(booru.PostMixin, KonachanExtractor):
50 |     """Extractor for single images from konachan.com"""
51 |     pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
52 |                r"/post/show/(?P<post>\d+)")
53 |     test = (
54 |         ("https://konachan.com/post/show/205189", {
55 |             "content": "674e75a753df82f5ad80803f575818b8e46e4b65",
56 |             "options": (("tags", True),),
57 |             "keyword": {
58 |                 "tags_artist": "patata",
59 |                 "tags_character": "clownpiece",
60 |                 "tags_copyright": "touhou",
61 |                 "tags_general": str,
62 |             },
63 |         }),
64 |         ("https://konachan.net/post/show/205189"),
65 |     )
66 | 
67 | 
68 | class KonachanPopularExtractor(booru.MoebooruPopularMixin, KonachanExtractor):
69 |     """Extractor for popular images from konachan.com"""
70 |     pattern = (r"(?:https?://)?(?:www\.)?konachan\.(?P<tld>com|net)"
71 |                r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
72 |                r"(?:\?(?P<query>[^#]*))?")
73 |     test = (
74 |         ("https://konachan.com/post/popular_by_month?month=11&year=2010", {
75 |             "count": 20,
76 |         }),
77 |         ("https://konachan.com/post/popular_recent"),
78 |         ("https://konachan.net/post/popular_recent"),
79 |     )
80 | 
81 |     def __init__(self, match):
82 |         super().__init__(match)
83 |         self.api_url = (
84 |             "https://konachan.{tld}/post/popular_{scale}.json".format(
85 |                 tld=match.group("tld"), scale=self.scale))
86 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/3dbooru.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2015-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from http://behoimi.org/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | class ThreedeebooruExtractor(booru.MoebooruPageMixin, booru.BooruExtractor):
15 |     """Base class for 3dbooru extractors"""
16 |     category = "3dbooru"
17 |     api_url = "http://behoimi.org/post/index.json"
18 |     post_url = "http://behoimi.org/post/show/{}"
19 |     page_limit = 1000
20 | 
21 |     def __init__(self, match):
22 |         super().__init__(match)
23 |         self.session.headers.update({
24 |             "Referer": "http://behoimi.org/post/show/",
25 |             "Accept-Encoding": "identity",
26 |         })
27 | 
28 | 
29 | class ThreedeebooruTagExtractor(booru.TagMixin,
30 |                                 ThreedeebooruExtractor):
31 |     """Extractor for images from behoimi.org based on search-tags"""
32 |     pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org/post"
33 |                r"(?:/(?:index)?)?\?tags=(?P<tags>[^&#]+)")
34 |     test = ("http://behoimi.org/post?tags=himekawa_azuru+dress", {
35 |         "url": "ecb30c6aaaf8a6ff8f55255737a9840832a483c1",
36 |         "content": "11cbda40c287e026c1ce4ca430810f761f2d0b2a",
37 |     })
38 | 
39 | 
40 | class ThreedeebooruPoolExtractor(booru.PoolMixin,
41 |                                  ThreedeebooruExtractor):
42 |     """Extractor for image-pools from behoimi.org"""
43 |     pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/pool/show/(?P<pool>\d+)"
44 |     test = ("http://behoimi.org/pool/show/27", {
45 |         "url": "da75d2d1475449d5ef0c266cb612683b110a30f2",
46 |         "content": "fd5b37c5c6c2de4b4d6f1facffdefa1e28176554",
47 |     })
48 | 
49 | 
50 | class ThreedeebooruPostExtractor(booru.PostMixin,
51 |                                  ThreedeebooruExtractor):
52 |     """Extractor for single images from behoimi.org"""
53 |     pattern = r"(?:https?://)?(?:www\.)?behoimi\.org/post/show/(?P<post>\d+)"
54 |     test = ("http://behoimi.org/post/show/140852", {
55 |         "url": "ce874ea26f01d6c94795f3cc3aaaaa9bc325f2f6",
56 |         "content": "26549d55b82aa9a6c1686b96af8bfcfa50805cd4",
57 |         "options": (("tags", True),),
58 |         "keyword": {
59 |             "tags_character": "furude_rika",
60 |             "tags_copyright": "higurashi_no_naku_koro_ni",
61 |             "tags_model": "himekawa_azuru",
62 |             "tags_general": str,
63 |         },
64 |     })
65 | 
66 | 
67 | class ThreedeebooruPopularExtractor(booru.MoebooruPopularMixin,
68 |                                     ThreedeebooruExtractor):
69 |     """Extractor for popular images from behoimi.org"""
70 |     pattern = (r"(?:https?://)?(?:www\.)?behoimi\.org"
71 |                r"/post/popular_(?P<scale>by_(?:day|week|month)|recent)"
72 |                r"(?:\?(?P<query>[^#]*))?")
73 |     test = ("http://behoimi.org/post/popular_by_month?month=2&year=2013", {
74 |         "url": "f5a26c624da9a3d1dbc610e4a614bc57df6251c5",
75 |         "count": 20,
76 |     })
77 | 
78 |     def __init__(self, match):
79 |         super().__init__(match)
80 |         self.api_url = "http://behoimi.org/post/popular_{scale}.json".format(
81 |             scale=self.scale)
82 | 


--------------------------------------------------------------------------------
/snap/snapcraft.yaml:
--------------------------------------------------------------------------------
  1 | %YAML 1.1
  2 | ---
  3 | # Snapcraft Recipe for gallery-dl
  4 | # ------------------------------
  5 | # This file is in the YAML data serialization format:
  6 | # http://yaml.org
  7 | # For the spec. of writing this file refer the following documentation:
  8 | # * The snapcraft format
  9 | #   https://docs.snapcraft.io/the-snapcraft-format/8337
 10 | # * Snap Documentation
 11 | #   https://docs.snapcraft.io
 12 | # * Topics under the doc category in the Snapcraft Forum
 13 | #   https://forum.snapcraft.io/c/doc
 14 | # For support refer to the snapcraft section in the Snapcraft Forum:
 15 | # https://forum.snapcraft.io/c/snapcraft
 16 | name: gallery-dl
 17 | license: GPL-2.0
 18 | base: core
 19 | summary: Download image-galleries and -collections from several image hosting sites
 20 | description: |
 21 |   `gallery-dl` is a command-line program to download image-galleries and -collections from several image hosting sites (see [Supported Sites][1]). It is a cross-platform tool with many configuration options and powerful filenaming capabilities.
 22 | 
 23 |   [1]: https://github.com/mikf/gallery-dl/blob/master/docs/supportedsites.rst
 24 | 
 25 | adopt-info: gallery-dl
 26 | confinement: strict
 27 | grade: stable
 28 | 
 29 | plugs:
 30 |   # For `xdg-open` command access for opening OAuth authentication webpages
 31 |   desktop:
 32 | 
 33 |   # Storage access
 34 |   home:
 35 |   removable-media: # Non-A/C
 36 | 
 37 |   # Network access
 38 |   network:
 39 | 
 40 |   # For network service for recieving OAuth callback tokens
 41 |   network-bind:
 42 | 
 43 |   # Configuration access
 44 |   config-gallery-dl:
 45 |     interface: personal-files
 46 |     read:
 47 |     - $HOME/.config/gallery-dl
 48 |     - $HOME/.gallery-dl.conf
 49 |   etc-gallery-dl:
 50 |     interface: system-files
 51 |     read:
 52 |     - /etc/gallery-dl.conf
 53 | 
 54 | parts:
 55 |   # Launcher programs to fix problems at runtime
 56 |   launchers:
 57 |     source: snap/local/launchers
 58 |     plugin: dump
 59 |     organize:
 60 |       '*': bin/
 61 | 
 62 |   # Check out the tagged release revision if it isn’t promoted to the stable channel
 63 |   # https://forum.snapcraft.io/t/selective-checkout-check-out-the-tagged-release-revision-if-it-isnt-promoted-to-the-stable-channel/10617
 64 |   selective-checkout:
 65 |     plugin: nil
 66 |     build-packages:
 67 |     - git
 68 |     stage-snaps:
 69 |     - selective-checkout
 70 |     prime:
 71 |     - -*
 72 | 
 73 |   gallery-dl:
 74 |     after:
 75 |     - selective-checkout
 76 | 
 77 |     source: .
 78 |     override-pull: |
 79 |       snapcraftctl pull
 80 |       $SNAPCRAFT_STAGE/scriptlets/selective-checkout
 81 | 
 82 |     plugin: python
 83 |     build-packages:
 84 |     - make
 85 |     python-packages:
 86 |     - youtube_dl
 87 |     override-build: |
 88 |       # build manpages and bash completion
 89 |       make man completion
 90 | 
 91 |       snapcraftctl build
 92 | 
 93 |   ffmpeg:
 94 |     plugin: nil
 95 |     stage-packages:
 96 |     - ffmpeg
 97 | 
 98 | apps:
 99 |   gallery-dl:
100 |     adapter: full
101 |     command-chain:
102 |     - bin/gallery-dl-launch
103 |     command: bin/gallery-dl
104 |     completer: etc/bash_completion.d/gallery-dl.bash_completion
105 |     environment:
106 |       LANG: C.UTF-8
107 |       LC_ALL: C.UTF-8
108 | 
109 |       # Satisfy FFmpeg's libpulsecommon dependency
110 |       LD_LIBRARY_PATH: $LD_LIBRARY_PATH:$SNAP/usr/lib/$SNAPCRAFT_ARCH_TRIPLET/pulseaudio
111 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/2chan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2017-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://www.2chan.net/"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class FutabaThreadExtractor(Extractor):
16 |     """Extractor for images from threads on www.2chan.net"""
17 |     category = "2chan"
18 |     subcategory = "thread"
19 |     directory_fmt = ("{category}", "{board_name}", "{thread}")
20 |     filename_fmt = "{tim}.{extension}"
21 |     archive_fmt = "{board}_{thread}_{tim}"
22 |     url_fmt = "https://{server}.2chan.net/{board}/src/{filename}"
23 |     pattern = r"(?:https?://)?([^.]+)\.2chan\.net/([^/]+)/res/(\d+)"
24 |     test = ("http://dec.2chan.net/70/res/947.htm", {
25 |         "url": "c5c12b80b290e224b6758507b3bb952044f4595b",
26 |         "keyword": "4bd22e7a9c3636faecd6ea7082509e8655e10dd0",
27 |     })
28 | 
29 |     def __init__(self, match):
30 |         Extractor.__init__(self, match)
31 |         self.server, self.board, self.thread = match.groups()
32 | 
33 |     def items(self):
34 |         url = "https://{}.2chan.net/{}/res/{}.htm".format(
35 |             self.server, self.board, self.thread)
36 |         page = self.request(url).text
37 |         data = self.metadata(page)
38 |         yield Message.Version, 1
39 |         yield Message.Directory, data
40 |         for post in self.posts(page):
41 |             if "filename" not in post:
42 |                 continue
43 |             post.update(data)
44 |             url = self.url_fmt.format_map(post)
45 |             yield Message.Url, url, post
46 | 
47 |     def metadata(self, page):
48 |         """Collect metadata for extractor-job"""
49 |         title = text.extract(page, "<title>", "</title>")[0]
50 |         title, _, boardname = title.rpartition(" - ")
51 |         return {
52 |             "server": self.server,
53 |             "title": title,
54 |             "board": self.board,
55 |             "board_name": boardname[:-4],
56 |             "thread": self.thread,
57 |         }
58 | 
59 |     def posts(self, page):
60 |         """Build a list of all post-objects"""
61 |         page = text.extract(
62 |             page, '<div class="thre"', '<div style="clear:left"></div>')[0]
63 |         return [
64 |             self.parse(post)
65 |             for post in page.split('<table border=0>')
66 |         ]
67 | 
68 |     def parse(self, post):
69 |         """Build post-object by extracting data from an HTML post"""
70 |         data = self._extract_post(post)
71 |         if '<a href="/' in post:
72 |             self._extract_image(post, data)
73 |             data["tim"], _, data["extension"] = data["filename"].partition(".")
74 |             data["time"] = data["tim"][:-3]
75 |             data["ext"] = "." + data["extension"]
76 |         return data
77 | 
78 |     @staticmethod
79 |     def _extract_post(post):
80 |         return text.extract_all(post, (
81 |             ("no"  , 'name="', '"'),
82 |             ("post", '<b>', '</b>'),
83 |             ("name", '<b>', ' </b>'),
84 |             ("now" , '</font> ', ' '),
85 |             (None  , '<blockquote', ''),
86 |             ("com" , '>', '</blockquote>'),
87 |         ))[0]
88 | 
89 |     @staticmethod
90 |     def _extract_image(post, data):
91 |         text.extract_all(post, (
92 |             (None      , '_blank', ''),
93 |             ("filename", '>', '<'),
94 |             ("fsize"   , '(', ' '),
95 |         ), 0, data)
96 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/slideshare.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2019 Mike Fährmann, Leonardo Taccari
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://www.slideshare.net/"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class SlidesharePresentationExtractor(Extractor):
16 |     """Extractor for images from a presentation on slideshare.net"""
17 |     category = "slideshare"
18 |     subcategory = "presentation"
19 |     directory_fmt = ("{category}", "{user}")
20 |     filename_fmt = "{presentation}-{num:>02}.{extension}"
21 |     archive_fmt = "{presentation}_{num}"
22 |     pattern = (r"(?:https?://)?(?:www\.)?slideshare\.net"
23 |                r"/(?:mobile/)?([^/?&#]+)/([^/?&#]+)")
24 |     test = (
25 |         (("https://www.slideshare.net"
26 |           "/Slideshare/get-started-with-slide-share"), {
27 |             "url": "23685fb9b94b32c77a547d45dc3a82fe7579ea18",
28 |             "content": "ee54e54898778e92696a7afec3ffabdbd98eb0cc",
29 |         }),
30 |         # long title
31 |         (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren"
32 |           "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), {
33 |             "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7",
34 |         }),
35 |         # mobile URL
36 |         (("https://www.slideshare.net"
37 |           "/mobile/uqudent/introduction-to-fixed-prosthodontics"), {
38 |             "url": "59993ad7b0cb93c73011547eedcd02c622649e9d",
39 |         }),
40 |     )
41 | 
42 |     def __init__(self, match):
43 |         Extractor.__init__(self, match)
44 |         self.user, self.presentation = match.groups()
45 | 
46 |     def items(self):
47 |         page = self.request("https://www.slideshare.net/" + self.user +
48 |                             "/" + self.presentation).text
49 |         data = self.get_job_metadata(page)
50 |         imgs = self.get_image_urls(page)
51 |         data["count"] = len(imgs)
52 |         yield Message.Version, 1
53 |         yield Message.Directory, data
54 |         for data["num"], url in enumerate(imgs, 1):
55 |             yield Message.Url, url, text.nameext_from_url(url, data)
56 | 
57 |     def get_job_metadata(self, page):
58 |         """Collect metadata for extractor-job"""
59 |         descr, pos = text.extract(
60 |             page, '<meta name="description" content="', '"')
61 |         title, pos = text.extract(
62 |             page, '<span class="j-title-breadcrumb">', '</span>', pos)
63 |         views, pos = text.extract(
64 |             page, '<span class="notranslate pippin-data">', 'views<', pos)
65 |         published, pos = text.extract(
66 |             page, '<time datetime="', '"', pos)
67 |         alt_descr, pos = text.extract(
68 |             page, 'id="slideshow-description-paragraph" class="notranslate">',
69 |             '</p>', pos)
70 | 
71 |         if descr.endswith("…") and alt_descr:
72 |             descr = text.remove_html(alt_descr).strip()
73 | 
74 |         return {
75 |             "user": self.user,
76 |             "presentation": self.presentation,
77 |             "title": text.unescape(title.strip()),
78 |             "description": text.unescape(descr),
79 |             "views": text.parse_int(views.replace(",", "")),
80 |             "published": published,
81 |         }
82 | 
83 |     @staticmethod
84 |     def get_image_urls(page):
85 |         """Extract and return a list of all image-urls"""
86 |         return list(text.extract_iter(page, 'data-full="', '"'))
87 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/danbooru.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2014-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://danbooru.donmai.us/"""
10 | 
11 | from . import booru
12 | 
13 | 
14 | BASE_PATTERN = (
15 |     r"(?:https?://)?"
16 |     r"(?P<subdomain>danbooru|hijiribe|sonohara|safebooru)"
17 |     r"\.donmai\.us")
18 | 
19 | 
20 | class DanbooruExtractor(booru.DanbooruPageMixin, booru.BooruExtractor):
21 |     """Base class for danbooru extractors"""
22 |     category = "danbooru"
23 |     page_limit = 1000
24 | 
25 |     def __init__(self, match):
26 |         super().__init__(match)
27 |         self.subdomain = match.group("subdomain")
28 |         self.scheme = "https" if self.subdomain == "danbooru" else "http"
29 |         self.api_url = "{scheme}://{subdomain}.donmai.us/posts.json".format(
30 |             scheme=self.scheme, subdomain=self.subdomain)
31 |         self.ugoira = self.config("ugoira", True)
32 | 
33 |         username, api_key = self._get_auth_info()
34 |         if username:
35 |             self.log.debug("Using HTTP Basic Auth for user '%s'", username)
36 |             self.session.auth = (username, api_key)
37 | 
38 | 
39 | class DanbooruTagExtractor(booru.TagMixin, DanbooruExtractor):
40 |     """Extractor for images from danbooru based on search-tags"""
41 |     pattern = BASE_PATTERN + r"/posts\?(?:[^&#]*&)*tags=(?P<tags>[^&#]+)"
42 |     test = (
43 |         ("https://danbooru.donmai.us/posts?tags=bonocho", {
44 |             "content": "b196fb9f1668109d7774a0a82efea3ffdda07746",
45 |         }),
46 |         # test page transitions
47 |         ("https://danbooru.donmai.us/posts?tags=canvas_%28cocktail_soft%29", {
48 |             "count": ">= 50",
49 |         }),
50 |         ("https://hijiribe.donmai.us/posts?tags=bonocho"),
51 |         ("https://sonohara.donmai.us/posts?tags=bonocho"),
52 |         ("https://safebooru.donmai.us/posts?tags=bonocho"),
53 |     )
54 | 
55 | 
56 | class DanbooruPoolExtractor(booru.PoolMixin, DanbooruExtractor):
57 |     """Extractor for image-pools from danbooru"""
58 |     pattern = BASE_PATTERN + r"/pools/(?P<pool>\d+)"
59 |     test = ("https://danbooru.donmai.us/pools/7659", {
60 |         "content": "b16bab12bea5f7ea9e0a836bf8045f280e113d99",
61 |     })
62 | 
63 | 
64 | class DanbooruPostExtractor(booru.PostMixin, DanbooruExtractor):
65 |     """Extractor for single images from danbooru"""
66 |     pattern = BASE_PATTERN + r"/posts/(?P<post>\d+)"
67 |     test = (
68 |         ("https://danbooru.donmai.us/posts/294929", {
69 |             "content": "5e255713cbf0a8e0801dc423563c34d896bb9229",
70 |         }),
71 |         ("https://danbooru.donmai.us/posts/3613024", {
72 |             "pattern": r"https?://.+\.webm$",
73 |             "options": (("ugoira", False),)
74 |         })
75 |     )
76 | 
77 | 
78 | class DanbooruPopularExtractor(booru.PopularMixin, DanbooruExtractor):
79 |     """Extractor for popular images from danbooru"""
80 |     pattern = BASE_PATTERN + r"/explore/posts/popular(?:\?(?P<query>[^#]*))?"
81 |     test = (
82 |         ("https://danbooru.donmai.us/explore/posts/popular"),
83 |         (("https://danbooru.donmai.us/explore/posts/popular"
84 |           "?date=2013-06-06+03%3A34%3A22+-0400&scale=week"), {
85 |             "count": ">= 1",
86 |         }),
87 |     )
88 | 
89 |     def __init__(self, match):
90 |         super().__init__(match)
91 |         urlfmt = "{scheme}://{subdomain}.donmai.us/explore/posts/popular.json"
92 |         self.api_url = urlfmt.format(
93 |             scheme=self.scheme, subdomain=self.subdomain)
94 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/vanillarock.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://vanilla-rock.com/"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class VanillarockExtractor(Extractor):
16 |     """Base class for vanillarock extractors"""
17 |     category = "vanillarock"
18 |     root = "https://vanilla-rock.com"
19 | 
20 |     def __init__(self, match):
21 |         Extractor.__init__(self, match)
22 |         self.path = match.group(1)
23 | 
24 | 
25 | class VanillarockPostExtractor(VanillarockExtractor):
26 |     """Extractor for blogposts on vanilla-rock.com"""
27 |     subcategory = "post"
28 |     directory_fmt = ("{category}", "{path}")
29 |     filename_fmt = "{num:>02}.{extension}"
30 |     archive_fmt = "{filename}"
31 |     pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com"
32 |                r"(/(?!category/|tag/)[^/?&#]+)/?$")
33 |     test = ("https://vanilla-rock.com/mizuhashi_parsee-5", {
34 |         "url": "7fb9a4d18d9fa22d7295fee8d94ab5a7a52265dd",
35 |         "keyword": "b91df99b714e1958d9636748b1c81a07c3ef52c9",
36 |     })
37 | 
38 |     def items(self):
39 |         extr = text.extract_from(self.request(self.root + self.path).text)
40 |         name = extr("<title>", "</title>")
41 | 
42 |         imgs = []
43 |         while True:
44 |             img = extr('<div class="main-img">', '</div>')
45 |             if not img:
46 |                 break
47 |             imgs.append(text.extract(img, 'href="', '"')[0])
48 | 
49 |         data = {
50 |             "count": len(imgs),
51 |             "title": text.unescape(name.rpartition(" | ")[0]),
52 |             "path" : self.path.strip("/"),
53 |             "date" : text.parse_datetime(extr(
54 |                 '<div class="date">', '</div>'), "%Y-%m-%d %H:%M"),
55 |             "tags" : text.split_html(extr(
56 |                 '<div class="cat-tag">', '</div>'))[::2],
57 |         }
58 | 
59 |         yield Message.Version, 1
60 |         yield Message.Directory, data
61 |         for data["num"], url in enumerate(imgs, 1):
62 |             yield Message.Url, url, text.nameext_from_url(url, data)
63 | 
64 | 
65 | class VanillarockTagExtractor(VanillarockExtractor):
66 |     """Extractor for vanillarock blog posts by tag or category"""
67 |     subcategory = "tag"
68 |     pattern = (r"(?:https?://)?(?:www\.)?vanilla-rock\.com"
69 |                r"(/(?:tag|category)/[^?&#]+)")
70 |     test = (
71 |         ("https://vanilla-rock.com/tag/%e5%b0%84%e5%91%bd%e4%b8%b8%e6%96%87", {
72 |             "pattern": VanillarockPostExtractor.pattern,
73 |             "count": ">= 12",
74 |         }),
75 |         (("https://vanilla-rock.com/category/%e4%ba%8c%e6%ac%a1%e3%82%a8%e3%83"
76 |           "%ad%e7%94%bb%e5%83%8f/%e8%90%8c%e3%81%88%e3%83%bb%e3%82%bd%e3%83%95"
77 |           "%e3%83%88%e3%82%a8%e3%83%ad"), {
78 |             "pattern": VanillarockPostExtractor.pattern,
79 |             "count": 3,
80 |         }),
81 |     )
82 | 
83 |     def items(self):
84 |         url = self.root + self.path
85 |         data = {"_extractor": VanillarockPostExtractor}
86 | 
87 |         yield Message.Version, 1
88 |         while url:
89 |             extr = text.extract_from(self.request(url).text)
90 |             while True:
91 |                 post = extr('<h2 class="entry-title">', '</h2>')
92 |                 if not post:
93 |                     break
94 |                 yield Message.Queue, text.extract(post, 'href="', '"')[0], data
95 |             url = text.unescape(extr('class="next page-numbers" href="', '"'))
96 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/myportfolio.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract images from https://www.myportfolio.com/"""
10 | 
11 | from .common import Extractor, Message
12 | from .. import text
13 | 
14 | 
15 | class MyportfolioGalleryExtractor(Extractor):
16 |     """Extractor for an image gallery on www.myportfolio.com"""
17 |     category = "myportfolio"
18 |     subcategory = "gallery"
19 |     directory_fmt = ("{category}", "{user}", "{title}")
20 |     filename_fmt = "{num:>02}.{extension}"
21 |     archive_fmt = "{user}_{filename}"
22 |     pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|"
23 |                r"(?:https?://)?([^.]+\.myportfolio\.com))"
24 |                r"(/[^/?&#]+)?")
25 |     test = (
26 |         ("https://hannahcosgrove.myportfolio.com/niamh-1", {
27 |             "url": "8cbd73a73e5bf3b4f5d1b1d4a1eb114c01a72a66",
28 |             "keyword": "7a460bb5641e648ae70702ff91c2fb11054b0e0b",
29 |         }),
30 |         ("https://hannahcosgrove.myportfolio.com/lfw", {
31 |             "pattern": r"https://hannahcosgrove\.myportfolio\.com/[^/?&#+]+$",
32 |             "count": ">= 8",
33 |         }),
34 |         ("myportfolio:https://tooco.com.ar/6-of-diamonds-paradise-bird", {
35 |             "count": 3,
36 |         }),
37 |         ("myportfolio:https://tooco.com.ar/", {
38 |             "count": ">= 40",
39 |         }),
40 |     )
41 | 
42 |     def __init__(self, match):
43 |         Extractor.__init__(self, match)
44 |         domain1, domain2, self.path = match.groups()
45 |         self.domain = domain1 or domain2
46 |         self.prefix = "myportfolio:" if domain1 else ""
47 | 
48 |     def items(self):
49 |         yield Message.Version, 1
50 |         url = "https://" + self.domain + (self.path or "")
51 |         page = self.request(url).text
52 | 
53 |         projects = text.extract(
54 |             page, '<section class="project-covers', '</section>')[0]
55 | 
56 |         if projects:
57 |             data = {"_extractor": MyportfolioGalleryExtractor}
58 |             base = self.prefix + "https://" + self.domain
59 |             for path in text.extract_iter(projects, ' href="', '"'):
60 |                 yield Message.Queue, base + path, data
61 |         else:
62 |             data = self.metadata(page)
63 |             imgs = self.images(page)
64 |             data["count"] = len(imgs)
65 |             yield Message.Directory, data
66 |             for data["num"], url in enumerate(imgs, 1):
67 |                 yield Message.Url, url, text.nameext_from_url(url, data)
68 | 
69 |     @staticmethod
70 |     def metadata(page):
71 |         """Collect general image metadata"""
72 |         # og:title contains data as "<user> - <title>", but both
73 |         # <user> and <title> can contain a "-" as well, so we get the title
74 |         # from somewhere else and cut that amount from the og:title content
75 | 
76 |         user, pos = text.extract(
77 |             page, 'property=og:title content="', '"')
78 |         desc, pos = text.extract(
79 |             page, 'property=og:description content="', '"', pos)
80 |         title, pos = text.extract(
81 |             page, '<h1 ', '</h1>', pos)
82 | 
83 |         title = title.partition(">")[2]
84 |         user = user[:-len(title)-3]
85 | 
86 |         return {
87 |             "user": text.unescape(user),
88 |             "title": text.unescape(title),
89 |             "description": text.unescape(desc or ""),
90 |         }
91 | 
92 |     @staticmethod
93 |     def images(page):
94 |         """Extract and return a list of all image-urls"""
95 |         return list(text.extract_iter(page, 'js-lightbox" data-src="', '"'))
96 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/hentaicafe.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2018-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://hentai.cafe/"""
10 | 
11 | from . import foolslide
12 | from .. import text
13 | from .common import Extractor
14 | from ..cache import memcache
15 | import re
16 | 
17 | 
18 | class HentaicafeChapterExtractor(foolslide.FoolslideChapterExtractor):
19 |     """Extractor for manga-chapters from hentai.cafe"""
20 |     category = "hentaicafe"
21 |     directory_fmt = ("{category}", "{manga}")
22 |     pattern = (r"(?:https?://)?(?:www\.)?hentai\.cafe"
23 |                r"(/manga/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
24 |     test = ("https://hentai.cafe/manga/read/saitom-box/en/0/1/", {
25 |         "url": "8c6a8c56875ba3ed7ab0a74a64f9960077767fc2",
26 |         "keyword": "6913608267d883c82b887303b9ced13821188329",
27 |     })
28 |     root = "https://hentai.cafe"
29 | 
30 |     def metadata(self, page):
31 |         info = text.unescape(text.extract(page, '<title>', '</title>')[0])
32 |         manga, _, chapter_string = info.partition(" :: ")
33 | 
34 |         data = self._data(self.gallery_url.split("/")[5])
35 |         data["manga"] = manga
36 |         data["chapter_string"] = chapter_string.rstrip(" :")
37 |         return self.parse_chapter_url(self.gallery_url, data)
38 | 
39 |     @memcache(keyarg=1)
40 |     def _data(self, manga):
41 |         return {"artist": [], "tags": []}
42 | 
43 | 
44 | class HentaicafeMangaExtractor(foolslide.FoolslideMangaExtractor):
45 |     """Extractor for manga from hentai.cafe"""
46 |     category = "hentaicafe"
47 |     pattern = (r"(?:https?://)?" + r"(?:www\.)?hentai\.cafe"
48 |                r"(/hc\.fyi/\d+|(?:/manga/series)?/[^/?&#]+)/?$")
49 |     test = (
50 |         # single chapter
51 |         ("https://hentai.cafe/hazuki-yuuto-summer-blues/", {
52 |             "url": "f8e24a07d6fbb7c6a6ec5ad8ad8faf2436f8751b",
53 |             "keyword": "eb9f98544098c961bd8cf5dbe69e6da51c4fb2f6",
54 |         }),
55 |         # multi-chapter
56 |         ("https://hentai.cafe/saitom-saitom-box/", {
57 |             "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
58 |             "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb",
59 |         }),
60 |         # new-style URL
61 |         ("https://hentai.cafe/hc.fyi/2782", {
62 |             "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
63 |             "keyword": "28271062d7b4a2f99a0e1a894f69af8c5581a6bb",
64 |         }),
65 |         # foolslide URL
66 |         ("https://hentai.cafe/manga/series/saitom-box/", {
67 |             "url": "ca3e8a91531fd6acd863d93ac3afbd8ead06a076",
68 |             "keyword": "f0ece32d958f889d8229ed4052716d398a0a875c",
69 |         }),
70 | 
71 |     )
72 |     root = "https://hentai.cafe"
73 |     reverse = False
74 |     request = Extractor.request
75 |     chapterclass = HentaicafeChapterExtractor
76 | 
77 |     def chapters(self, page):
78 |         if "/manga/series/" in self.manga_url:
79 |             chapters = foolslide.FoolslideMangaExtractor.chapters(self, page)
80 |             chapters.reverse()
81 |             return chapters
82 | 
83 |         tags  , pos = text.extract(page, "<p>Tags: ", "</br>")
84 |         artist, pos = text.extract(page, "\nArtists: ", "</br>", pos)
85 |         manga , pos = text.extract(page, "/manga/read/", "/", pos)
86 |         data = {
87 |             "tags"  : text.split_html(tags)[::2],
88 |             "artist": text.split_html(artist),
89 |         }
90 |         HentaicafeChapterExtractor._data(manga).update(data)
91 | 
92 |         return [
93 |             (url, data)
94 |             for url in re.findall(
95 |                 r'<a +class="x-btn[^"]*" +href="([^"]+)"', page)
96 |         ]
97 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/readcomiconline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2016-2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extract comic-issues and entire comics from https://readcomiconline.to/"""
10 | 
11 | from .common import ChapterExtractor, MangaExtractor
12 | from .kissmanga import RedirectMixin
13 | from .. import text
14 | import re
15 | 
16 | 
17 | class ReadcomiconlineBase(RedirectMixin):
18 |     """Base class for readcomiconline extractors"""
19 |     category = "readcomiconline"
20 |     directory_fmt = ("{category}", "{comic}", "{issue:>03}")
21 |     filename_fmt = "{comic}_{issue:>03}_{page:>03}.{extension}"
22 |     archive_fmt = "{issue_id}_{page}"
23 |     root = "https://readcomiconline.to"
24 | 
25 | 
26 | class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
27 |     """Extractor for comic-issues from readcomiconline.to"""
28 |     subcategory = "issue"
29 |     pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
30 |                r"(/Comic/[^/?&#]+/[^/?&#]+\?id=(\d+))")
31 |     test = ("https://readcomiconline.to/Comic/W-i-t-c-h/Issue-130?id=22289", {
32 |         "url": "2bbab6ec4fbc05d269cca420a82a9b5acda28682",
33 |         "keyword": "30fe110273e871305001f33c18634516a0a51421",
34 |     })
35 | 
36 |     def __init__(self, match):
37 |         ChapterExtractor.__init__(self, match)
38 |         self.issue_id = match.group(2)
39 | 
40 |     def metadata(self, page):
41 |         comic, pos = text.extract(page, "   - Read\r\n    ", "\r\n")
42 |         iinfo, pos = text.extract(page, "    ", "\r\n", pos)
43 |         match = re.match(r"(?:Issue )?#(\d+)|(.+)", iinfo)
44 |         return {
45 |             "comic": comic,
46 |             "issue": match.group(1) or match.group(2),
47 |             "issue_id": text.parse_int(self.issue_id),
48 |             "lang": "en",
49 |             "language": "English",
50 |         }
51 | 
52 |     def images(self, page):
53 |         return [
54 |             (url, None)
55 |             for url in text.extract_iter(
56 |                 page, 'lstImages.push("', '"'
57 |             )
58 |         ]
59 | 
60 | 
61 | class ReadcomiconlineComicExtractor(ReadcomiconlineBase, MangaExtractor):
62 |     """Extractor for comics from readcomiconline.to"""
63 |     chapterclass = ReadcomiconlineIssueExtractor
64 |     subcategory = "comic"
65 |     pattern = (r"(?i)(?:https?://)?(?:www\.)?readcomiconline\.to"
66 |                r"(/Comic/[^/?&#]+/?)$")
67 |     test = (
68 |         ("https://readcomiconline.to/Comic/W-i-t-c-h", {
69 |             "url": "e231bc2a293edb465133c37a8e36a7e7d94cab14",
70 |             "keyword": "3986248e4458fa44a201ec073c3684917f48ee0c",
71 |         }),
72 |         ("https://readcomiconline.to/Comic/Bazooka-Jules", {
73 |             "url": "711674cb78ed10bd2557315f7a67552d01b33985",
74 |             "keyword": "f5ba5246cd787bb750924d9690cb1549199bd516",
75 |         }),
76 |     )
77 | 
78 |     def chapters(self, page):
79 |         results = []
80 |         comic, pos = text.extract(page, ' class="barTitle">', '<')
81 |         page , pos = text.extract(page, ' class="listing">', '</table>', pos)
82 | 
83 |         comic = comic.rpartition("information")[0].strip()
84 |         needle = ' title="Read {} '.format(comic)
85 |         comic = text.unescape(comic)
86 | 
87 |         for item in text.extract_iter(page, ' href="', ' comic online '):
88 |             url, _, issue = item.partition(needle)
89 |             url = url.rpartition('"')[0]
90 |             if issue.startswith('Issue #'):
91 |                 issue = issue[7:]
92 |             results.append((self.root + url, {
93 |                 "comic": comic, "issue": issue,
94 |                 "issue_id": text.parse_int(url.rpartition("=")[2]),
95 |                 "lang": "en", "language": "English",
96 |             }))
97 |         return results
98 | 


--------------------------------------------------------------------------------
/test/test_oauth.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2018 Mike Fährmann
  5 | #
  6 | # This program is free software; you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License version 2 as
  8 | # published by the Free Software Foundation.
  9 | 
 10 | import unittest
 11 | 
 12 | from gallery_dl import oauth, text
 13 | 
 14 | TESTSERVER = "http://term.ie/oauth/example"
 15 | CONSUMER_KEY = "key"
 16 | CONSUMER_SECRET = "secret"
 17 | REQUEST_TOKEN = "requestkey"
 18 | REQUEST_TOKEN_SECRET = "requestsecret"
 19 | ACCESS_TOKEN = "accesskey"
 20 | ACCESS_TOKEN_SECRET = "accesssecret"
 21 | 
 22 | 
 23 | class TestOAuthSession(unittest.TestCase):
 24 | 
 25 |     def test_concat(self):
 26 |         concat = oauth.concat
 27 | 
 28 |         self.assertEqual(concat(), "")
 29 |         self.assertEqual(concat("str"), "str")
 30 |         self.assertEqual(concat("str1", "str2"), "str1&str2")
 31 | 
 32 |         self.assertEqual(concat("&", "?/"), "%26&%3F%2F")
 33 |         self.assertEqual(
 34 |             concat("GET", "http://example.org/", "foo=bar&baz=a"),
 35 |             "GET&http%3A%2F%2Fexample.org%2F&foo%3Dbar%26baz%3Da"
 36 |         )
 37 | 
 38 |     def test_nonce(self, size=16):
 39 |         nonce_values = set(oauth.nonce(size) for _ in range(size))
 40 | 
 41 |         # uniqueness
 42 |         self.assertEqual(len(nonce_values), size)
 43 | 
 44 |         # length
 45 |         for nonce in nonce_values:
 46 |             self.assertEqual(len(nonce), size)
 47 | 
 48 |     def test_quote(self):
 49 |         quote = oauth.quote
 50 | 
 51 |         reserved = ",;:!\"§$%&/(){}[]=?`´+*'äöü"
 52 |         unreserved = ("ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 53 |                       "abcdefghijklmnopqrstuvwxyz"
 54 |                       "0123456789-._~")
 55 | 
 56 |         for char in unreserved:
 57 |             self.assertEqual(quote(char), char)
 58 | 
 59 |         for char in reserved:
 60 |             quoted = quote(char)
 61 |             quoted_hex = quoted.replace("%", "")
 62 |             self.assertTrue(quoted.startswith("%"))
 63 |             self.assertTrue(len(quoted) >= 3)
 64 |             self.assertEqual(quoted_hex.upper(), quoted_hex)
 65 | 
 66 |     def test_request_token(self):
 67 |         response = self._oauth_request(
 68 |             "/request_token.php", {})
 69 |         expected = "oauth_token=requestkey&oauth_token_secret=requestsecret"
 70 |         self.assertEqual(response, expected, msg=response)
 71 | 
 72 |         data = text.parse_query(response)
 73 |         self.assertTrue(data["oauth_token"], REQUEST_TOKEN)
 74 |         self.assertTrue(data["oauth_token_secret"], REQUEST_TOKEN_SECRET)
 75 | 
 76 |     def test_access_token(self):
 77 |         response = self._oauth_request(
 78 |             "/access_token.php", {}, REQUEST_TOKEN, REQUEST_TOKEN_SECRET)
 79 |         expected = "oauth_token=accesskey&oauth_token_secret=accesssecret"
 80 |         self.assertEqual(response, expected, msg=response)
 81 | 
 82 |         data = text.parse_query(response)
 83 |         self.assertTrue(data["oauth_token"], ACCESS_TOKEN)
 84 |         self.assertTrue(data["oauth_token_secret"], ACCESS_TOKEN_SECRET)
 85 | 
 86 |     def test_authenticated_call(self):
 87 |         params = {"method": "foo", "a": "äöüß/?&#", "äöüß/?&#": "a"}
 88 |         response = self._oauth_request(
 89 |             "/echo_api.php", params, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
 90 | 
 91 |         self.assertEqual(text.parse_query(response), params)
 92 | 
 93 |     def _oauth_request(self, endpoint, params=None,
 94 |                        oauth_token=None, oauth_token_secret=None):
 95 |         session = oauth.OAuth1Session(
 96 |             CONSUMER_KEY, CONSUMER_SECRET,
 97 |             oauth_token, oauth_token_secret,
 98 |         )
 99 |         url = TESTSERVER + endpoint
100 |         return session.get(url, params=params).text
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     unittest.main(warnings="ignore")
105 | 


--------------------------------------------------------------------------------
/gallery_dl/downloader/ytdl.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2018 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Downloader module for URLs requiring youtube-dl support"""
 10 | 
 11 | from youtube_dl import YoutubeDL, DEFAULT_OUTTMPL
 12 | from .common import DownloaderBase
 13 | from .. import text
 14 | import os
 15 | 
 16 | 
 17 | class YoutubeDLDownloader(DownloaderBase):
 18 |     scheme = "ytdl"
 19 | 
 20 |     def __init__(self, extractor, output):
 21 |         DownloaderBase.__init__(self, extractor, output)
 22 | 
 23 |         retries = self.config("retries", extractor._retries)
 24 |         options = {
 25 |             "format": self.config("format") or None,
 26 |             "ratelimit": text.parse_bytes(self.config("rate"), None),
 27 |             "retries": retries+1 if retries >= 0 else float("inf"),
 28 |             "socket_timeout": self.config("timeout", extractor._timeout),
 29 |             "nocheckcertificate": not self.config("verify", extractor._verify),
 30 |             "nopart": not self.part,
 31 |             "updatetime": self.config("mtime", True),
 32 |         }
 33 |         options.update(self.config("raw-options") or {})
 34 | 
 35 |         if self.config("logging", True):
 36 |             options["logger"] = self.log
 37 |         self.forward_cookies = self.config("forward-cookies", True)
 38 | 
 39 |         outtmpl = self.config("outtmpl")
 40 |         self.outtmpl = DEFAULT_OUTTMPL if outtmpl == "default" else outtmpl
 41 | 
 42 |         self.ytdl = YoutubeDL(options)
 43 | 
 44 |     def download(self, url, pathfmt):
 45 |         if self.forward_cookies:
 46 |             set_cookie = self.ytdl.cookiejar.set_cookie
 47 |             for cookie in self.session.cookies:
 48 |                 set_cookie(cookie)
 49 | 
 50 |         try:
 51 |             info_dict = self.ytdl.extract_info(url[5:], download=False)
 52 |         except Exception:
 53 |             return False
 54 | 
 55 |         if "entries" in info_dict:
 56 |             index = pathfmt.kwdict.get("_ytdl_index")
 57 |             if index is None:
 58 |                 return self._download_playlist(pathfmt, info_dict)
 59 |             else:
 60 |                 info_dict = info_dict["entries"][index]
 61 |         return self._download_video(pathfmt, info_dict)
 62 | 
 63 |     def _download_video(self, pathfmt, info_dict):
 64 |         if "url" in info_dict:
 65 |             text.nameext_from_url(info_dict["url"], pathfmt.kwdict)
 66 | 
 67 |         if self.outtmpl:
 68 |             self.ytdl.params["outtmpl"] = self.outtmpl
 69 |             pathfmt.filename = filename = self.ytdl.prepare_filename(info_dict)
 70 |             pathfmt.extension = info_dict["ext"]
 71 |             pathfmt.path = pathfmt.directory + filename
 72 |             pathfmt.realpath = pathfmt.temppath = (
 73 |                 pathfmt.realdirectory + filename)
 74 |         else:
 75 |             pathfmt.set_extension(info_dict["ext"])
 76 | 
 77 |         if pathfmt.exists():
 78 |             pathfmt.temppath = ""
 79 |             return True
 80 |         if self.part and self.partdir:
 81 |             pathfmt.temppath = os.path.join(
 82 |                 self.partdir, pathfmt.filename)
 83 |         self.ytdl.params["outtmpl"] = pathfmt.temppath.replace("%", "%%")
 84 | 
 85 |         self.out.start(pathfmt.path)
 86 |         try:
 87 |             self.ytdl.process_info(info_dict)
 88 |         except Exception:
 89 |             self.log.debug("Traceback", exc_info=True)
 90 |             return False
 91 |         return True
 92 | 
 93 |     def _download_playlist(self, pathfmt, info_dict):
 94 |         pathfmt.set_extension("%(playlist_index)s.%(ext)s")
 95 |         self.ytdl.params["outtmpl"] = pathfmt.realpath
 96 | 
 97 |         for entry in info_dict["entries"]:
 98 |             self.ytdl.process_info(entry)
 99 |         return True
100 | 
101 | 
102 | __downloader__ = YoutubeDLDownloader
103 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/hentainexus.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Copyright 2019 Mike Fährmann
 4 | #
 5 | # This program is free software; you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License version 2 as
 7 | # published by the Free Software Foundation.
 8 | 
 9 | """Extractors for https://hentainexus.com/"""
10 | 
11 | from .common import GalleryExtractor, Extractor, Message
12 | from .. import text, util
13 | import json
14 | 
15 | 
16 | class HentainexusGalleryExtractor(GalleryExtractor):
17 |     """Extractor for image galleries on hentainexus.com"""
18 |     category = "hentainexus"
19 |     root = "https://hentainexus.com"
20 |     pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
21 |                r"/(?:view|read)/(\d+)")
22 |     test = (
23 |         ("https://hentainexus.com/view/5688", {
24 |             "url": "746d0043e20030f1171aae5ea113176607302517",
25 |             "keyword": "9512cf5f258130e5f75de9954d7a13217c2405e7",
26 |         }),
27 |         ("https://hentainexus.com/read/5688"),
28 |     )
29 | 
30 |     def __init__(self, match):
31 |         self.gallery_id = match.group(1)
32 |         url = "{}/view/{}".format(self.root, self.gallery_id)
33 |         GalleryExtractor.__init__(self, match, url)
34 | 
35 |     def metadata(self, page):
36 |         rmve = text.remove_html
37 |         extr = text.extract_from(page)
38 |         data = {
39 |             "gallery_id" : text.parse_int(self.gallery_id),
40 |             "tags"       : extr('"og:description" content="', '"').split(", "),
41 |             "thumbnail"  : extr('"og:image" content="', '"'),
42 |             "title"      : extr('<h1 class="title">', '</h1>'),
43 |             "artist"     : rmve(extr('viewcolumn">Artist</td>'     , '</td>')),
44 |             "book"       : rmve(extr('viewcolumn">Book</td>'       , '</td>')),
45 |             "language"   : rmve(extr('viewcolumn">Language</td>'   , '</td>')),
46 |             "magazine"   : rmve(extr('viewcolumn">Magazine</td>'   , '</td>')),
47 |             "parody"     : rmve(extr('viewcolumn">Parody</td>'     , '</td>')),
48 |             "publisher"  : rmve(extr('viewcolumn">Publisher</td>'  , '</td>')),
49 |             "description": rmve(extr('viewcolumn">Description</td>', '</td>')),
50 |         }
51 |         data["lang"] = util.language_to_code(data["language"])
52 |         return data
53 | 
54 |     def images(self, page):
55 |         url = "{}/read/{}".format(self.root, self.gallery_id)
56 |         extr = text.extract_from(self.request(url).text)
57 |         urls = extr("initReader(", "]") + "]"
58 |         return [(url, None) for url in json.loads(urls)]
59 | 
60 | 
61 | class HentainexusSearchExtractor(Extractor):
62 |     """Extractor for search results on hentainexus.com"""
63 |     category = "hentainexus"
64 |     subcategory = "search"
65 |     root = "https://hentainexus.com"
66 |     pattern = (r"(?i)(?:https?://)?(?:www\.)?hentainexus\.com"
67 |                r"(?:/page/\d+)?/?(?:\?(q=[^/?#]+))?$")
68 |     test = (
69 |         ("https://hentainexus.com/?q=tag:%22heart+pupils%22%20tag:group", {
70 |             "pattern": HentainexusGalleryExtractor.pattern,
71 |             "count": ">= 50",
72 |         }),
73 |         ("https://hentainexus.com/page/3?q=tag:%22heart+pupils%22"),
74 |     )
75 | 
76 |     def __init__(self, match):
77 |         Extractor.__init__(self, match)
78 |         self.params = text.parse_query(match.group(1))
79 | 
80 |     def items(self):
81 |         yield Message.Version, 1
82 |         params = self.params
83 |         path = "/"
84 | 
85 |         while path:
86 |             page = self.request(self.root + path, params=params).text
87 |             extr = text.extract_from(page)
88 |             data = {"_extractor": HentainexusGalleryExtractor}
89 | 
90 |             while True:
91 |                 gallery_id = extr('<a href="/view/', '"')
92 |                 if not gallery_id:
93 |                     break
94 |                 yield Message.Queue, self.root + "/view/" + gallery_id, data
95 | 
96 |             path = extr('class="pagination-next" href="', '"')
97 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/fuskator.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extractors for https://fuskator.com/"""
 10 | 
 11 | from .common import GalleryExtractor, Extractor, Message
 12 | from .. import text
 13 | import time
 14 | 
 15 | 
 16 | class FuskatorGalleryExtractor(GalleryExtractor):
 17 |     """Extractor for image galleries on fuskator.com"""
 18 |     category = "fuskator"
 19 |     root = "https://fuskator.com"
 20 |     pattern = r"(?:https?://)?fuskator\.com/(?:thumbs|expanded)/([^/?&#]+)"
 21 |     test = (
 22 |         ("https://fuskator.com/thumbs/d0GnIzXrSKU/", {
 23 |             "pattern": r"https://i\d+.fuskator.com/large/d0GnIzXrSKU/.+\.jpg",
 24 |             "count": 22,
 25 |             "keyword": {
 26 |                 "gallery_id": 473023,
 27 |                 "gallery_hash": "d0GnIzXrSKU",
 28 |                 "title": "re:Shaved Brunette Babe Maria Ryabushkina with ",
 29 |                 "views": int,
 30 |                 "score": float,
 31 |                 "count": 22,
 32 |                 "tags": list,
 33 |             },
 34 |         }),
 35 |         ("https://fuskator.com/expanded/gXpKzjgIidA/index.html"),
 36 |     )
 37 | 
 38 |     def __init__(self, match):
 39 |         self.gallery_hash = match.group(1)
 40 |         url = "{}/thumbs/{}/".format(self.root, self.gallery_hash)
 41 |         GalleryExtractor.__init__(self, match, url)
 42 | 
 43 |     def metadata(self, page):
 44 |         headers = {
 45 |             "Referer"         : self.gallery_url,
 46 |             "X-Requested-With": "XMLHttpRequest",
 47 |         }
 48 |         auth = self.request(
 49 |             self.root + "/ajax/auth.aspx", method="POST", headers=headers,
 50 |         ).text
 51 | 
 52 |         params = {
 53 |             "X-Auth": auth,
 54 |             "hash"  : self.gallery_hash,
 55 |             "_"     : int(time.time()),
 56 |         }
 57 |         self.data = data = self.request(
 58 |             self.root + "/ajax/gal.aspx", params=params, headers=headers,
 59 |         ).json()
 60 | 
 61 |         title = text.extract(page, "<title>", "</title>")[0].strip()
 62 |         title, _, gallery_id = title.rpartition("#")
 63 | 
 64 |         return {
 65 |             "gallery_id"  : text.parse_int(gallery_id),
 66 |             "gallery_hash": self.gallery_hash,
 67 |             "title"       : text.unescape(title[:-15]),
 68 |             "views"       : data["hits"],
 69 |             "score"       : data["rating"],
 70 |             "tags"        : data["tags"].split(","),
 71 |             "count"       : len(data["images"]),
 72 |         }
 73 | 
 74 |     def images(self, page):
 75 |         for image in self.data["images"]:
 76 |             yield "https:" + image["imageUrl"], image
 77 | 
 78 | 
 79 | class FuskatorSearchExtractor(Extractor):
 80 |     """Extractor for search results on fuskator.com"""
 81 |     category = "fuskator"
 82 |     subcategory = "search"
 83 |     root = "https://fuskator.com"
 84 |     pattern = r"(?:https?://)?fuskator\.com(/(?:search|page)/.+)"
 85 |     test = (
 86 |         ("https://fuskator.com/search/red_swimsuit/", {
 87 |             "pattern": FuskatorGalleryExtractor.pattern,
 88 |             "count": ">= 40",
 89 |         }),
 90 |         ("https://fuskator.com/page/3/swimsuit/quality/"),
 91 |     )
 92 | 
 93 |     def __init__(self, match):
 94 |         Extractor.__init__(self, match)
 95 |         self.path = match.group(1)
 96 | 
 97 |     def items(self):
 98 |         url = self.root + self.path
 99 |         data = {"_extractor": FuskatorGalleryExtractor}
100 | 
101 |         while True:
102 |             page = self.request(url).text
103 |             for path in text.extract_iter(
104 |                     page, 'class="pic_pad"><a href="', '"'):
105 |                 yield Message.Queue, self.root + path, data
106 | 
107 |             pages = text.extract(page, 'class="pages"><span>', '>&gt;&gt;<')[0]
108 |             if not pages:
109 |                 return
110 |             url = self.root + text.rextract(pages, 'href="', '"')[0]
111 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/hentai2read.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2016-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract hentai-manga from https://hentai2read.com/"""
 10 | 
 11 | from .common import ChapterExtractor, MangaExtractor
 12 | from .. import text
 13 | import json
 14 | import re
 15 | 
 16 | 
 17 | class Hentai2readBase():
 18 |     """Base class for hentai2read extractors"""
 19 |     category = "hentai2read"
 20 |     root = "https://hentai2read.com"
 21 | 
 22 | 
 23 | class Hentai2readChapterExtractor(Hentai2readBase, ChapterExtractor):
 24 |     """Extractor for a single manga chapter from hentai2read.com"""
 25 |     archive_fmt = "{chapter_id}_{page}"
 26 |     pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+/(\d+))"
 27 |     test = ("https://hentai2read.com/amazon_elixir/1/", {
 28 |         "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
 29 |         "keyword": "ff84b8f751f0e4ee37717efc4332ff1db71951d9",
 30 |     })
 31 | 
 32 |     def __init__(self, match):
 33 |         self.chapter = match.group(2)
 34 |         ChapterExtractor.__init__(self, match)
 35 | 
 36 |     def metadata(self, page):
 37 |         title, pos = text.extract(page, "<title>", "</title>")
 38 |         manga_id, pos = text.extract(page, 'data-mid="', '"', pos)
 39 |         chapter_id, pos = text.extract(page, 'data-cid="', '"', pos)
 40 |         match = re.match(r"Reading (.+) \(([^)]+)\) Hentai(?: by (.+))? - "
 41 |                          r"(\d+): (.+) . Page 1 ", title)
 42 |         return {
 43 |             "manga": match.group(1),
 44 |             "manga_id": text.parse_int(manga_id),
 45 |             "chapter": text.parse_int(self.chapter),
 46 |             "chapter_id": text.parse_int(chapter_id),
 47 |             "type": match.group(2),
 48 |             "author": match.group(3),
 49 |             "title": match.group(5),
 50 |             "lang": "en",
 51 |             "language": "English",
 52 |         }
 53 | 
 54 |     @staticmethod
 55 |     def images(page):
 56 |         images = text.extract(page, "'images' : ", ",\n")[0]
 57 |         return [
 58 |             ("https://hentaicdn.com/hentai" + part, None)
 59 |             for part in json.loads(images)
 60 |         ]
 61 | 
 62 | 
 63 | class Hentai2readMangaExtractor(Hentai2readBase, MangaExtractor):
 64 |     """Extractor for hmanga from hentai2read.com"""
 65 |     chapterclass = Hentai2readChapterExtractor
 66 |     pattern = r"(?:https?://)?(?:www\.)?hentai2read\.com(/[^/?&#]+)/?$"
 67 |     test = (
 68 |         ("https://hentai2read.com/amazon_elixir/", {
 69 |             "url": "273073752d418ec887d7f7211e42b832e8c403ba",
 70 |             "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac",
 71 |         }),
 72 |         ("https://hentai2read.com/oshikage_riot/", {
 73 |             "url": "6595f920a3088a15c2819c502862d45f8eb6bea6",
 74 |             "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36",
 75 |         }),
 76 |     )
 77 | 
 78 |     def chapters(self, page):
 79 |         results = []
 80 |         manga, pos = text.extract(
 81 |             page, '<span itemprop="name">', '</span>')
 82 |         mtype, pos = text.extract(
 83 |             page, '<small class="text-danger">[', ']</small>', pos)
 84 |         manga_id = text.parse_int(text.extract(
 85 |             page, 'data-mid="', '"', pos)[0])
 86 | 
 87 |         while True:
 88 |             chapter_id, pos = text.extract(page, ' data-cid="', '"', pos)
 89 |             if not chapter_id:
 90 |                 return results
 91 |             _  , pos = text.extract(page, ' href="', '"', pos)
 92 |             url, pos = text.extract(page, ' href="', '"', pos)
 93 |             chapter, pos = text.extract(page, '>', '<', pos)
 94 | 
 95 |             chapter, _, title = text.unescape(chapter).strip().partition(" - ")
 96 |             results.append((url, {
 97 |                 "manga_id": manga_id, "manga": manga, "type": mtype,
 98 |                 "chapter_id": text.parse_int(chapter_id),
 99 |                 "chapter": text.parse_int(chapter),
100 |                 "title": title, "lang": "en", "language": "English",
101 |             }))
102 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/pururin.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extractors for https://pururin.io/"""
 10 | 
 11 | from .common import GalleryExtractor
 12 | from .. import text, util
 13 | import json
 14 | 
 15 | 
 16 | class PururinGalleryExtractor(GalleryExtractor):
 17 |     """Extractor for image galleries on pururin.io"""
 18 |     category = "pururin"
 19 |     pattern = r"(?:https?://)?(?:www\.)?pururin\.io/(?:gallery|read)/(\d+)"
 20 |     test = (
 21 |         ("https://pururin.io/gallery/38661/iowant-2", {
 22 |             "pattern": r"https://cdn.pururin.io/\w+/images/data/\d+/\d+\.jpg",
 23 |             "keyword": {
 24 |                 "title"     : "Iowant 2!!",
 25 |                 "title_en"  : "Iowant 2!!",
 26 |                 "title_jp"  : "",
 27 |                 "gallery_id": 38661,
 28 |                 "count"     : 19,
 29 |                 "artist"    : ["Shoda Norihiro"],
 30 |                 "group"     : ["Obsidian Order"],
 31 |                 "parody"    : ["Kantai Collection"],
 32 |                 "characters": ["Admiral", "Iowa"],
 33 |                 "tags"      : list,
 34 |                 "type"      : "Doujinshi",
 35 |                 "collection": "",
 36 |                 "convention": "C92",
 37 |                 "rating"    : float,
 38 |                 "uploader"  : "demo",
 39 |                 "scanlator" : "",
 40 |                 "lang"      : "en",
 41 |                 "language"  : "English",
 42 |             }
 43 |         }),
 44 |         ("https://pururin.io/gallery/7661/unisis-team-vanilla", {
 45 |             "count": 17,
 46 |         }),
 47 |     )
 48 |     root = "https://pururin.io"
 49 | 
 50 |     def __init__(self, match):
 51 |         self.gallery_id = match.group(1)
 52 |         url = "{}/gallery/{}/x".format(self.root, self.gallery_id)
 53 |         GalleryExtractor.__init__(self, match, url)
 54 | 
 55 |         self._ext = ""
 56 |         self._cnt = 0
 57 | 
 58 |     def metadata(self, page):
 59 |         extr = text.extract_from(page)
 60 | 
 61 |         def _lst(key, e=extr):
 62 |             return [
 63 |                 text.unescape(item)
 64 |                 for item in text.extract_iter(e(key, "</td>"), 'title="', '"')
 65 |             ]
 66 | 
 67 |         def _str(key, e=extr):
 68 |             return text.unescape(text.extract(
 69 |                 e(key, "</td>"), 'title="', '"')[0] or "")
 70 | 
 71 |         url = "{}/read/{}/01/x".format(self.root, self.gallery_id)
 72 |         page = self.request(url).text
 73 |         info = json.loads(text.unescape(text.extract(
 74 |             page, ':gallery="', '"')[0]))
 75 |         self._ext = info["image_extension"]
 76 |         self._cnt = info["total_pages"]
 77 | 
 78 |         data = {
 79 |             "gallery_id": text.parse_int(self.gallery_id),
 80 |             "title"     : info["title"] or info.get("j_title") or "",
 81 |             "title_en"  : info["title"],
 82 |             "title_jp"  : info.get("j_title") or "",
 83 |             "artist"    : _lst("<td>Artist</td>"),
 84 |             "group"     : _lst("<td>Circle</td>"),
 85 |             "parody"    : _lst("<td>Parody</td>"),
 86 |             "tags"      : _lst("<td>Contents</td>"),
 87 |             "type"      : _str("<td>Category</td>"),
 88 |             "characters": _lst("<td>Character</td>"),
 89 |             "collection": _str("<td>Collection</td>"),
 90 |             "language"  : _str("<td>Language</td>"),
 91 |             "scanlator" : _str("<td>Scanlator</td>"),
 92 |             "convention": _str("<td>Convention</td>"),
 93 |             "uploader"  : text.remove_html(extr("<td>Uploader</td>", "</td>")),
 94 |             "rating"    : text.parse_float(extr(" :rating='"       , "'")),
 95 |         }
 96 |         data["lang"] = util.language_to_code(data["language"])
 97 |         return data
 98 | 
 99 |     def images(self, _):
100 |         ufmt = "https://cdn.pururin.io/assets/images/data/{}/{{}}.{}".format(
101 |             self.gallery_id, self._ext)
102 |         return [(ufmt.format(num), None) for num in range(1, self._cnt + 1)]
103 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/hbrowse.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2015-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract images from https://www.hbrowse.com/"""
 10 | 
 11 | from .common import ChapterExtractor, MangaExtractor
 12 | from .. import text, exception
 13 | import json
 14 | 
 15 | 
 16 | class HbrowseBase():
 17 |     """Base class for hbrowse extractors"""
 18 |     category = "hbrowse"
 19 |     root = "https://www.hbrowse.com"
 20 | 
 21 |     def parse_page(self, page, data):
 22 |         """Parse metadata on 'page' and add it to 'data'"""
 23 |         data, pos = text.extract_all(page, (
 24 |             ('manga' , '<td class="listLong">', '</td>'),
 25 |             ('artist', '<td class="listLong">', '</td>'),
 26 |             ('total' , '<td class="listLong">', ' '),
 27 |             ('origin', '<td class="listLong">', '</td>'),
 28 |         ), values=data)
 29 | 
 30 |         if not data["manga"] and "<b>Warning</b>" in page:
 31 |             msg = page.rpartition(">")[2].strip()
 32 |             self.log.error("Site is not accessible: '%s'", msg)
 33 |             raise exception.StopExtraction()
 34 | 
 35 |         tags = text.extract(page, 'class="listTable"', '</table>', pos)[0]
 36 | 
 37 |         data["manga"] = text.unescape(data["manga"])
 38 |         data["total"] = text.parse_int(data["total"])
 39 |         data["artist"] = text.remove_html(data["artist"])
 40 |         data["origin"] = text.remove_html(data["origin"])
 41 |         data["tags"] = list(text.extract_iter(tags, 'href="/browse/', '"'))
 42 |         return data
 43 | 
 44 | 
 45 | class HbrowseChapterExtractor(HbrowseBase, ChapterExtractor):
 46 |     """Extractor for manga-chapters from hbrowse.com"""
 47 |     directory_fmt = ("{category}", "{manga_id} {manga}", "c{chapter:>05}")
 48 |     filename_fmt = ("{category}_{manga_id}_{chapter:>05}_"
 49 |                     "{page:>03}.{extension}")
 50 |     archive_fmt = "{manga_id}_{chapter}_{page}"
 51 |     pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/(\d+)/c(\d+))"
 52 |     test = ("https://www.hbrowse.com/10363/c00000", {
 53 |         "url": "6feefbc9f4b98e20d8425ddffa9dd111791dc3e6",
 54 |         "keyword": "274996f6c809e5250b6ff3abbc5147e29f89d9a5",
 55 |         "content": "44578ebbe176c2c27434966aef22945787e2781e",
 56 |     })
 57 | 
 58 |     def __init__(self, match):
 59 |         self.path, self.gid, self.chapter = match.groups()
 60 |         self.path += "/"
 61 |         ChapterExtractor.__init__(self, match)
 62 | 
 63 |     def metadata(self, page):
 64 |         return self.parse_page(page, {
 65 |             "manga_id": text.parse_int(self.gid),
 66 |             "chapter": text.parse_int(self.chapter)
 67 |         })
 68 | 
 69 |     def images(self, page):
 70 |         base = self.root + "/data" + self.path
 71 |         json_data = text.extract(page, ';list = ', ',"zzz"')[0] + "]"
 72 |         return [(base + name, None) for name in json.loads(json_data)]
 73 | 
 74 | 
 75 | class HbrowseMangaExtractor(HbrowseBase, MangaExtractor):
 76 |     """Extractor for manga from hbrowse.com"""
 77 |     chapterclass = HbrowseChapterExtractor
 78 |     reverse = False
 79 |     pattern = r"(?:https?://)?(?:www\.)?hbrowse\.com(/\d+)/?$"
 80 |     test = ("https://www.hbrowse.com/10363", {
 81 |         "url": "b89682bfb86c11d2af0dc47463804ec3ac4aadd6",
 82 |         "keyword": "4b15fda1858a69de1fbf5afddfe47dd893397312",
 83 |     })
 84 | 
 85 |     def chapters(self, page):
 86 |         results = []
 87 |         data = self.parse_page(page, {
 88 |             "manga_id": text.parse_int(
 89 |                 self.manga_url.rstrip("/").rpartition("/")[2])
 90 |         })
 91 | 
 92 |         pos = 0
 93 |         needle = '<td class="listMiddle">\n<a class="listLink" href="'
 94 |         while True:
 95 |             url, pos = text.extract(page, needle, '"', pos)
 96 |             if not url:
 97 |                 return results
 98 |             title, pos = text.extract(page, '>View ', '<', pos)
 99 |             data["chapter"] = text.parse_int(url.rpartition("/")[2][1:])
100 |             data["title"] = title
101 |             results.append((text.urljoin(self.root, url), data.copy()))
102 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/hentaihere.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2016-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract hentai-manga from https://hentaihere.com/"""
 10 | 
 11 | from .common import ChapterExtractor, MangaExtractor
 12 | from .. import text
 13 | import json
 14 | import re
 15 | 
 16 | 
 17 | class HentaihereBase():
 18 |     """Base class for hentaihere extractors"""
 19 |     category = "hentaihere"
 20 |     root = "https://hentaihere.com"
 21 | 
 22 | 
 23 | class HentaihereChapterExtractor(HentaihereBase, ChapterExtractor):
 24 |     """Extractor for a single manga chapter from hentaihere.com"""
 25 |     archive_fmt = "{chapter_id}_{page}"
 26 |     pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com/m/S(\d+)/(\d+)"
 27 |     test = ("https://hentaihere.com/m/S13812/1/1/", {
 28 |         "url": "964b942cf492b3a129d2fe2608abfc475bc99e71",
 29 |         "keyword": "cbcee0c0eb178c4b87f06a834085784f8dddad24",
 30 |     })
 31 | 
 32 |     def __init__(self, match):
 33 |         self.manga_id, self.chapter = match.groups()
 34 |         url = "{}/m/S{}/{}/1".format(self.root, self.manga_id, self.chapter)
 35 |         ChapterExtractor.__init__(self, match, url)
 36 | 
 37 |     def metadata(self, page):
 38 |         title = text.extract(page, "<title>", "</title>")[0]
 39 |         chapter_id = text.extract(page, 'report/C', '"')[0]
 40 |         pattern = r"Page 1 \| (.+) \(([^)]+)\) - Chapter \d+: (.+) by (.+) at "
 41 |         match = re.match(pattern, title)
 42 |         return {
 43 |             "manga": match.group(1),
 44 |             "manga_id": text.parse_int(self.manga_id),
 45 |             "chapter": text.parse_int(self.chapter),
 46 |             "chapter_id": text.parse_int(chapter_id),
 47 |             "type": match.group(2),
 48 |             "title": match.group(3),
 49 |             "author": match.group(4),
 50 |             "lang": "en",
 51 |             "language": "English",
 52 |         }
 53 | 
 54 |     @staticmethod
 55 |     def images(page):
 56 |         images = text.extract(page, "var rff_imageList = ", ";")[0]
 57 |         return [
 58 |             ("https://hentaicdn.com/hentai" + part, None)
 59 |             for part in json.loads(images)
 60 |         ]
 61 | 
 62 | 
 63 | class HentaihereMangaExtractor(HentaihereBase, MangaExtractor):
 64 |     """Extractor for hmanga from hentaihere.com"""
 65 |     chapterclass = HentaihereChapterExtractor
 66 |     pattern = r"(?:https?://)?(?:www\.)?hentaihere\.com(/m/S\d+)/?$"
 67 |     test = (
 68 |         ("https://hentaihere.com/m/S13812", {
 69 |             "url": "d1ba6e28bb2162e844f8559c2b2725ba0a093559",
 70 |             "keyword": "13c1ce7e15cbb941f01c843b0e89adc993d939ac",
 71 |         }),
 72 |         ("https://hentaihere.com/m/S7608", {
 73 |             "url": "6c5239758dc93f6b1b4175922836c10391b174f7",
 74 |             "keyword": "675c7b7a4fa52cf569c283553bd16b4200a5cd36",
 75 |         }),
 76 |     )
 77 | 
 78 |     def chapters(self, page):
 79 |         results = []
 80 |         manga_id = text.parse_int(
 81 |             self.manga_url.rstrip("/").rpartition("/")[2][1:])
 82 |         manga, pos = text.extract(
 83 |             page, '<span itemprop="name">', '</span>')
 84 |         mtype, pos = text.extract(
 85 |             page, '<span class="mngType text-danger">[', ']</span>', pos)
 86 | 
 87 |         while True:
 88 |             marker, pos = text.extract(
 89 |                 page, '<li class="sub-chp clearfix">', '', pos)
 90 |             if marker is None:
 91 |                 return results
 92 |             url, pos = text.extract(page, '<a href="', '"', pos)
 93 |             chapter, pos = text.extract(page, 'title="Tagged: -">\n', '<', pos)
 94 |             chapter_id, pos = text.extract(page, '/C', '"', pos)
 95 |             chapter, _, title = text.unescape(chapter).strip().partition(" - ")
 96 |             results.append((url, {
 97 |                 "manga_id": manga_id, "manga": manga, "type": mtype,
 98 |                 "chapter_id": text.parse_int(chapter_id),
 99 |                 "chapter": text.parse_int(chapter),
100 |                 "title": title, "lang": "en", "language": "English",
101 |             }))
102 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/fallenangels.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2017-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract manga-chapters from https://www.fascans.com/"""
 10 | 
 11 | from .common import ChapterExtractor, MangaExtractor
 12 | from .. import text, util
 13 | import json
 14 | 
 15 | 
 16 | class FallenangelsChapterExtractor(ChapterExtractor):
 17 |     """Extractor for manga-chapters from fascans.com"""
 18 |     category = "fallenangels"
 19 |     pattern = (r"(?:https?://)?(manga|truyen)\.fascans\.com"
 20 |                r"/manga/([^/]+)/(\d+)(\.[^/?&#]+)?")
 21 |     test = (
 22 |         ("https://manga.fascans.com/manga/chronos-ruler/20/1", {
 23 |             "url": "4604a7914566cc2da0ff789aa178e2d1c8c241e3",
 24 |             "keyword": "2dfcc50020e32cd207be88e2a8fac0933e36bdfb",
 25 |         }),
 26 |         ("http://truyen.fascans.com/manga/hungry-marie/8", {
 27 |             "url": "1f923d9cb337d5e7bbf4323719881794a951c6ae",
 28 |             "keyword": "2bdb7334c0e3eceb9946ffd3132df679b4a94f6a",
 29 |         }),
 30 |         ("http://manga.fascans.com/manga/rakudai-kishi-no-eiyuutan/19.5", {
 31 |             "keyword": "9fcca4c1a90d11f00764f62477ebe10bd408021c",
 32 |         }),
 33 |     )
 34 | 
 35 |     def __init__(self, match):
 36 |         self.version, self.manga, self.chapter, self.minor = match.groups()
 37 |         url = "https://{}.fascans.com/manga/{}/{}/1".format(
 38 |             self.version, self.manga, self.chapter)
 39 |         ChapterExtractor.__init__(self, match, url)
 40 | 
 41 |     def metadata(self, page):
 42 |         extr = text.extract_from(page)
 43 |         lang = "vi" if self.version == "truyen" else "en"
 44 |         return {
 45 |             "manga"   : extr('name="description" content="', ' Chapter '),
 46 |             "title"   : extr(':  ', ' - Page 1'),
 47 |             "chapter" : self.chapter,
 48 |             "chapter_minor": self.minor or "",
 49 |             "lang"    : lang,
 50 |             "language": util.code_to_language(lang),
 51 |         }
 52 | 
 53 |     @staticmethod
 54 |     def images(page):
 55 |         return [
 56 |             (img["page_image"], None)
 57 |             for img in json.loads(
 58 |                 text.extract(page, "var pages = ", ";")[0]
 59 |             )
 60 |         ]
 61 | 
 62 | 
 63 | class FallenangelsMangaExtractor(MangaExtractor):
 64 |     """Extractor for manga from fascans.com"""
 65 |     chapterclass = FallenangelsChapterExtractor
 66 |     category = "fallenangels"
 67 |     pattern = r"(?:https?://)?((manga|truyen)\.fascans\.com/manga/[^/]+)/?$"
 68 |     test = (
 69 |         ("http://manga.fascans.com/manga/trinity-seven", {
 70 |             "url": "293057f264de6c438b979bd1c3de4719568db452",
 71 |             "keyword": "50e0374dba60734230e4284b5ffdadef5104ae62",
 72 |         }),
 73 |         ("https://truyen.fascans.com/manga/rakudai-kishi-no-eiyuutan", {
 74 |             "url": "51a731a6b82d5eb7a335fbae6b02d06aeb2ab07b",
 75 |             "keyword": "2d2a2a5d9ea5925eb9a47bb13d848967f3af086c",
 76 |         }),
 77 |     )
 78 | 
 79 |     def __init__(self, match):
 80 |         url = "https://" + match.group(1)
 81 |         self.lang = "vi" if match.group(2) == "truyen" else "en"
 82 |         MangaExtractor.__init__(self, match, url)
 83 | 
 84 |     def chapters(self, page):
 85 |         extr = text.extract_from(page)
 86 |         results = []
 87 |         language = util.code_to_language(self.lang)
 88 |         while extr('<li style="', '"'):
 89 |             vol = extr('class="volume-', '"')
 90 |             url = extr('href="', '"')
 91 |             cha = extr('>', '<')
 92 |             title = extr('<em>', '</em>')
 93 | 
 94 |             manga, _, chapter = cha.rpartition(" ")
 95 |             chapter, dot, minor = chapter.partition(".")
 96 |             results.append((url, {
 97 |                 "manga"   : manga,
 98 |                 "title"   : text.unescape(title),
 99 |                 "volume"  : text.parse_int(vol),
100 |                 "chapter" : text.parse_int(chapter),
101 |                 "chapter_minor": dot + minor,
102 |                 "lang"    : self.lang,
103 |                 "language": language,
104 |             }))
105 |         return results
106 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | from __future__ import unicode_literals, print_function
  5 | 
  6 | import sys
  7 | import os.path
  8 | import warnings
  9 | 
 10 | if sys.hexversion < 0x3040000:
 11 |     sys.exit("Python 3.4+ required")
 12 | 
 13 | try:
 14 |     from setuptools import setup
 15 |     has_setuptools = True
 16 | except ImportError:
 17 |     from distutils.core import setup
 18 |     has_setuptools = False
 19 | 
 20 | 
 21 | def read(fname):
 22 |     path = os.path.join(os.path.dirname(__file__), fname)
 23 |     with open(path, encoding="utf-8") as file:
 24 |         return file.read()
 25 | 
 26 | def check_file(fname):
 27 |     if os.path.exists(fname):
 28 |         return True
 29 |     warnings.warn(
 30 |         "Not including file '{}' since it is not present. "
 31 |         "Run 'make' to build all automatically generated files.".format(fname)
 32 |     )
 33 |     return False
 34 | 
 35 | 
 36 | # get version without importing the package
 37 | exec(read("gallery_dl/version.py"))
 38 | 
 39 | DESCRIPTION = ("Command-line program to download image-galleries and "
 40 |                "-collections from several image hosting sites")
 41 | LONG_DESCRIPTION = read("README.rst")
 42 | 
 43 | if "py2exe" in sys.argv:
 44 |     try:
 45 |         import py2exe
 46 |     except ImportError:
 47 |         sys.exit("Error importing 'py2exe'")
 48 |     params = {
 49 |         "console": [{
 50 |             "script": "./gallery_dl/__main__.py",
 51 |             "dest_base": "gallery-dl",
 52 |             "version": __version__,
 53 |             "description": DESCRIPTION,
 54 |             "comments": LONG_DESCRIPTION,
 55 |             "product_name": "gallery-dl",
 56 |             "product_version": __version__,
 57 |         }],
 58 |         "options": {"py2exe": {
 59 |             "bundle_files": 0,
 60 |             "compressed": 1,
 61 |             "optimize": 1,
 62 |             "dist_dir": ".",
 63 |             "packages": ["gallery_dl"],
 64 |             "dll_excludes": ["w9xpopen.exe"],
 65 |         }},
 66 |         "zipfile": None,
 67 |     }
 68 | elif has_setuptools:
 69 |     params = {
 70 |         "entry_points": {
 71 |             "console_scripts": [
 72 |                 "gallery-dl = gallery_dl:main"
 73 |             ]
 74 |         }
 75 |     }
 76 | else:
 77 |     params = {
 78 |         "scripts": ["bin/gallery-dl"]
 79 |     }
 80 | 
 81 | data_files = [
 82 |     (path, [f for f in files if check_file(f)])
 83 |     for (path, files) in [
 84 |         ('etc/bash_completion.d', ['gallery-dl.bash_completion']),
 85 |         ('share/man/man1'       , ['gallery-dl.1']),
 86 |         ('share/man/man5'       , ['gallery-dl.conf.5']),
 87 |     ]
 88 | ]
 89 | 
 90 | 
 91 | setup(
 92 |     name="gallery_dl",
 93 |     version=__version__,
 94 |     description=DESCRIPTION,
 95 |     long_description=LONG_DESCRIPTION,
 96 |     url="https://github.com/mikf/gallery-dl",
 97 |     download_url="https://github.com/mikf/gallery-dl/releases/latest",
 98 |     author="Mike Fährmann",
 99 |     author_email="mike_faehrmann@web.de",
100 |     maintainer="Mike Fährmann",
101 |     maintainer_email="mike_faehrmann@web.de",
102 |     license="GPLv2",
103 |     python_requires=">=3.4",
104 |     install_requires=[
105 |         "requests>=2.11.0",
106 |     ],
107 |     packages=[
108 |         "gallery_dl",
109 |         "gallery_dl.extractor",
110 |         "gallery_dl.downloader",
111 |         "gallery_dl.postprocessor",
112 |     ],
113 |     data_files=data_files,
114 |     keywords="image gallery downloader crawler scraper",
115 |     classifiers=[
116 |         "Development Status :: 5 - Production/Stable",
117 |         "Environment :: Console",
118 |         "Intended Audience :: End Users/Desktop",
119 |         "License :: OSI Approved :: GNU General Public License v2 (GPLv2)",
120 |         "Operating System :: Microsoft :: Windows",
121 |         "Operating System :: POSIX",
122 |         "Operating System :: MacOS",
123 |         "Programming Language :: Python :: 3.4",
124 |         "Programming Language :: Python :: 3.5",
125 |         "Programming Language :: Python :: 3.6",
126 |         "Programming Language :: Python :: 3.7",
127 |         "Programming Language :: Python :: 3 :: Only",
128 |         "Topic :: Internet :: WWW/HTTP",
129 |         "Topic :: Multimedia :: Graphics",
130 |         "Topic :: Utilities",
131 |     ],
132 |     test_suite="test",
133 |     **params
134 | )
135 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/bobx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2018-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract images from http://www.bobx.com/dark/"""
 10 | 
 11 | from .common import Extractor, Message
 12 | from .. import text
 13 | 
 14 | 
 15 | class BobxExtractor(Extractor):
 16 |     """Base class for bobx extractors"""
 17 |     category = "bobx"
 18 |     root = "http://www.bobx.com"
 19 |     per_page = 80
 20 | 
 21 |     def __init__(self, match):
 22 |         Extractor.__init__(self, match)
 23 |         self.path = match.group(1)
 24 | 
 25 | 
 26 | class BobxGalleryExtractor(BobxExtractor):
 27 |     """Extractor for individual image galleries on bobx.com"""
 28 |     subcategory = "gallery"
 29 |     directory_fmt = ("{category}", "{model}", "{title}")
 30 |     filename_fmt = "{model}_{image_id}_{num:>03}.{extension}"
 31 |     archive_fmt = "{image_id}"
 32 |     pattern = (r"(?:https?://)?(?:www\.)?bobx\.com"
 33 |                r"/([^/]+/[^/]+/photoset/[\w-]+)-\d+-\d+-\d+\.html")
 34 |     test = (
 35 |         (("http://www.bobx.com/idol/mikoto-hibi"
 36 |           "/photoset/wpb-2018-_11-0-2-8.html"), {
 37 |             "url": "93972d6a661f6627e963d62c9d15531e6b36a389",
 38 |             "keyword": "6c620862db494ed05e69356ba30e604b167b0670",
 39 |             "content": "3f176b7fe752524cec21a763aa55567e41181e07",
 40 |         }),
 41 |         (("http://www.bobx.com/idol/nashiko-momotsuki"
 42 |           "/photoset/wpb-net-_221---2018-08---magic-of-summer-0-10-10.html"), {
 43 |             "url": "f5d6c0cd0881ae6f504c21a90d86e3464dc54e8e",
 44 |             "keyword": "f4819c75f494044348889ecd27771508464c0f5f",
 45 |         }),
 46 |     )
 47 | 
 48 |     def items(self):
 49 |         num = 0
 50 |         while True:
 51 |             url = "{}/{}-{}-10-8.html".format(self.root, self.path, num)
 52 |             page = self.request(url, encoding="utf-8").text
 53 | 
 54 |             if num == 0:
 55 |                 data = self.metadata(page)
 56 |                 yield Message.Version, 1
 57 |                 yield Message.Directory, data
 58 |                 data["num"] = 0
 59 | 
 60 |             for url in self.images(page):
 61 |                 url = text.urljoin(self.root, url.replace("-preview-", "-"))
 62 |                 data = text.nameext_from_url(url, data)
 63 |                 data["image_id"] = text.parse_int(
 64 |                     data["filename"].rpartition("-")[2])
 65 |                 data["num"] += 1
 66 |                 yield Message.Url, url, data
 67 | 
 68 |             num += self.per_page
 69 |             if num >= data["count"]:
 70 |                 return
 71 | 
 72 |     @staticmethod
 73 |     def metadata(page):
 74 |         """Collect metadata for extractor-job"""
 75 |         info = text.extract(page, "<title>", "</title>")[0]
 76 |         model, _, info = info.partition(" in ")
 77 |         info, _, count = info.rpartition(" of ")
 78 |         title = info.rpartition(" - @")[0]
 79 |         return {
 80 |             "title": text.unquote(title),
 81 |             "model": text.unquote(model),
 82 |             "count": text.parse_int(count),
 83 |         }
 84 | 
 85 |     @staticmethod
 86 |     def images(page):
 87 |         """Extract all image-urls"""
 88 |         page = text.extract(page, "<table CELLPADDING=", "<script ")[0]
 89 |         return text.extract_iter(page, '<img src="/thumbnail', '"')
 90 | 
 91 | 
 92 | class BobxIdolExtractor(BobxExtractor):
 93 |     """Extractor for an idol's image galleries on bobx.com"""
 94 |     subcategory = "idol"
 95 |     pattern = r"(?:https?://)?(?:www\.)?bobx\.com/([^/]+/[^/?&#]+)/?$"
 96 |     test = ("http://www.bobx.com/idol/rin-okabe/", {
 97 |         "url": "74d80bfcd53b738b31909bb42e5cc97c41b475b8",
 98 |     })
 99 | 
100 |     def items(self):
101 |         url = "{}/{}/".format(self.root, self.path)
102 |         data = {"_extractor": BobxGalleryExtractor}
103 |         page = self.request(url).text
104 |         skip = True
105 | 
106 |         yield Message.Version, 1
107 |         for part in text.extract_iter(page, '="photoset/', '"'):
108 |             # skip every other entry
109 |             skip = not skip
110 |             if skip:
111 |                 continue
112 |             yield Message.Queue, "{}photoset/{}".format(url, part), data
113 | 


--------------------------------------------------------------------------------
/scripts/release.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | prompt() {
  5 |     echo "root: ${ROOTDIR} old: ${OLDVERSION} - new: ${NEWVERSION}"
  6 |     read -n 1 -r -p "Proceed? [Y/n] " P
  7 |     echo
  8 |     if [ "$P" == y -o "$P" == Y -o -z "$P" ]; then
  9 |         return 0
 10 |     else
 11 |         exit 1
 12 |     fi
 13 | }
 14 | 
 15 | cleanup() {
 16 |     cd "${ROOTDIR}"
 17 |     echo Removing old build directory
 18 | 
 19 |     if [ -d ./build ]; then
 20 |         rm -rf ./build
 21 |     fi
 22 | }
 23 | 
 24 | update() {
 25 |     cd "${ROOTDIR}"
 26 |     echo Updating version to ${NEWVERSION}
 27 | 
 28 |     sed -i "s#\"${PYVERSION}\"#\"${NEWVERSION}\"#" "gallery_dl/version.py"
 29 |     sed -i "s#v${OLDVERSION}#v${NEWVERSION}#" "${README}"
 30 | }
 31 | 
 32 | update-dev() {
 33 |     cd "${ROOTDIR}"
 34 | 
 35 |     IFS="." read MAJOR MINOR BUILD <<< "${NEWVERSION}"
 36 |     BUILD=$((BUILD+1))
 37 |     # update version to -dev
 38 |     sed -i "s#\"${NEWVERSION}\"#\"${MAJOR}.${MINOR}.${BUILD}-dev\"#" "gallery_dl/version.py"
 39 |     # add 'unreleased' line to changelog
 40 |     sed -i "2i\\\n## Unreleased" "${CHANGELOG}"
 41 | 
 42 |     git add "gallery_dl/version.py" "${CHANGELOG}"
 43 | }
 44 | 
 45 | build-python() {
 46 |     cd "${ROOTDIR}"
 47 |     echo Building bdist_wheel and sdist
 48 | 
 49 |     python setup.py bdist_wheel sdist
 50 | }
 51 | 
 52 | build-linux() {
 53 |     cd "${ROOTDIR}"
 54 |     echo Building Linux executable
 55 | 
 56 |     make executable
 57 | }
 58 | 
 59 | build-windows() {
 60 |     cd "${ROOTDIR}/dist"
 61 |     echo Building Windows executable
 62 | 
 63 |     # remove old executable
 64 |     rm -f "gallery-dl.exe"
 65 | 
 66 |     # build windows exe in vm
 67 |     ln -fs "${ROOTDIR}" /tmp/
 68 |     vmstart "Windows 7" &
 69 |     disown
 70 |     while [ ! -e "gallery-dl.exe" ] ; do
 71 |         sleep 5
 72 |     done
 73 |     sleep 2
 74 | 
 75 |     # check exe version
 76 |     OUTPUT="$(wine gallery-dl.exe --version)"
 77 |     if [[ ! "${OUTPUT%?}" == "${NEWVERSION}" ]]; then
 78 |         echo "exe version mismatch: ${OUTPUT} != ${NEWVERSION}"
 79 |         exit 3
 80 |     fi
 81 | }
 82 | 
 83 | sign() {
 84 |     cd "${ROOTDIR}/dist"
 85 |     echo Signing files
 86 | 
 87 |     gpg --detach-sign --armor gallery_dl-${NEWVERSION}-py3-none-any.whl
 88 |     gpg --detach-sign --armor gallery_dl-${NEWVERSION}.tar.gz
 89 |     gpg --detach-sign --yes gallery-dl.exe
 90 |     gpg --detach-sign --yes gallery-dl.bin
 91 | }
 92 | 
 93 | changelog() {
 94 |     cd "${ROOTDIR}"
 95 |     echo Updating "${CHANGELOG}"
 96 | 
 97 |     # - replace "#NN" with link to actual issue
 98 |     # - insert new version and date
 99 |     sed -i \
100 |         -e "s*\([( ]\)#\([0-9]\+\)*\1[#\2](https://github.com/mikf/gallery-dl/issues/\2)*g" \
101 |         -e "s*^## [Uu]nreleased*## ${NEWVERSION} - $(date +%Y-%m-%d)*" \
102 |         "${CHANGELOG}"
103 | }
104 | 
105 | supportedsites() {
106 |     cd "${ROOTDIR}"
107 |     echo Checking if "${SUPPORTEDSITES}" is up to date
108 | 
109 |     ./scripts/supportedsites.py
110 |     if ! git diff --quiet "${SUPPORTEDSITES}"; then
111 |         echo "updated ${SUPPORTEDSITES} contains changes"
112 |         exit 4
113 |     fi
114 | }
115 | 
116 | git-upload() {
117 |     cd "${ROOTDIR}"
118 |     echo Pushing changes to github
119 | 
120 |     git add "gallery_dl/version.py" "${README}" "${CHANGELOG}"
121 |     git commit -S -m "release version ${NEWVERSION}"
122 |     git tag -s -m "version ${NEWVERSION}" "v${NEWVERSION}"
123 |     git push
124 |     git push origin "v${NEWVERSION}"
125 | }
126 | 
127 | pypi-upload() {
128 |     cd "${ROOTDIR}/dist"
129 |     echo Uploading to PyPI
130 | 
131 |     twine upload gallery_dl-${NEWVERSION}*
132 | }
133 | 
134 | 
135 | ROOTDIR="$(realpath "$(dirname "$0")/..")/"
136 | README="README.rst"
137 | CHANGELOG="CHANGELOG.md"
138 | SUPPORTEDSITES="./docs/supportedsites.rst"
139 | 
140 | LASTTAG="$(git describe --abbrev=0 --tags)"
141 | OLDVERSION="${LASTTAG#v}"
142 | PYVERSION="$(python -c "import gallery_dl as g; print(g.__version__)")"
143 | 
144 | if [[ "$1" ]]; then
145 |     NEWVERSION="$1"
146 | else
147 |     NEWVERSION="${PYVERSION%-dev}"
148 | fi
149 | 
150 | if [[ ! $NEWVERSION =~ [0-9]+\.[0-9]+\.[0-9]+(-[a-z]+(\.[0-9]+)?)?$ ]]; then
151 |     echo "invalid version: $NEWVERSION"
152 |     exit 2
153 | fi
154 | 
155 | 
156 | prompt
157 | supportedsites
158 | cleanup
159 | update
160 | build-python
161 | build-linux
162 | build-windows
163 | sign
164 | changelog
165 | git-upload
166 | pypi-upload
167 | update-dev
168 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/warosu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2017-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract images from https://warosu.org/"""
 10 | 
 11 | from .common import Extractor, Message
 12 | from .. import text
 13 | 
 14 | 
 15 | class WarosuThreadExtractor(Extractor):
 16 |     """Extractor for images from threads on warosu.org"""
 17 |     category = "warosu"
 18 |     subcategory = "thread"
 19 |     directory_fmt = ("{category}", "{board}", "{thread} - {title}")
 20 |     filename_fmt = "{tim}-{filename}.{extension}"
 21 |     archive_fmt = "{board}_{thread}_{tim}"
 22 |     pattern = r"(?:https?://)?(?:www\.)?warosu\.org/([^/]+)/thread/(\d+)"
 23 |     test = (
 24 |         ("https://warosu.org/jp/thread/16656025", {
 25 |             "url": "889d57246ed67e491e5b8f7f124e50ea7991e770",
 26 |             "keyword": "c00ea4c5460c5986994f17bb8416826d42ca57c0",
 27 |         }),
 28 |         ("https://warosu.org/jp/thread/16658073", {
 29 |             "url": "4500cf3184b067424fd9883249bd543c905fbecd",
 30 |             "keyword": "7534edf4ec51891dbf44d775b73fbbefd52eec71",
 31 |             "content": "d48df0a701e6599312bfff8674f4aa5d4fb8db1c",
 32 |         }),
 33 |     )
 34 |     root = "https://warosu.org"
 35 | 
 36 |     def __init__(self, match):
 37 |         Extractor.__init__(self, match)
 38 |         self.board, self.thread = match.groups()
 39 | 
 40 |     def items(self):
 41 |         url = "{}/{}/thread/{}".format(self.root, self.board, self.thread)
 42 |         page = self.request(url).text
 43 |         data = self.get_metadata(page)
 44 |         posts = self.posts(page)
 45 | 
 46 |         if not data["title"]:
 47 |             title = text.remove_html(posts[0]["com"])
 48 |             data["title"] = text.unescape(title)[:50]
 49 | 
 50 |         yield Message.Version, 1
 51 |         yield Message.Directory, data
 52 |         for post in posts:
 53 |             if "image" in post:
 54 |                 for key in ("w", "h", "no", "time", "tim"):
 55 |                     post[key] = text.parse_int(post[key])
 56 |                 post.update(data)
 57 |                 yield Message.Url, post["image"], post
 58 | 
 59 |     def get_metadata(self, page):
 60 |         """Collect metadata for extractor-job"""
 61 |         boardname = text.extract(page, "<title>", "</title>")[0]
 62 |         title = text.extract(page, 'filetitle" itemprop="name">', '<')[0]
 63 |         return {
 64 |             "board": self.board,
 65 |             "board_name": boardname.rpartition(" - ")[2],
 66 |             "thread": self.thread,
 67 |             "title": title,
 68 |         }
 69 | 
 70 |     def posts(self, page):
 71 |         """Build a list of all post-objects"""
 72 |         page = text.extract(page, '<div class="content">', '<table>')[0]
 73 |         needle = '<table itemscope itemtype="http://schema.org/Comment">'
 74 |         return [self.parse(post) for post in page.split(needle)]
 75 | 
 76 |     def parse(self, post):
 77 |         """Build post-object by extracting data from an HTML post"""
 78 |         data = self._extract_post(post)
 79 |         if "<span>File:" in post:
 80 |             self._extract_image(post, data)
 81 |             part = data["image"].rpartition("/")[2]
 82 |             data["tim"], _, data["extension"] = part.partition(".")
 83 |             data["ext"] = "." + data["extension"]
 84 |         return data
 85 | 
 86 |     @staticmethod
 87 |     def _extract_post(post):
 88 |         data = text.extract_all(post, (
 89 |             ("no"  , 'id="p', '"'),
 90 |             ("name", '<span itemprop="name">', '</span>'),
 91 |             ("time", '<span class="posttime" title="', '000">'),
 92 |             ("now" , '', '<'),
 93 |             ("com" , '<blockquote><p itemprop="text">', '</p></blockquote>'),
 94 |         ))[0]
 95 |         data["com"] = text.unescape(text.remove_html(data["com"].strip()))
 96 |         return data
 97 | 
 98 |     @staticmethod
 99 |     def _extract_image(post, data):
100 |         text.extract_all(post, (
101 |             ("fsize"   , '<span>File: ', ', '),
102 |             ("w"       , '', 'x'),
103 |             ("h"       , '', ', '),
104 |             ("filename", '', '<'),
105 |             ("image"   , '<br />\n<a href="', '"'),
106 |         ), 0, data)
107 |         data["filename"] = text.unquote(data["filename"].rpartition(".")[0])
108 |         data["image"] = "https:" + data["image"]
109 | 


--------------------------------------------------------------------------------
/gallery_dl/config.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2015-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Global configuration module"""
 10 | 
 11 | import sys
 12 | import json
 13 | import os.path
 14 | import logging
 15 | from . import util
 16 | 
 17 | log = logging.getLogger("config")
 18 | 
 19 | 
 20 | # --------------------------------------------------------------------
 21 | # internals
 22 | 
 23 | _config = {}
 24 | 
 25 | if os.name == "nt":
 26 |     _default_configs = [
 27 |         r"%USERPROFILE%\gallery-dl\config.json",
 28 |         r"%USERPROFILE%\gallery-dl.conf",
 29 |     ]
 30 | else:
 31 |     _default_configs = [
 32 |         "/etc/gallery-dl.conf",
 33 |         "${HOME}/.config/gallery-dl/config.json",
 34 |         "${HOME}/.gallery-dl.conf",
 35 |     ]
 36 | 
 37 | 
 38 | # --------------------------------------------------------------------
 39 | # public interface
 40 | 
 41 | def load(files=None, strict=False, fmt="json"):
 42 |     """Load JSON configuration files"""
 43 |     if fmt == "yaml":
 44 |         try:
 45 |             import yaml
 46 |             parsefunc = yaml.safe_load
 47 |         except ImportError:
 48 |             log.error("Could not import 'yaml' module")
 49 |             return
 50 |     else:
 51 |         parsefunc = json.load
 52 | 
 53 |     for path in files or _default_configs:
 54 |         path = util.expand_path(path)
 55 |         try:
 56 |             with open(path, encoding="utf-8") as file:
 57 |                 confdict = parsefunc(file)
 58 |         except OSError as exc:
 59 |             if strict:
 60 |                 log.error("%s", exc)
 61 |                 sys.exit(1)
 62 |         except Exception as exc:
 63 |             log.warning("Could not parse '%s': %s", path, exc)
 64 |             if strict:
 65 |                 sys.exit(2)
 66 |         else:
 67 |             if not _config:
 68 |                 _config.update(confdict)
 69 |             else:
 70 |                 util.combine_dict(_config, confdict)
 71 | 
 72 | 
 73 | def clear():
 74 |     """Reset configuration to an empty state"""
 75 |     _config.clear()
 76 | 
 77 | 
 78 | def get(keys, default=None, conf=_config):
 79 |     """Get the value of property 'key' or a default value"""
 80 |     try:
 81 |         for k in keys:
 82 |             conf = conf[k]
 83 |         return conf
 84 |     except (KeyError, AttributeError):
 85 |         return default
 86 | 
 87 | 
 88 | def interpolate(keys, default=None, conf=_config):
 89 |     """Interpolate the value of 'key'"""
 90 |     try:
 91 |         lkey = keys[-1]
 92 |         if lkey in conf:
 93 |             return conf[lkey]
 94 |         for k in keys:
 95 |             if lkey in conf:
 96 |                 default = conf[lkey]
 97 |             conf = conf[k]
 98 |         return conf
 99 |     except (KeyError, AttributeError):
100 |         return default
101 | 
102 | 
103 | def set(keys, value, conf=_config):
104 |     """Set the value of property 'key' for this session"""
105 |     for k in keys[:-1]:
106 |         try:
107 |             conf = conf[k]
108 |         except KeyError:
109 |             temp = {}
110 |             conf[k] = temp
111 |             conf = temp
112 |     conf[keys[-1]] = value
113 | 
114 | 
115 | def setdefault(keys, value, conf=_config):
116 |     """Set the value of property 'key' if it doesn't exist"""
117 |     for k in keys[:-1]:
118 |         try:
119 |             conf = conf[k]
120 |         except KeyError:
121 |             temp = {}
122 |             conf[k] = temp
123 |             conf = temp
124 |     return conf.setdefault(keys[-1], value)
125 | 
126 | 
127 | def unset(keys, conf=_config):
128 |     """Unset the value of property 'key'"""
129 |     try:
130 |         for k in keys[:-1]:
131 |             conf = conf[k]
132 |         del conf[keys[-1]]
133 |     except (KeyError, AttributeError):
134 |         pass
135 | 
136 | 
137 | class apply():
138 |     """Context Manager: apply a collection of key-value pairs"""
139 |     _sentinel = object()
140 | 
141 |     def __init__(self, kvlist):
142 |         self.original = []
143 |         self.kvlist = kvlist
144 | 
145 |     def __enter__(self):
146 |         for key, value in self.kvlist:
147 |             self.original.append((key, get(key, self._sentinel)))
148 |             set(key, value)
149 | 
150 |     def __exit__(self, etype, value, traceback):
151 |         for key, value in self.original:
152 |             if value is self._sentinel:
153 |                 unset(key)
154 |             else:
155 |                 set(key, value)
156 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/paheal.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2018-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract images from https://rule34.paheal.net/"""
 10 | 
 11 | from .common import Extractor, Message, SharedConfigMixin
 12 | from .. import text
 13 | 
 14 | 
 15 | class PahealExtractor(SharedConfigMixin, Extractor):
 16 |     """Base class for paheal extractors"""
 17 |     basecategory = "booru"
 18 |     category = "paheal"
 19 |     filename_fmt = "{category}_{id}_{md5}.{extension}"
 20 |     archive_fmt = "{id}"
 21 |     root = "https://rule34.paheal.net"
 22 | 
 23 |     def items(self):
 24 |         yield Message.Version, 1
 25 |         yield Message.Directory, self.get_metadata()
 26 | 
 27 |         for data in self.get_posts():
 28 |             url = data["file_url"]
 29 |             for key in ("id", "width", "height"):
 30 |                 data[key] = text.parse_int(data[key])
 31 |             data["tags"] = text.unquote(data["tags"])
 32 |             yield Message.Url, url, text.nameext_from_url(url, data)
 33 | 
 34 |     def get_metadata(self):
 35 |         """Return general metadata"""
 36 |         return {}
 37 | 
 38 |     def get_posts(self):
 39 |         """Return an iterable containing data of all relevant posts"""
 40 | 
 41 | 
 42 | class PahealTagExtractor(PahealExtractor):
 43 |     """Extractor for images from rule34.paheal.net by search-tags"""
 44 |     subcategory = "tag"
 45 |     directory_fmt = ("{category}", "{search_tags}")
 46 |     pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
 47 |                r"/post/list/([^/?&#]+)")
 48 |     test = ("https://rule34.paheal.net/post/list/k-on/1", {
 49 |         "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20",
 50 |         "count": ">= 15"
 51 |     })
 52 |     per_page = 70
 53 | 
 54 |     def __init__(self, match):
 55 |         PahealExtractor.__init__(self, match)
 56 |         self.tags = text.unquote(match.group(1))
 57 | 
 58 |     def get_metadata(self):
 59 |         return {"search_tags": self.tags}
 60 | 
 61 |     def get_posts(self):
 62 |         pnum = 1
 63 |         while True:
 64 |             url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
 65 |             page = self.request(url).text
 66 | 
 67 |             for post in text.extract_iter(
 68 |                     page, '<img id="thumb_', '>Image Only<'):
 69 |                 yield self._extract_data(post)
 70 | 
 71 |             if ">Next<" not in page:
 72 |                 return
 73 |             pnum += 1
 74 | 
 75 |     @staticmethod
 76 |     def _extract_data(post):
 77 |         pid , pos = text.extract(post, '', '"')
 78 |         data, pos = text.extract(post, 'title="', '"', pos)
 79 |         md5 , pos = text.extract(post, '/_thumbs/', '/', pos)
 80 |         url , pos = text.extract(post, '<a href="', '"', pos)
 81 | 
 82 |         tags, dimensions, size, _ = data.split(" // ")
 83 |         width, _, height = dimensions.partition("x")
 84 | 
 85 |         return {
 86 |             "id": pid, "md5": md5, "tags": tags, "file_url": url,
 87 |             "width": width, "height": height,
 88 |             "size": text.parse_bytes(size[:-1]),
 89 |         }
 90 | 
 91 | 
 92 | class PahealPostExtractor(PahealExtractor):
 93 |     """Extractor for single images from rule34.paheal.net"""
 94 |     subcategory = "post"
 95 |     pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net"
 96 |                r"/post/view/(\d+)")
 97 |     test = ("https://rule34.paheal.net/post/view/481609", {
 98 |         "url": "1142779378f655ec0497d4c301836aa667f788b1",
 99 |         "keyword": "34e9e93d4fa6fa06fac1a56e78c9a52e8cd7b271",
100 |         "content": "7b924bcf150b352ac75c9d281d061e174c851a11",
101 |     })
102 | 
103 |     def __init__(self, match):
104 |         PahealExtractor.__init__(self, match)
105 |         self.post_id = match.group(1)
106 | 
107 |     def get_posts(self):
108 |         url = "{}/post/view/{}".format(self.root, self.post_id)
109 |         page = self.request(url).text
110 | 
111 |         tags  , pos = text.extract(page, ": ", "<")
112 |         md5   , pos = text.extract(page, "/_thumbs/", "/", pos)
113 |         url   , pos = text.extract(page, "id='main_image' src='", "'", pos)
114 |         width , pos = text.extract(page, "data-width='", "'", pos)
115 |         height, pos = text.extract(page, "data-height='", "'", pos)
116 | 
117 |         return ({
118 |             "id": self.post_id, "md5": md5, "tags": tags, "file_url": url,
119 |             "width": width, "height": height, "size": 0,
120 |         },)
121 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/komikcast.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2018-2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extract manga-chapters and entire manga from https://komikcast.com/"""
 10 | 
 11 | from .common import ChapterExtractor, MangaExtractor
 12 | from .. import text
 13 | import re
 14 | 
 15 | 
 16 | class KomikcastBase():
 17 |     """Base class for komikcast extractors"""
 18 |     category = "komikcast"
 19 |     root = "https://komikcast.com"
 20 | 
 21 |     @staticmethod
 22 |     def parse_chapter_string(chapter_string, data=None):
 23 |         """Parse 'chapter_string' value and add its info to 'data'"""
 24 |         if not data:
 25 |             data = {}
 26 | 
 27 |         match = re.match(
 28 |             r"(?:(.*) Chapter )?0*(\d+)([^ ]*)(?: (?:- )?(.+))?",
 29 |             text.unescape(chapter_string),
 30 |         )
 31 |         manga, chapter, data["chapter_minor"], title = match.groups()
 32 | 
 33 |         if manga:
 34 |             data["manga"] = manga.partition(" Chapter ")[0]
 35 |         if title and title.lower() != "bahasa indonesia":
 36 |             data["title"] = title.strip()
 37 |         else:
 38 |             data["title"] = ""
 39 |         data["chapter"] = text.parse_int(chapter)
 40 |         data["lang"] = "id"
 41 |         data["language"] = "Indonesian"
 42 | 
 43 |         return data
 44 | 
 45 | 
 46 | class KomikcastChapterExtractor(KomikcastBase, ChapterExtractor):
 47 |     """Extractor for manga-chapters from komikcast.com"""
 48 |     pattern = r"(?:https?://)?(?:www\.)?komikcast\.com(/chapter/[^/?&#]+/)"
 49 |     test = (
 50 |         (("https://komikcast.com/chapter/"
 51 |           "apotheosis-chapter-02-2-bahasa-indonesia/"), {
 52 |             "url": "f6b43fbc027697749b3ea1c14931c83f878d7936",
 53 |             "keyword": "f3938e1aff9ad1f302f52447e9781b21f6da26d4",
 54 |         }),
 55 |         (("https://komikcast.com/chapter/"
 56 |           "tonari-no-kashiwagi-san-chapter-18b/"), {
 57 |             "url": "aff90dd21dbb945a726778b10bdef522af7c42fe",
 58 |             "keyword": "19b5783864c4299913de436513b124b028b557c1",
 59 |         }),
 60 |         (("https://komikcast.com/chapter/090-eko-to-issho-chapter-1/"), {
 61 |             "url": "cda104a32ea2b06b3d6b096726622f519ed1fa33",
 62 |         }),
 63 |     )
 64 | 
 65 |     def metadata(self, page):
 66 |         info = text.extract(page, '<b>', "</b>")[0]
 67 |         return self.parse_chapter_string(info)
 68 | 
 69 |     @staticmethod
 70 |     def images(page):
 71 |         readerarea = text.extract(
 72 |             page, '<div id="readerarea">', '<div class="navig">')[0]
 73 |         return [
 74 |             (text.unescape(url), None)
 75 |             for url in re.findall(r"<img[^>]* src=[\"']([^\"']+)", readerarea)
 76 |             if "/Banner-" not in url and "/WM-Sampingan." not in url
 77 |         ]
 78 | 
 79 | 
 80 | class KomikcastMangaExtractor(KomikcastBase, MangaExtractor):
 81 |     """Extractor for manga from komikcast.com"""
 82 |     chapterclass = KomikcastChapterExtractor
 83 |     pattern = (r"(?:https?://)?(?:www\.)?komikcast\.com"
 84 |                r"(/(?:komik/)?[^/?&#]+)/?$")
 85 |     test = (
 86 |         ("https://komikcast.com/komik/090-eko-to-issho/", {
 87 |             "url": "dc798d107697d1f2309b14ca24ca9dba30c6600f",
 88 |             "keyword": "837a7e96867344ff59d840771c04c20dc46c0ab1",
 89 |         }),
 90 |         ("https://komikcast.com/tonari-no-kashiwagi-san/"),
 91 |     )
 92 | 
 93 |     def chapters(self, page):
 94 |         results = []
 95 |         data = self.metadata(page)
 96 | 
 97 |         for item in text.extract_iter(
 98 |                 page, '<span class="leftoff"><a href="', '</a>'):
 99 |             url, _, chapter_string = item.rpartition('">Chapter ')
100 |             self.parse_chapter_string(chapter_string, data)
101 |             results.append((url, data.copy()))
102 |         return results
103 | 
104 |     @staticmethod
105 |     def metadata(page):
106 |         """Return a dict with general metadata"""
107 |         manga , pos = text.extract(page, "<title>" , "</title>")
108 |         genres, pos = text.extract(page, ">Genres:", "</span>", pos)
109 |         author, pos = text.extract(page, ">Author:", "</span>", pos)
110 |         mtype , pos = text.extract(page, ">Type:"  , "</span>", pos)
111 | 
112 |         return {
113 |             "manga": text.unescape(manga[:-12]),
114 |             "author": text.remove_html(author),
115 |             "genres": text.split_html(genres)[::2],
116 |             "type": text.remove_html(mtype),
117 |         }
118 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/plurk.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extractors for https://www.plurk.com/"""
 10 | 
 11 | from .common import Extractor, Message
 12 | from .. import text, extractor, exception
 13 | import datetime
 14 | import json
 15 | import re
 16 | 
 17 | 
 18 | class PlurkExtractor(Extractor):
 19 |     """Base class for plurk extractors"""
 20 |     category = "plurk"
 21 |     root = "https://www.plurk.com"
 22 | 
 23 |     def items(self):
 24 |         urls = self._urls_ex if self.config("comments", False) else self._urls
 25 | 
 26 |         yield Message.Version, 1
 27 |         with extractor.blacklist(("plurk",)):
 28 |             for plurk in self.plurks():
 29 |                 for url in urls(plurk):
 30 |                     yield Message.Queue, url, plurk
 31 | 
 32 |     def plurks(self):
 33 |         """Return an iterable with all relevant 'plurk' objects"""
 34 | 
 35 |     @staticmethod
 36 |     def _urls(obj):
 37 |         """Extract URLs from a 'plurk' object"""
 38 |         return text.extract_iter(obj["content"], ' href="', '"')
 39 | 
 40 |     def _urls_ex(self, plurk):
 41 |         """Extract URLs from a 'plurk' and its comments"""
 42 |         yield from self._urls(plurk)
 43 |         for comment in self._comments(plurk):
 44 |             yield from self._urls(comment)
 45 | 
 46 |     def _comments(self, plurk):
 47 |         """Return an iterable with a 'plurk's comments"""
 48 |         url = "https://www.plurk.com/Responses/get"
 49 |         data = {"plurk_id": plurk["id"], "count": "200"}
 50 | 
 51 |         while True:
 52 |             info = self.request(url, "POST", data=data).json()
 53 |             yield from info["responses"]
 54 |             if not info["has_newer"]:
 55 |                 return
 56 |             data["from_response_id"] = info["responses"][-1]["id"]
 57 | 
 58 |     @staticmethod
 59 |     def _load(data):
 60 |         if not data:
 61 |             raise exception.NotFoundError("user")
 62 |         return json.loads(re.sub(r"new Date\(([^)]+)\)", r"\1", data))
 63 | 
 64 | 
 65 | class PlurkTimelineExtractor(PlurkExtractor):
 66 |     """Extractor for URLs from all posts in a Plurk timeline"""
 67 |     subcategory = "timeline"
 68 |     pattern = r"(?:https?://)?(?:www\.)?plurk\.com/(?!p/)(\w+)/?(?:$|[?&#])"
 69 |     test = ("https://www.plurk.com/plurkapi", {
 70 |         "pattern": r"https?://.+",
 71 |         "count": ">= 23"
 72 |     })
 73 | 
 74 |     def __init__(self, match):
 75 |         PlurkExtractor.__init__(self, match)
 76 |         self.user = match.group(1)
 77 | 
 78 |     def plurks(self):
 79 |         url = "{}/{}".format(self.root, self.user)
 80 |         page = self.request(url).text
 81 |         user_id, pos = text.extract(page, '"user_id":', ',')
 82 |         plurks = self._load(text.extract(page, "_PLURKS = ", ";\n", pos)[0])
 83 | 
 84 |         url = "https://www.plurk.com/TimeLine/getPlurks"
 85 |         data = {"user_id": user_id.strip()}
 86 |         headers = {"Referer": url, "X-Requested-With": "XMLHttpRequest"}
 87 | 
 88 |         while plurks:
 89 |             yield from plurks
 90 | 
 91 |             offset = datetime.datetime.strptime(
 92 |                 plurks[-1]["posted"], "%a, %d %b %Y %H:%M:%S %Z")
 93 |             data["offset"] = offset.strftime("%Y-%m-%dT%H:%M:%S.000Z")
 94 |             response = self.request(url, "POST", headers=headers, data=data)
 95 |             plurks = response.json()["plurks"]
 96 | 
 97 | 
 98 | class PlurkPostExtractor(PlurkExtractor):
 99 |     """Extractor for URLs from a Plurk post"""
100 |     subcategory = "post"
101 |     pattern = r"(?:https?://)?(?:www\.)?plurk\.com/p/(\w+)"
102 |     test = (
103 |         ("https://www.plurk.com/p/i701j1", {
104 |             "url": "2115f208564591b8748525c2807a84596aaaaa5f",
105 |             "count": 3,
106 |         }),
107 |         ("https://www.plurk.com/p/i701j1", {
108 |             "options": (("comments", True),),
109 |             "count": ">= 210",
110 |         }),
111 |     )
112 | 
113 |     def __init__(self, match):
114 |         PlurkExtractor.__init__(self, match)
115 |         self.plurk_id = match.group(1)
116 | 
117 |     def plurks(self):
118 |         url = "{}/p/{}".format(self.root, self.plurk_id)
119 |         page = self.request(url).text
120 |         user, pos = text.extract(page, " GLOBAL = ", "\n")
121 |         data, pos = text.extract(page, "plurk = ", ";\n", pos)
122 | 
123 |         data = self._load(data)
124 |         data["user"] = self._load(user)["page_user"]
125 |         return (data,)
126 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/sankakucomplex.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extractors for https://www.sankakucomplex.com/"""
 10 | 
 11 | from .common import Extractor, Message
 12 | from .. import text
 13 | import re
 14 | 
 15 | 
 16 | class SankakucomplexExtractor(Extractor):
 17 |     """Base class for sankakucomplex extractors"""
 18 |     category = "sankakucomplex"
 19 |     root = "https://www.sankakucomplex.com"
 20 | 
 21 |     def __init__(self, match):
 22 |         Extractor.__init__(self, match)
 23 |         self.path = match.group(1)
 24 | 
 25 | 
 26 | class SankakucomplexArticleExtractor(SankakucomplexExtractor):
 27 |     """Extractor for articles on www.sankakucomplex.com"""
 28 |     subcategory = "article"
 29 |     directory_fmt = ("{category}", "{date:%Y-%m-%d} {title}")
 30 |     filename_fmt = "{filename}.{extension}"
 31 |     archive_fmt = "{date:%Y%m%d}_{filename}"
 32 |     pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
 33 |                r"/(\d{4}/\d\d/\d\d/[^/?&#]+)")
 34 |     test = (
 35 |         ("https://www.sankakucomplex.com/2019/05/11/twitter-cosplayers", {
 36 |             "url": "4a9ecc5ae917fbce469280da5b6a482510cae84d",
 37 |             "keyword": "bfe08310e7d9a572f568f6900e0ed0eb295aa2b3",
 38 |         }),
 39 |         ("https://www.sankakucomplex.com/2009/12/01/sexy-goddesses-of-2ch", {
 40 |             "url": "a1e249173fd6c899a8134fcfbd9c925588a63f7c",
 41 |             "keyword": "e78fcc23c2711befc0969a45ea5082a29efccf68",
 42 |         }),
 43 |     )
 44 | 
 45 |     def items(self):
 46 |         url = "{}/{}/?pg=X".format(self.root, self.path)
 47 |         extr = text.extract_from(self.request(url).text)
 48 |         data = {
 49 |             "title"      : text.unescape(
 50 |                 extr('property="og:title" content="', '"')),
 51 |             "description": text.unescape(
 52 |                 extr('property="og:description" content="', '"')),
 53 |             "date"       : text.parse_datetime(
 54 |                 extr('property="article:published_time" content="', '"')),
 55 |         }
 56 |         imgs = self.images(extr)
 57 |         data["count"] = len(imgs)
 58 |         data["tags"] = text.split_html(extr('="meta-tags">', '</div>'))[::2]
 59 | 
 60 |         yield Message.Version, 1
 61 |         yield Message.Directory, data
 62 |         for img in imgs:
 63 |             img.update(data)
 64 |             yield Message.Url, img["url"], img
 65 | 
 66 |     def images(self, extr):
 67 |         num = 0
 68 |         imgs = []
 69 |         urls = set()
 70 |         orig = re.compile(r"-\d+x\d+\.")
 71 | 
 72 |         extr('<div class="entry-content">', '')
 73 |         while True:
 74 |             url = extr('data-lazy-src="', '"')
 75 |             if not url:
 76 |                 return imgs
 77 |             if url in urls:
 78 |                 continue
 79 |             if url[0] == "/":
 80 |                 url = text.urljoin(self.root, url)
 81 |             url = orig.sub(".", url)
 82 |             num += 1
 83 |             imgs.append(text.nameext_from_url(url, {
 84 |                 "url"   : url,
 85 |                 "num"   : num,
 86 |             }))
 87 |             urls.add(url)
 88 | 
 89 | 
 90 | class SankakucomplexTagExtractor(SankakucomplexExtractor):
 91 |     """Extractor for sankakucomplex blog articles by tag or author"""
 92 |     subcategory = "tag"
 93 |     pattern = (r"(?:https?://)?www\.sankakucomplex\.com"
 94 |                r"/((?:tag|category|author)/[^/&?#]+)")
 95 |     test = (
 96 |         ("https://www.sankakucomplex.com/tag/cosplay/", {
 97 |             "range": "1-50",
 98 |             "count": 50,
 99 |             "pattern": SankakucomplexArticleExtractor.pattern,
100 |         }),
101 |         ("https://www.sankakucomplex.com/category/anime/"),
102 |         ("https://www.sankakucomplex.com/author/rift/page/5/"),
103 |     )
104 | 
105 |     def items(self):
106 |         pnum = 1
107 |         last = None
108 |         data = {"_extractor": SankakucomplexArticleExtractor}
109 | 
110 |         yield Message.Version, 1
111 |         while True:
112 |             url = "{}/{}/page/{}/".format(self.root, self.path, pnum)
113 |             response = self.request(url, fatal=False)
114 |             if response.status_code >= 400:
115 |                 return
116 |             for url in text.extract_iter(response.text, 'data-direct="', '"'):
117 |                 if url != last:
118 |                     last = url
119 |                     yield Message.Queue, url, data
120 |             pnum += 1
121 | 


--------------------------------------------------------------------------------
/test/test_cookies.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Copyright 2017 Mike Fährmann
  5 | #
  6 | # This program is free software; you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License version 2 as
  8 | # published by the Free Software Foundation.
  9 | 
 10 | import unittest
 11 | from unittest import mock
 12 | 
 13 | import logging
 14 | import tempfile
 15 | import http.cookiejar
 16 | from os.path import join
 17 | 
 18 | import gallery_dl.config as config
 19 | import gallery_dl.extractor as extractor
 20 | 
 21 | CKEY = ("cookies",)
 22 | 
 23 | 
 24 | class TestCookiejar(unittest.TestCase):
 25 | 
 26 |     @classmethod
 27 |     def setUpClass(cls):
 28 |         cls.path = tempfile.TemporaryDirectory()
 29 | 
 30 |         cls.cookiefile = join(cls.path.name, "cookies.txt")
 31 |         with open(cls.cookiefile, "w") as file:
 32 |             file.write("""# HTTP Cookie File
 33 | .example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE
 34 | """)
 35 | 
 36 |         cls.invalid_cookiefile = join(cls.path.name, "invalid.txt")
 37 |         with open(cls.invalid_cookiefile, "w") as file:
 38 |             file.write("""# asd
 39 | .example.org\tTRUE\t/\tFALSE\t253402210800\tNAME\tVALUE
 40 | """)
 41 | 
 42 |     @classmethod
 43 |     def tearDownClass(cls):
 44 |         cls.path.cleanup()
 45 |         config.clear()
 46 | 
 47 |     def test_cookiefile(self):
 48 |         config.set(CKEY, self.cookiefile)
 49 | 
 50 |         cookies = extractor.find("test:").session.cookies
 51 |         self.assertEqual(len(cookies), 1)
 52 | 
 53 |         cookie = next(iter(cookies))
 54 |         self.assertEqual(cookie.domain, ".example.org")
 55 |         self.assertEqual(cookie.path  , "/")
 56 |         self.assertEqual(cookie.name  , "NAME")
 57 |         self.assertEqual(cookie.value , "VALUE")
 58 | 
 59 |     def test_invalid_cookiefile(self):
 60 |         self._test_warning(self.invalid_cookiefile, http.cookiejar.LoadError)
 61 | 
 62 |     def test_invalid_filename(self):
 63 |         self._test_warning(join(self.path.name, "nothing"), FileNotFoundError)
 64 | 
 65 |     def _test_warning(self, filename, exc):
 66 |         config.set(CKEY, filename)
 67 |         log = logging.getLogger("test")
 68 |         with mock.patch.object(log, "warning") as mock_warning:
 69 |             cookies = extractor.find("test:").session.cookies
 70 |             self.assertEqual(len(cookies), 0)
 71 |             self.assertEqual(mock_warning.call_count, 1)
 72 |             self.assertEqual(mock_warning.call_args[0][0], "cookies: %s")
 73 |             self.assertIsInstance(mock_warning.call_args[0][1], exc)
 74 | 
 75 | 
 76 | class TestCookiedict(unittest.TestCase):
 77 | 
 78 |     def setUp(self):
 79 |         self.cdict = {"NAME1": "VALUE1", "NAME2": "VALUE2"}
 80 |         config.set(CKEY, self.cdict)
 81 | 
 82 |     def tearDown(self):
 83 |         config.clear()
 84 | 
 85 |     def test_dict(self):
 86 |         cookies = extractor.find("test:").session.cookies
 87 |         self.assertEqual(len(cookies), len(self.cdict))
 88 |         self.assertEqual(sorted(cookies.keys()), sorted(self.cdict.keys()))
 89 |         self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values()))
 90 | 
 91 |     def test_domain(self):
 92 |         for category in ["exhentai", "nijie", "sankaku", "seiga"]:
 93 |             extr = _get_extractor(category)
 94 |             cookies = extr.session.cookies
 95 |             for key in self.cdict:
 96 |                 self.assertTrue(key in cookies)
 97 |             for c in cookies:
 98 |                 self.assertEqual(c.domain, extr.cookiedomain)
 99 | 
100 | 
101 | class TestCookieLogin(unittest.TestCase):
102 | 
103 |     def tearDown(self):
104 |         config.clear()
105 | 
106 |     def test_cookie_login(self):
107 |         extr_cookies = {
108 |             "exhentai": ("ipb_member_id", "ipb_pass_hash"),
109 |             "nijie"   : ("nemail", "nlogin"),
110 |             "sankaku" : ("login", "pass_hash"),
111 |             "seiga"   : ("user_session",),
112 |         }
113 |         for category, cookienames in extr_cookies.items():
114 |             cookies = {name: "value" for name in cookienames}
115 |             config.set(CKEY, cookies)
116 |             extr = _get_extractor(category)
117 |             with mock.patch.object(extr, "_login_impl") as mock_login:
118 |                 extr.login()
119 |                 mock_login.assert_not_called()
120 | 
121 | 
122 | def _get_extractor(category):
123 |     for extr in extractor.extractors():
124 |         if extr.category == category and hasattr(extr, "_login_impl"):
125 |             url = next(extr._get_tests())[0]
126 |             return extr.from_url(url)
127 | 
128 | 
129 | if __name__ == "__main__":
130 |     unittest.main()
131 | 


--------------------------------------------------------------------------------
/gallery_dl/oauth.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2018 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """OAuth helper functions and classes"""
 10 | 
 11 | import hmac
 12 | import time
 13 | import base64
 14 | import random
 15 | import string
 16 | import hashlib
 17 | import urllib.parse
 18 | 
 19 | import requests
 20 | import requests.auth
 21 | 
 22 | from . import text
 23 | 
 24 | 
 25 | def nonce(size, alphabet=string.ascii_letters):
 26 |     """Generate a nonce value with 'size' characters"""
 27 |     return "".join(random.choice(alphabet) for _ in range(size))
 28 | 
 29 | 
 30 | def quote(value, quote=urllib.parse.quote):
 31 |     """Quote 'value' according to the OAuth1.0 standard"""
 32 |     return quote(value, "~")
 33 | 
 34 | 
 35 | def concat(*args):
 36 |     """Concatenate 'args' as expected by OAuth1.0"""
 37 |     return "&".join(quote(item) for item in args)
 38 | 
 39 | 
 40 | class OAuth1Session(requests.Session):
 41 |     """Extension to requests.Session to support OAuth 1.0"""
 42 | 
 43 |     def __init__(self, consumer_key, consumer_secret,
 44 |                  token=None, token_secret=None):
 45 | 
 46 |         requests.Session.__init__(self)
 47 |         self.auth = OAuth1Client(
 48 |             consumer_key, consumer_secret,
 49 |             token, token_secret,
 50 |         )
 51 | 
 52 |     def rebuild_auth(self, prepared_request, response):
 53 |         if "Authorization" in prepared_request.headers:
 54 |             del prepared_request.headers["Authorization"]
 55 |             prepared_request.prepare_auth(self.auth)
 56 | 
 57 | 
 58 | class OAuth1Client(requests.auth.AuthBase):
 59 |     """OAuth1.0a authentication"""
 60 | 
 61 |     def __init__(self, consumer_key, consumer_secret,
 62 |                  token=None, token_secret=None):
 63 | 
 64 |         self.consumer_key = consumer_key
 65 |         self.consumer_secret = consumer_secret
 66 |         self.token = token
 67 |         self.token_secret = token_secret
 68 | 
 69 |     def __call__(self, request):
 70 |         oauth_params = [
 71 |             ("oauth_consumer_key", self.consumer_key),
 72 |             ("oauth_nonce", nonce(16)),
 73 |             ("oauth_signature_method", "HMAC-SHA1"),
 74 |             ("oauth_timestamp", str(int(time.time()))),
 75 |             ("oauth_version", "1.0"),
 76 |         ]
 77 |         if self.token:
 78 |             oauth_params.append(("oauth_token", self.token))
 79 | 
 80 |         signature = self.generate_signature(request, oauth_params)
 81 |         oauth_params.append(("oauth_signature", signature))
 82 | 
 83 |         request.headers["Authorization"] = "OAuth " + ",".join(
 84 |             key + '="' + value + '"' for key, value in oauth_params)
 85 | 
 86 |         return request
 87 | 
 88 |     def generate_signature(self, request, params):
 89 |         """Generate 'oauth_signature' value"""
 90 |         url, _, query = request.url.partition("?")
 91 | 
 92 |         params = params.copy()
 93 |         for key, value in text.parse_query(query).items():
 94 |             params.append((quote(key), quote(value)))
 95 |         params.sort()
 96 |         query = "&".join("=".join(item) for item in params)
 97 | 
 98 |         message = concat(request.method, url, query).encode()
 99 |         key = concat(self.consumer_secret, self.token_secret or "").encode()
100 |         signature = hmac.new(key, message, hashlib.sha1).digest()
101 | 
102 |         return quote(base64.b64encode(signature).decode())
103 | 
104 | 
105 | class OAuth1API():
106 |     """Base class for OAuth1.0 based API interfaces"""
107 |     API_KEY = None
108 |     API_SECRET = None
109 | 
110 |     def __init__(self, extractor):
111 |         self.log = extractor.log
112 |         self.extractor = extractor
113 | 
114 |         api_key = extractor.config("api-key", self.API_KEY)
115 |         api_secret = extractor.config("api-secret", self.API_SECRET)
116 |         token = extractor.config("access-token")
117 |         token_secret = extractor.config("access-token-secret")
118 | 
119 |         if api_key and api_secret and token and token_secret:
120 |             self.log.debug("Using OAuth1.0 authentication")
121 |             self.session = OAuth1Session(
122 |                 api_key, api_secret, token, token_secret)
123 |             self.api_key = None
124 |         else:
125 |             self.log.debug("Using api_key authentication")
126 |             self.session = extractor.session
127 |             self.api_key = api_key
128 | 
129 |     def request(self, url, method="GET", **kwargs):
130 |         kwargs["fatal"] = None
131 |         kwargs["session"] = self.session
132 |         return self.extractor.request(url, method, **kwargs)
133 | 


--------------------------------------------------------------------------------
/gallery_dl/extractor/shopify.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Copyright 2019 Mike Fährmann
  4 | #
  5 | # This program is free software; you can redistribute it and/or modify
  6 | # it under the terms of the GNU General Public License version 2 as
  7 | # published by the Free Software Foundation.
  8 | 
  9 | """Extractors for Shopify instances"""
 10 | 
 11 | from .common import Extractor, Message, SharedConfigMixin, generate_extractors
 12 | from .. import text
 13 | import re
 14 | 
 15 | 
 16 | class ShopifyExtractor(SharedConfigMixin, Extractor):
 17 |     """Base class for Shopify extractors"""
 18 |     basecategory = "shopify"
 19 |     filename_fmt = "{product[title]}_{num:>02}_{id}.{extension}"
 20 |     archive_fmt = "{id}"
 21 | 
 22 |     def __init__(self, match):
 23 |         Extractor.__init__(self, match)
 24 |         self.item_url = self.root + match.group(1)
 25 | 
 26 |     def request(self, url, **kwargs):
 27 |         kwargs["retries"] = float("inf")
 28 |         return Extractor.request(self, url, **kwargs)
 29 | 
 30 |     def items(self):
 31 |         data = self.metadata()
 32 |         yield Message.Version, 1
 33 |         yield Message.Directory, data
 34 | 
 35 |         headers = {"X-Requested-With": "XMLHttpRequest"}
 36 |         for url in self.products():
 37 |             response = self.request(
 38 |                 url + ".json", headers=headers, fatal=False)
 39 |             if response.status_code >= 400:
 40 |                 self.log.warning('Skipping %s ("%s: %s")',
 41 |                                  url, response.status_code, response.reason)
 42 |                 continue
 43 |             product = response.json()["product"]
 44 |             del product["image"]
 45 | 
 46 |             for num, image in enumerate(product.pop("images"), 1):
 47 |                 text.nameext_from_url(image["src"], image)
 48 |                 image.update(data)
 49 |                 image["product"] = product
 50 |                 image["num"] = num
 51 |                 yield Message.Url, image["src"], image
 52 | 
 53 |     def metadata(self):
 54 |         """Return general metadata"""
 55 |         return {}
 56 | 
 57 |     def products(self):
 58 |         """Return an iterable with all relevant product URLs"""
 59 | 
 60 | 
 61 | class ShopifyCollectionExtractor(ShopifyExtractor):
 62 |     """Base class for collection extractors for Shopify based sites"""
 63 |     subcategory = "collection"
 64 |     directory_fmt = ("{category}", "{collection[title]}")
 65 |     pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"
 66 | 
 67 |     def __init__(self, match):
 68 |         ShopifyExtractor.__init__(self, match)
 69 |         self.params = match.group(2)
 70 | 
 71 |     def metadata(self):
 72 |         return self.request(self.item_url + ".json").json()
 73 | 
 74 |     def products(self):
 75 |         params = text.parse_query(self.params)
 76 |         params["page"] = text.parse_int(params.get("page"), 1)
 77 |         search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+")
 78 | 
 79 |         while True:
 80 |             page = self.request(self.item_url, params=params).text
 81 |             urls = search_re.findall(page)
 82 |             last = None
 83 | 
 84 |             if not urls:
 85 |                 return
 86 |             for path in urls:
 87 |                 if last == path:
 88 |                     continue
 89 |                 last = path
 90 |                 yield self.root + path
 91 |             params["page"] += 1
 92 | 
 93 | 
 94 | class ShopifyProductExtractor(ShopifyExtractor):
 95 |     """Base class for product extractors for Shopify based sites"""
 96 |     subcategory = "product"
 97 |     directory_fmt = ("{category}", "Products")
 98 |     pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"
 99 | 
100 |     def products(self):
101 |         return (self.item_url,)
102 | 
103 | 
104 | EXTRACTORS = {
105 |     "fashionnova": {
106 |         "root": "https://www.fashionnova.com",
107 |         "pattern": r"(?:www\.)?fashionnova\.com",
108 |         "test-product": (
109 |             ("https://www.fashionnova.com/products/essential-slide-red", {
110 |                 "pattern": r"https?://cdn\d*\.shopify.com/",
111 |                 "count": 3,
112 |             }),
113 |             ("https://www.fashionnova.com/collections/flats/products/name"),
114 |         ),
115 |         "test-collection": (
116 |             ("https://www.fashionnova.com/collections/mini-dresses", {
117 |                 "range": "1-20",
118 |                 "count": 20,
119 |             }),
120 |             ("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
121 |             ("https://www.fashionnova.com/collections/mini-dresses#1"),
122 |         ),
123 | 
124 |     },
125 | }
126 | 
127 | generate_extractors(EXTRACTORS, globals(), (
128 |     ShopifyProductExtractor,
129 |     ShopifyCollectionExtractor,
130 | ))
131 | 


--------------------------------------------------------------------------------