├── Procfile ├── lncrawl ├── VERSION ├── assets │ ├── __init__.py │ ├── fonts │ │ ├── Horta.ttf │ │ ├── Sofia.otf │ │ ├── Bellota.otf │ │ ├── Caladea.ttf │ │ ├── Crimson.otf │ │ ├── Gidole.ttf │ │ ├── Orkney.ttf │ │ ├── Unique.ttf │ │ ├── Bradley Gratis.ttf │ │ ├── Liberation Serif.ttf │ │ ├── Libre Baskerville.ttf │ │ └── Glacial Indifference.otf │ ├── version.py │ ├── colors.txt │ ├── icons.py │ ├── templates │ │ ├── Simple.svg │ │ ├── Simple Dark.svg │ │ ├── Blocks.svg │ │ ├── Column.svg │ │ ├── Window.svg │ │ ├── Cross.svg │ │ ├── Tiles.svg │ │ ├── Gradient.svg │ │ └── Rings.svg │ ├── html_style.py │ └── html_style.css ├── utils │ ├── __init__.py │ ├── update_checker.py │ ├── uploader.py │ └── kindlegen_download.py ├── bots │ ├── discord │ │ ├── __init__.py │ │ └── config.py │ ├── __init__.py │ ├── console │ │ ├── __init__.py │ │ ├── login_info.py │ │ └── get_crawler.py │ ├── test │ │ ├── test_crawler.py │ │ └── post_github.py │ └── _sample.py ├── __init__.py ├── binders │ ├── text.py │ ├── __init__.py │ ├── web.py │ └── calibre.py ├── sources │ ├── anythingnovel.py │ ├── chinesefantasy.py │ ├── asianhobbyist.py │ ├── webnovelonlinecom.py │ ├── listnovel.py │ ├── novelringan.py │ ├── ranobelibme.py │ ├── webnovelonline.py │ ├── flyinglines.py │ ├── wuxialeague.py │ ├── fullnovellive.py │ ├── liberspark.py │ ├── aixdzs.py │ ├── tapread.py │ ├── tomotrans.py │ ├── wattpad.py │ ├── jpmtl.py │ ├── tiknovel.py │ ├── qidiancom.py │ ├── 9kqw.py │ ├── novelspread.py │ ├── novelv.py │ ├── machinetrans.py │ ├── readln.py │ ├── idqidian.py │ ├── yukinovel.py │ ├── fourscanlation.py │ ├── novelgo.py │ ├── gravitytales.py │ ├── machinetransorg.py │ ├── mangatoon.py │ ├── rewayatclub.py │ ├── shinsori.py │ ├── wuxiaonline.py │ ├── crescentmoon.py │ ├── meionovel.py │ ├── kissnovel.py │ ├── bestlightnovel.py │ ├── novelonlinefull.py │ ├── boxnovel.py │ ├── webnovelindonesia.py │ ├── translateindo.py │ ├── zenithnovels.py │ ├── litnet.py │ ├── __init__.py │ ├── royalroad.py │ └── wuxiasite.py └── core │ ├── __init__.py │ ├── novel_info.py │ └── novel_search.py ├── runtime.txt ├── MANIFEST.in ├── dev-requirements.txt ├── res ├── lncrawl.ico ├── lncrawl-icon.png └── lncrawl-web.png ├── __main__.py ├── .github ├── ISSUE_TEMPLATE │ ├── general.md │ ├── new-source.md │ ├── bug_report.md │ └── remove-source.md └── workflows │ └── pythonpackage.yml ├── package.json ├── scripts ├── publish.sh ├── publish.bat ├── build.sh └── build.bat ├── .gitignore ├── requirements.txt ├── .appveyor.yml ├── .env.example ├── .travis.yml ├── setup.py ├── app.json ├── setup.cfg ├── README.pip └── setup_pyi.py /Procfile: -------------------------------------------------------------------------------- 1 | bot: python . 2 | -------------------------------------------------------------------------------- /lncrawl/VERSION: -------------------------------------------------------------------------------- 1 | 2.22.1 2 | -------------------------------------------------------------------------------- /lncrawl/assets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lncrawl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.6.9 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include lncrawl/VERSION 2 | recursive-include lncrawl *.* 3 | -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | Js2Py 2 | PyInstaller 3 | cairosvg 4 | setuptools 5 | wheel 6 | -------------------------------------------------------------------------------- /res/lncrawl.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl.ico -------------------------------------------------------------------------------- /lncrawl/bots/discord/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from .discord_bot import DiscordBot 3 | -------------------------------------------------------------------------------- /res/lncrawl-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl-icon.png -------------------------------------------------------------------------------- /res/lncrawl-web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl-web.png -------------------------------------------------------------------------------- /__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | from lncrawl import main 4 | main() 5 | -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Horta.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Horta.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Sofia.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Sofia.otf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Bellota.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Bellota.otf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Caladea.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Caladea.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Crimson.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Crimson.otf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Gidole.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Gidole.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Orkney.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Orkney.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Unique.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Unique.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Bradley Gratis.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Bradley Gratis.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Liberation Serif.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Liberation Serif.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Libre Baskerville.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Libre Baskerville.ttf -------------------------------------------------------------------------------- /lncrawl/assets/fonts/Glacial Indifference.otf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Glacial Indifference.otf -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General 3 | about: If you want to create a general issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/new-source.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: New source 3 | about: Suggest a new source to add 4 | title: Enter your desired sources here 5 | labels: source 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lightnovel-crawler", 3 | "description": "Downloads lightnovels from various online sources and generates ebooks in many formats.", 4 | "version": "2.16.2", 5 | "engines": { 6 | "node": "12.x" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /lncrawl/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | try: 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | except Exception: 7 | pass 8 | # end try 9 | 10 | 11 | def main(): 12 | from .core import start_app 13 | start_app() 14 | # end def 15 | -------------------------------------------------------------------------------- /lncrawl/assets/version.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from pathlib import Path 3 | 4 | ROOT = Path(__file__).parent.parent 5 | 6 | with open(str(ROOT / 'VERSION'), 'r') as f: 7 | version = f.read().strip() 8 | # end with 9 | 10 | 11 | def get_value(): 12 | return version 13 | # end def 14 | -------------------------------------------------------------------------------- /scripts/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=$(head -n 1 lncrawl/VERSION) 4 | 5 | PY="python3" 6 | PIP="$PY -m pip --disable-pip-version-check" 7 | 8 | # . scripts/build.sh 9 | 10 | $PIP install twine 11 | $PY -m twine upload "dist/lightnovel_crawler-$VERSION-py3-none-any.whl" 12 | 13 | # FINISHED 14 | -------------------------------------------------------------------------------- /scripts/publish.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | SET /P VERSION==1.2.40 16 | lxml==4.5.1 17 | 18 | # Bot requirements 19 | discord.py==1.3.3 20 | python-telegram-bot==11.1.0 21 | PyDrive==1.3.1 22 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VERSION=$(head -n 1 lncrawl/VERSION) 4 | 5 | PY="python3" 6 | PIP="$PY -m pip --disable-pip-version-check" 7 | 8 | rm -rf venv build dist *.egg-info 9 | 10 | $PY -m venv venv 11 | . venv/bin/activate 12 | 13 | $PIP install -U pip==20.0.2 14 | $PIP install -r requirements.txt 15 | $PIP install -r dev-requirements.txt 16 | 17 | $PY setup.py clean bdist_wheel sdist package 18 | 19 | deactivate 20 | rm -rf venv build *.egg-info 21 | 22 | # FINISHED 23 | -------------------------------------------------------------------------------- /lncrawl/assets/colors.txt: -------------------------------------------------------------------------------- 1 | #d3dcf2 #829fe4 #6692c3 #4878a4 #00305a 2 | #e8d9ac #c7b07b #ffe28c #d8ab22 #382d1a 3 | #d8edb5 #abc8a4 #b1d17b #90a868 #183128 4 | #e6f1f5 #aab3b6 #a1bac4 #6a7275 #3b3e40 5 | #eaa8d3 #996185 #c964a6 #d897c1 #49223b 6 | #d3c0b8 #917569 #bc8b74 #72391e #332923 7 | #fffcfc #892323 #c42121 #2d2727 #020000 8 | #fcb0b0 #d67e7e #f7a0a0 #773535 #0a0505 9 | #2ab7ca #fed766 #cfffb3 #fe4a49 #330c2f 10 | #fde8e9 #e3bac6 #bc9ec1 #596475 #1f2232 11 | #ffffff #f9e316 #faa916 #96031a #000000 12 | #452103 #690500 #210f04 #934b00 #bb6b00 -------------------------------------------------------------------------------- /.appveyor.yml: -------------------------------------------------------------------------------- 1 | environment: 2 | matrix: 3 | - PYTHON: C:\Python38-x64 4 | - PYTHON: C:\Python37-x64 5 | - PYTHON: C:\Python36-x64 6 | - PYTHON: C:\Python35-x64 7 | 8 | init: 9 | - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH% 10 | - SET PYTHONIOENCODING=utf-8 11 | 12 | install: 13 | - python -m pip install --no-cache-dir -r requirements.txt 14 | 15 | build: false 16 | 17 | test_script: 18 | - python __main__.py --bot test -lll 19 | 20 | branches: 21 | only: 22 | - master 23 | 24 | cache: 25 | - '%LOCALAPPDATA%\pip\Cache' 26 | -------------------------------------------------------------------------------- /scripts/build.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | SET /P VERSION=' 28 | # end def 29 | -------------------------------------------------------------------------------- /lncrawl/bots/console/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | 5 | class ConsoleBot: 6 | log = logging.getLogger('CONSOLE_BOT') 7 | 8 | from .start import start 9 | from .start import open_folder 10 | from .start import process_chapter_range 11 | 12 | from .get_crawler import get_novel_url 13 | from .get_crawler import get_crawlers_to_search 14 | from .get_crawler import choose_a_novel 15 | 16 | from .login_info import get_login_info 17 | 18 | from .output_style import get_output_path 19 | from .output_style import force_replace_old 20 | from .output_style import get_output_formats 21 | from .output_style import should_pack_by_volume 22 | 23 | from .range_selection import get_range_selection 24 | from .range_selection import get_range_using_urls 25 | from .range_selection import get_range_using_index 26 | from .range_selection import get_range_from_volumes 27 | from .range_selection import get_range_from_chapters 28 | # end class 29 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Simple.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 16 | 17 | 18 | {%- for author in authors %} {{ author }} {%- endfor %} 19 | 20 | 21 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Simple Dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 16 | 17 | 18 | {%- for author in authors %} {{ author }} {%- endfor %} 19 | 20 | 21 | -------------------------------------------------------------------------------- /lncrawl/binders/text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | import re 5 | from bs4 import BeautifulSoup 6 | 7 | logger = logging.getLogger('TEXT_BINDER') 8 | 9 | 10 | def make_texts(app, data): 11 | text_files = [] 12 | for vol in data: 13 | dir_name = os.path.join(app.output_path, 'text', vol) 14 | os.makedirs(dir_name, exist_ok=True) 15 | for chap in data[vol]: 16 | file_name = '%s.txt' % str(chap['id']).rjust(5, '0') 17 | file_name = os.path.join(dir_name, file_name) 18 | with open(file_name, 'w', encoding='utf-8') as file: 19 | body = chap['body'].replace('

\n 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 17 | 18 | 19 | {%- for author in authors %} {{ author }} {%- endfor %} 20 | 21 | 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | os: linux 2 | dist: xenial 3 | language: python 4 | python: 5 | - "3.8" 6 | - "3.7" 7 | - "3.6" 8 | - "3.5" 9 | - nightly 10 | 11 | matrix: 12 | allow_failures: 13 | - python: nightly 14 | - os: osx 15 | fast_finish: true 16 | 17 | before_install: 18 | - | 19 | if [[ $TRAVIS_OS_NAME == 'osx' ]]; then 20 | brew upgrade python 21 | export PATH="/usr/local/opt/python/libexec/bin:${PATH}" 22 | fi 23 | install: 24 | - pip install -r requirements.txt 25 | - pip install flake8 26 | 27 | before_script: 28 | # stop the build if there are Python syntax errors or undefined names 29 | - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics 30 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 31 | - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 32 | 33 | script: 34 | - python __main__.py --bot test -lll 35 | 36 | cache: 37 | directories: 38 | - $HOME/.cache/pip 39 | - $HOME/.cache/pre-commit 40 | 41 | branches: 42 | only: 43 | - master 44 | 45 | notifications: 46 | email: false 47 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Column.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | 17 | {%- for line in title|wrap(10) %} {{ line }} {%- endfor %} 18 | 19 | 20 | {%- for author in authors %} {{ author }} {%- endfor %} 21 | 22 | 23 | -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - master 7 | push: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: [3.5, 3.6, 3.7, 3.8] 18 | 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v1 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -r requirements.txt 29 | - name: Lint with flake8 30 | run: | 31 | pip install flake8 32 | # stop the build if there are Python syntax errors or undefined names 33 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 34 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 35 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 36 | # - name: Test with pytest 37 | # run: | 38 | # pip install pytest 39 | # pytest 40 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Window.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | 17 | {%- for line in title|wrap(16) %} {{ line }} {%- endfor %} 18 | 19 | 20 | {%- for author in authors %} {{ author }} {%- endfor %} 21 | 22 | 23 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Cross.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | 17 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 18 | 19 | 20 | {%- for author in authors %} {{ author }} {%- endfor %} 21 | 22 | 23 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Tiles.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 19 | 20 | 21 | {%- for author in authors %} {{ author }} {%- endfor %} 22 | 23 | 24 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | 4 | if sys.version_info[:2] < (3, 5): 5 | raise RuntimeError( 6 | 'Lightnovel crawler only supports Python 3.5 and later.') 7 | else: 8 | run_pyi = 'package' in sys.argv 9 | if run_pyi: 10 | sys.argv.remove('package') 11 | # end if 12 | if len(sys.argv) == 1: 13 | sys.argv += ['build'] 14 | # end if 15 | 16 | # import required packages 17 | from pathlib import Path 18 | from setuptools import config, setup 19 | 20 | def parse_version(filename): 21 | with open(filename, 'r') as f: 22 | return f.read().strip() 23 | # end def 24 | 25 | def parse_requirements(filename): 26 | with open(filename, 'r', encoding='utf-8') as f: 27 | requirements = f.read().strip().split('\n') 28 | requirements = [ 29 | r.strip() for r in requirements 30 | if r.strip() and not r.startswith('#') 31 | ] 32 | return requirements 33 | # end def 34 | 35 | config.read_configuration('setup.cfg') 36 | 37 | setup( 38 | version=parse_version(Path('lncrawl') / 'VERSION'), 39 | install_requires=parse_requirements('requirements.txt'), 40 | ) 41 | 42 | if run_pyi: 43 | from setup_pyi import package 44 | package() 45 | # end if 46 | # end if 47 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Gradient.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 21 | 22 | 23 | {%- for author in authors %} {{ author }} {%- endfor %} 24 | 25 | 26 | -------------------------------------------------------------------------------- /lncrawl/bots/console/login_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from PyInquirer import prompt 3 | from ...core.arguments import get_args 4 | 5 | 6 | def get_login_info(self): 7 | '''Returns the (email, password) pair for login''' 8 | args = get_args() 9 | 10 | if args.login: 11 | return args.login 12 | # end if 13 | 14 | if args.suppress: 15 | return False 16 | # end if 17 | 18 | answer = prompt([ 19 | { 20 | 'type': 'confirm', 21 | 'name': 'login', 22 | 'message': 'Do you want to log in?', 23 | 'default': False 24 | }, 25 | ]) 26 | 27 | if answer['login']: 28 | answer = prompt([ 29 | { 30 | 'type': 'input', 31 | 'name': 'email', 32 | 'message': 'Username/Email:', 33 | 'validate': lambda val: True if len(val) 34 | else 'Email address should be not be empty' 35 | }, 36 | { 37 | 'type': 'password', 38 | 'name': 'password', 39 | 'message': 'Password:', 40 | 'validate': lambda val: True if len(val) 41 | else 'Password should be not be empty' 42 | }, 43 | ]) 44 | return answer['email'], answer['password'] 45 | # end if 46 | 47 | return None 48 | # end if 49 | -------------------------------------------------------------------------------- /lncrawl/assets/templates/Rings.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | {%- for line in title|wrap(20) %} {{ line }} {%- endfor %} 25 | 26 | 27 | {%- for author in authors %} {{ author }} {%- endfor %} 28 | 29 | 30 | -------------------------------------------------------------------------------- /app.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "lightnovel crawler", 3 | "description": "Downloads lightnovels from various online sources and generates ebooks in many formats.", 4 | "keywords": [ 5 | "discord", 6 | "bot", 7 | "telegram", 8 | "novel", 9 | "lightnovel", 10 | "crawler" 11 | ], 12 | "website": "https://github.com/dipu-bd/lightnovel-crawler", 13 | "logo": "https://github.com/dipu-bd/lightnovel-crawler/raw/master/res/lncrawl-icon.png", 14 | "env": { 15 | "LOG_LEVEL": { 16 | "description": "Available levels: NOTSET, WARN, INFO, DEBUG, FATAL, ERROR", 17 | "value": "INFO", 18 | "required": true 19 | }, 20 | "BOT": { 21 | "description": "available: console, discord, telegram", 22 | "value": "discord", 23 | "required": true 24 | }, 25 | "TELEGRAM_TOKEN": { 26 | "description": "Telegram token, only required if BOT is set to telegram", 27 | "required": false 28 | }, 29 | "DISCORD_TOKEN": { 30 | "description": "Discord token, only required if BOT is set to discord", 31 | "required": false 32 | }, 33 | "DISCORD_SIGNAL_CHAR": { 34 | "description": "Discord command prefix, only required if BOT is set to discord", 35 | "required": false, 36 | "value": "!" 37 | } 38 | }, 39 | "buildpacks": [ 40 | { 41 | "url": "https://github.com/heroku/heroku-buildpack-nodejs" 42 | }, 43 | { 44 | "url": "https://github.com/heroku/heroku-buildpack-python" 45 | }, 46 | { 47 | "url": "https://github.com/nntin/heroku-buildpack-calibre" 48 | } 49 | ] 50 | } 51 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/remove-source.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Remove source 3 | about: If you are owner of a source added here and want to remove it 4 | title: 'Request to remove a site: ' 5 | labels: removal 6 | assignees: dipu-bd 7 | --- 8 | 9 | ## Please check all the fields that applies to you 10 | 11 | Transform `[ ]` to `[x]` to check (you can also check it after submitting the issue): 12 | 13 | - [ ] I am a translator/author 14 | - [ ] I only publish my original contents/translations 15 | - [ ] I have permission from the author to translate their contents 16 | - [ ] I do not copy contents from others or do not use machine translations 17 | - [ ] Some people are using this program to steal my translations 18 | - [ ] _I do not blame a blacksmith or the sword he made if it is used by a someone to kill people_ 19 | - [ ] _I do not blame the lockpicks if it is used by a someone to steal from my house_ 20 | - [ ] I do not blame the developer if the program he wrote is used by others to steal from my site 21 | 22 | ## Why do you translate/write novels and post them on your site? 23 | 24 | ``` 25 | 26 | ``` 27 | 28 | ## Explain why you do want to prevent people from scraping your site? 29 | 30 | ``` 31 | 32 | ``` 33 | 34 | ## Can you prove your identity as a site owner? 35 | 36 | ``` 37 | - Add a file named `lncrawl.txt` with content `Please remove this source` to your site. 38 | - Paste the link of the file here 39 | ``` 40 | 41 | ## Have some links or evidences that people are using this app to steal contents from you? 42 | 43 | ``` 44 | 45 | ``` 46 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = lightnovel-crawler 3 | url = https://github.com/dipu-bd/lightnovel-crawler 4 | license = Apache 2.0 5 | license_file = LICENSE 6 | description = An app to download novels from online sources and generate e-books. 7 | long_description = file: README.pip 8 | long_description_content_type = text/markdown 9 | author = Sudipto Chandra 10 | author_email = dipu.sudipta@gmail.com 11 | platforms = 12 | Linux 13 | macOS 14 | Windows 15 | keywords = 16 | lightnovel 17 | crawler 18 | lncrawl 19 | ebook 20 | novel 21 | pdf 22 | epub 23 | mobi 24 | scraper 25 | web-scraper 26 | classifiers = 27 | Development Status :: 5 - Production/Stable 28 | Environment :: Console 29 | Natural Language :: English 30 | License :: OSI Approved :: Apache Software License 31 | Intended Audience :: Developers 32 | Intended Audience :: End Users/Desktop 33 | Programming Language :: Python :: 3 :: Only 34 | Programming Language :: Python :: 3.5 35 | Programming Language :: Python :: 3.6 36 | Programming Language :: Python :: 3.7 37 | Programming Language :: Python :: 3.8 38 | Topic :: Games/Entertainment 39 | Topic :: Internet :: WWW/HTTP 40 | Topic :: Multimedia :: Graphics 41 | Topic :: Printing 42 | Topic :: Text Processing :: Markup :: HTML 43 | project_urls = 44 | Source Code = https://github.com/dipu-bd/lightnovel-crawler 45 | Issue tracker = https://github.com/dipu-bd/lightnovel-crawler/issues 46 | Documentation = https://github.com/dipu-bd/lightnovel-crawler/blob/master/README.md 47 | Say Thanks! = https://saythanks.io/to/dipu-bd 48 | 49 | [options] 50 | python_requires = >= 3.5 51 | include_package_data = True 52 | packages = lncrawl 53 | package_dir = 54 | lncrawl = lncrawl 55 | 56 | [options.entry_points] 57 | console_scripts = 58 | lncrawl = lncrawl:main 59 | lightnovel-crawler = lncrawl:main 60 | -------------------------------------------------------------------------------- /lncrawl/assets/html_style.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | from pathlib import Path 4 | 5 | ROOT = Path(__file__).parent 6 | 7 | with open(str(ROOT / 'html_style.css'), 'r') as f: 8 | style = f.read() 9 | # end with 10 | 11 | 12 | def get_value(): 13 | return _minify(style) 14 | # end def 15 | 16 | 17 | def _minify(css): 18 | result = '' 19 | 20 | # remove comments - this will break IE<6 comment hacks 21 | css = re.sub(r'/\*[\s\S]*?\*/', "", css) 22 | 23 | # url() doesn't need quotes 24 | #css = re.sub(r'url\((["\'])([^)]*)\1\)', r'url(\2)', css) 25 | 26 | # spaces may be safely collapsed as generated content will collapse them anyway 27 | css = re.sub(r'\s+', ' ', css) 28 | 29 | # shorten collapsable colors: #aabbcc to #abc 30 | css = re.sub( 31 | r'#([0-9a-f])\1([0-9a-f])\2([0-9a-f])\3(\s|;)', r'#\1\2\3\4', css) 32 | 33 | # fragment values can loose zeros 34 | css = re.sub(r':\s*0(\.\d+([cm]m|e[mx]|in|p[ctx]))\s*;', r':\1;', css) 35 | 36 | for rule in re.findall(r'([^{]+){([^}]*)}', css): 37 | # we don't need spaces around operators 38 | selectors = [re.sub(r'(?<=[\[\(>+=])\s+|\s+(?=[=~^$*|>+\]\)])', 39 | r'', selector.strip()) for selector in rule[0].split(',')] 40 | # order is important, but we still want to discard repetitions 41 | properties = {} 42 | porder = [] 43 | for prop in re.findall(r'(.*?):(.*?)(;|$)', rule[1]): 44 | key = prop[0].strip().lower() 45 | if key not in porder: 46 | porder.append(key) 47 | properties[key] = prop[1].strip() 48 | # output rule if it contains any declarations 49 | if properties: 50 | result += "%s{%s}" % (','.join(selectors), ''.join( 51 | ['%s:%s;' % (key, properties[key]) for key in porder])[:-1]) 52 | 53 | return result 54 | # end def 55 | -------------------------------------------------------------------------------- /lncrawl/utils/uploader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Uploader for google drive""" 3 | import os 4 | import logging 5 | 6 | logger = logging.getLogger('UPLOADER') 7 | 8 | 9 | try: 10 | from pydrive.auth import GoogleAuth 11 | from pydrive.drive import GoogleDrive 12 | except Exception: 13 | logger.error('`pydrive` was not setup properly') 14 | # end try 15 | 16 | 17 | def upload(file_path, description=None): 18 | try: 19 | gauth = GoogleAuth() 20 | # gauth.LocalWebserverAuth() 21 | 22 | # Try to load saved client credentials 23 | credential_file = os.getenv('GOOGLE_DRIVE_CREDENTIAL_FILE') 24 | gauth.LoadCredentialsFile(credential_file) 25 | if gauth.credentials is None: 26 | # Authenticate if they're not there 27 | gauth.LocalWebserverAuth() 28 | elif gauth.access_token_expired: 29 | # Refresh them if expired 30 | gauth.Refresh() 31 | else: 32 | # Initialize the saved creds 33 | gauth.Authorize() 34 | # end if 35 | 36 | # Save the current credentials to a file 37 | gauth.SaveCredentialsFile(credential_file) 38 | 39 | drive = GoogleDrive(gauth) 40 | folder_id = os.getenv('GOOGLE_DRIVE_FOLDER_ID') 41 | filename_w_ext = os.path.basename(file_path) 42 | filename, file_extension = os.path.splitext(filename_w_ext) 43 | 44 | # Upload file to folder 45 | f = drive.CreateFile( 46 | {"parents": [{"kind": "drive#fileLink", "id": folder_id}]}) 47 | f['title'] = filename_w_ext 48 | 49 | # Make sure to add the path to the file to upload below. 50 | f.SetContentFile(file_path) 51 | f.Upload() 52 | 53 | logger.info(f['id']) 54 | return f['id'] 55 | except Exception: 56 | logger.exception('Failed to upload %s', file_path) 57 | # end try 58 | return None 59 | # end def 60 | -------------------------------------------------------------------------------- /lncrawl/binders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | To bind into ebooks 4 | """ 5 | import logging 6 | 7 | from .epub import make_epubs 8 | from .web import make_webs 9 | from .text import make_texts 10 | from .calibre import make_calibres 11 | 12 | logger = logging.Logger('BINDERS') 13 | 14 | depends_on_none = [ 15 | 'epub', 16 | 'text', 17 | 'web', 18 | ] 19 | depends_on_epub = [ 20 | 'docx', 21 | 'mobi', 22 | 'pdf', 23 | 'rtf', 24 | 'txt', 25 | 'azw3', 26 | 'fb2', 27 | 'lit', 28 | 'lrf', 29 | 'oeb', 30 | 'pdb', 31 | 'rb', 32 | 'snb', 33 | 'tcr', 34 | # 'pml', 35 | # 'html', 36 | ] 37 | available_formats = depends_on_none + depends_on_epub 38 | 39 | 40 | def generate_books(app, data): 41 | out_formats = app.output_formats 42 | if not out_formats: 43 | out_formats = {} 44 | # end if 45 | out_formats = {x: out_formats.get(x, False) for x in available_formats} 46 | 47 | # Resolve formats to output maintaining dependencies 48 | after_epub = [x for x in depends_on_epub if out_formats[x]] 49 | need_epub = 'epub' if len(after_epub) else None 50 | after_any = [x for x in depends_on_none if out_formats[x] or x == need_epub] 51 | 52 | # Generate output files 53 | outputs = dict() 54 | for fmt in (after_any + after_epub): 55 | try: 56 | if fmt == 'text': 57 | outputs[fmt] = make_texts(app, data) 58 | elif fmt == 'web': 59 | outputs[fmt] = make_webs(app, data) 60 | elif fmt == 'epub': 61 | outputs[fmt] = make_epubs(app, data) 62 | else: 63 | outputs[fmt] = make_calibres(app, outputs['epub'], fmt) 64 | # end if 65 | except Exception as err: 66 | logger.exception('Failed to generate "%s": %s' % (fmt, err)) 67 | # end try 68 | # end for 69 | 70 | return outputs 71 | # end def 72 | -------------------------------------------------------------------------------- /lncrawl/bots/discord/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import logging.config 4 | from colorama import Fore 5 | from ...core.arguments import get_args 6 | 7 | # The special signal character for crawler commands 8 | signal = os.getenv('DISCORD_SIGNAL_CHAR') or '!' 9 | max_workers = os.getenv('DISCORD_MAX_WORKERS', 10) 10 | 11 | # The public ip and path of the server to put files in 12 | public_ip = os.getenv('PUBLIC_ADDRESS', None) 13 | public_path = os.getenv('PUBLIC_DATA_PATH', None) 14 | 15 | os.makedirs('logs', exist_ok=True) 16 | logging.config.dictConfig({ 17 | # 18 | # Configure logging 19 | # Docs: https://docs.python.org/3.5/library/logging.config.html#configuration-dictionary-schema 20 | # Example: https://stackoverflow.com/a/7507842/1583052 21 | # 22 | 'version': 1, 23 | 'disable_existing_loggers': True, 24 | 'formatters': { 25 | 'console': { 26 | 'format': Fore.CYAN+'%(asctime)s'+Fore.RESET + ' ' + Fore.GREEN + '%(levelname)-8s'+Fore.RESET+' %(message)s', 27 | 'datefmt': '%H:%M:%S', 28 | }, 29 | 'file': { 30 | 'format': '%(asctime)s [%(process)d] %(levelname)s\n%(name)s: %(message)s\n', 31 | 'datefmt': '%Y-%m-%d %H:%M:%S', 32 | }, 33 | }, 34 | 'handlers': { 35 | 'console': { 36 | 'formatter': 'console', 37 | 'class': 'logging.StreamHandler', 38 | 'stream': 'ext://sys.stdout', # default is stderr 39 | }, 40 | 'file': { 41 | 'formatter': 'file', 42 | 'class': 'logging.handlers.RotatingFileHandler', 43 | 'filename': 'logs/discord-bot_%s.log' % (get_args().shard_id), 44 | 'maxBytes': 10 * 1024 * 1024, # 10 MB 45 | 'backupCount': 5, 46 | 'encoding': 'utf-8', 47 | }, 48 | }, 49 | 'loggers': { 50 | '': { # root logger 51 | 'handlers': ['console', 'file'], 52 | 'level': logging.INFO, 53 | }, 54 | }, 55 | }) 56 | -------------------------------------------------------------------------------- /lncrawl/sources/anythingnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import logging 4 | from concurrent import futures 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('ANYTHING_NOVEL') 8 | 9 | 10 | class AnythingNovelCrawler(Crawler): 11 | base_url = 'https://anythingnovel.com/' 12 | 13 | def read_novel_info(self): 14 | logger.debug('Visiting %s', self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | self.novel_title = soup.select( 18 | '#wrap .breadcrumbs span')[-1].text.strip() 19 | logger.info('Novel title: %s', self.novel_title) 20 | 21 | self.novel_cover = soup.select_one('#content a img')['src'] 22 | logger.info('Novel cover: %s', self.novel_cover) 23 | 24 | volumes = set([]) 25 | for a in reversed(soup.select('#content div li a')): 26 | title = a.text.strip() 27 | chapter_id = len(self.chapters) + 1 28 | volume_id = 1 + (chapter_id - 1) // 100 29 | volumes.add(volume_id) 30 | self.chapters.append({ 31 | 'id': chapter_id, 32 | 'volume': volume_id, 33 | 'title': title, 34 | 'url': a['href'], 35 | }) 36 | # end for 37 | 38 | self.chapters.sort(key=lambda x: x['id']) 39 | self.volumes = [{'id': x, 'title': ''} for x in volumes] 40 | # end def 41 | 42 | def download_chapter_body(self, chapter): 43 | logger.info('Downloading %s', chapter['url']) 44 | soup = self.get_soup(chapter['url']) 45 | content = soup.select_one('div#content') 46 | self.clean_contents(content) 47 | body = content.select('p') 48 | body = [str(p) for p in body if self.should_take(p)] 49 | return '

' + '

'.join(body) + '

' 50 | # end def 51 | 52 | def should_take(self, p): 53 | txt = p.text.strip().lower() 54 | return txt and txt != 'advertisement' 55 | # end def 56 | # end class 57 | -------------------------------------------------------------------------------- /lncrawl/sources/chinesefantasy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from ..utils.crawler import Crawler 3 | import requests 4 | import re 5 | import logging 6 | import json 7 | 8 | logger = logging.getLogger('CHINESE_FANTASY_NOVELS') 9 | 10 | 11 | class ChineseFantasyNovels(Crawler): 12 | base_url = 'https://m.chinesefantasynovels.com/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | if not self.novel_url.endswith('/'): 17 | self.novel_url += '/' 18 | # end if 19 | logger.debug('Visiting %s', self.novel_url) 20 | soup = self.get_soup(self.novel_url) 21 | 22 | self.novel_title = soup.select_one('.btitle h1').text 23 | logger.info('Novel title: %s', self.novel_title) 24 | 25 | self.novel_author = soup.select_one('.bookinfo .status').text 26 | logger.info('%s', self.novel_author) 27 | 28 | volumes = set([]) 29 | for a in reversed(soup.select('dl.chapterlist a')): 30 | ch_title = a.text.strip() 31 | ch_id = [int(x) for x in re.findall(r'\d+', ch_title)] 32 | ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1 33 | vol_id = 1 + len(self.chapters) // 100 34 | volumes.add(vol_id) 35 | self.chapters.append({ 36 | 'id': ch_id, 37 | 'volume': vol_id, 38 | 'title': ch_title, 39 | 'url': self.absolute_url(a['href']), 40 | }) 41 | # end def 42 | 43 | self.volumes = [{'id': x, 'title': ''} for x in volumes] 44 | # end def 45 | 46 | def download_chapter_body(self, chapter): 47 | '''Download body of a single chapter and return as clean html format.''' 48 | logger.info('Downloading %s', chapter['url']) 49 | soup = self.get_soup(chapter['url']) 50 | content = soup.select_one('#BookText') 51 | content.select_one('.link').decompose() 52 | body = self.extract_contents(content) 53 | return '

' + '

'.join(body) + ' 1 59 | ]) 60 | # end def 61 | # end class 62 | -------------------------------------------------------------------------------- /lncrawl/sources/webnovelonlinecom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import json 4 | import logging 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WEBNOVELONLINE_DOT_COM') 8 | 9 | 10 | class WebnovelOnlineDotComCrawler(Crawler): 11 | base_url = 'https://webnovelonline.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | url = self.novel_url 16 | logger.debug('Visiting %s', url) 17 | soup = self.get_soup(url) 18 | 19 | self.novel_title = soup.select_one('.novel-info .novel-desc h1').text 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = soup.select_one('meta[property="og:image"]')['content'] 23 | logger.info('Novel cover: %s', self.novel_title) 24 | 25 | volumes = set([]) 26 | for a in reversed(soup.select('.chapter-list .item a')): 27 | chap_id = len(self.chapters) + 1 28 | vol_id = 1 + len(self.chapters) // 100 29 | volumes.add(vol_id) 30 | self.chapters.append({ 31 | 'id': chap_id, 32 | 'volume': vol_id, 33 | 'title': a.text.strip(), 34 | 'url': self.absolute_url(a['href']), 35 | }) 36 | # end for 37 | 38 | self.volumes = [{'id': x, 'title': ''} for x in volumes] 39 | # end def 40 | 41 | def download_chapter_body(self, chapter): 42 | '''Download body of a single chapter and return as clean html format.''' 43 | logger.info('Visiting %s', chapter['url']) 44 | soup = self.get_soup(chapter['url']) 45 | 46 | for script in soup.select('script'): 47 | text = script.string 48 | if not text or not text.startswith('window._INITIAL_DATA_'): 49 | continue 50 | # end if 51 | content = re.findall(r',"chapter":(".+")},', text)[0] 52 | content = json.loads(content).strip() 53 | return '

' + '

'.join(content.split('\n\n')) + '

' 54 | # end for 55 | 56 | return '' 57 | # end def 58 | # end class 59 | -------------------------------------------------------------------------------- /lncrawl/sources/listnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from urllib.parse import urlparse 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('LIST_NOVEL') 9 | 10 | 11 | class ListNovelCrawler(Crawler): 12 | base_url = 'https://listnovel.com/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | possible_title = soup.select_one('.post-title h1') 20 | for span in possible_title.select('span'): 21 | span.extract() 22 | # end for 23 | self.novel_title = possible_title.text.strip() 24 | logger.info('Novel title: %s', self.novel_title) 25 | 26 | self.novel_cover = self.absolute_url(soup.select_one('.summary_image a img')['data-src']) 27 | logger.info('Novel cover: %s', self.novel_cover) 28 | 29 | self.novel_author = ' '.join([ 30 | a.text.strip() 31 | for a in soup.select('.author-content a[href*="manga-author"]') 32 | ]) 33 | logger.info('%s', self.novel_author) 34 | 35 | for a in reversed(soup.select('.main-col li.wp-manga-chapter a')): 36 | chap_id = len(self.chapters) + 1 37 | vol_id = 1 + len(self.chapters) // 100 38 | if chap_id % 100 == 1: 39 | self.volumes.append({'id': vol_id}) 40 | # end if 41 | self.chapters.append({ 42 | 'id': chap_id, 43 | 'volume': vol_id, 44 | 'title': a.text.strip(), 45 | 'url': self.absolute_url(a['href']), 46 | }) 47 | # end for 48 | # end def 49 | 50 | def download_chapter_body(self, chapter): 51 | '''Download body of a single chapter and return as clean html format.''' 52 | logger.info('Visiting %s', chapter['url']) 53 | soup = self.get_soup(chapter['url']) 54 | contents = soup.select('.reading-content p') 55 | return ''.join([str(p) for p in contents]) 56 | # end def 57 | # end class 58 | -------------------------------------------------------------------------------- /lncrawl/sources/novelringan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('NOVELRINGAN') 8 | 9 | 10 | class NovelRinganCrawler(Crawler): 11 | base_url = 'https://novelringan.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one('h1.entry-title').text 19 | logger.info('Novel title: %s', self.novel_title) 20 | 21 | self.novel_cover = self.absolute_url( 22 | soup.select_one('div.imgprop img')['src']) 23 | logger.info('Novel cover: %s', self.novel_cover) 24 | 25 | self.novel_author = 'Translated by novelringan.com' 26 | logger.info('Novel author: %s', self.novel_author) 27 | 28 | for a in reversed(soup.select('.bxcl ul li a')): 29 | chap_id = len(self.chapters) + 1 30 | if len(self.chapters) % 100 == 0: 31 | vol_id = chap_id//100 + 1 32 | vol_title = 'Volume ' + str(vol_id) 33 | self.volumes.append({ 34 | 'id': vol_id, 35 | 'title': vol_title, 36 | }) 37 | # end if 38 | self.chapters.append({ 39 | 'id': chap_id, 40 | 'volume': vol_id, 41 | 'url': self.absolute_url(a['href']), 42 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 43 | }) 44 | # end for 45 | # end def 46 | 47 | def download_chapter_body(self, chapter): 48 | '''Download body of a single chapter and return as clean html format.''' 49 | logger.info('Downloading %s', chapter['url']) 50 | soup = self.get_soup(chapter['url']) 51 | 52 | soup.select_one('#bacotan').extract() 53 | contents = soup.select('.entry-content p') 54 | 55 | body = [str(p) for p in contents if p.text.strip()] 56 | 57 | return '

' + '

'.join(body) + '

' 58 | 59 | # end def 60 | # end class 61 | -------------------------------------------------------------------------------- /lncrawl/sources/ranobelibme.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..utils.crawler import Crawler 4 | import re 5 | 6 | logger = logging.getLogger("RANOBE_LIB_ME") 7 | 8 | 9 | class RanobeLibCrawler(Crawler): 10 | base_url = 'https://ranobelib.me/' 11 | 12 | def read_novel_info(self): 13 | logger.info('Visiting %s', self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.select_one('.manga-title h1').text 17 | logger.info('Novel title: %s', self.novel_title) 18 | 19 | self.novel_cover = self.absolute_url( 20 | soup.select_one('.manga__image img')['src']) 21 | logger.info('Novel cover: %s', self.novel_cover) 22 | 23 | novel_link = soup.select_one("a[href*=author]") 24 | if novel_link: 25 | self.novel_author = novel_link.text.strip().title() 26 | # end if 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | chapters = soup.select('.chapter-item') 30 | chapters.reverse() 31 | 32 | volumes = set() 33 | for a in chapters: 34 | chap_id = len(self.chapters) + 1 35 | 36 | vol_id = int(a['data-volume']) 37 | volumes.add(vol_id) 38 | 39 | link = a.select_one('a') 40 | chapter_title = re.sub(r'\s+', ' ', link.text).strip() 41 | if not chapter_title: 42 | chapter_title = 'Том %d. Глава %d' % (int(vol_id), int(a['data-number'])) 43 | # end if 44 | 45 | self.chapters.append({ 46 | 'id': chap_id, 47 | 'volume': vol_id, 48 | 'url': self.absolute_url(link['href']), 49 | 'title': chapter_title, 50 | }) 51 | # end for 52 | 53 | self.volumes = [{'id': x} for x in volumes] 54 | # end def 55 | 56 | def download_chapter_body(self, chapter): 57 | logger.info('Downloading %s', chapter['url']) 58 | soup = self.get_soup(chapter['url']) 59 | 60 | div = soup.select_one('.reader-container') 61 | 62 | body = self.extract_contents(div) 63 | 64 | return '

' + '

'.join(body) + '

' 65 | # end def 66 | # end class 67 | -------------------------------------------------------------------------------- /lncrawl/sources/webnovelonline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import logging 4 | from ..utils.crawler import Crawler 5 | 6 | logger = logging.getLogger('WEBNOVEL_ONLINE') 7 | 8 | 9 | class WebnovelOnlineCrawler(Crawler): 10 | base_url = 'https://webnovel.online/' 11 | 12 | def read_novel_info(self): 13 | '''Get novel title, autor, cover etc''' 14 | url = self.novel_url 15 | logger.debug('Visiting %s', url) 16 | soup = self.get_soup(url) 17 | 18 | img = soup.select_one('main img.cover') 19 | self.novel_title = img['title'].strip() 20 | self.novel_cover = self.absolute_url(img['src']) 21 | 22 | span = soup.select_one('header span.send-author-event') 23 | if span: 24 | self.novel_author = span.text.strip() 25 | # end if 26 | 27 | chap_id = 0 28 | for a in soup.select('#info a.on-navigate-part'): 29 | vol_id = chap_id // 100 + 1 30 | if vol_id > len(self.volumes): 31 | self.volumes.append({ 32 | 'id': vol_id, 33 | 'title': 'Volume %d' % vol_id 34 | }) 35 | # end if 36 | 37 | chap_id += 1 38 | self.chapters.append({ 39 | 'id': chap_id, 40 | 'volume': vol_id, 41 | 'title': a.text.strip(), 42 | 'url': self.absolute_url(a['href']), 43 | }) 44 | # end for 45 | # end def 46 | 47 | def download_chapter_body(self, chapter): 48 | '''Download body of a single chapter and return as clean html format.''' 49 | logger.info('Visiting %s', chapter['url']) 50 | soup = self.get_soup(chapter['url']) 51 | 52 | strong = soup.select_one('#story-content strong') 53 | if strong and re.search(r'Chapter \d+', strong.text): 54 | chapter['title'] = strong.text.strip() 55 | logger.info('Updated title: %s', chapter['title']) 56 | # end if 57 | 58 | self.bad_tags += ['h1', 'h3', 'hr'] 59 | contents = soup.select_one('#story-content') 60 | body = self.extract_contents(contents) 61 | return '

' + '

'.join(body) + '

' 62 | # end def 63 | # end class 64 | -------------------------------------------------------------------------------- /lncrawl/sources/flyinglines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from urllib.parse import urlparse 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('FLYING LINES') 9 | 10 | chapter_body_url = 'https://www.flying-lines.com/h5/novel/%s/%s?accessToken=&isFirstEnter=1&webdriver=0' 11 | 12 | 13 | class FlyingLinesCrawler(Crawler): 14 | base_url = 'https://www.flying-lines.com/' 15 | 16 | def read_novel_info(self): 17 | '''Get novel title, autor, cover etc''' 18 | logger.debug('Visiting %s', self.novel_url) 19 | soup = self.get_soup(self.novel_url) 20 | 21 | self.novel_title = soup.select_one('.novel-info .title h2').text 22 | logger.info('Novel title: %s', self.novel_title) 23 | 24 | self.novel_cover = self.absolute_url( 25 | soup.select_one('.novel .novel-thumb img')['data-src']) 26 | logger.info('Novel cover: %s', self.novel_cover) 27 | 28 | authors = [x.text.strip() 29 | for x in soup.select('.novel-info ul.profile li')] 30 | self.novel_author = ', '.join(authors) 31 | logger.info('%s', self.novel_author) 32 | 33 | self.novel_id = urlparse(self.novel_url).path.split('/')[2] 34 | logger.info("Novel id: %s", self.novel_id) 35 | 36 | for a in soup.select('ul.volume-chapters li a'): 37 | chap_id = int(a['data-chapter-number']) 38 | vol_id = 1 + (chap_id - 1) // 100 39 | if len(self.chapters) % 100 == 0: 40 | self.volumes.append({'id': vol_id}) 41 | # end if 42 | self.chapters.append({ 43 | 'id': chap_id, 44 | 'volume': vol_id, 45 | 'title': a.text.strip(), 46 | 'url': self.absolute_url(a['href']), 47 | }) 48 | # end for 49 | # end def 50 | 51 | def download_chapter_body(self, chapter): 52 | '''Download body of a single chapter and return as clean html format.''' 53 | url = chapter_body_url % (self.novel_id, chapter['id']) 54 | logger.info('Downloading %s', url) 55 | response = self.submit_form(url) 56 | data = response.json() 57 | print(data) 58 | return data['data']['content'] 59 | # end def 60 | # end class 61 | -------------------------------------------------------------------------------- /lncrawl/sources/wuxialeague.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from ..utils.crawler import Crawler 5 | 6 | logger = logging.getLogger('WUXIA_LEAGUE') 7 | 8 | 9 | class WuxiaLeagueCrawler(Crawler): 10 | base_url = 'https://www.wuxialeague.com/' 11 | 12 | def read_novel_info(self): 13 | logger.debug('Visiting %s', self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.select_one('#bookinfo .d_title h1').text 17 | logger.info('Novel title: %s', self.novel_title) 18 | 19 | self.novel_cover = self.absolute_url(soup.select_one('#bookimg img')['src']) 20 | logger.info('Novel cover: %s', self.novel_cover) 21 | 22 | possible_authors = [a.text for a in soup.select('#bookinfo a[href*="/author/"]')] 23 | self.novel_author = ', '.join(possible_authors) 24 | logger.info('Novel author: %s', self.novel_author) 25 | 26 | for a in soup.select('#chapterList li a'): 27 | chap_id = 1 + len(self.chapters) 28 | vol_id = 1 + len(self.chapters) // 100 29 | if chap_id % 100 == 1: 30 | self.volumes.append({'id': vol_id}) 31 | # end if 32 | self.chapters.append({ 33 | 'id': chap_id, 34 | 'volume': vol_id, 35 | 'title': a.text.strip(), 36 | 'url': self.absolute_url(a['href']), 37 | }) 38 | # end for 39 | # end def 40 | 41 | def download_chapter_body(self, chapter): 42 | '''Download body of a single chapter and return as clean html format''' 43 | logger.info('Downloading %s', chapter['url']) 44 | soup = self.get_soup(chapter['url']) 45 | 46 | body = '' 47 | title_found = False 48 | for p in soup.select('#TextContent > p'): 49 | if not p.text.strip(): 50 | continue 51 | # end if 52 | clean_first = ''.join(re.findall(r'([a-z0-9]+)', p.text.lower())) 53 | clean_title = ''.join(re.findall(r'([a-z0-9]+)', chapter['title'].lower())) 54 | if clean_first == clean_title: 55 | continue 56 | # end if 57 | body += str(p).strip() 58 | # end for 59 | 60 | return body 61 | # end def 62 | # end class 63 | -------------------------------------------------------------------------------- /lncrawl/core/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Interactive application to take user inputs 4 | """ 5 | import logging 6 | import os 7 | 8 | import colorama 9 | import requests 10 | import win_unicode_console 11 | from colorama import Fore 12 | 13 | from ..assets.version import get_value as get_version 14 | from ..bots import run_bot 15 | from ..utils.update_checker import check_updates 16 | from .arguments import get_args 17 | from .display import (cancel_method, debug_mode, description, epilog, 18 | error_message, input_suppression) 19 | 20 | logger = logging.Logger('CORE') 21 | 22 | 23 | def init(): 24 | os.environ['version'] = get_version() 25 | 26 | win_unicode_console.enable() 27 | colorama.init() 28 | description() 29 | 30 | args = get_args() 31 | 32 | levels = ['NOTSET', 'WARN', 'INFO', 'DEBUG'] 33 | level = os.getenv('LOG_LEVEL') 34 | if not level: 35 | level = levels[args.log] if args.log else 'NOTSET' 36 | # end if 37 | if level != 'NOTSET': 38 | os.environ['debug_mode'] = 'yes' 39 | logging.basicConfig( 40 | level=logging.getLevelName(level), 41 | format=Fore.CYAN + '%(asctime)s ' 42 | + Fore.RED + '[%(levelname)s] ' 43 | + Fore.YELLOW + '(%(name)s)\n' 44 | + Fore.WHITE + '%(message)s' + Fore.RESET, 45 | ) 46 | debug_mode(level) 47 | # end if 48 | 49 | if args.suppress: 50 | input_suppression() 51 | print(args) 52 | # end if 53 | 54 | if args.bot: 55 | os.environ['BOT'] = args.bot 56 | # end if 57 | 58 | for key, val in args.extra.items(): 59 | os.environ[key] = val[0] 60 | # end for 61 | 62 | # requests.urllib3.disable_warnings( 63 | # requests.urllib3.exceptions.InsecureRequestWarning) 64 | # # end if 65 | # end def 66 | 67 | 68 | def start_app(): 69 | init() 70 | 71 | check_updates() 72 | cancel_method() 73 | 74 | try: 75 | bot = os.getenv('BOT', '').lower() 76 | run_bot(bot) 77 | except Exception as err: 78 | if os.getenv('debug_mode') == 'yes': 79 | raise err 80 | else: 81 | error_message(err) 82 | # end if 83 | # end try 84 | 85 | epilog() 86 | 87 | # if Icons.isWindows and get_args().suppress is False: 88 | # input('Press ENTER to exit...') 89 | # # end if 90 | # end def 91 | -------------------------------------------------------------------------------- /lncrawl/sources/fullnovellive.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..utils.crawler import Crawler 4 | 5 | logger = logging.getLogger('FULLNOVEL_LIVE') 6 | 7 | NOVEL_SEARCH = 'http://fullnovel.live/search/%s' 8 | 9 | 10 | class FullnovelLiveCrawler(Crawler): 11 | base_url = 'http://fullnovel.live/' 12 | 13 | def search_novel(self, query): 14 | '''Gets a list of (title, url) matching the given query''' 15 | results = [] 16 | soup = self.get_soup(NOVEL_SEARCH % query) 17 | for grid in soup.select('.grid .v-grid'): 18 | a = grid.select_one('h4 a') 19 | info = grid.select_one('.info-line a').text 20 | results.append({ 21 | 'title': (a['title'] or a.text).strip(), 22 | 'url': self.absolute_url(a['href']), 23 | 'info': info 24 | }) 25 | # end for 26 | return results 27 | # end def 28 | 29 | def read_novel_info(self): 30 | '''Get novel title, autor, cover etc''' 31 | soup = self.get_soup(self.novel_url) 32 | self.novel_title = soup.select_one('.info h1.title a').text.strip() 33 | self.novel_cover = self.absolute_url( 34 | soup.select_one('.info .image img')['src']) 35 | 36 | chapters = soup.select('.scroll-eps a') 37 | chapters.reverse() 38 | 39 | for x in chapters: 40 | chap_id = len(self.chapters) + 1 41 | if len(self.chapters) % 100 == 0: 42 | vol_id = chap_id//100 + 1 43 | vol_title = 'Volume ' + str(vol_id) 44 | self.volumes.append({ 45 | 'id': vol_id, 46 | 'title': vol_title, 47 | }) 48 | # end if 49 | self.chapters.append({ 50 | 'id': chap_id, 51 | 'volume': vol_id, 52 | 'url': self.absolute_url(x['href']), 53 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 54 | }) 55 | # end for 56 | # end def 57 | 58 | def download_chapter_body(self, chapter): 59 | '''Download body of a single chapter and return as clean html format.''' 60 | soup = self.get_soup(chapter['url']) 61 | contents = soup.select_one('.page .divContent') 62 | body = self.extract_contents(contents) 63 | return '

' + '

'.join(body) + ' p'): 51 | for strong in p.select('strong'): 52 | strong.name = 'span' 53 | # end for 54 | if p.text.strip(): 55 | body += str(p).strip() 56 | # end if 57 | # end for 58 | 59 | body += '

*******

' 60 | for p in soup.select('#authors_note > p'): 61 | if p.text.strip(): 62 | body += str(p).strip() 63 | # end if 64 | # end for 65 | 66 | return body 67 | # end def 68 | # end class 69 | -------------------------------------------------------------------------------- /README.pip: -------------------------------------------------------------------------------- 1 | Lightnovel Crawler 2 | ----------------------- 3 | 4 | Download lightnovels from various online sources and generate output in different formats, e.g. epub, mobi, json, html, text, docx and pdf. 5 | 6 | Supported sources: 7 | - http://boxnovel.org 8 | - http://liberspark.com 9 | - http://novelfull.com 10 | - http://tiknovel.com 11 | - http://www.machinenoveltranslation. 12 | - http://www.tiknovel.com 13 | - http://zenithnovels.com 14 | - https://4scanlation.xyz 15 | - https://9kqw.com 16 | - https://anythingnovel.com 17 | - https://babelnovel.com 18 | - https://bestlightnovel.com 19 | - https://book.qidian.com 20 | - https://boxnovel.com 21 | - https://creativenovels.com 22 | - https://crescentmoon.blog 23 | - https://es.mtlnovel.com 24 | - https://fr.mtlnovel.com 25 | - https://id.mtlnovel.com 26 | - https://kiss-novel.com 27 | - https://kisslightnovels.info 28 | - https://light-novel.online 29 | - https://listnovel.com 30 | - https://litnet.com 31 | - https://lnmtl.com 32 | - https://m.chinesefantasynovels.com 33 | - https://m.novelspread.com 34 | - https://m.romanticlovebooks.com 35 | - https://m.wuxiaworld.co 36 | - https://meionovel.com 37 | - https://myoniyonitranslations.com 38 | - https://novelfull.com 39 | - https://novelonlinefull.com 40 | - https://novelraw.blogspot.com 41 | - https://novelsrock.com 42 | - https://ranobelib.me 43 | - https://rewayat.club 44 | - https://tomotranslations.com 45 | - https://volarenovels.com 46 | - https://webnovel.online 47 | - https://webnovelindonesia.com 48 | - https://webnovelonline.com 49 | - https://wordexcerpt.com 50 | - https://wuxiaworld.online 51 | - https://www.aixdzs.com 52 | - https://www.asianhobbyist.com 53 | - https://www.idqidian.us 54 | - https://www.machine-translation.org 55 | - https://www.mtlnovel.com 56 | - https://www.novelall.com 57 | - https://www.novelringan.com 58 | - https://www.novelspread.com 59 | - https://www.qidian.com 60 | - https://www.readlightnovel.org 61 | - https://www.readnovelfull.com 62 | - https://www.romanticlovebooks.com 63 | - https://www.royalroad.com 64 | - https://www.scribblehub.com 65 | - https://www.shinsori.com 66 | - https://www.tapread.com 67 | - https://www.translateindo.com 68 | - https://www.wattpad.com 69 | - https://www.webnovel.com 70 | - https://www.worldnovel.online 71 | - https://www.wuxialeague.com 72 | - https://www.wuxiaworld.co 73 | - https://www.wuxiaworld.com 74 | - https://www.wuxiaworld.site 75 | 76 | Visit https://github.com/dipu-bd/lightnovel-crawler for more details. 77 | -------------------------------------------------------------------------------- /lncrawl/core/novel_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | To get the novel info 4 | """ 5 | import re 6 | import os 7 | import json 8 | from ..utils.crawler import Crawler 9 | 10 | 11 | def format_novel(crawler: Crawler): 12 | crawler.novel_title = crawler.novel_title.strip() 13 | crawler.novel_author = crawler.novel_author.strip() 14 | # crawler.novel_title = crawler.cleanup_text(crawler.novel_title) 15 | # crawler.novel_author = crawler.cleanup_text(crawler.novel_author) 16 | format_volumes(crawler) 17 | format_chapters(crawler) 18 | # end def 19 | 20 | 21 | def format_volumes(crawler: Crawler): 22 | for vol in crawler.volumes: 23 | vol['chapter_count'] = 0 24 | vol['final_chapter'] = 0 25 | vol['start_chapter'] = 1e8 26 | title = 'Volume %d' % vol['id'] 27 | if not ('title' in vol and vol['title']): 28 | vol['title'] = title 29 | # end if 30 | # end for 31 | # end def 32 | 33 | 34 | def format_chapters(crawler: Crawler): 35 | for item in crawler.chapters: 36 | title = '#%d' % item['id'] 37 | if not ('title' in item and item['title']): 38 | item['title'] = title 39 | # end if 40 | 41 | volume = [x for x in crawler.volumes if x['id'] == item['volume']] 42 | if len(volume) == 0: 43 | raise Exception('Unknown volume %s for chapter %s' % (item['volume'], item['id'])) 44 | else: 45 | volume = volume[0] 46 | # end if 47 | 48 | item['volume_title'] = volume['title'] 49 | 50 | volume['chapter_count'] += 1 51 | volume['final_chapter'] = item['id'] if volume['final_chapter'] < item['id'] else volume['final_chapter'] 52 | volume['start_chapter'] = item['id'] if volume['start_chapter'] > item['id'] else volume['start_chapter'] 53 | # end for 54 | # end def 55 | 56 | 57 | def save_metadata(crawler, output_path): 58 | data = { 59 | 'url': crawler.novel_url, 60 | 'title': crawler.novel_title, 61 | 'author': crawler.novel_author, 62 | 'cover': crawler.novel_cover, 63 | 'volumes': crawler.volumes, 64 | 'chapters': crawler.chapters, 65 | 'rtl': crawler.is_rtl, 66 | } 67 | file_name = os.path.join(output_path, 'json', 'meta.json') 68 | os.makedirs(os.path.dirname(file_name), exist_ok=True) 69 | with open(file_name, 'w', encoding="utf-8") as file: 70 | json.dump(data, file, indent=2) 71 | # end with 72 | # end def 73 | -------------------------------------------------------------------------------- /lncrawl/sources/aixdzs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from urllib.parse import urlparse 6 | 7 | import requests 8 | 9 | from ..utils.crawler import Crawler 10 | 11 | logger = logging.getLogger('AIXDZS_CRAWLER') 12 | 13 | chapter_list_url = 'https://read.aixdzs.com/%s' 14 | 15 | 16 | class AixdzsCrawler(Crawler): 17 | base_url = 'https://www.aixdzs.com' 18 | 19 | def read_novel_info(self): 20 | '''Get novel title, autor, cover etc''' 21 | if not self.novel_url.endswith('/'): 22 | self.novel_url += '/' 23 | # end if 24 | logger.debug('Visiting %s', self.novel_url) 25 | soup = self.get_soup(self.novel_url) 26 | 27 | self.novel_cover = soup.select_one('meta[property="og:image"]')['content'] 28 | logger.info('Novel cover: %s', self.novel_cover) 29 | 30 | self.novel_title = soup.select_one('meta[property="og:novel:book_name"]')['content'] 31 | logger.info('Novel title: %s', self.novel_title) 32 | 33 | self.novel_author = soup.select_one('meta[property="og:novel:author"]')['content'] 34 | logger.info('%s', self.novel_author) 35 | 36 | parsed_url = urlparse(self.novel_url) 37 | parsed_path = parsed_url.path.strip('/').split('/') 38 | chapter_url = chapter_list_url % ('/'.join(parsed_path[1:])) 39 | logger.debug('Visiting %s', chapter_url) 40 | soup = self.get_soup(chapter_url) 41 | 42 | volumes = set([]) 43 | for a in reversed(soup.select('div.catalog li a')): 44 | ch_id = len(self.chapters) + 1 45 | vol_id = 1 + len(self.chapters) // 100 46 | volumes.add(vol_id) 47 | self.chapters.append({ 48 | 'id': ch_id, 49 | 'volume': vol_id, 50 | 'title': a.text, 51 | 'url': self.absolute_url(a['href'], page_url=chapter_url), 52 | }) 53 | # end def 54 | 55 | self.volumes = [{'id': x, 'title': ''} for x in volumes] 56 | # end def 57 | 58 | def download_chapter_body(self, chapter): 59 | '''Download body of a single chapter and return as clean html format.''' 60 | logger.info('Downloading %s', chapter['url']) 61 | soup = self.get_soup(chapter['url']) 62 | chapter['body_lock'] = True 63 | contents = soup.select('.content > p') 64 | contents = [str(p) for p in contents if p.text.strip()] 65 | return ''.join(contents) 66 | # end def 67 | # end class 68 | -------------------------------------------------------------------------------- /lncrawl/sources/tapread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from urllib.parse import urlparse 4 | from ..utils.crawler import Crawler 5 | 6 | logger = logging.getLogger('TAPREAD') 7 | 8 | chapter_list_url = 'https://www.tapread.com/book/contents?bookId=%s' 9 | chapter_url = 'https://www.tapread.com/book/chapter?bookId=%s&chapterId=%s' 10 | 11 | 12 | class TapreadCrawler(Crawler): 13 | base_url = 'https://www.tapread.com/' 14 | 15 | def read_novel_info(self): 16 | '''Get novel title, autor, cover etc''' 17 | logger.debug('Visiting %s', self.novel_url) 18 | soup = self.get_soup(self.novel_url) 19 | 20 | self.novel_title = soup.select_one('.book-name').text.strip() 21 | logger.info('Novel title: %s', self.novel_title) 22 | 23 | try: 24 | self.novel_cover = self.absolute_url( 25 | soup.select_one('img.bg-img, img.cover-img, .book-img img')['src']) 26 | except Exception: 27 | pass 28 | # end try 29 | logger.info('Novel cover: %s', self.novel_cover) 30 | 31 | try: 32 | possible_authors = [] 33 | for div in soup.select('.author, .translator'): 34 | possible_authors.append( 35 | ': '.join([x.strip() for x in div.text.split(':')])) 36 | # end for 37 | self.novel_author = ', '.join(possible_authors) 38 | except Exception: 39 | pass 40 | # end try 41 | logger.info(self.novel_author) 42 | 43 | path = urlparse(self.novel_url).path 44 | book_id = path.split('/')[3] 45 | data = self.get_json(chapter_list_url % book_id) 46 | 47 | volumes = set() 48 | for chap in data['result']['chapterList']: 49 | chap_id = chap['chapterNo'] 50 | vol_id = (chap_id - 1) // 100 + 1 51 | volumes.add(vol_id) 52 | self.chapters.append({ 53 | 'id': chap_id, 54 | 'volume': vol_id, 55 | 'title': chap['chapterName'], 56 | 'url': chapter_url % (chap['bookId'], chap['chapterId']), 57 | }) 58 | # end for 59 | 60 | self.volumes = [{'id': x} for x in volumes] 61 | # end def 62 | 63 | def download_chapter_body(self, chapter): 64 | '''Download body of a single chapter and return as clean html format''' 65 | logger.info('Downloading %s', chapter['url']) 66 | data = self.get_json(chapter['url']) 67 | return data['result']['content'] 68 | # end def 69 | # end class 70 | -------------------------------------------------------------------------------- /lncrawl/sources/tomotrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('TOMO_TRANSLATIONS') 8 | 9 | 10 | class TomoTransCrawler(Crawler): 11 | base_url = 'https://tomotranslations.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one('article h1.title').text 19 | logger.info('Novel title: %s', self.novel_title) 20 | 21 | self.novel_cover = self.absolute_url( 22 | soup.select_one('article figure.wp-block-image img')['data-orig-file']) 23 | logger.info('Novel cover: %s', self.novel_cover) 24 | 25 | author = 'Tomo Translations' 26 | logger.info('Novel author: %s', self.novel_author) 27 | 28 | volumes = set() 29 | for a in soup.select('article section.entry a[href^="%s"]' % self.home_url): 30 | chap_id = len(self.chapters) + 1 31 | chap_url = self.absolute_url(a['href']) 32 | possible_vol = re.findall(r'-volume-(\d+)-', chap_url) 33 | if not len(possible_vol): 34 | continue 35 | # end if 36 | vol_id = int(possible_vol[0]) 37 | volumes.add(vol_id) 38 | self.chapters.append({ 39 | 'id': chap_id, 40 | 'volume': vol_id, 41 | 'url': chap_url, 42 | 'title': a.text.strip(), 43 | }) 44 | # end for 45 | 46 | self.volumes = [{'id': x} for x in volumes] 47 | # end def 48 | 49 | def download_chapter_body(self, chapter): 50 | '''Download body of a single chapter and return as clean html format.''' 51 | logger.info('Downloading %s', chapter['url']) 52 | soup = self.get_soup(chapter['url']) 53 | 54 | body = '' 55 | for tag in soup.select('article section.entry > *'): 56 | if tag.name == 'hr' and tag.has_attr("class") and 'is-style-dots' in tag.get('class'): 57 | body += '

—————–

' 58 | elif tag.name == 'p': 59 | if tag.find('strong'): 60 | chapter['title'] = tag.text.strip() 61 | elif tag.find('a') and re.match(r'Previous|Next', tag.find('a').text): 62 | pass 63 | else: 64 | body += str(tag) 65 | # end if 66 | # end if 67 | # end for 68 | 69 | return body 70 | # end def 71 | # end class 72 | -------------------------------------------------------------------------------- /lncrawl/sources/wattpad.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WATTPAD') 8 | 9 | 10 | class WattpadCrawler(Crawler): 11 | base_url = 'https://www.wattpad.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select('h1')[0].get_text().strip() 19 | logger.info('Novel title: %s', self.novel_title) 20 | 21 | self.novel_cover = self.absolute_url( 22 | soup.select('div.cover.cover-lg img')[0]['src']) 23 | logger.info('Novel cover: %s', self.novel_cover) 24 | 25 | self.novel_author = soup.select('div.author-info strong a')[0].get_text() 26 | logger.info('Novel author: %s', self.novel_author) 27 | 28 | description = soup.select('h2.description')[0].get_text() 29 | 30 | chapters = soup.select('ul.table-of-contents a') 31 | # chapters.reverse() 32 | 33 | for a in chapters: 34 | chap_id = len(self.chapters) + 1 35 | vol_id = chap_id//100 + 1 36 | if len(self.chapters) % 100 == 0: 37 | vol_title = 'Volume ' + str(vol_id) 38 | self.volumes.append({ 39 | 'id': vol_id, 40 | 'title': vol_title, 41 | }) 42 | # end if 43 | self.chapters.append({ 44 | 'id': chap_id, 45 | 'volume': vol_id, 46 | 'url': self.absolute_url(a['href']), 47 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 48 | }) 49 | # end for 50 | # end def 51 | 52 | def download_chapter_body(self, chapter): 53 | '''Download body of a single chapter and return as clean html format.''' 54 | logger.info('Downloading %s', chapter['url']) 55 | 56 | soup = self.get_soup(chapter['url']) 57 | pages = int(re.search('[1-9]', re.search('("pages":)([1-9])', str(soup)).group(0)).group(0)) 58 | chapter['title'] = soup.select('h2')[0].get_text().strip() 59 | contents = [] 60 | for i in range(1, pages+1): 61 | page_url = chapter['url'] + "/page/" + str(i) 62 | logger.info('Get body text from %s', page_url) 63 | soup_page = self.get_soup(page_url) 64 | for p in soup_page.select('pre p'): 65 | contents.append(p.text) 66 | 67 | return '

' + '

'.join(contents) + '

' 68 | # end def 69 | # end class 70 | -------------------------------------------------------------------------------- /lncrawl/sources/jpmtl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | import ast 6 | import requests 7 | from ..utils.crawler import Crawler 8 | 9 | logger = logging.getLogger('JPMTL') 10 | 11 | book_url = 'https://jpmtl.com/books/%s' 12 | 13 | class JpmtlCrawler(Crawler): 14 | base_url = 'https://jpmtl.com/' 15 | 16 | def initialize(self): 17 | self.home_url = 'https://jpmtl.com' 18 | # end def 19 | 20 | def read_novel_info(self): 21 | '''Get novel title, autor, cover etc''' 22 | self.novel_id = self.novel_url.split('/')[-1] 23 | logger.info('Novel Id: %s', self.novel_id) 24 | 25 | self.novel_url = book_url % self.novel_id 26 | logger.debug('Visiting %s', self.novel_url) 27 | soup = self.get_soup(self.novel_url) 28 | 29 | self.novel_title =soup.select_one('h1.book-sidebar__title').text.strip() 30 | logger.info('Novel title: %s', self.novel_title) 31 | 32 | try: 33 | self.novel_cover = self.absolute_url( 34 | soup.select_one('.book-sidebar__img img')['src']) 35 | logger.info('Novel cover: %s', self.novel_cover) 36 | except Exception: 37 | logger.debug('Failed to get cover: %s', self.novel_url) 38 | # end try 39 | 40 | self.novel_author = soup.select_one('.book-sidebar__author .book-sidebar__info').text.strip() 41 | logger.info('Novel author: %s', self.novel_author) 42 | 43 | for a in soup.select('ol.book-volume__list li a'): 44 | chap_id = len(self.chapters) + 1 45 | if len(self.chapters) % 100 == 0: 46 | vol_id = chap_id//100 + 1 47 | vol_title = 'Volume ' + str(vol_id) 48 | self.volumes.append({ 49 | 'id': vol_id, 50 | 'title': vol_title, 51 | }) 52 | # end if 53 | self.chapters.append({ 54 | 'id': chap_id, 55 | 'volume': vol_id, 56 | 'url': self.absolute_url(a['href']), 57 | 'title': a.select_one('.book-ccontent__title').text.strip() or ('Chapter %d' % chap_id), 58 | }) 59 | # end for 60 | # end def 61 | 62 | def download_chapter_body(self, chapter): 63 | '''Download body of a single chapter and return as clean html format''' 64 | logger.info('Downloading %s', chapter['url']) 65 | soup = self.get_soup(chapter['url']) 66 | 67 | contents = soup.select('.chapter-content__content p') 68 | 69 | body = [str(p) for p in contents if p.text.strip()] 70 | 71 | return '

' + '

'.join(body) + '

' 72 | # end def 73 | # end class 74 | -------------------------------------------------------------------------------- /lncrawl/sources/tiknovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import parse_qsl, urlparse 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('TIKNOVEL') 9 | 10 | chapter_details_url = 'https://tiknovel.com/book/ajaxchap' 11 | 12 | 13 | class TikNovelCrawler(Crawler): 14 | base_url = [ 15 | 'http://tiknovel.com/', 16 | 'https://tiknovel.com/', 17 | ] 18 | 19 | def read_novel_info(self): 20 | logger.debug('Visiting %s', self.novel_url) 21 | soup = self.get_soup(self.novel_url) 22 | 23 | self.novel_title = soup.select_one('#content .detail-wrap h1.detail-tit').text 24 | logger.info('Novel title: %s', self.novel_title) 25 | 26 | possible_authors = soup.select('#content table.detail-profile td') 27 | for td in possible_authors: 28 | if '作者' in td.find('strong').text: 29 | td.find('strong').extract() 30 | self.novel_author = td.text.strip() 31 | break 32 | # end if 33 | # end for 34 | logger.info('Novel author: %s', self.novel_author) 35 | 36 | self.novel_cover = self.absolute_url( 37 | soup.select_one('#content .detail-thumb-box img')['data-echo']) 38 | logger.info('Novel cover: %s', self.novel_cover) 39 | 40 | volumes = set() 41 | for a in soup.select('#content .contents-lst li a'): 42 | ch_id = int(a.find('span').text.strip()) 43 | vol_id = 1 + (ch_id - 1) // 100 44 | volumes.add(vol_id) 45 | self.chapters.append({ 46 | 'id': ch_id, 47 | 'volume': vol_id, 48 | 'title': a['title'], 49 | 'url': self.absolute_url(a['href']), 50 | }) 51 | # end for 52 | 53 | self.volumes = [{'id': x} for x in volumes] 54 | # end def 55 | 56 | def download_chapter_body(self, chapter): 57 | '''Download body of a single chapter and return as clean html format.''' 58 | chapter['body_lock'] = True 59 | query_str = urlparse(chapter['url']).query 60 | data_params = {x[0]: int(x[1]) for x in parse_qsl(query_str)} 61 | logging.debug("Requesting body with: %s", data_params) 62 | response = self.submit_form(chapter_details_url, data=data_params) 63 | data = response.json() 64 | chap_desc = data['data']['chap']['desc'] 65 | chap_desc = re.sub(r'(()|\n)+', '\n\n', chap_desc, flags=re.I) 66 | contents = chap_desc.split('\n\n') 67 | contents = [p for p in contents if p and p.strip()] 68 | return '

' + '

'.join(contents) + '

' 69 | # end def 70 | # end class 71 | -------------------------------------------------------------------------------- /lncrawl/sources/qidiancom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..utils.crawler import Crawler 4 | 5 | logger = logging.getLogger('QIDIAN_COM') 6 | 7 | chapter_list_url = 'https://book.qidian.com/ajax/book/category?_csrfToken=%s&bookId=%s' 8 | chapter_details_url = 'https://read.qidian.com/chapter/%s' 9 | 10 | 11 | class QidianComCrawler(Crawler): 12 | base_url = [ 13 | 'https://book.qidian.com/', 14 | # 'https://www.qidian.com/', 15 | ] 16 | 17 | def initialize(self): 18 | self.home_url = 'https://www.qidian.com/' 19 | # end def 20 | 21 | def read_novel_info(self): 22 | '''Get novel title, autor, cover etc''' 23 | logger.debug('Visiting %s', self.novel_url) 24 | soup = self.get_soup(self.novel_url) 25 | 26 | self.novel_title = soup.select_one('.book-info h1 em').text 27 | logger.info('Novel title: %s', self.novel_title) 28 | 29 | self.novel_author = soup.select_one('.book-info h1 a.writer').text 30 | logger.info('Novel author: %s', self.novel_author) 31 | 32 | book_img = soup.select_one('#bookImg') 33 | self.novel_cover = self.absolute_url(book_img.find('img')['src']) 34 | self.novel_cover = '/'.join(self.novel_cover.split('/')[:-1]) 35 | logger.info('Novel cover: %s', self.novel_cover) 36 | 37 | self.book_id = book_img['data-bid'] 38 | logger.debug('Book Id: %s', self.book_id) 39 | 40 | self.csrf = self.cookies['_csrfToken'] 41 | logger.debug('CSRF Token: %s', self.csrf) 42 | 43 | volume_url = chapter_list_url % (self.csrf, self.book_id) 44 | logger.debug('Visiting %s', volume_url) 45 | data = self.get_json(volume_url) 46 | 47 | for volume in data['data']['vs']: 48 | vol_id = len(self.volumes) + 1 49 | self.volumes.append({ 50 | 'id': vol_id, 51 | 'title': volume['vN'], 52 | }) 53 | for chapter in volume['cs']: 54 | ch_id = len(self.chapters) + 1 55 | self.chapters.append({ 56 | 'id': ch_id, 57 | 'volume': vol_id, 58 | 'title': chapter['cN'], 59 | 'url': chapter_details_url % chapter['cU'], 60 | }) 61 | # end for 62 | # end for 63 | # end def 64 | 65 | def download_chapter_body(self, chapter): 66 | '''Download body of a single chapter and return as clean html format''' 67 | logger.info('Downloading %s', chapter['url']) 68 | soup = self.get_soup(chapter['url']) 69 | chapter['body_lock'] = True 70 | chapter['title'] = soup.select_one('h3.j_chapterName').text.strip() 71 | return soup.select_one('div.j_readContent').extract() 72 | # end def 73 | # end class 74 | -------------------------------------------------------------------------------- /lncrawl/assets/html_style.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css?family=Merriweather:400,400i,700,700i'); 2 | 3 | html, 4 | body { 5 | margin: 0; 6 | padding: 0; 7 | width: 100%; 8 | height: 100%; 9 | position: relative; 10 | background-color: #323235; 11 | -webkit-font-smoothing: antialiased; 12 | } 13 | 14 | #content { 15 | padding: 10px 20px; 16 | max-width: 850px; 17 | margin: 10px auto; 18 | font-size: 16px; 19 | font-family: 'Merriweather', Georgia, serif; 20 | text-align: justify; 21 | line-height: 1.8; 22 | border-radius: 5px; 23 | box-shadow: 0 0 10px #000, 0 0 0 1px #000; 24 | background-color: #fffff0; 25 | } 26 | 27 | @media (max-width: 925px) { 28 | #content { 29 | margin: 5px; 30 | max-width: auto; 31 | } 32 | } 33 | 34 | main { 35 | min-height: 500px; 36 | padding: 0 10px; 37 | } 38 | 39 | h1, 40 | h2, 41 | h3, 42 | h4, 43 | h5, 44 | h6 { 45 | color: #555; 46 | padding: 10px; 47 | margin: 0; 48 | text-align: center; 49 | line-height: normal; 50 | } 51 | 52 | h1 { 53 | color: #333336; 54 | font-weight: 300; 55 | margin-bottom: 15px; 56 | } 57 | 58 | h1:after { 59 | content: '-'; 60 | margin: 10px 30px; 61 | height: 2px; 62 | border-radius: 50%; 63 | background: #444; 64 | display: block; 65 | color: transparent; 66 | } 67 | 68 | .link-group { 69 | padding: 10px; 70 | margin: 15px 0; 71 | display: flex; 72 | align-items: center; 73 | justify-content: space-between; 74 | background: #dde; 75 | border: 1px solid #dde; 76 | } 77 | 78 | .link-group a { 79 | color: #39f; 80 | text-decoration: none; 81 | } 82 | 83 | .link-group .btn { 84 | color: #333; 85 | font-family: sans-serif; 86 | font-size: 18px; 87 | font-weight: Arial, 600; 88 | display: inline-block; 89 | width: 145px; 90 | padding: 5px; 91 | text-align: center; 92 | background: #f5f5f5; 93 | box-shadow: 1px 1px 2px #aac, 0 0 0 1px #ccc; 94 | border-radius: 5px; 95 | } 96 | 97 | .link-group .btn:hover { 98 | background: #ececef; 99 | } 100 | 101 | .link-group .btn:active { 102 | box-shadow: 1px 1px 2px #cce inset, 0 0 0 1px #ccc; 103 | } 104 | 105 | div#readpos { 106 | border-radius: 10px; 107 | font-weight: bold; 108 | font-family: monospace; 109 | color: #770; 110 | font-size: 14px; 111 | padding: 5px 10px; 112 | background: white; 113 | box-shadow: 0 0 10px #333, 0 0 0 1px #dde; 114 | position: fixed; 115 | bottom: 10px; 116 | right: 10px; 117 | user-select: none; 118 | -webkit-user-drag: none; 119 | /* min-width: 100px; 120 | text-align: right; */ 121 | } 122 | 123 | @media print { 124 | #content { 125 | margin: 0; 126 | max-width: none; 127 | box-shadow: none; 128 | } 129 | .link-group { 130 | display: none; 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /lncrawl/sources/9kqw.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import parse_qsl, urlparse 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('9KQW') 9 | 10 | chapter_details_url = 'https://9kqw.com/book/ajaxchap' 11 | 12 | 13 | class TikNovelCrawler(Crawler): 14 | base_url = [ 15 | 'https://9kqw.com/', 16 | 'http://www.tiknovel.com/', 17 | 'https://www.tiknovel.com/', 18 | ] 19 | 20 | def initialize(self): 21 | self.base_url = 'https://9kqw.com/' 22 | # end def 23 | 24 | def read_novel_info(self): 25 | logger.debug('Visiting %s', self.novel_url) 26 | soup = self.get_soup(self.novel_url) 27 | 28 | self.novel_title = soup.select_one('#content .detail-wrap h1.detail-tit').text 29 | logger.info('Novel title: %s', self.novel_title) 30 | 31 | possible_authors = soup.select('#content table.detail-profile td') 32 | for td in possible_authors: 33 | if '作者' in td.find('strong').text: 34 | td.find('strong').extract() 35 | self.novel_author = td.text.strip() 36 | break 37 | # end if 38 | # end for 39 | logger.info('Novel author: %s', self.novel_author) 40 | 41 | self.novel_cover = self.absolute_url( 42 | soup.select_one('#content .detail-thumb-box img')['data-echo']) 43 | logger.info('Novel cover: %s', self.novel_cover) 44 | 45 | volumes = set() 46 | for a in soup.select('#content .contents-lst li a'): 47 | ch_id = int(a.find('span').text.strip()) 48 | vol_id = 1 + (ch_id - 1) // 100 49 | volumes.add(vol_id) 50 | self.chapters.append({ 51 | 'id': ch_id, 52 | 'volume': vol_id, 53 | 'title': a['title'], 54 | 'url': self.absolute_url(a['href']), 55 | }) 56 | # end for 57 | 58 | self.volumes = [{'id': x} for x in volumes] 59 | # end def 60 | 61 | def download_chapter_body(self, chapter): 62 | '''Download body of a single chapter and return as clean html format.''' 63 | chapter['body_lock'] = True 64 | query_str = urlparse(chapter['url']).query 65 | data_params = {x[0]: int(x[1]) for x in parse_qsl(query_str)} 66 | logging.debug("Requesting body with: %s", data_params) 67 | response = self.submit_form(chapter_details_url, data=data_params) 68 | data = response.json() 69 | chap_desc = data['data']['chap']['desc'] 70 | chap_desc = re.sub(r'(()|\n)+', '\n\n', chap_desc, flags=re.I) 71 | contents = chap_desc.split('\n\n') 72 | contents = [p for p in contents if p and p.strip()] 73 | return '

' + '

'.join(contents) + '

' 74 | # end def 75 | # end class 76 | -------------------------------------------------------------------------------- /lncrawl/sources/novelspread.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import hashlib 3 | import json 4 | import logging 5 | import re 6 | from concurrent.futures import ThreadPoolExecutor 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('NOVEL_SPREAD') 11 | 12 | book_info_url = 'https://api.novelspread.com/api/novel/path/%s' 13 | chapter_list_url = 'https://api.novelspread.com/api/novel/%s/chapter/menu' 14 | chapter_body_url = 'https://api.novelspread.com/api/novel/%s/chapter/%d/content?fingerprint=' 15 | 16 | 17 | class NovelSpreadCrawler(Crawler): 18 | base_url = 'https://www.novelspread.com/' 19 | 20 | def make_cover_url(self, image): 21 | a = '360' 22 | b = '512' 23 | c = '1' 24 | d = '90' 25 | r = a + b + c + d + image 26 | for i in range(2): 27 | m = hashlib.md5() 28 | m.update(r.encode()) 29 | r = m.hexdigest() 30 | # end for 31 | url = 'https://www.novelspread.com/image/' \ 32 | '%sx%s/%s/%s/%s/%s' % (a, b, d, c, r[:16], image) 33 | return url 34 | # end def 35 | 36 | def read_novel_info(self): 37 | self.novel_id = self.novel_url.strip('/').split('/')[-1] 38 | logger.info('Novel id: %s' % self.novel_id) 39 | data = self.get_json(book_info_url % self.novel_id) 40 | 41 | self.novel_title = data['data']['name'] 42 | logger.info('Title: %s' % self.novel_title) 43 | 44 | self.novel_author = 'Author: %s, Translator: %s' % ( 45 | data['data']['author'], data['data']['translator']) 46 | logger.info(self.novel_author) 47 | 48 | self.novel_cover = self.make_cover_url(data['data']['img']) 49 | logger.info('Novel cover: %s', self.novel_cover) 50 | 51 | logger.info('Getting chapters...') 52 | data = self.get_json(chapter_list_url % self.novel_id) 53 | 54 | volumes = set([]) 55 | for chap in data['data']: 56 | volumes.add(chap['volume']) 57 | self.chapters.append({ 58 | 'id': chap['chapter_number'], 59 | 'volume': chap['volume'], 60 | 'title': chap['title'], 61 | 'url': self.absolute_url(chap['link']) 62 | }) 63 | # end for 64 | 65 | self.volumes = [ 66 | {'id': x, 'title': ''} 67 | for x in volumes 68 | ] 69 | 70 | logger.debug('%d chapters and %d volumes found', 71 | len(self.chapters), len(self.volumes)) 72 | # end def 73 | 74 | def download_chapter_body(self, chapter): 75 | url = chapter_body_url % (self.novel_id, chapter['id']) 76 | logger.info('Getting chapter... %s [%s]', chapter['title'], url) 77 | data = self.get_json(url) 78 | return data['data']['chapter_content'] 79 | # end def 80 | # end class 81 | -------------------------------------------------------------------------------- /lncrawl/sources/novelv.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import logging 4 | from concurrent import futures 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('NOVELV') 8 | 9 | 10 | class NovelvCrawler(Crawler): 11 | base_url = 'https://www.novelv.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one( 19 | '.panel-default .info .info2 h1').text.strip() 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.select_one('.panel-default .info .info1 img')['src']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | authors = [] 27 | for a in soup.select('.panel-default .info .info2 h3 a'): 28 | if a['href'].startswith('/author/'): 29 | authors.append(a.text.strip()) 30 | # end if 31 | # end for 32 | self.novel_author = ', '.join(authors) 33 | logger.info('Novel author: %s', self.novel_author) 34 | 35 | volumes = set([]) 36 | for a in soup.select('.panel-default ul.list-charts li a'): 37 | possible_url = self.absolute_url(a['href'].lower()) 38 | if not possible_url.startswith(self.novel_url): 39 | continue 40 | # end if 41 | 42 | chapter_id = len(self.chapters) + 1 43 | volume_id = (chapter_id - 1) // 100 + 1 44 | volumes.add(volume_id) 45 | 46 | self.chapters.append({ 47 | 'id': chapter_id, 48 | 'title': a.text.strip(), 49 | 'url': possible_url, 50 | 'volume': volume_id, 51 | }) 52 | # end for 53 | 54 | self.volumes = [ 55 | {'id': x, 'title': ''} 56 | for x in list(volumes) 57 | ] 58 | # end def 59 | 60 | def download_chapter_body(self, chapter): 61 | '''Download body of a single chapter and return as clean html format.''' 62 | chapter['title'] = self.clean_text(chapter['title']) 63 | 64 | logger.info('Downloading %s', chapter['url']) 65 | soup = self.get_soup(chapter['url']) 66 | content = soup.select_one('.panel-body.content-body') 67 | body = self.extract_contents(content) 68 | body = '

%s

' % '

'.join(body) 69 | return self.clean_text(body) 70 | # end def 71 | 72 | def clean_text(self, text): 73 | text = re.sub(r'\ufffd\ufffd\ufffd+', '**', text) 74 | text = re.sub(r'\ufffd\ufffd', '"', text) 75 | text = re.sub(r'\u00a0\u00a0', '–', text) 76 | text = re.sub(r'\ufffdC', '', text) 77 | return text 78 | # end def 79 | # end class 80 | -------------------------------------------------------------------------------- /lncrawl/sources/machinetrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('MACHINE_NOVEL_TRANSLATION') 8 | 9 | search_url = 'http://www.machinenoveltranslation.com/search/autocomplete' 10 | 11 | 12 | class MachineNovelTrans(Crawler): 13 | base_url = 'http://www.machinenoveltranslation.com/' 14 | 15 | def read_novel_info(self): 16 | '''Get novel title, autor, cover etc''' 17 | logger.debug('Visiting %s', self.novel_url) 18 | soup = self.get_soup(self.novel_url) 19 | 20 | self.novel_title = soup.select_one('.desc h5').text 21 | logger.info('Novel title: %s', self.novel_title) 22 | 23 | self.novel_cover = self.absolute_url( 24 | soup.select_one('.about-author .row img')['src']) 25 | logger.info('Novel cover: %s', self.novel_cover) 26 | 27 | for div in soup.select('#chapters #accordion .panel'): 28 | vol_title = div.select_one('h4.panel-title a').text 29 | vol_id = [int(x) for x in re.findall(r'\d+', vol_title)] 30 | vol_id = vol_id[0] if len(vol_id) else len(self.volumes) + 1 31 | self.volumes.append({ 32 | 'id': vol_id, 33 | 'title': vol_title, 34 | }) 35 | 36 | for a in div.select('ul.navigate-page li a'): 37 | ch_title = a.text 38 | ch_id = [int(x) for x in re.findall(r'\d+', ch_title)] 39 | ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1 40 | self.chapters.append({ 41 | 'id': ch_id, 42 | 'volume': vol_id, 43 | 'title': ch_title, 44 | 'url': self.absolute_url(a['href']), 45 | }) 46 | # end for 47 | # end for 48 | 49 | logger.debug('%d chapters and %d volumes found', 50 | len(self.chapters), len(self.volumes)) 51 | # end def 52 | 53 | def download_chapter_body(self, chapter): 54 | '''Download body of a single chapter and return as clean html format.''' 55 | logger.info('Visiting %s', chapter['url']) 56 | soup = self.get_soup(chapter['url']) 57 | 58 | body = soup.select('.about-author .desc .translated') 59 | body = [self.format_text(x.text) for x in body if x] 60 | body = '\n'.join(['

%s

' % (x) for x in body if len(x)]) 61 | return body.strip() 62 | # end def 63 | 64 | def format_text(self, text): 65 | '''formats the text and remove bad characters''' 66 | text = re.sub(r'\u00ad', '', text, flags=re.UNICODE) 67 | text = re.sub(r'\u201e[, ]*', '“', text, flags=re.UNICODE) 68 | text = re.sub(r'\u201d[, ]*', '”', text, flags=re.UNICODE) 69 | text = re.sub(r'[ ]*,[ ]+', ', ', text, flags=re.UNICODE) 70 | return text.strip() 71 | # end def 72 | # end class 73 | -------------------------------------------------------------------------------- /lncrawl/bots/test/test_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from ...core.app import App 5 | from ...binders import available_formats 6 | 7 | 8 | def test_crawler(self, link, user_input): 9 | app = App() 10 | print('App instance: OK') 11 | 12 | app.initialize() 13 | print('App initialize: DONE') 14 | 15 | app.user_input = user_input 16 | app.init_search() 17 | print('Init search: DONE') 18 | 19 | if not app.crawler: 20 | if link not in app.crawler_links: 21 | print('Search is not supported for', link) 22 | return 23 | # end if 24 | 25 | print(len(app.crawler_links), 'available crawlers to search') 26 | app.crawler_links = [link] 27 | print('Selected crawler:', link) 28 | 29 | app.search_novel() 30 | print('Search: %d results found' % len(app.search_results)) 31 | 32 | source = app.search_results[0] 33 | print('Top result: %s with %d sources' % 34 | (source['title'], len(source['novels']))) 35 | 36 | novel_url = source['novels'][0]['url'] 37 | print('Top novel:', novel_url) 38 | 39 | app.init_crawler(novel_url) 40 | print('Init crawler: DONE') 41 | 42 | app.get_novel_info() 43 | print('Novel info: DONE') 44 | if not app.crawler.novel_title: 45 | raise Exception('No novel title') 46 | # end if 47 | return 48 | # end if 49 | 50 | if not app.crawler: 51 | raise Exception('No crawler initialized') 52 | # end if 53 | 54 | if app.can_do('login'): 55 | print('Login: enabled') 56 | # end if 57 | 58 | app.get_novel_info() 59 | print('Title:', app.crawler.novel_title) 60 | print('Cover:', app.crawler.novel_cover) 61 | print('Author:', app.crawler.novel_author) 62 | 63 | if not app.crawler.novel_title: 64 | raise Exception('No novel title') 65 | # end if 66 | 67 | print('Novel info: DONE') 68 | 69 | os.makedirs(app.output_path, exist_ok=True) 70 | print('Output path:', app.output_path) 71 | 72 | if len(app.crawler.volumes) == 0: 73 | raise Exception('Empty volume list') 74 | # end if 75 | 76 | if len(app.crawler.chapters) == 0: 77 | raise Exception('Empty chapter list') 78 | # end if 79 | 80 | app.chapters = app.crawler.chapters[:2] 81 | app.output_formats = {x: False for x in available_formats} 82 | app.output_formats['pdf'] = True 83 | app.pack_by_volume = False 84 | 85 | app.start_download() 86 | print('Download: DONE') 87 | 88 | if len(app.chapters[0]['body']) < 50: 89 | raise Exception('Empty body') 90 | # end if 91 | 92 | app.bind_books() 93 | print('Bindings: DONE') 94 | 95 | app.destroy() 96 | print('Destroy: DONE') 97 | 98 | print('-' * 6, 'Test Passed', '-' * 6) 99 | # end def 100 | -------------------------------------------------------------------------------- /lncrawl/sources/readln.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('READLIGHTNOVEL') 8 | search_url = 'https://www.readlightnovel.org/search/autocomplete' 9 | 10 | 11 | class ReadLightNovelCrawler(Crawler): 12 | base_url = 'https://www.readlightnovel.org/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = soup.select_one('.block-title h1').text 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.find('img', {'alt': self.novel_title})['src']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | author_link = soup.select_one("a[href*=author]") 27 | if author_link: 28 | self.novel_author = author_link.text.strip().title() 29 | # end if 30 | logger.info('Novel author: %s', self.novel_author) 31 | 32 | volume_ids = set() 33 | for a in soup.select('.chapters .chapter-chs li a'): 34 | chap_id = len(self.chapters) + 1 35 | vol_id = (chap_id - 1) // 100 + 1 36 | volume_ids.add(vol_id) 37 | self.chapters.append({ 38 | 'id': chap_id, 39 | 'volume': vol_id, 40 | 'url': self.absolute_url(a['href']), 41 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 42 | }) 43 | # end for 44 | 45 | self.volumes = [{'id': i} for i in volume_ids] 46 | # end def 47 | 48 | def download_chapter_body(self, chapter): 49 | '''Download body of a single chapter and return as clean html format.''' 50 | logger.info('Downloading %s', chapter['url']) 51 | soup = self.get_soup(chapter['url']) 52 | 53 | div = soup.select_one('.chapter-content3 .desc') 54 | 55 | bad_selectors = [ 56 | '.trinity-player-iframe-wrapper' 57 | '.hidden', 58 | '.ads-title', 59 | 'script', 60 | 'center', 61 | 'interaction', 62 | 'a[href*=remove-ads]', 63 | 'a[target=_blank]', 64 | 'hr', 65 | 'br' 66 | ] 67 | for hidden in div.select(', '.join(bad_selectors)): 68 | hidden.decompose() 69 | # end if 70 | 71 | body = self.extract_contents(div) 72 | if re.search(r'c?hapter .?\d+', body[0], re.IGNORECASE): 73 | title = body[0].replace('', '').replace('', '').strip() 74 | title = ('C' if title.startswith('hapter') else '') + title 75 | chapter['title'] = title.strip() 76 | body = body[1:] 77 | # end if 78 | 79 | return '

' + '

'.join(body) + '

' 80 | # end def 81 | # end class 82 | -------------------------------------------------------------------------------- /lncrawl/sources/idqidian.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('IDQIDIAN') 8 | 9 | 10 | class IdqidianCrawler(Crawler): 11 | base_url = 'https://www.idqidian.us/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.find_all( 19 | 'span', {"typeof": "v:Breadcrumb"})[-1].text 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = "https://www.idqidian.us/images/noavailable.jpg" 23 | logger.info('Novel cover: %s', self.novel_cover) 24 | 25 | author = soup.select('p')[3].text 26 | self.novel_author = author[20:len(author)-22] 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | chapters = soup.find('div', { 30 | 'style': '-moz-border-radius: 5px 5px 5px 5px; border: 1px solid #333; color: black; height: 400px; margin: 5px; overflow: auto; padding: 5px; width: 96%;'}).findAll( 31 | 'a') 32 | chapters.reverse() 33 | 34 | for a in chapters: 35 | chap_id = len(self.chapters) + 1 36 | if len(self.chapters) % 100 == 0: 37 | vol_id = chap_id//100 + 1 38 | vol_title = 'Volume ' + str(vol_id) 39 | self.volumes.append({ 40 | 'id': vol_id, 41 | 'title': vol_title, 42 | }) 43 | # end if 44 | self.chapters.append({ 45 | 'id': chap_id, 46 | 'volume': vol_id, 47 | 'url': self.absolute_url(a['href']), 48 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 49 | }) 50 | # end for 51 | # end def 52 | 53 | def download_chapter_body(self, chapter): 54 | '''Download body of a single chapter and return as clean html format.''' 55 | logger.info('Downloading %s', chapter['url']) 56 | soup = self.get_soup(chapter['url']) 57 | 58 | for a in soup.find_all('a'): 59 | a.decompose() 60 | 61 | body_parts = soup.select('p') 62 | body_parts = ''.join([str(p.extract()) for p in body_parts if 63 | p.text.strip() and not 'Advertisement' in p.text and not 'JavaScript!' in p.text]) 64 | if body_parts == '': 65 | texts = [str.strip(x) for x in soup.strings if str.strip(x) != ''] 66 | unwanted_text = [str.strip(x.text) for x in soup.find_all()] 67 | my_texts = set(texts).difference(unwanted_text) 68 | body_parts = ''.join( 69 | [str(p) for p in my_texts if p.strip() and not 'Advertisement' in p and not 'JavaScript!' in p]) 70 | # end if 71 | 72 | return body_parts 73 | # end def 74 | # end class 75 | -------------------------------------------------------------------------------- /lncrawl/sources/yukinovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | from bs4 import Comment 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('YUKI_NOVEL') 11 | 12 | 13 | class YukiNovelCrawler(Crawler): 14 | base_url = 'https://yukinovel.id/' 15 | 16 | def initialize(self): 17 | self.home_url = 'https://yukinovel.id/' 18 | # end def 19 | 20 | def read_novel_info(self): 21 | '''Get novel title, autor, cover etc''' 22 | url = self.novel_url.replace('https://yukinovel.me', 'https://yukinovel.id') 23 | logger.debug('Visiting %s', self.novel_url) 24 | soup = self.get_soup(self.novel_url) 25 | 26 | self.novel_title = soup.select_one('h1.entry-title').text 27 | logger.info('Novel title: %s', self.novel_title) 28 | 29 | self.novel_author = "Translated by Yukinovel" 30 | logger.info('Novel author: %s', self.novel_author) 31 | 32 | self.novel_cover = self.absolute_url( 33 | soup.select_one('div.lightnovel-thumb img')['src']) 34 | logger.info('Novel cover: %s', self.novel_cover) 35 | 36 | # Extract volume-wise chapter entries 37 | chapters = soup.select('div.lightnovel-episode ul li a') 38 | 39 | chapters.reverse() 40 | 41 | for a in chapters: 42 | chap_id = len(self.chapters) + 1 43 | if len(self.chapters) % 100 == 0: 44 | vol_id = chap_id//100 + 1 45 | vol_title = 'Volume ' + str(vol_id) 46 | self.volumes.append({ 47 | 'id': vol_id, 48 | 'title': vol_title, 49 | }) 50 | # end if 51 | self.chapters.append({ 52 | 'id': chap_id, 53 | 'volume': vol_id, 54 | 'url': self.absolute_url(a['href']), 55 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 56 | }) 57 | # end for 58 | # end def 59 | 60 | def download_chapter_body(self, chapter): 61 | '''Download body of a single chapter and return as clean html format.''' 62 | logger.info('Downloading %s', chapter['url']) 63 | soup = self.get_soup(chapter['url']) 64 | 65 | contents = soup.select_one('div.entry-content.cl') 66 | 67 | for d in contents.findAll('div'): 68 | d.decompose() 69 | # end for 70 | 71 | for comment in contents.find_all(string=lambda text: isinstance(text, Comment)): 72 | comment.extract() 73 | # end for 74 | 75 | if contents.findAll('p')[0].text.strip().startswith('Bab'): 76 | chapter['title'] = contents.findAll('p')[0].text.strip() 77 | contents.findAll('p')[0].extract() 78 | else: 79 | chapter['title'] = chapter['title'] 80 | # end if 81 | 82 | logger.debug(chapter['title']) 83 | 84 | return str(contents) 85 | # end def 86 | # end class 87 | -------------------------------------------------------------------------------- /lncrawl/sources/fourscanlation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import urlparse 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('4SCANLATION') 8 | novel_page = 'https://4scanlation.com/%s' 9 | 10 | 11 | class FourScanlationCrawler(Crawler): 12 | base_url = 'https://4scanlation.com/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | path_fragments = urlparse(self.novel_url).path.split('/') 17 | novel_hash = path_fragments[1] 18 | if novel_hash == 'category': 19 | novel_hash = path_fragments[2] 20 | # end if 21 | self.novel_url = novel_page % novel_hash 22 | 23 | logger.debug('Visiting %s', self.novel_url) 24 | soup = self.get_soup(self.novel_url) 25 | 26 | self.novel_title = soup.select_one(', '.join([ 27 | 'header h1', 28 | '.header-post-title-class', 29 | ])).text.strip() 30 | logger.info('Novel title: %s', self.novel_title) 31 | 32 | self.novel_author = "Source: 4scanlation" 33 | logger.info('Novel author: %s', self.novel_author) 34 | 35 | possible_image = soup.select_one('#primary article img.wp-post-image') 36 | if possible_image: 37 | self.novel_cover = self.absolute_url(possible_image['src']) 38 | # end if 39 | logger.info('Novel cover: %s', self.novel_cover) 40 | 41 | # Extract volume-wise chapter entries 42 | volumes = set() 43 | for a in soup.select('article.page p a'): 44 | possible_url = self.absolute_url(a['href']) 45 | if not self.is_relative_url(possible_url): 46 | continue 47 | # end if 48 | chap_id = 1 + len(self.chapters) 49 | vol_id = 1 + len(self.chapters) // 100 50 | volumes.add(vol_id) 51 | self.chapters.append({ 52 | 'id': chap_id, 53 | 'volume': vol_id, 54 | 'url': possible_url, 55 | 'title': a.text.strip(), 56 | }) 57 | # end for 58 | 59 | self.volumes = [{'id': x} for x in volumes] 60 | # end def 61 | 62 | def download_chapter_body(self, chapter): 63 | '''Download body of a single chapter and return as clean html format.''' 64 | logger.info('Downloading %s', chapter['url']) 65 | soup = self.get_soup(chapter['url']) 66 | 67 | contents = soup.select_one('article div.entry-content') 68 | if not contents: 69 | return '' 70 | # end if 71 | 72 | for d in contents.findAll('div'): 73 | d.extract() 74 | # end for 75 | 76 | try: 77 | chapter['title'] = soup.select_one('header h1').text 78 | logger.debug(chapter['title']) 79 | except Exception: 80 | pass 81 | # end try 82 | 83 | return str(contents or '') 84 | # end def 85 | # end class 86 | -------------------------------------------------------------------------------- /lncrawl/sources/novelgo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | import cssutils 6 | import urllib.parse 7 | 8 | from bs4 import BeautifulSoup 9 | 10 | from ..utils.crawler import Crawler 11 | 12 | logger = logging.getLogger('NOVEL_GO') 13 | 14 | 15 | class NovelGoCrawler(Crawler): 16 | base_url = 'https://novelgo.id/' 17 | 18 | def read_novel_info(self): 19 | '''Get novel title, autor, cover etc''' 20 | logger.debug('Visiting %s', self.novel_url) 21 | soup = self.get_soup(self.novel_url) 22 | 23 | self.novel_title = soup.find( 24 | 'h2', {'class': 'novel-title'}).text.strip() 25 | logger.info('Novel title: %s', self.novel_title) 26 | 27 | self.novel_author = soup.select_one( 28 | 'div.noveils-current-author a').text.strip() 29 | logger.info('Novel author: %s', self.novel_author) 30 | 31 | thumbnail = soup.find("div", {"class": "novel-thumbnail"})['style'] 32 | style = cssutils.parseStyle(thumbnail) 33 | url = style['background-image'] 34 | 35 | self.novel_cover = self.absolute_url( 36 | url.replace('url(', '').replace(')', '')) 37 | logger.info('Novel cover: %s', self.novel_cover) 38 | 39 | path = urllib.parse.urlsplit(self.novel_url)[2] 40 | book_id = path.split('/')[2] 41 | chapter_list = js = self.scraper.post( 42 | 'https://novelgo.id/wp-admin/admin-ajax.php?action=LoadChapter&post=%s' % book_id).content 43 | soup_chapter = BeautifulSoup(chapter_list, 'lxml') 44 | 45 | chapters = soup_chapter.select('ul li a') 46 | 47 | for x in chapters: 48 | chap_id = len(self.chapters) + 1 49 | if len(self.chapters) % 100 == 0: 50 | vol_id = chap_id//100 + 1 51 | vol_title = 'Volume ' + str(vol_id) 52 | self.volumes.append({ 53 | 'id': vol_id, 54 | 'title': vol_title, 55 | }) 56 | # end if 57 | self.chapters.append({ 58 | 'id': chap_id, 59 | 'volume': vol_id, 60 | 'url': self.absolute_url(x['href']), 61 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 62 | }) 63 | # end for 64 | 65 | logger.debug(self.chapters) 66 | # end def 67 | 68 | def download_chapter_body(self, chapter): 69 | '''Download body of a single chapter and return as clean html format.''' 70 | logger.info('Downloading %s', chapter['url']) 71 | soup = self.get_soup(chapter['url']) 72 | 73 | self.blacklist_patterns = [ 74 | r'^translat(ed by|or)', 75 | r'(volume|chapter) .?\d+', 76 | ] 77 | 78 | contents = soup.find( 79 | 'div', {'id': 'chapter-post-content'}).findAll('p') 80 | body = [str(p) for p in contents if p.text.strip()] 81 | return '

' + '

'.join(body) + '

' 82 | # end def 83 | # end class 84 | -------------------------------------------------------------------------------- /lncrawl/sources/gravitytales.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import logging 4 | from ..utils.crawler import Crawler 5 | 6 | logger = logging.getLogger('GRAVITY_TALES') 7 | 8 | cover_image_url = 'https://cdn.gravitytales.com/images/covers/%s.jpg' 9 | novel_toc_url = 'http://gravitytales.com/novel/%s' 10 | chapter_list_url = 'http://gravitytales.com/novel/%s/chapters' 11 | 12 | 13 | class GravityTalesCrawler(Crawler): 14 | base_url = 'http://gravitytales.com/' 15 | 16 | def read_novel_info(self): 17 | self.novel_id = re.split(r'\/(novel|post)\/', self.novel_url)[2] 18 | self.novel_id = self.novel_id.split('/')[0] 19 | logger.info('Novel id: %s' % self.novel_id) 20 | 21 | self.novel_url = novel_toc_url % self.novel_id 22 | logger.debug('Visiting %s' % self.novel_url) 23 | soup = self.get_soup(self.novel_url) 24 | 25 | for tag in soup.select('.main-content h3 > *'): 26 | tag.extract() 27 | self.novel_title = soup.select_one('.main-content h3').text.strip() 28 | logger.info('Novel title: %s' % self.novel_title) 29 | 30 | self.novel_cover = cover_image_url % self.novel_id 31 | logger.info('Novel cover: %s' % self.novel_cover) 32 | 33 | self.novel_author = soup.select_one('.main-content h4').text.strip() 34 | logger.info(self.novel_author) 35 | 36 | self.get_chapter_list() 37 | # end def 38 | 39 | def get_chapter_list(self): 40 | url = chapter_list_url % self.novel_id 41 | logger.info('Visiting %s' % url) 42 | soup = self.get_soup(url) 43 | 44 | # For each tabs... 45 | for a in soup.select('#chaptergroups li a'): 46 | vol_id = len(self.volumes) + 1 47 | self.volumes.append({ 48 | 'id': vol_id, 49 | 'title': a.text.strip(), 50 | '_tid': (a['href']), 51 | }) 52 | 53 | # ...get every chapters 54 | for a in soup.select_one(a['href']).select('table td a'): 55 | chap_id = len(self.chapters) + 1 56 | self.chapters.append({ 57 | 'id': chap_id, 58 | 'volume': vol_id, 59 | 'title': a.text.strip(), 60 | 'url': self.absolute_url(a['href']), 61 | }) 62 | # end for 63 | 64 | logger.info('%d chapters and %d volumes found', 65 | len(self.chapters), len(self.volumes)) 66 | # end def 67 | 68 | def download_chapter_body(self, chapter): 69 | '''Download body of a single chapter and return as clean html format.''' 70 | logger.info('Downloading %s' % chapter['url']) 71 | soup = self.get_soup(chapter['url']) 72 | body = soup.select_one('#chapterContent') 73 | for tag in body.contents: 74 | if hasattr(tag, 'attrs'): 75 | setattr(tag, 'attrs', {}) # clear attributes 76 | # end if 77 | # end for 78 | return str(body) 79 | # end def 80 | # end class 81 | -------------------------------------------------------------------------------- /lncrawl/sources/machinetransorg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import quote 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('MACHINE_NOVEL_TRANSLATION') 8 | 9 | search_url = 'https://www.machine-translation.org/novel/search/?keywords=%s' 10 | 11 | 12 | class MachineTransOrg(Crawler): 13 | base_url = 'https://www.machine-translation.org/' 14 | 15 | def search_novel(self, query): 16 | url = search_url % quote(query.lower()) 17 | logger.debug('Visiting: %s', url) 18 | soup = self.get_soup(url) 19 | 20 | results = [] 21 | for li in soup.select('.book-list-info > ul > li'): 22 | results.append({ 23 | 'title': li.select_one('a h4 b').text.strip(), 24 | 'url': self.absolute_url(li.select_one('.book-img a')['href']), 25 | 'info': li.select_one('.update-info').text.strip(), 26 | }) 27 | # end for 28 | return results 29 | # end def 30 | 31 | def read_novel_info(self): 32 | '''Get novel title, autor, cover etc''' 33 | logger.debug('Visiting %s', self.novel_url) 34 | soup = self.get_soup(self.novel_url) 35 | 36 | self.novel_title = soup.select_one('div.title h3 b').text 37 | logger.info('Novel title: %s', self.novel_title) 38 | 39 | self.novel_author = soup.select_one('div.title h3 span').text 40 | logger.info('Novel author: %s', self.novel_author) 41 | 42 | self.novel_cover = self.absolute_url( 43 | soup.select_one('.book-img img')['src']) 44 | logger.info('Novel cover: %s', self.novel_cover) 45 | 46 | for a in reversed(soup.select('div.slide-item a')): 47 | ch_title = a.text.strip() 48 | ch_id = len(self.chapters) + 1 49 | if len(self.chapters) % 100 == 0: 50 | vol_id = ch_id//100 + 1 51 | vol_title = 'Volume ' + str(vol_id) 52 | self.volumes.append({ 53 | 'id': vol_id, 54 | 'title': vol_title, 55 | }) 56 | # end if 57 | self.chapters.append({ 58 | 'id': ch_id, 59 | 'volume': vol_id, 60 | 'title': ch_title, 61 | 'url': self.absolute_url(a['href']), 62 | }) 63 | # end for 64 | 65 | logger.debug('%d chapters and %d volumes found', 66 | len(self.chapters), len(self.volumes)) 67 | # end def 68 | 69 | def download_chapter_body(self, chapter): 70 | '''Download body of a single chapter and return as clean html format''' 71 | logger.info('Visiting %s', chapter['url']) 72 | soup = self.get_soup(chapter['url']) 73 | body = soup.select_one('.read-main .read-context') 74 | 75 | self.blacklist_patterns = [ 76 | r'^Refresh time: \d+-\d+-\d+$' 77 | ] 78 | self.clean_contents(body) 79 | 80 | return str(body) 81 | # end def 82 | # end class 83 | -------------------------------------------------------------------------------- /lncrawl/bots/_sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..core.app import App 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | # TODO: It is recommended to implemented all methods. But you can skip those 8 | # Which return values by default. 9 | 10 | 11 | class SampleBot: 12 | def start(self): 13 | # TODO: must be implemented 14 | # Start processing using this bot. It should use self methods to take 15 | # inputs and self.app methods to process them. 16 | # 17 | self.app = App() 18 | self.app.initialize() 19 | # 20 | # Checkout console.py for a sample implementation 21 | # end def 22 | 23 | def get_novel_url(self): 24 | # Returns a novel page url or a query 25 | pass 26 | # end def 27 | 28 | def get_crawlers_to_search(self): 29 | # Returns user choice to search the choosen sites for a novel 30 | pass 31 | # end def 32 | 33 | def choose_a_novel(self): 34 | # The search_results is an array of (novel_title, novel_url). 35 | # This method should return a single novel_url only 36 | # 37 | # By default, returns the first search_results. Implemented it to 38 | # handle multiple search_results 39 | pass 40 | # end def 41 | 42 | def get_login_info(self): 43 | # By default, returns None to skip login 44 | pass 45 | # end if 46 | 47 | def get_output_path(self): 48 | # You should return a valid absolute path. The parameter suggested_path 49 | # is valid but not gurranteed to exists. 50 | # 51 | # NOTE: If you do not want to use any pre-downloaded files, remove all 52 | # contents inside of your selected output directory. 53 | # 54 | # By default, returns a valid existing path from suggested_path 55 | pass 56 | # end def 57 | 58 | def get_output_formats(self): 59 | # The keys should be from from `self.output_formats`. Each value 60 | # corresponding a key defines whether create output in that format. 61 | # 62 | # By default, it returns all True to all of the output formats. 63 | pass 64 | # end def 65 | 66 | def should_pack_by_volume(self): 67 | # By default, returns False to generate a single file 68 | pass 69 | # end def 70 | 71 | def get_range_selection(self): 72 | # Should return a key from `self.selections` array 73 | pass 74 | # end def 75 | 76 | def get_range_using_urls(self): 77 | # Should return a list of chapters to download 78 | pass 79 | # end def 80 | 81 | def get_range_using_index(self): 82 | # Should return a list of chapters to download 83 | pass 84 | # end def 85 | 86 | def get_range_from_volumes(self): 87 | # Should return a list of chapters to download 88 | pass 89 | # end def 90 | 91 | def get_range_from_chapters(self): 92 | # Should return a list of chapters to download 93 | pass 94 | # end def 95 | # end class 96 | -------------------------------------------------------------------------------- /lncrawl/sources/mangatoon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | import ast 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('MANGATOON_MOBI') 9 | 10 | book_url = 'https://mangatoon.mobi/%s/detail/%s/episodes' 11 | search_url = 'https://mangatoon.mobi/%s/search?word=%s' 12 | 13 | 14 | class MangatoonMobiCrawler(Crawler): 15 | base_url = 'https://mangatoon.mobi/' 16 | 17 | def initialize(self): 18 | self.home_url = 'https://mangatoon.mobi' 19 | # end def 20 | 21 | def read_novel_info(self): 22 | '''Get novel title, autor, cover etc''' 23 | self.novel_id = self.novel_url.split('/')[5] 24 | logger.info('Novel Id: %s', self.novel_id) 25 | 26 | novel_region = self.novel_url.split('/')[3] 27 | 28 | self.novel_url = book_url % (novel_region,self.novel_id) 29 | logger.debug('Visiting %s', self.novel_url) 30 | soup = self.get_soup(self.novel_url) 31 | 32 | self.novel_title =soup.select_one('h1.comics-title').text 33 | logger.info('Novel title: %s', self.novel_title) 34 | 35 | try: 36 | self.novel_cover = self.absolute_url( 37 | soup.select_one('.detail-top-right img')['src']) 38 | logger.info('Novel cover: %s', self.novel_cover) 39 | except Exception: 40 | logger.debug('Failed to get cover: %s', self.novel_url) 41 | # end try 42 | 43 | self.novel_author = soup.select_one('.created-by').text 44 | logger.info('Novel author: %s', self.novel_author) 45 | 46 | for a in soup.select('a.episode-item'): 47 | chap_id = len(self.chapters) + 1 48 | if len(self.chapters) % 100 == 0: 49 | vol_id = chap_id//100 + 1 50 | vol_title = 'Volume ' + str(vol_id) 51 | self.volumes.append({ 52 | 'id': vol_id, 53 | 'title': vol_title, 54 | }) 55 | # end if 56 | self.chapters.append({ 57 | 'id': chap_id, 58 | 'volume': vol_id, 59 | 'url': self.absolute_url(a['href']), 60 | 'title': a.select_one('.episode-title').text.strip() or ('Chapter %d' % chap_id), 61 | }) 62 | # end for 63 | # end def 64 | 65 | def download_chapter_body(self, chapter): 66 | '''Download body of a single chapter and return as clean html format''' 67 | logger.info('Downloading %s', chapter['url']) 68 | soup = self.get_soup(chapter['url']) 69 | 70 | script = soup.find("script", text=re.compile("initialValue\s+=")) 71 | initialValue = re.search('var initialValue = (?P.*);', script.string) 72 | content = initialValue.group('value') 73 | chapter_content = ast.literal_eval(content) 74 | chapter_content = [p.replace('\-', '-') for p in chapter_content] 75 | 76 | 77 | text = '

' + '

'.join(chapter_content) + '

' 78 | # end if 79 | return text.strip() 80 | # end def 81 | # end class 82 | -------------------------------------------------------------------------------- /lncrawl/sources/rewayatclub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from concurrent import futures 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('REWAYAT_CLUB') 9 | 10 | 11 | class RewayatClubCrawler(Crawler): 12 | base_url = 'https://rewayat.club/' 13 | 14 | def read_novel_info(self): 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.is_rtl = True 19 | 20 | self.novel_title = soup.select_one('h1.card-header').text.strip() 21 | logger.info('Novel title: %s', self.novel_title) 22 | 23 | self.novel_cover = self.absolute_url( 24 | soup.select_one('.card-body .align-middle img')['src']) 25 | logger.info('Novel cover: %s', self.novel_cover) 26 | 27 | self.novel_author = soup.select_one( 28 | '.card-body table td a[href*="/user/"]').text.strip() 29 | logger.info('Novel author: %s', self.novel_author) 30 | 31 | page_count = len(soup.select( 32 | '.card-footer select.custom-select option')) 33 | logger.info('Total pages: %d', page_count) 34 | 35 | logger.info('Getting chapters...') 36 | futures_to_check = { 37 | self.executor.submit(self.download_chapter_list, i + 1): str(i) 38 | for i in range(page_count) 39 | } 40 | temp_chapters = dict() 41 | for future in futures.as_completed(futures_to_check): 42 | page = int(futures_to_check[future]) 43 | temp_chapters[page] = future.result() 44 | # end for 45 | 46 | logger.info('Building sorted chapter list...') 47 | volumes = set() 48 | for page in sorted(temp_chapters.keys()): 49 | for chap in temp_chapters[page]: 50 | chap['id'] = 1 + len(self.chapters) 51 | chap['volume'] = 1 + len(self.chapters) // 100 52 | volumes.add(chap['volume']) 53 | self.chapters.append(chap) 54 | # end for 55 | # end for 56 | 57 | self.volumes = [{'id': x} for x in volumes] 58 | # end def 59 | 60 | def download_chapter_list(self, page_no): 61 | chapter_url = self.novel_url + ('?page=%d' % page_no) 62 | logger.info('Visiting %s', chapter_url) 63 | soup = self.get_soup(chapter_url) 64 | 65 | chapters = [] 66 | for a in soup.select('.card a[href*="/novel/"]'): 67 | chapters.append({ 68 | 'url': self.absolute_url(a['href']), 69 | 'title': a.select_one('div p').text.strip(), 70 | }) 71 | # end for 72 | return chapters 73 | # end def 74 | 75 | def download_chapter_body(self, chapter): 76 | '''Download body of a single chapter and return as clean html format.''' 77 | logger.info('Downloading %s', chapter['url']) 78 | soup = self.get_soup(chapter['url']) 79 | paras = soup.select('.card .card-body p') 80 | paras = [str(p) for p in paras if p.text.strip()] 81 | return ''.join(paras) 82 | # end def 83 | # end class 84 | -------------------------------------------------------------------------------- /lncrawl/sources/shinsori.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('SHINSORI') 8 | 9 | 10 | class ShinsoriCrawler(Crawler): 11 | base_url = 'https://www.shinsori.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one('span.the-section-title').text.strip() 19 | logger.info('Novel title: %s', self.novel_title) 20 | 21 | self.novel_cover = None 22 | logger.info('Novel cover: %s', self.novel_cover) 23 | 24 | self.novel_author = 'Author : %s, Translator: Shinsori' % soup.select( 25 | 'div.entry.clearfix p strong')[1].next_sibling.strip() 26 | logger.info('Novel author: %s', self.novel_author) 27 | 28 | # get pagination range 29 | p_range = int(soup.select('ul.lcp_paginator li')[-2].text) 30 | 31 | chapters = [] 32 | # get chapter list by looping pagination range 33 | for x in range(p_range): 34 | p_url = '%s?lcp_page0=%d#lcp_instance_0 x+1' % (self.novel_url, x+1) 35 | p_soup = self.get_soup(p_url) 36 | chapters.extend(p_soup.select('ul.lcp_catlist')[1].select('li a')) 37 | # end for 38 | 39 | for x in chapters: 40 | chap_id = len(self.chapters) + 1 41 | vol_id = len(self.chapters)//100 + 1 42 | self.chapters.append({ 43 | 'id': chap_id, 44 | 'volume': vol_id, 45 | 'url': self.absolute_url(x['href']), 46 | 'title': x['title'] or ('Chapter %d' % chap_id), 47 | }) 48 | # end for 49 | 50 | self.volumes = [ 51 | {'id': x + 1} 52 | for x in range(len(self.chapters) // 100 + 1) 53 | ] 54 | # end def 55 | 56 | def download_chapter_body(self, chapter): 57 | '''Download body of a single chapter and return as clean html format.''' 58 | logger.info('Downloading %s', chapter['url']) 59 | soup = self.get_soup(chapter['url']) 60 | 61 | logger.debug(soup.title.string) 62 | 63 | content = soup.select_one('div.entry-content') 64 | 65 | # remove div with no class 66 | for item in content.findAll('div', attrs={'class': None}): 67 | item.decompose() 68 | 69 | # remove style 70 | for item in content.findAll('style'): 71 | item.decompose() 72 | 73 | subs = 'tab' 74 | # remove all div that has class but not relevant 75 | for item in content.findAll('div'): 76 | res = [x for x in item['class'] if re.search(subs, x)] 77 | if len(res) == 0: 78 | item.extract() 79 | 80 | # remove p with attribute style 81 | for item in content.findAll('p'): 82 | if item.has_attr('style'): 83 | item.decompose() 84 | 85 | return str(content) 86 | # end def 87 | # end class 88 | -------------------------------------------------------------------------------- /lncrawl/sources/wuxiaonline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WUXIA_ONLINE') 8 | search_url = 'https://wuxiaworld.online/search.ajax?type=&query=%s' 9 | 10 | 11 | class WuxiaOnlineCrawler(Crawler): 12 | base_url = 'https://wuxiaworld.online/' 13 | 14 | # DISABLING DUE TO CLOUDEFLARE CAPTCHA CHALLENGE 15 | # def search_novel(self, query): 16 | # '''Gets a list of {title, url} matching the given query''' 17 | # soup = self.get_soup(search_url % query) 18 | 19 | # results = [] 20 | # for novel in soup.select('li'): 21 | # a = novel.select_one('.resultname a') 22 | # info = novel.select_one('a:nth-of-type(2)') 23 | # info = info.text.strip() if info else '' 24 | # results.append({ 25 | # 'title': a.text.strip(), 26 | # 'url': self.absolute_url(a['href']), 27 | # 'info': 'Latest: %s' % info, 28 | # }) 29 | # # end for 30 | 31 | # return results 32 | # # end def 33 | 34 | def read_novel_info(self): 35 | '''Get novel title, autor, cover etc''' 36 | url = self.novel_url 37 | logger.debug('Visiting %s', url) 38 | soup = self.get_soup(url) 39 | self.novel_title = soup.select_one('h1.entry-title').text 40 | logger.info('Novel title: %s', self.novel_title) 41 | 42 | # self.novel_author = soup.select_one('#maininfo p').text.strip() 43 | # self.novel_author = re.sub(r'^Author[^\w]+', '', self.novel_author).strip() 44 | # logger.info('Novel author: %s', self.novel_author) 45 | 46 | self.novel_cover = self.absolute_url( 47 | soup.select_one('.info_image img')['src']) 48 | logger.info('Novel cover: %s', self.novel_cover) 49 | 50 | last_vol = -1 51 | for a in reversed(soup.select('.chapter-list .row span a')): 52 | chap_id = len(self.chapters) + 1 53 | vol_id = 1 + (chap_id - 1) // 100 54 | volume = {'id': vol_id, 'title': ''} 55 | if last_vol != vol_id: 56 | self.volumes.append(volume) 57 | last_vol = vol_id 58 | # end if 59 | self.chapters.append({ 60 | 'id': chap_id, 61 | 'volume': vol_id, 62 | 'title': a['title'], 63 | 'url': self.absolute_url(a['href']), 64 | }) 65 | # end for 66 | 67 | logger.info('%d chapters and %d volumes found', 68 | len(self.chapters), len(self.volumes)) 69 | # end def 70 | 71 | def download_chapter_body(self, chapter): 72 | '''Download body of a single chapter and return as clean html format.''' 73 | logger.info('Downloading %s', chapter['url']) 74 | soup = self.get_soup(chapter['url']) 75 | 76 | parts = soup.select_one('#list_chapter .content-area') 77 | body = self.extract_contents(parts) 78 | return '

' + '

'.join(body) + '

' 79 | # end def 80 | # end class 81 | -------------------------------------------------------------------------------- /lncrawl/sources/crescentmoon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('CRESCENTMOON') 9 | 10 | 11 | class CrescentMoonCrawler(Crawler): 12 | base_url = 'https://crescentmoon.blog/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.select_one('div.entry-content p a')['href']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | self.novel_author = soup.select('div.entry-content p')[2].text.strip() 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | a = soup.select('div.entry-content p') 30 | for idx, item in enumerate(a): 31 | if "table of contents" in item.text.strip().lower(): 32 | toc = a[idx+1] 33 | 34 | chapters = toc.findAll('a') 35 | 36 | for x in chapters: 37 | chap_id = len(self.chapters) + 1 38 | if len(self.chapters) % 100 == 0: 39 | vol_id = chap_id//100 + 1 40 | vol_title = 'Volume ' + str(vol_id) 41 | self.volumes.append({ 42 | 'id': vol_id, 43 | 'title': vol_title, 44 | }) 45 | # end if 46 | self.chapters.append({ 47 | 'id': chap_id, 48 | 'volume': vol_id, 49 | 'url': self.absolute_url(x['href']), 50 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 51 | }) 52 | # end for 53 | # end def 54 | 55 | def download_chapter_body(self, chapter): 56 | '''Download body of a single chapter and return as clean html format.''' 57 | logger.info('Downloading %s', chapter['url']) 58 | soup = self.get_soup(chapter['url']) 59 | 60 | logger.debug(soup.title.string) 61 | 62 | # if soup.find("h1", {"class": "entry-title"}).text.strip(): 63 | # chapter['title'] = soup.find("h1", {"class": "entry-title"}).text.strip() 64 | # else: 65 | # chapter['title'] = chapter['title'] 66 | # end if 67 | 68 | #contents = soup.select('div.entry-content p') 69 | #contents = contents[:-1] 70 | #body = self.extract_contents(contents) 71 | # return '

' + '

'.join(body) + '

' 72 | # return str(contents) 73 | 74 | body = [] 75 | contents = soup.select('div.entry-content p') 76 | contents = contents[:-1] 77 | for p in contents: 78 | para = ' '.join(self.extract_contents(p)) 79 | if len(para): 80 | body.append(para) 81 | # end if 82 | # end for 83 | 84 | return '

%s

' % '

'.join(body) 85 | # end def 86 | # end class 87 | -------------------------------------------------------------------------------- /lncrawl/sources/meionovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('MEIONOVEL') 9 | 10 | 11 | class MeionovelCrawler(Crawler): 12 | base_url = 'https://meionovel.id/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = ' '.join([ 20 | str(x) 21 | for x in soup.select_one('.post-title h3').contents 22 | if not x.name 23 | ]).strip() 24 | logger.info('Novel title: %s', self.novel_title) 25 | 26 | self.novel_cover = self.absolute_url( 27 | soup.select_one('.summary_image img')['data-src']) 28 | logger.info('Novel cover: %s', self.novel_cover) 29 | 30 | author = soup.find('div', {'class': 'author-content'}).findAll('a') 31 | if len(author) == 2: 32 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 33 | else: 34 | self.novel_author = author[0].text 35 | logger.info('Novel author: %s', self.novel_author) 36 | 37 | 38 | content_area = soup.select_one(' .page-content-listing') 39 | 40 | for span in content_area.findAll('span'): 41 | span.decompose() 42 | 43 | chapters = content_area.select('ul.main li.wp-manga-chapter a') 44 | 45 | chapters.reverse() 46 | 47 | for a in chapters: 48 | chap_id = len(self.chapters) + 1 49 | vol_id = chap_id//100 + 1 50 | if len(self.chapters) % 100 == 0: 51 | vol_title = 'Volume ' + str(vol_id) 52 | self.volumes.append({ 53 | 'id': vol_id, 54 | 'title': vol_title, 55 | }) 56 | # end if 57 | self.chapters.append({ 58 | 'id': chap_id, 59 | 'volume': vol_id, 60 | 'url': self.absolute_url(a['href']), 61 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 62 | }) 63 | # end for 64 | # end def 65 | 66 | def download_chapter_body(self, chapter): 67 | '''Download body of a single chapter and return as clean html format.''' 68 | logger.info('Downloading %s', chapter['url']) 69 | soup = self.get_soup(chapter['url']) 70 | 71 | contents = soup.select_one('div.text-left') 72 | 73 | for img in contents.findAll('img'): 74 | if img.has_attr('data-lazy-src'): 75 | src_url = img['data-lazy-src'] 76 | parent = img.parent 77 | img.decompose() 78 | new_tag = soup.new_tag("img", src=src_url) 79 | parent.append(new_tag) 80 | 81 | if contents.h3: 82 | contents.h3.decompose() 83 | 84 | for codeblock in contents.findAll('div', {'class': 'code-block'}): 85 | codeblock.decompose() 86 | 87 | return str(contents) 88 | # end def 89 | # end class 90 | -------------------------------------------------------------------------------- /lncrawl/utils/kindlegen_download.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import platform 4 | import tarfile 5 | import tempfile 6 | from io import BytesIO, FileIO 7 | from logging import Logger 8 | from shutil import rmtree 9 | from zipfile import ZipFile 10 | import requests 11 | 12 | logger = Logger('KINDLEGEN') 13 | 14 | WINDOWS_URL = 'http://kindlegen.s3.amazonaws.com/kindlegen_win32_v2_9.zip' 15 | MACOS_URL = 'http://kindlegen.s3.amazonaws.com/KindleGen_Mac_i386_v2_9.zip' 16 | LINUX_URL = 'http://kindlegen.s3.amazonaws.com/kindlegen_linux_2.6_i386_v2_9.tar.gz' 17 | 18 | 19 | def get_url_by_platform(): 20 | if platform.system() == 'Linux': 21 | return LINUX_URL 22 | elif platform.system() == 'Darwin': 23 | return MACOS_URL 24 | elif platform.system() == 'Windows': 25 | return WINDOWS_URL 26 | else: 27 | raise Exception('Unrecognized platform') 28 | # end if 29 | # end def 30 | 31 | 32 | def extract_kindlegen_file(extractor, file_list): 33 | logger.debug(file_list) 34 | home = os.path.expanduser('~') 35 | if file_list.count('kindlegen') == 1: 36 | extractor('kindlegen', path=home) 37 | logger.info('Extracted kindlegen to %s', home) 38 | elif file_list.count('kindlegen.exe') == 1: 39 | extractor('kindlegen.exe', path=home) 40 | logger.info('Extracted kindlegen.exe to %s', home) 41 | os.rename(os.path.join(home, 'kindlegen.exe'), 42 | os.path.join(home, 'kindlegen')) 43 | logger.info('Renamed kindlegen.exe to kindlegen') 44 | else: 45 | raise Exception('Kindlegen executable was not found.') 46 | # end if 47 | # end def 48 | 49 | 50 | def download_kindlegen(): 51 | # Download the file 52 | url = get_url_by_platform() 53 | print('Downloading kindlegen...') 54 | byte_array = requests.get(url).content 55 | 56 | # Extract contents 57 | print('Extracting kindlegen...') 58 | if url.endswith('.zip'): 59 | with BytesIO(byte_array) as byte_stream: 60 | with ZipFile(byte_stream) as file: 61 | extract_kindlegen_file(file.extract, file.namelist()) 62 | # end with 63 | # end with 64 | elif url.endswith('.tar.gz'): 65 | temp_file = tempfile.mktemp('.tar.gz') 66 | try: 67 | logger.info('Writing content to %s', temp_file) 68 | with FileIO(temp_file, 'w') as file: 69 | file.write(byte_array) 70 | # end with 71 | logger.info('Opening %s as archive', temp_file) 72 | with tarfile.open(temp_file) as file: 73 | extract_kindlegen_file(file.extract, file.getnames()) 74 | # end with 75 | finally: 76 | os.remove(temp_file) 77 | logger.info('%s removed.', temp_file) 78 | # end finally 79 | # end if 80 | # end def 81 | 82 | 83 | def retrieve_kindlegen(): 84 | # Check kindlegen availability 85 | home = os.path.expanduser('~') 86 | kindlegen_file = os.path.join(home, 'kindlegen') 87 | if os.path.exists(kindlegen_file): 88 | return kindlegen_file 89 | # end if 90 | return None 91 | # end def 92 | -------------------------------------------------------------------------------- /lncrawl/binders/web.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | 5 | from ..assets.html_style import get_value as get_css_style 6 | 7 | logger = logging.getLogger('WEB_BINDER') 8 | 9 | 10 | def bind_html_chapter(chapter, prev_chapter, next_chapter, direction='ltr'): 11 | prev_button = '%s.html' % ( 12 | str(prev_chapter['id']).rjust(5, '0')) if prev_chapter else '#' 13 | next_button = '%s.html' % str(next_chapter['id']).rjust( 14 | 5, '0') if next_chapter else '#' 15 | button_group = '

' 20 | 21 | script = ''' 22 | window.addEventListener('scroll', function(e) { 23 | try { 24 | var scroll = window.scrollY; 25 | var height = document.body.scrollHeight - window.innerHeight + 10; 26 | var percent = Math.round(100.0 * scroll / height); 27 | document.getElementById('readpos').innerText = percent + '%'; 28 | } catch (err) { 29 | // ignore 30 | } 31 | }) 32 | ''' 33 | 34 | main_body = chapter['body'] 35 | if not main_body: 36 | main_body = '

%s

No contents

' % chapter['title'] 37 | # end if 38 | 39 | html = '\n' 40 | html += '' % direction 41 | html += '' 42 | html += '' 43 | html += '%s' % chapter['title'] 44 | html += '' % get_css_style() 45 | html += '' % script 46 | html += '
' 47 | html += button_group 48 | html += '
%s
' % main_body 49 | html += button_group 50 | html += '
' 51 | html += '
0%
' 52 | html += '' 53 | 54 | file_name = '%s.html' % str(chapter['id']).rjust(5, '0') 55 | return html, file_name 56 | # end def 57 | 58 | 59 | def make_webs(app, data): 60 | web_files = [] 61 | for vol in data: 62 | dir_name = os.path.join(app.output_path, 'web', vol) 63 | os.makedirs(dir_name, exist_ok=True) 64 | for i in range(len(data[vol])): 65 | chapter = data[vol][i] 66 | prev_chapter = data[vol][i - 1] if i > 0 else None 67 | next_chapter = data[vol][i + 1] if i + 1 < len(data[vol]) else None 68 | direction = 'rtl' if app.crawler.is_rtl else 'ltr' 69 | html, file_name = bind_html_chapter( 70 | chapter, prev_chapter, next_chapter, direction) 71 | 72 | file_name = os.path.join(dir_name, file_name) 73 | with open(file_name, 'w', encoding='utf-8') as file: 74 | file.write(html) 75 | # end with 76 | web_files.append(file_name) 77 | # end for 78 | # end for 79 | print('Created: %d web files' % len(web_files)) 80 | return web_files 81 | # end def 82 | -------------------------------------------------------------------------------- /lncrawl/core/novel_search.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | To search for novels in selected sources 4 | """ 5 | import os 6 | import logging 7 | from concurrent import futures 8 | 9 | from slugify import slugify 10 | from progress.bar import IncrementalBar 11 | 12 | from ..sources import crawler_list 13 | 14 | logger = logging.getLogger('SEARCH_NOVEL') 15 | 16 | 17 | def get_search_result(user_input, link): 18 | try: 19 | crawler = crawler_list[link] 20 | instance = crawler() 21 | instance.home_url = link.strip('/') 22 | results = instance.search_novel(user_input) 23 | logger.debug(results) 24 | logger.info('%d results from %s', len(results), link) 25 | return results 26 | except Exception: 27 | import traceback 28 | logger.debug(traceback.format_exc()) 29 | # end try 30 | return [] 31 | # end def 32 | 33 | 34 | def process_results(results): 35 | combined = dict() 36 | for result in results: 37 | key = slugify(result['title']) 38 | if len(key) <= 1: 39 | continue 40 | elif key not in combined: 41 | combined[key] = [] 42 | # end if 43 | combined[key].append(result) 44 | # end for 45 | 46 | processed = [] 47 | for key, value in combined.items(): 48 | value.sort(key=lambda x: x['url']) 49 | processed.append({ 50 | 'id': key, 51 | 'title': value[0]['title'], 52 | 'novels': value 53 | }) 54 | # end for 55 | 56 | processed.sort(key=lambda x: -len(x['novels'])) 57 | 58 | return processed[:15] # Control the number of results 59 | # end def 60 | 61 | 62 | def search_novels(app): 63 | executor = futures.ThreadPoolExecutor(10) 64 | 65 | # Add future tasks 66 | checked = {} 67 | futures_to_check = {} 68 | for link in app.crawler_links: 69 | crawler = crawler_list[link] 70 | if crawler in checked: 71 | logger.info('A crawler for "%s" already exists', link) 72 | continue 73 | # end if 74 | checked[crawler] = True 75 | futures_to_check[ 76 | executor.submit( 77 | get_search_result, 78 | app.user_input, 79 | link 80 | ) 81 | ] = str(crawler) 82 | # end for 83 | 84 | bar = IncrementalBar('Searching', max=len(futures_to_check.keys())) 85 | bar.start() 86 | 87 | if os.getenv('debug_mode') == 'yes': 88 | bar.next = lambda: None # Hide in debug mode 89 | # end if 90 | 91 | # Resolve future tasks 92 | app.progress = 0 93 | combined_results = [] 94 | for future in futures.as_completed(futures_to_check): 95 | combined_results += future.result() 96 | app.progress += 1 97 | bar.next() 98 | # end for 99 | 100 | # Process combined search results 101 | app.search_results = process_results(combined_results) 102 | bar.clearln() 103 | bar.finish() 104 | print('Found %d results' % len(app.search_results)) 105 | 106 | executor.shutdown() 107 | # end def 108 | -------------------------------------------------------------------------------- /lncrawl/sources/kissnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('KISS-NOVEL') 8 | 9 | 10 | class KissNovelCrawler(Crawler): 11 | base_url = 'https://kiss-novel.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = ' '.join([ 19 | str(x) 20 | for x in soup.select_one('.post-title h1').contents 21 | if not x.name 22 | ]).strip() 23 | logger.info('Novel title: %s', self.novel_title) 24 | 25 | self.novel_cover = self.absolute_url( 26 | soup.select_one('.summary_image img')['src']) 27 | logger.info('Novel cover: %s', self.novel_cover) 28 | 29 | author = soup.find('div', {'class': 'author-content'}).findAll('a') 30 | if len(author) == 2: 31 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 32 | else: 33 | self.novel_author = author[0].text 34 | logger.info('Novel author: %s', self.novel_author) 35 | 36 | latest_chapter = soup.select('div.post-content_item ul li a')[0].text 37 | chapter_count = [int(i) for i in latest_chapter.split() if i.isdigit()] 38 | page_count = (chapter_count)[0]//10+1 39 | chapters_page_url = '%s/%s#chapter-section' 40 | 41 | chapters = [] 42 | 43 | for i in range(page_count): 44 | url = chapters_page_url % (self.novel_url, str(i+1)) 45 | logger.debug('Visiting %s', url) 46 | soup = self.get_soup(url) 47 | chapters.extend(soup.select('ul.main li.wp-manga-chapter a')) 48 | # end for 49 | chapters.reverse() 50 | 51 | for a in chapters: 52 | chap_id = len(self.chapters) + 1 53 | vol_id = chap_id//100 + 1 54 | if len(self.chapters) % 100 == 0: 55 | vol_title = 'Volume ' + str(vol_id) 56 | self.volumes.append({ 57 | 'id': vol_id, 58 | 'title': vol_title, 59 | }) 60 | # end if 61 | self.chapters.append({ 62 | 'id': chap_id, 63 | 'volume': vol_id, 64 | 'url': self.absolute_url(a['href']), 65 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 66 | }) 67 | # end for 68 | # end def 69 | 70 | def download_chapter_body(self, chapter): 71 | '''Download body of a single chapter and return as clean html format.''' 72 | logger.info('Downloading %s', chapter['url']) 73 | soup = self.get_soup(chapter['url']) 74 | 75 | contents = soup.select('div.reading-content p') 76 | 77 | body = [str(p) for p in contents if p.text.strip()] 78 | return '

' + '

'.join(body) + '

' 79 | 80 | # if contents.h3: 81 | # contents.h3.decompose() 82 | 83 | # for codeblock in contents.findAll('div', {'class': 'code-block'}): 84 | # codeblock.decompose() 85 | 86 | # return str(contents) 87 | # end def 88 | # end class 89 | -------------------------------------------------------------------------------- /lncrawl/sources/bestlightnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from bs4 import BeautifulSoup 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('NOVEL_ONLINE_FREE') 8 | search_url = 'https://bestlightnovel.com/getsearchstory' 9 | novel_page_url = 'https://bestlightnovel.com/novel/%s' 10 | 11 | 12 | class BestLightNovel(Crawler): 13 | base_url = 'https://bestlightnovel.com/' 14 | 15 | def search_novel(self, query): 16 | response = self.submit_form(search_url, { 17 | 'searchword': query 18 | }) 19 | data = response.json() 20 | 21 | results = [] 22 | for novel in data: 23 | titleSoup = BeautifulSoup(novel['name'], 'lxml') 24 | results.append({ 25 | 'title': titleSoup.body.text.title(), 26 | 'url': novel_page_url % novel['nameunsigned'], 27 | 'info': 'Latest: %s' % novel['lastchapter'], 28 | }) 29 | # end for 30 | return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | # self.novel_title = soup.select_one('h1.entry-title').text.strip() 39 | self.novel_title = soup.select_one('div.entry-header h1').text.strip() 40 | logger.info('Novel title: %s', self.novel_title) 41 | 42 | try: 43 | novel_data = self.submit_form(search_url, { 44 | 'searchword': self.novel_title 45 | }).json() 46 | self.novel_cover = novel_data[0]['image'] 47 | self.novel_author = novel_data[0]['author'] 48 | except Exception: 49 | logger.debug('Failed getting novel info.\n%s', Exception) 50 | # end try 51 | 52 | for a in reversed(soup.select('#list_chapter .chapter-list a')): 53 | chap_id = len(self.chapters) + 1 54 | vol_id = len(self.chapters) // 100 + 1 55 | if len(self.chapters) % 100 == 0: 56 | self.volumes.append({'id': vol_id}) 57 | # end if 58 | self.chapters.append({ 59 | 'id': chap_id, 60 | 'volume': vol_id, 61 | 'title': a.text.strip(), 62 | 'url': self.absolute_url(a['href']), 63 | }) 64 | # end for 65 | # end def 66 | 67 | def download_chapter_body(self, chapter): 68 | '''Download body of a single chapter and return as clean html format.''' 69 | logger.info('Downloading %s', chapter['url']) 70 | soup = self.get_soup(chapter['url']) 71 | 72 | logger.debug(soup.title.string) 73 | 74 | if 'Chapter' in soup.select_one('h1').text: 75 | chapter['title'] = soup.select_one('h1').text 76 | else: 77 | chapter['title'] = chapter['title'] 78 | # end if 79 | 80 | self.blacklist_patterns = [ 81 | r'^translat(ed by|or)', 82 | r'(volume|chapter) .?\d+', 83 | ] 84 | 85 | contents = soup.select_one('#vung_doc') 86 | body = self.extract_contents(contents) 87 | return '

' + '

'.join(body) + '

' 88 | # end def 89 | # end class 90 | -------------------------------------------------------------------------------- /lncrawl/sources/novelonlinefull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from bs4 import BeautifulSoup 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('NOVEL_ONLINE_FULL') 8 | search_url = 'https://novelonlinefull.com/getsearchstory' 9 | novel_page_url = 'https://novelonlinefull.com/novel/%s' 10 | 11 | 12 | class NovelOnlineFullCrawler(Crawler): 13 | base_url = 'https://novelonlinefull.com/' 14 | 15 | def search_novel(self, query): 16 | response = self.submit_form(search_url, { 17 | 'searchword': query 18 | }) 19 | data = response.json() 20 | 21 | results = [] 22 | for novel in data: 23 | titleSoup = BeautifulSoup(novel['name'], 'lxml') 24 | results.append({ 25 | 'title': titleSoup.body.text.title(), 26 | 'url': novel_page_url % novel['nameunsigned'], 27 | 'info': 'Latest: %s' % novel['lastchapter'], 28 | }) 29 | # end for 30 | return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | # self.novel_title = soup.select_one('h1.entry-title').text.strip() 39 | self.novel_title = soup.select_one('div.entry-header h1').text.strip() 40 | logger.info('Novel title: %s', self.novel_title) 41 | 42 | try: 43 | novel_data = self.submit_form(search_url, { 44 | 'searchword': self.novel_title 45 | }).json() 46 | self.novel_cover = novel_data[0]['image'] 47 | self.novel_author = novel_data[0]['author'] 48 | except Exception: 49 | logger.debug('Failed getting novel info.\n%s', Exception) 50 | # end try 51 | 52 | for a in reversed(soup.select('#list_chapter .chapter-list a')): 53 | chap_id = len(self.chapters) + 1 54 | vol_id = len(self.chapters) // 100 + 1 55 | if len(self.chapters) % 100 == 0: 56 | self.volumes.append({'id': vol_id}) 57 | # end if 58 | self.chapters.append({ 59 | 'id': chap_id, 60 | 'volume': vol_id, 61 | 'title': a.text.strip(), 62 | 'url': self.absolute_url(a['href']), 63 | }) 64 | # end for 65 | # end def 66 | 67 | def download_chapter_body(self, chapter): 68 | '''Download body of a single chapter and return as clean html format.''' 69 | logger.info('Downloading %s', chapter['url']) 70 | soup = self.get_soup(chapter['url']) 71 | 72 | logger.debug(soup.title.string) 73 | 74 | if 'Chapter' in soup.select_one('h1').text: 75 | chapter['title'] = soup.select_one('h1').text 76 | else: 77 | chapter['title'] = chapter['title'] 78 | # end if 79 | 80 | self.blacklist_patterns = [ 81 | r'^translat(ed by|or)', 82 | r'(volume|chapter) .?\d+', 83 | ] 84 | 85 | contents = soup.select_one('#vung_doc') 86 | body = self.extract_contents(contents) 87 | return '

' + '

'.join(body) + '

' 88 | # end def 89 | # end class 90 | -------------------------------------------------------------------------------- /lncrawl/sources/boxnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('BOXNOVEL') 8 | search_url = 'https://boxnovel.com/?s=%s&post_type=wp-manga&author=&artist=&release=' 9 | 10 | 11 | class BoxNovelCrawler(Crawler): 12 | base_url = 'https://boxnovel.com/' 13 | 14 | def search_novel(self, query): 15 | query = query.lower().replace(' ', '+') 16 | soup = self.get_soup(search_url % query) 17 | 18 | results = [] 19 | for tab in soup.select('.c-tabs-item__content'): 20 | a = tab.select_one('.post-title h4 a') 21 | latest = tab.select_one('.latest-chap .chapter a').text 22 | votes = tab.select_one('.rating .total_votes').text 23 | results.append({ 24 | 'title': a.text.strip(), 25 | 'url': self.absolute_url(a['href']), 26 | 'info': '%s | Rating: %s' % (latest, votes), 27 | }) 28 | # end for 29 | 30 | return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | self.novel_title = ' '.join([ 39 | str(x) 40 | for x in soup.select_one('.post-title h3').contents 41 | if not x.name 42 | ]).strip() 43 | logger.info('Novel title: %s', self.novel_title) 44 | 45 | probable_img = soup.select_one('.summary_image img') 46 | if probable_img: 47 | self.novel_cover = self.absolute_url(probable_img['src']) 48 | logger.info('Novel cover: %s', self.novel_cover) 49 | 50 | author = soup.select('.author-content a') 51 | if len(author) == 2: 52 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 53 | else: 54 | self.novel_author = author[0].text 55 | logger.info('Novel author: %s', self.novel_author) 56 | 57 | chapters = soup.select('ul.main li.wp-manga-chapter a') 58 | for a in reversed(chapters): 59 | chap_id = len(self.chapters) + 1 60 | vol_id = chap_id//100 + 1 61 | if len(self.chapters) % 100 == 0: 62 | vol_title = 'Volume ' + str(vol_id) 63 | self.volumes.append({ 64 | 'id': vol_id, 65 | 'title': vol_title, 66 | }) 67 | # end if 68 | self.chapters.append({ 69 | 'id': chap_id, 70 | 'volume': vol_id, 71 | 'url': self.absolute_url(a['href']), 72 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 73 | }) 74 | # end for 75 | # end def 76 | 77 | def download_chapter_body(self, chapter): 78 | '''Download body of a single chapter and return as clean html format.''' 79 | logger.info('Downloading %s', chapter['url']) 80 | soup = self.get_soup(chapter['url']) 81 | 82 | contents = soup.select_one('div.text-left') 83 | for bad in contents.select('h3, .code-block, script, .adsbygoogle'): 84 | bad.decompose() 85 | 86 | return str(contents) 87 | # end def 88 | # end class 89 | -------------------------------------------------------------------------------- /lncrawl/sources/webnovelindonesia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from concurrent import futures 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WEBNOVEL_INDONESIA') 8 | 9 | chapter_list_url = 'https://webnovelindonesia.com/wp-json/writerist/v1/chapters?category=%s&perpage=100&order=ASC&paged=%s' 10 | 11 | 12 | class WebnovelIndonesia(Crawler): 13 | base_url = 'https://webnovelindonesia.com/' 14 | 15 | def read_novel_info(self): 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = soup.select_one('.breadcrumb .breadcrumb-item.active').text.strip() 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.select_one('.section-novel img[class*="lazy"]')['data-src']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | self.novel_author = soup.select_one('.section-novel li a[href*="/aut/"]').text.strip() 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | possible_chapter_pages = soup.select('#js-chpater-jump > div.jump-to') 30 | 31 | if not len(possible_chapter_pages): 32 | possible_chapter_pages = [{'data-paged': '1'}] 33 | # end if 34 | 35 | novel_id = soup.select_one('#sortable-table')['data-category'] 36 | 37 | logger.info('Downloading chapters...') 38 | futures_to_check = dict() 39 | for div in possible_chapter_pages: 40 | page = div['data-paged'] 41 | url = chapter_list_url % (novel_id, page) 42 | task = self.executor.submit(self.extract_chapter_list, url) 43 | futures_to_check[task] = page 44 | # end for 45 | 46 | temp_chapters = dict() 47 | for future in futures.as_completed(futures_to_check): 48 | page = int(futures_to_check[future]) 49 | temp_chapters[page] = future.result() 50 | # end for 51 | 52 | logger.info('Building sorted chapter list...') 53 | for page in sorted(temp_chapters.keys()): 54 | self.volumes.append({'id': page}) 55 | for chap in temp_chapters[page]: 56 | chap['volume'] = page 57 | chap['id'] = 1 + len(self.chapters) 58 | self.chapters.append(chap) 59 | # end for 60 | # end for 61 | # end def 62 | 63 | def extract_chapter_list(self, url): 64 | temp_list = [] 65 | logger.debug('Visiting: %s', url) 66 | data = self.get_json(url) 67 | for item in data: 68 | temp_list.append({ 69 | 'title': item['post_title'], 70 | 'url': self.absolute_url(item['permalink']), 71 | }) 72 | # end for 73 | return temp_list 74 | # end def 75 | 76 | def download_chapter_body(self, chapter): 77 | '''Download body of a single chapter and return as clean html format''' 78 | logger.info('Downloading %s', chapter['url']) 79 | soup = self.get_soup(chapter['url']) 80 | 81 | body = '' 82 | for p in soup.select('#content > p'): 83 | if p.text.strip(): 84 | body += str(p).strip() 85 | # end if 86 | # end for 87 | 88 | return body 89 | # end def 90 | # end class 91 | -------------------------------------------------------------------------------- /lncrawl/sources/translateindo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import quote, urlparse 5 | import urllib.parse 6 | from bs4 import BeautifulSoup 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('TRANSLATEINDO') 11 | 12 | #search_url = 'https://www.worldnovel.online/wp-json/writerist/v1/novel/search?keyword=%s' 13 | #chapter_list_url = "https://www.worldnovel.online/wp-json/writerist/v1/chapters?category=%s&perpage=4000&order=ASC&paged=1" 14 | 15 | 16 | class TranslateIndoCrawler(Crawler): 17 | base_url = 'https://www.translateindo.com/' 18 | 19 | # def search_novel(self, query): 20 | # data = self.get_json(search_url % quote(query)) 21 | 22 | # results = [] 23 | # for item in data: 24 | # results.append({ 25 | # 'url': item['permalink'], 26 | # 'title': item['post_title'], 27 | # }) 28 | # # end for 29 | 30 | # return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | self.novel_title = soup.select_one('h1.entry-title').text.strip() 39 | logger.info('Novel title: %s', self.novel_title) 40 | 41 | possible_cover = soup.select_one('div.entry-content img')['src'] 42 | if possible_cover: 43 | self.novel_cover = self.absolute_url(possible_cover) 44 | # end if 45 | logger.info('Novel cover: %s', self.novel_cover) 46 | 47 | for span in soup.select('div.entry-content p span'): 48 | possible_author = re.sub(r'[\(\s\n\)]+', ' ', span.text, re.M).strip() 49 | if possible_author.startswith('Author:'): 50 | possible_author = re.sub('Author:', '', possible_author) 51 | self.novel_author = possible_author.strip() 52 | break 53 | # end if 54 | # end for 55 | logger.info('Novel author: %s', self.novel_author) 56 | 57 | for div in soup.select('.cl-lists .cl-block'): 58 | possible_vol = div.select_one('.cl-header') 59 | if not possible_vol: 60 | continue 61 | 62 | vol_title = possible_vol.text.strip() 63 | vol_id = len(self.volumes) + 1 64 | self.volumes.append({ 65 | 'id': vol_id, 66 | 'title': vol_title, 67 | }) 68 | 69 | for a in div.select('ol.cl-body li a'): 70 | chap_id = len(self.chapters) + 1 71 | self.chapters.append({ 72 | 'id': chap_id, 73 | 'volume': vol_id, 74 | 'url': self.absolute_url(a['href']), 75 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 76 | }) 77 | # end for 78 | # end for 79 | # end def 80 | 81 | def download_chapter_body(self, chapter): 82 | '''Download body of a single chapter and return as clean html format''' 83 | logger.info('Downloading %s', chapter['url']) 84 | soup = self.get_soup(chapter['url']) 85 | 86 | contents = soup.select('div.entry-content p') 87 | 88 | body = [str(p) for p in contents if p.text.strip()] 89 | return '

' + '

'.join(body) + '

' 90 | # end def 91 | # end class 92 | -------------------------------------------------------------------------------- /setup_pyi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import platform 5 | import re 6 | import shlex 7 | import shutil 8 | import sys 9 | from pathlib import Path 10 | 11 | from PyInstaller import __main__ as pyi 12 | from setuptools.config import read_configuration 13 | 14 | ROOT = Path(__file__).parent 15 | unix_root = '/'.join(str(ROOT).split(os.sep)) 16 | site_packages = list(ROOT.glob('venv/**/site-packages'))[0] 17 | unix_site_packages = '/'.join(str(site_packages).split(os.sep)) 18 | 19 | 20 | def package(): 21 | output = str(ROOT / 'windows') 22 | shutil.rmtree(output, ignore_errors=True) 23 | os.makedirs(output, exist_ok=True) 24 | setup_command() 25 | pyi.run() 26 | shutil.rmtree(output, ignore_errors=True) 27 | # end def 28 | 29 | 30 | def setup_command(): 31 | command = 'pyinstaller ' 32 | command += '--onefile ' # onefile 33 | command += '--clean ' 34 | command += '--noconfirm ' 35 | command += '--name "lncrawl" ' 36 | command += '--icon "%s/res/lncrawl.ico" ' % unix_root 37 | command += '--distpath "%s" ' % str(ROOT / 'dist') 38 | command += '--specpath "%s" ' % str(ROOT / 'windows') 39 | command += '--workpath "%s" ' % str(ROOT / 'windows' / 'build') 40 | 41 | command += gather_data_files() 42 | command += gather_hidden_imports() 43 | command += '"%s/__main__.py" ' % unix_root 44 | 45 | print(command) 46 | print() 47 | 48 | sys.argv = shlex.split(command) 49 | # end def 50 | 51 | 52 | def gather_data_files(): 53 | command = '' 54 | 55 | # add data files of this project 56 | for f in (ROOT / 'lncrawl').glob('**/*.*'): 57 | src = str(f) 58 | src = '/'.join(src.split(os.sep)) 59 | dst = str(f.parent.relative_to(ROOT)) 60 | dst = '/'.join(dst.split(os.sep)) 61 | command += '--add-data "%s%s%s" ' % (src, os.pathsep, dst) 62 | # end for 63 | command += '--add-data "%s/lncrawl/VERSION%slncrawl" ' % (unix_root, os.pathsep) 64 | 65 | # add data files of other dependencies 66 | command += '--add-data "%s/cairosvg/VERSION%s." ' % ( 67 | unix_site_packages, os.pathsep) 68 | command += '--add-data "%s/cairocffi/VERSION%scairocffi" ' % ( 69 | unix_site_packages, os.pathsep) 70 | command += '--add-data "%s/tinycss2/VERSION%stinycss2" ' % ( 71 | unix_site_packages, os.pathsep) 72 | command += '--add-data "%s/text_unidecode/data.bin%stext_unidecode" ' % ( 73 | unix_site_packages, os.pathsep) 74 | command += '--add-data "%s/cloudscraper%scloudscraper" ' % ( 75 | unix_site_packages, os.pathsep) 76 | command += '--add-data "%s/wcwidth/version.json%swcwidth" ' % ( 77 | unix_site_packages, os.pathsep) 78 | 79 | return command 80 | # end def 81 | 82 | 83 | def gather_hidden_imports(): 84 | command = '' 85 | 86 | # add hidden imports of this project 87 | for f in (ROOT / 'lncrawl' / 'sources').glob('*.py'): 88 | if os.path.isfile(f) and re.match(r'^([^_.][^.]+).py$', f.name): 89 | module_name = f.name[:-3] 90 | command += '--hidden-import "lncrawl.sources.%s" ' % module_name 91 | # end if 92 | # end for 93 | command += '--hidden-import "pkg_resources.py2_warn" ' 94 | 95 | return command 96 | # end def 97 | 98 | 99 | if __name__ == '__main__': 100 | package() 101 | # end if 102 | -------------------------------------------------------------------------------- /lncrawl/binders/calibre.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | import subprocess 5 | 6 | logger = logging.getLogger('CALIBRE_BINDER') 7 | 8 | EBOOK_CONVERT = 'ebook-convert' 9 | CALIBRE_LINK = 'https://calibre-ebook.com/download' 10 | 11 | 12 | def run_ebook_convert(*args): 13 | ''' 14 | Calls `ebook-convert` with given args 15 | Visit https://manual.calibre-ebook.com/generated/en/ebook-convert.html for argument list. 16 | ''' 17 | try: 18 | isdebug = os.getenv('debug_mode') == 'yes' 19 | with open(os.devnull, 'w') as dumper: 20 | subprocess.call( 21 | [EBOOK_CONVERT] + list(args), 22 | stdout=None if isdebug else dumper, 23 | stderr=None if isdebug else dumper, 24 | ) 25 | # end with 26 | return True 27 | except Exception: 28 | import traceback 29 | logger.debug(traceback.format_exc()) 30 | return False 31 | # end try 32 | # end def 33 | 34 | 35 | def epub_to_calibre(app, epub_file, out_fmt): 36 | if not os.path.exists(epub_file): 37 | return None 38 | # end if 39 | 40 | epub_path = os.path.dirname(epub_file) 41 | epub_file_name = os.path.basename(epub_file) 42 | file_name_without_ext = epub_file_name.replace('.epub', '') 43 | 44 | work_path = os.path.dirname(epub_path) 45 | out_path = os.path.join(work_path, out_fmt) 46 | out_file_name = file_name_without_ext + '.' + out_fmt 47 | out_file = os.path.join(out_path, out_file_name) 48 | 49 | os.makedirs(out_path, exist_ok=True) 50 | 51 | logger.debug('Converting "%s" to "%s"', epub_file, out_file) 52 | 53 | args = [ 54 | epub_file, 55 | out_file, 56 | '--unsmarten-punctuation', 57 | '--no-chapters-in-toc', 58 | '--title', file_name_without_ext, 59 | '--authors', app.crawler.novel_author, 60 | '--series', app.crawler.novel_title, 61 | '--publisher', app.crawler.home_url, 62 | '--book-producer', 'Lightnovel Crawler', 63 | '--enable-heuristics', '--disable-renumber-headings', 64 | ] 65 | if app.book_cover: 66 | args += ['--cover', app.book_cover] 67 | if out_fmt == 'pdf': 68 | args += [ 69 | '--paper-size', 'a4', 70 | '--pdf-page-numbers', 71 | '--pdf-hyphenate', 72 | '--pdf-header-template', '

⦗ _TITLE_ — _SECTION_ ⦘

', 73 | ] 74 | # end if 75 | 76 | run_ebook_convert(*args) 77 | 78 | if os.path.exists(out_file): 79 | print('Created: %s' % out_file_name) 80 | return out_file 81 | else: 82 | logger.error('[%s] conversion failed: %s', out_fmt, epub_file_name) 83 | return None 84 | # end if 85 | # end def 86 | 87 | 88 | def make_calibres(app, epubs, out_fmt): 89 | if out_fmt == 'epub' or not epubs: 90 | return epubs 91 | # end if 92 | 93 | if not run_ebook_convert('--version'): 94 | logger.error('Install Calibre to generate %s: %s', 95 | out_fmt, CALIBRE_LINK), 96 | return 97 | # end if 98 | 99 | out_files = [] 100 | for epub in epubs: 101 | out = epub_to_calibre(app, epub, out_fmt) 102 | out_files += [out] 103 | # end for 104 | 105 | return out_files 106 | # end def 107 | -------------------------------------------------------------------------------- /lncrawl/sources/zenithnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | import requests 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('ZENITH_NOVELS') 11 | 12 | novel_url = 'http://zenithnovels.com/%s/' 13 | 14 | 15 | class ZenithNovelsCrawler(Crawler): 16 | base_url = 'http://zenithnovels.com/' 17 | 18 | def read_novel_info(self): 19 | '''Get novel title, autor, cover etc''' 20 | self.novel_id = re.search( 21 | r'(?<=zenithnovels.com/)[^/]+', self.novel_url).group(0) 22 | logger.info('Novel id: %s', self.novel_id) 23 | 24 | url = novel_url % self.novel_id 25 | logger.debug('Visiting %s', url) 26 | soup = self.get_soup(url) 27 | 28 | self.novel_title = soup.select_one('article#the-post h1.name').text 29 | logger.info('Novel title: %s', self.novel_title) 30 | 31 | self.novel_cover = self.absolute_url(soup.select_one( 32 | 'article#the-post .entry img')['src']) 33 | logger.info('Novel cover: %s', self.novel_cover) 34 | 35 | while True: 36 | self.parse_chapter_list(soup) 37 | 38 | next_link = soup.select_one('ul.lcp_paginator a.lcp_nextlink') 39 | if next_link: 40 | soup = self.get_soup(next_link['href']) 41 | else: 42 | break 43 | # end if 44 | # end if 45 | 46 | self.chapters.sort(key=lambda x: x['volume'] * 1e6 + x['id']) 47 | self.volumes = [{'id': x, 'title': ''} for x in set(self.volumes)] 48 | # end def 49 | 50 | def parse_chapter_list(self, soup): 51 | for a in soup.select('ul.lcp_catlist li a'): 52 | ch_title = a['title'] 53 | ch_id = [int(''.join(x).strip()) for x in re.findall( 54 | r'((?<=ch) \d+)|((?<=chapter) \d+)', ch_title, re.IGNORECASE)] 55 | ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1 56 | vol_id = [int(''.join(x).strip()) for x in re.findall( 57 | r'((?<=book) \d+)|((?<=volume) \d+)', ch_title, re.IGNORECASE)] 58 | vol_id = vol_id[0] if len(vol_id) else 1 + (ch_id - 1) // 100 59 | 60 | self.volumes.append(vol_id) 61 | self.chapters.append({ 62 | 'id': ch_id, 63 | 'volume': vol_id, 64 | 'title': ch_title, 65 | 'url': self.absolute_url(a['href']), 66 | }) 67 | # end for 68 | # end def 69 | 70 | def download_chapter_body(self, chapter): 71 | '''Download body of a single chapter and return as clean html format.''' 72 | logger.info('Downloading %s', chapter['url']) 73 | soup = self.get_soup(chapter['url']) 74 | 75 | entry = soup.select_one('article#the-post .entry') 76 | 77 | try: 78 | self.clean_contents(entry) 79 | for note in entry.select('.footnote'): 80 | note.decompose() 81 | # end for 82 | except Exception: 83 | pass 84 | # end try 85 | 86 | body = '' 87 | for tag in entry.children: 88 | if tag.name == 'p' and len(tag.text.strip()): 89 | p = ' '.join(self.extract_contents(tag)) 90 | if len(p.strip()): 91 | body += '

%s

' % p 92 | # end if 93 | # end if 94 | # end for 95 | 96 | return body 97 | # end def 98 | # end class 99 | -------------------------------------------------------------------------------- /lncrawl/sources/litnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..utils.crawler import Crawler 4 | 5 | logger = logging.getLogger('LITNET') 6 | search_url = 'https://litnet.com/en/search?q=%s' 7 | 8 | 9 | class LitnetCrawler(Crawler): 10 | base_url = 'https://litnet.com/' 11 | 12 | def search_novel(self, query): 13 | query = query.lower().replace(' ', '+') 14 | soup = self.get_soup(search_url % query) 15 | 16 | results = [] 17 | for a in soup.select('div.l-container ul a'): 18 | results.append({ 19 | 'title': a.text.strip(), 20 | 'url': self.absolute_url(a['href']), 21 | }) 22 | # end for 23 | 24 | return results 25 | # end def 26 | 27 | def read_novel_info(self): 28 | '''Get novel title, autor, cover etc''' 29 | logger.debug('Visiting %s', self.novel_url) 30 | soup = self.get_soup(self.novel_url) 31 | 32 | self.novel_title = soup.select_one('h1').text.strip() 33 | logger.info('Novel title: %s', self.novel_title) 34 | 35 | img_src = soup.select_one('div.book-view-cover img') 36 | if not img_src: 37 | img_src = soup.select_one('div.book-cover img') 38 | # end if 39 | if img_src: 40 | self.novel_cover = self.absolute_url(img_src['src']) 41 | # end if 42 | logger.info('Novel cover: %s', self.novel_cover) 43 | 44 | author = soup.select_one('div.book-view-info a.author') 45 | if not author: 46 | author = soup.select_one('div.book-head-content a.book-autor') 47 | # end if 48 | if author: 49 | self.novel_author = author.text.strip() 50 | # end if 51 | logger.info('Novel author: %s', self.novel_author) 52 | 53 | chapters = soup.find('select', {'name': 'chapter'}) 54 | if chapters is None: 55 | chapters = soup.select('div.collapsible-body a.collection-item') 56 | else: 57 | chapters = chapters.find_all('option') 58 | chapters = [c for c in chapters if c.attrs['value']] 59 | # end if 60 | 61 | for a in chapters: 62 | chap_id = len(self.chapters) + 1 63 | if len(self.chapters) % 100 == 0: 64 | vol_id = chap_id//100 + 1 65 | vol_title = 'Volume ' + str(vol_id) 66 | self.volumes.append({ 67 | 'id': vol_id, 68 | 'title': vol_title, 69 | }) 70 | # end if 71 | 72 | abs_url = self.last_visited_url.replace('book', 'reader') 73 | chap_url = abs_url + \ 74 | ('?c=%s' % a.attrs['value']) if a.has_attr( 75 | 'value') else self.home_url + a['href'] 76 | self.chapters.append({ 77 | 'id': chap_id, 78 | 'volume': 1, 79 | 'url': chap_url, 80 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 81 | }) 82 | # end for 83 | # end def 84 | 85 | def download_chapter_body(self, chapter): 86 | '''Download body of a single chapter and return as clean html format.''' 87 | logger.info('Downloading %s', chapter['url']) 88 | soup = self.get_soup(chapter['url']) 89 | 90 | contents = soup.select_one('div.reader-text') 91 | if contents is None: 92 | contents = soup.select_one('div.demo-txt') 93 | return str(contents) 94 | # end def 95 | # end class 96 | -------------------------------------------------------------------------------- /lncrawl/bots/console/get_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | from PyInquirer import prompt 5 | 6 | from ...core import display 7 | from ...core.arguments import get_args 8 | from ...sources import rejected_sources 9 | 10 | 11 | def get_novel_url(self): 12 | '''Returns a novel page url or a query''' 13 | args = get_args() 14 | if args.query and len(args.query) > 1: 15 | return args.query 16 | # end if 17 | 18 | url = args.novel_page 19 | if url: 20 | if re.match(r'^https?://.+\..+$', url): 21 | return url 22 | else: 23 | raise Exception('Invalid URL of novel page') 24 | # end if 25 | # end if 26 | 27 | try: 28 | if args.suppress: 29 | raise Exception() 30 | # end if 31 | 32 | answer = prompt([ 33 | { 34 | 'type': 'input', 35 | 'name': 'novel', 36 | 'message': 'Enter novel page url or query novel:', 37 | 'validate': lambda val: 'Input should not be empty' 38 | if len(val) == 0 else True, 39 | }, 40 | ]) 41 | return answer['novel'].strip() 42 | except Exception: 43 | raise Exception('Novel page url or query was not given') 44 | # end try 45 | # end def 46 | 47 | 48 | def get_crawlers_to_search(self): 49 | '''Returns user choice to search the choosen sites for a novel''' 50 | links = self.app.crawler_links 51 | if not links: 52 | return None 53 | # end if 54 | 55 | args = get_args() 56 | if args.suppress or not args.sources: 57 | return links 58 | # end if 59 | 60 | answer = prompt([ 61 | { 62 | 'type': 'checkbox', 63 | 'name': 'sites', 64 | 'message': 'Where to search?', 65 | 'choices': [{'name': x} for x in sorted(links)], 66 | } 67 | ]) 68 | 69 | selected = answer['sites'] 70 | return selected if len(selected) > 0 else links 71 | # end def 72 | 73 | 74 | def choose_a_novel(self): 75 | '''Choose a single novel url from the search result''' 76 | args = get_args() 77 | 78 | # Choose a novel title 79 | choices = self.app.search_results 80 | selected_choice = self.app.search_results[0] 81 | if len(choices) > 1 and not args.suppress: 82 | answer = prompt([ 83 | { 84 | 'type': 'list', 85 | 'name': 'novel', 86 | 'message': 'Which one is your novel?', 87 | 'choices': display.format_novel_choices(choices), 88 | } 89 | ]) 90 | 91 | index = int(answer['novel'].split('.')[0]) 92 | selected_choice = self.app.search_results[index - 1] 93 | # end if 94 | 95 | # Choose the novel source 96 | novels = selected_choice['novels'] 97 | selected_novel = novels[0] 98 | if len(novels) > 1 and not args.suppress: 99 | answer = prompt([ 100 | { 101 | 'type': 'list', 102 | 'name': 'novel', 103 | 'message': 'Choose a source to download?', 104 | 'choices': ['0. Back'] + display.format_source_choices(novels), 105 | } 106 | ]) 107 | 108 | index = int(answer['novel'].split('.')[0]) 109 | if index == 0: 110 | return self.choose_a_novel() 111 | # end if 112 | selected_novel = novels[index - 1] 113 | # end if 114 | 115 | return selected_novel['url'] 116 | # end def 117 | -------------------------------------------------------------------------------- /lncrawl/sources/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Auto imports all crawlers from the current package directory. 4 | To be recognized, your crawler file should meet following conditions: 5 | - file does not starts with an underscore 6 | - file ends with .py extension 7 | - file contains a class that extends `lncrawl.utils.crawler.Crawler` 8 | - the class extending `lncrawl.utils.crawler.Crawler` has a global variable `base_url` 9 | - `base_url` contains a valid url or a list of urls supported by the crawler 10 | 11 | For example, see any of the files inside this directory. 12 | """ 13 | 14 | import importlib 15 | import os 16 | import re 17 | import sys 18 | from urllib.parse import urlparse 19 | 20 | from ..utils.crawler import Crawler 21 | 22 | rejected_sources = { 23 | 'https://novelplanet.com/': 'Site is closed', 24 | 'http://gravitytales.com/': 'Redirects to webnovel.com', 25 | 'http://fullnovel.live/': "403 - Forbidden: Access is denied", 26 | 'http://moonbunnycafe.com/': "Does not follow uniform format", 27 | 'https://anythingnovel.com/': 'Site broken', 28 | 'https://indomtl.com/': "Does not like to be crawled", 29 | 'https://lnindo.org/': "Does not like to be crawled", 30 | 'https://myoniyonitranslations.com/': "522 - Connection timed out", 31 | 'https://novelgo.id/': "Removed by owner", 32 | 'https://www.flying-lines.com/': 'Obfuscated content', 33 | 'https://www.jieruihao.cn/': "Unavailable", 34 | 'https://www.noveluniverse.com/': "Site is down", 35 | 'https://www.novelupdates.com/': "Does not host any novels", 36 | 'https://www.novelv.com/': "Site is down", 37 | 'https://yukinovel.id/': "Removed by owner", 38 | 'https://www.rebirth.online/': 'Site moved', 39 | 'https://mtled-novels.com/': 'Domain is expired', 40 | } 41 | 42 | # this list will be auto-generated 43 | crawler_list = {} 44 | 45 | # auto-import all submodules in the current directory 46 | __module_regex = re.compile(r'^([^_.][^.]+).py[c]?$', re.I) 47 | __url_regex = re.compile(r'^^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.I) 48 | 49 | for entry in os.listdir(__path__[0]): 50 | file_path = os.path.join(__path__[0], entry) 51 | if not os.path.isfile(file_path): 52 | continue 53 | # end if 54 | 55 | regex_result = __module_regex.findall(entry) 56 | if len(regex_result) != 1: # does not contains a module 57 | continue 58 | # end if 59 | 60 | module_name = regex_result[0] 61 | module = importlib.import_module('.' + module_name, package=__package__) 62 | 63 | for key in dir(module): 64 | item = getattr(module, key) 65 | if type(item) != type(Crawler) or item.__base__ != Crawler: 66 | continue 67 | # end if 68 | 69 | if not hasattr(item, 'base_url'): 70 | raise Exception('No `base_url` for `%s`' % key) 71 | # end if 72 | 73 | base_url = getattr(item, 'base_url') 74 | if isinstance(base_url, str): 75 | base_url = [base_url] 76 | # end if 77 | 78 | if not isinstance(base_url, list): 79 | raise Exception('Unexpected `base_url` type in `%s`' % key) 80 | # end if 81 | 82 | for url in base_url: 83 | if not __url_regex.match(url): 84 | raise Exception('Invalid `base_url` in `%s`: %s' % (key, url)) 85 | # end if 86 | if not url.endswith('/'): 87 | url += '/' 88 | # end if 89 | if url in rejected_sources: 90 | continue 91 | # end if 92 | crawler_list[url] = item 93 | # end for 94 | # end for 95 | # end for 96 | -------------------------------------------------------------------------------- /lncrawl/sources/royalroad.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('ROYALROAD') 8 | search_url = 'https://www.royalroad.com/fictions/search?keyword=%s' 9 | 10 | 11 | class RoyalRoadCrawler(Crawler): 12 | base_url = 'https://www.royalroad.com/' 13 | 14 | def search_novel(self, query): 15 | query = query.lower().replace(' ', '+') 16 | soup = self.get_soup(search_url % query) 17 | 18 | results = [] 19 | for a in soup.select('h2.fiction-title a')[:5]: 20 | url = self.absolute_url(a['href']) 21 | results.append({ 22 | 'url': url, 23 | 'title': a.text.strip(), 24 | 'info': self.search_novel_info(url), 25 | }) 26 | # end for 27 | 28 | return results 29 | # end def 30 | 31 | def search_novel_info(self, url): 32 | '''Get novel title, autor, cover etc''' 33 | logger.debug('Visiting %s', url) 34 | soup = self.get_soup(url) 35 | 36 | score = soup.select_one('span.star')['data-content'] 37 | chapters = len(soup.find('tbody').findAll('a', href=True)) 38 | latest = soup.find('tbody').findAll('a', href=True)[-1].text.strip() 39 | info = 'Score: %s, Chapter count %s, Latest: %s' % ( 40 | score, chapters, latest) 41 | 42 | return info 43 | # end def 44 | 45 | def read_novel_info(self): 46 | '''Get novel title, autor, cover etc''' 47 | logger.debug('Visiting %s', self.novel_url) 48 | soup = self.get_soup(self.novel_url) 49 | 50 | self.novel_title = soup.find("h1", {"property": "name"}).text.strip() 51 | logger.info('Novel title: %s', self.novel_title) 52 | 53 | self.novel_cover = self.absolute_url( 54 | soup.find("img", {"class": "img-offset thumbnail inline-block"})['src']) 55 | logger.info('Novel cover: %s', self.novel_cover) 56 | 57 | self.novel_author = soup.find( 58 | "span", {"property": "name"}).text.strip() 59 | logger.info('Novel author: %s', self.novel_author) 60 | 61 | chapters = soup.find('tbody').findAll('a', href=True) 62 | 63 | for x in chapters: 64 | chap_id = len(self.chapters) + 1 65 | if len(self.chapters) % 100 == 0: 66 | vol_id = chap_id//100 + 1 67 | vol_title = 'Volume ' + str(vol_id) 68 | self.volumes.append({ 69 | 'id': vol_id, 70 | 'title': vol_title, 71 | }) 72 | # end if 73 | self.chapters.append({ 74 | 'id': chap_id, 75 | 'volume': vol_id, 76 | 'url': self.absolute_url(x['href']), 77 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 78 | }) 79 | # end for 80 | # end def 81 | 82 | def download_chapter_body(self, chapter): 83 | '''Download body of a single chapter and return as clean html format.''' 84 | logger.info('Downloading %s', chapter['url']) 85 | soup = self.get_soup(chapter['url']) 86 | 87 | logger.debug(soup.title.string) 88 | 89 | if 'Chapter' in soup.select_one('h2').text: 90 | chapter['title'] = soup.select_one('h2').text 91 | else: 92 | chapter['title'] = chapter['title'] 93 | # end if 94 | 95 | contents = soup.find("div", {"class": "chapter-content"}) 96 | 97 | self.clean_contents(contents) 98 | return str(contents) 99 | # end def 100 | # end class 101 | -------------------------------------------------------------------------------- /lncrawl/sources/wuxiasite.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WUXIA-SITE') 8 | search_url = 'https://wuxiaworld.site/?s=%s&post_type=wp-manga' 9 | 10 | 11 | class WuxiaSiteCrawler(Crawler): 12 | base_url = 'https://wuxiaworld.site/' 13 | 14 | # TODO: disabled due to cloudflare issue 15 | # def search_novel(self, query): 16 | # query = query.lower().replace(' ', '+') 17 | # soup = self.get_soup(search_url % query) 18 | 19 | # results = [] 20 | # for tab in soup.select('.c-tabs-item__content'): 21 | # a = tab.select_one('.post-title h4 a') 22 | # latest = tab.select_one('.latest-chap .chapter a').text 23 | # votes = tab.select_one('.rating .total_votes').text 24 | # results.append({ 25 | # 'title': a.text.strip(), 26 | # 'url': self.absolute_url(a['href']), 27 | # 'info': '%s | Rating: %s' % (latest, votes), 28 | # }) 29 | # # end for 30 | 31 | # return results 32 | # # end def 33 | 34 | def read_novel_info(self): 35 | '''Get novel title, autor, cover etc''' 36 | logger.debug('Visiting %s', self.novel_url) 37 | soup = self.get_soup(self.novel_url) 38 | 39 | self.novel_title = ' '.join([ 40 | str(x) 41 | for x in soup.select_one('.post-title h3').contents 42 | if not x.name 43 | ]).strip() 44 | logger.info('Novel title: %s', self.novel_title) 45 | 46 | possible_img = soup.select_one('.summary_image img') 47 | if possible_img: 48 | if possible_img.has_attr('data-src'): 49 | self.novel_cover = self.absolute_url(possible_img['data-src']) 50 | elif possible_img.has_attr('srcset'): 51 | self.novel_cover = self.absolute_url(possible_img['srcset'].split(',')[0]) 52 | elif possible_img.has_attr('src'): 53 | self.novel_cover = self.absolute_url(possible_img['src']) 54 | logger.info('Novel cover: %s', self.novel_cover) 55 | 56 | author = soup.select('.author-content a') 57 | if len(author) == 2: 58 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 59 | else: 60 | self.novel_author = author[0].text 61 | logger.info('Novel author: %s', self.novel_author) 62 | 63 | chapters = soup.select('ul.main li.wp-manga-chapter a') 64 | chapters.reverse() 65 | 66 | for a in chapters: 67 | chap_id = len(self.chapters) + 1 68 | vol_id = chap_id//100 + 1 69 | if len(self.chapters) % 100 == 0: 70 | vol_title = 'Volume ' + str(vol_id) 71 | self.volumes.append({ 72 | 'id': vol_id, 73 | 'title': vol_title, 74 | }) 75 | # end if 76 | self.chapters.append({ 77 | 'id': chap_id, 78 | 'volume': vol_id, 79 | 'url': self.absolute_url(a['href']), 80 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 81 | }) 82 | # end for 83 | # end def 84 | 85 | def download_chapter_body(self, chapter): 86 | '''Download body of a single chapter and return as clean html format.''' 87 | logger.info('Downloading %s', chapter['url']) 88 | soup = self.get_soup(chapter['url']) 89 | contents = soup.select('.text-left p, .cha-words p') 90 | body = [str(p) for p in contents if p.text.strip()] 91 | return '

' + '

'.join(body) + '

' 92 | # end def 93 | # end class 94 | -------------------------------------------------------------------------------- /lncrawl/bots/test/post_github.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import os 5 | import platform 6 | import sys 7 | from datetime import datetime 8 | from urllib.parse import urlencode 9 | 10 | import requests 11 | 12 | from ...assets.user_agents import user_agents 13 | 14 | logger = logging.getLogger('MAKE_GITHUB_ISSUE') 15 | 16 | # Authentication for user filing issue 17 | USERNAME = os.getenv('GITHUB_USERNAME') 18 | # PASSWORD = os.getenv('GITHUB_PASSWORD') # deprecated 19 | TOKEN = os.getenv('GITHUB_TOKEN') # must have read/write access to repo 20 | 21 | # The repository to add this issue to 22 | REPO_OWNER = 'dipu-bd' 23 | REPO_NAME = 'lightnovel-crawler' 24 | 25 | # Headers 26 | headers = { 27 | "User-Agent": user_agents[0], 28 | "Authorization": "token %s" % TOKEN, 29 | "Accept": "application/vnd.github.golden-comet-preview+json" 30 | } 31 | 32 | 33 | def find_issues(labels=None): 34 | '''Returns list of issues by query''' 35 | # Url to get issues via GET 36 | url = 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) 37 | 38 | # Create a session without authentication 39 | session = requests.Session() 40 | 41 | # Create our issue 42 | data = { 43 | 'labels': labels, 44 | } 45 | 46 | # Get issues 47 | r = session.get(url + '?' + urlencode(data), headers=headers) 48 | if r.ok: 49 | logger.info('Successfully retrieved issues') 50 | return r.json() 51 | else: 52 | logger.info('Failed to get issues: %s' % url) 53 | logger.debug('Response:\n%s\n' % r.content) 54 | return [] 55 | # end if 56 | # end def 57 | 58 | 59 | def post_issue(title, body=None, labels=None): 60 | '''Create an issue on github.com using the given parameters.''' 61 | # Our url to create issues via POST 62 | url = 'https://api.github.com/repos/%s/%s/import/issues' % (REPO_OWNER, REPO_NAME) 63 | 64 | # Create an authenticated session to create the issue 65 | session = requests.Session() 66 | # session.auth = (USERNAME, PASSWORD) 67 | 68 | # Create our issue 69 | payload = json.dumps({ 70 | 'issue': { 71 | 'title': title, 72 | 'body': body, 73 | 'labels': labels, 74 | } 75 | }) 76 | 77 | # Add the issue to our repository 78 | r = session.post(url, data=payload, headers=headers) 79 | if r.ok: 80 | logger.info('Successfully created Issue %s' % title) 81 | else: 82 | logger.info('Could not create Issue %s' % title) 83 | logger.debug('Response:\n%s\n' % r.content) 84 | raise Exception('Failed to create issue') 85 | # end if 86 | # end def 87 | 88 | 89 | def post_on_github(self, message): 90 | if sys.version_info.minor != 6: 91 | print('Not Python 3.6... skipping.') 92 | return 93 | # end if 94 | 95 | # Check if there is already an issue younger than a week 96 | issues = find_issues('bot-report') 97 | if len(issues): 98 | time = int(issues[0]['title'].split('~')[-1].strip()) 99 | diff = datetime.utcnow().timestamp() - time 100 | if diff < 7 * 24 * 3600: 101 | print('Detected an open issue younger than a week... skipping.') 102 | return 103 | # end if 104 | # end if 105 | 106 | # Create new issue with appropriate label 107 | title = '[Test Bot][Python %d.%d][%s] Report ~ %s' % ( 108 | sys.version_info.major, 109 | sys.version_info.minor, 110 | platform.system(), 111 | datetime.utcnow().strftime('%s') 112 | ) 113 | post_issue(title, message, ['bot-report']) 114 | # end def 115 | --------------------------------------------------------------------------------