├── Procfile
├── lncrawl
    ├── VERSION
    ├── assets
    │   ├── __init__.py
    │   ├── fonts
    │   │   ├── Horta.ttf
    │   │   ├── Sofia.otf
    │   │   ├── Bellota.otf
    │   │   ├── Caladea.ttf
    │   │   ├── Crimson.otf
    │   │   ├── Gidole.ttf
    │   │   ├── Orkney.ttf
    │   │   ├── Unique.ttf
    │   │   ├── Bradley Gratis.ttf
    │   │   ├── Liberation Serif.ttf
    │   │   ├── Libre Baskerville.ttf
    │   │   └── Glacial Indifference.otf
    │   ├── version.py
    │   ├── colors.txt
    │   ├── icons.py
    │   ├── templates
    │   │   ├── Simple.svg
    │   │   ├── Simple Dark.svg
    │   │   ├── Blocks.svg
    │   │   ├── Column.svg
    │   │   ├── Window.svg
    │   │   ├── Cross.svg
    │   │   ├── Tiles.svg
    │   │   ├── Gradient.svg
    │   │   └── Rings.svg
    │   ├── html_style.py
    │   └── html_style.css
    ├── utils
    │   ├── __init__.py
    │   ├── update_checker.py
    │   ├── uploader.py
    │   └── kindlegen_download.py
    ├── bots
    │   ├── discord
    │   │   ├── __init__.py
    │   │   └── config.py
    │   ├── __init__.py
    │   ├── console
    │   │   ├── __init__.py
    │   │   ├── login_info.py
    │   │   └── get_crawler.py
    │   ├── test
    │   │   ├── test_crawler.py
    │   │   └── post_github.py
    │   └── _sample.py
    ├── __init__.py
    ├── binders
    │   ├── text.py
    │   ├── __init__.py
    │   ├── web.py
    │   └── calibre.py
    ├── sources
    │   ├── anythingnovel.py
    │   ├── chinesefantasy.py
    │   ├── asianhobbyist.py
    │   ├── webnovelonlinecom.py
    │   ├── listnovel.py
    │   ├── novelringan.py
    │   ├── ranobelibme.py
    │   ├── webnovelonline.py
    │   ├── flyinglines.py
    │   ├── wuxialeague.py
    │   ├── fullnovellive.py
    │   ├── liberspark.py
    │   ├── aixdzs.py
    │   ├── tapread.py
    │   ├── tomotrans.py
    │   ├── wattpad.py
    │   ├── jpmtl.py
    │   ├── tiknovel.py
    │   ├── qidiancom.py
    │   ├── 9kqw.py
    │   ├── novelspread.py
    │   ├── novelv.py
    │   ├── machinetrans.py
    │   ├── readln.py
    │   ├── idqidian.py
    │   ├── yukinovel.py
    │   ├── fourscanlation.py
    │   ├── novelgo.py
    │   ├── gravitytales.py
    │   ├── machinetransorg.py
    │   ├── mangatoon.py
    │   ├── rewayatclub.py
    │   ├── shinsori.py
    │   ├── wuxiaonline.py
    │   ├── crescentmoon.py
    │   ├── meionovel.py
    │   ├── kissnovel.py
    │   ├── bestlightnovel.py
    │   ├── novelonlinefull.py
    │   ├── boxnovel.py
    │   ├── webnovelindonesia.py
    │   ├── translateindo.py
    │   ├── zenithnovels.py
    │   ├── litnet.py
    │   ├── __init__.py
    │   ├── royalroad.py
    │   └── wuxiasite.py
    └── core
    │   ├── __init__.py
    │   ├── novel_info.py
    │   └── novel_search.py
├── runtime.txt
├── MANIFEST.in
├── dev-requirements.txt
├── res
    ├── lncrawl.ico
    ├── lncrawl-icon.png
    └── lncrawl-web.png
├── __main__.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── general.md
    │   ├── new-source.md
    │   ├── bug_report.md
    │   └── remove-source.md
    └── workflows
    │   └── pythonpackage.yml
├── package.json
├── scripts
    ├── publish.sh
    ├── publish.bat
    ├── build.sh
    └── build.bat
├── .gitignore
├── requirements.txt
├── .appveyor.yml
├── .env.example
├── .travis.yml
├── setup.py
├── app.json
├── setup.cfg
├── README.pip
└── setup_pyi.py


/Procfile:
--------------------------------------------------------------------------------
1 | bot: python .
2 | 


--------------------------------------------------------------------------------
/lncrawl/VERSION:
--------------------------------------------------------------------------------
1 | 2.22.1
2 | 


--------------------------------------------------------------------------------
/lncrawl/assets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lncrawl/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.6.9
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include lncrawl/VERSION
2 | recursive-include lncrawl *.*
3 | 


--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | Js2Py
2 | PyInstaller
3 | cairosvg
4 | setuptools
5 | wheel
6 | 


--------------------------------------------------------------------------------
/res/lncrawl.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl.ico


--------------------------------------------------------------------------------
/lncrawl/bots/discord/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from .discord_bot import DiscordBot
3 | 


--------------------------------------------------------------------------------
/res/lncrawl-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl-icon.png


--------------------------------------------------------------------------------
/res/lncrawl-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl-web.png


--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | from lncrawl import main
4 | main()
5 | 


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Horta.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Horta.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Sofia.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Sofia.otf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Bellota.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Bellota.otf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Caladea.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Caladea.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Crimson.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Crimson.otf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Gidole.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Gidole.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Orkney.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Orkney.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Unique.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Unique.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Bradley Gratis.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Bradley Gratis.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Liberation Serif.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Liberation Serif.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Libre Baskerville.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Libre Baskerville.ttf


--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Glacial Indifference.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Glacial Indifference.otf


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: General
 3 | about: If you want to create a general issue
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new-source.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: New source
 3 | about: Suggest a new source to add
 4 | title: Enter your desired sources here
 5 | labels: source
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "lightnovel-crawler",
3 |   "description": "Downloads lightnovels from various online sources and generates ebooks in many formats.",
4 |   "version": "2.16.2",
5 |   "engines": {
6 |     "node": "12.x"
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/lncrawl/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- coding: utf-8 -*-
 3 | try:
 4 |     from dotenv import load_dotenv
 5 |     load_dotenv()
 6 | except Exception:
 7 |     pass
 8 | # end try
 9 | 
10 | 
11 | def main():
12 |     from .core import start_app
13 |     start_app()
14 | # end def
15 | 


--------------------------------------------------------------------------------
/lncrawl/assets/version.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from pathlib import Path
 3 | 
 4 | ROOT = Path(__file__).parent.parent
 5 | 
 6 | with open(str(ROOT / 'VERSION'), 'r') as f:
 7 |     version = f.read().strip()
 8 | # end with
 9 | 
10 | 
11 | def get_value():
12 |     return version
13 | # end def
14 | 


--------------------------------------------------------------------------------
/scripts/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | VERSION=$(head -n 1 lncrawl/VERSION)
 4 | 
 5 | PY="python3"
 6 | PIP="$PY -m pip --disable-pip-version-check"
 7 | 
 8 | # . scripts/build.sh
 9 | 
10 | $PIP install twine
11 | $PY -m twine upload "dist/lightnovel_crawler-$VERSION-py3-none-any.whl"
12 | 
13 | # FINISHED
14 | 


--------------------------------------------------------------------------------
/scripts/publish.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF 
 2 | 
 3 | SET /P VERSION=<lncrawl\VERSION
 4 | 
 5 | SET PY=python
 6 | SET PIP=%PY% -m pip --disable-pip-version-check
 7 | 
 8 | REM CALL scripts\build.bat
 9 | 
10 | %PIP% install twine
11 | %PY% -m twine upload "dist\lightnovel_crawler-%VERSION%-py3-none-any.whl"
12 | 
13 | ECHO ON
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | _novel
 3 | _book
 4 | geckodriver.log
 5 | build
 6 | dist
 7 | *.egg-info
 8 | Test Novel
 9 | .vscode
10 | bundle
11 | env
12 | .env
13 | Lightnovels/
14 | windows/
15 | .pyi/
16 | .telegram_bot_output/
17 | .discord_bot_output/
18 | mycreds.txt
19 | /venv*/
20 | *.log
21 | save_pid.txt
22 | /.tox
23 | /logs
24 | client_secrets.json
25 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: Fix this bug
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **Let us know**
14 | - Novel URL: [e.g. https://lnmtl.com/novel/against-the-gods]
15 | - App Version: [e.g. 2.7.8]
16 | - Package: [can be exe, pip or`github]
17 | - OS: [e.g. Windows 10]
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # App requirements
 2 | ascii==3.6
 3 | win_unicode_console==0.5
 4 | python-dotenv==0.13.0
 5 | beautifulsoup4==4.9.1
 6 | requests==2.23.0
 7 | python-slugify==4.0.0
 8 | cssutils==1.0.2
 9 | PyInquirer==1.0.3
10 | colorama==0.4.3
11 | progress==1.5
12 | Js2Py==0.70
13 | EbookLib==0.17.1
14 | pillow==6.2.2
15 | cloudscraper>=1.2.40
16 | lxml==4.5.1
17 | 
18 | # Bot requirements
19 | discord.py==1.3.3
20 | python-telegram-bot==11.1.0
21 | PyDrive==1.3.1
22 | 


--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | VERSION=$(head -n 1 lncrawl/VERSION)
 4 | 
 5 | PY="python3"
 6 | PIP="$PY -m pip --disable-pip-version-check"
 7 | 
 8 | rm -rf venv build dist *.egg-info
 9 | 
10 | $PY -m venv venv
11 | . venv/bin/activate
12 | 
13 | $PIP install -U pip==20.0.2
14 | $PIP install -r requirements.txt
15 | $PIP install -r dev-requirements.txt
16 | 
17 | $PY setup.py clean bdist_wheel sdist package
18 | 
19 | deactivate
20 | rm -rf venv build *.egg-info
21 | 
22 | # FINISHED
23 | 


--------------------------------------------------------------------------------
/lncrawl/assets/colors.txt:
--------------------------------------------------------------------------------
 1 | #d3dcf2 #829fe4 #6692c3 #4878a4 #00305a
 2 | #e8d9ac #c7b07b #ffe28c #d8ab22 #382d1a
 3 | #d8edb5 #abc8a4 #b1d17b #90a868 #183128
 4 | #e6f1f5 #aab3b6 #a1bac4 #6a7275 #3b3e40
 5 | #eaa8d3 #996185 #c964a6 #d897c1 #49223b
 6 | #d3c0b8 #917569 #bc8b74 #72391e #332923
 7 | #fffcfc #892323 #c42121 #2d2727 #020000
 8 | #fcb0b0 #d67e7e #f7a0a0 #773535 #0a0505
 9 | #2ab7ca #fed766 #cfffb3 #fe4a49 #330c2f
10 | #fde8e9 #e3bac6 #bc9ec1 #596475 #1f2232
11 | #ffffff #f9e316 #faa916 #96031a #000000
12 | #452103 #690500 #210f04 #934b00 #bb6b00


--------------------------------------------------------------------------------
/.appveyor.yml:
--------------------------------------------------------------------------------
 1 | environment:
 2 |   matrix:
 3 |     - PYTHON: C:\Python38-x64
 4 |     - PYTHON: C:\Python37-x64
 5 |     - PYTHON: C:\Python36-x64
 6 |     - PYTHON: C:\Python35-x64
 7 | 
 8 | init:
 9 |   - SET PATH=%PYTHON%;%PYTHON%\Scripts;%PATH%
10 |   - SET PYTHONIOENCODING=utf-8
11 | 
12 | install:
13 |   - python -m pip install --no-cache-dir  -r requirements.txt
14 | 
15 | build: false
16 | 
17 | test_script:
18 |   - python __main__.py --bot test -lll
19 | 
20 | branches:
21 |   only:
22 |     - master
23 | 
24 | cache:
25 |   - '%LOCALAPPDATA%\pip\Cache'
26 | 


--------------------------------------------------------------------------------
/scripts/build.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF 
 2 | 
 3 | SET /P VERSION=<lncrawl\VERSION
 4 | 
 5 | SET PY=python
 6 | SET PIP=%PY% -m pip --disable-pip-version-check
 7 | 
 8 | RD /S /Q "dist" "venv" "build" "lightnovel_crawler.egg-info" &
 9 | 
10 | %PY% -m venv venv
11 | CALL venv\Scripts\activate.bat
12 | 
13 | %PIP% install -U pip==20.0.2
14 | %PIP% install -r requirements.txt
15 | %PIP% install -r dev-requirements.txt
16 | 
17 | %PY% setup.py clean bdist_wheel sdist package
18 | 
19 | CALL venv\Scripts\deactivate.bat
20 | RD /S /Q "venv" "build" "lightnovel_crawler.egg-info" &
21 | 
22 | ECHO ON
23 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Values should be the file names (without .py) inside `lncrawl/interfaces` folder.
 2 | # By default the console bot will be choosen if this is left empty or invalid..
 3 | BOT=console
 4 | 
 5 | # Available levels: NOTSET, WARN, INFO, DEBUG, FATAL, ERROR
 6 | # If this variable is unset or NONE, logging will not be configured.
 7 | LOG_LEVEL=INFO
 8 | 
 9 | # Configs for bots
10 | TELEGRAM_TOKEN=
11 | DISCORD_TOKEN=
12 | DISCORD_DISABLE_SEARCH=false
13 | DISCORD_SIGNAL_CHAR=!
14 | 
15 | # Publicly available data folder
16 | PUBLIC_DATA_PATH=
17 | PUBLIC_ADDRESS=http://18.218.187.242/
18 | 
19 | # Google Drive Config
20 | GOOGLE_DRIVE_CREDENTIAL_FILE=mycreds.txt
21 | GOOGLE_DRIVE_FOLDER_ID=118iN1jzavVV-9flrLPZo7DOi0cuxrQ5F
22 | 


--------------------------------------------------------------------------------
/lncrawl/utils/update_checker.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | 
 4 | import cloudscraper
 5 | 
 6 | from ..assets.version import get_value
 7 | from ..core.display import new_version_news
 8 | 
 9 | logger = logging.Logger('UPDATE_CHECK')
10 | 
11 | 
12 | def check_updates():
13 |     try:
14 |         logger.info('Checking latest version')
15 |         pypi_short_url = 'http://bit.ly/2yYyFGd'
16 |         scraper = cloudscraper.create_scraper()
17 |         res = scraper.get(pypi_short_url, timeout=5)
18 |         latest_version = res.json()['info']['version']
19 |         if get_value() != latest_version:
20 |             new_version_news(latest_version)
21 |         # end if
22 |     except Exception:
23 |         logger.warn('Failed to check for update')
24 |     # end try
25 | # end def
26 | 


--------------------------------------------------------------------------------
/lncrawl/bots/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | supported_bots = [
 4 |     'console',
 5 |     'telegram',
 6 |     'discord',
 7 |     'test',
 8 | ]
 9 | 
10 | 
11 | def run_bot(bot):
12 |     if bot not in supported_bots:
13 |         bot = 'console'
14 |     # end if
15 |     if bot == 'console':
16 |         from ..bots.console import ConsoleBot
17 |         ConsoleBot().start()
18 |     elif bot == 'telegram':
19 |         from ..bots.telegram import TelegramBot
20 |         TelegramBot().start()
21 |     elif bot == 'discord':
22 |         from ..bots.discord import DiscordBot
23 |         DiscordBot().start_bot()
24 |     elif bot == 'test':
25 |         from ..bots.test import TestBot
26 |         TestBot().start()
27 |     else:
28 |         print('Unknown bot: %s' % bot)
29 |     # end def
30 | # end def
31 | 


--------------------------------------------------------------------------------
/lncrawl/assets/icons.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import platform
 3 | 
 4 | isMac = platform.system() == 'Darwin'
 5 | isLinux = platform.system() == 'Linux'
 6 | isWindows = platform.system() == 'Windows'
 7 | 
 8 | 
 9 | class Icons:
10 |     isMac = isMac
11 |     isLinux = isLinux
12 |     isWindows = isWindows
13 |     hasSupport = isLinux or isMac
14 | 
15 |     # --------------------------------------- #
16 | 
17 |     EMPTY = '  '
18 |     BOOK = '📒' if hasSupport else ''
19 |     CLOVER = '🍀' if hasSupport else '#'
20 |     LINK = '🔗' if hasSupport else '-'
21 |     HANDS = '🙏' if hasSupport else '-'
22 |     ERROR = '❗' if hasSupport else '!'
23 |     PARTY = '📦' if hasSupport else '$'
24 |     SOUND = '🔊' if hasSupport else '<<'
25 |     SPARKLE = '✨' if hasSupport else '*'
26 |     INFO = '💁  ' if hasSupport else ': '
27 |     RIGHT_ARROW = '➡' if hasSupport else '->'
28 | # end def
29 | 


--------------------------------------------------------------------------------
/lncrawl/bots/console/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | 
 4 | 
 5 | class ConsoleBot:
 6 |     log = logging.getLogger('CONSOLE_BOT')
 7 | 
 8 |     from .start import start
 9 |     from .start import open_folder
10 |     from .start import process_chapter_range
11 | 
12 |     from .get_crawler import get_novel_url
13 |     from .get_crawler import get_crawlers_to_search
14 |     from .get_crawler import choose_a_novel
15 | 
16 |     from .login_info import get_login_info
17 | 
18 |     from .output_style import get_output_path
19 |     from .output_style import force_replace_old
20 |     from .output_style import get_output_formats
21 |     from .output_style import should_pack_by_volume
22 | 
23 |     from .range_selection import get_range_selection
24 |     from .range_selection import get_range_using_urls
25 |     from .range_selection import get_range_using_index
26 |     from .range_selection import get_range_from_volumes
27 |     from .range_selection import get_range_from_chapters
28 | # end class
29 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Simple.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="1200" height="1600" x="0" y="0" style="fill:{{ color1 }}"/>
14 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
15 |     {%- for line in title|wrap(20) %} <tspan x="600" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="middle">{{ line }}</tspan> {%- endfor %}
16 |   </text>
17 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
18 |     {%- for author in authors %} <tspan x="600" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="middle">{{ author }}</tspan> {%- endfor %}
19 |   </text>
20 | </svg>
21 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Simple Dark.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="1200" height="1600" x="0" y="0" style="fill:{{ color5 }}"/>
14 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color1 }}">
15 |     {%- for line in title|wrap(20) %} <tspan x="600" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="middle">{{ line }}</tspan> {%- endfor %}
16 |   </text>
17 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color1 }}">
18 |     {%- for author in authors %} <tspan x="600" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="middle">{{ author }}</tspan> {%- endfor %}
19 |   </text>
20 | </svg>
21 | 


--------------------------------------------------------------------------------
/lncrawl/binders/text.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import os
 4 | import re
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | logger = logging.getLogger('TEXT_BINDER')
 8 | 
 9 | 
10 | def make_texts(app, data):
11 |     text_files = []
12 |     for vol in data:
13 |         dir_name = os.path.join(app.output_path, 'text', vol)
14 |         os.makedirs(dir_name, exist_ok=True)
15 |         for chap in data[vol]:
16 |             file_name = '%s.txt' % str(chap['id']).rjust(5, '0')
17 |             file_name = os.path.join(dir_name, file_name)
18 |             with open(file_name, 'w', encoding='utf-8') as file:
19 |                 body = chap['body'].replace('</p><p', '</p>\n<p')
20 |                 soup = BeautifulSoup(body, 'lxml')
21 |                 text = '\n\n'.join(soup.stripped_strings)
22 |                 text = re.sub(r'[\r\n]+', '\r\n\r\n', text)
23 |                 file.write(text)
24 |                 text_files.append(file_name)
25 |             # end with
26 |         # end for
27 |     # end for
28 |     print('Created: %d text files' % len(text_files))
29 |     return text_files
30 | # end def
31 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Blocks.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="1200" height="1100" x="0" y="0" style="fill:{{ color1 }}"/>
14 |   <rect width="1200" height="500" x="0" y="1100" style="fill:{{ color2 }}"/>
15 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
16 |     {%- for line in title|wrap(20) %} <tspan x="600" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="middle">{{ line }}</tspan> {%- endfor %}
17 |   </text>
18 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
19 |     {%- for author in authors %} <tspan x="1100" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="end">{{ author }}</tspan> {%- endfor %}
20 |   </text>
21 | </svg>
22 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | os: linux
 2 | dist: xenial
 3 | language: python
 4 | python:
 5 |   - "3.8"
 6 |   - "3.7"
 7 |   - "3.6"
 8 |   - "3.5"
 9 |   - nightly
10 | 
11 | matrix:
12 |   allow_failures:
13 |     - python: nightly
14 |     - os: osx
15 |   fast_finish: true
16 | 
17 | before_install:
18 |   - |
19 |     if [[ $TRAVIS_OS_NAME == 'osx' ]]; then
20 |       brew upgrade python
21 |       export PATH="/usr/local/opt/python/libexec/bin:${PATH}"
22 |     fi
23 | install:
24 |   - pip install -r requirements.txt
25 |   - pip install flake8
26 | 
27 | before_script:
28 |   # stop the build if there are Python syntax errors or undefined names
29 |   - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
30 |   # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
31 |   - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
32 | 
33 | script:
34 |   - python __main__.py --bot test -lll
35 | 
36 | cache:
37 |   directories:
38 |     - $HOME/.cache/pip
39 |     - $HOME/.cache/pre-commit
40 | 
41 | branches:
42 |   only:
43 |     - master
44 | 
45 | notifications:
46 |   email: false
47 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Column.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="1100" height="1600" x="110" y="0" style="fill:{{ color1 }}"/>
14 |   <rect width="100" height="1600" x="0" y="0" style="fill:{{ color2 }}"/>
15 |   <rect width="10" height="1600" x="100" y="0" style="fill:{{ color4 }}"/>
16 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
17 |     {%- for line in title|wrap(10) %} <tspan x="140" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="start">{{ line }}</tspan> {%- endfor %}
18 |   </text>
19 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
20 |     {%- for author in authors %} <tspan x="140" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="start">{{ author }}</tspan> {%- endfor %}
21 |   </text>
22 | </svg>
23 | 


--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |     - master
 7 |   push:
 8 |     branches:
 9 |     - master
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.5, 3.6, 3.7, 3.8]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v1
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install -r requirements.txt
29 |     - name: Lint with flake8
30 |       run: |
31 |         pip install flake8
32 |         # stop the build if there are Python syntax errors or undefined names
33 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
34 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
35 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
36 | #     - name: Test with pytest
37 | #       run: |
38 | #         pip install pytest
39 | #         pytest
40 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Window.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="1200" height="1600" x="0" y="0" style="fill:{{ color1 }}"/>
14 |   <rect width="40" height="1600" x="260" y="0" style="fill:{{ color2 }}"/>
15 |   <rect width="1200" height="40" x="0" y="400" style="fill:{{ color2 }}"/>
16 |   <text y="{{ 360 - font_size * title|wrap(16)|length }}" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
17 |     {%- for line in title|wrap(16) %} <tspan x="340" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="start">{{ line }}</tspan> {%- endfor %}
18 |   </text>
19 |   <text y="480" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
20 |     {%- for author in authors %} <tspan x="340" dx="0" dy="{{ font_size_author }}" font-family="{{ font }}" text-anchor="start">{{ author }}</tspan> {%- endfor %}
21 |   </text>
22 | </svg>
23 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Cross.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="1200" height="1600" x="0" y="0" style="fill:{{ color1 }}"/>
14 |   <rect width="100" height="1600" x="0" y="0" style="fill:{{ color2 }}"/>
15 |   <rect width="1200" height="{{ 20 + font_size * title|wrap(20)|length }}" x="0" y="110" ry="30" style="fill:{{ color2 }}"/>
16 |   <text x="600" y="100" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
17 |     {%- for line in title|wrap(20) %} <tspan x="600" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="middle">{{ line }}</tspan> {%- endfor %}
18 |   </text>
19 |   <text x="600" y="1580" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
20 |     {%- for author in authors %} <tspan x="600" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="middle">{{ author }}</tspan> {%- endfor %}
21 |   </text>
22 | </svg>
23 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Tiles.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <rect width="400" height="400" x="0" y="0" style="fill:{{ color1 }}"/>
14 |   <rect width="800" height="400" x="400" y="0" style="fill:{{ color2 }}"/>
15 |   <rect width="400" height="1200" x="0" y="400" style="fill:{{ color4 }}"/>
16 |   <rect width="800" height="1200" x="400" y="400" style="fill:{{ color3 }}"/>
17 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
18 |     {%- for line in title|wrap(20) %} <tspan x="1100" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="end">{{ line }}</tspan> {%- endfor %}
19 |   </text>
20 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
21 |     {%- for author in authors %} <tspan x="1100" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="end">{{ author }}</tspan> {%- endfor %}
22 |   </text>
23 | </svg>
24 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | 
 4 | if sys.version_info[:2] < (3, 5):
 5 |     raise RuntimeError(
 6 |         'Lightnovel crawler only supports Python 3.5 and later.')
 7 | else:
 8 |     run_pyi = 'package' in sys.argv
 9 |     if run_pyi:
10 |         sys.argv.remove('package')
11 |     # end if
12 |     if len(sys.argv) == 1:
13 |         sys.argv += ['build']
14 |     # end if
15 | 
16 |     # import required packages
17 |     from pathlib import Path
18 |     from setuptools import config, setup
19 | 
20 |     def parse_version(filename):
21 |         with open(filename, 'r') as f:
22 |             return f.read().strip()
23 |     # end def
24 | 
25 |     def parse_requirements(filename):
26 |         with open(filename, 'r', encoding='utf-8') as f:
27 |             requirements = f.read().strip().split('\n')
28 |             requirements = [
29 |                 r.strip() for r in requirements
30 |                 if r.strip() and not r.startswith('#')
31 |             ]
32 |             return requirements
33 |     # end def
34 | 
35 |     config.read_configuration('setup.cfg')
36 | 
37 |     setup(
38 |         version=parse_version(Path('lncrawl') / 'VERSION'),
39 |         install_requires=parse_requirements('requirements.txt'),
40 |     )
41 | 
42 |     if run_pyi:
43 |         from setup_pyi import package
44 |         package()
45 |     # end if
46 | # end if
47 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Gradient.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |     <linearGradient id="grad1" x1="0%" x2="0%" y1="0%" y2="100%">
13 |       <stop offset="0%" style="stop-color:{{ color2|rgb }};stop-opacity:1"/>
14 |       <stop offset="50%" style="stop-color:{{ color1|rgb }};stop-opacity:1"/>
15 |       <stop offset="100%" style="stop-color:{{ color2|rgb }};stop-opacity:1"/>
16 |     </linearGradient>
17 |   </defs>
18 |   <rect width="1200" height="1600" x="0" y="0" fill="url(#grad1)"/>
19 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
20 |     {%- for line in title|wrap(20) %} <tspan x="600" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="middle">{{ line }}</tspan> {%- endfor %}
21 |   </text>
22 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
23 |     {%- for author in authors %} <tspan x="600" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="middle">{{ author }}</tspan> {%- endfor %}
24 |   </text>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/lncrawl/bots/console/login_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from PyInquirer import prompt
 3 | from ...core.arguments import get_args
 4 | 
 5 | 
 6 | def get_login_info(self):
 7 |     '''Returns the (email, password) pair for login'''
 8 |     args = get_args()
 9 | 
10 |     if args.login:
11 |         return args.login
12 |     # end if
13 | 
14 |     if args.suppress:
15 |         return False
16 |     # end if
17 | 
18 |     answer = prompt([
19 |         {
20 |             'type': 'confirm',
21 |             'name': 'login',
22 |             'message': 'Do you want to log in?',
23 |             'default': False
24 |         },
25 |     ])
26 | 
27 |     if answer['login']:
28 |         answer = prompt([
29 |             {
30 |                 'type': 'input',
31 |                 'name': 'email',
32 |                 'message': 'Username/Email:',
33 |                 'validate': lambda val: True if len(val)
34 |                 else 'Email address should be not be empty'
35 |             },
36 |             {
37 |                 'type': 'password',
38 |                 'name': 'password',
39 |                 'message': 'Password:',
40 |                 'validate': lambda val: True if len(val)
41 |                 else 'Password should be not be empty'
42 |             },
43 |         ])
44 |         return answer['email'], answer['password']
45 |     # end if
46 | 
47 |     return None
48 | # end if
49 | 


--------------------------------------------------------------------------------
/lncrawl/assets/templates/Rings.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" standalone="no"?>
 2 | <svg width="1200" height="1600" viewBox="0 0 1200 1600" xml:space="preserve">
 3 |   <defs>
 4 |     <style type="text/css">
 5 |       <![CDATA[
 6 |       @font-face {
 7 |         font-family: "{{ font }}";
 8 |         src: url("data:{{ font_type }};charset=utf-8;base64,{{ font_data }}");
 9 |       }
10 |     ]]>
11 |     </style>
12 |   </defs>
13 |   <circle cx="600" cy="800" r="1000" fill="{{ color2 }}"/>
14 |   <circle cx="600" cy="800" r="900" fill="{{ color4 }}"/>
15 |   <circle cx="600" cy="800" r="800" fill="{{ color3 }}"/>
16 |   <circle cx="600" cy="800" r="700" fill="{{ color1 }}"/>
17 |   <circle cx="600" cy="800" r="600" fill="{{ color2 }}"/>
18 |   <circle cx="600" cy="800" r="500" fill="{{ color3 }}"/>
19 |   <circle cx="600" cy="800" r="400" fill="{{ color1 }}"/>
20 |   <circle cx="600" cy="800" r="300" fill="{{ color2 }}"/>
21 |   <circle cx="600" cy="800" r="200" fill="{{ color3 }}"/>
22 |   <circle cx="600" cy="800" r="100" fill="{{ color4 }}"/>
23 |   <text y="200" style="font-size:{{ font_size }}px;font-weight:bold;fill:{{ color5 }}">
24 |     {%- for line in title|wrap(20) %} <tspan x="600" dx="0" dy="{{ font_size }}" font-family="{{ font }}" text-anchor="middle">{{ line }}</tspan> {%- endfor %}
25 |   </text>
26 |   <text y="1500" style="font-size:{{ font_size_author }}px;font-weight:bold;fill:{{ color5 }}">
27 |     {%- for author in authors %} <tspan x="600" dx="0" dy="-{{ font_size_author }}" font-family="{{ font }}" text-anchor="middle">{{ author }}</tspan> {%- endfor %}
28 |   </text>
29 | </svg>
30 | 


--------------------------------------------------------------------------------
/app.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "lightnovel crawler",
 3 |   "description": "Downloads lightnovels from various online sources and generates ebooks in many formats.",
 4 |   "keywords": [
 5 |     "discord",
 6 |     "bot",
 7 |     "telegram",
 8 |     "novel",
 9 |     "lightnovel",
10 |     "crawler"
11 |   ],
12 |   "website": "https://github.com/dipu-bd/lightnovel-crawler",
13 |   "logo": "https://github.com/dipu-bd/lightnovel-crawler/raw/master/res/lncrawl-icon.png",
14 |   "env": {
15 |     "LOG_LEVEL": {
16 |       "description": "Available levels: NOTSET, WARN, INFO, DEBUG, FATAL, ERROR",
17 |       "value": "INFO",
18 |       "required": true
19 |     },
20 |     "BOT": {
21 |       "description": "available: console, discord, telegram",
22 |       "value": "discord",
23 |       "required": true
24 |     },
25 |     "TELEGRAM_TOKEN": {
26 |       "description": "Telegram token, only required if BOT is set to telegram",
27 |       "required": false
28 |     },
29 |     "DISCORD_TOKEN": {
30 |       "description": "Discord token, only required if BOT is set to discord",
31 |       "required": false
32 |     },
33 |     "DISCORD_SIGNAL_CHAR": {
34 |       "description": "Discord command prefix, only required if BOT is set to discord",
35 |       "required": false,
36 |       "value": "!"
37 |     }
38 |   },
39 |   "buildpacks": [
40 |     {
41 |       "url": "https://github.com/heroku/heroku-buildpack-nodejs"
42 |     },
43 |     {
44 |       "url": "https://github.com/heroku/heroku-buildpack-python"
45 |     },
46 |     {
47 |       "url": "https://github.com/nntin/heroku-buildpack-calibre"
48 |     }
49 |   ]
50 | }
51 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/remove-source.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Remove source
 3 | about: If you are owner of a source added here and want to remove it
 4 | title: 'Request to remove a site: <your site url>'
 5 | labels: removal
 6 | assignees: dipu-bd
 7 | ---
 8 | 
 9 | ## Please check all the fields that applies to you
10 | 
11 | Transform `[ ]` to `[x]` to check (you can also check it after submitting the issue):
12 | 
13 | - [ ] I am a translator/author
14 | - [ ] I only publish my original contents/translations
15 | - [ ] I have permission from the author to translate their contents
16 | - [ ] I do not copy contents from others or do not use machine translations
17 | - [ ] Some people are using this program to steal my translations
18 | - [ ] _I do not blame a blacksmith or the sword he made if it is used by a someone to kill people_
19 | - [ ] _I do not blame the lockpicks if it is used by a someone to steal from my house_
20 | - [ ] I do not blame the developer if the program he wrote is used by others to steal from my site
21 | 
22 | ## Why do you translate/write novels and post them on your site?
23 | 
24 | ```
25 | <write here>
26 | ```
27 | 
28 | ## Explain why you do want to prevent people from scraping your site?
29 | 
30 | ```
31 | <write here>
32 | ```
33 | 
34 | ## Can you prove your identity as a site owner?
35 | 
36 | ```
37 | - Add a file named `lncrawl.txt` with content `Please remove this source` to your site.
38 | - Paste the link of the file here
39 | ```
40 | 
41 | ## Have some links or evidences that people are using this app to steal contents from you?
42 | 
43 | ```
44 | <post one link per line here / or delete this question if there is none>
45 | ```
46 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = lightnovel-crawler
 3 | url = https://github.com/dipu-bd/lightnovel-crawler
 4 | license = Apache 2.0
 5 | license_file = LICENSE
 6 | description = An app to download novels from online sources and generate e-books.
 7 | long_description = file: README.pip
 8 | long_description_content_type = text/markdown
 9 | author = Sudipto Chandra
10 | author_email = dipu.sudipta@gmail.com
11 | platforms =
12 |     Linux
13 |     macOS
14 |     Windows
15 | keywords =
16 |     lightnovel
17 |     crawler
18 |     lncrawl
19 |     ebook
20 |     novel
21 |     pdf
22 |     epub
23 |     mobi
24 |     scraper
25 |     web-scraper
26 | classifiers =
27 |     Development Status :: 5 - Production/Stable
28 |     Environment :: Console
29 |     Natural Language :: English
30 |     License :: OSI Approved :: Apache Software License
31 |     Intended Audience :: Developers
32 |     Intended Audience :: End Users/Desktop  
33 |     Programming Language :: Python :: 3 :: Only
34 |     Programming Language :: Python :: 3.5
35 |     Programming Language :: Python :: 3.6
36 |     Programming Language :: Python :: 3.7
37 |     Programming Language :: Python :: 3.8
38 |     Topic :: Games/Entertainment
39 |     Topic :: Internet :: WWW/HTTP
40 |     Topic :: Multimedia :: Graphics
41 |     Topic :: Printing
42 |     Topic :: Text Processing :: Markup :: HTML
43 | project_urls =
44 |     Source Code = https://github.com/dipu-bd/lightnovel-crawler
45 |     Issue tracker = https://github.com/dipu-bd/lightnovel-crawler/issues
46 |     Documentation = https://github.com/dipu-bd/lightnovel-crawler/blob/master/README.md
47 |     Say Thanks! = https://saythanks.io/to/dipu-bd
48 | 
49 | [options]
50 | python_requires = >= 3.5
51 | include_package_data = True
52 | packages = lncrawl
53 | package_dir =
54 |     lncrawl = lncrawl
55 | 
56 | [options.entry_points]
57 | console_scripts =
58 |     lncrawl = lncrawl:main
59 |     lightnovel-crawler = lncrawl:main
60 | 


--------------------------------------------------------------------------------
/lncrawl/assets/html_style.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | from pathlib import Path
 4 | 
 5 | ROOT = Path(__file__).parent
 6 | 
 7 | with open(str(ROOT / 'html_style.css'), 'r') as f:
 8 |     style = f.read()
 9 | # end with
10 | 
11 | 
12 | def get_value():
13 |     return _minify(style)
14 | # end def
15 | 
16 | 
17 | def _minify(css):
18 |     result = ''
19 | 
20 |     # remove comments - this will break IE<6 comment hacks
21 |     css = re.sub(r'/\*[\s\S]*?\*/', "", css)
22 | 
23 |     # url() doesn't need quotes
24 |     #css = re.sub(r'url\((["\'])([^)]*)\1\)', r'url(\2)', css)
25 | 
26 |     # spaces may be safely collapsed as generated content will collapse them anyway
27 |     css = re.sub(r'\s+', ' ', css)
28 | 
29 |     # shorten collapsable colors: #aabbcc to #abc
30 |     css = re.sub(
31 |         r'#([0-9a-f])\1([0-9a-f])\2([0-9a-f])\3(\s|;)', r'#\1\2\3\4', css)
32 | 
33 |     # fragment values can loose zeros
34 |     css = re.sub(r':\s*0(\.\d+([cm]m|e[mx]|in|p[ctx]))\s*;', r':\1;', css)
35 | 
36 |     for rule in re.findall(r'([^{]+){([^}]*)}', css):
37 |         # we don't need spaces around operators
38 |         selectors = [re.sub(r'(?<=[\[\(>+=])\s+|\s+(?=[=~^$*|>+\]\)])',
39 |                             r'', selector.strip()) for selector in rule[0].split(',')]
40 |         # order is important, but we still want to discard repetitions
41 |         properties = {}
42 |         porder = []
43 |         for prop in re.findall(r'(.*?):(.*?)(;|$)', rule[1]):
44 |             key = prop[0].strip().lower()
45 |             if key not in porder:
46 |                 porder.append(key)
47 |             properties[key] = prop[1].strip()
48 |             # output rule if it contains any declarations
49 |             if properties:
50 |                 result += "%s{%s}" % (','.join(selectors), ''.join(
51 |                     ['%s:%s;' % (key, properties[key]) for key in porder])[:-1])
52 | 
53 |     return result
54 | # end def
55 | 


--------------------------------------------------------------------------------
/lncrawl/utils/uploader.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """Uploader for google drive"""
 3 | import os
 4 | import logging
 5 | 
 6 | logger = logging.getLogger('UPLOADER')
 7 | 
 8 | 
 9 | try:
10 |     from pydrive.auth import GoogleAuth
11 |     from pydrive.drive import GoogleDrive
12 | except Exception:
13 |     logger.error('`pydrive` was not setup properly')
14 | # end try
15 | 
16 | 
17 | def upload(file_path, description=None):
18 |     try:
19 |         gauth = GoogleAuth()
20 |         # gauth.LocalWebserverAuth()
21 | 
22 |         # Try to load saved client credentials
23 |         credential_file = os.getenv('GOOGLE_DRIVE_CREDENTIAL_FILE')
24 |         gauth.LoadCredentialsFile(credential_file)
25 |         if gauth.credentials is None:
26 |             # Authenticate if they're not there
27 |             gauth.LocalWebserverAuth()
28 |         elif gauth.access_token_expired:
29 |             # Refresh them if expired
30 |             gauth.Refresh()
31 |         else:
32 |             # Initialize the saved creds
33 |             gauth.Authorize()
34 |         # end if
35 | 
36 |         # Save the current credentials to a file
37 |         gauth.SaveCredentialsFile(credential_file)
38 | 
39 |         drive = GoogleDrive(gauth)
40 |         folder_id = os.getenv('GOOGLE_DRIVE_FOLDER_ID')
41 |         filename_w_ext = os.path.basename(file_path)
42 |         filename, file_extension = os.path.splitext(filename_w_ext)
43 | 
44 |         # Upload file to folder
45 |         f = drive.CreateFile(
46 |             {"parents": [{"kind": "drive#fileLink", "id": folder_id}]})
47 |         f['title'] = filename_w_ext
48 | 
49 |         # Make sure to add the path to the file to upload below.
50 |         f.SetContentFile(file_path)
51 |         f.Upload()
52 | 
53 |         logger.info(f['id'])
54 |         return f['id']
55 |     except Exception:
56 |         logger.exception('Failed to upload %s', file_path)
57 |     # end try
58 |     return None
59 | # end def
60 | 


--------------------------------------------------------------------------------
/lncrawl/binders/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | To bind into ebooks
 4 | """
 5 | import logging
 6 | 
 7 | from .epub import make_epubs
 8 | from .web import make_webs
 9 | from .text import make_texts
10 | from .calibre import make_calibres
11 | 
12 | logger = logging.Logger('BINDERS')
13 | 
14 | depends_on_none = [
15 |     'epub',
16 |     'text',
17 |     'web',
18 | ]
19 | depends_on_epub = [
20 |     'docx',
21 |     'mobi',
22 |     'pdf',
23 |     'rtf',
24 |     'txt',
25 |     'azw3',
26 |     'fb2',
27 |     'lit',
28 |     'lrf',
29 |     'oeb',
30 |     'pdb',
31 |     'rb',
32 |     'snb',
33 |     'tcr',
34 |     # 'pml',
35 |     # 'html',
36 | ]
37 | available_formats = depends_on_none + depends_on_epub
38 | 
39 | 
40 | def generate_books(app, data):
41 |     out_formats = app.output_formats
42 |     if not out_formats:
43 |         out_formats = {}
44 |     # end if
45 |     out_formats = {x: out_formats.get(x, False) for x in available_formats}
46 | 
47 |     # Resolve formats to output maintaining dependencies
48 |     after_epub = [x for x in depends_on_epub if out_formats[x]]
49 |     need_epub = 'epub' if len(after_epub) else None
50 |     after_any = [x for x in depends_on_none if out_formats[x] or x == need_epub]
51 | 
52 |     # Generate output files
53 |     outputs = dict()
54 |     for fmt in (after_any + after_epub):
55 |         try:
56 |             if fmt == 'text':
57 |                 outputs[fmt] = make_texts(app, data)
58 |             elif fmt == 'web':
59 |                 outputs[fmt] = make_webs(app, data)
60 |             elif fmt == 'epub':
61 |                 outputs[fmt] = make_epubs(app, data)
62 |             else:
63 |                 outputs[fmt] = make_calibres(app, outputs['epub'], fmt)
64 |             # end if
65 |         except Exception as err:
66 |             logger.exception('Failed to generate "%s": %s' % (fmt, err))
67 |         # end try
68 |     # end for
69 | 
70 |     return outputs
71 | # end def
72 | 


--------------------------------------------------------------------------------
/lncrawl/bots/discord/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import logging.config
 4 | from colorama import Fore
 5 | from ...core.arguments import get_args
 6 | 
 7 | # The special signal character for crawler commands
 8 | signal = os.getenv('DISCORD_SIGNAL_CHAR') or '!'
 9 | max_workers = os.getenv('DISCORD_MAX_WORKERS', 10)
10 | 
11 | # The public ip and path of the server to put files in
12 | public_ip = os.getenv('PUBLIC_ADDRESS', None)
13 | public_path = os.getenv('PUBLIC_DATA_PATH', None)
14 | 
15 | os.makedirs('logs', exist_ok=True)
16 | logging.config.dictConfig({
17 |     #
18 |     # Configure logging
19 |     # Docs: https://docs.python.org/3.5/library/logging.config.html#configuration-dictionary-schema
20 |     # Example: https://stackoverflow.com/a/7507842/1583052
21 |     #
22 |     'version': 1,
23 |     'disable_existing_loggers': True,
24 |     'formatters': {
25 |         'console': {
26 |             'format': Fore.CYAN+'%(asctime)s'+Fore.RESET + ' ' + Fore.GREEN + '%(levelname)-8s'+Fore.RESET+' %(message)s',
27 |             'datefmt': '%H:%M:%S',
28 |         },
29 |         'file': {
30 |             'format': '%(asctime)s [%(process)d] %(levelname)s\n%(name)s: %(message)s\n',
31 |             'datefmt': '%Y-%m-%d %H:%M:%S',
32 |         },
33 |     },
34 |     'handlers': {
35 |         'console': {
36 |             'formatter': 'console',
37 |             'class': 'logging.StreamHandler',
38 |             'stream': 'ext://sys.stdout',  # default is stderr
39 |         },
40 |         'file': {
41 |             'formatter': 'file',
42 |             'class': 'logging.handlers.RotatingFileHandler',
43 |             'filename': 'logs/discord-bot_%s.log' % (get_args().shard_id),
44 |             'maxBytes': 10 * 1024 * 1024,  # 10 MB
45 |             'backupCount': 5,
46 |             'encoding': 'utf-8',
47 |         },
48 |     },
49 |     'loggers': {
50 |         '': {  # root logger
51 |             'handlers': ['console', 'file'],
52 |             'level': logging.INFO,
53 |         },
54 |     },
55 | })
56 | 


--------------------------------------------------------------------------------
/lncrawl/sources/anythingnovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import logging
 4 | from concurrent import futures
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('ANYTHING_NOVEL')
 8 | 
 9 | 
10 | class AnythingNovelCrawler(Crawler):
11 |     base_url = 'https://anythingnovel.com/'
12 | 
13 |     def read_novel_info(self):
14 |         logger.debug('Visiting %s', self.novel_url)
15 |         soup = self.get_soup(self.novel_url)
16 | 
17 |         self.novel_title = soup.select(
18 |             '#wrap .breadcrumbs span')[-1].text.strip()
19 |         logger.info('Novel title: %s', self.novel_title)
20 | 
21 |         self.novel_cover = soup.select_one('#content a img')['src']
22 |         logger.info('Novel cover: %s', self.novel_cover)
23 | 
24 |         volumes = set([])
25 |         for a in reversed(soup.select('#content div li a')):
26 |             title = a.text.strip()
27 |             chapter_id = len(self.chapters) + 1
28 |             volume_id = 1 + (chapter_id - 1) // 100
29 |             volumes.add(volume_id)
30 |             self.chapters.append({
31 |                 'id': chapter_id,
32 |                 'volume': volume_id,
33 |                 'title': title,
34 |                 'url': a['href'],
35 |             })
36 |         # end for
37 | 
38 |         self.chapters.sort(key=lambda x: x['id'])
39 |         self.volumes = [{'id': x, 'title': ''} for x in volumes]
40 |     # end def
41 | 
42 |     def download_chapter_body(self, chapter):
43 |         logger.info('Downloading %s', chapter['url'])
44 |         soup = self.get_soup(chapter['url'])
45 |         content = soup.select_one('div#content')
46 |         self.clean_contents(content)
47 |         body = content.select('p')
48 |         body = [str(p) for p in body if self.should_take(p)]
49 |         return '<p>' + '</p><p>'.join(body) + '</p>'
50 |     # end def
51 | 
52 |     def should_take(self, p):
53 |         txt = p.text.strip().lower()
54 |         return txt and txt != 'advertisement'
55 |     # end def
56 | # end class
57 | 


--------------------------------------------------------------------------------
/lncrawl/sources/chinesefantasy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from ..utils.crawler import Crawler
 3 | import requests
 4 | import re
 5 | import logging
 6 | import json
 7 | 
 8 | logger = logging.getLogger('CHINESE_FANTASY_NOVELS')
 9 | 
10 | 
11 | class ChineseFantasyNovels(Crawler):
12 |     base_url = 'https://m.chinesefantasynovels.com/'
13 | 
14 |     def read_novel_info(self):
15 |         '''Get novel title, autor, cover etc'''
16 |         if not self.novel_url.endswith('/'):
17 |             self.novel_url += '/'
18 |         # end if
19 |         logger.debug('Visiting %s', self.novel_url)
20 |         soup = self.get_soup(self.novel_url)
21 | 
22 |         self.novel_title = soup.select_one('.btitle h1').text
23 |         logger.info('Novel title: %s', self.novel_title)
24 | 
25 |         self.novel_author = soup.select_one('.bookinfo .status').text
26 |         logger.info('%s', self.novel_author)
27 | 
28 |         volumes = set([])
29 |         for a in reversed(soup.select('dl.chapterlist a')):
30 |             ch_title = a.text.strip()
31 |             ch_id = [int(x) for x in re.findall(r'\d+', ch_title)]
32 |             ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1
33 |             vol_id = 1 + len(self.chapters) // 100
34 |             volumes.add(vol_id)
35 |             self.chapters.append({
36 |                 'id': ch_id,
37 |                 'volume': vol_id,
38 |                 'title': ch_title,
39 |                 'url': self.absolute_url(a['href']),
40 |             })
41 |         # end def
42 | 
43 |         self.volumes = [{'id': x, 'title': ''} for x in volumes]
44 |     # end def
45 | 
46 |     def download_chapter_body(self, chapter):
47 |         '''Download body of a single chapter and return as clean html format.'''
48 |         logger.info('Downloading %s', chapter['url'])
49 |         soup = self.get_soup(chapter['url'])
50 |         content = soup.select_one('#BookText')
51 |         content.select_one('.link').decompose()
52 |         body = self.extract_contents(content)
53 |         return '<p>' + '</p><p>'.join(body) + '</p'
54 |     # end def
55 | # end class
56 | 


--------------------------------------------------------------------------------
/lncrawl/sources/asianhobbyist.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | 
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('ASIAN_HOBBYIST')
 8 | 
 9 | 
10 | class AsianHobbyistCrawler(Crawler):
11 |     base_url = 'https://www.asianhobbyist.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.select_one(
19 |             '#content article .post-title.entry-title a').text
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_author = 'N/A'
23 |         logger.info('Novel author: %s', self.novel_author)
24 | 
25 |         self.novel_cover = self.absolute_url(
26 |             soup.select_one('#content article p img')['src'])
27 |         logger.info('Novel cover: %s', self.novel_cover)
28 | 
29 |         self.volumes.append({'id': 1})
30 |         for a in soup.select('#content .tabs .wuji-row ul li a'):
31 |             title = a.text.strip()
32 | 
33 |             chap_id = len(self.chapters) + 1
34 |             match = re.findall(r'ch(apter)? (\d+)', title, re.IGNORECASE)
35 |             if len(match) == 1:
36 |                 chap_id = int(match[0][1])
37 |             # end if
38 | 
39 |             self.chapters.append({
40 |                 'volume': 1,
41 |                 'id': chap_id,
42 |                 'title': title,
43 |                 'url':  self.absolute_url(a['href']),
44 |             })
45 |         # end for
46 |     # end def
47 | 
48 |     def download_chapter_body(self, chapter):
49 |         '''Download body of a single chapter and return as clean html format.'''
50 |         logger.debug('Visiting %s', chapter['url'])
51 |         soup = self.get_soup(chapter['url'])
52 | 
53 |         content = soup.select_one('#content article .entry-content')
54 |         self.clean_contents(content)
55 | 
56 |         return ''.join([
57 |             str(p) for p in content.select('p')
58 |             if len(p.text.strip()) > 1
59 |         ])
60 |     # end def
61 | # end class
62 | 


--------------------------------------------------------------------------------
/lncrawl/sources/webnovelonlinecom.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import json
 4 | import logging
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('WEBNOVELONLINE_DOT_COM')
 8 | 
 9 | 
10 | class WebnovelOnlineDotComCrawler(Crawler):
11 |     base_url = 'https://webnovelonline.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         url = self.novel_url
16 |         logger.debug('Visiting %s', url)
17 |         soup = self.get_soup(url)
18 | 
19 |         self.novel_title = soup.select_one('.novel-info .novel-desc h1').text
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_cover = soup.select_one('meta[property="og:image"]')['content']
23 |         logger.info('Novel cover: %s', self.novel_title)
24 | 
25 |         volumes = set([])
26 |         for a in reversed(soup.select('.chapter-list .item a')):
27 |             chap_id = len(self.chapters) + 1
28 |             vol_id = 1 + len(self.chapters) // 100
29 |             volumes.add(vol_id)
30 |             self.chapters.append({
31 |                 'id': chap_id,
32 |                 'volume': vol_id,
33 |                 'title': a.text.strip(),
34 |                 'url': self.absolute_url(a['href']),
35 |             })
36 |         # end for
37 | 
38 |         self.volumes = [{'id': x, 'title': ''} for x in volumes]
39 |     # end def
40 | 
41 |     def download_chapter_body(self, chapter):
42 |         '''Download body of a single chapter and return as clean html format.'''
43 |         logger.info('Visiting %s', chapter['url'])
44 |         soup = self.get_soup(chapter['url'])
45 | 
46 |         for script in soup.select('script'):
47 |             text = script.string
48 |             if not text or not text.startswith('window._INITIAL_DATA_'):
49 |                 continue
50 |             # end if
51 |             content = re.findall(r',"chapter":(".+")},', text)[0]
52 |             content = json.loads(content).strip()
53 |             return '<p>' + '</p><p>'.join(content.split('\n\n')) + '</p>'
54 |         # end for
55 | 
56 |         return ''
57 |     # end def
58 | # end class
59 | 


--------------------------------------------------------------------------------
/lncrawl/sources/listnovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from urllib.parse import urlparse
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('LIST_NOVEL')
 9 | 
10 | 
11 | class ListNovelCrawler(Crawler):
12 |     base_url = 'https://listnovel.com/'
13 | 
14 |     def read_novel_info(self):
15 |         '''Get novel title, autor, cover etc'''
16 |         logger.debug('Visiting %s', self.novel_url)
17 |         soup = self.get_soup(self.novel_url)
18 | 
19 |         possible_title = soup.select_one('.post-title h1')
20 |         for span in possible_title.select('span'):
21 |             span.extract()
22 |         # end for
23 |         self.novel_title = possible_title.text.strip()
24 |         logger.info('Novel title: %s', self.novel_title)
25 | 
26 |         self.novel_cover = self.absolute_url(soup.select_one('.summary_image a img')['data-src'])
27 |         logger.info('Novel cover: %s', self.novel_cover)
28 | 
29 |         self.novel_author = ' '.join([
30 |             a.text.strip()
31 |             for a in soup.select('.author-content a[href*="manga-author"]')
32 |         ])
33 |         logger.info('%s', self.novel_author)
34 | 
35 |         for a in reversed(soup.select('.main-col li.wp-manga-chapter a')):
36 |             chap_id = len(self.chapters) + 1
37 |             vol_id = 1 + len(self.chapters) // 100
38 |             if chap_id % 100 == 1:
39 |                 self.volumes.append({'id': vol_id})
40 |             # end if
41 |             self.chapters.append({
42 |                 'id': chap_id,
43 |                 'volume': vol_id,
44 |                 'title': a.text.strip(),
45 |                 'url':  self.absolute_url(a['href']),
46 |             })
47 |         # end for
48 |     # end def
49 | 
50 |     def download_chapter_body(self, chapter):
51 |         '''Download body of a single chapter and return as clean html format.'''
52 |         logger.info('Visiting %s', chapter['url'])
53 |         soup = self.get_soup(chapter['url'])
54 |         contents = soup.select('.reading-content p')
55 |         return ''.join([str(p) for p in contents])
56 |     # end def
57 | # end class
58 | 


--------------------------------------------------------------------------------
/lncrawl/sources/novelringan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('NOVELRINGAN')
 8 | 
 9 | 
10 | class NovelRinganCrawler(Crawler):
11 |     base_url = 'https://novelringan.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.select_one('h1.entry-title').text
19 |         logger.info('Novel title: %s', self.novel_title)
20 | 
21 |         self.novel_cover = self.absolute_url(
22 |             soup.select_one('div.imgprop img')['src'])
23 |         logger.info('Novel cover: %s', self.novel_cover)
24 | 
25 |         self.novel_author = 'Translated by novelringan.com'
26 |         logger.info('Novel author: %s', self.novel_author)
27 | 
28 |         for a in reversed(soup.select('.bxcl ul li a')):
29 |             chap_id = len(self.chapters) + 1
30 |             if len(self.chapters) % 100 == 0:
31 |                 vol_id = chap_id//100 + 1
32 |                 vol_title = 'Volume ' + str(vol_id)
33 |                 self.volumes.append({
34 |                     'id': vol_id,
35 |                     'title': vol_title,
36 |                 })
37 |             # end if
38 |             self.chapters.append({
39 |                 'id': chap_id,
40 |                 'volume': vol_id,
41 |                 'url':  self.absolute_url(a['href']),
42 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
43 |             })
44 |         # end for
45 |     # end def
46 | 
47 |     def download_chapter_body(self, chapter):
48 |         '''Download body of a single chapter and return as clean html format.'''
49 |         logger.info('Downloading %s', chapter['url'])
50 |         soup = self.get_soup(chapter['url'])
51 | 
52 |         soup.select_one('#bacotan').extract()
53 |         contents = soup.select('.entry-content p')
54 | 
55 |         body = [str(p) for p in contents if p.text.strip()]
56 | 
57 |         return '<p>' + '</p><p>'.join(body) + '</p>'
58 | 
59 |     # end def
60 | # end class
61 | 


--------------------------------------------------------------------------------
/lncrawl/sources/ranobelibme.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | from ..utils.crawler import Crawler
 4 | import re
 5 | 
 6 | logger = logging.getLogger("RANOBE_LIB_ME")
 7 | 
 8 | 
 9 | class RanobeLibCrawler(Crawler):
10 |     base_url = 'https://ranobelib.me/'
11 | 
12 |     def read_novel_info(self):
13 |         logger.info('Visiting %s', self.novel_url)
14 |         soup = self.get_soup(self.novel_url)
15 | 
16 |         self.novel_title = soup.select_one('.manga-title h1').text
17 |         logger.info('Novel title: %s', self.novel_title)
18 | 
19 |         self.novel_cover = self.absolute_url(
20 |             soup.select_one('.manga__image img')['src'])
21 |         logger.info('Novel cover: %s', self.novel_cover)
22 | 
23 |         novel_link = soup.select_one("a[href*=author]")
24 |         if novel_link:
25 |             self.novel_author = novel_link.text.strip().title()
26 |         # end if
27 |         logger.info('Novel author: %s', self.novel_author)
28 | 
29 |         chapters = soup.select('.chapter-item')
30 |         chapters.reverse()
31 | 
32 |         volumes = set()
33 |         for a in chapters:
34 |             chap_id = len(self.chapters) + 1
35 | 
36 |             vol_id = int(a['data-volume'])
37 |             volumes.add(vol_id)
38 | 
39 |             link = a.select_one('a')
40 |             chapter_title = re.sub(r'\s+', ' ', link.text).strip()
41 |             if not chapter_title:
42 |                 chapter_title = 'Том %d. Глава %d' % (int(vol_id), int(a['data-number']))
43 |             # end if
44 | 
45 |             self.chapters.append({
46 |                 'id': chap_id,
47 |                 'volume': vol_id,
48 |                 'url':  self.absolute_url(link['href']),
49 |                 'title': chapter_title,
50 |             })
51 |         # end for
52 | 
53 |         self.volumes = [{'id': x} for x in volumes]
54 |     # end def
55 | 
56 |     def download_chapter_body(self, chapter):
57 |         logger.info('Downloading %s', chapter['url'])
58 |         soup = self.get_soup(chapter['url'])
59 | 
60 |         div = soup.select_one('.reader-container')
61 | 
62 |         body = self.extract_contents(div)
63 | 
64 |         return '<p>' + '</p><p>'.join(body) + '</p>'
65 |     # end def
66 | # end class
67 | 


--------------------------------------------------------------------------------
/lncrawl/sources/webnovelonline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import logging
 4 | from ..utils.crawler import Crawler
 5 | 
 6 | logger = logging.getLogger('WEBNOVEL_ONLINE')
 7 | 
 8 | 
 9 | class WebnovelOnlineCrawler(Crawler):
10 |     base_url = 'https://webnovel.online/'
11 | 
12 |     def read_novel_info(self):
13 |         '''Get novel title, autor, cover etc'''
14 |         url = self.novel_url
15 |         logger.debug('Visiting %s', url)
16 |         soup = self.get_soup(url)
17 | 
18 |         img = soup.select_one('main img.cover')
19 |         self.novel_title = img['title'].strip()
20 |         self.novel_cover = self.absolute_url(img['src'])
21 | 
22 |         span = soup.select_one('header span.send-author-event')
23 |         if span:
24 |             self.novel_author = span.text.strip()
25 |         # end if
26 | 
27 |         chap_id = 0
28 |         for a in soup.select('#info a.on-navigate-part'):
29 |             vol_id = chap_id // 100 + 1
30 |             if vol_id > len(self.volumes):
31 |                 self.volumes.append({
32 |                     'id': vol_id,
33 |                     'title': 'Volume %d' % vol_id
34 |                 })
35 |             # end if
36 | 
37 |             chap_id += 1
38 |             self.chapters.append({
39 |                 'id': chap_id,
40 |                 'volume': vol_id,
41 |                 'title': a.text.strip(),
42 |                 'url': self.absolute_url(a['href']),
43 |             })
44 |         # end for
45 |     # end def
46 | 
47 |     def download_chapter_body(self, chapter):
48 |         '''Download body of a single chapter and return as clean html format.'''
49 |         logger.info('Visiting %s', chapter['url'])
50 |         soup = self.get_soup(chapter['url'])
51 | 
52 |         strong = soup.select_one('#story-content strong')
53 |         if strong and re.search(r'Chapter \d+', strong.text):
54 |             chapter['title'] = strong.text.strip()
55 |             logger.info('Updated title: %s', chapter['title'])
56 |         # end if
57 | 
58 |         self.bad_tags += ['h1', 'h3', 'hr']
59 |         contents = soup.select_one('#story-content')
60 |         body = self.extract_contents(contents)
61 |         return '<p>' + '</p><p>'.join(body) + '</p>'
62 |     # end def
63 | # end class
64 | 


--------------------------------------------------------------------------------
/lncrawl/sources/flyinglines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from urllib.parse import urlparse
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('FLYING LINES')
 9 | 
10 | chapter_body_url = 'https://www.flying-lines.com/h5/novel/%s/%s?accessToken=&isFirstEnter=1&webdriver=0'
11 | 
12 | 
13 | class FlyingLinesCrawler(Crawler):
14 |     base_url = 'https://www.flying-lines.com/'
15 | 
16 |     def read_novel_info(self):
17 |         '''Get novel title, autor, cover etc'''
18 |         logger.debug('Visiting %s', self.novel_url)
19 |         soup = self.get_soup(self.novel_url)
20 | 
21 |         self.novel_title = soup.select_one('.novel-info .title h2').text
22 |         logger.info('Novel title: %s', self.novel_title)
23 | 
24 |         self.novel_cover = self.absolute_url(
25 |             soup.select_one('.novel .novel-thumb img')['data-src'])
26 |         logger.info('Novel cover: %s', self.novel_cover)
27 | 
28 |         authors = [x.text.strip()
29 |                    for x in soup.select('.novel-info ul.profile li')]
30 |         self.novel_author = ', '.join(authors)
31 |         logger.info('%s', self.novel_author)
32 | 
33 |         self.novel_id = urlparse(self.novel_url).path.split('/')[2]
34 |         logger.info("Novel id: %s", self.novel_id)
35 | 
36 |         for a in soup.select('ul.volume-chapters li a'):
37 |             chap_id = int(a['data-chapter-number'])
38 |             vol_id = 1 + (chap_id - 1) // 100
39 |             if len(self.chapters) % 100 == 0:
40 |                 self.volumes.append({'id': vol_id})
41 |             # end if
42 |             self.chapters.append({
43 |                 'id': chap_id,
44 |                 'volume': vol_id,
45 |                 'title': a.text.strip(),
46 |                 'url':  self.absolute_url(a['href']),
47 |             })
48 |         # end for
49 |     # end def
50 | 
51 |     def download_chapter_body(self, chapter):
52 |         '''Download body of a single chapter and return as clean html format.'''
53 |         url = chapter_body_url % (self.novel_id, chapter['id'])
54 |         logger.info('Downloading %s', url)
55 |         response = self.submit_form(url)
56 |         data = response.json()
57 |         print(data)
58 |         return data['data']['content']
59 |     # end def
60 | # end class
61 | 


--------------------------------------------------------------------------------
/lncrawl/sources/wuxialeague.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from ..utils.crawler import Crawler
 5 | 
 6 | logger = logging.getLogger('WUXIA_LEAGUE')
 7 | 
 8 | 
 9 | class WuxiaLeagueCrawler(Crawler):
10 |     base_url = 'https://www.wuxialeague.com/'
11 | 
12 |     def read_novel_info(self):
13 |         logger.debug('Visiting %s', self.novel_url)
14 |         soup = self.get_soup(self.novel_url)
15 | 
16 |         self.novel_title = soup.select_one('#bookinfo .d_title h1').text
17 |         logger.info('Novel title: %s', self.novel_title)
18 | 
19 |         self.novel_cover = self.absolute_url(soup.select_one('#bookimg img')['src'])
20 |         logger.info('Novel cover: %s', self.novel_cover)
21 | 
22 |         possible_authors = [a.text for a in soup.select('#bookinfo a[href*="/author/"]')]
23 |         self.novel_author = ', '.join(possible_authors)
24 |         logger.info('Novel author: %s', self.novel_author)
25 | 
26 |         for a in soup.select('#chapterList li a'):
27 |             chap_id = 1 + len(self.chapters)
28 |             vol_id = 1 + len(self.chapters) // 100
29 |             if chap_id % 100 == 1:
30 |                 self.volumes.append({'id': vol_id})
31 |             # end if
32 |             self.chapters.append({
33 |                 'id': chap_id,
34 |                 'volume': vol_id,
35 |                 'title': a.text.strip(),
36 |                 'url': self.absolute_url(a['href']),
37 |             })
38 |         # end for
39 |     # end def
40 | 
41 |     def download_chapter_body(self, chapter):
42 |         '''Download body of a single chapter and return as clean html format'''
43 |         logger.info('Downloading %s', chapter['url'])
44 |         soup = self.get_soup(chapter['url'])
45 | 
46 |         body = ''
47 |         title_found = False
48 |         for p in soup.select('#TextContent > p'):
49 |             if not p.text.strip():
50 |                 continue
51 |             # end if
52 |             clean_first = ''.join(re.findall(r'([a-z0-9]+)', p.text.lower()))
53 |             clean_title = ''.join(re.findall(r'([a-z0-9]+)', chapter['title'].lower()))
54 |             if clean_first == clean_title:
55 |                 continue
56 |             # end if
57 |             body += str(p).strip()
58 |         # end for
59 | 
60 |         return body
61 |     # end def
62 | # end class
63 | 


--------------------------------------------------------------------------------
/lncrawl/core/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Interactive application to take user inputs
 4 | """
 5 | import logging
 6 | import os
 7 | 
 8 | import colorama
 9 | import requests
10 | import win_unicode_console
11 | from colorama import Fore
12 | 
13 | from ..assets.version import get_value as get_version
14 | from ..bots import run_bot
15 | from ..utils.update_checker import check_updates
16 | from .arguments import get_args
17 | from .display import (cancel_method, debug_mode, description, epilog,
18 |                       error_message, input_suppression)
19 | 
20 | logger = logging.Logger('CORE')
21 | 
22 | 
23 | def init():
24 |     os.environ['version'] = get_version()
25 | 
26 |     win_unicode_console.enable()
27 |     colorama.init()
28 |     description()
29 | 
30 |     args = get_args()
31 | 
32 |     levels = ['NOTSET', 'WARN', 'INFO', 'DEBUG']
33 |     level = os.getenv('LOG_LEVEL')
34 |     if not level:
35 |         level = levels[args.log] if args.log else 'NOTSET'
36 |     # end if
37 |     if level != 'NOTSET':
38 |         os.environ['debug_mode'] = 'yes'
39 |         logging.basicConfig(
40 |             level=logging.getLevelName(level),
41 |             format=Fore.CYAN + '%(asctime)s '
42 |             + Fore.RED + '[%(levelname)s] '
43 |             + Fore.YELLOW + '(%(name)s)\n'
44 |             + Fore.WHITE + '%(message)s' + Fore.RESET,
45 |         )
46 |         debug_mode(level)
47 |     # end if
48 | 
49 |     if args.suppress:
50 |         input_suppression()
51 |         print(args)
52 |     # end if
53 | 
54 |     if args.bot:
55 |         os.environ['BOT'] = args.bot
56 |     # end if
57 | 
58 |     for key, val in args.extra.items():
59 |         os.environ[key] = val[0]
60 |     # end for
61 | 
62 |     # requests.urllib3.disable_warnings(
63 |     #     requests.urllib3.exceptions.InsecureRequestWarning)
64 |     # # end if
65 | # end def
66 | 
67 | 
68 | def start_app():
69 |     init()
70 | 
71 |     check_updates()
72 |     cancel_method()
73 | 
74 |     try:
75 |         bot = os.getenv('BOT', '').lower()
76 |         run_bot(bot)
77 |     except Exception as err:
78 |         if os.getenv('debug_mode') == 'yes':
79 |             raise err
80 |         else:
81 |             error_message(err)
82 |         # end if
83 |     # end try
84 | 
85 |     epilog()
86 | 
87 |     # if Icons.isWindows and get_args().suppress is False:
88 |     #     input('Press ENTER to exit...')
89 |     # # end if
90 | # end def
91 | 


--------------------------------------------------------------------------------
/lncrawl/sources/fullnovellive.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | from ..utils.crawler import Crawler
 4 | 
 5 | logger = logging.getLogger('FULLNOVEL_LIVE')
 6 | 
 7 | NOVEL_SEARCH = 'http://fullnovel.live/search/%s'
 8 | 
 9 | 
10 | class FullnovelLiveCrawler(Crawler):
11 |     base_url = 'http://fullnovel.live/'
12 | 
13 |     def search_novel(self, query):
14 |         '''Gets a list of (title, url) matching the given query'''
15 |         results = []
16 |         soup = self.get_soup(NOVEL_SEARCH % query)
17 |         for grid in soup.select('.grid .v-grid'):
18 |             a = grid.select_one('h4 a')
19 |             info = grid.select_one('.info-line a').text
20 |             results.append({
21 |                 'title': (a['title'] or a.text).strip(),
22 |                 'url': self.absolute_url(a['href']),
23 |                 'info': info
24 |             })
25 |         # end for
26 |         return results
27 |     # end def
28 | 
29 |     def read_novel_info(self):
30 |         '''Get novel title, autor, cover etc'''
31 |         soup = self.get_soup(self.novel_url)
32 |         self.novel_title = soup.select_one('.info h1.title a').text.strip()
33 |         self.novel_cover = self.absolute_url(
34 |             soup.select_one('.info .image img')['src'])
35 | 
36 |         chapters = soup.select('.scroll-eps a')
37 |         chapters.reverse()
38 | 
39 |         for x in chapters:
40 |             chap_id = len(self.chapters) + 1
41 |             if len(self.chapters) % 100 == 0:
42 |                 vol_id = chap_id//100 + 1
43 |                 vol_title = 'Volume ' + str(vol_id)
44 |                 self.volumes.append({
45 |                     'id': vol_id,
46 |                     'title': vol_title,
47 |                 })
48 |             # end if
49 |             self.chapters.append({
50 |                 'id': chap_id,
51 |                 'volume': vol_id,
52 |                 'url': self.absolute_url(x['href']),
53 |                 'title': x.text.strip() or ('Chapter %d' % chap_id),
54 |             })
55 |         # end for
56 |     # end def
57 | 
58 |     def download_chapter_body(self, chapter):
59 |         '''Download body of a single chapter and return as clean html format.'''
60 |         soup = self.get_soup(chapter['url'])
61 |         contents = soup.select_one('.page .divContent')
62 |         body = self.extract_contents(contents)
63 |         return '<p>' + '</p><p>'.join(body) + '</p'
64 |     # end def
65 | # end class
66 | 


--------------------------------------------------------------------------------
/lncrawl/sources/liberspark.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from ..utils.crawler import Crawler
 5 | 
 6 | logger = logging.getLogger('LIBER_SPARK')
 7 | 
 8 | 
 9 | class LiberSparkCrawler(Crawler):
10 |     base_url = 'http://liberspark.com/'
11 | 
12 |     def read_novel_info(self):
13 |         logger.debug('Visiting %s', self.novel_url)
14 |         soup = self.get_soup(self.novel_url)
15 | 
16 |         possible_title = soup.select_one('.novel-main-wrapper h1')
17 |         for bad in possible_title.select('span'):
18 |             bad.extract()
19 |         # end for
20 |         self.novel_title = possible_title.text.strip()
21 |         logger.info('Novel title: %s', self.novel_title)
22 | 
23 |         self.novel_cover = self.absolute_url(soup.select_one('#uploaded-cover-image')['src'])
24 |         logger.info('Novel cover: %s', self.novel_cover)
25 | 
26 |         self.novel_author = soup.select_one('.novel-author-info a h4').text.strip()
27 |         logger.info('Novel author: %s', self.novel_author)
28 | 
29 |         for a in reversed(soup.select('#novel-chapters-list td a')):
30 |             chap_id = 1 + len(self.chapters)
31 |             vol_id = 1 + len(self.chapters) // 100
32 |             if chap_id % 100 == 1:
33 |                 self.volumes.append({'id': vol_id})
34 |             # end if
35 |             self.chapters.append({
36 |                 'id': chap_id,
37 |                 'volume': vol_id,
38 |                 'title': a.text.strip(),
39 |                 'url': self.absolute_url(a['href']),
40 |             })
41 |         # end for
42 |     # end def
43 | 
44 |     def download_chapter_body(self, chapter):
45 |         '''Download body of a single chapter and return as clean html format'''
46 |         logger.info('Downloading %s', chapter['url'])
47 |         soup = self.get_soup(chapter['url'])
48 | 
49 |         body = ''
50 |         for p in soup.select('#reader-content > p'):
51 |             for strong in p.select('strong'):
52 |                 strong.name = 'span'
53 |             # end for
54 |             if p.text.strip():
55 |                 body += str(p).strip()
56 |             # end if
57 |         # end for
58 | 
59 |         body += '<p>*******</p>'
60 |         for p in soup.select('#authors_note > p'):
61 |             if p.text.strip():
62 |                 body += str(p).strip()
63 |             # end if
64 |         # end for
65 | 
66 |         return body
67 |     # end def
68 | # end class
69 | 


--------------------------------------------------------------------------------
/README.pip:
--------------------------------------------------------------------------------
 1 | Lightnovel Crawler
 2 | -----------------------
 3 | 
 4 | Download lightnovels from various online sources and generate output in different formats, e.g. epub, mobi, json, html, text, docx and pdf.
 5 | 
 6 | Supported sources:
 7 | - http://boxnovel.org
 8 | - http://liberspark.com
 9 | - http://novelfull.com
10 | - http://tiknovel.com
11 | - http://www.machinenoveltranslation.
12 | - http://www.tiknovel.com
13 | - http://zenithnovels.com
14 | - https://4scanlation.xyz
15 | - https://9kqw.com
16 | - https://anythingnovel.com
17 | - https://babelnovel.com
18 | - https://bestlightnovel.com
19 | - https://book.qidian.com
20 | - https://boxnovel.com
21 | - https://creativenovels.com
22 | - https://crescentmoon.blog
23 | - https://es.mtlnovel.com
24 | - https://fr.mtlnovel.com
25 | - https://id.mtlnovel.com
26 | - https://kiss-novel.com
27 | - https://kisslightnovels.info
28 | - https://light-novel.online
29 | - https://listnovel.com
30 | - https://litnet.com
31 | - https://lnmtl.com
32 | - https://m.chinesefantasynovels.com
33 | - https://m.novelspread.com
34 | - https://m.romanticlovebooks.com
35 | - https://m.wuxiaworld.co
36 | - https://meionovel.com
37 | - https://myoniyonitranslations.com
38 | - https://novelfull.com
39 | - https://novelonlinefull.com
40 | - https://novelraw.blogspot.com
41 | - https://novelsrock.com
42 | - https://ranobelib.me
43 | - https://rewayat.club
44 | - https://tomotranslations.com
45 | - https://volarenovels.com
46 | - https://webnovel.online
47 | - https://webnovelindonesia.com
48 | - https://webnovelonline.com
49 | - https://wordexcerpt.com
50 | - https://wuxiaworld.online
51 | - https://www.aixdzs.com
52 | - https://www.asianhobbyist.com
53 | - https://www.idqidian.us
54 | - https://www.machine-translation.org
55 | - https://www.mtlnovel.com
56 | - https://www.novelall.com
57 | - https://www.novelringan.com
58 | - https://www.novelspread.com
59 | - https://www.qidian.com
60 | - https://www.readlightnovel.org
61 | - https://www.readnovelfull.com
62 | - https://www.romanticlovebooks.com
63 | - https://www.royalroad.com
64 | - https://www.scribblehub.com
65 | - https://www.shinsori.com
66 | - https://www.tapread.com
67 | - https://www.translateindo.com
68 | - https://www.wattpad.com
69 | - https://www.webnovel.com
70 | - https://www.worldnovel.online
71 | - https://www.wuxialeague.com
72 | - https://www.wuxiaworld.co
73 | - https://www.wuxiaworld.com
74 | - https://www.wuxiaworld.site
75 | 
76 | Visit https://github.com/dipu-bd/lightnovel-crawler for more details.
77 | 


--------------------------------------------------------------------------------
/lncrawl/core/novel_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | To get the novel info
 4 | """
 5 | import re
 6 | import os
 7 | import json
 8 | from ..utils.crawler import Crawler
 9 | 
10 | 
11 | def format_novel(crawler: Crawler):
12 |     crawler.novel_title = crawler.novel_title.strip()
13 |     crawler.novel_author = crawler.novel_author.strip()
14 |     # crawler.novel_title = crawler.cleanup_text(crawler.novel_title)
15 |     # crawler.novel_author = crawler.cleanup_text(crawler.novel_author)
16 |     format_volumes(crawler)
17 |     format_chapters(crawler)
18 | # end def
19 | 
20 | 
21 | def format_volumes(crawler: Crawler):
22 |     for vol in crawler.volumes:
23 |         vol['chapter_count'] = 0
24 |         vol['final_chapter'] = 0
25 |         vol['start_chapter'] = 1e8
26 |         title = 'Volume %d' % vol['id']
27 |         if not ('title' in vol and vol['title']):
28 |             vol['title'] = title
29 |         # end if
30 |     # end for
31 | # end def
32 | 
33 | 
34 | def format_chapters(crawler: Crawler):
35 |     for item in crawler.chapters:
36 |         title = '#%d' % item['id']
37 |         if not ('title' in item and item['title']):
38 |             item['title'] = title
39 |         # end if
40 | 
41 |         volume = [x for x in crawler.volumes if x['id'] == item['volume']]
42 |         if len(volume) == 0:
43 |             raise Exception('Unknown volume %s for chapter %s' % (item['volume'], item['id']))
44 |         else:
45 |             volume = volume[0]
46 |         # end if
47 | 
48 |         item['volume_title'] = volume['title']
49 | 
50 |         volume['chapter_count'] += 1
51 |         volume['final_chapter'] = item['id'] if volume['final_chapter'] < item['id'] else volume['final_chapter']
52 |         volume['start_chapter'] = item['id'] if volume['start_chapter'] > item['id'] else volume['start_chapter']
53 |     # end for
54 | # end def
55 | 
56 | 
57 | def save_metadata(crawler, output_path):
58 |     data = {
59 |         'url': crawler.novel_url,
60 |         'title': crawler.novel_title,
61 |         'author': crawler.novel_author,
62 |         'cover': crawler.novel_cover,
63 |         'volumes': crawler.volumes,
64 |         'chapters': crawler.chapters,
65 |         'rtl': crawler.is_rtl,
66 |     }
67 |     file_name = os.path.join(output_path, 'json', 'meta.json')
68 |     os.makedirs(os.path.dirname(file_name), exist_ok=True)
69 |     with open(file_name, 'w', encoding="utf-8") as file:
70 |         json.dump(data, file, indent=2)
71 |     # end with
72 | # end def
73 | 


--------------------------------------------------------------------------------
/lncrawl/sources/aixdzs.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from urllib.parse import urlparse
 6 | 
 7 | import requests
 8 | 
 9 | from ..utils.crawler import Crawler
10 | 
11 | logger = logging.getLogger('AIXDZS_CRAWLER')
12 | 
13 | chapter_list_url = 'https://read.aixdzs.com/%s'
14 | 
15 | 
16 | class AixdzsCrawler(Crawler):
17 |     base_url = 'https://www.aixdzs.com'
18 | 
19 |     def read_novel_info(self):
20 |         '''Get novel title, autor, cover etc'''
21 |         if not self.novel_url.endswith('/'):
22 |             self.novel_url += '/'
23 |         # end if
24 |         logger.debug('Visiting %s', self.novel_url)
25 |         soup = self.get_soup(self.novel_url)
26 | 
27 |         self.novel_cover = soup.select_one('meta[property="og:image"]')['content']
28 |         logger.info('Novel cover: %s', self.novel_cover)
29 | 
30 |         self.novel_title = soup.select_one('meta[property="og:novel:book_name"]')['content']
31 |         logger.info('Novel title: %s', self.novel_title)
32 | 
33 |         self.novel_author = soup.select_one('meta[property="og:novel:author"]')['content']
34 |         logger.info('%s', self.novel_author)
35 | 
36 |         parsed_url = urlparse(self.novel_url)
37 |         parsed_path = parsed_url.path.strip('/').split('/')
38 |         chapter_url = chapter_list_url % ('/'.join(parsed_path[1:]))
39 |         logger.debug('Visiting %s', chapter_url)
40 |         soup = self.get_soup(chapter_url)
41 | 
42 |         volumes = set([])
43 |         for a in reversed(soup.select('div.catalog li a')):
44 |             ch_id = len(self.chapters) + 1
45 |             vol_id = 1 + len(self.chapters) // 100
46 |             volumes.add(vol_id)
47 |             self.chapters.append({
48 |                 'id': ch_id,
49 |                 'volume': vol_id,
50 |                 'title': a.text,
51 |                 'url': self.absolute_url(a['href'], page_url=chapter_url),
52 |             })
53 |         # end def
54 | 
55 |         self.volumes = [{'id': x, 'title': ''} for x in volumes]
56 |     # end def
57 | 
58 |     def download_chapter_body(self, chapter):
59 |         '''Download body of a single chapter and return as clean html format.'''
60 |         logger.info('Downloading %s', chapter['url'])
61 |         soup = self.get_soup(chapter['url'])
62 |         chapter['body_lock'] = True
63 |         contents = soup.select('.content > p')
64 |         contents = [str(p) for p in contents if p.text.strip()]
65 |         return ''.join(contents)
66 |     # end def
67 | # end class
68 | 


--------------------------------------------------------------------------------
/lncrawl/sources/tapread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | from urllib.parse import urlparse
 4 | from ..utils.crawler import Crawler
 5 | 
 6 | logger = logging.getLogger('TAPREAD')
 7 | 
 8 | chapter_list_url = 'https://www.tapread.com/book/contents?bookId=%s'
 9 | chapter_url = 'https://www.tapread.com/book/chapter?bookId=%s&chapterId=%s'
10 | 
11 | 
12 | class TapreadCrawler(Crawler):
13 |     base_url = 'https://www.tapread.com/'
14 | 
15 |     def read_novel_info(self):
16 |         '''Get novel title, autor, cover etc'''
17 |         logger.debug('Visiting %s', self.novel_url)
18 |         soup = self.get_soup(self.novel_url)
19 | 
20 |         self.novel_title = soup.select_one('.book-name').text.strip()
21 |         logger.info('Novel title: %s', self.novel_title)
22 | 
23 |         try:
24 |             self.novel_cover = self.absolute_url(
25 |                 soup.select_one('img.bg-img, img.cover-img, .book-img img')['src'])
26 |         except Exception:
27 |             pass
28 |         # end try
29 |         logger.info('Novel cover: %s', self.novel_cover)
30 | 
31 |         try:
32 |             possible_authors = []
33 |             for div in soup.select('.author, .translator'):
34 |                 possible_authors.append(
35 |                     ': '.join([x.strip() for x in div.text.split(':')]))
36 |             # end for
37 |             self.novel_author = ', '.join(possible_authors)
38 |         except Exception:
39 |             pass
40 |         # end try
41 |         logger.info(self.novel_author)
42 | 
43 |         path = urlparse(self.novel_url).path
44 |         book_id = path.split('/')[3]
45 |         data = self.get_json(chapter_list_url % book_id)
46 | 
47 |         volumes = set()
48 |         for chap in data['result']['chapterList']:
49 |             chap_id = chap['chapterNo']
50 |             vol_id = (chap_id - 1) // 100 + 1
51 |             volumes.add(vol_id)
52 |             self.chapters.append({
53 |                 'id': chap_id,
54 |                 'volume': vol_id,
55 |                 'title': chap['chapterName'],
56 |                 'url': chapter_url % (chap['bookId'], chap['chapterId']),
57 |             })
58 |         # end for
59 | 
60 |         self.volumes = [{'id': x} for x in volumes]
61 |     # end def
62 | 
63 |     def download_chapter_body(self, chapter):
64 |         '''Download body of a single chapter and return as clean html format'''
65 |         logger.info('Downloading %s', chapter['url'])
66 |         data = self.get_json(chapter['url'])
67 |         return data['result']['content']
68 |     # end def
69 | # end class
70 | 


--------------------------------------------------------------------------------
/lncrawl/sources/tomotrans.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | 
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('TOMO_TRANSLATIONS')
 8 | 
 9 | 
10 | class TomoTransCrawler(Crawler):
11 |     base_url = 'https://tomotranslations.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.select_one('article h1.title').text
19 |         logger.info('Novel title: %s', self.novel_title)
20 | 
21 |         self.novel_cover = self.absolute_url(
22 |             soup.select_one('article figure.wp-block-image img')['data-orig-file'])
23 |         logger.info('Novel cover: %s', self.novel_cover)
24 | 
25 |         author = 'Tomo Translations'
26 |         logger.info('Novel author: %s', self.novel_author)
27 | 
28 |         volumes = set()
29 |         for a in soup.select('article section.entry a[href^="%s"]' % self.home_url):
30 |             chap_id = len(self.chapters) + 1
31 |             chap_url = self.absolute_url(a['href'])
32 |             possible_vol = re.findall(r'-volume-(\d+)-', chap_url)
33 |             if not len(possible_vol):
34 |                 continue
35 |             # end if
36 |             vol_id = int(possible_vol[0])
37 |             volumes.add(vol_id)
38 |             self.chapters.append({
39 |                 'id': chap_id,
40 |                 'volume': vol_id,
41 |                 'url':  chap_url,
42 |                 'title': a.text.strip(),
43 |             })
44 |         # end for
45 | 
46 |         self.volumes = [{'id': x} for x in volumes]
47 |     # end def
48 | 
49 |     def download_chapter_body(self, chapter):
50 |         '''Download body of a single chapter and return as clean html format.'''
51 |         logger.info('Downloading %s', chapter['url'])
52 |         soup = self.get_soup(chapter['url'])
53 | 
54 |         body = ''
55 |         for tag in soup.select('article section.entry > *'):
56 |             if tag.name == 'hr' and tag.has_attr("class") and 'is-style-dots' in tag.get('class'):
57 |                 body += '<p>—————–</p>'
58 |             elif tag.name == 'p':
59 |                 if tag.find('strong'):
60 |                     chapter['title'] = tag.text.strip()
61 |                 elif tag.find('a') and re.match(r'Previous|Next', tag.find('a').text):
62 |                     pass
63 |                 else:
64 |                     body += str(tag)
65 |                 # end if
66 |             # end if
67 |         # end for
68 | 
69 |         return body
70 |     # end def
71 | # end class
72 | 


--------------------------------------------------------------------------------
/lncrawl/sources/wattpad.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('WATTPAD')
 8 | 
 9 | 
10 | class WattpadCrawler(Crawler):
11 |     base_url = 'https://www.wattpad.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.select('h1')[0].get_text().strip()
19 |         logger.info('Novel title: %s', self.novel_title)
20 | 
21 |         self.novel_cover = self.absolute_url(
22 |             soup.select('div.cover.cover-lg img')[0]['src'])
23 |         logger.info('Novel cover: %s', self.novel_cover)
24 | 
25 |         self.novel_author = soup.select('div.author-info strong a')[0].get_text()
26 |         logger.info('Novel author: %s', self.novel_author)
27 | 
28 |         description = soup.select('h2.description')[0].get_text()
29 | 
30 |         chapters = soup.select('ul.table-of-contents a')
31 |         # chapters.reverse()
32 | 
33 |         for a in chapters:
34 |             chap_id = len(self.chapters) + 1
35 |             vol_id = chap_id//100 + 1
36 |             if len(self.chapters) % 100 == 0:
37 |                 vol_title = 'Volume ' + str(vol_id)
38 |                 self.volumes.append({
39 |                     'id': vol_id,
40 |                     'title': vol_title,
41 |                 })
42 |             # end if
43 |             self.chapters.append({
44 |                 'id': chap_id,
45 |                 'volume': vol_id,
46 |                 'url':  self.absolute_url(a['href']),
47 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
48 |             })
49 |         # end for
50 |     # end def
51 | 
52 |     def download_chapter_body(self, chapter):
53 |         '''Download body of a single chapter and return as clean html format.'''
54 |         logger.info('Downloading %s', chapter['url'])
55 | 
56 |         soup = self.get_soup(chapter['url'])
57 |         pages = int(re.search('[1-9]', re.search('("pages":)([1-9])', str(soup)).group(0)).group(0))
58 |         chapter['title'] = soup.select('h2')[0].get_text().strip()
59 |         contents = []
60 |         for i in range(1, pages+1):
61 |             page_url = chapter['url'] + "/page/" + str(i)
62 |             logger.info('Get body text from %s', page_url)
63 |             soup_page = self.get_soup(page_url)
64 |             for p in soup_page.select('pre p'):
65 |                 contents.append(p.text)
66 | 
67 |         return '<p>' + '</p><p>'.join(contents) + '</p>'
68 |     # end def
69 | # end class
70 | 


--------------------------------------------------------------------------------
/lncrawl/sources/jpmtl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | import ast
 6 | import requests
 7 | from ..utils.crawler import Crawler
 8 | 
 9 | logger = logging.getLogger('JPMTL')
10 | 
11 | book_url = 'https://jpmtl.com/books/%s'
12 | 
13 | class JpmtlCrawler(Crawler):
14 |     base_url = 'https://jpmtl.com/'
15 | 
16 |     def initialize(self):
17 |         self.home_url = 'https://jpmtl.com'
18 |     # end def
19 | 
20 |     def read_novel_info(self):
21 |         '''Get novel title, autor, cover etc'''
22 |         self.novel_id = self.novel_url.split('/')[-1]
23 |         logger.info('Novel Id: %s', self.novel_id)
24 | 
25 |         self.novel_url = book_url % self.novel_id
26 |         logger.debug('Visiting %s', self.novel_url)
27 |         soup = self.get_soup(self.novel_url)
28 | 
29 |         self.novel_title =soup.select_one('h1.book-sidebar__title').text.strip()
30 |         logger.info('Novel title: %s', self.novel_title)
31 | 
32 |         try:
33 |             self.novel_cover = self.absolute_url(
34 |                 soup.select_one('.book-sidebar__img img')['src'])
35 |             logger.info('Novel cover: %s', self.novel_cover)
36 |         except Exception:
37 |             logger.debug('Failed to get cover: %s', self.novel_url)
38 |         # end try
39 | 
40 |         self.novel_author = soup.select_one('.book-sidebar__author .book-sidebar__info').text.strip()
41 |         logger.info('Novel author: %s', self.novel_author)
42 | 
43 |         for a in soup.select('ol.book-volume__list li a'):
44 |             chap_id = len(self.chapters) + 1
45 |             if len(self.chapters) % 100 == 0:
46 |                 vol_id = chap_id//100 + 1
47 |                 vol_title = 'Volume ' + str(vol_id)
48 |                 self.volumes.append({
49 |                     'id': vol_id,
50 |                     'title': vol_title,
51 |                 })
52 |             # end if
53 |             self.chapters.append({
54 |                 'id': chap_id,
55 |                 'volume': vol_id,
56 |                 'url':  self.absolute_url(a['href']),
57 |                 'title': a.select_one('.book-ccontent__title').text.strip() or ('Chapter %d' % chap_id),
58 |             })
59 |         # end for
60 |     # end def
61 | 
62 |     def download_chapter_body(self, chapter):
63 |         '''Download body of a single chapter and return as clean html format'''
64 |         logger.info('Downloading %s', chapter['url'])
65 |         soup = self.get_soup(chapter['url'])
66 |         
67 |         contents = soup.select('.chapter-content__content p')
68 | 
69 |         body = [str(p) for p in contents if p.text.strip()]
70 | 
71 |         return '<p>' + '</p><p>'.join(body) + '</p>'
72 |     # end def
73 | # end class
74 | 


--------------------------------------------------------------------------------
/lncrawl/sources/tiknovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from urllib.parse import parse_qsl, urlparse
 5 | 
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('TIKNOVEL')
 9 | 
10 | chapter_details_url = 'https://tiknovel.com/book/ajaxchap'
11 | 
12 | 
13 | class TikNovelCrawler(Crawler):
14 |     base_url = [
15 |         'http://tiknovel.com/',
16 |         'https://tiknovel.com/',
17 |     ]
18 | 
19 |     def read_novel_info(self):
20 |         logger.debug('Visiting %s', self.novel_url)
21 |         soup = self.get_soup(self.novel_url)
22 | 
23 |         self.novel_title = soup.select_one('#content .detail-wrap h1.detail-tit').text
24 |         logger.info('Novel title: %s', self.novel_title)
25 | 
26 |         possible_authors = soup.select('#content table.detail-profile td')
27 |         for td in possible_authors:
28 |             if '作者' in td.find('strong').text:
29 |                 td.find('strong').extract()
30 |                 self.novel_author = td.text.strip()
31 |                 break
32 |             # end if
33 |         # end for
34 |         logger.info('Novel author: %s', self.novel_author)
35 | 
36 |         self.novel_cover = self.absolute_url(
37 |             soup.select_one('#content .detail-thumb-box img')['data-echo'])
38 |         logger.info('Novel cover: %s', self.novel_cover)
39 | 
40 |         volumes = set()
41 |         for a in soup.select('#content .contents-lst li a'):
42 |             ch_id = int(a.find('span').text.strip())
43 |             vol_id = 1 + (ch_id - 1) // 100
44 |             volumes.add(vol_id)
45 |             self.chapters.append({
46 |                 'id': ch_id,
47 |                 'volume': vol_id,
48 |                 'title': a['title'],
49 |                 'url':  self.absolute_url(a['href']),
50 |             })
51 |         # end for
52 | 
53 |         self.volumes = [{'id': x} for x in volumes]
54 |     # end def
55 | 
56 |     def download_chapter_body(self, chapter):
57 |         '''Download body of a single chapter and return as clean html format.'''
58 |         chapter['body_lock'] = True
59 |         query_str = urlparse(chapter['url']).query
60 |         data_params = {x[0]: int(x[1]) for x in parse_qsl(query_str)}
61 |         logging.debug("Requesting body with: %s", data_params)
62 |         response = self.submit_form(chapter_details_url, data=data_params)
63 |         data = response.json()
64 |         chap_desc = data['data']['chap']['desc']
65 |         chap_desc = re.sub(r'((<br\/>)|\n)+', '\n\n', chap_desc, flags=re.I)
66 |         contents = chap_desc.split('\n\n')
67 |         contents = [p for p in contents if p and p.strip()]
68 |         return '<p>' + '</p><p>'.join(contents) + '</p>'
69 |     # end def
70 | # end class
71 | 


--------------------------------------------------------------------------------
/lncrawl/sources/qidiancom.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | from ..utils.crawler import Crawler
 4 | 
 5 | logger = logging.getLogger('QIDIAN_COM')
 6 | 
 7 | chapter_list_url = 'https://book.qidian.com/ajax/book/category?_csrfToken=%s&bookId=%s'
 8 | chapter_details_url = 'https://read.qidian.com/chapter/%s'
 9 | 
10 | 
11 | class QidianComCrawler(Crawler):
12 |     base_url = [
13 |         'https://book.qidian.com/',
14 |         # 'https://www.qidian.com/',
15 |     ]
16 | 
17 |     def initialize(self):
18 |         self.home_url = 'https://www.qidian.com/'
19 |     # end def
20 | 
21 |     def read_novel_info(self):
22 |         '''Get novel title, autor, cover etc'''
23 |         logger.debug('Visiting %s', self.novel_url)
24 |         soup = self.get_soup(self.novel_url)
25 | 
26 |         self.novel_title = soup.select_one('.book-info h1 em').text
27 |         logger.info('Novel title: %s', self.novel_title)
28 | 
29 |         self.novel_author = soup.select_one('.book-info h1 a.writer').text
30 |         logger.info('Novel author: %s', self.novel_author)
31 | 
32 |         book_img = soup.select_one('#bookImg')
33 |         self.novel_cover = self.absolute_url(book_img.find('img')['src'])
34 |         self.novel_cover = '/'.join(self.novel_cover.split('/')[:-1])
35 |         logger.info('Novel cover: %s', self.novel_cover)
36 | 
37 |         self.book_id = book_img['data-bid']
38 |         logger.debug('Book Id: %s', self.book_id)
39 | 
40 |         self.csrf = self.cookies['_csrfToken']
41 |         logger.debug('CSRF Token: %s', self.csrf)
42 | 
43 |         volume_url = chapter_list_url % (self.csrf, self.book_id)
44 |         logger.debug('Visiting %s', volume_url)
45 |         data = self.get_json(volume_url)
46 | 
47 |         for volume in data['data']['vs']:
48 |             vol_id = len(self.volumes) + 1
49 |             self.volumes.append({
50 |                 'id': vol_id,
51 |                 'title': volume['vN'],
52 |             })
53 |             for chapter in volume['cs']:
54 |                 ch_id = len(self.chapters) + 1
55 |                 self.chapters.append({
56 |                     'id': ch_id,
57 |                     'volume': vol_id,
58 |                     'title': chapter['cN'],
59 |                     'url': chapter_details_url % chapter['cU'],
60 |                 })
61 |             # end for
62 |         # end for
63 |     # end def
64 | 
65 |     def download_chapter_body(self, chapter):
66 |         '''Download body of a single chapter and return as clean html format'''
67 |         logger.info('Downloading %s', chapter['url'])
68 |         soup = self.get_soup(chapter['url'])
69 |         chapter['body_lock'] = True
70 |         chapter['title'] = soup.select_one('h3.j_chapterName').text.strip()
71 |         return soup.select_one('div.j_readContent').extract()
72 |     # end def
73 | # end class
74 | 


--------------------------------------------------------------------------------
/lncrawl/assets/html_style.css:
--------------------------------------------------------------------------------
  1 | @import url('https://fonts.googleapis.com/css?family=Merriweather:400,400i,700,700i');
  2 | 
  3 | html,
  4 | body {
  5 |   margin: 0;
  6 |   padding: 0;
  7 |   width: 100%;
  8 |   height: 100%;
  9 |   position: relative;
 10 |   background-color: #323235;
 11 |   -webkit-font-smoothing: antialiased;
 12 | }
 13 | 
 14 | #content {
 15 |   padding: 10px 20px;
 16 |   max-width: 850px;
 17 |   margin: 10px auto;
 18 |   font-size: 16px;
 19 |   font-family: 'Merriweather', Georgia, serif;
 20 |   text-align: justify;
 21 |   line-height: 1.8;
 22 |   border-radius: 5px;
 23 |   box-shadow: 0 0 10px #000, 0 0 0 1px #000;
 24 |   background-color: #fffff0;
 25 | }
 26 | 
 27 | @media (max-width: 925px) {
 28 |   #content {
 29 |     margin: 5px;
 30 |     max-width: auto;
 31 |   }
 32 | }
 33 | 
 34 | main {
 35 |   min-height: 500px;
 36 |   padding: 0 10px;
 37 | }
 38 | 
 39 | h1,
 40 | h2,
 41 | h3,
 42 | h4,
 43 | h5,
 44 | h6 {
 45 |   color: #555;
 46 |   padding: 10px;
 47 |   margin: 0;
 48 |   text-align: center;
 49 |   line-height: normal;
 50 | }
 51 | 
 52 | h1 {
 53 |   color: #333336;
 54 |   font-weight: 300;
 55 |   margin-bottom: 15px;
 56 | }
 57 | 
 58 | h1:after {
 59 |   content: '-';
 60 |   margin: 10px 30px;
 61 |   height: 2px;
 62 |   border-radius: 50%;
 63 |   background: #444;
 64 |   display: block;
 65 |   color: transparent;
 66 | }
 67 | 
 68 | .link-group {
 69 |   padding: 10px;
 70 |   margin: 15px 0;
 71 |   display: flex;
 72 |   align-items: center;
 73 |   justify-content: space-between;
 74 |   background: #dde;
 75 |   border: 1px solid #dde;
 76 | }
 77 | 
 78 | .link-group a {
 79 |   color: #39f;
 80 |   text-decoration: none;
 81 | }
 82 | 
 83 | .link-group .btn {
 84 |   color: #333;
 85 |   font-family: sans-serif;
 86 |   font-size: 18px;
 87 |   font-weight: Arial, 600;
 88 |   display: inline-block;
 89 |   width: 145px;
 90 |   padding: 5px;
 91 |   text-align: center;
 92 |   background: #f5f5f5;
 93 |   box-shadow: 1px 1px 2px #aac, 0 0 0 1px #ccc;
 94 |   border-radius: 5px;
 95 | }
 96 | 
 97 | .link-group .btn:hover {
 98 |   background: #ececef;
 99 | }
100 | 
101 | .link-group .btn:active {
102 |   box-shadow: 1px 1px 2px #cce inset, 0 0 0 1px #ccc;
103 | }
104 | 
105 | div#readpos {
106 |   border-radius: 10px;
107 |   font-weight: bold;
108 |   font-family: monospace;
109 |   color: #770;
110 |   font-size: 14px;
111 |   padding: 5px 10px;
112 |   background: white;
113 |   box-shadow: 0 0 10px #333, 0 0 0 1px #dde;
114 |   position: fixed;
115 |   bottom: 10px;
116 |   right: 10px;
117 |   user-select: none;
118 |   -webkit-user-drag: none;
119 |   /* min-width: 100px;
120 |     text-align: right; */
121 | }
122 | 
123 | @media print {
124 |   #content {
125 |     margin: 0;
126 |     max-width: none;
127 |     box-shadow: none;
128 |   }
129 |   .link-group {
130 |     display: none;
131 |   }
132 | }
133 | 


--------------------------------------------------------------------------------
/lncrawl/sources/9kqw.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from urllib.parse import parse_qsl, urlparse
 5 | 
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('9KQW')
 9 | 
10 | chapter_details_url = 'https://9kqw.com/book/ajaxchap'
11 | 
12 | 
13 | class TikNovelCrawler(Crawler):
14 |     base_url = [
15 |         'https://9kqw.com/',
16 |         'http://www.tiknovel.com/',
17 |         'https://www.tiknovel.com/',
18 |     ]
19 | 
20 |     def initialize(self):
21 |         self.base_url = 'https://9kqw.com/'
22 |     # end def
23 | 
24 |     def read_novel_info(self):
25 |         logger.debug('Visiting %s', self.novel_url)
26 |         soup = self.get_soup(self.novel_url)
27 | 
28 |         self.novel_title = soup.select_one('#content .detail-wrap h1.detail-tit').text
29 |         logger.info('Novel title: %s', self.novel_title)
30 | 
31 |         possible_authors = soup.select('#content table.detail-profile td')
32 |         for td in possible_authors:
33 |             if '作者' in td.find('strong').text:
34 |                 td.find('strong').extract()
35 |                 self.novel_author = td.text.strip()
36 |                 break
37 |             # end if
38 |         # end for
39 |         logger.info('Novel author: %s', self.novel_author)
40 | 
41 |         self.novel_cover = self.absolute_url(
42 |             soup.select_one('#content .detail-thumb-box img')['data-echo'])
43 |         logger.info('Novel cover: %s', self.novel_cover)
44 | 
45 |         volumes = set()
46 |         for a in soup.select('#content .contents-lst li a'):
47 |             ch_id = int(a.find('span').text.strip())
48 |             vol_id = 1 + (ch_id - 1) // 100
49 |             volumes.add(vol_id)
50 |             self.chapters.append({
51 |                 'id': ch_id,
52 |                 'volume': vol_id,
53 |                 'title': a['title'],
54 |                 'url':  self.absolute_url(a['href']),
55 |             })
56 |         # end for
57 | 
58 |         self.volumes = [{'id': x} for x in volumes]
59 |     # end def
60 | 
61 |     def download_chapter_body(self, chapter):
62 |         '''Download body of a single chapter and return as clean html format.'''
63 |         chapter['body_lock'] = True
64 |         query_str = urlparse(chapter['url']).query
65 |         data_params = {x[0]: int(x[1]) for x in parse_qsl(query_str)}
66 |         logging.debug("Requesting body with: %s", data_params)
67 |         response = self.submit_form(chapter_details_url, data=data_params)
68 |         data = response.json()
69 |         chap_desc = data['data']['chap']['desc']
70 |         chap_desc = re.sub(r'((<br\/>)|\n)+', '\n\n', chap_desc, flags=re.I)
71 |         contents = chap_desc.split('\n\n')
72 |         contents = [p for p in contents if p and p.strip()]
73 |         return '<p>' + '</p><p>'.join(contents) + '</p>'
74 |     # end def
75 | # end class
76 | 


--------------------------------------------------------------------------------
/lncrawl/sources/novelspread.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import hashlib
 3 | import json
 4 | import logging
 5 | import re
 6 | from concurrent.futures import ThreadPoolExecutor
 7 | 
 8 | from ..utils.crawler import Crawler
 9 | 
10 | logger = logging.getLogger('NOVEL_SPREAD')
11 | 
12 | book_info_url = 'https://api.novelspread.com/api/novel/path/%s'
13 | chapter_list_url = 'https://api.novelspread.com/api/novel/%s/chapter/menu'
14 | chapter_body_url = 'https://api.novelspread.com/api/novel/%s/chapter/%d/content?fingerprint='
15 | 
16 | 
17 | class NovelSpreadCrawler(Crawler):
18 |     base_url = 'https://www.novelspread.com/'
19 | 
20 |     def make_cover_url(self, image):
21 |         a = '360'
22 |         b = '512'
23 |         c = '1'
24 |         d = '90'
25 |         r = a + b + c + d + image
26 |         for i in range(2):
27 |             m = hashlib.md5()
28 |             m.update(r.encode())
29 |             r = m.hexdigest()
30 |         # end for
31 |         url = 'https://www.novelspread.com/image/' \
32 |               '%sx%s/%s/%s/%s/%s' % (a, b, d, c, r[:16], image)
33 |         return url
34 |     # end def
35 | 
36 |     def read_novel_info(self):
37 |         self.novel_id = self.novel_url.strip('/').split('/')[-1]
38 |         logger.info('Novel id: %s' % self.novel_id)
39 |         data = self.get_json(book_info_url % self.novel_id)
40 | 
41 |         self.novel_title = data['data']['name']
42 |         logger.info('Title: %s' % self.novel_title)
43 | 
44 |         self.novel_author = 'Author: %s, Translator: %s' % (
45 |             data['data']['author'], data['data']['translator'])
46 |         logger.info(self.novel_author)
47 | 
48 |         self.novel_cover = self.make_cover_url(data['data']['img'])
49 |         logger.info('Novel cover: %s', self.novel_cover)
50 | 
51 |         logger.info('Getting chapters...')
52 |         data = self.get_json(chapter_list_url % self.novel_id)
53 | 
54 |         volumes = set([])
55 |         for chap in data['data']:
56 |             volumes.add(chap['volume'])
57 |             self.chapters.append({
58 |                 'id': chap['chapter_number'],
59 |                 'volume': chap['volume'],
60 |                 'title': chap['title'],
61 |                 'url': self.absolute_url(chap['link'])
62 |             })
63 |         # end for
64 | 
65 |         self.volumes = [
66 |             {'id': x, 'title': ''}
67 |             for x in volumes
68 |         ]
69 | 
70 |         logger.debug('%d chapters and %d volumes found',
71 |                      len(self.chapters), len(self.volumes))
72 |     # end def
73 | 
74 |     def download_chapter_body(self, chapter):
75 |         url = chapter_body_url % (self.novel_id, chapter['id'])
76 |         logger.info('Getting chapter... %s [%s]', chapter['title'], url)
77 |         data = self.get_json(url)
78 |         return data['data']['chapter_content']
79 |     # end def
80 | # end class
81 | 


--------------------------------------------------------------------------------
/lncrawl/sources/novelv.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import logging
 4 | from concurrent import futures
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('NOVELV')
 8 | 
 9 | 
10 | class NovelvCrawler(Crawler):
11 |     base_url = 'https://www.novelv.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.select_one(
19 |             '.panel-default .info .info2 h1').text.strip()
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_cover = self.absolute_url(
23 |             soup.select_one('.panel-default .info .info1 img')['src'])
24 |         logger.info('Novel cover: %s', self.novel_cover)
25 | 
26 |         authors = []
27 |         for a in soup.select('.panel-default .info .info2 h3 a'):
28 |             if a['href'].startswith('/author/'):
29 |                 authors.append(a.text.strip())
30 |             # end if
31 |         # end for
32 |         self.novel_author = ', '.join(authors)
33 |         logger.info('Novel author: %s', self.novel_author)
34 | 
35 |         volumes = set([])
36 |         for a in soup.select('.panel-default ul.list-charts li a'):
37 |             possible_url = self.absolute_url(a['href'].lower())
38 |             if not possible_url.startswith(self.novel_url):
39 |                 continue
40 |             # end if
41 | 
42 |             chapter_id = len(self.chapters) + 1
43 |             volume_id = (chapter_id - 1) // 100 + 1
44 |             volumes.add(volume_id)
45 | 
46 |             self.chapters.append({
47 |                 'id': chapter_id,
48 |                 'title': a.text.strip(),
49 |                 'url': possible_url,
50 |                 'volume': volume_id,
51 |             })
52 |         # end for
53 | 
54 |         self.volumes = [
55 |             {'id': x, 'title': ''}
56 |             for x in list(volumes)
57 |         ]
58 |     # end def
59 | 
60 |     def download_chapter_body(self, chapter):
61 |         '''Download body of a single chapter and return as clean html format.'''
62 |         chapter['title'] = self.clean_text(chapter['title'])
63 | 
64 |         logger.info('Downloading %s', chapter['url'])
65 |         soup = self.get_soup(chapter['url'])
66 |         content = soup.select_one('.panel-body.content-body')
67 |         body = self.extract_contents(content)
68 |         body = '<p>%s</p>' % '</p><p>'.join(body)
69 |         return self.clean_text(body)
70 |     # end def
71 | 
72 |     def clean_text(self, text):
73 |         text = re.sub(r'\ufffd\ufffd\ufffd+', '**', text)
74 |         text = re.sub(r'\ufffd\ufffd', '"', text)
75 |         text = re.sub(r'\u00a0\u00a0', '–', text)
76 |         text = re.sub(r'\ufffdC', '', text)
77 |         return text
78 |     # end def
79 | # end class
80 | 


--------------------------------------------------------------------------------
/lncrawl/sources/machinetrans.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('MACHINE_NOVEL_TRANSLATION')
 8 | 
 9 | search_url = 'http://www.machinenoveltranslation.com/search/autocomplete'
10 | 
11 | 
12 | class MachineNovelTrans(Crawler):
13 |     base_url = 'http://www.machinenoveltranslation.com/'
14 | 
15 |     def read_novel_info(self):
16 |         '''Get novel title, autor, cover etc'''
17 |         logger.debug('Visiting %s', self.novel_url)
18 |         soup = self.get_soup(self.novel_url)
19 | 
20 |         self.novel_title = soup.select_one('.desc h5').text
21 |         logger.info('Novel title: %s', self.novel_title)
22 | 
23 |         self.novel_cover = self.absolute_url(
24 |             soup.select_one('.about-author .row img')['src'])
25 |         logger.info('Novel cover: %s', self.novel_cover)
26 | 
27 |         for div in soup.select('#chapters #accordion .panel'):
28 |             vol_title = div.select_one('h4.panel-title a').text
29 |             vol_id = [int(x) for x in re.findall(r'\d+', vol_title)]
30 |             vol_id = vol_id[0] if len(vol_id) else len(self.volumes) + 1
31 |             self.volumes.append({
32 |                 'id': vol_id,
33 |                 'title': vol_title,
34 |             })
35 | 
36 |             for a in div.select('ul.navigate-page li a'):
37 |                 ch_title = a.text
38 |                 ch_id = [int(x) for x in re.findall(r'\d+', ch_title)]
39 |                 ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1
40 |                 self.chapters.append({
41 |                     'id': ch_id,
42 |                     'volume': vol_id,
43 |                     'title': ch_title,
44 |                     'url':  self.absolute_url(a['href']),
45 |                 })
46 |             # end for
47 |         # end for
48 | 
49 |         logger.debug('%d chapters and %d volumes found',
50 |                      len(self.chapters), len(self.volumes))
51 |     # end def
52 | 
53 |     def download_chapter_body(self, chapter):
54 |         '''Download body of a single chapter and return as clean html format.'''
55 |         logger.info('Visiting %s', chapter['url'])
56 |         soup = self.get_soup(chapter['url'])
57 | 
58 |         body = soup.select('.about-author .desc .translated')
59 |         body = [self.format_text(x.text) for x in body if x]
60 |         body = '\n'.join(['<p>%s</p>' % (x) for x in body if len(x)])
61 |         return body.strip()
62 |     # end def
63 | 
64 |     def format_text(self, text):
65 |         '''formats the text and remove bad characters'''
66 |         text = re.sub(r'\u00ad', '', text, flags=re.UNICODE)
67 |         text = re.sub(r'\u201e[, ]*', '&ldquo;', text, flags=re.UNICODE)
68 |         text = re.sub(r'\u201d[, ]*', '&rdquo;', text, flags=re.UNICODE)
69 |         text = re.sub(r'[ ]*,[ ]+', ', ', text, flags=re.UNICODE)
70 |         return text.strip()
71 |     # end def
72 | # end class
73 | 


--------------------------------------------------------------------------------
/lncrawl/bots/test/test_crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | 
  4 | from ...core.app import App
  5 | from ...binders import available_formats
  6 | 
  7 | 
  8 | def test_crawler(self, link, user_input):
  9 |     app = App()
 10 |     print('App instance: OK')
 11 | 
 12 |     app.initialize()
 13 |     print('App initialize: DONE')
 14 | 
 15 |     app.user_input = user_input
 16 |     app.init_search()
 17 |     print('Init search: DONE')
 18 | 
 19 |     if not app.crawler:
 20 |         if link not in app.crawler_links:
 21 |             print('Search is not supported for', link)
 22 |             return
 23 |         # end if
 24 | 
 25 |         print(len(app.crawler_links), 'available crawlers to search')
 26 |         app.crawler_links = [link]
 27 |         print('Selected crawler:', link)
 28 | 
 29 |         app.search_novel()
 30 |         print('Search: %d results found' % len(app.search_results))
 31 | 
 32 |         source = app.search_results[0]
 33 |         print('Top result: %s with %d sources' %
 34 |               (source['title'], len(source['novels'])))
 35 | 
 36 |         novel_url = source['novels'][0]['url']
 37 |         print('Top novel:', novel_url)
 38 | 
 39 |         app.init_crawler(novel_url)
 40 |         print('Init crawler: DONE')
 41 | 
 42 |         app.get_novel_info()
 43 |         print('Novel info: DONE')
 44 |         if not app.crawler.novel_title:
 45 |             raise Exception('No novel title')
 46 |             # end if
 47 |         return
 48 |     # end if
 49 | 
 50 |     if not app.crawler:
 51 |         raise Exception('No crawler initialized')
 52 |     # end if
 53 | 
 54 |     if app.can_do('login'):
 55 |         print('Login: enabled')
 56 |     # end if
 57 | 
 58 |     app.get_novel_info()
 59 |     print('Title:', app.crawler.novel_title)
 60 |     print('Cover:', app.crawler.novel_cover)
 61 |     print('Author:', app.crawler.novel_author)
 62 | 
 63 |     if not app.crawler.novel_title:
 64 |         raise Exception('No novel title')
 65 |     # end if
 66 | 
 67 |     print('Novel info: DONE')
 68 | 
 69 |     os.makedirs(app.output_path, exist_ok=True)
 70 |     print('Output path:', app.output_path)
 71 | 
 72 |     if len(app.crawler.volumes) == 0:
 73 |         raise Exception('Empty volume list')
 74 |     # end if
 75 | 
 76 |     if len(app.crawler.chapters) == 0:
 77 |         raise Exception('Empty chapter list')
 78 |     # end if
 79 | 
 80 |     app.chapters = app.crawler.chapters[:2]
 81 |     app.output_formats = {x: False for x in available_formats}
 82 |     app.output_formats['pdf'] = True
 83 |     app.pack_by_volume = False
 84 | 
 85 |     app.start_download()
 86 |     print('Download: DONE')
 87 | 
 88 |     if len(app.chapters[0]['body']) < 50:
 89 |         raise Exception('Empty body')
 90 |     # end if
 91 | 
 92 |     app.bind_books()
 93 |     print('Bindings: DONE')
 94 | 
 95 |     app.destroy()
 96 |     print('Destroy: DONE')
 97 | 
 98 |     print('-' * 6, 'Test Passed', '-' * 6)
 99 | # end def
100 | 


--------------------------------------------------------------------------------
/lncrawl/sources/readln.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('READLIGHTNOVEL')
 8 | search_url = 'https://www.readlightnovel.org/search/autocomplete'
 9 | 
10 | 
11 | class ReadLightNovelCrawler(Crawler):
12 |     base_url = 'https://www.readlightnovel.org/'
13 | 
14 |     def read_novel_info(self):
15 |         '''Get novel title, autor, cover etc'''
16 |         logger.debug('Visiting %s', self.novel_url)
17 |         soup = self.get_soup(self.novel_url)
18 | 
19 |         self.novel_title = soup.select_one('.block-title h1').text
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_cover = self.absolute_url(
23 |             soup.find('img', {'alt': self.novel_title})['src'])
24 |         logger.info('Novel cover: %s', self.novel_cover)
25 | 
26 |         author_link = soup.select_one("a[href*=author]")
27 |         if author_link:
28 |             self.novel_author = author_link.text.strip().title()
29 |         # end if
30 |         logger.info('Novel author: %s', self.novel_author)
31 | 
32 |         volume_ids = set()
33 |         for a in soup.select('.chapters .chapter-chs li a'):
34 |             chap_id = len(self.chapters) + 1
35 |             vol_id = (chap_id - 1) // 100 + 1
36 |             volume_ids.add(vol_id)
37 |             self.chapters.append({
38 |                 'id': chap_id,
39 |                 'volume': vol_id,
40 |                 'url':  self.absolute_url(a['href']),
41 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
42 |             })
43 |         # end for
44 | 
45 |         self.volumes = [{'id': i} for i in volume_ids]
46 |     # end def
47 | 
48 |     def download_chapter_body(self, chapter):
49 |         '''Download body of a single chapter and return as clean html format.'''
50 |         logger.info('Downloading %s', chapter['url'])
51 |         soup = self.get_soup(chapter['url'])
52 | 
53 |         div = soup.select_one('.chapter-content3 .desc')
54 | 
55 |         bad_selectors = [
56 |             '.trinity-player-iframe-wrapper'
57 |             '.hidden',
58 |             '.ads-title',
59 |             'script',
60 |             'center',
61 |             'interaction',
62 |             'a[href*=remove-ads]',
63 |             'a[target=_blank]',
64 |             'hr',
65 |             'br'
66 |         ]
67 |         for hidden in div.select(', '.join(bad_selectors)):
68 |             hidden.decompose()
69 |         # end if
70 | 
71 |         body = self.extract_contents(div)
72 |         if re.search(r'c?hapter .?\d+', body[0], re.IGNORECASE):
73 |             title = body[0].replace('<strong>', '').replace('</strong>', '').strip()
74 |             title = ('C' if title.startswith('hapter') else '') + title
75 |             chapter['title'] = title.strip()
76 |             body = body[1:]
77 |         # end if
78 | 
79 |         return '<p>' + '</p><p>'.join(body) + '</p>'
80 |     # end def
81 | # end class
82 | 


--------------------------------------------------------------------------------
/lncrawl/sources/idqidian.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('IDQIDIAN')
 8 | 
 9 | 
10 | class IdqidianCrawler(Crawler):
11 |     base_url = 'https://www.idqidian.us/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.find_all(
19 |             'span', {"typeof": "v:Breadcrumb"})[-1].text
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_cover = "https://www.idqidian.us/images/noavailable.jpg"
23 |         logger.info('Novel cover: %s', self.novel_cover)
24 | 
25 |         author = soup.select('p')[3].text
26 |         self.novel_author = author[20:len(author)-22]
27 |         logger.info('Novel author: %s', self.novel_author)
28 | 
29 |         chapters = soup.find('div', {
30 |             'style': '-moz-border-radius: 5px 5px 5px 5px; border: 1px solid #333; color: black; height: 400px; margin: 5px; overflow: auto; padding: 5px; width: 96%;'}).findAll(
31 |             'a')
32 |         chapters.reverse()
33 | 
34 |         for a in chapters:
35 |             chap_id = len(self.chapters) + 1
36 |             if len(self.chapters) % 100 == 0:
37 |                 vol_id = chap_id//100 + 1
38 |                 vol_title = 'Volume ' + str(vol_id)
39 |                 self.volumes.append({
40 |                     'id': vol_id,
41 |                     'title': vol_title,
42 |                 })
43 |             # end if
44 |             self.chapters.append({
45 |                 'id': chap_id,
46 |                 'volume': vol_id,
47 |                 'url':  self.absolute_url(a['href']),
48 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
49 |             })
50 |         # end for
51 |     # end def
52 | 
53 |     def download_chapter_body(self, chapter):
54 |         '''Download body of a single chapter and return as clean html format.'''
55 |         logger.info('Downloading %s', chapter['url'])
56 |         soup = self.get_soup(chapter['url'])
57 | 
58 |         for a in soup.find_all('a'):
59 |             a.decompose()
60 | 
61 |         body_parts = soup.select('p')
62 |         body_parts = ''.join([str(p.extract()) for p in body_parts if
63 |                               p.text.strip() and not 'Advertisement' in p.text and not 'JavaScript!' in p.text])
64 |         if body_parts == '':
65 |             texts = [str.strip(x) for x in soup.strings if str.strip(x) != '']
66 |             unwanted_text = [str.strip(x.text) for x in soup.find_all()]
67 |             my_texts = set(texts).difference(unwanted_text)
68 |             body_parts = ''.join(
69 |                 [str(p) for p in my_texts if p.strip() and not 'Advertisement' in p and not 'JavaScript!' in p])
70 |         # end if
71 | 
72 |         return body_parts
73 |     # end def
74 | # end class
75 | 


--------------------------------------------------------------------------------
/lncrawl/sources/yukinovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | 
 6 | from bs4 import Comment
 7 | 
 8 | from ..utils.crawler import Crawler
 9 | 
10 | logger = logging.getLogger('YUKI_NOVEL')
11 | 
12 | 
13 | class YukiNovelCrawler(Crawler):
14 |     base_url = 'https://yukinovel.id/'
15 | 
16 |     def initialize(self):
17 |         self.home_url = 'https://yukinovel.id/'
18 |     # end def
19 | 
20 |     def read_novel_info(self):
21 |         '''Get novel title, autor, cover etc'''
22 |         url = self.novel_url.replace('https://yukinovel.me', 'https://yukinovel.id')
23 |         logger.debug('Visiting %s', self.novel_url)
24 |         soup = self.get_soup(self.novel_url)
25 | 
26 |         self.novel_title = soup.select_one('h1.entry-title').text
27 |         logger.info('Novel title: %s', self.novel_title)
28 | 
29 |         self.novel_author = "Translated by Yukinovel"
30 |         logger.info('Novel author: %s', self.novel_author)
31 | 
32 |         self.novel_cover = self.absolute_url(
33 |             soup.select_one('div.lightnovel-thumb img')['src'])
34 |         logger.info('Novel cover: %s', self.novel_cover)
35 | 
36 |         # Extract volume-wise chapter entries
37 |         chapters = soup.select('div.lightnovel-episode ul li a')
38 | 
39 |         chapters.reverse()
40 | 
41 |         for a in chapters:
42 |             chap_id = len(self.chapters) + 1
43 |             if len(self.chapters) % 100 == 0:
44 |                 vol_id = chap_id//100 + 1
45 |                 vol_title = 'Volume ' + str(vol_id)
46 |                 self.volumes.append({
47 |                     'id': vol_id,
48 |                     'title': vol_title,
49 |                 })
50 |             # end if
51 |             self.chapters.append({
52 |                 'id': chap_id,
53 |                 'volume': vol_id,
54 |                 'url':  self.absolute_url(a['href']),
55 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
56 |             })
57 |         # end for
58 |     # end def
59 | 
60 |     def download_chapter_body(self, chapter):
61 |         '''Download body of a single chapter and return as clean html format.'''
62 |         logger.info('Downloading %s', chapter['url'])
63 |         soup = self.get_soup(chapter['url'])
64 | 
65 |         contents = soup.select_one('div.entry-content.cl')
66 | 
67 |         for d in contents.findAll('div'):
68 |             d.decompose()
69 |         # end for
70 | 
71 |         for comment in contents.find_all(string=lambda text: isinstance(text, Comment)):
72 |             comment.extract()
73 |         # end for
74 | 
75 |         if contents.findAll('p')[0].text.strip().startswith('Bab'):
76 |             chapter['title'] = contents.findAll('p')[0].text.strip()
77 |             contents.findAll('p')[0].extract()
78 |         else:
79 |             chapter['title'] = chapter['title']
80 |         # end if
81 | 
82 |         logger.debug(chapter['title'])
83 | 
84 |         return str(contents)
85 |     # end def
86 | # end class
87 | 


--------------------------------------------------------------------------------
/lncrawl/sources/fourscanlation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from urllib.parse import urlparse
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('4SCANLATION')
 8 | novel_page = 'https://4scanlation.com/%s'
 9 | 
10 | 
11 | class FourScanlationCrawler(Crawler):
12 |     base_url = 'https://4scanlation.com/'
13 | 
14 |     def read_novel_info(self):
15 |         '''Get novel title, autor, cover etc'''
16 |         path_fragments = urlparse(self.novel_url).path.split('/')
17 |         novel_hash = path_fragments[1]
18 |         if novel_hash == 'category':
19 |             novel_hash = path_fragments[2]
20 |         # end if
21 |         self.novel_url = novel_page % novel_hash
22 | 
23 |         logger.debug('Visiting %s', self.novel_url)
24 |         soup = self.get_soup(self.novel_url)
25 | 
26 |         self.novel_title = soup.select_one(', '.join([
27 |             'header h1',
28 |             '.header-post-title-class',
29 |         ])).text.strip()
30 |         logger.info('Novel title: %s', self.novel_title)
31 | 
32 |         self.novel_author = "Source: 4scanlation"
33 |         logger.info('Novel author: %s', self.novel_author)
34 | 
35 |         possible_image = soup.select_one('#primary article img.wp-post-image')
36 |         if possible_image:
37 |             self.novel_cover = self.absolute_url(possible_image['src'])
38 |         # end if
39 |         logger.info('Novel cover: %s', self.novel_cover)
40 | 
41 |         # Extract volume-wise chapter entries
42 |         volumes = set()
43 |         for a in soup.select('article.page p a'):
44 |             possible_url = self.absolute_url(a['href'])
45 |             if not self.is_relative_url(possible_url):
46 |                 continue
47 |             # end if
48 |             chap_id = 1 + len(self.chapters)
49 |             vol_id = 1 + len(self.chapters) // 100
50 |             volumes.add(vol_id)
51 |             self.chapters.append({
52 |                 'id': chap_id,
53 |                 'volume': vol_id,
54 |                 'url':  possible_url,
55 |                 'title': a.text.strip(),
56 |             })
57 |         # end for
58 | 
59 |         self.volumes = [{'id': x} for x in volumes]
60 |     # end def
61 | 
62 |     def download_chapter_body(self, chapter):
63 |         '''Download body of a single chapter and return as clean html format.'''
64 |         logger.info('Downloading %s', chapter['url'])
65 |         soup = self.get_soup(chapter['url'])
66 | 
67 |         contents = soup.select_one('article div.entry-content')
68 |         if not contents:
69 |             return ''
70 |         # end if
71 | 
72 |         for d in contents.findAll('div'):
73 |             d.extract()
74 |         # end for
75 | 
76 |         try:
77 |             chapter['title'] = soup.select_one('header h1').text
78 |             logger.debug(chapter['title'])
79 |         except Exception:
80 |             pass
81 |         # end try
82 | 
83 |         return str(contents or '')
84 |     # end def
85 | # end class
86 | 


--------------------------------------------------------------------------------
/lncrawl/sources/novelgo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | import cssutils
 6 | import urllib.parse
 7 | 
 8 | from bs4 import BeautifulSoup
 9 | 
10 | from ..utils.crawler import Crawler
11 | 
12 | logger = logging.getLogger('NOVEL_GO')
13 | 
14 | 
15 | class NovelGoCrawler(Crawler):
16 |     base_url = 'https://novelgo.id/'
17 | 
18 |     def read_novel_info(self):
19 |         '''Get novel title, autor, cover etc'''
20 |         logger.debug('Visiting %s', self.novel_url)
21 |         soup = self.get_soup(self.novel_url)
22 | 
23 |         self.novel_title = soup.find(
24 |             'h2', {'class': 'novel-title'}).text.strip()
25 |         logger.info('Novel title: %s', self.novel_title)
26 | 
27 |         self.novel_author = soup.select_one(
28 |             'div.noveils-current-author a').text.strip()
29 |         logger.info('Novel author: %s', self.novel_author)
30 | 
31 |         thumbnail = soup.find("div", {"class": "novel-thumbnail"})['style']
32 |         style = cssutils.parseStyle(thumbnail)
33 |         url = style['background-image']
34 | 
35 |         self.novel_cover = self.absolute_url(
36 |             url.replace('url(', '').replace(')', ''))
37 |         logger.info('Novel cover: %s', self.novel_cover)
38 | 
39 |         path = urllib.parse.urlsplit(self.novel_url)[2]
40 |         book_id = path.split('/')[2]
41 |         chapter_list = js = self.scraper.post(
42 |             'https://novelgo.id/wp-admin/admin-ajax.php?action=LoadChapter&post=%s' % book_id).content
43 |         soup_chapter = BeautifulSoup(chapter_list, 'lxml')
44 | 
45 |         chapters = soup_chapter.select('ul li a')
46 | 
47 |         for x in chapters:
48 |             chap_id = len(self.chapters) + 1
49 |             if len(self.chapters) % 100 == 0:
50 |                 vol_id = chap_id//100 + 1
51 |                 vol_title = 'Volume ' + str(vol_id)
52 |                 self.volumes.append({
53 |                     'id': vol_id,
54 |                     'title': vol_title,
55 |                 })
56 |             # end if
57 |             self.chapters.append({
58 |                 'id': chap_id,
59 |                 'volume': vol_id,
60 |                 'url': self.absolute_url(x['href']),
61 |                 'title': x.text.strip() or ('Chapter %d' % chap_id),
62 |             })
63 |         # end for
64 | 
65 |         logger.debug(self.chapters)
66 |     # end def
67 | 
68 |     def download_chapter_body(self, chapter):
69 |         '''Download body of a single chapter and return as clean html format.'''
70 |         logger.info('Downloading %s', chapter['url'])
71 |         soup = self.get_soup(chapter['url'])
72 | 
73 |         self.blacklist_patterns = [
74 |             r'^translat(ed by|or)',
75 |             r'(volume|chapter) .?\d+',
76 |         ]
77 | 
78 |         contents = soup.find(
79 |             'div', {'id': 'chapter-post-content'}).findAll('p')
80 |         body = [str(p) for p in contents if p.text.strip()]
81 |         return '<p>' + '</p><p>'.join(body) + '</p>'
82 |     # end def
83 | # end class
84 | 


--------------------------------------------------------------------------------
/lncrawl/sources/gravitytales.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import logging
 4 | from ..utils.crawler import Crawler
 5 | 
 6 | logger = logging.getLogger('GRAVITY_TALES')
 7 | 
 8 | cover_image_url = 'https://cdn.gravitytales.com/images/covers/%s.jpg'
 9 | novel_toc_url = 'http://gravitytales.com/novel/%s'
10 | chapter_list_url = 'http://gravitytales.com/novel/%s/chapters'
11 | 
12 | 
13 | class GravityTalesCrawler(Crawler):
14 |     base_url = 'http://gravitytales.com/'
15 | 
16 |     def read_novel_info(self):
17 |         self.novel_id = re.split(r'\/(novel|post)\/', self.novel_url)[2]
18 |         self.novel_id = self.novel_id.split('/')[0]
19 |         logger.info('Novel id: %s' % self.novel_id)
20 | 
21 |         self.novel_url = novel_toc_url % self.novel_id
22 |         logger.debug('Visiting %s' % self.novel_url)
23 |         soup = self.get_soup(self.novel_url)
24 | 
25 |         for tag in soup.select('.main-content h3 > *'):
26 |             tag.extract()
27 |         self.novel_title = soup.select_one('.main-content h3').text.strip()
28 |         logger.info('Novel title: %s' % self.novel_title)
29 | 
30 |         self.novel_cover = cover_image_url % self.novel_id
31 |         logger.info('Novel cover: %s' % self.novel_cover)
32 | 
33 |         self.novel_author = soup.select_one('.main-content h4').text.strip()
34 |         logger.info(self.novel_author)
35 | 
36 |         self.get_chapter_list()
37 |     # end def
38 | 
39 |     def get_chapter_list(self):
40 |         url = chapter_list_url % self.novel_id
41 |         logger.info('Visiting %s' % url)
42 |         soup = self.get_soup(url)
43 | 
44 |         # For each tabs...
45 |         for a in soup.select('#chaptergroups li a'):
46 |             vol_id = len(self.volumes) + 1
47 |             self.volumes.append({
48 |                 'id': vol_id,
49 |                 'title': a.text.strip(),
50 |                 '_tid': (a['href']),
51 |             })
52 | 
53 |             # ...get every chapters
54 |             for a in soup.select_one(a['href']).select('table td a'):
55 |                 chap_id = len(self.chapters) + 1
56 |                 self.chapters.append({
57 |                     'id': chap_id,
58 |                     'volume': vol_id,
59 |                     'title': a.text.strip(),
60 |                     'url': self.absolute_url(a['href']),
61 |                 })
62 |         # end for
63 | 
64 |         logger.info('%d chapters and %d volumes found',
65 |                     len(self.chapters), len(self.volumes))
66 |     # end def
67 | 
68 |     def download_chapter_body(self, chapter):
69 |         '''Download body of a single chapter and return as clean html format.'''
70 |         logger.info('Downloading %s' % chapter['url'])
71 |         soup = self.get_soup(chapter['url'])
72 |         body = soup.select_one('#chapterContent')
73 |         for tag in body.contents:
74 |             if hasattr(tag, 'attrs'):
75 |                 setattr(tag, 'attrs', {})    # clear attributes
76 |             # end if
77 |         # end for
78 |         return str(body)
79 |     # end def
80 | # end class
81 | 


--------------------------------------------------------------------------------
/lncrawl/sources/machinetransorg.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from urllib.parse import quote
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('MACHINE_NOVEL_TRANSLATION')
 8 | 
 9 | search_url = 'https://www.machine-translation.org/novel/search/?keywords=%s'
10 | 
11 | 
12 | class MachineTransOrg(Crawler):
13 |     base_url = 'https://www.machine-translation.org/'
14 | 
15 |     def search_novel(self, query):
16 |         url = search_url % quote(query.lower())
17 |         logger.debug('Visiting: %s', url)
18 |         soup = self.get_soup(url)
19 | 
20 |         results = []
21 |         for li in soup.select('.book-list-info > ul > li'):
22 |             results.append({
23 |                 'title': li.select_one('a h4 b').text.strip(),
24 |                 'url': self.absolute_url(li.select_one('.book-img a')['href']),
25 |                 'info': li.select_one('.update-info').text.strip(),
26 |             })
27 |         # end for
28 |         return results
29 |     # end def
30 | 
31 |     def read_novel_info(self):
32 |         '''Get novel title, autor, cover etc'''
33 |         logger.debug('Visiting %s', self.novel_url)
34 |         soup = self.get_soup(self.novel_url)
35 | 
36 |         self.novel_title = soup.select_one('div.title h3 b').text
37 |         logger.info('Novel title: %s', self.novel_title)
38 | 
39 |         self.novel_author = soup.select_one('div.title h3 span').text
40 |         logger.info('Novel author: %s', self.novel_author)
41 | 
42 |         self.novel_cover = self.absolute_url(
43 |             soup.select_one('.book-img img')['src'])
44 |         logger.info('Novel cover: %s', self.novel_cover)
45 | 
46 |         for a in reversed(soup.select('div.slide-item a')):
47 |             ch_title = a.text.strip()
48 |             ch_id = len(self.chapters) + 1
49 |             if len(self.chapters) % 100 == 0:
50 |                 vol_id = ch_id//100 + 1
51 |                 vol_title = 'Volume ' + str(vol_id)
52 |                 self.volumes.append({
53 |                     'id': vol_id,
54 |                     'title': vol_title,
55 |                 })
56 |             # end if
57 |             self.chapters.append({
58 |                 'id': ch_id,
59 |                 'volume': vol_id,
60 |                 'title': ch_title,
61 |                 'url':  self.absolute_url(a['href']),
62 |             })
63 |         # end for
64 | 
65 |         logger.debug('%d chapters and %d volumes found',
66 |                      len(self.chapters), len(self.volumes))
67 |     # end def
68 | 
69 |     def download_chapter_body(self, chapter):
70 |         '''Download body of a single chapter and return as clean html format'''
71 |         logger.info('Visiting %s', chapter['url'])
72 |         soup = self.get_soup(chapter['url'])
73 |         body = soup.select_one('.read-main .read-context')
74 | 
75 |         self.blacklist_patterns = [
76 |             r'^Refresh time: \d+-\d+-\d+$'
77 |         ]
78 |         self.clean_contents(body)
79 | 
80 |         return str(body)
81 |     # end def
82 | # end class
83 | 


--------------------------------------------------------------------------------
/lncrawl/bots/_sample.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | from ..core.app import App
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | # TODO: It is recommended to implemented all methods. But you can skip those
 8 | #       Which return values by default.
 9 | 
10 | 
11 | class SampleBot:
12 |     def start(self):
13 |         # TODO: must be implemented
14 |         # Start processing using this bot. It should use self methods to take
15 |         # inputs and self.app methods to process them.
16 |         #
17 |         self.app = App()
18 |         self.app.initialize()
19 |         #
20 |         # Checkout console.py for a sample implementation
21 |     # end def
22 | 
23 |     def get_novel_url(self):
24 |         # Returns a novel page url or a query
25 |         pass
26 |     # end def
27 | 
28 |     def get_crawlers_to_search(self):
29 |         # Returns user choice to search the choosen sites for a novel
30 |         pass
31 |     # end def
32 | 
33 |     def choose_a_novel(self):
34 |         # The search_results is an array of (novel_title, novel_url).
35 |         # This method should return a single novel_url only
36 |         #
37 |         # By default, returns the first search_results. Implemented it to
38 |         # handle multiple search_results
39 |         pass
40 |     # end def
41 | 
42 |     def get_login_info(self):
43 |         # By default, returns None to skip login
44 |         pass
45 |     # end if
46 | 
47 |     def get_output_path(self):
48 |         # You should return a valid absolute path. The parameter suggested_path
49 |         # is valid but not gurranteed to exists.
50 |         #
51 |         # NOTE: If you do not want to use any pre-downloaded files, remove all
52 |         #       contents inside of your selected output directory.
53 |         #
54 |         # By default, returns a valid existing path from suggested_path
55 |         pass
56 |     # end def
57 | 
58 |     def get_output_formats(self):
59 |         # The keys should be from from `self.output_formats`. Each value
60 |         # corresponding a key defines whether create output in that format.
61 |         #
62 |         # By default, it returns all True to all of the output formats.
63 |         pass
64 |     # end def
65 | 
66 |     def should_pack_by_volume(self):
67 |         # By default, returns False to generate a single file
68 |         pass
69 |     # end def
70 | 
71 |     def get_range_selection(self):
72 |         # Should return a key from `self.selections` array
73 |         pass
74 |     # end def
75 | 
76 |     def get_range_using_urls(self):
77 |         # Should return a list of chapters to download
78 |         pass
79 |     # end def
80 | 
81 |     def get_range_using_index(self):
82 |         # Should return a list of chapters to download
83 |         pass
84 |     # end def
85 | 
86 |     def get_range_from_volumes(self):
87 |         # Should return a list of chapters to download
88 |         pass
89 |     # end def
90 | 
91 |     def get_range_from_chapters(self):
92 |         # Should return a list of chapters to download
93 |         pass
94 |     # end def
95 | # end class
96 | 


--------------------------------------------------------------------------------
/lncrawl/sources/mangatoon.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | import ast
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('MANGATOON_MOBI')
 9 | 
10 | book_url = 'https://mangatoon.mobi/%s/detail/%s/episodes'
11 | search_url = 'https://mangatoon.mobi/%s/search?word=%s'
12 | 
13 | 
14 | class MangatoonMobiCrawler(Crawler):
15 |     base_url = 'https://mangatoon.mobi/'
16 | 
17 |     def initialize(self):
18 |         self.home_url = 'https://mangatoon.mobi'
19 |     # end def
20 | 
21 |     def read_novel_info(self):
22 |         '''Get novel title, autor, cover etc'''
23 |         self.novel_id = self.novel_url.split('/')[5]
24 |         logger.info('Novel Id: %s', self.novel_id)
25 | 
26 |         novel_region = self.novel_url.split('/')[3]
27 | 
28 |         self.novel_url = book_url % (novel_region,self.novel_id)
29 |         logger.debug('Visiting %s', self.novel_url)
30 |         soup = self.get_soup(self.novel_url)
31 | 
32 |         self.novel_title =soup.select_one('h1.comics-title').text
33 |         logger.info('Novel title: %s', self.novel_title)
34 | 
35 |         try:
36 |             self.novel_cover = self.absolute_url(
37 |                 soup.select_one('.detail-top-right img')['src'])
38 |             logger.info('Novel cover: %s', self.novel_cover)
39 |         except Exception:
40 |             logger.debug('Failed to get cover: %s', self.novel_url)
41 |         # end try
42 | 
43 |         self.novel_author = soup.select_one('.created-by').text
44 |         logger.info('Novel author: %s', self.novel_author)
45 | 
46 |         for a in soup.select('a.episode-item'):
47 |             chap_id = len(self.chapters) + 1
48 |             if len(self.chapters) % 100 == 0:
49 |                 vol_id = chap_id//100 + 1
50 |                 vol_title = 'Volume ' + str(vol_id)
51 |                 self.volumes.append({
52 |                     'id': vol_id,
53 |                     'title': vol_title,
54 |                 })
55 |             # end if
56 |             self.chapters.append({
57 |                 'id': chap_id,
58 |                 'volume': vol_id,
59 |                 'url':  self.absolute_url(a['href']),
60 |                 'title': a.select_one('.episode-title').text.strip() or ('Chapter %d' % chap_id),
61 |             })
62 |         # end for
63 |     # end def
64 | 
65 |     def download_chapter_body(self, chapter):
66 |         '''Download body of a single chapter and return as clean html format'''
67 |         logger.info('Downloading %s', chapter['url'])
68 |         soup = self.get_soup(chapter['url'])
69 | 
70 |         script = soup.find("script", text=re.compile("initialValue\s+="))
71 |         initialValue = re.search('var initialValue = (?P<value>.*);', script.string)
72 |         content = initialValue.group('value')
73 |         chapter_content = ast.literal_eval(content)
74 |         chapter_content = [p.replace('\-', '-') for p in chapter_content]
75 | 
76 | 
77 |         text = '<p>' + '</p><p>'.join(chapter_content) + '</p>'
78 |         # end if
79 |         return text.strip()
80 |     # end def
81 | # end class
82 | 


--------------------------------------------------------------------------------
/lncrawl/sources/rewayatclub.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from concurrent import futures
 5 | 
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('REWAYAT_CLUB')
 9 | 
10 | 
11 | class RewayatClubCrawler(Crawler):
12 |     base_url = 'https://rewayat.club/'
13 | 
14 |     def read_novel_info(self):
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.is_rtl = True
19 | 
20 |         self.novel_title = soup.select_one('h1.card-header').text.strip()
21 |         logger.info('Novel title: %s', self.novel_title)
22 | 
23 |         self.novel_cover = self.absolute_url(
24 |             soup.select_one('.card-body .align-middle img')['src'])
25 |         logger.info('Novel cover: %s', self.novel_cover)
26 | 
27 |         self.novel_author = soup.select_one(
28 |             '.card-body table td a[href*="/user/"]').text.strip()
29 |         logger.info('Novel author: %s', self.novel_author)
30 | 
31 |         page_count = len(soup.select(
32 |             '.card-footer select.custom-select option'))
33 |         logger.info('Total pages: %d', page_count)
34 | 
35 |         logger.info('Getting chapters...')
36 |         futures_to_check = {
37 |             self.executor.submit(self.download_chapter_list, i + 1): str(i)
38 |             for i in range(page_count)
39 |         }
40 |         temp_chapters = dict()
41 |         for future in futures.as_completed(futures_to_check):
42 |             page = int(futures_to_check[future])
43 |             temp_chapters[page] = future.result()
44 |         # end for
45 | 
46 |         logger.info('Building sorted chapter list...')
47 |         volumes = set()
48 |         for page in sorted(temp_chapters.keys()):
49 |             for chap in temp_chapters[page]:
50 |                 chap['id'] = 1 + len(self.chapters)
51 |                 chap['volume'] = 1 + len(self.chapters) // 100
52 |                 volumes.add(chap['volume'])
53 |                 self.chapters.append(chap)
54 |             # end for
55 |         # end for
56 | 
57 |         self.volumes = [{'id': x} for x in volumes]
58 |     # end def
59 | 
60 |     def download_chapter_list(self, page_no):
61 |         chapter_url = self.novel_url + ('?page=%d' % page_no)
62 |         logger.info('Visiting %s', chapter_url)
63 |         soup = self.get_soup(chapter_url)
64 | 
65 |         chapters = []
66 |         for a in soup.select('.card a[href*="/novel/"]'):
67 |             chapters.append({
68 |                 'url': self.absolute_url(a['href']),
69 |                 'title': a.select_one('div p').text.strip(),
70 |             })
71 |         # end for
72 |         return chapters
73 |     # end def
74 | 
75 |     def download_chapter_body(self, chapter):
76 |         '''Download body of a single chapter and return as clean html format.'''
77 |         logger.info('Downloading %s', chapter['url'])
78 |         soup = self.get_soup(chapter['url'])
79 |         paras = soup.select('.card .card-body p')
80 |         paras = [str(p) for p in paras if p.text.strip()]
81 |         return ''.join(paras)
82 |     # end def
83 | # end class
84 | 


--------------------------------------------------------------------------------
/lncrawl/sources/shinsori.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('SHINSORI')
 8 | 
 9 | 
10 | class ShinsoriCrawler(Crawler):
11 |     base_url = 'https://www.shinsori.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = soup.select_one('span.the-section-title').text.strip()
19 |         logger.info('Novel title: %s', self.novel_title)
20 | 
21 |         self.novel_cover = None
22 |         logger.info('Novel cover: %s', self.novel_cover)
23 | 
24 |         self.novel_author = 'Author : %s, Translator: Shinsori' % soup.select(
25 |             'div.entry.clearfix p strong')[1].next_sibling.strip()
26 |         logger.info('Novel author: %s', self.novel_author)
27 | 
28 |         # get pagination range
29 |         p_range = int(soup.select('ul.lcp_paginator li')[-2].text)
30 | 
31 |         chapters = []
32 |         # get chapter list by looping pagination range
33 |         for x in range(p_range):
34 |             p_url = '%s?lcp_page0=%d#lcp_instance_0 x+1' % (self.novel_url, x+1)
35 |             p_soup = self.get_soup(p_url)
36 |             chapters.extend(p_soup.select('ul.lcp_catlist')[1].select('li a'))
37 |         # end for
38 | 
39 |         for x in chapters:
40 |             chap_id = len(self.chapters) + 1
41 |             vol_id = len(self.chapters)//100 + 1
42 |             self.chapters.append({
43 |                 'id': chap_id,
44 |                 'volume': vol_id,
45 |                 'url': self.absolute_url(x['href']),
46 |                 'title': x['title'] or ('Chapter %d' % chap_id),
47 |             })
48 |         # end for
49 | 
50 |         self.volumes = [
51 |             {'id': x + 1}
52 |             for x in range(len(self.chapters) // 100 + 1)
53 |         ]
54 |     # end def
55 | 
56 |     def download_chapter_body(self, chapter):
57 |         '''Download body of a single chapter and return as clean html format.'''
58 |         logger.info('Downloading %s', chapter['url'])
59 |         soup = self.get_soup(chapter['url'])
60 | 
61 |         logger.debug(soup.title.string)
62 | 
63 |         content = soup.select_one('div.entry-content')
64 | 
65 |         # remove div with no class
66 |         for item in content.findAll('div', attrs={'class': None}):
67 |             item.decompose()
68 | 
69 |         # remove style
70 |         for item in content.findAll('style'):
71 |             item.decompose()
72 | 
73 |         subs = 'tab'
74 |         # remove all div that has class but not relevant
75 |         for item in content.findAll('div'):
76 |             res = [x for x in item['class'] if re.search(subs, x)]
77 |             if len(res) == 0:
78 |                 item.extract()
79 | 
80 |         # remove p with attribute style
81 |         for item in content.findAll('p'):
82 |             if item.has_attr('style'):
83 |                 item.decompose()
84 | 
85 |         return str(content)
86 |     # end def
87 | # end class
88 | 


--------------------------------------------------------------------------------
/lncrawl/sources/wuxiaonline.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('WUXIA_ONLINE')
 8 | search_url = 'https://wuxiaworld.online/search.ajax?type=&query=%s'
 9 | 
10 | 
11 | class WuxiaOnlineCrawler(Crawler):
12 |     base_url = 'https://wuxiaworld.online/'
13 | 
14 |     # DISABLING DUE TO CLOUDEFLARE CAPTCHA CHALLENGE
15 |     # def search_novel(self, query):
16 |     #     '''Gets a list of {title, url} matching the given query'''
17 |     #     soup = self.get_soup(search_url % query)
18 | 
19 |     #     results = []
20 |     #     for novel in soup.select('li'):
21 |     #         a = novel.select_one('.resultname a')
22 |     #         info = novel.select_one('a:nth-of-type(2)')
23 |     #         info = info.text.strip() if info else ''
24 |     #         results.append({
25 |     #             'title': a.text.strip(),
26 |     #             'url': self.absolute_url(a['href']),
27 |     #             'info': 'Latest: %s' % info,
28 |     #         })
29 |     #     # end for
30 | 
31 |     #     return results
32 |     # # end def
33 | 
34 |     def read_novel_info(self):
35 |         '''Get novel title, autor, cover etc'''
36 |         url = self.novel_url
37 |         logger.debug('Visiting %s', url)
38 |         soup = self.get_soup(url)
39 |         self.novel_title = soup.select_one('h1.entry-title').text
40 |         logger.info('Novel title: %s', self.novel_title)
41 | 
42 |         # self.novel_author = soup.select_one('#maininfo p').text.strip()
43 |         # self.novel_author = re.sub(r'^Author[^\w]+', '', self.novel_author).strip()
44 |         # logger.info('Novel author: %s', self.novel_author)
45 | 
46 |         self.novel_cover = self.absolute_url(
47 |             soup.select_one('.info_image img')['src'])
48 |         logger.info('Novel cover: %s', self.novel_cover)
49 | 
50 |         last_vol = -1
51 |         for a in reversed(soup.select('.chapter-list .row span a')):
52 |             chap_id = len(self.chapters) + 1
53 |             vol_id = 1 + (chap_id - 1) // 100
54 |             volume = {'id': vol_id, 'title': ''}
55 |             if last_vol != vol_id:
56 |                 self.volumes.append(volume)
57 |                 last_vol = vol_id
58 |             # end if
59 |             self.chapters.append({
60 |                 'id': chap_id,
61 |                 'volume': vol_id,
62 |                 'title': a['title'],
63 |                 'url':  self.absolute_url(a['href']),
64 |             })
65 |         # end for
66 | 
67 |         logger.info('%d chapters and %d volumes found',
68 |                     len(self.chapters), len(self.volumes))
69 |     # end def
70 | 
71 |     def download_chapter_body(self, chapter):
72 |         '''Download body of a single chapter and return as clean html format.'''
73 |         logger.info('Downloading %s', chapter['url'])
74 |         soup = self.get_soup(chapter['url'])
75 | 
76 |         parts = soup.select_one('#list_chapter .content-area')
77 |         body = self.extract_contents(parts)
78 |         return '<p>' + '</p><p>'.join(body) + '</p>'
79 |     # end def
80 | # end class
81 | 


--------------------------------------------------------------------------------
/lncrawl/sources/crescentmoon.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | 
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('CRESCENTMOON')
 9 | 
10 | 
11 | class CrescentMoonCrawler(Crawler):
12 |     base_url = 'https://crescentmoon.blog/'
13 | 
14 |     def read_novel_info(self):
15 |         '''Get novel title, autor, cover etc'''
16 |         logger.debug('Visiting %s', self.novel_url)
17 |         soup = self.get_soup(self.novel_url)
18 | 
19 |         self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip()
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_cover = self.absolute_url(
23 |             soup.select_one('div.entry-content p a')['href'])
24 |         logger.info('Novel cover: %s', self.novel_cover)
25 | 
26 |         self.novel_author = soup.select('div.entry-content p')[2].text.strip()
27 |         logger.info('Novel author: %s', self.novel_author)
28 | 
29 |         a = soup.select('div.entry-content p')
30 |         for idx, item in enumerate(a):
31 |             if "table of contents" in item.text.strip().lower():
32 |                 toc = a[idx+1]
33 | 
34 |         chapters = toc.findAll('a')
35 | 
36 |         for x in chapters:
37 |             chap_id = len(self.chapters) + 1
38 |             if len(self.chapters) % 100 == 0:
39 |                 vol_id = chap_id//100 + 1
40 |                 vol_title = 'Volume ' + str(vol_id)
41 |                 self.volumes.append({
42 |                     'id': vol_id,
43 |                     'title': vol_title,
44 |                 })
45 |             # end if
46 |             self.chapters.append({
47 |                 'id': chap_id,
48 |                 'volume': vol_id,
49 |                 'url': self.absolute_url(x['href']),
50 |                 'title': x.text.strip() or ('Chapter %d' % chap_id),
51 |             })
52 |         # end for
53 |     # end def
54 | 
55 |     def download_chapter_body(self, chapter):
56 |         '''Download body of a single chapter and return as clean html format.'''
57 |         logger.info('Downloading %s', chapter['url'])
58 |         soup = self.get_soup(chapter['url'])
59 | 
60 |         logger.debug(soup.title.string)
61 | 
62 |         # if soup.find("h1", {"class": "entry-title"}).text.strip():
63 |         #    chapter['title'] = soup.find("h1", {"class": "entry-title"}).text.strip()
64 |         # else:
65 |         #    chapter['title'] = chapter['title']
66 |         # end if
67 | 
68 |         #contents = soup.select('div.entry-content p')
69 |         #contents = contents[:-1]
70 |         #body = self.extract_contents(contents)
71 |         # return '<p>' + '</p><p>'.join(body) + '</p>'
72 |         # return str(contents)
73 | 
74 |         body = []
75 |         contents = soup.select('div.entry-content p')
76 |         contents = contents[:-1]
77 |         for p in contents:
78 |             para = ' '.join(self.extract_contents(p))
79 |             if len(para):
80 |                 body.append(para)
81 |             # end if
82 |         # end for
83 | 
84 |         return '<p>%s</p>' % '</p><p>'.join(body)
85 |     # end def
86 | # end class
87 | 


--------------------------------------------------------------------------------
/lncrawl/sources/meionovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | 
 6 | from ..utils.crawler import Crawler
 7 | 
 8 | logger = logging.getLogger('MEIONOVEL')
 9 | 
10 | 
11 | class MeionovelCrawler(Crawler):
12 |     base_url = 'https://meionovel.id/'
13 | 
14 |     def read_novel_info(self):
15 |         '''Get novel title, autor, cover etc'''
16 |         logger.debug('Visiting %s', self.novel_url)
17 |         soup = self.get_soup(self.novel_url)
18 | 
19 |         self.novel_title = ' '.join([
20 |             str(x)
21 |             for x in soup.select_one('.post-title h3').contents
22 |             if not x.name
23 |         ]).strip()
24 |         logger.info('Novel title: %s', self.novel_title)
25 | 
26 |         self.novel_cover = self.absolute_url(
27 |             soup.select_one('.summary_image img')['data-src'])
28 |         logger.info('Novel cover: %s', self.novel_cover)
29 | 
30 |         author = soup.find('div', {'class': 'author-content'}).findAll('a')
31 |         if len(author) == 2:
32 |             self.novel_author = author[0].text + ' (' + author[1].text + ')'
33 |         else:
34 |             self.novel_author = author[0].text
35 |         logger.info('Novel author: %s', self.novel_author)
36 | 
37 |         
38 |         content_area = soup.select_one(' .page-content-listing')
39 | 
40 |         for span in content_area.findAll('span'):
41 |             span.decompose()
42 | 
43 |         chapters = content_area.select('ul.main li.wp-manga-chapter a')
44 | 
45 |         chapters.reverse()
46 | 
47 |         for a in chapters:
48 |             chap_id = len(self.chapters) + 1
49 |             vol_id = chap_id//100 + 1
50 |             if len(self.chapters) % 100 == 0:
51 |                 vol_title = 'Volume ' + str(vol_id)
52 |                 self.volumes.append({
53 |                     'id': vol_id,
54 |                     'title': vol_title,
55 |                 })
56 |             # end if
57 |             self.chapters.append({
58 |                 'id': chap_id,
59 |                 'volume': vol_id,
60 |                 'url':  self.absolute_url(a['href']),
61 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
62 |             })
63 |         # end for
64 |     # end def
65 | 
66 |     def download_chapter_body(self, chapter):
67 |         '''Download body of a single chapter and return as clean html format.'''
68 |         logger.info('Downloading %s', chapter['url'])
69 |         soup = self.get_soup(chapter['url'])
70 | 
71 |         contents = soup.select_one('div.text-left')
72 | 
73 |         for img in contents.findAll('img'):
74 |             if img.has_attr('data-lazy-src'):
75 |                 src_url = img['data-lazy-src']
76 |                 parent = img.parent
77 |                 img.decompose()
78 |                 new_tag = soup.new_tag("img", src=src_url)
79 |                 parent.append(new_tag)
80 | 
81 |         if contents.h3:
82 |             contents.h3.decompose()
83 | 
84 |         for codeblock in contents.findAll('div', {'class': 'code-block'}):
85 |             codeblock.decompose()
86 | 
87 |         return str(contents)
88 |     # end def
89 | # end class
90 | 


--------------------------------------------------------------------------------
/lncrawl/utils/kindlegen_download.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import platform
 4 | import tarfile
 5 | import tempfile
 6 | from io import BytesIO, FileIO
 7 | from logging import Logger
 8 | from shutil import rmtree
 9 | from zipfile import ZipFile
10 | import requests
11 | 
12 | logger = Logger('KINDLEGEN')
13 | 
14 | WINDOWS_URL = 'http://kindlegen.s3.amazonaws.com/kindlegen_win32_v2_9.zip'
15 | MACOS_URL = 'http://kindlegen.s3.amazonaws.com/KindleGen_Mac_i386_v2_9.zip'
16 | LINUX_URL = 'http://kindlegen.s3.amazonaws.com/kindlegen_linux_2.6_i386_v2_9.tar.gz'
17 | 
18 | 
19 | def get_url_by_platform():
20 |     if platform.system() == 'Linux':
21 |         return LINUX_URL
22 |     elif platform.system() == 'Darwin':
23 |         return MACOS_URL
24 |     elif platform.system() == 'Windows':
25 |         return WINDOWS_URL
26 |     else:
27 |         raise Exception('Unrecognized platform')
28 |     # end if
29 | # end def
30 | 
31 | 
32 | def extract_kindlegen_file(extractor, file_list):
33 |     logger.debug(file_list)
34 |     home = os.path.expanduser('~')
35 |     if file_list.count('kindlegen') == 1:
36 |         extractor('kindlegen', path=home)
37 |         logger.info('Extracted kindlegen to %s', home)
38 |     elif file_list.count('kindlegen.exe') == 1:
39 |         extractor('kindlegen.exe', path=home)
40 |         logger.info('Extracted kindlegen.exe to %s', home)
41 |         os.rename(os.path.join(home, 'kindlegen.exe'),
42 |                   os.path.join(home, 'kindlegen'))
43 |         logger.info('Renamed kindlegen.exe to kindlegen')
44 |     else:
45 |         raise Exception('Kindlegen executable was not found.')
46 |     # end if
47 | # end def
48 | 
49 | 
50 | def download_kindlegen():
51 |     # Download the file
52 |     url = get_url_by_platform()
53 |     print('Downloading kindlegen...')
54 |     byte_array = requests.get(url).content
55 | 
56 |     # Extract contents
57 |     print('Extracting kindlegen...')
58 |     if url.endswith('.zip'):
59 |         with BytesIO(byte_array) as byte_stream:
60 |             with ZipFile(byte_stream) as file:
61 |                 extract_kindlegen_file(file.extract, file.namelist())
62 |             # end with
63 |         # end with
64 |     elif url.endswith('.tar.gz'):
65 |         temp_file = tempfile.mktemp('.tar.gz')
66 |         try:
67 |             logger.info('Writing content to %s', temp_file)
68 |             with FileIO(temp_file, 'w') as file:
69 |                 file.write(byte_array)
70 |             # end with
71 |             logger.info('Opening %s as archive', temp_file)
72 |             with tarfile.open(temp_file) as file:
73 |                 extract_kindlegen_file(file.extract, file.getnames())
74 |             # end with
75 |         finally:
76 |             os.remove(temp_file)
77 |             logger.info('%s removed.', temp_file)
78 |         # end finally
79 |     # end if
80 | # end def
81 | 
82 | 
83 | def retrieve_kindlegen():
84 |     # Check kindlegen availability
85 |     home = os.path.expanduser('~')
86 |     kindlegen_file = os.path.join(home, 'kindlegen')
87 |     if os.path.exists(kindlegen_file):
88 |         return kindlegen_file
89 |     # end if
90 |     return None
91 | # end def
92 | 


--------------------------------------------------------------------------------
/lncrawl/binders/web.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import os
 4 | 
 5 | from ..assets.html_style import get_value as get_css_style
 6 | 
 7 | logger = logging.getLogger('WEB_BINDER')
 8 | 
 9 | 
10 | def bind_html_chapter(chapter, prev_chapter, next_chapter, direction='ltr'):
11 |     prev_button = '%s.html' % (
12 |         str(prev_chapter['id']).rjust(5, '0')) if prev_chapter else '#'
13 |     next_button = '%s.html' % str(next_chapter['id']).rjust(
14 |         5, '0') if next_chapter else '#'
15 |     button_group = '<div class="link-group">'
16 |     button_group += '<a class="btn" href="%s">Previous</a>' % prev_button
17 |     # button_group += '<a href="%s" target="_blank">Original Source</a>' % chapter['url']
18 |     button_group += '<a class="btn" href="%s">Next</a>' % next_button
19 |     button_group += '</div>'
20 | 
21 |     script = '''
22 |     window.addEventListener('scroll', function(e) {
23 |         try {
24 |             var scroll = window.scrollY;
25 |             var height = document.body.scrollHeight - window.innerHeight + 10;
26 |             var percent = Math.round(100.0 * scroll / height);
27 |             document.getElementById('readpos').innerText = percent + '%';
28 |         } catch (err) {
29 |             // ignore
30 |         }
31 |     })
32 |     '''
33 | 
34 |     main_body = chapter['body']
35 |     if not main_body:
36 |         main_body = '<h1>%s</h1><p>No contents</p>' % chapter['title']
37 |     # end if
38 | 
39 |     html = '<!DOCTYPE html>\n'
40 |     html += '<html dir="%s"><head>' % direction
41 |     html += '<meta charset="utf-8"/>'
42 |     html += '<meta name="viewport" content="width=device-width, initial-scale=1"/>'
43 |     html += '<title>%s</title>' % chapter['title']
44 |     html += '<style>%s</style>' % get_css_style()
45 |     html += '<script>%s</script>' % script
46 |     html += '</head><body><div id="content">'
47 |     html += button_group
48 |     html += '<main>%s</main>' % main_body
49 |     html += button_group
50 |     html += '</div>'
51 |     html += '<div id="readpos">0%</div>'
52 |     html += '</body></html>'
53 | 
54 |     file_name = '%s.html' % str(chapter['id']).rjust(5, '0')
55 |     return html, file_name
56 | # end def
57 | 
58 | 
59 | def make_webs(app, data):
60 |     web_files = []
61 |     for vol in data:
62 |         dir_name = os.path.join(app.output_path, 'web', vol)
63 |         os.makedirs(dir_name, exist_ok=True)
64 |         for i in range(len(data[vol])):
65 |             chapter = data[vol][i]
66 |             prev_chapter = data[vol][i - 1] if i > 0 else None
67 |             next_chapter = data[vol][i + 1] if i + 1 < len(data[vol]) else None
68 |             direction = 'rtl' if app.crawler.is_rtl else 'ltr'
69 |             html, file_name = bind_html_chapter(
70 |                 chapter, prev_chapter, next_chapter, direction)
71 | 
72 |             file_name = os.path.join(dir_name, file_name)
73 |             with open(file_name, 'w', encoding='utf-8') as file:
74 |                 file.write(html)
75 |             # end with
76 |             web_files.append(file_name)
77 |         # end for
78 |     # end for
79 |     print('Created: %d web files' % len(web_files))
80 |     return web_files
81 | # end def
82 | 


--------------------------------------------------------------------------------
/lncrawl/core/novel_search.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | To search for novels in selected sources
  4 | """
  5 | import os
  6 | import logging
  7 | from concurrent import futures
  8 | 
  9 | from slugify import slugify
 10 | from progress.bar import IncrementalBar
 11 | 
 12 | from ..sources import crawler_list
 13 | 
 14 | logger = logging.getLogger('SEARCH_NOVEL')
 15 | 
 16 | 
 17 | def get_search_result(user_input, link):
 18 |     try:
 19 |         crawler = crawler_list[link]
 20 |         instance = crawler()
 21 |         instance.home_url = link.strip('/')
 22 |         results = instance.search_novel(user_input)
 23 |         logger.debug(results)
 24 |         logger.info('%d results from %s', len(results), link)
 25 |         return results
 26 |     except Exception:
 27 |         import traceback
 28 |         logger.debug(traceback.format_exc())
 29 |     # end try
 30 |     return []
 31 | # end def
 32 | 
 33 | 
 34 | def process_results(results):
 35 |     combined = dict()
 36 |     for result in results:
 37 |         key = slugify(result['title'])
 38 |         if len(key) <= 1:
 39 |             continue
 40 |         elif key not in combined:
 41 |             combined[key] = []
 42 |         # end if
 43 |         combined[key].append(result)
 44 |     # end for
 45 | 
 46 |     processed = []
 47 |     for key, value in combined.items():
 48 |         value.sort(key=lambda x: x['url'])
 49 |         processed.append({
 50 |             'id': key,
 51 |             'title': value[0]['title'],
 52 |             'novels': value
 53 |         })
 54 |     # end for
 55 | 
 56 |     processed.sort(key=lambda x: -len(x['novels']))
 57 | 
 58 |     return processed[:15]  # Control the number of results
 59 | # end def
 60 | 
 61 | 
 62 | def search_novels(app):
 63 |     executor = futures.ThreadPoolExecutor(10)
 64 | 
 65 |     # Add future tasks
 66 |     checked = {}
 67 |     futures_to_check = {}
 68 |     for link in app.crawler_links:
 69 |         crawler = crawler_list[link]
 70 |         if crawler in checked:
 71 |             logger.info('A crawler for "%s" already exists', link)
 72 |             continue
 73 |         # end if
 74 |         checked[crawler] = True
 75 |         futures_to_check[
 76 |             executor.submit(
 77 |                 get_search_result,
 78 |                 app.user_input,
 79 |                 link
 80 |             )
 81 |         ] = str(crawler)
 82 |     # end for
 83 | 
 84 |     bar = IncrementalBar('Searching', max=len(futures_to_check.keys()))
 85 |     bar.start()
 86 | 
 87 |     if os.getenv('debug_mode') == 'yes':
 88 |         bar.next = lambda: None  # Hide in debug mode
 89 |     # end if
 90 | 
 91 |     # Resolve future tasks
 92 |     app.progress = 0
 93 |     combined_results = []
 94 |     for future in futures.as_completed(futures_to_check):
 95 |         combined_results += future.result()
 96 |         app.progress += 1
 97 |         bar.next()
 98 |     # end for
 99 | 
100 |     # Process combined search results
101 |     app.search_results = process_results(combined_results)
102 |     bar.clearln()
103 |     bar.finish()
104 |     print('Found %d results' % len(app.search_results))
105 | 
106 |     executor.shutdown()
107 | # end def
108 | 


--------------------------------------------------------------------------------
/lncrawl/sources/kissnovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('KISS-NOVEL')
 8 | 
 9 | 
10 | class KissNovelCrawler(Crawler):
11 |     base_url = 'https://kiss-novel.com/'
12 | 
13 |     def read_novel_info(self):
14 |         '''Get novel title, autor, cover etc'''
15 |         logger.debug('Visiting %s', self.novel_url)
16 |         soup = self.get_soup(self.novel_url)
17 | 
18 |         self.novel_title = ' '.join([
19 |             str(x)
20 |             for x in soup.select_one('.post-title h1').contents
21 |             if not x.name
22 |         ]).strip()
23 |         logger.info('Novel title: %s', self.novel_title)
24 | 
25 |         self.novel_cover = self.absolute_url(
26 |             soup.select_one('.summary_image img')['src'])
27 |         logger.info('Novel cover: %s', self.novel_cover)
28 | 
29 |         author = soup.find('div', {'class': 'author-content'}).findAll('a')
30 |         if len(author) == 2:
31 |             self.novel_author = author[0].text + ' (' + author[1].text + ')'
32 |         else:
33 |             self.novel_author = author[0].text
34 |         logger.info('Novel author: %s', self.novel_author)
35 | 
36 |         latest_chapter = soup.select('div.post-content_item ul li a')[0].text
37 |         chapter_count = [int(i) for i in latest_chapter.split() if i.isdigit()]
38 |         page_count = (chapter_count)[0]//10+1
39 |         chapters_page_url = '%s/%s#chapter-section'
40 | 
41 |         chapters = []
42 | 
43 |         for i in range(page_count):
44 |             url = chapters_page_url % (self.novel_url, str(i+1))
45 |             logger.debug('Visiting %s', url)
46 |             soup = self.get_soup(url)
47 |             chapters.extend(soup.select('ul.main li.wp-manga-chapter a'))
48 |         # end for
49 |         chapters.reverse()
50 | 
51 |         for a in chapters:
52 |             chap_id = len(self.chapters) + 1
53 |             vol_id = chap_id//100 + 1
54 |             if len(self.chapters) % 100 == 0:
55 |                 vol_title = 'Volume ' + str(vol_id)
56 |                 self.volumes.append({
57 |                     'id': vol_id,
58 |                     'title': vol_title,
59 |                 })
60 |             # end if
61 |             self.chapters.append({
62 |                 'id': chap_id,
63 |                 'volume': vol_id,
64 |                 'url':  self.absolute_url(a['href']),
65 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
66 |             })
67 |         # end for
68 |     # end def
69 | 
70 |     def download_chapter_body(self, chapter):
71 |         '''Download body of a single chapter and return as clean html format.'''
72 |         logger.info('Downloading %s', chapter['url'])
73 |         soup = self.get_soup(chapter['url'])
74 | 
75 |         contents = soup.select('div.reading-content p')
76 | 
77 |         body = [str(p) for p in contents if p.text.strip()]
78 |         return '<p>' + '</p><p>'.join(body) + '</p>'
79 | 
80 |         # if contents.h3:
81 |         #    contents.h3.decompose()
82 | 
83 |         # for codeblock in contents.findAll('div', {'class': 'code-block'}):
84 |         #    codeblock.decompose()
85 | 
86 |         # return str(contents)
87 |     # end def
88 | # end class
89 | 


--------------------------------------------------------------------------------
/lncrawl/sources/bestlightnovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from bs4 import BeautifulSoup
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('NOVEL_ONLINE_FREE')
 8 | search_url = 'https://bestlightnovel.com/getsearchstory'
 9 | novel_page_url = 'https://bestlightnovel.com/novel/%s'
10 | 
11 | 
12 | class BestLightNovel(Crawler):
13 |     base_url = 'https://bestlightnovel.com/'
14 | 
15 |     def search_novel(self, query):
16 |         response = self.submit_form(search_url, {
17 |             'searchword': query
18 |         })
19 |         data = response.json()
20 | 
21 |         results = []
22 |         for novel in data:
23 |             titleSoup = BeautifulSoup(novel['name'], 'lxml')
24 |             results.append({
25 |                 'title': titleSoup.body.text.title(),
26 |                 'url': novel_page_url % novel['nameunsigned'],
27 |                 'info': 'Latest: %s' % novel['lastchapter'],
28 |             })
29 |         # end for
30 |         return results
31 |     # end def
32 | 
33 |     def read_novel_info(self):
34 |         '''Get novel title, autor, cover etc'''
35 |         logger.debug('Visiting %s', self.novel_url)
36 |         soup = self.get_soup(self.novel_url)
37 | 
38 |         # self.novel_title = soup.select_one('h1.entry-title').text.strip()
39 |         self.novel_title = soup.select_one('div.entry-header h1').text.strip()
40 |         logger.info('Novel title: %s', self.novel_title)
41 | 
42 |         try:
43 |             novel_data = self.submit_form(search_url, {
44 |                 'searchword': self.novel_title
45 |             }).json()
46 |             self.novel_cover = novel_data[0]['image']
47 |             self.novel_author = novel_data[0]['author']
48 |         except Exception:
49 |             logger.debug('Failed getting novel info.\n%s', Exception)
50 |         # end try
51 | 
52 |         for a in reversed(soup.select('#list_chapter .chapter-list a')):
53 |             chap_id = len(self.chapters) + 1
54 |             vol_id = len(self.chapters) // 100 + 1
55 |             if len(self.chapters) % 100 == 0:
56 |                 self.volumes.append({'id': vol_id})
57 |             # end if
58 |             self.chapters.append({
59 |                 'id': chap_id,
60 |                 'volume': vol_id,
61 |                 'title': a.text.strip(),
62 |                 'url': self.absolute_url(a['href']),
63 |             })
64 |         # end for
65 |     # end def
66 | 
67 |     def download_chapter_body(self, chapter):
68 |         '''Download body of a single chapter and return as clean html format.'''
69 |         logger.info('Downloading %s', chapter['url'])
70 |         soup = self.get_soup(chapter['url'])
71 | 
72 |         logger.debug(soup.title.string)
73 | 
74 |         if 'Chapter' in soup.select_one('h1').text:
75 |             chapter['title'] = soup.select_one('h1').text
76 |         else:
77 |             chapter['title'] = chapter['title']
78 |         # end if
79 | 
80 |         self.blacklist_patterns = [
81 |             r'^translat(ed by|or)',
82 |             r'(volume|chapter) .?\d+',
83 |         ]
84 | 
85 |         contents = soup.select_one('#vung_doc')
86 |         body = self.extract_contents(contents)
87 |         return '<p>' + '</p><p>'.join(body) + '</p>'
88 |     # end def
89 | # end class
90 | 


--------------------------------------------------------------------------------
/lncrawl/sources/novelonlinefull.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from bs4 import BeautifulSoup
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('NOVEL_ONLINE_FULL')
 8 | search_url = 'https://novelonlinefull.com/getsearchstory'
 9 | novel_page_url = 'https://novelonlinefull.com/novel/%s'
10 | 
11 | 
12 | class NovelOnlineFullCrawler(Crawler):
13 |     base_url = 'https://novelonlinefull.com/'
14 | 
15 |     def search_novel(self, query):
16 |         response = self.submit_form(search_url, {
17 |             'searchword': query
18 |         })
19 |         data = response.json()
20 | 
21 |         results = []
22 |         for novel in data:
23 |             titleSoup = BeautifulSoup(novel['name'], 'lxml')
24 |             results.append({
25 |                 'title': titleSoup.body.text.title(),
26 |                 'url': novel_page_url % novel['nameunsigned'],
27 |                 'info': 'Latest: %s' % novel['lastchapter'],
28 |             })
29 |         # end for
30 |         return results
31 |     # end def
32 | 
33 |     def read_novel_info(self):
34 |         '''Get novel title, autor, cover etc'''
35 |         logger.debug('Visiting %s', self.novel_url)
36 |         soup = self.get_soup(self.novel_url)
37 | 
38 |         # self.novel_title = soup.select_one('h1.entry-title').text.strip()
39 |         self.novel_title = soup.select_one('div.entry-header h1').text.strip()
40 |         logger.info('Novel title: %s', self.novel_title)
41 | 
42 |         try:
43 |             novel_data = self.submit_form(search_url, {
44 |                 'searchword': self.novel_title
45 |             }).json()
46 |             self.novel_cover = novel_data[0]['image']
47 |             self.novel_author = novel_data[0]['author']
48 |         except Exception:
49 |             logger.debug('Failed getting novel info.\n%s', Exception)
50 |         # end try
51 | 
52 |         for a in reversed(soup.select('#list_chapter .chapter-list a')):
53 |             chap_id = len(self.chapters) + 1
54 |             vol_id = len(self.chapters) // 100 + 1
55 |             if len(self.chapters) % 100 == 0:
56 |                 self.volumes.append({'id': vol_id})
57 |             # end if
58 |             self.chapters.append({
59 |                 'id': chap_id,
60 |                 'volume': vol_id,
61 |                 'title': a.text.strip(),
62 |                 'url': self.absolute_url(a['href']),
63 |             })
64 |         # end for
65 |     # end def
66 | 
67 |     def download_chapter_body(self, chapter):
68 |         '''Download body of a single chapter and return as clean html format.'''
69 |         logger.info('Downloading %s', chapter['url'])
70 |         soup = self.get_soup(chapter['url'])
71 | 
72 |         logger.debug(soup.title.string)
73 | 
74 |         if 'Chapter' in soup.select_one('h1').text:
75 |             chapter['title'] = soup.select_one('h1').text
76 |         else:
77 |             chapter['title'] = chapter['title']
78 |         # end if
79 | 
80 |         self.blacklist_patterns = [
81 |             r'^translat(ed by|or)',
82 |             r'(volume|chapter) .?\d+',
83 |         ]
84 | 
85 |         contents = soup.select_one('#vung_doc')
86 |         body = self.extract_contents(contents)
87 |         return '<p>' + '</p><p>'.join(body) + '</p>'
88 |     # end def
89 | # end class
90 | 


--------------------------------------------------------------------------------
/lncrawl/sources/boxnovel.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('BOXNOVEL')
 8 | search_url = 'https://boxnovel.com/?s=%s&post_type=wp-manga&author=&artist=&release='
 9 | 
10 | 
11 | class BoxNovelCrawler(Crawler):
12 |     base_url = 'https://boxnovel.com/'
13 | 
14 |     def search_novel(self, query):
15 |         query = query.lower().replace(' ', '+')
16 |         soup = self.get_soup(search_url % query)
17 | 
18 |         results = []
19 |         for tab in soup.select('.c-tabs-item__content'):
20 |             a = tab.select_one('.post-title h4 a')
21 |             latest = tab.select_one('.latest-chap .chapter a').text
22 |             votes = tab.select_one('.rating .total_votes').text
23 |             results.append({
24 |                 'title': a.text.strip(),
25 |                 'url': self.absolute_url(a['href']),
26 |                 'info': '%s | Rating: %s' % (latest, votes),
27 |             })
28 |         # end for
29 | 
30 |         return results
31 |     # end def
32 | 
33 |     def read_novel_info(self):
34 |         '''Get novel title, autor, cover etc'''
35 |         logger.debug('Visiting %s', self.novel_url)
36 |         soup = self.get_soup(self.novel_url)
37 | 
38 |         self.novel_title = ' '.join([
39 |             str(x)
40 |             for x in soup.select_one('.post-title h3').contents
41 |             if not x.name
42 |         ]).strip()
43 |         logger.info('Novel title: %s', self.novel_title)
44 | 
45 |         probable_img = soup.select_one('.summary_image img')
46 |         if probable_img:
47 |             self.novel_cover = self.absolute_url(probable_img['src'])
48 |         logger.info('Novel cover: %s', self.novel_cover)
49 | 
50 |         author = soup.select('.author-content a')
51 |         if len(author) == 2:
52 |             self.novel_author = author[0].text + ' (' + author[1].text + ')'
53 |         else:
54 |             self.novel_author = author[0].text
55 |         logger.info('Novel author: %s', self.novel_author)
56 | 
57 |         chapters = soup.select('ul.main li.wp-manga-chapter a')
58 |         for a in reversed(chapters):
59 |             chap_id = len(self.chapters) + 1
60 |             vol_id = chap_id//100 + 1
61 |             if len(self.chapters) % 100 == 0:
62 |                 vol_title = 'Volume ' + str(vol_id)
63 |                 self.volumes.append({
64 |                     'id': vol_id,
65 |                     'title': vol_title,
66 |                 })
67 |             # end if
68 |             self.chapters.append({
69 |                 'id': chap_id,
70 |                 'volume': vol_id,
71 |                 'url':  self.absolute_url(a['href']),
72 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
73 |             })
74 |         # end for
75 |     # end def
76 | 
77 |     def download_chapter_body(self, chapter):
78 |         '''Download body of a single chapter and return as clean html format.'''
79 |         logger.info('Downloading %s', chapter['url'])
80 |         soup = self.get_soup(chapter['url'])
81 | 
82 |         contents = soup.select_one('div.text-left')
83 |         for bad in contents.select('h3, .code-block, script, .adsbygoogle'):
84 |             bad.decompose()
85 | 
86 |         return str(contents)
87 |     # end def
88 | # end class
89 | 


--------------------------------------------------------------------------------
/lncrawl/sources/webnovelindonesia.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from concurrent import futures
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('WEBNOVEL_INDONESIA')
 8 | 
 9 | chapter_list_url = 'https://webnovelindonesia.com/wp-json/writerist/v1/chapters?category=%s&perpage=100&order=ASC&paged=%s'
10 | 
11 | 
12 | class WebnovelIndonesia(Crawler):
13 |     base_url = 'https://webnovelindonesia.com/'
14 | 
15 |     def read_novel_info(self):
16 |         logger.debug('Visiting %s', self.novel_url)
17 |         soup = self.get_soup(self.novel_url)
18 | 
19 |         self.novel_title = soup.select_one('.breadcrumb .breadcrumb-item.active').text.strip()
20 |         logger.info('Novel title: %s', self.novel_title)
21 | 
22 |         self.novel_cover = self.absolute_url(
23 |             soup.select_one('.section-novel img[class*="lazy"]')['data-src'])
24 |         logger.info('Novel cover: %s', self.novel_cover)
25 | 
26 |         self.novel_author = soup.select_one('.section-novel li a[href*="/aut/"]').text.strip()
27 |         logger.info('Novel author: %s', self.novel_author)
28 | 
29 |         possible_chapter_pages = soup.select('#js-chpater-jump > div.jump-to')
30 | 
31 |         if not len(possible_chapter_pages):
32 |             possible_chapter_pages = [{'data-paged': '1'}]
33 |         # end if
34 | 
35 |         novel_id = soup.select_one('#sortable-table')['data-category']
36 | 
37 |         logger.info('Downloading chapters...')
38 |         futures_to_check = dict()
39 |         for div in possible_chapter_pages:
40 |             page = div['data-paged']
41 |             url = chapter_list_url % (novel_id, page)
42 |             task = self.executor.submit(self.extract_chapter_list, url)
43 |             futures_to_check[task] = page
44 |         # end for
45 | 
46 |         temp_chapters = dict()
47 |         for future in futures.as_completed(futures_to_check):
48 |             page = int(futures_to_check[future])
49 |             temp_chapters[page] = future.result()
50 |         # end for
51 | 
52 |         logger.info('Building sorted chapter list...')
53 |         for page in sorted(temp_chapters.keys()):
54 |             self.volumes.append({'id': page})
55 |             for chap in temp_chapters[page]:
56 |                 chap['volume'] = page
57 |                 chap['id'] = 1 + len(self.chapters)
58 |                 self.chapters.append(chap)
59 |             # end for
60 |         # end for
61 |     # end def
62 | 
63 |     def extract_chapter_list(self, url):
64 |         temp_list = []
65 |         logger.debug('Visiting: %s', url)
66 |         data = self.get_json(url)
67 |         for item in data:
68 |             temp_list.append({
69 |                 'title': item['post_title'],
70 |                 'url': self.absolute_url(item['permalink']),
71 |             })
72 |         # end for
73 |         return temp_list
74 |     # end def
75 | 
76 |     def download_chapter_body(self, chapter):
77 |         '''Download body of a single chapter and return as clean html format'''
78 |         logger.info('Downloading %s', chapter['url'])
79 |         soup = self.get_soup(chapter['url'])
80 | 
81 |         body = ''
82 |         for p in soup.select('#content > p'):
83 |             if p.text.strip():
84 |                 body += str(p).strip()
85 |             # end if
86 |         # end for
87 | 
88 |         return body
89 |     # end def
90 | # end class
91 | 


--------------------------------------------------------------------------------
/lncrawl/sources/translateindo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | import re
 4 | from urllib.parse import quote, urlparse
 5 | import urllib.parse
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | from ..utils.crawler import Crawler
 9 | 
10 | logger = logging.getLogger('TRANSLATEINDO')
11 | 
12 | #search_url = 'https://www.worldnovel.online/wp-json/writerist/v1/novel/search?keyword=%s'
13 | #chapter_list_url = "https://www.worldnovel.online/wp-json/writerist/v1/chapters?category=%s&perpage=4000&order=ASC&paged=1"
14 | 
15 | 
16 | class TranslateIndoCrawler(Crawler):
17 |     base_url = 'https://www.translateindo.com/'
18 | 
19 |     # def search_novel(self, query):
20 |     #    data = self.get_json(search_url % quote(query))
21 | 
22 |     #    results = []
23 |     #    for item in data:
24 |     #        results.append({
25 |     #            'url': item['permalink'],
26 |     #            'title': item['post_title'],
27 |     #        })
28 |     #    # end for
29 | 
30 |     #    return results
31 |     # end def
32 | 
33 |     def read_novel_info(self):
34 |         '''Get novel title, autor, cover etc'''
35 |         logger.debug('Visiting %s', self.novel_url)
36 |         soup = self.get_soup(self.novel_url)
37 | 
38 |         self.novel_title = soup.select_one('h1.entry-title').text.strip()
39 |         logger.info('Novel title: %s', self.novel_title)
40 | 
41 |         possible_cover = soup.select_one('div.entry-content img')['src']
42 |         if possible_cover:
43 |             self.novel_cover = self.absolute_url(possible_cover)
44 |         # end if
45 |         logger.info('Novel cover: %s', self.novel_cover)
46 | 
47 |         for span in soup.select('div.entry-content p span'):
48 |             possible_author = re.sub(r'[\(\s\n\)]+', ' ', span.text, re.M).strip()
49 |             if possible_author.startswith('Author:'):
50 |                 possible_author = re.sub('Author:', '', possible_author)
51 |                 self.novel_author = possible_author.strip()
52 |                 break
53 |             # end if
54 |         # end for
55 |         logger.info('Novel author: %s', self.novel_author)
56 | 
57 |         for div in soup.select('.cl-lists .cl-block'):
58 |             possible_vol = div.select_one('.cl-header')
59 |             if not possible_vol:
60 |                 continue
61 | 
62 |             vol_title = possible_vol.text.strip()
63 |             vol_id = len(self.volumes) + 1
64 |             self.volumes.append({
65 |                 'id': vol_id,
66 |                 'title': vol_title,
67 |             })
68 | 
69 |             for a in div.select('ol.cl-body li a'):
70 |                 chap_id = len(self.chapters) + 1
71 |                 self.chapters.append({
72 |                     'id': chap_id,
73 |                     'volume': vol_id,
74 |                     'url':  self.absolute_url(a['href']),
75 |                     'title': a.text.strip() or ('Chapter %d' % chap_id),
76 |                 })
77 |             # end for
78 |         # end for
79 |     # end def
80 | 
81 |     def download_chapter_body(self, chapter):
82 |         '''Download body of a single chapter and return as clean html format'''
83 |         logger.info('Downloading %s', chapter['url'])
84 |         soup = self.get_soup(chapter['url'])
85 | 
86 |         contents = soup.select('div.entry-content p')
87 | 
88 |         body = [str(p) for p in contents if p.text.strip()]
89 |         return '<p>' + '</p><p>'.join(body) + '</p>'
90 |     # end def
91 | # end class
92 | 


--------------------------------------------------------------------------------
/setup_pyi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | import os
  4 | import platform
  5 | import re
  6 | import shlex
  7 | import shutil
  8 | import sys
  9 | from pathlib import Path
 10 | 
 11 | from PyInstaller import __main__ as pyi
 12 | from setuptools.config import read_configuration
 13 | 
 14 | ROOT = Path(__file__).parent
 15 | unix_root = '/'.join(str(ROOT).split(os.sep))
 16 | site_packages = list(ROOT.glob('venv/**/site-packages'))[0]
 17 | unix_site_packages = '/'.join(str(site_packages).split(os.sep))
 18 | 
 19 | 
 20 | def package():
 21 |     output = str(ROOT / 'windows')
 22 |     shutil.rmtree(output, ignore_errors=True)
 23 |     os.makedirs(output, exist_ok=True)
 24 |     setup_command()
 25 |     pyi.run()
 26 |     shutil.rmtree(output, ignore_errors=True)
 27 | # end def
 28 | 
 29 | 
 30 | def setup_command():
 31 |     command = 'pyinstaller '
 32 |     command += '--onefile '  # onefile
 33 |     command += '--clean '
 34 |     command += '--noconfirm '
 35 |     command += '--name "lncrawl" '
 36 |     command += '--icon "%s/res/lncrawl.ico" ' % unix_root
 37 |     command += '--distpath "%s" ' % str(ROOT / 'dist')
 38 |     command += '--specpath "%s" ' % str(ROOT / 'windows')
 39 |     command += '--workpath "%s" ' % str(ROOT / 'windows' / 'build')
 40 | 
 41 |     command += gather_data_files()
 42 |     command += gather_hidden_imports()
 43 |     command += '"%s/__main__.py" ' % unix_root
 44 | 
 45 |     print(command)
 46 |     print()
 47 | 
 48 |     sys.argv = shlex.split(command)
 49 | # end def
 50 | 
 51 | 
 52 | def gather_data_files():
 53 |     command = ''
 54 | 
 55 |     # add data files of this project
 56 |     for f in (ROOT / 'lncrawl').glob('**/*.*'):
 57 |         src = str(f)
 58 |         src = '/'.join(src.split(os.sep))
 59 |         dst = str(f.parent.relative_to(ROOT))
 60 |         dst = '/'.join(dst.split(os.sep))
 61 |         command += '--add-data "%s%s%s" ' % (src, os.pathsep, dst)
 62 |     # end for
 63 |     command += '--add-data "%s/lncrawl/VERSION%slncrawl" ' % (unix_root, os.pathsep)
 64 | 
 65 |     # add data files of other dependencies
 66 |     command += '--add-data "%s/cairosvg/VERSION%s." ' % (
 67 |         unix_site_packages, os.pathsep)
 68 |     command += '--add-data "%s/cairocffi/VERSION%scairocffi" ' % (
 69 |         unix_site_packages, os.pathsep)
 70 |     command += '--add-data "%s/tinycss2/VERSION%stinycss2" ' % (
 71 |         unix_site_packages, os.pathsep)
 72 |     command += '--add-data "%s/text_unidecode/data.bin%stext_unidecode" ' % (
 73 |         unix_site_packages, os.pathsep)
 74 |     command += '--add-data "%s/cloudscraper%scloudscraper" ' % (
 75 |         unix_site_packages, os.pathsep)
 76 |     command += '--add-data "%s/wcwidth/version.json%swcwidth" ' % (
 77 |         unix_site_packages, os.pathsep)
 78 | 
 79 |     return command
 80 | # end def
 81 | 
 82 | 
 83 | def gather_hidden_imports():
 84 |     command = ''
 85 | 
 86 |     # add hidden imports of this project
 87 |     for f in (ROOT / 'lncrawl' / 'sources').glob('*.py'):
 88 |         if os.path.isfile(f) and re.match(r'^([^_.][^.]+).py$', f.name):
 89 |             module_name = f.name[:-3]
 90 |             command += '--hidden-import "lncrawl.sources.%s" ' % module_name
 91 |         # end if
 92 |     # end for
 93 |     command += '--hidden-import "pkg_resources.py2_warn" '
 94 | 
 95 |     return command
 96 | # end def
 97 | 
 98 | 
 99 | if __name__ == '__main__':
100 |     package()
101 | # end if
102 | 


--------------------------------------------------------------------------------
/lncrawl/binders/calibre.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import logging
  3 | import os
  4 | import subprocess
  5 | 
  6 | logger = logging.getLogger('CALIBRE_BINDER')
  7 | 
  8 | EBOOK_CONVERT = 'ebook-convert'
  9 | CALIBRE_LINK = 'https://calibre-ebook.com/download'
 10 | 
 11 | 
 12 | def run_ebook_convert(*args):
 13 |     '''
 14 |     Calls `ebook-convert` with given args
 15 |     Visit https://manual.calibre-ebook.com/generated/en/ebook-convert.html for argument list.
 16 |     '''
 17 |     try:
 18 |         isdebug = os.getenv('debug_mode') == 'yes'
 19 |         with open(os.devnull, 'w') as dumper:
 20 |             subprocess.call(
 21 |                 [EBOOK_CONVERT] + list(args),
 22 |                 stdout=None if isdebug else dumper,
 23 |                 stderr=None if isdebug else dumper,
 24 |             )
 25 |         # end with
 26 |         return True
 27 |     except Exception:
 28 |         import traceback
 29 |         logger.debug(traceback.format_exc())
 30 |         return False
 31 |     # end try
 32 | # end def
 33 | 
 34 | 
 35 | def epub_to_calibre(app, epub_file, out_fmt):
 36 |     if not os.path.exists(epub_file):
 37 |         return None
 38 |     # end if
 39 | 
 40 |     epub_path = os.path.dirname(epub_file)
 41 |     epub_file_name = os.path.basename(epub_file)
 42 |     file_name_without_ext = epub_file_name.replace('.epub', '')
 43 | 
 44 |     work_path = os.path.dirname(epub_path)
 45 |     out_path = os.path.join(work_path, out_fmt)
 46 |     out_file_name = file_name_without_ext + '.' + out_fmt
 47 |     out_file = os.path.join(out_path, out_file_name)
 48 | 
 49 |     os.makedirs(out_path, exist_ok=True)
 50 | 
 51 |     logger.debug('Converting "%s" to "%s"', epub_file, out_file)
 52 | 
 53 |     args = [
 54 |         epub_file,
 55 |         out_file,
 56 |         '--unsmarten-punctuation',
 57 |         '--no-chapters-in-toc',
 58 |         '--title', file_name_without_ext,
 59 |         '--authors', app.crawler.novel_author,
 60 |         '--series', app.crawler.novel_title,
 61 |         '--publisher', app.crawler.home_url,
 62 |         '--book-producer', 'Lightnovel Crawler',
 63 |         '--enable-heuristics', '--disable-renumber-headings',
 64 |     ]
 65 |     if app.book_cover:
 66 |         args += ['--cover', app.book_cover]
 67 |     if out_fmt == 'pdf':
 68 |         args += [
 69 |             '--paper-size', 'a4',
 70 |             '--pdf-page-numbers',
 71 |             '--pdf-hyphenate',
 72 |             '--pdf-header-template', '<p style="text-align:center; color:#555; font-size:0.9em">⦗ _TITLE_ &mdash; _SECTION_ ⦘</p>',
 73 |         ]
 74 |     # end if
 75 | 
 76 |     run_ebook_convert(*args)
 77 | 
 78 |     if os.path.exists(out_file):
 79 |         print('Created: %s' % out_file_name)
 80 |         return out_file
 81 |     else:
 82 |         logger.error('[%s] conversion failed: %s', out_fmt, epub_file_name)
 83 |         return None
 84 |     # end if
 85 | # end def
 86 | 
 87 | 
 88 | def make_calibres(app, epubs, out_fmt):
 89 |     if out_fmt == 'epub' or not epubs:
 90 |         return epubs
 91 |     # end if
 92 | 
 93 |     if not run_ebook_convert('--version'):
 94 |         logger.error('Install Calibre to generate %s: %s',
 95 |                      out_fmt, CALIBRE_LINK),
 96 |         return
 97 |     # end if
 98 | 
 99 |     out_files = []
100 |     for epub in epubs:
101 |         out = epub_to_calibre(app, epub, out_fmt)
102 |         out_files += [out]
103 |     # end for
104 | 
105 |     return out_files
106 | # end def
107 | 


--------------------------------------------------------------------------------
/lncrawl/sources/zenithnovels.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | 
 6 | import requests
 7 | 
 8 | from ..utils.crawler import Crawler
 9 | 
10 | logger = logging.getLogger('ZENITH_NOVELS')
11 | 
12 | novel_url = 'http://zenithnovels.com/%s/'
13 | 
14 | 
15 | class ZenithNovelsCrawler(Crawler):
16 |     base_url = 'http://zenithnovels.com/'
17 | 
18 |     def read_novel_info(self):
19 |         '''Get novel title, autor, cover etc'''
20 |         self.novel_id = re.search(
21 |             r'(?<=zenithnovels.com/)[^/]+', self.novel_url).group(0)
22 |         logger.info('Novel id: %s', self.novel_id)
23 | 
24 |         url = novel_url % self.novel_id
25 |         logger.debug('Visiting %s', url)
26 |         soup = self.get_soup(url)
27 | 
28 |         self.novel_title = soup.select_one('article#the-post h1.name').text
29 |         logger.info('Novel title: %s', self.novel_title)
30 | 
31 |         self.novel_cover = self.absolute_url(soup.select_one(
32 |             'article#the-post .entry img')['src'])
33 |         logger.info('Novel cover: %s', self.novel_cover)
34 | 
35 |         while True:
36 |             self.parse_chapter_list(soup)
37 | 
38 |             next_link = soup.select_one('ul.lcp_paginator a.lcp_nextlink')
39 |             if next_link:
40 |                 soup = self.get_soup(next_link['href'])
41 |             else:
42 |                 break
43 |             # end if
44 |         # end if
45 | 
46 |         self.chapters.sort(key=lambda x: x['volume'] * 1e6 + x['id'])
47 |         self.volumes = [{'id': x, 'title': ''} for x in set(self.volumes)]
48 |     # end def
49 | 
50 |     def parse_chapter_list(self, soup):
51 |         for a in soup.select('ul.lcp_catlist li a'):
52 |             ch_title = a['title']
53 |             ch_id = [int(''.join(x).strip()) for x in re.findall(
54 |                 r'((?<=ch) \d+)|((?<=chapter) \d+)', ch_title, re.IGNORECASE)]
55 |             ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1
56 |             vol_id = [int(''.join(x).strip()) for x in re.findall(
57 |                 r'((?<=book) \d+)|((?<=volume) \d+)', ch_title, re.IGNORECASE)]
58 |             vol_id = vol_id[0] if len(vol_id) else 1 + (ch_id - 1) // 100
59 | 
60 |             self.volumes.append(vol_id)
61 |             self.chapters.append({
62 |                 'id': ch_id,
63 |                 'volume': vol_id,
64 |                 'title': ch_title,
65 |                 'url': self.absolute_url(a['href']),
66 |             })
67 |         # end for
68 |     # end def
69 | 
70 |     def download_chapter_body(self, chapter):
71 |         '''Download body of a single chapter and return as clean html format.'''
72 |         logger.info('Downloading %s', chapter['url'])
73 |         soup = self.get_soup(chapter['url'])
74 | 
75 |         entry = soup.select_one('article#the-post .entry')
76 | 
77 |         try:
78 |             self.clean_contents(entry)
79 |             for note in entry.select('.footnote'):
80 |                 note.decompose()
81 |             # end for
82 |         except Exception:
83 |             pass
84 |         # end try
85 | 
86 |         body = ''
87 |         for tag in entry.children:
88 |             if tag.name == 'p' and len(tag.text.strip()):
89 |                 p = ' '.join(self.extract_contents(tag))
90 |                 if len(p.strip()):
91 |                     body += '<p>%s</p>' % p
92 |                 # end if
93 |             # end if
94 |         # end for
95 | 
96 |         return body
97 |     # end def
98 | # end class
99 | 


--------------------------------------------------------------------------------
/lncrawl/sources/litnet.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import logging
 3 | from ..utils.crawler import Crawler
 4 | 
 5 | logger = logging.getLogger('LITNET')
 6 | search_url = 'https://litnet.com/en/search?q=%s'
 7 | 
 8 | 
 9 | class LitnetCrawler(Crawler):
10 |     base_url = 'https://litnet.com/'
11 | 
12 |     def search_novel(self, query):
13 |         query = query.lower().replace(' ', '+')
14 |         soup = self.get_soup(search_url % query)
15 | 
16 |         results = []
17 |         for a in soup.select('div.l-container ul a'):
18 |             results.append({
19 |                 'title': a.text.strip(),
20 |                 'url': self.absolute_url(a['href']),
21 |             })
22 |         # end for
23 | 
24 |         return results
25 |     # end def
26 | 
27 |     def read_novel_info(self):
28 |         '''Get novel title, autor, cover etc'''
29 |         logger.debug('Visiting %s', self.novel_url)
30 |         soup = self.get_soup(self.novel_url)
31 | 
32 |         self.novel_title = soup.select_one('h1').text.strip()
33 |         logger.info('Novel title: %s', self.novel_title)
34 | 
35 |         img_src = soup.select_one('div.book-view-cover img')
36 |         if not img_src:
37 |             img_src = soup.select_one('div.book-cover img')
38 |         # end if
39 |         if img_src:
40 |             self.novel_cover = self.absolute_url(img_src['src'])
41 |         # end if
42 |         logger.info('Novel cover: %s', self.novel_cover)
43 | 
44 |         author = soup.select_one('div.book-view-info a.author')
45 |         if not author:
46 |             author = soup.select_one('div.book-head-content a.book-autor')
47 |         # end if
48 |         if author:
49 |             self.novel_author = author.text.strip()
50 |         # end if
51 |         logger.info('Novel author: %s', self.novel_author)
52 | 
53 |         chapters = soup.find('select', {'name': 'chapter'})
54 |         if chapters is None:
55 |             chapters = soup.select('div.collapsible-body a.collection-item')
56 |         else:
57 |             chapters = chapters.find_all('option')
58 |             chapters = [c for c in chapters if c.attrs['value']]
59 |         # end if
60 | 
61 |         for a in chapters:
62 |             chap_id = len(self.chapters) + 1
63 |             if len(self.chapters) % 100 == 0:
64 |                 vol_id = chap_id//100 + 1
65 |                 vol_title = 'Volume ' + str(vol_id)
66 |                 self.volumes.append({
67 |                     'id': vol_id,
68 |                     'title': vol_title,
69 |                 })
70 |             # end if
71 | 
72 |             abs_url = self.last_visited_url.replace('book', 'reader')
73 |             chap_url = abs_url + \
74 |                 ('?c=%s' % a.attrs['value']) if a.has_attr(
75 |                     'value') else self.home_url + a['href']
76 |             self.chapters.append({
77 |                 'id': chap_id,
78 |                 'volume': 1,
79 |                 'url': chap_url,
80 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
81 |             })
82 |         # end for
83 |     # end def
84 | 
85 |     def download_chapter_body(self, chapter):
86 |         '''Download body of a single chapter and return as clean html format.'''
87 |         logger.info('Downloading %s', chapter['url'])
88 |         soup = self.get_soup(chapter['url'])
89 | 
90 |         contents = soup.select_one('div.reader-text')
91 |         if contents is None:
92 |             contents = soup.select_one('div.demo-txt')
93 |         return str(contents)
94 |     # end def
95 | # end class
96 | 


--------------------------------------------------------------------------------
/lncrawl/bots/console/get_crawler.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import re
  3 | 
  4 | from PyInquirer import prompt
  5 | 
  6 | from ...core import display
  7 | from ...core.arguments import get_args
  8 | from ...sources import rejected_sources
  9 | 
 10 | 
 11 | def get_novel_url(self):
 12 |     '''Returns a novel page url or a query'''
 13 |     args = get_args()
 14 |     if args.query and len(args.query) > 1:
 15 |         return args.query
 16 |     # end if
 17 | 
 18 |     url = args.novel_page
 19 |     if url:
 20 |         if re.match(r'^https?://.+\..+$', url):
 21 |             return url
 22 |         else:
 23 |             raise Exception('Invalid URL of novel page')
 24 |         # end if
 25 |     # end if
 26 | 
 27 |     try:
 28 |         if args.suppress:
 29 |             raise Exception()
 30 |         # end if
 31 | 
 32 |         answer = prompt([
 33 |             {
 34 |                 'type': 'input',
 35 |                 'name': 'novel',
 36 |                 'message': 'Enter novel page url or query novel:',
 37 |                 'validate': lambda val: 'Input should not be empty'
 38 |                 if len(val) == 0 else True,
 39 |             },
 40 |         ])
 41 |         return answer['novel'].strip()
 42 |     except Exception:
 43 |         raise Exception('Novel page url or query was not given')
 44 |     # end try
 45 | # end def
 46 | 
 47 | 
 48 | def get_crawlers_to_search(self):
 49 |     '''Returns user choice to search the choosen sites for a novel'''
 50 |     links = self.app.crawler_links
 51 |     if not links:
 52 |         return None
 53 |     # end if
 54 | 
 55 |     args = get_args()
 56 |     if args.suppress or not args.sources:
 57 |         return links
 58 |     # end if
 59 | 
 60 |     answer = prompt([
 61 |         {
 62 |             'type': 'checkbox',
 63 |             'name': 'sites',
 64 |             'message': 'Where to search?',
 65 |             'choices': [{'name': x} for x in sorted(links)],
 66 |         }
 67 |     ])
 68 | 
 69 |     selected = answer['sites']
 70 |     return selected if len(selected) > 0 else links
 71 | # end def
 72 | 
 73 | 
 74 | def choose_a_novel(self):
 75 |     '''Choose a single novel url from the search result'''
 76 |     args = get_args()
 77 | 
 78 |     # Choose a novel title
 79 |     choices = self.app.search_results
 80 |     selected_choice = self.app.search_results[0]
 81 |     if len(choices) > 1 and not args.suppress:
 82 |         answer = prompt([
 83 |             {
 84 |                 'type': 'list',
 85 |                 'name': 'novel',
 86 |                 'message': 'Which one is your novel?',
 87 |                 'choices': display.format_novel_choices(choices),
 88 |             }
 89 |         ])
 90 | 
 91 |         index = int(answer['novel'].split('.')[0])
 92 |         selected_choice = self.app.search_results[index - 1]
 93 |     # end if
 94 | 
 95 |     # Choose the novel source
 96 |     novels = selected_choice['novels']
 97 |     selected_novel = novels[0]
 98 |     if len(novels) > 1 and not args.suppress:
 99 |         answer = prompt([
100 |             {
101 |                 'type': 'list',
102 |                 'name': 'novel',
103 |                 'message': 'Choose a source to download?',
104 |                 'choices': ['0. Back'] + display.format_source_choices(novels),
105 |             }
106 |         ])
107 | 
108 |         index = int(answer['novel'].split('.')[0])
109 |         if index == 0:
110 |             return self.choose_a_novel()
111 |         # end if
112 |         selected_novel = novels[index - 1]
113 |     # end if
114 | 
115 |     return selected_novel['url']
116 | # end def
117 | 


--------------------------------------------------------------------------------
/lncrawl/sources/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Auto imports all crawlers from the current package directory.
 4 | To be recognized, your crawler file should meet following conditions:
 5 |     - file does not starts with an underscore
 6 |     - file ends with .py extension
 7 |     - file contains a class that extends `lncrawl.utils.crawler.Crawler`
 8 |     - the class extending `lncrawl.utils.crawler.Crawler` has a global variable `base_url`
 9 |     - `base_url` contains a valid url or a list of urls supported by the crawler
10 | 
11 | For example, see any of the files inside this directory.
12 | """
13 | 
14 | import importlib
15 | import os
16 | import re
17 | import sys
18 | from urllib.parse import urlparse
19 | 
20 | from ..utils.crawler import Crawler
21 | 
22 | rejected_sources = {
23 |     'https://novelplanet.com/': 'Site is closed',
24 |     'http://gravitytales.com/': 'Redirects to webnovel.com',
25 |     'http://fullnovel.live/': "403 - Forbidden: Access is denied",
26 |     'http://moonbunnycafe.com/': "Does not follow uniform format",
27 |     'https://anythingnovel.com/': 'Site broken',
28 |     'https://indomtl.com/': "Does not like to be crawled",
29 |     'https://lnindo.org/': "Does not like to be crawled",
30 |     'https://myoniyonitranslations.com/': "522 - Connection timed out",
31 |     'https://novelgo.id/': "Removed by owner",
32 |     'https://www.flying-lines.com/': 'Obfuscated content',
33 |     'https://www.jieruihao.cn/': "Unavailable",
34 |     'https://www.noveluniverse.com/': "Site is down",
35 |     'https://www.novelupdates.com/': "Does not host any novels",
36 |     'https://www.novelv.com/': "Site is down",
37 |     'https://yukinovel.id/': "Removed by owner",
38 |     'https://www.rebirth.online/': 'Site moved',
39 |     'https://mtled-novels.com/': 'Domain is expired',
40 | }
41 | 
42 | # this list will be auto-generated
43 | crawler_list = {}
44 | 
45 | # auto-import all submodules in the current directory
46 | __module_regex = re.compile(r'^([^_.][^.]+).py[c]?$', re.I)
47 | __url_regex = re.compile(r'^^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.I)
48 | 
49 | for entry in os.listdir(__path__[0]):
50 |     file_path = os.path.join(__path__[0], entry)
51 |     if not os.path.isfile(file_path):
52 |         continue
53 |     # end if
54 | 
55 |     regex_result = __module_regex.findall(entry)
56 |     if len(regex_result) != 1:  # does not contains a module
57 |         continue
58 |     # end if
59 | 
60 |     module_name = regex_result[0]
61 |     module = importlib.import_module('.' + module_name, package=__package__)
62 | 
63 |     for key in dir(module):
64 |         item = getattr(module, key)
65 |         if type(item) != type(Crawler) or item.__base__ != Crawler:
66 |             continue
67 |         # end if
68 | 
69 |         if not hasattr(item, 'base_url'):
70 |             raise Exception('No `base_url` for `%s`' % key)
71 |         # end if
72 | 
73 |         base_url = getattr(item, 'base_url')
74 |         if isinstance(base_url, str):
75 |             base_url = [base_url]
76 |         # end if
77 | 
78 |         if not isinstance(base_url, list):
79 |             raise Exception('Unexpected `base_url` type in `%s`' % key)
80 |         # end if
81 | 
82 |         for url in base_url:
83 |             if not __url_regex.match(url):
84 |                 raise Exception('Invalid `base_url` in `%s`: %s' % (key, url))
85 |             # end if
86 |             if not url.endswith('/'):
87 |                 url += '/'
88 |             # end if
89 |             if url in rejected_sources:
90 |                 continue
91 |             # end if
92 |             crawler_list[url] = item
93 |         # end for
94 |     # end for
95 | # end for
96 | 


--------------------------------------------------------------------------------
/lncrawl/sources/royalroad.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import logging
  4 | import re
  5 | from ..utils.crawler import Crawler
  6 | 
  7 | logger = logging.getLogger('ROYALROAD')
  8 | search_url = 'https://www.royalroad.com/fictions/search?keyword=%s'
  9 | 
 10 | 
 11 | class RoyalRoadCrawler(Crawler):
 12 |     base_url = 'https://www.royalroad.com/'
 13 | 
 14 |     def search_novel(self, query):
 15 |         query = query.lower().replace(' ', '+')
 16 |         soup = self.get_soup(search_url % query)
 17 | 
 18 |         results = []
 19 |         for a in soup.select('h2.fiction-title a')[:5]:
 20 |             url = self.absolute_url(a['href'])
 21 |             results.append({
 22 |                 'url': url,
 23 |                 'title': a.text.strip(),
 24 |                 'info': self.search_novel_info(url),
 25 |             })
 26 |         # end for
 27 | 
 28 |         return results
 29 |     # end def
 30 | 
 31 |     def search_novel_info(self, url):
 32 |         '''Get novel title, autor, cover etc'''
 33 |         logger.debug('Visiting %s', url)
 34 |         soup = self.get_soup(url)
 35 | 
 36 |         score = soup.select_one('span.star')['data-content']
 37 |         chapters = len(soup.find('tbody').findAll('a', href=True))
 38 |         latest = soup.find('tbody').findAll('a', href=True)[-1].text.strip()
 39 |         info = 'Score: %s, Chapter count %s, Latest: %s' % (
 40 |             score, chapters, latest)
 41 | 
 42 |         return info
 43 |     # end def
 44 | 
 45 |     def read_novel_info(self):
 46 |         '''Get novel title, autor, cover etc'''
 47 |         logger.debug('Visiting %s', self.novel_url)
 48 |         soup = self.get_soup(self.novel_url)
 49 | 
 50 |         self.novel_title = soup.find("h1", {"property": "name"}).text.strip()
 51 |         logger.info('Novel title: %s', self.novel_title)
 52 | 
 53 |         self.novel_cover = self.absolute_url(
 54 |             soup.find("img", {"class": "img-offset thumbnail inline-block"})['src'])
 55 |         logger.info('Novel cover: %s', self.novel_cover)
 56 | 
 57 |         self.novel_author = soup.find(
 58 |             "span", {"property": "name"}).text.strip()
 59 |         logger.info('Novel author: %s', self.novel_author)
 60 | 
 61 |         chapters = soup.find('tbody').findAll('a', href=True)
 62 | 
 63 |         for x in chapters:
 64 |             chap_id = len(self.chapters) + 1
 65 |             if len(self.chapters) % 100 == 0:
 66 |                 vol_id = chap_id//100 + 1
 67 |                 vol_title = 'Volume ' + str(vol_id)
 68 |                 self.volumes.append({
 69 |                     'id': vol_id,
 70 |                     'title': vol_title,
 71 |                 })
 72 |             # end if
 73 |             self.chapters.append({
 74 |                 'id': chap_id,
 75 |                 'volume': vol_id,
 76 |                 'url': self.absolute_url(x['href']),
 77 |                 'title': x.text.strip() or ('Chapter %d' % chap_id),
 78 |             })
 79 |         # end for
 80 |     # end def
 81 | 
 82 |     def download_chapter_body(self, chapter):
 83 |         '''Download body of a single chapter and return as clean html format.'''
 84 |         logger.info('Downloading %s', chapter['url'])
 85 |         soup = self.get_soup(chapter['url'])
 86 | 
 87 |         logger.debug(soup.title.string)
 88 | 
 89 |         if 'Chapter' in soup.select_one('h2').text:
 90 |             chapter['title'] = soup.select_one('h2').text
 91 |         else:
 92 |             chapter['title'] = chapter['title']
 93 |         # end if
 94 | 
 95 |         contents = soup.find("div", {"class": "chapter-content"})
 96 | 
 97 |         self.clean_contents(contents)
 98 |         return str(contents)
 99 |     # end def
100 | # end class
101 | 


--------------------------------------------------------------------------------
/lncrawl/sources/wuxiasite.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import logging
 4 | import re
 5 | from ..utils.crawler import Crawler
 6 | 
 7 | logger = logging.getLogger('WUXIA-SITE')
 8 | search_url = 'https://wuxiaworld.site/?s=%s&post_type=wp-manga'
 9 | 
10 | 
11 | class WuxiaSiteCrawler(Crawler):
12 |     base_url = 'https://wuxiaworld.site/'
13 | 
14 |     # TODO: disabled due to cloudflare issue
15 |     # def search_novel(self, query):
16 |     #     query = query.lower().replace(' ', '+')
17 |     #     soup = self.get_soup(search_url % query)
18 | 
19 |     #     results = []
20 |     #     for tab in soup.select('.c-tabs-item__content'):
21 |     #         a = tab.select_one('.post-title h4 a')
22 |     #         latest = tab.select_one('.latest-chap .chapter a').text
23 |     #         votes = tab.select_one('.rating .total_votes').text
24 |     #         results.append({
25 |     #             'title': a.text.strip(),
26 |     #             'url': self.absolute_url(a['href']),
27 |     #             'info': '%s | Rating: %s' % (latest, votes),
28 |     #         })
29 |     #     # end for
30 | 
31 |     #     return results
32 |     # # end def
33 | 
34 |     def read_novel_info(self):
35 |         '''Get novel title, autor, cover etc'''
36 |         logger.debug('Visiting %s', self.novel_url)
37 |         soup = self.get_soup(self.novel_url)
38 | 
39 |         self.novel_title = ' '.join([
40 |             str(x)
41 |             for x in soup.select_one('.post-title h3').contents
42 |             if not x.name
43 |         ]).strip()
44 |         logger.info('Novel title: %s', self.novel_title)
45 | 
46 |         possible_img = soup.select_one('.summary_image img')
47 |         if possible_img:
48 |             if possible_img.has_attr('data-src'):
49 |                 self.novel_cover = self.absolute_url(possible_img['data-src'])
50 |             elif possible_img.has_attr('srcset'):
51 |                 self.novel_cover = self.absolute_url(possible_img['srcset'].split(',')[0])
52 |             elif possible_img.has_attr('src'):
53 |                 self.novel_cover = self.absolute_url(possible_img['src'])
54 |         logger.info('Novel cover: %s', self.novel_cover)
55 | 
56 |         author = soup.select('.author-content a')
57 |         if len(author) == 2:
58 |             self.novel_author = author[0].text + ' (' + author[1].text + ')'
59 |         else:
60 |             self.novel_author = author[0].text
61 |         logger.info('Novel author: %s', self.novel_author)
62 | 
63 |         chapters = soup.select('ul.main li.wp-manga-chapter a')
64 |         chapters.reverse()
65 | 
66 |         for a in chapters:
67 |             chap_id = len(self.chapters) + 1
68 |             vol_id = chap_id//100 + 1
69 |             if len(self.chapters) % 100 == 0:
70 |                 vol_title = 'Volume ' + str(vol_id)
71 |                 self.volumes.append({
72 |                     'id': vol_id,
73 |                     'title': vol_title,
74 |                 })
75 |             # end if
76 |             self.chapters.append({
77 |                 'id': chap_id,
78 |                 'volume': vol_id,
79 |                 'url':  self.absolute_url(a['href']),
80 |                 'title': a.text.strip() or ('Chapter %d' % chap_id),
81 |             })
82 |         # end for
83 |     # end def
84 | 
85 |     def download_chapter_body(self, chapter):
86 |         '''Download body of a single chapter and return as clean html format.'''
87 |         logger.info('Downloading %s', chapter['url'])
88 |         soup = self.get_soup(chapter['url'])
89 |         contents = soup.select('.text-left p, .cha-words p')
90 |         body = [str(p) for p in contents if p.text.strip()]
91 |         return '<p>' + '</p><p>'.join(body) + '</p>'
92 |     # end def
93 | # end class
94 | 


--------------------------------------------------------------------------------
/lncrawl/bots/test/post_github.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | import logging
  4 | import os
  5 | import platform
  6 | import sys
  7 | from datetime import datetime
  8 | from urllib.parse import urlencode
  9 | 
 10 | import requests
 11 | 
 12 | from ...assets.user_agents import user_agents
 13 | 
 14 | logger = logging.getLogger('MAKE_GITHUB_ISSUE')
 15 | 
 16 | # Authentication for user filing issue
 17 | USERNAME = os.getenv('GITHUB_USERNAME')
 18 | # PASSWORD = os.getenv('GITHUB_PASSWORD')  # deprecated
 19 | TOKEN = os.getenv('GITHUB_TOKEN')  # must have read/write access to repo
 20 | 
 21 | # The repository to add this issue to
 22 | REPO_OWNER = 'dipu-bd'
 23 | REPO_NAME = 'lightnovel-crawler'
 24 | 
 25 | # Headers
 26 | headers = {
 27 |     "User-Agent": user_agents[0],
 28 |     "Authorization": "token %s" % TOKEN,
 29 |     "Accept": "application/vnd.github.golden-comet-preview+json"
 30 | }
 31 | 
 32 | 
 33 | def find_issues(labels=None):
 34 |     '''Returns list of issues by query'''
 35 |     # Url to get issues via GET
 36 |     url = 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME)
 37 | 
 38 |     # Create a session without authentication
 39 |     session = requests.Session()
 40 | 
 41 |     # Create our issue
 42 |     data = {
 43 |         'labels': labels,
 44 |     }
 45 | 
 46 |     # Get issues
 47 |     r = session.get(url + '?' + urlencode(data), headers=headers)
 48 |     if r.ok:
 49 |         logger.info('Successfully retrieved issues')
 50 |         return r.json()
 51 |     else:
 52 |         logger.info('Failed to get issues: %s' % url)
 53 |         logger.debug('Response:\n%s\n' % r.content)
 54 |         return []
 55 |     # end if
 56 | # end def
 57 | 
 58 | 
 59 | def post_issue(title, body=None, labels=None):
 60 |     '''Create an issue on github.com using the given parameters.'''
 61 |     # Our url to create issues via POST
 62 |     url = 'https://api.github.com/repos/%s/%s/import/issues' % (REPO_OWNER, REPO_NAME)
 63 | 
 64 |     # Create an authenticated session to create the issue
 65 |     session = requests.Session()
 66 |     # session.auth = (USERNAME, PASSWORD)
 67 | 
 68 |     # Create our issue
 69 |     payload = json.dumps({
 70 |         'issue': {
 71 |             'title': title,
 72 |             'body': body,
 73 |             'labels': labels,
 74 |         }
 75 |     })
 76 | 
 77 |     # Add the issue to our repository
 78 |     r = session.post(url, data=payload, headers=headers)
 79 |     if r.ok:
 80 |         logger.info('Successfully created Issue %s' % title)
 81 |     else:
 82 |         logger.info('Could not create Issue %s' % title)
 83 |         logger.debug('Response:\n%s\n' % r.content)
 84 |         raise Exception('Failed to create issue')
 85 |     # end if
 86 | # end def
 87 | 
 88 | 
 89 | def post_on_github(self, message):
 90 |     if sys.version_info.minor != 6:
 91 |         print('Not Python 3.6... skipping.')
 92 |         return
 93 |     # end if
 94 | 
 95 |     # Check if there is already an issue younger than a week
 96 |     issues = find_issues('bot-report')
 97 |     if len(issues):
 98 |         time = int(issues[0]['title'].split('~')[-1].strip())
 99 |         diff = datetime.utcnow().timestamp() - time
100 |         if diff < 7 * 24 * 3600:
101 |             print('Detected an open issue younger than a week... skipping.')
102 |             return
103 |         # end if
104 |     # end if
105 | 
106 |     # Create new issue with appropriate label
107 |     title = '[Test Bot][Python %d.%d][%s] Report ~ %s' % (
108 |         sys.version_info.major,
109 |         sys.version_info.minor,
110 |         platform.system(),
111 |         datetime.utcnow().strftime('%s')
112 |     )
113 |     post_issue(title, message, ['bot-report'])
114 | # end def
115 | 


--------------------------------------------------------------------------------