├── Procfile
├── lncrawl
├── VERSION
├── assets
│ ├── __init__.py
│ ├── fonts
│ │ ├── Horta.ttf
│ │ ├── Sofia.otf
│ │ ├── Bellota.otf
│ │ ├── Caladea.ttf
│ │ ├── Crimson.otf
│ │ ├── Gidole.ttf
│ │ ├── Orkney.ttf
│ │ ├── Unique.ttf
│ │ ├── Bradley Gratis.ttf
│ │ ├── Liberation Serif.ttf
│ │ ├── Libre Baskerville.ttf
│ │ └── Glacial Indifference.otf
│ ├── version.py
│ ├── colors.txt
│ ├── icons.py
│ ├── templates
│ │ ├── Simple.svg
│ │ ├── Simple Dark.svg
│ │ ├── Blocks.svg
│ │ ├── Column.svg
│ │ ├── Window.svg
│ │ ├── Cross.svg
│ │ ├── Tiles.svg
│ │ ├── Gradient.svg
│ │ └── Rings.svg
│ ├── html_style.py
│ └── html_style.css
├── utils
│ ├── __init__.py
│ ├── update_checker.py
│ ├── uploader.py
│ └── kindlegen_download.py
├── bots
│ ├── discord
│ │ ├── __init__.py
│ │ └── config.py
│ ├── __init__.py
│ ├── console
│ │ ├── __init__.py
│ │ ├── login_info.py
│ │ └── get_crawler.py
│ ├── test
│ │ ├── test_crawler.py
│ │ └── post_github.py
│ └── _sample.py
├── __init__.py
├── binders
│ ├── text.py
│ ├── __init__.py
│ ├── web.py
│ └── calibre.py
├── sources
│ ├── anythingnovel.py
│ ├── chinesefantasy.py
│ ├── asianhobbyist.py
│ ├── webnovelonlinecom.py
│ ├── listnovel.py
│ ├── novelringan.py
│ ├── ranobelibme.py
│ ├── webnovelonline.py
│ ├── flyinglines.py
│ ├── wuxialeague.py
│ ├── fullnovellive.py
│ ├── liberspark.py
│ ├── aixdzs.py
│ ├── tapread.py
│ ├── tomotrans.py
│ ├── wattpad.py
│ ├── jpmtl.py
│ ├── tiknovel.py
│ ├── qidiancom.py
│ ├── 9kqw.py
│ ├── novelspread.py
│ ├── novelv.py
│ ├── machinetrans.py
│ ├── readln.py
│ ├── idqidian.py
│ ├── yukinovel.py
│ ├── fourscanlation.py
│ ├── novelgo.py
│ ├── gravitytales.py
│ ├── machinetransorg.py
│ ├── mangatoon.py
│ ├── rewayatclub.py
│ ├── shinsori.py
│ ├── wuxiaonline.py
│ ├── crescentmoon.py
│ ├── meionovel.py
│ ├── kissnovel.py
│ ├── bestlightnovel.py
│ ├── novelonlinefull.py
│ ├── boxnovel.py
│ ├── webnovelindonesia.py
│ ├── translateindo.py
│ ├── zenithnovels.py
│ ├── litnet.py
│ ├── __init__.py
│ ├── royalroad.py
│ └── wuxiasite.py
└── core
│ ├── __init__.py
│ ├── novel_info.py
│ └── novel_search.py
├── runtime.txt
├── MANIFEST.in
├── dev-requirements.txt
├── res
├── lncrawl.ico
├── lncrawl-icon.png
└── lncrawl-web.png
├── __main__.py
├── .github
├── ISSUE_TEMPLATE
│ ├── general.md
│ ├── new-source.md
│ ├── bug_report.md
│ └── remove-source.md
└── workflows
│ └── pythonpackage.yml
├── package.json
├── scripts
├── publish.sh
├── publish.bat
├── build.sh
└── build.bat
├── .gitignore
├── requirements.txt
├── .appveyor.yml
├── .env.example
├── .travis.yml
├── setup.py
├── app.json
├── setup.cfg
├── README.pip
└── setup_pyi.py
/Procfile:
--------------------------------------------------------------------------------
1 | bot: python .
2 |
--------------------------------------------------------------------------------
/lncrawl/VERSION:
--------------------------------------------------------------------------------
1 | 2.22.1
2 |
--------------------------------------------------------------------------------
/lncrawl/assets/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lncrawl/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/runtime.txt:
--------------------------------------------------------------------------------
1 | python-3.6.9
2 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include lncrawl/VERSION
2 | recursive-include lncrawl *.*
3 |
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | Js2Py
2 | PyInstaller
3 | cairosvg
4 | setuptools
5 | wheel
6 |
--------------------------------------------------------------------------------
/res/lncrawl.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl.ico
--------------------------------------------------------------------------------
/lncrawl/bots/discord/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from .discord_bot import DiscordBot
3 |
--------------------------------------------------------------------------------
/res/lncrawl-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl-icon.png
--------------------------------------------------------------------------------
/res/lncrawl-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/res/lncrawl-web.png
--------------------------------------------------------------------------------
/__main__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | from lncrawl import main
4 | main()
5 |
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Horta.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Horta.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Sofia.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Sofia.otf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Bellota.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Bellota.otf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Caladea.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Caladea.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Crimson.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Crimson.otf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Gidole.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Gidole.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Orkney.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Orkney.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Unique.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Unique.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Bradley Gratis.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Bradley Gratis.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Liberation Serif.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Liberation Serif.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Libre Baskerville.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Libre Baskerville.ttf
--------------------------------------------------------------------------------
/lncrawl/assets/fonts/Glacial Indifference.otf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/skyme5/lightnovel-crawler/master/lncrawl/assets/fonts/Glacial Indifference.otf
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: General
3 | about: If you want to create a general issue
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new-source.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: New source
3 | about: Suggest a new source to add
4 | title: Enter your desired sources here
5 | labels: source
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "lightnovel-crawler",
3 | "description": "Downloads lightnovels from various online sources and generates ebooks in many formats.",
4 | "version": "2.16.2",
5 | "engines": {
6 | "node": "12.x"
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/lncrawl/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | try:
4 | from dotenv import load_dotenv
5 | load_dotenv()
6 | except Exception:
7 | pass
8 | # end try
9 |
10 |
11 | def main():
12 | from .core import start_app
13 | start_app()
14 | # end def
15 |
--------------------------------------------------------------------------------
/lncrawl/assets/version.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from pathlib import Path
3 |
4 | ROOT = Path(__file__).parent.parent
5 |
6 | with open(str(ROOT / 'VERSION'), 'r') as f:
7 | version = f.read().strip()
8 | # end with
9 |
10 |
11 | def get_value():
12 | return version
13 | # end def
14 |
--------------------------------------------------------------------------------
/scripts/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | VERSION=$(head -n 1 lncrawl/VERSION)
4 |
5 | PY="python3"
6 | PIP="$PY -m pip --disable-pip-version-check"
7 |
8 | # . scripts/build.sh
9 |
10 | $PIP install twine
11 | $PY -m twine upload "dist/lightnovel_crawler-$VERSION-py3-none-any.whl"
12 |
13 | # FINISHED
14 |
--------------------------------------------------------------------------------
/scripts/publish.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | SET /P VERSION=
\n
2 |
22 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | os: linux
2 | dist: xenial
3 | language: python
4 | python:
5 | - "3.8"
6 | - "3.7"
7 | - "3.6"
8 | - "3.5"
9 | - nightly
10 |
11 | matrix:
12 | allow_failures:
13 | - python: nightly
14 | - os: osx
15 | fast_finish: true
16 |
17 | before_install:
18 | - |
19 | if [[ $TRAVIS_OS_NAME == 'osx' ]]; then
20 | brew upgrade python
21 | export PATH="/usr/local/opt/python/libexec/bin:${PATH}"
22 | fi
23 | install:
24 | - pip install -r requirements.txt
25 | - pip install flake8
26 |
27 | before_script:
28 | # stop the build if there are Python syntax errors or undefined names
29 | - flake8 . --count --select=E901,E999,F821,F822,F823 --show-source --statistics
30 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
31 | - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
32 |
33 | script:
34 | - python __main__.py --bot test -lll
35 |
36 | cache:
37 | directories:
38 | - $HOME/.cache/pip
39 | - $HOME/.cache/pre-commit
40 |
41 | branches:
42 | only:
43 | - master
44 |
45 | notifications:
46 | email: false
47 |
--------------------------------------------------------------------------------
/lncrawl/assets/templates/Column.svg:
--------------------------------------------------------------------------------
1 |
2 |
23 |
--------------------------------------------------------------------------------
/.github/workflows/pythonpackage.yml:
--------------------------------------------------------------------------------
1 | name: Python package
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - master
7 | push:
8 | branches:
9 | - master
10 |
11 | jobs:
12 | build:
13 |
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | python-version: [3.5, 3.6, 3.7, 3.8]
18 |
19 | steps:
20 | - uses: actions/checkout@v2
21 | - name: Set up Python ${{ matrix.python-version }}
22 | uses: actions/setup-python@v1
23 | with:
24 | python-version: ${{ matrix.python-version }}
25 | - name: Install dependencies
26 | run: |
27 | python -m pip install --upgrade pip
28 | pip install -r requirements.txt
29 | - name: Lint with flake8
30 | run: |
31 | pip install flake8
32 | # stop the build if there are Python syntax errors or undefined names
33 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
34 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
35 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
36 | # - name: Test with pytest
37 | # run: |
38 | # pip install pytest
39 | # pytest
40 |
--------------------------------------------------------------------------------
/lncrawl/assets/templates/Window.svg:
--------------------------------------------------------------------------------
1 |
2 |
23 |
--------------------------------------------------------------------------------
/lncrawl/assets/templates/Cross.svg:
--------------------------------------------------------------------------------
1 |
2 |
23 |
--------------------------------------------------------------------------------
/lncrawl/assets/templates/Tiles.svg:
--------------------------------------------------------------------------------
1 |
2 |
24 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import sys
3 |
4 | if sys.version_info[:2] < (3, 5):
5 | raise RuntimeError(
6 | 'Lightnovel crawler only supports Python 3.5 and later.')
7 | else:
8 | run_pyi = 'package' in sys.argv
9 | if run_pyi:
10 | sys.argv.remove('package')
11 | # end if
12 | if len(sys.argv) == 1:
13 | sys.argv += ['build']
14 | # end if
15 |
16 | # import required packages
17 | from pathlib import Path
18 | from setuptools import config, setup
19 |
20 | def parse_version(filename):
21 | with open(filename, 'r') as f:
22 | return f.read().strip()
23 | # end def
24 |
25 | def parse_requirements(filename):
26 | with open(filename, 'r', encoding='utf-8') as f:
27 | requirements = f.read().strip().split('\n')
28 | requirements = [
29 | r.strip() for r in requirements
30 | if r.strip() and not r.startswith('#')
31 | ]
32 | return requirements
33 | # end def
34 |
35 | config.read_configuration('setup.cfg')
36 |
37 | setup(
38 | version=parse_version(Path('lncrawl') / 'VERSION'),
39 | install_requires=parse_requirements('requirements.txt'),
40 | )
41 |
42 | if run_pyi:
43 | from setup_pyi import package
44 | package()
45 | # end if
46 | # end if
47 |
--------------------------------------------------------------------------------
/lncrawl/assets/templates/Gradient.svg:
--------------------------------------------------------------------------------
1 |
2 |
26 |
--------------------------------------------------------------------------------
/lncrawl/bots/console/login_info.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from PyInquirer import prompt
3 | from ...core.arguments import get_args
4 |
5 |
6 | def get_login_info(self):
7 | '''Returns the (email, password) pair for login'''
8 | args = get_args()
9 |
10 | if args.login:
11 | return args.login
12 | # end if
13 |
14 | if args.suppress:
15 | return False
16 | # end if
17 |
18 | answer = prompt([
19 | {
20 | 'type': 'confirm',
21 | 'name': 'login',
22 | 'message': 'Do you want to log in?',
23 | 'default': False
24 | },
25 | ])
26 |
27 | if answer['login']:
28 | answer = prompt([
29 | {
30 | 'type': 'input',
31 | 'name': 'email',
32 | 'message': 'Username/Email:',
33 | 'validate': lambda val: True if len(val)
34 | else 'Email address should be not be empty'
35 | },
36 | {
37 | 'type': 'password',
38 | 'name': 'password',
39 | 'message': 'Password:',
40 | 'validate': lambda val: True if len(val)
41 | else 'Password should be not be empty'
42 | },
43 | ])
44 | return answer['email'], answer['password']
45 | # end if
46 |
47 | return None
48 | # end if
49 |
--------------------------------------------------------------------------------
/lncrawl/assets/templates/Rings.svg:
--------------------------------------------------------------------------------
1 |
2 |
30 |
--------------------------------------------------------------------------------
/app.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "lightnovel crawler",
3 | "description": "Downloads lightnovels from various online sources and generates ebooks in many formats.",
4 | "keywords": [
5 | "discord",
6 | "bot",
7 | "telegram",
8 | "novel",
9 | "lightnovel",
10 | "crawler"
11 | ],
12 | "website": "https://github.com/dipu-bd/lightnovel-crawler",
13 | "logo": "https://github.com/dipu-bd/lightnovel-crawler/raw/master/res/lncrawl-icon.png",
14 | "env": {
15 | "LOG_LEVEL": {
16 | "description": "Available levels: NOTSET, WARN, INFO, DEBUG, FATAL, ERROR",
17 | "value": "INFO",
18 | "required": true
19 | },
20 | "BOT": {
21 | "description": "available: console, discord, telegram",
22 | "value": "discord",
23 | "required": true
24 | },
25 | "TELEGRAM_TOKEN": {
26 | "description": "Telegram token, only required if BOT is set to telegram",
27 | "required": false
28 | },
29 | "DISCORD_TOKEN": {
30 | "description": "Discord token, only required if BOT is set to discord",
31 | "required": false
32 | },
33 | "DISCORD_SIGNAL_CHAR": {
34 | "description": "Discord command prefix, only required if BOT is set to discord",
35 | "required": false,
36 | "value": "!"
37 | }
38 | },
39 | "buildpacks": [
40 | {
41 | "url": "https://github.com/heroku/heroku-buildpack-nodejs"
42 | },
43 | {
44 | "url": "https://github.com/heroku/heroku-buildpack-python"
45 | },
46 | {
47 | "url": "https://github.com/nntin/heroku-buildpack-calibre"
48 | }
49 | ]
50 | }
51 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/remove-source.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Remove source
3 | about: If you are owner of a source added here and want to remove it
4 | title: 'Request to remove a site: ' + ' '.join(body) + ' ' + ' '.join(body) + ' ' + ' '.join(content.split('\n\n')) + ' ' + ' '.join(body) + ' ' + ' '.join(body) + ' ' + ' '.join(body) + ' ' + ' '.join(body) + ' ******* —————– ' + ' '.join(contents) + ' ' + ' '.join(body) + ' ' + ' '.join(contents) + ' ' + ' '.join(contents) + ' %s
)|\n)+', '\n\n', chap_desc, flags=re.I)
66 | contents = chap_desc.split('\n\n')
67 | contents = [p for p in contents if p and p.strip()]
68 | return '
)|\n)+', '\n\n', chap_desc, flags=re.I)
71 | contents = chap_desc.split('\n\n')
72 | contents = [p for p in contents if p and p.strip()]
73 | return '
'.join(body) 69 | return self.clean_text(body) 70 | # end def 71 | 72 | def clean_text(self, text): 73 | text = re.sub(r'\ufffd\ufffd\ufffd+', '**', text) 74 | text = re.sub(r'\ufffd\ufffd', '"', text) 75 | text = re.sub(r'\u00a0\u00a0', '–', text) 76 | text = re.sub(r'\ufffdC', '', text) 77 | return text 78 | # end def 79 | # end class 80 | -------------------------------------------------------------------------------- /lncrawl/sources/machinetrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('MACHINE_NOVEL_TRANSLATION') 8 | 9 | search_url = 'http://www.machinenoveltranslation.com/search/autocomplete' 10 | 11 | 12 | class MachineNovelTrans(Crawler): 13 | base_url = 'http://www.machinenoveltranslation.com/' 14 | 15 | def read_novel_info(self): 16 | '''Get novel title, autor, cover etc''' 17 | logger.debug('Visiting %s', self.novel_url) 18 | soup = self.get_soup(self.novel_url) 19 | 20 | self.novel_title = soup.select_one('.desc h5').text 21 | logger.info('Novel title: %s', self.novel_title) 22 | 23 | self.novel_cover = self.absolute_url( 24 | soup.select_one('.about-author .row img')['src']) 25 | logger.info('Novel cover: %s', self.novel_cover) 26 | 27 | for div in soup.select('#chapters #accordion .panel'): 28 | vol_title = div.select_one('h4.panel-title a').text 29 | vol_id = [int(x) for x in re.findall(r'\d+', vol_title)] 30 | vol_id = vol_id[0] if len(vol_id) else len(self.volumes) + 1 31 | self.volumes.append({ 32 | 'id': vol_id, 33 | 'title': vol_title, 34 | }) 35 | 36 | for a in div.select('ul.navigate-page li a'): 37 | ch_title = a.text 38 | ch_id = [int(x) for x in re.findall(r'\d+', ch_title)] 39 | ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1 40 | self.chapters.append({ 41 | 'id': ch_id, 42 | 'volume': vol_id, 43 | 'title': ch_title, 44 | 'url': self.absolute_url(a['href']), 45 | }) 46 | # end for 47 | # end for 48 | 49 | logger.debug('%d chapters and %d volumes found', 50 | len(self.chapters), len(self.volumes)) 51 | # end def 52 | 53 | def download_chapter_body(self, chapter): 54 | '''Download body of a single chapter and return as clean html format.''' 55 | logger.info('Visiting %s', chapter['url']) 56 | soup = self.get_soup(chapter['url']) 57 | 58 | body = soup.select('.about-author .desc .translated') 59 | body = [self.format_text(x.text) for x in body if x] 60 | body = '\n'.join(['
%s
' % (x) for x in body if len(x)]) 61 | return body.strip() 62 | # end def 63 | 64 | def format_text(self, text): 65 | '''formats the text and remove bad characters''' 66 | text = re.sub(r'\u00ad', '', text, flags=re.UNICODE) 67 | text = re.sub(r'\u201e[, ]*', '“', text, flags=re.UNICODE) 68 | text = re.sub(r'\u201d[, ]*', '”', text, flags=re.UNICODE) 69 | text = re.sub(r'[ ]*,[ ]+', ', ', text, flags=re.UNICODE) 70 | return text.strip() 71 | # end def 72 | # end class 73 | -------------------------------------------------------------------------------- /lncrawl/bots/test/test_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | 4 | from ...core.app import App 5 | from ...binders import available_formats 6 | 7 | 8 | def test_crawler(self, link, user_input): 9 | app = App() 10 | print('App instance: OK') 11 | 12 | app.initialize() 13 | print('App initialize: DONE') 14 | 15 | app.user_input = user_input 16 | app.init_search() 17 | print('Init search: DONE') 18 | 19 | if not app.crawler: 20 | if link not in app.crawler_links: 21 | print('Search is not supported for', link) 22 | return 23 | # end if 24 | 25 | print(len(app.crawler_links), 'available crawlers to search') 26 | app.crawler_links = [link] 27 | print('Selected crawler:', link) 28 | 29 | app.search_novel() 30 | print('Search: %d results found' % len(app.search_results)) 31 | 32 | source = app.search_results[0] 33 | print('Top result: %s with %d sources' % 34 | (source['title'], len(source['novels']))) 35 | 36 | novel_url = source['novels'][0]['url'] 37 | print('Top novel:', novel_url) 38 | 39 | app.init_crawler(novel_url) 40 | print('Init crawler: DONE') 41 | 42 | app.get_novel_info() 43 | print('Novel info: DONE') 44 | if not app.crawler.novel_title: 45 | raise Exception('No novel title') 46 | # end if 47 | return 48 | # end if 49 | 50 | if not app.crawler: 51 | raise Exception('No crawler initialized') 52 | # end if 53 | 54 | if app.can_do('login'): 55 | print('Login: enabled') 56 | # end if 57 | 58 | app.get_novel_info() 59 | print('Title:', app.crawler.novel_title) 60 | print('Cover:', app.crawler.novel_cover) 61 | print('Author:', app.crawler.novel_author) 62 | 63 | if not app.crawler.novel_title: 64 | raise Exception('No novel title') 65 | # end if 66 | 67 | print('Novel info: DONE') 68 | 69 | os.makedirs(app.output_path, exist_ok=True) 70 | print('Output path:', app.output_path) 71 | 72 | if len(app.crawler.volumes) == 0: 73 | raise Exception('Empty volume list') 74 | # end if 75 | 76 | if len(app.crawler.chapters) == 0: 77 | raise Exception('Empty chapter list') 78 | # end if 79 | 80 | app.chapters = app.crawler.chapters[:2] 81 | app.output_formats = {x: False for x in available_formats} 82 | app.output_formats['pdf'] = True 83 | app.pack_by_volume = False 84 | 85 | app.start_download() 86 | print('Download: DONE') 87 | 88 | if len(app.chapters[0]['body']) < 50: 89 | raise Exception('Empty body') 90 | # end if 91 | 92 | app.bind_books() 93 | print('Bindings: DONE') 94 | 95 | app.destroy() 96 | print('Destroy: DONE') 97 | 98 | print('-' * 6, 'Test Passed', '-' * 6) 99 | # end def 100 | -------------------------------------------------------------------------------- /lncrawl/sources/readln.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('READLIGHTNOVEL') 8 | search_url = 'https://www.readlightnovel.org/search/autocomplete' 9 | 10 | 11 | class ReadLightNovelCrawler(Crawler): 12 | base_url = 'https://www.readlightnovel.org/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = soup.select_one('.block-title h1').text 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.find('img', {'alt': self.novel_title})['src']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | author_link = soup.select_one("a[href*=author]") 27 | if author_link: 28 | self.novel_author = author_link.text.strip().title() 29 | # end if 30 | logger.info('Novel author: %s', self.novel_author) 31 | 32 | volume_ids = set() 33 | for a in soup.select('.chapters .chapter-chs li a'): 34 | chap_id = len(self.chapters) + 1 35 | vol_id = (chap_id - 1) // 100 + 1 36 | volume_ids.add(vol_id) 37 | self.chapters.append({ 38 | 'id': chap_id, 39 | 'volume': vol_id, 40 | 'url': self.absolute_url(a['href']), 41 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 42 | }) 43 | # end for 44 | 45 | self.volumes = [{'id': i} for i in volume_ids] 46 | # end def 47 | 48 | def download_chapter_body(self, chapter): 49 | '''Download body of a single chapter and return as clean html format.''' 50 | logger.info('Downloading %s', chapter['url']) 51 | soup = self.get_soup(chapter['url']) 52 | 53 | div = soup.select_one('.chapter-content3 .desc') 54 | 55 | bad_selectors = [ 56 | '.trinity-player-iframe-wrapper' 57 | '.hidden', 58 | '.ads-title', 59 | 'script', 60 | 'center', 61 | 'interaction', 62 | 'a[href*=remove-ads]', 63 | 'a[target=_blank]', 64 | 'hr', 65 | 'br' 66 | ] 67 | for hidden in div.select(', '.join(bad_selectors)): 68 | hidden.decompose() 69 | # end if 70 | 71 | body = self.extract_contents(div) 72 | if re.search(r'c?hapter .?\d+', body[0], re.IGNORECASE): 73 | title = body[0].replace('', '').replace('', '').strip() 74 | title = ('C' if title.startswith('hapter') else '') + title 75 | chapter['title'] = title.strip() 76 | body = body[1:] 77 | # end if 78 | 79 | return '' + '
'.join(body) + '
' 80 | # end def 81 | # end class 82 | -------------------------------------------------------------------------------- /lncrawl/sources/idqidian.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('IDQIDIAN') 8 | 9 | 10 | class IdqidianCrawler(Crawler): 11 | base_url = 'https://www.idqidian.us/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.find_all( 19 | 'span', {"typeof": "v:Breadcrumb"})[-1].text 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = "https://www.idqidian.us/images/noavailable.jpg" 23 | logger.info('Novel cover: %s', self.novel_cover) 24 | 25 | author = soup.select('p')[3].text 26 | self.novel_author = author[20:len(author)-22] 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | chapters = soup.find('div', { 30 | 'style': '-moz-border-radius: 5px 5px 5px 5px; border: 1px solid #333; color: black; height: 400px; margin: 5px; overflow: auto; padding: 5px; width: 96%;'}).findAll( 31 | 'a') 32 | chapters.reverse() 33 | 34 | for a in chapters: 35 | chap_id = len(self.chapters) + 1 36 | if len(self.chapters) % 100 == 0: 37 | vol_id = chap_id//100 + 1 38 | vol_title = 'Volume ' + str(vol_id) 39 | self.volumes.append({ 40 | 'id': vol_id, 41 | 'title': vol_title, 42 | }) 43 | # end if 44 | self.chapters.append({ 45 | 'id': chap_id, 46 | 'volume': vol_id, 47 | 'url': self.absolute_url(a['href']), 48 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 49 | }) 50 | # end for 51 | # end def 52 | 53 | def download_chapter_body(self, chapter): 54 | '''Download body of a single chapter and return as clean html format.''' 55 | logger.info('Downloading %s', chapter['url']) 56 | soup = self.get_soup(chapter['url']) 57 | 58 | for a in soup.find_all('a'): 59 | a.decompose() 60 | 61 | body_parts = soup.select('p') 62 | body_parts = ''.join([str(p.extract()) for p in body_parts if 63 | p.text.strip() and not 'Advertisement' in p.text and not 'JavaScript!' in p.text]) 64 | if body_parts == '': 65 | texts = [str.strip(x) for x in soup.strings if str.strip(x) != ''] 66 | unwanted_text = [str.strip(x.text) for x in soup.find_all()] 67 | my_texts = set(texts).difference(unwanted_text) 68 | body_parts = ''.join( 69 | [str(p) for p in my_texts if p.strip() and not 'Advertisement' in p and not 'JavaScript!' in p]) 70 | # end if 71 | 72 | return body_parts 73 | # end def 74 | # end class 75 | -------------------------------------------------------------------------------- /lncrawl/sources/yukinovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | from bs4 import Comment 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('YUKI_NOVEL') 11 | 12 | 13 | class YukiNovelCrawler(Crawler): 14 | base_url = 'https://yukinovel.id/' 15 | 16 | def initialize(self): 17 | self.home_url = 'https://yukinovel.id/' 18 | # end def 19 | 20 | def read_novel_info(self): 21 | '''Get novel title, autor, cover etc''' 22 | url = self.novel_url.replace('https://yukinovel.me', 'https://yukinovel.id') 23 | logger.debug('Visiting %s', self.novel_url) 24 | soup = self.get_soup(self.novel_url) 25 | 26 | self.novel_title = soup.select_one('h1.entry-title').text 27 | logger.info('Novel title: %s', self.novel_title) 28 | 29 | self.novel_author = "Translated by Yukinovel" 30 | logger.info('Novel author: %s', self.novel_author) 31 | 32 | self.novel_cover = self.absolute_url( 33 | soup.select_one('div.lightnovel-thumb img')['src']) 34 | logger.info('Novel cover: %s', self.novel_cover) 35 | 36 | # Extract volume-wise chapter entries 37 | chapters = soup.select('div.lightnovel-episode ul li a') 38 | 39 | chapters.reverse() 40 | 41 | for a in chapters: 42 | chap_id = len(self.chapters) + 1 43 | if len(self.chapters) % 100 == 0: 44 | vol_id = chap_id//100 + 1 45 | vol_title = 'Volume ' + str(vol_id) 46 | self.volumes.append({ 47 | 'id': vol_id, 48 | 'title': vol_title, 49 | }) 50 | # end if 51 | self.chapters.append({ 52 | 'id': chap_id, 53 | 'volume': vol_id, 54 | 'url': self.absolute_url(a['href']), 55 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 56 | }) 57 | # end for 58 | # end def 59 | 60 | def download_chapter_body(self, chapter): 61 | '''Download body of a single chapter and return as clean html format.''' 62 | logger.info('Downloading %s', chapter['url']) 63 | soup = self.get_soup(chapter['url']) 64 | 65 | contents = soup.select_one('div.entry-content.cl') 66 | 67 | for d in contents.findAll('div'): 68 | d.decompose() 69 | # end for 70 | 71 | for comment in contents.find_all(string=lambda text: isinstance(text, Comment)): 72 | comment.extract() 73 | # end for 74 | 75 | if contents.findAll('p')[0].text.strip().startswith('Bab'): 76 | chapter['title'] = contents.findAll('p')[0].text.strip() 77 | contents.findAll('p')[0].extract() 78 | else: 79 | chapter['title'] = chapter['title'] 80 | # end if 81 | 82 | logger.debug(chapter['title']) 83 | 84 | return str(contents) 85 | # end def 86 | # end class 87 | -------------------------------------------------------------------------------- /lncrawl/sources/fourscanlation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import urlparse 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('4SCANLATION') 8 | novel_page = 'https://4scanlation.com/%s' 9 | 10 | 11 | class FourScanlationCrawler(Crawler): 12 | base_url = 'https://4scanlation.com/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | path_fragments = urlparse(self.novel_url).path.split('/') 17 | novel_hash = path_fragments[1] 18 | if novel_hash == 'category': 19 | novel_hash = path_fragments[2] 20 | # end if 21 | self.novel_url = novel_page % novel_hash 22 | 23 | logger.debug('Visiting %s', self.novel_url) 24 | soup = self.get_soup(self.novel_url) 25 | 26 | self.novel_title = soup.select_one(', '.join([ 27 | 'header h1', 28 | '.header-post-title-class', 29 | ])).text.strip() 30 | logger.info('Novel title: %s', self.novel_title) 31 | 32 | self.novel_author = "Source: 4scanlation" 33 | logger.info('Novel author: %s', self.novel_author) 34 | 35 | possible_image = soup.select_one('#primary article img.wp-post-image') 36 | if possible_image: 37 | self.novel_cover = self.absolute_url(possible_image['src']) 38 | # end if 39 | logger.info('Novel cover: %s', self.novel_cover) 40 | 41 | # Extract volume-wise chapter entries 42 | volumes = set() 43 | for a in soup.select('article.page p a'): 44 | possible_url = self.absolute_url(a['href']) 45 | if not self.is_relative_url(possible_url): 46 | continue 47 | # end if 48 | chap_id = 1 + len(self.chapters) 49 | vol_id = 1 + len(self.chapters) // 100 50 | volumes.add(vol_id) 51 | self.chapters.append({ 52 | 'id': chap_id, 53 | 'volume': vol_id, 54 | 'url': possible_url, 55 | 'title': a.text.strip(), 56 | }) 57 | # end for 58 | 59 | self.volumes = [{'id': x} for x in volumes] 60 | # end def 61 | 62 | def download_chapter_body(self, chapter): 63 | '''Download body of a single chapter and return as clean html format.''' 64 | logger.info('Downloading %s', chapter['url']) 65 | soup = self.get_soup(chapter['url']) 66 | 67 | contents = soup.select_one('article div.entry-content') 68 | if not contents: 69 | return '' 70 | # end if 71 | 72 | for d in contents.findAll('div'): 73 | d.extract() 74 | # end for 75 | 76 | try: 77 | chapter['title'] = soup.select_one('header h1').text 78 | logger.debug(chapter['title']) 79 | except Exception: 80 | pass 81 | # end try 82 | 83 | return str(contents or '') 84 | # end def 85 | # end class 86 | -------------------------------------------------------------------------------- /lncrawl/sources/novelgo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | import cssutils 6 | import urllib.parse 7 | 8 | from bs4 import BeautifulSoup 9 | 10 | from ..utils.crawler import Crawler 11 | 12 | logger = logging.getLogger('NOVEL_GO') 13 | 14 | 15 | class NovelGoCrawler(Crawler): 16 | base_url = 'https://novelgo.id/' 17 | 18 | def read_novel_info(self): 19 | '''Get novel title, autor, cover etc''' 20 | logger.debug('Visiting %s', self.novel_url) 21 | soup = self.get_soup(self.novel_url) 22 | 23 | self.novel_title = soup.find( 24 | 'h2', {'class': 'novel-title'}).text.strip() 25 | logger.info('Novel title: %s', self.novel_title) 26 | 27 | self.novel_author = soup.select_one( 28 | 'div.noveils-current-author a').text.strip() 29 | logger.info('Novel author: %s', self.novel_author) 30 | 31 | thumbnail = soup.find("div", {"class": "novel-thumbnail"})['style'] 32 | style = cssutils.parseStyle(thumbnail) 33 | url = style['background-image'] 34 | 35 | self.novel_cover = self.absolute_url( 36 | url.replace('url(', '').replace(')', '')) 37 | logger.info('Novel cover: %s', self.novel_cover) 38 | 39 | path = urllib.parse.urlsplit(self.novel_url)[2] 40 | book_id = path.split('/')[2] 41 | chapter_list = js = self.scraper.post( 42 | 'https://novelgo.id/wp-admin/admin-ajax.php?action=LoadChapter&post=%s' % book_id).content 43 | soup_chapter = BeautifulSoup(chapter_list, 'lxml') 44 | 45 | chapters = soup_chapter.select('ul li a') 46 | 47 | for x in chapters: 48 | chap_id = len(self.chapters) + 1 49 | if len(self.chapters) % 100 == 0: 50 | vol_id = chap_id//100 + 1 51 | vol_title = 'Volume ' + str(vol_id) 52 | self.volumes.append({ 53 | 'id': vol_id, 54 | 'title': vol_title, 55 | }) 56 | # end if 57 | self.chapters.append({ 58 | 'id': chap_id, 59 | 'volume': vol_id, 60 | 'url': self.absolute_url(x['href']), 61 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 62 | }) 63 | # end for 64 | 65 | logger.debug(self.chapters) 66 | # end def 67 | 68 | def download_chapter_body(self, chapter): 69 | '''Download body of a single chapter and return as clean html format.''' 70 | logger.info('Downloading %s', chapter['url']) 71 | soup = self.get_soup(chapter['url']) 72 | 73 | self.blacklist_patterns = [ 74 | r'^translat(ed by|or)', 75 | r'(volume|chapter) .?\d+', 76 | ] 77 | 78 | contents = soup.find( 79 | 'div', {'id': 'chapter-post-content'}).findAll('p') 80 | body = [str(p) for p in contents if p.text.strip()] 81 | return '' + '
'.join(body) + '
' 82 | # end def 83 | # end class 84 | -------------------------------------------------------------------------------- /lncrawl/sources/gravitytales.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import logging 4 | from ..utils.crawler import Crawler 5 | 6 | logger = logging.getLogger('GRAVITY_TALES') 7 | 8 | cover_image_url = 'https://cdn.gravitytales.com/images/covers/%s.jpg' 9 | novel_toc_url = 'http://gravitytales.com/novel/%s' 10 | chapter_list_url = 'http://gravitytales.com/novel/%s/chapters' 11 | 12 | 13 | class GravityTalesCrawler(Crawler): 14 | base_url = 'http://gravitytales.com/' 15 | 16 | def read_novel_info(self): 17 | self.novel_id = re.split(r'\/(novel|post)\/', self.novel_url)[2] 18 | self.novel_id = self.novel_id.split('/')[0] 19 | logger.info('Novel id: %s' % self.novel_id) 20 | 21 | self.novel_url = novel_toc_url % self.novel_id 22 | logger.debug('Visiting %s' % self.novel_url) 23 | soup = self.get_soup(self.novel_url) 24 | 25 | for tag in soup.select('.main-content h3 > *'): 26 | tag.extract() 27 | self.novel_title = soup.select_one('.main-content h3').text.strip() 28 | logger.info('Novel title: %s' % self.novel_title) 29 | 30 | self.novel_cover = cover_image_url % self.novel_id 31 | logger.info('Novel cover: %s' % self.novel_cover) 32 | 33 | self.novel_author = soup.select_one('.main-content h4').text.strip() 34 | logger.info(self.novel_author) 35 | 36 | self.get_chapter_list() 37 | # end def 38 | 39 | def get_chapter_list(self): 40 | url = chapter_list_url % self.novel_id 41 | logger.info('Visiting %s' % url) 42 | soup = self.get_soup(url) 43 | 44 | # For each tabs... 45 | for a in soup.select('#chaptergroups li a'): 46 | vol_id = len(self.volumes) + 1 47 | self.volumes.append({ 48 | 'id': vol_id, 49 | 'title': a.text.strip(), 50 | '_tid': (a['href']), 51 | }) 52 | 53 | # ...get every chapters 54 | for a in soup.select_one(a['href']).select('table td a'): 55 | chap_id = len(self.chapters) + 1 56 | self.chapters.append({ 57 | 'id': chap_id, 58 | 'volume': vol_id, 59 | 'title': a.text.strip(), 60 | 'url': self.absolute_url(a['href']), 61 | }) 62 | # end for 63 | 64 | logger.info('%d chapters and %d volumes found', 65 | len(self.chapters), len(self.volumes)) 66 | # end def 67 | 68 | def download_chapter_body(self, chapter): 69 | '''Download body of a single chapter and return as clean html format.''' 70 | logger.info('Downloading %s' % chapter['url']) 71 | soup = self.get_soup(chapter['url']) 72 | body = soup.select_one('#chapterContent') 73 | for tag in body.contents: 74 | if hasattr(tag, 'attrs'): 75 | setattr(tag, 'attrs', {}) # clear attributes 76 | # end if 77 | # end for 78 | return str(body) 79 | # end def 80 | # end class 81 | -------------------------------------------------------------------------------- /lncrawl/sources/machinetransorg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import quote 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('MACHINE_NOVEL_TRANSLATION') 8 | 9 | search_url = 'https://www.machine-translation.org/novel/search/?keywords=%s' 10 | 11 | 12 | class MachineTransOrg(Crawler): 13 | base_url = 'https://www.machine-translation.org/' 14 | 15 | def search_novel(self, query): 16 | url = search_url % quote(query.lower()) 17 | logger.debug('Visiting: %s', url) 18 | soup = self.get_soup(url) 19 | 20 | results = [] 21 | for li in soup.select('.book-list-info > ul > li'): 22 | results.append({ 23 | 'title': li.select_one('a h4 b').text.strip(), 24 | 'url': self.absolute_url(li.select_one('.book-img a')['href']), 25 | 'info': li.select_one('.update-info').text.strip(), 26 | }) 27 | # end for 28 | return results 29 | # end def 30 | 31 | def read_novel_info(self): 32 | '''Get novel title, autor, cover etc''' 33 | logger.debug('Visiting %s', self.novel_url) 34 | soup = self.get_soup(self.novel_url) 35 | 36 | self.novel_title = soup.select_one('div.title h3 b').text 37 | logger.info('Novel title: %s', self.novel_title) 38 | 39 | self.novel_author = soup.select_one('div.title h3 span').text 40 | logger.info('Novel author: %s', self.novel_author) 41 | 42 | self.novel_cover = self.absolute_url( 43 | soup.select_one('.book-img img')['src']) 44 | logger.info('Novel cover: %s', self.novel_cover) 45 | 46 | for a in reversed(soup.select('div.slide-item a')): 47 | ch_title = a.text.strip() 48 | ch_id = len(self.chapters) + 1 49 | if len(self.chapters) % 100 == 0: 50 | vol_id = ch_id//100 + 1 51 | vol_title = 'Volume ' + str(vol_id) 52 | self.volumes.append({ 53 | 'id': vol_id, 54 | 'title': vol_title, 55 | }) 56 | # end if 57 | self.chapters.append({ 58 | 'id': ch_id, 59 | 'volume': vol_id, 60 | 'title': ch_title, 61 | 'url': self.absolute_url(a['href']), 62 | }) 63 | # end for 64 | 65 | logger.debug('%d chapters and %d volumes found', 66 | len(self.chapters), len(self.volumes)) 67 | # end def 68 | 69 | def download_chapter_body(self, chapter): 70 | '''Download body of a single chapter and return as clean html format''' 71 | logger.info('Visiting %s', chapter['url']) 72 | soup = self.get_soup(chapter['url']) 73 | body = soup.select_one('.read-main .read-context') 74 | 75 | self.blacklist_patterns = [ 76 | r'^Refresh time: \d+-\d+-\d+$' 77 | ] 78 | self.clean_contents(body) 79 | 80 | return str(body) 81 | # end def 82 | # end class 83 | -------------------------------------------------------------------------------- /lncrawl/bots/_sample.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..core.app import App 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | # TODO: It is recommended to implemented all methods. But you can skip those 8 | # Which return values by default. 9 | 10 | 11 | class SampleBot: 12 | def start(self): 13 | # TODO: must be implemented 14 | # Start processing using this bot. It should use self methods to take 15 | # inputs and self.app methods to process them. 16 | # 17 | self.app = App() 18 | self.app.initialize() 19 | # 20 | # Checkout console.py for a sample implementation 21 | # end def 22 | 23 | def get_novel_url(self): 24 | # Returns a novel page url or a query 25 | pass 26 | # end def 27 | 28 | def get_crawlers_to_search(self): 29 | # Returns user choice to search the choosen sites for a novel 30 | pass 31 | # end def 32 | 33 | def choose_a_novel(self): 34 | # The search_results is an array of (novel_title, novel_url). 35 | # This method should return a single novel_url only 36 | # 37 | # By default, returns the first search_results. Implemented it to 38 | # handle multiple search_results 39 | pass 40 | # end def 41 | 42 | def get_login_info(self): 43 | # By default, returns None to skip login 44 | pass 45 | # end if 46 | 47 | def get_output_path(self): 48 | # You should return a valid absolute path. The parameter suggested_path 49 | # is valid but not gurranteed to exists. 50 | # 51 | # NOTE: If you do not want to use any pre-downloaded files, remove all 52 | # contents inside of your selected output directory. 53 | # 54 | # By default, returns a valid existing path from suggested_path 55 | pass 56 | # end def 57 | 58 | def get_output_formats(self): 59 | # The keys should be from from `self.output_formats`. Each value 60 | # corresponding a key defines whether create output in that format. 61 | # 62 | # By default, it returns all True to all of the output formats. 63 | pass 64 | # end def 65 | 66 | def should_pack_by_volume(self): 67 | # By default, returns False to generate a single file 68 | pass 69 | # end def 70 | 71 | def get_range_selection(self): 72 | # Should return a key from `self.selections` array 73 | pass 74 | # end def 75 | 76 | def get_range_using_urls(self): 77 | # Should return a list of chapters to download 78 | pass 79 | # end def 80 | 81 | def get_range_using_index(self): 82 | # Should return a list of chapters to download 83 | pass 84 | # end def 85 | 86 | def get_range_from_volumes(self): 87 | # Should return a list of chapters to download 88 | pass 89 | # end def 90 | 91 | def get_range_from_chapters(self): 92 | # Should return a list of chapters to download 93 | pass 94 | # end def 95 | # end class 96 | -------------------------------------------------------------------------------- /lncrawl/sources/mangatoon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | import ast 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('MANGATOON_MOBI') 9 | 10 | book_url = 'https://mangatoon.mobi/%s/detail/%s/episodes' 11 | search_url = 'https://mangatoon.mobi/%s/search?word=%s' 12 | 13 | 14 | class MangatoonMobiCrawler(Crawler): 15 | base_url = 'https://mangatoon.mobi/' 16 | 17 | def initialize(self): 18 | self.home_url = 'https://mangatoon.mobi' 19 | # end def 20 | 21 | def read_novel_info(self): 22 | '''Get novel title, autor, cover etc''' 23 | self.novel_id = self.novel_url.split('/')[5] 24 | logger.info('Novel Id: %s', self.novel_id) 25 | 26 | novel_region = self.novel_url.split('/')[3] 27 | 28 | self.novel_url = book_url % (novel_region,self.novel_id) 29 | logger.debug('Visiting %s', self.novel_url) 30 | soup = self.get_soup(self.novel_url) 31 | 32 | self.novel_title =soup.select_one('h1.comics-title').text 33 | logger.info('Novel title: %s', self.novel_title) 34 | 35 | try: 36 | self.novel_cover = self.absolute_url( 37 | soup.select_one('.detail-top-right img')['src']) 38 | logger.info('Novel cover: %s', self.novel_cover) 39 | except Exception: 40 | logger.debug('Failed to get cover: %s', self.novel_url) 41 | # end try 42 | 43 | self.novel_author = soup.select_one('.created-by').text 44 | logger.info('Novel author: %s', self.novel_author) 45 | 46 | for a in soup.select('a.episode-item'): 47 | chap_id = len(self.chapters) + 1 48 | if len(self.chapters) % 100 == 0: 49 | vol_id = chap_id//100 + 1 50 | vol_title = 'Volume ' + str(vol_id) 51 | self.volumes.append({ 52 | 'id': vol_id, 53 | 'title': vol_title, 54 | }) 55 | # end if 56 | self.chapters.append({ 57 | 'id': chap_id, 58 | 'volume': vol_id, 59 | 'url': self.absolute_url(a['href']), 60 | 'title': a.select_one('.episode-title').text.strip() or ('Chapter %d' % chap_id), 61 | }) 62 | # end for 63 | # end def 64 | 65 | def download_chapter_body(self, chapter): 66 | '''Download body of a single chapter and return as clean html format''' 67 | logger.info('Downloading %s', chapter['url']) 68 | soup = self.get_soup(chapter['url']) 69 | 70 | script = soup.find("script", text=re.compile("initialValue\s+=")) 71 | initialValue = re.search('var initialValue = (?P' + '
'.join(chapter_content) + '
' 78 | # end if 79 | return text.strip() 80 | # end def 81 | # end class 82 | -------------------------------------------------------------------------------- /lncrawl/sources/rewayatclub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from concurrent import futures 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('REWAYAT_CLUB') 9 | 10 | 11 | class RewayatClubCrawler(Crawler): 12 | base_url = 'https://rewayat.club/' 13 | 14 | def read_novel_info(self): 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.is_rtl = True 19 | 20 | self.novel_title = soup.select_one('h1.card-header').text.strip() 21 | logger.info('Novel title: %s', self.novel_title) 22 | 23 | self.novel_cover = self.absolute_url( 24 | soup.select_one('.card-body .align-middle img')['src']) 25 | logger.info('Novel cover: %s', self.novel_cover) 26 | 27 | self.novel_author = soup.select_one( 28 | '.card-body table td a[href*="/user/"]').text.strip() 29 | logger.info('Novel author: %s', self.novel_author) 30 | 31 | page_count = len(soup.select( 32 | '.card-footer select.custom-select option')) 33 | logger.info('Total pages: %d', page_count) 34 | 35 | logger.info('Getting chapters...') 36 | futures_to_check = { 37 | self.executor.submit(self.download_chapter_list, i + 1): str(i) 38 | for i in range(page_count) 39 | } 40 | temp_chapters = dict() 41 | for future in futures.as_completed(futures_to_check): 42 | page = int(futures_to_check[future]) 43 | temp_chapters[page] = future.result() 44 | # end for 45 | 46 | logger.info('Building sorted chapter list...') 47 | volumes = set() 48 | for page in sorted(temp_chapters.keys()): 49 | for chap in temp_chapters[page]: 50 | chap['id'] = 1 + len(self.chapters) 51 | chap['volume'] = 1 + len(self.chapters) // 100 52 | volumes.add(chap['volume']) 53 | self.chapters.append(chap) 54 | # end for 55 | # end for 56 | 57 | self.volumes = [{'id': x} for x in volumes] 58 | # end def 59 | 60 | def download_chapter_list(self, page_no): 61 | chapter_url = self.novel_url + ('?page=%d' % page_no) 62 | logger.info('Visiting %s', chapter_url) 63 | soup = self.get_soup(chapter_url) 64 | 65 | chapters = [] 66 | for a in soup.select('.card a[href*="/novel/"]'): 67 | chapters.append({ 68 | 'url': self.absolute_url(a['href']), 69 | 'title': a.select_one('div p').text.strip(), 70 | }) 71 | # end for 72 | return chapters 73 | # end def 74 | 75 | def download_chapter_body(self, chapter): 76 | '''Download body of a single chapter and return as clean html format.''' 77 | logger.info('Downloading %s', chapter['url']) 78 | soup = self.get_soup(chapter['url']) 79 | paras = soup.select('.card .card-body p') 80 | paras = [str(p) for p in paras if p.text.strip()] 81 | return ''.join(paras) 82 | # end def 83 | # end class 84 | -------------------------------------------------------------------------------- /lncrawl/sources/shinsori.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('SHINSORI') 8 | 9 | 10 | class ShinsoriCrawler(Crawler): 11 | base_url = 'https://www.shinsori.com/' 12 | 13 | def read_novel_info(self): 14 | '''Get novel title, autor, cover etc''' 15 | logger.debug('Visiting %s', self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one('span.the-section-title').text.strip() 19 | logger.info('Novel title: %s', self.novel_title) 20 | 21 | self.novel_cover = None 22 | logger.info('Novel cover: %s', self.novel_cover) 23 | 24 | self.novel_author = 'Author : %s, Translator: Shinsori' % soup.select( 25 | 'div.entry.clearfix p strong')[1].next_sibling.strip() 26 | logger.info('Novel author: %s', self.novel_author) 27 | 28 | # get pagination range 29 | p_range = int(soup.select('ul.lcp_paginator li')[-2].text) 30 | 31 | chapters = [] 32 | # get chapter list by looping pagination range 33 | for x in range(p_range): 34 | p_url = '%s?lcp_page0=%d#lcp_instance_0 x+1' % (self.novel_url, x+1) 35 | p_soup = self.get_soup(p_url) 36 | chapters.extend(p_soup.select('ul.lcp_catlist')[1].select('li a')) 37 | # end for 38 | 39 | for x in chapters: 40 | chap_id = len(self.chapters) + 1 41 | vol_id = len(self.chapters)//100 + 1 42 | self.chapters.append({ 43 | 'id': chap_id, 44 | 'volume': vol_id, 45 | 'url': self.absolute_url(x['href']), 46 | 'title': x['title'] or ('Chapter %d' % chap_id), 47 | }) 48 | # end for 49 | 50 | self.volumes = [ 51 | {'id': x + 1} 52 | for x in range(len(self.chapters) // 100 + 1) 53 | ] 54 | # end def 55 | 56 | def download_chapter_body(self, chapter): 57 | '''Download body of a single chapter and return as clean html format.''' 58 | logger.info('Downloading %s', chapter['url']) 59 | soup = self.get_soup(chapter['url']) 60 | 61 | logger.debug(soup.title.string) 62 | 63 | content = soup.select_one('div.entry-content') 64 | 65 | # remove div with no class 66 | for item in content.findAll('div', attrs={'class': None}): 67 | item.decompose() 68 | 69 | # remove style 70 | for item in content.findAll('style'): 71 | item.decompose() 72 | 73 | subs = 'tab' 74 | # remove all div that has class but not relevant 75 | for item in content.findAll('div'): 76 | res = [x for x in item['class'] if re.search(subs, x)] 77 | if len(res) == 0: 78 | item.extract() 79 | 80 | # remove p with attribute style 81 | for item in content.findAll('p'): 82 | if item.has_attr('style'): 83 | item.decompose() 84 | 85 | return str(content) 86 | # end def 87 | # end class 88 | -------------------------------------------------------------------------------- /lncrawl/sources/wuxiaonline.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WUXIA_ONLINE') 8 | search_url = 'https://wuxiaworld.online/search.ajax?type=&query=%s' 9 | 10 | 11 | class WuxiaOnlineCrawler(Crawler): 12 | base_url = 'https://wuxiaworld.online/' 13 | 14 | # DISABLING DUE TO CLOUDEFLARE CAPTCHA CHALLENGE 15 | # def search_novel(self, query): 16 | # '''Gets a list of {title, url} matching the given query''' 17 | # soup = self.get_soup(search_url % query) 18 | 19 | # results = [] 20 | # for novel in soup.select('li'): 21 | # a = novel.select_one('.resultname a') 22 | # info = novel.select_one('a:nth-of-type(2)') 23 | # info = info.text.strip() if info else '' 24 | # results.append({ 25 | # 'title': a.text.strip(), 26 | # 'url': self.absolute_url(a['href']), 27 | # 'info': 'Latest: %s' % info, 28 | # }) 29 | # # end for 30 | 31 | # return results 32 | # # end def 33 | 34 | def read_novel_info(self): 35 | '''Get novel title, autor, cover etc''' 36 | url = self.novel_url 37 | logger.debug('Visiting %s', url) 38 | soup = self.get_soup(url) 39 | self.novel_title = soup.select_one('h1.entry-title').text 40 | logger.info('Novel title: %s', self.novel_title) 41 | 42 | # self.novel_author = soup.select_one('#maininfo p').text.strip() 43 | # self.novel_author = re.sub(r'^Author[^\w]+', '', self.novel_author).strip() 44 | # logger.info('Novel author: %s', self.novel_author) 45 | 46 | self.novel_cover = self.absolute_url( 47 | soup.select_one('.info_image img')['src']) 48 | logger.info('Novel cover: %s', self.novel_cover) 49 | 50 | last_vol = -1 51 | for a in reversed(soup.select('.chapter-list .row span a')): 52 | chap_id = len(self.chapters) + 1 53 | vol_id = 1 + (chap_id - 1) // 100 54 | volume = {'id': vol_id, 'title': ''} 55 | if last_vol != vol_id: 56 | self.volumes.append(volume) 57 | last_vol = vol_id 58 | # end if 59 | self.chapters.append({ 60 | 'id': chap_id, 61 | 'volume': vol_id, 62 | 'title': a['title'], 63 | 'url': self.absolute_url(a['href']), 64 | }) 65 | # end for 66 | 67 | logger.info('%d chapters and %d volumes found', 68 | len(self.chapters), len(self.volumes)) 69 | # end def 70 | 71 | def download_chapter_body(self, chapter): 72 | '''Download body of a single chapter and return as clean html format.''' 73 | logger.info('Downloading %s', chapter['url']) 74 | soup = self.get_soup(chapter['url']) 75 | 76 | parts = soup.select_one('#list_chapter .content-area') 77 | body = self.extract_contents(parts) 78 | return '' + '
'.join(body) + '
' 79 | # end def 80 | # end class 81 | -------------------------------------------------------------------------------- /lncrawl/sources/crescentmoon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('CRESCENTMOON') 9 | 10 | 11 | class CrescentMoonCrawler(Crawler): 12 | base_url = 'https://crescentmoon.blog/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.select_one('div.entry-content p a')['href']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | self.novel_author = soup.select('div.entry-content p')[2].text.strip() 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | a = soup.select('div.entry-content p') 30 | for idx, item in enumerate(a): 31 | if "table of contents" in item.text.strip().lower(): 32 | toc = a[idx+1] 33 | 34 | chapters = toc.findAll('a') 35 | 36 | for x in chapters: 37 | chap_id = len(self.chapters) + 1 38 | if len(self.chapters) % 100 == 0: 39 | vol_id = chap_id//100 + 1 40 | vol_title = 'Volume ' + str(vol_id) 41 | self.volumes.append({ 42 | 'id': vol_id, 43 | 'title': vol_title, 44 | }) 45 | # end if 46 | self.chapters.append({ 47 | 'id': chap_id, 48 | 'volume': vol_id, 49 | 'url': self.absolute_url(x['href']), 50 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 51 | }) 52 | # end for 53 | # end def 54 | 55 | def download_chapter_body(self, chapter): 56 | '''Download body of a single chapter and return as clean html format.''' 57 | logger.info('Downloading %s', chapter['url']) 58 | soup = self.get_soup(chapter['url']) 59 | 60 | logger.debug(soup.title.string) 61 | 62 | # if soup.find("h1", {"class": "entry-title"}).text.strip(): 63 | # chapter['title'] = soup.find("h1", {"class": "entry-title"}).text.strip() 64 | # else: 65 | # chapter['title'] = chapter['title'] 66 | # end if 67 | 68 | #contents = soup.select('div.entry-content p') 69 | #contents = contents[:-1] 70 | #body = self.extract_contents(contents) 71 | # return '' + '
'.join(body) + '
' 72 | # return str(contents) 73 | 74 | body = [] 75 | contents = soup.select('div.entry-content p') 76 | contents = contents[:-1] 77 | for p in contents: 78 | para = ' '.join(self.extract_contents(p)) 79 | if len(para): 80 | body.append(para) 81 | # end if 82 | # end for 83 | 84 | return '%s
' % ''.join(body) 85 | # end def 86 | # end class 87 | -------------------------------------------------------------------------------- /lncrawl/sources/meionovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | from ..utils.crawler import Crawler 7 | 8 | logger = logging.getLogger('MEIONOVEL') 9 | 10 | 11 | class MeionovelCrawler(Crawler): 12 | base_url = 'https://meionovel.id/' 13 | 14 | def read_novel_info(self): 15 | '''Get novel title, autor, cover etc''' 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = ' '.join([ 20 | str(x) 21 | for x in soup.select_one('.post-title h3').contents 22 | if not x.name 23 | ]).strip() 24 | logger.info('Novel title: %s', self.novel_title) 25 | 26 | self.novel_cover = self.absolute_url( 27 | soup.select_one('.summary_image img')['data-src']) 28 | logger.info('Novel cover: %s', self.novel_cover) 29 | 30 | author = soup.find('div', {'class': 'author-content'}).findAll('a') 31 | if len(author) == 2: 32 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 33 | else: 34 | self.novel_author = author[0].text 35 | logger.info('Novel author: %s', self.novel_author) 36 | 37 | 38 | content_area = soup.select_one(' .page-content-listing') 39 | 40 | for span in content_area.findAll('span'): 41 | span.decompose() 42 | 43 | chapters = content_area.select('ul.main li.wp-manga-chapter a') 44 | 45 | chapters.reverse() 46 | 47 | for a in chapters: 48 | chap_id = len(self.chapters) + 1 49 | vol_id = chap_id//100 + 1 50 | if len(self.chapters) % 100 == 0: 51 | vol_title = 'Volume ' + str(vol_id) 52 | self.volumes.append({ 53 | 'id': vol_id, 54 | 'title': vol_title, 55 | }) 56 | # end if 57 | self.chapters.append({ 58 | 'id': chap_id, 59 | 'volume': vol_id, 60 | 'url': self.absolute_url(a['href']), 61 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 62 | }) 63 | # end for 64 | # end def 65 | 66 | def download_chapter_body(self, chapter): 67 | '''Download body of a single chapter and return as clean html format.''' 68 | logger.info('Downloading %s', chapter['url']) 69 | soup = self.get_soup(chapter['url']) 70 | 71 | contents = soup.select_one('div.text-left') 72 | 73 | for img in contents.findAll('img'): 74 | if img.has_attr('data-lazy-src'): 75 | src_url = img['data-lazy-src'] 76 | parent = img.parent 77 | img.decompose() 78 | new_tag = soup.new_tag("img", src=src_url) 79 | parent.append(new_tag) 80 | 81 | if contents.h3: 82 | contents.h3.decompose() 83 | 84 | for codeblock in contents.findAll('div', {'class': 'code-block'}): 85 | codeblock.decompose() 86 | 87 | return str(contents) 88 | # end def 89 | # end class 90 | -------------------------------------------------------------------------------- /lncrawl/utils/kindlegen_download.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import platform 4 | import tarfile 5 | import tempfile 6 | from io import BytesIO, FileIO 7 | from logging import Logger 8 | from shutil import rmtree 9 | from zipfile import ZipFile 10 | import requests 11 | 12 | logger = Logger('KINDLEGEN') 13 | 14 | WINDOWS_URL = 'http://kindlegen.s3.amazonaws.com/kindlegen_win32_v2_9.zip' 15 | MACOS_URL = 'http://kindlegen.s3.amazonaws.com/KindleGen_Mac_i386_v2_9.zip' 16 | LINUX_URL = 'http://kindlegen.s3.amazonaws.com/kindlegen_linux_2.6_i386_v2_9.tar.gz' 17 | 18 | 19 | def get_url_by_platform(): 20 | if platform.system() == 'Linux': 21 | return LINUX_URL 22 | elif platform.system() == 'Darwin': 23 | return MACOS_URL 24 | elif platform.system() == 'Windows': 25 | return WINDOWS_URL 26 | else: 27 | raise Exception('Unrecognized platform') 28 | # end if 29 | # end def 30 | 31 | 32 | def extract_kindlegen_file(extractor, file_list): 33 | logger.debug(file_list) 34 | home = os.path.expanduser('~') 35 | if file_list.count('kindlegen') == 1: 36 | extractor('kindlegen', path=home) 37 | logger.info('Extracted kindlegen to %s', home) 38 | elif file_list.count('kindlegen.exe') == 1: 39 | extractor('kindlegen.exe', path=home) 40 | logger.info('Extracted kindlegen.exe to %s', home) 41 | os.rename(os.path.join(home, 'kindlegen.exe'), 42 | os.path.join(home, 'kindlegen')) 43 | logger.info('Renamed kindlegen.exe to kindlegen') 44 | else: 45 | raise Exception('Kindlegen executable was not found.') 46 | # end if 47 | # end def 48 | 49 | 50 | def download_kindlegen(): 51 | # Download the file 52 | url = get_url_by_platform() 53 | print('Downloading kindlegen...') 54 | byte_array = requests.get(url).content 55 | 56 | # Extract contents 57 | print('Extracting kindlegen...') 58 | if url.endswith('.zip'): 59 | with BytesIO(byte_array) as byte_stream: 60 | with ZipFile(byte_stream) as file: 61 | extract_kindlegen_file(file.extract, file.namelist()) 62 | # end with 63 | # end with 64 | elif url.endswith('.tar.gz'): 65 | temp_file = tempfile.mktemp('.tar.gz') 66 | try: 67 | logger.info('Writing content to %s', temp_file) 68 | with FileIO(temp_file, 'w') as file: 69 | file.write(byte_array) 70 | # end with 71 | logger.info('Opening %s as archive', temp_file) 72 | with tarfile.open(temp_file) as file: 73 | extract_kindlegen_file(file.extract, file.getnames()) 74 | # end with 75 | finally: 76 | os.remove(temp_file) 77 | logger.info('%s removed.', temp_file) 78 | # end finally 79 | # end if 80 | # end def 81 | 82 | 83 | def retrieve_kindlegen(): 84 | # Check kindlegen availability 85 | home = os.path.expanduser('~') 86 | kindlegen_file = os.path.join(home, 'kindlegen') 87 | if os.path.exists(kindlegen_file): 88 | return kindlegen_file 89 | # end if 90 | return None 91 | # end def 92 | -------------------------------------------------------------------------------- /lncrawl/binders/web.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | 5 | from ..assets.html_style import get_value as get_css_style 6 | 7 | logger = logging.getLogger('WEB_BINDER') 8 | 9 | 10 | def bind_html_chapter(chapter, prev_chapter, next_chapter, direction='ltr'): 11 | prev_button = '%s.html' % ( 12 | str(prev_chapter['id']).rjust(5, '0')) if prev_chapter else '#' 13 | next_button = '%s.html' % str(next_chapter['id']).rjust( 14 | 5, '0') if next_chapter else '#' 15 | button_group = '
No contents
' % chapter['title'] 37 | # end if 38 | 39 | html = '\n' 40 | html += '' % direction 41 | html += '' 42 | html += '' 43 | html += '' + '
'.join(body) + '
' 79 | 80 | # if contents.h3: 81 | # contents.h3.decompose() 82 | 83 | # for codeblock in contents.findAll('div', {'class': 'code-block'}): 84 | # codeblock.decompose() 85 | 86 | # return str(contents) 87 | # end def 88 | # end class 89 | -------------------------------------------------------------------------------- /lncrawl/sources/bestlightnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from bs4 import BeautifulSoup 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('NOVEL_ONLINE_FREE') 8 | search_url = 'https://bestlightnovel.com/getsearchstory' 9 | novel_page_url = 'https://bestlightnovel.com/novel/%s' 10 | 11 | 12 | class BestLightNovel(Crawler): 13 | base_url = 'https://bestlightnovel.com/' 14 | 15 | def search_novel(self, query): 16 | response = self.submit_form(search_url, { 17 | 'searchword': query 18 | }) 19 | data = response.json() 20 | 21 | results = [] 22 | for novel in data: 23 | titleSoup = BeautifulSoup(novel['name'], 'lxml') 24 | results.append({ 25 | 'title': titleSoup.body.text.title(), 26 | 'url': novel_page_url % novel['nameunsigned'], 27 | 'info': 'Latest: %s' % novel['lastchapter'], 28 | }) 29 | # end for 30 | return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | # self.novel_title = soup.select_one('h1.entry-title').text.strip() 39 | self.novel_title = soup.select_one('div.entry-header h1').text.strip() 40 | logger.info('Novel title: %s', self.novel_title) 41 | 42 | try: 43 | novel_data = self.submit_form(search_url, { 44 | 'searchword': self.novel_title 45 | }).json() 46 | self.novel_cover = novel_data[0]['image'] 47 | self.novel_author = novel_data[0]['author'] 48 | except Exception: 49 | logger.debug('Failed getting novel info.\n%s', Exception) 50 | # end try 51 | 52 | for a in reversed(soup.select('#list_chapter .chapter-list a')): 53 | chap_id = len(self.chapters) + 1 54 | vol_id = len(self.chapters) // 100 + 1 55 | if len(self.chapters) % 100 == 0: 56 | self.volumes.append({'id': vol_id}) 57 | # end if 58 | self.chapters.append({ 59 | 'id': chap_id, 60 | 'volume': vol_id, 61 | 'title': a.text.strip(), 62 | 'url': self.absolute_url(a['href']), 63 | }) 64 | # end for 65 | # end def 66 | 67 | def download_chapter_body(self, chapter): 68 | '''Download body of a single chapter and return as clean html format.''' 69 | logger.info('Downloading %s', chapter['url']) 70 | soup = self.get_soup(chapter['url']) 71 | 72 | logger.debug(soup.title.string) 73 | 74 | if 'Chapter' in soup.select_one('h1').text: 75 | chapter['title'] = soup.select_one('h1').text 76 | else: 77 | chapter['title'] = chapter['title'] 78 | # end if 79 | 80 | self.blacklist_patterns = [ 81 | r'^translat(ed by|or)', 82 | r'(volume|chapter) .?\d+', 83 | ] 84 | 85 | contents = soup.select_one('#vung_doc') 86 | body = self.extract_contents(contents) 87 | return '' + '
'.join(body) + '
' 88 | # end def 89 | # end class 90 | -------------------------------------------------------------------------------- /lncrawl/sources/novelonlinefull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from bs4 import BeautifulSoup 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('NOVEL_ONLINE_FULL') 8 | search_url = 'https://novelonlinefull.com/getsearchstory' 9 | novel_page_url = 'https://novelonlinefull.com/novel/%s' 10 | 11 | 12 | class NovelOnlineFullCrawler(Crawler): 13 | base_url = 'https://novelonlinefull.com/' 14 | 15 | def search_novel(self, query): 16 | response = self.submit_form(search_url, { 17 | 'searchword': query 18 | }) 19 | data = response.json() 20 | 21 | results = [] 22 | for novel in data: 23 | titleSoup = BeautifulSoup(novel['name'], 'lxml') 24 | results.append({ 25 | 'title': titleSoup.body.text.title(), 26 | 'url': novel_page_url % novel['nameunsigned'], 27 | 'info': 'Latest: %s' % novel['lastchapter'], 28 | }) 29 | # end for 30 | return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | # self.novel_title = soup.select_one('h1.entry-title').text.strip() 39 | self.novel_title = soup.select_one('div.entry-header h1').text.strip() 40 | logger.info('Novel title: %s', self.novel_title) 41 | 42 | try: 43 | novel_data = self.submit_form(search_url, { 44 | 'searchword': self.novel_title 45 | }).json() 46 | self.novel_cover = novel_data[0]['image'] 47 | self.novel_author = novel_data[0]['author'] 48 | except Exception: 49 | logger.debug('Failed getting novel info.\n%s', Exception) 50 | # end try 51 | 52 | for a in reversed(soup.select('#list_chapter .chapter-list a')): 53 | chap_id = len(self.chapters) + 1 54 | vol_id = len(self.chapters) // 100 + 1 55 | if len(self.chapters) % 100 == 0: 56 | self.volumes.append({'id': vol_id}) 57 | # end if 58 | self.chapters.append({ 59 | 'id': chap_id, 60 | 'volume': vol_id, 61 | 'title': a.text.strip(), 62 | 'url': self.absolute_url(a['href']), 63 | }) 64 | # end for 65 | # end def 66 | 67 | def download_chapter_body(self, chapter): 68 | '''Download body of a single chapter and return as clean html format.''' 69 | logger.info('Downloading %s', chapter['url']) 70 | soup = self.get_soup(chapter['url']) 71 | 72 | logger.debug(soup.title.string) 73 | 74 | if 'Chapter' in soup.select_one('h1').text: 75 | chapter['title'] = soup.select_one('h1').text 76 | else: 77 | chapter['title'] = chapter['title'] 78 | # end if 79 | 80 | self.blacklist_patterns = [ 81 | r'^translat(ed by|or)', 82 | r'(volume|chapter) .?\d+', 83 | ] 84 | 85 | contents = soup.select_one('#vung_doc') 86 | body = self.extract_contents(contents) 87 | return '' + '
'.join(body) + '
' 88 | # end def 89 | # end class 90 | -------------------------------------------------------------------------------- /lncrawl/sources/boxnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('BOXNOVEL') 8 | search_url = 'https://boxnovel.com/?s=%s&post_type=wp-manga&author=&artist=&release=' 9 | 10 | 11 | class BoxNovelCrawler(Crawler): 12 | base_url = 'https://boxnovel.com/' 13 | 14 | def search_novel(self, query): 15 | query = query.lower().replace(' ', '+') 16 | soup = self.get_soup(search_url % query) 17 | 18 | results = [] 19 | for tab in soup.select('.c-tabs-item__content'): 20 | a = tab.select_one('.post-title h4 a') 21 | latest = tab.select_one('.latest-chap .chapter a').text 22 | votes = tab.select_one('.rating .total_votes').text 23 | results.append({ 24 | 'title': a.text.strip(), 25 | 'url': self.absolute_url(a['href']), 26 | 'info': '%s | Rating: %s' % (latest, votes), 27 | }) 28 | # end for 29 | 30 | return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | self.novel_title = ' '.join([ 39 | str(x) 40 | for x in soup.select_one('.post-title h3').contents 41 | if not x.name 42 | ]).strip() 43 | logger.info('Novel title: %s', self.novel_title) 44 | 45 | probable_img = soup.select_one('.summary_image img') 46 | if probable_img: 47 | self.novel_cover = self.absolute_url(probable_img['src']) 48 | logger.info('Novel cover: %s', self.novel_cover) 49 | 50 | author = soup.select('.author-content a') 51 | if len(author) == 2: 52 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 53 | else: 54 | self.novel_author = author[0].text 55 | logger.info('Novel author: %s', self.novel_author) 56 | 57 | chapters = soup.select('ul.main li.wp-manga-chapter a') 58 | for a in reversed(chapters): 59 | chap_id = len(self.chapters) + 1 60 | vol_id = chap_id//100 + 1 61 | if len(self.chapters) % 100 == 0: 62 | vol_title = 'Volume ' + str(vol_id) 63 | self.volumes.append({ 64 | 'id': vol_id, 65 | 'title': vol_title, 66 | }) 67 | # end if 68 | self.chapters.append({ 69 | 'id': chap_id, 70 | 'volume': vol_id, 71 | 'url': self.absolute_url(a['href']), 72 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 73 | }) 74 | # end for 75 | # end def 76 | 77 | def download_chapter_body(self, chapter): 78 | '''Download body of a single chapter and return as clean html format.''' 79 | logger.info('Downloading %s', chapter['url']) 80 | soup = self.get_soup(chapter['url']) 81 | 82 | contents = soup.select_one('div.text-left') 83 | for bad in contents.select('h3, .code-block, script, .adsbygoogle'): 84 | bad.decompose() 85 | 86 | return str(contents) 87 | # end def 88 | # end class 89 | -------------------------------------------------------------------------------- /lncrawl/sources/webnovelindonesia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from concurrent import futures 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WEBNOVEL_INDONESIA') 8 | 9 | chapter_list_url = 'https://webnovelindonesia.com/wp-json/writerist/v1/chapters?category=%s&perpage=100&order=ASC&paged=%s' 10 | 11 | 12 | class WebnovelIndonesia(Crawler): 13 | base_url = 'https://webnovelindonesia.com/' 14 | 15 | def read_novel_info(self): 16 | logger.debug('Visiting %s', self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | self.novel_title = soup.select_one('.breadcrumb .breadcrumb-item.active').text.strip() 20 | logger.info('Novel title: %s', self.novel_title) 21 | 22 | self.novel_cover = self.absolute_url( 23 | soup.select_one('.section-novel img[class*="lazy"]')['data-src']) 24 | logger.info('Novel cover: %s', self.novel_cover) 25 | 26 | self.novel_author = soup.select_one('.section-novel li a[href*="/aut/"]').text.strip() 27 | logger.info('Novel author: %s', self.novel_author) 28 | 29 | possible_chapter_pages = soup.select('#js-chpater-jump > div.jump-to') 30 | 31 | if not len(possible_chapter_pages): 32 | possible_chapter_pages = [{'data-paged': '1'}] 33 | # end if 34 | 35 | novel_id = soup.select_one('#sortable-table')['data-category'] 36 | 37 | logger.info('Downloading chapters...') 38 | futures_to_check = dict() 39 | for div in possible_chapter_pages: 40 | page = div['data-paged'] 41 | url = chapter_list_url % (novel_id, page) 42 | task = self.executor.submit(self.extract_chapter_list, url) 43 | futures_to_check[task] = page 44 | # end for 45 | 46 | temp_chapters = dict() 47 | for future in futures.as_completed(futures_to_check): 48 | page = int(futures_to_check[future]) 49 | temp_chapters[page] = future.result() 50 | # end for 51 | 52 | logger.info('Building sorted chapter list...') 53 | for page in sorted(temp_chapters.keys()): 54 | self.volumes.append({'id': page}) 55 | for chap in temp_chapters[page]: 56 | chap['volume'] = page 57 | chap['id'] = 1 + len(self.chapters) 58 | self.chapters.append(chap) 59 | # end for 60 | # end for 61 | # end def 62 | 63 | def extract_chapter_list(self, url): 64 | temp_list = [] 65 | logger.debug('Visiting: %s', url) 66 | data = self.get_json(url) 67 | for item in data: 68 | temp_list.append({ 69 | 'title': item['post_title'], 70 | 'url': self.absolute_url(item['permalink']), 71 | }) 72 | # end for 73 | return temp_list 74 | # end def 75 | 76 | def download_chapter_body(self, chapter): 77 | '''Download body of a single chapter and return as clean html format''' 78 | logger.info('Downloading %s', chapter['url']) 79 | soup = self.get_soup(chapter['url']) 80 | 81 | body = '' 82 | for p in soup.select('#content > p'): 83 | if p.text.strip(): 84 | body += str(p).strip() 85 | # end if 86 | # end for 87 | 88 | return body 89 | # end def 90 | # end class 91 | -------------------------------------------------------------------------------- /lncrawl/sources/translateindo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | from urllib.parse import quote, urlparse 5 | import urllib.parse 6 | from bs4 import BeautifulSoup 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('TRANSLATEINDO') 11 | 12 | #search_url = 'https://www.worldnovel.online/wp-json/writerist/v1/novel/search?keyword=%s' 13 | #chapter_list_url = "https://www.worldnovel.online/wp-json/writerist/v1/chapters?category=%s&perpage=4000&order=ASC&paged=1" 14 | 15 | 16 | class TranslateIndoCrawler(Crawler): 17 | base_url = 'https://www.translateindo.com/' 18 | 19 | # def search_novel(self, query): 20 | # data = self.get_json(search_url % quote(query)) 21 | 22 | # results = [] 23 | # for item in data: 24 | # results.append({ 25 | # 'url': item['permalink'], 26 | # 'title': item['post_title'], 27 | # }) 28 | # # end for 29 | 30 | # return results 31 | # end def 32 | 33 | def read_novel_info(self): 34 | '''Get novel title, autor, cover etc''' 35 | logger.debug('Visiting %s', self.novel_url) 36 | soup = self.get_soup(self.novel_url) 37 | 38 | self.novel_title = soup.select_one('h1.entry-title').text.strip() 39 | logger.info('Novel title: %s', self.novel_title) 40 | 41 | possible_cover = soup.select_one('div.entry-content img')['src'] 42 | if possible_cover: 43 | self.novel_cover = self.absolute_url(possible_cover) 44 | # end if 45 | logger.info('Novel cover: %s', self.novel_cover) 46 | 47 | for span in soup.select('div.entry-content p span'): 48 | possible_author = re.sub(r'[\(\s\n\)]+', ' ', span.text, re.M).strip() 49 | if possible_author.startswith('Author:'): 50 | possible_author = re.sub('Author:', '', possible_author) 51 | self.novel_author = possible_author.strip() 52 | break 53 | # end if 54 | # end for 55 | logger.info('Novel author: %s', self.novel_author) 56 | 57 | for div in soup.select('.cl-lists .cl-block'): 58 | possible_vol = div.select_one('.cl-header') 59 | if not possible_vol: 60 | continue 61 | 62 | vol_title = possible_vol.text.strip() 63 | vol_id = len(self.volumes) + 1 64 | self.volumes.append({ 65 | 'id': vol_id, 66 | 'title': vol_title, 67 | }) 68 | 69 | for a in div.select('ol.cl-body li a'): 70 | chap_id = len(self.chapters) + 1 71 | self.chapters.append({ 72 | 'id': chap_id, 73 | 'volume': vol_id, 74 | 'url': self.absolute_url(a['href']), 75 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 76 | }) 77 | # end for 78 | # end for 79 | # end def 80 | 81 | def download_chapter_body(self, chapter): 82 | '''Download body of a single chapter and return as clean html format''' 83 | logger.info('Downloading %s', chapter['url']) 84 | soup = self.get_soup(chapter['url']) 85 | 86 | contents = soup.select('div.entry-content p') 87 | 88 | body = [str(p) for p in contents if p.text.strip()] 89 | return '' + '
'.join(body) + '
' 90 | # end def 91 | # end class 92 | -------------------------------------------------------------------------------- /setup_pyi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import platform 5 | import re 6 | import shlex 7 | import shutil 8 | import sys 9 | from pathlib import Path 10 | 11 | from PyInstaller import __main__ as pyi 12 | from setuptools.config import read_configuration 13 | 14 | ROOT = Path(__file__).parent 15 | unix_root = '/'.join(str(ROOT).split(os.sep)) 16 | site_packages = list(ROOT.glob('venv/**/site-packages'))[0] 17 | unix_site_packages = '/'.join(str(site_packages).split(os.sep)) 18 | 19 | 20 | def package(): 21 | output = str(ROOT / 'windows') 22 | shutil.rmtree(output, ignore_errors=True) 23 | os.makedirs(output, exist_ok=True) 24 | setup_command() 25 | pyi.run() 26 | shutil.rmtree(output, ignore_errors=True) 27 | # end def 28 | 29 | 30 | def setup_command(): 31 | command = 'pyinstaller ' 32 | command += '--onefile ' # onefile 33 | command += '--clean ' 34 | command += '--noconfirm ' 35 | command += '--name "lncrawl" ' 36 | command += '--icon "%s/res/lncrawl.ico" ' % unix_root 37 | command += '--distpath "%s" ' % str(ROOT / 'dist') 38 | command += '--specpath "%s" ' % str(ROOT / 'windows') 39 | command += '--workpath "%s" ' % str(ROOT / 'windows' / 'build') 40 | 41 | command += gather_data_files() 42 | command += gather_hidden_imports() 43 | command += '"%s/__main__.py" ' % unix_root 44 | 45 | print(command) 46 | print() 47 | 48 | sys.argv = shlex.split(command) 49 | # end def 50 | 51 | 52 | def gather_data_files(): 53 | command = '' 54 | 55 | # add data files of this project 56 | for f in (ROOT / 'lncrawl').glob('**/*.*'): 57 | src = str(f) 58 | src = '/'.join(src.split(os.sep)) 59 | dst = str(f.parent.relative_to(ROOT)) 60 | dst = '/'.join(dst.split(os.sep)) 61 | command += '--add-data "%s%s%s" ' % (src, os.pathsep, dst) 62 | # end for 63 | command += '--add-data "%s/lncrawl/VERSION%slncrawl" ' % (unix_root, os.pathsep) 64 | 65 | # add data files of other dependencies 66 | command += '--add-data "%s/cairosvg/VERSION%s." ' % ( 67 | unix_site_packages, os.pathsep) 68 | command += '--add-data "%s/cairocffi/VERSION%scairocffi" ' % ( 69 | unix_site_packages, os.pathsep) 70 | command += '--add-data "%s/tinycss2/VERSION%stinycss2" ' % ( 71 | unix_site_packages, os.pathsep) 72 | command += '--add-data "%s/text_unidecode/data.bin%stext_unidecode" ' % ( 73 | unix_site_packages, os.pathsep) 74 | command += '--add-data "%s/cloudscraper%scloudscraper" ' % ( 75 | unix_site_packages, os.pathsep) 76 | command += '--add-data "%s/wcwidth/version.json%swcwidth" ' % ( 77 | unix_site_packages, os.pathsep) 78 | 79 | return command 80 | # end def 81 | 82 | 83 | def gather_hidden_imports(): 84 | command = '' 85 | 86 | # add hidden imports of this project 87 | for f in (ROOT / 'lncrawl' / 'sources').glob('*.py'): 88 | if os.path.isfile(f) and re.match(r'^([^_.][^.]+).py$', f.name): 89 | module_name = f.name[:-3] 90 | command += '--hidden-import "lncrawl.sources.%s" ' % module_name 91 | # end if 92 | # end for 93 | command += '--hidden-import "pkg_resources.py2_warn" ' 94 | 95 | return command 96 | # end def 97 | 98 | 99 | if __name__ == '__main__': 100 | package() 101 | # end if 102 | -------------------------------------------------------------------------------- /lncrawl/binders/calibre.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import os 4 | import subprocess 5 | 6 | logger = logging.getLogger('CALIBRE_BINDER') 7 | 8 | EBOOK_CONVERT = 'ebook-convert' 9 | CALIBRE_LINK = 'https://calibre-ebook.com/download' 10 | 11 | 12 | def run_ebook_convert(*args): 13 | ''' 14 | Calls `ebook-convert` with given args 15 | Visit https://manual.calibre-ebook.com/generated/en/ebook-convert.html for argument list. 16 | ''' 17 | try: 18 | isdebug = os.getenv('debug_mode') == 'yes' 19 | with open(os.devnull, 'w') as dumper: 20 | subprocess.call( 21 | [EBOOK_CONVERT] + list(args), 22 | stdout=None if isdebug else dumper, 23 | stderr=None if isdebug else dumper, 24 | ) 25 | # end with 26 | return True 27 | except Exception: 28 | import traceback 29 | logger.debug(traceback.format_exc()) 30 | return False 31 | # end try 32 | # end def 33 | 34 | 35 | def epub_to_calibre(app, epub_file, out_fmt): 36 | if not os.path.exists(epub_file): 37 | return None 38 | # end if 39 | 40 | epub_path = os.path.dirname(epub_file) 41 | epub_file_name = os.path.basename(epub_file) 42 | file_name_without_ext = epub_file_name.replace('.epub', '') 43 | 44 | work_path = os.path.dirname(epub_path) 45 | out_path = os.path.join(work_path, out_fmt) 46 | out_file_name = file_name_without_ext + '.' + out_fmt 47 | out_file = os.path.join(out_path, out_file_name) 48 | 49 | os.makedirs(out_path, exist_ok=True) 50 | 51 | logger.debug('Converting "%s" to "%s"', epub_file, out_file) 52 | 53 | args = [ 54 | epub_file, 55 | out_file, 56 | '--unsmarten-punctuation', 57 | '--no-chapters-in-toc', 58 | '--title', file_name_without_ext, 59 | '--authors', app.crawler.novel_author, 60 | '--series', app.crawler.novel_title, 61 | '--publisher', app.crawler.home_url, 62 | '--book-producer', 'Lightnovel Crawler', 63 | '--enable-heuristics', '--disable-renumber-headings', 64 | ] 65 | if app.book_cover: 66 | args += ['--cover', app.book_cover] 67 | if out_fmt == 'pdf': 68 | args += [ 69 | '--paper-size', 'a4', 70 | '--pdf-page-numbers', 71 | '--pdf-hyphenate', 72 | '--pdf-header-template', '⦗ _TITLE_ — _SECTION_ ⦘
', 73 | ] 74 | # end if 75 | 76 | run_ebook_convert(*args) 77 | 78 | if os.path.exists(out_file): 79 | print('Created: %s' % out_file_name) 80 | return out_file 81 | else: 82 | logger.error('[%s] conversion failed: %s', out_fmt, epub_file_name) 83 | return None 84 | # end if 85 | # end def 86 | 87 | 88 | def make_calibres(app, epubs, out_fmt): 89 | if out_fmt == 'epub' or not epubs: 90 | return epubs 91 | # end if 92 | 93 | if not run_ebook_convert('--version'): 94 | logger.error('Install Calibre to generate %s: %s', 95 | out_fmt, CALIBRE_LINK), 96 | return 97 | # end if 98 | 99 | out_files = [] 100 | for epub in epubs: 101 | out = epub_to_calibre(app, epub, out_fmt) 102 | out_files += [out] 103 | # end for 104 | 105 | return out_files 106 | # end def 107 | -------------------------------------------------------------------------------- /lncrawl/sources/zenithnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | 6 | import requests 7 | 8 | from ..utils.crawler import Crawler 9 | 10 | logger = logging.getLogger('ZENITH_NOVELS') 11 | 12 | novel_url = 'http://zenithnovels.com/%s/' 13 | 14 | 15 | class ZenithNovelsCrawler(Crawler): 16 | base_url = 'http://zenithnovels.com/' 17 | 18 | def read_novel_info(self): 19 | '''Get novel title, autor, cover etc''' 20 | self.novel_id = re.search( 21 | r'(?<=zenithnovels.com/)[^/]+', self.novel_url).group(0) 22 | logger.info('Novel id: %s', self.novel_id) 23 | 24 | url = novel_url % self.novel_id 25 | logger.debug('Visiting %s', url) 26 | soup = self.get_soup(url) 27 | 28 | self.novel_title = soup.select_one('article#the-post h1.name').text 29 | logger.info('Novel title: %s', self.novel_title) 30 | 31 | self.novel_cover = self.absolute_url(soup.select_one( 32 | 'article#the-post .entry img')['src']) 33 | logger.info('Novel cover: %s', self.novel_cover) 34 | 35 | while True: 36 | self.parse_chapter_list(soup) 37 | 38 | next_link = soup.select_one('ul.lcp_paginator a.lcp_nextlink') 39 | if next_link: 40 | soup = self.get_soup(next_link['href']) 41 | else: 42 | break 43 | # end if 44 | # end if 45 | 46 | self.chapters.sort(key=lambda x: x['volume'] * 1e6 + x['id']) 47 | self.volumes = [{'id': x, 'title': ''} for x in set(self.volumes)] 48 | # end def 49 | 50 | def parse_chapter_list(self, soup): 51 | for a in soup.select('ul.lcp_catlist li a'): 52 | ch_title = a['title'] 53 | ch_id = [int(''.join(x).strip()) for x in re.findall( 54 | r'((?<=ch) \d+)|((?<=chapter) \d+)', ch_title, re.IGNORECASE)] 55 | ch_id = ch_id[0] if len(ch_id) else len(self.chapters) + 1 56 | vol_id = [int(''.join(x).strip()) for x in re.findall( 57 | r'((?<=book) \d+)|((?<=volume) \d+)', ch_title, re.IGNORECASE)] 58 | vol_id = vol_id[0] if len(vol_id) else 1 + (ch_id - 1) // 100 59 | 60 | self.volumes.append(vol_id) 61 | self.chapters.append({ 62 | 'id': ch_id, 63 | 'volume': vol_id, 64 | 'title': ch_title, 65 | 'url': self.absolute_url(a['href']), 66 | }) 67 | # end for 68 | # end def 69 | 70 | def download_chapter_body(self, chapter): 71 | '''Download body of a single chapter and return as clean html format.''' 72 | logger.info('Downloading %s', chapter['url']) 73 | soup = self.get_soup(chapter['url']) 74 | 75 | entry = soup.select_one('article#the-post .entry') 76 | 77 | try: 78 | self.clean_contents(entry) 79 | for note in entry.select('.footnote'): 80 | note.decompose() 81 | # end for 82 | except Exception: 83 | pass 84 | # end try 85 | 86 | body = '' 87 | for tag in entry.children: 88 | if tag.name == 'p' and len(tag.text.strip()): 89 | p = ' '.join(self.extract_contents(tag)) 90 | if len(p.strip()): 91 | body += '%s
' % p 92 | # end if 93 | # end if 94 | # end for 95 | 96 | return body 97 | # end def 98 | # end class 99 | -------------------------------------------------------------------------------- /lncrawl/sources/litnet.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from ..utils.crawler import Crawler 4 | 5 | logger = logging.getLogger('LITNET') 6 | search_url = 'https://litnet.com/en/search?q=%s' 7 | 8 | 9 | class LitnetCrawler(Crawler): 10 | base_url = 'https://litnet.com/' 11 | 12 | def search_novel(self, query): 13 | query = query.lower().replace(' ', '+') 14 | soup = self.get_soup(search_url % query) 15 | 16 | results = [] 17 | for a in soup.select('div.l-container ul a'): 18 | results.append({ 19 | 'title': a.text.strip(), 20 | 'url': self.absolute_url(a['href']), 21 | }) 22 | # end for 23 | 24 | return results 25 | # end def 26 | 27 | def read_novel_info(self): 28 | '''Get novel title, autor, cover etc''' 29 | logger.debug('Visiting %s', self.novel_url) 30 | soup = self.get_soup(self.novel_url) 31 | 32 | self.novel_title = soup.select_one('h1').text.strip() 33 | logger.info('Novel title: %s', self.novel_title) 34 | 35 | img_src = soup.select_one('div.book-view-cover img') 36 | if not img_src: 37 | img_src = soup.select_one('div.book-cover img') 38 | # end if 39 | if img_src: 40 | self.novel_cover = self.absolute_url(img_src['src']) 41 | # end if 42 | logger.info('Novel cover: %s', self.novel_cover) 43 | 44 | author = soup.select_one('div.book-view-info a.author') 45 | if not author: 46 | author = soup.select_one('div.book-head-content a.book-autor') 47 | # end if 48 | if author: 49 | self.novel_author = author.text.strip() 50 | # end if 51 | logger.info('Novel author: %s', self.novel_author) 52 | 53 | chapters = soup.find('select', {'name': 'chapter'}) 54 | if chapters is None: 55 | chapters = soup.select('div.collapsible-body a.collection-item') 56 | else: 57 | chapters = chapters.find_all('option') 58 | chapters = [c for c in chapters if c.attrs['value']] 59 | # end if 60 | 61 | for a in chapters: 62 | chap_id = len(self.chapters) + 1 63 | if len(self.chapters) % 100 == 0: 64 | vol_id = chap_id//100 + 1 65 | vol_title = 'Volume ' + str(vol_id) 66 | self.volumes.append({ 67 | 'id': vol_id, 68 | 'title': vol_title, 69 | }) 70 | # end if 71 | 72 | abs_url = self.last_visited_url.replace('book', 'reader') 73 | chap_url = abs_url + \ 74 | ('?c=%s' % a.attrs['value']) if a.has_attr( 75 | 'value') else self.home_url + a['href'] 76 | self.chapters.append({ 77 | 'id': chap_id, 78 | 'volume': 1, 79 | 'url': chap_url, 80 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 81 | }) 82 | # end for 83 | # end def 84 | 85 | def download_chapter_body(self, chapter): 86 | '''Download body of a single chapter and return as clean html format.''' 87 | logger.info('Downloading %s', chapter['url']) 88 | soup = self.get_soup(chapter['url']) 89 | 90 | contents = soup.select_one('div.reader-text') 91 | if contents is None: 92 | contents = soup.select_one('div.demo-txt') 93 | return str(contents) 94 | # end def 95 | # end class 96 | -------------------------------------------------------------------------------- /lncrawl/bots/console/get_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | 4 | from PyInquirer import prompt 5 | 6 | from ...core import display 7 | from ...core.arguments import get_args 8 | from ...sources import rejected_sources 9 | 10 | 11 | def get_novel_url(self): 12 | '''Returns a novel page url or a query''' 13 | args = get_args() 14 | if args.query and len(args.query) > 1: 15 | return args.query 16 | # end if 17 | 18 | url = args.novel_page 19 | if url: 20 | if re.match(r'^https?://.+\..+$', url): 21 | return url 22 | else: 23 | raise Exception('Invalid URL of novel page') 24 | # end if 25 | # end if 26 | 27 | try: 28 | if args.suppress: 29 | raise Exception() 30 | # end if 31 | 32 | answer = prompt([ 33 | { 34 | 'type': 'input', 35 | 'name': 'novel', 36 | 'message': 'Enter novel page url or query novel:', 37 | 'validate': lambda val: 'Input should not be empty' 38 | if len(val) == 0 else True, 39 | }, 40 | ]) 41 | return answer['novel'].strip() 42 | except Exception: 43 | raise Exception('Novel page url or query was not given') 44 | # end try 45 | # end def 46 | 47 | 48 | def get_crawlers_to_search(self): 49 | '''Returns user choice to search the choosen sites for a novel''' 50 | links = self.app.crawler_links 51 | if not links: 52 | return None 53 | # end if 54 | 55 | args = get_args() 56 | if args.suppress or not args.sources: 57 | return links 58 | # end if 59 | 60 | answer = prompt([ 61 | { 62 | 'type': 'checkbox', 63 | 'name': 'sites', 64 | 'message': 'Where to search?', 65 | 'choices': [{'name': x} for x in sorted(links)], 66 | } 67 | ]) 68 | 69 | selected = answer['sites'] 70 | return selected if len(selected) > 0 else links 71 | # end def 72 | 73 | 74 | def choose_a_novel(self): 75 | '''Choose a single novel url from the search result''' 76 | args = get_args() 77 | 78 | # Choose a novel title 79 | choices = self.app.search_results 80 | selected_choice = self.app.search_results[0] 81 | if len(choices) > 1 and not args.suppress: 82 | answer = prompt([ 83 | { 84 | 'type': 'list', 85 | 'name': 'novel', 86 | 'message': 'Which one is your novel?', 87 | 'choices': display.format_novel_choices(choices), 88 | } 89 | ]) 90 | 91 | index = int(answer['novel'].split('.')[0]) 92 | selected_choice = self.app.search_results[index - 1] 93 | # end if 94 | 95 | # Choose the novel source 96 | novels = selected_choice['novels'] 97 | selected_novel = novels[0] 98 | if len(novels) > 1 and not args.suppress: 99 | answer = prompt([ 100 | { 101 | 'type': 'list', 102 | 'name': 'novel', 103 | 'message': 'Choose a source to download?', 104 | 'choices': ['0. Back'] + display.format_source_choices(novels), 105 | } 106 | ]) 107 | 108 | index = int(answer['novel'].split('.')[0]) 109 | if index == 0: 110 | return self.choose_a_novel() 111 | # end if 112 | selected_novel = novels[index - 1] 113 | # end if 114 | 115 | return selected_novel['url'] 116 | # end def 117 | -------------------------------------------------------------------------------- /lncrawl/sources/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Auto imports all crawlers from the current package directory. 4 | To be recognized, your crawler file should meet following conditions: 5 | - file does not starts with an underscore 6 | - file ends with .py extension 7 | - file contains a class that extends `lncrawl.utils.crawler.Crawler` 8 | - the class extending `lncrawl.utils.crawler.Crawler` has a global variable `base_url` 9 | - `base_url` contains a valid url or a list of urls supported by the crawler 10 | 11 | For example, see any of the files inside this directory. 12 | """ 13 | 14 | import importlib 15 | import os 16 | import re 17 | import sys 18 | from urllib.parse import urlparse 19 | 20 | from ..utils.crawler import Crawler 21 | 22 | rejected_sources = { 23 | 'https://novelplanet.com/': 'Site is closed', 24 | 'http://gravitytales.com/': 'Redirects to webnovel.com', 25 | 'http://fullnovel.live/': "403 - Forbidden: Access is denied", 26 | 'http://moonbunnycafe.com/': "Does not follow uniform format", 27 | 'https://anythingnovel.com/': 'Site broken', 28 | 'https://indomtl.com/': "Does not like to be crawled", 29 | 'https://lnindo.org/': "Does not like to be crawled", 30 | 'https://myoniyonitranslations.com/': "522 - Connection timed out", 31 | 'https://novelgo.id/': "Removed by owner", 32 | 'https://www.flying-lines.com/': 'Obfuscated content', 33 | 'https://www.jieruihao.cn/': "Unavailable", 34 | 'https://www.noveluniverse.com/': "Site is down", 35 | 'https://www.novelupdates.com/': "Does not host any novels", 36 | 'https://www.novelv.com/': "Site is down", 37 | 'https://yukinovel.id/': "Removed by owner", 38 | 'https://www.rebirth.online/': 'Site moved', 39 | 'https://mtled-novels.com/': 'Domain is expired', 40 | } 41 | 42 | # this list will be auto-generated 43 | crawler_list = {} 44 | 45 | # auto-import all submodules in the current directory 46 | __module_regex = re.compile(r'^([^_.][^.]+).py[c]?$', re.I) 47 | __url_regex = re.compile(r'^^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.I) 48 | 49 | for entry in os.listdir(__path__[0]): 50 | file_path = os.path.join(__path__[0], entry) 51 | if not os.path.isfile(file_path): 52 | continue 53 | # end if 54 | 55 | regex_result = __module_regex.findall(entry) 56 | if len(regex_result) != 1: # does not contains a module 57 | continue 58 | # end if 59 | 60 | module_name = regex_result[0] 61 | module = importlib.import_module('.' + module_name, package=__package__) 62 | 63 | for key in dir(module): 64 | item = getattr(module, key) 65 | if type(item) != type(Crawler) or item.__base__ != Crawler: 66 | continue 67 | # end if 68 | 69 | if not hasattr(item, 'base_url'): 70 | raise Exception('No `base_url` for `%s`' % key) 71 | # end if 72 | 73 | base_url = getattr(item, 'base_url') 74 | if isinstance(base_url, str): 75 | base_url = [base_url] 76 | # end if 77 | 78 | if not isinstance(base_url, list): 79 | raise Exception('Unexpected `base_url` type in `%s`' % key) 80 | # end if 81 | 82 | for url in base_url: 83 | if not __url_regex.match(url): 84 | raise Exception('Invalid `base_url` in `%s`: %s' % (key, url)) 85 | # end if 86 | if not url.endswith('/'): 87 | url += '/' 88 | # end if 89 | if url in rejected_sources: 90 | continue 91 | # end if 92 | crawler_list[url] = item 93 | # end for 94 | # end for 95 | # end for 96 | -------------------------------------------------------------------------------- /lncrawl/sources/royalroad.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('ROYALROAD') 8 | search_url = 'https://www.royalroad.com/fictions/search?keyword=%s' 9 | 10 | 11 | class RoyalRoadCrawler(Crawler): 12 | base_url = 'https://www.royalroad.com/' 13 | 14 | def search_novel(self, query): 15 | query = query.lower().replace(' ', '+') 16 | soup = self.get_soup(search_url % query) 17 | 18 | results = [] 19 | for a in soup.select('h2.fiction-title a')[:5]: 20 | url = self.absolute_url(a['href']) 21 | results.append({ 22 | 'url': url, 23 | 'title': a.text.strip(), 24 | 'info': self.search_novel_info(url), 25 | }) 26 | # end for 27 | 28 | return results 29 | # end def 30 | 31 | def search_novel_info(self, url): 32 | '''Get novel title, autor, cover etc''' 33 | logger.debug('Visiting %s', url) 34 | soup = self.get_soup(url) 35 | 36 | score = soup.select_one('span.star')['data-content'] 37 | chapters = len(soup.find('tbody').findAll('a', href=True)) 38 | latest = soup.find('tbody').findAll('a', href=True)[-1].text.strip() 39 | info = 'Score: %s, Chapter count %s, Latest: %s' % ( 40 | score, chapters, latest) 41 | 42 | return info 43 | # end def 44 | 45 | def read_novel_info(self): 46 | '''Get novel title, autor, cover etc''' 47 | logger.debug('Visiting %s', self.novel_url) 48 | soup = self.get_soup(self.novel_url) 49 | 50 | self.novel_title = soup.find("h1", {"property": "name"}).text.strip() 51 | logger.info('Novel title: %s', self.novel_title) 52 | 53 | self.novel_cover = self.absolute_url( 54 | soup.find("img", {"class": "img-offset thumbnail inline-block"})['src']) 55 | logger.info('Novel cover: %s', self.novel_cover) 56 | 57 | self.novel_author = soup.find( 58 | "span", {"property": "name"}).text.strip() 59 | logger.info('Novel author: %s', self.novel_author) 60 | 61 | chapters = soup.find('tbody').findAll('a', href=True) 62 | 63 | for x in chapters: 64 | chap_id = len(self.chapters) + 1 65 | if len(self.chapters) % 100 == 0: 66 | vol_id = chap_id//100 + 1 67 | vol_title = 'Volume ' + str(vol_id) 68 | self.volumes.append({ 69 | 'id': vol_id, 70 | 'title': vol_title, 71 | }) 72 | # end if 73 | self.chapters.append({ 74 | 'id': chap_id, 75 | 'volume': vol_id, 76 | 'url': self.absolute_url(x['href']), 77 | 'title': x.text.strip() or ('Chapter %d' % chap_id), 78 | }) 79 | # end for 80 | # end def 81 | 82 | def download_chapter_body(self, chapter): 83 | '''Download body of a single chapter and return as clean html format.''' 84 | logger.info('Downloading %s', chapter['url']) 85 | soup = self.get_soup(chapter['url']) 86 | 87 | logger.debug(soup.title.string) 88 | 89 | if 'Chapter' in soup.select_one('h2').text: 90 | chapter['title'] = soup.select_one('h2').text 91 | else: 92 | chapter['title'] = chapter['title'] 93 | # end if 94 | 95 | contents = soup.find("div", {"class": "chapter-content"}) 96 | 97 | self.clean_contents(contents) 98 | return str(contents) 99 | # end def 100 | # end class 101 | -------------------------------------------------------------------------------- /lncrawl/sources/wuxiasite.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import re 5 | from ..utils.crawler import Crawler 6 | 7 | logger = logging.getLogger('WUXIA-SITE') 8 | search_url = 'https://wuxiaworld.site/?s=%s&post_type=wp-manga' 9 | 10 | 11 | class WuxiaSiteCrawler(Crawler): 12 | base_url = 'https://wuxiaworld.site/' 13 | 14 | # TODO: disabled due to cloudflare issue 15 | # def search_novel(self, query): 16 | # query = query.lower().replace(' ', '+') 17 | # soup = self.get_soup(search_url % query) 18 | 19 | # results = [] 20 | # for tab in soup.select('.c-tabs-item__content'): 21 | # a = tab.select_one('.post-title h4 a') 22 | # latest = tab.select_one('.latest-chap .chapter a').text 23 | # votes = tab.select_one('.rating .total_votes').text 24 | # results.append({ 25 | # 'title': a.text.strip(), 26 | # 'url': self.absolute_url(a['href']), 27 | # 'info': '%s | Rating: %s' % (latest, votes), 28 | # }) 29 | # # end for 30 | 31 | # return results 32 | # # end def 33 | 34 | def read_novel_info(self): 35 | '''Get novel title, autor, cover etc''' 36 | logger.debug('Visiting %s', self.novel_url) 37 | soup = self.get_soup(self.novel_url) 38 | 39 | self.novel_title = ' '.join([ 40 | str(x) 41 | for x in soup.select_one('.post-title h3').contents 42 | if not x.name 43 | ]).strip() 44 | logger.info('Novel title: %s', self.novel_title) 45 | 46 | possible_img = soup.select_one('.summary_image img') 47 | if possible_img: 48 | if possible_img.has_attr('data-src'): 49 | self.novel_cover = self.absolute_url(possible_img['data-src']) 50 | elif possible_img.has_attr('srcset'): 51 | self.novel_cover = self.absolute_url(possible_img['srcset'].split(',')[0]) 52 | elif possible_img.has_attr('src'): 53 | self.novel_cover = self.absolute_url(possible_img['src']) 54 | logger.info('Novel cover: %s', self.novel_cover) 55 | 56 | author = soup.select('.author-content a') 57 | if len(author) == 2: 58 | self.novel_author = author[0].text + ' (' + author[1].text + ')' 59 | else: 60 | self.novel_author = author[0].text 61 | logger.info('Novel author: %s', self.novel_author) 62 | 63 | chapters = soup.select('ul.main li.wp-manga-chapter a') 64 | chapters.reverse() 65 | 66 | for a in chapters: 67 | chap_id = len(self.chapters) + 1 68 | vol_id = chap_id//100 + 1 69 | if len(self.chapters) % 100 == 0: 70 | vol_title = 'Volume ' + str(vol_id) 71 | self.volumes.append({ 72 | 'id': vol_id, 73 | 'title': vol_title, 74 | }) 75 | # end if 76 | self.chapters.append({ 77 | 'id': chap_id, 78 | 'volume': vol_id, 79 | 'url': self.absolute_url(a['href']), 80 | 'title': a.text.strip() or ('Chapter %d' % chap_id), 81 | }) 82 | # end for 83 | # end def 84 | 85 | def download_chapter_body(self, chapter): 86 | '''Download body of a single chapter and return as clean html format.''' 87 | logger.info('Downloading %s', chapter['url']) 88 | soup = self.get_soup(chapter['url']) 89 | contents = soup.select('.text-left p, .cha-words p') 90 | body = [str(p) for p in contents if p.text.strip()] 91 | return '' + '
'.join(body) + '
' 92 | # end def 93 | # end class 94 | -------------------------------------------------------------------------------- /lncrawl/bots/test/post_github.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | import os 5 | import platform 6 | import sys 7 | from datetime import datetime 8 | from urllib.parse import urlencode 9 | 10 | import requests 11 | 12 | from ...assets.user_agents import user_agents 13 | 14 | logger = logging.getLogger('MAKE_GITHUB_ISSUE') 15 | 16 | # Authentication for user filing issue 17 | USERNAME = os.getenv('GITHUB_USERNAME') 18 | # PASSWORD = os.getenv('GITHUB_PASSWORD') # deprecated 19 | TOKEN = os.getenv('GITHUB_TOKEN') # must have read/write access to repo 20 | 21 | # The repository to add this issue to 22 | REPO_OWNER = 'dipu-bd' 23 | REPO_NAME = 'lightnovel-crawler' 24 | 25 | # Headers 26 | headers = { 27 | "User-Agent": user_agents[0], 28 | "Authorization": "token %s" % TOKEN, 29 | "Accept": "application/vnd.github.golden-comet-preview+json" 30 | } 31 | 32 | 33 | def find_issues(labels=None): 34 | '''Returns list of issues by query''' 35 | # Url to get issues via GET 36 | url = 'https://api.github.com/repos/%s/%s/issues' % (REPO_OWNER, REPO_NAME) 37 | 38 | # Create a session without authentication 39 | session = requests.Session() 40 | 41 | # Create our issue 42 | data = { 43 | 'labels': labels, 44 | } 45 | 46 | # Get issues 47 | r = session.get(url + '?' + urlencode(data), headers=headers) 48 | if r.ok: 49 | logger.info('Successfully retrieved issues') 50 | return r.json() 51 | else: 52 | logger.info('Failed to get issues: %s' % url) 53 | logger.debug('Response:\n%s\n' % r.content) 54 | return [] 55 | # end if 56 | # end def 57 | 58 | 59 | def post_issue(title, body=None, labels=None): 60 | '''Create an issue on github.com using the given parameters.''' 61 | # Our url to create issues via POST 62 | url = 'https://api.github.com/repos/%s/%s/import/issues' % (REPO_OWNER, REPO_NAME) 63 | 64 | # Create an authenticated session to create the issue 65 | session = requests.Session() 66 | # session.auth = (USERNAME, PASSWORD) 67 | 68 | # Create our issue 69 | payload = json.dumps({ 70 | 'issue': { 71 | 'title': title, 72 | 'body': body, 73 | 'labels': labels, 74 | } 75 | }) 76 | 77 | # Add the issue to our repository 78 | r = session.post(url, data=payload, headers=headers) 79 | if r.ok: 80 | logger.info('Successfully created Issue %s' % title) 81 | else: 82 | logger.info('Could not create Issue %s' % title) 83 | logger.debug('Response:\n%s\n' % r.content) 84 | raise Exception('Failed to create issue') 85 | # end if 86 | # end def 87 | 88 | 89 | def post_on_github(self, message): 90 | if sys.version_info.minor != 6: 91 | print('Not Python 3.6... skipping.') 92 | return 93 | # end if 94 | 95 | # Check if there is already an issue younger than a week 96 | issues = find_issues('bot-report') 97 | if len(issues): 98 | time = int(issues[0]['title'].split('~')[-1].strip()) 99 | diff = datetime.utcnow().timestamp() - time 100 | if diff < 7 * 24 * 3600: 101 | print('Detected an open issue younger than a week... skipping.') 102 | return 103 | # end if 104 | # end if 105 | 106 | # Create new issue with appropriate label 107 | title = '[Test Bot][Python %d.%d][%s] Report ~ %s' % ( 108 | sys.version_info.major, 109 | sys.version_info.minor, 110 | platform.system(), 111 | datetime.utcnow().strftime('%s') 112 | ) 113 | post_issue(title, message, ['bot-report']) 114 | # end def 115 | --------------------------------------------------------------------------------