├── .editorconfig
├── .env.example
├── .gitattributes
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── 1_general.md
│ ├── 2_new-source.md
│ ├── 3_source-issue.md
│ └── 4_bug_report.md
├── contribs.json
├── dependabot.yml
└── workflows
│ ├── index-gen.yml
│ ├── lint.yml
│ └── release.yml
├── .gitignore
├── .python-version
├── Aptfile
├── LICENSE
├── Procfile
├── README.md
├── README.pip
├── app.json
├── compose.yml
├── etc
└── wuxiaworld.com
│ ├── help.md
│ ├── wuxia.proto
│ └── wuxia.proto.json
├── lncrawl
├── VERSION
├── __init__.py
├── __main__.py
├── assets
│ ├── __init__.py
│ ├── banner.py
│ ├── chars.py
│ ├── colors.py
│ ├── epub
│ │ ├── __init__.py
│ │ ├── chapter.xhtml
│ │ ├── cover.xhtml
│ │ └── style.css
│ ├── languages.py
│ ├── user_agents.py
│ ├── version.py
│ └── web
│ │ ├── __init__.py
│ │ ├── script.js
│ │ └── style.css
├── binders
│ ├── __init__.py
│ ├── calibre.py
│ ├── epub.py
│ ├── json.py
│ ├── text.py
│ └── web.py
├── bots
│ ├── __init__.py
│ ├── _sample.py
│ ├── console
│ │ ├── __init__.py
│ │ ├── get_crawler.py
│ │ ├── integration.py
│ │ ├── login_info.py
│ │ ├── open_folder_prompt.py
│ │ ├── output_style.py
│ │ ├── range_selection.py
│ │ └── resume_download.py
│ ├── discord
│ │ ├── __init__.py
│ │ ├── config.py
│ │ ├── discord_bot.py
│ │ └── message_handler.py
│ ├── lookup
│ │ ├── __init__.py
│ │ ├── analyze.py
│ │ ├── generator.py
│ │ └── prompts.py
│ ├── server
│ │ ├── __init__.py
│ │ ├── api
│ │ │ ├── __init__.py
│ │ │ ├── artifacts.py
│ │ │ ├── auth.py
│ │ │ ├── jobs.py
│ │ │ ├── novels.py
│ │ │ ├── runner.py
│ │ │ └── users.py
│ │ ├── app.py
│ │ ├── config.py
│ │ ├── context.py
│ │ ├── db.py
│ │ ├── exceptions.py
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ ├── _base.py
│ │ │ ├── job.py
│ │ │ ├── pagination.py
│ │ │ └── user.py
│ │ ├── security.py
│ │ ├── services
│ │ │ ├── __init__.py
│ │ │ ├── artifacts.py
│ │ │ ├── jobs.py
│ │ │ ├── novels.py
│ │ │ ├── runner.py
│ │ │ ├── scheduler.py
│ │ │ ├── tier.py
│ │ │ └── users.py
│ │ ├── ui
│ │ │ └── __index__.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ ├── aborter.py
│ │ │ ├── decorators.py
│ │ │ ├── json_tools.py
│ │ │ ├── text_tools.py
│ │ │ └── time_utils.py
│ └── telegram
│ │ └── __init__.py
├── constants.py
├── core
│ ├── __init__.py
│ ├── app.py
│ ├── arguments.py
│ ├── browser.py
│ ├── cleaner.py
│ ├── crawler.py
│ ├── display.py
│ ├── download_chapters.py
│ ├── download_images.py
│ ├── exeptions.py
│ ├── logconfig.py
│ ├── metadata.py
│ ├── novel_info.py
│ ├── novel_search.py
│ ├── proxy.py
│ ├── scraper.py
│ ├── soup.py
│ ├── sources.py
│ └── taskman.py
├── models
│ ├── __init__.py
│ ├── chapter.py
│ ├── formats.py
│ ├── meta.py
│ ├── novel.py
│ ├── search_result.py
│ ├── session.py
│ └── volume.py
├── templates
│ ├── __init__.py
│ ├── browser
│ │ ├── __init__.py
│ │ ├── basic.py
│ │ ├── chapter_only.py
│ │ ├── general.py
│ │ ├── login.py
│ │ ├── optional_volume.py
│ │ ├── searchable.py
│ │ └── with_volume.py
│ ├── madara.py
│ ├── mangastream.py
│ ├── novelfull.py
│ ├── novelmtl.py
│ ├── novelpub.py
│ └── soup
│ │ ├── __init__.py
│ │ ├── chapter_only.py
│ │ ├── general.py
│ │ ├── optional_volume.py
│ │ ├── searchable.py
│ │ └── with_volume.py
├── utils
│ ├── __init__.py
│ ├── common.py
│ ├── imgen.py
│ ├── kindlegen_download.py
│ ├── material_colors.py
│ ├── pbincli.py
│ ├── platforms.py
│ ├── ratelimit.py
│ ├── sockets.py
│ ├── ssl_no_verify.py
│ ├── tilings.py
│ └── uploader
│ │ ├── __init__.py
│ │ ├── anonfiles.py
│ │ ├── gofile.py
│ │ └── google_drive.py
└── webdriver
│ ├── __init__.py
│ ├── elements.py
│ ├── job_queue.py
│ ├── local.py
│ ├── remote.py
│ └── scripts.py
├── requirements-app.txt
├── requirements-bot.txt
├── requirements-dev.txt
├── requirements.txt
├── res
├── lncrawl-icon.png
├── lncrawl-web.png
└── lncrawl.ico
├── scripts
├── Dockerfile
├── bitanon.sh
├── build.bat
├── build.sh
├── check_sources.py
├── entry_point.sh
├── index_gen.py
├── lint.bat
├── lint.sh
├── lncrawl.service
├── publish.bat
├── publish.sh
├── push_tag.bat
├── push_tag.sh
├── push_tag_force.bat
├── push_tag_force.sh
├── rebrandly.sh
├── start.sh
└── stop.sh
├── setup.cfg
├── setup.py
├── setup_pyi.py
└── sources
├── __init__.py
├── _examples
├── _00_basic.py
├── _01_general_soup.py
├── _02_searchable_soup.py
├── _03_chapter_only_soup.py
├── _04_searchable_chapter_only_soup.py
├── _05_with_volume_soup.py
├── _06_searchable_with_volume_soup.py
├── _07_optional_volume_soup.py
├── _08_searchable_optional_volume_soup.py
├── _09_basic_browser.py
├── _10_general_browser.py
├── _11_searchable_browser.py
├── _12_chapter_only_browser.py
├── _13_searchable_chapter_only_browser.py
├── _14_with_volume_browser.py
├── _15_searchable_with_volume_browser.py
├── _16_optional_volume_browser.py
└── _17_searchable_optional_volume_browser.py
├── _index.json
├── _rejected.json
├── ar
├── arnovel.py
├── kolnovel.py
└── rewayatclub.py
├── en
├── 1
│ └── 1stkissnovel.py
├── 4
│ └── 4scanlation.py
├── 8
│ ├── 888novel.py
│ └── 88tang.py
├── a
│ ├── allnovel.py
│ ├── allnovelfull.py
│ ├── americanfaux.py
│ ├── amnesiactl.py
│ ├── ancientheartloss.py
│ ├── anythingnovel.py
│ ├── aquamanga.py
│ ├── arangscans.py
│ ├── arcanetranslations.py
│ ├── asadatrans.py
│ ├── asianhobbyist.py
│ ├── asianovel.py
│ ├── asianovel_net.py
│ └── automtl.py
├── b
│ ├── babelnovel.py
│ ├── bakapervert.py
│ ├── bato.py
│ ├── beautymanga.py
│ ├── bestlightnovel.py
│ ├── blackboxtl.py
│ ├── bonnovel.py
│ ├── booknet.py
│ ├── boxnovel.py
│ ├── boxnovelcom.py
│ ├── boxnovelonline.py
│ ├── boxnovelorg.py
│ └── bronovel.py
├── c
│ ├── centinni.py
│ ├── chereads.py
│ ├── chickengege.py
│ ├── chrysanthemumgarden.py
│ ├── ckandawrites.online.py
│ ├── clicknovel.py
│ ├── coffeemanga.py
│ ├── creativenovels.py
│ ├── crescentmoon.py
│ └── fu_kemao.py
├── d
│ ├── daonovel.py
│ ├── daotranslate.py
│ ├── demontrans.py
│ ├── divinedaolibrary.py
│ ├── dmtrans.py
│ ├── dobelyuwai.py
│ ├── dragon_tea.py
│ ├── dsrealmtrans.py
│ └── dummynovels.py
├── e
│ ├── ebotnovel.py
│ ├── engnovel.py
│ └── exiledrebels.py
├── f
│ ├── fanfiction.py
│ ├── fanmtl.py
│ ├── fanstrans.py
│ ├── fantasyworldonline.py
│ ├── faqwiki.py
│ ├── fenrirealm.py
│ ├── fenrirtranslations.py
│ ├── fictionpress.py
│ ├── flyinglines.py
│ ├── foxteller.py
│ ├── freefullnovel.py
│ ├── freelightnovel.py
│ ├── freemanga.py
│ ├── freewebnovel.py
│ ├── fringecapybara.py
│ ├── fsapk.py
│ ├── fujitrans.py
│ ├── fullnovellive.py
│ └── fuyuneko.py
├── g
│ ├── genesistls.py
│ └── gravitytales.py
├── h
│ ├── hanyunovels.py
│ ├── harimanga.py
│ ├── hostednovel.py
│ ├── hotnovelfull.py
│ └── hui3r.py
├── i
│ ├── imperfectcomic.py
│ ├── inadequatetrans.py
│ ├── infinitetrans.py
│ ├── inkitt.py
│ ├── instadoses.py
│ ├── isekaiscan.py
│ ├── isekaiscaneu.py
│ ├── isotls.py
│ └── snowycodex.py
├── j
│ ├── jpmtl.py
│ └── justatrans.py
├── k
│ ├── katreadingcafe.py
│ ├── kingmanga.py
│ ├── kissmanga.py
│ ├── kissnovel.py
│ ├── kitenovel.py
│ ├── kolnovelnewsite.py
│ └── koreanmtl.py
├── l
│ ├── ladybirdtrans.py
│ ├── latestnovel.py
│ ├── lazygirltranslations.py
│ ├── leafstudio.py
│ ├── lemontree.py
│ ├── librarynovel.py
│ ├── lightnovelbastion.py
│ ├── lightnovelheaven.py
│ ├── lightnovelkiss.py
│ ├── lightnovelme.py
│ ├── lightnovelmeta.py
│ ├── lightnovelonline.py
│ ├── lightnovelpub.py
│ ├── lightnovelreader.py
│ ├── lightnovelshub.py
│ ├── lightnovelsonl.py
│ ├── lightnoveltv.py
│ ├── lightnovelworld.com.py
│ ├── lightnovelworld.py
│ ├── lightnovetrans.py
│ ├── listnovel.py
│ ├── literotica.py
│ ├── lnmtl.py
│ ├── ltnovel.py
│ ├── luminarynovels.py
│ └── lunarletters.py
├── m
│ ├── machinetransorg.py
│ ├── manga-tx.py
│ ├── mangabuddy.py
│ ├── mangachilllove.py
│ ├── mangaread.py
│ ├── mangarockteam.py
│ ├── mangarosie.py
│ ├── mangastic.py
│ ├── mangatoon.py
│ ├── mangatx.py
│ ├── mangaweebs.py
│ ├── manhuaplus.py
│ ├── manhwachill.py
│ ├── meownovel.py
│ ├── miraslation.py
│ ├── mixednovel.py
│ ├── mltnovels.py
│ ├── mostnovel.py
│ ├── mtlednovels.py
│ ├── mtlnation.py
│ ├── mtlreader.py
│ ├── myboxnovel.py
│ ├── mydramanovel.py
│ ├── myoniyonitrans.py
│ └── mysticalmerries.py
├── n
│ ├── neosekaitranslations.py
│ ├── newnovelorg.py
│ ├── newsnovel.py
│ ├── noblemtl.py
│ ├── noobchan.py
│ ├── novel-bin.net.py
│ ├── novel-bin.py
│ ├── novel27.py
│ ├── novel35.py
│ ├── novelall.py
│ ├── novelbin.net.py
│ ├── novelbin.py
│ ├── novelcake.py
│ ├── novelcool.py
│ ├── novelcrush.py
│ ├── novelfull.py
│ ├── novelfullme.py
│ ├── novelfullplus.py
│ ├── novelgate.py
│ ├── novelhall.py
│ ├── novelhard.py
│ ├── novelhi.py
│ ├── novelhulk.py
│ ├── novelhunters.py
│ ├── novelight.py
│ ├── novelmao.py
│ ├── novelmic.py
│ ├── novelmt.py
│ ├── novelmtl.py
│ ├── novelmultiverse.py
│ ├── novelnext.py
│ ├── novelnextz.py
│ ├── novelonlinefree.py
│ ├── novelonlinefull.py
│ ├── novelpassion.py
│ ├── novelplanet.py
│ ├── novelpub.py
│ ├── novelrare.py
│ ├── novelraw.py
│ ├── novelsala.py
│ ├── novelsemperor.py
│ ├── novelsite.py
│ ├── novelsonline.py
│ ├── novelspl.py
│ ├── novelspread.py
│ ├── novelsrock.py
│ ├── noveltranslate.py
│ ├── noveluniverse.py
│ ├── novelupdatescc.py
│ ├── novelv.py
│ ├── novelww.py
│ ├── novelzec.py
│ ├── novlove.py
│ └── nyxtranslation.py
├── o
│ ├── omgnovels.py
│ ├── oppatrans.py
│ ├── oppatranslations.py
│ ├── ornovel.py
│ └── overabook.py
├── p
│ ├── pandamanga.py
│ ├── pandanovelco.py
│ ├── pandanovelorg.py
│ ├── peryinfo.py
│ ├── pianmanga.py
│ └── puretl.py
├── q
│ └── qidianunderground.py
├── r
│ ├── raeitranslations.py
│ ├── randomnovel.py
│ ├── ranobes.py
│ ├── readlightnovelcc.py
│ ├── readlightnovelorg.py
│ ├── readlightnovelsnet.py
│ ├── readmanganato.py
│ ├── readmtl.py
│ ├── readnovelfull.py
│ ├── readnovelz.py
│ ├── readonlinenovels.py
│ ├── readwebnovels.py
│ ├── readwn.py
│ ├── reaperscans.py
│ ├── rebirthonline.py
│ ├── reincarnationpalace.py
│ ├── relibrary.py
│ ├── royalroad.py
│ └── rpgnovels.py
├── s
│ ├── scribblehub.py
│ ├── secondlifetranslations.py
│ ├── shalvation.py
│ ├── shanghaifantasy.py
│ ├── shinsori.py
│ ├── skydemonorder.py
│ ├── skynovel.py
│ ├── sleepytrans.py
│ ├── smnovels.py
│ ├── sonicmtl.py
│ ├── steambun.py
│ ├── supernovel.py
│ └── systemtranslation.py
├── t
│ ├── tamagotl.py
│ ├── tapread.py
│ ├── teanovel.py
│ ├── tigertranslations.py
│ ├── tipnovel.py
│ ├── tomotrans.py
│ ├── toonily.py
│ ├── topmanhua.py
│ ├── totallytranslations.py
│ ├── translateindo.py
│ ├── travistranslations.py
│ └── tunovelaligera.py
├── u
│ └── usefulnovel.py
├── v
│ ├── veratales.py
│ ├── viewnovel.py
│ ├── vipnovel.py
│ ├── virlyce.py
│ ├── vistrans.py
│ └── volarenovels.py
├── w
│ ├── wanderinginn.py
│ ├── webnovel.py
│ ├── webnovelonlinecom.py
│ ├── webnovelonlinenet.py
│ ├── webnovelpub.py
│ ├── webtoon.py
│ ├── whatsawhizzerwebnovels.py
│ ├── whitemoonlightnovels.py
│ ├── wnmtl.py
│ ├── wondernovels.py
│ ├── woopread.py
│ ├── wordexcerpt.py
│ ├── wordrain.py
│ ├── writerupdates.py
│ ├── wspadancewichita.py
│ ├── wujizun.py
│ ├── wuxiablog.py
│ ├── wuxiabox.py
│ ├── wuxiacity.py
│ ├── wuxiaclick.py
│ ├── wuxiaco.py
│ ├── wuxiacom.py
│ ├── wuxiahub.py
│ ├── wuxialeague.py
│ ├── wuxiamtl.py
│ ├── wuxianovelhub.py
│ ├── wuxiaonline.py
│ ├── wuxiapub.py
│ ├── wuxiar.py
│ ├── wuxiasite.py
│ ├── wuxiaspot.py
│ ├── wuxiau.py
│ ├── wuxiav.py
│ ├── wuxiaworldio.py
│ ├── wuxiaworldlive.py
│ ├── wuxiaworldsite.py
│ ├── wuxiax.py
│ └── wuxiaz.py
├── x
│ └── xiainovel.py
└── z
│ ├── zenithnovels.py
│ ├── zetrotranslation.py
│ ├── zinmanga.py
│ └── zinnovel.py
├── es
├── domentranslations.py
└── novelasligeras.py
├── fr
├── animesama.py
├── chireads.py
├── lightnovelfr.py
├── lnmtlfr.py
├── noveldeglace.py
└── xiaowaz.py
├── id
├── darktrans.py
├── grensia_blogspot.py
├── idqidian.py
├── indomtl.py
├── indowebnovel.py
├── meionovel.py
├── morenovel.py
├── novelgo.py
├── novelku.py
├── novelringan.py
├── noveltoon.py
├── wbnovel.py
├── webnovelindonesia.py
├── webnovelover.py
├── worldnovelonline.py
├── yukinovel.py
└── zhiend.py
├── jp
└── s
│ └── syosetu.py
├── multi
├── foxaholic.py
├── mtlnovel.py
├── novelupdates.py
├── quotev.py
├── wattpad.py
├── webfic.py
└── wtrlab.py
├── pt
├── blnovels.py
├── centralnovel.py
└── ceunovel.py
├── ru
├── bestmanga.py
├── ifreedom.py
├── jaomix.py
├── litnet.py
├── ranobelib.py
├── ranobenovel.py
├── renovels.py
└── rulate.py
├── tr
└── fenrirscan.py
├── vi
├── lnhakone.py
└── truenfull.py
└── zh
├── 27k.py
├── 69shuba.cx.py
├── 69shuba.py
├── daocaorenshuwu.py
├── ddxsss.py
├── ixdzs.py
├── novel543.py
├── piaotian.py
├── powanjuan.py
├── shw5.py
├── soxs.py
├── trxs.py
├── uukanshu.py
├── uukanshu_sj.py
├── xbanxia.py
└── xnunu.py
/.editorconfig:
--------------------------------------------------------------------------------
1 | # EditorConfig is awesome: https://editorconfig.org/
2 |
3 | # top-most EditorConfig file
4 | root = true
5 |
6 | # Unix-style newlines with a newline ending every file
7 | [*]
8 | end_of_line = lf
9 | insert_final_newline = true
10 |
11 | # Set default charset
12 | [*.{js,py}]
13 | charset = utf-8
14 |
15 | [*.py]
16 | indent_style = space
17 | indent_size = 4
18 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # Values should be the file names (without .py) inside `lncrawl/interfaces` folder.
2 | # By default the console bot will be choosen if this is left empty or invalid..
3 | BOT=console
4 |
5 | # Available levels: NOTSET, WARN, INFO, DEBUG, FATAL, ERROR
6 | # If this variable is unset or NONE, logging will not be configured.
7 | LOG_LEVEL=INFO
8 |
9 | # Configs for bots
10 | TELEGRAM_TOKEN=
11 | DISCORD_TOKEN=
12 | DISCORD_DISABLE_SEARCH=false
13 | DISCORD_SIGNAL_CHAR=!
14 |
15 | # Cloud drives for upload | Options: [ANONFILES, GOFILE, GOOGLE_DRIVE] | Default: ANONFILES
16 | CLOUD_DRIVE=ANONFILES
17 |
18 | # Google Drive Config
19 | GOOGLE_DRIVE_CREDENTIAL_FILE=mycreds.txt
20 | GOOGLE_DRIVE_FOLDER_ID=118iN1jzavVV-9flrLPZo7DOi0cuxrQ5F
21 |
22 | # Password for VNC server
23 | VNC_PASSWORD=secret
24 |
25 | # Server Config
26 | SERVER_SECRET=
27 | SERVER_ADMIN_EMAIL=
28 | SERVER_ADMIN_PASSWORD=
29 | RUNNER_INTERVAL_IN_SECOND=10
30 | DATABASE_URL=sqlite:///.server/sqlite.db
31 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 |
3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4 | patreon: # Replace with a single Patreon username
5 | open_collective: # Replace with a single Open Collective username
6 | ko_fi: # Replace with a single Ko-fi username
7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
13 | custom: ['https://paypal.me/sd1pu']
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/1_general.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: General
3 | about: Create a general issue
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/2_new-source.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Request New Source
3 | about: Want to request a new source that is not yet listed in the README.md?
4 | title: Replace this with an url
5 | labels: source
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | - Language:
19 | - Supports Searching: Yes/No
20 | - Contains Machine Translations: Yes/No
21 | - Contains Manga/Manhua/Manhwa: Yes/No
22 | - Has CloudFlare Protection: Yes/No
23 |
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/3_source-issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Source Not Working
3 | about: Having trouble with a specific source? (e.g. failing to crawl or missing chapters or content)
4 | title: Fix this source
5 | labels: source-issue
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
12 | ## Let us know
13 |
14 |
15 |
16 | **Novel URL**:
14 |
15 |
16 |
--------------------------------------------------------------------------------
/lncrawl/assets/epub/style.css:
--------------------------------------------------------------------------------
1 | img {
2 | width: 100%;
3 | object-fit: scale-down;
4 | object-position: center;
5 | }
6 |
7 | p + br {
8 | display: none;
9 | }
10 |
11 | #intro {
12 | width: 100vw;
13 | height: calc(100% - 30px);
14 | text-align: center;
15 | position: relative;
16 | display: flex;
17 | flex-direction: column;
18 | align-items: center;
19 | justify-content: space-between;
20 | text-align: center;
21 | letter-spacing: 0.25;
22 | }
23 |
24 | #intro .header {
25 | height: 200px;
26 | }
27 |
28 | #intro h1 {
29 | opacity: 1;
30 | }
31 | #intro h3 {
32 | opacity: 0.6;
33 | }
34 |
35 | #intro img {
36 | width: 100%;
37 | height: calc(100% - 300px);
38 | object-fit: contain;
39 | object-position: center;
40 | }
41 |
42 | #intro .footer {
43 | height: 50px;
44 | line-height: 24px;
45 | opacity: 0.8;
46 | }
47 |
48 | #cover {
49 | object-fit: cover;
50 | }
51 |
52 | #volume {
53 | width: 100%;
54 | height: 100%;
55 | display: flex;
56 | text-align: center;
57 | align-items: center;
58 | justify-content: center;
59 | }
60 |
--------------------------------------------------------------------------------
/lncrawl/assets/version.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | ROOT = Path(__file__).parent.parent
4 |
5 | with open(str(ROOT / "VERSION"), "r", encoding="utf8") as f:
6 | version = f.read().strip()
7 |
8 |
9 | def get_version():
10 | return version
11 |
--------------------------------------------------------------------------------
/lncrawl/assets/web/__init__.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | ROOT = Path(__file__).parent
4 |
5 |
6 | def get_js_script():
7 | with open(str(ROOT / "script.js"), "r", encoding="utf8") as f:
8 | script = f.read()
9 | return script
10 |
11 |
12 | def get_css_style():
13 | with open(str(ROOT / "style.css"), "r", encoding="utf8") as f:
14 | style = f.read()
15 | return style
16 |
--------------------------------------------------------------------------------
/lncrawl/binders/json.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from pathlib import Path
3 | from typing import Generator
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | def make_jsons(app, data) -> Generator[str, None, None]:
9 | root_path = Path(app.output_path)
10 | yield str(root_path / 'meta.json')
11 | for vol in data:
12 | for chap in data[vol]:
13 | file_name = "%s.json" % str(chap["id"]).rjust(5, "0")
14 | file_path = root_path / "json" / file_name
15 | if file_path.is_file():
16 | yield str(file_path)
17 |
--------------------------------------------------------------------------------
/lncrawl/binders/text.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import re
4 | from typing import Generator
5 |
6 | from bs4 import BeautifulSoup
7 |
8 | from ..assets.chars import Chars
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def make_texts(app, data) -> Generator[str, None, None]:
14 | for vol in data:
15 | dir_name = os.path.join(app.output_path, "text", vol)
16 | os.makedirs(dir_name, exist_ok=True)
17 | for chap in data[vol]:
18 | if not chap.get("body"):
19 | continue
20 | file_name = "%s.txt" % str(chap["id"]).rjust(5, "0")
21 | file_name = os.path.join(dir_name, file_name)
22 | with open(file_name, "w", encoding="utf8") as file:
23 | body = chap["body"].replace("
\n
None:
11 | self.app: Optional[App] = None
12 | self.search_mode = False
13 |
14 | from .get_crawler import (
15 | choose_a_novel,
16 | confirm_guessed_novel,
17 | confirm_retry,
18 | get_crawlers_to_search,
19 | get_novel_url,
20 | )
21 | from .integration import process_chapter_range, start
22 | from .login_info import get_login_info
23 | from .output_style import (
24 | force_replace_old,
25 | get_output_formats,
26 | get_output_path,
27 | should_pack_by_volume,
28 | )
29 | from .range_selection import (
30 | get_range_from_chapters,
31 | get_range_from_volumes,
32 | get_range_selection,
33 | get_range_using_index,
34 | get_range_using_urls,
35 | )
36 |
--------------------------------------------------------------------------------
/lncrawl/bots/console/login_info.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Tuple
2 |
3 | from questionary import prompt
4 |
5 | from ...core.arguments import get_args
6 |
7 |
8 | def get_login_info(self) -> Optional[Tuple[str, str]]:
9 | """Returns the (email, password) pair for login"""
10 | args = get_args()
11 |
12 | if args.login:
13 | return args.login
14 |
15 | if args.suppress:
16 | return None
17 |
18 | answer = prompt(
19 | [
20 | {
21 | "type": "confirm",
22 | "name": "login",
23 | "message": "Do you want to log in?",
24 | "default": False,
25 | },
26 | ]
27 | )
28 |
29 | if answer["login"]:
30 | answer = prompt(
31 | [
32 | {
33 | "type": "input",
34 | "name": "email",
35 | "message": "User/Email:",
36 | "validate": lambda a: True
37 | if a
38 | else "User/Email should be not be empty",
39 | },
40 | {
41 | "type": "password",
42 | "name": "password",
43 | "message": "Password:",
44 | "validate": lambda a: True
45 | if a
46 | else "Password should be not be empty",
47 | },
48 | ]
49 | )
50 | return answer["email"], answer["password"]
51 |
52 | return None
53 |
--------------------------------------------------------------------------------
/lncrawl/bots/console/open_folder_prompt.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from questionary import prompt
4 |
5 | from ...utils.platforms import Platform
6 | from ...core.arguments import get_args
7 |
8 |
9 | def display_open_folder(folder_path: str):
10 | args = get_args()
11 |
12 | if args.suppress:
13 | return
14 | if Platform.java or Platform.docker:
15 | return
16 |
17 | answer = prompt(
18 | [
19 | {
20 | "type": "confirm",
21 | "name": "exit",
22 | "message": "Open the output folder?",
23 | "default": True,
24 | },
25 | ]
26 | )
27 |
28 | if not answer["exit"]:
29 | return
30 |
31 | if Platform.windows:
32 | os.system(f'explorer.exe "{folder_path}"')
33 | elif Platform.wsl:
34 | os.system(f'cd "{folder_path}" && explorer.exe .')
35 | elif Platform.linux:
36 | os.system(f'xdg-open "{folder_path}"')
37 | elif Platform.mac:
38 | os.system(f'open "{folder_path}"')
39 | else:
40 | print(f"Output Folder: {folder_path}")
41 |
--------------------------------------------------------------------------------
/lncrawl/bots/discord/__init__.py:
--------------------------------------------------------------------------------
1 | from . import config
2 | from .discord_bot import DiscordBot
3 |
4 | __all__ = ["config", "DiscordBot"]
5 |
--------------------------------------------------------------------------------
/lncrawl/bots/lookup/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import re
3 | from urllib.parse import urlparse
4 |
5 | from slugify import slugify
6 |
7 | from ...core.sources import sources_path
8 | from .analyze import analyze_url
9 | from .generator import generate_crawler
10 | from .prompts import get_features, get_novel_url
11 |
12 |
13 | class LookupBot:
14 | log = logging.getLogger(__name__)
15 |
16 | def __init__(self) -> None:
17 | pass
18 |
19 | def start(self) -> None:
20 | novel_url = get_novel_url()
21 |
22 | _parsed = urlparse(novel_url)
23 | base_url = "%s://%s/" % (_parsed.scheme, _parsed.hostname)
24 | name = re.sub(r"(^www\.)|(\.com$)", "", _parsed.hostname)
25 |
26 | template = analyze_url(base_url, novel_url)
27 |
28 | features = get_features()
29 | language = features["language"] or "multi"
30 | has_manga = features["has_manga"]
31 | has_mtl = features["has_mtl"]
32 |
33 | filename = name + ".py"
34 | classname = slugify(
35 | name,
36 | max_length=20,
37 | separator="_",
38 | lowercase=True,
39 | word_boundary=True,
40 | ).title()
41 |
42 | folder = sources_path / language
43 | if language == "en":
44 | folder = folder / filename[0]
45 | filename = str(folder / filename)
46 |
47 | generate_crawler(
48 | template,
49 | output_file=filename,
50 | classname=classname,
51 | base_url=base_url,
52 | has_manga=has_manga,
53 | has_mtl=has_mtl,
54 | )
55 |
--------------------------------------------------------------------------------
/lncrawl/bots/lookup/generator.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Type
3 |
4 | from colorama import Style
5 |
6 | from ...assets.chars import Chars
7 | from ...core.crawler import Crawler
8 | from ...core.exeptions import LNException
9 |
10 |
11 | def generate_crawler(
12 | template: Type[Crawler],
13 | output_file: str,
14 | classname: str,
15 | base_url: str,
16 | has_manga: bool,
17 | has_mtl: bool,
18 | ):
19 | if os.path.exists(output_file):
20 | raise LNException(f"File exists: {output_file}")
21 |
22 | lines = [
23 | "import logging",
24 | "",
25 | f"from {template.__module__} import {template.__name__}",
26 | "",
27 | "logger = logging.getLogger(__name__)",
28 | "",
29 | "",
30 | f"class {classname}({template.__name__}):",
31 | f" has_mtl = {bool(has_mtl)}",
32 | f" has_manga = {bool(has_manga)}",
33 | f' base_url = ["{base_url}"]',
34 | "",
35 | ]
36 | with open(output_file, "w", encoding="utf-8") as f:
37 | f.write("\n".join(lines))
38 |
39 | print()
40 | print(
41 | Style.BRIGHT + Chars.PARTY,
42 | "Generated source file",
43 | Chars.PARTY + Style.RESET_ALL,
44 | )
45 | print(Chars.RIGHT_ARROW, output_file)
46 | print()
47 |
--------------------------------------------------------------------------------
/lncrawl/bots/lookup/prompts.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from questionary import prompt
4 |
5 | from ...assets.languages import language_codes
6 | from ...core.arguments import get_args
7 | from ...core.exeptions import LNException
8 |
9 |
10 | def get_novel_url():
11 | """Return a novel page url"""
12 | args = get_args()
13 | url = args.novel_page
14 |
15 | if url:
16 | if re.match(r"^https?://.+\..+$", url):
17 | return url
18 | else:
19 | raise LNException("Invalid URL of novel page")
20 |
21 | try:
22 | answer = prompt(
23 | [
24 | {
25 | "type": "input",
26 | "name": "novel",
27 | "message": "Enter novel page url:",
28 | "validate": lambda x: (
29 | True
30 | if re.match(r"^https?://.+\..+$", x)
31 | else "Invalid URL of novel page"
32 | ),
33 | },
34 | ]
35 | )
36 | return answer["novel"].strip()
37 | except Exception:
38 | raise LNException("Novel page url or query was not given")
39 |
40 |
41 | def get_features():
42 | """Return the feature list for the crawler"""
43 | answer = prompt(
44 | [
45 | {
46 | "type": "autocomplete",
47 | "name": "language",
48 | "message": "Enter language:",
49 | "choices": list(sorted(language_codes.keys())),
50 | },
51 | {
52 | "type": "confirm",
53 | "name": "has_manga",
54 | "message": "Does it contain Manga/Manhua/Manhwa?",
55 | "default": False,
56 | },
57 | {
58 | "type": "confirm",
59 | "name": "has_mtl",
60 | "message": "Does it contain Machine Translations?",
61 | "default": False,
62 | },
63 | ]
64 | )
65 | return answer
66 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | import uvicorn
4 |
5 | from ...core.arguments import get_args
6 | from .app import app
7 | from .context import ServerContext
8 |
9 |
10 | class ServerBot:
11 | log = logging.getLogger(__name__)
12 |
13 | def start(self):
14 | args = get_args()
15 |
16 | ctx = ServerContext()
17 | ctx.db.prepare()
18 | ctx.users.prepare()
19 | ctx.scheduler.start()
20 |
21 | uvicorn.run(
22 | app,
23 | log_level=logging.DEBUG,
24 | port=args.server_port or 8080,
25 | host=args.server_host or '0.0.0.0',
26 | )
27 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/api/__init__.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, Depends
2 |
3 | from ..security import ensure_admin, ensure_login
4 | from .artifacts import router as artifact
5 | from .auth import router as auth
6 | from .jobs import router as job
7 | from .novels import router as novel
8 | from .runner import router as runner
9 | from .users import router as user
10 |
11 | router = APIRouter()
12 |
13 | router.include_router(
14 | auth,
15 | prefix='/auth',
16 | tags=['Auth'],
17 | )
18 |
19 | router.include_router(
20 | user,
21 | prefix='/user',
22 | tags=['Users'],
23 | dependencies=[Depends(ensure_admin)],
24 | )
25 |
26 | router.include_router(
27 | job,
28 | prefix='/job',
29 | tags=['Jobs'],
30 | dependencies=[Depends(ensure_login)],
31 | )
32 |
33 | router.include_router(
34 | novel,
35 | prefix='/novel',
36 | tags=['Novels'],
37 | dependencies=[Depends(ensure_login)],
38 | )
39 |
40 | router.include_router(
41 | artifact,
42 | prefix='/artifact',
43 | tags=['Artifacts'],
44 | dependencies=[Depends(ensure_login)],
45 | )
46 |
47 | router.include_router(
48 | runner,
49 | prefix='/runner',
50 | tags=['Runner'],
51 | dependencies=[Depends(ensure_admin)],
52 | )
53 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/api/artifacts.py:
--------------------------------------------------------------------------------
1 | import mimetypes
2 | import os
3 | from typing import Optional
4 |
5 | from fastapi import APIRouter, Depends, Path, Query
6 | from fastapi.responses import FileResponse
7 |
8 | from ..context import ServerContext
9 | from ..exceptions import AppErrors
10 |
11 | # The root router
12 | router = APIRouter()
13 |
14 |
15 | @router.get("s", summary='Returns a list of artifacts')
16 | def list_artifacts(
17 | ctx: ServerContext = Depends(),
18 | offset: int = Query(default=0),
19 | limit: int = Query(default=20, le=100),
20 | novel_id: Optional[str] = Query(default=None),
21 | ):
22 | return ctx.artifacts.list(
23 | limit=limit,
24 | offset=offset,
25 | novel_id=novel_id,
26 | )
27 |
28 |
29 | @router.get("/{artifact_id}", summary='Returns a artifact')
30 | def get_novel(
31 | artifact_id: str = Path(),
32 | ctx: ServerContext = Depends(),
33 | ):
34 | return ctx.artifacts.get(artifact_id)
35 |
36 |
37 | @router.get("/{artifact_id}/download", summary='Download artifact file')
38 | def get_novel_artifacts(
39 | artifact_id: str = Path(),
40 | ctx: ServerContext = Depends(),
41 | ):
42 | artifact = ctx.artifacts.get(artifact_id)
43 | file_path = artifact.output_file
44 | if not file_path:
45 | raise AppErrors.no_artifact_file
46 |
47 | media_type, _ = mimetypes.guess_type(file_path)
48 | return FileResponse(
49 | path=file_path,
50 | filename=os.path.basename(file_path),
51 | media_type=media_type or "application/octet-stream",
52 | )
53 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/api/auth.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, Body, Depends
2 |
3 | from ..context import ServerContext
4 | from ..models.user import (CreateRequest, LoginRequest, LoginResponse,
5 | SignupRequest, UpdateRequest, User)
6 | from ..security import ensure_user
7 |
8 | # The root router
9 | router = APIRouter()
10 |
11 |
12 | @router.post("/login", summary="Login with username or email and password")
13 | def login(
14 | ctx: ServerContext = Depends(),
15 | credentials: LoginRequest = Body(
16 | default=...,
17 | description='The login credentials',
18 | ),
19 | ):
20 | user = ctx.users.verify(credentials)
21 | token = ctx.users.generate_token(user.id)
22 | return LoginResponse(token=token, user=user)
23 |
24 |
25 | @router.post('/signup', summary='Signup as a new user')
26 | def signup(
27 | ctx: ServerContext = Depends(),
28 | body: SignupRequest = Body(
29 | default=...,
30 | description='The signup request',
31 | ),
32 | ):
33 | request = CreateRequest(
34 | password=body.password,
35 | email=body.email,
36 | name=body.name,
37 | )
38 | user = ctx.users.create(request)
39 | token = ctx.users.generate_token(user.id)
40 | return LoginResponse(token=token, user=user)
41 |
42 |
43 | @router.get('/me', summary='Get current user details')
44 | def me(
45 | user: User = Depends(ensure_user),
46 | ):
47 | return user
48 |
49 |
50 | @router.put('/me/update', summary='Update current user details')
51 | def self_update(
52 | ctx: ServerContext = Depends(),
53 | user: User = Depends(ensure_user),
54 | body: UpdateRequest = Body(
55 | default=...,
56 | description='The signup request',
57 | ),
58 | ):
59 | body.role = None
60 | body.tier = None
61 | body.is_active = None
62 | return ctx.users.update(user.id, body)
63 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/api/novels.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, Depends, Path, Query
2 |
3 | from ..context import ServerContext
4 |
5 | # The root router
6 | router = APIRouter()
7 |
8 |
9 | @router.get("s", summary='Returns a list of novels')
10 | def list_novels(
11 | ctx: ServerContext = Depends(),
12 | offset: int = Query(default=0),
13 | limit: int = Query(default=20, le=100),
14 | with_orphans: bool = Query(default=False),
15 | ):
16 | return ctx.novels.list(
17 | limit=limit,
18 | offset=offset,
19 | with_orphans=with_orphans,
20 | )
21 |
22 |
23 | @router.get("/{novel_id}", summary='Returns a novel')
24 | def get_novel(
25 | novel_id: str = Path(),
26 | ctx: ServerContext = Depends(),
27 | ):
28 | return ctx.novels.get(novel_id)
29 |
30 |
31 | @router.get("/{novel_id}/artifacts", summary='Returns cached artifacts')
32 | def get_novel_artifacts(
33 | novel_id: str = Path(),
34 | ctx: ServerContext = Depends(),
35 | ):
36 | return ctx.novels.get_artifacts(novel_id)
37 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/api/runner.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, Depends
2 |
3 | from ..context import ServerContext
4 |
5 | # The root router
6 | router = APIRouter()
7 |
8 |
9 | @router.get("/start", summary='Start the runner')
10 | def start(ctx: ServerContext = Depends()):
11 | ctx.scheduler.start()
12 |
13 |
14 | @router.get("/stop", summary='Stops the runner')
15 | def stop(ctx: ServerContext = Depends()):
16 | ctx.scheduler.close()
17 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/api/users.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter, Body, Depends, Path, Query
2 |
3 | from ..context import ServerContext
4 | from ..exceptions import AppErrors
5 | from ..models.user import CreateRequest, UpdateRequest, User
6 | from ..security import ensure_user
7 |
8 | # The root router
9 | router = APIRouter()
10 |
11 |
12 | @router.get('s', summary='Get list of all users')
13 | def all_users(
14 | ctx: ServerContext = Depends(),
15 | offset: int = Query(default=0),
16 | limit: int = Query(default=20, le=100),
17 | ):
18 | return ctx.users.list(offset, limit)
19 |
20 |
21 | @router.post('', summary='Create an user')
22 | def create_user(
23 | ctx: ServerContext = Depends(),
24 | body: CreateRequest = Body(
25 | default=...,
26 | description='The signup request',
27 | ),
28 | ):
29 | return ctx.users.create(body)
30 |
31 |
32 | @router.get('/{user_id}', summary='Get the user')
33 | def get_user(
34 | ctx: ServerContext = Depends(),
35 | user_id: str = Path(),
36 | ):
37 | return ctx.users.get(user_id)
38 |
39 |
40 | @router.put('/{user_id}', summary='Update the user')
41 | def update_user(
42 | ctx: ServerContext = Depends(),
43 | user: User = Depends(ensure_user),
44 | body: UpdateRequest = Body(
45 | default=...,
46 | description='The signup request',
47 | ),
48 | user_id: str = Path(),
49 | ):
50 | if user_id == user.id:
51 | body.role = None
52 | body.is_active = None
53 | return ctx.users.update(user_id, body)
54 |
55 |
56 | @router.delete('/{user_id}', summary='Delete the user')
57 | def delete_user(
58 | user: User = Depends(ensure_user),
59 | ctx: ServerContext = Depends(),
60 | user_id: str = Path(),
61 | ):
62 | if user.id == user_id:
63 | raise AppErrors.can_not_delete_self
64 | return ctx.users.remove(user_id)
65 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/app.py:
--------------------------------------------------------------------------------
1 | import traceback
2 |
3 | from fastapi import FastAPI
4 | from fastapi.middleware.cors import CORSMiddleware
5 | from fastapi.middleware.gzip import GZipMiddleware
6 |
7 | from ...assets.version import get_version
8 |
9 | app = FastAPI(
10 | version=get_version(),
11 | title="Lightnovel Crawler",
12 | description="Download novels from online sources and generate e-books",
13 | )
14 |
15 | app.add_middleware(
16 | CORSMiddleware,
17 | allow_credentials=True,
18 | allow_origins=["*"],
19 | allow_methods=["*"],
20 | allow_headers=["*"],
21 | )
22 |
23 | app.add_middleware(
24 | GZipMiddleware,
25 | minimum_size=1000,
26 | )
27 |
28 | try:
29 | from .api import router as api
30 | app.include_router(api, prefix='/api')
31 | except ImportError:
32 | traceback.print_exc()
33 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/context.py:
--------------------------------------------------------------------------------
1 | from functools import cached_property
2 | from typing import Optional
3 |
4 | from .utils.decorators import autoclose
5 |
6 | _cache: Optional['ServerContext'] = None
7 |
8 |
9 | class ServerContext:
10 | def __new__(cls):
11 | global _cache
12 | if _cache is None:
13 | _cache = super().__new__(cls)
14 | return _cache
15 |
16 | @cached_property
17 | def config(self):
18 | from .config import Config
19 | return Config()
20 |
21 | @cached_property
22 | @autoclose
23 | def db(self):
24 | from .db import DB
25 | return DB(self)
26 |
27 | @cached_property
28 | def users(self):
29 | from .services.users import UserService
30 | return UserService(self)
31 |
32 | @cached_property
33 | def jobs(self):
34 | from .services.jobs import JobService
35 | return JobService(self)
36 |
37 | @cached_property
38 | def novels(self):
39 | from .services.novels import NovelService
40 | return NovelService(self)
41 |
42 | @cached_property
43 | def artifacts(self):
44 | from .services.artifacts import ArtifactService
45 | return ArtifactService(self)
46 |
47 | @cached_property
48 | @autoclose
49 | def scheduler(self):
50 | from .services.scheduler import JobScheduler
51 | return JobScheduler(self)
52 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/db.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from sqlmodel import Session, SQLModel, create_engine
4 |
5 | from .context import ServerContext
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | class DB:
11 | def __init__(self, ctx: ServerContext) -> None:
12 | self.engine = create_engine(
13 | ctx.config.server.database_url,
14 | echo=logger.isEnabledFor(logging.DEBUG),
15 | )
16 |
17 | def close(self):
18 | self.engine.dispose()
19 |
20 | def prepare(self):
21 | logger.info('Creating tables')
22 | SQLModel.metadata.create_all(self.engine)
23 |
24 | def session(
25 | self, *,
26 | future: bool = True,
27 | autoflush: bool = True,
28 | autocommit: bool = False,
29 | expire_on_commit: bool = True,
30 | enable_baked_queries: bool = True,
31 | ):
32 | return Session(
33 | self.engine,
34 | future=future, # type:ignore
35 | autoflush=autoflush,
36 | autocommit=autocommit, # type:ignore
37 | expire_on_commit=expire_on_commit,
38 | enable_baked_queries=enable_baked_queries,
39 | )
40 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/bots/server/models/__init__.py
--------------------------------------------------------------------------------
/lncrawl/bots/server/models/_base.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | from sqlalchemy import event
4 | from sqlmodel import Field, SQLModel
5 |
6 | from ..utils.time_utils import current_timestamp
7 |
8 |
9 | def generate_uuid():
10 | return uuid.uuid4().hex
11 |
12 |
13 | class BaseModel(SQLModel):
14 | id: str = Field(
15 | default_factory=generate_uuid,
16 | primary_key=True,
17 | description="ID"
18 | )
19 | created_at: int = Field(
20 | index=True,
21 | default_factory=current_timestamp,
22 | description="Create timestamp (ms)"
23 | )
24 | updated_at: int = Field(
25 | default_factory=current_timestamp,
26 | description="Update timestamp (ms)"
27 | )
28 |
29 |
30 | @event.listens_for(BaseModel, "before_update", propagate=True)
31 | def auto_update_timestamp(mapper, connection, target: BaseModel):
32 | target.updated_at = current_timestamp()
33 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/models/pagination.py:
--------------------------------------------------------------------------------
1 | from typing import Generic, List, TypeVar
2 |
3 | from pydantic import BaseModel
4 |
5 | T = TypeVar("T")
6 |
7 |
8 | class Paginated(BaseModel, Generic[T]):
9 | total: int
10 | offset: int
11 | limit: int
12 | items: List[T]
13 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/models/user.py:
--------------------------------------------------------------------------------
1 | from enum import Enum, IntEnum
2 | from typing import Optional
3 |
4 | from pydantic import EmailStr
5 | from sqlmodel import Field, SQLModel
6 |
7 | from ._base import BaseModel
8 |
9 |
10 | class UserRole(str, Enum):
11 | USER = "user"
12 | ADMIN = "admin"
13 |
14 |
15 | class UserTier(IntEnum):
16 | BASIC = 0
17 | PREMIUM = 1
18 | VIP = 2
19 |
20 |
21 | class User(BaseModel, table=True):
22 | password: str = Field(description="Hashed password", exclude=True)
23 | email: str = Field(unique=True, index=True, description="User Email")
24 | role: UserRole = Field(default=UserRole.USER, description="User role")
25 | is_active: bool = Field(default=True, description="Active status")
26 | name: Optional[str] = Field(default=None, description="Full name")
27 | tier: UserTier = Field(default=UserTier.BASIC, description="User tier")
28 |
29 |
30 | class LoginRequest(SQLModel):
31 | email: str = Field(description="User email")
32 | password: str = Field(description="User password")
33 |
34 |
35 | class LoginResponse(SQLModel):
36 | token: str = Field(description="The authorization token")
37 | user: User = Field(description="The user")
38 |
39 |
40 | class SignupRequest(SQLModel):
41 | email: EmailStr = Field(description="User Email")
42 | password: str = Field(description="User password")
43 | name: Optional[str] = Field(default=None, description="Full name")
44 |
45 |
46 | class CreateRequest(SignupRequest):
47 | role: UserRole = Field(default=UserRole.USER, description="User role")
48 | tier: UserTier = Field(default=UserTier.BASIC, description="User tier")
49 |
50 |
51 | class UpdateRequest(SQLModel):
52 | password: Optional[str] = Field(default=None, description="User password")
53 | name: Optional[str] = Field(default=None, description="Full name")
54 | role: Optional[UserRole] = Field(default=None, description="User role")
55 | is_active: Optional[bool] = Field(default=None, description="Active status")
56 | tier: Optional[UserTier] = Field(default=None, description="User tier")
57 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/security.py:
--------------------------------------------------------------------------------
1 | from fastapi import Depends
2 | from fastapi.security import APIKeyHeader
3 | from jose import jwt
4 |
5 | from .context import ServerContext
6 | from .exceptions import AppErrors
7 | from .models.user import User, UserRole
8 |
9 | header_scheme = APIKeyHeader(
10 | name='Authorization',
11 | scheme_name='Bearer Token',
12 | )
13 |
14 |
15 | def ensure_login(
16 | ctx: ServerContext = Depends(),
17 | token: str = Depends(header_scheme),
18 | ) -> dict:
19 | try:
20 | key = ctx.config.server.token_secret
21 | algo = ctx.config.server.token_algo
22 | if token.startswith('Bearer '):
23 | token = token[len('Bearer '):]
24 | return jwt.decode(token, key, algorithms=[algo])
25 | except Exception as e:
26 | raise AppErrors.unauthorized from e
27 |
28 |
29 | def ensure_user(
30 | ctx: ServerContext = Depends(),
31 | payload: dict = Depends(ensure_login),
32 | ) -> User:
33 | user_id = payload.get('uid')
34 | if not user_id:
35 | raise AppErrors.unauthorized
36 | user = ctx.users.get(user_id)
37 | if not user.is_active:
38 | raise AppErrors.inactive_user
39 | return user
40 |
41 |
42 | def ensure_admin(user: User = Depends(ensure_user)) -> User:
43 | if user.role != UserRole.ADMIN:
44 | raise AppErrors.forbidden
45 | return user
46 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/services/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/bots/server/services/__init__.py
--------------------------------------------------------------------------------
/lncrawl/bots/server/services/artifacts.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from sqlmodel import desc, func, select
4 |
5 | from ..context import ServerContext
6 | from ..exceptions import AppErrors
7 | from ..models.job import Artifact
8 | from ..models.pagination import Paginated
9 | from ..models.user import User, UserRole
10 |
11 |
12 | class ArtifactService:
13 | def __init__(self, ctx: ServerContext) -> None:
14 | self._ctx = ctx
15 | self._db = ctx.db
16 |
17 | def list(
18 | self,
19 | offset: int = 0,
20 | limit: int = 20,
21 | novel_id: Optional[str] = None,
22 | ) -> Paginated[Artifact]:
23 | with self._db.session() as sess:
24 | stmt = select(Artifact)
25 |
26 | # Apply filters
27 | if not novel_id:
28 | stmt = stmt.where(Artifact.novel_id == novel_id)
29 |
30 | # Apply sorting
31 | stmt.order_by(desc(Artifact.created_at))
32 |
33 | total = sess.exec(select(func.count()).select_from(Artifact)).one()
34 | items = sess.exec(stmt.offset(offset).limit(limit)).all()
35 |
36 | return Paginated(
37 | total=total,
38 | offset=offset,
39 | limit=limit,
40 | items=list(items),
41 | )
42 |
43 | def get(self, artifact_id: str) -> Artifact:
44 | with self._db.session() as sess:
45 | artifact = sess.get(Artifact, artifact_id)
46 | if not artifact:
47 | raise AppErrors.no_such_artifact
48 | return artifact
49 |
50 | def delete(self, artifact_id: str, user: User) -> bool:
51 | if user.role != UserRole.ADMIN:
52 | raise AppErrors.forbidden
53 | with self._db.session() as sess:
54 | artifact = sess.get(Artifact, artifact_id)
55 | if not artifact:
56 | raise AppErrors.no_such_artifact
57 | sess.delete(artifact)
58 | sess.commit()
59 | return True
60 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/services/tier.py:
--------------------------------------------------------------------------------
1 | from lncrawl.models import OutputFormat
2 |
3 | from ..models.job import JobPriority
4 | from ..models.user import UserTier
5 |
6 | ##
7 | # For Job creation
8 | ##
9 |
10 | JOB_PRIORITY_LEVEL = {
11 | UserTier.BASIC: JobPriority.LOW,
12 | UserTier.PREMIUM: JobPriority.NORMAL,
13 | UserTier.VIP: JobPriority.HIGH,
14 | }
15 |
16 | ##
17 | # For JobRunner service
18 | ##
19 | ENABLED_FORMATS = {
20 | UserTier.BASIC: [
21 | OutputFormat.json,
22 | OutputFormat.epub,
23 | ],
24 | UserTier.PREMIUM: [
25 | OutputFormat.json,
26 | OutputFormat.epub,
27 | OutputFormat.text,
28 | OutputFormat.web,
29 | OutputFormat.pdf,
30 | ],
31 | UserTier.VIP: list(OutputFormat),
32 | }
33 |
34 | BATCH_DOWNLOAD_LIMIT = {
35 | UserTier.BASIC: 10,
36 | UserTier.PREMIUM: 100,
37 | UserTier.VIP: 10000,
38 | }
39 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/ui/__index__.py:
--------------------------------------------------------------------------------
1 | import reflex as rx
2 |
3 |
4 | class State(rx.State):
5 | count: int = 0
6 |
7 | def increment(self):
8 | self.count += 1
9 |
10 | def decrement(self):
11 | self.count -= 1
12 |
13 |
14 | def index():
15 | return rx.hstack(
16 | rx.button(
17 | "Decrement",
18 | color_scheme="ruby",
19 | on_click=State.decrement,
20 | ),
21 | rx.heading(State.count, font_size="2em"),
22 | rx.button(
23 | "Increment",
24 | color_scheme="grass",
25 | on_click=State.increment,
26 | ),
27 | spacing="4",
28 | )
29 |
30 |
31 | app = rx.App()
32 | app.add_page(index)
33 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/bots/server/utils/__init__.py
--------------------------------------------------------------------------------
/lncrawl/bots/server/utils/aborter.py:
--------------------------------------------------------------------------------
1 | from threading import Event
2 |
3 |
4 | class Aborter:
5 | def __init__(self) -> None:
6 | self._event = Event()
7 |
8 | @property
9 | def aborted(self):
10 | return self._event.is_set()
11 |
12 | def abort(self):
13 | self._event.set()
14 |
15 | def wait(self, timeout: float):
16 | if timeout <= 0:
17 | return
18 | self._event.wait(timeout)
19 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/utils/decorators.py:
--------------------------------------------------------------------------------
1 | import atexit
2 |
3 |
4 | def autoclose(func):
5 | def inner(*args, **kwargs):
6 | val = func(*args, **kwargs)
7 | if hasattr(val, 'close') and callable(val.close):
8 | atexit.register(val.close)
9 | return val
10 | return inner
11 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/utils/json_tools.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | from typing import Any, TypeVar
4 |
5 | _log = logging.getLogger(__name__)
6 |
7 | T = TypeVar('T')
8 |
9 |
10 | def json_encode(data: Any, encoding: str = "utf-8") -> bytes:
11 | try:
12 | output = json.dumps(
13 | data,
14 | allow_nan=True,
15 | ensure_ascii=False,
16 | check_circular=True,
17 | separators=(',', ':'),
18 | )
19 | return output.encode(encoding)
20 | except Exception as err:
21 | _log.debug('Failed encoding', err)
22 | return b''
23 |
24 |
25 | def json_decode(data: str | bytes | bytearray | None, _default: T) -> T:
26 | try:
27 | if isinstance(data, bytearray):
28 | data = bytes(data)
29 | if isinstance(data, bytes):
30 | data = data.decode()
31 | if not isinstance(data, str):
32 | return _default
33 | return json.loads(data)
34 | except Exception as err:
35 | _log.debug('Failed decoding', err)
36 | return _default
37 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/utils/text_tools.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import hashlib
3 | import lzma
4 |
5 | from cryptography.fernet import Fernet
6 |
7 | __key_cache = {}
8 |
9 |
10 | def text_compress(plain: bytes) -> bytes:
11 | lzc = lzma.LZMACompressor()
12 | output = lzc.compress(plain)
13 | output += lzc.flush()
14 | return output
15 |
16 |
17 | def text_decompress(compressed: bytes) -> bytes:
18 | lzd = lzma.LZMADecompressor()
19 | return lzd.decompress(compressed)
20 |
21 |
22 | def text_encrypt(plain: bytes, secret: str | bytes) -> bytes:
23 | fernet = Fernet(generate_key(secret))
24 | result = fernet.encrypt(plain)
25 | return base64.urlsafe_b64decode(result)
26 |
27 |
28 | def text_decrypt(cipher: bytes, secret: str | bytes) -> bytes:
29 | fernet = Fernet(generate_key(secret))
30 | cipher = base64.urlsafe_b64encode(cipher)
31 | return fernet.decrypt(cipher)
32 |
33 |
34 | def text_compress_encrypt(plain: bytes, secret: str | bytes) -> bytes:
35 | return text_encrypt(text_compress(plain), secret)
36 |
37 |
38 | def text_decrypt_decompress(cipher: bytes, secret: str | bytes) -> bytes:
39 | return text_decompress(text_decrypt(cipher, secret))
40 |
41 |
42 | def generate_md5(*texts) -> str:
43 | md5 = hashlib.md5()
44 | for text in texts:
45 | md5.update(str(text or '').encode())
46 | return md5.hexdigest()
47 |
48 |
49 | def generate_key(secret: str | bytes) -> bytes:
50 | if isinstance(secret, str):
51 | secret = secret.encode()
52 | if secret not in __key_cache:
53 | hash = hashlib.sha3_256(secret).digest()
54 | key = base64.urlsafe_b64encode(hash)
55 | __key_cache[secret] = key
56 | return __key_cache[secret]
57 |
--------------------------------------------------------------------------------
/lncrawl/bots/server/utils/time_utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from typing import Any
3 |
4 | from dateutil import parser
5 | from dateutil.relativedelta import relativedelta
6 | from dateutil.tz import tzutc
7 |
8 |
9 | def current_timestamp():
10 | '''Current UNIX timestamp in milliseconds'''
11 | return round(1000 * datetime.now().timestamp())
12 |
13 |
14 | def as_unix_time(time: Any) -> int | None:
15 | try:
16 | if isinstance(time, int):
17 | return time
18 | if isinstance(time, str):
19 | time = parser.parse(time)
20 | if isinstance(time, datetime):
21 | return round(1000 * time.timestamp())
22 | except Exception:
23 | pass
24 | return None
25 |
26 |
27 | def time_from_now(
28 | years=0, months=0, days=0, weeks=0,
29 | hours=0, minutes=0, seconds=0
30 | ) -> datetime:
31 | delta = relativedelta(
32 | years=years, months=months, days=days, weeks=weeks,
33 | hours=hours, minutes=minutes, seconds=seconds
34 | )
35 | return datetime.now(tzutc()).replace(microsecond=0) + delta
36 |
--------------------------------------------------------------------------------
/lncrawl/constants.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | DEFAULT_OUTPUT_PATH = os.getenv('OUTPUT_PATH') or os.path.abspath("Lightnovels")
4 | META_FILE_NAME = "meta.json"
5 |
--------------------------------------------------------------------------------
/lncrawl/core/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Interactive application to take user inputs
3 | """
4 |
5 | import logging
6 | import os
7 | import sys
8 |
9 | import colorama # type:ignore
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | def init():
15 | from ..assets.version import get_version
16 | from .arguments import get_args
17 | from .display import description, input_suppression
18 | from .logconfig import configure_logging
19 |
20 | os.environ["version"] = get_version()
21 |
22 | colorama.init(wrap=True)
23 | description()
24 |
25 | configure_logging()
26 |
27 | args = get_args()
28 | logger.debug("Arguments: %s", args)
29 |
30 | if args.suppress:
31 | input_suppression()
32 | print(args)
33 |
34 | if args.bot:
35 | os.environ["BOT"] = args.bot
36 |
37 | for key, val in args.extra.items():
38 | os.environ[key] = val[0]
39 |
40 |
41 | def start_app():
42 | from ..bots import run_bot
43 | from .arguments import get_args
44 | from .display import cancel_method, error_message
45 | from .proxy import load_proxies, start_proxy_fetcher, stop_proxy_fetcher
46 | from .sources import load_sources
47 |
48 | init()
49 |
50 | load_sources()
51 | cancel_method()
52 |
53 | args = get_args()
54 | if args.proxy_file:
55 | os.environ["use_proxy"] = "file"
56 | load_proxies(args.proxy_file)
57 |
58 | if args.auto_proxy:
59 | os.environ["use_proxy"] = "auto"
60 | start_proxy_fetcher()
61 |
62 | try:
63 | bot = os.getenv("BOT", "").lower()
64 | run_bot(bot)
65 | except KeyboardInterrupt:
66 | pass
67 | except Exception:
68 | error_message(*sys.exc_info())
69 |
70 | if args.auto_proxy:
71 | stop_proxy_fetcher()
72 |
--------------------------------------------------------------------------------
/lncrawl/core/exeptions.py:
--------------------------------------------------------------------------------
1 | from urllib.error import URLError
2 |
3 | from cloudscraper.exceptions import CloudflareException
4 | from PIL import UnidentifiedImageError
5 | from requests.exceptions import RequestException
6 | from urllib3.exceptions import HTTPError
7 |
8 |
9 | class LNException(Exception):
10 | pass
11 |
12 |
13 | class FallbackToBrowser(Exception):
14 | pass
15 |
16 |
17 | ScraperErrorGroup = (
18 | URLError,
19 | HTTPError,
20 | CloudflareException,
21 | RequestException,
22 | FallbackToBrowser,
23 | UnidentifiedImageError,
24 | )
25 |
26 | RetryErrorGroup = (
27 | URLError,
28 | HTTPError,
29 | CloudflareException,
30 | RequestException,
31 | UnidentifiedImageError,
32 | )
33 |
--------------------------------------------------------------------------------
/lncrawl/core/soup.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from abc import ABC
3 | from typing import Optional, Union
4 |
5 | from bs4 import BeautifulSoup, Tag
6 | from requests import Response
7 |
8 | from .exeptions import LNException
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | DEFAULT_PARSER = "lxml"
14 |
15 |
16 | class SoupMaker(ABC):
17 | def __init__(
18 | self,
19 | parser: Optional[str] = None,
20 | ) -> None:
21 | """This is a helper for Beautiful Soup. It is being used as a superclass of the Crawler.
22 |
23 | Args:
24 | - parser (Optional[str], optional): Desirable features of the parser. This can be the name of a specific parser
25 | ("lxml", "lxml-xml", "html.parser", or "html5lib") or it may be the type of markup to be used ("html", "html5", "xml").
26 | """
27 | self._parser = parser or DEFAULT_PARSER
28 |
29 | def close(self) -> None:
30 | pass
31 |
32 | def make_soup(
33 | self,
34 | data: Union[Response, bytes, str],
35 | encoding: Optional[str] = None,
36 | ) -> BeautifulSoup:
37 | if isinstance(data, Response):
38 | return self.make_soup(data.content, encoding)
39 | elif isinstance(data, bytes):
40 | html = data.decode(encoding or "utf8", "ignore")
41 | elif isinstance(data, str):
42 | html = data
43 | else:
44 | raise LNException("Could not parse response")
45 | return BeautifulSoup(html, features=self._parser)
46 |
47 | def make_tag(
48 | self,
49 | data: Union[Response, bytes, str],
50 | encoding: Optional[str] = None,
51 | ) -> Tag:
52 | soup = self.make_soup(data, encoding)
53 | return next(soup.find("body").children)
54 |
--------------------------------------------------------------------------------
/lncrawl/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .chapter import Chapter
2 | from .formats import OutputFormat
3 | from .meta import MetaInfo
4 | from .novel import Novel
5 | from .search_result import CombinedSearchResult, SearchResult
6 | from .session import Session
7 | from .volume import Volume
8 |
9 | __all__ = [
10 | "Chapter",
11 | "CombinedSearchResult",
12 | "SearchResult",
13 | "OutputFormat",
14 | "Novel",
15 | "MetaInfo",
16 | "Session",
17 | "Volume",
18 | ]
19 |
--------------------------------------------------------------------------------
/lncrawl/models/chapter.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, Optional
2 |
3 | from box import Box
4 |
5 |
6 | class Chapter(Box):
7 | def __init__(
8 | self,
9 | id: int,
10 | url: str = "",
11 | title: str = "",
12 | volume: Optional[int] = None,
13 | volume_title: Optional[str] = None,
14 | body: Optional[str] = None,
15 | images: Dict[str, str] = dict(),
16 | success: bool = False,
17 | **kwargs,
18 | ) -> None:
19 | self.id = id
20 | self.url = url
21 | self.title = title
22 | self.volume = volume
23 | self.volume_title = volume_title
24 | self.body = body
25 | self.images = images
26 | self.success = success
27 | self.update(kwargs)
28 |
29 | @staticmethod
30 | def without_body(item: "Chapter") -> "Chapter":
31 | result = item.copy()
32 | result.body = None
33 | return result
34 |
--------------------------------------------------------------------------------
/lncrawl/models/formats.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class OutputFormat(str, Enum):
5 | json = "json"
6 | epub = "epub"
7 | text = "text"
8 | web = "web"
9 | docx = "docx"
10 | mobi = "mobi"
11 | pdf = "pdf"
12 | rtf = "rtf"
13 | txt = "txt"
14 | azw3 = "azw3"
15 | fb2 = "fb2"
16 | lit = "lit"
17 | lrf = "lrf"
18 | oeb = "oeb"
19 | pdb = "pdb"
20 | rb = "rb"
21 | snb = "snb"
22 | tcr = "tcr"
23 |
24 | def __str__(self) -> str:
25 | return self.value
26 |
--------------------------------------------------------------------------------
/lncrawl/models/meta.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from box import Box
4 |
5 | from .novel import Novel
6 | from .session import Session
7 |
8 |
9 | class MetaInfo(Box):
10 | def __init__(
11 | self,
12 | session: Optional[Session] = None,
13 | novel: Optional[Novel] = None,
14 | **kwargs,
15 | ) -> None:
16 | self.session = session
17 | self.novel = novel
18 | self.update(kwargs)
19 |
--------------------------------------------------------------------------------
/lncrawl/models/novel.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 |
3 | from box import Box
4 |
5 | from ..assets.languages import find_code
6 | from .chapter import Chapter
7 | from .volume import Volume
8 |
9 |
10 | class Novel(Box):
11 | def __init__(
12 | self,
13 | url: str,
14 | title: str,
15 | authors: List[str] = [],
16 | cover_url: Optional[str] = None,
17 | chapters: List[Chapter] = [],
18 | volumes: List[Volume] = [],
19 | is_rtl: bool = False,
20 | synopsis: str = "",
21 | language: Optional[str] = None,
22 | tags: List[str] = [],
23 | has_manga: Optional[bool] = None,
24 | has_mtl: Optional[bool] = None,
25 | **kwargs,
26 | ) -> None:
27 | self.url = url
28 | self.title = title
29 | self.authors = authors
30 | self.cover_url = cover_url
31 | self.chapters = chapters
32 | self.volumes = volumes
33 | self.is_rtl = is_rtl
34 | self.synopsis = synopsis
35 | self.has_manga = has_manga
36 | self.has_mtl = has_mtl
37 | self.language = find_code(language)
38 | self.tags = tags
39 | self.update(kwargs)
40 |
--------------------------------------------------------------------------------
/lncrawl/models/search_result.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from box import Box
4 |
5 |
6 | class SearchResult(Box):
7 | def __init__(
8 | self,
9 | title: str,
10 | url: str,
11 | info: str = "",
12 | **kwargs,
13 | ) -> None:
14 | self.title = str(title)
15 | self.url = str(url)
16 | self.info = str(info)
17 | self.update(kwargs)
18 |
19 |
20 | class CombinedSearchResult(Box):
21 | def __init__(
22 | self,
23 | id: str,
24 | title: str,
25 | novels: List[SearchResult] = [],
26 | **kwargs,
27 | ) -> None:
28 | self.id = id
29 | self.title = str(title)
30 | self.novels = novels
31 | self.update(kwargs)
32 |
--------------------------------------------------------------------------------
/lncrawl/models/volume.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from box import Box
4 |
5 |
6 | class Volume(Box):
7 | def __init__(
8 | self,
9 | id: int,
10 | title: str = "",
11 | start_chapter: Optional[int] = None,
12 | final_chapter: Optional[int] = None,
13 | chapter_count: Optional[int] = None,
14 | **kwargs,
15 | ) -> None:
16 | self.id = id
17 | self.title = title
18 | self.start_chapter = start_chapter
19 | self.final_chapter = final_chapter
20 | self.chapter_count = chapter_count
21 | self.update(kwargs)
22 |
--------------------------------------------------------------------------------
/lncrawl/templates/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/templates/__init__.py
--------------------------------------------------------------------------------
/lncrawl/templates/browser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/templates/browser/__init__.py
--------------------------------------------------------------------------------
/lncrawl/templates/browser/chapter_only.py:
--------------------------------------------------------------------------------
1 | from typing import Generator
2 |
3 | from bs4 import Tag
4 |
5 | from ...models import Chapter
6 | from ..soup.chapter_only import ChapterOnlySoupTemplate
7 | from .general import GeneralBrowserTemplate
8 |
9 |
10 | class ChapterOnlyBrowserTemplate(GeneralBrowserTemplate, ChapterOnlySoupTemplate):
11 | """Attempts to crawl using cloudscraper first, if failed use the browser."""
12 |
13 | def parse_chapter_list_in_browser(self) -> Generator[Chapter, None, None]:
14 | chap_id = 0
15 | for tag in self.select_chapter_tags_in_browser():
16 | if not isinstance(tag, Tag):
17 | continue
18 | chap_id += 1
19 | yield self.parse_chapter_item(tag, chap_id)
20 |
21 | def select_chapter_tags_in_browser(self) -> Generator[Tag, None, None]:
22 | """Select chapter list item tags from the browser"""
23 | yield from self.select_chapter_tags(self.browser.soup)
24 |
--------------------------------------------------------------------------------
/lncrawl/templates/browser/login.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from abc import abstractmethod
3 |
4 | from ...core.exeptions import FallbackToBrowser, ScraperErrorGroup
5 | from .general import GeneralBrowserTemplate
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | class LoginBrowserTemplate(GeneralBrowserTemplate):
11 | """Attempts to crawl using cloudscraper first, if failed use the browser."""
12 |
13 | def login(self, email: str, password: str) -> None:
14 | try:
15 | return self.login_in_soup(email, password)
16 | except ScraperErrorGroup:
17 | return self.login_in_browser(email, password)
18 |
19 | def login_in_soup(self, email: str, password: str) -> None:
20 | """Login to the website using the scraper"""
21 | raise FallbackToBrowser()
22 |
23 | @abstractmethod
24 | def login_in_browser(self, email: str, password: str) -> None:
25 | """Login to the website using the browser"""
26 | raise NotImplementedError()
27 |
--------------------------------------------------------------------------------
/lncrawl/templates/browser/searchable.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Generator, List
3 |
4 | from bs4 import Tag
5 |
6 | from ...core.exeptions import FallbackToBrowser
7 | from ...models import SearchResult
8 | from ..soup.searchable import SearchableSoupTemplate
9 | from .general import GeneralBrowserTemplate
10 |
11 |
12 | class SearchableBrowserTemplate(GeneralBrowserTemplate, SearchableSoupTemplate):
13 | """Attempts to crawl using cloudscraper first, if failed use the browser."""
14 |
15 | def search_novel_in_soup(self, query: str) -> List[SearchResult]:
16 | tags = self.select_search_items(query)
17 | return list(self.process_search_results(tags))
18 |
19 | def search_novel_in_browser(self, query: str) -> List[SearchResult]:
20 | tags = self.select_search_items_in_browser(query)
21 | return list(self.process_search_results_in_browser(tags))
22 |
23 | def process_search_results_in_browser(
24 | self, tags: Generator[Tag, None, None]
25 | ) -> Generator[Tag, None, None]:
26 | """Process novel item tag and generates search results from the browser"""
27 | count = 0
28 | for tag in tags:
29 | if not isinstance(tag, Tag):
30 | continue
31 | count += 1
32 | if count == 10:
33 | break
34 | yield self.parse_search_item_in_browser(tag)
35 |
36 | @abstractmethod
37 | def select_search_items(self, query: str) -> Generator[Tag, None, None]:
38 | raise FallbackToBrowser()
39 |
40 | def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, None]:
41 | """Select novel items found by the query using the browser"""
42 | yield from self.select_search_items(self.browser.soup)
43 |
44 | def parse_search_item_in_browser(self, tag: Tag) -> SearchResult:
45 | """Parse a tag and return single search result"""
46 | return self.parse_search_item(tag)
47 |
--------------------------------------------------------------------------------
/lncrawl/templates/browser/with_volume.py:
--------------------------------------------------------------------------------
1 | from typing import Generator, Union
2 |
3 | from bs4 import Tag
4 |
5 | from ...models import Chapter, Volume
6 | from ..soup.with_volume import ChapterWithVolumeSoupTemplate
7 | from .general import GeneralBrowserTemplate
8 |
9 |
10 | class ChapterWithVolumeBrowserTemplate(
11 | GeneralBrowserTemplate, ChapterWithVolumeSoupTemplate
12 | ):
13 | """Attempts to crawl using cloudscraper first, if failed use the browser."""
14 |
15 | def parse_chapter_list_in_browser(
16 | self,
17 | ) -> Generator[Union[Chapter, Volume], None, None]:
18 | vol_id = 0
19 | chap_id = 0
20 | for vol in self.select_volume_tags_in_browser():
21 | if not isinstance(vol, Tag):
22 | continue
23 | vol_id += 1
24 | vol_item = self.parse_volume_item_in_browser(vol, vol_id)
25 | yield vol_item
26 | for tag in self.select_chapter_tags_in_browser(vol, vol_item):
27 | if not isinstance(tag, Tag):
28 | continue
29 | chap_id += 1
30 | item = self.parse_chapter_item_in_browser(tag, chap_id, vol_item)
31 | item.volume = vol_id
32 | yield item
33 |
34 | def select_volume_tags_in_browser(self) -> Generator[Tag, None, None]:
35 | """Select volume list item tags from the browser"""
36 | return self.select_volume_tags(self.browser.soup)
37 |
38 | def parse_volume_item_in_browser(self, tag: Tag, id: int) -> Volume:
39 | """Parse a single volume from volume list item tag from the browser"""
40 | return self.parse_volume_item(tag, id)
41 |
42 | def select_chapter_tags_in_browser(
43 | self, tag: Tag, vol: Volume
44 | ) -> Generator[Tag, None, None]:
45 | """Select chapter list item tags from volume tag from the browser"""
46 | return self.select_chapter_tags(tag, vol)
47 |
48 | def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter:
49 | """Parse a single chapter from chapter list item tag from the browser"""
50 | return self.parse_chapter_item(tag, id, vol)
51 |
--------------------------------------------------------------------------------
/lncrawl/templates/soup/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/templates/soup/__init__.py
--------------------------------------------------------------------------------
/lncrawl/templates/soup/chapter_only.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Generator
3 |
4 | from bs4 import BeautifulSoup, Tag
5 |
6 | from ...models import Chapter
7 | from .general import GeneralSoupTemplate
8 |
9 |
10 | class ChapterOnlySoupTemplate(GeneralSoupTemplate):
11 | def parse_chapter_list(self, soup: BeautifulSoup) -> Generator[Chapter, None, None]:
12 | chap_id = 0
13 | for tag in self.select_chapter_tags(soup):
14 | if not isinstance(tag, Tag):
15 | continue
16 | chap_id += 1
17 | yield self.parse_chapter_item(tag, chap_id)
18 |
19 | @abstractmethod
20 | def select_chapter_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]:
21 | """Select chapter list item tags from the page soup"""
22 | raise NotImplementedError()
23 |
24 | @abstractmethod
25 | def parse_chapter_item(self, tag: Tag, id: int) -> Chapter:
26 | """Parse a single chapter from chapter list item tag"""
27 | raise NotImplementedError()
28 |
--------------------------------------------------------------------------------
/lncrawl/templates/soup/optional_volume.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Generator, Union
3 |
4 | from bs4 import BeautifulSoup, Tag
5 |
6 | from ...models import Chapter, Volume
7 | from .general import GeneralSoupTemplate
8 |
9 |
10 | class OptionalVolumeSoupTemplate(GeneralSoupTemplate):
11 | def parse_chapter_list(
12 | self, soup: BeautifulSoup
13 | ) -> Generator[Union[Chapter, Volume], None, None]:
14 | vol_id = 0
15 | chap_id = 0
16 | for vol in self.select_volume_tags(soup):
17 | if not isinstance(vol, Tag):
18 | continue
19 | vol_id += 1
20 | vol_item = self.parse_volume_item(vol, vol_id)
21 | yield vol_item
22 | for tag in self.select_chapter_tags(vol):
23 | if not isinstance(tag, Tag):
24 | continue
25 | chap_id += 1
26 | item = self.parse_chapter_item(tag, chap_id, vol_item)
27 | item.volume = vol_id
28 | yield item
29 |
30 | if chap_id > 0:
31 | return
32 |
33 | vol_id = 0
34 | chap_id = 0
35 | parent = soup.select_one("html")
36 | for tag in self.select_chapter_tags(parent):
37 | if not isinstance(tag, Tag):
38 | continue
39 | if chap_id % 100 == 0:
40 | vol_id = chap_id // 100 + 1
41 | vol_item = self.parse_volume_item(parent, vol_id)
42 | yield vol_item
43 | chap_id += 1
44 | item = self.parse_chapter_item(tag, chap_id, vol_item)
45 | item.volume = vol_id
46 | yield item
47 |
48 | def select_volume_tags(self, soup: BeautifulSoup):
49 | return []
50 |
51 | def parse_volume_item(self, tag: Tag, id: int) -> Volume:
52 | return Volume(id=id)
53 |
54 | @abstractmethod
55 | def select_chapter_tags(self, parent: Tag) -> Generator[Tag, None, None]:
56 | raise NotImplementedError()
57 |
58 | @abstractmethod
59 | def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter:
60 | raise NotImplementedError()
61 |
--------------------------------------------------------------------------------
/lncrawl/templates/soup/searchable.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Generator, List
3 |
4 | from bs4 import Tag
5 |
6 | from ...models import SearchResult
7 | from .general import GeneralSoupTemplate
8 |
9 |
10 | class SearchableSoupTemplate(GeneralSoupTemplate):
11 | def search_novel(self, query) -> List[SearchResult]:
12 | tags = self.select_search_items(query)
13 | return list(self.process_search_results(tags))
14 |
15 | def process_search_results(
16 | self, tags: Generator[Tag, None, None]
17 | ) -> Generator[Tag, None, None]:
18 | """Process novel item tag and generates search results"""
19 | count = 0
20 | for tag in tags:
21 | if not isinstance(tag, Tag):
22 | continue
23 | count += 1
24 | if count == 10:
25 | break
26 | yield self.parse_search_item(tag)
27 |
28 | @abstractmethod
29 | def select_search_items(self, query: str) -> Generator[Tag, None, None]:
30 | """Select novel items found on the search page by the query"""
31 | raise NotImplementedError()
32 |
33 | @abstractmethod
34 | def parse_search_item(self, tag: Tag) -> SearchResult:
35 | """Parse a tag and return single search result"""
36 | raise NotImplementedError()
37 |
--------------------------------------------------------------------------------
/lncrawl/templates/soup/with_volume.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Generator, Union
3 |
4 | from bs4 import BeautifulSoup, Tag
5 |
6 | from ...models import Chapter, Volume
7 | from .general import GeneralSoupTemplate
8 |
9 |
10 | class ChapterWithVolumeSoupTemplate(GeneralSoupTemplate):
11 | def parse_chapter_list(
12 | self, soup: BeautifulSoup
13 | ) -> Generator[Union[Chapter, Volume], None, None]:
14 | vol_id = 0
15 | chap_id = 0
16 | for vol in self.select_volume_tags(soup):
17 | if not isinstance(vol, Tag):
18 | continue
19 | vol_id += 1
20 | vol_item = self.parse_volume_item(vol, vol_id)
21 | yield vol_item
22 | for tag in self.select_chapter_tags(vol, vol_item):
23 | if not isinstance(tag, Tag):
24 | continue
25 | chap_id += 1
26 | item = self.parse_chapter_item(tag, chap_id, vol_item)
27 | item.volume = vol_id
28 | yield item
29 |
30 | @abstractmethod
31 | def select_volume_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]:
32 | """Select volume list item tags from the page soup"""
33 | raise NotImplementedError()
34 |
35 | @abstractmethod
36 | def parse_volume_item(self, tag: Tag, id: int) -> Volume:
37 | """Parse a single volume from volume list item tag"""
38 | raise NotImplementedError()
39 |
40 | @abstractmethod
41 | def select_chapter_tags(self, tag: Tag, vol: Volume) -> Generator[Tag, None, None]:
42 | """Select chapter list item tags from volume tag"""
43 | raise NotImplementedError()
44 |
45 | @abstractmethod
46 | def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter:
47 | """Parse a single chapter from chapter list item tag"""
48 | raise NotImplementedError()
49 |
--------------------------------------------------------------------------------
/lncrawl/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/utils/__init__.py
--------------------------------------------------------------------------------
/lncrawl/utils/common.py:
--------------------------------------------------------------------------------
1 | from typing import TypeVar, Generic, Callable, Type
2 |
3 | T = TypeVar('T')
4 |
5 |
6 | class static_cached_property(Generic[T]):
7 | def __init__(self, func: Callable[..., T]):
8 | self._initialized = False
9 | if isinstance(func, staticmethod):
10 | self.func = func.__func__
11 | else:
12 | self.func = func
13 |
14 | def __get__(self, instance: None, owner: Type) -> T:
15 | if not self._initialized:
16 | self._value = self.func()
17 | self._initialized = True
18 | return self._value
19 |
--------------------------------------------------------------------------------
/lncrawl/utils/imgen.py:
--------------------------------------------------------------------------------
1 | # https://github.com/alexwlchan/specktre
2 |
3 | import random
4 | from typing import List, Optional
5 |
6 | from PIL import Image, ImageDraw
7 |
8 | from .material_colors import ColorName, ColorWeight, generate_colors
9 | from .tilings import TileGenerator, generate_tiles
10 |
11 |
12 | def generate_image(
13 | filename: Optional[str] = None,
14 | width: int = 512,
15 | height: int = 512,
16 | color_names: List[ColorName] = [],
17 | color_weights: List[ColorWeight] = [],
18 | generator: Optional[TileGenerator] = None,
19 | side_length: int = 50,
20 | ) -> Image:
21 | tiles = generate_tiles(
22 | generator,
23 | width,
24 | height,
25 | side_length,
26 | )
27 | colors = generate_colors(
28 | color_names,
29 | color_weights,
30 | )
31 | im = Image.new(
32 | mode="RGB",
33 | size=(width, height),
34 | )
35 | for tile, color in zip(tiles, colors):
36 | ImageDraw.Draw(im).polygon(tile, fill=color)
37 |
38 | if filename:
39 | im.save(filename)
40 |
41 | return im
42 |
43 |
44 | good_color_names = set(ColorName).difference(
45 | [
46 | ColorName.black,
47 | ColorName.white,
48 | ColorName.light_blue,
49 | ColorName.light_green,
50 | ]
51 | )
52 | good_color_weights = set(ColorWeight).difference(
53 | [
54 | ColorWeight.main,
55 | ColorWeight.w50,
56 | ColorWeight.w100,
57 | ColorWeight.w200,
58 | ColorWeight.w800,
59 | ColorWeight.w900,
60 | ColorWeight.a100,
61 | ColorWeight.a200,
62 | ]
63 | )
64 |
65 |
66 | def generate_cover_image(
67 | filename: Optional[str] = None,
68 | width: int = 800,
69 | height: int = 1032,
70 | ) -> Image:
71 | generate_image(
72 | filename=filename,
73 | width=width,
74 | height=height,
75 | color_names=good_color_names,
76 | color_weights=good_color_weights,
77 | side_length=random.randint(300, 750),
78 | )
79 |
--------------------------------------------------------------------------------
/lncrawl/utils/ratelimit.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 |
4 | logger = logging.getLogger(__name__)
5 |
6 |
7 | class RateLimiter(object):
8 | """A helper class for a controlling number of requests per seconds.
9 | It is being used along with the TaskManager class.
10 |
11 | Args:
12 | - ratelimit (float, optional): Number of requests per seconds.
13 | """
14 |
15 | def __init__(self, ratelimit: float):
16 | if ratelimit <= 0:
17 | raise ValueError("ratelimit should be a non-zero positive number")
18 | self.period = 1 / ratelimit
19 | self._closed = False
20 |
21 | def _now(self):
22 | if hasattr(time, "monotonic"):
23 | return time.monotonic()
24 | return time.time()
25 |
26 | def __enter__(self):
27 | self._time = self._now()
28 |
29 | def __exit__(self, type, value, traceback):
30 | if self._closed:
31 | return
32 | d = (self._time + self.period) - self._now()
33 | self._time = self._now()
34 | if d > 0:
35 | time.sleep(d)
36 |
37 | def shutdown(self):
38 | self._closed = True
39 |
40 | def wrap(self, fn):
41 | def inner(*args, **kwargs):
42 | with self:
43 | return fn(*args, **kwargs)
44 |
45 | return inner
46 |
--------------------------------------------------------------------------------
/lncrawl/utils/sockets.py:
--------------------------------------------------------------------------------
1 | import socket
2 |
3 |
4 | def free_port(host="127.0.0.1") -> int:
5 | """
6 | Determines a free port using sockets.
7 | """
8 | free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
9 | free_socket.bind((host, 0))
10 | free_socket.listen(5)
11 | port: int = free_socket.getsockname()[1]
12 | free_socket.close()
13 | return port
14 |
--------------------------------------------------------------------------------
/lncrawl/utils/ssl_no_verify.py:
--------------------------------------------------------------------------------
1 | """
2 | https://stackoverflow.com/a/15445989/1583052
3 | """
4 | import warnings
5 | import contextlib
6 |
7 | import requests
8 | from urllib3.exceptions import InsecureRequestWarning
9 |
10 |
11 | old_merge_environment_settings = requests.Session.merge_environment_settings
12 |
13 |
14 | @contextlib.contextmanager
15 | def no_ssl_verification():
16 | opened_adapters = set()
17 |
18 | def merge_environment_settings(self, url, proxies, stream, verify, cert):
19 | # Verification happens only once per connection so we need to close
20 | # all the opened adapters once we're done. Otherwise, the effects of
21 | # verify=False persist beyond the end of this context manager.
22 | opened_adapters.add(self.get_adapter(url))
23 |
24 | settings = old_merge_environment_settings(
25 | self, url, proxies, stream, verify, cert
26 | )
27 | settings["verify"] = False
28 |
29 | return settings
30 |
31 | requests.Session.merge_environment_settings = merge_environment_settings
32 |
33 | try:
34 | with warnings.catch_warnings():
35 | warnings.simplefilter("ignore", InsecureRequestWarning)
36 | yield
37 | finally:
38 | requests.Session.merge_environment_settings = old_merge_environment_settings
39 |
40 | for adapter in opened_adapters:
41 | try:
42 | adapter.close()
43 | except Exception:
44 | pass
45 |
--------------------------------------------------------------------------------
/lncrawl/utils/uploader/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | cloud_drive = os.getenv("CLOUD_DRIVE", "ANONFILES")
4 |
5 |
6 | def upload(file_path, description=None):
7 | if cloud_drive == "GOOGLE_DRIVE":
8 | from .google_drive import upload
9 |
10 | return upload(file_path, description)
11 | elif cloud_drive == "GOFILE":
12 | from .gofile import upload
13 |
14 | return upload(file_path, description)
15 | else:
16 | from .anonfiles import upload
17 |
18 | return upload(file_path, description)
19 |
--------------------------------------------------------------------------------
/lncrawl/utils/uploader/anonfiles.py:
--------------------------------------------------------------------------------
1 | from requests import Session
2 |
3 |
4 | # API Docs: https://anonfiles.com/docs/api
5 | def upload(file_path, description):
6 | with Session() as sess:
7 | with open(file_path, "rb") as fp:
8 | response = sess.post(
9 | "https://api.anonfiles.com/upload",
10 | files={"file": fp},
11 | stream=True,
12 | )
13 | response.raise_for_status()
14 | return response.json()["data"]["file"]["url"]["full"]
15 |
--------------------------------------------------------------------------------
/lncrawl/utils/uploader/gofile.py:
--------------------------------------------------------------------------------
1 | from requests import Session
2 |
3 |
4 | # API Docs: https://gofile.io/api
5 | def upload(file_path, description=""):
6 | with Session() as sess:
7 | response = sess.get("https://api.gofile.io/getServer")
8 | response.raise_for_status()
9 | server_name = response.json()["data"]["server"]
10 |
11 | with open(file_path, "rb") as fp:
12 | response = sess.post(
13 | f"https://{server_name}.gofile.io/uploadFile",
14 | files={"file": fp},
15 | stream=True,
16 | )
17 | response.raise_for_status()
18 | return response.json()["data"]["downloadPage"]
19 |
--------------------------------------------------------------------------------
/lncrawl/utils/uploader/google_drive.py:
--------------------------------------------------------------------------------
1 | """[DEPRECATED] Uploader for google drive"""
2 | import logging
3 | import os
4 |
5 | logger = logging.getLogger(__name__)
6 |
7 |
8 | try:
9 | from pydrive.auth import GoogleAuth
10 | from pydrive.drive import GoogleDrive
11 | except Exception:
12 | logger.error("`pydrive` was not setup properly")
13 |
14 |
15 | def upload(file_path, description=None) -> str:
16 | gauth = GoogleAuth()
17 | # gauth.LocalWebserverAuth()
18 |
19 | # Try to load saved client credentials
20 | credential_file = os.getenv("GOOGLE_DRIVE_CREDENTIAL_FILE")
21 | gauth.LoadCredentialsFile(credential_file)
22 | if gauth.credentials is None:
23 | # Authenticate if they're not there
24 | gauth.LocalWebserverAuth()
25 | elif gauth.access_token_expired:
26 | # Refresh them if expired
27 | gauth.Refresh()
28 | else:
29 | # Initialize the saved creds
30 | gauth.Authorize()
31 |
32 | # Save the current credentials to a file
33 | gauth.SaveCredentialsFile(credential_file)
34 |
35 | drive = GoogleDrive(gauth)
36 | folder_id = os.getenv("GOOGLE_DRIVE_FOLDER_ID")
37 | filename_w_ext = os.path.basename(file_path)
38 | filename, file_extension = os.path.splitext(filename_w_ext)
39 |
40 | # Upload file to folder
41 | f = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": folder_id}]})
42 | f["title"] = filename_w_ext
43 |
44 | # Make sure to add the path to the file to upload below.
45 | f.SetContentFile(file_path)
46 | f.Upload()
47 |
48 | logger.info("Uploaded file id: {}", f["id"])
49 | return "https://drive.google.com/open?id=" + f["id"]
50 |
--------------------------------------------------------------------------------
/lncrawl/webdriver/__init__.py:
--------------------------------------------------------------------------------
1 | # https://cloudbytes.dev/snippets/run-selenium-and-chrome-on-wsl2
2 | # https://github.com/ultrafunkamsterdam/undetected-chromedriver
3 |
4 | import logging
5 | from typing import Optional
6 |
7 | from selenium.webdriver import ChromeOptions
8 | from selenium.webdriver.remote.webdriver import WebDriver
9 |
10 | from ..core.arguments import get_args
11 | from ..core.soup import SoupMaker
12 | from .local import create_local
13 | from .remote import create_remote
14 |
15 | logger = logging.getLogger(__name__)
16 |
17 |
18 | def create_new(
19 | options: Optional["ChromeOptions"] = None,
20 | timeout: Optional[float] = None,
21 | user_data_dir: Optional[str] = None,
22 | soup_maker: Optional[SoupMaker] = None,
23 | headless: bool = False,
24 | **kwargs,
25 | ) -> WebDriver:
26 | args = get_args()
27 | if args.selenium_grid:
28 | return create_remote(
29 | address=args.selenium_grid,
30 | options=options,
31 | timeout=timeout,
32 | soup_maker=soup_maker,
33 | )
34 | else:
35 | return create_local(
36 | options=options,
37 | timeout=timeout,
38 | soup_maker=soup_maker,
39 | user_data_dir=user_data_dir,
40 | headless=headless,
41 | )
42 |
--------------------------------------------------------------------------------
/lncrawl/webdriver/job_queue.py:
--------------------------------------------------------------------------------
1 | import atexit
2 | import logging
3 | from threading import Semaphore, Thread
4 | from typing import List, Optional
5 |
6 | from selenium.webdriver.remote.webdriver import WebDriver
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 | MAX_BROWSER_INSTANCES = 8
11 |
12 | __open_browsers: List[WebDriver] = []
13 | __semaphore = Semaphore(MAX_BROWSER_INSTANCES)
14 |
15 |
16 | def __override_quit(driver: WebDriver):
17 | __open_browsers.append(driver)
18 | original = Thread(target=driver.quit, daemon=True)
19 |
20 | def override():
21 | if driver in __open_browsers:
22 | __semaphore.release()
23 | __open_browsers.remove(driver)
24 | logger.info("Destroyed instance: %s", driver.session_id)
25 | if not original._started.is_set(): # type:ignore
26 | original.start()
27 |
28 | driver.quit = override # type:ignore
29 |
30 |
31 | def _acquire_queue(timeout: Optional[float] = None):
32 | acquired = __semaphore.acquire(True, timeout)
33 | if not acquired:
34 | raise TimeoutError("Failed to acquire semaphore")
35 |
36 |
37 | def _release_queue(driver: WebDriver):
38 | __override_quit(driver)
39 |
40 |
41 | def check_active(driver: WebDriver) -> bool:
42 | if not isinstance(driver, WebDriver):
43 | return False
44 | return driver in __open_browsers
45 |
46 |
47 | def cleanup_drivers():
48 | for driver in __open_browsers:
49 | driver.close()
50 | driver.quit()
51 |
52 |
53 | atexit.register(cleanup_drivers)
54 |
--------------------------------------------------------------------------------
/requirements-app.txt:
--------------------------------------------------------------------------------
1 | # app requirements
2 | typer
3 | ascii
4 | regex
5 | packaging
6 | lxml[html-clean]
7 | pyease-grpc>=1.6.0
8 | python-dotenv>=0.15.0,<2.0.0
9 | beautifulsoup4>=4.8.0,<5.0.0
10 | requests>=2.20.0,<2.33.0
11 | python-slugify>=4.0.0,<9.0.0
12 | colorama>=0.4.0,<0.5.0
13 | tqdm>=4.60,<5.0
14 | PyExecJS>=1.5.1,<2.0.0
15 | ebooklib>=0.17.0,<1.0.0
16 | pillow>=6.0.0
17 | cloudscraper>=1.2.71
18 | readability-lxml>=0.8.0,<1.0.0
19 | questionary>=1.6.0
20 | prompt-toolkit~=3.0
21 | html5lib~=1.1
22 | base58~=2.1.1
23 | python-box>=6.0.0,<8.0.0
24 | pycryptodome>=3.0.0,<4.0.0
25 | selenium>=3.141.0
26 | tenacity>=9.0.0
27 |
--------------------------------------------------------------------------------
/requirements-bot.txt:
--------------------------------------------------------------------------------
1 | # bot requirements
2 | discord.py>=2.0.0
3 | python-telegram-bot[job-queue]~=20.0
4 | # pydrive>=1.3.1,<2.0.0
5 |
6 | uvicorn
7 | fastapi[standard]
8 | cachetools
9 | sqlmodel
10 | passlib[argon2]
11 | python-jose[cryptography]
12 | python-dateutil
13 | reflex
14 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | # dev requirements
2 | wheel
3 | black
4 | flake8
5 | setuptools
6 | pyinstaller
7 | pycryptodome>=3.0.0,<4.0.0
8 |
9 | types-tqdm
10 | types-colorama
11 | types-cachetools
12 | types-python-dateutil
13 | types-passlib
14 | types-python-jose
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # app requirements
2 | typer
3 | ascii
4 | regex
5 | packaging
6 | lxml[html-clean]
7 | pyease-grpc>=1.6.0
8 | python-dotenv>=0.15.0,<2.0.0
9 | beautifulsoup4>=4.8.0,<5.0.0
10 | requests>=2.20.0,<2.33.0
11 | python-slugify>=4.0.0,<9.0.0
12 | colorama>=0.4.0,<0.5.0
13 | tqdm>=4.60,<5.0
14 | PyExecJS>=1.5.1,<2.0.0
15 | ebooklib>=0.17.0,<1.0.0
16 | pillow>=6.0.0
17 | cloudscraper>=1.2.71
18 | readability-lxml>=0.8.0,<1.0.0
19 | questionary>=1.6.0
20 | prompt-toolkit~=3.0
21 | html5lib~=1.1
22 | base58~=2.1.1
23 | python-box>=6.0.0,<8.0.0
24 | pycryptodome>=3.0.0,<4.0.0
25 | selenium>=3.141.0
26 | tenacity>=9.0.0
27 |
28 | # bot requirements
29 | discord.py>=2.0.0
30 | python-telegram-bot[job-queue]~=20.0
31 | uvicorn
32 | fastapi[standard]
33 | cachetools
34 | sqlmodel
35 | passlib[argon2]
36 | python-jose[cryptography]
37 | python-dateutil
38 | reflex
39 |
40 | # dev requirements
41 | wheel
42 | black
43 | flake8
44 | tk-tools
45 | setuptools
46 | pyinstaller
47 | types-tqdm
48 | types-colorama
49 | types-cachetools
50 | types-python-dateutil
51 | types-passlib
52 | types-python-jose
--------------------------------------------------------------------------------
/res/lncrawl-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/res/lncrawl-icon.png
--------------------------------------------------------------------------------
/res/lncrawl-web.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/res/lncrawl-web.png
--------------------------------------------------------------------------------
/res/lncrawl.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/res/lncrawl.ico
--------------------------------------------------------------------------------
/scripts/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.10-slim-bookworm
2 |
3 | USER root
4 | # Install general dependencies
5 | RUN apt-get update -yq \
6 | && apt-get install -yq \
7 | wget tar xz-utils make cmake g++ libffi-dev libegl1 libopengl0 libxcb-cursor0 \
8 | libnss3 libgl1-mesa-glx libxcomposite1 libxrandr2 libxi6 fontconfig \
9 | libxkbcommon-x11-0 libxtst6 libxkbfile1 libxcomposite-dev libxdamage-dev \
10 | && rm -rf /var/lib/apt/lists/* \
11 | && apt-get clean autoclean \
12 | && apt-get autoremove -yq
13 |
14 | # Install calibre
15 | RUN wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sh /dev/stdin \
16 | && ln -s /opt/calibre/ebook-convert /usr/local/bin/ebook-convert
17 |
18 | # Add app user
19 | RUN useradd -ms /bin/bash lncrawl
20 | USER lncrawl
21 |
22 | # Install global requirements
23 | RUN alias python=python3
24 | RUN alias pip=pip3
25 | RUN export PATH="/home/lncrawl/.local/bin:$PATH"
26 | RUN pip install -U pip wheel
27 |
28 | WORKDIR /app
29 |
30 | # Install app requirements
31 | COPY --chown=lncrawl:lncrawl requirements.txt .
32 | RUN pip install -r requirements.txt
33 |
34 | COPY .env .env
35 | COPY lncrawl lncrawl
36 | COPY sources sources
37 |
38 | ENV OUTPUT_PATH=/home/lncrawl/output
39 | RUN mkdir -p $OUTPUT_PATH
40 |
41 | ENTRYPOINT [ "python", "-m", "lncrawl" ]
42 |
--------------------------------------------------------------------------------
/scripts/bitanon.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 |
3 | VERSION=$(head -n 1 lncrawl/VERSION)
4 |
5 | # SHLINK_API_KEY=
6 |
7 |
8 | EXE_LINK="https://github.com/dipu-bd/lightnovel-crawler/releases/download/v$VERSION/lncrawl.exe"
9 | EXE_TITLE="Lightnovel Crawler v$VERSION (Windows)"
10 |
11 | LINUX_LINK="https://github.com/dipu-bd/lightnovel-crawler/releases/download/v$VERSION/lncrawl-linux"
12 | LINUX_TITLE="Lightnovel Crawler v$VERSION (Linux)"
13 |
14 | MAC_LINK="https://github.com/dipu-bd/lightnovel-crawler/releases/download/v$VERSION/lncrawl-mac"
15 | MAC_TITLE="Lightnovel Crawler v$VERSION (Mac)"
16 |
17 | set -ex
18 |
19 | curl -X 'PATCH' \
20 | 'https://go.bitanon.dev/rest/v3/short-urls/lncrawl-windows' \
21 | -H 'accept: application/json' \
22 | -H 'Content-Type: application/json' \
23 | -H "X-Api-Key: $SHLINK_API_KEY" \
24 | -d '{"title": "'"$EXE_TITLE"'","longUrl": "'"$EXE_LINK"'"}'
25 |
26 | curl -X 'PATCH' \
27 | 'https://go.bitanon.dev/rest/v3/short-urls/lncrawl-linux' \
28 | -H 'accept: application/json' \
29 | -H 'Content-Type: application/json' \
30 | -H "X-Api-Key: $SHLINK_API_KEY" \
31 | -d '{"title": "'"$LINUX_TITLE"'","longUrl": "'"$LINUX_LINK"'"}'
32 |
33 | curl -X 'PATCH' \
34 | 'https://go.bitanon.dev/rest/v3/short-urls/lncrawl-mac' \
35 | -H 'accept: application/json' \
36 | -H 'Content-Type: application/json' \
37 | -H "X-Api-Key: $SHLINK_API_KEY" \
38 | -d '{"title": "'"$MAC_TITLE"'","longUrl": "'"$MAC_LINK"'"}'
39 |
--------------------------------------------------------------------------------
/scripts/build.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | SET /P VERSION= " + " ".join(body) + " %s
".join(body) 57 | -------------------------------------------------------------------------------- /sources/en/o/ornovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class OrNovel(Crawler): 9 | base_url = "https://www.ornovel.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | self.novel_title = " ".join( 16 | [str(x) for x in soup.select_one(".title h1").contents if not x.name] 17 | ).strip() 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | probable_img = soup.select_one(".intro-left img.book-image") 21 | if probable_img: 22 | self.novel_cover = self.absolute_url(probable_img["src"]) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = " ".join( 26 | [a.text.strip() for a in soup.select(".author-container")] 27 | ) 28 | logger.info("%s", self.novel_author) 29 | 30 | volumes = set() 31 | chapters = soup.select("ul.chapters-all li.chapters-item a") 32 | for a in chapters: 33 | chap_id = len(self.chapters) + 1 34 | vol_id = (chap_id - 1) // 100 + 1 35 | volumes.add(vol_id) 36 | self.chapters.append( 37 | { 38 | "id": chap_id, 39 | "volume": vol_id, 40 | "url": self.absolute_url(a["href"]), 41 | "title": a.text.strip() or ("Chapter %d" % chap_id), 42 | } 43 | ) 44 | 45 | self.volumes = [{"id": x} for x in volumes] 46 | 47 | def download_chapter_body(self, chapter): 48 | soup = self.get_soup(chapter["url"]) 49 | 50 | contents = soup.select_one("div.chapter-detail") 51 | for bad in contents.select( 52 | "h2, ins, .chapter-header .code-block, script, .adsbygoogle" 53 | ): 54 | bad.extract() 55 | 56 | return str(contents) 57 | -------------------------------------------------------------------------------- /sources/en/p/pandamanga.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class PandaMangaxyzCrawler(MangaStreamTemplate): 10 | base_url = ["https://www.pandamanga.xyz/"] 11 | -------------------------------------------------------------------------------- /sources/en/p/pandanovelco.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Generator 4 | from bs4 import BeautifulSoup, Tag 5 | from lncrawl.templates.novelpub import NovelPubTemplate 6 | 7 | 8 | class PandaNovelCo(NovelPubTemplate): 9 | base_url = [ 10 | "https://pandanovel.co/", 11 | ] 12 | 13 | # We override because we do not have a request token like other novel pub 14 | # (without that wrong error is raised and browser search isn't triggered) 15 | def select_search_items(self, query: str) -> Generator[Tag, None, None]: 16 | self.submit_form( 17 | f"{self.home_url}lnsearchlive", 18 | data={"inputContent": query}, 19 | headers={ 20 | "referer": f"{self.home_url}search", 21 | }, 22 | ) 23 | 24 | # override this because somehow novel_url is always missing trailing / 25 | def select_chapter_tags_in_browser(self): 26 | next_link = f"{self.novel_url}/chapters" 27 | while next_link: 28 | self.browser.visit(next_link) 29 | self.browser.wait("ul.chapter-list li") 30 | chapter_list = self.browser.find("ul.chapter-list") 31 | yield from chapter_list.as_tag().select("li a") 32 | try: 33 | next_link = self.browser.find('.PagedList-skipToNext a[rel="next"]') 34 | next_link = next_link.get_attribute("href") 35 | except Exception: 36 | next_link = False 37 | 38 | # .chapter-content -> #content 39 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 40 | self.browser.wait("#content") 41 | return soup.select_one("#content") 42 | -------------------------------------------------------------------------------- /sources/en/p/pandanovelorg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelfull import NovelFullTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class Pandanovelorg(NovelFullTemplate): 8 | has_mtl = False 9 | has_manga = False 10 | base_url = ["https://pandanovel.org/"] 11 | -------------------------------------------------------------------------------- /sources/en/r/readmtl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.madara import MadaraTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Readmtl(MadaraTemplate): 9 | has_mtl = True 10 | has_manga = False 11 | base_url = ["https://readmtl.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/r/readnovelfull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ReadNovelFullCrawler(NovelFullTemplate): 10 | base_url = "https://readnovelfull.com/" 11 | -------------------------------------------------------------------------------- /sources/en/r/readwn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class ReadWNCrawler(NovelMTLTemplate): 8 | has_mtl = True 9 | base_url = [ 10 | "https://www.readwn.com/", 11 | "https://www.wuxiap.com/" 12 | ] 13 | -------------------------------------------------------------------------------- /sources/en/s/sleepytrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SleepyTranslations(Crawler): 9 | base_url = "https://sleepytranslations.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | possible_title = soup.select_one(".post-title h1") 16 | for span in possible_title.select("span"): 17 | span.extract() 18 | self.novel_title = possible_title.text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_image = soup.select_one(".summary_image a img") 22 | if possible_image: 23 | self.novel_cover = self.absolute_url(possible_image["src"]) 24 | logger.info("Novel cover: %s", self.novel_cover) 25 | 26 | self.novel_author = " ".join( 27 | [a.text.strip() for a in soup.select('.author-content a[href*="author"]')] 28 | ) 29 | logger.info("%s", self.novel_author) 30 | 31 | self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"] 32 | logger.info("Novel id: %s", self.novel_id) 33 | 34 | response = self.submit_form(self.novel_url.strip("/") + "/ajax/chapters") 35 | soup = self.make_soup(response) 36 | for a in reversed(soup.select(".wp-manga-chapter a")): 37 | chap_id = len(self.chapters) + 1 38 | vol_id = 1 + len(self.chapters) // 100 39 | if chap_id % 100 == 1: 40 | self.volumes.append({"id": vol_id}) 41 | self.chapters.append( 42 | { 43 | "id": chap_id, 44 | "volume": vol_id, 45 | "title": a.text.strip(), 46 | "url": self.absolute_url(a["href"]), 47 | } 48 | ) 49 | 50 | def download_chapter_body(self, chapter): 51 | soup = self.get_soup(chapter["url"]) 52 | contents = soup.select(".reading-content p") 53 | return "".join([str(p) for p in contents]) 54 | -------------------------------------------------------------------------------- /sources/en/s/smnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SMNovelsCrawler(Crawler): 9 | base_url = "https://smnovels.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | # Site has no author name or novel covers. 16 | possible_title = soup.select_one("h1.entry-title") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | for a in soup.select(".all-chapters-list a"): 22 | chap_id = len(self.chapters) + 1 23 | vol_id = len(self.chapters) // 100 + 1 24 | if len(self.chapters) % 100 == 0: 25 | self.volumes.append({"id": vol_id}) 26 | self.chapters.append( 27 | { 28 | "id": chap_id, 29 | "volume": vol_id, 30 | "title": a.text.strip(), 31 | "url": self.absolute_url(a["href"]), 32 | } 33 | ) 34 | 35 | def download_chapter_body(self, chapter): 36 | soup = self.get_soup(chapter["url"]) 37 | 38 | contents = soup.select_one(".entry-content") 39 | for bad in contents.select("br"): 40 | bad.extract() 41 | return self.cleaner.extract_contents(contents) 42 | -------------------------------------------------------------------------------- /sources/en/s/sonicmtl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from bs4 import BeautifulSoup, Tag 3 | from lncrawl.templates.madara import MadaraTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SonicMTLCrawler(MadaraTemplate): 9 | has_mtl = True 10 | base_url = [ 11 | "https://sonicmtl.com", 12 | "https://www.sonicmtl.com/", 13 | ] 14 | 15 | def initialize(self): 16 | super().initialize() 17 | self.cleaner.bad_css.update( 18 | { 19 | ".ad", 20 | ".c-ads", 21 | ".custom-code", 22 | ".body-top-ads", 23 | ".before-content-ad", 24 | ".autors-widget", 25 | } 26 | ) 27 | 28 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 29 | return soup.select_one(".reading-content .text-left") 30 | -------------------------------------------------------------------------------- /sources/en/s/steambun.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class SteambunCrawler(Crawler): 11 | base_url = "https://steambunlightnovel.com/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | possible_title = soup.select_one("h1.entry-title") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.text 20 | logger.info("Novel title: %s", self.novel_title) 21 | 22 | self.novel_author = "by SteamBun Translations" 23 | logger.info("Novel author: %s", self.novel_author) 24 | 25 | # Site does not list covers. 26 | 27 | volumes = set([]) 28 | for a in reversed( 29 | soup.select('div.w4pl-inner li a[href*="steambunlightnovel.com"]') 30 | ): 31 | title = a.text.strip() 32 | chapter_id = len(self.chapters) + 1 33 | volume_id = 1 + (chapter_id - 1) // 100 34 | volumes.add(volume_id) 35 | self.chapters.append( 36 | { 37 | "id": chapter_id, 38 | "volume": volume_id, 39 | "title": title, 40 | "url": a["href"], 41 | } 42 | ) 43 | 44 | self.chapters.sort(key=lambda x: x["id"]) 45 | self.volumes = [{"id": x, "title": ""} for x in volumes] 46 | 47 | def download_chapter_body(self, chapter): 48 | soup = self.get_soup(chapter["url"]) 49 | content = soup.select_one("div.entry-content") 50 | assert content, "No chapter content" 51 | self.cleaner.clean_contents(content) 52 | body = content.select("p") 53 | body = [str(p) for p in body if self.should_take(p)] 54 | return "
" + "
".join(body) + "
" 55 | 56 | def should_take(self, p): 57 | txt = p.text.strip().lower() 58 | return txt and txt != "advertisement" 59 | -------------------------------------------------------------------------------- /sources/en/s/systemtranslation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class SystemTranslationCrawler(MangaStreamTemplate): 10 | base_url = ["https://systemtranslation.com/"] 11 | -------------------------------------------------------------------------------- /sources/en/t/tamagotl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class TamagoTlCrawler(MangaStreamTemplate): 10 | base_url = ["https://tamagotl.com/"] 11 | has_mtl = True 12 | -------------------------------------------------------------------------------- /sources/en/t/teanovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | 5 | from bs4 import Tag 6 | 7 | from lncrawl.core.crawler import Crawler 8 | from lncrawl.core.exeptions import LNException 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TeaNovelCrawler(Crawler): 14 | base_url = "https://www.teanovel.com" 15 | 16 | def initialize(self): 17 | self.init_executor( 18 | workers=4 19 | ) 20 | 21 | def read_novel_info(self): 22 | soup = self.get_soup(self.novel_url) 23 | 24 | script_tag = soup.select_one("script#__NEXT_DATA__") 25 | if not isinstance(script_tag, Tag): 26 | raise LNException("No script data found") 27 | 28 | next_data = json.loads(script_tag.get_text()) 29 | 30 | novel_data = next_data["props"]["pageProps"]["novel"] 31 | 32 | self.novel_title = novel_data["name"] 33 | self.novel_author = novel_data["author"] 34 | 35 | img_tag = soup.select_one("main img[src*='_next/']") 36 | if isinstance(img_tag, Tag): 37 | self.novel_cover = self.absolute_url(img_tag["src"]) 38 | 39 | chapters = self.get_soup(self.novel_url + "/chapter-list").select("a.border-b") 40 | for chapter in chapters: 41 | chapter_id = len(self.chapters) + 1 42 | self.chapters.append( 43 | { 44 | "id": chapter_id, 45 | "title": chapter.select_one("p").get_text(strip=True), 46 | "url": self.absolute_url(chapter["href"]), 47 | } 48 | ) 49 | 50 | def download_chapter_body(self, chapter): 51 | chapter = self.get_soup(chapter["url"]) 52 | return self.cleaner.extract_contents(chapter.select_one("div.prose")) 53 | -------------------------------------------------------------------------------- /sources/en/t/totallytranslations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from requests.sessions import Session 5 | 6 | from lncrawl.core.crawler import Crawler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class TotallyTranslations(Crawler): 12 | base_url = "https://totallytranslations.com/" 13 | 14 | def initialize(self): 15 | self.scraper = Session() 16 | 17 | def read_novel_info(self): 18 | logger.debug("Visiting %s", self.novel_url) 19 | soup = self.get_soup(self.novel_url) 20 | 21 | possible_title = soup.select_one(".entry-title") 22 | assert possible_title, "No novel title" 23 | self.novel_title = possible_title.text 24 | logger.info("Novel title: %s", self.novel_title) 25 | 26 | possible_image = soup.select_one(".novel-image img") 27 | if possible_image: 28 | self.novel_cover = self.absolute_url(possible_image["src"]) 29 | logger.info("Novel cover: %s", self.novel_cover) 30 | 31 | for p in soup.select(".chapters-list .chapters-title"): 32 | vol_title = p.text.strip() 33 | vol_id = len(self.volumes) + 1 34 | self.volumes.append( 35 | { 36 | "id": vol_id, 37 | "title": vol_title, 38 | } 39 | ) 40 | 41 | ul = p.find_next("ul") 42 | for a in ul.select("a"): 43 | chap_id = len(self.chapters) + 1 44 | self.chapters.append( 45 | { 46 | "id": chap_id, 47 | "volume": vol_id, 48 | "title": a.text.strip(), 49 | "url": self.absolute_url(a["href"]), 50 | } 51 | ) 52 | 53 | def download_chapter_body(self, chapter): 54 | soup = self.get_soup(chapter["url"]) 55 | paras = soup.select(".post-content p") 56 | return "\n".join([str(p) for p in paras if p.text.strip()]) 57 | -------------------------------------------------------------------------------- /sources/en/v/veratales.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class VeraTales(Crawler): 10 | base_url = "https://veratales.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.find("h1").text.strip() 17 | logger.info("Novel title: %s", self.novel_title) 18 | 19 | # self.novel_author= soup.find("div",{"class":"novel-author-info"}).find("h4").text.strip() 20 | self.novel_author = "" 21 | logger.info("%s", self.novel_author) 22 | 23 | possible_image = soup.select_one("div.card-header a img") 24 | if possible_image: 25 | self.novel_cover = self.absolute_url(possible_image["src"]) 26 | logger.info("Novel cover: %s", self.novel_cover) 27 | 28 | chapters = soup.select("table td a") 29 | for a in reversed(chapters): 30 | chap_id = len(self.chapters) + 1 31 | vol_id = 1 + len(self.chapters) // 100 32 | if len(self.volumes) < vol_id: 33 | self.volumes.append({"id": vol_id}) 34 | self.chapters.append( 35 | { 36 | "id": chap_id, 37 | "volume": vol_id, 38 | "url": self.absolute_url(a["href"]), 39 | "title": a.text.strip() or ("Chapter %d" % chap_id), 40 | } 41 | ) 42 | 43 | def download_chapter_body(self, chapter): 44 | soup = self.get_soup(chapter["url"]) 45 | contents = soup.select_one("div.reader-content") 46 | return self.cleaner.extract_contents(contents) 47 | -------------------------------------------------------------------------------- /sources/en/w/webnovelonlinecom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import json 4 | import logging 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class WebnovelOnlineDotComCrawler(Crawler): 11 | base_url = "https://webnovelonline.com/" 12 | 13 | def read_novel_info(self): 14 | url = self.novel_url 15 | soup = self.get_soup(url) 16 | 17 | possible_title = soup.select_one(".novel-info .novel-desc h1") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.text 20 | logger.info("Novel title: %s", self.novel_title) 21 | 22 | possible_novel_cover = soup.select_one('meta[property="og:image"]') 23 | if possible_novel_cover: 24 | self.novel_cover = self.absolute_url(possible_novel_cover["content"]) 25 | logger.info("Novel cover: %s", self.novel_title) 26 | 27 | volumes = set([]) 28 | for a in reversed(soup.select(".chapter-list .item a")): 29 | chap_id = len(self.chapters) + 1 30 | vol_id = 1 + len(self.chapters) // 100 31 | volumes.add(vol_id) 32 | self.chapters.append( 33 | { 34 | "id": chap_id, 35 | "volume": vol_id, 36 | "title": a.text.strip(), 37 | "url": self.absolute_url(a["href"]), 38 | } 39 | ) 40 | 41 | self.volumes = [{"id": x, "title": ""} for x in volumes] 42 | 43 | def download_chapter_body(self, chapter): 44 | soup = self.get_soup(chapter["url"]) 45 | 46 | for script in soup.select("script"): 47 | text = script.string 48 | if not text or not text.startswith("window._INITIAL_DATA_"): 49 | continue 50 | content = re.findall(r',"chapter":(".+")},', text)[0] 51 | content = json.loads(content).strip() 52 | return "" + "
".join(content.split("\n\n")) + "
" 53 | 54 | return "" 55 | -------------------------------------------------------------------------------- /sources/en/w/webnovelpub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lncrawl.templates.novelpub import NovelPubTemplate 4 | 5 | 6 | class WebnovelpubCrawler(NovelPubTemplate): 7 | base_url = [ 8 | "https://www.webnovelpub.com/", 9 | "https://www.webnovelpub.pro/", 10 | ] 11 | -------------------------------------------------------------------------------- /sources/en/w/whatsawhizzerwebnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class WhatsAWhizzerCrawler(Crawler): 12 | base_url = ["https://whatsawhizzerwebnovels.com/"] 13 | 14 | def read_novel_info(self): 15 | logger.debug("Visiting %s", self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one(".page-header-title").text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | cover_tag = soup.select_one('meta[property="og:image"]') 22 | 23 | if isinstance(cover_tag, Tag): 24 | self.novel_cover = cover_tag["content"] 25 | 26 | logger.info("Novel cover: %s", self.novel_cover) 27 | 28 | for a in soup.select(".entry > p > a"): 29 | self.chapters.append( 30 | { 31 | "id": len(self.chapters) + 1, 32 | "url": self.absolute_url(a["href"]), 33 | "title": a.text.strip(), 34 | } 35 | ) 36 | 37 | def download_chapter_body(self, chapter): 38 | soup = self.get_soup(chapter["url"]) 39 | contents = soup.select_one("article > div") 40 | 41 | nav_tags = contents.find_all("a", string="Table of Contents") 42 | for nav in nav_tags: 43 | nav.parent.extract() 44 | 45 | self.cleaner.clean_contents(contents) 46 | 47 | return str(contents) 48 | -------------------------------------------------------------------------------- /sources/en/w/wuxiabox.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelmtl import NovelMTLTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Wuxiabox(NovelMTLTemplate): 9 | has_mtl = True 10 | has_manga = False 11 | base_url = ["https://www.wuxiabox.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/w/wuxiahub.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaHubCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiahub.com" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiamtl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaMTLCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiamtl.com" 9 | has_mtl = True 10 | -------------------------------------------------------------------------------- /sources/en/w/wuxianovelhub.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelmtl import NovelMTLTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class WuxiaNHCrawler(NovelMTLTemplate): 9 | base_url = "https://www.wuxianovelhub.com/" 10 | -------------------------------------------------------------------------------- /sources/en/w/wuxiapub.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaPubCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiapub.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiar.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaRCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiar.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiaspot.py: -------------------------------------------------------------------------------- 1 | from lncrawl.templates.novelmtl import NovelMTLTemplate 2 | 3 | 4 | class WuxiaSpotCrawler(NovelMTLTemplate): 5 | has_mtl = False 6 | has_manga = False 7 | base_url = "https://www.wuxiaspot.com/" 8 | -------------------------------------------------------------------------------- /sources/en/w/wuxiau.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaUCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiau.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiav.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaVCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiav.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiax.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaXCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiax.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiaz.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaZCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiaz.com/" 9 | -------------------------------------------------------------------------------- /sources/en/x/xiainovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Comment 5 | 6 | from lncrawl.core.crawler import Crawler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class YukiNovelCrawler(Crawler): 12 | base_url = "https://www.xiainovel.com/" 13 | 14 | def read_novel_info(self): 15 | logger.debug("Visiting %s", self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | possible_title = soup.select_one("div.page-header h1") 19 | assert possible_title, "No novel title" 20 | self.novel_title = possible_title.text 21 | logger.info("Novel title: %s", self.novel_title) 22 | 23 | self.novel_author = "Translated by XiaiNovel" 24 | logger.info("Novel author: %s", self.novel_author) 25 | 26 | # NOTE: Can't fetch cover url, as it's listed a base64 code. 27 | # self.novel_cover = self.absolute_url( 28 | # soup.select_one('div.col-md-6 img') 29 | # logger.info('Novel cover: %s', self.novel_cover) 30 | 31 | # Extract volume-wise chapter entries 32 | chapters = soup.select("ul.list-group li a") 33 | 34 | chapters.reverse() 35 | 36 | for a in chapters: 37 | chap_id = len(self.chapters) + 1 38 | vol_id = 1 + len(self.chapters) // 100 39 | if len(self.volumes) < vol_id: 40 | self.volumes.append({"id": vol_id}) 41 | self.chapters.append( 42 | { 43 | "id": chap_id, 44 | "volume": vol_id, 45 | "url": self.absolute_url(a["href"]), 46 | "title": a.text.strip() or ("Chapter %d" % chap_id), 47 | } 48 | ) 49 | 50 | def download_chapter_body(self, chapter): 51 | soup = self.get_soup(chapter["url"]) 52 | 53 | contents = soup.select_one("section#StoryContent") 54 | 55 | for d in contents.findAll("div"): 56 | d.extract() 57 | 58 | for comment in contents.find_all(string=lambda text: isinstance(text, Comment)): 59 | comment.extract() 60 | 61 | return str(contents) 62 | -------------------------------------------------------------------------------- /sources/fr/lightnovelfr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class LightnovelFrCrawler(MangaStreamTemplate): 10 | base_url = ["https://lightnovelfr.com/"] 11 | -------------------------------------------------------------------------------- /sources/fr/xiaowaz.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | from lncrawl.core.exeptions import LNException 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class XiaowazCrawler(Crawler): 13 | base_url = ["https://xiaowaz.fr/"] 14 | 15 | def initialize(self) -> None: 16 | self.cleaner.bad_css.update( 17 | [".abh_box_business", ".footnote_container_prepare"] 18 | ) 19 | 20 | def read_novel_info(self): 21 | soup = self.get_soup(self.novel_url) 22 | 23 | title_tag = soup.select_one("h1.card_title") 24 | if not isinstance(title_tag, Tag): 25 | raise LNException("No title found") 26 | 27 | self.novel_title = title_tag.text.strip() 28 | 29 | image_tag = soup.select_one(".entry-content img") 30 | if isinstance(image_tag, Tag): 31 | self.novel_cover = self.absolute_url(image_tag["src"]) 32 | 33 | logger.info("Novel cover: %s", self.novel_cover) 34 | 35 | for a in soup.select(".entry-content a[href*='/articles/']"): 36 | self.chapters.append( 37 | { 38 | "id": len(self.chapters) + 1, 39 | "title": a.text.strip(), 40 | "url": self.absolute_url(a["href"]), 41 | } 42 | ) 43 | 44 | def download_chapter_body(self, chapter): 45 | soup = self.get_soup(chapter["url"]) 46 | contents = soup.select_one(".entry-content") 47 | self.cleaner.clean_contents(contents) 48 | 49 | return str(contents) 50 | -------------------------------------------------------------------------------- /sources/id/darktrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class DarkTranslation(Crawler): 10 | base_url = "https://darktranslation.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 17 | logger.info("Novel title: %s", self.novel_title) 18 | 19 | # FIXME: Problem getting cover image, tried multiple ways and keep getting error. 20 | # self.novel_cover = self.absolute_url( 21 | # soup.select_one('div.entry-content p img') 22 | # logger.info('Novel cover: %s', self.novel_cover) 23 | 24 | self.novel_author = "by Dark Translation" 25 | logger.info("Novel author: %s", self.novel_author) 26 | 27 | # Extract volume-wise chapter entries 28 | # Stops external links being selected as chapters 29 | chapters = soup.select("div.entry-content p a") 30 | 31 | for a in chapters: 32 | chap_id = len(self.chapters) + 1 33 | vol_id = 1 + len(self.chapters) // 100 34 | if len(self.volumes) < vol_id: 35 | self.volumes.append({"id": vol_id}) 36 | self.chapters.append( 37 | { 38 | "id": chap_id, 39 | "volume": vol_id, 40 | "url": self.absolute_url(a["href"]), 41 | "title": a.text.strip() or ("Chapter %d" % chap_id), 42 | } 43 | ) 44 | 45 | def download_chapter_body(self, chapter): 46 | soup = self.get_soup(chapter["url"]) 47 | contents = soup.select("div.entry-content") 48 | return self.cleaner.extract_contents(contents) 49 | -------------------------------------------------------------------------------- /sources/id/novelringan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class NovelRinganCrawler(Crawler): 9 | base_url = "https://novelringan.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | possible_title = soup.select_one("h1.entry-title") 16 | assert possible_title, "No novel title" 17 | self.novel_title = possible_title.text 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | possible_image = soup.select_one("div.imgprop img") 21 | if possible_image: 22 | self.novel_cover = self.absolute_url(possible_image["src"]) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = " ".join( 26 | [a.text.strip() for a in soup.select('.entry-author a[href*="/author/"]')] 27 | ) 28 | logger.info("%s", self.novel_author) 29 | 30 | for a in reversed(soup.select(".bxcl ul li a")): 31 | chap_id = len(self.chapters) + 1 32 | vol_id = 1 + len(self.chapters) // 100 33 | if len(self.volumes) < vol_id: 34 | self.volumes.append({"id": vol_id}) 35 | self.chapters.append( 36 | { 37 | "id": chap_id, 38 | "volume": vol_id, 39 | "url": self.absolute_url(a["href"]), 40 | "title": a.text.strip() or ("Chapter %d" % chap_id), 41 | } 42 | ) 43 | 44 | def download_chapter_body(self, chapter): 45 | soup = self.get_soup(chapter["url"]) 46 | contents = soup.select(".entry-content p") 47 | 48 | body = [str(p) for p in contents if p.text.strip()] 49 | 50 | return "" + "
".join(body) + "
" 51 | -------------------------------------------------------------------------------- /sources/id/zhiend.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ZhiEnd(Crawler): 10 | base_url = ["http://zhi-end.blogspot.com/", "http://zhi-end.blogspot.co.id/"] 11 | 12 | def initialize(self): 13 | self.home_url = "http://zhi-end.blogspot.com/" 14 | 15 | def read_novel_info(self): 16 | logger.debug("Visiting %s", self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | possible_title = soup.select_one("h1.entry-title") 20 | assert possible_title, "No novel title" 21 | self.novel_title = possible_title.text.strip() 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_image = soup.select_one("div.entry-content div a img") 25 | if possible_image: 26 | self.novel_cover = self.absolute_url(possible_image["src"]) 27 | logger.info("Novel cover: %s", self.novel_cover) 28 | 29 | self.novel_author = "Translated by Zhi End" 30 | logger.info("Novel author: %s", self.novel_author) 31 | 32 | # Extract volume-wise chapter entries 33 | chapters = soup.select('div.entry-content div [href*="zhi-end.blogspot"]') 34 | 35 | for a in chapters: 36 | chap_id = len(self.chapters) + 1 37 | vol_id = 1 + len(self.chapters) // 100 38 | if len(self.volumes) < vol_id: 39 | self.volumes.append({"id": vol_id}) 40 | self.chapters.append( 41 | { 42 | "id": chap_id, 43 | "volume": vol_id, 44 | "url": self.absolute_url(a["href"]), 45 | "title": a.text.strip() or ("Chapter %d" % chap_id), 46 | } 47 | ) 48 | 49 | def download_chapter_body(self, chapter): 50 | soup = self.get_soup(chapter["url"]) 51 | 52 | body_parts = soup.select_one("div.post-body") 53 | 54 | return self.cleaner.extract_contents(body_parts) 55 | -------------------------------------------------------------------------------- /sources/multi/quotev.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | from lncrawl.core.exeptions import LNException 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class QuotevCrawler(Crawler): 13 | base_url = ["https://www.quotev.com/"] 14 | 15 | def initialize(self) -> None: 16 | self.cleaner.bad_css.update([".nosel"]) 17 | 18 | def read_novel_info(self): 19 | soup = self.get_soup(self.novel_url) 20 | 21 | title_tag = soup.select_one("#quizHeaderTitle h1") 22 | if not isinstance(title_tag, Tag): 23 | raise LNException("No title found") 24 | 25 | self.novel_title = title_tag.text.strip() 26 | 27 | image_tag = soup.select_one("meta[property='og:image']") 28 | if isinstance(image_tag, Tag): 29 | self.novel_cover = self.absolute_url(image_tag["content"]) 30 | 31 | logger.info("Novel cover: %s", self.novel_cover) 32 | 33 | for a in soup.select("#rselectList a"): 34 | self.chapters.append( 35 | { 36 | "id": len(self.chapters) + 1, 37 | "title": a.text.strip(), 38 | "url": self.absolute_url(a["href"]), 39 | } 40 | ) 41 | 42 | def download_chapter_body(self, chapter): 43 | soup = self.get_soup(chapter["url"]) 44 | contents = soup.select_one("#rescontent") 45 | self.cleaner.clean_contents(contents) 46 | 47 | return str(contents) 48 | -------------------------------------------------------------------------------- /sources/pt/centralnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class CentralNovelCrawler(MangaStreamTemplate): 10 | base_url = ["https://centralnovel.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.init_executor(ratelimit=2.99) 14 | -------------------------------------------------------------------------------- /sources/ru/bestmanga.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | from lncrawl.templates.madara import MadaraTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class BestMangaCrawler(MadaraTemplate): 10 | has_manga = True 11 | base_url = ["https://bestmanga.club/"] 12 | -------------------------------------------------------------------------------- /sources/ru/ifreedom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class IfreedomCrawler(Crawler): 10 | base_url = [ 11 | "https://ifreedom.su/", 12 | "https://bookhamster.ru/" 13 | ] 14 | 15 | def read_novel_info(self): 16 | soup = self.get_soup(self.novel_url) 17 | 18 | possible_title = soup.select_one("h1.entry-title") 19 | if possible_title: 20 | self.novel_title = possible_title.get_text() 21 | 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_author = soup.select_one("span.dashicons-admin-users").next\ 25 | .next\ 26 | .next 27 | if "Не указан" not in str(possible_author): 28 | self.novel_author = possible_author.get_text() 29 | logger.info("Novel author: %s", self.novel_author) 30 | 31 | possible_full_synopsis = soup.select_one("span.open-desc") 32 | if possible_full_synopsis: 33 | possible_full_synopsis = possible_full_synopsis["onclick"] 34 | self.novel_synopsis = possible_full_synopsis.split("= '")[1].strip("';") 35 | else: 36 | self.novel_synopsis = soup.select_one("div.descr-ranobe").get_text() 37 | 38 | img_src = soup.select_one("div.img-ranobe img") 39 | if img_src: 40 | self.novel_cover = self.absolute_url(img_src["src"]) 41 | 42 | for a in reversed(soup.select(".menu-ranobe a")): 43 | chap_id = 1 + (len(self.chapters)) 44 | 45 | self.chapters.append( 46 | { 47 | "id": chap_id, 48 | "title": a.text.strip(), 49 | "url": self.absolute_url(a['href']) 50 | } 51 | ) 52 | 53 | def download_chapter_body(self, chapter): 54 | soup = self.get_soup(chapter["url"]) 55 | content = soup.select_one("div.entry-content") 56 | return self.cleaner.extract_contents(content) 57 | -------------------------------------------------------------------------------- /sources/zh/daocaorenshuwu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Daocaorenshuwu(Crawler): 10 | base_url = [ 11 | "https://www.daocaorenshuwu.com/", 12 | ] 13 | 14 | def read_novel_info(self): 15 | logger.debug("Visiting %s", self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | possible_title = soup.select_one(".container .book-info h1.book-name") 19 | assert possible_title, "No novel title" 20 | self.novel_title = possible_title.text 21 | logger.info("Novel title: %s", self.novel_title) 22 | 23 | self.novel_author = soup.select(".container .media-body .row div")[ 24 | 0 25 | ].text.strip() 26 | logger.info("Novel author: %s", self.novel_author) 27 | 28 | possible_image = soup.select_one(".container .media-left a img") 29 | if possible_image: 30 | self.novel_cover = self.absolute_url(possible_image["src"]) 31 | logger.info("Novel cover: %s", self.novel_cover) 32 | 33 | # Extract volume-wise ch 34 | # apter entries 35 | chapters = soup.select("#all-chapter a") 36 | 37 | for a in chapters: 38 | chap_id = len(self.chapters) + 1 39 | vol_id = 1 + len(self.chapters) // 100 40 | if len(self.volumes) < vol_id: 41 | self.volumes.append({"id": vol_id}) 42 | self.chapters.append( 43 | { 44 | "id": chap_id, 45 | "volume": vol_id, 46 | "url": self.absolute_url(a["href"]), 47 | "title": a.text.strip() or ("Chapter %d" % chap_id), 48 | } 49 | ) 50 | 51 | def download_chapter_body(self, chapter): 52 | soup = self.get_soup(chapter["url"]) 53 | contents = soup.select(".cont-text > p") 54 | contents = [str(p) for p in contents if p.text.strip()] 55 | return "".join(contents) 56 | -------------------------------------------------------------------------------- /sources/zh/powanjuan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class PowanjuanCrawler(Crawler): 10 | base_url = "https://www.powanjuan.cc/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url, encoding='gb2312') 15 | 16 | possible_title = soup.select_one(".desc h1") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text.split('(')[0].strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_novel_author = soup.select_one('.descTip span') 22 | if possible_novel_author: 23 | self.novel_author = possible_novel_author.text.replace('作者:', '').strip() 24 | logger.info("Novel author: %s", self.novel_author) 25 | 26 | possible_synopsis = soup.select_one('.descInfo p') 27 | if possible_synopsis: 28 | self.novel_synopsis = possible_synopsis.text 29 | logger.info("Novel synopsis: %s", self.novel_synopsis) 30 | 31 | volumes = set([]) 32 | for a in soup.select(".catalog ul.clearfix li a"): 33 | ch_id = len(self.chapters) + 1 34 | vol_id = 1 + len(self.chapters) // 100 35 | volumes.add(vol_id) 36 | self.chapters.append( 37 | { 38 | "id": ch_id, 39 | "volume": vol_id, 40 | "title": a.text.strip(), 41 | "url": self.absolute_url(a["href"]), 42 | } 43 | ) 44 | 45 | self.volumes = [{"id": x, "title": ""} for x in volumes] 46 | 47 | def download_chapter_body(self, chapter): 48 | soup = self.get_soup(chapter["url"], encoding='gb2312') 49 | contents = soup.select_one("#mycontent") 50 | return self.cleaner.extract_contents(contents) 51 | -------------------------------------------------------------------------------- /sources/zh/soxs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Soxc(Crawler): 10 | base_url = ["https://www.soxs.cc/"] 11 | 12 | def read_novel_info(self): 13 | self.novel_url = self.novel_url.replace("/book/", "/") 14 | self.novel_url = self.novel_url.replace(".html", "/") 15 | soup = self.get_soup(self.novel_url) 16 | 17 | possible_title = soup.select_one(".xiaoshuo h1") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.get_text() 20 | logger.info(f"Novel title: {self.novel_title}") 21 | 22 | self.novel_author = soup.select_one(".xiaoshuo h6").get_text() 23 | logger.info(f"Novel Author: {self.novel_author}") 24 | 25 | possible_novel_cover = soup.select_one(".book_cover img") 26 | if possible_novel_cover: 27 | self.novel_cover = self.absolute_url(possible_novel_cover["src"]) 28 | logger.info(f"Novel Cover: {self.novel_cover}") 29 | 30 | logger.info("Getting chapters...") 31 | for chapter in soup.select(".novel_list dd a"): 32 | url = self.absolute_url(chapter["href"]) 33 | chap_id = len(self.chapters) + 1 34 | if len(self.chapters) % 100 == 0: 35 | vol_id = len(self.chapters) // 100 + 1 36 | self.volumes.append({"id": vol_id}) 37 | 38 | self.chapters.append( 39 | { 40 | "id": chap_id, 41 | "url": url, 42 | "volume": vol_id, 43 | } 44 | ) 45 | 46 | def download_chapter_body(self, chapter): 47 | soup = self.get_soup(chapter["url"]) 48 | title = soup.select_one(".read_title h1").text.strip() 49 | chapter["title"] = title 50 | 51 | content = soup.select(".content") 52 | content = "\n".join(str(p) for p in content) 53 | content = content.replace(self.novel_url, "") 54 | content = content.replace("soxscc", "mtlrealm.com ") 55 | return content 56 | -------------------------------------------------------------------------------- /sources/zh/trxs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class TrxsCrawler(Crawler): 10 | base_url = "https://trxs.cc/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url, encoding='gb2312') 15 | 16 | possible_title = soup.select_one(".book_info h1") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_novel_cover = soup.select_one('.book_info img') 22 | if possible_novel_cover: 23 | self.novel_cover = self.absolute_url(possible_novel_cover["src"]) 24 | logger.info("Novel cover: %s", self.novel_cover) 25 | 26 | possible_synopsis = soup.select_one('.book_info p') 27 | if possible_synopsis: 28 | self.novel_synopsis = possible_synopsis.text 29 | logger.info("Novel synopsis %s", self.novel_synopsis) 30 | 31 | possible_novel_author = soup.select_one('.book_info a') 32 | if possible_novel_author: 33 | self.novel_author = possible_novel_author.text 34 | logger.info("Novel author: %s", self.novel_author) 35 | 36 | volumes = set([]) 37 | for a in soup.select(".book_list a"): 38 | ch_id = len(self.chapters) + 1 39 | vol_id = 1 + len(self.chapters) // 100 40 | volumes.add(vol_id) 41 | self.chapters.append( 42 | { 43 | "id": ch_id, 44 | "volume": vol_id, 45 | "title": a.text, 46 | "url": self.absolute_url(a["href"]), 47 | } 48 | ) 49 | 50 | self.volumes = [{"id": x, "title": ""} for x in volumes] 51 | 52 | def download_chapter_body(self, chapter): 53 | soup = self.get_soup(chapter["url"], encoding='gb2312') 54 | contents = soup.select_one(".read_chapterDetail") 55 | return self.cleaner.extract_contents(contents) 56 | --------------------------------------------------------------------------------