├── .editorconfig ├── .env.example ├── .gitattributes ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ ├── 1_general.md │ ├── 2_new-source.md │ ├── 3_source-issue.md │ └── 4_bug_report.md ├── contribs.json ├── dependabot.yml └── workflows │ ├── index-gen.yml │ ├── lint.yml │ └── release.yml ├── .gitignore ├── .python-version ├── Aptfile ├── LICENSE ├── Procfile ├── README.md ├── README.pip ├── app.json ├── compose.yml ├── etc └── wuxiaworld.com │ ├── help.md │ ├── wuxia.proto │ └── wuxia.proto.json ├── lncrawl ├── VERSION ├── __init__.py ├── __main__.py ├── assets │ ├── __init__.py │ ├── banner.py │ ├── chars.py │ ├── colors.py │ ├── epub │ │ ├── __init__.py │ │ ├── chapter.xhtml │ │ ├── cover.xhtml │ │ └── style.css │ ├── languages.py │ ├── user_agents.py │ ├── version.py │ └── web │ │ ├── __init__.py │ │ ├── script.js │ │ └── style.css ├── binders │ ├── __init__.py │ ├── calibre.py │ ├── epub.py │ ├── json.py │ ├── text.py │ └── web.py ├── bots │ ├── __init__.py │ ├── _sample.py │ ├── console │ │ ├── __init__.py │ │ ├── get_crawler.py │ │ ├── integration.py │ │ ├── login_info.py │ │ ├── open_folder_prompt.py │ │ ├── output_style.py │ │ ├── range_selection.py │ │ └── resume_download.py │ ├── discord │ │ ├── __init__.py │ │ ├── config.py │ │ ├── discord_bot.py │ │ └── message_handler.py │ ├── lookup │ │ ├── __init__.py │ │ ├── analyze.py │ │ ├── generator.py │ │ └── prompts.py │ ├── server │ │ ├── __init__.py │ │ ├── api │ │ │ ├── __init__.py │ │ │ ├── artifacts.py │ │ │ ├── auth.py │ │ │ ├── jobs.py │ │ │ ├── novels.py │ │ │ ├── runner.py │ │ │ └── users.py │ │ ├── app.py │ │ ├── config.py │ │ ├── context.py │ │ ├── db.py │ │ ├── exceptions.py │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── _base.py │ │ │ ├── job.py │ │ │ ├── pagination.py │ │ │ └── user.py │ │ ├── security.py │ │ ├── services │ │ │ ├── __init__.py │ │ │ ├── artifacts.py │ │ │ ├── jobs.py │ │ │ ├── novels.py │ │ │ ├── runner.py │ │ │ ├── scheduler.py │ │ │ ├── tier.py │ │ │ └── users.py │ │ ├── ui │ │ │ └── __index__.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── aborter.py │ │ │ ├── decorators.py │ │ │ ├── json_tools.py │ │ │ ├── text_tools.py │ │ │ └── time_utils.py │ └── telegram │ │ └── __init__.py ├── constants.py ├── core │ ├── __init__.py │ ├── app.py │ ├── arguments.py │ ├── browser.py │ ├── cleaner.py │ ├── crawler.py │ ├── display.py │ ├── download_chapters.py │ ├── download_images.py │ ├── exeptions.py │ ├── logconfig.py │ ├── metadata.py │ ├── novel_info.py │ ├── novel_search.py │ ├── proxy.py │ ├── scraper.py │ ├── soup.py │ ├── sources.py │ └── taskman.py ├── models │ ├── __init__.py │ ├── chapter.py │ ├── formats.py │ ├── meta.py │ ├── novel.py │ ├── search_result.py │ ├── session.py │ └── volume.py ├── templates │ ├── __init__.py │ ├── browser │ │ ├── __init__.py │ │ ├── basic.py │ │ ├── chapter_only.py │ │ ├── general.py │ │ ├── login.py │ │ ├── optional_volume.py │ │ ├── searchable.py │ │ └── with_volume.py │ ├── madara.py │ ├── mangastream.py │ ├── novelfull.py │ ├── novelmtl.py │ ├── novelpub.py │ └── soup │ │ ├── __init__.py │ │ ├── chapter_only.py │ │ ├── general.py │ │ ├── optional_volume.py │ │ ├── searchable.py │ │ └── with_volume.py ├── utils │ ├── __init__.py │ ├── common.py │ ├── imgen.py │ ├── kindlegen_download.py │ ├── material_colors.py │ ├── pbincli.py │ ├── platforms.py │ ├── ratelimit.py │ ├── sockets.py │ ├── ssl_no_verify.py │ ├── tilings.py │ └── uploader │ │ ├── __init__.py │ │ ├── anonfiles.py │ │ ├── gofile.py │ │ └── google_drive.py └── webdriver │ ├── __init__.py │ ├── elements.py │ ├── job_queue.py │ ├── local.py │ ├── remote.py │ └── scripts.py ├── requirements-app.txt ├── requirements-bot.txt ├── requirements-dev.txt ├── requirements.txt ├── res ├── lncrawl-icon.png ├── lncrawl-web.png └── lncrawl.ico ├── scripts ├── Dockerfile ├── bitanon.sh ├── build.bat ├── build.sh ├── check_sources.py ├── entry_point.sh ├── index_gen.py ├── lint.bat ├── lint.sh ├── lncrawl.service ├── publish.bat ├── publish.sh ├── push_tag.bat ├── push_tag.sh ├── push_tag_force.bat ├── push_tag_force.sh ├── rebrandly.sh ├── start.sh └── stop.sh ├── setup.cfg ├── setup.py ├── setup_pyi.py └── sources ├── __init__.py ├── _examples ├── _00_basic.py ├── _01_general_soup.py ├── _02_searchable_soup.py ├── _03_chapter_only_soup.py ├── _04_searchable_chapter_only_soup.py ├── _05_with_volume_soup.py ├── _06_searchable_with_volume_soup.py ├── _07_optional_volume_soup.py ├── _08_searchable_optional_volume_soup.py ├── _09_basic_browser.py ├── _10_general_browser.py ├── _11_searchable_browser.py ├── _12_chapter_only_browser.py ├── _13_searchable_chapter_only_browser.py ├── _14_with_volume_browser.py ├── _15_searchable_with_volume_browser.py ├── _16_optional_volume_browser.py └── _17_searchable_optional_volume_browser.py ├── _index.json ├── _rejected.json ├── ar ├── arnovel.py ├── kolnovel.py └── rewayatclub.py ├── en ├── 1 │ └── 1stkissnovel.py ├── 4 │ └── 4scanlation.py ├── 8 │ ├── 888novel.py │ └── 88tang.py ├── a │ ├── allnovel.py │ ├── allnovelfull.py │ ├── americanfaux.py │ ├── amnesiactl.py │ ├── ancientheartloss.py │ ├── anythingnovel.py │ ├── aquamanga.py │ ├── arangscans.py │ ├── arcanetranslations.py │ ├── asadatrans.py │ ├── asianhobbyist.py │ ├── asianovel.py │ ├── asianovel_net.py │ └── automtl.py ├── b │ ├── babelnovel.py │ ├── bakapervert.py │ ├── bato.py │ ├── beautymanga.py │ ├── bestlightnovel.py │ ├── blackboxtl.py │ ├── bonnovel.py │ ├── booknet.py │ ├── boxnovel.py │ ├── boxnovelcom.py │ ├── boxnovelonline.py │ ├── boxnovelorg.py │ └── bronovel.py ├── c │ ├── centinni.py │ ├── chereads.py │ ├── chickengege.py │ ├── chrysanthemumgarden.py │ ├── ckandawrites.online.py │ ├── clicknovel.py │ ├── coffeemanga.py │ ├── creativenovels.py │ ├── crescentmoon.py │ └── fu_kemao.py ├── d │ ├── daonovel.py │ ├── daotranslate.py │ ├── demontrans.py │ ├── divinedaolibrary.py │ ├── dmtrans.py │ ├── dobelyuwai.py │ ├── dragon_tea.py │ ├── dsrealmtrans.py │ └── dummynovels.py ├── e │ ├── ebotnovel.py │ ├── engnovel.py │ └── exiledrebels.py ├── f │ ├── fanfiction.py │ ├── fanmtl.py │ ├── fanstrans.py │ ├── fantasyworldonline.py │ ├── faqwiki.py │ ├── fenrirealm.py │ ├── fenrirtranslations.py │ ├── fictionpress.py │ ├── flyinglines.py │ ├── foxteller.py │ ├── freefullnovel.py │ ├── freelightnovel.py │ ├── freemanga.py │ ├── freewebnovel.py │ ├── fringecapybara.py │ ├── fsapk.py │ ├── fujitrans.py │ ├── fullnovellive.py │ └── fuyuneko.py ├── g │ ├── genesistls.py │ └── gravitytales.py ├── h │ ├── hanyunovels.py │ ├── harimanga.py │ ├── hostednovel.py │ ├── hotnovelfull.py │ └── hui3r.py ├── i │ ├── imperfectcomic.py │ ├── inadequatetrans.py │ ├── infinitetrans.py │ ├── inkitt.py │ ├── instadoses.py │ ├── isekaiscan.py │ ├── isekaiscaneu.py │ ├── isotls.py │ └── snowycodex.py ├── j │ ├── jpmtl.py │ └── justatrans.py ├── k │ ├── katreadingcafe.py │ ├── kingmanga.py │ ├── kissmanga.py │ ├── kissnovel.py │ ├── kitenovel.py │ ├── kolnovelnewsite.py │ └── koreanmtl.py ├── l │ ├── ladybirdtrans.py │ ├── latestnovel.py │ ├── lazygirltranslations.py │ ├── leafstudio.py │ ├── lemontree.py │ ├── librarynovel.py │ ├── lightnovelbastion.py │ ├── lightnovelheaven.py │ ├── lightnovelkiss.py │ ├── lightnovelme.py │ ├── lightnovelmeta.py │ ├── lightnovelonline.py │ ├── lightnovelpub.py │ ├── lightnovelreader.py │ ├── lightnovelshub.py │ ├── lightnovelsonl.py │ ├── lightnoveltv.py │ ├── lightnovelworld.com.py │ ├── lightnovelworld.py │ ├── lightnovetrans.py │ ├── listnovel.py │ ├── literotica.py │ ├── lnmtl.py │ ├── ltnovel.py │ ├── luminarynovels.py │ └── lunarletters.py ├── m │ ├── machinetransorg.py │ ├── manga-tx.py │ ├── mangabuddy.py │ ├── mangachilllove.py │ ├── mangaread.py │ ├── mangarockteam.py │ ├── mangarosie.py │ ├── mangastic.py │ ├── mangatoon.py │ ├── mangatx.py │ ├── mangaweebs.py │ ├── manhuaplus.py │ ├── manhwachill.py │ ├── meownovel.py │ ├── miraslation.py │ ├── mixednovel.py │ ├── mltnovels.py │ ├── mostnovel.py │ ├── mtlednovels.py │ ├── mtlnation.py │ ├── mtlreader.py │ ├── myboxnovel.py │ ├── mydramanovel.py │ ├── myoniyonitrans.py │ └── mysticalmerries.py ├── n │ ├── neosekaitranslations.py │ ├── newnovelorg.py │ ├── newsnovel.py │ ├── noblemtl.py │ ├── noobchan.py │ ├── novel-bin.net.py │ ├── novel-bin.py │ ├── novel27.py │ ├── novel35.py │ ├── novelall.py │ ├── novelbin.net.py │ ├── novelbin.py │ ├── novelcake.py │ ├── novelcool.py │ ├── novelcrush.py │ ├── novelfull.py │ ├── novelfullme.py │ ├── novelfullplus.py │ ├── novelgate.py │ ├── novelhall.py │ ├── novelhard.py │ ├── novelhi.py │ ├── novelhulk.py │ ├── novelhunters.py │ ├── novelight.py │ ├── novelmao.py │ ├── novelmic.py │ ├── novelmt.py │ ├── novelmtl.py │ ├── novelmultiverse.py │ ├── novelnext.py │ ├── novelnextz.py │ ├── novelonlinefree.py │ ├── novelonlinefull.py │ ├── novelpassion.py │ ├── novelplanet.py │ ├── novelpub.py │ ├── novelrare.py │ ├── novelraw.py │ ├── novelsala.py │ ├── novelsemperor.py │ ├── novelsite.py │ ├── novelsonline.py │ ├── novelspl.py │ ├── novelspread.py │ ├── novelsrock.py │ ├── noveltranslate.py │ ├── noveluniverse.py │ ├── novelupdatescc.py │ ├── novelv.py │ ├── novelww.py │ ├── novelzec.py │ ├── novlove.py │ └── nyxtranslation.py ├── o │ ├── omgnovels.py │ ├── oppatrans.py │ ├── oppatranslations.py │ ├── ornovel.py │ └── overabook.py ├── p │ ├── pandamanga.py │ ├── pandanovelco.py │ ├── pandanovelorg.py │ ├── peryinfo.py │ ├── pianmanga.py │ └── puretl.py ├── q │ └── qidianunderground.py ├── r │ ├── raeitranslations.py │ ├── randomnovel.py │ ├── ranobes.py │ ├── readlightnovelcc.py │ ├── readlightnovelorg.py │ ├── readlightnovelsnet.py │ ├── readmanganato.py │ ├── readmtl.py │ ├── readnovelfull.py │ ├── readnovelz.py │ ├── readonlinenovels.py │ ├── readwebnovels.py │ ├── readwn.py │ ├── reaperscans.py │ ├── rebirthonline.py │ ├── reincarnationpalace.py │ ├── relibrary.py │ ├── royalroad.py │ └── rpgnovels.py ├── s │ ├── scribblehub.py │ ├── secondlifetranslations.py │ ├── shalvation.py │ ├── shanghaifantasy.py │ ├── shinsori.py │ ├── skydemonorder.py │ ├── skynovel.py │ ├── sleepytrans.py │ ├── smnovels.py │ ├── sonicmtl.py │ ├── steambun.py │ ├── supernovel.py │ └── systemtranslation.py ├── t │ ├── tamagotl.py │ ├── tapread.py │ ├── teanovel.py │ ├── tigertranslations.py │ ├── tipnovel.py │ ├── tomotrans.py │ ├── toonily.py │ ├── topmanhua.py │ ├── totallytranslations.py │ ├── translateindo.py │ ├── travistranslations.py │ └── tunovelaligera.py ├── u │ └── usefulnovel.py ├── v │ ├── veratales.py │ ├── viewnovel.py │ ├── vipnovel.py │ ├── virlyce.py │ ├── vistrans.py │ └── volarenovels.py ├── w │ ├── wanderinginn.py │ ├── webnovel.py │ ├── webnovelonlinecom.py │ ├── webnovelonlinenet.py │ ├── webnovelpub.py │ ├── webtoon.py │ ├── whatsawhizzerwebnovels.py │ ├── whitemoonlightnovels.py │ ├── wnmtl.py │ ├── wondernovels.py │ ├── woopread.py │ ├── wordexcerpt.py │ ├── wordrain.py │ ├── writerupdates.py │ ├── wspadancewichita.py │ ├── wujizun.py │ ├── wuxiablog.py │ ├── wuxiabox.py │ ├── wuxiacity.py │ ├── wuxiaclick.py │ ├── wuxiaco.py │ ├── wuxiacom.py │ ├── wuxiahub.py │ ├── wuxialeague.py │ ├── wuxiamtl.py │ ├── wuxianovelhub.py │ ├── wuxiaonline.py │ ├── wuxiapub.py │ ├── wuxiar.py │ ├── wuxiasite.py │ ├── wuxiaspot.py │ ├── wuxiau.py │ ├── wuxiav.py │ ├── wuxiaworldio.py │ ├── wuxiaworldlive.py │ ├── wuxiaworldsite.py │ ├── wuxiax.py │ └── wuxiaz.py ├── x │ └── xiainovel.py └── z │ ├── zenithnovels.py │ ├── zetrotranslation.py │ ├── zinmanga.py │ └── zinnovel.py ├── es ├── domentranslations.py └── novelasligeras.py ├── fr ├── animesama.py ├── chireads.py ├── lightnovelfr.py ├── lnmtlfr.py ├── noveldeglace.py └── xiaowaz.py ├── id ├── darktrans.py ├── grensia_blogspot.py ├── idqidian.py ├── indomtl.py ├── indowebnovel.py ├── meionovel.py ├── morenovel.py ├── novelgo.py ├── novelku.py ├── novelringan.py ├── noveltoon.py ├── wbnovel.py ├── webnovelindonesia.py ├── webnovelover.py ├── worldnovelonline.py ├── yukinovel.py └── zhiend.py ├── jp └── s │ └── syosetu.py ├── multi ├── foxaholic.py ├── mtlnovel.py ├── novelupdates.py ├── quotev.py ├── wattpad.py ├── webfic.py └── wtrlab.py ├── pt ├── blnovels.py ├── centralnovel.py └── ceunovel.py ├── ru ├── bestmanga.py ├── ifreedom.py ├── jaomix.py ├── litnet.py ├── ranobelib.py ├── ranobenovel.py ├── renovels.py └── rulate.py ├── tr └── fenrirscan.py ├── vi ├── lnhakone.py └── truenfull.py └── zh ├── 27k.py ├── 69shuba.cx.py ├── 69shuba.py ├── daocaorenshuwu.py ├── ddxsss.py ├── ixdzs.py ├── novel543.py ├── piaotian.py ├── powanjuan.py ├── shw5.py ├── soxs.py ├── trxs.py ├── uukanshu.py ├── uukanshu_sj.py ├── xbanxia.py └── xnunu.py /.editorconfig: -------------------------------------------------------------------------------- 1 | # EditorConfig is awesome: https://editorconfig.org/ 2 | 3 | # top-most EditorConfig file 4 | root = true 5 | 6 | # Unix-style newlines with a newline ending every file 7 | [*] 8 | end_of_line = lf 9 | insert_final_newline = true 10 | 11 | # Set default charset 12 | [*.{js,py}] 13 | charset = utf-8 14 | 15 | [*.py] 16 | indent_style = space 17 | indent_size = 4 18 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Values should be the file names (without .py) inside `lncrawl/interfaces` folder. 2 | # By default the console bot will be choosen if this is left empty or invalid.. 3 | BOT=console 4 | 5 | # Available levels: NOTSET, WARN, INFO, DEBUG, FATAL, ERROR 6 | # If this variable is unset or NONE, logging will not be configured. 7 | LOG_LEVEL=INFO 8 | 9 | # Configs for bots 10 | TELEGRAM_TOKEN= 11 | DISCORD_TOKEN= 12 | DISCORD_DISABLE_SEARCH=false 13 | DISCORD_SIGNAL_CHAR=! 14 | 15 | # Cloud drives for upload | Options: [ANONFILES, GOFILE, GOOGLE_DRIVE] | Default: ANONFILES 16 | CLOUD_DRIVE=ANONFILES 17 | 18 | # Google Drive Config 19 | GOOGLE_DRIVE_CREDENTIAL_FILE=mycreds.txt 20 | GOOGLE_DRIVE_FOLDER_ID=118iN1jzavVV-9flrLPZo7DOi0cuxrQ5F 21 | 22 | # Password for VNC server 23 | VNC_PASSWORD=secret 24 | 25 | # Server Config 26 | SERVER_SECRET= 27 | SERVER_ADMIN_EMAIL= 28 | SERVER_ADMIN_PASSWORD= 29 | RUNNER_INTERVAL_IN_SECOND=10 30 | DATABASE_URL=sqlite:///.server/sqlite.db 31 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 13 | custom: ['https://paypal.me/sd1pu'] 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/1_general.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General 3 | about: Create a general issue 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/2_new-source.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request New Source 3 | about: Want to request a new source that is not yet listed in the README.md? 4 | title: Replace this with an url 5 | labels: source 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | - Language: 19 | - Supports Searching: Yes/No 20 | - Contains Machine Translations: Yes/No 21 | - Contains Manga/Manhua/Manhwa: Yes/No 22 | - Has CloudFlare Protection: Yes/No 23 | 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/3_source-issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Source Not Working 3 | about: Having trouble with a specific source? (e.g. failing to crawl or missing chapters or content) 4 | title: Fix this source 5 | labels: source-issue 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | ## Let us know 13 | 14 | 15 | 16 | **Novel URL**: 17 | **App Location**: PIP | EXE | Discord | Telegram 18 | 19 | **App Version**: x.y.z 20 | 21 | ## Describe this issue 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/4_bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug 3 | about: Having a general issue with the app? 4 | title: Fix this bug 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Describe the bug 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | ## Let us know 19 | 20 | 21 | 22 | **App source**: PIP | EXE | Discord | Telegram 23 | **App version**: x.y.z 24 | **Your OS**: Linux | Windows | Mac 25 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "daily" 12 | target-branch: "dev" 13 | labels: 14 | - "pip dependencies" 15 | -------------------------------------------------------------------------------- /.github/workflows/index-gen.yml: -------------------------------------------------------------------------------- 1 | name: Generate source index 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | paths: 8 | - "sources/**" 9 | - "!sources/_index.json" 10 | - "scripts/index_gen.py" 11 | - "scripts/check_sources.py" 12 | - "lncrawl/VERSION" 13 | 14 | concurrency: 15 | group: ${{ github.workflow }}-${{ github.ref }} 16 | cancel-in-progress: true 17 | 18 | jobs: 19 | index_gen: 20 | if: github.repository == 'dipu-bd/lightnovel-crawler' 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v4 24 | with: 25 | fetch-depth: 0 26 | 27 | - name: Set up Python 3.11 28 | uses: actions/setup-python@v5 29 | with: 30 | python-version: "3.11" 31 | 32 | - name: Install dependencies 33 | run: pip install -r requirements-app.txt 34 | 35 | - name: Generate index 36 | run: python ./scripts/index_gen.py 37 | 38 | - name: Commit changes 39 | uses: stefanzweifel/git-auto-commit-action@v5 40 | with: 41 | commit_message: Generate source index 42 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - dev 8 | pull_request: 9 | branches: 10 | - master 11 | - dev 12 | 13 | concurrency: 14 | group: ${{ github.workflow }}-${{ github.ref }} 15 | cancel-in-progress: true 16 | 17 | jobs: 18 | lint_test: 19 | if: github.repository == 'dipu-bd/lightnovel-crawler' 20 | name: Lint & Test 21 | runs-on: ubuntu-latest 22 | strategy: 23 | matrix: 24 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 25 | 26 | steps: 27 | - uses: actions/checkout@v4 28 | - name: Set up Python ${{ matrix.python-version }} 29 | uses: actions/setup-python@v5 30 | with: 31 | python-version: ${{ matrix.python-version }} 32 | 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install -r requirements.txt 37 | 38 | - name: Lint with flake8 39 | run: flake8 -v --count --show-source --statistics 40 | 41 | - name: Build wheel 42 | run: python setup.py clean bdist_wheel 43 | 44 | - name: Install and test the wheel package 45 | run: | 46 | pip install dist/lightnovel_crawler*.whl 47 | lncrawl --list-sources 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | _novel 3 | _book 4 | geckodriver.log 5 | build 6 | dist 7 | *.egg-info 8 | Test Novel 9 | .vscode 10 | bundle 11 | env 12 | .env 13 | Lightnovels/ 14 | windows/ 15 | .pyi/ 16 | .telegram_bot_output/ 17 | .discord_bot_output/ 18 | mycreds.txt 19 | /venv*/ 20 | *.log 21 | save_pid.txt 22 | /.tox 23 | /logs 24 | client_secrets.json 25 | /lightnovel-crawler-* 26 | __pycache__/ 27 | .idea/ 28 | /test.py 29 | .venv/ 30 | .DS_Store 31 | /sqlite.db 32 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.10 2 | -------------------------------------------------------------------------------- /Aptfile: -------------------------------------------------------------------------------- 1 | libnss3 libgl1-mesa-glx libxcomposite1 libxrandr2 libxi6 2 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | bot: python lncrawl --shard-id 0 --shard-count 1 2 | -------------------------------------------------------------------------------- /README.pip: -------------------------------------------------------------------------------- 1 | Lightnovel Crawler 2 | ----------------------- 3 | 4 | Download lightnovels from various online sources and generate output in different formats, e.g. epub, mobi, json, html, text, docx and pdf. 5 | 6 | Visit https://github.com/dipu-bd/lightnovel-crawler for more details. 7 | -------------------------------------------------------------------------------- /compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | chrome: 3 | platform: linux/amd64 4 | image: selenium/standalone-chrome:latest 5 | shm_size: 6gb 6 | restart: unless-stopped 7 | ports: 8 | - "7900:7900" 9 | - "4444:4444" 10 | environment: 11 | SE_VNC_VIEW_ONLY: "1" 12 | SE_EVENT_BUS_PUBLISH_PORT: "4442" 13 | SE_EVENT_BUS_SUBSCRIBE_PORT: "4443" 14 | SE_NODE_MAX_SESSIONS: "4" 15 | SE_NODE_OVERRIDE_MAX_SESSIONS: "true" 16 | SE_NO_VNC_PORT: "7900" 17 | SE_SCREEN_WIDTH: "1920" 18 | SE_SCREEN_HEIGHT: "1080" 19 | SE_NODE_GRID_URL: "false" 20 | 21 | # discord-bot: 22 | # platform: linux/amd64 23 | # image: lncrawl 24 | # build: 25 | # context: . 26 | # dockerfile: ./scripts/Dockerfile 27 | # restart: unless-stopped 28 | # environment: 29 | # CLOUD_DRIVE: "GOFILE" 30 | # DISCORD_TOKEN: "${DISCORD_TOKEN}" 31 | # DISCORD_SIGNAL_CHAR: "${DISCORD_SIGNAL_CHAR}" 32 | # DISCORD_DISABLE_SEARCH: "${DISCORD_DISABLE_SEARCH}" 33 | # command: python -m lncrawl --suppress --bot discord --shard-id 0 --shard-count 1 --selenium-grid "http://chrome:4444" 34 | # depends_on: 35 | # - chrome 36 | 37 | server: 38 | platform: linux/amd64 39 | build: 40 | context: . 41 | dockerfile: ./scripts/Dockerfile 42 | restart: unless-stopped 43 | ports: 44 | - "23457:8000" 45 | command: python -m lncrawl -ll --suppress --bot server --port 8000 --selenium-grid "http://chrome:4444" 46 | environment: 47 | - PYTHONUNBUFFERED=1 48 | - OUTPUT_PATH=/home/lncrawl/output 49 | - DATABASE_URL=sqlite:////home/lncrawl/output/sqlite.db 50 | volumes: 51 | - output_path:/home/lncrawl/output 52 | depends_on: 53 | - chrome 54 | 55 | volumes: 56 | output_path: 57 | -------------------------------------------------------------------------------- /etc/wuxiaworld.com/help.md: -------------------------------------------------------------------------------- 1 | Install the packages required: 2 | 3 | ``` 4 | pip install grpcio-tools 5 | ``` 6 | 7 | To generate the wuxia.proto.json from wuxia.proto, run this command: 8 | 9 | ``` 10 | pyease-grpc -I . wuxia.proto > wuxia.proto.json 11 | ``` 12 | -------------------------------------------------------------------------------- /etc/wuxiaworld.com/wuxia.proto.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/etc/wuxiaworld.com/wuxia.proto.json -------------------------------------------------------------------------------- /lncrawl/VERSION: -------------------------------------------------------------------------------- 1 | 3.9.4 2 | -------------------------------------------------------------------------------- /lncrawl/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from __future__ import annotations 3 | 4 | import multiprocessing 5 | 6 | 7 | def main(): 8 | multiprocessing.freeze_support() 9 | multiprocessing.set_start_method("spawn") 10 | 11 | try: 12 | from dotenv import load_dotenv 13 | load_dotenv() 14 | except Exception: 15 | pass 16 | 17 | from .core import start_app 18 | start_app() 19 | 20 | 21 | if __name__ == "__main__": 22 | main() 23 | -------------------------------------------------------------------------------- /lncrawl/__main__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sys 4 | 5 | if not __package__ and not hasattr(sys, "frozen"): 6 | import os.path 7 | path = os.path.realpath(os.path.abspath(__file__)) 8 | sys.path.insert(0, os.path.dirname(os.path.dirname(path))) 9 | 10 | 11 | if __name__ == "__main__": 12 | from lncrawl import main 13 | main() 14 | -------------------------------------------------------------------------------- /lncrawl/assets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/assets/__init__.py -------------------------------------------------------------------------------- /lncrawl/assets/chars.py: -------------------------------------------------------------------------------- 1 | from ..utils.common import static_cached_property 2 | from ..utils.platforms import Platform 3 | 4 | 5 | class Chars: 6 | @static_cached_property 7 | @staticmethod 8 | def __supported(): 9 | return Platform.linux or Platform.mac 10 | 11 | # --------------------------------------- # 12 | 13 | @static_cached_property 14 | @staticmethod 15 | def EOL() -> str: 16 | return "\r\n" if Platform.windows else "\n" 17 | 18 | @static_cached_property 19 | @staticmethod 20 | def EMPTY(): 21 | return " " 22 | 23 | @static_cached_property 24 | @staticmethod 25 | def BOOK(): 26 | return "📒" if Chars.__supported else "[#]" 27 | 28 | @static_cached_property 29 | @staticmethod 30 | def CLOVER(): 31 | return "🍀" if Chars.__supported else "*" 32 | 33 | @static_cached_property 34 | @staticmethod 35 | def LINK(): 36 | return "🔗" if Chars.__supported else "-" 37 | 38 | @static_cached_property 39 | @staticmethod 40 | def HANDS(): 41 | return "🙏" if Chars.__supported else "-" 42 | 43 | @static_cached_property 44 | @staticmethod 45 | def ERROR(): 46 | return "❗" if Chars.__supported else "!" 47 | 48 | @static_cached_property 49 | @staticmethod 50 | def PARTY(): 51 | return "📦" if Chars.__supported else "$" 52 | 53 | @static_cached_property 54 | @staticmethod 55 | def SOUND(): 56 | return "🔊" if Chars.__supported else "<<" 57 | 58 | @static_cached_property 59 | @staticmethod 60 | def SPARKLE(): 61 | return "✨" if Chars.__supported else "*" 62 | 63 | @static_cached_property 64 | @staticmethod 65 | def INFO(): 66 | return "💁" if Chars.__supported else ">" 67 | 68 | @static_cached_property 69 | @staticmethod 70 | def RIGHT_ARROW(): 71 | return "➡" if Chars.__supported else "->" 72 | -------------------------------------------------------------------------------- /lncrawl/assets/epub/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | ROOT = Path(__file__).parent 4 | 5 | 6 | def epub_style_css() -> bytes: 7 | return (ROOT / "style.css").read_bytes() 8 | 9 | 10 | def epub_cover_xhtml() -> bytes: 11 | return (ROOT / "cover.xhtml").read_bytes() 12 | 13 | 14 | def epub_chapter_xhtml() -> bytes: 15 | return (ROOT / "chapter.xhtml").read_bytes() 16 | -------------------------------------------------------------------------------- /lncrawl/assets/epub/chapter.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | -------------------------------------------------------------------------------- /lncrawl/assets/epub/cover.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /lncrawl/assets/epub/style.css: -------------------------------------------------------------------------------- 1 | img { 2 | width: 100%; 3 | object-fit: scale-down; 4 | object-position: center; 5 | } 6 | 7 | p + br { 8 | display: none; 9 | } 10 | 11 | #intro { 12 | width: 100vw; 13 | height: calc(100% - 30px); 14 | text-align: center; 15 | position: relative; 16 | display: flex; 17 | flex-direction: column; 18 | align-items: center; 19 | justify-content: space-between; 20 | text-align: center; 21 | letter-spacing: 0.25; 22 | } 23 | 24 | #intro .header { 25 | height: 200px; 26 | } 27 | 28 | #intro h1 { 29 | opacity: 1; 30 | } 31 | #intro h3 { 32 | opacity: 0.6; 33 | } 34 | 35 | #intro img { 36 | width: 100%; 37 | height: calc(100% - 300px); 38 | object-fit: contain; 39 | object-position: center; 40 | } 41 | 42 | #intro .footer { 43 | height: 50px; 44 | line-height: 24px; 45 | opacity: 0.8; 46 | } 47 | 48 | #cover { 49 | object-fit: cover; 50 | } 51 | 52 | #volume { 53 | width: 100%; 54 | height: 100%; 55 | display: flex; 56 | text-align: center; 57 | align-items: center; 58 | justify-content: center; 59 | } 60 | -------------------------------------------------------------------------------- /lncrawl/assets/version.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | ROOT = Path(__file__).parent.parent 4 | 5 | with open(str(ROOT / "VERSION"), "r", encoding="utf8") as f: 6 | version = f.read().strip() 7 | 8 | 9 | def get_version(): 10 | return version 11 | -------------------------------------------------------------------------------- /lncrawl/assets/web/__init__.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | ROOT = Path(__file__).parent 4 | 5 | 6 | def get_js_script(): 7 | with open(str(ROOT / "script.js"), "r", encoding="utf8") as f: 8 | script = f.read() 9 | return script 10 | 11 | 12 | def get_css_style(): 13 | with open(str(ROOT / "style.css"), "r", encoding="utf8") as f: 14 | style = f.read() 15 | return style 16 | -------------------------------------------------------------------------------- /lncrawl/binders/json.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import Generator 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def make_jsons(app, data) -> Generator[str, None, None]: 9 | root_path = Path(app.output_path) 10 | yield str(root_path / 'meta.json') 11 | for vol in data: 12 | for chap in data[vol]: 13 | file_name = "%s.json" % str(chap["id"]).rjust(5, "0") 14 | file_path = root_path / "json" / file_name 15 | if file_path.is_file(): 16 | yield str(file_path) 17 | -------------------------------------------------------------------------------- /lncrawl/binders/text.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import re 4 | from typing import Generator 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | from ..assets.chars import Chars 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def make_texts(app, data) -> Generator[str, None, None]: 14 | for vol in data: 15 | dir_name = os.path.join(app.output_path, "text", vol) 16 | os.makedirs(dir_name, exist_ok=True) 17 | for chap in data[vol]: 18 | if not chap.get("body"): 19 | continue 20 | file_name = "%s.txt" % str(chap["id"]).rjust(5, "0") 21 | file_name = os.path.join(dir_name, file_name) 22 | with open(file_name, "w", encoding="utf8") as file: 23 | body = chap["body"].replace("

\n None: 11 | self.app: Optional[App] = None 12 | self.search_mode = False 13 | 14 | from .get_crawler import ( 15 | choose_a_novel, 16 | confirm_guessed_novel, 17 | confirm_retry, 18 | get_crawlers_to_search, 19 | get_novel_url, 20 | ) 21 | from .integration import process_chapter_range, start 22 | from .login_info import get_login_info 23 | from .output_style import ( 24 | force_replace_old, 25 | get_output_formats, 26 | get_output_path, 27 | should_pack_by_volume, 28 | ) 29 | from .range_selection import ( 30 | get_range_from_chapters, 31 | get_range_from_volumes, 32 | get_range_selection, 33 | get_range_using_index, 34 | get_range_using_urls, 35 | ) 36 | -------------------------------------------------------------------------------- /lncrawl/bots/console/login_info.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | from questionary import prompt 4 | 5 | from ...core.arguments import get_args 6 | 7 | 8 | def get_login_info(self) -> Optional[Tuple[str, str]]: 9 | """Returns the (email, password) pair for login""" 10 | args = get_args() 11 | 12 | if args.login: 13 | return args.login 14 | 15 | if args.suppress: 16 | return None 17 | 18 | answer = prompt( 19 | [ 20 | { 21 | "type": "confirm", 22 | "name": "login", 23 | "message": "Do you want to log in?", 24 | "default": False, 25 | }, 26 | ] 27 | ) 28 | 29 | if answer["login"]: 30 | answer = prompt( 31 | [ 32 | { 33 | "type": "input", 34 | "name": "email", 35 | "message": "User/Email:", 36 | "validate": lambda a: True 37 | if a 38 | else "User/Email should be not be empty", 39 | }, 40 | { 41 | "type": "password", 42 | "name": "password", 43 | "message": "Password:", 44 | "validate": lambda a: True 45 | if a 46 | else "Password should be not be empty", 47 | }, 48 | ] 49 | ) 50 | return answer["email"], answer["password"] 51 | 52 | return None 53 | -------------------------------------------------------------------------------- /lncrawl/bots/console/open_folder_prompt.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from questionary import prompt 4 | 5 | from ...utils.platforms import Platform 6 | from ...core.arguments import get_args 7 | 8 | 9 | def display_open_folder(folder_path: str): 10 | args = get_args() 11 | 12 | if args.suppress: 13 | return 14 | if Platform.java or Platform.docker: 15 | return 16 | 17 | answer = prompt( 18 | [ 19 | { 20 | "type": "confirm", 21 | "name": "exit", 22 | "message": "Open the output folder?", 23 | "default": True, 24 | }, 25 | ] 26 | ) 27 | 28 | if not answer["exit"]: 29 | return 30 | 31 | if Platform.windows: 32 | os.system(f'explorer.exe "{folder_path}"') 33 | elif Platform.wsl: 34 | os.system(f'cd "{folder_path}" && explorer.exe .') 35 | elif Platform.linux: 36 | os.system(f'xdg-open "{folder_path}"') 37 | elif Platform.mac: 38 | os.system(f'open "{folder_path}"') 39 | else: 40 | print(f"Output Folder: {folder_path}") 41 | -------------------------------------------------------------------------------- /lncrawl/bots/discord/__init__.py: -------------------------------------------------------------------------------- 1 | from . import config 2 | from .discord_bot import DiscordBot 3 | 4 | __all__ = ["config", "DiscordBot"] 5 | -------------------------------------------------------------------------------- /lncrawl/bots/lookup/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | from urllib.parse import urlparse 4 | 5 | from slugify import slugify 6 | 7 | from ...core.sources import sources_path 8 | from .analyze import analyze_url 9 | from .generator import generate_crawler 10 | from .prompts import get_features, get_novel_url 11 | 12 | 13 | class LookupBot: 14 | log = logging.getLogger(__name__) 15 | 16 | def __init__(self) -> None: 17 | pass 18 | 19 | def start(self) -> None: 20 | novel_url = get_novel_url() 21 | 22 | _parsed = urlparse(novel_url) 23 | base_url = "%s://%s/" % (_parsed.scheme, _parsed.hostname) 24 | name = re.sub(r"(^www\.)|(\.com$)", "", _parsed.hostname) 25 | 26 | template = analyze_url(base_url, novel_url) 27 | 28 | features = get_features() 29 | language = features["language"] or "multi" 30 | has_manga = features["has_manga"] 31 | has_mtl = features["has_mtl"] 32 | 33 | filename = name + ".py" 34 | classname = slugify( 35 | name, 36 | max_length=20, 37 | separator="_", 38 | lowercase=True, 39 | word_boundary=True, 40 | ).title() 41 | 42 | folder = sources_path / language 43 | if language == "en": 44 | folder = folder / filename[0] 45 | filename = str(folder / filename) 46 | 47 | generate_crawler( 48 | template, 49 | output_file=filename, 50 | classname=classname, 51 | base_url=base_url, 52 | has_manga=has_manga, 53 | has_mtl=has_mtl, 54 | ) 55 | -------------------------------------------------------------------------------- /lncrawl/bots/lookup/generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Type 3 | 4 | from colorama import Style 5 | 6 | from ...assets.chars import Chars 7 | from ...core.crawler import Crawler 8 | from ...core.exeptions import LNException 9 | 10 | 11 | def generate_crawler( 12 | template: Type[Crawler], 13 | output_file: str, 14 | classname: str, 15 | base_url: str, 16 | has_manga: bool, 17 | has_mtl: bool, 18 | ): 19 | if os.path.exists(output_file): 20 | raise LNException(f"File exists: {output_file}") 21 | 22 | lines = [ 23 | "import logging", 24 | "", 25 | f"from {template.__module__} import {template.__name__}", 26 | "", 27 | "logger = logging.getLogger(__name__)", 28 | "", 29 | "", 30 | f"class {classname}({template.__name__}):", 31 | f" has_mtl = {bool(has_mtl)}", 32 | f" has_manga = {bool(has_manga)}", 33 | f' base_url = ["{base_url}"]', 34 | "", 35 | ] 36 | with open(output_file, "w", encoding="utf-8") as f: 37 | f.write("\n".join(lines)) 38 | 39 | print() 40 | print( 41 | Style.BRIGHT + Chars.PARTY, 42 | "Generated source file", 43 | Chars.PARTY + Style.RESET_ALL, 44 | ) 45 | print(Chars.RIGHT_ARROW, output_file) 46 | print() 47 | -------------------------------------------------------------------------------- /lncrawl/bots/lookup/prompts.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from questionary import prompt 4 | 5 | from ...assets.languages import language_codes 6 | from ...core.arguments import get_args 7 | from ...core.exeptions import LNException 8 | 9 | 10 | def get_novel_url(): 11 | """Return a novel page url""" 12 | args = get_args() 13 | url = args.novel_page 14 | 15 | if url: 16 | if re.match(r"^https?://.+\..+$", url): 17 | return url 18 | else: 19 | raise LNException("Invalid URL of novel page") 20 | 21 | try: 22 | answer = prompt( 23 | [ 24 | { 25 | "type": "input", 26 | "name": "novel", 27 | "message": "Enter novel page url:", 28 | "validate": lambda x: ( 29 | True 30 | if re.match(r"^https?://.+\..+$", x) 31 | else "Invalid URL of novel page" 32 | ), 33 | }, 34 | ] 35 | ) 36 | return answer["novel"].strip() 37 | except Exception: 38 | raise LNException("Novel page url or query was not given") 39 | 40 | 41 | def get_features(): 42 | """Return the feature list for the crawler""" 43 | answer = prompt( 44 | [ 45 | { 46 | "type": "autocomplete", 47 | "name": "language", 48 | "message": "Enter language:", 49 | "choices": list(sorted(language_codes.keys())), 50 | }, 51 | { 52 | "type": "confirm", 53 | "name": "has_manga", 54 | "message": "Does it contain Manga/Manhua/Manhwa?", 55 | "default": False, 56 | }, 57 | { 58 | "type": "confirm", 59 | "name": "has_mtl", 60 | "message": "Does it contain Machine Translations?", 61 | "default": False, 62 | }, 63 | ] 64 | ) 65 | return answer 66 | -------------------------------------------------------------------------------- /lncrawl/bots/server/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import uvicorn 4 | 5 | from ...core.arguments import get_args 6 | from .app import app 7 | from .context import ServerContext 8 | 9 | 10 | class ServerBot: 11 | log = logging.getLogger(__name__) 12 | 13 | def start(self): 14 | args = get_args() 15 | 16 | ctx = ServerContext() 17 | ctx.db.prepare() 18 | ctx.users.prepare() 19 | ctx.scheduler.start() 20 | 21 | uvicorn.run( 22 | app, 23 | log_level=logging.DEBUG, 24 | port=args.server_port or 8080, 25 | host=args.server_host or '0.0.0.0', 26 | ) 27 | -------------------------------------------------------------------------------- /lncrawl/bots/server/api/__init__.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends 2 | 3 | from ..security import ensure_admin, ensure_login 4 | from .artifacts import router as artifact 5 | from .auth import router as auth 6 | from .jobs import router as job 7 | from .novels import router as novel 8 | from .runner import router as runner 9 | from .users import router as user 10 | 11 | router = APIRouter() 12 | 13 | router.include_router( 14 | auth, 15 | prefix='/auth', 16 | tags=['Auth'], 17 | ) 18 | 19 | router.include_router( 20 | user, 21 | prefix='/user', 22 | tags=['Users'], 23 | dependencies=[Depends(ensure_admin)], 24 | ) 25 | 26 | router.include_router( 27 | job, 28 | prefix='/job', 29 | tags=['Jobs'], 30 | dependencies=[Depends(ensure_login)], 31 | ) 32 | 33 | router.include_router( 34 | novel, 35 | prefix='/novel', 36 | tags=['Novels'], 37 | dependencies=[Depends(ensure_login)], 38 | ) 39 | 40 | router.include_router( 41 | artifact, 42 | prefix='/artifact', 43 | tags=['Artifacts'], 44 | dependencies=[Depends(ensure_login)], 45 | ) 46 | 47 | router.include_router( 48 | runner, 49 | prefix='/runner', 50 | tags=['Runner'], 51 | dependencies=[Depends(ensure_admin)], 52 | ) 53 | -------------------------------------------------------------------------------- /lncrawl/bots/server/api/artifacts.py: -------------------------------------------------------------------------------- 1 | import mimetypes 2 | import os 3 | from typing import Optional 4 | 5 | from fastapi import APIRouter, Depends, Path, Query 6 | from fastapi.responses import FileResponse 7 | 8 | from ..context import ServerContext 9 | from ..exceptions import AppErrors 10 | 11 | # The root router 12 | router = APIRouter() 13 | 14 | 15 | @router.get("s", summary='Returns a list of artifacts') 16 | def list_artifacts( 17 | ctx: ServerContext = Depends(), 18 | offset: int = Query(default=0), 19 | limit: int = Query(default=20, le=100), 20 | novel_id: Optional[str] = Query(default=None), 21 | ): 22 | return ctx.artifacts.list( 23 | limit=limit, 24 | offset=offset, 25 | novel_id=novel_id, 26 | ) 27 | 28 | 29 | @router.get("/{artifact_id}", summary='Returns a artifact') 30 | def get_novel( 31 | artifact_id: str = Path(), 32 | ctx: ServerContext = Depends(), 33 | ): 34 | return ctx.artifacts.get(artifact_id) 35 | 36 | 37 | @router.get("/{artifact_id}/download", summary='Download artifact file') 38 | def get_novel_artifacts( 39 | artifact_id: str = Path(), 40 | ctx: ServerContext = Depends(), 41 | ): 42 | artifact = ctx.artifacts.get(artifact_id) 43 | file_path = artifact.output_file 44 | if not file_path: 45 | raise AppErrors.no_artifact_file 46 | 47 | media_type, _ = mimetypes.guess_type(file_path) 48 | return FileResponse( 49 | path=file_path, 50 | filename=os.path.basename(file_path), 51 | media_type=media_type or "application/octet-stream", 52 | ) 53 | -------------------------------------------------------------------------------- /lncrawl/bots/server/api/auth.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Body, Depends 2 | 3 | from ..context import ServerContext 4 | from ..models.user import (CreateRequest, LoginRequest, LoginResponse, 5 | SignupRequest, UpdateRequest, User) 6 | from ..security import ensure_user 7 | 8 | # The root router 9 | router = APIRouter() 10 | 11 | 12 | @router.post("/login", summary="Login with username or email and password") 13 | def login( 14 | ctx: ServerContext = Depends(), 15 | credentials: LoginRequest = Body( 16 | default=..., 17 | description='The login credentials', 18 | ), 19 | ): 20 | user = ctx.users.verify(credentials) 21 | token = ctx.users.generate_token(user.id) 22 | return LoginResponse(token=token, user=user) 23 | 24 | 25 | @router.post('/signup', summary='Signup as a new user') 26 | def signup( 27 | ctx: ServerContext = Depends(), 28 | body: SignupRequest = Body( 29 | default=..., 30 | description='The signup request', 31 | ), 32 | ): 33 | request = CreateRequest( 34 | password=body.password, 35 | email=body.email, 36 | name=body.name, 37 | ) 38 | user = ctx.users.create(request) 39 | token = ctx.users.generate_token(user.id) 40 | return LoginResponse(token=token, user=user) 41 | 42 | 43 | @router.get('/me', summary='Get current user details') 44 | def me( 45 | user: User = Depends(ensure_user), 46 | ): 47 | return user 48 | 49 | 50 | @router.put('/me/update', summary='Update current user details') 51 | def self_update( 52 | ctx: ServerContext = Depends(), 53 | user: User = Depends(ensure_user), 54 | body: UpdateRequest = Body( 55 | default=..., 56 | description='The signup request', 57 | ), 58 | ): 59 | body.role = None 60 | body.tier = None 61 | body.is_active = None 62 | return ctx.users.update(user.id, body) 63 | -------------------------------------------------------------------------------- /lncrawl/bots/server/api/novels.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends, Path, Query 2 | 3 | from ..context import ServerContext 4 | 5 | # The root router 6 | router = APIRouter() 7 | 8 | 9 | @router.get("s", summary='Returns a list of novels') 10 | def list_novels( 11 | ctx: ServerContext = Depends(), 12 | offset: int = Query(default=0), 13 | limit: int = Query(default=20, le=100), 14 | with_orphans: bool = Query(default=False), 15 | ): 16 | return ctx.novels.list( 17 | limit=limit, 18 | offset=offset, 19 | with_orphans=with_orphans, 20 | ) 21 | 22 | 23 | @router.get("/{novel_id}", summary='Returns a novel') 24 | def get_novel( 25 | novel_id: str = Path(), 26 | ctx: ServerContext = Depends(), 27 | ): 28 | return ctx.novels.get(novel_id) 29 | 30 | 31 | @router.get("/{novel_id}/artifacts", summary='Returns cached artifacts') 32 | def get_novel_artifacts( 33 | novel_id: str = Path(), 34 | ctx: ServerContext = Depends(), 35 | ): 36 | return ctx.novels.get_artifacts(novel_id) 37 | -------------------------------------------------------------------------------- /lncrawl/bots/server/api/runner.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Depends 2 | 3 | from ..context import ServerContext 4 | 5 | # The root router 6 | router = APIRouter() 7 | 8 | 9 | @router.get("/start", summary='Start the runner') 10 | def start(ctx: ServerContext = Depends()): 11 | ctx.scheduler.start() 12 | 13 | 14 | @router.get("/stop", summary='Stops the runner') 15 | def stop(ctx: ServerContext = Depends()): 16 | ctx.scheduler.close() 17 | -------------------------------------------------------------------------------- /lncrawl/bots/server/api/users.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Body, Depends, Path, Query 2 | 3 | from ..context import ServerContext 4 | from ..exceptions import AppErrors 5 | from ..models.user import CreateRequest, UpdateRequest, User 6 | from ..security import ensure_user 7 | 8 | # The root router 9 | router = APIRouter() 10 | 11 | 12 | @router.get('s', summary='Get list of all users') 13 | def all_users( 14 | ctx: ServerContext = Depends(), 15 | offset: int = Query(default=0), 16 | limit: int = Query(default=20, le=100), 17 | ): 18 | return ctx.users.list(offset, limit) 19 | 20 | 21 | @router.post('', summary='Create an user') 22 | def create_user( 23 | ctx: ServerContext = Depends(), 24 | body: CreateRequest = Body( 25 | default=..., 26 | description='The signup request', 27 | ), 28 | ): 29 | return ctx.users.create(body) 30 | 31 | 32 | @router.get('/{user_id}', summary='Get the user') 33 | def get_user( 34 | ctx: ServerContext = Depends(), 35 | user_id: str = Path(), 36 | ): 37 | return ctx.users.get(user_id) 38 | 39 | 40 | @router.put('/{user_id}', summary='Update the user') 41 | def update_user( 42 | ctx: ServerContext = Depends(), 43 | user: User = Depends(ensure_user), 44 | body: UpdateRequest = Body( 45 | default=..., 46 | description='The signup request', 47 | ), 48 | user_id: str = Path(), 49 | ): 50 | if user_id == user.id: 51 | body.role = None 52 | body.is_active = None 53 | return ctx.users.update(user_id, body) 54 | 55 | 56 | @router.delete('/{user_id}', summary='Delete the user') 57 | def delete_user( 58 | user: User = Depends(ensure_user), 59 | ctx: ServerContext = Depends(), 60 | user_id: str = Path(), 61 | ): 62 | if user.id == user_id: 63 | raise AppErrors.can_not_delete_self 64 | return ctx.users.remove(user_id) 65 | -------------------------------------------------------------------------------- /lncrawl/bots/server/app.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | 3 | from fastapi import FastAPI 4 | from fastapi.middleware.cors import CORSMiddleware 5 | from fastapi.middleware.gzip import GZipMiddleware 6 | 7 | from ...assets.version import get_version 8 | 9 | app = FastAPI( 10 | version=get_version(), 11 | title="Lightnovel Crawler", 12 | description="Download novels from online sources and generate e-books", 13 | ) 14 | 15 | app.add_middleware( 16 | CORSMiddleware, 17 | allow_credentials=True, 18 | allow_origins=["*"], 19 | allow_methods=["*"], 20 | allow_headers=["*"], 21 | ) 22 | 23 | app.add_middleware( 24 | GZipMiddleware, 25 | minimum_size=1000, 26 | ) 27 | 28 | try: 29 | from .api import router as api 30 | app.include_router(api, prefix='/api') 31 | except ImportError: 32 | traceback.print_exc() 33 | -------------------------------------------------------------------------------- /lncrawl/bots/server/context.py: -------------------------------------------------------------------------------- 1 | from functools import cached_property 2 | from typing import Optional 3 | 4 | from .utils.decorators import autoclose 5 | 6 | _cache: Optional['ServerContext'] = None 7 | 8 | 9 | class ServerContext: 10 | def __new__(cls): 11 | global _cache 12 | if _cache is None: 13 | _cache = super().__new__(cls) 14 | return _cache 15 | 16 | @cached_property 17 | def config(self): 18 | from .config import Config 19 | return Config() 20 | 21 | @cached_property 22 | @autoclose 23 | def db(self): 24 | from .db import DB 25 | return DB(self) 26 | 27 | @cached_property 28 | def users(self): 29 | from .services.users import UserService 30 | return UserService(self) 31 | 32 | @cached_property 33 | def jobs(self): 34 | from .services.jobs import JobService 35 | return JobService(self) 36 | 37 | @cached_property 38 | def novels(self): 39 | from .services.novels import NovelService 40 | return NovelService(self) 41 | 42 | @cached_property 43 | def artifacts(self): 44 | from .services.artifacts import ArtifactService 45 | return ArtifactService(self) 46 | 47 | @cached_property 48 | @autoclose 49 | def scheduler(self): 50 | from .services.scheduler import JobScheduler 51 | return JobScheduler(self) 52 | -------------------------------------------------------------------------------- /lncrawl/bots/server/db.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from sqlmodel import Session, SQLModel, create_engine 4 | 5 | from .context import ServerContext 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class DB: 11 | def __init__(self, ctx: ServerContext) -> None: 12 | self.engine = create_engine( 13 | ctx.config.server.database_url, 14 | echo=logger.isEnabledFor(logging.DEBUG), 15 | ) 16 | 17 | def close(self): 18 | self.engine.dispose() 19 | 20 | def prepare(self): 21 | logger.info('Creating tables') 22 | SQLModel.metadata.create_all(self.engine) 23 | 24 | def session( 25 | self, *, 26 | future: bool = True, 27 | autoflush: bool = True, 28 | autocommit: bool = False, 29 | expire_on_commit: bool = True, 30 | enable_baked_queries: bool = True, 31 | ): 32 | return Session( 33 | self.engine, 34 | future=future, # type:ignore 35 | autoflush=autoflush, 36 | autocommit=autocommit, # type:ignore 37 | expire_on_commit=expire_on_commit, 38 | enable_baked_queries=enable_baked_queries, 39 | ) 40 | -------------------------------------------------------------------------------- /lncrawl/bots/server/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/bots/server/models/__init__.py -------------------------------------------------------------------------------- /lncrawl/bots/server/models/_base.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | from sqlalchemy import event 4 | from sqlmodel import Field, SQLModel 5 | 6 | from ..utils.time_utils import current_timestamp 7 | 8 | 9 | def generate_uuid(): 10 | return uuid.uuid4().hex 11 | 12 | 13 | class BaseModel(SQLModel): 14 | id: str = Field( 15 | default_factory=generate_uuid, 16 | primary_key=True, 17 | description="ID" 18 | ) 19 | created_at: int = Field( 20 | index=True, 21 | default_factory=current_timestamp, 22 | description="Create timestamp (ms)" 23 | ) 24 | updated_at: int = Field( 25 | default_factory=current_timestamp, 26 | description="Update timestamp (ms)" 27 | ) 28 | 29 | 30 | @event.listens_for(BaseModel, "before_update", propagate=True) 31 | def auto_update_timestamp(mapper, connection, target: BaseModel): 32 | target.updated_at = current_timestamp() 33 | -------------------------------------------------------------------------------- /lncrawl/bots/server/models/pagination.py: -------------------------------------------------------------------------------- 1 | from typing import Generic, List, TypeVar 2 | 3 | from pydantic import BaseModel 4 | 5 | T = TypeVar("T") 6 | 7 | 8 | class Paginated(BaseModel, Generic[T]): 9 | total: int 10 | offset: int 11 | limit: int 12 | items: List[T] 13 | -------------------------------------------------------------------------------- /lncrawl/bots/server/models/user.py: -------------------------------------------------------------------------------- 1 | from enum import Enum, IntEnum 2 | from typing import Optional 3 | 4 | from pydantic import EmailStr 5 | from sqlmodel import Field, SQLModel 6 | 7 | from ._base import BaseModel 8 | 9 | 10 | class UserRole(str, Enum): 11 | USER = "user" 12 | ADMIN = "admin" 13 | 14 | 15 | class UserTier(IntEnum): 16 | BASIC = 0 17 | PREMIUM = 1 18 | VIP = 2 19 | 20 | 21 | class User(BaseModel, table=True): 22 | password: str = Field(description="Hashed password", exclude=True) 23 | email: str = Field(unique=True, index=True, description="User Email") 24 | role: UserRole = Field(default=UserRole.USER, description="User role") 25 | is_active: bool = Field(default=True, description="Active status") 26 | name: Optional[str] = Field(default=None, description="Full name") 27 | tier: UserTier = Field(default=UserTier.BASIC, description="User tier") 28 | 29 | 30 | class LoginRequest(SQLModel): 31 | email: str = Field(description="User email") 32 | password: str = Field(description="User password") 33 | 34 | 35 | class LoginResponse(SQLModel): 36 | token: str = Field(description="The authorization token") 37 | user: User = Field(description="The user") 38 | 39 | 40 | class SignupRequest(SQLModel): 41 | email: EmailStr = Field(description="User Email") 42 | password: str = Field(description="User password") 43 | name: Optional[str] = Field(default=None, description="Full name") 44 | 45 | 46 | class CreateRequest(SignupRequest): 47 | role: UserRole = Field(default=UserRole.USER, description="User role") 48 | tier: UserTier = Field(default=UserTier.BASIC, description="User tier") 49 | 50 | 51 | class UpdateRequest(SQLModel): 52 | password: Optional[str] = Field(default=None, description="User password") 53 | name: Optional[str] = Field(default=None, description="Full name") 54 | role: Optional[UserRole] = Field(default=None, description="User role") 55 | is_active: Optional[bool] = Field(default=None, description="Active status") 56 | tier: Optional[UserTier] = Field(default=None, description="User tier") 57 | -------------------------------------------------------------------------------- /lncrawl/bots/server/security.py: -------------------------------------------------------------------------------- 1 | from fastapi import Depends 2 | from fastapi.security import APIKeyHeader 3 | from jose import jwt 4 | 5 | from .context import ServerContext 6 | from .exceptions import AppErrors 7 | from .models.user import User, UserRole 8 | 9 | header_scheme = APIKeyHeader( 10 | name='Authorization', 11 | scheme_name='Bearer Token', 12 | ) 13 | 14 | 15 | def ensure_login( 16 | ctx: ServerContext = Depends(), 17 | token: str = Depends(header_scheme), 18 | ) -> dict: 19 | try: 20 | key = ctx.config.server.token_secret 21 | algo = ctx.config.server.token_algo 22 | if token.startswith('Bearer '): 23 | token = token[len('Bearer '):] 24 | return jwt.decode(token, key, algorithms=[algo]) 25 | except Exception as e: 26 | raise AppErrors.unauthorized from e 27 | 28 | 29 | def ensure_user( 30 | ctx: ServerContext = Depends(), 31 | payload: dict = Depends(ensure_login), 32 | ) -> User: 33 | user_id = payload.get('uid') 34 | if not user_id: 35 | raise AppErrors.unauthorized 36 | user = ctx.users.get(user_id) 37 | if not user.is_active: 38 | raise AppErrors.inactive_user 39 | return user 40 | 41 | 42 | def ensure_admin(user: User = Depends(ensure_user)) -> User: 43 | if user.role != UserRole.ADMIN: 44 | raise AppErrors.forbidden 45 | return user 46 | -------------------------------------------------------------------------------- /lncrawl/bots/server/services/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/bots/server/services/__init__.py -------------------------------------------------------------------------------- /lncrawl/bots/server/services/artifacts.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from sqlmodel import desc, func, select 4 | 5 | from ..context import ServerContext 6 | from ..exceptions import AppErrors 7 | from ..models.job import Artifact 8 | from ..models.pagination import Paginated 9 | from ..models.user import User, UserRole 10 | 11 | 12 | class ArtifactService: 13 | def __init__(self, ctx: ServerContext) -> None: 14 | self._ctx = ctx 15 | self._db = ctx.db 16 | 17 | def list( 18 | self, 19 | offset: int = 0, 20 | limit: int = 20, 21 | novel_id: Optional[str] = None, 22 | ) -> Paginated[Artifact]: 23 | with self._db.session() as sess: 24 | stmt = select(Artifact) 25 | 26 | # Apply filters 27 | if not novel_id: 28 | stmt = stmt.where(Artifact.novel_id == novel_id) 29 | 30 | # Apply sorting 31 | stmt.order_by(desc(Artifact.created_at)) 32 | 33 | total = sess.exec(select(func.count()).select_from(Artifact)).one() 34 | items = sess.exec(stmt.offset(offset).limit(limit)).all() 35 | 36 | return Paginated( 37 | total=total, 38 | offset=offset, 39 | limit=limit, 40 | items=list(items), 41 | ) 42 | 43 | def get(self, artifact_id: str) -> Artifact: 44 | with self._db.session() as sess: 45 | artifact = sess.get(Artifact, artifact_id) 46 | if not artifact: 47 | raise AppErrors.no_such_artifact 48 | return artifact 49 | 50 | def delete(self, artifact_id: str, user: User) -> bool: 51 | if user.role != UserRole.ADMIN: 52 | raise AppErrors.forbidden 53 | with self._db.session() as sess: 54 | artifact = sess.get(Artifact, artifact_id) 55 | if not artifact: 56 | raise AppErrors.no_such_artifact 57 | sess.delete(artifact) 58 | sess.commit() 59 | return True 60 | -------------------------------------------------------------------------------- /lncrawl/bots/server/services/tier.py: -------------------------------------------------------------------------------- 1 | from lncrawl.models import OutputFormat 2 | 3 | from ..models.job import JobPriority 4 | from ..models.user import UserTier 5 | 6 | ## 7 | # For Job creation 8 | ## 9 | 10 | JOB_PRIORITY_LEVEL = { 11 | UserTier.BASIC: JobPriority.LOW, 12 | UserTier.PREMIUM: JobPriority.NORMAL, 13 | UserTier.VIP: JobPriority.HIGH, 14 | } 15 | 16 | ## 17 | # For JobRunner service 18 | ## 19 | ENABLED_FORMATS = { 20 | UserTier.BASIC: [ 21 | OutputFormat.json, 22 | OutputFormat.epub, 23 | ], 24 | UserTier.PREMIUM: [ 25 | OutputFormat.json, 26 | OutputFormat.epub, 27 | OutputFormat.text, 28 | OutputFormat.web, 29 | OutputFormat.pdf, 30 | ], 31 | UserTier.VIP: list(OutputFormat), 32 | } 33 | 34 | BATCH_DOWNLOAD_LIMIT = { 35 | UserTier.BASIC: 10, 36 | UserTier.PREMIUM: 100, 37 | UserTier.VIP: 10000, 38 | } 39 | -------------------------------------------------------------------------------- /lncrawl/bots/server/ui/__index__.py: -------------------------------------------------------------------------------- 1 | import reflex as rx 2 | 3 | 4 | class State(rx.State): 5 | count: int = 0 6 | 7 | def increment(self): 8 | self.count += 1 9 | 10 | def decrement(self): 11 | self.count -= 1 12 | 13 | 14 | def index(): 15 | return rx.hstack( 16 | rx.button( 17 | "Decrement", 18 | color_scheme="ruby", 19 | on_click=State.decrement, 20 | ), 21 | rx.heading(State.count, font_size="2em"), 22 | rx.button( 23 | "Increment", 24 | color_scheme="grass", 25 | on_click=State.increment, 26 | ), 27 | spacing="4", 28 | ) 29 | 30 | 31 | app = rx.App() 32 | app.add_page(index) 33 | -------------------------------------------------------------------------------- /lncrawl/bots/server/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/bots/server/utils/__init__.py -------------------------------------------------------------------------------- /lncrawl/bots/server/utils/aborter.py: -------------------------------------------------------------------------------- 1 | from threading import Event 2 | 3 | 4 | class Aborter: 5 | def __init__(self) -> None: 6 | self._event = Event() 7 | 8 | @property 9 | def aborted(self): 10 | return self._event.is_set() 11 | 12 | def abort(self): 13 | self._event.set() 14 | 15 | def wait(self, timeout: float): 16 | if timeout <= 0: 17 | return 18 | self._event.wait(timeout) 19 | -------------------------------------------------------------------------------- /lncrawl/bots/server/utils/decorators.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | 3 | 4 | def autoclose(func): 5 | def inner(*args, **kwargs): 6 | val = func(*args, **kwargs) 7 | if hasattr(val, 'close') and callable(val.close): 8 | atexit.register(val.close) 9 | return val 10 | return inner 11 | -------------------------------------------------------------------------------- /lncrawl/bots/server/utils/json_tools.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Any, TypeVar 4 | 5 | _log = logging.getLogger(__name__) 6 | 7 | T = TypeVar('T') 8 | 9 | 10 | def json_encode(data: Any, encoding: str = "utf-8") -> bytes: 11 | try: 12 | output = json.dumps( 13 | data, 14 | allow_nan=True, 15 | ensure_ascii=False, 16 | check_circular=True, 17 | separators=(',', ':'), 18 | ) 19 | return output.encode(encoding) 20 | except Exception as err: 21 | _log.debug('Failed encoding', err) 22 | return b'' 23 | 24 | 25 | def json_decode(data: str | bytes | bytearray | None, _default: T) -> T: 26 | try: 27 | if isinstance(data, bytearray): 28 | data = bytes(data) 29 | if isinstance(data, bytes): 30 | data = data.decode() 31 | if not isinstance(data, str): 32 | return _default 33 | return json.loads(data) 34 | except Exception as err: 35 | _log.debug('Failed decoding', err) 36 | return _default 37 | -------------------------------------------------------------------------------- /lncrawl/bots/server/utils/text_tools.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import lzma 4 | 5 | from cryptography.fernet import Fernet 6 | 7 | __key_cache = {} 8 | 9 | 10 | def text_compress(plain: bytes) -> bytes: 11 | lzc = lzma.LZMACompressor() 12 | output = lzc.compress(plain) 13 | output += lzc.flush() 14 | return output 15 | 16 | 17 | def text_decompress(compressed: bytes) -> bytes: 18 | lzd = lzma.LZMADecompressor() 19 | return lzd.decompress(compressed) 20 | 21 | 22 | def text_encrypt(plain: bytes, secret: str | bytes) -> bytes: 23 | fernet = Fernet(generate_key(secret)) 24 | result = fernet.encrypt(plain) 25 | return base64.urlsafe_b64decode(result) 26 | 27 | 28 | def text_decrypt(cipher: bytes, secret: str | bytes) -> bytes: 29 | fernet = Fernet(generate_key(secret)) 30 | cipher = base64.urlsafe_b64encode(cipher) 31 | return fernet.decrypt(cipher) 32 | 33 | 34 | def text_compress_encrypt(plain: bytes, secret: str | bytes) -> bytes: 35 | return text_encrypt(text_compress(plain), secret) 36 | 37 | 38 | def text_decrypt_decompress(cipher: bytes, secret: str | bytes) -> bytes: 39 | return text_decompress(text_decrypt(cipher, secret)) 40 | 41 | 42 | def generate_md5(*texts) -> str: 43 | md5 = hashlib.md5() 44 | for text in texts: 45 | md5.update(str(text or '').encode()) 46 | return md5.hexdigest() 47 | 48 | 49 | def generate_key(secret: str | bytes) -> bytes: 50 | if isinstance(secret, str): 51 | secret = secret.encode() 52 | if secret not in __key_cache: 53 | hash = hashlib.sha3_256(secret).digest() 54 | key = base64.urlsafe_b64encode(hash) 55 | __key_cache[secret] = key 56 | return __key_cache[secret] 57 | -------------------------------------------------------------------------------- /lncrawl/bots/server/utils/time_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Any 3 | 4 | from dateutil import parser 5 | from dateutil.relativedelta import relativedelta 6 | from dateutil.tz import tzutc 7 | 8 | 9 | def current_timestamp(): 10 | '''Current UNIX timestamp in milliseconds''' 11 | return round(1000 * datetime.now().timestamp()) 12 | 13 | 14 | def as_unix_time(time: Any) -> int | None: 15 | try: 16 | if isinstance(time, int): 17 | return time 18 | if isinstance(time, str): 19 | time = parser.parse(time) 20 | if isinstance(time, datetime): 21 | return round(1000 * time.timestamp()) 22 | except Exception: 23 | pass 24 | return None 25 | 26 | 27 | def time_from_now( 28 | years=0, months=0, days=0, weeks=0, 29 | hours=0, minutes=0, seconds=0 30 | ) -> datetime: 31 | delta = relativedelta( 32 | years=years, months=months, days=days, weeks=weeks, 33 | hours=hours, minutes=minutes, seconds=seconds 34 | ) 35 | return datetime.now(tzutc()).replace(microsecond=0) + delta 36 | -------------------------------------------------------------------------------- /lncrawl/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DEFAULT_OUTPUT_PATH = os.getenv('OUTPUT_PATH') or os.path.abspath("Lightnovels") 4 | META_FILE_NAME = "meta.json" 5 | -------------------------------------------------------------------------------- /lncrawl/core/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Interactive application to take user inputs 3 | """ 4 | 5 | import logging 6 | import os 7 | import sys 8 | 9 | import colorama # type:ignore 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def init(): 15 | from ..assets.version import get_version 16 | from .arguments import get_args 17 | from .display import description, input_suppression 18 | from .logconfig import configure_logging 19 | 20 | os.environ["version"] = get_version() 21 | 22 | colorama.init(wrap=True) 23 | description() 24 | 25 | configure_logging() 26 | 27 | args = get_args() 28 | logger.debug("Arguments: %s", args) 29 | 30 | if args.suppress: 31 | input_suppression() 32 | print(args) 33 | 34 | if args.bot: 35 | os.environ["BOT"] = args.bot 36 | 37 | for key, val in args.extra.items(): 38 | os.environ[key] = val[0] 39 | 40 | 41 | def start_app(): 42 | from ..bots import run_bot 43 | from .arguments import get_args 44 | from .display import cancel_method, error_message 45 | from .proxy import load_proxies, start_proxy_fetcher, stop_proxy_fetcher 46 | from .sources import load_sources 47 | 48 | init() 49 | 50 | load_sources() 51 | cancel_method() 52 | 53 | args = get_args() 54 | if args.proxy_file: 55 | os.environ["use_proxy"] = "file" 56 | load_proxies(args.proxy_file) 57 | 58 | if args.auto_proxy: 59 | os.environ["use_proxy"] = "auto" 60 | start_proxy_fetcher() 61 | 62 | try: 63 | bot = os.getenv("BOT", "").lower() 64 | run_bot(bot) 65 | except KeyboardInterrupt: 66 | pass 67 | except Exception: 68 | error_message(*sys.exc_info()) 69 | 70 | if args.auto_proxy: 71 | stop_proxy_fetcher() 72 | -------------------------------------------------------------------------------- /lncrawl/core/exeptions.py: -------------------------------------------------------------------------------- 1 | from urllib.error import URLError 2 | 3 | from cloudscraper.exceptions import CloudflareException 4 | from PIL import UnidentifiedImageError 5 | from requests.exceptions import RequestException 6 | from urllib3.exceptions import HTTPError 7 | 8 | 9 | class LNException(Exception): 10 | pass 11 | 12 | 13 | class FallbackToBrowser(Exception): 14 | pass 15 | 16 | 17 | ScraperErrorGroup = ( 18 | URLError, 19 | HTTPError, 20 | CloudflareException, 21 | RequestException, 22 | FallbackToBrowser, 23 | UnidentifiedImageError, 24 | ) 25 | 26 | RetryErrorGroup = ( 27 | URLError, 28 | HTTPError, 29 | CloudflareException, 30 | RequestException, 31 | UnidentifiedImageError, 32 | ) 33 | -------------------------------------------------------------------------------- /lncrawl/core/soup.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC 3 | from typing import Optional, Union 4 | 5 | from bs4 import BeautifulSoup, Tag 6 | from requests import Response 7 | 8 | from .exeptions import LNException 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | DEFAULT_PARSER = "lxml" 14 | 15 | 16 | class SoupMaker(ABC): 17 | def __init__( 18 | self, 19 | parser: Optional[str] = None, 20 | ) -> None: 21 | """This is a helper for Beautiful Soup. It is being used as a superclass of the Crawler. 22 | 23 | Args: 24 | - parser (Optional[str], optional): Desirable features of the parser. This can be the name of a specific parser 25 | ("lxml", "lxml-xml", "html.parser", or "html5lib") or it may be the type of markup to be used ("html", "html5", "xml"). 26 | """ 27 | self._parser = parser or DEFAULT_PARSER 28 | 29 | def close(self) -> None: 30 | pass 31 | 32 | def make_soup( 33 | self, 34 | data: Union[Response, bytes, str], 35 | encoding: Optional[str] = None, 36 | ) -> BeautifulSoup: 37 | if isinstance(data, Response): 38 | return self.make_soup(data.content, encoding) 39 | elif isinstance(data, bytes): 40 | html = data.decode(encoding or "utf8", "ignore") 41 | elif isinstance(data, str): 42 | html = data 43 | else: 44 | raise LNException("Could not parse response") 45 | return BeautifulSoup(html, features=self._parser) 46 | 47 | def make_tag( 48 | self, 49 | data: Union[Response, bytes, str], 50 | encoding: Optional[str] = None, 51 | ) -> Tag: 52 | soup = self.make_soup(data, encoding) 53 | return next(soup.find("body").children) 54 | -------------------------------------------------------------------------------- /lncrawl/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .chapter import Chapter 2 | from .formats import OutputFormat 3 | from .meta import MetaInfo 4 | from .novel import Novel 5 | from .search_result import CombinedSearchResult, SearchResult 6 | from .session import Session 7 | from .volume import Volume 8 | 9 | __all__ = [ 10 | "Chapter", 11 | "CombinedSearchResult", 12 | "SearchResult", 13 | "OutputFormat", 14 | "Novel", 15 | "MetaInfo", 16 | "Session", 17 | "Volume", 18 | ] 19 | -------------------------------------------------------------------------------- /lncrawl/models/chapter.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional 2 | 3 | from box import Box 4 | 5 | 6 | class Chapter(Box): 7 | def __init__( 8 | self, 9 | id: int, 10 | url: str = "", 11 | title: str = "", 12 | volume: Optional[int] = None, 13 | volume_title: Optional[str] = None, 14 | body: Optional[str] = None, 15 | images: Dict[str, str] = dict(), 16 | success: bool = False, 17 | **kwargs, 18 | ) -> None: 19 | self.id = id 20 | self.url = url 21 | self.title = title 22 | self.volume = volume 23 | self.volume_title = volume_title 24 | self.body = body 25 | self.images = images 26 | self.success = success 27 | self.update(kwargs) 28 | 29 | @staticmethod 30 | def without_body(item: "Chapter") -> "Chapter": 31 | result = item.copy() 32 | result.body = None 33 | return result 34 | -------------------------------------------------------------------------------- /lncrawl/models/formats.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class OutputFormat(str, Enum): 5 | json = "json" 6 | epub = "epub" 7 | text = "text" 8 | web = "web" 9 | docx = "docx" 10 | mobi = "mobi" 11 | pdf = "pdf" 12 | rtf = "rtf" 13 | txt = "txt" 14 | azw3 = "azw3" 15 | fb2 = "fb2" 16 | lit = "lit" 17 | lrf = "lrf" 18 | oeb = "oeb" 19 | pdb = "pdb" 20 | rb = "rb" 21 | snb = "snb" 22 | tcr = "tcr" 23 | 24 | def __str__(self) -> str: 25 | return self.value 26 | -------------------------------------------------------------------------------- /lncrawl/models/meta.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from box import Box 4 | 5 | from .novel import Novel 6 | from .session import Session 7 | 8 | 9 | class MetaInfo(Box): 10 | def __init__( 11 | self, 12 | session: Optional[Session] = None, 13 | novel: Optional[Novel] = None, 14 | **kwargs, 15 | ) -> None: 16 | self.session = session 17 | self.novel = novel 18 | self.update(kwargs) 19 | -------------------------------------------------------------------------------- /lncrawl/models/novel.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from box import Box 4 | 5 | from ..assets.languages import find_code 6 | from .chapter import Chapter 7 | from .volume import Volume 8 | 9 | 10 | class Novel(Box): 11 | def __init__( 12 | self, 13 | url: str, 14 | title: str, 15 | authors: List[str] = [], 16 | cover_url: Optional[str] = None, 17 | chapters: List[Chapter] = [], 18 | volumes: List[Volume] = [], 19 | is_rtl: bool = False, 20 | synopsis: str = "", 21 | language: Optional[str] = None, 22 | tags: List[str] = [], 23 | has_manga: Optional[bool] = None, 24 | has_mtl: Optional[bool] = None, 25 | **kwargs, 26 | ) -> None: 27 | self.url = url 28 | self.title = title 29 | self.authors = authors 30 | self.cover_url = cover_url 31 | self.chapters = chapters 32 | self.volumes = volumes 33 | self.is_rtl = is_rtl 34 | self.synopsis = synopsis 35 | self.has_manga = has_manga 36 | self.has_mtl = has_mtl 37 | self.language = find_code(language) 38 | self.tags = tags 39 | self.update(kwargs) 40 | -------------------------------------------------------------------------------- /lncrawl/models/search_result.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from box import Box 4 | 5 | 6 | class SearchResult(Box): 7 | def __init__( 8 | self, 9 | title: str, 10 | url: str, 11 | info: str = "", 12 | **kwargs, 13 | ) -> None: 14 | self.title = str(title) 15 | self.url = str(url) 16 | self.info = str(info) 17 | self.update(kwargs) 18 | 19 | 20 | class CombinedSearchResult(Box): 21 | def __init__( 22 | self, 23 | id: str, 24 | title: str, 25 | novels: List[SearchResult] = [], 26 | **kwargs, 27 | ) -> None: 28 | self.id = id 29 | self.title = str(title) 30 | self.novels = novels 31 | self.update(kwargs) 32 | -------------------------------------------------------------------------------- /lncrawl/models/volume.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from box import Box 4 | 5 | 6 | class Volume(Box): 7 | def __init__( 8 | self, 9 | id: int, 10 | title: str = "", 11 | start_chapter: Optional[int] = None, 12 | final_chapter: Optional[int] = None, 13 | chapter_count: Optional[int] = None, 14 | **kwargs, 15 | ) -> None: 16 | self.id = id 17 | self.title = title 18 | self.start_chapter = start_chapter 19 | self.final_chapter = final_chapter 20 | self.chapter_count = chapter_count 21 | self.update(kwargs) 22 | -------------------------------------------------------------------------------- /lncrawl/templates/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/templates/__init__.py -------------------------------------------------------------------------------- /lncrawl/templates/browser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/templates/browser/__init__.py -------------------------------------------------------------------------------- /lncrawl/templates/browser/chapter_only.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | from bs4 import Tag 4 | 5 | from ...models import Chapter 6 | from ..soup.chapter_only import ChapterOnlySoupTemplate 7 | from .general import GeneralBrowserTemplate 8 | 9 | 10 | class ChapterOnlyBrowserTemplate(GeneralBrowserTemplate, ChapterOnlySoupTemplate): 11 | """Attempts to crawl using cloudscraper first, if failed use the browser.""" 12 | 13 | def parse_chapter_list_in_browser(self) -> Generator[Chapter, None, None]: 14 | chap_id = 0 15 | for tag in self.select_chapter_tags_in_browser(): 16 | if not isinstance(tag, Tag): 17 | continue 18 | chap_id += 1 19 | yield self.parse_chapter_item(tag, chap_id) 20 | 21 | def select_chapter_tags_in_browser(self) -> Generator[Tag, None, None]: 22 | """Select chapter list item tags from the browser""" 23 | yield from self.select_chapter_tags(self.browser.soup) 24 | -------------------------------------------------------------------------------- /lncrawl/templates/browser/login.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import abstractmethod 3 | 4 | from ...core.exeptions import FallbackToBrowser, ScraperErrorGroup 5 | from .general import GeneralBrowserTemplate 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class LoginBrowserTemplate(GeneralBrowserTemplate): 11 | """Attempts to crawl using cloudscraper first, if failed use the browser.""" 12 | 13 | def login(self, email: str, password: str) -> None: 14 | try: 15 | return self.login_in_soup(email, password) 16 | except ScraperErrorGroup: 17 | return self.login_in_browser(email, password) 18 | 19 | def login_in_soup(self, email: str, password: str) -> None: 20 | """Login to the website using the scraper""" 21 | raise FallbackToBrowser() 22 | 23 | @abstractmethod 24 | def login_in_browser(self, email: str, password: str) -> None: 25 | """Login to the website using the browser""" 26 | raise NotImplementedError() 27 | -------------------------------------------------------------------------------- /lncrawl/templates/browser/searchable.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Generator, List 3 | 4 | from bs4 import Tag 5 | 6 | from ...core.exeptions import FallbackToBrowser 7 | from ...models import SearchResult 8 | from ..soup.searchable import SearchableSoupTemplate 9 | from .general import GeneralBrowserTemplate 10 | 11 | 12 | class SearchableBrowserTemplate(GeneralBrowserTemplate, SearchableSoupTemplate): 13 | """Attempts to crawl using cloudscraper first, if failed use the browser.""" 14 | 15 | def search_novel_in_soup(self, query: str) -> List[SearchResult]: 16 | tags = self.select_search_items(query) 17 | return list(self.process_search_results(tags)) 18 | 19 | def search_novel_in_browser(self, query: str) -> List[SearchResult]: 20 | tags = self.select_search_items_in_browser(query) 21 | return list(self.process_search_results_in_browser(tags)) 22 | 23 | def process_search_results_in_browser( 24 | self, tags: Generator[Tag, None, None] 25 | ) -> Generator[Tag, None, None]: 26 | """Process novel item tag and generates search results from the browser""" 27 | count = 0 28 | for tag in tags: 29 | if not isinstance(tag, Tag): 30 | continue 31 | count += 1 32 | if count == 10: 33 | break 34 | yield self.parse_search_item_in_browser(tag) 35 | 36 | @abstractmethod 37 | def select_search_items(self, query: str) -> Generator[Tag, None, None]: 38 | raise FallbackToBrowser() 39 | 40 | def select_search_items_in_browser(self, query: str) -> Generator[Tag, None, None]: 41 | """Select novel items found by the query using the browser""" 42 | yield from self.select_search_items(self.browser.soup) 43 | 44 | def parse_search_item_in_browser(self, tag: Tag) -> SearchResult: 45 | """Parse a tag and return single search result""" 46 | return self.parse_search_item(tag) 47 | -------------------------------------------------------------------------------- /lncrawl/templates/browser/with_volume.py: -------------------------------------------------------------------------------- 1 | from typing import Generator, Union 2 | 3 | from bs4 import Tag 4 | 5 | from ...models import Chapter, Volume 6 | from ..soup.with_volume import ChapterWithVolumeSoupTemplate 7 | from .general import GeneralBrowserTemplate 8 | 9 | 10 | class ChapterWithVolumeBrowserTemplate( 11 | GeneralBrowserTemplate, ChapterWithVolumeSoupTemplate 12 | ): 13 | """Attempts to crawl using cloudscraper first, if failed use the browser.""" 14 | 15 | def parse_chapter_list_in_browser( 16 | self, 17 | ) -> Generator[Union[Chapter, Volume], None, None]: 18 | vol_id = 0 19 | chap_id = 0 20 | for vol in self.select_volume_tags_in_browser(): 21 | if not isinstance(vol, Tag): 22 | continue 23 | vol_id += 1 24 | vol_item = self.parse_volume_item_in_browser(vol, vol_id) 25 | yield vol_item 26 | for tag in self.select_chapter_tags_in_browser(vol, vol_item): 27 | if not isinstance(tag, Tag): 28 | continue 29 | chap_id += 1 30 | item = self.parse_chapter_item_in_browser(tag, chap_id, vol_item) 31 | item.volume = vol_id 32 | yield item 33 | 34 | def select_volume_tags_in_browser(self) -> Generator[Tag, None, None]: 35 | """Select volume list item tags from the browser""" 36 | return self.select_volume_tags(self.browser.soup) 37 | 38 | def parse_volume_item_in_browser(self, tag: Tag, id: int) -> Volume: 39 | """Parse a single volume from volume list item tag from the browser""" 40 | return self.parse_volume_item(tag, id) 41 | 42 | def select_chapter_tags_in_browser( 43 | self, tag: Tag, vol: Volume 44 | ) -> Generator[Tag, None, None]: 45 | """Select chapter list item tags from volume tag from the browser""" 46 | return self.select_chapter_tags(tag, vol) 47 | 48 | def parse_chapter_item_in_browser(self, tag: Tag, id: int, vol: Volume) -> Chapter: 49 | """Parse a single chapter from chapter list item tag from the browser""" 50 | return self.parse_chapter_item(tag, id, vol) 51 | -------------------------------------------------------------------------------- /lncrawl/templates/soup/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/templates/soup/__init__.py -------------------------------------------------------------------------------- /lncrawl/templates/soup/chapter_only.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Generator 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | 6 | from ...models import Chapter 7 | from .general import GeneralSoupTemplate 8 | 9 | 10 | class ChapterOnlySoupTemplate(GeneralSoupTemplate): 11 | def parse_chapter_list(self, soup: BeautifulSoup) -> Generator[Chapter, None, None]: 12 | chap_id = 0 13 | for tag in self.select_chapter_tags(soup): 14 | if not isinstance(tag, Tag): 15 | continue 16 | chap_id += 1 17 | yield self.parse_chapter_item(tag, chap_id) 18 | 19 | @abstractmethod 20 | def select_chapter_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]: 21 | """Select chapter list item tags from the page soup""" 22 | raise NotImplementedError() 23 | 24 | @abstractmethod 25 | def parse_chapter_item(self, tag: Tag, id: int) -> Chapter: 26 | """Parse a single chapter from chapter list item tag""" 27 | raise NotImplementedError() 28 | -------------------------------------------------------------------------------- /lncrawl/templates/soup/optional_volume.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Generator, Union 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | 6 | from ...models import Chapter, Volume 7 | from .general import GeneralSoupTemplate 8 | 9 | 10 | class OptionalVolumeSoupTemplate(GeneralSoupTemplate): 11 | def parse_chapter_list( 12 | self, soup: BeautifulSoup 13 | ) -> Generator[Union[Chapter, Volume], None, None]: 14 | vol_id = 0 15 | chap_id = 0 16 | for vol in self.select_volume_tags(soup): 17 | if not isinstance(vol, Tag): 18 | continue 19 | vol_id += 1 20 | vol_item = self.parse_volume_item(vol, vol_id) 21 | yield vol_item 22 | for tag in self.select_chapter_tags(vol): 23 | if not isinstance(tag, Tag): 24 | continue 25 | chap_id += 1 26 | item = self.parse_chapter_item(tag, chap_id, vol_item) 27 | item.volume = vol_id 28 | yield item 29 | 30 | if chap_id > 0: 31 | return 32 | 33 | vol_id = 0 34 | chap_id = 0 35 | parent = soup.select_one("html") 36 | for tag in self.select_chapter_tags(parent): 37 | if not isinstance(tag, Tag): 38 | continue 39 | if chap_id % 100 == 0: 40 | vol_id = chap_id // 100 + 1 41 | vol_item = self.parse_volume_item(parent, vol_id) 42 | yield vol_item 43 | chap_id += 1 44 | item = self.parse_chapter_item(tag, chap_id, vol_item) 45 | item.volume = vol_id 46 | yield item 47 | 48 | def select_volume_tags(self, soup: BeautifulSoup): 49 | return [] 50 | 51 | def parse_volume_item(self, tag: Tag, id: int) -> Volume: 52 | return Volume(id=id) 53 | 54 | @abstractmethod 55 | def select_chapter_tags(self, parent: Tag) -> Generator[Tag, None, None]: 56 | raise NotImplementedError() 57 | 58 | @abstractmethod 59 | def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter: 60 | raise NotImplementedError() 61 | -------------------------------------------------------------------------------- /lncrawl/templates/soup/searchable.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Generator, List 3 | 4 | from bs4 import Tag 5 | 6 | from ...models import SearchResult 7 | from .general import GeneralSoupTemplate 8 | 9 | 10 | class SearchableSoupTemplate(GeneralSoupTemplate): 11 | def search_novel(self, query) -> List[SearchResult]: 12 | tags = self.select_search_items(query) 13 | return list(self.process_search_results(tags)) 14 | 15 | def process_search_results( 16 | self, tags: Generator[Tag, None, None] 17 | ) -> Generator[Tag, None, None]: 18 | """Process novel item tag and generates search results""" 19 | count = 0 20 | for tag in tags: 21 | if not isinstance(tag, Tag): 22 | continue 23 | count += 1 24 | if count == 10: 25 | break 26 | yield self.parse_search_item(tag) 27 | 28 | @abstractmethod 29 | def select_search_items(self, query: str) -> Generator[Tag, None, None]: 30 | """Select novel items found on the search page by the query""" 31 | raise NotImplementedError() 32 | 33 | @abstractmethod 34 | def parse_search_item(self, tag: Tag) -> SearchResult: 35 | """Parse a tag and return single search result""" 36 | raise NotImplementedError() 37 | -------------------------------------------------------------------------------- /lncrawl/templates/soup/with_volume.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Generator, Union 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | 6 | from ...models import Chapter, Volume 7 | from .general import GeneralSoupTemplate 8 | 9 | 10 | class ChapterWithVolumeSoupTemplate(GeneralSoupTemplate): 11 | def parse_chapter_list( 12 | self, soup: BeautifulSoup 13 | ) -> Generator[Union[Chapter, Volume], None, None]: 14 | vol_id = 0 15 | chap_id = 0 16 | for vol in self.select_volume_tags(soup): 17 | if not isinstance(vol, Tag): 18 | continue 19 | vol_id += 1 20 | vol_item = self.parse_volume_item(vol, vol_id) 21 | yield vol_item 22 | for tag in self.select_chapter_tags(vol, vol_item): 23 | if not isinstance(tag, Tag): 24 | continue 25 | chap_id += 1 26 | item = self.parse_chapter_item(tag, chap_id, vol_item) 27 | item.volume = vol_id 28 | yield item 29 | 30 | @abstractmethod 31 | def select_volume_tags(self, soup: BeautifulSoup) -> Generator[Tag, None, None]: 32 | """Select volume list item tags from the page soup""" 33 | raise NotImplementedError() 34 | 35 | @abstractmethod 36 | def parse_volume_item(self, tag: Tag, id: int) -> Volume: 37 | """Parse a single volume from volume list item tag""" 38 | raise NotImplementedError() 39 | 40 | @abstractmethod 41 | def select_chapter_tags(self, tag: Tag, vol: Volume) -> Generator[Tag, None, None]: 42 | """Select chapter list item tags from volume tag""" 43 | raise NotImplementedError() 44 | 45 | @abstractmethod 46 | def parse_chapter_item(self, tag: Tag, id: int, vol: Volume) -> Chapter: 47 | """Parse a single chapter from chapter list item tag""" 48 | raise NotImplementedError() 49 | -------------------------------------------------------------------------------- /lncrawl/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/lncrawl/utils/__init__.py -------------------------------------------------------------------------------- /lncrawl/utils/common.py: -------------------------------------------------------------------------------- 1 | from typing import TypeVar, Generic, Callable, Type 2 | 3 | T = TypeVar('T') 4 | 5 | 6 | class static_cached_property(Generic[T]): 7 | def __init__(self, func: Callable[..., T]): 8 | self._initialized = False 9 | if isinstance(func, staticmethod): 10 | self.func = func.__func__ 11 | else: 12 | self.func = func 13 | 14 | def __get__(self, instance: None, owner: Type) -> T: 15 | if not self._initialized: 16 | self._value = self.func() 17 | self._initialized = True 18 | return self._value 19 | -------------------------------------------------------------------------------- /lncrawl/utils/imgen.py: -------------------------------------------------------------------------------- 1 | # https://github.com/alexwlchan/specktre 2 | 3 | import random 4 | from typing import List, Optional 5 | 6 | from PIL import Image, ImageDraw 7 | 8 | from .material_colors import ColorName, ColorWeight, generate_colors 9 | from .tilings import TileGenerator, generate_tiles 10 | 11 | 12 | def generate_image( 13 | filename: Optional[str] = None, 14 | width: int = 512, 15 | height: int = 512, 16 | color_names: List[ColorName] = [], 17 | color_weights: List[ColorWeight] = [], 18 | generator: Optional[TileGenerator] = None, 19 | side_length: int = 50, 20 | ) -> Image: 21 | tiles = generate_tiles( 22 | generator, 23 | width, 24 | height, 25 | side_length, 26 | ) 27 | colors = generate_colors( 28 | color_names, 29 | color_weights, 30 | ) 31 | im = Image.new( 32 | mode="RGB", 33 | size=(width, height), 34 | ) 35 | for tile, color in zip(tiles, colors): 36 | ImageDraw.Draw(im).polygon(tile, fill=color) 37 | 38 | if filename: 39 | im.save(filename) 40 | 41 | return im 42 | 43 | 44 | good_color_names = set(ColorName).difference( 45 | [ 46 | ColorName.black, 47 | ColorName.white, 48 | ColorName.light_blue, 49 | ColorName.light_green, 50 | ] 51 | ) 52 | good_color_weights = set(ColorWeight).difference( 53 | [ 54 | ColorWeight.main, 55 | ColorWeight.w50, 56 | ColorWeight.w100, 57 | ColorWeight.w200, 58 | ColorWeight.w800, 59 | ColorWeight.w900, 60 | ColorWeight.a100, 61 | ColorWeight.a200, 62 | ] 63 | ) 64 | 65 | 66 | def generate_cover_image( 67 | filename: Optional[str] = None, 68 | width: int = 800, 69 | height: int = 1032, 70 | ) -> Image: 71 | generate_image( 72 | filename=filename, 73 | width=width, 74 | height=height, 75 | color_names=good_color_names, 76 | color_weights=good_color_weights, 77 | side_length=random.randint(300, 750), 78 | ) 79 | -------------------------------------------------------------------------------- /lncrawl/utils/ratelimit.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class RateLimiter(object): 8 | """A helper class for a controlling number of requests per seconds. 9 | It is being used along with the TaskManager class. 10 | 11 | Args: 12 | - ratelimit (float, optional): Number of requests per seconds. 13 | """ 14 | 15 | def __init__(self, ratelimit: float): 16 | if ratelimit <= 0: 17 | raise ValueError("ratelimit should be a non-zero positive number") 18 | self.period = 1 / ratelimit 19 | self._closed = False 20 | 21 | def _now(self): 22 | if hasattr(time, "monotonic"): 23 | return time.monotonic() 24 | return time.time() 25 | 26 | def __enter__(self): 27 | self._time = self._now() 28 | 29 | def __exit__(self, type, value, traceback): 30 | if self._closed: 31 | return 32 | d = (self._time + self.period) - self._now() 33 | self._time = self._now() 34 | if d > 0: 35 | time.sleep(d) 36 | 37 | def shutdown(self): 38 | self._closed = True 39 | 40 | def wrap(self, fn): 41 | def inner(*args, **kwargs): 42 | with self: 43 | return fn(*args, **kwargs) 44 | 45 | return inner 46 | -------------------------------------------------------------------------------- /lncrawl/utils/sockets.py: -------------------------------------------------------------------------------- 1 | import socket 2 | 3 | 4 | def free_port(host="127.0.0.1") -> int: 5 | """ 6 | Determines a free port using sockets. 7 | """ 8 | free_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 9 | free_socket.bind((host, 0)) 10 | free_socket.listen(5) 11 | port: int = free_socket.getsockname()[1] 12 | free_socket.close() 13 | return port 14 | -------------------------------------------------------------------------------- /lncrawl/utils/ssl_no_verify.py: -------------------------------------------------------------------------------- 1 | """ 2 | https://stackoverflow.com/a/15445989/1583052 3 | """ 4 | import warnings 5 | import contextlib 6 | 7 | import requests 8 | from urllib3.exceptions import InsecureRequestWarning 9 | 10 | 11 | old_merge_environment_settings = requests.Session.merge_environment_settings 12 | 13 | 14 | @contextlib.contextmanager 15 | def no_ssl_verification(): 16 | opened_adapters = set() 17 | 18 | def merge_environment_settings(self, url, proxies, stream, verify, cert): 19 | # Verification happens only once per connection so we need to close 20 | # all the opened adapters once we're done. Otherwise, the effects of 21 | # verify=False persist beyond the end of this context manager. 22 | opened_adapters.add(self.get_adapter(url)) 23 | 24 | settings = old_merge_environment_settings( 25 | self, url, proxies, stream, verify, cert 26 | ) 27 | settings["verify"] = False 28 | 29 | return settings 30 | 31 | requests.Session.merge_environment_settings = merge_environment_settings 32 | 33 | try: 34 | with warnings.catch_warnings(): 35 | warnings.simplefilter("ignore", InsecureRequestWarning) 36 | yield 37 | finally: 38 | requests.Session.merge_environment_settings = old_merge_environment_settings 39 | 40 | for adapter in opened_adapters: 41 | try: 42 | adapter.close() 43 | except Exception: 44 | pass 45 | -------------------------------------------------------------------------------- /lncrawl/utils/uploader/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | cloud_drive = os.getenv("CLOUD_DRIVE", "ANONFILES") 4 | 5 | 6 | def upload(file_path, description=None): 7 | if cloud_drive == "GOOGLE_DRIVE": 8 | from .google_drive import upload 9 | 10 | return upload(file_path, description) 11 | elif cloud_drive == "GOFILE": 12 | from .gofile import upload 13 | 14 | return upload(file_path, description) 15 | else: 16 | from .anonfiles import upload 17 | 18 | return upload(file_path, description) 19 | -------------------------------------------------------------------------------- /lncrawl/utils/uploader/anonfiles.py: -------------------------------------------------------------------------------- 1 | from requests import Session 2 | 3 | 4 | # API Docs: https://anonfiles.com/docs/api 5 | def upload(file_path, description): 6 | with Session() as sess: 7 | with open(file_path, "rb") as fp: 8 | response = sess.post( 9 | "https://api.anonfiles.com/upload", 10 | files={"file": fp}, 11 | stream=True, 12 | ) 13 | response.raise_for_status() 14 | return response.json()["data"]["file"]["url"]["full"] 15 | -------------------------------------------------------------------------------- /lncrawl/utils/uploader/gofile.py: -------------------------------------------------------------------------------- 1 | from requests import Session 2 | 3 | 4 | # API Docs: https://gofile.io/api 5 | def upload(file_path, description=""): 6 | with Session() as sess: 7 | response = sess.get("https://api.gofile.io/getServer") 8 | response.raise_for_status() 9 | server_name = response.json()["data"]["server"] 10 | 11 | with open(file_path, "rb") as fp: 12 | response = sess.post( 13 | f"https://{server_name}.gofile.io/uploadFile", 14 | files={"file": fp}, 15 | stream=True, 16 | ) 17 | response.raise_for_status() 18 | return response.json()["data"]["downloadPage"] 19 | -------------------------------------------------------------------------------- /lncrawl/utils/uploader/google_drive.py: -------------------------------------------------------------------------------- 1 | """[DEPRECATED] Uploader for google drive""" 2 | import logging 3 | import os 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | try: 9 | from pydrive.auth import GoogleAuth 10 | from pydrive.drive import GoogleDrive 11 | except Exception: 12 | logger.error("`pydrive` was not setup properly") 13 | 14 | 15 | def upload(file_path, description=None) -> str: 16 | gauth = GoogleAuth() 17 | # gauth.LocalWebserverAuth() 18 | 19 | # Try to load saved client credentials 20 | credential_file = os.getenv("GOOGLE_DRIVE_CREDENTIAL_FILE") 21 | gauth.LoadCredentialsFile(credential_file) 22 | if gauth.credentials is None: 23 | # Authenticate if they're not there 24 | gauth.LocalWebserverAuth() 25 | elif gauth.access_token_expired: 26 | # Refresh them if expired 27 | gauth.Refresh() 28 | else: 29 | # Initialize the saved creds 30 | gauth.Authorize() 31 | 32 | # Save the current credentials to a file 33 | gauth.SaveCredentialsFile(credential_file) 34 | 35 | drive = GoogleDrive(gauth) 36 | folder_id = os.getenv("GOOGLE_DRIVE_FOLDER_ID") 37 | filename_w_ext = os.path.basename(file_path) 38 | filename, file_extension = os.path.splitext(filename_w_ext) 39 | 40 | # Upload file to folder 41 | f = drive.CreateFile({"parents": [{"kind": "drive#fileLink", "id": folder_id}]}) 42 | f["title"] = filename_w_ext 43 | 44 | # Make sure to add the path to the file to upload below. 45 | f.SetContentFile(file_path) 46 | f.Upload() 47 | 48 | logger.info("Uploaded file id: {}", f["id"]) 49 | return "https://drive.google.com/open?id=" + f["id"] 50 | -------------------------------------------------------------------------------- /lncrawl/webdriver/__init__.py: -------------------------------------------------------------------------------- 1 | # https://cloudbytes.dev/snippets/run-selenium-and-chrome-on-wsl2 2 | # https://github.com/ultrafunkamsterdam/undetected-chromedriver 3 | 4 | import logging 5 | from typing import Optional 6 | 7 | from selenium.webdriver import ChromeOptions 8 | from selenium.webdriver.remote.webdriver import WebDriver 9 | 10 | from ..core.arguments import get_args 11 | from ..core.soup import SoupMaker 12 | from .local import create_local 13 | from .remote import create_remote 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def create_new( 19 | options: Optional["ChromeOptions"] = None, 20 | timeout: Optional[float] = None, 21 | user_data_dir: Optional[str] = None, 22 | soup_maker: Optional[SoupMaker] = None, 23 | headless: bool = False, 24 | **kwargs, 25 | ) -> WebDriver: 26 | args = get_args() 27 | if args.selenium_grid: 28 | return create_remote( 29 | address=args.selenium_grid, 30 | options=options, 31 | timeout=timeout, 32 | soup_maker=soup_maker, 33 | ) 34 | else: 35 | return create_local( 36 | options=options, 37 | timeout=timeout, 38 | soup_maker=soup_maker, 39 | user_data_dir=user_data_dir, 40 | headless=headless, 41 | ) 42 | -------------------------------------------------------------------------------- /lncrawl/webdriver/job_queue.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import logging 3 | from threading import Semaphore, Thread 4 | from typing import List, Optional 5 | 6 | from selenium.webdriver.remote.webdriver import WebDriver 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | MAX_BROWSER_INSTANCES = 8 11 | 12 | __open_browsers: List[WebDriver] = [] 13 | __semaphore = Semaphore(MAX_BROWSER_INSTANCES) 14 | 15 | 16 | def __override_quit(driver: WebDriver): 17 | __open_browsers.append(driver) 18 | original = Thread(target=driver.quit, daemon=True) 19 | 20 | def override(): 21 | if driver in __open_browsers: 22 | __semaphore.release() 23 | __open_browsers.remove(driver) 24 | logger.info("Destroyed instance: %s", driver.session_id) 25 | if not original._started.is_set(): # type:ignore 26 | original.start() 27 | 28 | driver.quit = override # type:ignore 29 | 30 | 31 | def _acquire_queue(timeout: Optional[float] = None): 32 | acquired = __semaphore.acquire(True, timeout) 33 | if not acquired: 34 | raise TimeoutError("Failed to acquire semaphore") 35 | 36 | 37 | def _release_queue(driver: WebDriver): 38 | __override_quit(driver) 39 | 40 | 41 | def check_active(driver: WebDriver) -> bool: 42 | if not isinstance(driver, WebDriver): 43 | return False 44 | return driver in __open_browsers 45 | 46 | 47 | def cleanup_drivers(): 48 | for driver in __open_browsers: 49 | driver.close() 50 | driver.quit() 51 | 52 | 53 | atexit.register(cleanup_drivers) 54 | -------------------------------------------------------------------------------- /requirements-app.txt: -------------------------------------------------------------------------------- 1 | # app requirements 2 | typer 3 | ascii 4 | regex 5 | packaging 6 | lxml[html-clean] 7 | pyease-grpc>=1.6.0 8 | python-dotenv>=0.15.0,<2.0.0 9 | beautifulsoup4>=4.8.0,<5.0.0 10 | requests>=2.20.0,<2.33.0 11 | python-slugify>=4.0.0,<9.0.0 12 | colorama>=0.4.0,<0.5.0 13 | tqdm>=4.60,<5.0 14 | PyExecJS>=1.5.1,<2.0.0 15 | ebooklib>=0.17.0,<1.0.0 16 | pillow>=6.0.0 17 | cloudscraper>=1.2.71 18 | readability-lxml>=0.8.0,<1.0.0 19 | questionary>=1.6.0 20 | prompt-toolkit~=3.0 21 | html5lib~=1.1 22 | base58~=2.1.1 23 | python-box>=6.0.0,<8.0.0 24 | pycryptodome>=3.0.0,<4.0.0 25 | selenium>=3.141.0 26 | tenacity>=9.0.0 27 | -------------------------------------------------------------------------------- /requirements-bot.txt: -------------------------------------------------------------------------------- 1 | # bot requirements 2 | discord.py>=2.0.0 3 | python-telegram-bot[job-queue]~=20.0 4 | # pydrive>=1.3.1,<2.0.0 5 | 6 | uvicorn 7 | fastapi[standard] 8 | cachetools 9 | sqlmodel 10 | passlib[argon2] 11 | python-jose[cryptography] 12 | python-dateutil 13 | reflex 14 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # dev requirements 2 | wheel 3 | black 4 | flake8 5 | setuptools 6 | pyinstaller 7 | pycryptodome>=3.0.0,<4.0.0 8 | 9 | types-tqdm 10 | types-colorama 11 | types-cachetools 12 | types-python-dateutil 13 | types-passlib 14 | types-python-jose -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # app requirements 2 | typer 3 | ascii 4 | regex 5 | packaging 6 | lxml[html-clean] 7 | pyease-grpc>=1.6.0 8 | python-dotenv>=0.15.0,<2.0.0 9 | beautifulsoup4>=4.8.0,<5.0.0 10 | requests>=2.20.0,<2.33.0 11 | python-slugify>=4.0.0,<9.0.0 12 | colorama>=0.4.0,<0.5.0 13 | tqdm>=4.60,<5.0 14 | PyExecJS>=1.5.1,<2.0.0 15 | ebooklib>=0.17.0,<1.0.0 16 | pillow>=6.0.0 17 | cloudscraper>=1.2.71 18 | readability-lxml>=0.8.0,<1.0.0 19 | questionary>=1.6.0 20 | prompt-toolkit~=3.0 21 | html5lib~=1.1 22 | base58~=2.1.1 23 | python-box>=6.0.0,<8.0.0 24 | pycryptodome>=3.0.0,<4.0.0 25 | selenium>=3.141.0 26 | tenacity>=9.0.0 27 | 28 | # bot requirements 29 | discord.py>=2.0.0 30 | python-telegram-bot[job-queue]~=20.0 31 | uvicorn 32 | fastapi[standard] 33 | cachetools 34 | sqlmodel 35 | passlib[argon2] 36 | python-jose[cryptography] 37 | python-dateutil 38 | reflex 39 | 40 | # dev requirements 41 | wheel 42 | black 43 | flake8 44 | tk-tools 45 | setuptools 46 | pyinstaller 47 | types-tqdm 48 | types-colorama 49 | types-cachetools 50 | types-python-dateutil 51 | types-passlib 52 | types-python-jose -------------------------------------------------------------------------------- /res/lncrawl-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/res/lncrawl-icon.png -------------------------------------------------------------------------------- /res/lncrawl-web.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/res/lncrawl-web.png -------------------------------------------------------------------------------- /res/lncrawl.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/res/lncrawl.ico -------------------------------------------------------------------------------- /scripts/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim-bookworm 2 | 3 | USER root 4 | # Install general dependencies 5 | RUN apt-get update -yq \ 6 | && apt-get install -yq \ 7 | wget tar xz-utils make cmake g++ libffi-dev libegl1 libopengl0 libxcb-cursor0 \ 8 | libnss3 libgl1-mesa-glx libxcomposite1 libxrandr2 libxi6 fontconfig \ 9 | libxkbcommon-x11-0 libxtst6 libxkbfile1 libxcomposite-dev libxdamage-dev \ 10 | && rm -rf /var/lib/apt/lists/* \ 11 | && apt-get clean autoclean \ 12 | && apt-get autoremove -yq 13 | 14 | # Install calibre 15 | RUN wget -nv -O- https://download.calibre-ebook.com/linux-installer.sh | sh /dev/stdin \ 16 | && ln -s /opt/calibre/ebook-convert /usr/local/bin/ebook-convert 17 | 18 | # Add app user 19 | RUN useradd -ms /bin/bash lncrawl 20 | USER lncrawl 21 | 22 | # Install global requirements 23 | RUN alias python=python3 24 | RUN alias pip=pip3 25 | RUN export PATH="/home/lncrawl/.local/bin:$PATH" 26 | RUN pip install -U pip wheel 27 | 28 | WORKDIR /app 29 | 30 | # Install app requirements 31 | COPY --chown=lncrawl:lncrawl requirements.txt . 32 | RUN pip install -r requirements.txt 33 | 34 | COPY .env .env 35 | COPY lncrawl lncrawl 36 | COPY sources sources 37 | 38 | ENV OUTPUT_PATH=/home/lncrawl/output 39 | RUN mkdir -p $OUTPUT_PATH 40 | 41 | ENTRYPOINT [ "python", "-m", "lncrawl" ] 42 | -------------------------------------------------------------------------------- /scripts/bitanon.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | VERSION=$(head -n 1 lncrawl/VERSION) 4 | 5 | # SHLINK_API_KEY= 6 | 7 | 8 | EXE_LINK="https://github.com/dipu-bd/lightnovel-crawler/releases/download/v$VERSION/lncrawl.exe" 9 | EXE_TITLE="Lightnovel Crawler v$VERSION (Windows)" 10 | 11 | LINUX_LINK="https://github.com/dipu-bd/lightnovel-crawler/releases/download/v$VERSION/lncrawl-linux" 12 | LINUX_TITLE="Lightnovel Crawler v$VERSION (Linux)" 13 | 14 | MAC_LINK="https://github.com/dipu-bd/lightnovel-crawler/releases/download/v$VERSION/lncrawl-mac" 15 | MAC_TITLE="Lightnovel Crawler v$VERSION (Mac)" 16 | 17 | set -ex 18 | 19 | curl -X 'PATCH' \ 20 | 'https://go.bitanon.dev/rest/v3/short-urls/lncrawl-windows' \ 21 | -H 'accept: application/json' \ 22 | -H 'Content-Type: application/json' \ 23 | -H "X-Api-Key: $SHLINK_API_KEY" \ 24 | -d '{"title": "'"$EXE_TITLE"'","longUrl": "'"$EXE_LINK"'"}' 25 | 26 | curl -X 'PATCH' \ 27 | 'https://go.bitanon.dev/rest/v3/short-urls/lncrawl-linux' \ 28 | -H 'accept: application/json' \ 29 | -H 'Content-Type: application/json' \ 30 | -H "X-Api-Key: $SHLINK_API_KEY" \ 31 | -d '{"title": "'"$LINUX_TITLE"'","longUrl": "'"$LINUX_LINK"'"}' 32 | 33 | curl -X 'PATCH' \ 34 | 'https://go.bitanon.dev/rest/v3/short-urls/lncrawl-mac' \ 35 | -H 'accept: application/json' \ 36 | -H 'Content-Type: application/json' \ 37 | -H "X-Api-Key: $SHLINK_API_KEY" \ 38 | -d '{"title": "'"$MAC_TITLE"'","longUrl": "'"$MAC_LINK"'"}' 39 | -------------------------------------------------------------------------------- /scripts/build.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | SET /P VERSION= /dev/null; then 9 | if [ -w /etc/passwd ]; then 10 | echo "${USER_NAME:-default}:x:$(id -u):0:${USER_NAME:-default} user:${HOME}:/sbin/nologin" >> /etc/passwd 11 | fi 12 | fi 13 | 14 | /usr/bin/supervisord --configuration /etc/supervisord.conf & 15 | SUPERVISOR_PID=$! 16 | 17 | 18 | function shutdown { 19 | echo "Trapped SIGTERM/SIGINT/x so shutting down supervisord..." 20 | kill -s SIGTERM ${SUPERVISOR_PID} 21 | wait ${SUPERVISOR_PID} 22 | echo "Shutdown complete" 23 | } 24 | 25 | trap shutdown SIGTERM SIGINT 26 | 27 | sleep 30 28 | exec /usr/bin/python3 -m lncrawl --suppress $@ & 29 | wait ${SUPERVISOR_PID} 30 | -------------------------------------------------------------------------------- /scripts/lint.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | flake8 -v --count --show-source --statistics -------------------------------------------------------------------------------- /scripts/lint.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | flake8 -v --count --show-source --statistics 4 | -------------------------------------------------------------------------------- /scripts/lncrawl.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Lightnovel Crawler 3 | After=network-online.target 4 | Wants=network-online.target 5 | 6 | [Service] 7 | User=dipu 8 | WorkingDirectory=/home/dipu/projects/lightnovel-crawler 9 | RestartSec=2s 10 | Restart=always 11 | ExecStart=/bin/bash ./scripts/start.sh 12 | ExecStop=/bin/bash ./scripts/stop.sh 13 | 14 | [Install] 15 | WantedBy=multi-user.target 16 | Alias=lncrawl.service 17 | -------------------------------------------------------------------------------- /scripts/publish.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | SET /P VERSION=/dev/null 2>&1 4 | pgrep python -a | grep "discord" | awk '{print $1}' | xargs kill -9 >/dev/null 2>&1 5 | echo "Stopped all discord bots." 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # https://setuptools.readthedocs.io/en/latest/userguide/declarative_config.html 2 | 3 | [metadata] 4 | name = lightnovel-crawler 5 | version = file: lncrawl/VERSION 6 | author = Sudipto Chandra 7 | author_email = dipu.sudipta@gmail.com 8 | url = https://github.com/dipu-bd/lightnovel-crawler 9 | description = An app to download novels from online sources and generate e-books. 10 | long_description = file: README.pip 11 | long_description_content_type = text/markdown 12 | license = Apache 2.0 13 | license_file = LICENSE 14 | platforms = any 15 | keywords = lightnovel, crawler, lncrawl, novel, pdf, epub, mobi, scraper 16 | classifiers = 17 | Development Status :: 5 - Production/Stable 18 | License :: OSI Approved :: Apache Software License 19 | Natural Language :: English 20 | Intended Audience :: End Users/Desktop 21 | Programming Language :: Python :: 3 :: Only 22 | Programming Language :: Python :: 3.8 23 | Programming Language :: Python :: 3.9 24 | Programming Language :: Python :: 3.10 25 | Programming Language :: Python :: 3.11 26 | Programming Language :: Python :: 3.12 27 | Topic :: Games/Entertainment 28 | Environment :: Console 29 | project_urls = 30 | "Source Code" = https://github.com/dipu-bd/lightnovel-crawler 31 | "Issue Tracker" = https://github.com/dipu-bd/lightnovel-crawler/issues 32 | "Documentation" = https://github.com/dipu-bd/lightnovel-crawler/blob/master/README.md 33 | 34 | [options] 35 | zip_safe = False 36 | python_requires = >= 3.8 37 | include_package_data = True 38 | 39 | [options.entry_points] 40 | console_scripts = 41 | lncrawl = lncrawl:main 42 | lightnovel_crawler = lncrawl:main 43 | lightnovel-crawler = lncrawl:main 44 | 45 | [flake8] 46 | #select=E9,F63,F7,F82 47 | ignore=E203,E265,E265,W503 48 | indent-size=4 49 | max-line-length=150 50 | exclude = .git, .eggs, __pycache__, tests/, docs/, build/, dist/, res/, venv/, venv-win/, venv36/, venv-linux/ 51 | # max-complexity=10 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | from glob import glob 5 | 6 | if sys.version_info[:2] < (3, 8): 7 | raise RuntimeError("Lightnovel crawler only supports Python 3.8 and later.") 8 | 9 | try: 10 | from setuptools import config, setup 11 | except ImportError: 12 | print("Run `pip install setuptools`") 13 | exit(1) 14 | 15 | 16 | def parse_requirements(filename): 17 | with open(filename, "r", encoding="utf8") as f: 18 | requirements = f.read().strip().split("\n") 19 | requirements = [ 20 | r.strip() for r in requirements if r.strip() and not r.startswith("#") 21 | ] 22 | return requirements 23 | 24 | 25 | def is_ignored(fname: str): 26 | try: 27 | status = os.popen(f"git check-ignore {fname}").read() 28 | return bool(status.strip()) 29 | except Exception: 30 | return False 31 | 32 | 33 | run_pyi = "package" in sys.argv 34 | if run_pyi: 35 | sys.argv.remove("package") 36 | 37 | if len(sys.argv) == 1: 38 | sys.argv += ["build"] 39 | 40 | lncrawl_files = [] 41 | lncrawl_packages = ["lncrawl"] 42 | for fname in glob("lncrawl/**/*", recursive=True): 43 | if os.path.isdir(fname) and not is_ignored(fname): 44 | lncrawl_packages.append(".".join(fname.split(os.sep))) 45 | if os.path.isfile(fname) and not is_ignored(fname): 46 | lncrawl_files.append("/".join(fname.split(os.sep)[1:])) 47 | 48 | sources_files = [] 49 | sources_packages = ["sources"] 50 | for fname in glob("sources/**/*", recursive=True): 51 | if os.path.isdir(fname) and not is_ignored(fname): 52 | sources_packages.append(".".join(fname.split(os.sep))) 53 | if os.path.isfile(fname) and not is_ignored(fname): 54 | sources_files.append("/".join(fname.split(os.sep)[1:])) 55 | 56 | config.read_configuration("setup.cfg") 57 | 58 | setup( 59 | install_requires=parse_requirements("requirements-app.txt"), 60 | packages=lncrawl_packages + sources_packages, 61 | package_data={ 62 | "lncrawl": lncrawl_files, 63 | "sources": sources_files, 64 | }, 65 | ) 66 | 67 | if run_pyi: 68 | from setup_pyi import package 69 | 70 | package() 71 | -------------------------------------------------------------------------------- /sources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipu-bd/lightnovel-crawler/1ed455e6436ec7d9d6c7a497f621c6ba58f1a1b7/sources/__init__.py -------------------------------------------------------------------------------- /sources/ar/kolnovel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.mangastream import MangaStreamTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Kolnovel(MangaStreamTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://kolnovel.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/8/88tang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class TangEatDrinkRead(Crawler): 11 | base_url = "https://88tangeatdrinkread.wordpress.com/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | title = soup.select_one("h1.entry-title").text 18 | self.novel_title = title.rsplit("~", 1)[0].strip() 19 | logger.debug("Novel title = %s", self.novel_title) 20 | 21 | self.novel_author = "by 88 Tang" 22 | logger.info("Novel author: %s", self.novel_author) 23 | 24 | # Removes none TOC links. 25 | toc_parts = soup.select_one(".entry-content") 26 | for notoc in toc_parts.select(".sharedaddy, .code-block, script, .adsbygoogle"): 27 | notoc.extract() 28 | 29 | # Extract volume-wise chapter entries 30 | # TODO: Chapter title are url links, it's the way translator formatted website. 31 | chapters = soup.select( 32 | '.entry-content a[href*="88tangeatdrinkread.wordpress.com"]' 33 | ) 34 | 35 | for a in chapters: 36 | chap_id = len(self.chapters) + 1 37 | vol_id = 1 + len(self.chapters) // 100 38 | if len(self.volumes) < vol_id: 39 | self.volumes.append({"id": vol_id}) 40 | self.chapters.append( 41 | { 42 | "id": chap_id, 43 | "volume": vol_id, 44 | "url": self.absolute_url(a["href"]), 45 | "title": a.text.strip() or ("Chapter %d" % chap_id), 46 | } 47 | ) 48 | 49 | def download_chapter_body(self, chapter): 50 | soup = self.get_soup(chapter["url"]) 51 | contents = soup.select_one("div.entry-content") 52 | return self.cleaner.extract_contents(contents) 53 | -------------------------------------------------------------------------------- /sources/en/a/allnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class AllNovelCrawler(NovelFullTemplate): 10 | base_url = [ 11 | "https://allnovel.org/", 12 | "https://www.allnovel.org/", 13 | "https://allnovelxo.com/" 14 | ] 15 | 16 | def initialize(self) -> None: 17 | self.cleaner.bad_tags.update(["h3"]) 18 | -------------------------------------------------------------------------------- /sources/en/a/allnovelfull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class AllNovelFullCrawler(NovelFullTemplate): 10 | base_url = [ 11 | "https://allnovelfull.com/", 12 | "https://allnovelfull.net/" 13 | ] 14 | -------------------------------------------------------------------------------- /sources/en/a/anythingnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class AnythingNovelCrawler(Crawler): 9 | base_url = "https://anythingnovel.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | self.novel_title = soup.select("#wrap .breadcrumbs span")[-1].text.strip() 16 | logger.info("Novel title: %s", self.novel_title) 17 | 18 | possible_novel_cover = soup.select_one("#content a img") 19 | if possible_novel_cover: 20 | self.novel_cover = self.absolute_url(possible_novel_cover["src"]) 21 | logger.info("Novel cover: %s", self.novel_cover) 22 | 23 | volumes = set([]) 24 | for a in reversed(soup.select("#content div li a")): 25 | title = a.text.strip() 26 | chapter_id = len(self.chapters) + 1 27 | volume_id = 1 + (chapter_id - 1) // 100 28 | volumes.add(volume_id) 29 | self.chapters.append( 30 | { 31 | "id": chapter_id, 32 | "volume": volume_id, 33 | "title": title, 34 | "url": a["href"], 35 | } 36 | ) 37 | 38 | self.chapters.sort(key=lambda x: x["id"]) 39 | self.volumes = [{"id": x, "title": ""} for x in volumes] 40 | 41 | def download_chapter_body(self, chapter): 42 | soup = self.get_soup(chapter["url"]) 43 | content = soup.select_one("div#content") 44 | self.cleaner.clean_contents(content) 45 | body = content.select("p") 46 | body = [str(p) for p in body if self.should_take(p)] 47 | return "

" + "

".join(body) + "

" 48 | 49 | def should_take(self, p): 50 | txt = p.text.strip().lower() 51 | return txt and txt != "advertisement" 52 | -------------------------------------------------------------------------------- /sources/en/a/arcanetranslations.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.mangastream import MangaStreamTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Arcanetranslations(MangaStreamTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://arcanetranslations.com/"] 12 | 13 | def select_chapter_body(self, tag): 14 | result = super().select_chapter_body(tag) 15 | if "Login to buy access to this content" in result.text: 16 | raise Exception( 17 | "This content is behind a paywall. Please login to access it." 18 | ) 19 | return result 20 | -------------------------------------------------------------------------------- /sources/en/a/asianhobbyist.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class AsianHobbyistCrawler(Crawler): 10 | base_url = "https://www.asianhobbyist.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | possible_title = soup.select_one("h1.entry-title") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_image = soup.select_one(".main-wrap .background img[data-lazy-src]") 22 | if possible_image: 23 | self.novel_cover = self.absolute_url(possible_image["data-lazy-src"]) 24 | logger.info("Novel cover: %s", self.novel_cover) 25 | 26 | for a in soup.select(".divTable .tableBody div.fn a"): 27 | title = a.text.strip() 28 | chap_id = len(self.chapters) + 1 29 | self.chapters.append( 30 | { 31 | "id": chap_id, 32 | "title": title, 33 | "url": self.absolute_url(a["href"]), 34 | } 35 | ) 36 | 37 | def download_chapter_body(self, chapter): 38 | soup = self.get_soup(chapter["url"]) 39 | content = soup.select_one(".entry-content") 40 | self.cleaner.extract_contents(content) 41 | return content.decode_contents() 42 | -------------------------------------------------------------------------------- /sources/en/a/asianovel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import re 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | # Created using AsianHobbyist as a template. 10 | class AsianNovelCrawler(Crawler): 11 | base_url = "https://read.asianovel.com/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | novel_toc_url = self.novel_url + "/table-of-contents" 16 | soup = self.get_soup(novel_toc_url) 17 | 18 | possible_title = soup.select_one(".novel-description-full") 19 | assert possible_title, "No novel title" 20 | 21 | self.novel_title = possible_title.get_text() 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_image = soup.select_one("article:first-of-type .row img") 25 | if possible_image: 26 | self.novel_cover = self.absolute_url(possible_image["src"]) 27 | 28 | logger.info("Novel cover: %s", self.novel_cover) 29 | 30 | self.volumes.append({"id": 1}) 31 | for a in soup.select("#toc > div a"): 32 | title = a.select_one("div:first-of-type").get_text().strip() 33 | 34 | chap_id = len(self.chapters) + 1 35 | match = re.findall(r"ch(apter)? (\d+)", title, re.IGNORECASE) 36 | if len(match) == 1: 37 | chap_id = int(match[0][1]) 38 | 39 | self.chapters.append( 40 | { 41 | "volume": 1, 42 | "id": chap_id, 43 | "title": title, 44 | "url": self.absolute_url(a["href"]), 45 | } 46 | ) 47 | 48 | def download_chapter_body(self, chapter): 49 | logger.debug("Visiting %s", chapter["url"]) 50 | soup = self.get_soup(chapter["url"]) 51 | 52 | content = soup.select_one("#story") 53 | self.cleaner.clean_contents(content) 54 | 55 | return "".join([str(p) for p in content.select("p") if len(p.text.strip()) > 1]) 56 | -------------------------------------------------------------------------------- /sources/en/b/boxnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.madara import MadaraTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class BoxNovelCrawler(MadaraTemplate): 10 | base_url = ["https://boxnovel.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.cleaner.bad_css.update( 14 | [ 15 | ".para-comment", 16 | ".j_open_para_comment", 17 | ".j_para_comment_count", 18 | ".para-comment-num", 19 | "#wp-manga-current-chap", 20 | ".cha-tit", 21 | ".subtitle ", 22 | ] 23 | ) 24 | self.cleaner.bad_tag_text_pairs.update( 25 | { 26 | "p": r"Thank you for reading on myboxnovel.com" 27 | } 28 | ) 29 | -------------------------------------------------------------------------------- /sources/en/c/chickengege.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | from lncrawl.core.exeptions import LNException 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class ChickenGegeCrawler(Crawler): 13 | base_url = ["https://www.chickengege.org/"] 14 | 15 | def initialize(self) -> None: 16 | self.cleaner.bad_css.update([".m-a-box", ".m-a-box-container"]) 17 | 18 | def read_novel_info(self): 19 | soup = self.get_soup(self.novel_url) 20 | 21 | title_tag = soup.select_one("h1.entry-title") 22 | if not isinstance(title_tag, Tag): 23 | raise LNException("No title found") 24 | 25 | self.novel_title = title_tag.text.strip() 26 | 27 | image_tag = soup.select_one("img.novelist-cover-image") 28 | if isinstance(image_tag, Tag): 29 | self.novel_cover = self.absolute_url(image_tag["src"]) 30 | 31 | logger.info("Novel cover: %s", self.novel_cover) 32 | 33 | for a in soup.select("ul#novelList a, ul#extraList a, table#novelList a"): 34 | self.chapters.append( 35 | { 36 | "id": len(self.chapters) + 1, 37 | "title": a.text.strip(), 38 | "url": self.absolute_url(a["href"]), 39 | } 40 | ) 41 | 42 | def download_chapter_body(self, chapter): 43 | soup = self.get_soup(chapter["url"]) 44 | contents = soup.select_one("article div.entry-content") 45 | self.cleaner.clean_contents(contents) 46 | 47 | return str(contents) 48 | -------------------------------------------------------------------------------- /sources/en/c/ckandawrites.online.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.mangastream import MangaStreamTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class CkandawritesOnline(MangaStreamTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://ckandawrites.online/"] 12 | -------------------------------------------------------------------------------- /sources/en/c/crescentmoon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class CrescentMoonCrawler(Crawler): 11 | base_url = "https://crescentmoon.blog/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | self.novel_cover = self.absolute_url( 21 | soup.select_one("div.entry-content p a")["href"] 22 | ) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = soup.select("div.entry-content p")[2].text.strip() 26 | logger.info("Novel author: %s", self.novel_author) 27 | 28 | toc = None 29 | a = soup.select("div.entry-content p") 30 | for idx, item in enumerate(a): 31 | if "table of contents" in item.text.strip().lower(): 32 | toc = a[idx + 1] 33 | assert toc, "No table of contents" 34 | 35 | for x in toc.find_all("a"): 36 | chap_id = len(self.chapters) + 1 37 | vol_id = 1 + len(self.chapters) // 100 38 | if len(self.volumes) < vol_id: 39 | self.volumes.append({"id": vol_id}) 40 | self.chapters.append( 41 | { 42 | "id": chap_id, 43 | "volume": vol_id, 44 | "url": self.absolute_url(x["href"]), 45 | "title": x.text.strip() or ("Chapter %d" % chap_id), 46 | } 47 | ) 48 | 49 | def download_chapter_body(self, chapter): 50 | soup = self.get_soup(chapter["url"]) 51 | contents = soup.select("div.entry-content") 52 | return self.cleaner.extract_contents(contents) 53 | -------------------------------------------------------------------------------- /sources/en/d/dmtrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class DMTranslations(Crawler): 11 | base_url = [ 12 | "https://dmtranslationscn.com/", 13 | ] 14 | 15 | def read_novel_info(self): 16 | logger.debug("Visiting %s", self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | possible_title = soup.select_one(".entry-title") 20 | assert possible_title, "No novel title" 21 | self.novel_title = possible_title.text.strip() 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_image = soup.select_one("div.entry-content p img") 25 | if possible_image: 26 | self.novel_cover = self.absolute_url(possible_image["src"]) 27 | logger.info("Novel cover: %s", self.novel_cover) 28 | 29 | self.novel_author = "Translated by DM Translations" 30 | logger.info("Novel author: %s", self.novel_author) 31 | 32 | # Extract volume-wise chapter entries 33 | chapters = soup.find("div", {"class": "entry-content"}).findAll("a") 34 | 35 | for a in chapters: 36 | chap_id = len(self.chapters) + 1 37 | vol_id = 1 + len(self.chapters) // 100 38 | if len(self.volumes) < vol_id: 39 | self.volumes.append({"id": vol_id}) 40 | self.chapters.append( 41 | { 42 | "id": chap_id, 43 | "volume": vol_id, 44 | "url": self.absolute_url(a["href"]), 45 | "title": a.text.strip() or ("Chapter %d" % chap_id), 46 | } 47 | ) 48 | 49 | def download_chapter_body(self, chapter): 50 | soup = self.get_soup(chapter["url"]) 51 | 52 | body_parts = soup.select_one("div.entry-content") 53 | 54 | for content in body_parts.select("p"): 55 | for bad in ["Translator- DM", "Previous Chapter", "Next Chapter"]: 56 | if bad in content.text: 57 | content.extract() 58 | 59 | return self.cleaner.extract_contents(body_parts) 60 | -------------------------------------------------------------------------------- /sources/en/e/ebotnovel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.mangastream import MangaStreamTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Ebotnovel(MangaStreamTemplate): 9 | has_mtl = True 10 | has_manga = False 11 | base_url = ["https://ebotnovel.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/e/exiledrebels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class ExiledRebelsScanlations(Crawler): 11 | base_url = "https://exiledrebelsscanlations.com/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | possible_image = soup.select_one(".post-thumbnail img") 21 | if possible_image: 22 | self.novel_cover = self.absolute_url(possible_image["src"]) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = "Translated by ExR" 26 | logger.info("Novel author: %s", self.novel_author) 27 | 28 | # Extract volume-wise chapter entries 29 | # Stops external links being selected as chapters 30 | chapters = soup.select( 31 | 'div.lcp_catlist p [href*="exiledrebelsscanlations.com/"]' 32 | ) 33 | 34 | for a in chapters: 35 | chap_id = len(self.chapters) + 1 36 | vol_id = 1 + len(self.chapters) // 100 37 | if len(self.volumes) < vol_id: 38 | self.volumes.append({"id": vol_id}) 39 | self.chapters.append( 40 | { 41 | "id": chap_id, 42 | "volume": vol_id, 43 | "url": self.absolute_url(a["href"]), 44 | "title": a.text.strip() or ("Chapter %d" % chap_id), 45 | } 46 | ) 47 | 48 | def download_chapter_body(self, chapter): 49 | soup = self.get_soup(chapter["url"]) 50 | contents = soup.select_one("div#wtr-content") 51 | return self.cleaner.extract_contents(contents) 52 | -------------------------------------------------------------------------------- /sources/en/f/fenrirtranslations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import BeautifulSoup 5 | 6 | from lncrawl.templates.madara import MadaraTemplate 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class FenrirTranslationsCrawler(MadaraTemplate): 11 | base_url = ["https://fenrirtranslations.com/"] 12 | 13 | def initialize(self) -> None: 14 | self.cleaner.bad_css.update( 15 | [ 16 | "div.chapter-warning", 17 | "div.code-block" 18 | ] 19 | ) 20 | 21 | def parse_authors(self, soup: BeautifulSoup): 22 | for a in soup.select('.manga-authors a[href*="author"]'): 23 | yield a.text.strip() 24 | 25 | def parse_summary(self, soup): 26 | possible_summary = soup.select_one(".manga-summary") 27 | return self.cleaner.extract_contents(possible_summary) 28 | 29 | def select_chapter_tags(self, soup: BeautifulSoup): 30 | try: 31 | clean_novel_url = self.novel_url.split("?")[0].strip("/") 32 | response = self.submit_form(f"{clean_novel_url}/ajax/chapters/") 33 | soup = self.make_soup(response) 34 | chapters = soup.select(".free ul.main .wp-manga-chapter a") 35 | yield from reversed(chapters) 36 | except Exception as e: 37 | logger.debug("Failed to fetch chapters using ajax", e) 38 | -------------------------------------------------------------------------------- /sources/en/f/freelightnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class FreeLightNovel(Crawler): 11 | base_url = "https://www.freelightnovel.com/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | possible_title = soup.select_one("h1.page-header") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.text 20 | logger.info("Novel title: %s", self.novel_title) 21 | 22 | possible_image = soup.select_one(".content img.img-responsive") 23 | if possible_image: 24 | self.novel_cover = self.absolute_url(possible_image["src"]) 25 | logger.info("Novel cover: %s", self.novel_cover) 26 | 27 | self.volumes.append({"id": 1}) 28 | for a in soup.select(".book-toc .dropdown-menu li.leaf a"): 29 | title = a.text.strip() 30 | 31 | chap_id = len(self.chapters) + 1 32 | match = re.findall(r"ch(apter)? (\d+)", title, re.IGNORECASE) 33 | if len(match) == 1: 34 | chap_id = int(match[0][1]) 35 | 36 | self.chapters.append( 37 | { 38 | "volume": 1, 39 | "id": chap_id, 40 | "title": title, 41 | "url": self.absolute_url(a["href"]), 42 | } 43 | ) 44 | 45 | def download_chapter_body(self, chapter): 46 | logger.debug("Visiting %s", chapter["url"]) 47 | soup = self.get_soup(chapter["url"]) 48 | 49 | content = soup.select_one(".content") 50 | self.cleaner.clean_contents(content) 51 | 52 | return "".join([str(p) for p in content.select("p") if len(p.text.strip()) > 1]) 53 | -------------------------------------------------------------------------------- /sources/en/h/hotnovelfull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class HotNovelFullCrawler(NovelFullTemplate): 10 | base_url = ["https://hotnovelfull.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.cleaner.bad_tag_text_pairs.update( 14 | { 15 | "h4": [ 16 | r"Chapter \d+", 17 | r"^\s*(Translator|Editor):.*$", 18 | ], 19 | "strong": r"This chapter upload first at NovelNext\.com", 20 | } 21 | ) 22 | -------------------------------------------------------------------------------- /sources/en/i/imperfectcomic.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ImperfectComicCrawler(MangaStreamTemplate): 10 | base_url = ["https://imperfectcomic.org/"] 11 | has_manga = True 12 | -------------------------------------------------------------------------------- /sources/en/i/isotls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class IsotlsCrawler(Crawler): 9 | base_url = [ 10 | 'https://isotls.com/', 11 | 'https://www.isotls.com/', 12 | ] 13 | 14 | def read_novel_info(self): 15 | soup = self.get_soup(self.novel_url) 16 | 17 | possible_cover = soup.select_one('meta[property="og:image"]') 18 | if possible_cover: 19 | self.novel_cover = self.absolute_url(possible_cover['content']) 20 | 21 | possible_title = soup.select_one('meta[property="og:title"]') 22 | assert possible_title, 'No novel title' 23 | self.novel_title = possible_title['content'] 24 | 25 | possible_novel_author = soup.select_one('meta[name="twitter:data1"]') 26 | if possible_novel_author: 27 | self.novel_author = possible_novel_author['content'] 28 | 29 | for a in soup.select('main section:nth-child(3) nav ul li a'): 30 | chap_id = len(self.chapters) + 1 31 | vol_id = len(self.chapters) // 100 + 1 32 | if len(self.chapters) % 100 == 0: 33 | self.volumes.append({'id': vol_id}) 34 | 35 | self.chapters.append({ 36 | 'id': chap_id, 37 | 'volume': vol_id, 38 | 'title': a.text.strip(), 39 | 'url': self.absolute_url(a['href']), 40 | }) 41 | 42 | def download_chapter_body(self, chapter): 43 | soup = self.get_soup(chapter['url']) 44 | contents = soup.select_one("div.content") 45 | return self.cleaner.extract_contents(contents) 46 | -------------------------------------------------------------------------------- /sources/en/i/snowycodex.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from bs4 import BeautifulSoup, Tag 6 | 7 | from lncrawl.models import Chapter 8 | from lncrawl.templates.browser.chapter_only import ChapterOnlyBrowserTemplate 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class SnowyCodexCrawler(ChapterOnlyBrowserTemplate): 14 | base_url = "https://snowycodex.com/" 15 | 16 | def initialize(self) -> None: 17 | self.cleaner.bad_css.update( 18 | { 19 | ".wpulike", 20 | ".sharedaddy", 21 | ".wpulike-default", 22 | '[style="text-align:center;"]', 23 | } 24 | ) 25 | self.cleaner.bad_tag_text_pairs.update( 26 | { 27 | "p": r"[\u4E00-\u9FFF]+", 28 | } 29 | ) 30 | 31 | def parse_title(self, soup: BeautifulSoup) -> str: 32 | tag = soup.select_one(".entry-content h2") 33 | assert isinstance(tag, Tag) 34 | return tag.text.strip() 35 | 36 | def parse_cover(self, soup: BeautifulSoup) -> str: 37 | tag = soup.select_one(".entry-content img") 38 | assert isinstance(tag, Tag) 39 | if tag.has_attr("data-src"): 40 | return self.absolute_url(tag["data-src"]) 41 | elif tag.has_attr("src"): 42 | return self.absolute_url(tag["src"]) 43 | 44 | def parse_authors(self, soup: BeautifulSoup): 45 | tag = soup.find("strong", string="Author:") 46 | assert isinstance(tag, Tag) 47 | yield tag.next_sibling.text.strip() 48 | 49 | def select_chapter_tags(self, soup: BeautifulSoup): 50 | yield from soup.select(".entry-content a[href*='/chapter']") 51 | 52 | def parse_chapter_item(self, tag: Tag, id: int) -> Chapter: 53 | return Chapter( 54 | id=id, 55 | title=tag.text.strip(), 56 | url=self.absolute_url(tag["href"]), 57 | ) 58 | 59 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 60 | return soup.select_one(".entry-content") 61 | -------------------------------------------------------------------------------- /sources/en/l/leafstudio.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from typing import List 4 | 5 | from lncrawl.core.crawler import Crawler 6 | from lncrawl.models import Chapter, SearchResult 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class LiteroticaCrawler(Crawler): 12 | base_url = ["https://leafstudio.site/"] 13 | 14 | def initialize(self) -> None: 15 | self.init_executor(ratelimit=2) 16 | 17 | def search_novel(self, query) -> List[SearchResult]: 18 | soup = self.get_soup( 19 | f"{self.home_url}novels?search={query}&type=&language=&status=&sort=" 20 | ) 21 | results = [] 22 | for item in soup.select("a.novel-item"): 23 | results.append( 24 | SearchResult( 25 | title=item.select_one("p.novel-item-title").text.strip(), 26 | url=item["href"], 27 | ) 28 | ) 29 | return results 30 | 31 | def read_novel_info(self) -> None: 32 | soup = self.get_soup(self.novel_url) 33 | self.novel_title = soup.select_one(".title").text 34 | self.novel_author = "LeafStudio" 35 | self.novel_synopsis = soup.select_one(".desc_div > p:nth-child(2)").text or None 36 | self.novel_tags = [item.text for item in soup.select("a.novel_genre")] or None 37 | self.novel_cover = soup.select_one("#novel_cover")["src"] or None 38 | for item in soup.select("a.free_chap").__reversed__(): 39 | self.chapters.append( 40 | dict(id=len(self.chapters) + 1, title=item.text, url=item["href"]) 41 | ) 42 | 43 | def download_chapter_body(self, chapter: Chapter) -> str: 44 | soup = self.get_soup(chapter["url"]) 45 | chapterText = "" 46 | for item in soup.select("p.chapter_content"): 47 | chapterText += self.cleaner.extract_contents(item) 48 | return chapterText.replace("Login to buy access to this Chapter.", "") 49 | -------------------------------------------------------------------------------- /sources/en/l/lightnovelmeta.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelmtl import NovelMTLTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class LightNovelMetaCrawler(NovelMTLTemplate): 9 | base_url = "https://www.lightnovelmeta.com" 10 | -------------------------------------------------------------------------------- /sources/en/l/lightnovelpub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lncrawl.templates.novelpub import NovelPubTemplate 4 | 5 | 6 | class LightnovelpubCrawler(NovelPubTemplate): 7 | base_url = [ 8 | "https://www.lightnovelpub.com/", 9 | ] 10 | -------------------------------------------------------------------------------- /sources/en/l/lightnovelworld.com.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lncrawl.templates.novelpub import NovelPubTemplate 4 | 5 | 6 | class LightnovelworldComCrawler(NovelPubTemplate): 7 | base_url = [ 8 | "https://www.lightnovelworld.com/", 9 | ] 10 | -------------------------------------------------------------------------------- /sources/en/l/lightnovelworld.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class LightNovelWorldCrawler(Crawler): 9 | base_url = "https://lightnovel.world/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | self.novel_author = soup.select_one("span.textC999").text.strip() 16 | logger.info("Novel author: %s", self.novel_author) 17 | 18 | possible_title = soup.select_one("li.text1") 19 | for span in possible_title.select("span"): 20 | span.extract() 21 | self.novel_title = possible_title.text.strip() 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_image = soup.select_one(".book_info_l img") 25 | if possible_image: 26 | self.novel_cover = self.absolute_url(possible_image["src"]) 27 | logger.info("Novel cover: %s", self.novel_cover) 28 | 29 | volumes = set([]) 30 | for a in soup.select("div#chapter_content ul li a"): 31 | chap_id = 1 + len(self.chapters) 32 | vol_id = 1 + len(self.chapters) // 100 33 | volumes.add(vol_id) 34 | self.chapters.append( 35 | { 36 | "id": chap_id, 37 | "volume": vol_id, 38 | "url": self.absolute_url(a["href"]), 39 | "title": a.text.strip() or ("Chapter %d" % chap_id), 40 | } 41 | ) 42 | 43 | self.volumes = [{"id": x} for x in volumes] 44 | 45 | def download_chapter_body(self, chapter): 46 | soup = self.get_soup(chapter["url"]) 47 | 48 | contents = soup.select_one("div#content_detail") 49 | for ads in contents.select("div"): 50 | ads.extract() 51 | 52 | return str(contents) 53 | -------------------------------------------------------------------------------- /sources/en/l/lightnovetrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | from typing import Generator, Union 5 | 6 | from bs4 import BeautifulSoup, Tag 7 | 8 | from lncrawl.models import Chapter, Volume 9 | from lncrawl.templates.soup.general import GeneralSoupTemplate 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class LNTCrawler(GeneralSoupTemplate): 15 | base_url = ["https://lightnovelstranslations.com/"] 16 | 17 | has_manga = False 18 | has_mtl = False 19 | 20 | def get_novel_soup(self) -> BeautifulSoup: 21 | return self.get_soup(f"{self.novel_url}/?tab=table_contents") 22 | 23 | def parse_title(self, soup: BeautifulSoup) -> str: 24 | tag = soup.select_one(".novel_title") 25 | assert tag 26 | return tag.text.strip() 27 | 28 | def parse_cover(self, soup: BeautifulSoup) -> str: 29 | tag = soup.select_one(".novel-image img") 30 | assert tag 31 | if tag.has_attr("data-src"): 32 | return self.absolute_url(tag["data-src"]) 33 | if tag.has_attr("src"): 34 | return self.absolute_url(tag["src"]) 35 | 36 | def parse_authors(self, soup: BeautifulSoup) -> Generator[str, None, None]: 37 | for p in soup.select(".entry-content > p"): 38 | if "Author" in p.text: 39 | yield p.text.replace("Author:", "").strip() 40 | 41 | def parse_chapter_list( 42 | self, soup: BeautifulSoup 43 | ) -> Generator[Union[Chapter, Volume], None, None]: 44 | _id = 0 45 | for a in soup.select(".novel_list_chapter_content li.unlock a"): 46 | _id += 1 47 | yield Chapter( 48 | id=_id, url=self.absolute_url(a["href"]), title=a.text.strip() 49 | ) 50 | 51 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 52 | return soup.select_one(".text_story") 53 | -------------------------------------------------------------------------------- /sources/en/l/ltnovel.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class LtNovel(NovelMTLTemplate): 8 | base_url = "https://www.ltnovel.com/" 9 | -------------------------------------------------------------------------------- /sources/en/l/luminarynovels.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bs4 import BeautifulSoup, Tag 4 | 5 | from lncrawl.templates.madara import MadaraTemplate 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class Luminarynovels(MadaraTemplate): 11 | has_mtl = False 12 | has_manga = False 13 | base_url = ["https://luminarynovels.com/"] 14 | 15 | def initialize(self) -> None: 16 | # contains self-promo and discord link 17 | self.cleaner.bad_css.add("div.chapter-warning.alert.alert-warning") 18 | 19 | def select_chapter_tags(self, soup: BeautifulSoup): 20 | try: 21 | clean_novel_url = self.novel_url.split("?")[0].strip("/") 22 | response = self.submit_form(f"{clean_novel_url}/ajax/chapters/", max_retries=0) 23 | soup = self.make_soup(response) 24 | chapters = soup.select(" div.page-content-listing.single-page > div > ul > li > a") 25 | if not chapters: 26 | raise Exception("No chapters on first URL") 27 | except Exception: 28 | nl_id = soup.select_one("#manga-chapters-holder[data-id]") 29 | assert isinstance(nl_id, Tag) 30 | response = self.submit_form( 31 | f"{self.home_url}wp-admin/admin-ajax.php", 32 | data={ 33 | "action": "manga_get_chapters", 34 | "manga": nl_id["data-id"], 35 | }, 36 | ) 37 | soup = self.make_soup(response) 38 | chapters = soup.select("ul.main .wp-manga-chapter a") 39 | 40 | yield from reversed(chapters) 41 | -------------------------------------------------------------------------------- /sources/en/l/lunarletters.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class LunarLetters(Crawler): 9 | base_url = "https://lunarletters.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | possible_title = soup.select_one('meta[property="og:title"]') 16 | assert possible_title, "No novel title" 17 | self.novel_title = possible_title["content"] 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | possible_novel_cover = soup.select_one('meta[property="og:image"]') 21 | if possible_novel_cover: 22 | self.novel_cover = self.absolute_url(possible_novel_cover["content"]) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = " ".join( 26 | [ 27 | a.text.strip() 28 | for a in soup.select('.author-content a[href*="series-author"]') 29 | ] 30 | ) 31 | logger.info("%s", self.novel_author) 32 | 33 | volumes = set() 34 | chapters = soup.select("ul.main li.wp-manga-chapter a") 35 | for a in reversed(chapters): 36 | chap_id = len(self.chapters) + 1 37 | vol_id = (chap_id - 1) // 100 + 1 38 | volumes.add(vol_id) 39 | self.chapters.append( 40 | { 41 | "id": chap_id, 42 | "volume": vol_id, 43 | "url": self.absolute_url(a["href"]), 44 | "title": a.text.strip() or ("Chapter %d" % chap_id), 45 | } 46 | ) 47 | 48 | self.volumes = [{"id": x} for x in volumes] 49 | 50 | def download_chapter_body(self, chapter): 51 | soup = self.get_soup(chapter["url"]) 52 | contents = soup.select(".reading-content p") 53 | return "".join([str(p) for p in contents]) 54 | -------------------------------------------------------------------------------- /sources/en/m/mangarockteam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | from lncrawl.templates.madara import MadaraTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class MangaRockTeamCrawler(MadaraTemplate): 10 | has_manga = True 11 | base_url = ["https://mangarockteam.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/m/mltnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class MltNovelsCrawler(MangaStreamTemplate): 10 | base_url = ["https://mltnovels.com/"] 11 | has_mtl = True 12 | -------------------------------------------------------------------------------- /sources/en/m/myboxnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.madara import MadaraTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class MyBoxNovelCrawler(MadaraTemplate): 10 | base_url = ["https://myboxnovel.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.cleaner.bad_css.update( 14 | [ 15 | ".para-comment", 16 | ".j_open_para_comment", 17 | ".j_para_comment_count", 18 | ".para-comment-num", 19 | "#wp-manga-current-chap", 20 | ".cha-tit", 21 | ".subtitle ", 22 | ] 23 | ) 24 | # self.cleaner.bad_tag_text_pairs.update( 25 | # { 26 | # "div": r"Visit our comic site Webnovel\.live", 27 | # } 28 | # ) 29 | -------------------------------------------------------------------------------- /sources/en/n/newnovelorg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.madara import MadaraTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class NewNovelOrg(MadaraTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://newnovel.org/"] 12 | 13 | def initialize(self) -> None: 14 | self.cleaner.bad_css.update( 15 | [ 16 | ".para-comment", 17 | ".j_open_para_comment", 18 | ".j_para_comment_count", 19 | ".para-comment-num", 20 | "#wp-manga-current-chap", 21 | ".cha-tit", 22 | ".subtitle ", 23 | ] 24 | ) 25 | -------------------------------------------------------------------------------- /sources/en/n/noblemtl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NobleMtlCrawler(MangaStreamTemplate): 10 | base_url = ["https://noblemtl.com/"] 11 | -------------------------------------------------------------------------------- /sources/en/n/novel-bin.net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelfull import NovelFullTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Novel_Bin_Net(NovelFullTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://novel-bin.net/"] 12 | 13 | def initialize(self) -> None: 14 | self.init_executor(ratelimit=1) 15 | -------------------------------------------------------------------------------- /sources/en/n/novel-bin.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelfull import NovelFullTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Novel_Bin(NovelFullTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://novel-bin.com/", "https://novelbin.me"] 12 | 13 | def initialize(self) -> None: 14 | self.init_executor(ratelimit=1) 15 | -------------------------------------------------------------------------------- /sources/en/n/novelbin.net.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelfull import NovelFullTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Novelbin_Net(NovelFullTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://novelbin.net/"] 12 | -------------------------------------------------------------------------------- /sources/en/n/novelbin.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NovelbinCrawler(NovelFullTemplate): 10 | base_url = ["https://novelbin.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.init_executor(ratelimit=0.99) 14 | -------------------------------------------------------------------------------- /sources/en/n/novelfull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NovelFullCrawler(NovelFullTemplate): 10 | base_url = [ 11 | "http://novelfull.com/", 12 | "https://novelfull.com/", 13 | "https://novelfull.net/", 14 | ] 15 | 16 | def initialize(self) -> None: 17 | self.cleaner.bad_css.update( 18 | [ 19 | 'div[align="left"]', 20 | 'img[src*="proxy?container=focus"]', 21 | ] 22 | ) 23 | -------------------------------------------------------------------------------- /sources/en/n/novelfullplus.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NovelFullPlus(NovelFullTemplate): 10 | base_url = ["https://novelfullplus.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.cleaner.bad_tags.update(["h1", "h2", "h3", "h4"]) 14 | -------------------------------------------------------------------------------- /sources/en/n/novelhulk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NovelHulkCrawler(NovelFullTemplate): 10 | base_url = ["https://novelhulk.com/"] 11 | -------------------------------------------------------------------------------- /sources/en/n/novelmt.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class NovelMtCrawler(NovelMTLTemplate): 8 | has_mtl = True 9 | base_url = "https://www.novelmt.com/" 10 | -------------------------------------------------------------------------------- /sources/en/n/novelmtl.py: -------------------------------------------------------------------------------- 1 | from lncrawl.templates.novelmtl import NovelMTLTemplate 2 | 3 | 4 | class NovelMTLCrawler(NovelMTLTemplate): 5 | has_mtl = False 6 | has_manga = False 7 | base_url = "https://www.novelmtl.com/" 8 | -------------------------------------------------------------------------------- /sources/en/n/novelnext.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import BeautifulSoup, Tag 5 | 6 | from lncrawl.templates.novelfull import NovelFullTemplate 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class NovelNextCrawler(NovelFullTemplate): 12 | base_url = ["https://novelnext.com/", "https://novelnext.dramanovels.io/"] 13 | 14 | def initialize(self) -> None: 15 | self.init_executor(ratelimit=0.2) 16 | self.cleaner.bad_tag_text_pairs.update( 17 | { 18 | "h4": [ 19 | r"Chapter \d+", 20 | r"^\s*(Translator|Editor):.*$", 21 | ], 22 | "p": [ 23 | r"^\s*(Translator|Editor):.*$", 24 | r"Bookmark this website \( ", 25 | r"\) to update the latest novels\.", 26 | ], 27 | "strong": r"NovelNext\.com", 28 | } 29 | ) 30 | 31 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 32 | return soup.select_one("#chr-content, #chapter-content") 33 | -------------------------------------------------------------------------------- /sources/en/n/novelnextz.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelfull import NovelFullTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Novelnextz(NovelFullTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://novelnextz.com/"] 12 | 13 | def initialize(self) -> None: 14 | self.cleaner.bad_tag_text_pairs.update( 15 | { 16 | "h4": [ 17 | r"Chapter \d+", 18 | r"^\s*(Translator|Editor):.*$", 19 | ], 20 | "p": [ 21 | r"^\s*(Translator|Editor):.*$", 22 | r"Bookmark this website \( ", 23 | r"\) to update the latest novels\.", 24 | ], 25 | "strong": r"NovelNext\.com", 26 | } 27 | ) 28 | -------------------------------------------------------------------------------- /sources/en/n/novelpub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lncrawl.templates.novelpub import NovelPubTemplate 4 | 5 | 6 | class NovelpubCrawler(NovelPubTemplate): 7 | base_url = [ 8 | "https://www.novelpub.com/", 9 | ] 10 | -------------------------------------------------------------------------------- /sources/en/n/novelrare.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NovelrareCrawler(Crawler): 10 | base_url = "https://novelrare.com/" 11 | 12 | def read_novel_info(self): 13 | soup = self.get_soup(self.novel_url) 14 | 15 | possible_title = soup.select_one("#manga-title h1") 16 | if possible_title: 17 | self.novel_title = possible_title.get_text(strip=True) 18 | 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_synopsis = soup.select_one("div[aria-labelledby='manga-info'] p") 22 | if possible_synopsis: 23 | self.novel_synopsis = possible_synopsis.get_text() 24 | 25 | img_src = soup.select_one("div.summary_image img") 26 | if img_src: 27 | self.novel_cover = self.absolute_url(img_src["src"]) 28 | 29 | chapters_table = soup.select_one("div.listing-chapters_wrap") 30 | for a in reversed( 31 | chapters_table.find_all("a", class_=lambda x: x != "c-new-tag") 32 | ): 33 | chap_id = 1 + (len(self.chapters)) 34 | 35 | self.chapters.append( 36 | { 37 | "id": chap_id, 38 | "title": a.text.strip(), 39 | "url": self.absolute_url(a['href']) 40 | } 41 | ) 42 | 43 | def download_chapter_body(self, chapter): 44 | soup = self.get_soup(chapter["url"]) 45 | content = soup.select_one("div.text-left") 46 | return self.cleaner.extract_contents(content) 47 | -------------------------------------------------------------------------------- /sources/en/n/novelzec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class NovelZec(Crawler): 10 | base_url = "https://novelzec.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | possible_title = soup.select_one(".entry-header h1") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | authors = soup.select('.entry-header span a[href*="/author/"]') 22 | self.novel_author = ", ".join([a.text.strip() for a in authors]) 23 | logger.info("Novel author: %s", self.novel_author) 24 | 25 | possible_image = soup.select_one(".entry-header img") 26 | if possible_image: 27 | self.novel_cover = self.absolute_url(possible_image["src"]) 28 | logger.info("Novel cover: %s", self.novel_cover) 29 | 30 | for a in reversed(soup.select("#chap-list li a")): 31 | chap_id = len(self.chapters) + 1 32 | vol_id = len(self.chapters) // 100 + 1 33 | if len(self.chapters) % 100 == 0: 34 | self.volumes.append({"id": vol_id}) 35 | self.chapters.append( 36 | { 37 | "id": chap_id, 38 | "volume": vol_id, 39 | "title": a.text.strip(), 40 | "url": self.absolute_url(a["href"]), 41 | } 42 | ) 43 | 44 | def download_chapter_body(self, chapter): 45 | soup = self.get_soup(chapter["url"]) 46 | contents = soup.select_one(".content-story") 47 | return self.cleaner.extract_contents(contents) 48 | -------------------------------------------------------------------------------- /sources/en/n/novlove.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelfull import NovelFullTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Novlove(NovelFullTemplate): 9 | has_mtl = False 10 | has_manga = False 11 | base_url = ["https://novlove.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/o/oppatrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class OppaTranslations(Crawler): 10 | base_url = "https://www.oppatranslations.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 17 | logger.info("Novel title: %s", self.novel_title) 18 | 19 | possible_image = soup.select_one("div.entry-content p img") 20 | if possible_image: 21 | self.novel_cover = self.absolute_url(possible_image["src"]) 22 | logger.info("Novel cover: %s", self.novel_cover) 23 | 24 | self.novel_author = soup.select("div.entry-content p")[8].text.strip() 25 | logger.info("Novel author: %s", self.novel_author) 26 | 27 | # Extract volume-wise chapter entries 28 | # Stops external links being selected as chapters 29 | chapters = soup.select('div.entry-content p [href*="oppatranslations.com/"]') 30 | 31 | for a in chapters: 32 | chap_id = len(self.chapters) + 1 33 | vol_id = 1 + len(self.chapters) // 100 34 | if len(self.volumes) < vol_id: 35 | self.volumes.append({"id": vol_id}) 36 | self.chapters.append( 37 | { 38 | "id": chap_id, 39 | "volume": vol_id, 40 | "url": self.absolute_url(a["href"]), 41 | "title": a.text.strip() or ("Chapter %d" % chap_id), 42 | } 43 | ) 44 | 45 | def download_chapter_body(self, chapter): 46 | soup = self.get_soup(chapter["url"]) 47 | 48 | body = [] 49 | contents = soup.select("div.entry-content p") 50 | contents = contents[:-1] 51 | for p in contents: 52 | para = " ".join(self.cleaner.extract_contents(p)) 53 | if len(para): 54 | body.append(para) 55 | 56 | return "

%s

" % "

".join(body) 57 | -------------------------------------------------------------------------------- /sources/en/o/ornovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class OrNovel(Crawler): 9 | base_url = "https://www.ornovel.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | self.novel_title = " ".join( 16 | [str(x) for x in soup.select_one(".title h1").contents if not x.name] 17 | ).strip() 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | probable_img = soup.select_one(".intro-left img.book-image") 21 | if probable_img: 22 | self.novel_cover = self.absolute_url(probable_img["src"]) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = " ".join( 26 | [a.text.strip() for a in soup.select(".author-container")] 27 | ) 28 | logger.info("%s", self.novel_author) 29 | 30 | volumes = set() 31 | chapters = soup.select("ul.chapters-all li.chapters-item a") 32 | for a in chapters: 33 | chap_id = len(self.chapters) + 1 34 | vol_id = (chap_id - 1) // 100 + 1 35 | volumes.add(vol_id) 36 | self.chapters.append( 37 | { 38 | "id": chap_id, 39 | "volume": vol_id, 40 | "url": self.absolute_url(a["href"]), 41 | "title": a.text.strip() or ("Chapter %d" % chap_id), 42 | } 43 | ) 44 | 45 | self.volumes = [{"id": x} for x in volumes] 46 | 47 | def download_chapter_body(self, chapter): 48 | soup = self.get_soup(chapter["url"]) 49 | 50 | contents = soup.select_one("div.chapter-detail") 51 | for bad in contents.select( 52 | "h2, ins, .chapter-header .code-block, script, .adsbygoogle" 53 | ): 54 | bad.extract() 55 | 56 | return str(contents) 57 | -------------------------------------------------------------------------------- /sources/en/p/pandamanga.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class PandaMangaxyzCrawler(MangaStreamTemplate): 10 | base_url = ["https://www.pandamanga.xyz/"] 11 | -------------------------------------------------------------------------------- /sources/en/p/pandanovelco.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from typing import Generator 4 | from bs4 import BeautifulSoup, Tag 5 | from lncrawl.templates.novelpub import NovelPubTemplate 6 | 7 | 8 | class PandaNovelCo(NovelPubTemplate): 9 | base_url = [ 10 | "https://pandanovel.co/", 11 | ] 12 | 13 | # We override because we do not have a request token like other novel pub 14 | # (without that wrong error is raised and browser search isn't triggered) 15 | def select_search_items(self, query: str) -> Generator[Tag, None, None]: 16 | self.submit_form( 17 | f"{self.home_url}lnsearchlive", 18 | data={"inputContent": query}, 19 | headers={ 20 | "referer": f"{self.home_url}search", 21 | }, 22 | ) 23 | 24 | # override this because somehow novel_url is always missing trailing / 25 | def select_chapter_tags_in_browser(self): 26 | next_link = f"{self.novel_url}/chapters" 27 | while next_link: 28 | self.browser.visit(next_link) 29 | self.browser.wait("ul.chapter-list li") 30 | chapter_list = self.browser.find("ul.chapter-list") 31 | yield from chapter_list.as_tag().select("li a") 32 | try: 33 | next_link = self.browser.find('.PagedList-skipToNext a[rel="next"]') 34 | next_link = next_link.get_attribute("href") 35 | except Exception: 36 | next_link = False 37 | 38 | # .chapter-content -> #content 39 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 40 | self.browser.wait("#content") 41 | return soup.select_one("#content") 42 | -------------------------------------------------------------------------------- /sources/en/p/pandanovelorg.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelfull import NovelFullTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class Pandanovelorg(NovelFullTemplate): 8 | has_mtl = False 9 | has_manga = False 10 | base_url = ["https://pandanovel.org/"] 11 | -------------------------------------------------------------------------------- /sources/en/r/readmtl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.madara import MadaraTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Readmtl(MadaraTemplate): 9 | has_mtl = True 10 | has_manga = False 11 | base_url = ["https://readmtl.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/r/readnovelfull.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.novelfull import NovelFullTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ReadNovelFullCrawler(NovelFullTemplate): 10 | base_url = "https://readnovelfull.com/" 11 | -------------------------------------------------------------------------------- /sources/en/r/readwn.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class ReadWNCrawler(NovelMTLTemplate): 8 | has_mtl = True 9 | base_url = [ 10 | "https://www.readwn.com/", 11 | "https://www.wuxiap.com/" 12 | ] 13 | -------------------------------------------------------------------------------- /sources/en/s/sleepytrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SleepyTranslations(Crawler): 9 | base_url = "https://sleepytranslations.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | possible_title = soup.select_one(".post-title h1") 16 | for span in possible_title.select("span"): 17 | span.extract() 18 | self.novel_title = possible_title.text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_image = soup.select_one(".summary_image a img") 22 | if possible_image: 23 | self.novel_cover = self.absolute_url(possible_image["src"]) 24 | logger.info("Novel cover: %s", self.novel_cover) 25 | 26 | self.novel_author = " ".join( 27 | [a.text.strip() for a in soup.select('.author-content a[href*="author"]')] 28 | ) 29 | logger.info("%s", self.novel_author) 30 | 31 | self.novel_id = soup.select_one("#manga-chapters-holder")["data-id"] 32 | logger.info("Novel id: %s", self.novel_id) 33 | 34 | response = self.submit_form(self.novel_url.strip("/") + "/ajax/chapters") 35 | soup = self.make_soup(response) 36 | for a in reversed(soup.select(".wp-manga-chapter a")): 37 | chap_id = len(self.chapters) + 1 38 | vol_id = 1 + len(self.chapters) // 100 39 | if chap_id % 100 == 1: 40 | self.volumes.append({"id": vol_id}) 41 | self.chapters.append( 42 | { 43 | "id": chap_id, 44 | "volume": vol_id, 45 | "title": a.text.strip(), 46 | "url": self.absolute_url(a["href"]), 47 | } 48 | ) 49 | 50 | def download_chapter_body(self, chapter): 51 | soup = self.get_soup(chapter["url"]) 52 | contents = soup.select(".reading-content p") 53 | return "".join([str(p) for p in contents]) 54 | -------------------------------------------------------------------------------- /sources/en/s/smnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SMNovelsCrawler(Crawler): 9 | base_url = "https://smnovels.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | # Site has no author name or novel covers. 16 | possible_title = soup.select_one("h1.entry-title") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | for a in soup.select(".all-chapters-list a"): 22 | chap_id = len(self.chapters) + 1 23 | vol_id = len(self.chapters) // 100 + 1 24 | if len(self.chapters) % 100 == 0: 25 | self.volumes.append({"id": vol_id}) 26 | self.chapters.append( 27 | { 28 | "id": chap_id, 29 | "volume": vol_id, 30 | "title": a.text.strip(), 31 | "url": self.absolute_url(a["href"]), 32 | } 33 | ) 34 | 35 | def download_chapter_body(self, chapter): 36 | soup = self.get_soup(chapter["url"]) 37 | 38 | contents = soup.select_one(".entry-content") 39 | for bad in contents.select("br"): 40 | bad.extract() 41 | return self.cleaner.extract_contents(contents) 42 | -------------------------------------------------------------------------------- /sources/en/s/sonicmtl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from bs4 import BeautifulSoup, Tag 3 | from lncrawl.templates.madara import MadaraTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class SonicMTLCrawler(MadaraTemplate): 9 | has_mtl = True 10 | base_url = [ 11 | "https://sonicmtl.com", 12 | "https://www.sonicmtl.com/", 13 | ] 14 | 15 | def initialize(self): 16 | super().initialize() 17 | self.cleaner.bad_css.update( 18 | { 19 | ".ad", 20 | ".c-ads", 21 | ".custom-code", 22 | ".body-top-ads", 23 | ".before-content-ad", 24 | ".autors-widget", 25 | } 26 | ) 27 | 28 | def select_chapter_body(self, soup: BeautifulSoup) -> Tag: 29 | return soup.select_one(".reading-content .text-left") 30 | -------------------------------------------------------------------------------- /sources/en/s/steambun.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class SteambunCrawler(Crawler): 11 | base_url = "https://steambunlightnovel.com/" 12 | 13 | def read_novel_info(self): 14 | logger.debug("Visiting %s", self.novel_url) 15 | soup = self.get_soup(self.novel_url) 16 | 17 | possible_title = soup.select_one("h1.entry-title") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.text 20 | logger.info("Novel title: %s", self.novel_title) 21 | 22 | self.novel_author = "by SteamBun Translations" 23 | logger.info("Novel author: %s", self.novel_author) 24 | 25 | # Site does not list covers. 26 | 27 | volumes = set([]) 28 | for a in reversed( 29 | soup.select('div.w4pl-inner li a[href*="steambunlightnovel.com"]') 30 | ): 31 | title = a.text.strip() 32 | chapter_id = len(self.chapters) + 1 33 | volume_id = 1 + (chapter_id - 1) // 100 34 | volumes.add(volume_id) 35 | self.chapters.append( 36 | { 37 | "id": chapter_id, 38 | "volume": volume_id, 39 | "title": title, 40 | "url": a["href"], 41 | } 42 | ) 43 | 44 | self.chapters.sort(key=lambda x: x["id"]) 45 | self.volumes = [{"id": x, "title": ""} for x in volumes] 46 | 47 | def download_chapter_body(self, chapter): 48 | soup = self.get_soup(chapter["url"]) 49 | content = soup.select_one("div.entry-content") 50 | assert content, "No chapter content" 51 | self.cleaner.clean_contents(content) 52 | body = content.select("p") 53 | body = [str(p) for p in body if self.should_take(p)] 54 | return "

" + "

".join(body) + "

" 55 | 56 | def should_take(self, p): 57 | txt = p.text.strip().lower() 58 | return txt and txt != "advertisement" 59 | -------------------------------------------------------------------------------- /sources/en/s/systemtranslation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class SystemTranslationCrawler(MangaStreamTemplate): 10 | base_url = ["https://systemtranslation.com/"] 11 | -------------------------------------------------------------------------------- /sources/en/t/tamagotl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class TamagoTlCrawler(MangaStreamTemplate): 10 | base_url = ["https://tamagotl.com/"] 11 | has_mtl = True 12 | -------------------------------------------------------------------------------- /sources/en/t/teanovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import logging 4 | 5 | from bs4 import Tag 6 | 7 | from lncrawl.core.crawler import Crawler 8 | from lncrawl.core.exeptions import LNException 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class TeaNovelCrawler(Crawler): 14 | base_url = "https://www.teanovel.com" 15 | 16 | def initialize(self): 17 | self.init_executor( 18 | workers=4 19 | ) 20 | 21 | def read_novel_info(self): 22 | soup = self.get_soup(self.novel_url) 23 | 24 | script_tag = soup.select_one("script#__NEXT_DATA__") 25 | if not isinstance(script_tag, Tag): 26 | raise LNException("No script data found") 27 | 28 | next_data = json.loads(script_tag.get_text()) 29 | 30 | novel_data = next_data["props"]["pageProps"]["novel"] 31 | 32 | self.novel_title = novel_data["name"] 33 | self.novel_author = novel_data["author"] 34 | 35 | img_tag = soup.select_one("main img[src*='_next/']") 36 | if isinstance(img_tag, Tag): 37 | self.novel_cover = self.absolute_url(img_tag["src"]) 38 | 39 | chapters = self.get_soup(self.novel_url + "/chapter-list").select("a.border-b") 40 | for chapter in chapters: 41 | chapter_id = len(self.chapters) + 1 42 | self.chapters.append( 43 | { 44 | "id": chapter_id, 45 | "title": chapter.select_one("p").get_text(strip=True), 46 | "url": self.absolute_url(chapter["href"]), 47 | } 48 | ) 49 | 50 | def download_chapter_body(self, chapter): 51 | chapter = self.get_soup(chapter["url"]) 52 | return self.cleaner.extract_contents(chapter.select_one("div.prose")) 53 | -------------------------------------------------------------------------------- /sources/en/t/totallytranslations.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from requests.sessions import Session 5 | 6 | from lncrawl.core.crawler import Crawler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class TotallyTranslations(Crawler): 12 | base_url = "https://totallytranslations.com/" 13 | 14 | def initialize(self): 15 | self.scraper = Session() 16 | 17 | def read_novel_info(self): 18 | logger.debug("Visiting %s", self.novel_url) 19 | soup = self.get_soup(self.novel_url) 20 | 21 | possible_title = soup.select_one(".entry-title") 22 | assert possible_title, "No novel title" 23 | self.novel_title = possible_title.text 24 | logger.info("Novel title: %s", self.novel_title) 25 | 26 | possible_image = soup.select_one(".novel-image img") 27 | if possible_image: 28 | self.novel_cover = self.absolute_url(possible_image["src"]) 29 | logger.info("Novel cover: %s", self.novel_cover) 30 | 31 | for p in soup.select(".chapters-list .chapters-title"): 32 | vol_title = p.text.strip() 33 | vol_id = len(self.volumes) + 1 34 | self.volumes.append( 35 | { 36 | "id": vol_id, 37 | "title": vol_title, 38 | } 39 | ) 40 | 41 | ul = p.find_next("ul") 42 | for a in ul.select("a"): 43 | chap_id = len(self.chapters) + 1 44 | self.chapters.append( 45 | { 46 | "id": chap_id, 47 | "volume": vol_id, 48 | "title": a.text.strip(), 49 | "url": self.absolute_url(a["href"]), 50 | } 51 | ) 52 | 53 | def download_chapter_body(self, chapter): 54 | soup = self.get_soup(chapter["url"]) 55 | paras = soup.select(".post-content p") 56 | return "\n".join([str(p) for p in paras if p.text.strip()]) 57 | -------------------------------------------------------------------------------- /sources/en/v/veratales.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class VeraTales(Crawler): 10 | base_url = "https://veratales.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.find("h1").text.strip() 17 | logger.info("Novel title: %s", self.novel_title) 18 | 19 | # self.novel_author= soup.find("div",{"class":"novel-author-info"}).find("h4").text.strip() 20 | self.novel_author = "" 21 | logger.info("%s", self.novel_author) 22 | 23 | possible_image = soup.select_one("div.card-header a img") 24 | if possible_image: 25 | self.novel_cover = self.absolute_url(possible_image["src"]) 26 | logger.info("Novel cover: %s", self.novel_cover) 27 | 28 | chapters = soup.select("table td a") 29 | for a in reversed(chapters): 30 | chap_id = len(self.chapters) + 1 31 | vol_id = 1 + len(self.chapters) // 100 32 | if len(self.volumes) < vol_id: 33 | self.volumes.append({"id": vol_id}) 34 | self.chapters.append( 35 | { 36 | "id": chap_id, 37 | "volume": vol_id, 38 | "url": self.absolute_url(a["href"]), 39 | "title": a.text.strip() or ("Chapter %d" % chap_id), 40 | } 41 | ) 42 | 43 | def download_chapter_body(self, chapter): 44 | soup = self.get_soup(chapter["url"]) 45 | contents = soup.select_one("div.reader-content") 46 | return self.cleaner.extract_contents(contents) 47 | -------------------------------------------------------------------------------- /sources/en/w/webnovelonlinecom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import json 4 | import logging 5 | from lncrawl.core.crawler import Crawler 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class WebnovelOnlineDotComCrawler(Crawler): 11 | base_url = "https://webnovelonline.com/" 12 | 13 | def read_novel_info(self): 14 | url = self.novel_url 15 | soup = self.get_soup(url) 16 | 17 | possible_title = soup.select_one(".novel-info .novel-desc h1") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.text 20 | logger.info("Novel title: %s", self.novel_title) 21 | 22 | possible_novel_cover = soup.select_one('meta[property="og:image"]') 23 | if possible_novel_cover: 24 | self.novel_cover = self.absolute_url(possible_novel_cover["content"]) 25 | logger.info("Novel cover: %s", self.novel_title) 26 | 27 | volumes = set([]) 28 | for a in reversed(soup.select(".chapter-list .item a")): 29 | chap_id = len(self.chapters) + 1 30 | vol_id = 1 + len(self.chapters) // 100 31 | volumes.add(vol_id) 32 | self.chapters.append( 33 | { 34 | "id": chap_id, 35 | "volume": vol_id, 36 | "title": a.text.strip(), 37 | "url": self.absolute_url(a["href"]), 38 | } 39 | ) 40 | 41 | self.volumes = [{"id": x, "title": ""} for x in volumes] 42 | 43 | def download_chapter_body(self, chapter): 44 | soup = self.get_soup(chapter["url"]) 45 | 46 | for script in soup.select("script"): 47 | text = script.string 48 | if not text or not text.startswith("window._INITIAL_DATA_"): 49 | continue 50 | content = re.findall(r',"chapter":(".+")},', text)[0] 51 | content = json.loads(content).strip() 52 | return "

" + "

".join(content.split("\n\n")) + "

" 53 | 54 | return "" 55 | -------------------------------------------------------------------------------- /sources/en/w/webnovelpub.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lncrawl.templates.novelpub import NovelPubTemplate 4 | 5 | 6 | class WebnovelpubCrawler(NovelPubTemplate): 7 | base_url = [ 8 | "https://www.webnovelpub.com/", 9 | "https://www.webnovelpub.pro/", 10 | ] 11 | -------------------------------------------------------------------------------- /sources/en/w/whatsawhizzerwebnovels.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class WhatsAWhizzerCrawler(Crawler): 12 | base_url = ["https://whatsawhizzerwebnovels.com/"] 13 | 14 | def read_novel_info(self): 15 | logger.debug("Visiting %s", self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | self.novel_title = soup.select_one(".page-header-title").text.strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | cover_tag = soup.select_one('meta[property="og:image"]') 22 | 23 | if isinstance(cover_tag, Tag): 24 | self.novel_cover = cover_tag["content"] 25 | 26 | logger.info("Novel cover: %s", self.novel_cover) 27 | 28 | for a in soup.select(".entry > p > a"): 29 | self.chapters.append( 30 | { 31 | "id": len(self.chapters) + 1, 32 | "url": self.absolute_url(a["href"]), 33 | "title": a.text.strip(), 34 | } 35 | ) 36 | 37 | def download_chapter_body(self, chapter): 38 | soup = self.get_soup(chapter["url"]) 39 | contents = soup.select_one("article > div") 40 | 41 | nav_tags = contents.find_all("a", string="Table of Contents") 42 | for nav in nav_tags: 43 | nav.parent.extract() 44 | 45 | self.cleaner.clean_contents(contents) 46 | 47 | return str(contents) 48 | -------------------------------------------------------------------------------- /sources/en/w/wuxiabox.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelmtl import NovelMTLTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class Wuxiabox(NovelMTLTemplate): 9 | has_mtl = True 10 | has_manga = False 11 | base_url = ["https://www.wuxiabox.com/"] 12 | -------------------------------------------------------------------------------- /sources/en/w/wuxiahub.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaHubCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiahub.com" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiamtl.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaMTLCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiamtl.com" 9 | has_mtl = True 10 | -------------------------------------------------------------------------------- /sources/en/w/wuxianovelhub.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from lncrawl.templates.novelmtl import NovelMTLTemplate 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class WuxiaNHCrawler(NovelMTLTemplate): 9 | base_url = "https://www.wuxianovelhub.com/" 10 | -------------------------------------------------------------------------------- /sources/en/w/wuxiapub.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaPubCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiapub.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiar.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaRCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiar.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiaspot.py: -------------------------------------------------------------------------------- 1 | from lncrawl.templates.novelmtl import NovelMTLTemplate 2 | 3 | 4 | class WuxiaSpotCrawler(NovelMTLTemplate): 5 | has_mtl = False 6 | has_manga = False 7 | base_url = "https://www.wuxiaspot.com/" 8 | -------------------------------------------------------------------------------- /sources/en/w/wuxiau.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaUCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiau.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiav.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaVCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiav.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiax.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaXCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiax.com/" 9 | -------------------------------------------------------------------------------- /sources/en/w/wuxiaz.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from lncrawl.templates.novelmtl import NovelMTLTemplate 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | class WuxiaZCrawler(NovelMTLTemplate): 8 | base_url = "https://www.wuxiaz.com/" 9 | -------------------------------------------------------------------------------- /sources/en/x/xiainovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Comment 5 | 6 | from lncrawl.core.crawler import Crawler 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class YukiNovelCrawler(Crawler): 12 | base_url = "https://www.xiainovel.com/" 13 | 14 | def read_novel_info(self): 15 | logger.debug("Visiting %s", self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | possible_title = soup.select_one("div.page-header h1") 19 | assert possible_title, "No novel title" 20 | self.novel_title = possible_title.text 21 | logger.info("Novel title: %s", self.novel_title) 22 | 23 | self.novel_author = "Translated by XiaiNovel" 24 | logger.info("Novel author: %s", self.novel_author) 25 | 26 | # NOTE: Can't fetch cover url, as it's listed a base64 code. 27 | # self.novel_cover = self.absolute_url( 28 | # soup.select_one('div.col-md-6 img') 29 | # logger.info('Novel cover: %s', self.novel_cover) 30 | 31 | # Extract volume-wise chapter entries 32 | chapters = soup.select("ul.list-group li a") 33 | 34 | chapters.reverse() 35 | 36 | for a in chapters: 37 | chap_id = len(self.chapters) + 1 38 | vol_id = 1 + len(self.chapters) // 100 39 | if len(self.volumes) < vol_id: 40 | self.volumes.append({"id": vol_id}) 41 | self.chapters.append( 42 | { 43 | "id": chap_id, 44 | "volume": vol_id, 45 | "url": self.absolute_url(a["href"]), 46 | "title": a.text.strip() or ("Chapter %d" % chap_id), 47 | } 48 | ) 49 | 50 | def download_chapter_body(self, chapter): 51 | soup = self.get_soup(chapter["url"]) 52 | 53 | contents = soup.select_one("section#StoryContent") 54 | 55 | for d in contents.findAll("div"): 56 | d.extract() 57 | 58 | for comment in contents.find_all(string=lambda text: isinstance(text, Comment)): 59 | comment.extract() 60 | 61 | return str(contents) 62 | -------------------------------------------------------------------------------- /sources/fr/lightnovelfr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class LightnovelFrCrawler(MangaStreamTemplate): 10 | base_url = ["https://lightnovelfr.com/"] 11 | -------------------------------------------------------------------------------- /sources/fr/xiaowaz.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | from lncrawl.core.exeptions import LNException 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class XiaowazCrawler(Crawler): 13 | base_url = ["https://xiaowaz.fr/"] 14 | 15 | def initialize(self) -> None: 16 | self.cleaner.bad_css.update( 17 | [".abh_box_business", ".footnote_container_prepare"] 18 | ) 19 | 20 | def read_novel_info(self): 21 | soup = self.get_soup(self.novel_url) 22 | 23 | title_tag = soup.select_one("h1.card_title") 24 | if not isinstance(title_tag, Tag): 25 | raise LNException("No title found") 26 | 27 | self.novel_title = title_tag.text.strip() 28 | 29 | image_tag = soup.select_one(".entry-content img") 30 | if isinstance(image_tag, Tag): 31 | self.novel_cover = self.absolute_url(image_tag["src"]) 32 | 33 | logger.info("Novel cover: %s", self.novel_cover) 34 | 35 | for a in soup.select(".entry-content a[href*='/articles/']"): 36 | self.chapters.append( 37 | { 38 | "id": len(self.chapters) + 1, 39 | "title": a.text.strip(), 40 | "url": self.absolute_url(a["href"]), 41 | } 42 | ) 43 | 44 | def download_chapter_body(self, chapter): 45 | soup = self.get_soup(chapter["url"]) 46 | contents = soup.select_one(".entry-content") 47 | self.cleaner.clean_contents(contents) 48 | 49 | return str(contents) 50 | -------------------------------------------------------------------------------- /sources/id/darktrans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class DarkTranslation(Crawler): 10 | base_url = "https://darktranslation.com/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url) 15 | 16 | self.novel_title = soup.find("h1", {"class": "entry-title"}).text.strip() 17 | logger.info("Novel title: %s", self.novel_title) 18 | 19 | # FIXME: Problem getting cover image, tried multiple ways and keep getting error. 20 | # self.novel_cover = self.absolute_url( 21 | # soup.select_one('div.entry-content p img') 22 | # logger.info('Novel cover: %s', self.novel_cover) 23 | 24 | self.novel_author = "by Dark Translation" 25 | logger.info("Novel author: %s", self.novel_author) 26 | 27 | # Extract volume-wise chapter entries 28 | # Stops external links being selected as chapters 29 | chapters = soup.select("div.entry-content p a") 30 | 31 | for a in chapters: 32 | chap_id = len(self.chapters) + 1 33 | vol_id = 1 + len(self.chapters) // 100 34 | if len(self.volumes) < vol_id: 35 | self.volumes.append({"id": vol_id}) 36 | self.chapters.append( 37 | { 38 | "id": chap_id, 39 | "volume": vol_id, 40 | "url": self.absolute_url(a["href"]), 41 | "title": a.text.strip() or ("Chapter %d" % chap_id), 42 | } 43 | ) 44 | 45 | def download_chapter_body(self, chapter): 46 | soup = self.get_soup(chapter["url"]) 47 | contents = soup.select("div.entry-content") 48 | return self.cleaner.extract_contents(contents) 49 | -------------------------------------------------------------------------------- /sources/id/novelringan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | class NovelRinganCrawler(Crawler): 9 | base_url = "https://novelringan.com/" 10 | 11 | def read_novel_info(self): 12 | logger.debug("Visiting %s", self.novel_url) 13 | soup = self.get_soup(self.novel_url) 14 | 15 | possible_title = soup.select_one("h1.entry-title") 16 | assert possible_title, "No novel title" 17 | self.novel_title = possible_title.text 18 | logger.info("Novel title: %s", self.novel_title) 19 | 20 | possible_image = soup.select_one("div.imgprop img") 21 | if possible_image: 22 | self.novel_cover = self.absolute_url(possible_image["src"]) 23 | logger.info("Novel cover: %s", self.novel_cover) 24 | 25 | self.novel_author = " ".join( 26 | [a.text.strip() for a in soup.select('.entry-author a[href*="/author/"]')] 27 | ) 28 | logger.info("%s", self.novel_author) 29 | 30 | for a in reversed(soup.select(".bxcl ul li a")): 31 | chap_id = len(self.chapters) + 1 32 | vol_id = 1 + len(self.chapters) // 100 33 | if len(self.volumes) < vol_id: 34 | self.volumes.append({"id": vol_id}) 35 | self.chapters.append( 36 | { 37 | "id": chap_id, 38 | "volume": vol_id, 39 | "url": self.absolute_url(a["href"]), 40 | "title": a.text.strip() or ("Chapter %d" % chap_id), 41 | } 42 | ) 43 | 44 | def download_chapter_body(self, chapter): 45 | soup = self.get_soup(chapter["url"]) 46 | contents = soup.select(".entry-content p") 47 | 48 | body = [str(p) for p in contents if p.text.strip()] 49 | 50 | return "

" + "

".join(body) + "

" 51 | -------------------------------------------------------------------------------- /sources/id/zhiend.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class ZhiEnd(Crawler): 10 | base_url = ["http://zhi-end.blogspot.com/", "http://zhi-end.blogspot.co.id/"] 11 | 12 | def initialize(self): 13 | self.home_url = "http://zhi-end.blogspot.com/" 14 | 15 | def read_novel_info(self): 16 | logger.debug("Visiting %s", self.novel_url) 17 | soup = self.get_soup(self.novel_url) 18 | 19 | possible_title = soup.select_one("h1.entry-title") 20 | assert possible_title, "No novel title" 21 | self.novel_title = possible_title.text.strip() 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_image = soup.select_one("div.entry-content div a img") 25 | if possible_image: 26 | self.novel_cover = self.absolute_url(possible_image["src"]) 27 | logger.info("Novel cover: %s", self.novel_cover) 28 | 29 | self.novel_author = "Translated by Zhi End" 30 | logger.info("Novel author: %s", self.novel_author) 31 | 32 | # Extract volume-wise chapter entries 33 | chapters = soup.select('div.entry-content div [href*="zhi-end.blogspot"]') 34 | 35 | for a in chapters: 36 | chap_id = len(self.chapters) + 1 37 | vol_id = 1 + len(self.chapters) // 100 38 | if len(self.volumes) < vol_id: 39 | self.volumes.append({"id": vol_id}) 40 | self.chapters.append( 41 | { 42 | "id": chap_id, 43 | "volume": vol_id, 44 | "url": self.absolute_url(a["href"]), 45 | "title": a.text.strip() or ("Chapter %d" % chap_id), 46 | } 47 | ) 48 | 49 | def download_chapter_body(self, chapter): 50 | soup = self.get_soup(chapter["url"]) 51 | 52 | body_parts = soup.select_one("div.post-body") 53 | 54 | return self.cleaner.extract_contents(body_parts) 55 | -------------------------------------------------------------------------------- /sources/multi/quotev.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from bs4 import Tag 5 | 6 | from lncrawl.core.crawler import Crawler 7 | from lncrawl.core.exeptions import LNException 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class QuotevCrawler(Crawler): 13 | base_url = ["https://www.quotev.com/"] 14 | 15 | def initialize(self) -> None: 16 | self.cleaner.bad_css.update([".nosel"]) 17 | 18 | def read_novel_info(self): 19 | soup = self.get_soup(self.novel_url) 20 | 21 | title_tag = soup.select_one("#quizHeaderTitle h1") 22 | if not isinstance(title_tag, Tag): 23 | raise LNException("No title found") 24 | 25 | self.novel_title = title_tag.text.strip() 26 | 27 | image_tag = soup.select_one("meta[property='og:image']") 28 | if isinstance(image_tag, Tag): 29 | self.novel_cover = self.absolute_url(image_tag["content"]) 30 | 31 | logger.info("Novel cover: %s", self.novel_cover) 32 | 33 | for a in soup.select("#rselectList a"): 34 | self.chapters.append( 35 | { 36 | "id": len(self.chapters) + 1, 37 | "title": a.text.strip(), 38 | "url": self.absolute_url(a["href"]), 39 | } 40 | ) 41 | 42 | def download_chapter_body(self, chapter): 43 | soup = self.get_soup(chapter["url"]) 44 | contents = soup.select_one("#rescontent") 45 | self.cleaner.clean_contents(contents) 46 | 47 | return str(contents) 48 | -------------------------------------------------------------------------------- /sources/pt/centralnovel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.templates.mangastream import MangaStreamTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class CentralNovelCrawler(MangaStreamTemplate): 10 | base_url = ["https://centralnovel.com/"] 11 | 12 | def initialize(self) -> None: 13 | self.init_executor(ratelimit=2.99) 14 | -------------------------------------------------------------------------------- /sources/ru/bestmanga.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | from lncrawl.templates.madara import MadaraTemplate 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class BestMangaCrawler(MadaraTemplate): 10 | has_manga = True 11 | base_url = ["https://bestmanga.club/"] 12 | -------------------------------------------------------------------------------- /sources/ru/ifreedom.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class IfreedomCrawler(Crawler): 10 | base_url = [ 11 | "https://ifreedom.su/", 12 | "https://bookhamster.ru/" 13 | ] 14 | 15 | def read_novel_info(self): 16 | soup = self.get_soup(self.novel_url) 17 | 18 | possible_title = soup.select_one("h1.entry-title") 19 | if possible_title: 20 | self.novel_title = possible_title.get_text() 21 | 22 | logger.info("Novel title: %s", self.novel_title) 23 | 24 | possible_author = soup.select_one("span.dashicons-admin-users").next\ 25 | .next\ 26 | .next 27 | if "Не указан" not in str(possible_author): 28 | self.novel_author = possible_author.get_text() 29 | logger.info("Novel author: %s", self.novel_author) 30 | 31 | possible_full_synopsis = soup.select_one("span.open-desc") 32 | if possible_full_synopsis: 33 | possible_full_synopsis = possible_full_synopsis["onclick"] 34 | self.novel_synopsis = possible_full_synopsis.split("= '")[1].strip("';") 35 | else: 36 | self.novel_synopsis = soup.select_one("div.descr-ranobe").get_text() 37 | 38 | img_src = soup.select_one("div.img-ranobe img") 39 | if img_src: 40 | self.novel_cover = self.absolute_url(img_src["src"]) 41 | 42 | for a in reversed(soup.select(".menu-ranobe a")): 43 | chap_id = 1 + (len(self.chapters)) 44 | 45 | self.chapters.append( 46 | { 47 | "id": chap_id, 48 | "title": a.text.strip(), 49 | "url": self.absolute_url(a['href']) 50 | } 51 | ) 52 | 53 | def download_chapter_body(self, chapter): 54 | soup = self.get_soup(chapter["url"]) 55 | content = soup.select_one("div.entry-content") 56 | return self.cleaner.extract_contents(content) 57 | -------------------------------------------------------------------------------- /sources/zh/daocaorenshuwu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Daocaorenshuwu(Crawler): 10 | base_url = [ 11 | "https://www.daocaorenshuwu.com/", 12 | ] 13 | 14 | def read_novel_info(self): 15 | logger.debug("Visiting %s", self.novel_url) 16 | soup = self.get_soup(self.novel_url) 17 | 18 | possible_title = soup.select_one(".container .book-info h1.book-name") 19 | assert possible_title, "No novel title" 20 | self.novel_title = possible_title.text 21 | logger.info("Novel title: %s", self.novel_title) 22 | 23 | self.novel_author = soup.select(".container .media-body .row div")[ 24 | 0 25 | ].text.strip() 26 | logger.info("Novel author: %s", self.novel_author) 27 | 28 | possible_image = soup.select_one(".container .media-left a img") 29 | if possible_image: 30 | self.novel_cover = self.absolute_url(possible_image["src"]) 31 | logger.info("Novel cover: %s", self.novel_cover) 32 | 33 | # Extract volume-wise ch 34 | # apter entries 35 | chapters = soup.select("#all-chapter a") 36 | 37 | for a in chapters: 38 | chap_id = len(self.chapters) + 1 39 | vol_id = 1 + len(self.chapters) // 100 40 | if len(self.volumes) < vol_id: 41 | self.volumes.append({"id": vol_id}) 42 | self.chapters.append( 43 | { 44 | "id": chap_id, 45 | "volume": vol_id, 46 | "url": self.absolute_url(a["href"]), 47 | "title": a.text.strip() or ("Chapter %d" % chap_id), 48 | } 49 | ) 50 | 51 | def download_chapter_body(self, chapter): 52 | soup = self.get_soup(chapter["url"]) 53 | contents = soup.select(".cont-text > p") 54 | contents = [str(p) for p in contents if p.text.strip()] 55 | return "".join(contents) 56 | -------------------------------------------------------------------------------- /sources/zh/powanjuan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class PowanjuanCrawler(Crawler): 10 | base_url = "https://www.powanjuan.cc/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url, encoding='gb2312') 15 | 16 | possible_title = soup.select_one(".desc h1") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text.split('(')[0].strip() 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_novel_author = soup.select_one('.descTip span') 22 | if possible_novel_author: 23 | self.novel_author = possible_novel_author.text.replace('作者:', '').strip() 24 | logger.info("Novel author: %s", self.novel_author) 25 | 26 | possible_synopsis = soup.select_one('.descInfo p') 27 | if possible_synopsis: 28 | self.novel_synopsis = possible_synopsis.text 29 | logger.info("Novel synopsis: %s", self.novel_synopsis) 30 | 31 | volumes = set([]) 32 | for a in soup.select(".catalog ul.clearfix li a"): 33 | ch_id = len(self.chapters) + 1 34 | vol_id = 1 + len(self.chapters) // 100 35 | volumes.add(vol_id) 36 | self.chapters.append( 37 | { 38 | "id": ch_id, 39 | "volume": vol_id, 40 | "title": a.text.strip(), 41 | "url": self.absolute_url(a["href"]), 42 | } 43 | ) 44 | 45 | self.volumes = [{"id": x, "title": ""} for x in volumes] 46 | 47 | def download_chapter_body(self, chapter): 48 | soup = self.get_soup(chapter["url"], encoding='gb2312') 49 | contents = soup.select_one("#mycontent") 50 | return self.cleaner.extract_contents(contents) 51 | -------------------------------------------------------------------------------- /sources/zh/soxs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | from lncrawl.core.crawler import Crawler 4 | 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class Soxc(Crawler): 10 | base_url = ["https://www.soxs.cc/"] 11 | 12 | def read_novel_info(self): 13 | self.novel_url = self.novel_url.replace("/book/", "/") 14 | self.novel_url = self.novel_url.replace(".html", "/") 15 | soup = self.get_soup(self.novel_url) 16 | 17 | possible_title = soup.select_one(".xiaoshuo h1") 18 | assert possible_title, "No novel title" 19 | self.novel_title = possible_title.get_text() 20 | logger.info(f"Novel title: {self.novel_title}") 21 | 22 | self.novel_author = soup.select_one(".xiaoshuo h6").get_text() 23 | logger.info(f"Novel Author: {self.novel_author}") 24 | 25 | possible_novel_cover = soup.select_one(".book_cover img") 26 | if possible_novel_cover: 27 | self.novel_cover = self.absolute_url(possible_novel_cover["src"]) 28 | logger.info(f"Novel Cover: {self.novel_cover}") 29 | 30 | logger.info("Getting chapters...") 31 | for chapter in soup.select(".novel_list dd a"): 32 | url = self.absolute_url(chapter["href"]) 33 | chap_id = len(self.chapters) + 1 34 | if len(self.chapters) % 100 == 0: 35 | vol_id = len(self.chapters) // 100 + 1 36 | self.volumes.append({"id": vol_id}) 37 | 38 | self.chapters.append( 39 | { 40 | "id": chap_id, 41 | "url": url, 42 | "volume": vol_id, 43 | } 44 | ) 45 | 46 | def download_chapter_body(self, chapter): 47 | soup = self.get_soup(chapter["url"]) 48 | title = soup.select_one(".read_title h1").text.strip() 49 | chapter["title"] = title 50 | 51 | content = soup.select(".content") 52 | content = "\n".join(str(p) for p in content) 53 | content = content.replace(self.novel_url, "") 54 | content = content.replace("soxscc", "mtlrealm.com ") 55 | return content 56 | -------------------------------------------------------------------------------- /sources/zh/trxs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | 4 | from lncrawl.core.crawler import Crawler 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class TrxsCrawler(Crawler): 10 | base_url = "https://trxs.cc/" 11 | 12 | def read_novel_info(self): 13 | logger.debug("Visiting %s", self.novel_url) 14 | soup = self.get_soup(self.novel_url, encoding='gb2312') 15 | 16 | possible_title = soup.select_one(".book_info h1") 17 | assert possible_title, "No novel title" 18 | self.novel_title = possible_title.text 19 | logger.info("Novel title: %s", self.novel_title) 20 | 21 | possible_novel_cover = soup.select_one('.book_info img') 22 | if possible_novel_cover: 23 | self.novel_cover = self.absolute_url(possible_novel_cover["src"]) 24 | logger.info("Novel cover: %s", self.novel_cover) 25 | 26 | possible_synopsis = soup.select_one('.book_info p') 27 | if possible_synopsis: 28 | self.novel_synopsis = possible_synopsis.text 29 | logger.info("Novel synopsis %s", self.novel_synopsis) 30 | 31 | possible_novel_author = soup.select_one('.book_info a') 32 | if possible_novel_author: 33 | self.novel_author = possible_novel_author.text 34 | logger.info("Novel author: %s", self.novel_author) 35 | 36 | volumes = set([]) 37 | for a in soup.select(".book_list a"): 38 | ch_id = len(self.chapters) + 1 39 | vol_id = 1 + len(self.chapters) // 100 40 | volumes.add(vol_id) 41 | self.chapters.append( 42 | { 43 | "id": ch_id, 44 | "volume": vol_id, 45 | "title": a.text, 46 | "url": self.absolute_url(a["href"]), 47 | } 48 | ) 49 | 50 | self.volumes = [{"id": x, "title": ""} for x in volumes] 51 | 52 | def download_chapter_body(self, chapter): 53 | soup = self.get_soup(chapter["url"], encoding='gb2312') 54 | contents = soup.select_one(".read_chapterDetail") 55 | return self.cleaner.extract_contents(contents) 56 | --------------------------------------------------------------------------------