├── .python-version ├── plugins ├── omnimd-browser-reader │ ├── README.md │ ├── pyproject.toml │ └── browser.py ├── omnimd-toutiao-reader │ ├── README.md │ ├── pyproject.toml │ └── toutiao.py ├── omnimd-zhihu-reader │ ├── README.md │ ├── pyproject.toml │ └── zhihu.py └── omnimd-freedium-reader │ ├── README.md │ ├── pyproject.toml │ └── freedium.py ├── data ├── 1.gif └── 1.jpg ├── .gitignore ├── tests ├── conftest.py ├── test_extractor.py ├── test_parser.py └── test_utils.py ├── Dockerfile ├── src └── omni_article_markdown │ ├── __init__.py │ ├── plugins.py │ ├── extractors │ ├── hugo.py │ ├── zhihu.py │ ├── 163.py │ ├── woshipm.py │ ├── infoqcn.py │ ├── aliyun_developer.py │ ├── android_dev_blog.py │ ├── cloudflare_blog.py │ ├── oschina.py │ ├── tencent_cloud.py │ ├── anthropic.py │ ├── medium.py │ ├── infoq.py │ ├── quantamagazine.py │ ├── juejin.py │ ├── sspai.py │ ├── claude_doc.py │ ├── microsoft_learn.py │ ├── cnblog.py │ ├── apple_developer.py │ ├── baijiahao.py │ ├── toutiao.py │ ├── jetbrains_blog.py │ ├── wechat_gzh.py │ ├── jianshu.py │ ├── towards_data_science.py │ ├── freedium.py │ └── yuque.py │ ├── hookspecs.py │ ├── store.py │ ├── readers.py │ ├── omni_article_md.py │ ├── cli.py │ ├── extractor.py │ ├── utils.py │ └── parser.py ├── .editorconfig ├── .github ├── FUNDING.yml └── workflows │ ├── publish.yml │ └── publish_plugin.yml ├── LICENSE ├── pyproject.toml ├── ruff.toml ├── README.md └── uv.lock /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /plugins/omnimd-browser-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 浏览器插件 2 | -------------------------------------------------------------------------------- /plugins/omnimd-toutiao-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 头条插件 2 | -------------------------------------------------------------------------------- /plugins/omnimd-zhihu-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 知乎插件 2 | -------------------------------------------------------------------------------- /data/1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caol64/omni-article-markdown/HEAD/data/1.gif -------------------------------------------------------------------------------- /data/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caol64/omni-article-markdown/HEAD/data/1.jpg -------------------------------------------------------------------------------- /plugins/omnimd-freedium-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) Freedium插件 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | 3 | # Python-generated files 4 | __pycache__/ 5 | *.py[oc] 6 | build/ 7 | dist/ 8 | wheels/ 9 | *.egg-info 10 | 11 | # Virtual environments 12 | .venv 13 | .env 14 | 15 | plugins/**/uv.lock 16 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | @pytest.fixture 6 | def make_soup(): 7 | def _make_soup(html: str, parser: str = "html.parser"): 8 | return BeautifulSoup(html, parser) 9 | return _make_soup 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-alpine 2 | 3 | ARG PYPI_REGISTRY="https://pypi.org/simple/" 4 | 5 | WORKDIR /app 6 | 7 | RUN pip config set global.index-url "${PYPI_REGISTRY}" 8 | RUN pip install omni-article-markdown 9 | 10 | ENTRYPOINT ["mdcli"] 11 | CMD [] 12 | -------------------------------------------------------------------------------- /src/omni_article_markdown/__init__.py: -------------------------------------------------------------------------------- 1 | from .omni_article_md import OmniArticleMarkdown 2 | 3 | __all__ = ["OmniArticleMarkdown"] 4 | 5 | DEFAULT_PLUGINS = { 6 | "zhihu": "omnimd-zhihu-reader", 7 | "freedium": "omnimd-freedium-reader", 8 | "toutiao": "omnimd-toutiao-reader", 9 | "browser": "omnimd-browser-reader", 10 | } 11 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | trim_trailing_whitespace = true 7 | charset = utf-8 8 | end_of_line = lf 9 | insert_final_newline = true 10 | 11 | [*.json] 12 | indent_size = 2 13 | 14 | [*.{yml,yaml}] 15 | indent_size = 2 16 | 17 | [Makefile] 18 | indent_style = tab 19 | 20 | [*.{md,mdx}] 21 | max_line_length = off 22 | trim_trailing_whitespace = false 23 | -------------------------------------------------------------------------------- /src/omni_article_markdown/plugins.py: -------------------------------------------------------------------------------- 1 | import pluggy 2 | 3 | from . import hookspecs 4 | 5 | pm = pluggy.PluginManager("mdcli") 6 | pm.add_hookspecs(hookspecs) 7 | 8 | _loaded_plugins = False 9 | 10 | def load_mdcli_plugins(): 11 | global _loaded_plugins 12 | if _loaded_plugins: 13 | return 14 | pm.load_setuptools_entrypoints("mdcli") 15 | _loaded_plugins = True 16 | 17 | # 在应用启动时调用 18 | load_mdcli_plugins() 19 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/hugo.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | 7 | 8 | class HugoExtractor(Extractor): 9 | """ 10 | Hugo博客 11 | """ 12 | 13 | @override 14 | def can_handle(self, soup: BeautifulSoup) -> bool: 15 | return False 16 | 17 | @override 18 | def article_container(self) -> tuple: 19 | return ("div", {"class": "post-content"}) 20 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/zhihu.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class ZhihuExtractor(Extractor): 10 | """ 11 | 知乎专栏 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_og_site_name(soup) == "知乎专栏" 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "Post-RichText"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/163.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class Netease163Extractor(Extractor): 10 | """ 11 | 163.com 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://www.163.com", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "post_body"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/woshipm.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_url 7 | 8 | 9 | class WoShiPMExtractor(Extractor): 10 | """ 11 | 人人都是产品经理 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_og_url(soup).startswith("https://www.woshipm.com") 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "article--content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/infoqcn.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class InfoQCNExtractor(Extractor): 10 | """ 11 | www.infoq.cn 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://www.infoq.cn", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "article-content-wrap"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/aliyun_developer.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class AliyunDeveloperExtractor(Extractor): 10 | """ 11 | developer.aliyun.com 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://developer.aliyun.com", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "article-content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/android_dev_blog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class AndroidDevelopersBlogExtractor(Extractor): 10 | """ 11 | Android Developers Blog 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_og_site_name(soup) == "Android Developers Blog" 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "adb-detail__content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/cloudflare_blog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class CloudflareBlogExtractor(Extractor): 10 | """ 11 | blog.cloudflare.com 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://blog.cloudflare.com", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("section", {"class": "post-full-content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/hookspecs.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | from pluggy import HookimplMarker, HookspecMarker 4 | 5 | hookspec = HookspecMarker("mdcli") 6 | hookimpl = HookimplMarker("mdcli") 7 | 8 | 9 | class ReaderPlugin(Protocol): 10 | def can_handle(self, url: str) -> bool: ... 11 | 12 | def read(self, url: str) -> str: ... 13 | 14 | 15 | @hookspec(firstresult=True) 16 | def get_custom_reader(url: str) -> ReaderPlugin | None: 17 | """ 18 | Allows plugins to provide a custom reader for a given URL. 19 | The first plugin that returns a ReaderPlugin instance will be used. 20 | """ 21 | ... 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/oschina.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | 7 | 8 | class OsChinaExtractor(Extractor): 9 | """ 10 | 开源中国 11 | """ 12 | 13 | @override 14 | def can_handle(self, soup: BeautifulSoup) -> bool: 15 | title_tag = soup.title 16 | title = title_tag.get_text(strip=True) if title_tag else None 17 | return title is not None and title.endswith(" - OSCHINA - 中文开源技术交流社区") 18 | 19 | @override 20 | def article_container(self) -> tuple: 21 | return ("div", {"class": "detail-box"}) 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/tencent_cloud.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | 7 | 8 | class TencentCloudExtractor(Extractor): 9 | """ 10 | 腾讯云开发者社区 11 | """ 12 | 13 | @override 14 | def can_handle(self, soup: BeautifulSoup) -> bool: 15 | title_tag = soup.title 16 | title = title_tag.get_text(strip=True) if title_tag else None 17 | return title is not None and title.endswith("-腾讯云开发者社区-腾讯云") 18 | 19 | @override 20 | def article_container(self) -> tuple: 21 | return ("div", {"class": "mod-content__markdown"}) 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/anthropic.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_title 7 | 8 | 9 | class ClaudeDocExtractor(Extractor): 10 | """ 11 | Anthropic 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_title(soup).endswith(" \\ Anthropic") 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("article", None) 21 | 22 | @override 23 | def extract_url(self, soup: BeautifulSoup) -> str: 24 | return "https://www.anthropic.com/" 25 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/medium.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class MediumExtractor(Extractor): 10 | """ 11 | Medium 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend([ 17 | lambda el: 'data-testid' in el.attrs, 18 | lambda el: 'class' in el.attrs and 'speechify-ignore' in el.attrs['class'], 19 | ]) 20 | 21 | @override 22 | def can_handle(self, soup: BeautifulSoup) -> bool: 23 | return get_og_site_name(soup) == "Medium" 24 | 25 | @override 26 | def article_container(self) -> tuple: 27 | return ("article", None) 28 | -------------------------------------------------------------------------------- /src/omni_article_markdown/store.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | 6 | class Store: 7 | def __init__(self, base_dir_name: str = ".ommimd"): 8 | self.path = Path.home() / base_dir_name 9 | 10 | def save(self, key: str, obj: Any): 11 | self.path.mkdir(parents=True, exist_ok=True) 12 | file_path = self.path / f"{key}.json" 13 | with open(file_path, "w", encoding="utf8") as f: 14 | json.dump(obj, f, indent=4, ensure_ascii=False) 15 | 16 | def load(self, key: str) -> Any | None: 17 | file_path = self.path / f"{key}.json" 18 | if not file_path.exists() or not file_path.is_file(): 19 | return None 20 | with open(file_path, encoding="utf8") as f: 21 | return json.load(f) 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/infoq.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class InfoQExtractor(Extractor): 10 | """ 11 | www.infoq.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "author-section-full" in el.attrs["class"], 19 | ] 20 | ) 21 | 22 | @override 23 | def can_handle(self, soup: BeautifulSoup) -> bool: 24 | return is_matched_canonical("https://www.infoq.com", soup) 25 | 26 | @override 27 | def article_container(self) -> tuple: 28 | return ("div", {"class": "article__data"}) 29 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/quantamagazine.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class QuantamagazineExtractor(Extractor): 10 | """ 11 | quantamagazine.org 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "post__title__title" in el.attrs["class"], 19 | ] 20 | ) 21 | 22 | @override 23 | def can_handle(self, soup: BeautifulSoup) -> bool: 24 | return get_og_site_name(soup) == "Quanta Magazine" 25 | 26 | @override 27 | def article_container(self) -> tuple: 28 | return ("div", {"id": "postBody"}) 29 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/juejin.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import filter_tag, is_matched_canonical 7 | 8 | 9 | class JuejinExtractor(Extractor): 10 | """ 11 | juejin.cn 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://juejin.cn/", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"id": "article-root"}) 21 | 22 | @override 23 | def extract_title(self, soup: BeautifulSoup) -> str: 24 | title_tag = filter_tag(soup.find("h1", {"class": "article-title"})) 25 | return title_tag.get_text(strip=True) if title_tag else super().extract_title(soup) 26 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/sspai.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class SspaiExtractor(Extractor): 10 | """ 11 | 少数派 12 | """ 13 | def __init__(self): 14 | super().__init__() 15 | self.attrs_to_clean.extend( 16 | [ 17 | lambda el: "class" in el.attrs and "comment__list" in el.attrs["class"], 18 | lambda el: "class" in el.attrs and "comment__footer__wrapper" in el.attrs["class"], 19 | ] 20 | ) 21 | 22 | @override 23 | def can_handle(self, soup: BeautifulSoup) -> bool: 24 | return get_og_site_name(soup) == "少数派 - 高品质数字消费指南" 25 | 26 | @override 27 | def article_container(self) -> tuple: 28 | return ("div", {"class": "article__main__wrapper"}) 29 | -------------------------------------------------------------------------------- /plugins/omnimd-zhihu-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-zhihu-reader" 3 | version = "0.1.3" 4 | description = "A plugin for omni-article-markdown to read Zhihu content." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | zhihu = "zhihu" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-zhihu-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/zhihu.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/zhihu.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: ['https://yuzhi.tech/sponsor', 'https://paypal.me/caol64'] 16 | -------------------------------------------------------------------------------- /plugins/omnimd-toutiao-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-toutiao-reader" 3 | version = "0.1.3" 4 | description = "A plugin for omni-article-markdown to read Toutiao content." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | toutiao = "toutiao" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-toutiao-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/toutiao.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/toutiao.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/claude_doc.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_title 7 | 8 | 9 | class ClaudeDocExtractor(Extractor): 10 | """ 11 | docs.claude.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "data-component-part" in el.attrs and "code-block-header" in el.attrs["data-component-part"], 19 | lambda el: "data-component-part" in el.attrs and "code-group-tab-bar" in el.attrs["data-component-part"], 20 | ] 21 | ) 22 | 23 | @override 24 | def can_handle(self, soup: BeautifulSoup) -> bool: 25 | return get_og_title(soup).endswith(" - Claude Docs") 26 | 27 | @override 28 | def article_container(self) -> tuple: 29 | return ("div", {"class": "mdx-content"}) 30 | -------------------------------------------------------------------------------- /plugins/omnimd-freedium-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-freedium-reader" 3 | version = "0.1.3" 4 | description = "A plugin for omni-article-markdown to read Freedium content." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | freedium = "freedium" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-freedium-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/freedium.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/freedium.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | omnimd-publish: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.13' 19 | 20 | - name: Set up pip cache 21 | uses: actions/cache@v4 22 | with: 23 | path: ~/.cache/pip 24 | key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} 25 | 26 | - name: Install Hatch 27 | run: | 28 | pip install -U hatch hatchling 29 | 30 | - name: Build and publish with Hatch 31 | env: 32 | HATCH_INDEX_USER: __token__ 33 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }} 34 | run: | 35 | hatch build --clean 36 | hatch publish --yes --no-prompt 37 | -------------------------------------------------------------------------------- /plugins/omnimd-browser-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-browser-reader" 3 | version = "0.1.2" 4 | description = "A plugin for omni-article-markdown to read content that needs enabling javascript." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | browser = "browser" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-browser-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/browser.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/browser.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/microsoft_learn.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_url 7 | 8 | 9 | class MicrosoftLearnExtractor(Extractor): 10 | """ 11 | 微软技术文档 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "id" in el.attrs and "article-header" in el.attrs["id"], 19 | lambda el: "id" in el.attrs and "article-metadata" in el.attrs["id"], 20 | lambda el: "id" in el.attrs and "site-user-feedback-footer" in el.attrs["id"], 21 | ] 22 | ) 23 | 24 | @override 25 | def can_handle(self, soup: BeautifulSoup) -> bool: 26 | return get_og_url(soup).startswith("https://learn.microsoft.com") 27 | 28 | @override 29 | def article_container(self) -> tuple: 30 | return ("main", {"id": "main"}) 31 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/cnblog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class CnBlogsExtractor(Extractor): 10 | """ 11 | 博客园 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "id" in el.attrs and "blog_post_info_block" in el.attrs["id"], 19 | lambda el: "class" in el.attrs and "postDesc" in el.attrs["class"], 20 | ] 21 | ) 22 | 23 | @override 24 | def can_handle(self, soup: BeautifulSoup) -> bool: 25 | return is_matched_canonical("https://www.cnblogs.com", soup) 26 | 27 | @override 28 | def article_container(self) -> tuple: 29 | return ("div", {"class": "post"}) 30 | 31 | @override 32 | def extract_description(self, soup: BeautifulSoup) -> str: 33 | return "" 34 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/apple_developer.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class AppleDevelopExtractor(Extractor): 10 | """ 11 | Apple Developer Documentation 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "eyebrow" in el.attrs["class"], 19 | lambda el: "class" in el.attrs and "platform" in el.attrs["class"], 20 | lambda el: "class" in el.attrs and "title" in el.attrs["class"], 21 | ] 22 | ) 23 | 24 | @override 25 | def can_handle(self, soup: BeautifulSoup) -> bool: 26 | return get_og_site_name(soup) == "Apple Developer Documentation" 27 | 28 | @override 29 | def article_container(self) -> tuple: 30 | return ("main", {"class": "main"}) 31 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/baijiahao.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import filter_tag 7 | 8 | 9 | class Netease163Extractor(Extractor): 10 | """ 11 | 百家号 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | tag1 = filter_tag(soup.find("div", {"data-testid": "article"})) 17 | tag2 = filter_tag(soup.find("span", {"class": "bjh-p"})) 18 | return tag1 is not None and tag2 is not None 19 | 20 | @override 21 | def article_container(self) -> tuple: 22 | return ("div", {"data-testid": "article"}) 23 | 24 | @override 25 | def pre_handle_soup(self, soup: BeautifulSoup) -> BeautifulSoup: 26 | for tag in soup.find_all("span", {"class": "bjh-p"}): 27 | span_tag = filter_tag(tag) 28 | if span_tag: 29 | span_tag.name = "p" 30 | # for tag in soup.find_all("img"): 31 | # tag.wrap(soup.new_tag("p")) 32 | return soup 33 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/toutiao.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | from bs4.element import Tag 5 | 6 | from ..extractor import Extractor 7 | from ..utils import filter_tag, get_attr_text 8 | 9 | 10 | class ToutiaoExtractor(Extractor): 11 | """ 12 | 今日头条 13 | """ 14 | 15 | @override 16 | def can_handle(self, soup: BeautifulSoup) -> bool: 17 | title_tag = soup.title 18 | title = title_tag.get_text(strip=True) if title_tag else None 19 | return title is not None and title.endswith(" - 今日头条") 20 | 21 | @override 22 | def article_container(self) -> tuple: 23 | return ("div", {"class": "article-content"}) 24 | 25 | @override 26 | def extract_img(self, element: Tag) -> Tag: 27 | img_els = element.find_all("img") 28 | for img_el in img_els: 29 | img_tag = filter_tag(img_el) 30 | if img_tag: 31 | src = get_attr_text(img_tag.attrs.get("data-src")) 32 | if src: 33 | img_tag.attrs["src"] = src 34 | return element 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 caol64 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/jetbrains_blog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class JetbrainsBlogExtractor(Extractor): 10 | """ 11 | blog.jetbrains.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "content__row" in el.attrs["class"], 19 | lambda el: "class" in el.attrs and "content__pagination" in el.attrs["class"], 20 | lambda el: "class" in el.attrs and "content__form" in el.attrs["class"], 21 | lambda el: "class" in el.attrs and "tag" in el.attrs["class"], 22 | lambda el: "class" in el.attrs and "author-post" in el.attrs["class"], 23 | ] 24 | ) 25 | 26 | @override 27 | def can_handle(self, soup: BeautifulSoup) -> bool: 28 | return get_og_site_name(soup) == "The JetBrains Blog" 29 | 30 | @override 31 | def article_container(self) -> tuple: 32 | return ("div", {"class": "content"}) 33 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/wechat_gzh.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | from bs4.element import Tag 5 | 6 | from ..extractor import Extractor 7 | from ..utils import filter_tag, get_attr_text, get_og_site_name 8 | 9 | 10 | class WechatGZHExtractor(Extractor): 11 | """ 12 | 微信公众号 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__() 17 | self.attrs_to_clean.append(lambda el: 'id' in el.attrs and el.attrs['id'] == 'meta_content') 18 | 19 | @override 20 | def can_handle(self, soup: BeautifulSoup) -> bool: 21 | return get_og_site_name(soup) == "微信公众平台" 22 | 23 | @override 24 | def article_container(self) -> tuple: 25 | return ("div", {"class": "rich_media_content"}) 26 | 27 | @override 28 | def extract_img(self, element: Tag) -> Tag: 29 | img_els = element.find_all("img") 30 | for img_el in img_els: 31 | img_tag = filter_tag(img_el) 32 | if img_tag: 33 | src = get_attr_text(img_tag.attrs.get("data-src")) 34 | if src: 35 | img_tag.attrs["src"] = src 36 | return element 37 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/jianshu.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | from bs4.element import Tag 5 | 6 | from ..extractor import ARTICLE_CONTAINERS, Extractor 7 | from ..utils import filter_tag, get_attr_text, get_og_site_name 8 | 9 | 10 | class JianshuExtractor(Extractor): 11 | """ 12 | www.jianshu.com 13 | """ 14 | 15 | @override 16 | def can_handle(self, soup: BeautifulSoup) -> bool: 17 | return get_og_site_name(soup) == "简书" 18 | 19 | @override 20 | def article_container(self) -> tuple | list: 21 | return ARTICLE_CONTAINERS 22 | 23 | @override 24 | def extract_description(self, soup: BeautifulSoup) -> str: 25 | return "" 26 | 27 | @override 28 | def extract_url(self, soup: BeautifulSoup) -> str: 29 | return "https:" 30 | 31 | @override 32 | def extract_img(self, element: Tag) -> Tag: 33 | img_els = element.find_all("img") 34 | for img_el in img_els: 35 | img_tag = filter_tag(img_el) 36 | if img_tag: 37 | src = get_attr_text(img_tag.attrs.get("data-original-src")) 38 | if src: 39 | img_tag.attrs["src"] = src 40 | return element 41 | -------------------------------------------------------------------------------- /.github/workflows/publish_plugin.yml: -------------------------------------------------------------------------------- 1 | name: Build Plugins and Publish to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | package_path: 7 | description: 'Path to plugin directory (relative to repo root)' 8 | required: true 9 | default: 'plugins/omnimd-freedium-reader' 10 | 11 | jobs: 12 | omnimd-plugin-publish: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: '3.13' 23 | 24 | - name: Set up pip cache 25 | uses: actions/cache@v4 26 | with: 27 | path: ~/.cache/pip 28 | key: ${{ runner.os }}-${{ github.event.inputs.package_path }}-pip-${{ hashFiles('pyproject.toml') }} 29 | 30 | - name: Install Hatch 31 | run: | 32 | pip install -U hatch hatchling 33 | 34 | - name: Build and publish with Hatch 35 | env: 36 | HATCH_INDEX_USER: __token__ 37 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }} 38 | run: | 39 | cd "${{ github.event.inputs.package_path }}" 40 | hatch build --clean 41 | hatch publish --yes --no-prompt 42 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/towards_data_science.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class TowardsDataScienceExtractor(Extractor): 10 | """ 11 | towardsdatascience.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend([ 17 | lambda el: 'class' in el.attrs and 'taxonomy-post_tag' in el.attrs['class'], 18 | lambda el: 'class' in el.attrs and 'tds-cta-box' in el.attrs['class'], 19 | lambda el: 'class' in el.attrs and 'wp-block-buttons' in el.attrs['class'], 20 | lambda el: 'class' in el.attrs and 'wp-block-outermost-social-sharing' in el.attrs['class'], 21 | lambda el: 'class' in el.attrs and 'wp-block-tenup-post-time-to-read' in el.attrs['class'], 22 | ]) 23 | self.tags_to_clean.extend([ 24 | lambda el: el.name == 'time', 25 | ]) 26 | 27 | @override 28 | def can_handle(self, soup: BeautifulSoup) -> bool: 29 | return get_og_site_name(soup) == "Towards Data Science" 30 | 31 | @override 32 | def article_container(self) -> tuple | list: 33 | return ("main", None) 34 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/freedium.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import filter_tag 7 | 8 | 9 | class FreediumExtractor(Extractor): 10 | """ 11 | freedium.cfd 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | title_tag = soup.title 17 | title = title_tag.get_text(strip=True) if title_tag else None 18 | return title is not None and title.endswith(" - Freedium") 19 | 20 | @override 21 | def article_container(self) -> tuple: 22 | return ("div", {"class": "main-content"}) 23 | 24 | @override 25 | def extract_title(self, soup: BeautifulSoup) -> str: 26 | title_tag = filter_tag(soup.find("h1")) 27 | if title_tag: 28 | title = title_tag.get_text(strip=True) 29 | title_tag.decompose() 30 | return title 31 | return super().extract_title(soup) 32 | 33 | @override 34 | def extract_description(self, soup: BeautifulSoup) -> str: 35 | description_tag = soup.find("h2") 36 | if description_tag: 37 | description = description_tag.get_text(strip=True) 38 | description_tag.decompose() 39 | return description 40 | return super().extract_description(soup) 41 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omni-article-markdown" 3 | version = "0.1.10" 4 | description = "Easily convert web articles (blogs, news, documents, etc.) into Markdown format." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | classifiers = [ 12 | "Development Status :: 5 - Production/Stable", 13 | "Environment :: Console", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: End Users/Desktop", 16 | "License :: OSI Approved :: MIT License", 17 | "Natural Language :: English", 18 | "Operating System :: OS Independent", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.13", 21 | "Topic :: Text Processing :: Markup :: Markdown", 22 | "Topic :: Utilities", 23 | ] 24 | dependencies = [ 25 | "requests>=2.32.3", 26 | "beautifulsoup4>=4.13.4", 27 | "html5lib>=1.1", 28 | "click>=8.2.0", 29 | "pluggy>=1.6.0", 30 | "click-default-group>=1.2.4", 31 | "pip", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "pytest", 37 | ] 38 | 39 | [project.scripts] 40 | mdcli = "omni_article_markdown.cli:cli" 41 | 42 | [project.urls] 43 | Homepage = "https://github.com/caol64/omni-article-markdown" 44 | 45 | [build-system] 46 | requires = ["hatchling"] 47 | build-backend = "hatchling.build" 48 | 49 | [tool.hatch.build] 50 | exclude = [ 51 | "/data", 52 | "/plugins", 53 | "/dist", 54 | ] 55 | 56 | [tool.hatch.build.targets.sdist] 57 | include = [ 58 | "/src/omni_article_markdown", 59 | "/README.md", 60 | "/pyproject.toml", 61 | ] 62 | -------------------------------------------------------------------------------- /src/omni_article_markdown/readers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import requests 4 | 5 | from .extractor import Extractor 6 | from .hookspecs import ReaderPlugin 7 | from .plugins import pm 8 | from .utils import REQUEST_HEADERS 9 | 10 | 11 | class Reader(ABC): 12 | @abstractmethod 13 | def read(self) -> str: ... 14 | 15 | def extractor(self) -> Extractor | None: 16 | return None 17 | 18 | 19 | class ReaderFactory: 20 | @staticmethod 21 | def create(url_or_path: str) -> Reader: 22 | custom_plugin_reader = pm.hook.get_custom_reader(url=url_or_path) 23 | if custom_plugin_reader: 24 | 25 | class PluginReaderAdapter(Reader): 26 | def __init__(self, plugin: ReaderPlugin, url: str): 27 | self.plugin = plugin 28 | self.url = url 29 | 30 | def read(self) -> str: 31 | return self.plugin.read(self.url) 32 | 33 | return PluginReaderAdapter(custom_plugin_reader, url_or_path) 34 | if url_or_path.startswith("http"): 35 | return HtmlReader(url_or_path) 36 | return FileReader(url_or_path) 37 | 38 | 39 | class HtmlReader(Reader): 40 | def __init__(self, url_or_path: str): 41 | self.url_or_path = url_or_path 42 | 43 | def read(self) -> str: 44 | response = requests.get(self.url_or_path, headers=REQUEST_HEADERS) 45 | response.encoding = "utf-8" 46 | return response.text 47 | 48 | 49 | class FileReader(Reader): 50 | def __init__(self, url_or_path: str): 51 | self.url_or_path = url_or_path 52 | 53 | def read(self) -> str: 54 | with open(self.url_or_path, encoding="utf8") as f: 55 | return f.read() 56 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/yuque.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import override 4 | 5 | import requests 6 | from bs4 import BeautifulSoup 7 | 8 | from ..extractor import Article, Extractor 9 | from ..utils import REQUEST_HEADERS, filter_tag, get_og_url 10 | 11 | 12 | class YuqueExtractor(Extractor): 13 | """ 14 | 语雀 15 | """ 16 | 17 | @override 18 | def can_handle(self, soup: BeautifulSoup) -> bool: 19 | return get_og_url(soup).startswith("https://www.yuque.com") 20 | 21 | @override 22 | def article_container(self) -> tuple: 23 | return ("", {}) 24 | 25 | @override 26 | def extract_article(self, soup: BeautifulSoup) -> Article: 27 | script_tag = filter_tag(soup.find("script", string=re.compile(r"decodeURIComponent"))) 28 | if script_tag: 29 | raw_js = script_tag.string 30 | if raw_js: 31 | match = re.search(r'decodeURIComponent\s*\(\s*"([^"]+)"\s*\)', raw_js) 32 | if match: 33 | encoded_str = match.group(1) 34 | 35 | from urllib.parse import unquote 36 | 37 | decoded_str = unquote(encoded_str) 38 | decoded_json = json.loads(decoded_str) 39 | # print(decoded_json) 40 | doc = decoded_json["doc"] 41 | if doc and doc["book_id"]: 42 | book_id = str(doc["book_id"]) 43 | slug = str(doc["slug"]) 44 | response = requests.get(f"https://www.yuque.com/api/docs/{slug}?book_id={book_id}&mode=markdown", headers=REQUEST_HEADERS) 45 | response.encoding = "utf-8" 46 | resp = response.json() 47 | # print(resp) 48 | return Article(str(resp["data"]["title"]), None, None, str(resp["data"]["sourcecode"])) 49 | return Article("", None, None, "") 50 | -------------------------------------------------------------------------------- /plugins/omnimd-browser-reader/browser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from runpy import run_module 3 | from typing import override 4 | 5 | from playwright.sync_api import Browser, Playwright, sync_playwright 6 | 7 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 8 | from omni_article_markdown.utils import BROWSER_TARGET_HOSTS, REQUEST_HEADERS 9 | 10 | 11 | class BrowserPlugin(ReaderPlugin): 12 | @override 13 | def can_handle(self, url: str) -> bool: 14 | return any(host in url for host in BROWSER_TARGET_HOSTS) 15 | 16 | @override 17 | def read(self, url: str) -> str: 18 | def try_launch_browser(p: Playwright) -> Browser: 19 | try: 20 | return p.chromium.launch(headless=True) 21 | except Exception as e: 22 | # Playwright not installed or browser missing 23 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 24 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 25 | original_argv = sys.argv 26 | args = ["playwright", "install", "chromium"] 27 | sys.argv = args 28 | run_module("playwright", run_name="__main__") 29 | sys.argv = original_argv 30 | # Try again 31 | return p.chromium.launch(headless=True) 32 | raise # re-raise other exceptions 33 | 34 | with sync_playwright() as p: 35 | browser = try_launch_browser(p) 36 | context = browser.new_context( 37 | user_agent=REQUEST_HEADERS["User-Agent"], 38 | java_script_enabled=True, 39 | extra_http_headers=REQUEST_HEADERS, 40 | ) 41 | page = context.new_page() 42 | page.goto(url, wait_until="networkidle") 43 | html = page.content() 44 | page.close() 45 | context.close() 46 | browser.close() 47 | return html 48 | 49 | 50 | @hookimpl 51 | def get_custom_reader(url: str) -> ReaderPlugin | None: 52 | plugin_instance = BrowserPlugin() 53 | if plugin_instance.can_handle(url): 54 | return plugin_instance 55 | return None 56 | -------------------------------------------------------------------------------- /plugins/omnimd-freedium-reader/freedium.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import resources 3 | from runpy import run_module 4 | from typing import override 5 | 6 | from playwright.sync_api import Browser, Playwright, sync_playwright 7 | 8 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 9 | from omni_article_markdown.utils import REQUEST_HEADERS 10 | 11 | 12 | class FreediumPlugin(ReaderPlugin): 13 | @override 14 | def can_handle(self, url: str) -> bool: 15 | return "freedium.cfd" in url 16 | 17 | @override 18 | def read(self, url: str) -> str: 19 | def try_launch_browser(p: Playwright) -> Browser: 20 | try: 21 | return p.chromium.launch(headless=True) 22 | except Exception as e: 23 | # Playwright not installed or browser missing 24 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 25 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 26 | original_argv = sys.argv 27 | args = ["playwright", "install", "chromium"] 28 | sys.argv = args 29 | run_module("playwright", run_name="__main__") 30 | sys.argv = original_argv 31 | # Try again 32 | return p.chromium.launch(headless=True) 33 | raise # re-raise other exceptions 34 | 35 | with sync_playwright() as p: 36 | browser = try_launch_browser(p) 37 | context = browser.new_context( 38 | user_agent=REQUEST_HEADERS["User-Agent"], 39 | java_script_enabled=True, 40 | extra_http_headers=REQUEST_HEADERS, 41 | ) 42 | with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path: 43 | context.add_init_script(path=str(js_path)) 44 | page = context.new_page() 45 | page.goto(url, wait_until="networkidle") 46 | html = page.content() 47 | page.close() 48 | context.close() 49 | browser.close() 50 | return html 51 | 52 | 53 | @hookimpl 54 | def get_custom_reader(url: str) -> ReaderPlugin | None: 55 | plugin_instance = FreediumPlugin() 56 | if plugin_instance.can_handle(url): 57 | return plugin_instance 58 | return None 59 | -------------------------------------------------------------------------------- /src/omni_article_markdown/omni_article_md.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import pkgutil 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | from .extractor import Article, DefaultExtractor, Extractor 9 | from .parser import HtmlMarkdownParser 10 | from .readers import ReaderFactory 11 | from .utils import to_snake_case 12 | 13 | 14 | @dataclass 15 | class ReaderContext: 16 | raw_html: str 17 | 18 | 19 | @dataclass 20 | class ExtractorContext: 21 | article: Article 22 | 23 | 24 | @dataclass 25 | class ParserContext: 26 | title: str 27 | markdown: str 28 | 29 | 30 | class OmniArticleMarkdown: 31 | DEFAULT_SAVE_PATH = "./" 32 | 33 | def __init__(self, url_or_path: str): 34 | self.url_or_path = url_or_path 35 | 36 | def parse(self) -> ParserContext: 37 | reader_ctx = self._read_html(self.url_or_path) 38 | extractor_ctx = self._extract_article(reader_ctx) 39 | parser_ctx = self._parse_html(extractor_ctx) 40 | return parser_ctx 41 | 42 | def save(self, ctx: ParserContext, save_path: str = "") -> str: 43 | save_path = save_path or self.DEFAULT_SAVE_PATH 44 | file_path = Path(save_path) 45 | if file_path.is_dir(): 46 | filename = f"{to_snake_case(ctx.title)}.md" 47 | file_path = file_path / filename 48 | with file_path.open("w", encoding="utf-8") as f: 49 | f.write(ctx.markdown) 50 | return str(file_path.resolve()) 51 | 52 | def _read_html(self, url_or_path: str) -> ReaderContext: 53 | reader = ReaderFactory.create(url_or_path) 54 | raw_html = reader.read() 55 | return ReaderContext(raw_html) 56 | 57 | def _extract_article(self, ctx: ReaderContext) -> ExtractorContext: 58 | soup = BeautifulSoup(ctx.raw_html, "html5lib") 59 | for extract in load_extractors(): 60 | article = extract.extract(soup) 61 | if article: 62 | break 63 | else: 64 | article = DefaultExtractor().extract(soup) 65 | if not article: 66 | raise ValueError("Failed to extract article content.") 67 | return ExtractorContext(article) 68 | 69 | def _parse_html(self, ctx: ExtractorContext) -> ParserContext: 70 | parser = HtmlMarkdownParser(ctx.article) 71 | result = parser.parse() 72 | return ParserContext(title=result[0], markdown=result[1]) 73 | 74 | 75 | def load_extractors(package_name="extractors") -> list[Extractor]: 76 | extractors_package = Path(__file__).parent / package_name 77 | extractors = [] 78 | for _loader, module_name, _is_pkg in pkgutil.iter_modules([extractors_package.resolve()]): 79 | module = importlib.import_module(f"omni_article_markdown.{package_name}.{module_name}") 80 | for attr in dir(module): 81 | cls = getattr(module, attr) 82 | if isinstance(cls, type) and issubclass(cls, Extractor) and cls is not Extractor: 83 | extractors.append(cls()) 84 | return extractors 85 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | # Ruff configuration for the entire project 2 | # This ensures consistent formatting across all Python code 3 | 4 | # Use 120 character line length to prevent splitting Reflex lambdas 5 | line-length = 120 6 | 7 | # Target Python 3.13+ 8 | target-version = "py313" 9 | 10 | # Exclude generated and build directories 11 | extend-exclude = [ 12 | ".venv", 13 | "venv", 14 | "__pycache__", 15 | "*.pyc", 16 | "*.yaml", 17 | "node_modules", 18 | "webview", 19 | "bin", 20 | "build", 21 | "dist", 22 | ] 23 | 24 | [format] 25 | # Use double quotes for strings 26 | quote-style = "double" 27 | 28 | # Use 4 spaces for indentation 29 | indent-style = "space" 30 | 31 | # Respect magic trailing commas 32 | skip-magic-trailing-comma = false 33 | 34 | # Use Unix line endings 35 | line-ending = "auto" 36 | 37 | [lint] 38 | # Enable specific rule sets 39 | select = [ 40 | "E", # pycodestyle errors 41 | "W", # pycodestyle warnings (includes W292 for newline at EOF) 42 | "F", # Pyflakes 43 | "I", # isort 44 | "N", # pep8-naming 45 | "UP", # pyupgrade 46 | "B", # flake8-bugbear 47 | "C4", # flake8-comprehensions 48 | "DTZ", # flake8-datetimez 49 | "T10", # flake8-debugger 50 | "RET", # flake8-return 51 | "SIM", # flake8-simplify 52 | "TID", # flake8-tidy-imports 53 | ] 54 | 55 | # Ignore specific rules 56 | ignore = [ 57 | "E501", # Line too long (handled by formatter) 58 | "E712", # Comparison to True/False (needed for SQLAlchemy) 59 | "B008", # Do not perform function calls in argument defaults 60 | "B904", # Within except clause, use raise from (not always needed) 61 | "UP007", # Use X | Y for type unions (keep Optional for clarity) 62 | "SIM108", # Use ternary operator (sometimes if/else is clearer) 63 | "DTZ005", # datetime.now() without tz (okay for timestamps) 64 | "N999", # Invalid module name (web-bff is valid) 65 | "TID252", # Relative imports from parent (used in package structure) 66 | "RET504", # Unnecessary assignment before return (sometimes clearer) 67 | ] 68 | 69 | # Allow unused variables when prefixed with underscore 70 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 71 | 72 | [lint.per-file-ignores] 73 | # Ignore import violations in __init__ files 74 | "__init__.py" = ["E402", "F401", "F403"] 75 | 76 | # Ignore missing docstrings in tests 77 | "test_*.py" = ["D100", "D101", "D102", "D103", "D104"] 78 | "tests/*" = ["D100", "D101", "D102", "D103", "D104"] 79 | 80 | # Allow dynamic imports in recipe files 81 | "recipes/*" = ["F401", "F403"] 82 | 83 | [lint.isort] 84 | # Combine as imports 85 | combine-as-imports = true 86 | 87 | # Force single line imports 88 | force-single-line = false 89 | 90 | # Order imports by type 91 | section-order = [ 92 | "future", 93 | "standard-library", 94 | "third-party", 95 | "first-party", 96 | "local-folder", 97 | ] 98 | 99 | [lint.pydocstyle] 100 | # Use Google docstring convention 101 | convention = "google" 102 | -------------------------------------------------------------------------------- /plugins/omnimd-zhihu-reader/zhihu.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import resources 3 | from runpy import run_module 4 | from typing import override 5 | 6 | import requests 7 | from playwright.sync_api import Browser, Cookie, Playwright, sync_playwright 8 | 9 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 10 | from omni_article_markdown.store import Store 11 | from omni_article_markdown.utils import REQUEST_HEADERS 12 | 13 | 14 | class ZhihuPlugin(ReaderPlugin): 15 | @override 16 | def can_handle(self, url: str) -> bool: 17 | return "zhihu.com" in url 18 | 19 | @override 20 | def read(self, url: str) -> str: 21 | store = Store() 22 | cookies_raw = store.load("zhihu_cookies") 23 | 24 | if not cookies_raw: 25 | print("未找到知乎登录信息,尝试模拟登录...") 26 | cookies_raw = self._get_zhihu_cookies(url) 27 | if not cookies_raw: 28 | raise Exception("无法获取知乎登录信息") 29 | 30 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 31 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 32 | 33 | # 如果初始请求失败,则尝试重新获取 cookie 并重试 34 | if response.status_code == 403: 35 | print("Cookie 失效,重新模拟登录知乎...") 36 | cookies_raw = self._get_zhihu_cookies(url) 37 | if not cookies_raw: 38 | raise Exception("重新模拟登录失败,无法访问知乎内容") 39 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 40 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 41 | 42 | response.encoding = "utf-8" 43 | return response.text 44 | 45 | def _get_zhihu_cookies(self, url: str) -> list[Cookie]: 46 | def try_launch_browser(p: Playwright) -> Browser: 47 | try: 48 | return p.chromium.launch(headless=True) 49 | except Exception as e: 50 | # Playwright not installed or browser missing 51 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 52 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 53 | original_argv = sys.argv 54 | args = ["playwright", "install", "chromium"] 55 | sys.argv = args 56 | run_module("playwright", run_name="__main__") 57 | sys.argv = original_argv 58 | # Try again 59 | return p.chromium.launch(headless=True) 60 | raise # re-raise other exceptions 61 | 62 | with sync_playwright() as p: 63 | browser = try_launch_browser(p) 64 | context = browser.new_context( 65 | user_agent=REQUEST_HEADERS["User-Agent"], 66 | java_script_enabled=True, 67 | extra_http_headers=REQUEST_HEADERS, 68 | ) 69 | with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path: 70 | context.add_init_script(path=str(js_path)) 71 | page = context.new_page() 72 | page.goto(url, wait_until="networkidle") 73 | cookies = context.cookies() 74 | store = Store() 75 | store.save("zhihu_cookies", cookies) 76 | page.close() 77 | context.close() 78 | browser.close() 79 | return cookies 80 | 81 | def _convert_playwright_cookies_to_requests_dict(self, playwright_cookies: list[Cookie]) -> dict[str, str]: 82 | requests_cookies = {} 83 | for cookie in playwright_cookies: 84 | requests_cookies[cookie.get("name")] = cookie.get("value") 85 | return requests_cookies 86 | 87 | 88 | @hookimpl 89 | def get_custom_reader(url: str) -> ReaderPlugin | None: 90 | plugin_instance = ZhihuPlugin() 91 | if plugin_instance.can_handle(url): 92 | return plugin_instance 93 | return None 94 | -------------------------------------------------------------------------------- /plugins/omnimd-toutiao-reader/toutiao.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import resources 3 | from runpy import run_module 4 | from typing import override 5 | 6 | import requests 7 | from playwright.sync_api import Browser, Cookie, Playwright, sync_playwright 8 | 9 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 10 | from omni_article_markdown.store import Store 11 | from omni_article_markdown.utils import REQUEST_HEADERS 12 | 13 | 14 | class ToutiaoPlugin(ReaderPlugin): 15 | @override 16 | def can_handle(self, url: str) -> bool: 17 | return "toutiao.com" in url 18 | 19 | @override 20 | def read(self, url: str) -> str: 21 | store = Store() 22 | cookies_raw = store.load("toutiao_cookies") 23 | 24 | if not cookies_raw: 25 | print("未找到头条登录信息,尝试模拟登录...") 26 | cookies_raw = self._get_toutiao_cookies(url) 27 | if not cookies_raw: 28 | raise Exception("无法获取头条登录信息") 29 | 30 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 31 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 32 | response.encoding = "utf-8" 33 | html = response.text 34 | 35 | # 如果初始请求失败,则尝试重新获取 cookie 并重试 36 | if "您需要允许该网站执行 JavaScript" in html: 37 | print("Cookie 失效,重新模拟登录头条...") 38 | cookies_raw = self._get_toutiao_cookies(url) 39 | if not cookies_raw: 40 | raise Exception("重新模拟登录失败,无法访问头条内容") 41 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 42 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 43 | 44 | response.encoding = "utf-8" 45 | return response.text 46 | 47 | def _get_toutiao_cookies(self, url: str) -> list[Cookie]: 48 | def try_launch_browser(p: Playwright) -> Browser: 49 | try: 50 | return p.chromium.launch(headless=True) 51 | except Exception as e: 52 | # Playwright not installed or browser missing 53 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 54 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 55 | original_argv = sys.argv 56 | args = ["playwright", "install", "chromium"] 57 | sys.argv = args 58 | run_module("playwright", run_name="__main__") 59 | sys.argv = original_argv 60 | # Try again 61 | return p.chromium.launch(headless=True) 62 | raise # re-raise other exceptions 63 | 64 | with sync_playwright() as p: 65 | browser = try_launch_browser(p) 66 | context = browser.new_context( 67 | user_agent=REQUEST_HEADERS["User-Agent"], 68 | java_script_enabled=True, 69 | extra_http_headers=REQUEST_HEADERS, 70 | ) 71 | with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path: 72 | context.add_init_script(path=str(js_path)) 73 | page = context.new_page() 74 | page.goto(url, wait_until="networkidle") 75 | cookies = context.cookies() 76 | store = Store() 77 | store.save("toutiao_cookies", cookies) 78 | page.close() 79 | context.close() 80 | browser.close() 81 | return cookies 82 | 83 | def _convert_playwright_cookies_to_requests_dict(self, playwright_cookies: list[Cookie]) -> dict[str, str]: 84 | requests_cookies = {} 85 | for cookie in playwright_cookies: 86 | requests_cookies[cookie.get("name")] = cookie.get("value") 87 | return requests_cookies 88 | 89 | 90 | @hookimpl 91 | def get_custom_reader(url: str) -> ReaderPlugin | None: 92 | plugin_instance = ToutiaoPlugin() 93 | if plugin_instance.can_handle(url): 94 | return plugin_instance 95 | return None 96 | -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | from bs4.element import Tag 2 | 3 | from omni_article_markdown.extractor import ( 4 | Article, 5 | DefaultExtractor, 6 | extract_article_from_soup, 7 | remove_duplicate_titles, 8 | ) 9 | 10 | # ---- mock utils ---- 11 | 12 | def make_html(content: str, title="Page Title", description="Desc", url="https://example.com") -> str: 13 | return f""" 14 | 15 | 16 | {title} 17 | 18 | 19 | 20 | 21 | {content} 22 | 23 | 24 | """ 25 | 26 | 27 | def test_extract_article_from_soup_basic(make_soup): 28 | html = "

Hello

" 29 | soup = make_soup(html) 30 | tag = extract_article_from_soup(soup, ("article", None)) 31 | assert tag is not None 32 | assert tag.name == "article" 33 | assert "Hello" in tag.text 34 | 35 | 36 | def test_default_extractor_basic_behavior(make_soup): 37 | extractor = DefaultExtractor() 38 | html = make_html("

Hello World

") 39 | soup = make_soup(html) 40 | 41 | assert extractor.can_handle(soup) is True 42 | assert isinstance(extractor.article_container(), list | tuple) 43 | 44 | 45 | def test_cleaning_tags_and_attrs(make_soup): 46 | html = make_html(""" 47 |
48 |

Visible

49 | 50 |

Hidden

51 | 52 | 53 |
54 | """) 55 | extractor = DefaultExtractor() 56 | article = extractor.extract(make_soup(html)) 57 | assert article is not None 58 | assert isinstance(article.body, Tag) 59 | text = article.body.get_text() 60 | # 不应包含隐藏元素、style、注释内容 61 | assert "Visible" in text 62 | assert "Hidden" not in text 63 | assert "Invisible" not in text 64 | assert "color:red" not in text 65 | 66 | 67 | def test_extract_metadata(make_soup): 68 | html = make_html("

Body

", title="Hello", description="A test desc", url="https://abc.com") 69 | soup = make_soup(html) 70 | extractor = DefaultExtractor() 71 | 72 | assert extractor.extract_title(soup) == "Hello" 73 | assert extractor.extract_description(soup) == "A test desc" 74 | assert extractor.extract_url(soup) == "https://abc.com" 75 | 76 | 77 | def test_remove_duplicate_titles(make_soup): 78 | html = "

Same Title

Body text

" 79 | soup = make_soup(html) 80 | article = Article(title="Same Title", url=None, description=None, body=soup.article) 81 | remove_duplicate_titles(article) 82 | 83 | # 标题应保持一致 84 | assert article.title == "Same Title" 85 | # H1 应被删除 86 | assert article.body.find("h1") is None 87 | 88 | 89 | def test_remove_duplicate_titles_different(make_soup): 90 | html = "

Other Title

Body text

" 91 | soup = make_soup(html) 92 | article = Article(title="Main Page", url=None, description=None, body=soup.article) 93 | remove_duplicate_titles(article) 94 | 95 | # 原标题不变,H1 保留 96 | assert article.title == "Main Page" 97 | assert article.body.find("h1") is not None 98 | 99 | 100 | class CustomExtractor(DefaultExtractor): 101 | def can_handle(self, soup): 102 | title = soup.title.text.strip() if soup.title else "" 103 | return "Special" in title 104 | 105 | def article_container(self): 106 | return ("body", None) 107 | 108 | 109 | def test_custom_extractor_can_handle(make_soup): 110 | html = make_html("

Hello

", title="Special Page") 111 | extractor = CustomExtractor() 112 | soup = make_soup(html) 113 | assert extractor.can_handle(soup) is True 114 | 115 | article = extractor.extract(soup) 116 | assert article is not None 117 | assert isinstance(article.body, Tag) 118 | assert "Hello" in article.body.text 119 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | from omni_article_markdown.extractor import Article 2 | from omni_article_markdown.parser import HtmlMarkdownParser 3 | 4 | 5 | def test_basic_paragraph(make_soup): 6 | html = "

Hello world

" 7 | article = Article("Test", "", "", make_soup(html)) 8 | parser = HtmlMarkdownParser(article) 9 | title, md = parser.parse() 10 | assert "# Test" in md 11 | assert "Hello world" in md 12 | 13 | 14 | def test_heading_and_strong(make_soup): 15 | html = "

Subtitle

bold and italic

" 16 | article = Article("Title", "", "", make_soup(html)) 17 | parser = HtmlMarkdownParser(article) 18 | _, md = parser.parse() 19 | assert "## Subtitle" in md 20 | assert "**bold**" in md 21 | assert "*italic*" in md 22 | 23 | 24 | def test_link_parsing(make_soup): 25 | html = '

Example

' 26 | article = Article("Title", "", "", make_soup(html)) 27 | parser = HtmlMarkdownParser(article) 28 | _, md = parser.parse() 29 | assert "[Example](https://example.com)" in md 30 | 31 | 32 | def test_unordered_list(make_soup): 33 | html = "" 34 | article = Article("Fruits", "", "", make_soup(html)) 35 | parser = HtmlMarkdownParser(article) 36 | _, md = parser.parse() 37 | assert "- Apple" in md 38 | assert "- Banana" in md 39 | 40 | 41 | def test_ordered_list(make_soup): 42 | html = "
  1. One
  2. Two
" 43 | article = Article("Numbers", "", "", make_soup(html)) 44 | parser = HtmlMarkdownParser(article) 45 | _, md = parser.parse() 46 | assert "1. One" in md 47 | assert "2. Two" in md 48 | 49 | 50 | def test_blockquote(make_soup): 51 | html = "

Quote me

" 52 | article = Article("Quote", "", "", make_soup(html)) 53 | parser = HtmlMarkdownParser(article) 54 | _, md = parser.parse() 55 | assert "> Quote me" in md 56 | 57 | 58 | def test_codeblock(make_soup): 59 | html = "
print('Hello')
" 60 | article = Article("Code", "", "", make_soup(html)) 61 | parser = HtmlMarkdownParser(article) 62 | _, md = parser.parse() 63 | assert "```" in md 64 | assert "print('Hello')" in md 65 | 66 | 67 | def test_inline_code(make_soup): 68 | html = "

Run ls -al command.

" 69 | article = Article("Cmd", "", "", make_soup(html)) 70 | parser = HtmlMarkdownParser(article) 71 | _, md = parser.parse() 72 | assert "`ls -al`" in md 73 | 74 | 75 | def test_image_absolute_url(make_soup): 76 | html = 'demo' 77 | article = Article("Img", "", "", make_soup(html)) 78 | parser = HtmlMarkdownParser(article) 79 | _, md = parser.parse() 80 | assert "![demo](https://example.com/image.png)" in md 81 | 82 | 83 | def test_image_relative_url(make_soup): 84 | html = 'demo' 85 | article = Article("Img", "https://site.com/docs/page.html", "", make_soup(html)) 86 | parser = HtmlMarkdownParser(article) 87 | _, md = parser.parse() 88 | assert "![demo](https://site.com/images/demo.png)" in md 89 | 90 | 91 | def test_table_parsing(make_soup): 92 | html = """ 93 | 94 | 95 | 96 | 97 |
NameAge
Alice18
Bob20
98 | """ 99 | article = Article("Table", "", "", make_soup(html)) 100 | parser = HtmlMarkdownParser(article) 101 | _, md = parser.parse() 102 | assert "| Name | Age |" in md 103 | assert "| Alice | 18 |" in md 104 | 105 | 106 | def test_mathjax_equations(make_soup): 107 | html = "E=mc^2" 108 | article = Article("Math", "", "", make_soup(html)) 109 | parser = HtmlMarkdownParser(article) 110 | _, md = parser.parse() 111 | assert "$$ E=mc^2 $$" in md 112 | 113 | 114 | def test_post_handler_math(make_soup): 115 | html = "

\\(x+y\\) and \\[E=mc^2\\]

" 116 | article = Article("Math", "", "", make_soup(html)) 117 | parser = HtmlMarkdownParser(article) 118 | _, md = parser.parse() 119 | assert "$x+y$" in md 120 | assert "$$E=mc^2$$" in md 121 | -------------------------------------------------------------------------------- /src/omni_article_markdown/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from runpy import run_module 3 | 4 | import click 5 | from click_default_group import DefaultGroup 6 | 7 | from . import DEFAULT_PLUGINS 8 | from .omni_article_md import OmniArticleMarkdown 9 | 10 | 11 | @click.group(cls=DefaultGroup, default="parse", default_if_no_args=True) 12 | def cli(): 13 | """ 14 | A CLI tool to parse articles and save them as Markdown. 15 | It also supports installing plugins. 16 | """ 17 | ... 18 | 19 | 20 | @cli.command(name="parse") 21 | @click.argument("url_or_path") 22 | @click.option( 23 | "-s", 24 | "--save", 25 | help="Save result (default: ./). Provide a path to save elsewhere.", 26 | type=click.Path(dir_okay=True, writable=True), 27 | ) 28 | def parse_article(url_or_path: str, save: str | None): 29 | """ 30 | Parses an article from a URL or local path and outputs/saves it as Markdown. 31 | """ 32 | handler = OmniArticleMarkdown(url_or_path) 33 | parser_ctx = handler.parse() 34 | 35 | if save is None: 36 | click.echo(parser_ctx.markdown) 37 | else: 38 | save_path = handler.save(parser_ctx, save) 39 | click.echo(f"Article saved to: {save_path}") 40 | 41 | 42 | @cli.command() 43 | @click.argument("plugin_name") 44 | @click.option("-U", "--upgrade", is_flag=True, help="Upgrade the plugin if already installed.", default=False) 45 | @click.option( 46 | "-e", 47 | "--editable", 48 | is_flag=True, 49 | help="Install the editable package based on the provided local file path", 50 | default=False, 51 | ) 52 | def install(plugin_name: str, upgrade: bool, editable: bool): 53 | """ 54 | Installs a plugin for this application. 55 | For example, to install the 'zhihu' plugin: mdcli install zhihu 56 | """ 57 | actual_package_name = ( 58 | plugin_name if editable or plugin_name not in DEFAULT_PLUGINS else DEFAULT_PLUGINS[plugin_name] 59 | ) 60 | 61 | click.echo(f"Attempting to install plugin: {actual_package_name}...") 62 | args = ["pip", "install"] 63 | if upgrade: 64 | args.append("--upgrade") 65 | args.append(actual_package_name) 66 | 67 | original_argv = sys.argv 68 | try: 69 | sys.argv = args 70 | run_module("pip", run_name="__main__") 71 | click.echo(f"Plugin '{actual_package_name}' processed by pip.") 72 | click.echo("If the plugin provides new functionality, it should now be available.") 73 | click.echo( 74 | "You might need to restart the application for changes to take full effect if it involves runtime loading during startup." 75 | ) 76 | except Exception as e: 77 | click.echo(f"Failed to process plugin '{actual_package_name}' with pip: {e}", err=True) 78 | click.echo("Please ensure pip is installed and the package name is correct.", err=True) 79 | finally: 80 | sys.argv = original_argv 81 | 82 | 83 | @cli.command() 84 | @click.argument("plugin_name") 85 | @click.option("-y", "--yes", is_flag=True, help="Don't ask for confirmation before uninstalling.", default=False) 86 | def uninstall(plugin_name: str, yes: bool): 87 | """ 88 | Uninstalls a plugin for this application. 89 | For example, to uninstall the 'zhihu' plugin: mdcli uninstall zhihu 90 | """ 91 | actual_package_name = DEFAULT_PLUGINS.get(plugin_name, plugin_name) 92 | 93 | click.echo(f"Attempting to uninstall plugin: {actual_package_name}...") 94 | args = ["pip", "uninstall"] 95 | if yes: 96 | args.append("-y") 97 | args.append(actual_package_name) 98 | 99 | original_argv = sys.argv 100 | try: 101 | sys.argv = args 102 | run_module("pip", run_name="__main__") 103 | click.echo(f"Plugin '{actual_package_name}' uninstallation processed by pip.") 104 | click.echo( 105 | "The plugin's functionality should no longer be available after the next application start (or if dynamically unloaded)." 106 | ) 107 | except Exception as e: 108 | click.echo(f"Failed to process uninstallation of plugin '{actual_package_name}' with pip: {e}", err=True) 109 | click.echo("Please ensure pip is installed and the package name is correct.", err=True) 110 | finally: 111 | sys.argv = original_argv 112 | 113 | 114 | if __name__ == "__main__": 115 | cli() 116 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from collections.abc import Callable 4 | from dataclasses import dataclass 5 | from typing import override 6 | 7 | from bs4 import BeautifulSoup 8 | from bs4.element import Comment, Tag 9 | 10 | from .utils import filter_tag, get_attr_text, get_canonical_url, get_og_description, get_og_title, get_og_url, get_title 11 | 12 | TAGS_TO_CLEAN: list[Callable[[Tag], bool]] = [ 13 | lambda el: el.name in ("style", "link", "button", "footer", "header"), 14 | lambda el: el.name == "script" and "src" not in el.attrs, 15 | lambda el: el.name == "script" 16 | and el.has_attr("src") 17 | and not get_attr_text(el.attrs["src"]).startswith("https://gist.github.com"), 18 | ] 19 | 20 | ATTRS_TO_CLEAN: list[Callable[[Tag], bool]] = [ 21 | lambda el: "style" in el.attrs 22 | and re.search(r"display\s*:\s*none", get_attr_text(el.attrs.get("style")), re.IGNORECASE) is not None, 23 | lambda el: "hidden" in el.attrs, 24 | lambda el: "class" in el.attrs and "katex-html" in el.attrs["class"], # katex 25 | ] 26 | 27 | ARTICLE_CONTAINERS = [("article", None), ("main", None), ("body", None)] 28 | 29 | 30 | @dataclass 31 | class Article: 32 | title: str 33 | url: str | None 34 | description: str | None 35 | body: Tag | str 36 | 37 | 38 | class Extractor(ABC): 39 | def __init__(self): 40 | self.tags_to_clean = TAGS_TO_CLEAN 41 | self.attrs_to_clean = ATTRS_TO_CLEAN 42 | 43 | def extract(self, soup: BeautifulSoup) -> Article | None: 44 | if self.can_handle(soup): 45 | # print(f"Using extractor: {self.__class__.__name__}") 46 | soup = self.pre_handle_soup(soup) 47 | article_container = self.article_container() 48 | if isinstance(article_container, tuple): 49 | article_container = [article_container] 50 | for container in article_container: 51 | article = self.extract_article(soup) 52 | if article: 53 | return article 54 | article_tag = extract_article_from_soup(soup, container) 55 | if article_tag: 56 | for el in article_tag.find_all(): 57 | tag = filter_tag(el) 58 | if tag: 59 | if any(cond(tag) for cond in self.tags_to_clean): 60 | tag.decompose() 61 | continue 62 | if tag.attrs and any(cond(tag) for cond in self.attrs_to_clean): 63 | tag.decompose() 64 | for comment in article_tag.find_all(string=lambda text: isinstance(text, Comment)): 65 | comment.extract() 66 | self.extract_img(article_tag) 67 | title = self.extract_title(soup) 68 | description = self.extract_description(soup) 69 | url = self.extract_url(soup) 70 | article = Article(title=title, url=url, description=description, body=article_tag) 71 | remove_duplicate_titles(article) 72 | return article 73 | return None 74 | 75 | @abstractmethod 76 | def can_handle(self, soup: BeautifulSoup) -> bool: ... 77 | 78 | @abstractmethod 79 | def article_container(self) -> tuple | list: ... 80 | 81 | def extract_title(self, soup: BeautifulSoup) -> str: 82 | return get_og_title(soup) or get_title(soup) 83 | 84 | def extract_description(self, soup: BeautifulSoup) -> str: 85 | return get_og_description(soup) 86 | 87 | def extract_url(self, soup: BeautifulSoup) -> str: 88 | return get_og_url(soup) or get_canonical_url(soup) 89 | 90 | def extract_img(self, element: Tag) -> Tag: 91 | return element 92 | 93 | def extract_article(self, soup: BeautifulSoup) -> Article | None: 94 | return None 95 | 96 | def pre_handle_soup(self, soup: BeautifulSoup) -> BeautifulSoup: 97 | return soup 98 | 99 | 100 | class DefaultExtractor(Extractor): 101 | @override 102 | def can_handle(self, soup: BeautifulSoup) -> bool: 103 | return True 104 | 105 | @override 106 | def article_container(self) -> tuple | list: 107 | return ARTICLE_CONTAINERS 108 | 109 | 110 | def extract_article_from_soup(soup: BeautifulSoup, template: tuple) -> Tag | None: 111 | if template[1] is not None: 112 | result = soup.find(template[0], attrs=template[1]) 113 | else: 114 | result = soup.find(template[0]) 115 | return filter_tag(result) 116 | 117 | 118 | def remove_duplicate_titles(article: Article): 119 | if article.body and isinstance(article.body, Tag): 120 | first_h1 = article.body.find("h1") 121 | if first_h1: 122 | h1_text = first_h1.get_text(strip=True) 123 | if h1_text.lower() in article.title.lower(): 124 | article.title = h1_text 125 | first_h1.decompose() 126 | -------------------------------------------------------------------------------- /src/omni_article_markdown/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import urlparse 3 | 4 | from bs4 import BeautifulSoup 5 | from bs4.element import AttributeValueList, NavigableString, PageElement, Tag 6 | 7 | REQUEST_HEADERS = { 8 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0", 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 12 | "Priority": "u=0, i", 13 | "Sec-Ch-Ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"', 14 | "Sec-Ch-Ua-Mobile": "?0", 15 | "Sec-Ch-Ua-Platform": '"macOS"', 16 | "Sec-Fetch-Dest": "document", 17 | "Sec-Fetch-Mode": "navigate", 18 | "Sec-Fetch-Site": "none", 19 | "Sec-Fetch-User": "?1", 20 | "Upgrade-Insecure-Requests": "1", 21 | } 22 | 23 | BROWSER_TARGET_HOSTS = [ 24 | "developer.apple.com/documentation/", 25 | "www.infoq.cn/", 26 | "pcsx2.net/", 27 | "baijiahao.baidu.com/", 28 | ] 29 | 30 | def is_sequentially_increasing(code: str) -> bool: 31 | try: 32 | # 解码并按换行符拆分 33 | numbers = [int(line.strip()) for line in code.split("\n") if line.strip()] 34 | # 检查是否递增 35 | return all(numbers[i] + 1 == numbers[i + 1] for i in range(len(numbers) - 1)) 36 | except ValueError: 37 | return False # 处理非数字情况 38 | 39 | 40 | def move_spaces(input_string: str, suffix: str) -> str: 41 | # 使用正则表达式匹配以指定的suffix结尾,且suffix之前有空格的情况 42 | escaped_suffix = re.escape(suffix) # 处理正则中的特殊字符 43 | pattern = rf"(.*?)\s+({escaped_suffix})$" 44 | match = re.search(pattern, input_string) 45 | if match: 46 | # 获取字符串的主体部分(不含空格)和尾部的 '**' 47 | main_part = match.group(1) 48 | stars = match.group(2) 49 | # 计算空格的数量并将空格移动到 '**' 后 50 | space_count = len(input_string) - len(main_part) - len(stars) 51 | return f"{main_part}{stars}{' ' * space_count}" 52 | return input_string 53 | 54 | 55 | def to_snake_case(input_string: str) -> str: 56 | input_string = "".join(char if char.isalnum() else " " for char in input_string) 57 | snake_case_string = "_".join(word.lower() for word in input_string.split()) 58 | return snake_case_string 59 | 60 | 61 | def collapse_spaces(text) -> str: 62 | """ 63 | 将多个连续空格(包括换行和 Tab)折叠成一个空格。 64 | """ 65 | return re.sub(r"\s+", " ", text) 66 | 67 | 68 | def extract_domain(url: str) -> str | None: 69 | """ 70 | 从URL中提取域名(包含协议)。 71 | 72 | Args: 73 | url (str): 要提取域名的URL。 74 | 75 | Returns: 76 | str | None: 提取出的域名(包含协议),如果解析失败或协议不支持则返回 None。 77 | """ 78 | try: 79 | parsed_url = urlparse(url) 80 | if parsed_url.scheme in {"http", "https"} and parsed_url.netloc: 81 | return f"{parsed_url.scheme}://{parsed_url.netloc}".rstrip("/") 82 | return None # 返回 None 表示 URL 格式不符合要求或协议不支持 83 | 84 | except ValueError: 85 | return None # 如果 URL 格式无效,则返回 None 86 | 87 | 88 | def detect_language(file_name: str | None, code: str) -> str: 89 | # TODO: 添加语言检测逻辑 90 | return "" 91 | 92 | 93 | def filter_tag(el: Tag | PageElement | NavigableString | None) -> Tag | None: 94 | if el is None or not isinstance(el, Tag): 95 | return None 96 | return el 97 | 98 | 99 | def get_attr_text(el: str | AttributeValueList | None) -> str: 100 | if el is None: 101 | return "" 102 | if isinstance(el, str): 103 | return el.strip() 104 | return " ".join(el).strip() 105 | 106 | 107 | def get_og_url(soup: BeautifulSoup) -> str: 108 | og_tag = filter_tag(soup.find("meta", {"property": "og:url"})) 109 | return get_tag_text(og_tag, "content") 110 | 111 | 112 | def get_og_site_name(soup: BeautifulSoup) -> str: 113 | og_tag = filter_tag(soup.find("meta", {"property": "og:site_name"})) 114 | return get_tag_text(og_tag, "content") 115 | 116 | 117 | def get_og_description(soup: BeautifulSoup) -> str: 118 | og_tag = filter_tag(soup.find("meta", {"property": "og:description"})) 119 | return get_tag_text(og_tag, "content") 120 | 121 | 122 | def get_canonical_url(soup: BeautifulSoup) -> str: 123 | canonical_tag = filter_tag(soup.find("link", {"rel": "canonical"})) 124 | return get_tag_text(canonical_tag, "href") 125 | 126 | 127 | def is_matched_canonical(url: str, soup: BeautifulSoup) -> bool: 128 | canonical = get_canonical_url(soup) 129 | if not canonical: 130 | return False 131 | return canonical.startswith(url) 132 | 133 | 134 | def get_og_title(soup: BeautifulSoup) -> str: 135 | og_tag = filter_tag(soup.find("meta", {"property": "og:title"})) 136 | return get_tag_text(og_tag, "content") 137 | 138 | 139 | def get_tag_text(tag: Tag | None, attr: str) -> str: 140 | if tag is not None and tag.has_attr(attr): 141 | el = tag[attr] 142 | return get_attr_text(el) 143 | return "" 144 | 145 | 146 | def get_title(soup: BeautifulSoup) -> str: 147 | title_tag = soup.title 148 | return title_tag.get_text(strip=True) if title_tag else "" 149 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bs4.element import AttributeValueList, NavigableString 3 | 4 | from omni_article_markdown.utils import ( 5 | collapse_spaces, 6 | detect_language, 7 | extract_domain, 8 | filter_tag, 9 | get_attr_text, 10 | get_canonical_url, 11 | get_og_description, 12 | get_og_site_name, 13 | get_og_title, 14 | get_og_url, 15 | get_tag_text, 16 | get_title, 17 | is_matched_canonical, 18 | is_sequentially_increasing, 19 | move_spaces, 20 | to_snake_case, 21 | ) 22 | 23 | 24 | # -------------------------- 25 | # 测试 is_sequentially_increasing 26 | # -------------------------- 27 | def test_is_sequentially_increasing_true(): 28 | code = "1\n2\n3\n4" 29 | assert is_sequentially_increasing(code) is True 30 | 31 | 32 | def test_is_sequentially_increasing_false(): 33 | code = "1\n3\n5" 34 | assert is_sequentially_increasing(code) is False 35 | 36 | 37 | def test_is_sequentially_increasing_non_numeric(): 38 | code = "a\nb\nc" 39 | assert is_sequentially_increasing(code) is False 40 | 41 | 42 | # -------------------------- 43 | # move_spaces 44 | # -------------------------- 45 | def test_move_spaces(): 46 | assert move_spaces("**hello **", "**") == "**hello** " 47 | assert move_spaces("**hello **", "**") == "**hello** " 48 | assert move_spaces("**hello world**", "**") == "**hello world**" 49 | 50 | 51 | # -------------------------- 52 | # to_snake_case 53 | # -------------------------- 54 | def test_to_snake_case(): 55 | assert to_snake_case("HelloWorld") == "helloworld" 56 | assert to_snake_case("Hello World!") == "hello_world" 57 | assert to_snake_case("Already_snake_case") == "already_snake_case" 58 | 59 | 60 | # -------------------------- 61 | # collapse_spaces 62 | # -------------------------- 63 | def test_collapse_spaces(): 64 | assert collapse_spaces("a b\tc\nd") == "a b c d" 65 | 66 | 67 | # -------------------------- 68 | # extract_domain 69 | # -------------------------- 70 | def test_extract_domain(): 71 | assert extract_domain("https://example.com/path?q=1") == "https://example.com" 72 | assert extract_domain("http://abc.xyz") == "http://abc.xyz" 73 | assert extract_domain("ftp://example.com") is None 74 | assert extract_domain("not_a_url") is None 75 | 76 | 77 | # -------------------------- 78 | # detect_language 79 | # -------------------------- 80 | def test_detect_language_placeholder(): 81 | assert detect_language("file.py", "print('hi')") == "" 82 | 83 | 84 | # -------------------------- 85 | # filter_tag 86 | # -------------------------- 87 | def test_filter_tag_with_tag(make_soup): 88 | soup = make_soup("
") 89 | el = soup.div 90 | assert filter_tag(el) == el 91 | 92 | 93 | def test_filter_tag_with_none_or_text(): 94 | text_node = NavigableString("text") 95 | assert filter_tag(None) is None 96 | assert filter_tag(text_node) is None 97 | 98 | 99 | # -------------------------- 100 | # get_attr_text 101 | # -------------------------- 102 | def test_get_attr_text(): 103 | assert get_attr_text(" hello ") == "hello" 104 | assert get_attr_text(AttributeValueList(["a", "b", "c"])) == "a b c" 105 | assert get_attr_text(None) == "" 106 | 107 | 108 | # -------------------------- 109 | # meta/og tag 相关 110 | # -------------------------- 111 | HTML_DOC = """ 112 | 113 | 114 | Example Title 115 | 116 | 117 | 118 | 119 | 120 | 121 |

Hello

122 | 123 | """ 124 | 125 | 126 | @pytest.fixture 127 | def soup(make_soup): 128 | return make_soup(HTML_DOC) 129 | 130 | 131 | def test_get_og_url(soup): 132 | assert get_og_url(soup) == "https://example.com/page" 133 | 134 | 135 | def test_get_og_site_name(soup): 136 | assert get_og_site_name(soup) == "Example Site" 137 | 138 | 139 | def test_get_og_description(soup): 140 | assert get_og_description(soup) == "This is a description." 141 | 142 | 143 | def test_get_og_title(soup): 144 | assert get_og_title(soup) == "OG Title" 145 | 146 | 147 | def test_get_canonical_url(soup): 148 | assert get_canonical_url(soup) == "https://example.com/page" 149 | 150 | 151 | def test_is_matched_canonical_true(soup): 152 | assert is_matched_canonical("https://example.com", soup) is True 153 | 154 | 155 | def test_is_matched_canonical_false(soup): 156 | assert is_matched_canonical("https://other.com", soup) is False 157 | 158 | 159 | # -------------------------- 160 | # get_tag_text 161 | # -------------------------- 162 | def test_get_tag_text(make_soup): 163 | soup = make_soup('') 164 | tag = filter_tag(soup.find("meta")) 165 | assert get_tag_text(tag, "content") == "abc" 166 | assert get_tag_text(tag, "missing") == "" 167 | assert get_tag_text(None, "content") == "" 168 | 169 | 170 | # -------------------------- 171 | # get_title 172 | # -------------------------- 173 | def test_get_title(soup): 174 | assert get_title(soup) == "Example Title" 175 | 176 | 177 | def test_get_title_no_title(make_soup): 178 | soup = make_soup("") 179 | assert get_title(soup) == "" 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/omni-article-markdown)](https://pypi.org/project/omni-article-markdown/) 4 | ![Python](https://img.shields.io/pypi/pyversions/omni-article-markdown) 5 | [![License](https://img.shields.io/github/license/caol64/omni-article-markdown)](LICENSE) 6 | [![PyPI Downloads](https://static.pepy.tech/personalized-badge/omni-article-markdown?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/omni-article-markdown) 7 | [![Stars](https://img.shields.io/github/stars/caol64/omni-article-markdown?style=social)](https://github.com/caol64/omni-article-markdown) 8 | 9 | 轻松将网页文章(博客、新闻、文档等)转换为 `Markdown` 格式。 10 | 11 | ![](data/1.gif) 12 | 13 | --- 14 | 15 | ## 项目简介 16 | 墨探的开发初衷,是为了解决一个问题:如何将来自互联网上各种不同网站的文章内容,精准且高效地转换成统一的Markdown格式。 17 | 18 | 众所周知,万维网上的网站设计风格迥异,其HTML结构也呈现出千差万别的特点。这种多样性给自动化内容提取和格式转换带来了巨大的困难。要实现一个能够适应各种复杂HTML结构的通用解决方案,并非易事。 19 | 20 | 我的想法是:从特定的网站开始适配,以点到面,逐步抽取出通用的解决方案,最后尽可能多的覆盖更多网站。 21 | 22 | --- 23 | 24 | ## 功能介绍 25 | 26 | - 支持大部分 html 元素转换 27 | - 部分页面支持katex公式转换(示例:[https://quantum.country/qcvc](https://quantum.country/qcvc)) 28 | - 部分页面支持github gist(示例:[https://towardsdatascience.com/hands-on-multi-agent-llm-restaurant-simulation-with-python-and-openai](https://towardsdatascience.com/hands-on-multi-agent-llm-restaurant-simulation-with-python-and-openai)) 29 | - 支持保存成文件或输出至`stdout` 30 | - 支持突破某些网站的防爬虫策略(需安装插件) 31 | 32 | 以下是一些网站示例,大家可以自己测试下效果。 33 | 34 | |站点|链接|备注| 35 | --|--|-- 36 | |Medium|[link](https://medium.com/perry-street-software-engineering/architectural-linting-for-swift-made-easy-75d7f9f569cd)|| 37 | |csdn|[link](https://blog.csdn.net/weixin_41705306/article/details/148787220)|| 38 | |掘金|[link](https://juejin.cn/post/7405845617282449462)|| 39 | |知乎专栏|[link](https://zhuanlan.zhihu.com/p/1915735485801828475)|需安装zhihu插件| 40 | |公众号|[link](https://mp.weixin.qq.com/s/imHIKy7dqMmpm032eIhIJg)|| 41 | |今日头条|[link](https://www.toutiao.com/article/7283050053155947062/)|需安装toutiao插件| 42 | |网易|[link](https://www.163.com/dy/article/K2SPPGSK0514R9KE.html)|| 43 | |简书|[link](https://www.jianshu.com/p/20bd2e9b1f03)|| 44 | |Freedium|[link](https://freedium.cfd/https://medium.com/@devlink/ai-killed-my-coding-brain-but-im-rebuilding-it-8de7e1618bca)|需安装freedium插件| 45 | |Towards Data Science|[link](https://towardsdatascience.com/hands-on-multi-agent-llm-restaurant-simulation-with-python-and-openai/)|| 46 | |Quantamagazine|[link](https://www.quantamagazine.org/matter-vs-force-why-there-are-exactly-two-types-of-particles-20250623/)|| 47 | |苹果开发者文档|[link](https://developer.apple.com/documentation/technologyoverviews/adopting-liquid-glass)|需安装browser插件| 48 | |Cloudflare博客|[link](https://blog.cloudflare.com/20-percent-internet-upgrade/)|| 49 | |阿里云开发者社区|[link](https://developer.aliyun.com/article/791514)|| 50 | |微软技术文档|[link](https://learn.microsoft.com/en-us/dotnet/ai/get-started-app-chat-template)|| 51 | |InfoQ|[link](https://www.infoq.com/articles/ai-ml-data-engineering-trends-2025/)|| 52 | |博客园|[link](https://www.cnblogs.com/hez2010/p/19097937/runtime-async)|| 53 | |思否|[link](https://segmentfault.com/a/1190000047273730)|| 54 | |开源中国|[link](https://my.oschina.net/SeaTunnel/blog/18694930)|| 55 | |Forbes|[link](https://www.forbes.com/sites/danalexander/2025/10/10/trump-is-now-one-of-americas-biggest-bitcoin-investors/)|| 56 | |少数派|[link](https://sspai.com/post/102861)|| 57 | |语雀|[link](https://www.yuque.com/yuque/ng1qth/about)|| 58 | |腾讯云开发者社区|[link](https://cloud.tencent.com/developer/article/2571935)|| 59 | |百家号|[link](https://baijiahao.baidu.com/s?id=1846135703319246634)|需安装browser插件| 60 | |人人都是产品经理|[link](https://www.woshipm.com/data-analysis/6276761.html)|| 61 | |Jetbrains博客|[link](https://blog.jetbrains.com/teamcity/2025/10/the-state-of-cicd/)|| 62 | |Claude文档|[link](https://docs.claude.com/en/docs/build-with-claude/prompt-caching)|| 63 | |Anthropic|[link](https://www.anthropic.com/news/claude-sonnet-4-5)|| 64 | |Meta博客|[link](https://engineering.fb.com/2025/10/06/developer-tools/openzl-open-source-format-aware-compression-framework/)|| 65 | |Android Developers Blog|[link](https://android-developers.googleblog.com/2025/11/jetpack-navigation-3-is-stable.html)|| 66 | 67 | --- 68 | 69 | ## 快速开始 70 | 71 | 1. 安装 72 | 73 | ```sh 74 | pip install omni-article-markdown 75 | ``` 76 | 77 | 2. 运行说明 78 | 79 | **仅转换** 80 | 81 | ```sh 82 | mdcli https://example.com 83 | ``` 84 | 85 | **保存到当前目录** 86 | 87 | ```sh 88 | mdcli https://example.com -s 89 | ``` 90 | 91 | **保存到指定路径** 92 | 93 | ```sh 94 | mdcli https://example.com -s /home/user/ 95 | ``` 96 | 97 | --- 98 | 99 | ## 插件机制 100 | 101 | [「墨探」是如何使用插件机制构建可扩展架构的](https://babyno.top/posts/2025/06/a-deep-dive-into-the-extensible-architecture-of-omni-article-markdown/) 102 | 103 | **安装插件** 104 | 105 | 安装插件和`pip`命令格式相同: 106 | 107 | ```sh 108 | mdcli install [-U] [-e] 109 | ``` 110 | 111 | **示例:安装知乎解析插件** 112 | 113 | ```sh 114 | mdcli install zhihu 115 | ``` 116 | 117 | 或者,你可以使用 `-e` 参数安装本地可编辑的包。 118 | 119 | ```sh 120 | mdcli install -e "./plugins/omnimd-zhihu-reader" 121 | ``` 122 | 123 | **升级插件** 124 | 125 | ```sh 126 | mdcli install zhihu -U 127 | ``` 128 | 129 | **卸载插件** 130 | 131 | 如果你想移除一个已安装的插件,可以使用 `mdcli` 提供的 `uninstall` 命令。 132 | 133 | ```sh 134 | mdcli uninstall zhihu 135 | ``` 136 | 137 | 或者,使用插件的全称删除 138 | 139 | ```sh 140 | mdcli uninstall omnimd-zhihu-reader 141 | ``` 142 | 143 | **已支持的插件** 144 | 145 | 目前已发布4个插件,你可以按需安装: 146 | 147 | | 命令 | 说明 | 148 | |----------------------------------|----------------------------------------------------------| 149 | | `mdcli install zhihu` | 知乎专栏 | 150 | | `mdcli install toutiao` | 今日头条 | 151 | | `mdcli install freedium` | Freedium | 152 | | `mdcli install browser` | 需要启用浏览器的JS功能才能访问的页面(如Apple Developer Documentation) | 153 | 154 | **开发自己的插件** 155 | 156 | 文档编写中。 157 | 158 | --- 159 | 160 | ## 使用Docker镜像 161 | 162 | **直接下载** 163 | 164 | 暂未提供 165 | 166 | **自行构建** 167 | 168 | ``` 169 | docker build --build-arg PYPI_REGISTRY=https://pypi.tuna.tsinghua.edu.cn/simple -t omni-article-markdown . 170 | # 可以指定PYPI镜像源,默认使用官方源 171 | ``` 172 | 173 | 现在你可以在`docker`容器中使用墨探了: 174 | 175 | ``` 176 | docker run --rm omni-article-markdown /path/to/your/url 177 | ``` 178 | 179 | ## 架构说明 180 | 181 | ![](data/1.jpg) 182 | 183 | 墨探主要分为三个模块: 184 | 185 | - **Reader** 模块的功能是读取整个网页内容 186 | - **Extractor** 模块的功能是提取正文内容,清理无用数据 187 | - **Parser** 模块的功能是将 HTML 转换为 Markdown 188 | 189 | --- 190 | 191 | ## 贡献与反馈 192 | - 发现解析问题?欢迎提交 [Issue](https://github.com/caol64/omni-article-markdown/issues) 193 | - 改进解析?欢迎贡献 [Pull Request](https://github.com/caol64/omni-article-markdown/pulls) 194 | - 开发插件?文档正在筹备中 195 | 196 | --- 197 | 198 | ## 赞助 199 | 200 | 如果您觉得不错,可以给我家猫咪买点罐头吃。[喂猫❤️](https://yuzhi.tech/sponsor) 201 | 202 | --- 203 | 204 | ## License 205 | 206 | MIT License 207 | -------------------------------------------------------------------------------- /src/omni_article_markdown/parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from collections.abc import Callable 3 | from urllib.parse import urljoin 4 | 5 | import requests 6 | from bs4.element import NavigableString, Tag 7 | 8 | from .extractor import Article 9 | from .utils import ( 10 | collapse_spaces, 11 | detect_language, 12 | filter_tag, 13 | get_attr_text, 14 | is_sequentially_increasing, 15 | move_spaces, 16 | ) 17 | 18 | LB_SYMBOL = "[|lb_bl|]" 19 | 20 | POST_HANDLERS: list[Callable[[str], str]] = [ 21 | # 添加换行使文章更美观 22 | lambda el: re.sub(f"(?:{re.escape(LB_SYMBOL)})+", LB_SYMBOL, el).replace(LB_SYMBOL, "\n\n").strip(), 23 | # 纠正不规范格式 `**code**` 替换为 **`code`** 24 | lambda el: re.sub(r"`\*\*(.*?)\*\*`", r"**`\1`**", el), 25 | # 纠正不规范格式 `*code*` 替换为 *`code`* 26 | lambda el: re.sub(r"`\*(.*?)\*`", r"*`\1`*", el), 27 | # 纠正不规范格式 `[code](url)` 替换为 [`code`](url) 28 | lambda el: re.sub(r"`\s*\[([^\]]+)\]\(([^)]+)\)\s*`", r"[`\1`](\2)", el), 29 | # 将 \( ... \) 替换为 $ ... $ 30 | lambda el: re.sub(r"\\\((.+?)\\\)", r"$\1$", el), 31 | # 将 \[ ... \] 替换为 $$ ... $$ 32 | lambda el: re.sub(r"\\\[(.+?)\\\]", r"$$\1$$", el), 33 | ] 34 | 35 | INLINE_ELEMENTS = ["span", "code", "li", "a", "strong", "em", "b", "i"] 36 | 37 | BLOCK_ELEMENTS = [ 38 | "p", 39 | "h1", 40 | "h2", 41 | "h3", 42 | "h4", 43 | "h5", 44 | "h6", 45 | "ul", 46 | "ol", 47 | "blockquote", 48 | "pre", 49 | "img", 50 | "picture", 51 | "hr", 52 | "figcaption", 53 | "table", 54 | "section", 55 | ] 56 | 57 | TRUSTED_ELEMENTS = INLINE_ELEMENTS + BLOCK_ELEMENTS 58 | 59 | 60 | class HtmlMarkdownParser: 61 | def __init__(self, article: Article): 62 | self.article = article 63 | 64 | def parse(self) -> tuple[str, str]: 65 | if isinstance(self.article.body, str): 66 | markdown = self.article.body 67 | else: 68 | markdown = self._process_children(self.article.body) 69 | for handler in POST_HANDLERS: 70 | markdown = handler(markdown) 71 | if not self.article.description or self.article.description in markdown: 72 | description = "" 73 | else: 74 | description = f"> {self.article.description}\n\n" 75 | result = f"# {self.article.title}\n\n{description}{markdown}" 76 | # print(result) 77 | return (self.article.title, result) 78 | 79 | def _process_element(self, element: Tag, level: int = 0, is_pre: bool = False) -> str: 80 | parts = [] 81 | tag = element.name 82 | match tag: 83 | case "br": 84 | parts.append(LB_SYMBOL) 85 | case "hr": 86 | parts.append("---") 87 | case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 88 | heading = self._process_children(element, level, is_pre=is_pre) 89 | parts.append(f"{'#' * int(element.name[1])} {heading}") 90 | case "a": 91 | link = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "") 92 | if link: 93 | parts.append(f"[{link}]({element.get('href')})") 94 | case "strong" | "b": 95 | s = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "") 96 | if s: 97 | parts.append(move_spaces(f"**{s}**", "**")) 98 | case "em" | "i": 99 | s = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "") 100 | if s: 101 | parts.append(move_spaces(f"*{s}*", "*")) 102 | case "ul" | "ol": 103 | parts.append(self._process_list(element, level)) 104 | case "img": 105 | parts.append(self._process_image(element, None)) 106 | case "blockquote": 107 | blockquote = self._process_children(element, level, is_pre=is_pre) 108 | if blockquote.startswith(LB_SYMBOL): 109 | blockquote = blockquote.removeprefix(LB_SYMBOL) 110 | if blockquote.endswith(LB_SYMBOL): 111 | blockquote = blockquote.removesuffix(LB_SYMBOL) 112 | parts.append("\n".join(f"> {line}" for line in blockquote.split(LB_SYMBOL))) 113 | case "pre": 114 | parts.append(self._process_codeblock(element, level)) 115 | case "code": # inner code 116 | code = self._process_children(element, level, is_pre=is_pre) 117 | if LB_SYMBOL not in code: 118 | parts.append(f"`{code}`") 119 | else: 120 | parts.append(code) 121 | case "picture": 122 | source_elements = element.find_all("source") 123 | img_element = filter_tag(element.find("img")) 124 | if img_element and source_elements: 125 | el = source_elements[0] 126 | src_el = filter_tag(el) 127 | if src_el: 128 | parts.append(self._process_image(img_element, src_el)) 129 | elif img_element: 130 | parts.append(self._process_image(img_element, None)) 131 | case "figcaption": 132 | figcaption = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "\n").strip() 133 | figcaptions = figcaption.replace("\n\n", "\n").split("\n") 134 | parts.append("\n".join([f"*{caption}*" for caption in figcaptions])) 135 | case "table": 136 | parts.append(self._process_table(element, level)) 137 | case "math": # 处理latex公式 138 | semantics = filter_tag(element.find("semantics")) 139 | if semantics: 140 | tex = filter_tag(semantics.find(attrs={"encoding": "application/x-tex"})) 141 | if tex: 142 | parts.append(f"$$ {tex.text} $$") 143 | case "script": # 处理github gist 144 | parts.append(self._process_gist(element)) 145 | case _: 146 | parts.append(self._process_children(element, level, is_pre=is_pre)) 147 | result = "".join(parts) 148 | if result and is_block_element(element.name) and (not element.children or not is_pure_block_children(element)): 149 | result = f"{LB_SYMBOL}{result}{LB_SYMBOL}" 150 | return result 151 | 152 | def _process_children(self, element: Tag, level: int = 0, is_pre: bool = False) -> str: 153 | parts = [] 154 | if element.children: 155 | # new_level = level + 1 if element.name in HtmlMarkdownParser.TRUSTED_ELEMENTS else level 156 | for child in element.children: 157 | if isinstance(child, NavigableString): 158 | if is_pre: 159 | parts.append(child) 160 | else: 161 | result = collapse_spaces(child).replace("<", "<").replace(">", ">") 162 | if result.strip(): 163 | parts.append(result) 164 | # print(element.name, level, result) 165 | elif isinstance(child, Tag): 166 | result = self._process_element(child, level, is_pre=is_pre) 167 | if is_pre or result.strip(): 168 | parts.append(result) 169 | return "".join(parts) if is_pre or level > 0 else "".join(parts) 170 | 171 | def _process_list(self, element: Tag, level: int) -> str: 172 | indent = " " * level 173 | child_list = element.find_all(recursive=False) 174 | is_ol = element.name == "ol" 175 | parts = [] 176 | for i, child in enumerate(child_list): 177 | child = filter_tag(child) 178 | if child: 179 | if child.name == "li": 180 | content = self._process_children(child, level).replace(LB_SYMBOL, "\n").strip() 181 | if content: # 忽略空内容 182 | prefix = f"{i + 1}." if is_ol else "-" 183 | parts.append(f"{indent}{prefix} {content}") 184 | elif child.name == "ul" or child.name == "ol": 185 | content = self._process_element(child, level + 1) 186 | if content: # 忽略空内容 187 | parts.append(f"{content.replace(LB_SYMBOL, '\n')}") 188 | if not parts: 189 | return "" # 所有内容都为空则返回空字符串 190 | return "\n".join(parts) 191 | 192 | def _process_codeblock(self, element: Tag, level: int) -> str: 193 | # 找出所有 code 标签(可能为 0 个、1 个或多个) 194 | code_elements = element.find_all("code") or [element] 195 | 196 | # 处理每一个 code 标签并拼接 197 | code_parts = [ 198 | self._process_children(code_el, level, is_pre=True).replace(LB_SYMBOL, "\n") 199 | for code_el in code_elements 200 | if isinstance(code_el, Tag) 201 | ] 202 | code = "\n".join(code_parts).strip() 203 | 204 | if is_sequentially_increasing(code): 205 | return "" # 忽略行号 206 | 207 | # 尝试提取语言:从第一个 code 标签的 class 中提取 language 208 | first_code_el = code_elements[0] 209 | language = ( 210 | next((cls.split("-")[1] for cls in (first_code_el.get("class") or []) if cls.startswith("language-")), "") 211 | if isinstance(first_code_el, Tag) 212 | else "" 213 | ) 214 | if not language: 215 | language = detect_language(None, code) 216 | return f"```{language}\n{code}\n```" if language else f"```\n{code}\n```" 217 | 218 | def _process_table(self, element: Tag, level: int) -> str: 219 | if element.find("pre"): 220 | return self._process_children(element, level) 221 | # 获取所有行,包括 thead 和 tbody 222 | rows = element.find_all("tr") 223 | if not rows: 224 | return "" 225 | # 解析表头(如果有) 226 | headers = [] 227 | first_row = filter_tag(rows.pop(0)) 228 | if first_row and first_row.find("th"): 229 | headers = [th.get_text(strip=True) for th in first_row.find_all("th")] 230 | # 解析表身 231 | body = [[td.get_text(strip=True) for td in row.find_all("td")] for row in rows if isinstance(row, Tag)] 232 | # 处理缺失的表头 233 | if not headers and body: 234 | headers = body.pop(0) 235 | # 统一列数 236 | col_count = max(len(headers), max((len(row) for row in body), default=0)) 237 | headers += [""] * (col_count - len(headers)) 238 | for row in body: 239 | row += [""] * (col_count - len(row)) 240 | # 生成 Markdown 表格 241 | markdown_table = [] 242 | markdown_table.append("| " + " | ".join(headers) + " |") 243 | markdown_table.append("|-" + "-|-".join(["-" * len(h) for h in headers]) + "-|") 244 | for row in body: 245 | markdown_table.append("| " + " | ".join(row) + " |") 246 | return "\n".join(markdown_table) 247 | 248 | def _process_image(self, element: Tag, source: Tag | None) -> str: 249 | src = ( 250 | get_attr_text(element.attrs.get("src")) 251 | if source is None 252 | else get_attr_text(source.attrs.get("srcset")).split()[0] 253 | ) 254 | alt = get_attr_text(element.attrs.get("alt")) 255 | if src: 256 | if not src.startswith("http") and self.article.url: 257 | src = urljoin(self.article.url, src) 258 | return f"![{alt}]({src})" 259 | return "" 260 | 261 | def _process_gist(self, element: Tag) -> str: 262 | src = get_attr_text(element.attrs.get("src")) 263 | pattern = r"/([0-9a-f]+)(?:\.js)?$" 264 | match = re.search(pattern, src) 265 | if match: 266 | gist_id = match.group(1) 267 | url = f"https://api.github.com/gists/{gist_id}" 268 | response = requests.get(url) 269 | response.encoding = "utf-8" 270 | if response.status_code == 200: 271 | data = response.json() 272 | gists = [] 273 | for filename, info in data["files"].items(): 274 | code = info["content"] 275 | language = detect_language(filename, code) 276 | gists.append(f"```{language}\n{code}\n```") 277 | return "\n\n".join(gists) 278 | print(f"Fetch gist error: {response.status_code}") 279 | return "" 280 | 281 | 282 | def is_block_element(element_name: str) -> bool: 283 | return element_name in BLOCK_ELEMENTS 284 | 285 | 286 | def is_pure_block_children(element: Tag) -> bool: 287 | for child in element.children: 288 | if isinstance(child, NavigableString): 289 | if child.strip(): # 有非空文本 290 | return False 291 | elif isinstance(child, Tag) and not is_block_element(child.name): 292 | return False 293 | return True 294 | -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | revision = 2 3 | requires-python = ">=3.13" 4 | 5 | [[package]] 6 | name = "beautifulsoup4" 7 | version = "4.13.4" 8 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 9 | dependencies = [ 10 | { name = "soupsieve" }, 11 | { name = "typing-extensions" }, 12 | ] 13 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload_time = "2025-04-15T17:05:13.836Z" } 14 | wheels = [ 15 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload_time = "2025-04-15T17:05:12.221Z" }, 16 | ] 17 | 18 | [[package]] 19 | name = "certifi" 20 | version = "2025.4.26" 21 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 22 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/9e/c05b3920a3b7d20d3d3310465f50348e5b3694f4f88c6daf736eef3024c4/certifi-2025.4.26.tar.gz", hash = "sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6", size = 160705, upload_time = "2025-04-26T02:12:29.51Z" } 23 | wheels = [ 24 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3", size = 159618, upload_time = "2025-04-26T02:12:27.662Z" }, 25 | ] 26 | 27 | [[package]] 28 | name = "charset-normalizer" 29 | version = "3.4.2" 30 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 31 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" } 32 | wheels = [ 33 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" }, 34 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" }, 35 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" }, 36 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" }, 37 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" }, 38 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" }, 39 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" }, 40 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" }, 41 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" }, 42 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" }, 43 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" }, 44 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" }, 45 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" }, 46 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" }, 47 | ] 48 | 49 | [[package]] 50 | name = "click" 51 | version = "8.2.1" 52 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 53 | dependencies = [ 54 | { name = "colorama", marker = "sys_platform == 'win32'" }, 55 | ] 56 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload_time = "2025-05-20T23:19:49.832Z" } 57 | wheels = [ 58 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload_time = "2025-05-20T23:19:47.796Z" }, 59 | ] 60 | 61 | [[package]] 62 | name = "click-default-group" 63 | version = "1.2.4" 64 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 65 | dependencies = [ 66 | { name = "click" }, 67 | ] 68 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/ce/edb087fb53de63dad3b36408ca30368f438738098e668b78c87f93cd41df/click_default_group-1.2.4.tar.gz", hash = "sha256:eb3f3c99ec0d456ca6cd2a7f08f7d4e91771bef51b01bdd9580cc6450fe1251e" } 69 | wheels = [ 70 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/1a/aff8bb287a4b1400f69e09a53bd65de96aa5cee5691925b38731c67fc695/click_default_group-1.2.4-py2.py3-none-any.whl", hash = "sha256:9b60486923720e7fc61731bdb32b617039aba820e22e1c88766b1125592eaa5f" }, 71 | ] 72 | 73 | [[package]] 74 | name = "colorama" 75 | version = "0.4.6" 76 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 77 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload_time = "2022-10-25T02:36:22.414Z" } 78 | wheels = [ 79 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload_time = "2022-10-25T02:36:20.889Z" }, 80 | ] 81 | 82 | [[package]] 83 | name = "html5lib" 84 | version = "1.1" 85 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 86 | dependencies = [ 87 | { name = "six" }, 88 | { name = "webencodings" }, 89 | ] 90 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/b6/b55c3f49042f1df3dcd422b7f224f939892ee94f22abcf503a9b7339eaf2/html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f", size = 272215, upload_time = "2020-06-22T23:32:38.834Z" } 91 | wheels = [ 92 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d", size = 112173, upload_time = "2020-06-22T23:32:36.781Z" }, 93 | ] 94 | 95 | [[package]] 96 | name = "idna" 97 | version = "3.10" 98 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 99 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" } 100 | wheels = [ 101 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" }, 102 | ] 103 | 104 | [[package]] 105 | name = "iniconfig" 106 | version = "2.1.0" 107 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 108 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload_time = "2025-03-19T20:09:59.721Z" } 109 | wheels = [ 110 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload_time = "2025-03-19T20:10:01.071Z" }, 111 | ] 112 | 113 | [[package]] 114 | name = "omni-article-markdown" 115 | version = "0.1.10" 116 | source = { editable = "." } 117 | dependencies = [ 118 | { name = "beautifulsoup4" }, 119 | { name = "click" }, 120 | { name = "click-default-group" }, 121 | { name = "html5lib" }, 122 | { name = "pip" }, 123 | { name = "pluggy" }, 124 | { name = "requests" }, 125 | ] 126 | 127 | [package.optional-dependencies] 128 | dev = [ 129 | { name = "pytest" }, 130 | ] 131 | 132 | [package.metadata] 133 | requires-dist = [ 134 | { name = "beautifulsoup4", specifier = ">=4.13.4" }, 135 | { name = "click", specifier = ">=8.2.0" }, 136 | { name = "click-default-group", specifier = ">=1.2.4" }, 137 | { name = "html5lib", specifier = ">=1.1" }, 138 | { name = "pip" }, 139 | { name = "pluggy", specifier = ">=1.6.0" }, 140 | { name = "pytest", marker = "extra == 'dev'" }, 141 | { name = "requests", specifier = ">=2.32.3" }, 142 | ] 143 | provides-extras = ["dev"] 144 | 145 | [[package]] 146 | name = "packaging" 147 | version = "25.0" 148 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 149 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload_time = "2025-04-19T11:48:59.673Z" } 150 | wheels = [ 151 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload_time = "2025-04-19T11:48:57.875Z" }, 152 | ] 153 | 154 | [[package]] 155 | name = "pip" 156 | version = "25.1.1" 157 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 158 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/59/de/241caa0ca606f2ec5fe0c1f4261b0465df78d786a38da693864a116c37f4/pip-25.1.1.tar.gz", hash = "sha256:3de45d411d308d5054c2168185d8da7f9a2cd753dbac8acbfa88a8909ecd9077", size = 1940155, upload_time = "2025-05-02T15:14:02.057Z" } 159 | wheels = [ 160 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/29/a2/d40fb2460e883eca5199c62cfc2463fd261f760556ae6290f88488c362c0/pip-25.1.1-py3-none-any.whl", hash = "sha256:2913a38a2abf4ea6b64ab507bd9e967f3b53dc1ede74b01b0931e1ce548751af", size = 1825227, upload_time = "2025-05-02T15:13:59.102Z" }, 161 | ] 162 | 163 | [[package]] 164 | name = "pluggy" 165 | version = "1.6.0" 166 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 167 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload_time = "2025-05-15T12:30:07.975Z" } 168 | wheels = [ 169 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload_time = "2025-05-15T12:30:06.134Z" }, 170 | ] 171 | 172 | [[package]] 173 | name = "pygments" 174 | version = "2.19.2" 175 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 176 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload_time = "2025-06-21T13:39:12.283Z" } 177 | wheels = [ 178 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload_time = "2025-06-21T13:39:07.939Z" }, 179 | ] 180 | 181 | [[package]] 182 | name = "pytest" 183 | version = "8.4.2" 184 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 185 | dependencies = [ 186 | { name = "colorama", marker = "sys_platform == 'win32'" }, 187 | { name = "iniconfig" }, 188 | { name = "packaging" }, 189 | { name = "pluggy" }, 190 | { name = "pygments" }, 191 | ] 192 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload_time = "2025-09-04T14:34:22.711Z" } 193 | wheels = [ 194 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload_time = "2025-09-04T14:34:20.226Z" }, 195 | ] 196 | 197 | [[package]] 198 | name = "requests" 199 | version = "2.32.3" 200 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 201 | dependencies = [ 202 | { name = "certifi" }, 203 | { name = "charset-normalizer" }, 204 | { name = "idna" }, 205 | { name = "urllib3" }, 206 | ] 207 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218, upload_time = "2024-05-29T15:37:49.536Z" } 208 | wheels = [ 209 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928, upload_time = "2024-05-29T15:37:47.027Z" }, 210 | ] 211 | 212 | [[package]] 213 | name = "six" 214 | version = "1.17.0" 215 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 216 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload_time = "2024-12-04T17:35:28.174Z" } 217 | wheels = [ 218 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload_time = "2024-12-04T17:35:26.475Z" }, 219 | ] 220 | 221 | [[package]] 222 | name = "soupsieve" 223 | version = "2.7" 224 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 225 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload_time = "2025-04-20T18:50:08.518Z" } 226 | wheels = [ 227 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload_time = "2025-04-20T18:50:07.196Z" }, 228 | ] 229 | 230 | [[package]] 231 | name = "typing-extensions" 232 | version = "4.13.2" 233 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 234 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload_time = "2025-04-10T14:19:05.416Z" } 235 | wheels = [ 236 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload_time = "2025-04-10T14:19:03.967Z" }, 237 | ] 238 | 239 | [[package]] 240 | name = "urllib3" 241 | version = "2.4.0" 242 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 243 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8a/78/16493d9c386d8e60e442a35feac5e00f0913c0f4b7c217c11e8ec2ff53e0/urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466", size = 390672, upload_time = "2025-04-10T15:23:39.232Z" } 244 | wheels = [ 245 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680, upload_time = "2025-04-10T15:23:37.377Z" }, 246 | ] 247 | 248 | [[package]] 249 | name = "webencodings" 250 | version = "0.5.1" 251 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } 252 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload_time = "2017-04-05T20:21:34.189Z" } 253 | wheels = [ 254 | { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload_time = "2017-04-05T20:21:32.581Z" }, 255 | ] 256 | --------------------------------------------------------------------------------