├── .python-version
├── plugins
    ├── omnimd-browser-reader
    │   ├── README.md
    │   ├── pyproject.toml
    │   └── browser.py
    ├── omnimd-toutiao-reader
    │   ├── README.md
    │   ├── pyproject.toml
    │   └── toutiao.py
    ├── omnimd-zhihu-reader
    │   ├── README.md
    │   ├── pyproject.toml
    │   └── zhihu.py
    └── omnimd-freedium-reader
    │   ├── README.md
    │   ├── pyproject.toml
    │   └── freedium.py
├── data
    ├── 1.gif
    └── 1.jpg
├── .gitignore
├── tests
    ├── conftest.py
    ├── test_extractor.py
    ├── test_parser.py
    └── test_utils.py
├── Dockerfile
├── src
    └── omni_article_markdown
    │   ├── __init__.py
    │   ├── plugins.py
    │   ├── extractors
    │       ├── hugo.py
    │       ├── zhihu.py
    │       ├── 163.py
    │       ├── woshipm.py
    │       ├── infoqcn.py
    │       ├── aliyun_developer.py
    │       ├── android_dev_blog.py
    │       ├── cloudflare_blog.py
    │       ├── oschina.py
    │       ├── tencent_cloud.py
    │       ├── anthropic.py
    │       ├── medium.py
    │       ├── infoq.py
    │       ├── quantamagazine.py
    │       ├── juejin.py
    │       ├── sspai.py
    │       ├── claude_doc.py
    │       ├── microsoft_learn.py
    │       ├── cnblog.py
    │       ├── apple_developer.py
    │       ├── baijiahao.py
    │       ├── toutiao.py
    │       ├── jetbrains_blog.py
    │       ├── wechat_gzh.py
    │       ├── jianshu.py
    │       ├── towards_data_science.py
    │       ├── freedium.py
    │       └── yuque.py
    │   ├── hookspecs.py
    │   ├── store.py
    │   ├── readers.py
    │   ├── omni_article_md.py
    │   ├── cli.py
    │   ├── extractor.py
    │   ├── utils.py
    │   └── parser.py
├── .editorconfig
├── .github
    ├── FUNDING.yml
    └── workflows
    │   ├── publish.yml
    │   └── publish_plugin.yml
├── LICENSE
├── pyproject.toml
├── ruff.toml
├── README.md
└── uv.lock


/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 | 


--------------------------------------------------------------------------------
/plugins/omnimd-browser-reader/README.md:
--------------------------------------------------------------------------------
1 | # 墨探 (omni-article-markdown) 浏览器插件
2 | 


--------------------------------------------------------------------------------
/plugins/omnimd-toutiao-reader/README.md:
--------------------------------------------------------------------------------
1 | # 墨探 (omni-article-markdown) 头条插件
2 | 


--------------------------------------------------------------------------------
/plugins/omnimd-zhihu-reader/README.md:
--------------------------------------------------------------------------------
1 | # 墨探 (omni-article-markdown) 知乎插件
2 | 


--------------------------------------------------------------------------------
/data/1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caol64/omni-article-markdown/HEAD/data/1.gif


--------------------------------------------------------------------------------
/data/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/caol64/omni-article-markdown/HEAD/data/1.jpg


--------------------------------------------------------------------------------
/plugins/omnimd-freedium-reader/README.md:
--------------------------------------------------------------------------------
1 | # 墨探 (omni-article-markdown) Freedium插件
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_STORE
 2 | 
 3 | # Python-generated files
 4 | __pycache__/
 5 | *.py[oc]
 6 | build/
 7 | dist/
 8 | wheels/
 9 | *.egg-info
10 | 
11 | # Virtual environments
12 | .venv
13 | .env
14 | 
15 | plugins/**/uv.lock
16 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | 
 5 | @pytest.fixture
 6 | def make_soup():
 7 |     def _make_soup(html: str, parser: str = "html.parser"):
 8 |         return BeautifulSoup(html, parser)
 9 |     return _make_soup
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.13-alpine
 2 | 
 3 | ARG PYPI_REGISTRY="https://pypi.org/simple/"
 4 | 
 5 | WORKDIR /app
 6 | 
 7 | RUN pip config set global.index-url "${PYPI_REGISTRY}"
 8 | RUN pip install omni-article-markdown
 9 | 
10 | ENTRYPOINT ["mdcli"]
11 | CMD []
12 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/__init__.py:
--------------------------------------------------------------------------------
 1 | from .omni_article_md import OmniArticleMarkdown
 2 | 
 3 | __all__ = ["OmniArticleMarkdown"]
 4 | 
 5 | DEFAULT_PLUGINS = {
 6 |     "zhihu": "omnimd-zhihu-reader",
 7 |     "freedium": "omnimd-freedium-reader",
 8 |     "toutiao": "omnimd-toutiao-reader",
 9 |     "browser": "omnimd-browser-reader",
10 | }
11 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | root = true
 2 | 
 3 | [*]
 4 | indent_style = space
 5 | indent_size = 4
 6 | trim_trailing_whitespace = true
 7 | charset = utf-8
 8 | end_of_line = lf
 9 | insert_final_newline = true
10 | 
11 | [*.json]
12 | indent_size = 2
13 | 
14 | [*.{yml,yaml}]
15 | indent_size = 2
16 | 
17 | [Makefile]
18 | indent_style = tab
19 | 
20 | [*.{md,mdx}]
21 | max_line_length = off
22 | trim_trailing_whitespace = false
23 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/plugins.py:
--------------------------------------------------------------------------------
 1 | import pluggy
 2 | 
 3 | from . import hookspecs
 4 | 
 5 | pm = pluggy.PluginManager("mdcli")
 6 | pm.add_hookspecs(hookspecs)
 7 | 
 8 | _loaded_plugins = False
 9 | 
10 | def load_mdcli_plugins():
11 |     global _loaded_plugins
12 |     if _loaded_plugins:
13 |         return
14 |     pm.load_setuptools_entrypoints("mdcli")
15 |     _loaded_plugins = True
16 | 
17 | # 在应用启动时调用
18 | load_mdcli_plugins()
19 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/hugo.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | 
 7 | 
 8 | class HugoExtractor(Extractor):
 9 |     """
10 |     Hugo博客
11 |     """
12 | 
13 |     @override
14 |     def can_handle(self, soup: BeautifulSoup) -> bool:
15 |         return False
16 | 
17 |     @override
18 |     def article_container(self) -> tuple:
19 |         return ("div", {"class": "post-content"})
20 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/zhihu.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class ZhihuExtractor(Extractor):
10 |     """
11 |     知乎专栏
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return get_og_site_name(soup) == "知乎专栏"
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"class": "Post-RichText"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/163.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import is_matched_canonical
 7 | 
 8 | 
 9 | class Netease163Extractor(Extractor):
10 |     """
11 |     163.com
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return is_matched_canonical("https://www.163.com", soup)
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"class": "post_body"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/woshipm.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_url
 7 | 
 8 | 
 9 | class WoShiPMExtractor(Extractor):
10 |     """
11 |     人人都是产品经理
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return get_og_url(soup).startswith("https://www.woshipm.com")
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"class": "article--content"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/infoqcn.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import is_matched_canonical
 7 | 
 8 | 
 9 | class InfoQCNExtractor(Extractor):
10 |     """
11 |     www.infoq.cn
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return is_matched_canonical("https://www.infoq.cn", soup)
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"class": "article-content-wrap"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/aliyun_developer.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import is_matched_canonical
 7 | 
 8 | 
 9 | class AliyunDeveloperExtractor(Extractor):
10 |     """
11 |     developer.aliyun.com
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return is_matched_canonical("https://developer.aliyun.com", soup)
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"class": "article-content"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/android_dev_blog.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class AndroidDevelopersBlogExtractor(Extractor):
10 |     """
11 |     Android Developers Blog
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return get_og_site_name(soup) == "Android Developers Blog"
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"class": "adb-detail__content"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/cloudflare_blog.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import is_matched_canonical
 7 | 
 8 | 
 9 | class CloudflareBlogExtractor(Extractor):
10 |     """
11 |     blog.cloudflare.com
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return is_matched_canonical("https://blog.cloudflare.com", soup)
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("section", {"class": "post-full-content"})
21 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/hookspecs.py:
--------------------------------------------------------------------------------
 1 | from typing import Protocol
 2 | 
 3 | from pluggy import HookimplMarker, HookspecMarker
 4 | 
 5 | hookspec = HookspecMarker("mdcli")
 6 | hookimpl = HookimplMarker("mdcli")
 7 | 
 8 | 
 9 | class ReaderPlugin(Protocol):
10 |     def can_handle(self, url: str) -> bool: ...
11 | 
12 |     def read(self, url: str) -> str: ...
13 | 
14 | 
15 | @hookspec(firstresult=True)
16 | def get_custom_reader(url: str) -> ReaderPlugin | None:
17 |     """
18 |     Allows plugins to provide a custom reader for a given URL.
19 |     The first plugin that returns a ReaderPlugin instance will be used.
20 |     """
21 |     ...
22 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/oschina.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | 
 7 | 
 8 | class OsChinaExtractor(Extractor):
 9 |     """
10 |     开源中国
11 |     """
12 | 
13 |     @override
14 |     def can_handle(self, soup: BeautifulSoup) -> bool:
15 |         title_tag = soup.title
16 |         title = title_tag.get_text(strip=True) if title_tag else None
17 |         return title is not None and title.endswith(" - OSCHINA - 中文开源技术交流社区")
18 | 
19 |     @override
20 |     def article_container(self) -> tuple:
21 |         return ("div", {"class": "detail-box"})
22 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/tencent_cloud.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | 
 7 | 
 8 | class TencentCloudExtractor(Extractor):
 9 |     """
10 |     腾讯云开发者社区
11 |     """
12 | 
13 |     @override
14 |     def can_handle(self, soup: BeautifulSoup) -> bool:
15 |         title_tag = soup.title
16 |         title = title_tag.get_text(strip=True) if title_tag else None
17 |         return title is not None and title.endswith("-腾讯云开发者社区-腾讯云")
18 | 
19 |     @override
20 |     def article_container(self) -> tuple:
21 |         return ("div", {"class": "mod-content__markdown"})
22 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/anthropic.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_title
 7 | 
 8 | 
 9 | class ClaudeDocExtractor(Extractor):
10 |     """
11 |     Anthropic
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return get_title(soup).endswith(" \\ Anthropic")
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("article", None)
21 | 
22 |     @override
23 |     def extract_url(self, soup: BeautifulSoup) -> str:
24 |         return "https://www.anthropic.com/"
25 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/medium.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class MediumExtractor(Extractor):
10 |     """
11 |     Medium
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend([
17 |             lambda el: 'data-testid' in el.attrs,
18 |             lambda el: 'class' in el.attrs and 'speechify-ignore' in el.attrs['class'],
19 |         ])
20 | 
21 |     @override
22 |     def can_handle(self, soup: BeautifulSoup) -> bool:
23 |         return get_og_site_name(soup) == "Medium"
24 | 
25 |     @override
26 |     def article_container(self) -> tuple:
27 |         return ("article", None)
28 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/store.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from typing import Any
 4 | 
 5 | 
 6 | class Store:
 7 |     def __init__(self, base_dir_name: str = ".ommimd"):
 8 |         self.path = Path.home() / base_dir_name
 9 | 
10 |     def save(self, key: str, obj: Any):
11 |         self.path.mkdir(parents=True, exist_ok=True)
12 |         file_path = self.path / f"{key}.json"
13 |         with open(file_path, "w", encoding="utf8") as f:
14 |             json.dump(obj, f, indent=4, ensure_ascii=False)
15 | 
16 |     def load(self, key: str) -> Any | None:
17 |         file_path = self.path / f"{key}.json"
18 |         if not file_path.exists() or not file_path.is_file():
19 |             return None
20 |         with open(file_path, encoding="utf8") as f:
21 |             return json.load(f)
22 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/infoq.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import is_matched_canonical
 7 | 
 8 | 
 9 | class InfoQExtractor(Extractor):
10 |     """
11 |     www.infoq.com
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "class" in el.attrs and "author-section-full" in el.attrs["class"],
19 |             ]
20 |         )
21 | 
22 |     @override
23 |     def can_handle(self, soup: BeautifulSoup) -> bool:
24 |         return is_matched_canonical("https://www.infoq.com", soup)
25 | 
26 |     @override
27 |     def article_container(self) -> tuple:
28 |         return ("div", {"class": "article__data"})
29 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/quantamagazine.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class QuantamagazineExtractor(Extractor):
10 |     """
11 |     quantamagazine.org
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "class" in el.attrs and "post__title__title" in el.attrs["class"],
19 |             ]
20 |         )
21 | 
22 |     @override
23 |     def can_handle(self, soup: BeautifulSoup) -> bool:
24 |         return get_og_site_name(soup) == "Quanta Magazine"
25 | 
26 |     @override
27 |     def article_container(self) -> tuple:
28 |         return ("div", {"id": "postBody"})
29 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/juejin.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import filter_tag, is_matched_canonical
 7 | 
 8 | 
 9 | class JuejinExtractor(Extractor):
10 |     """
11 |     juejin.cn
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         return is_matched_canonical("https://juejin.cn/", soup)
17 | 
18 |     @override
19 |     def article_container(self) -> tuple:
20 |         return ("div", {"id": "article-root"})
21 | 
22 |     @override
23 |     def extract_title(self, soup: BeautifulSoup) -> str:
24 |         title_tag = filter_tag(soup.find("h1", {"class": "article-title"}))
25 |         return title_tag.get_text(strip=True) if title_tag else super().extract_title(soup)
26 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/sspai.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class SspaiExtractor(Extractor):
10 |     """
11 |     少数派
12 |     """
13 |     def __init__(self):
14 |         super().__init__()
15 |         self.attrs_to_clean.extend(
16 |             [
17 |                 lambda el: "class" in el.attrs and "comment__list" in el.attrs["class"],
18 |                 lambda el: "class" in el.attrs and "comment__footer__wrapper" in el.attrs["class"],
19 |             ]
20 |         )
21 | 
22 |     @override
23 |     def can_handle(self, soup: BeautifulSoup) -> bool:
24 |         return get_og_site_name(soup) == "少数派 - 高品质数字消费指南"
25 | 
26 |     @override
27 |     def article_container(self) -> tuple:
28 |         return ("div", {"class": "article__main__wrapper"})
29 | 


--------------------------------------------------------------------------------
/plugins/omnimd-zhihu-reader/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "omnimd-zhihu-reader"
 3 | version = "0.1.3"
 4 | description = "A plugin for omni-article-markdown to read Zhihu content."
 5 | authors = [
 6 |     { name = "Lei", email = "caol64@gmail.com" }
 7 | ]
 8 | readme = "README.md"
 9 | requires-python = ">=3.13"
10 | license = "MIT"
11 | dependencies = [
12 |     "playwright",
13 | ]
14 | 
15 | [project.entry-points.mdcli]
16 | zhihu = "zhihu"
17 | 
18 | [project.urls]
19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-zhihu-reader"
20 | 
21 | [build-system]
22 | requires = ["hatchling"]
23 | build-backend = "hatchling.build"
24 | 
25 | [tool.hatch.build]
26 | exclude = [
27 |     "/dist",
28 | ]
29 | 
30 | [tool.hatch.build.targets.wheel]
31 | include = [
32 |     "/zhihu.py",
33 | ]
34 | 
35 | [tool.hatch.build.targets.sdist]
36 | include = [
37 |     "/zhihu.py",
38 |     "/README.md",
39 |     "/pyproject.toml",
40 | ]
41 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
12 | polar: # Replace with a single Polar username
13 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username
14 | thanks_dev: # Replace with a single thanks.dev username
15 | custom: ['https://yuzhi.tech/sponsor', 'https://paypal.me/caol64']
16 | 


--------------------------------------------------------------------------------
/plugins/omnimd-toutiao-reader/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "omnimd-toutiao-reader"
 3 | version = "0.1.3"
 4 | description = "A plugin for omni-article-markdown to read Toutiao content."
 5 | authors = [
 6 |     { name = "Lei", email = "caol64@gmail.com" }
 7 | ]
 8 | readme = "README.md"
 9 | requires-python = ">=3.13"
10 | license = "MIT"
11 | dependencies = [
12 |     "playwright",
13 | ]
14 | 
15 | [project.entry-points.mdcli]
16 | toutiao = "toutiao"
17 | 
18 | [project.urls]
19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-toutiao-reader"
20 | 
21 | [build-system]
22 | requires = ["hatchling"]
23 | build-backend = "hatchling.build"
24 | 
25 | [tool.hatch.build]
26 | exclude = [
27 |     "/dist",
28 | ]
29 | 
30 | [tool.hatch.build.targets.wheel]
31 | include = [
32 |     "/toutiao.py",
33 | ]
34 | 
35 | [tool.hatch.build.targets.sdist]
36 | include = [
37 |     "/toutiao.py",
38 |     "/README.md",
39 |     "/pyproject.toml",
40 | ]
41 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/claude_doc.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_title
 7 | 
 8 | 
 9 | class ClaudeDocExtractor(Extractor):
10 |     """
11 |     docs.claude.com
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "data-component-part" in el.attrs and "code-block-header" in el.attrs["data-component-part"],
19 |                 lambda el: "data-component-part" in el.attrs and "code-group-tab-bar" in el.attrs["data-component-part"],
20 |             ]
21 |         )
22 | 
23 |     @override
24 |     def can_handle(self, soup: BeautifulSoup) -> bool:
25 |         return get_og_title(soup).endswith(" - Claude Docs")
26 | 
27 |     @override
28 |     def article_container(self) -> tuple:
29 |         return ("div", {"class": "mdx-content"})
30 | 


--------------------------------------------------------------------------------
/plugins/omnimd-freedium-reader/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "omnimd-freedium-reader"
 3 | version = "0.1.3"
 4 | description = "A plugin for omni-article-markdown to read Freedium content."
 5 | authors = [
 6 |     { name = "Lei", email = "caol64@gmail.com" }
 7 | ]
 8 | readme = "README.md"
 9 | requires-python = ">=3.13"
10 | license = "MIT"
11 | dependencies = [
12 |     "playwright",
13 | ]
14 | 
15 | [project.entry-points.mdcli]
16 | freedium = "freedium"
17 | 
18 | [project.urls]
19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-freedium-reader"
20 | 
21 | [build-system]
22 | requires = ["hatchling"]
23 | build-backend = "hatchling.build"
24 | 
25 | [tool.hatch.build]
26 | exclude = [
27 |     "/dist",
28 | ]
29 | 
30 | [tool.hatch.build.targets.wheel]
31 | include = [
32 |     "/freedium.py",
33 | ]
34 | 
35 | [tool.hatch.build.targets.sdist]
36 | include = [
37 |     "/freedium.py",
38 |     "/README.md",
39 |     "/pyproject.toml",
40 | ]
41 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Build and Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   omnimd-publish:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |       - name: Checkout repository
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: '3.13'
19 | 
20 |       - name: Set up pip cache
21 |         uses: actions/cache@v4
22 |         with:
23 |           path: ~/.cache/pip
24 |           key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
25 | 
26 |       - name: Install Hatch
27 |         run: |
28 |           pip install -U hatch hatchling
29 | 
30 |       - name: Build and publish with Hatch
31 |         env:
32 |           HATCH_INDEX_USER: __token__
33 |           HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }}
34 |         run: |
35 |           hatch build --clean
36 |           hatch publish --yes --no-prompt
37 | 


--------------------------------------------------------------------------------
/plugins/omnimd-browser-reader/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "omnimd-browser-reader"
 3 | version = "0.1.2"
 4 | description = "A plugin for omni-article-markdown to read content that needs enabling javascript."
 5 | authors = [
 6 |     { name = "Lei", email = "caol64@gmail.com" }
 7 | ]
 8 | readme = "README.md"
 9 | requires-python = ">=3.13"
10 | license = "MIT"
11 | dependencies = [
12 |     "playwright",
13 | ]
14 | 
15 | [project.entry-points.mdcli]
16 | browser = "browser"
17 | 
18 | [project.urls]
19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-browser-reader"
20 | 
21 | [build-system]
22 | requires = ["hatchling"]
23 | build-backend = "hatchling.build"
24 | 
25 | [tool.hatch.build]
26 | exclude = [
27 |     "/dist",
28 | ]
29 | 
30 | [tool.hatch.build.targets.wheel]
31 | include = [
32 |     "/browser.py",
33 | ]
34 | 
35 | [tool.hatch.build.targets.sdist]
36 | include = [
37 |     "/browser.py",
38 |     "/README.md",
39 |     "/pyproject.toml",
40 | ]
41 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/microsoft_learn.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_url
 7 | 
 8 | 
 9 | class MicrosoftLearnExtractor(Extractor):
10 |     """
11 |     微软技术文档
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "id" in el.attrs and "article-header" in el.attrs["id"],
19 |                 lambda el: "id" in el.attrs and "article-metadata" in el.attrs["id"],
20 |                 lambda el: "id" in el.attrs and "site-user-feedback-footer" in el.attrs["id"],
21 |             ]
22 |         )
23 | 
24 |     @override
25 |     def can_handle(self, soup: BeautifulSoup) -> bool:
26 |         return get_og_url(soup).startswith("https://learn.microsoft.com")
27 | 
28 |     @override
29 |     def article_container(self) -> tuple:
30 |         return ("main", {"id": "main"})
31 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/cnblog.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import is_matched_canonical
 7 | 
 8 | 
 9 | class CnBlogsExtractor(Extractor):
10 |     """
11 |     博客园
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "id" in el.attrs and "blog_post_info_block" in el.attrs["id"],
19 |                 lambda el: "class" in el.attrs and "postDesc" in el.attrs["class"],
20 |             ]
21 |         )
22 | 
23 |     @override
24 |     def can_handle(self, soup: BeautifulSoup) -> bool:
25 |         return is_matched_canonical("https://www.cnblogs.com", soup)
26 | 
27 |     @override
28 |     def article_container(self) -> tuple:
29 |         return ("div", {"class": "post"})
30 | 
31 |     @override
32 |     def extract_description(self, soup: BeautifulSoup) -> str:
33 |         return ""
34 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/apple_developer.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class AppleDevelopExtractor(Extractor):
10 |     """
11 |     Apple Developer Documentation
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "class" in el.attrs and "eyebrow" in el.attrs["class"],
19 |                 lambda el: "class" in el.attrs and "platform" in el.attrs["class"],
20 |                 lambda el: "class" in el.attrs and "title" in el.attrs["class"],
21 |             ]
22 |         )
23 | 
24 |     @override
25 |     def can_handle(self, soup: BeautifulSoup) -> bool:
26 |         return get_og_site_name(soup) == "Apple Developer Documentation"
27 | 
28 |     @override
29 |     def article_container(self) -> tuple:
30 |         return ("main", {"class": "main"})
31 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/baijiahao.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import filter_tag
 7 | 
 8 | 
 9 | class Netease163Extractor(Extractor):
10 |     """
11 |     百家号
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         tag1 = filter_tag(soup.find("div", {"data-testid": "article"}))
17 |         tag2 = filter_tag(soup.find("span", {"class": "bjh-p"}))
18 |         return tag1 is not None and tag2 is not None
19 | 
20 |     @override
21 |     def article_container(self) -> tuple:
22 |         return ("div", {"data-testid": "article"})
23 | 
24 |     @override
25 |     def pre_handle_soup(self, soup: BeautifulSoup) -> BeautifulSoup:
26 |         for tag in soup.find_all("span", {"class": "bjh-p"}):
27 |             span_tag = filter_tag(tag)
28 |             if span_tag:
29 |                 span_tag.name = "p"
30 |         # for tag in soup.find_all("img"):
31 |         #     tag.wrap(soup.new_tag("p"))
32 |         return soup
33 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/toutiao.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from bs4.element import Tag
 5 | 
 6 | from ..extractor import Extractor
 7 | from ..utils import filter_tag, get_attr_text
 8 | 
 9 | 
10 | class ToutiaoExtractor(Extractor):
11 |     """
12 |     今日头条
13 |     """
14 | 
15 |     @override
16 |     def can_handle(self, soup: BeautifulSoup) -> bool:
17 |         title_tag = soup.title
18 |         title = title_tag.get_text(strip=True) if title_tag else None
19 |         return title is not None and title.endswith(" - 今日头条")
20 | 
21 |     @override
22 |     def article_container(self) -> tuple:
23 |         return ("div", {"class": "article-content"})
24 | 
25 |     @override
26 |     def extract_img(self, element: Tag) -> Tag:
27 |         img_els = element.find_all("img")
28 |         for img_el in img_els:
29 |             img_tag = filter_tag(img_el)
30 |             if img_tag:
31 |                 src = get_attr_text(img_tag.attrs.get("data-src"))
32 |                 if src:
33 |                     img_tag.attrs["src"] = src
34 |         return element
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 caol64
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/jetbrains_blog.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class JetbrainsBlogExtractor(Extractor):
10 |     """
11 |     blog.jetbrains.com
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend(
17 |             [
18 |                 lambda el: "class" in el.attrs and "content__row" in el.attrs["class"],
19 |                 lambda el: "class" in el.attrs and "content__pagination" in el.attrs["class"],
20 |                 lambda el: "class" in el.attrs and "content__form" in el.attrs["class"],
21 |                 lambda el: "class" in el.attrs and "tag" in el.attrs["class"],
22 |                 lambda el: "class" in el.attrs and "author-post" in el.attrs["class"],
23 |             ]
24 |         )
25 | 
26 |     @override
27 |     def can_handle(self, soup: BeautifulSoup) -> bool:
28 |         return get_og_site_name(soup) == "The JetBrains Blog"
29 | 
30 |     @override
31 |     def article_container(self) -> tuple:
32 |         return ("div", {"class": "content"})
33 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/wechat_gzh.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from bs4.element import Tag
 5 | 
 6 | from ..extractor import Extractor
 7 | from ..utils import filter_tag, get_attr_text, get_og_site_name
 8 | 
 9 | 
10 | class WechatGZHExtractor(Extractor):
11 |     """
12 |     微信公众号
13 |     """
14 | 
15 |     def __init__(self):
16 |         super().__init__()
17 |         self.attrs_to_clean.append(lambda el: 'id' in el.attrs and el.attrs['id'] == 'meta_content')
18 | 
19 |     @override
20 |     def can_handle(self, soup: BeautifulSoup) -> bool:
21 |         return get_og_site_name(soup) == "微信公众平台"
22 | 
23 |     @override
24 |     def article_container(self) -> tuple:
25 |         return ("div", {"class": "rich_media_content"})
26 | 
27 |     @override
28 |     def extract_img(self, element: Tag) -> Tag:
29 |         img_els = element.find_all("img")
30 |         for img_el in img_els:
31 |             img_tag = filter_tag(img_el)
32 |             if img_tag:
33 |                 src = get_attr_text(img_tag.attrs.get("data-src"))
34 |                 if src:
35 |                     img_tag.attrs["src"] = src
36 |         return element
37 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/jianshu.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from bs4.element import Tag
 5 | 
 6 | from ..extractor import ARTICLE_CONTAINERS, Extractor
 7 | from ..utils import filter_tag, get_attr_text, get_og_site_name
 8 | 
 9 | 
10 | class JianshuExtractor(Extractor):
11 |     """
12 |     www.jianshu.com
13 |     """
14 | 
15 |     @override
16 |     def can_handle(self, soup: BeautifulSoup) -> bool:
17 |         return get_og_site_name(soup) == "简书"
18 | 
19 |     @override
20 |     def article_container(self) -> tuple | list:
21 |         return ARTICLE_CONTAINERS
22 | 
23 |     @override
24 |     def extract_description(self, soup: BeautifulSoup) -> str:
25 |         return ""
26 | 
27 |     @override
28 |     def extract_url(self, soup: BeautifulSoup) -> str:
29 |         return "https:"
30 | 
31 |     @override
32 |     def extract_img(self, element: Tag) -> Tag:
33 |         img_els = element.find_all("img")
34 |         for img_el in img_els:
35 |             img_tag = filter_tag(img_el)
36 |             if img_tag:
37 |                 src = get_attr_text(img_tag.attrs.get("data-original-src"))
38 |                 if src:
39 |                     img_tag.attrs["src"] = src
40 |         return element
41 | 


--------------------------------------------------------------------------------
/.github/workflows/publish_plugin.yml:
--------------------------------------------------------------------------------
 1 | name: Build Plugins and Publish to PyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       package_path:
 7 |         description: 'Path to plugin directory (relative to repo root)'
 8 |         required: true
 9 |         default: 'plugins/omnimd-freedium-reader'
10 | 
11 | jobs:
12 |   omnimd-plugin-publish:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: '3.13'
23 | 
24 |       - name: Set up pip cache
25 |         uses: actions/cache@v4
26 |         with:
27 |           path: ~/.cache/pip
28 |           key: ${{ runner.os }}-${{ github.event.inputs.package_path }}-pip-${{ hashFiles('pyproject.toml') }}
29 | 
30 |       - name: Install Hatch
31 |         run: |
32 |           pip install -U hatch hatchling
33 | 
34 |       - name: Build and publish with Hatch
35 |         env:
36 |           HATCH_INDEX_USER: __token__
37 |           HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }}
38 |         run: |
39 |           cd "${{ github.event.inputs.package_path }}"
40 |           hatch build --clean
41 |           hatch publish --yes --no-prompt
42 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/towards_data_science.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import get_og_site_name
 7 | 
 8 | 
 9 | class TowardsDataScienceExtractor(Extractor):
10 |     """
11 |     towardsdatascience.com
12 |     """
13 | 
14 |     def __init__(self):
15 |         super().__init__()
16 |         self.attrs_to_clean.extend([
17 |             lambda el: 'class' in el.attrs and 'taxonomy-post_tag' in el.attrs['class'],
18 |             lambda el: 'class' in el.attrs and 'tds-cta-box' in el.attrs['class'],
19 |             lambda el: 'class' in el.attrs and 'wp-block-buttons' in el.attrs['class'],
20 |             lambda el: 'class' in el.attrs and 'wp-block-outermost-social-sharing' in el.attrs['class'],
21 |             lambda el: 'class' in el.attrs and 'wp-block-tenup-post-time-to-read' in el.attrs['class'],
22 |         ])
23 |         self.tags_to_clean.extend([
24 |             lambda el: el.name == 'time',
25 |         ])
26 | 
27 |     @override
28 |     def can_handle(self, soup: BeautifulSoup) -> bool:
29 |         return get_og_site_name(soup) == "Towards Data Science"
30 | 
31 |     @override
32 |     def article_container(self) -> tuple | list:
33 |         return ("main", None)
34 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/freedium.py:
--------------------------------------------------------------------------------
 1 | from typing import override
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from ..extractor import Extractor
 6 | from ..utils import filter_tag
 7 | 
 8 | 
 9 | class FreediumExtractor(Extractor):
10 |     """
11 |     freedium.cfd
12 |     """
13 | 
14 |     @override
15 |     def can_handle(self, soup: BeautifulSoup) -> bool:
16 |         title_tag = soup.title
17 |         title = title_tag.get_text(strip=True) if title_tag else None
18 |         return title is not None and title.endswith(" - Freedium")
19 | 
20 |     @override
21 |     def article_container(self) -> tuple:
22 |         return ("div", {"class": "main-content"})
23 | 
24 |     @override
25 |     def extract_title(self, soup: BeautifulSoup) -> str:
26 |         title_tag = filter_tag(soup.find("h1"))
27 |         if title_tag:
28 |             title = title_tag.get_text(strip=True)
29 |             title_tag.decompose()
30 |             return title
31 |         return super().extract_title(soup)
32 | 
33 |     @override
34 |     def extract_description(self, soup: BeautifulSoup) -> str:
35 |         description_tag = soup.find("h2")
36 |         if description_tag:
37 |             description = description_tag.get_text(strip=True)
38 |             description_tag.decompose()
39 |             return description
40 |         return super().extract_description(soup)
41 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "omni-article-markdown"
 3 | version = "0.1.10"
 4 | description = "Easily convert web articles (blogs, news, documents, etc.) into Markdown format."
 5 | authors = [
 6 |     { name = "Lei", email = "caol64@gmail.com" }
 7 | ]
 8 | readme = "README.md"
 9 | requires-python = ">=3.13"
10 | license = "MIT"
11 | classifiers = [
12 |     "Development Status :: 5 - Production/Stable",
13 |     "Environment :: Console",
14 |     "Intended Audience :: Developers",
15 |     "Intended Audience :: End Users/Desktop",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Natural Language :: English",
18 |     "Operating System :: OS Independent",
19 |     "Programming Language :: Python :: 3",
20 |     "Programming Language :: Python :: 3.13",
21 |     "Topic :: Text Processing :: Markup :: Markdown",
22 |     "Topic :: Utilities",
23 | ]
24 | dependencies = [
25 |     "requests>=2.32.3",
26 |     "beautifulsoup4>=4.13.4",
27 |     "html5lib>=1.1",
28 |     "click>=8.2.0",
29 |     "pluggy>=1.6.0",
30 |     "click-default-group>=1.2.4",
31 |     "pip",
32 | ]
33 | 
34 | [project.optional-dependencies]
35 | dev = [
36 |     "pytest",
37 | ]
38 | 
39 | [project.scripts]
40 | mdcli = "omni_article_markdown.cli:cli"
41 | 
42 | [project.urls]
43 | Homepage = "https://github.com/caol64/omni-article-markdown"
44 | 
45 | [build-system]
46 | requires = ["hatchling"]
47 | build-backend = "hatchling.build"
48 | 
49 | [tool.hatch.build]
50 | exclude = [
51 |     "/data",
52 |     "/plugins",
53 |     "/dist",
54 | ]
55 | 
56 | [tool.hatch.build.targets.sdist]
57 | include = [
58 |     "/src/omni_article_markdown",
59 |     "/README.md",
60 |     "/pyproject.toml",
61 | ]
62 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/readers.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | 
 3 | import requests
 4 | 
 5 | from .extractor import Extractor
 6 | from .hookspecs import ReaderPlugin
 7 | from .plugins import pm
 8 | from .utils import REQUEST_HEADERS
 9 | 
10 | 
11 | class Reader(ABC):
12 |     @abstractmethod
13 |     def read(self) -> str: ...
14 | 
15 |     def extractor(self) -> Extractor | None:
16 |         return None
17 | 
18 | 
19 | class ReaderFactory:
20 |     @staticmethod
21 |     def create(url_or_path: str) -> Reader:
22 |         custom_plugin_reader = pm.hook.get_custom_reader(url=url_or_path)
23 |         if custom_plugin_reader:
24 | 
25 |             class PluginReaderAdapter(Reader):
26 |                 def __init__(self, plugin: ReaderPlugin, url: str):
27 |                     self.plugin = plugin
28 |                     self.url = url
29 | 
30 |                 def read(self) -> str:
31 |                     return self.plugin.read(self.url)
32 | 
33 |             return PluginReaderAdapter(custom_plugin_reader, url_or_path)
34 |         if url_or_path.startswith("http"):
35 |             return HtmlReader(url_or_path)
36 |         return FileReader(url_or_path)
37 | 
38 | 
39 | class HtmlReader(Reader):
40 |     def __init__(self, url_or_path: str):
41 |         self.url_or_path = url_or_path
42 | 
43 |     def read(self) -> str:
44 |         response = requests.get(self.url_or_path, headers=REQUEST_HEADERS)
45 |         response.encoding = "utf-8"
46 |         return response.text
47 | 
48 | 
49 | class FileReader(Reader):
50 |     def __init__(self, url_or_path: str):
51 |         self.url_or_path = url_or_path
52 | 
53 |     def read(self) -> str:
54 |         with open(self.url_or_path, encoding="utf8") as f:
55 |             return f.read()
56 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractors/yuque.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from typing import override
 4 | 
 5 | import requests
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | from ..extractor import Article, Extractor
 9 | from ..utils import REQUEST_HEADERS, filter_tag, get_og_url
10 | 
11 | 
12 | class YuqueExtractor(Extractor):
13 |     """
14 |     语雀
15 |     """
16 | 
17 |     @override
18 |     def can_handle(self, soup: BeautifulSoup) -> bool:
19 |         return get_og_url(soup).startswith("https://www.yuque.com")
20 | 
21 |     @override
22 |     def article_container(self) -> tuple:
23 |         return ("", {})
24 | 
25 |     @override
26 |     def extract_article(self, soup: BeautifulSoup) -> Article:
27 |         script_tag = filter_tag(soup.find("script", string=re.compile(r"decodeURIComponent")))
28 |         if script_tag:
29 |             raw_js = script_tag.string
30 |             if raw_js:
31 |                 match = re.search(r'decodeURIComponent\s*\(\s*"([^"]+)"\s*\)', raw_js)
32 |                 if match:
33 |                     encoded_str = match.group(1)
34 | 
35 |                     from urllib.parse import unquote
36 | 
37 |                     decoded_str = unquote(encoded_str)
38 |                     decoded_json = json.loads(decoded_str)
39 |                     # print(decoded_json)
40 |                     doc = decoded_json["doc"]
41 |                     if doc and doc["book_id"]:
42 |                         book_id = str(doc["book_id"])
43 |                         slug = str(doc["slug"])
44 |                         response = requests.get(f"https://www.yuque.com/api/docs/{slug}?book_id={book_id}&mode=markdown", headers=REQUEST_HEADERS)
45 |                         response.encoding = "utf-8"
46 |                         resp = response.json()
47 |                         # print(resp)
48 |                         return Article(str(resp["data"]["title"]), None, None, str(resp["data"]["sourcecode"]))
49 |         return Article("", None, None, "")
50 | 


--------------------------------------------------------------------------------
/plugins/omnimd-browser-reader/browser.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from runpy import run_module
 3 | from typing import override
 4 | 
 5 | from playwright.sync_api import Browser, Playwright, sync_playwright
 6 | 
 7 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl
 8 | from omni_article_markdown.utils import BROWSER_TARGET_HOSTS, REQUEST_HEADERS
 9 | 
10 | 
11 | class BrowserPlugin(ReaderPlugin):
12 |     @override
13 |     def can_handle(self, url: str) -> bool:
14 |         return any(host in url for host in BROWSER_TARGET_HOSTS)
15 | 
16 |     @override
17 |     def read(self, url: str) -> str:
18 |         def try_launch_browser(p: Playwright) -> Browser:
19 |             try:
20 |                 return p.chromium.launch(headless=True)
21 |             except Exception as e:
22 |                 # Playwright not installed or browser missing
23 |                 if "Executable doesn't exist" in str(e) or "playwright install" in str(e):
24 |                     print("[INFO] Chromium not installed, installing with 'playwright install chromium'...")
25 |                     original_argv = sys.argv
26 |                     args = ["playwright", "install", "chromium"]
27 |                     sys.argv = args
28 |                     run_module("playwright", run_name="__main__")
29 |                     sys.argv = original_argv
30 |                     # Try again
31 |                     return p.chromium.launch(headless=True)
32 |                 raise  # re-raise other exceptions
33 | 
34 |         with sync_playwright() as p:
35 |             browser = try_launch_browser(p)
36 |             context = browser.new_context(
37 |                 user_agent=REQUEST_HEADERS["User-Agent"],
38 |                 java_script_enabled=True,
39 |                 extra_http_headers=REQUEST_HEADERS,
40 |             )
41 |             page = context.new_page()
42 |             page.goto(url, wait_until="networkidle")
43 |             html = page.content()
44 |             page.close()
45 |             context.close()
46 |             browser.close()
47 |         return html
48 | 
49 | 
50 | @hookimpl
51 | def get_custom_reader(url: str) -> ReaderPlugin | None:
52 |     plugin_instance = BrowserPlugin()
53 |     if plugin_instance.can_handle(url):
54 |         return plugin_instance
55 |     return None
56 | 


--------------------------------------------------------------------------------
/plugins/omnimd-freedium-reader/freedium.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from importlib import resources
 3 | from runpy import run_module
 4 | from typing import override
 5 | 
 6 | from playwright.sync_api import Browser, Playwright, sync_playwright
 7 | 
 8 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl
 9 | from omni_article_markdown.utils import REQUEST_HEADERS
10 | 
11 | 
12 | class FreediumPlugin(ReaderPlugin):
13 |     @override
14 |     def can_handle(self, url: str) -> bool:
15 |         return "freedium.cfd" in url
16 | 
17 |     @override
18 |     def read(self, url: str) -> str:
19 |         def try_launch_browser(p: Playwright) -> Browser:
20 |             try:
21 |                 return p.chromium.launch(headless=True)
22 |             except Exception as e:
23 |                 # Playwright not installed or browser missing
24 |                 if "Executable doesn't exist" in str(e) or "playwright install" in str(e):
25 |                     print("[INFO] Chromium not installed, installing with 'playwright install chromium'...")
26 |                     original_argv = sys.argv
27 |                     args = ["playwright", "install", "chromium"]
28 |                     sys.argv = args
29 |                     run_module("playwright", run_name="__main__")
30 |                     sys.argv = original_argv
31 |                     # Try again
32 |                     return p.chromium.launch(headless=True)
33 |                 raise  # re-raise other exceptions
34 | 
35 |         with sync_playwright() as p:
36 |             browser = try_launch_browser(p)
37 |             context = browser.new_context(
38 |                 user_agent=REQUEST_HEADERS["User-Agent"],
39 |                 java_script_enabled=True,
40 |                 extra_http_headers=REQUEST_HEADERS,
41 |             )
42 |             with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path:
43 |                 context.add_init_script(path=str(js_path))
44 |             page = context.new_page()
45 |             page.goto(url, wait_until="networkidle")
46 |             html = page.content()
47 |             page.close()
48 |             context.close()
49 |             browser.close()
50 |         return html
51 | 
52 | 
53 | @hookimpl
54 | def get_custom_reader(url: str) -> ReaderPlugin | None:
55 |     plugin_instance = FreediumPlugin()
56 |     if plugin_instance.can_handle(url):
57 |         return plugin_instance
58 |     return None
59 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/omni_article_md.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import pkgutil
 3 | from dataclasses import dataclass
 4 | from pathlib import Path
 5 | 
 6 | from bs4 import BeautifulSoup
 7 | 
 8 | from .extractor import Article, DefaultExtractor, Extractor
 9 | from .parser import HtmlMarkdownParser
10 | from .readers import ReaderFactory
11 | from .utils import to_snake_case
12 | 
13 | 
14 | @dataclass
15 | class ReaderContext:
16 |     raw_html: str
17 | 
18 | 
19 | @dataclass
20 | class ExtractorContext:
21 |     article: Article
22 | 
23 | 
24 | @dataclass
25 | class ParserContext:
26 |     title: str
27 |     markdown: str
28 | 
29 | 
30 | class OmniArticleMarkdown:
31 |     DEFAULT_SAVE_PATH = "./"
32 | 
33 |     def __init__(self, url_or_path: str):
34 |         self.url_or_path = url_or_path
35 | 
36 |     def parse(self) -> ParserContext:
37 |         reader_ctx = self._read_html(self.url_or_path)
38 |         extractor_ctx = self._extract_article(reader_ctx)
39 |         parser_ctx = self._parse_html(extractor_ctx)
40 |         return parser_ctx
41 | 
42 |     def save(self, ctx: ParserContext, save_path: str = "") -> str:
43 |         save_path = save_path or self.DEFAULT_SAVE_PATH
44 |         file_path = Path(save_path)
45 |         if file_path.is_dir():
46 |             filename = f"{to_snake_case(ctx.title)}.md"
47 |             file_path = file_path / filename
48 |         with file_path.open("w", encoding="utf-8") as f:
49 |             f.write(ctx.markdown)
50 |         return str(file_path.resolve())
51 | 
52 |     def _read_html(self, url_or_path: str) -> ReaderContext:
53 |         reader = ReaderFactory.create(url_or_path)
54 |         raw_html = reader.read()
55 |         return ReaderContext(raw_html)
56 | 
57 |     def _extract_article(self, ctx: ReaderContext) -> ExtractorContext:
58 |         soup = BeautifulSoup(ctx.raw_html, "html5lib")
59 |         for extract in load_extractors():
60 |             article = extract.extract(soup)
61 |             if article:
62 |                 break
63 |         else:
64 |             article = DefaultExtractor().extract(soup)
65 |         if not article:
66 |             raise ValueError("Failed to extract article content.")
67 |         return ExtractorContext(article)
68 | 
69 |     def _parse_html(self, ctx: ExtractorContext) -> ParserContext:
70 |         parser = HtmlMarkdownParser(ctx.article)
71 |         result = parser.parse()
72 |         return ParserContext(title=result[0], markdown=result[1])
73 | 
74 | 
75 | def load_extractors(package_name="extractors") -> list[Extractor]:
76 |     extractors_package = Path(__file__).parent / package_name
77 |     extractors = []
78 |     for _loader, module_name, _is_pkg in pkgutil.iter_modules([extractors_package.resolve()]):
79 |         module = importlib.import_module(f"omni_article_markdown.{package_name}.{module_name}")
80 |         for attr in dir(module):
81 |             cls = getattr(module, attr)
82 |             if isinstance(cls, type) and issubclass(cls, Extractor) and cls is not Extractor:
83 |                 extractors.append(cls())
84 |     return extractors
85 | 


--------------------------------------------------------------------------------
/ruff.toml:
--------------------------------------------------------------------------------
  1 | # Ruff configuration for the entire project
  2 | # This ensures consistent formatting across all Python code
  3 | 
  4 | # Use 120 character line length to prevent splitting Reflex lambdas
  5 | line-length = 120
  6 | 
  7 | # Target Python 3.13+
  8 | target-version = "py313"
  9 | 
 10 | # Exclude generated and build directories
 11 | extend-exclude = [
 12 |     ".venv",
 13 |     "venv",
 14 |     "__pycache__",
 15 |     "*.pyc",
 16 |     "*.yaml",
 17 |     "node_modules",
 18 |     "webview",
 19 |     "bin",
 20 |     "build",
 21 |     "dist",
 22 | ]
 23 | 
 24 | [format]
 25 | # Use double quotes for strings
 26 | quote-style = "double"
 27 | 
 28 | # Use 4 spaces for indentation
 29 | indent-style = "space"
 30 | 
 31 | # Respect magic trailing commas
 32 | skip-magic-trailing-comma = false
 33 | 
 34 | # Use Unix line endings
 35 | line-ending = "auto"
 36 | 
 37 | [lint]
 38 | # Enable specific rule sets
 39 | select = [
 40 |     "E",   # pycodestyle errors
 41 |     "W",   # pycodestyle warnings (includes W292 for newline at EOF)
 42 |     "F",   # Pyflakes
 43 |     "I",   # isort
 44 |     "N",   # pep8-naming
 45 |     "UP",  # pyupgrade
 46 |     "B",   # flake8-bugbear
 47 |     "C4",  # flake8-comprehensions
 48 |     "DTZ", # flake8-datetimez
 49 |     "T10", # flake8-debugger
 50 |     "RET", # flake8-return
 51 |     "SIM", # flake8-simplify
 52 |     "TID", # flake8-tidy-imports
 53 | ]
 54 | 
 55 | # Ignore specific rules
 56 | ignore = [
 57 |     "E501",   # Line too long (handled by formatter)
 58 |     "E712",   # Comparison to True/False (needed for SQLAlchemy)
 59 |     "B008",   # Do not perform function calls in argument defaults
 60 |     "B904",   # Within except clause, use raise from (not always needed)
 61 |     "UP007",  # Use X | Y for type unions (keep Optional for clarity)
 62 |     "SIM108", # Use ternary operator (sometimes if/else is clearer)
 63 |     "DTZ005", # datetime.now() without tz (okay for timestamps)
 64 |     "N999",   # Invalid module name (web-bff is valid)
 65 |     "TID252", # Relative imports from parent (used in package structure)
 66 |     "RET504", # Unnecessary assignment before return (sometimes clearer)
 67 | ]
 68 | 
 69 | # Allow unused variables when prefixed with underscore
 70 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
 71 | 
 72 | [lint.per-file-ignores]
 73 | # Ignore import violations in __init__ files
 74 | "__init__.py" = ["E402", "F401", "F403"]
 75 | 
 76 | # Ignore missing docstrings in tests
 77 | "test_*.py" = ["D100", "D101", "D102", "D103", "D104"]
 78 | "tests/*" = ["D100", "D101", "D102", "D103", "D104"]
 79 | 
 80 | # Allow dynamic imports in recipe files
 81 | "recipes/*" = ["F401", "F403"]
 82 | 
 83 | [lint.isort]
 84 | # Combine as imports
 85 | combine-as-imports = true
 86 | 
 87 | # Force single line imports
 88 | force-single-line = false
 89 | 
 90 | # Order imports by type
 91 | section-order = [
 92 |     "future",
 93 |     "standard-library",
 94 |     "third-party",
 95 |     "first-party",
 96 |     "local-folder",
 97 | ]
 98 | 
 99 | [lint.pydocstyle]
100 | # Use Google docstring convention
101 | convention = "google"
102 | 


--------------------------------------------------------------------------------
/plugins/omnimd-zhihu-reader/zhihu.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from importlib import resources
 3 | from runpy import run_module
 4 | from typing import override
 5 | 
 6 | import requests
 7 | from playwright.sync_api import Browser, Cookie, Playwright, sync_playwright
 8 | 
 9 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl
10 | from omni_article_markdown.store import Store
11 | from omni_article_markdown.utils import REQUEST_HEADERS
12 | 
13 | 
14 | class ZhihuPlugin(ReaderPlugin):
15 |     @override
16 |     def can_handle(self, url: str) -> bool:
17 |         return "zhihu.com" in url
18 | 
19 |     @override
20 |     def read(self, url: str) -> str:
21 |         store = Store()
22 |         cookies_raw = store.load("zhihu_cookies")
23 | 
24 |         if not cookies_raw:
25 |             print("未找到知乎登录信息，尝试模拟登录...")
26 |             cookies_raw = self._get_zhihu_cookies(url)
27 |             if not cookies_raw:
28 |                 raise Exception("无法获取知乎登录信息")
29 | 
30 |         cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw)
31 |         response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies)
32 | 
33 |         # 如果初始请求失败，则尝试重新获取 cookie 并重试
34 |         if response.status_code == 403:
35 |             print("Cookie 失效，重新模拟登录知乎...")
36 |             cookies_raw = self._get_zhihu_cookies(url)
37 |             if not cookies_raw:
38 |                 raise Exception("重新模拟登录失败，无法访问知乎内容")
39 |             cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw)
40 |             response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies)
41 | 
42 |         response.encoding = "utf-8"
43 |         return response.text
44 | 
45 |     def _get_zhihu_cookies(self, url: str) -> list[Cookie]:
46 |         def try_launch_browser(p: Playwright) -> Browser:
47 |             try:
48 |                 return p.chromium.launch(headless=True)
49 |             except Exception as e:
50 |                 # Playwright not installed or browser missing
51 |                 if "Executable doesn't exist" in str(e) or "playwright install" in str(e):
52 |                     print("[INFO] Chromium not installed, installing with 'playwright install chromium'...")
53 |                     original_argv = sys.argv
54 |                     args = ["playwright", "install", "chromium"]
55 |                     sys.argv = args
56 |                     run_module("playwright", run_name="__main__")
57 |                     sys.argv = original_argv
58 |                     # Try again
59 |                     return p.chromium.launch(headless=True)
60 |                 raise  # re-raise other exceptions
61 | 
62 |         with sync_playwright() as p:
63 |             browser = try_launch_browser(p)
64 |             context = browser.new_context(
65 |                 user_agent=REQUEST_HEADERS["User-Agent"],
66 |                 java_script_enabled=True,
67 |                 extra_http_headers=REQUEST_HEADERS,
68 |             )
69 |             with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path:
70 |                 context.add_init_script(path=str(js_path))
71 |             page = context.new_page()
72 |             page.goto(url, wait_until="networkidle")
73 |             cookies = context.cookies()
74 |             store = Store()
75 |             store.save("zhihu_cookies", cookies)
76 |             page.close()
77 |             context.close()
78 |             browser.close()
79 |             return cookies
80 | 
81 |     def _convert_playwright_cookies_to_requests_dict(self, playwright_cookies: list[Cookie]) -> dict[str, str]:
82 |         requests_cookies = {}
83 |         for cookie in playwright_cookies:
84 |             requests_cookies[cookie.get("name")] = cookie.get("value")
85 |         return requests_cookies
86 | 
87 | 
88 | @hookimpl
89 | def get_custom_reader(url: str) -> ReaderPlugin | None:
90 |     plugin_instance = ZhihuPlugin()
91 |     if plugin_instance.can_handle(url):
92 |         return plugin_instance
93 |     return None
94 | 


--------------------------------------------------------------------------------
/plugins/omnimd-toutiao-reader/toutiao.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from importlib import resources
 3 | from runpy import run_module
 4 | from typing import override
 5 | 
 6 | import requests
 7 | from playwright.sync_api import Browser, Cookie, Playwright, sync_playwright
 8 | 
 9 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl
10 | from omni_article_markdown.store import Store
11 | from omni_article_markdown.utils import REQUEST_HEADERS
12 | 
13 | 
14 | class ToutiaoPlugin(ReaderPlugin):
15 |     @override
16 |     def can_handle(self, url: str) -> bool:
17 |         return "toutiao.com" in url
18 | 
19 |     @override
20 |     def read(self, url: str) -> str:
21 |         store = Store()
22 |         cookies_raw = store.load("toutiao_cookies")
23 | 
24 |         if not cookies_raw:
25 |             print("未找到头条登录信息，尝试模拟登录...")
26 |             cookies_raw = self._get_toutiao_cookies(url)
27 |             if not cookies_raw:
28 |                 raise Exception("无法获取头条登录信息")
29 | 
30 |         cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw)
31 |         response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies)
32 |         response.encoding = "utf-8"
33 |         html = response.text
34 | 
35 |         # 如果初始请求失败，则尝试重新获取 cookie 并重试
36 |         if "您需要允许该网站执行 JavaScript" in html:
37 |             print("Cookie 失效，重新模拟登录头条...")
38 |             cookies_raw = self._get_toutiao_cookies(url)
39 |             if not cookies_raw:
40 |                 raise Exception("重新模拟登录失败，无法访问头条内容")
41 |             cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw)
42 |             response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies)
43 | 
44 |         response.encoding = "utf-8"
45 |         return response.text
46 | 
47 |     def _get_toutiao_cookies(self, url: str) -> list[Cookie]:
48 |         def try_launch_browser(p: Playwright) -> Browser:
49 |             try:
50 |                 return p.chromium.launch(headless=True)
51 |             except Exception as e:
52 |                 # Playwright not installed or browser missing
53 |                 if "Executable doesn't exist" in str(e) or "playwright install" in str(e):
54 |                     print("[INFO] Chromium not installed, installing with 'playwright install chromium'...")
55 |                     original_argv = sys.argv
56 |                     args = ["playwright", "install", "chromium"]
57 |                     sys.argv = args
58 |                     run_module("playwright", run_name="__main__")
59 |                     sys.argv = original_argv
60 |                     # Try again
61 |                     return p.chromium.launch(headless=True)
62 |                 raise  # re-raise other exceptions
63 | 
64 |         with sync_playwright() as p:
65 |             browser = try_launch_browser(p)
66 |             context = browser.new_context(
67 |                 user_agent=REQUEST_HEADERS["User-Agent"],
68 |                 java_script_enabled=True,
69 |                 extra_http_headers=REQUEST_HEADERS,
70 |             )
71 |             with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path:
72 |                 context.add_init_script(path=str(js_path))
73 |             page = context.new_page()
74 |             page.goto(url, wait_until="networkidle")
75 |             cookies = context.cookies()
76 |             store = Store()
77 |             store.save("toutiao_cookies", cookies)
78 |             page.close()
79 |             context.close()
80 |             browser.close()
81 |             return cookies
82 | 
83 |     def _convert_playwright_cookies_to_requests_dict(self, playwright_cookies: list[Cookie]) -> dict[str, str]:
84 |         requests_cookies = {}
85 |         for cookie in playwright_cookies:
86 |             requests_cookies[cookie.get("name")] = cookie.get("value")
87 |         return requests_cookies
88 | 
89 | 
90 | @hookimpl
91 | def get_custom_reader(url: str) -> ReaderPlugin | None:
92 |     plugin_instance = ToutiaoPlugin()
93 |     if plugin_instance.can_handle(url):
94 |         return plugin_instance
95 |     return None
96 | 


--------------------------------------------------------------------------------
/tests/test_extractor.py:
--------------------------------------------------------------------------------
  1 | from bs4.element import Tag
  2 | 
  3 | from omni_article_markdown.extractor import (
  4 |     Article,
  5 |     DefaultExtractor,
  6 |     extract_article_from_soup,
  7 |     remove_duplicate_titles,
  8 | )
  9 | 
 10 | # ---- mock utils ----
 11 | 
 12 | def make_html(content: str, title="Page Title", description="Desc", url="https://example.com") -> str:
 13 |     return f"""
 14 |     <html>
 15 |         <head>
 16 |             <title>{title}</title>
 17 |             <meta property="og:description" content="{description}">
 18 |             <meta property="og:url" content="{url}">
 19 |         </head>
 20 |         <body>
 21 |             {content}
 22 |         </body>
 23 |     </html>
 24 |     """
 25 | 
 26 | 
 27 | def test_extract_article_from_soup_basic(make_soup):
 28 |     html = "<html><body><article><p>Hello</p></article></body></html>"
 29 |     soup = make_soup(html)
 30 |     tag = extract_article_from_soup(soup, ("article", None))
 31 |     assert tag is not None
 32 |     assert tag.name == "article"
 33 |     assert "Hello" in tag.text
 34 | 
 35 | 
 36 | def test_default_extractor_basic_behavior(make_soup):
 37 |     extractor = DefaultExtractor()
 38 |     html = make_html("<article><p>Hello World</p></article>")
 39 |     soup = make_soup(html)
 40 | 
 41 |     assert extractor.can_handle(soup) is True
 42 |     assert isinstance(extractor.article_container(), list | tuple)
 43 | 
 44 | 
 45 | def test_cleaning_tags_and_attrs(make_soup):
 46 |     html = make_html("""
 47 |         <article>
 48 |             <p>Visible</p>
 49 |             <style>p{color:red}</style>
 50 |             <p style="display:none">Hidden</p>
 51 |             <div hidden>Invisible</div>
 52 |             <!-- comment -->
 53 |         </article>
 54 |     """)
 55 |     extractor = DefaultExtractor()
 56 |     article = extractor.extract(make_soup(html))
 57 |     assert article is not None
 58 |     assert isinstance(article.body, Tag)
 59 |     text = article.body.get_text()
 60 |     # 不应包含隐藏元素、style、注释内容
 61 |     assert "Visible" in text
 62 |     assert "Hidden" not in text
 63 |     assert "Invisible" not in text
 64 |     assert "color:red" not in text
 65 | 
 66 | 
 67 | def test_extract_metadata(make_soup):
 68 |     html = make_html("<article><p>Body</p></article>", title="Hello", description="A test desc", url="https://abc.com")
 69 |     soup = make_soup(html)
 70 |     extractor = DefaultExtractor()
 71 | 
 72 |     assert extractor.extract_title(soup) == "Hello"
 73 |     assert extractor.extract_description(soup) == "A test desc"
 74 |     assert extractor.extract_url(soup) == "https://abc.com"
 75 | 
 76 | 
 77 | def test_remove_duplicate_titles(make_soup):
 78 |     html = "<article><h1>Same Title</h1><p>Body text</p></article>"
 79 |     soup = make_soup(html)
 80 |     article = Article(title="Same Title", url=None, description=None, body=soup.article)
 81 |     remove_duplicate_titles(article)
 82 | 
 83 |     # 标题应保持一致
 84 |     assert article.title == "Same Title"
 85 |     # H1 应被删除
 86 |     assert article.body.find("h1") is None
 87 | 
 88 | 
 89 | def test_remove_duplicate_titles_different(make_soup):
 90 |     html = "<article><h1>Other Title</h1><p>Body text</p></article>"
 91 |     soup = make_soup(html)
 92 |     article = Article(title="Main Page", url=None, description=None, body=soup.article)
 93 |     remove_duplicate_titles(article)
 94 | 
 95 |     # 原标题不变，H1 保留
 96 |     assert article.title == "Main Page"
 97 |     assert article.body.find("h1") is not None
 98 | 
 99 | 
100 | class CustomExtractor(DefaultExtractor):
101 |     def can_handle(self, soup):
102 |         title = soup.title.text.strip() if soup.title else ""
103 |         return "Special" in title
104 | 
105 |     def article_container(self):
106 |         return ("body", None)
107 | 
108 | 
109 | def test_custom_extractor_can_handle(make_soup):
110 |     html = make_html("<p>Hello</p>", title="Special Page")
111 |     extractor = CustomExtractor()
112 |     soup = make_soup(html)
113 |     assert extractor.can_handle(soup) is True
114 | 
115 |     article = extractor.extract(soup)
116 |     assert article is not None
117 |     assert isinstance(article.body, Tag)
118 |     assert "Hello" in article.body.text
119 | 


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
  1 | from omni_article_markdown.extractor import Article
  2 | from omni_article_markdown.parser import HtmlMarkdownParser
  3 | 
  4 | 
  5 | def test_basic_paragraph(make_soup):
  6 |     html = "<p>Hello world</p>"
  7 |     article = Article("Test", "", "", make_soup(html))
  8 |     parser = HtmlMarkdownParser(article)
  9 |     title, md = parser.parse()
 10 |     assert "# Test" in md
 11 |     assert "Hello world" in md
 12 | 
 13 | 
 14 | def test_heading_and_strong(make_soup):
 15 |     html = "<h2>Subtitle</h2><p><strong>bold</strong> and <em>italic</em></p>"
 16 |     article = Article("Title", "", "", make_soup(html))
 17 |     parser = HtmlMarkdownParser(article)
 18 |     _, md = parser.parse()
 19 |     assert "## Subtitle" in md
 20 |     assert "**bold**" in md
 21 |     assert "*italic*" in md
 22 | 
 23 | 
 24 | def test_link_parsing(make_soup):
 25 |     html = '<p><a href="https://example.com">Example</a></p>'
 26 |     article = Article("Title", "", "", make_soup(html))
 27 |     parser = HtmlMarkdownParser(article)
 28 |     _, md = parser.parse()
 29 |     assert "[Example](https://example.com)" in md
 30 | 
 31 | 
 32 | def test_unordered_list(make_soup):
 33 |     html = "<ul><li>Apple</li><li>Banana</li></ul>"
 34 |     article = Article("Fruits", "", "", make_soup(html))
 35 |     parser = HtmlMarkdownParser(article)
 36 |     _, md = parser.parse()
 37 |     assert "- Apple" in md
 38 |     assert "- Banana" in md
 39 | 
 40 | 
 41 | def test_ordered_list(make_soup):
 42 |     html = "<ol><li>One</li><li>Two</li></ol>"
 43 |     article = Article("Numbers", "", "", make_soup(html))
 44 |     parser = HtmlMarkdownParser(article)
 45 |     _, md = parser.parse()
 46 |     assert "1. One" in md
 47 |     assert "2. Two" in md
 48 | 
 49 | 
 50 | def test_blockquote(make_soup):
 51 |     html = "<blockquote><p>Quote me</p></blockquote>"
 52 |     article = Article("Quote", "", "", make_soup(html))
 53 |     parser = HtmlMarkdownParser(article)
 54 |     _, md = parser.parse()
 55 |     assert "> Quote me" in md
 56 | 
 57 | 
 58 | def test_codeblock(make_soup):
 59 |     html = "<pre><code>print('Hello')</code></pre>"
 60 |     article = Article("Code", "", "", make_soup(html))
 61 |     parser = HtmlMarkdownParser(article)
 62 |     _, md = parser.parse()
 63 |     assert "```" in md
 64 |     assert "print('Hello')" in md
 65 | 
 66 | 
 67 | def test_inline_code(make_soup):
 68 |     html = "<p>Run <code>ls -al</code> command.</p>"
 69 |     article = Article("Cmd", "", "", make_soup(html))
 70 |     parser = HtmlMarkdownParser(article)
 71 |     _, md = parser.parse()
 72 |     assert "`ls -al`" in md
 73 | 
 74 | 
 75 | def test_image_absolute_url(make_soup):
 76 |     html = '<img src="https://example.com/image.png" alt="demo">'
 77 |     article = Article("Img", "", "", make_soup(html))
 78 |     parser = HtmlMarkdownParser(article)
 79 |     _, md = parser.parse()
 80 |     assert "![demo](https://example.com/image.png)" in md
 81 | 
 82 | 
 83 | def test_image_relative_url(make_soup):
 84 |     html = '<img src="../images/demo.png" alt="demo">'
 85 |     article = Article("Img", "https://site.com/docs/page.html", "", make_soup(html))
 86 |     parser = HtmlMarkdownParser(article)
 87 |     _, md = parser.parse()
 88 |     assert "![demo](https://site.com/images/demo.png)" in md
 89 | 
 90 | 
 91 | def test_table_parsing(make_soup):
 92 |     html = """
 93 |     <table>
 94 |         <tr><th>Name</th><th>Age</th></tr>
 95 |         <tr><td>Alice</td><td>18</td></tr>
 96 |         <tr><td>Bob</td><td>20</td></tr>
 97 |     </table>
 98 |     """
 99 |     article = Article("Table", "", "", make_soup(html))
100 |     parser = HtmlMarkdownParser(article)
101 |     _, md = parser.parse()
102 |     assert "| Name | Age |" in md
103 |     assert "| Alice | 18 |" in md
104 | 
105 | 
106 | def test_mathjax_equations(make_soup):
107 |     html = "<math><semantics><annotation encoding='application/x-tex'>E=mc^2</annotation></semantics></math>"
108 |     article = Article("Math", "", "", make_soup(html))
109 |     parser = HtmlMarkdownParser(article)
110 |     _, md = parser.parse()
111 |     assert "$$ E=mc^2 $$" in md
112 | 
113 | 
114 | def test_post_handler_math(make_soup):
115 |     html = "<p>\\(x+y\\) and \\[E=mc^2\\]</p>"
116 |     article = Article("Math", "", "", make_soup(html))
117 |     parser = HtmlMarkdownParser(article)
118 |     _, md = parser.parse()
119 |     assert "$x+y$" in md
120 |     assert "$$E=mc^2$$" in md
121 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/cli.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from runpy import run_module
  3 | 
  4 | import click
  5 | from click_default_group import DefaultGroup
  6 | 
  7 | from . import DEFAULT_PLUGINS
  8 | from .omni_article_md import OmniArticleMarkdown
  9 | 
 10 | 
 11 | @click.group(cls=DefaultGroup, default="parse", default_if_no_args=True)
 12 | def cli():
 13 |     """
 14 |     A CLI tool to parse articles and save them as Markdown.
 15 |     It also supports installing plugins.
 16 |     """
 17 |     ...
 18 | 
 19 | 
 20 | @cli.command(name="parse")
 21 | @click.argument("url_or_path")
 22 | @click.option(
 23 |     "-s",
 24 |     "--save",
 25 |     help="Save result (default: ./). Provide a path to save elsewhere.",
 26 |     type=click.Path(dir_okay=True, writable=True),
 27 | )
 28 | def parse_article(url_or_path: str, save: str | None):
 29 |     """
 30 |     Parses an article from a URL or local path and outputs/saves it as Markdown.
 31 |     """
 32 |     handler = OmniArticleMarkdown(url_or_path)
 33 |     parser_ctx = handler.parse()
 34 | 
 35 |     if save is None:
 36 |         click.echo(parser_ctx.markdown)
 37 |     else:
 38 |         save_path = handler.save(parser_ctx, save)
 39 |         click.echo(f"Article saved to: {save_path}")
 40 | 
 41 | 
 42 | @cli.command()
 43 | @click.argument("plugin_name")
 44 | @click.option("-U", "--upgrade", is_flag=True, help="Upgrade the plugin if already installed.", default=False)
 45 | @click.option(
 46 |     "-e",
 47 |     "--editable",
 48 |     is_flag=True,
 49 |     help="Install the editable package based on the provided local file path",
 50 |     default=False,
 51 | )
 52 | def install(plugin_name: str, upgrade: bool, editable: bool):
 53 |     """
 54 |     Installs a plugin for this application.
 55 |     For example, to install the 'zhihu' plugin: mdcli install zhihu
 56 |     """
 57 |     actual_package_name = (
 58 |         plugin_name if editable or plugin_name not in DEFAULT_PLUGINS else DEFAULT_PLUGINS[plugin_name]
 59 |     )
 60 | 
 61 |     click.echo(f"Attempting to install plugin: {actual_package_name}...")
 62 |     args = ["pip", "install"]
 63 |     if upgrade:
 64 |         args.append("--upgrade")
 65 |     args.append(actual_package_name)
 66 | 
 67 |     original_argv = sys.argv
 68 |     try:
 69 |         sys.argv = args
 70 |         run_module("pip", run_name="__main__")
 71 |         click.echo(f"Plugin '{actual_package_name}' processed by pip.")
 72 |         click.echo("If the plugin provides new functionality, it should now be available.")
 73 |         click.echo(
 74 |             "You might need to restart the application for changes to take full effect if it involves runtime loading during startup."
 75 |         )
 76 |     except Exception as e:
 77 |         click.echo(f"Failed to process plugin '{actual_package_name}' with pip: {e}", err=True)
 78 |         click.echo("Please ensure pip is installed and the package name is correct.", err=True)
 79 |     finally:
 80 |         sys.argv = original_argv
 81 | 
 82 | 
 83 | @cli.command()
 84 | @click.argument("plugin_name")
 85 | @click.option("-y", "--yes", is_flag=True, help="Don't ask for confirmation before uninstalling.", default=False)
 86 | def uninstall(plugin_name: str, yes: bool):
 87 |     """
 88 |     Uninstalls a plugin for this application.
 89 |     For example, to uninstall the 'zhihu' plugin: mdcli uninstall zhihu
 90 |     """
 91 |     actual_package_name = DEFAULT_PLUGINS.get(plugin_name, plugin_name)
 92 | 
 93 |     click.echo(f"Attempting to uninstall plugin: {actual_package_name}...")
 94 |     args = ["pip", "uninstall"]
 95 |     if yes:
 96 |         args.append("-y")
 97 |     args.append(actual_package_name)
 98 | 
 99 |     original_argv = sys.argv
100 |     try:
101 |         sys.argv = args
102 |         run_module("pip", run_name="__main__")
103 |         click.echo(f"Plugin '{actual_package_name}' uninstallation processed by pip.")
104 |         click.echo(
105 |             "The plugin's functionality should no longer be available after the next application start (or if dynamically unloaded)."
106 |         )
107 |     except Exception as e:
108 |         click.echo(f"Failed to process uninstallation of plugin '{actual_package_name}' with pip: {e}", err=True)
109 |         click.echo("Please ensure pip is installed and the package name is correct.", err=True)
110 |     finally:
111 |         sys.argv = original_argv
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     cli()
116 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/extractor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from abc import ABC, abstractmethod
  3 | from collections.abc import Callable
  4 | from dataclasses import dataclass
  5 | from typing import override
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | from bs4.element import Comment, Tag
  9 | 
 10 | from .utils import filter_tag, get_attr_text, get_canonical_url, get_og_description, get_og_title, get_og_url, get_title
 11 | 
 12 | TAGS_TO_CLEAN: list[Callable[[Tag], bool]] = [
 13 |     lambda el: el.name in ("style", "link", "button", "footer", "header"),
 14 |     lambda el: el.name == "script" and "src" not in el.attrs,
 15 |     lambda el: el.name == "script"
 16 |         and el.has_attr("src")
 17 |         and not get_attr_text(el.attrs["src"]).startswith("https://gist.github.com"),
 18 | ]
 19 | 
 20 | ATTRS_TO_CLEAN: list[Callable[[Tag], bool]] = [
 21 |     lambda el: "style" in el.attrs
 22 |         and re.search(r"display\s*:\s*none", get_attr_text(el.attrs.get("style")), re.IGNORECASE) is not None,
 23 |     lambda el: "hidden" in el.attrs,
 24 |     lambda el: "class" in el.attrs and "katex-html" in el.attrs["class"],  # katex
 25 | ]
 26 | 
 27 | ARTICLE_CONTAINERS = [("article", None), ("main", None), ("body", None)]
 28 | 
 29 | 
 30 | @dataclass
 31 | class Article:
 32 |     title: str
 33 |     url: str | None
 34 |     description: str | None
 35 |     body: Tag | str
 36 | 
 37 | 
 38 | class Extractor(ABC):
 39 |     def __init__(self):
 40 |         self.tags_to_clean = TAGS_TO_CLEAN
 41 |         self.attrs_to_clean = ATTRS_TO_CLEAN
 42 | 
 43 |     def extract(self, soup: BeautifulSoup) -> Article | None:
 44 |         if self.can_handle(soup):
 45 |             # print(f"Using extractor: {self.__class__.__name__}")
 46 |             soup = self.pre_handle_soup(soup)
 47 |             article_container = self.article_container()
 48 |             if isinstance(article_container, tuple):
 49 |                 article_container = [article_container]
 50 |             for container in article_container:
 51 |                 article = self.extract_article(soup)
 52 |                 if article:
 53 |                     return article
 54 |                 article_tag = extract_article_from_soup(soup, container)
 55 |                 if article_tag:
 56 |                     for el in article_tag.find_all():
 57 |                         tag = filter_tag(el)
 58 |                         if tag:
 59 |                             if any(cond(tag) for cond in self.tags_to_clean):
 60 |                                 tag.decompose()
 61 |                                 continue
 62 |                             if tag.attrs and any(cond(tag) for cond in self.attrs_to_clean):
 63 |                                 tag.decompose()
 64 |                     for comment in article_tag.find_all(string=lambda text: isinstance(text, Comment)):
 65 |                         comment.extract()
 66 |                     self.extract_img(article_tag)
 67 |                     title = self.extract_title(soup)
 68 |                     description = self.extract_description(soup)
 69 |                     url = self.extract_url(soup)
 70 |                     article = Article(title=title, url=url, description=description, body=article_tag)
 71 |                     remove_duplicate_titles(article)
 72 |                     return article
 73 |         return None
 74 | 
 75 |     @abstractmethod
 76 |     def can_handle(self, soup: BeautifulSoup) -> bool: ...
 77 | 
 78 |     @abstractmethod
 79 |     def article_container(self) -> tuple | list: ...
 80 | 
 81 |     def extract_title(self, soup: BeautifulSoup) -> str:
 82 |         return get_og_title(soup) or get_title(soup)
 83 | 
 84 |     def extract_description(self, soup: BeautifulSoup) -> str:
 85 |         return get_og_description(soup)
 86 | 
 87 |     def extract_url(self, soup: BeautifulSoup) -> str:
 88 |         return get_og_url(soup) or get_canonical_url(soup)
 89 | 
 90 |     def extract_img(self, element: Tag) -> Tag:
 91 |         return element
 92 | 
 93 |     def extract_article(self, soup: BeautifulSoup) -> Article | None:
 94 |         return None
 95 | 
 96 |     def pre_handle_soup(self, soup: BeautifulSoup) -> BeautifulSoup:
 97 |         return soup
 98 | 
 99 | 
100 | class DefaultExtractor(Extractor):
101 |     @override
102 |     def can_handle(self, soup: BeautifulSoup) -> bool:
103 |         return True
104 | 
105 |     @override
106 |     def article_container(self) -> tuple | list:
107 |         return ARTICLE_CONTAINERS
108 | 
109 | 
110 | def extract_article_from_soup(soup: BeautifulSoup, template: tuple) -> Tag | None:
111 |     if template[1] is not None:
112 |         result = soup.find(template[0], attrs=template[1])
113 |     else:
114 |         result = soup.find(template[0])
115 |     return filter_tag(result)
116 | 
117 | 
118 | def remove_duplicate_titles(article: Article):
119 |     if article.body and isinstance(article.body, Tag):
120 |         first_h1 = article.body.find("h1")
121 |         if first_h1:
122 |             h1_text = first_h1.get_text(strip=True)
123 |             if h1_text.lower() in article.title.lower():
124 |                 article.title = h1_text
125 |                 first_h1.decompose()
126 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from urllib.parse import urlparse
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | from bs4.element import AttributeValueList, NavigableString, PageElement, Tag
  6 | 
  7 | REQUEST_HEADERS = {
  8 |     "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0",
  9 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
 10 |     "Accept-Encoding": "gzip, deflate",
 11 |     "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
 12 |     "Priority": "u=0, i",
 13 |     "Sec-Ch-Ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"',
 14 |     "Sec-Ch-Ua-Mobile": "?0",
 15 |     "Sec-Ch-Ua-Platform": '"macOS"',
 16 |     "Sec-Fetch-Dest": "document",
 17 |     "Sec-Fetch-Mode": "navigate",
 18 |     "Sec-Fetch-Site": "none",
 19 |     "Sec-Fetch-User": "?1",
 20 |     "Upgrade-Insecure-Requests": "1",
 21 | }
 22 | 
 23 | BROWSER_TARGET_HOSTS = [
 24 |     "developer.apple.com/documentation/",
 25 |     "www.infoq.cn/",
 26 |     "pcsx2.net/",
 27 |     "baijiahao.baidu.com/",
 28 | ]
 29 | 
 30 | def is_sequentially_increasing(code: str) -> bool:
 31 |     try:
 32 |         # 解码并按换行符拆分
 33 |         numbers = [int(line.strip()) for line in code.split("\n") if line.strip()]
 34 |         # 检查是否递增
 35 |         return all(numbers[i] + 1 == numbers[i + 1] for i in range(len(numbers) - 1))
 36 |     except ValueError:
 37 |         return False  # 处理非数字情况
 38 | 
 39 | 
 40 | def move_spaces(input_string: str, suffix: str) -> str:
 41 |     # 使用正则表达式匹配以指定的suffix结尾，且suffix之前有空格的情况
 42 |     escaped_suffix = re.escape(suffix)  # 处理正则中的特殊字符
 43 |     pattern = rf"(.*?)\s+({escaped_suffix})$"
 44 |     match = re.search(pattern, input_string)
 45 |     if match:
 46 |         # 获取字符串的主体部分（不含空格）和尾部的 '**'
 47 |         main_part = match.group(1)
 48 |         stars = match.group(2)
 49 |         # 计算空格的数量并将空格移动到 '**' 后
 50 |         space_count = len(input_string) - len(main_part) - len(stars)
 51 |         return f"{main_part}{stars}{' ' * space_count}"
 52 |     return input_string
 53 | 
 54 | 
 55 | def to_snake_case(input_string: str) -> str:
 56 |     input_string = "".join(char if char.isalnum() else " " for char in input_string)
 57 |     snake_case_string = "_".join(word.lower() for word in input_string.split())
 58 |     return snake_case_string
 59 | 
 60 | 
 61 | def collapse_spaces(text) -> str:
 62 |     """
 63 |     将多个连续空格（包括换行和 Tab）折叠成一个空格。
 64 |     """
 65 |     return re.sub(r"\s+", " ", text)
 66 | 
 67 | 
 68 | def extract_domain(url: str) -> str | None:
 69 |     """
 70 |     从URL中提取域名（包含协议）。
 71 | 
 72 |     Args:
 73 |         url (str): 要提取域名的URL。
 74 | 
 75 |     Returns:
 76 |         str | None: 提取出的域名（包含协议），如果解析失败或协议不支持则返回 None。
 77 |     """
 78 |     try:
 79 |         parsed_url = urlparse(url)
 80 |         if parsed_url.scheme in {"http", "https"} and parsed_url.netloc:
 81 |             return f"{parsed_url.scheme}://{parsed_url.netloc}".rstrip("/")
 82 |         return None  # 返回 None 表示 URL 格式不符合要求或协议不支持
 83 | 
 84 |     except ValueError:
 85 |         return None  # 如果 URL 格式无效，则返回 None
 86 | 
 87 | 
 88 | def detect_language(file_name: str | None, code: str) -> str:
 89 |     # TODO: 添加语言检测逻辑
 90 |     return ""
 91 | 
 92 | 
 93 | def filter_tag(el: Tag | PageElement | NavigableString | None) -> Tag | None:
 94 |     if el is None or not isinstance(el, Tag):
 95 |         return None
 96 |     return el
 97 | 
 98 | 
 99 | def get_attr_text(el: str | AttributeValueList | None) -> str:
100 |     if el is None:
101 |         return ""
102 |     if isinstance(el, str):
103 |         return el.strip()
104 |     return " ".join(el).strip()
105 | 
106 | 
107 | def get_og_url(soup: BeautifulSoup) -> str:
108 |     og_tag = filter_tag(soup.find("meta", {"property": "og:url"}))
109 |     return get_tag_text(og_tag, "content")
110 | 
111 | 
112 | def get_og_site_name(soup: BeautifulSoup) -> str:
113 |     og_tag = filter_tag(soup.find("meta", {"property": "og:site_name"}))
114 |     return get_tag_text(og_tag, "content")
115 | 
116 | 
117 | def get_og_description(soup: BeautifulSoup) -> str:
118 |     og_tag = filter_tag(soup.find("meta", {"property": "og:description"}))
119 |     return get_tag_text(og_tag, "content")
120 | 
121 | 
122 | def get_canonical_url(soup: BeautifulSoup) -> str:
123 |     canonical_tag = filter_tag(soup.find("link", {"rel": "canonical"}))
124 |     return get_tag_text(canonical_tag, "href")
125 | 
126 | 
127 | def is_matched_canonical(url: str, soup: BeautifulSoup) -> bool:
128 |     canonical = get_canonical_url(soup)
129 |     if not canonical:
130 |         return False
131 |     return canonical.startswith(url)
132 | 
133 | 
134 | def get_og_title(soup: BeautifulSoup) -> str:
135 |     og_tag = filter_tag(soup.find("meta", {"property": "og:title"}))
136 |     return get_tag_text(og_tag, "content")
137 | 
138 | 
139 | def get_tag_text(tag: Tag | None, attr: str) -> str:
140 |     if tag is not None and tag.has_attr(attr):
141 |         el = tag[attr]
142 |         return get_attr_text(el)
143 |     return ""
144 | 
145 | 
146 | def get_title(soup: BeautifulSoup) -> str:
147 |     title_tag = soup.title
148 |     return title_tag.get_text(strip=True) if title_tag else ""
149 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from bs4.element import AttributeValueList, NavigableString
  3 | 
  4 | from omni_article_markdown.utils import (
  5 |     collapse_spaces,
  6 |     detect_language,
  7 |     extract_domain,
  8 |     filter_tag,
  9 |     get_attr_text,
 10 |     get_canonical_url,
 11 |     get_og_description,
 12 |     get_og_site_name,
 13 |     get_og_title,
 14 |     get_og_url,
 15 |     get_tag_text,
 16 |     get_title,
 17 |     is_matched_canonical,
 18 |     is_sequentially_increasing,
 19 |     move_spaces,
 20 |     to_snake_case,
 21 | )
 22 | 
 23 | 
 24 | # --------------------------
 25 | # 测试 is_sequentially_increasing
 26 | # --------------------------
 27 | def test_is_sequentially_increasing_true():
 28 |     code = "1\n2\n3\n4"
 29 |     assert is_sequentially_increasing(code) is True
 30 | 
 31 | 
 32 | def test_is_sequentially_increasing_false():
 33 |     code = "1\n3\n5"
 34 |     assert is_sequentially_increasing(code) is False
 35 | 
 36 | 
 37 | def test_is_sequentially_increasing_non_numeric():
 38 |     code = "a\nb\nc"
 39 |     assert is_sequentially_increasing(code) is False
 40 | 
 41 | 
 42 | # --------------------------
 43 | # move_spaces
 44 | # --------------------------
 45 | def test_move_spaces():
 46 |     assert move_spaces("**hello  **", "**") == "**hello**  "
 47 |     assert move_spaces("**hello **", "**") == "**hello** "
 48 |     assert move_spaces("**hello world**", "**") == "**hello world**"
 49 | 
 50 | 
 51 | # --------------------------
 52 | # to_snake_case
 53 | # --------------------------
 54 | def test_to_snake_case():
 55 |     assert to_snake_case("HelloWorld") == "helloworld"
 56 |     assert to_snake_case("Hello World!") == "hello_world"
 57 |     assert to_snake_case("Already_snake_case") == "already_snake_case"
 58 | 
 59 | 
 60 | # --------------------------
 61 | # collapse_spaces
 62 | # --------------------------
 63 | def test_collapse_spaces():
 64 |     assert collapse_spaces("a   b\tc\nd") == "a b c d"
 65 | 
 66 | 
 67 | # --------------------------
 68 | # extract_domain
 69 | # --------------------------
 70 | def test_extract_domain():
 71 |     assert extract_domain("https://example.com/path?q=1") == "https://example.com"
 72 |     assert extract_domain("http://abc.xyz") == "http://abc.xyz"
 73 |     assert extract_domain("ftp://example.com") is None
 74 |     assert extract_domain("not_a_url") is None
 75 | 
 76 | 
 77 | # --------------------------
 78 | # detect_language
 79 | # --------------------------
 80 | def test_detect_language_placeholder():
 81 |     assert detect_language("file.py", "print('hi')") == ""
 82 | 
 83 | 
 84 | # --------------------------
 85 | # filter_tag
 86 | # --------------------------
 87 | def test_filter_tag_with_tag(make_soup):
 88 |     soup = make_soup("<div></div>")
 89 |     el = soup.div
 90 |     assert filter_tag(el) == el
 91 | 
 92 | 
 93 | def test_filter_tag_with_none_or_text():
 94 |     text_node = NavigableString("text")
 95 |     assert filter_tag(None) is None
 96 |     assert filter_tag(text_node) is None
 97 | 
 98 | 
 99 | # --------------------------
100 | # get_attr_text
101 | # --------------------------
102 | def test_get_attr_text():
103 |     assert get_attr_text(" hello ") == "hello"
104 |     assert get_attr_text(AttributeValueList(["a", "b", "c"])) == "a b c"
105 |     assert get_attr_text(None) == ""
106 | 
107 | 
108 | # --------------------------
109 | # meta/og tag 相关
110 | # --------------------------
111 | HTML_DOC = """
112 | <html>
113 | <head>
114 |     <title>Example Title</title>
115 |     <meta property="og:url" content="https://example.com/page" />
116 |     <meta property="og:site_name" content="Example Site" />
117 |     <meta property="og:description" content="This is a description." />
118 |     <meta property="og:title" content="OG Title" />
119 |     <link rel="canonical" href="https://example.com/page" />
120 | </head>
121 | <body><h1>Hello</h1></body>
122 | </html>
123 | """
124 | 
125 | 
126 | @pytest.fixture
127 | def soup(make_soup):
128 |     return make_soup(HTML_DOC)
129 | 
130 | 
131 | def test_get_og_url(soup):
132 |     assert get_og_url(soup) == "https://example.com/page"
133 | 
134 | 
135 | def test_get_og_site_name(soup):
136 |     assert get_og_site_name(soup) == "Example Site"
137 | 
138 | 
139 | def test_get_og_description(soup):
140 |     assert get_og_description(soup) == "This is a description."
141 | 
142 | 
143 | def test_get_og_title(soup):
144 |     assert get_og_title(soup) == "OG Title"
145 | 
146 | 
147 | def test_get_canonical_url(soup):
148 |     assert get_canonical_url(soup) == "https://example.com/page"
149 | 
150 | 
151 | def test_is_matched_canonical_true(soup):
152 |     assert is_matched_canonical("https://example.com", soup) is True
153 | 
154 | 
155 | def test_is_matched_canonical_false(soup):
156 |     assert is_matched_canonical("https://other.com", soup) is False
157 | 
158 | 
159 | # --------------------------
160 | # get_tag_text
161 | # --------------------------
162 | def test_get_tag_text(make_soup):
163 |     soup = make_soup('<meta name="description" content="abc" />')
164 |     tag = filter_tag(soup.find("meta"))
165 |     assert get_tag_text(tag, "content") == "abc"
166 |     assert get_tag_text(tag, "missing") == ""
167 |     assert get_tag_text(None, "content") == ""
168 | 
169 | 
170 | # --------------------------
171 | # get_title
172 | # --------------------------
173 | def test_get_title(soup):
174 |     assert get_title(soup) == "Example Title"
175 | 
176 | 
177 | def test_get_title_no_title(make_soup):
178 |     soup = make_soup("<html><head></head></html>")
179 |     assert get_title(soup) == ""
180 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 墨探 (omni-article-markdown)
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/omni-article-markdown)](https://pypi.org/project/omni-article-markdown/)
  4 | ![Python](https://img.shields.io/pypi/pyversions/omni-article-markdown)
  5 | [![License](https://img.shields.io/github/license/caol64/omni-article-markdown)](LICENSE)
  6 | [![PyPI Downloads](https://static.pepy.tech/personalized-badge/omni-article-markdown?period=total&units=INTERNATIONAL_SYSTEM&left_color=GRAY&right_color=GREEN&left_text=downloads)](https://pepy.tech/projects/omni-article-markdown)
  7 | [![Stars](https://img.shields.io/github/stars/caol64/omni-article-markdown?style=social)](https://github.com/caol64/omni-article-markdown)
  8 | 
  9 | 轻松将网页文章（博客、新闻、文档等）转换为 `Markdown` 格式。
 10 | 
 11 | ![](data/1.gif)
 12 | 
 13 | ---
 14 | 
 15 | ## 项目简介
 16 | 墨探的开发初衷，是为了解决一个问题：如何将来自互联网上各种不同网站的文章内容，精准且高效地转换成统一的Markdown格式。
 17 | 
 18 | 众所周知，万维网上的网站设计风格迥异，其HTML结构也呈现出千差万别的特点。这种多样性给自动化内容提取和格式转换带来了巨大的困难。要实现一个能够适应各种复杂HTML结构的通用解决方案，并非易事。
 19 | 
 20 | 我的想法是：从特定的网站开始适配，以点到面，逐步抽取出通用的解决方案，最后尽可能多的覆盖更多网站。
 21 | 
 22 | ---
 23 | 
 24 | ## 功能介绍
 25 | 
 26 | - 支持大部分 html 元素转换
 27 | - 部分页面支持katex公式转换（示例：[https://quantum.country/qcvc](https://quantum.country/qcvc)）
 28 | - 部分页面支持github gist（示例：[https://towardsdatascience.com/hands-on-multi-agent-llm-restaurant-simulation-with-python-and-openai](https://towardsdatascience.com/hands-on-multi-agent-llm-restaurant-simulation-with-python-and-openai)）
 29 | - 支持保存成文件或输出至`stdout`
 30 | - 支持突破某些网站的防爬虫策略（需安装插件）
 31 | 
 32 | 以下是一些网站示例，大家可以自己测试下效果。
 33 | 
 34 | |站点|链接|备注|
 35 | --|--|--
 36 | |Medium|[link](https://medium.com/perry-street-software-engineering/architectural-linting-for-swift-made-easy-75d7f9f569cd)||
 37 | |csdn|[link](https://blog.csdn.net/weixin_41705306/article/details/148787220)||
 38 | |掘金|[link](https://juejin.cn/post/7405845617282449462)||
 39 | |知乎专栏|[link](https://zhuanlan.zhihu.com/p/1915735485801828475)|需安装zhihu插件|
 40 | |公众号|[link](https://mp.weixin.qq.com/s/imHIKy7dqMmpm032eIhIJg)||
 41 | |今日头条|[link](https://www.toutiao.com/article/7283050053155947062/)|需安装toutiao插件|
 42 | |网易|[link](https://www.163.com/dy/article/K2SPPGSK0514R9KE.html)||
 43 | |简书|[link](https://www.jianshu.com/p/20bd2e9b1f03)||
 44 | |Freedium|[link](https://freedium.cfd/https://medium.com/@devlink/ai-killed-my-coding-brain-but-im-rebuilding-it-8de7e1618bca)|需安装freedium插件|
 45 | |Towards Data Science|[link](https://towardsdatascience.com/hands-on-multi-agent-llm-restaurant-simulation-with-python-and-openai/)||
 46 | |Quantamagazine|[link](https://www.quantamagazine.org/matter-vs-force-why-there-are-exactly-two-types-of-particles-20250623/)||
 47 | |苹果开发者文档|[link](https://developer.apple.com/documentation/technologyoverviews/adopting-liquid-glass)|需安装browser插件|
 48 | |Cloudflare博客|[link](https://blog.cloudflare.com/20-percent-internet-upgrade/)||
 49 | |阿里云开发者社区|[link](https://developer.aliyun.com/article/791514)||
 50 | |微软技术文档|[link](https://learn.microsoft.com/en-us/dotnet/ai/get-started-app-chat-template)||
 51 | |InfoQ|[link](https://www.infoq.com/articles/ai-ml-data-engineering-trends-2025/)||
 52 | |博客园|[link](https://www.cnblogs.com/hez2010/p/19097937/runtime-async)||
 53 | |思否|[link](https://segmentfault.com/a/1190000047273730)||
 54 | |开源中国|[link](https://my.oschina.net/SeaTunnel/blog/18694930)||
 55 | |Forbes|[link](https://www.forbes.com/sites/danalexander/2025/10/10/trump-is-now-one-of-americas-biggest-bitcoin-investors/)||
 56 | |少数派|[link](https://sspai.com/post/102861)||
 57 | |语雀|[link](https://www.yuque.com/yuque/ng1qth/about)||
 58 | |腾讯云开发者社区|[link](https://cloud.tencent.com/developer/article/2571935)||
 59 | |百家号|[link](https://baijiahao.baidu.com/s?id=1846135703319246634)|需安装browser插件|
 60 | |人人都是产品经理|[link](https://www.woshipm.com/data-analysis/6276761.html)||
 61 | |Jetbrains博客|[link](https://blog.jetbrains.com/teamcity/2025/10/the-state-of-cicd/)||
 62 | |Claude文档|[link](https://docs.claude.com/en/docs/build-with-claude/prompt-caching)||
 63 | |Anthropic|[link](https://www.anthropic.com/news/claude-sonnet-4-5)||
 64 | |Meta博客|[link](https://engineering.fb.com/2025/10/06/developer-tools/openzl-open-source-format-aware-compression-framework/)||
 65 | |Android Developers Blog|[link](https://android-developers.googleblog.com/2025/11/jetpack-navigation-3-is-stable.html)||
 66 | 
 67 | ---
 68 | 
 69 | ## 快速开始
 70 | 
 71 | 1. 安装
 72 | 
 73 | ```sh
 74 | pip install omni-article-markdown
 75 | ```
 76 | 
 77 | 2. 运行说明
 78 | 
 79 | **仅转换**
 80 | 
 81 | ```sh
 82 | mdcli https://example.com
 83 | ```
 84 | 
 85 | **保存到当前目录**
 86 | 
 87 | ```sh
 88 | mdcli https://example.com -s
 89 | ```
 90 | 
 91 | **保存到指定路径**
 92 | 
 93 | ```sh
 94 | mdcli https://example.com -s /home/user/
 95 | ```
 96 | 
 97 | ---
 98 | 
 99 | ## 插件机制
100 | 
101 | [「墨探」是如何使用插件机制构建可扩展架构的](https://babyno.top/posts/2025/06/a-deep-dive-into-the-extensible-architecture-of-omni-article-markdown/)
102 | 
103 | **安装插件**
104 | 
105 | 安装插件和`pip`命令格式相同：
106 | 
107 | ```sh
108 | mdcli install <PLUGIN_NAME_OR_PACKAGE_NAME> [-U] [-e]
109 | ```
110 | 
111 | **示例：安装知乎解析插件**
112 | 
113 | ```sh
114 | mdcli install zhihu
115 | ```
116 | 
117 | 或者，你可以使用 `-e` 参数安装本地可编辑的包。
118 | 
119 | ```sh
120 | mdcli install -e "./plugins/omnimd-zhihu-reader"
121 | ```
122 | 
123 | **升级插件**
124 | 
125 | ```sh
126 | mdcli install zhihu -U
127 | ```
128 | 
129 | **卸载插件**
130 | 
131 | 如果你想移除一个已安装的插件，可以使用 `mdcli` 提供的 `uninstall` 命令。
132 | 
133 | ```sh
134 | mdcli uninstall zhihu
135 | ```
136 | 
137 | 或者，使用插件的全称删除
138 | 
139 | ```sh
140 | mdcli uninstall omnimd-zhihu-reader
141 | ```
142 | 
143 | **已支持的插件**
144 | 
145 | 目前已发布4个插件，你可以按需安装：
146 | 
147 | | 命令                             | 说明                                                     |
148 | |----------------------------------|----------------------------------------------------------|
149 | | `mdcli install zhihu`              | 知乎专栏 |
150 | | `mdcli install toutiao`            | 今日头条                         |
151 | | `mdcli install freedium`           | Freedium                         |
152 | | `mdcli install browser`           | 需要启用浏览器的JS功能才能访问的页面（如Apple Developer Documentation）                         |
153 | 
154 | **开发自己的插件**
155 | 
156 | 文档编写中。
157 | 
158 | ---
159 | 
160 | ## 使用Docker镜像
161 | 
162 | **直接下载**
163 | 
164 | 暂未提供
165 | 
166 | **自行构建**
167 | 
168 | ```
169 | docker build --build-arg PYPI_REGISTRY=https://pypi.tuna.tsinghua.edu.cn/simple -t omni-article-markdown .
170 | # 可以指定PYPI镜像源，默认使用官方源
171 | ```
172 | 
173 | 现在你可以在`docker`容器中使用墨探了：
174 | 
175 | ```
176 | docker run --rm omni-article-markdown /path/to/your/url
177 | ```
178 | 
179 | ## 架构说明
180 | 
181 | ![](data/1.jpg)
182 | 
183 | 墨探主要分为三个模块：
184 | 
185 | - **Reader** 模块的功能是读取整个网页内容
186 | - **Extractor** 模块的功能是提取正文内容，清理无用数据
187 | - **Parser** 模块的功能是将 HTML 转换为 Markdown
188 | 
189 | ---
190 | 
191 | ## 贡献与反馈
192 | - 发现解析问题？欢迎提交 [Issue](https://github.com/caol64/omni-article-markdown/issues)
193 | - 改进解析？欢迎贡献 [Pull Request](https://github.com/caol64/omni-article-markdown/pulls)
194 | - 开发插件？文档正在筹备中
195 | 
196 | ---
197 | 
198 | ## 赞助
199 | 
200 | 如果您觉得不错，可以给我家猫咪买点罐头吃。[喂猫❤️](https://yuzhi.tech/sponsor)
201 | 
202 | ---
203 | 
204 | ## License
205 | 
206 | MIT License
207 | 


--------------------------------------------------------------------------------
/src/omni_article_markdown/parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from collections.abc import Callable
  3 | from urllib.parse import urljoin
  4 | 
  5 | import requests
  6 | from bs4.element import NavigableString, Tag
  7 | 
  8 | from .extractor import Article
  9 | from .utils import (
 10 |     collapse_spaces,
 11 |     detect_language,
 12 |     filter_tag,
 13 |     get_attr_text,
 14 |     is_sequentially_increasing,
 15 |     move_spaces,
 16 | )
 17 | 
 18 | LB_SYMBOL = "[|lb_bl|]"
 19 | 
 20 | POST_HANDLERS: list[Callable[[str], str]] = [
 21 |     # 添加换行使文章更美观
 22 |     lambda el: re.sub(f"(?:{re.escape(LB_SYMBOL)})+", LB_SYMBOL, el).replace(LB_SYMBOL, "\n\n").strip(),
 23 |     # 纠正不规范格式 `**code**` 替换为 **`code`**
 24 |     lambda el: re.sub(r"`\*\*(.*?)\*\*`", r"**`\1`**", el),
 25 |     # 纠正不规范格式 `*code*` 替换为 *`code`*
 26 |     lambda el: re.sub(r"`\*(.*?)\*`", r"*`\1`*", el),
 27 |     # 纠正不规范格式 `[code](url)` 替换为 [`code`](url)
 28 |     lambda el: re.sub(r"`\s*\[([^\]]+)\]\(([^)]+)\)\s*`", r"[`\1`](\2)", el),
 29 |     # 将 \( ... \) 替换为 $ ... $
 30 |     lambda el: re.sub(r"\\\((.+?)\\\)", r"$\1$", el),
 31 |     # 将 \[ ... \] 替换为 $$ ... $$
 32 |     lambda el: re.sub(r"\\\[(.+?)\\\]", r"$$\1$$", el),
 33 | ]
 34 | 
 35 | INLINE_ELEMENTS = ["span", "code", "li", "a", "strong", "em", "b", "i"]
 36 | 
 37 | BLOCK_ELEMENTS = [
 38 |     "p",
 39 |     "h1",
 40 |     "h2",
 41 |     "h3",
 42 |     "h4",
 43 |     "h5",
 44 |     "h6",
 45 |     "ul",
 46 |     "ol",
 47 |     "blockquote",
 48 |     "pre",
 49 |     "img",
 50 |     "picture",
 51 |     "hr",
 52 |     "figcaption",
 53 |     "table",
 54 |     "section",
 55 | ]
 56 | 
 57 | TRUSTED_ELEMENTS = INLINE_ELEMENTS + BLOCK_ELEMENTS
 58 | 
 59 | 
 60 | class HtmlMarkdownParser:
 61 |     def __init__(self, article: Article):
 62 |         self.article = article
 63 | 
 64 |     def parse(self) -> tuple[str, str]:
 65 |         if isinstance(self.article.body, str):
 66 |             markdown = self.article.body
 67 |         else:
 68 |             markdown = self._process_children(self.article.body)
 69 |         for handler in POST_HANDLERS:
 70 |             markdown = handler(markdown)
 71 |         if not self.article.description or self.article.description in markdown:
 72 |             description = ""
 73 |         else:
 74 |             description = f"> {self.article.description}\n\n"
 75 |         result = f"# {self.article.title}\n\n{description}{markdown}"
 76 |         # print(result)
 77 |         return (self.article.title, result)
 78 | 
 79 |     def _process_element(self, element: Tag, level: int = 0, is_pre: bool = False) -> str:
 80 |         parts = []
 81 |         tag = element.name
 82 |         match tag:
 83 |             case "br":
 84 |                 parts.append(LB_SYMBOL)
 85 |             case "hr":
 86 |                 parts.append("---")
 87 |             case "h1" | "h2" | "h3" | "h4" | "h5" | "h6":
 88 |                 heading = self._process_children(element, level, is_pre=is_pre)
 89 |                 parts.append(f"{'#' * int(element.name[1])} {heading}")
 90 |             case "a":
 91 |                 link = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "")
 92 |                 if link:
 93 |                     parts.append(f"[{link}]({element.get('href')})")
 94 |             case "strong" | "b":
 95 |                 s = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "")
 96 |                 if s:
 97 |                     parts.append(move_spaces(f"**{s}**", "**"))
 98 |             case "em" | "i":
 99 |                 s = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "")
100 |                 if s:
101 |                     parts.append(move_spaces(f"*{s}*", "*"))
102 |             case "ul" | "ol":
103 |                 parts.append(self._process_list(element, level))
104 |             case "img":
105 |                 parts.append(self._process_image(element, None))
106 |             case "blockquote":
107 |                 blockquote = self._process_children(element, level, is_pre=is_pre)
108 |                 if blockquote.startswith(LB_SYMBOL):
109 |                     blockquote = blockquote.removeprefix(LB_SYMBOL)
110 |                 if blockquote.endswith(LB_SYMBOL):
111 |                     blockquote = blockquote.removesuffix(LB_SYMBOL)
112 |                 parts.append("\n".join(f"> {line}" for line in blockquote.split(LB_SYMBOL)))
113 |             case "pre":
114 |                 parts.append(self._process_codeblock(element, level))
115 |             case "code":  # inner code
116 |                 code = self._process_children(element, level, is_pre=is_pre)
117 |                 if LB_SYMBOL not in code:
118 |                     parts.append(f"`{code}`")
119 |                 else:
120 |                     parts.append(code)
121 |             case "picture":
122 |                 source_elements = element.find_all("source")
123 |                 img_element = filter_tag(element.find("img"))
124 |                 if img_element and source_elements:
125 |                     el = source_elements[0]
126 |                     src_el = filter_tag(el)
127 |                     if src_el:
128 |                         parts.append(self._process_image(img_element, src_el))
129 |                 elif img_element:
130 |                     parts.append(self._process_image(img_element, None))
131 |             case "figcaption":
132 |                 figcaption = self._process_children(element, level, is_pre=is_pre).replace(LB_SYMBOL, "\n").strip()
133 |                 figcaptions = figcaption.replace("\n\n", "\n").split("\n")
134 |                 parts.append("\n".join([f"*{caption}*" for caption in figcaptions]))
135 |             case "table":
136 |                 parts.append(self._process_table(element, level))
137 |             case "math":  # 处理latex公式
138 |                 semantics = filter_tag(element.find("semantics"))
139 |                 if semantics:
140 |                     tex = filter_tag(semantics.find(attrs={"encoding": "application/x-tex"}))
141 |                     if tex:
142 |                         parts.append(f"$$ {tex.text} $$")
143 |             case "script":  # 处理github gist
144 |                 parts.append(self._process_gist(element))
145 |             case _:
146 |                 parts.append(self._process_children(element, level, is_pre=is_pre))
147 |         result = "".join(parts)
148 |         if result and is_block_element(element.name) and (not element.children or not is_pure_block_children(element)):
149 |             result = f"{LB_SYMBOL}{result}{LB_SYMBOL}"
150 |         return result
151 | 
152 |     def _process_children(self, element: Tag, level: int = 0, is_pre: bool = False) -> str:
153 |         parts = []
154 |         if element.children:
155 |             # new_level = level + 1 if element.name in HtmlMarkdownParser.TRUSTED_ELEMENTS else level
156 |             for child in element.children:
157 |                 if isinstance(child, NavigableString):
158 |                     if is_pre:
159 |                         parts.append(child)
160 |                     else:
161 |                         result = collapse_spaces(child).replace("<", "&lt;").replace(">", "&gt;")
162 |                         if result.strip():
163 |                             parts.append(result)
164 |                         # print(element.name, level, result)
165 |                 elif isinstance(child, Tag):
166 |                     result = self._process_element(child, level, is_pre=is_pre)
167 |                     if is_pre or result.strip():
168 |                         parts.append(result)
169 |         return "".join(parts) if is_pre or level > 0 else "".join(parts)
170 | 
171 |     def _process_list(self, element: Tag, level: int) -> str:
172 |         indent = "  " * level
173 |         child_list = element.find_all(recursive=False)
174 |         is_ol = element.name == "ol"
175 |         parts = []
176 |         for i, child in enumerate(child_list):
177 |             child = filter_tag(child)
178 |             if child:
179 |                 if child.name == "li":
180 |                     content = self._process_children(child, level).replace(LB_SYMBOL, "\n").strip()
181 |                     if content:  # 忽略空内容
182 |                         prefix = f"{i + 1}." if is_ol else "-"
183 |                         parts.append(f"{indent}{prefix} {content}")
184 |                 elif child.name == "ul" or child.name == "ol":
185 |                     content = self._process_element(child, level + 1)
186 |                     if content:  # 忽略空内容
187 |                         parts.append(f"{content.replace(LB_SYMBOL, '\n')}")
188 |         if not parts:
189 |             return ""  # 所有内容都为空则返回空字符串
190 |         return "\n".join(parts)
191 | 
192 |     def _process_codeblock(self, element: Tag, level: int) -> str:
193 |         # 找出所有 code 标签（可能为 0 个、1 个或多个）
194 |         code_elements = element.find_all("code") or [element]
195 | 
196 |         # 处理每一个 code 标签并拼接
197 |         code_parts = [
198 |             self._process_children(code_el, level, is_pre=True).replace(LB_SYMBOL, "\n")
199 |             for code_el in code_elements
200 |             if isinstance(code_el, Tag)
201 |         ]
202 |         code = "\n".join(code_parts).strip()
203 | 
204 |         if is_sequentially_increasing(code):
205 |             return ""  # 忽略行号
206 | 
207 |         # 尝试提取语言：从第一个 code 标签的 class 中提取 language
208 |         first_code_el = code_elements[0]
209 |         language = (
210 |             next((cls.split("-")[1] for cls in (first_code_el.get("class") or []) if cls.startswith("language-")), "")
211 |             if isinstance(first_code_el, Tag)
212 |             else ""
213 |         )
214 |         if not language:
215 |             language = detect_language(None, code)
216 |         return f"```{language}\n{code}\n```" if language else f"```\n{code}\n```"
217 | 
218 |     def _process_table(self, element: Tag, level: int) -> str:
219 |         if element.find("pre"):
220 |             return self._process_children(element, level)
221 |         # 获取所有行，包括 thead 和 tbody
222 |         rows = element.find_all("tr")
223 |         if not rows:
224 |             return ""
225 |         # 解析表头（如果有）
226 |         headers = []
227 |         first_row = filter_tag(rows.pop(0))
228 |         if first_row and first_row.find("th"):
229 |             headers = [th.get_text(strip=True) for th in first_row.find_all("th")]
230 |         # 解析表身
231 |         body = [[td.get_text(strip=True) for td in row.find_all("td")] for row in rows if isinstance(row, Tag)]
232 |         # 处理缺失的表头
233 |         if not headers and body:
234 |             headers = body.pop(0)
235 |         # 统一列数
236 |         col_count = max(len(headers), max((len(row) for row in body), default=0))
237 |         headers += [""] * (col_count - len(headers))
238 |         for row in body:
239 |             row += [""] * (col_count - len(row))
240 |         # 生成 Markdown 表格
241 |         markdown_table = []
242 |         markdown_table.append("| " + " | ".join(headers) + " |")
243 |         markdown_table.append("|-" + "-|-".join(["-" * len(h) for h in headers]) + "-|")
244 |         for row in body:
245 |             markdown_table.append("| " + " | ".join(row) + " |")
246 |         return "\n".join(markdown_table)
247 | 
248 |     def _process_image(self, element: Tag, source: Tag | None) -> str:
249 |         src = (
250 |             get_attr_text(element.attrs.get("src"))
251 |             if source is None
252 |             else get_attr_text(source.attrs.get("srcset")).split()[0]
253 |         )
254 |         alt = get_attr_text(element.attrs.get("alt"))
255 |         if src:
256 |             if not src.startswith("http") and self.article.url:
257 |                 src = urljoin(self.article.url, src)
258 |             return f"![{alt}]({src})"
259 |         return ""
260 | 
261 |     def _process_gist(self, element: Tag) -> str:
262 |         src = get_attr_text(element.attrs.get("src"))
263 |         pattern = r"/([0-9a-f]+)(?:\.js)?$"
264 |         match = re.search(pattern, src)
265 |         if match:
266 |             gist_id = match.group(1)
267 |             url = f"https://api.github.com/gists/{gist_id}"
268 |             response = requests.get(url)
269 |             response.encoding = "utf-8"
270 |             if response.status_code == 200:
271 |                 data = response.json()
272 |                 gists = []
273 |                 for filename, info in data["files"].items():
274 |                     code = info["content"]
275 |                     language = detect_language(filename, code)
276 |                     gists.append(f"```{language}\n{code}\n```")
277 |                 return "\n\n".join(gists)
278 |             print(f"Fetch gist error: {response.status_code}")
279 |         return ""
280 | 
281 | 
282 | def is_block_element(element_name: str) -> bool:
283 |     return element_name in BLOCK_ELEMENTS
284 | 
285 | 
286 | def is_pure_block_children(element: Tag) -> bool:
287 |     for child in element.children:
288 |         if isinstance(child, NavigableString):
289 |             if child.strip():  # 有非空文本
290 |                 return False
291 |         elif isinstance(child, Tag) and not is_block_element(child.name):
292 |             return False
293 |     return True
294 | 


--------------------------------------------------------------------------------
/uv.lock:
--------------------------------------------------------------------------------
  1 | version = 1
  2 | revision = 2
  3 | requires-python = ">=3.13"
  4 | 
  5 | [[package]]
  6 | name = "beautifulsoup4"
  7 | version = "4.13.4"
  8 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
  9 | dependencies = [
 10 |     { name = "soupsieve" },
 11 |     { name = "typing-extensions" },
 12 | ]
 13 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/e4/0c4c39e18fd76d6a628d4dd8da40543d136ce2d1752bd6eeeab0791f4d6b/beautifulsoup4-4.13.4.tar.gz", hash = "sha256:dbb3c4e1ceae6aefebdaf2423247260cd062430a410e38c66f2baa50a8437195", size = 621067, upload_time = "2025-04-15T17:05:13.836Z" }
 14 | wheels = [
 15 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/cd/30110dc0ffcf3b131156077b90e9f60ed75711223f306da4db08eff8403b/beautifulsoup4-4.13.4-py3-none-any.whl", hash = "sha256:9bbbb14bfde9d79f38b8cd5f8c7c85f4b8f2523190ebed90e950a8dea4cb1c4b", size = 187285, upload_time = "2025-04-15T17:05:12.221Z" },
 16 | ]
 17 | 
 18 | [[package]]
 19 | name = "certifi"
 20 | version = "2025.4.26"
 21 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 22 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e8/9e/c05b3920a3b7d20d3d3310465f50348e5b3694f4f88c6daf736eef3024c4/certifi-2025.4.26.tar.gz", hash = "sha256:0a816057ea3cdefcef70270d2c515e4506bbc954f417fa5ade2021213bb8f0c6", size = 160705, upload_time = "2025-04-26T02:12:29.51Z" }
 23 | wheels = [
 24 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/4a/7e/3db2bd1b1f9e95f7cddca6d6e75e2f2bd9f51b1246e546d88addca0106bd/certifi-2025.4.26-py3-none-any.whl", hash = "sha256:30350364dfe371162649852c63336a15c70c6510c2ad5015b21c2345311805f3", size = 159618, upload_time = "2025-04-26T02:12:27.662Z" },
 25 | ]
 26 | 
 27 | [[package]]
 28 | name = "charset-normalizer"
 29 | version = "3.4.2"
 30 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 31 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e4/33/89c2ced2b67d1c2a61c19c6751aa8902d46ce3dacb23600a283619f5a12d/charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63", size = 126367, upload_time = "2025-05-02T08:34:42.01Z" }
 32 | wheels = [
 33 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/12/a93df3366ed32db1d907d7593a94f1fe6293903e3e92967bebd6950ed12c/charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0", size = 199622, upload_time = "2025-05-02T08:32:56.363Z" },
 34 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/04/93/bf204e6f344c39d9937d3c13c8cd5bbfc266472e51fc8c07cb7f64fcd2de/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf", size = 143435, upload_time = "2025-05-02T08:32:58.551Z" },
 35 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/22/2a/ea8a2095b0bafa6c5b5a55ffdc2f924455233ee7b91c69b7edfcc9e02284/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e", size = 153653, upload_time = "2025-05-02T08:33:00.342Z" },
 36 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b6/57/1b090ff183d13cef485dfbe272e2fe57622a76694061353c59da52c9a659/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1", size = 146231, upload_time = "2025-05-02T08:33:02.081Z" },
 37 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e2/28/ffc026b26f441fc67bd21ab7f03b313ab3fe46714a14b516f931abe1a2d8/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c", size = 148243, upload_time = "2025-05-02T08:33:04.063Z" },
 38 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c0/0f/9abe9bd191629c33e69e47c6ef45ef99773320e9ad8e9cb08b8ab4a8d4cb/charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691", size = 150442, upload_time = "2025-05-02T08:33:06.418Z" },
 39 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/67/7c/a123bbcedca91d5916c056407f89a7f5e8fdfce12ba825d7d6b9954a1a3c/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0", size = 145147, upload_time = "2025-05-02T08:33:08.183Z" },
 40 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ec/fe/1ac556fa4899d967b83e9893788e86b6af4d83e4726511eaaad035e36595/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b", size = 153057, upload_time = "2025-05-02T08:33:09.986Z" },
 41 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2b/ff/acfc0b0a70b19e3e54febdd5301a98b72fa07635e56f24f60502e954c461/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff", size = 156454, upload_time = "2025-05-02T08:33:11.814Z" },
 42 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/92/08/95b458ce9c740d0645feb0e96cea1f5ec946ea9c580a94adfe0b617f3573/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b", size = 154174, upload_time = "2025-05-02T08:33:13.707Z" },
 43 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/78/be/8392efc43487ac051eee6c36d5fbd63032d78f7728cb37aebcc98191f1ff/charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148", size = 149166, upload_time = "2025-05-02T08:33:15.458Z" },
 44 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/44/96/392abd49b094d30b91d9fbda6a69519e95802250b777841cf3bda8fe136c/charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7", size = 98064, upload_time = "2025-05-02T08:33:17.06Z" },
 45 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e9/b0/0200da600134e001d91851ddc797809e2fe0ea72de90e09bec5a2fbdaccb/charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980", size = 105641, upload_time = "2025-05-02T08:33:18.753Z" },
 46 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/94/c5790835a017658cbfabd07f3bfb549140c3ac458cfc196323996b10095a/charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0", size = 52626, upload_time = "2025-05-02T08:34:40.053Z" },
 47 | ]
 48 | 
 49 | [[package]]
 50 | name = "click"
 51 | version = "8.2.1"
 52 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 53 | dependencies = [
 54 |     { name = "colorama", marker = "sys_platform == 'win32'" },
 55 | ]
 56 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/60/6c/8ca2efa64cf75a977a0d7fac081354553ebe483345c734fb6b6515d96bbc/click-8.2.1.tar.gz", hash = "sha256:27c491cc05d968d271d5a1db13e3b5a184636d9d930f148c50b038f0d0646202", size = 286342, upload_time = "2025-05-20T23:19:49.832Z" }
 57 | wheels = [
 58 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload_time = "2025-05-20T23:19:47.796Z" },
 59 | ]
 60 | 
 61 | [[package]]
 62 | name = "click-default-group"
 63 | version = "1.2.4"
 64 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 65 | dependencies = [
 66 |     { name = "click" },
 67 | ]
 68 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/ce/edb087fb53de63dad3b36408ca30368f438738098e668b78c87f93cd41df/click_default_group-1.2.4.tar.gz", hash = "sha256:eb3f3c99ec0d456ca6cd2a7f08f7d4e91771bef51b01bdd9580cc6450fe1251e" }
 69 | wheels = [
 70 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/1a/aff8bb287a4b1400f69e09a53bd65de96aa5cee5691925b38731c67fc695/click_default_group-1.2.4-py2.py3-none-any.whl", hash = "sha256:9b60486923720e7fc61731bdb32b617039aba820e22e1c88766b1125592eaa5f" },
 71 | ]
 72 | 
 73 | [[package]]
 74 | name = "colorama"
 75 | version = "0.4.6"
 76 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 77 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload_time = "2022-10-25T02:36:22.414Z" }
 78 | wheels = [
 79 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload_time = "2022-10-25T02:36:20.889Z" },
 80 | ]
 81 | 
 82 | [[package]]
 83 | name = "html5lib"
 84 | version = "1.1"
 85 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 86 | dependencies = [
 87 |     { name = "six" },
 88 |     { name = "webencodings" },
 89 | ]
 90 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ac/b6/b55c3f49042f1df3dcd422b7f224f939892ee94f22abcf503a9b7339eaf2/html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f", size = 272215, upload_time = "2020-06-22T23:32:38.834Z" }
 91 | wheels = [
 92 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6c/dd/a834df6482147d48e225a49515aabc28974ad5a4ca3215c18a882565b028/html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d", size = 112173, upload_time = "2020-06-22T23:32:36.781Z" },
 93 | ]
 94 | 
 95 | [[package]]
 96 | name = "idna"
 97 | version = "3.10"
 98 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
 99 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490, upload_time = "2024-09-15T18:07:39.745Z" }
100 | wheels = [
101 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442, upload_time = "2024-09-15T18:07:37.964Z" },
102 | ]
103 | 
104 | [[package]]
105 | name = "iniconfig"
106 | version = "2.1.0"
107 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
108 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload_time = "2025-03-19T20:09:59.721Z" }
109 | wheels = [
110 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload_time = "2025-03-19T20:10:01.071Z" },
111 | ]
112 | 
113 | [[package]]
114 | name = "omni-article-markdown"
115 | version = "0.1.10"
116 | source = { editable = "." }
117 | dependencies = [
118 |     { name = "beautifulsoup4" },
119 |     { name = "click" },
120 |     { name = "click-default-group" },
121 |     { name = "html5lib" },
122 |     { name = "pip" },
123 |     { name = "pluggy" },
124 |     { name = "requests" },
125 | ]
126 | 
127 | [package.optional-dependencies]
128 | dev = [
129 |     { name = "pytest" },
130 | ]
131 | 
132 | [package.metadata]
133 | requires-dist = [
134 |     { name = "beautifulsoup4", specifier = ">=4.13.4" },
135 |     { name = "click", specifier = ">=8.2.0" },
136 |     { name = "click-default-group", specifier = ">=1.2.4" },
137 |     { name = "html5lib", specifier = ">=1.1" },
138 |     { name = "pip" },
139 |     { name = "pluggy", specifier = ">=1.6.0" },
140 |     { name = "pytest", marker = "extra == 'dev'" },
141 |     { name = "requests", specifier = ">=2.32.3" },
142 | ]
143 | provides-extras = ["dev"]
144 | 
145 | [[package]]
146 | name = "packaging"
147 | version = "25.0"
148 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
149 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload_time = "2025-04-19T11:48:59.673Z" }
150 | wheels = [
151 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload_time = "2025-04-19T11:48:57.875Z" },
152 | ]
153 | 
154 | [[package]]
155 | name = "pip"
156 | version = "25.1.1"
157 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
158 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/59/de/241caa0ca606f2ec5fe0c1f4261b0465df78d786a38da693864a116c37f4/pip-25.1.1.tar.gz", hash = "sha256:3de45d411d308d5054c2168185d8da7f9a2cd753dbac8acbfa88a8909ecd9077", size = 1940155, upload_time = "2025-05-02T15:14:02.057Z" }
159 | wheels = [
160 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/29/a2/d40fb2460e883eca5199c62cfc2463fd261f760556ae6290f88488c362c0/pip-25.1.1-py3-none-any.whl", hash = "sha256:2913a38a2abf4ea6b64ab507bd9e967f3b53dc1ede74b01b0931e1ce548751af", size = 1825227, upload_time = "2025-05-02T15:13:59.102Z" },
161 | ]
162 | 
163 | [[package]]
164 | name = "pluggy"
165 | version = "1.6.0"
166 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
167 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload_time = "2025-05-15T12:30:07.975Z" }
168 | wheels = [
169 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload_time = "2025-05-15T12:30:06.134Z" },
170 | ]
171 | 
172 | [[package]]
173 | name = "pygments"
174 | version = "2.19.2"
175 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
176 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload_time = "2025-06-21T13:39:12.283Z" }
177 | wheels = [
178 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload_time = "2025-06-21T13:39:07.939Z" },
179 | ]
180 | 
181 | [[package]]
182 | name = "pytest"
183 | version = "8.4.2"
184 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
185 | dependencies = [
186 |     { name = "colorama", marker = "sys_platform == 'win32'" },
187 |     { name = "iniconfig" },
188 |     { name = "packaging" },
189 |     { name = "pluggy" },
190 |     { name = "pygments" },
191 | ]
192 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload_time = "2025-09-04T14:34:22.711Z" }
193 | wheels = [
194 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload_time = "2025-09-04T14:34:20.226Z" },
195 | ]
196 | 
197 | [[package]]
198 | name = "requests"
199 | version = "2.32.3"
200 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
201 | dependencies = [
202 |     { name = "certifi" },
203 |     { name = "charset-normalizer" },
204 |     { name = "idna" },
205 |     { name = "urllib3" },
206 | ]
207 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/63/70/2bf7780ad2d390a8d301ad0b550f1581eadbd9a20f896afe06353c2a2913/requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760", size = 131218, upload_time = "2024-05-29T15:37:49.536Z" }
208 | wheels = [
209 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928, upload_time = "2024-05-29T15:37:47.027Z" },
210 | ]
211 | 
212 | [[package]]
213 | name = "six"
214 | version = "1.17.0"
215 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
216 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload_time = "2024-12-04T17:35:28.174Z" }
217 | wheels = [
218 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload_time = "2024-12-04T17:35:26.475Z" },
219 | ]
220 | 
221 | [[package]]
222 | name = "soupsieve"
223 | version = "2.7"
224 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
225 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3f/f4/4a80cd6ef364b2e8b65b15816a843c0980f7a5a2b4dc701fc574952aa19f/soupsieve-2.7.tar.gz", hash = "sha256:ad282f9b6926286d2ead4750552c8a6142bc4c783fd66b0293547c8fe6ae126a", size = 103418, upload_time = "2025-04-20T18:50:08.518Z" }
226 | wheels = [
227 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/e7/9c/0e6afc12c269578be5c0c1c9f4b49a8d32770a080260c333ac04cc1c832d/soupsieve-2.7-py3-none-any.whl", hash = "sha256:6e60cc5c1ffaf1cebcc12e8188320b72071e922c2e897f737cadce79ad5d30c4", size = 36677, upload_time = "2025-04-20T18:50:07.196Z" },
228 | ]
229 | 
230 | [[package]]
231 | name = "typing-extensions"
232 | version = "4.13.2"
233 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
234 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967, upload_time = "2025-04-10T14:19:05.416Z" }
235 | wheels = [
236 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806, upload_time = "2025-04-10T14:19:03.967Z" },
237 | ]
238 | 
239 | [[package]]
240 | name = "urllib3"
241 | version = "2.4.0"
242 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
243 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8a/78/16493d9c386d8e60e442a35feac5e00f0913c0f4b7c217c11e8ec2ff53e0/urllib3-2.4.0.tar.gz", hash = "sha256:414bc6535b787febd7567804cc015fee39daab8ad86268f1310a9250697de466", size = 390672, upload_time = "2025-04-10T15:23:39.232Z" }
244 | wheels = [
245 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/6b/11/cc635220681e93a0183390e26485430ca2c7b5f9d33b15c74c2861cb8091/urllib3-2.4.0-py3-none-any.whl", hash = "sha256:4e16665048960a0900c702d4a66415956a584919c03361cac9f1df5c5dd7e813", size = 128680, upload_time = "2025-04-10T15:23:37.377Z" },
246 | ]
247 | 
248 | [[package]]
249 | name = "webencodings"
250 | version = "0.5.1"
251 | source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
252 | sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/0b/02/ae6ceac1baeda530866a85075641cec12989bd8d31af6d5ab4a3e8c92f47/webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923", size = 9721, upload_time = "2017-04-05T20:21:34.189Z" }
253 | wheels = [
254 |     { url = "https://pypi.tuna.tsinghua.edu.cn/packages/f4/24/2a3e3df732393fed8b3ebf2ec078f05546de641fe1b667ee316ec1dcf3b7/webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", size = 11774, upload_time = "2017-04-05T20:21:32.581Z" },
255 | ]
256 | 


--------------------------------------------------------------------------------