Hello
├── .python-version ├── plugins ├── omnimd-browser-reader │ ├── README.md │ ├── pyproject.toml │ └── browser.py ├── omnimd-toutiao-reader │ ├── README.md │ ├── pyproject.toml │ └── toutiao.py ├── omnimd-zhihu-reader │ ├── README.md │ ├── pyproject.toml │ └── zhihu.py └── omnimd-freedium-reader │ ├── README.md │ ├── pyproject.toml │ └── freedium.py ├── data ├── 1.gif └── 1.jpg ├── .gitignore ├── tests ├── conftest.py ├── test_extractor.py ├── test_parser.py └── test_utils.py ├── Dockerfile ├── src └── omni_article_markdown │ ├── __init__.py │ ├── plugins.py │ ├── extractors │ ├── hugo.py │ ├── zhihu.py │ ├── 163.py │ ├── woshipm.py │ ├── infoqcn.py │ ├── aliyun_developer.py │ ├── android_dev_blog.py │ ├── cloudflare_blog.py │ ├── oschina.py │ ├── tencent_cloud.py │ ├── anthropic.py │ ├── medium.py │ ├── infoq.py │ ├── quantamagazine.py │ ├── juejin.py │ ├── sspai.py │ ├── claude_doc.py │ ├── microsoft_learn.py │ ├── cnblog.py │ ├── apple_developer.py │ ├── baijiahao.py │ ├── toutiao.py │ ├── jetbrains_blog.py │ ├── wechat_gzh.py │ ├── jianshu.py │ ├── towards_data_science.py │ ├── freedium.py │ └── yuque.py │ ├── hookspecs.py │ ├── store.py │ ├── readers.py │ ├── omni_article_md.py │ ├── cli.py │ ├── extractor.py │ ├── utils.py │ └── parser.py ├── .editorconfig ├── .github ├── FUNDING.yml └── workflows │ ├── publish.yml │ └── publish_plugin.yml ├── LICENSE ├── pyproject.toml ├── ruff.toml ├── README.md └── uv.lock /.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /plugins/omnimd-browser-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 浏览器插件 2 | -------------------------------------------------------------------------------- /plugins/omnimd-toutiao-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 头条插件 2 | -------------------------------------------------------------------------------- /plugins/omnimd-zhihu-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) 知乎插件 2 | -------------------------------------------------------------------------------- /data/1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caol64/omni-article-markdown/HEAD/data/1.gif -------------------------------------------------------------------------------- /data/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/caol64/omni-article-markdown/HEAD/data/1.jpg -------------------------------------------------------------------------------- /plugins/omnimd-freedium-reader/README.md: -------------------------------------------------------------------------------- 1 | # 墨探 (omni-article-markdown) Freedium插件 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_STORE 2 | 3 | # Python-generated files 4 | __pycache__/ 5 | *.py[oc] 6 | build/ 7 | dist/ 8 | wheels/ 9 | *.egg-info 10 | 11 | # Virtual environments 12 | .venv 13 | .env 14 | 15 | plugins/**/uv.lock 16 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | @pytest.fixture 6 | def make_soup(): 7 | def _make_soup(html: str, parser: str = "html.parser"): 8 | return BeautifulSoup(html, parser) 9 | return _make_soup 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.13-alpine 2 | 3 | ARG PYPI_REGISTRY="https://pypi.org/simple/" 4 | 5 | WORKDIR /app 6 | 7 | RUN pip config set global.index-url "${PYPI_REGISTRY}" 8 | RUN pip install omni-article-markdown 9 | 10 | ENTRYPOINT ["mdcli"] 11 | CMD [] 12 | -------------------------------------------------------------------------------- /src/omni_article_markdown/__init__.py: -------------------------------------------------------------------------------- 1 | from .omni_article_md import OmniArticleMarkdown 2 | 3 | __all__ = ["OmniArticleMarkdown"] 4 | 5 | DEFAULT_PLUGINS = { 6 | "zhihu": "omnimd-zhihu-reader", 7 | "freedium": "omnimd-freedium-reader", 8 | "toutiao": "omnimd-toutiao-reader", 9 | "browser": "omnimd-browser-reader", 10 | } 11 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 4 6 | trim_trailing_whitespace = true 7 | charset = utf-8 8 | end_of_line = lf 9 | insert_final_newline = true 10 | 11 | [*.json] 12 | indent_size = 2 13 | 14 | [*.{yml,yaml}] 15 | indent_size = 2 16 | 17 | [Makefile] 18 | indent_style = tab 19 | 20 | [*.{md,mdx}] 21 | max_line_length = off 22 | trim_trailing_whitespace = false 23 | -------------------------------------------------------------------------------- /src/omni_article_markdown/plugins.py: -------------------------------------------------------------------------------- 1 | import pluggy 2 | 3 | from . import hookspecs 4 | 5 | pm = pluggy.PluginManager("mdcli") 6 | pm.add_hookspecs(hookspecs) 7 | 8 | _loaded_plugins = False 9 | 10 | def load_mdcli_plugins(): 11 | global _loaded_plugins 12 | if _loaded_plugins: 13 | return 14 | pm.load_setuptools_entrypoints("mdcli") 15 | _loaded_plugins = True 16 | 17 | # 在应用启动时调用 18 | load_mdcli_plugins() 19 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/hugo.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | 7 | 8 | class HugoExtractor(Extractor): 9 | """ 10 | Hugo博客 11 | """ 12 | 13 | @override 14 | def can_handle(self, soup: BeautifulSoup) -> bool: 15 | return False 16 | 17 | @override 18 | def article_container(self) -> tuple: 19 | return ("div", {"class": "post-content"}) 20 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/zhihu.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class ZhihuExtractor(Extractor): 10 | """ 11 | 知乎专栏 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_og_site_name(soup) == "知乎专栏" 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "Post-RichText"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/163.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class Netease163Extractor(Extractor): 10 | """ 11 | 163.com 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://www.163.com", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "post_body"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/woshipm.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_url 7 | 8 | 9 | class WoShiPMExtractor(Extractor): 10 | """ 11 | 人人都是产品经理 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_og_url(soup).startswith("https://www.woshipm.com") 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "article--content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/infoqcn.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class InfoQCNExtractor(Extractor): 10 | """ 11 | www.infoq.cn 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://www.infoq.cn", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "article-content-wrap"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/aliyun_developer.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class AliyunDeveloperExtractor(Extractor): 10 | """ 11 | developer.aliyun.com 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://developer.aliyun.com", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "article-content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/android_dev_blog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class AndroidDevelopersBlogExtractor(Extractor): 10 | """ 11 | Android Developers Blog 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_og_site_name(soup) == "Android Developers Blog" 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"class": "adb-detail__content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/cloudflare_blog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class CloudflareBlogExtractor(Extractor): 10 | """ 11 | blog.cloudflare.com 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://blog.cloudflare.com", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("section", {"class": "post-full-content"}) 21 | -------------------------------------------------------------------------------- /src/omni_article_markdown/hookspecs.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | from pluggy import HookimplMarker, HookspecMarker 4 | 5 | hookspec = HookspecMarker("mdcli") 6 | hookimpl = HookimplMarker("mdcli") 7 | 8 | 9 | class ReaderPlugin(Protocol): 10 | def can_handle(self, url: str) -> bool: ... 11 | 12 | def read(self, url: str) -> str: ... 13 | 14 | 15 | @hookspec(firstresult=True) 16 | def get_custom_reader(url: str) -> ReaderPlugin | None: 17 | """ 18 | Allows plugins to provide a custom reader for a given URL. 19 | The first plugin that returns a ReaderPlugin instance will be used. 20 | """ 21 | ... 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/oschina.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | 7 | 8 | class OsChinaExtractor(Extractor): 9 | """ 10 | 开源中国 11 | """ 12 | 13 | @override 14 | def can_handle(self, soup: BeautifulSoup) -> bool: 15 | title_tag = soup.title 16 | title = title_tag.get_text(strip=True) if title_tag else None 17 | return title is not None and title.endswith(" - OSCHINA - 中文开源技术交流社区") 18 | 19 | @override 20 | def article_container(self) -> tuple: 21 | return ("div", {"class": "detail-box"}) 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/tencent_cloud.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | 7 | 8 | class TencentCloudExtractor(Extractor): 9 | """ 10 | 腾讯云开发者社区 11 | """ 12 | 13 | @override 14 | def can_handle(self, soup: BeautifulSoup) -> bool: 15 | title_tag = soup.title 16 | title = title_tag.get_text(strip=True) if title_tag else None 17 | return title is not None and title.endswith("-腾讯云开发者社区-腾讯云") 18 | 19 | @override 20 | def article_container(self) -> tuple: 21 | return ("div", {"class": "mod-content__markdown"}) 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/anthropic.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_title 7 | 8 | 9 | class ClaudeDocExtractor(Extractor): 10 | """ 11 | Anthropic 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return get_title(soup).endswith(" \\ Anthropic") 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("article", None) 21 | 22 | @override 23 | def extract_url(self, soup: BeautifulSoup) -> str: 24 | return "https://www.anthropic.com/" 25 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/medium.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class MediumExtractor(Extractor): 10 | """ 11 | Medium 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend([ 17 | lambda el: 'data-testid' in el.attrs, 18 | lambda el: 'class' in el.attrs and 'speechify-ignore' in el.attrs['class'], 19 | ]) 20 | 21 | @override 22 | def can_handle(self, soup: BeautifulSoup) -> bool: 23 | return get_og_site_name(soup) == "Medium" 24 | 25 | @override 26 | def article_container(self) -> tuple: 27 | return ("article", None) 28 | -------------------------------------------------------------------------------- /src/omni_article_markdown/store.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from typing import Any 4 | 5 | 6 | class Store: 7 | def __init__(self, base_dir_name: str = ".ommimd"): 8 | self.path = Path.home() / base_dir_name 9 | 10 | def save(self, key: str, obj: Any): 11 | self.path.mkdir(parents=True, exist_ok=True) 12 | file_path = self.path / f"{key}.json" 13 | with open(file_path, "w", encoding="utf8") as f: 14 | json.dump(obj, f, indent=4, ensure_ascii=False) 15 | 16 | def load(self, key: str) -> Any | None: 17 | file_path = self.path / f"{key}.json" 18 | if not file_path.exists() or not file_path.is_file(): 19 | return None 20 | with open(file_path, encoding="utf8") as f: 21 | return json.load(f) 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/infoq.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class InfoQExtractor(Extractor): 10 | """ 11 | www.infoq.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "author-section-full" in el.attrs["class"], 19 | ] 20 | ) 21 | 22 | @override 23 | def can_handle(self, soup: BeautifulSoup) -> bool: 24 | return is_matched_canonical("https://www.infoq.com", soup) 25 | 26 | @override 27 | def article_container(self) -> tuple: 28 | return ("div", {"class": "article__data"}) 29 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/quantamagazine.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class QuantamagazineExtractor(Extractor): 10 | """ 11 | quantamagazine.org 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "post__title__title" in el.attrs["class"], 19 | ] 20 | ) 21 | 22 | @override 23 | def can_handle(self, soup: BeautifulSoup) -> bool: 24 | return get_og_site_name(soup) == "Quanta Magazine" 25 | 26 | @override 27 | def article_container(self) -> tuple: 28 | return ("div", {"id": "postBody"}) 29 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/juejin.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import filter_tag, is_matched_canonical 7 | 8 | 9 | class JuejinExtractor(Extractor): 10 | """ 11 | juejin.cn 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | return is_matched_canonical("https://juejin.cn/", soup) 17 | 18 | @override 19 | def article_container(self) -> tuple: 20 | return ("div", {"id": "article-root"}) 21 | 22 | @override 23 | def extract_title(self, soup: BeautifulSoup) -> str: 24 | title_tag = filter_tag(soup.find("h1", {"class": "article-title"})) 25 | return title_tag.get_text(strip=True) if title_tag else super().extract_title(soup) 26 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/sspai.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class SspaiExtractor(Extractor): 10 | """ 11 | 少数派 12 | """ 13 | def __init__(self): 14 | super().__init__() 15 | self.attrs_to_clean.extend( 16 | [ 17 | lambda el: "class" in el.attrs and "comment__list" in el.attrs["class"], 18 | lambda el: "class" in el.attrs and "comment__footer__wrapper" in el.attrs["class"], 19 | ] 20 | ) 21 | 22 | @override 23 | def can_handle(self, soup: BeautifulSoup) -> bool: 24 | return get_og_site_name(soup) == "少数派 - 高品质数字消费指南" 25 | 26 | @override 27 | def article_container(self) -> tuple: 28 | return ("div", {"class": "article__main__wrapper"}) 29 | -------------------------------------------------------------------------------- /plugins/omnimd-zhihu-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-zhihu-reader" 3 | version = "0.1.3" 4 | description = "A plugin for omni-article-markdown to read Zhihu content." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | zhihu = "zhihu" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-zhihu-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/zhihu.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/zhihu.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry 12 | polar: # Replace with a single Polar username 13 | buy_me_a_coffee: # Replace with a single Buy Me a Coffee username 14 | thanks_dev: # Replace with a single thanks.dev username 15 | custom: ['https://yuzhi.tech/sponsor', 'https://paypal.me/caol64'] 16 | -------------------------------------------------------------------------------- /plugins/omnimd-toutiao-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-toutiao-reader" 3 | version = "0.1.3" 4 | description = "A plugin for omni-article-markdown to read Toutiao content." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | toutiao = "toutiao" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-toutiao-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/toutiao.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/toutiao.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/claude_doc.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_title 7 | 8 | 9 | class ClaudeDocExtractor(Extractor): 10 | """ 11 | docs.claude.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "data-component-part" in el.attrs and "code-block-header" in el.attrs["data-component-part"], 19 | lambda el: "data-component-part" in el.attrs and "code-group-tab-bar" in el.attrs["data-component-part"], 20 | ] 21 | ) 22 | 23 | @override 24 | def can_handle(self, soup: BeautifulSoup) -> bool: 25 | return get_og_title(soup).endswith(" - Claude Docs") 26 | 27 | @override 28 | def article_container(self) -> tuple: 29 | return ("div", {"class": "mdx-content"}) 30 | -------------------------------------------------------------------------------- /plugins/omnimd-freedium-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-freedium-reader" 3 | version = "0.1.3" 4 | description = "A plugin for omni-article-markdown to read Freedium content." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | freedium = "freedium" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-freedium-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/freedium.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/freedium.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Build and Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | omnimd-publish: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: '3.13' 19 | 20 | - name: Set up pip cache 21 | uses: actions/cache@v4 22 | with: 23 | path: ~/.cache/pip 24 | key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} 25 | 26 | - name: Install Hatch 27 | run: | 28 | pip install -U hatch hatchling 29 | 30 | - name: Build and publish with Hatch 31 | env: 32 | HATCH_INDEX_USER: __token__ 33 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }} 34 | run: | 35 | hatch build --clean 36 | hatch publish --yes --no-prompt 37 | -------------------------------------------------------------------------------- /plugins/omnimd-browser-reader/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omnimd-browser-reader" 3 | version = "0.1.2" 4 | description = "A plugin for omni-article-markdown to read content that needs enabling javascript." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | dependencies = [ 12 | "playwright", 13 | ] 14 | 15 | [project.entry-points.mdcli] 16 | browser = "browser" 17 | 18 | [project.urls] 19 | Homepage = "https://github.com/caol64/omni-article-markdown/tree/main/plugins/omnimd-browser-reader" 20 | 21 | [build-system] 22 | requires = ["hatchling"] 23 | build-backend = "hatchling.build" 24 | 25 | [tool.hatch.build] 26 | exclude = [ 27 | "/dist", 28 | ] 29 | 30 | [tool.hatch.build.targets.wheel] 31 | include = [ 32 | "/browser.py", 33 | ] 34 | 35 | [tool.hatch.build.targets.sdist] 36 | include = [ 37 | "/browser.py", 38 | "/README.md", 39 | "/pyproject.toml", 40 | ] 41 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/microsoft_learn.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_url 7 | 8 | 9 | class MicrosoftLearnExtractor(Extractor): 10 | """ 11 | 微软技术文档 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "id" in el.attrs and "article-header" in el.attrs["id"], 19 | lambda el: "id" in el.attrs and "article-metadata" in el.attrs["id"], 20 | lambda el: "id" in el.attrs and "site-user-feedback-footer" in el.attrs["id"], 21 | ] 22 | ) 23 | 24 | @override 25 | def can_handle(self, soup: BeautifulSoup) -> bool: 26 | return get_og_url(soup).startswith("https://learn.microsoft.com") 27 | 28 | @override 29 | def article_container(self) -> tuple: 30 | return ("main", {"id": "main"}) 31 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/cnblog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import is_matched_canonical 7 | 8 | 9 | class CnBlogsExtractor(Extractor): 10 | """ 11 | 博客园 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "id" in el.attrs and "blog_post_info_block" in el.attrs["id"], 19 | lambda el: "class" in el.attrs and "postDesc" in el.attrs["class"], 20 | ] 21 | ) 22 | 23 | @override 24 | def can_handle(self, soup: BeautifulSoup) -> bool: 25 | return is_matched_canonical("https://www.cnblogs.com", soup) 26 | 27 | @override 28 | def article_container(self) -> tuple: 29 | return ("div", {"class": "post"}) 30 | 31 | @override 32 | def extract_description(self, soup: BeautifulSoup) -> str: 33 | return "" 34 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/apple_developer.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class AppleDevelopExtractor(Extractor): 10 | """ 11 | Apple Developer Documentation 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "eyebrow" in el.attrs["class"], 19 | lambda el: "class" in el.attrs and "platform" in el.attrs["class"], 20 | lambda el: "class" in el.attrs and "title" in el.attrs["class"], 21 | ] 22 | ) 23 | 24 | @override 25 | def can_handle(self, soup: BeautifulSoup) -> bool: 26 | return get_og_site_name(soup) == "Apple Developer Documentation" 27 | 28 | @override 29 | def article_container(self) -> tuple: 30 | return ("main", {"class": "main"}) 31 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/baijiahao.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import filter_tag 7 | 8 | 9 | class Netease163Extractor(Extractor): 10 | """ 11 | 百家号 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | tag1 = filter_tag(soup.find("div", {"data-testid": "article"})) 17 | tag2 = filter_tag(soup.find("span", {"class": "bjh-p"})) 18 | return tag1 is not None and tag2 is not None 19 | 20 | @override 21 | def article_container(self) -> tuple: 22 | return ("div", {"data-testid": "article"}) 23 | 24 | @override 25 | def pre_handle_soup(self, soup: BeautifulSoup) -> BeautifulSoup: 26 | for tag in soup.find_all("span", {"class": "bjh-p"}): 27 | span_tag = filter_tag(tag) 28 | if span_tag: 29 | span_tag.name = "p" 30 | # for tag in soup.find_all("img"): 31 | # tag.wrap(soup.new_tag("p")) 32 | return soup 33 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/toutiao.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | from bs4.element import Tag 5 | 6 | from ..extractor import Extractor 7 | from ..utils import filter_tag, get_attr_text 8 | 9 | 10 | class ToutiaoExtractor(Extractor): 11 | """ 12 | 今日头条 13 | """ 14 | 15 | @override 16 | def can_handle(self, soup: BeautifulSoup) -> bool: 17 | title_tag = soup.title 18 | title = title_tag.get_text(strip=True) if title_tag else None 19 | return title is not None and title.endswith(" - 今日头条") 20 | 21 | @override 22 | def article_container(self) -> tuple: 23 | return ("div", {"class": "article-content"}) 24 | 25 | @override 26 | def extract_img(self, element: Tag) -> Tag: 27 | img_els = element.find_all("img") 28 | for img_el in img_els: 29 | img_tag = filter_tag(img_el) 30 | if img_tag: 31 | src = get_attr_text(img_tag.attrs.get("data-src")) 32 | if src: 33 | img_tag.attrs["src"] = src 34 | return element 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 caol64 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/jetbrains_blog.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class JetbrainsBlogExtractor(Extractor): 10 | """ 11 | blog.jetbrains.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend( 17 | [ 18 | lambda el: "class" in el.attrs and "content__row" in el.attrs["class"], 19 | lambda el: "class" in el.attrs and "content__pagination" in el.attrs["class"], 20 | lambda el: "class" in el.attrs and "content__form" in el.attrs["class"], 21 | lambda el: "class" in el.attrs and "tag" in el.attrs["class"], 22 | lambda el: "class" in el.attrs and "author-post" in el.attrs["class"], 23 | ] 24 | ) 25 | 26 | @override 27 | def can_handle(self, soup: BeautifulSoup) -> bool: 28 | return get_og_site_name(soup) == "The JetBrains Blog" 29 | 30 | @override 31 | def article_container(self) -> tuple: 32 | return ("div", {"class": "content"}) 33 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/wechat_gzh.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | from bs4.element import Tag 5 | 6 | from ..extractor import Extractor 7 | from ..utils import filter_tag, get_attr_text, get_og_site_name 8 | 9 | 10 | class WechatGZHExtractor(Extractor): 11 | """ 12 | 微信公众号 13 | """ 14 | 15 | def __init__(self): 16 | super().__init__() 17 | self.attrs_to_clean.append(lambda el: 'id' in el.attrs and el.attrs['id'] == 'meta_content') 18 | 19 | @override 20 | def can_handle(self, soup: BeautifulSoup) -> bool: 21 | return get_og_site_name(soup) == "微信公众平台" 22 | 23 | @override 24 | def article_container(self) -> tuple: 25 | return ("div", {"class": "rich_media_content"}) 26 | 27 | @override 28 | def extract_img(self, element: Tag) -> Tag: 29 | img_els = element.find_all("img") 30 | for img_el in img_els: 31 | img_tag = filter_tag(img_el) 32 | if img_tag: 33 | src = get_attr_text(img_tag.attrs.get("data-src")) 34 | if src: 35 | img_tag.attrs["src"] = src 36 | return element 37 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/jianshu.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | from bs4.element import Tag 5 | 6 | from ..extractor import ARTICLE_CONTAINERS, Extractor 7 | from ..utils import filter_tag, get_attr_text, get_og_site_name 8 | 9 | 10 | class JianshuExtractor(Extractor): 11 | """ 12 | www.jianshu.com 13 | """ 14 | 15 | @override 16 | def can_handle(self, soup: BeautifulSoup) -> bool: 17 | return get_og_site_name(soup) == "简书" 18 | 19 | @override 20 | def article_container(self) -> tuple | list: 21 | return ARTICLE_CONTAINERS 22 | 23 | @override 24 | def extract_description(self, soup: BeautifulSoup) -> str: 25 | return "" 26 | 27 | @override 28 | def extract_url(self, soup: BeautifulSoup) -> str: 29 | return "https:" 30 | 31 | @override 32 | def extract_img(self, element: Tag) -> Tag: 33 | img_els = element.find_all("img") 34 | for img_el in img_els: 35 | img_tag = filter_tag(img_el) 36 | if img_tag: 37 | src = get_attr_text(img_tag.attrs.get("data-original-src")) 38 | if src: 39 | img_tag.attrs["src"] = src 40 | return element 41 | -------------------------------------------------------------------------------- /.github/workflows/publish_plugin.yml: -------------------------------------------------------------------------------- 1 | name: Build Plugins and Publish to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | package_path: 7 | description: 'Path to plugin directory (relative to repo root)' 8 | required: true 9 | default: 'plugins/omnimd-freedium-reader' 10 | 11 | jobs: 12 | omnimd-plugin-publish: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: '3.13' 23 | 24 | - name: Set up pip cache 25 | uses: actions/cache@v4 26 | with: 27 | path: ~/.cache/pip 28 | key: ${{ runner.os }}-${{ github.event.inputs.package_path }}-pip-${{ hashFiles('pyproject.toml') }} 29 | 30 | - name: Install Hatch 31 | run: | 32 | pip install -U hatch hatchling 33 | 34 | - name: Build and publish with Hatch 35 | env: 36 | HATCH_INDEX_USER: __token__ 37 | HATCH_INDEX_AUTH: ${{ secrets.PYPI_TOKEN }} 38 | run: | 39 | cd "${{ github.event.inputs.package_path }}" 40 | hatch build --clean 41 | hatch publish --yes --no-prompt 42 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/towards_data_science.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import get_og_site_name 7 | 8 | 9 | class TowardsDataScienceExtractor(Extractor): 10 | """ 11 | towardsdatascience.com 12 | """ 13 | 14 | def __init__(self): 15 | super().__init__() 16 | self.attrs_to_clean.extend([ 17 | lambda el: 'class' in el.attrs and 'taxonomy-post_tag' in el.attrs['class'], 18 | lambda el: 'class' in el.attrs and 'tds-cta-box' in el.attrs['class'], 19 | lambda el: 'class' in el.attrs and 'wp-block-buttons' in el.attrs['class'], 20 | lambda el: 'class' in el.attrs and 'wp-block-outermost-social-sharing' in el.attrs['class'], 21 | lambda el: 'class' in el.attrs and 'wp-block-tenup-post-time-to-read' in el.attrs['class'], 22 | ]) 23 | self.tags_to_clean.extend([ 24 | lambda el: el.name == 'time', 25 | ]) 26 | 27 | @override 28 | def can_handle(self, soup: BeautifulSoup) -> bool: 29 | return get_og_site_name(soup) == "Towards Data Science" 30 | 31 | @override 32 | def article_container(self) -> tuple | list: 33 | return ("main", None) 34 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/freedium.py: -------------------------------------------------------------------------------- 1 | from typing import override 2 | 3 | from bs4 import BeautifulSoup 4 | 5 | from ..extractor import Extractor 6 | from ..utils import filter_tag 7 | 8 | 9 | class FreediumExtractor(Extractor): 10 | """ 11 | freedium.cfd 12 | """ 13 | 14 | @override 15 | def can_handle(self, soup: BeautifulSoup) -> bool: 16 | title_tag = soup.title 17 | title = title_tag.get_text(strip=True) if title_tag else None 18 | return title is not None and title.endswith(" - Freedium") 19 | 20 | @override 21 | def article_container(self) -> tuple: 22 | return ("div", {"class": "main-content"}) 23 | 24 | @override 25 | def extract_title(self, soup: BeautifulSoup) -> str: 26 | title_tag = filter_tag(soup.find("h1")) 27 | if title_tag: 28 | title = title_tag.get_text(strip=True) 29 | title_tag.decompose() 30 | return title 31 | return super().extract_title(soup) 32 | 33 | @override 34 | def extract_description(self, soup: BeautifulSoup) -> str: 35 | description_tag = soup.find("h2") 36 | if description_tag: 37 | description = description_tag.get_text(strip=True) 38 | description_tag.decompose() 39 | return description 40 | return super().extract_description(soup) 41 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "omni-article-markdown" 3 | version = "0.1.10" 4 | description = "Easily convert web articles (blogs, news, documents, etc.) into Markdown format." 5 | authors = [ 6 | { name = "Lei", email = "caol64@gmail.com" } 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.13" 10 | license = "MIT" 11 | classifiers = [ 12 | "Development Status :: 5 - Production/Stable", 13 | "Environment :: Console", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: End Users/Desktop", 16 | "License :: OSI Approved :: MIT License", 17 | "Natural Language :: English", 18 | "Operating System :: OS Independent", 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.13", 21 | "Topic :: Text Processing :: Markup :: Markdown", 22 | "Topic :: Utilities", 23 | ] 24 | dependencies = [ 25 | "requests>=2.32.3", 26 | "beautifulsoup4>=4.13.4", 27 | "html5lib>=1.1", 28 | "click>=8.2.0", 29 | "pluggy>=1.6.0", 30 | "click-default-group>=1.2.4", 31 | "pip", 32 | ] 33 | 34 | [project.optional-dependencies] 35 | dev = [ 36 | "pytest", 37 | ] 38 | 39 | [project.scripts] 40 | mdcli = "omni_article_markdown.cli:cli" 41 | 42 | [project.urls] 43 | Homepage = "https://github.com/caol64/omni-article-markdown" 44 | 45 | [build-system] 46 | requires = ["hatchling"] 47 | build-backend = "hatchling.build" 48 | 49 | [tool.hatch.build] 50 | exclude = [ 51 | "/data", 52 | "/plugins", 53 | "/dist", 54 | ] 55 | 56 | [tool.hatch.build.targets.sdist] 57 | include = [ 58 | "/src/omni_article_markdown", 59 | "/README.md", 60 | "/pyproject.toml", 61 | ] 62 | -------------------------------------------------------------------------------- /src/omni_article_markdown/readers.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | import requests 4 | 5 | from .extractor import Extractor 6 | from .hookspecs import ReaderPlugin 7 | from .plugins import pm 8 | from .utils import REQUEST_HEADERS 9 | 10 | 11 | class Reader(ABC): 12 | @abstractmethod 13 | def read(self) -> str: ... 14 | 15 | def extractor(self) -> Extractor | None: 16 | return None 17 | 18 | 19 | class ReaderFactory: 20 | @staticmethod 21 | def create(url_or_path: str) -> Reader: 22 | custom_plugin_reader = pm.hook.get_custom_reader(url=url_or_path) 23 | if custom_plugin_reader: 24 | 25 | class PluginReaderAdapter(Reader): 26 | def __init__(self, plugin: ReaderPlugin, url: str): 27 | self.plugin = plugin 28 | self.url = url 29 | 30 | def read(self) -> str: 31 | return self.plugin.read(self.url) 32 | 33 | return PluginReaderAdapter(custom_plugin_reader, url_or_path) 34 | if url_or_path.startswith("http"): 35 | return HtmlReader(url_or_path) 36 | return FileReader(url_or_path) 37 | 38 | 39 | class HtmlReader(Reader): 40 | def __init__(self, url_or_path: str): 41 | self.url_or_path = url_or_path 42 | 43 | def read(self) -> str: 44 | response = requests.get(self.url_or_path, headers=REQUEST_HEADERS) 45 | response.encoding = "utf-8" 46 | return response.text 47 | 48 | 49 | class FileReader(Reader): 50 | def __init__(self, url_or_path: str): 51 | self.url_or_path = url_or_path 52 | 53 | def read(self) -> str: 54 | with open(self.url_or_path, encoding="utf8") as f: 55 | return f.read() 56 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractors/yuque.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import override 4 | 5 | import requests 6 | from bs4 import BeautifulSoup 7 | 8 | from ..extractor import Article, Extractor 9 | from ..utils import REQUEST_HEADERS, filter_tag, get_og_url 10 | 11 | 12 | class YuqueExtractor(Extractor): 13 | """ 14 | 语雀 15 | """ 16 | 17 | @override 18 | def can_handle(self, soup: BeautifulSoup) -> bool: 19 | return get_og_url(soup).startswith("https://www.yuque.com") 20 | 21 | @override 22 | def article_container(self) -> tuple: 23 | return ("", {}) 24 | 25 | @override 26 | def extract_article(self, soup: BeautifulSoup) -> Article: 27 | script_tag = filter_tag(soup.find("script", string=re.compile(r"decodeURIComponent"))) 28 | if script_tag: 29 | raw_js = script_tag.string 30 | if raw_js: 31 | match = re.search(r'decodeURIComponent\s*\(\s*"([^"]+)"\s*\)', raw_js) 32 | if match: 33 | encoded_str = match.group(1) 34 | 35 | from urllib.parse import unquote 36 | 37 | decoded_str = unquote(encoded_str) 38 | decoded_json = json.loads(decoded_str) 39 | # print(decoded_json) 40 | doc = decoded_json["doc"] 41 | if doc and doc["book_id"]: 42 | book_id = str(doc["book_id"]) 43 | slug = str(doc["slug"]) 44 | response = requests.get(f"https://www.yuque.com/api/docs/{slug}?book_id={book_id}&mode=markdown", headers=REQUEST_HEADERS) 45 | response.encoding = "utf-8" 46 | resp = response.json() 47 | # print(resp) 48 | return Article(str(resp["data"]["title"]), None, None, str(resp["data"]["sourcecode"])) 49 | return Article("", None, None, "") 50 | -------------------------------------------------------------------------------- /plugins/omnimd-browser-reader/browser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from runpy import run_module 3 | from typing import override 4 | 5 | from playwright.sync_api import Browser, Playwright, sync_playwright 6 | 7 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 8 | from omni_article_markdown.utils import BROWSER_TARGET_HOSTS, REQUEST_HEADERS 9 | 10 | 11 | class BrowserPlugin(ReaderPlugin): 12 | @override 13 | def can_handle(self, url: str) -> bool: 14 | return any(host in url for host in BROWSER_TARGET_HOSTS) 15 | 16 | @override 17 | def read(self, url: str) -> str: 18 | def try_launch_browser(p: Playwright) -> Browser: 19 | try: 20 | return p.chromium.launch(headless=True) 21 | except Exception as e: 22 | # Playwright not installed or browser missing 23 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 24 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 25 | original_argv = sys.argv 26 | args = ["playwright", "install", "chromium"] 27 | sys.argv = args 28 | run_module("playwright", run_name="__main__") 29 | sys.argv = original_argv 30 | # Try again 31 | return p.chromium.launch(headless=True) 32 | raise # re-raise other exceptions 33 | 34 | with sync_playwright() as p: 35 | browser = try_launch_browser(p) 36 | context = browser.new_context( 37 | user_agent=REQUEST_HEADERS["User-Agent"], 38 | java_script_enabled=True, 39 | extra_http_headers=REQUEST_HEADERS, 40 | ) 41 | page = context.new_page() 42 | page.goto(url, wait_until="networkidle") 43 | html = page.content() 44 | page.close() 45 | context.close() 46 | browser.close() 47 | return html 48 | 49 | 50 | @hookimpl 51 | def get_custom_reader(url: str) -> ReaderPlugin | None: 52 | plugin_instance = BrowserPlugin() 53 | if plugin_instance.can_handle(url): 54 | return plugin_instance 55 | return None 56 | -------------------------------------------------------------------------------- /plugins/omnimd-freedium-reader/freedium.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import resources 3 | from runpy import run_module 4 | from typing import override 5 | 6 | from playwright.sync_api import Browser, Playwright, sync_playwright 7 | 8 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 9 | from omni_article_markdown.utils import REQUEST_HEADERS 10 | 11 | 12 | class FreediumPlugin(ReaderPlugin): 13 | @override 14 | def can_handle(self, url: str) -> bool: 15 | return "freedium.cfd" in url 16 | 17 | @override 18 | def read(self, url: str) -> str: 19 | def try_launch_browser(p: Playwright) -> Browser: 20 | try: 21 | return p.chromium.launch(headless=True) 22 | except Exception as e: 23 | # Playwright not installed or browser missing 24 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 25 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 26 | original_argv = sys.argv 27 | args = ["playwright", "install", "chromium"] 28 | sys.argv = args 29 | run_module("playwright", run_name="__main__") 30 | sys.argv = original_argv 31 | # Try again 32 | return p.chromium.launch(headless=True) 33 | raise # re-raise other exceptions 34 | 35 | with sync_playwright() as p: 36 | browser = try_launch_browser(p) 37 | context = browser.new_context( 38 | user_agent=REQUEST_HEADERS["User-Agent"], 39 | java_script_enabled=True, 40 | extra_http_headers=REQUEST_HEADERS, 41 | ) 42 | with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path: 43 | context.add_init_script(path=str(js_path)) 44 | page = context.new_page() 45 | page.goto(url, wait_until="networkidle") 46 | html = page.content() 47 | page.close() 48 | context.close() 49 | browser.close() 50 | return html 51 | 52 | 53 | @hookimpl 54 | def get_custom_reader(url: str) -> ReaderPlugin | None: 55 | plugin_instance = FreediumPlugin() 56 | if plugin_instance.can_handle(url): 57 | return plugin_instance 58 | return None 59 | -------------------------------------------------------------------------------- /src/omni_article_markdown/omni_article_md.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import pkgutil 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | 6 | from bs4 import BeautifulSoup 7 | 8 | from .extractor import Article, DefaultExtractor, Extractor 9 | from .parser import HtmlMarkdownParser 10 | from .readers import ReaderFactory 11 | from .utils import to_snake_case 12 | 13 | 14 | @dataclass 15 | class ReaderContext: 16 | raw_html: str 17 | 18 | 19 | @dataclass 20 | class ExtractorContext: 21 | article: Article 22 | 23 | 24 | @dataclass 25 | class ParserContext: 26 | title: str 27 | markdown: str 28 | 29 | 30 | class OmniArticleMarkdown: 31 | DEFAULT_SAVE_PATH = "./" 32 | 33 | def __init__(self, url_or_path: str): 34 | self.url_or_path = url_or_path 35 | 36 | def parse(self) -> ParserContext: 37 | reader_ctx = self._read_html(self.url_or_path) 38 | extractor_ctx = self._extract_article(reader_ctx) 39 | parser_ctx = self._parse_html(extractor_ctx) 40 | return parser_ctx 41 | 42 | def save(self, ctx: ParserContext, save_path: str = "") -> str: 43 | save_path = save_path or self.DEFAULT_SAVE_PATH 44 | file_path = Path(save_path) 45 | if file_path.is_dir(): 46 | filename = f"{to_snake_case(ctx.title)}.md" 47 | file_path = file_path / filename 48 | with file_path.open("w", encoding="utf-8") as f: 49 | f.write(ctx.markdown) 50 | return str(file_path.resolve()) 51 | 52 | def _read_html(self, url_or_path: str) -> ReaderContext: 53 | reader = ReaderFactory.create(url_or_path) 54 | raw_html = reader.read() 55 | return ReaderContext(raw_html) 56 | 57 | def _extract_article(self, ctx: ReaderContext) -> ExtractorContext: 58 | soup = BeautifulSoup(ctx.raw_html, "html5lib") 59 | for extract in load_extractors(): 60 | article = extract.extract(soup) 61 | if article: 62 | break 63 | else: 64 | article = DefaultExtractor().extract(soup) 65 | if not article: 66 | raise ValueError("Failed to extract article content.") 67 | return ExtractorContext(article) 68 | 69 | def _parse_html(self, ctx: ExtractorContext) -> ParserContext: 70 | parser = HtmlMarkdownParser(ctx.article) 71 | result = parser.parse() 72 | return ParserContext(title=result[0], markdown=result[1]) 73 | 74 | 75 | def load_extractors(package_name="extractors") -> list[Extractor]: 76 | extractors_package = Path(__file__).parent / package_name 77 | extractors = [] 78 | for _loader, module_name, _is_pkg in pkgutil.iter_modules([extractors_package.resolve()]): 79 | module = importlib.import_module(f"omni_article_markdown.{package_name}.{module_name}") 80 | for attr in dir(module): 81 | cls = getattr(module, attr) 82 | if isinstance(cls, type) and issubclass(cls, Extractor) and cls is not Extractor: 83 | extractors.append(cls()) 84 | return extractors 85 | -------------------------------------------------------------------------------- /ruff.toml: -------------------------------------------------------------------------------- 1 | # Ruff configuration for the entire project 2 | # This ensures consistent formatting across all Python code 3 | 4 | # Use 120 character line length to prevent splitting Reflex lambdas 5 | line-length = 120 6 | 7 | # Target Python 3.13+ 8 | target-version = "py313" 9 | 10 | # Exclude generated and build directories 11 | extend-exclude = [ 12 | ".venv", 13 | "venv", 14 | "__pycache__", 15 | "*.pyc", 16 | "*.yaml", 17 | "node_modules", 18 | "webview", 19 | "bin", 20 | "build", 21 | "dist", 22 | ] 23 | 24 | [format] 25 | # Use double quotes for strings 26 | quote-style = "double" 27 | 28 | # Use 4 spaces for indentation 29 | indent-style = "space" 30 | 31 | # Respect magic trailing commas 32 | skip-magic-trailing-comma = false 33 | 34 | # Use Unix line endings 35 | line-ending = "auto" 36 | 37 | [lint] 38 | # Enable specific rule sets 39 | select = [ 40 | "E", # pycodestyle errors 41 | "W", # pycodestyle warnings (includes W292 for newline at EOF) 42 | "F", # Pyflakes 43 | "I", # isort 44 | "N", # pep8-naming 45 | "UP", # pyupgrade 46 | "B", # flake8-bugbear 47 | "C4", # flake8-comprehensions 48 | "DTZ", # flake8-datetimez 49 | "T10", # flake8-debugger 50 | "RET", # flake8-return 51 | "SIM", # flake8-simplify 52 | "TID", # flake8-tidy-imports 53 | ] 54 | 55 | # Ignore specific rules 56 | ignore = [ 57 | "E501", # Line too long (handled by formatter) 58 | "E712", # Comparison to True/False (needed for SQLAlchemy) 59 | "B008", # Do not perform function calls in argument defaults 60 | "B904", # Within except clause, use raise from (not always needed) 61 | "UP007", # Use X | Y for type unions (keep Optional for clarity) 62 | "SIM108", # Use ternary operator (sometimes if/else is clearer) 63 | "DTZ005", # datetime.now() without tz (okay for timestamps) 64 | "N999", # Invalid module name (web-bff is valid) 65 | "TID252", # Relative imports from parent (used in package structure) 66 | "RET504", # Unnecessary assignment before return (sometimes clearer) 67 | ] 68 | 69 | # Allow unused variables when prefixed with underscore 70 | dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" 71 | 72 | [lint.per-file-ignores] 73 | # Ignore import violations in __init__ files 74 | "__init__.py" = ["E402", "F401", "F403"] 75 | 76 | # Ignore missing docstrings in tests 77 | "test_*.py" = ["D100", "D101", "D102", "D103", "D104"] 78 | "tests/*" = ["D100", "D101", "D102", "D103", "D104"] 79 | 80 | # Allow dynamic imports in recipe files 81 | "recipes/*" = ["F401", "F403"] 82 | 83 | [lint.isort] 84 | # Combine as imports 85 | combine-as-imports = true 86 | 87 | # Force single line imports 88 | force-single-line = false 89 | 90 | # Order imports by type 91 | section-order = [ 92 | "future", 93 | "standard-library", 94 | "third-party", 95 | "first-party", 96 | "local-folder", 97 | ] 98 | 99 | [lint.pydocstyle] 100 | # Use Google docstring convention 101 | convention = "google" 102 | -------------------------------------------------------------------------------- /plugins/omnimd-zhihu-reader/zhihu.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import resources 3 | from runpy import run_module 4 | from typing import override 5 | 6 | import requests 7 | from playwright.sync_api import Browser, Cookie, Playwright, sync_playwright 8 | 9 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 10 | from omni_article_markdown.store import Store 11 | from omni_article_markdown.utils import REQUEST_HEADERS 12 | 13 | 14 | class ZhihuPlugin(ReaderPlugin): 15 | @override 16 | def can_handle(self, url: str) -> bool: 17 | return "zhihu.com" in url 18 | 19 | @override 20 | def read(self, url: str) -> str: 21 | store = Store() 22 | cookies_raw = store.load("zhihu_cookies") 23 | 24 | if not cookies_raw: 25 | print("未找到知乎登录信息,尝试模拟登录...") 26 | cookies_raw = self._get_zhihu_cookies(url) 27 | if not cookies_raw: 28 | raise Exception("无法获取知乎登录信息") 29 | 30 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 31 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 32 | 33 | # 如果初始请求失败,则尝试重新获取 cookie 并重试 34 | if response.status_code == 403: 35 | print("Cookie 失效,重新模拟登录知乎...") 36 | cookies_raw = self._get_zhihu_cookies(url) 37 | if not cookies_raw: 38 | raise Exception("重新模拟登录失败,无法访问知乎内容") 39 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 40 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 41 | 42 | response.encoding = "utf-8" 43 | return response.text 44 | 45 | def _get_zhihu_cookies(self, url: str) -> list[Cookie]: 46 | def try_launch_browser(p: Playwright) -> Browser: 47 | try: 48 | return p.chromium.launch(headless=True) 49 | except Exception as e: 50 | # Playwright not installed or browser missing 51 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 52 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 53 | original_argv = sys.argv 54 | args = ["playwright", "install", "chromium"] 55 | sys.argv = args 56 | run_module("playwright", run_name="__main__") 57 | sys.argv = original_argv 58 | # Try again 59 | return p.chromium.launch(headless=True) 60 | raise # re-raise other exceptions 61 | 62 | with sync_playwright() as p: 63 | browser = try_launch_browser(p) 64 | context = browser.new_context( 65 | user_agent=REQUEST_HEADERS["User-Agent"], 66 | java_script_enabled=True, 67 | extra_http_headers=REQUEST_HEADERS, 68 | ) 69 | with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path: 70 | context.add_init_script(path=str(js_path)) 71 | page = context.new_page() 72 | page.goto(url, wait_until="networkidle") 73 | cookies = context.cookies() 74 | store = Store() 75 | store.save("zhihu_cookies", cookies) 76 | page.close() 77 | context.close() 78 | browser.close() 79 | return cookies 80 | 81 | def _convert_playwright_cookies_to_requests_dict(self, playwright_cookies: list[Cookie]) -> dict[str, str]: 82 | requests_cookies = {} 83 | for cookie in playwright_cookies: 84 | requests_cookies[cookie.get("name")] = cookie.get("value") 85 | return requests_cookies 86 | 87 | 88 | @hookimpl 89 | def get_custom_reader(url: str) -> ReaderPlugin | None: 90 | plugin_instance = ZhihuPlugin() 91 | if plugin_instance.can_handle(url): 92 | return plugin_instance 93 | return None 94 | -------------------------------------------------------------------------------- /plugins/omnimd-toutiao-reader/toutiao.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from importlib import resources 3 | from runpy import run_module 4 | from typing import override 5 | 6 | import requests 7 | from playwright.sync_api import Browser, Cookie, Playwright, sync_playwright 8 | 9 | from omni_article_markdown.hookspecs import ReaderPlugin, hookimpl 10 | from omni_article_markdown.store import Store 11 | from omni_article_markdown.utils import REQUEST_HEADERS 12 | 13 | 14 | class ToutiaoPlugin(ReaderPlugin): 15 | @override 16 | def can_handle(self, url: str) -> bool: 17 | return "toutiao.com" in url 18 | 19 | @override 20 | def read(self, url: str) -> str: 21 | store = Store() 22 | cookies_raw = store.load("toutiao_cookies") 23 | 24 | if not cookies_raw: 25 | print("未找到头条登录信息,尝试模拟登录...") 26 | cookies_raw = self._get_toutiao_cookies(url) 27 | if not cookies_raw: 28 | raise Exception("无法获取头条登录信息") 29 | 30 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 31 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 32 | response.encoding = "utf-8" 33 | html = response.text 34 | 35 | # 如果初始请求失败,则尝试重新获取 cookie 并重试 36 | if "您需要允许该网站执行 JavaScript" in html: 37 | print("Cookie 失效,重新模拟登录头条...") 38 | cookies_raw = self._get_toutiao_cookies(url) 39 | if not cookies_raw: 40 | raise Exception("重新模拟登录失败,无法访问头条内容") 41 | cookies = self._convert_playwright_cookies_to_requests_dict(cookies_raw) 42 | response = requests.get(url, headers=REQUEST_HEADERS, cookies=cookies) 43 | 44 | response.encoding = "utf-8" 45 | return response.text 46 | 47 | def _get_toutiao_cookies(self, url: str) -> list[Cookie]: 48 | def try_launch_browser(p: Playwright) -> Browser: 49 | try: 50 | return p.chromium.launch(headless=True) 51 | except Exception as e: 52 | # Playwright not installed or browser missing 53 | if "Executable doesn't exist" in str(e) or "playwright install" in str(e): 54 | print("[INFO] Chromium not installed, installing with 'playwright install chromium'...") 55 | original_argv = sys.argv 56 | args = ["playwright", "install", "chromium"] 57 | sys.argv = args 58 | run_module("playwright", run_name="__main__") 59 | sys.argv = original_argv 60 | # Try again 61 | return p.chromium.launch(headless=True) 62 | raise # re-raise other exceptions 63 | 64 | with sync_playwright() as p: 65 | browser = try_launch_browser(p) 66 | context = browser.new_context( 67 | user_agent=REQUEST_HEADERS["User-Agent"], 68 | java_script_enabled=True, 69 | extra_http_headers=REQUEST_HEADERS, 70 | ) 71 | with resources.path("omni_article_markdown.libs", "stealth.min.js") as js_path: 72 | context.add_init_script(path=str(js_path)) 73 | page = context.new_page() 74 | page.goto(url, wait_until="networkidle") 75 | cookies = context.cookies() 76 | store = Store() 77 | store.save("toutiao_cookies", cookies) 78 | page.close() 79 | context.close() 80 | browser.close() 81 | return cookies 82 | 83 | def _convert_playwright_cookies_to_requests_dict(self, playwright_cookies: list[Cookie]) -> dict[str, str]: 84 | requests_cookies = {} 85 | for cookie in playwright_cookies: 86 | requests_cookies[cookie.get("name")] = cookie.get("value") 87 | return requests_cookies 88 | 89 | 90 | @hookimpl 91 | def get_custom_reader(url: str) -> ReaderPlugin | None: 92 | plugin_instance = ToutiaoPlugin() 93 | if plugin_instance.can_handle(url): 94 | return plugin_instance 95 | return None 96 | -------------------------------------------------------------------------------- /tests/test_extractor.py: -------------------------------------------------------------------------------- 1 | from bs4.element import Tag 2 | 3 | from omni_article_markdown.extractor import ( 4 | Article, 5 | DefaultExtractor, 6 | extract_article_from_soup, 7 | remove_duplicate_titles, 8 | ) 9 | 10 | # ---- mock utils ---- 11 | 12 | def make_html(content: str, title="Page Title", description="Desc", url="https://example.com") -> str: 13 | return f""" 14 | 15 |
16 |Hello
Hello World
Visible
49 | 50 | 51 |Body
Body text
Body text
Hello
", title="Special Page") 111 | extractor = CustomExtractor() 112 | soup = make_soup(html) 113 | assert extractor.can_handle(soup) is True 114 | 115 | article = extractor.extract(soup) 116 | assert article is not None 117 | assert isinstance(article.body, Tag) 118 | assert "Hello" in article.body.text 119 | -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | from omni_article_markdown.extractor import Article 2 | from omni_article_markdown.parser import HtmlMarkdownParser 3 | 4 | 5 | def test_basic_paragraph(make_soup): 6 | html = "Hello world
" 7 | article = Article("Test", "", "", make_soup(html)) 8 | parser = HtmlMarkdownParser(article) 9 | title, md = parser.parse() 10 | assert "# Test" in md 11 | assert "Hello world" in md 12 | 13 | 14 | def test_heading_and_strong(make_soup): 15 | html = "bold and italic
" 16 | article = Article("Title", "", "", make_soup(html)) 17 | parser = HtmlMarkdownParser(article) 18 | _, md = parser.parse() 19 | assert "## Subtitle" in md 20 | assert "**bold**" in md 21 | assert "*italic*" in md 22 | 23 | 24 | def test_link_parsing(make_soup): 25 | html = '' 26 | article = Article("Title", "", "", make_soup(html)) 27 | parser = HtmlMarkdownParser(article) 28 | _, md = parser.parse() 29 | assert "[Example](https://example.com)" in md 30 | 31 | 32 | def test_unordered_list(make_soup): 33 | html = "" 52 | article = Article("Quote", "", "", make_soup(html)) 53 | parser = HtmlMarkdownParser(article) 54 | _, md = parser.parse() 55 | assert "> Quote me" in md 56 | 57 | 58 | def test_codeblock(make_soup): 59 | html = "Quote me
print('Hello')"
60 | article = Article("Code", "", "", make_soup(html))
61 | parser = HtmlMarkdownParser(article)
62 | _, md = parser.parse()
63 | assert "```" in md
64 | assert "print('Hello')" in md
65 |
66 |
67 | def test_inline_code(make_soup):
68 | html = "Run ls -al command.
'
77 | article = Article("Img", "", "", make_soup(html))
78 | parser = HtmlMarkdownParser(article)
79 | _, md = parser.parse()
80 | assert "" in md
81 |
82 |
83 | def test_image_relative_url(make_soup):
84 | html = '
'
85 | article = Article("Img", "https://site.com/docs/page.html", "", make_soup(html))
86 | parser = HtmlMarkdownParser(article)
87 | _, md = parser.parse()
88 | assert "" in md
89 |
90 |
91 | def test_table_parsing(make_soup):
92 | html = """
93 | | Name | Age |
|---|---|
| Alice | 18 |
| Bob | 20 |
\\(x+y\\) and \\[E=mc^2\\]
" 116 | article = Article("Math", "", "", make_soup(html)) 117 | parser = HtmlMarkdownParser(article) 118 | _, md = parser.parse() 119 | assert "$x+y$" in md 120 | assert "$$E=mc^2$$" in md 121 | -------------------------------------------------------------------------------- /src/omni_article_markdown/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from runpy import run_module 3 | 4 | import click 5 | from click_default_group import DefaultGroup 6 | 7 | from . import DEFAULT_PLUGINS 8 | from .omni_article_md import OmniArticleMarkdown 9 | 10 | 11 | @click.group(cls=DefaultGroup, default="parse", default_if_no_args=True) 12 | def cli(): 13 | """ 14 | A CLI tool to parse articles and save them as Markdown. 15 | It also supports installing plugins. 16 | """ 17 | ... 18 | 19 | 20 | @cli.command(name="parse") 21 | @click.argument("url_or_path") 22 | @click.option( 23 | "-s", 24 | "--save", 25 | help="Save result (default: ./). Provide a path to save elsewhere.", 26 | type=click.Path(dir_okay=True, writable=True), 27 | ) 28 | def parse_article(url_or_path: str, save: str | None): 29 | """ 30 | Parses an article from a URL or local path and outputs/saves it as Markdown. 31 | """ 32 | handler = OmniArticleMarkdown(url_or_path) 33 | parser_ctx = handler.parse() 34 | 35 | if save is None: 36 | click.echo(parser_ctx.markdown) 37 | else: 38 | save_path = handler.save(parser_ctx, save) 39 | click.echo(f"Article saved to: {save_path}") 40 | 41 | 42 | @cli.command() 43 | @click.argument("plugin_name") 44 | @click.option("-U", "--upgrade", is_flag=True, help="Upgrade the plugin if already installed.", default=False) 45 | @click.option( 46 | "-e", 47 | "--editable", 48 | is_flag=True, 49 | help="Install the editable package based on the provided local file path", 50 | default=False, 51 | ) 52 | def install(plugin_name: str, upgrade: bool, editable: bool): 53 | """ 54 | Installs a plugin for this application. 55 | For example, to install the 'zhihu' plugin: mdcli install zhihu 56 | """ 57 | actual_package_name = ( 58 | plugin_name if editable or plugin_name not in DEFAULT_PLUGINS else DEFAULT_PLUGINS[plugin_name] 59 | ) 60 | 61 | click.echo(f"Attempting to install plugin: {actual_package_name}...") 62 | args = ["pip", "install"] 63 | if upgrade: 64 | args.append("--upgrade") 65 | args.append(actual_package_name) 66 | 67 | original_argv = sys.argv 68 | try: 69 | sys.argv = args 70 | run_module("pip", run_name="__main__") 71 | click.echo(f"Plugin '{actual_package_name}' processed by pip.") 72 | click.echo("If the plugin provides new functionality, it should now be available.") 73 | click.echo( 74 | "You might need to restart the application for changes to take full effect if it involves runtime loading during startup." 75 | ) 76 | except Exception as e: 77 | click.echo(f"Failed to process plugin '{actual_package_name}' with pip: {e}", err=True) 78 | click.echo("Please ensure pip is installed and the package name is correct.", err=True) 79 | finally: 80 | sys.argv = original_argv 81 | 82 | 83 | @cli.command() 84 | @click.argument("plugin_name") 85 | @click.option("-y", "--yes", is_flag=True, help="Don't ask for confirmation before uninstalling.", default=False) 86 | def uninstall(plugin_name: str, yes: bool): 87 | """ 88 | Uninstalls a plugin for this application. 89 | For example, to uninstall the 'zhihu' plugin: mdcli uninstall zhihu 90 | """ 91 | actual_package_name = DEFAULT_PLUGINS.get(plugin_name, plugin_name) 92 | 93 | click.echo(f"Attempting to uninstall plugin: {actual_package_name}...") 94 | args = ["pip", "uninstall"] 95 | if yes: 96 | args.append("-y") 97 | args.append(actual_package_name) 98 | 99 | original_argv = sys.argv 100 | try: 101 | sys.argv = args 102 | run_module("pip", run_name="__main__") 103 | click.echo(f"Plugin '{actual_package_name}' uninstallation processed by pip.") 104 | click.echo( 105 | "The plugin's functionality should no longer be available after the next application start (or if dynamically unloaded)." 106 | ) 107 | except Exception as e: 108 | click.echo(f"Failed to process uninstallation of plugin '{actual_package_name}' with pip: {e}", err=True) 109 | click.echo("Please ensure pip is installed and the package name is correct.", err=True) 110 | finally: 111 | sys.argv = original_argv 112 | 113 | 114 | if __name__ == "__main__": 115 | cli() 116 | -------------------------------------------------------------------------------- /src/omni_article_markdown/extractor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from collections.abc import Callable 4 | from dataclasses import dataclass 5 | from typing import override 6 | 7 | from bs4 import BeautifulSoup 8 | from bs4.element import Comment, Tag 9 | 10 | from .utils import filter_tag, get_attr_text, get_canonical_url, get_og_description, get_og_title, get_og_url, get_title 11 | 12 | TAGS_TO_CLEAN: list[Callable[[Tag], bool]] = [ 13 | lambda el: el.name in ("style", "link", "button", "footer", "header"), 14 | lambda el: el.name == "script" and "src" not in el.attrs, 15 | lambda el: el.name == "script" 16 | and el.has_attr("src") 17 | and not get_attr_text(el.attrs["src"]).startswith("https://gist.github.com"), 18 | ] 19 | 20 | ATTRS_TO_CLEAN: list[Callable[[Tag], bool]] = [ 21 | lambda el: "style" in el.attrs 22 | and re.search(r"display\s*:\s*none", get_attr_text(el.attrs.get("style")), re.IGNORECASE) is not None, 23 | lambda el: "hidden" in el.attrs, 24 | lambda el: "class" in el.attrs and "katex-html" in el.attrs["class"], # katex 25 | ] 26 | 27 | ARTICLE_CONTAINERS = [("article", None), ("main", None), ("body", None)] 28 | 29 | 30 | @dataclass 31 | class Article: 32 | title: str 33 | url: str | None 34 | description: str | None 35 | body: Tag | str 36 | 37 | 38 | class Extractor(ABC): 39 | def __init__(self): 40 | self.tags_to_clean = TAGS_TO_CLEAN 41 | self.attrs_to_clean = ATTRS_TO_CLEAN 42 | 43 | def extract(self, soup: BeautifulSoup) -> Article | None: 44 | if self.can_handle(soup): 45 | # print(f"Using extractor: {self.__class__.__name__}") 46 | soup = self.pre_handle_soup(soup) 47 | article_container = self.article_container() 48 | if isinstance(article_container, tuple): 49 | article_container = [article_container] 50 | for container in article_container: 51 | article = self.extract_article(soup) 52 | if article: 53 | return article 54 | article_tag = extract_article_from_soup(soup, container) 55 | if article_tag: 56 | for el in article_tag.find_all(): 57 | tag = filter_tag(el) 58 | if tag: 59 | if any(cond(tag) for cond in self.tags_to_clean): 60 | tag.decompose() 61 | continue 62 | if tag.attrs and any(cond(tag) for cond in self.attrs_to_clean): 63 | tag.decompose() 64 | for comment in article_tag.find_all(string=lambda text: isinstance(text, Comment)): 65 | comment.extract() 66 | self.extract_img(article_tag) 67 | title = self.extract_title(soup) 68 | description = self.extract_description(soup) 69 | url = self.extract_url(soup) 70 | article = Article(title=title, url=url, description=description, body=article_tag) 71 | remove_duplicate_titles(article) 72 | return article 73 | return None 74 | 75 | @abstractmethod 76 | def can_handle(self, soup: BeautifulSoup) -> bool: ... 77 | 78 | @abstractmethod 79 | def article_container(self) -> tuple | list: ... 80 | 81 | def extract_title(self, soup: BeautifulSoup) -> str: 82 | return get_og_title(soup) or get_title(soup) 83 | 84 | def extract_description(self, soup: BeautifulSoup) -> str: 85 | return get_og_description(soup) 86 | 87 | def extract_url(self, soup: BeautifulSoup) -> str: 88 | return get_og_url(soup) or get_canonical_url(soup) 89 | 90 | def extract_img(self, element: Tag) -> Tag: 91 | return element 92 | 93 | def extract_article(self, soup: BeautifulSoup) -> Article | None: 94 | return None 95 | 96 | def pre_handle_soup(self, soup: BeautifulSoup) -> BeautifulSoup: 97 | return soup 98 | 99 | 100 | class DefaultExtractor(Extractor): 101 | @override 102 | def can_handle(self, soup: BeautifulSoup) -> bool: 103 | return True 104 | 105 | @override 106 | def article_container(self) -> tuple | list: 107 | return ARTICLE_CONTAINERS 108 | 109 | 110 | def extract_article_from_soup(soup: BeautifulSoup, template: tuple) -> Tag | None: 111 | if template[1] is not None: 112 | result = soup.find(template[0], attrs=template[1]) 113 | else: 114 | result = soup.find(template[0]) 115 | return filter_tag(result) 116 | 117 | 118 | def remove_duplicate_titles(article: Article): 119 | if article.body and isinstance(article.body, Tag): 120 | first_h1 = article.body.find("h1") 121 | if first_h1: 122 | h1_text = first_h1.get_text(strip=True) 123 | if h1_text.lower() in article.title.lower(): 124 | article.title = h1_text 125 | first_h1.decompose() 126 | -------------------------------------------------------------------------------- /src/omni_article_markdown/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import urlparse 3 | 4 | from bs4 import BeautifulSoup 5 | from bs4.element import AttributeValueList, NavigableString, PageElement, Tag 6 | 7 | REQUEST_HEADERS = { 8 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36 Edg/126.0.0.0", 9 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", 10 | "Accept-Encoding": "gzip, deflate", 11 | "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", 12 | "Priority": "u=0, i", 13 | "Sec-Ch-Ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Microsoft Edge";v="126"', 14 | "Sec-Ch-Ua-Mobile": "?0", 15 | "Sec-Ch-Ua-Platform": '"macOS"', 16 | "Sec-Fetch-Dest": "document", 17 | "Sec-Fetch-Mode": "navigate", 18 | "Sec-Fetch-Site": "none", 19 | "Sec-Fetch-User": "?1", 20 | "Upgrade-Insecure-Requests": "1", 21 | } 22 | 23 | BROWSER_TARGET_HOSTS = [ 24 | "developer.apple.com/documentation/", 25 | "www.infoq.cn/", 26 | "pcsx2.net/", 27 | "baijiahao.baidu.com/", 28 | ] 29 | 30 | def is_sequentially_increasing(code: str) -> bool: 31 | try: 32 | # 解码并按换行符拆分 33 | numbers = [int(line.strip()) for line in code.split("\n") if line.strip()] 34 | # 检查是否递增 35 | return all(numbers[i] + 1 == numbers[i + 1] for i in range(len(numbers) - 1)) 36 | except ValueError: 37 | return False # 处理非数字情况 38 | 39 | 40 | def move_spaces(input_string: str, suffix: str) -> str: 41 | # 使用正则表达式匹配以指定的suffix结尾,且suffix之前有空格的情况 42 | escaped_suffix = re.escape(suffix) # 处理正则中的特殊字符 43 | pattern = rf"(.*?)\s+({escaped_suffix})$" 44 | match = re.search(pattern, input_string) 45 | if match: 46 | # 获取字符串的主体部分(不含空格)和尾部的 '**' 47 | main_part = match.group(1) 48 | stars = match.group(2) 49 | # 计算空格的数量并将空格移动到 '**' 后 50 | space_count = len(input_string) - len(main_part) - len(stars) 51 | return f"{main_part}{stars}{' ' * space_count}" 52 | return input_string 53 | 54 | 55 | def to_snake_case(input_string: str) -> str: 56 | input_string = "".join(char if char.isalnum() else " " for char in input_string) 57 | snake_case_string = "_".join(word.lower() for word in input_string.split()) 58 | return snake_case_string 59 | 60 | 61 | def collapse_spaces(text) -> str: 62 | """ 63 | 将多个连续空格(包括换行和 Tab)折叠成一个空格。 64 | """ 65 | return re.sub(r"\s+", " ", text) 66 | 67 | 68 | def extract_domain(url: str) -> str | None: 69 | """ 70 | 从URL中提取域名(包含协议)。 71 | 72 | Args: 73 | url (str): 要提取域名的URL。 74 | 75 | Returns: 76 | str | None: 提取出的域名(包含协议),如果解析失败或协议不支持则返回 None。 77 | """ 78 | try: 79 | parsed_url = urlparse(url) 80 | if parsed_url.scheme in {"http", "https"} and parsed_url.netloc: 81 | return f"{parsed_url.scheme}://{parsed_url.netloc}".rstrip("/") 82 | return None # 返回 None 表示 URL 格式不符合要求或协议不支持 83 | 84 | except ValueError: 85 | return None # 如果 URL 格式无效,则返回 None 86 | 87 | 88 | def detect_language(file_name: str | None, code: str) -> str: 89 | # TODO: 添加语言检测逻辑 90 | return "" 91 | 92 | 93 | def filter_tag(el: Tag | PageElement | NavigableString | None) -> Tag | None: 94 | if el is None or not isinstance(el, Tag): 95 | return None 96 | return el 97 | 98 | 99 | def get_attr_text(el: str | AttributeValueList | None) -> str: 100 | if el is None: 101 | return "" 102 | if isinstance(el, str): 103 | return el.strip() 104 | return " ".join(el).strip() 105 | 106 | 107 | def get_og_url(soup: BeautifulSoup) -> str: 108 | og_tag = filter_tag(soup.find("meta", {"property": "og:url"})) 109 | return get_tag_text(og_tag, "content") 110 | 111 | 112 | def get_og_site_name(soup: BeautifulSoup) -> str: 113 | og_tag = filter_tag(soup.find("meta", {"property": "og:site_name"})) 114 | return get_tag_text(og_tag, "content") 115 | 116 | 117 | def get_og_description(soup: BeautifulSoup) -> str: 118 | og_tag = filter_tag(soup.find("meta", {"property": "og:description"})) 119 | return get_tag_text(og_tag, "content") 120 | 121 | 122 | def get_canonical_url(soup: BeautifulSoup) -> str: 123 | canonical_tag = filter_tag(soup.find("link", {"rel": "canonical"})) 124 | return get_tag_text(canonical_tag, "href") 125 | 126 | 127 | def is_matched_canonical(url: str, soup: BeautifulSoup) -> bool: 128 | canonical = get_canonical_url(soup) 129 | if not canonical: 130 | return False 131 | return canonical.startswith(url) 132 | 133 | 134 | def get_og_title(soup: BeautifulSoup) -> str: 135 | og_tag = filter_tag(soup.find("meta", {"property": "og:title"})) 136 | return get_tag_text(og_tag, "content") 137 | 138 | 139 | def get_tag_text(tag: Tag | None, attr: str) -> str: 140 | if tag is not None and tag.has_attr(attr): 141 | el = tag[attr] 142 | return get_attr_text(el) 143 | return "" 144 | 145 | 146 | def get_title(soup: BeautifulSoup) -> str: 147 | title_tag = soup.title 148 | return title_tag.get_text(strip=True) if title_tag else "" 149 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bs4.element import AttributeValueList, NavigableString 3 | 4 | from omni_article_markdown.utils import ( 5 | collapse_spaces, 6 | detect_language, 7 | extract_domain, 8 | filter_tag, 9 | get_attr_text, 10 | get_canonical_url, 11 | get_og_description, 12 | get_og_site_name, 13 | get_og_title, 14 | get_og_url, 15 | get_tag_text, 16 | get_title, 17 | is_matched_canonical, 18 | is_sequentially_increasing, 19 | move_spaces, 20 | to_snake_case, 21 | ) 22 | 23 | 24 | # -------------------------- 25 | # 测试 is_sequentially_increasing 26 | # -------------------------- 27 | def test_is_sequentially_increasing_true(): 28 | code = "1\n2\n3\n4" 29 | assert is_sequentially_increasing(code) is True 30 | 31 | 32 | def test_is_sequentially_increasing_false(): 33 | code = "1\n3\n5" 34 | assert is_sequentially_increasing(code) is False 35 | 36 | 37 | def test_is_sequentially_increasing_non_numeric(): 38 | code = "a\nb\nc" 39 | assert is_sequentially_increasing(code) is False 40 | 41 | 42 | # -------------------------- 43 | # move_spaces 44 | # -------------------------- 45 | def test_move_spaces(): 46 | assert move_spaces("**hello **", "**") == "**hello** " 47 | assert move_spaces("**hello **", "**") == "**hello** " 48 | assert move_spaces("**hello world**", "**") == "**hello world**" 49 | 50 | 51 | # -------------------------- 52 | # to_snake_case 53 | # -------------------------- 54 | def test_to_snake_case(): 55 | assert to_snake_case("HelloWorld") == "helloworld" 56 | assert to_snake_case("Hello World!") == "hello_world" 57 | assert to_snake_case("Already_snake_case") == "already_snake_case" 58 | 59 | 60 | # -------------------------- 61 | # collapse_spaces 62 | # -------------------------- 63 | def test_collapse_spaces(): 64 | assert collapse_spaces("a b\tc\nd") == "a b c d" 65 | 66 | 67 | # -------------------------- 68 | # extract_domain 69 | # -------------------------- 70 | def test_extract_domain(): 71 | assert extract_domain("https://example.com/path?q=1") == "https://example.com" 72 | assert extract_domain("http://abc.xyz") == "http://abc.xyz" 73 | assert extract_domain("ftp://example.com") is None 74 | assert extract_domain("not_a_url") is None 75 | 76 | 77 | # -------------------------- 78 | # detect_language 79 | # -------------------------- 80 | def test_detect_language_placeholder(): 81 | assert detect_language("file.py", "print('hi')") == "" 82 | 83 | 84 | # -------------------------- 85 | # filter_tag 86 | # -------------------------- 87 | def test_filter_tag_with_tag(make_soup): 88 | soup = make_soup("") 89 | el = soup.div 90 | assert filter_tag(el) == el 91 | 92 | 93 | def test_filter_tag_with_none_or_text(): 94 | text_node = NavigableString("text") 95 | assert filter_tag(None) is None 96 | assert filter_tag(text_node) is None 97 | 98 | 99 | # -------------------------- 100 | # get_attr_text 101 | # -------------------------- 102 | def test_get_attr_text(): 103 | assert get_attr_text(" hello ") == "hello" 104 | assert get_attr_text(AttributeValueList(["a", "b", "c"])) == "a b c" 105 | assert get_attr_text(None) == "" 106 | 107 | 108 | # -------------------------- 109 | # meta/og tag 相关 110 | # -------------------------- 111 | HTML_DOC = """ 112 | 113 | 114 |