├── .coveragerc ├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── LICENSE ├── README.md ├── README_zh.md ├── config.json ├── demos ├── notion_templage.png ├── yinxiang_clipper.html ├── yinxiang_clipper.json ├── yinxiang_clipper.resources │ ├── svg_1.svg │ ├── svg_10.svg │ ├── svg_11.svg │ ├── svg_12.svg │ ├── svg_13.svg │ ├── svg_2.svg │ ├── svg_3.svg │ ├── svg_4.svg │ ├── svg_5.svg │ ├── svg_6.svg │ ├── svg_7.svg │ ├── svg_8.svg │ └── svg_9.svg ├── yinxiang_clipper_2.html ├── yinxiang_clipper_2.resources │ ├── svg_1.svg │ ├── svg_10.svg │ ├── svg_11.svg │ ├── svg_12.svg │ ├── svg_13.svg │ ├── svg_14.svg │ ├── svg_15.svg │ ├── svg_16.svg │ ├── svg_17.svg │ ├── svg_18.svg │ ├── svg_19.svg │ ├── svg_2.svg │ ├── svg_20.svg │ ├── svg_21.svg │ ├── svg_22.svg │ ├── svg_23.svg │ ├── svg_24.svg │ ├── svg_25.svg │ ├── svg_26.svg │ ├── svg_27.svg │ ├── svg_3.svg │ ├── svg_4.svg │ ├── svg_5.svg │ ├── svg_6.svg │ ├── svg_7.svg │ ├── svg_8.svg │ └── svg_9.svg ├── yinxiang_clipper_wx.html ├── yinxiang_clipper_wx.json ├── yinxiang_gbk.html ├── yinxiang_markdown.html ├── yinxiang_markdown.json ├── yinxiang_markdown.resources │ └── 5BB98FD9-8FA4-481F-AF4E-E3B1F2DD38BC.png ├── yinxiang_mobile.html ├── yinxiang_normal.html ├── yinxiang_normal.resources │ └── 7672861D-5C56-4A07-B0E6-256950F2775A.png ├── yinxiang_normal_format.html ├── yinxiang_notion.png ├── yinxiang_notion2.png └── yinxiang_supernote.html ├── examples ├── insert_divider.ipynb ├── insert_table.ipynb ├── insert_text.ipynb ├── insert_todo.ipynb ├── parse_code.ipynb ├── parse_tag.ipynb └── process_md.ipynb ├── html2notion ├── __init__.py ├── main.py ├── translate │ ├── __init__.py │ ├── batch_import.py │ ├── cos_uploader.py │ ├── html2json.py │ ├── html2json_base.py │ ├── html2json_clipper.py │ ├── html2json_default.py │ ├── html2json_markdown.py │ ├── html2json_yinxiang.py │ ├── import_stats.py │ ├── notion_export.py │ └── notion_import.py └── utils │ ├── __init__.py │ ├── load_config.py │ ├── log.py │ ├── timeutil.py │ └── url_process.py ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── conftest.py ├── test_batchimport.py ├── test_config.py ├── test_cosupload.py ├── test_demos.py ├── test_log.py ├── test_notionexport.py ├── test_reqlimit.py ├── test_util.py └── test_yinxiang.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */__init__.py 4 | main.py 5 | 6 | [report] 7 | exclude_lines = 8 | if __name__ == .__main__.: 9 | async def main(.*): 10 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8", "3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest pytest-asyncio pytest-cov 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install -e . 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest --cov=html2notion --cov-config=.coveragerc 42 | env: 43 | notion_api_key: ${{ secrets.NOTION_API_KEY }} 44 | notion_db_id_1: ${{ secrets.NOTION_DATABASE_ID_1 }} 45 | notion_page_id_1: ${{ secrets.NOTION_PAGE_ID_1 }} 46 | cos_secret_id: ${{ secrets.COS_SECRET_ID }} 47 | cos_secret_key: ${{ secrets.COS_SECRET_KEY }} 48 | cos_region: ${{ secrets.COS_REGION }} 49 | cos_bucket: ${{ secrets.COS_BUCKET }} 50 | - name: Upload coverage reports to Codecov 51 | uses: codecov/codecov-action@v3 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.pyc 3 | notion_demo/* 4 | build/ 5 | html2notion.egg-info/ 6 | logs/* 7 | .config.json 8 | .DS_Store 9 | dist/* 10 | .coverage 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) SelfBoot 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
Row 1: Role, Read, Normal | Row 1: Bold Text. Undownline | Row 1: Link here: https://platform.openai.com/docs/guides/chat/introduction | Row 1: In classification problems:
Code here; |
Row 2: System | Row 2:You are a helpful assistant. | Row 2: none |
12 | 师徒们正行赏间,又见一山挡路。唐僧道:“徒弟们仔细,前遇山高,恐有虎狼阻挡。”行者道:“师父,出家人莫说在家话。你记得那乌巢和尚的《心经》云心无挂碍,无挂碍,方无恐怖,远离颠倒梦想之言?但只是扫除心上垢,洗净耳边尘。不受苦中苦,难为人上人。你莫生忧虑,但有老孙,就是塌下天来,可保无事。怕什么虎狼!”
13 | |
14 |
25 | 扫地恐伤蝼蚁命,爱惜飞蛾纱罩灯。
26 | |
27 |
35 | 早还是山野中无人查考,若到城市,倘有人一时冲撞了你,你也行凶,执着棍子,乱打伤人,我可做得白客,怎能脱身?
36 | |
37 |
92 | 那只虎蹲着身,伏在尘埃,动也不敢动动。却被他照头一棒,就打的脑浆迸万点桃红,牙齿喷几珠玉块,唬得那陈玄奘滚鞍落马,咬指道声:“天哪,天哪!刘太保前日打的斑斓虎,还与他斗了半日。今日孙悟空不用争持,把这虎一棒打得稀烂,正是强中更有强中手!”
93 | |
94 |
102 | 行者去解开包袱,在那包裹中间见有几个粗面烧饼,拿出来递与师父。又见那光艳艳的一领绵布直裰,一顶嵌金花帽,行者道:“这衣帽是东土带来的?”三藏就顺口儿答应道:“是我小时穿戴的。这帽子若戴了,不用教经,就会念经;这衣服若穿了,不用演礼,就会行礼。”行者道:“好师父,把与我穿戴了罢。”三藏道:“只怕长短不一,你若穿得,就穿了罢。”行者遂脱下旧白布直裰,将绵布直裰穿上,也就是比量着身体裁的一般,把帽儿戴上。三藏见他戴上帽子,就不吃干粮,却默默的念那紧箍咒一遍。行者叫道:“头痛,头痛!”那师父不住的又念了几遍,把个行者痛得打滚,抓破了嵌金的花帽。
103 | |
104 |
112 | 行者道:“贤弟,你起来。不是我去不成,既是妖精敢骂我,我就不能不降他,我和你去。老孙五百年前大闹天宫,普天的神将看见我,一个个控背躬身,口口称呼大圣。这妖怪无礼,他敢背前面后骂我!我这去,把他拿住,碎尸万段,以报骂我之仇!报毕,我即回来。”八戒道:“哥哥,正是,你只去拿了妖精,报了你仇,那时来与不来,任从尊意。” 那猴才跳下崖,撞入洞里,脱了妖衣,整一整锦直裰,束一束虎皮裙,执了铁棒,径出门来。慌得那群猴拦住道:“大圣爷爷,你往那里去?带挈我们耍子几年也好。”行者道:“小的们,你说那里话!我保唐僧的这桩事,天上地下,都晓得孙悟空是唐僧的徒弟。他倒不是赶我回来,倒是教我来家看看,送我来家自在耍子。如今只因这件事,你们却都要仔细看守家业,依时插柳栽松,毋得废坠,待我还去保唐僧,取经回东土。功成之后,仍回来与你们共乐天真。”众猴各各领命。
113 | |
114 |
122 | 长老现了原身,定性睁睛,才认得是行者,一把搀住道:“悟空!你从那里来也?”沙僧侍立左右,把那请行者降妖精,救公主,解虎气,并回朝上项事,备陈了一遍。三藏谢之不尽道:“贤徒,亏了你也,亏了你也!这一去,早诣西方,径回东土,奏唐王,你的功劳第一。”行者笑道:“莫说莫说!但不念那话儿,足感爱厚之情也。”国王闻此言,又劝谢了他四众,整治素筵,大开东阁。他师徒受了皇恩,辞王西去。
123 | |
124 |
132 | 行者闻言,把功曹叱退,切切在心,按云头,径来山上。只见长老与八戒、沙僧,簇拥前进,他却暗想:“我若把功曹的言语实实告诵师父,师父他不济事,必就哭了;假若不与他实说,梦着头,带着他走,常言道乍入芦圩,不知深浅。倘或被妖魔捞去,却不又要老孙费心?且等我照顾八戒一照顾,先着他出头与那怪打一仗看。若是打得过他,就算他一功;若是没手段,被怪拿去,等老孙再去救他不迟,却好显我本事出名。”
133 | |
134 |
142 | 呆子真个对行者说道:“哥哥,你教我做甚事?”行者道:“第一件是看师父,第二件是去巡山。”八戒道:“看师父是坐,巡山去是走。终不然教我坐一会又走,走一会又坐,两处怎么顾盼得来?”行者道:“不是教你两件齐干,只是领了一件便罢。”八戒又笑道:“这等也好计较。但不知看师父是怎样,巡山是怎样,你先与我讲讲,等我依个相应些儿的去干罢。”行者道:“看师父啊,师父去出恭,你伺候;师父要走路,你扶持;师父要吃斋,你化斋。若他饿了些儿,你该打;黄了些儿脸皮,你该打;瘦了些儿形骸,你该打。”
143 | |
144 |
The OpenAI API can be applied to virtually any task that involves understanding or generating natural language, code, or images. We offer a spectrum of models with different levels of power suitable for different tasks, as well as the ability to fine-tune
your own custom models. These models can be used for everything from content generation to semantic search and classification.
\n\n###\n\n
.Remember to also append this separator when you eventually make requests to your model.Row 1: Role, Read, Normal | Row 1: Bold Text. Undownline | Row 1: Link here: https://platform.openai.com/docs/guides/chat/introduction | Row 1: In classification problems:
Code here; |
Row 2: System | Row 2:You are a helpful assistant. | Row 2: none |
Animal | Names | Column | Column2 |
Cat | Captain | pading | agagin |
Dog | Ruff the Protector | null |
\n",
33 | " 1\n",
34 | "2\n",
35 | "3\n",
36 | "4\n",
37 | "5\n",
38 | "6\n",
39 | "7\n",
40 | "8\n",
41 | "9\n",
42 | "10\n",
43 | "11\n",
44 | "12\n",
45 | "
\n",
46 | "# Note: you need to be using OpenAI Python v0.27.0 for the code below to workimport openai\n",
48 | "\n",
49 | "openai.ChatCompletion.create(\n",
50 | " model=\"gpt-3.5-turbo\",\n",
51 | " messages=[\n",
52 | " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n",
53 | " {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n",
54 | " {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n",
55 | " {\"role\": \"user\", \"content\": \"Where was it played?\"}\n",
56 | " ]\n",
57 | ")
\n",
58 | "\"\"\"\n",
59 | "soup = BeautifulSoup(html_doc, 'html.parser')\n",
60 | "\n",
61 | "# 找到所有的标签\n", 62 | "pre_tags = soup.find_all('pre')\n", 63 | "\n", 64 | "for pre in pre_tags:\n", 65 | " # 在每个标签中找到标签\n", 66 | " code_tags = pre.find_all('code')\n", 67 | " \n", 68 | " for code in code_tags:\n", 69 | " # 检查
标签是否包含行号,这里假设行号是在标签中的数字\n", 70 | " span_tags = code.find_all('span')\n", 71 | " \n", 72 | " for span in span_tags:\n", 73 | " if span.string and span.string.strip().isdigit():\n", 74 | " # 如果是行号,则删除这个标签\n", 75 | " span.decompose()\n", 76 | "\n", 77 | "# 这时,soup中的HTML已经没有行号了\n", 78 | "print(soup.prettify())\n" 79 | ] 80 | } 81 | ], 82 | "metadata": { 83 | "kernelspec": { 84 | "display_name": "notion", 85 | "language": "python", 86 | "name": "python3" 87 | }, 88 | "language_info": { 89 | "codemirror_mode": { 90 | "name": "ipython", 91 | "version": 3 92 | }, 93 | "file_extension": ".py", 94 | "mimetype": "text/x-python", 95 | "name": "python", 96 | "nbconvert_exporter": "python", 97 | "pygments_lexer": "ipython3", 98 | "version": "3.11.2" 99 | }, 100 | "orig_nbformat": 4 101 | }, 102 | "nbformat": 4, 103 | "nbformat_minor": 2 104 | } 105 | -------------------------------------------------------------------------------- /examples/parse_tag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from bs4 import BeautifulSoup, NavigableString\n", 10 | "\n", 11 | "html = '''\n", 12 | "
\n", 13 | "\n", 36 | "'''\n", 37 | "\n", 38 | "def extract_text_and_parents(tag, parents=[]):\n", 39 | " results = []\n", 40 | " for child in tag.children:\n", 41 | " if isinstance(child, NavigableString):\n", 42 | " if child.strip():\n", 43 | " text = child.strip()\n", 44 | " parent_tags = [{\"name\": p.name, \"attrs\": p.attrs} for p in parents + [tag]]\n", 45 | " results.append({\"text\": text, \"parent_tags\": parent_tags})\n", 46 | " else:\n", 47 | " results.extend(extract_text_and_parents(child, parents + [tag]))\n", 48 | " return results\n", 49 | "\n", 50 | "soup = BeautifulSoup(html, 'html.parser')\n", 51 | "td_tags = soup.find_all('td')\n", 52 | "\n", 53 | "for i, td in enumerate(td_tags, 1):\n", 54 | " text_with_parents = extract_text_and_parents(td)\n", 55 | " print(f\"Text and parent tags in TD {i}:\")\n", 56 | " for item in text_with_parents:\n", 57 | " print(f\"Text: {item['text']}\")\n", 58 | " print(\"Parent tags:\")\n", 59 | " for parent in item[\"parent_tags\"]:\n", 60 | " print(f\" Tag: {parent['name']}, Attributes: {parent['attrs']}\")\n", 61 | " print()\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "from bs4 import BeautifulSoup\n", 71 | "from html2notion.translate.html2json_base import Html2JsonBase\n", 72 | "content = \"\"\"\n", 14 | "\n", 15 | " \n", 16 | "
\n", 34 | "\n", 17 | " \n", 27 | "\n", 18 | " \n", 20 | "Row 1: You are a helpful assistant. Remember it.\n", 19 | "\n", 21 | " \n", 22 | " \n", 23 | "\n", 24 | " \n", 26 | "Row 1: Import Content Read more.\n", 25 | "\n", 28 | " \n", 32 | " \n", 33 | "\n", 29 | " Row 2:\n", 30 | " Row 2:\n", 31 | " Row 2:\n", 35 | "web image:
\n", 73 | "\"\"\"\n", 74 | "tag = BeautifulSoup(content, 'html.parser').find('p')\n", 75 | "text_and_parents = Html2JsonBase.extract_text_and_parents(tag)\n", 76 | "for item in text_and_parents:\n", 77 | " print(f\"Text: {item[0]}, {item[1]}\")" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "notion", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.11.2" 98 | }, 99 | "orig_nbformat": 4 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 2 103 | } 104 | -------------------------------------------------------------------------------- /examples/process_md.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "md_text = \"\"\"\n", 10 | "# Header\n", 11 | "\n", 12 | "**bold**, _ite_, ~~other~~, more...\n", 13 | "`inline code` here.\n", 14 | "\n", 15 | "```python\n", 16 | "import os\n", 17 | "os.print('hello')\n", 18 | "```\n", 19 | "\n", 20 | "> Please work through this document in its entirety to better understand how OpenAI’s rate limit system works. We include code examples and possible solutions to handle common issues. It is recommended to **follow** this guidance before filling out the [Rate Limit Increase Request form](https://docs.google.com/forms/d/e/1FAIpQLSc6gSL3zfHFlL6gNIyUcjkEv29jModHGxg5_XGyr-PrE2LaHw/viewform) with details regarding how to fill it out in the last section.\n", 21 | "\n", 22 | "divider\n", 23 | "* * *\n", 24 | "\n", 25 | "### image\n", 26 | "local images:\n", 27 | "\n", 28 | "\n", 29 | "\n", 30 | "web image:\n", 31 | "\n", 32 | "\n", 33 | "[link](https://docs.microsoft.com/zh-tw/previous-versions/visualstudio/design-tools/expression-studio-2/cc294571(v=expression.10))\n", 34 | "\n", 35 | "### Table\n", 36 | "\n", 37 | "|header| column1 | column 2\n", 38 | "|-|-|-\n", 39 | "|row 1| row 1_1 | row 1_2\n", 40 | "|row 2| row 2_2 **bold**, _ite_, ~~other~~, more... | row 2_3\n", 41 | "\n", 42 | "### list\n", 43 | "\n", 44 | "[Why do we have rate limits?](https://platform.openai.com/docs/guides/rate-limits/overview)\n", 45 | "Rate limits are a common practice for APIs, and they're put in place for a few different reasons:\n", 46 | "\n", 47 | "- They help protect against abuse or misuse of the API. For example, a malicious actor could flood the API with requests in an attempt to overload it or cause disruptions in service. By setting rate limits, `OpenAI` can prevent this kind of activity.\n", 48 | "- Rate limits help ensure that everyone has fair access to the API. If one person or organization makes an excessive number of requests, it could bog down the API for everyone else. By throttling the number of requests that a single user can make, OpenAI ensures that the most number of people have an opportunity to use the API without experiencing slowdowns.\n", 49 | "- Rate limits can help OpenAI manage the aggregate load on its infrastructure. If requests to the API increase dramatically, it could tax the servers and cause performance issues. By setting rate limits, OpenAI can help maintain a smooth and consistent experience for all users.\n", 50 | "\n", 51 | "number list\n", 52 | "\n", 53 | "1. number list1\n", 54 | "2. numner list2\n", 55 | "\n", 56 | "## checkbox\n", 57 | "\n", 58 | "Three frogs\n", 59 | "* [x] The first frog\n", 60 | "* [ ] The second frog\n", 61 | "* [ ] The third frog\n", 62 | "\n", 63 | "# math and grapth\n", 64 | "\n", 65 | "Here is math\n", 66 | "```math\n", 67 | "e^{i\\pi} + 1 = 0\n", 68 | "```\n", 69 | "\n", 70 | "mermaid grapth:\n", 71 | "\n", 72 | "```mermaid\n", 73 | "graph TD\n", 74 | "A[Module A] -->|A1| B( Module B)\n", 75 | "B --> C{Confidition C}\n", 76 | "C -->|condition C1| D[Module D]\n", 77 | "C -->|condition C2| E[Module E]\n", 78 | "C -->|condition C3| F[Module F]\n", 79 | "```\n", 80 | "\n", 81 | "sequenceDiagram\n", 82 | "\n", 83 | "```mermaid\n", 84 | "sequenceDiagram\n", 85 | "A->>B: Have you received a message?\n", 86 | "B-->>A: Message received\n", 87 | "```\n", 88 | "\n", 89 | "gantt\n", 90 | "\n", 91 | "```mermaid\n", 92 | "gantt\n", 93 | "title Gantt chart\n", 94 | "dateFormat YYYY-MM-DD\n", 95 | "section Proj A\n", 96 | "Task 1 :a1, 2018-06-06, 30d\n", 97 | "Task 2 :after a1 , 20d\n", 98 | "section Proj B\n", 99 | "Task 3 :2018-06-12 , 12d\n", 100 | "Task 4 : 24d\n", 101 | "```\n", 102 | "\n", 103 | "### chart\n", 104 | "\n", 105 | "```chart\n", 106 | ", budget, income, expenses, debt\n", 107 | "June,5000,8000,4000,6000\n", 108 | "July,3000,1000,4000,3000\n", 109 | "Aug,5000,7000,6000,3000\n", 110 | "Sep,7000,2000,3000,1000\n", 111 | "Oct,6000,5000,4000,2000\n", 112 | "Nov,4000,3000,5000,\n", 113 | "\n", 114 | "type: pie\n", 115 | "title: 每月收益\n", 116 | "x.title: Amount\n", 117 | "y.title: Month\n", 118 | "y.suffix: $\n", 119 | "```\n", 120 | "\n", 121 | "```chart\n", 122 | ",Budget,Income,Expenses,Debt\n", 123 | "June,5000,8000,4000,6000\n", 124 | "July,3000,1000,4000,3000\n", 125 | "Aug,5000,7000,6000,3000\n", 126 | "Sep,7000,2000,3000,1000\n", 127 | "Oct,6000,5000,4000,2000\n", 128 | "Nov,4000,3000,5000,\n", 129 | "\n", 130 | "type: line\n", 131 | "title: Monthly Revenue\n", 132 | "x.title: Amount\n", 133 | "y.title: Month\n", 134 | "y.suffix: $\n", 135 | "```\n", 136 | "\"\"\"" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "import re\n", 146 | "\n", 147 | "def extract_code_blocks(md_text):\n", 148 | " code_pattern = re.compile(r'```(\\w+)?\\n(.*?)```', re.DOTALL)\n", 149 | " matches = code_pattern.findall(md_text)\n", 150 | " code_blocks = [{'language': match[0], 'code': match[1]} for match in matches]\n", 151 | " return code_blocks\n", 152 | "\n", 153 | "\n", 154 | "code_blocks = extract_code_blocks(md_text)\n", 155 | "\n", 156 | "for block in code_blocks:\n", 157 | " print(f\"Language: {block['language']}\")\n", 158 | " print(f\"Code: {block['code']}\\n\")\n" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "notion", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.11.2" 179 | }, 180 | "orig_nbformat": 4 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /html2notion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/html2notion/__init__.py -------------------------------------------------------------------------------- /html2notion/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import json 5 | import asyncio 6 | from pathlib import Path 7 | from aiohttp import ClientSession 8 | from notion_client import AsyncClient 9 | from rich.console import Console 10 | from rich.table import Table 11 | from rich.text import Text 12 | from rich import box 13 | from .utils import setup_logger, read_config, logger, config 14 | from .translate.notion_import import NotionImporter 15 | from .translate.batch_import import BatchImport 16 | from .translate.import_stats import StatLevel 17 | console = Console() 18 | 19 | 20 | def prepare_args(): 21 | parser = argparse.ArgumentParser( 22 | description='Html2notion: Save HTML to your Notion notes quickly and easily, while keeping the original format as much as possible') 23 | parser.add_argument('--conf', type=str, help='conf file path', required=True) 24 | parser.add_argument('--log', type=str, help='log directory path') 25 | parser.add_argument('--batch', type=int, default=15, help='batch save concurrent limit') 26 | 27 | group = parser.add_mutually_exclusive_group(required=True) 28 | group.add_argument('--file', type=str, help='Save single html file to notion') 29 | group.add_argument('--dir', type=str, help='Save all html files in the dir to notion') 30 | return parser 31 | 32 | 33 | def print_single_stats(stat): 34 | if stat.get_level() == StatLevel.EXCEPTION.value: 35 | text = Text(f"Failed to import {stat.filename}", style="default") 36 | text.append(f"\nException: {stat.exception}", style="red") 37 | if 'body.parent.page_id should be defined' in str(stat.exception): 38 | text.append(f"\nHeadmeta : \n{json.dumps(stat.head_meta, indent=4)}", style="yellow") 39 | console.print(text) 40 | return 41 | 42 | title = f"{stat.filename}" if stat.filename else "Import Result (Loss filename)" 43 | style = "default" 44 | if stat.get_level() == StatLevel.LOSS.value: 45 | title += " (Loss some content)" 46 | style = "yellow" 47 | elif stat.get_level() == StatLevel.SUCC.value: 48 | title += "(Import successfully)" 49 | style = "green" 50 | 51 | table = Table(title=title, title_style=style, expand=True, box=box.HEAVY_HEAD, show_lines=True) 52 | table.add_column("Item", justify="right", style="default") 53 | table.add_column("Html", style="default") 54 | table.add_column("Notion", justify="left", style="default") 55 | table.add_row("Text Len", str(stat.text_count), str(stat.notion_text_count)) 56 | table.add_row("Image Count", str(stat.image_count), str(stat.notion_image_count)) 57 | if stat.skip_tag: 58 | table.add_row("Skip Tag Count", "", 'Detail: [yellow]' + ";".join([repr(s) 59 | for s in stat.skip_tag])[:2000] + "[/yellow]") 60 | 61 | console.print(table) 62 | 63 | 64 | def print_batch_stats(batch_import): 65 | all_files = batch_import.all_files 66 | batch_stats = batch_import.batch_stats 67 | success_stats = [stat for stat in batch_stats if not stat.get_level() == StatLevel.SUCC.value] 68 | if len(success_stats) == len(all_files): 69 | console.print(f"All files migrated successfully and there is no data loss.", style="green") 70 | 71 | failed_stats = [stat for stat in batch_stats if stat.get_level() == StatLevel.EXCEPTION.value] 72 | if failed_stats: 73 | table = Table(title=f"\nImport Fail Exception Detail\nLog path: {config.get('log_path')}", expand=True, box=box.HEAVY_HEAD, show_lines=True) 74 | table.add_column("File Name", justify="left", style="default") 75 | table.add_column("Fail Reason", justify="left", style="default") 76 | 77 | for stat in failed_stats: 78 | table.add_row(str(stat.filename), str(stat)) 79 | console.print(table) 80 | 81 | less_stats = [stat for stat in batch_stats if stat.get_level() == StatLevel.LOSS.value] 82 | if less_stats: 83 | table = Table(title=f"\nImport Data Loss Detail (You can use --file to import single file for more info)\n", expand=True, box=box.HEAVY_HEAD, show_lines=True) 84 | table.add_column("File Name", justify="left", style="default") 85 | table.add_column("Loss Detail", justify="left", style="default") 86 | 87 | for stat in less_stats: 88 | table.add_row(str(stat.filename), str(stat)) 89 | console.print(table) 90 | 91 | 92 | 93 | def prepare_env(args: argparse.Namespace): 94 | log_path = Path(args.log) if args.log else Path.cwd() / 'logs/' 95 | if not log_path.is_dir(): 96 | log_path.mkdir(parents=True) 97 | 98 | conf_path = Path(args.conf) 99 | if not conf_path.is_file(): 100 | text = Text(f"Read conf {conf_path} failed.", style="red") 101 | console.print(text) 102 | sys.exit(1) 103 | 104 | setup_logger(log_path) 105 | read_config(conf_path) 106 | logger.info(f"Read log {log_path}, conf {conf_path}") 107 | 108 | 109 | async def import_single_file(file): 110 | notion_api_key = "" 111 | if 'GITHUB_ACTIONS' in os.environ: 112 | notion_api_key = os.environ['notion_api_key'] 113 | else: 114 | notion_api_key = config['notion']['api_key'] 115 | async with ClientSession() as session: 116 | async with AsyncClient(auth=notion_api_key) as notion_client: 117 | notion_importer = NotionImporter(session, notion_client) 118 | await notion_importer.process_file(file) 119 | return notion_importer.import_stats 120 | 121 | 122 | def main(): 123 | arg_parse = prepare_args() 124 | args = arg_parse.parse_args() 125 | prepare_env(args) 126 | 127 | text = Text("") 128 | file_path = Path(args.file) if args.file else None 129 | dir_path = Path(args.dir) if args.dir else None 130 | max_concurrency = args.batch 131 | if file_path and file_path.is_file(): 132 | stats = asyncio.run(import_single_file(file_path)) 133 | print_single_stats(stats) 134 | elif dir_path and dir_path.is_dir(): 135 | logger.info(f"Begin save all html files in the dir: {dir_path}.") 136 | batch_import = BatchImport(dir_path, max_concurrency) 137 | result = asyncio.run(batch_import.process_directory()) 138 | logger.info(f"Finish save all html files in the dir: {dir_path}.\n{result}") 139 | print_batch_stats(batch_import) 140 | else: 141 | text.append("The parameters provided are incorrect, please check.", style="red") 142 | text.append(f"\n{arg_parse.format_help()}", style="default") 143 | 144 | text.append("\nIf you need help, please submit an ", style="default") 145 | link = Text("issue", style="cyan underline link https://github.com/selfboot/html2notion/issues") 146 | text.append(link) 147 | text.append(" on gitHub.", style="default") 148 | console.print(text) 149 | return 150 | 151 | 152 | if __name__ == '__main__': 153 | main() 154 | -------------------------------------------------------------------------------- /html2notion/translate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/html2notion/translate/__init__.py -------------------------------------------------------------------------------- /html2notion/translate/batch_import.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | import os 4 | from pathlib import Path 5 | from asyncio import Lock 6 | from notion_client import AsyncClient 7 | from rich.progress import ( 8 | BarColumn, 9 | MofNCompleteColumn, 10 | Progress, 11 | TextColumn, 12 | TimeRemainingColumn, 13 | ) 14 | from ..translate.notion_import import NotionImporter 15 | from ..utils import logger, config 16 | 17 | 18 | class BatchImport: 19 | def __init__(self, directory: Path, concurrent_limit: int = 10): 20 | self.directory = directory 21 | self.concurrent_limit = concurrent_limit 22 | if 'GITHUB_ACTIONS' in os.environ: 23 | self.notion_api_key = os.environ['notion_api_key'] 24 | else: 25 | self.notion_api_key = config['notion']['api_key'] 26 | self.notion_client = AsyncClient(auth=self.notion_api_key) 27 | self.batch_stats = [] 28 | self.files_lock = Lock() 29 | 30 | @staticmethod 31 | async def process_file(session, notion_client, file_path, files_lock, batch_stats): 32 | logger.info(f"Begin file, file {file_path}") 33 | notion_import = NotionImporter(session, notion_client) 34 | response = await notion_import.process_file(file_path) 35 | logger.info(f"Finish file {file_path}, status {str(notion_import.import_stats)}") 36 | async with files_lock: 37 | batch_stats.append(notion_import.import_stats) 38 | return response 39 | 40 | async def process_directory(self): 41 | semaphore = asyncio.Semaphore(self.concurrent_limit) 42 | self.all_files = [file_path for file_path in self.directory.glob('*.html') if file_path.name != 'index.html'] 43 | files_len = len(self.all_files) 44 | 45 | with Progress( 46 | TextColumn("[progress.description]{task.description}", justify="right"), 47 | BarColumn(), 48 | MofNCompleteColumn(), 49 | TextColumn(" "), 50 | TimeRemainingColumn() 51 | ) as progress: 52 | # with Progress() as progress: 53 | progress.add_task("[cyan]Total", total=files_len, 54 | completed=files_len, update_period=0, style="cyan") 55 | success_task_id = progress.add_task( 56 | "[green]Success", total=files_len, style="green") 57 | failed_task_id = progress.add_task("[red]Failed", total=files_len, style="red") 58 | async def process_file_with_semaphore(session, notion_client, file_path): 59 | async with semaphore: 60 | result = await self.process_file(session, notion_client, file_path, self.files_lock, self.batch_stats) 61 | if result == "succ": 62 | progress.update(success_task_id, advance=1) 63 | else: 64 | progress.update(failed_task_id, advance=1) 65 | return result 66 | 67 | async with aiohttp.ClientSession() as session: 68 | tasks = [process_file_with_semaphore(session, self.notion_client, file_path) for file_path in self.all_files] 69 | results = await asyncio.gather(*tasks) 70 | await session.close() 71 | return results 72 | 73 | 74 | if __name__ == '__main__': 75 | from ..utils import test_prepare_conf 76 | test_prepare_conf() 77 | from tempfile import TemporaryDirectory 78 | with TemporaryDirectory() as temp_dir: 79 | temp_dir_path = Path(temp_dir) 80 | files = [] 81 | for i in range(100): 82 | temp_file = temp_dir_path / f"file{i}.txt" 83 | temp_file.write_text("main_hold") 84 | files.append(temp_file) 85 | 86 | max_concurrency = 2 87 | batch_import = BatchImport(temp_dir_path, max_concurrency) 88 | result = asyncio.run(batch_import.process_directory()) 89 | print(result) 90 | -------------------------------------------------------------------------------- /html2notion/translate/cos_uploader.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from qcloud_cos import CosConfig 3 | from qcloud_cos import CosS3Client 4 | from qcloud_cos.cos_exception import CosClientError 5 | from functools import partial 6 | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type 7 | from ..utils import logger, test_prepare_conf, config 8 | 9 | class TencentCosUploaderAsync: 10 | def __init__(self, secret_id, secret_key, region, bucket, timeout=60): 11 | self.config = CosConfig(Region=region, SecretId=secret_id, SecretKey=secret_key, Timeout=timeout) 12 | self.client = CosS3Client(self.config) 13 | self.bucket = bucket 14 | 15 | @retry(stop=stop_after_attempt(5), 16 | wait=wait_exponential(multiplier=1, min=3, max=30), 17 | retry=retry_if_exception_type(CosClientError)) 18 | async def upload_file(self, loop, local_path, key): 19 | with open(local_path, 'rb') as f: 20 | content = f.read() 21 | 22 | executor = loop.run_in_executor 23 | put_object_partial = partial(self.client.put_object, Bucket=self.bucket, Body=content, Key=key) 24 | response = await executor(None, put_object_partial) 25 | return response 26 | 27 | @retry(stop=stop_after_attempt(5), 28 | wait=wait_exponential(multiplier=1, min=3, max=30), 29 | retry=retry_if_exception_type(CosClientError)) 30 | async def check_file_exist(self, loop, key): 31 | try: 32 | executor = loop.run_in_executor 33 | return await executor(None, self.client.object_exists, self.bucket, key) 34 | except Exception as e: 35 | logger.error(e) 36 | return False 37 | 38 | @retry(stop=stop_after_attempt(5), 39 | wait=wait_exponential(multiplier=1, min=3, max=30), 40 | retry=retry_if_exception_type(CosClientError)) 41 | async def delete_file(self, loop, key): 42 | executor = loop.run_in_executor 43 | response = await executor(None, self.client.delete_object, self.bucket, key) 44 | return response 45 | 46 | 47 | async def main(): 48 | test_prepare_conf() 49 | 50 | try: 51 | secret_id = config["cos"]["secret_id"] 52 | secret_key = config["cos"]["secret_key"] 53 | region = config["cos"]["region"] 54 | bucket = config["cos"]["bucket"] 55 | except Exception as e: 56 | print(f"Please fill cos conf in the config file") 57 | return 58 | 59 | local_path = './demos/saul.webp' 60 | key = 'test/saul.webp' 61 | 62 | uploader = TencentCosUploaderAsync(secret_id, secret_key, region, bucket) 63 | loop = asyncio.get_event_loop() 64 | 65 | upload_response = await uploader.upload_file(loop, local_path, key) 66 | print(f"Upload response: {upload_response}") 67 | 68 | if await uploader.check_file_exist(loop, key): 69 | print("Upload successful!") 70 | else: 71 | print("Upload failed!") 72 | 73 | 74 | if __name__ == "__main__": 75 | asyncio.run(main()) 76 | -------------------------------------------------------------------------------- /html2notion/translate/html2json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import chardet 3 | import time 4 | from functools import singledispatch 5 | from pathlib import Path 6 | from bs4 import BeautifulSoup, Tag 7 | from ..utils import logger, test_prepare_conf 8 | from ..translate.html2json_base import Html2JsonBase 9 | from ..translate.html2json_default import Default_Type 10 | from ..translate.html2json_yinxiang import YinXiang_Type 11 | from ..translate.html2json_clipper import YinXiangClipper_Type 12 | from ..translate.html2json_markdown import YinXiangMarkdown_Type 13 | 14 | 15 | """ 16 | 17 | 18 | 19 | """ 20 | def _is_yinxiang_export_html(html_soup, import_stat): 21 | meta_source = html_soup.select_one('html > head > meta[name="source"]') 22 | meta_source_content = meta_source.get('content', "") if isinstance(meta_source, Tag) else "" 23 | if not meta_source_content: 24 | return False 25 | yinxiang_source_content = ["yinxiang", "desktop", "mobile"] 26 | import_stat.head_meta["source"] = meta_source_content 27 | for prefix in yinxiang_source_content: 28 | if isinstance(meta_source_content, str) and meta_source_content.startswith(prefix): 29 | return True 30 | return False 31 | 32 | 33 | """ 34 | 35 | 36 | """ 37 | def _is_yinxiang_clipper_html(html_soup, import_stat): 38 | meta_source_application = html_soup.select_one('html > head > meta[name="source-application"]') 39 | source_application = meta_source_application.get('content', "") if isinstance(meta_source_application, Tag) else "" 40 | if not source_application: 41 | return False 42 | import_stat.head_meta["source-application"] = source_application 43 | if isinstance(source_application, str) and source_application.endswith("evernote"): 44 | return True 45 | if isinstance(source_application, str) and source_application in ["微信",]: 46 | return True 47 | return False 48 | 49 | 50 | """ 51 | 52 | """ 53 | def _is_yinxiang_markdown_html(html_soup, import_stat): 54 | meta_content_class = html_soup.select_one('html > head > meta[name="content-class"]') 55 | content_class = meta_content_class.get('content', "") if isinstance(meta_content_class, Tag) else "" 56 | if not content_class: 57 | return False 58 | import_stat.head_meta["content_class"] = content_class 59 | if isinstance(content_class, str) and content_class.endswith("markdown"): 60 | return True 61 | return False 62 | 63 | 64 | # 65 | # 66 | def _infer_input_type(html_content, import_stat): 67 | soup = BeautifulSoup(html_content, 'html.parser') 68 | exporter_version_meta = soup.select_one('html > head > meta[name="exporter-version"]') 69 | exporter_version_content = exporter_version_meta.get('content', "") if isinstance( exporter_version_meta, Tag) else "" 70 | import_stat.head_meta["exporter-version"] = exporter_version_content 71 | exporter_version = exporter_version_content if isinstance(exporter_version_content, str) else "" 72 | if exporter_version.startswith("Evernote") or exporter_version.startswith("YXBJ"): 73 | if _is_yinxiang_markdown_html(soup, import_stat): 74 | return YinXiangMarkdown_Type 75 | if _is_yinxiang_clipper_html(soup, import_stat): 76 | return YinXiangClipper_Type 77 | elif _is_yinxiang_export_html(soup, import_stat): 78 | return YinXiang_Type 79 | 80 | return YinXiang_Type # default 81 | 82 | return Default_Type 83 | 84 | 85 | def _get_converter(html_content, import_stat): 86 | html_type = _infer_input_type(html_content, import_stat) 87 | import_stat.head_meta["parse_type"] = html_type 88 | logger.info(f"Input type: {html_type}") 89 | converter = Html2JsonBase.create(html_type, html_content, import_stat) 90 | return converter 91 | 92 | 93 | @singledispatch 94 | def html2json_process(html_content, import_stat): 95 | raise TypeError(f"Unsupported {type(html_content)}, {import_stat}") 96 | 97 | 98 | @html2json_process.register 99 | def _(html_content: str, import_stat): 100 | converter = _get_converter(html_content, import_stat) 101 | result = converter.process() 102 | return converter.get_notion_data(), result 103 | 104 | 105 | @html2json_process.register 106 | def _(html_file: Path, import_stat): 107 | if not html_file.is_file(): 108 | print(f"Load file: {html_file.resolve()} failed") 109 | raise FileNotFoundError 110 | 111 | html_content = "" 112 | with html_file.open('rb') as f: 113 | data = f.read() 114 | result = chardet.detect(data) 115 | encoding = result['encoding'] if result['encoding'] else 'utf-8' 116 | html_content = data.decode(encoding) 117 | 118 | if html_content == "main_hold": # just for local debug 119 | time.sleep(1) 120 | return "main_hold" 121 | 122 | converter = _get_converter(html_content, import_stat) 123 | result = converter.process() 124 | return converter.get_notion_data(), result 125 | 126 | 127 | if __name__ == "__main__": 128 | test_prepare_conf() 129 | html_file = Path("./demos/Test Case D.html") 130 | result, html_type = html2json_process(html_file) 131 | print(html_type) 132 | print(json.dumps(result, indent=4, ensure_ascii=False)) 133 | result2, html_type2 = html2json_process( 134 | "
test") 135 | print(html_type2) 136 | print(json.dumps(result2, indent=4, ensure_ascii=False)) 137 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_base.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import copy 4 | from collections import namedtuple 5 | from bs4 import NavigableString, Tag, PageElement 6 | from enum import Enum 7 | from ..utils import logger, config, is_valid_url 8 | 9 | class Block(Enum): 10 | FAIL = "fail" 11 | PARAGRAPH = "paragraph" 12 | QUOTE = "quote" 13 | NUMBERED_LIST = "numbered_list_item" 14 | BULLETED_LIST = "bulleted_list_item" 15 | HEADING = "heading" 16 | CODE = "code" 17 | DIVIDER = "divider" 18 | TABLE = "table" 19 | TO_DO = "to_do" 20 | EQUATION = "equation" 21 | 22 | 23 | class Html2JsonBase: 24 | # https://developers.notion.com/reference/request-limits 25 | URL_MAX_LENGTH = 2000 26 | TEXT_MAX_LENGTH = 2000 27 | EXPRESSION_MAX_LENGTH = 1000 28 | RICHTEXT_ARRAY_LENGTH = 100 29 | 30 | _registry = {} 31 | _text_annotations = { 32 | "bold": bool, 33 | "italic": bool, 34 | "strikethrough": bool, 35 | "underline": bool, 36 | "code": bool, 37 | "color": str, 38 | } 39 | 40 | _language = {"abap", "agda", "arduino", 41 | "assembly", "bash", "basic", "bnf", "c", "c#", "c++", "clojure", "coffeescript", "coq", "css", 42 | "dart", "dhall", "diff", "docker", "ebnf", "elixir", "elm", "erlang", "f#", "flow", "fortran", 43 | "gherkin", "glsl", "go", "graphql", "groovy", "haskell", "html", "idris", "java", "javascript", 44 | "json", "julia", "kotlin", "latex", "less", "lisp", "livescript", "llvm ir", "lua", "makefile", 45 | "markdown", "markup", "matlab", "mathematica", "mermaid", "nix", "objective-c", "ocaml", "pascal", 46 | "perl", "php", "plain text", "powershell", "prolog", "protobuf", "purescript", "python", "r", 47 | "racket", "reason", "ruby", "rust", "sass", "scala", "scheme", "scss", "shell", "solidity", "sql", 48 | "swift", "toml", "typescript", "vb.net", "verilog", "vhdl", "visual basic", "webassembly", "xml", 49 | "yaml", "java/c/c++/c#"} 50 | 51 | _color_tuple = namedtuple("Color", "name r g b") 52 | _notion_color = [ 53 | _color_tuple("default", 0, 0, 0), 54 | _color_tuple("gray", 128, 128, 128), 55 | _color_tuple("brown", 165, 42, 42), 56 | _color_tuple("orange", 255, 165, 0), 57 | _color_tuple("yellow", 255, 255, 0), 58 | _color_tuple("green", 0, 128, 0), 59 | _color_tuple("blue", 0, 0, 255), 60 | _color_tuple("purple", 128, 0, 128), 61 | _color_tuple("pink", 255, 192, 203), 62 | _color_tuple("red", 255, 0, 0), 63 | ] 64 | 65 | # Page content should be: https://developers.notion.com/reference/post-page 66 | def __init__(self, html_content, import_stat): 67 | self.html_content = html_content 68 | self.children = [] 69 | self.properties = {} 70 | self.parent = {} 71 | self.import_stat = import_stat 72 | if 'GITHUB_ACTIONS' in os.environ: 73 | notion_database_id = os.environ['notion_db_id_1'] 74 | else: 75 | notion_database_id = config['notion']['database_id'] 76 | self.parent = {"type": "database_id", "database_id": notion_database_id} 77 | 78 | def process(self): 79 | raise NotImplementedError("Subclasses must implement this method") 80 | 81 | def get_notion_data(self): 82 | return { 83 | key: value 84 | for key, value in { 85 | 'children': self.children, 86 | 'properties': self.properties, 87 | 'parent': self.parent, 88 | }.items() 89 | if value 90 | } 91 | 92 | @staticmethod 93 | def extract_text_and_parents(tag: PageElement, parents=[]): 94 | results = [] 95 | # Filter empty content when tag is not img 96 | if isinstance(tag, NavigableString) and tag.strip(): 97 | results.append((tag, parents)) 98 | return results 99 | elif isinstance(tag, Tag): 100 | if tag.name == 'img': 101 | img_src = tag.get('src', '') 102 | parent_tags = [p for p in parents + [tag]] 103 | results.append((img_src, parent_tags)) 104 | else: 105 | for child in tag.children: 106 | if isinstance(child, NavigableString): 107 | if tag.name != 'img' and child.strip(): 108 | text = child.text 109 | parent_tags = [p for p in parents + [tag]] 110 | results.append((text, parent_tags)) 111 | elif isinstance(child, Tag) and child.name == 'br': 112 | results.append(('
', [])) 113 | else: 114 | results.extend(Html2JsonBase.extract_text_and_parents(child, parents + [tag])) 115 | return results 116 | 117 | @staticmethod 118 | def parse_one_style(tag_soup: Tag, text_params: dict): 119 | tag_name = tag_soup.name.lower() 120 | styles = Html2JsonBase.get_tag_style(tag_soup) 121 | if Html2JsonBase.is_bold(tag_name, styles): 122 | text_params["bold"] = True 123 | if Html2JsonBase.is_italic(tag_name, styles): 124 | text_params["italic"] = True 125 | if Html2JsonBase.is_strikethrough(tag_name, styles): 126 | text_params["strikethrough"] = True 127 | if Html2JsonBase.is_underline(tag_name, styles): 128 | text_params["underline"] = True 129 | if Html2JsonBase.is_code(tag_name, styles): 130 | text_params["code"] = True 131 | 132 | color = Html2JsonBase.get_color(styles, tag_soup.attrs if tag_name else {}) 133 | if color != 'default': 134 | text_params["color"] = color 135 | 136 | if tag_name == 'a': 137 | href = tag_soup.get('href', "") 138 | if not href: 139 | logger.warning("Link href is empty") 140 | text_params["url"] = href 141 | elif tag_name == 'img': 142 | src = tag_soup.get('src', "") 143 | # only support external image here. 144 | if not src: 145 | logger.warning("Image src is empty") 146 | text_params["src"] = src 147 | return 148 | 149 | # https://developers.notion.com/reference/request-limits 150 | # Process one tag and return a list of objects 151 | # unlineline and bold 152 | #Red color4153 | #Code in super note154 | def generate_inline_obj(self, tag: PageElement): 155 | res_obj = [] 156 | text_with_parents = Html2JsonBase.extract_text_and_parents(tag) 157 | for (text, parent_tags) in text_with_parents: 158 | text_params = {"plain_text": text} 159 | for parent in parent_tags: 160 | Html2JsonBase.parse_one_style(parent, text_params) 161 | if text == "
": 162 | try: 163 | res_obj[-1]["text"]["content"] += "\n" 164 | res_obj[-1]["plain_text"] += "\n" 165 | except Exception as e: 166 | pass 167 | continue 168 | 169 | link_url = text_params.get("url", "") 170 | text_obj = {} 171 | if text_params.get("url", "") and is_valid_url(link_url): 172 | text_obj = self.generate_link(**text_params) 173 | # Here image is a independent block, split out in the outer layer 174 | elif text_params.get("src", ""): 175 | text_obj = self.generate_image(**text_params) 176 | else: 177 | if len(text) <= self.TEXT_MAX_LENGTH: 178 | text_obj = self.generate_text(**text_params) 179 | else: 180 | for chunk in [text[i:i+self.TEXT_MAX_LENGTH] for i in range(0, len(text), self.TEXT_MAX_LENGTH)]: 181 | text_params["plain_text"] = chunk 182 | text_obj = self.generate_text(**text_params) 183 | if text_obj: 184 | res_obj.append(text_obj) 185 | text_obj = None 186 | if text_obj: 187 | res_obj.append(text_obj) 188 | return res_obj 189 | 190 | def generate_link(self, **kwargs): 191 | link_url = kwargs.get("url", "") 192 | plain_text = kwargs.get("plain_text", "") 193 | if not plain_text or not is_valid_url(link_url): 194 | return 195 | 196 | link_url = link_url[:self.URL_MAX_LENGTH] 197 | self.import_stat.add_notion_text(plain_text) 198 | return { 199 | "href": link_url, 200 | "plain_text": plain_text, 201 | "text": { 202 | "link": {"url": link_url}, 203 | "content": plain_text 204 | }, 205 | "type": "text" 206 | } 207 | 208 | def generate_image(self, **kwargs): 209 | source = kwargs.get("src", "") 210 | if not source or not is_valid_url(source): 211 | return 212 | self.import_stat.add_notion_image(source) 213 | image_block = { 214 | "object": "block", 215 | "type": "image", 216 | "image": { 217 | "type": "external", 218 | "external": { 219 | "url": source 220 | } 221 | } 222 | } 223 | return image_block 224 | 225 | def generate_text(self, **kwargs): 226 | plain_text = kwargs.get("plain_text", "") 227 | if not plain_text: 228 | return 229 | annotations = { 230 | key: value 231 | for key, value in kwargs.items() 232 | if key in Html2JsonBase._text_annotations and isinstance(value, Html2JsonBase._text_annotations[key]) 233 | } 234 | stats_count = kwargs.get("stats_count", True) 235 | if stats_count: 236 | self.import_stat.add_notion_text(plain_text) 237 | text_obj = { 238 | "plain_text": plain_text, 239 | "text": {"content": plain_text}, 240 | "type": "text" 241 | } 242 | if annotations: 243 | text_obj["annotations"] = annotations 244 | 245 | return text_obj 246 | 247 | def generate_properties(self, **kwargs): 248 | title = kwargs.get("title", "") 249 | url = kwargs.get("url", "") 250 | tags = kwargs.get("tags", []) 251 | created_time = kwargs.get("created_time", "") 252 | 253 | property_map = { 254 | "Title": {"title": [{"text": {"content": title}}]} if title else None, 255 | "URL": {"url": url, "type": "url"} if url else None, 256 | "Tags": {"type": "multi_select", "multi_select": [{"name": tag} for tag in tags]} if tags else None, 257 | "Created": {"date": {"start": created_time}, "type": "date"} if created_time else None, 258 | } 259 | 260 | properties_obj = {key: value for key, value in property_map.items() if value is not None} 261 | 262 | logger.debug(f"properties: {properties_obj}") 263 | return properties_obj 264 | 265 | @staticmethod 266 | def is_same_annotations_text(text_one: dict, text_another: dict): 267 | if text_one["type"] != "text" or text_another["type"] != "text": 268 | return False 269 | attributes = ["annotations", "href"] 270 | 271 | # When merging, be careful not to let the text length exceed the limit 272 | total_size = len(text_one["text"]["content"]) + len(text_another["text"]["content"]) 273 | if total_size > Html2JsonBase.TEXT_MAX_LENGTH: 274 | return False 275 | 276 | return all(text_one.get(attr) == text_another.get(attr) for attr in attributes) 277 | 278 | @staticmethod 279 | def merge_rich_text(rich_text: list): 280 | if not rich_text: 281 | return [] 282 | merged_text = [] 283 | current_text = rich_text[0] 284 | for text in rich_text[1:]: 285 | if Html2JsonBase.is_same_annotations_text(current_text, text): 286 | text_content = current_text["text"]["content"] + text["text"]["content"] 287 | current_text["plain_text"] = text_content 288 | current_text["text"]["content"] = text_content 289 | else: 290 | merged_text.append(current_text) 291 | current_text = text 292 | if current_text: 293 | merged_text.append(current_text) 294 | 295 | return merged_text 296 | 297 | @staticmethod 298 | def is_bold(tag_name: str, styles: dict) -> bool: 299 | if tag_name in ('b', 'strong'): 300 | return True 301 | 302 | font_weight = styles.get('font-weight', None) 303 | if font_weight is None: 304 | return False 305 | elif font_weight == 'bold': 306 | return True 307 | elif font_weight.isdigit() and int(font_weight) >= 700: 308 | return True 309 | return False 310 | 311 | @staticmethod 312 | def is_strikethrough(tag_name: str, styles: dict) -> bool: 313 | if tag_name in ('s', 'strike', 'del'): 314 | return True 315 | text_decoration = styles.get("text-decoration", "") 316 | return "line-through" in text_decoration 317 | 318 | @staticmethod 319 | def is_italic(tag_name: str, styles: dict) -> bool: 320 | if tag_name in ('i', 'em'): 321 | return True 322 | font_style = styles.get('font-style', "") 323 | return "italic" in font_style 324 | 325 | @staticmethod 326 | def is_underline(tag_name: str, styles: dict) -> bool: 327 | # A tuple of a single element requires a comma after the element 328 | if tag_name in ('u',): 329 | return True 330 | text_decoration = styles.get('text-decoration', "") 331 | return 'underline' in text_decoration 332 | 333 | @staticmethod 334 | def is_code(tag_name: str, styles: dict): 335 | if tag_name in ('code',): 336 | return True 337 | 338 | # style="-en-code: true" 339 | if styles.get('-en-code', "false") == "true": 340 | return True 341 | 342 | # Check if the font-family is monospace 343 | font_family = styles.get('font-family', "") 344 | monospace_fonts = {'courier', 'monospace'} 345 | if not font_family: 346 | return False 347 | for font in monospace_fonts: 348 | if font.lower() == font_family.lower(): 349 | return True 350 | 351 | @staticmethod 352 | def _closest_color(r, g, b): 353 | closest_distance = float("inf") 354 | closest_color = None 355 | 356 | for color in Html2JsonBase._notion_color: 357 | distance = ((r - color.r) ** 2 + (g - color.g) ** 2 + (b - color.b) ** 2) ** 0.5 358 | if distance < closest_distance: 359 | closest_distance = distance 360 | closest_color = color.name 361 | 362 | return closest_color 363 | 364 | @staticmethod 365 | def _hex_to_rgb(hex_color): 366 | hex_color = hex_color.lstrip("#") 367 | return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) 368 | 369 | @staticmethod 370 | def get_color(styles: dict, attrs): 371 | color = styles.get('color', "") 372 | if not color and 'color' in attrs: 373 | color = attrs['color'] 374 | if not color: 375 | return "default" 376 | # If the color_values have 4 items, then it is RGBA and the last value is alpha 377 | # rgba(174, 174, 188, 0.2) 378 | if color.startswith("rgb"): 379 | color_values = [int(x.strip()) for x in re.findall(r'\d+', color)] 380 | if len(color_values) >= 3: 381 | r, g, b = color_values[:3] 382 | return Html2JsonBase._closest_color(r, g, b) 383 | # Check if color is in hexadecimal format 384 | elif re.match(r'^#(?:[0-9a-fA-F]{3}){1,2}$', color): 385 | if len(color) == 4: # Short form like #abc -> #aabbcc 386 | color = '#' + ''.join([c*2 for c in color[1:]]) 387 | r, g, b = Html2JsonBase._hex_to_rgb(color) 388 | return Html2JsonBase._closest_color(r, g, b) 389 | 390 | return "default" 391 | 392 | def convert_paragraph(self, soup): 393 | json_obj = { 394 | "object": "block", 395 | "type": "paragraph", 396 | "paragraph": { 397 | "rich_text": [] 398 | } 399 | } 400 | rich_text = json_obj["paragraph"]["rich_text"] 401 | text_obj = self.generate_inline_obj(soup) 402 | if text_obj: 403 | rich_text.extend(text_obj) 404 | 405 | # Split out image into a independent blocks 406 | split_objs = Html2JsonBase.split_image_src(json_obj) 407 | return Html2JsonBase.ensure_array_len(split_objs) 408 | 409 | def convert_divider(self, soup): 410 | return { 411 | "object": "block", 412 | "type": "divider", 413 | "divider": {} 414 | } 415 | 416 | def convert_heading(self, soup): 417 | heading_map = {"h1": "heading_1", "h2": "heading_2", "h3": "heading_3", 418 | "h4": "heading_3", "h5": "heading_3", "h6": "heading_3"} 419 | 420 | heading_level = heading_map.get(soup.name, "heading_3") 421 | json_obj = { 422 | "object": "block", 423 | "type": heading_level, 424 | heading_level: { 425 | "rich_text": [] 426 | } 427 | } 428 | rich_text = json_obj[heading_level]["rich_text"] 429 | text_obj = self.generate_inline_obj(soup) 430 | if text_obj: 431 | rich_text.extend(text_obj) 432 | return json_obj 433 | return None 434 | 435 | #436 | def convert_numbered_list_item(self, soup): 437 | return self.convert_list_items(soup, 'numbered_list_item') 438 | 439 | #
first second third440 | def convert_bulleted_list_item(self, soup): 441 | return self.convert_list_items(soup, 'bulleted_list_item') 442 | 443 | def convert_list_items(self, soup, list_type): 444 | # Remove heading tags in li 445 | for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): 446 | heading.unwrap() 447 | 448 | items = soup.find_all('li', recursive=True) 449 | if not items: 450 | logger.warning("No list items found in {soup}") 451 | 452 | json_arr = [] 453 | for item in items: 454 | one_item = self._convert_one_list_item(item, list_type) 455 | if one_item: 456 | json_arr.append(one_item) 457 | else: 458 | logger.info(f'empty {item}') 459 | return json_arr 460 | 461 | def _convert_one_list_item(self, soup, list_type): 462 | if list_type not in {'numbered_list_item', 'bulleted_list_item'}: 463 | logger.warning(f'Not support list_type') 464 | 465 | json_obj = { 466 | "object": "block", 467 | list_type: { 468 | "rich_text": [] 469 | }, 470 | "type": list_type, 471 | } 472 | rich_text = json_obj[list_type]["rich_text"] 473 | text_obj = self.generate_inline_obj(soup) 474 | if text_obj: 475 | rich_text.extend(text_obj) 476 | 477 | return json_obj 478 | 479 | """ 480 |
itemA itemB itemC481 |485 | """ 486 | # ../examples/insert_table.ipynb 487 | def convert_table(self, soup): 488 | table_rows = [] 489 | tr_tags = soup.find_all('tr') 490 | if not tr_tags: 491 | logger.error(f"No tr found in {soup}") 492 | return 493 | 494 | table_width = len(tr_tags[0].find_all('td')) 495 | has_header = False 496 | for tr in tr_tags: 497 | td_tags = tr.find_all('td') 498 | if not td_tags: 499 | td_tags = tr.find_all('th') 500 | has_header = True 501 | table_width = max(table_width, len(td_tags)) 502 | one_row = { 503 | "type": "table_row", 504 | "table_row": { 505 | "cells": [] 506 | } 507 | } 508 | for td in td_tags: 509 | col = self.generate_inline_obj(td) 510 | one_row["table_row"]["cells"].append(col) 511 | table_rows.append(one_row) 512 | 513 | table_obj = { 514 | "table": { 515 | "has_row_header": False, 516 | "has_column_header": has_header, 517 | "table_width": table_width, 518 | "children": table_rows, 519 | } 520 | } 521 | return table_obj 522 | 523 | @staticmethod 524 | def split_image_src(text_obj): 525 | rich_text = text_obj["paragraph"]["rich_text"] 526 | need_split = any(text.get("object") == "block" for text in rich_text) 527 | if not need_split: 528 | return [text_obj] 529 | 530 | split_obj = [] 531 | cur_obj = { 532 | "object": "block", 533 | "type": "paragraph", 534 | "paragraph": { 535 | "rich_text": [] 536 | } 537 | } 538 | for text in rich_text: 539 | if text.get("object") == "block": 540 | if len(cur_obj["paragraph"]["rich_text"]) > 0: 541 | split_obj.append(copy.deepcopy(cur_obj)) 542 | cur_obj["paragraph"]["rich_text"].clear() 543 | split_obj.append(text) 544 | continue 545 | cur_obj["paragraph"]["rich_text"].append(text) 546 | if len(cur_obj["paragraph"]["rich_text"]) > 0: 547 | split_obj.append(cur_obj) 548 | return split_obj 549 | 550 | # Only if there is no ";" in the value of the attribute, you can use this method to get all attributes. 551 | # Can't use this way like: background-image: url('data:image/png;base64...') 552 | @staticmethod 553 | def get_tag_style(tag_soup): 554 | styles = {} 555 | if not isinstance(tag_soup, Tag): 556 | return styles 557 | style = tag_soup.get('style', "") 558 | if str and isinstance(style, str): 559 | # style = ''.join(style.split()) 560 | styles = { 561 | rule.split(':')[0].strip(): rule.split(':')[1].strip().lower() 562 | for rule in style.split(';') 563 | if rule and len(rule.split(':')) > 1 564 | } 565 | return styles 566 | 567 | @staticmethod 568 | def get_valid_language(language): 569 | if language in Html2JsonBase._language: 570 | return language 571 | return "plain text" 572 | 573 | @staticmethod 574 | def ensure_array_len(blocks): 575 | final_objs = [] 576 | for obj in blocks: 577 | if "paragraph" not in obj or "rich_text" not in obj["paragraph"] or len( 578 | obj["paragraph"]["rich_text"]) <= Html2JsonBase.RICHTEXT_ARRAY_LENGTH: 579 | final_objs.append(obj) 580 | continue 581 | 582 | # If the length of rich_text is greater than RICHTEXT_ARRAY_LENGTH, we split it 583 | rich_text_arr = obj["paragraph"]["rich_text"] 584 | rich_texts = [rich_text_arr[i:i+Html2JsonBase.RICHTEXT_ARRAY_LENGTH] 585 | for i in range(0, len(rich_text_arr), Html2JsonBase.RICHTEXT_ARRAY_LENGTH)] 586 | for rich_text in rich_texts: 587 | new_json_obj = { 588 | "object": "block", 589 | "type": "paragraph", 590 | "paragraph": { 591 | "rich_text": rich_text 592 | } 593 | } 594 | final_objs.append(new_json_obj) 595 | return final_objs 596 | 597 | @classmethod 598 | def register(cls, input_type, subclass): 599 | cls._registry[input_type] = subclass 600 | 601 | @classmethod 602 | def create(cls, input_type, html_content, import_stat): 603 | subclass = cls._registry.get(input_type) 604 | if subclass is None: 605 | raise ValueError(f"noknown: {input_type}") 606 | return subclass(html_content, import_stat) 607 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_clipper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup, NavigableString, Tag 2 | from ..utils import logger, DateStrToISO8601 3 | from ..translate.html2json_base import Html2JsonBase, Block 4 | 5 | YinXiangClipper_Type = "clipper.yinxiang" 6 | 7 | 8 | class Html2JsonClipper(Html2JsonBase): 9 | input_type = YinXiangClipper_Type 10 | 11 | def __init__(self, html_content, import_stat): 12 | super().__init__(html_content, import_stat) 13 | 14 | def process(self): 15 | soup = BeautifulSoup(self.html_content, 'html.parser') 16 | self.convert_properties(soup) 17 | 18 | content_tags = soup.body 19 | if not content_tags: 20 | logger.error("No content found") 21 | raise Exception("No content found") 22 | 23 | self.import_stat.add_text(content_tags.get_text()) 24 | self.convert_children(content_tags) # Assesume only one body tag 25 | 26 | return YinXiangClipper_Type 27 | 28 | def convert_properties(self, soup): 29 | properties = {"title": "Unknown"} 30 | title_tag = soup.select_one('head > title') 31 | if title_tag: 32 | properties["title"] = title_tag.text 33 | 34 | meta_tags = [ 35 | ('head > meta[name="source-url"]', "url"), 36 | ('head > meta[name="keywords"]', "tags", lambda x: x.split(",")), 37 | ('head > meta[name="created"]', "created_time", DateStrToISO8601), 38 | ] 39 | 40 | for selector, key, *converter in meta_tags: 41 | tag = soup.select_one(selector) 42 | if tag and tag.get('content', None): 43 | content = tag['content'] 44 | properties[key] = converter[0](content) if converter else content 45 | 46 | self.properties = self.generate_properties(**properties) 47 | return 48 | 49 | def get_block_type(self, element): 50 | tag_name = element.name 51 | if tag_name == "p": 52 | return Block.PARAGRAPH.value 53 | elif tag_name == "table": 54 | return Block.TABLE.value 55 | elif tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): 56 | return Block.HEADING.value 57 | elif tag_name == 'hr': 58 | return Block.DIVIDER.value 59 | elif tag_name == 'ol': 60 | return Block.NUMBERED_LIST.value 61 | elif tag_name == 'ul': 62 | return Block.BULLETED_LIST.value 63 | elif element.name == 'pre' and element.code: 64 | return Block.CODE.value 65 | elif self._check_is_block(element): 66 | return Block.QUOTE.value 67 | 68 | return Block.FAIL.value 69 | 70 | def convert_children(self, soup): 71 | processed_tags = set() 72 | for element in soup.descendants: 73 | if isinstance(element, NavigableString): 74 | continue 75 | if any(id(ancestor) in processed_tags for ancestor in element.parents): 76 | logger.debug(f"Skip processed tag {element}") 77 | continue 78 | block_type = self.get_block_type(element) 79 | if hasattr(self, f"convert_{block_type}"): 80 | converter = getattr(self, f"convert_{block_type}") 81 | block = converter(element) 82 | if block: 83 | self.children.extend([block] if not isinstance(block, list) else block) 84 | processed_tags.add(id(element)) 85 | unprocessed_tags = set() 86 | for element in soup.descendants: 87 | if not isinstance(element, NavigableString) or id(element) in processed_tags: 88 | continue 89 | if any(id(ancestor) in processed_tags for ancestor in element.parents): 90 | continue 91 | unprocessed_tags.add(element) 92 | 93 | for unprocessed_tag in unprocessed_tags: 94 | logger.warning(f"Unknown tag {unprocessed_tag.name}, {self.get_block_type(unprocessed_tag)}") 95 | self.import_stat.add_skip_tag(unprocessed_tag.get_text()) 96 | return 97 | 98 | #482 |483 |
484 |99 | def convert_code(self, soup): 100 | json_obj = { 101 | "object": "block", 102 | "type": "code", 103 | "code": { 104 | "rich_text": [], 105 | "language": "plain text", 106 | }, 107 | } 108 | rich_text = json_obj["code"]["rich_text"] 109 | code_tag = soup.code 110 | if not code_tag: 111 | logger.error(f'No code tag found in {soup}') 112 | return 113 | children_list = list(code_tag.children) if isinstance(code_tag, Tag) else [code_tag] 114 | for child in children_list: 115 | if isinstance(child, Tag) and child.name == "code": 116 | logger.debug(f'Skip line number') 117 | continue 118 | text_obj = self.generate_inline_obj(child) 119 | if text_obj: 120 | rich_text.extend(text_obj) 121 | json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text) 122 | return json_obj 123 | 124 | def convert_quote(self, soup): 125 | json_obj = { 126 | "object": "block", 127 | "type": "quote", 128 | "quote": { 129 | "rich_text": [] 130 | } 131 | } 132 | rich_text = json_obj["quote"]["rich_text"] 133 | text_obj = self.generate_inline_obj(soup) 134 | if text_obj: 135 | rich_text.extend(text_obj) 136 | 137 | # Merge tags has same anotions 138 | return json_obj 139 | 140 | def _check_is_block(self, element): 141 | quote_elements = {'blockquote', 'q', 'cite'} 142 | if element.name in quote_elements: 143 | return True 144 | 145 | if element.name != 'div': 146 | return False 147 | 148 | # if 'class' in element.attrs: 149 | # if any('quote' in class_name.lower() for class_name in element.attrs['class']): 150 | # return True 151 | 152 | # if 'style' in element.attrs: 153 | # style_attrs = element.attrs['style'].lower() 154 | # if 'border:' in style_attrs or 'padding:' in style_attrs: 155 | # return True 156 | 157 | return False 158 | 159 | 160 | Html2JsonBase.register(YinXiangClipper_Type, Html2JsonClipper) 161 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_default.py: -------------------------------------------------------------------------------- 1 | # For notes that are clipped from web pages 2 | # that are not written manually by Evernote and have rich text formatting, 3 | # try to keep the format for conversion 4 | 5 | from ..translate.html2json_base import Html2JsonBase 6 | 7 | Default_Type = "default" 8 | 9 | 10 | class Html2JsonDefault(Html2JsonBase): 11 | input_type = Default_Type 12 | 13 | def __init__(self, html_content, import_stat): 14 | super().__init__(html_content, import_stat) 15 | 16 | # todo 17 | def process(self): 18 | return Default_Type 19 | 20 | 21 | Html2JsonBase.register(Default_Type, Html2JsonDefault) 22 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_markdown.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup, Tag 3 | from urllib.parse import unquote 4 | from ..utils import logger, is_valid_url, DateStrToISO8601 5 | from ..translate.html2json_base import Html2JsonBase, Block 6 | 7 | YinXiangMarkdown_Type = "markdown.yinxiang" 8 | 9 | # Yinxiang markdown 10 | # https://list.yinxiang.com/markdown/eef42447-db3f-48ee-827b-1bb34c03eb83.php 11 | 12 | 13 | class Html2JsonMarkdown(Html2JsonBase): 14 | input_type = YinXiangMarkdown_Type 15 | undo_image = "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAOCAYAAAAfSC3RAAAAAXNSR0IArs4c6QAAADdJREFUKBVjvHv37n8GMgALSI+SkhJJWu/du8fARJIOJMWjGpECA505GjjoIYLEB6dVUNojFQAA/1MJUFWet/4AAAAASUVORK5CYII=')" 16 | 17 | def __init__(self, html_content, import_stat): 18 | super().__init__(html_content, import_stat) 19 | self.markdown = "" 20 | 21 | def process(self): 22 | soup = BeautifulSoup(self.html_content, 'html.parser') 23 | self.convert_properties(soup) 24 | 25 | content_tags = soup.body 26 | if not content_tags: 27 | logger.error("No content found") 28 | raise Exception("No content found") 29 | 30 | # The center records the contents of the original markdown file, which is useless 31 | center_to_delete = content_tags.find('center') 32 | if isinstance(center_to_delete, Tag): 33 | md_encode = center_to_delete.get_text() 34 | self.markdown = unquote(md_encode) 35 | if isinstance(center_to_delete, Tag): 36 | center_to_delete.decompose() 37 | 38 | # Special handling contains blocks of code, 39 | # because some chart blocks are converted into images and cannot be processed directly 40 | self._replace_pre_code(soup) 41 | self.import_stat.add_text(content_tags.get_text()) 42 | img_tags = content_tags.find_all('img') 43 | for img in img_tags: 44 | img_src = img.get('src', '') 45 | if is_valid_url(img_src): 46 | self.import_stat.add_image(img_src) 47 | 48 | self.convert_children(content_tags) # Assesume only one body tag 49 | 50 | return YinXiangMarkdown_Type 51 | 52 | def convert_properties(self, soup): 53 | properties = {"title": "Unknown"} 54 | title_tag = soup.select_one('head > title') 55 | if title_tag: 56 | properties["title"] = title_tag.text 57 | 58 | meta_tags = [ 59 | ('head > meta[name="source-url"]', "url"), 60 | ('head > meta[name="keywords"]', "tags", lambda x: x.split(",")), 61 | ('head > meta[name="created"]', "created_time", DateStrToISO8601), 62 | ] 63 | 64 | for selector, key, *converter in meta_tags: 65 | tag = soup.select_one(selector) 66 | if tag and tag.get('content', None): 67 | content = tag['content'] 68 | properties[key] = converter[0](content) if converter else content 69 | 70 | self.properties = self.generate_properties(**properties) 71 | return 72 | 73 | def get_block_type(self, element): 74 | tag_name = element.name 75 | if tag_name == "p": 76 | return Block.PARAGRAPH.value 77 | elif tag_name == "table": 78 | return Block.TABLE.value 79 | elif tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): 80 | return Block.HEADING.value 81 | elif tag_name == 'hr': 82 | return Block.DIVIDER.value 83 | elif tag_name == 'ol': 84 | return Block.NUMBERED_LIST.value 85 | elif tag_name == 'ul': 86 | if self._is_checkbox(element): 87 | return Block.TO_DO.value 88 | return Block.BULLETED_LIST.value 89 | elif element.name == 'pre' and element.code: 90 | if self._is_math(element): 91 | return Block.EQUATION.value 92 | return Block.CODE.value 93 | elif element.name == "blockquote": 94 | return Block.QUOTE.value 95 | 96 | return Block.FAIL.value 97 | 98 | def convert_children(self, soup): 99 | div_tag = soup.find('div') 100 | if not div_tag: 101 | logger.error(f'No div tag found in {soup}') 102 | return 103 | for child in div_tag.children: 104 | block_type = self.get_block_type(child) 105 | logger.debug(f'block_type: {block_type}, child: {child}') 106 | if hasattr(self, f"convert_{block_type}"): 107 | converter = getattr(self, f"convert_{block_type}") 108 | block = converter(child) 109 | if block: 110 | self.children.extend([block] if not isinstance(block, list) else block) 111 | else: 112 | self.import_stat.add_skip_tag(child.get_text()) 113 | logger.warning(f"Unknown tag : {child}") 114 | return 115 | 116 | def convert_code(self, soup): 117 | json_obj = { 118 | "object": "block", 119 | "type": "code", 120 | "code": { 121 | "rich_text": [], 122 | "language": "plain text", 123 | }, 124 | } 125 | rich_text = json_obj["code"]["rich_text"] 126 | code_tag = soup.code 127 | if not code_tag: 128 | logger.error(f'No code tag found in {soup}') 129 | return 130 | children_list = list(code_tag.children) if isinstance(code_tag, Tag) else [code_tag] 131 | for child in children_list: 132 | text_obj = self.generate_inline_obj(child) 133 | if text_obj: 134 | rich_text.extend(text_obj) 135 | 136 | css_dict = Html2JsonBase.get_tag_style(code_tag) 137 | language = css_dict.get('language', 'plain text') 138 | json_obj["code"]["language"] = Html2JsonBase.get_valid_language(language) 139 | json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text) 140 | return json_obj 141 | 142 | def convert_quote(self, soup): 143 | json_obj = { 144 | "object": "block", 145 | "type": "quote", 146 | "quote": { 147 | "rich_text": [] 148 | } 149 | } 150 | rich_text = json_obj["quote"]["rich_text"] 151 | text_obj = self.generate_inline_obj(soup) 152 | if text_obj: 153 | rich_text.extend(text_obj) 154 | return json_obj 155 | 156 | def convert_equation(self, soup: Tag): 157 | json_obj = { 158 | "object": "block", 159 | "type": "paragraph", 160 | "paragraph": { 161 | "rich_text": [] 162 | } 163 | } 164 | expression = soup.get_text()[:Html2JsonBase.EXPRESSION_MAX_LENGTH] 165 | equation = json_obj["paragraph"]["rich_text"] 166 | equation.append({ 167 | "type": "equation", 168 | "equation": { 169 | "expression": expression 170 | } 171 | }) 172 | return json_obj 173 | 174 | def convert_to_do(self, soup: Tag): 175 | li_tags = soup.find_all('li', recursive=True) 176 | childs = li_tags if li_tags else [soup] 177 | to_do_blocks = [] 178 | for child in childs: 179 | json_obj = { 180 | "object": "block", 181 | "type": "to_do", 182 | "to_do": { 183 | "rich_text": [], 184 | "checked": False 185 | } 186 | } 187 | text = json_obj["to_do"]["rich_text"] 188 | text_obj = self.generate_inline_obj(child) 189 | if text_obj: 190 | text.extend(text_obj) 191 | 192 | style = child.get('style', '') 193 | if isinstance(style, str) and Html2JsonMarkdown.undo_image not in style: 194 | json_obj["to_do"]["checked"] = True 195 | to_do_blocks.append(json_obj) 196 | return to_do_blocks 197 | 198 | # Each style in
line number
... code content ...has a background-image, which is considered a check box 199 | def _is_checkbox(self, soup): 200 | for li in soup.find_all('li'): 201 | style = li.get('style', '') 202 | if not "background-image: url('data:image/png;" in style: 203 | return False 204 | return True 205 | 206 | def _extract_code_blocks(self): 207 | code_pattern = re.compile(r'```(\w+)?\n(.*?)```', re.DOTALL) 208 | matches = code_pattern.findall(self.markdown) 209 | code_blocks = [{'language': match[0], 'code': match[1].rstrip('\n')} for match in matches] 210 | return code_blocks 211 | 212 | def _replace_pre_code(self, soup): 213 | markdown_code_blocks = self._extract_code_blocks() 214 | count = sum(1 for pre_tag in soup.find_all('pre') if pre_tag.find('code')) 215 | 216 | if markdown_code_blocks and count != len(markdown_code_blocks): 217 | logger.warning(f'Code block count not match: {count} != {len(markdown_code_blocks)}') 218 | return 219 | 220 | pre_tags = soup.find_all('pre') 221 | idx = 0 222 | for pre in pre_tags: 223 | code = pre.find('code') 224 | if not code: 225 | continue 226 | new_tag = soup.new_tag('code') 227 | new_tag.string = markdown_code_blocks[idx]['code'] 228 | new_tag['style'] = 'language: ' + markdown_code_blocks[idx]['language'] 229 | idx += 1 230 | code.replace_with(new_tag) 231 | return soup 232 | 233 | def _is_math(self, soup): 234 | code_tag = soup.code 235 | if not code_tag: 236 | return False 237 | 238 | css_dict = Html2JsonBase.get_tag_style(code_tag) 239 | if 'language' in css_dict and css_dict['language'] == 'math': 240 | return True 241 | return False 242 | 243 | 244 | Html2JsonBase.register(YinXiangMarkdown_Type, Html2JsonMarkdown) 245 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_yinxiang.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup, Tag 2 | from ..utils import logger, DateStrToISO8601 3 | from ..translate.html2json_base import Html2JsonBase, Block 4 | 5 | YinXiang_Type = "yinxiang" 6 | 7 | 8 | class Html2JsonYinXiang(Html2JsonBase): 9 | input_type = YinXiang_Type 10 | 11 | def __init__(self, html_content, import_stat): 12 | super().__init__(html_content, import_stat) 13 | 14 | def process(self): 15 | soup = BeautifulSoup(self.html_content, 'html.parser') 16 | self.convert_children(soup) 17 | self.convert_properties(soup) 18 | return YinXiang_Type 19 | 20 | def convert_properties(self, soup): 21 | properties = {"title": "Unknown"} 22 | title_tag = soup.select_one('head > title') 23 | if title_tag: 24 | properties["title"] = title_tag.text 25 | 26 | meta_tags = [ 27 | ('head > meta[name="source-url"]', "url"), 28 | ('head > meta[name="keywords"]', "tags", lambda x: x.split(",")), 29 | ('head > meta[name="created"]', "created_time", DateStrToISO8601), 30 | ] 31 | 32 | for selector, key, *converter in meta_tags: 33 | tag = soup.select_one(selector) 34 | if tag and tag.get('content', None): 35 | content = tag['content'] 36 | properties[key] = converter[0](content) if converter else content 37 | 38 | self.properties = self.generate_properties(**properties) 39 | return 40 | 41 | def convert_children(self, soup): 42 | content_tags = soup.find_all('body', recursive=True) 43 | if not content_tags: 44 | logger.warning("No content found") 45 | raise Exception("No content found") 46 | 47 | self.import_stat.add_text(content_tags[0].get_text()) 48 | for child in content_tags[0].children: 49 | block_type = self.get_block_type(child) 50 | # Computer all text len in html 51 | logger.debug(f'Support tag {child} with style {block_type}') 52 | if hasattr(self, f"convert_{block_type}"): 53 | converter = getattr(self, f"convert_{block_type}") 54 | block = converter(child) 55 | if block: 56 | self.children.extend([block] if not isinstance(block, list) else block) 57 | else: 58 | self.import_stat.add_skip_tag(child.get_text()) 59 | logger.warning(f"Unknown tag : {child}") 60 | 61 | def convert_code(self, soup): 62 | json_obj = { 63 | "object": "block", 64 | "type": "code", 65 | "code": { 66 | "rich_text": [], 67 | "language": "plain text", 68 | }, 69 | } 70 | rich_text = json_obj["code"]["rich_text"] 71 | 72 | children_list = list(soup.children) if isinstance(soup, Tag) else [soup] 73 | for index, child in enumerate(children_list): 74 | is_last_child = index == len(children_list) - 1 75 | text_obj = self.generate_inline_obj(child) 76 | if text_obj: 77 | rich_text.extend(text_obj) 78 | if not is_last_child: 79 | rich_text.append(self.generate_text(plain_text='\n', stats_count=False)) 80 | json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text) 81 | css_dict = Html2JsonBase.get_tag_style(soup) 82 | language = css_dict.get('--en-codeblockLanguage', 'plain text') 83 | json_obj["code"]["language"] = language 84 | return json_obj 85 | 86 | def convert_quote(self, soup): 87 | json_obj = { 88 | "object": "block", 89 | "type": "quote", 90 | "quote": { 91 | "rich_text": [] 92 | } 93 | } 94 | rich_text = json_obj["quote"]["rich_text"] 95 | 96 | children_list = list(soup.children) 97 | for index, child in enumerate(children_list): 98 | is_last_child = index == len(children_list) - 1 99 | text_obj = self.generate_inline_obj(child) 100 | if text_obj: 101 | rich_text.extend(text_obj) 102 | if not is_last_child: 103 | rich_text.append(self.generate_text(plain_text='\n', stats_count=False)) 104 | 105 | # Merge tags has same anotions 106 | logger.debug(f'before merge: {rich_text}') 107 | json_obj["quote"]["rich_text"] = self.merge_rich_text(rich_text) 108 | return json_obj 109 | 110 | def convert_to_do(self, soup: Tag): 111 | # Compatible with the situation where input is under li tag(super note). 112 | li_tags = soup.find_all('li', recursive=True) 113 | childs = li_tags if li_tags else [soup] 114 | to_do_blocks = [] 115 | for child in childs: 116 | json_obj = { 117 | "object": "block", 118 | "type": "to_do", 119 | "to_do": { 120 | "rich_text": [], 121 | "checked": False 122 | } 123 | } 124 | text = json_obj["to_do"]["rich_text"] 125 | text_obj = self.generate_inline_obj(child) 126 | if text_obj: 127 | text.extend(text_obj) 128 | input_tag = child.find('input') 129 | if input_tag and isinstance(input_tag, Tag) and input_tag.get('checked', 'false') == 'true': 130 | json_obj["to_do"]["checked"] = True 131 | to_do_blocks.append(json_obj) 132 | return to_do_blocks 133 | 134 | def get_block_type(self, single_tag): 135 | tag_name = single_tag.name 136 | style = single_tag.get('style') if tag_name else "" 137 | 138 | # There are priorities here. It is possible to hit multiple targets 139 | # at the same time, and the first one takes precedence. 140 | if self._check_is_todo(single_tag): 141 | return Block.TO_DO.value 142 | elif tag_name == 'hr': 143 | return Block.DIVIDER.value 144 | elif tag_name == 'ol': 145 | return Block.NUMBERED_LIST.value 146 | elif tag_name == 'ul': 147 | return Block.BULLETED_LIST.value 148 | elif tag_name == 'p': 149 | return Block.PARAGRAPH.value 150 | elif tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): 151 | return Block.HEADING.value 152 | elif tag_name == 'table' or self._check_is_table(single_tag): 153 | return Block.TABLE.value 154 | 155 | css_dict = Html2JsonBase.get_tag_style(single_tag) 156 | if css_dict.get('--en-blockquote', None) == 'true': 157 | return Block.QUOTE.value 158 | if css_dict.get('--en-codeblock', None) == 'true': 159 | return Block.CODE.value 160 | if css_dict.get('-en-codeblock', None) == 'true': 161 | return Block.CODE.value 162 | 163 | # Issue 5: 164 | if tag_name == 'div': 165 | return Block.PARAGRAPH.value 166 | return Block.FAIL.value 167 | 168 | #169 | def _check_is_table(self, tag): 170 | if tag.name == "div": 171 | children = list(filter(lambda x: x != '\n', tag.contents)) 172 | table_count = sum(1 for child in children if child.name == "table") 173 | return table_count >= 1 174 | return False 175 | 176 | def _check_is_todo(self, tag): 177 | if not isinstance(tag, Tag): 178 | return False 179 | input_tag = tag.find('input') 180 | if input_tag and isinstance(input_tag, Tag) and input_tag.get('type') == 'checkbox': 181 | return True 182 | return False 183 | 184 | Html2JsonBase.register(YinXiang_Type, Html2JsonYinXiang) 185 | -------------------------------------------------------------------------------- /html2notion/translate/import_stats.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from enum import Enum 3 | 4 | 5 | class StatLevel(Enum): 6 | EXCEPTION = "exception" 7 | LOSS = "loss" 8 | SUCC = "success" 9 | 10 | 11 | class ImportStats: 12 | def __init__(self): 13 | self.text_count = 0 14 | self.image_count = 0 15 | self.notion_text_count = 0 16 | self.notion_image_count = 0 17 | self.skip_tag = [] 18 | self.exception = None 19 | self.filename = "None" 20 | self.html_content = "" 21 | self.notion_content = "" 22 | self.html_image_src = [] 23 | self.notion_image_src = [] 24 | self.head_meta = {} 25 | 26 | def add_text(self, text: str): 27 | self.text_count += len(text) 28 | self.html_content += text 29 | 30 | def add_notion_text(self, text: str): 31 | self.notion_content += text 32 | self.notion_text_count += len(text) 33 | 34 | def add_image(self, src: str): 35 | self.html_image_src.append(src) 36 | self.image_count += 1 37 | 38 | def add_notion_image(self, src: str): 39 | self.notion_image_src.append(src) 40 | self.notion_image_count += 1 41 | 42 | def add_skip_tag(self, tag): 43 | self.skip_tag.append(tag) 44 | 45 | def set_filename(self, filename: Path): 46 | self.filename = filename 47 | 48 | def set_exception(self, exception: Exception): 49 | self.exception = exception 50 | 51 | def get_level(self): 52 | if self.exception: 53 | return StatLevel.EXCEPTION.value 54 | if self.notion_text_count < self.text_count: 55 | return StatLevel.LOSS.value 56 | return StatLevel.SUCC.value 57 | 58 | def __str__(self): 59 | msg = "" 60 | if self.get_level() == StatLevel.EXCEPTION.value: 61 | msg += f"[red]{str(self.exception)}[/red]" 62 | if 'body.parent.page_id should be defined' in str(self.exception): 63 | msg += f"\nHeadmeta: [yellow]{self.head_meta}[/yellow]" 64 | 65 | if self.get_level() == StatLevel.LOSS.value: 66 | if self.text_count != self.notion_text_count: 67 | msg += f"Text Len {self.text_count} -> {self.notion_text_count}, Loss [yellow]{self.text_count-self.notion_text_count}[/yellow]" 68 | 69 | msg += '\nDetail: [yellow]' + ";".join([repr(s) for s in self.skip_tag])[:500] + "[/yellow]" 70 | return msg 71 | 72 | def get_detail(self): 73 | return f"filename: {self.filename}, {self.text_count} text, {self.image_count} image\nNotion {self.notion_text_count} text, {self.notion_image_count} image\n{self.skip_tag}" 74 | 75 | 76 | if __name__ == '__main__': 77 | task_stats = ImportStats() 78 | task_stats.add_text(100) 79 | task_stats.add_image(20) 80 | task_stats.add_notion_text(80) 81 | task_stats.add_notion_image(15) 82 | task_stats.set_exception(Exception("Some error occurred")) 83 | 84 | print(task_stats) 85 | -------------------------------------------------------------------------------- /html2notion/translate/notion_export.py: -------------------------------------------------------------------------------- 1 | import json 2 | from notion_client import Client, errors as notion_errors 3 | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type 4 | from ..utils import logger, test_prepare_conf, config 5 | 6 | class NotionExporter: 7 | # Remove keys which not used by add page 8 | delete_block = { 9 | "rich_text": [ 10 | { 11 | # "type": "text", 12 | "text": { 13 | # "content": "测试第一行", 14 | "link": None 15 | }, 16 | "annotations": { 17 | "bold": False, 18 | "italic": False, 19 | "strikethrough": False, 20 | "underline": False, 21 | "code": False, 22 | "color": "default" 23 | }, 24 | # "plain_text": "测试第一行", 25 | "href": None 26 | } 27 | ], 28 | "color": "default", 29 | "is_toggleable": False 30 | } 31 | 32 | delete_conf = { 33 | # "object": "block", 34 | "id": "__any__", 35 | "parent": "__any__", 36 | "created_time": "__any__", 37 | "last_edited_time": "__any__", 38 | "created_by": "__any__", 39 | "last_edited_by": "__any__", 40 | "has_children": False, 41 | "archived": False, 42 | # "type": "paragraph", 43 | "paragraph": delete_block, 44 | "quote": delete_block, 45 | "numbered_list_item": delete_block, 46 | "bulleted_list_item": delete_block, 47 | "heading_1": delete_block, 48 | "heading_2": delete_block, 49 | "heading_3": delete_block, 50 | } 51 | 52 | def __init__(self, api_key, page_id, page_size=2): 53 | self.notion = Client(auth=api_key, logger=logger) 54 | self.page_id = page_id 55 | self.page_size = page_size 56 | self.all_blocks = [] 57 | self.output_blocks = [] 58 | 59 | @staticmethod 60 | def get_delete_conf(key_path): 61 | result = NotionExporter.delete_conf.copy() 62 | for key in key_path: 63 | # Number in path is json array placeholder 64 | if isinstance(key, int): 65 | if isinstance(result, list) and len(result) > 0: 66 | result = result[0] # type: ignore 67 | else: 68 | result = None 69 | elif isinstance(result, dict) and key in result: 70 | # If prefix path has __any__ conf, then delete all children 71 | if result[key] == "__any__": 72 | return ["__any__"] 73 | else: 74 | result = result[key] 75 | else: 76 | result = None 77 | 78 | if (isinstance(result, list)): 79 | return result 80 | elif (isinstance(result, str) or isinstance(result, bool) or isinstance(result, int)): 81 | return [result] 82 | else: 83 | return [None] 84 | 85 | @staticmethod 86 | def check_is_delete(key_path: list, value): 87 | delete_values = NotionExporter.get_delete_conf(key_path) 88 | if value in delete_values or '__any__' in delete_values: 89 | return True 90 | # logger.debug(f"Check key: {key_path}, value: {value}, delete values: {delete_values}") 91 | return False 92 | 93 | @staticmethod 94 | def keep_dict_pathvalue(data, path, value): 95 | for i, key in enumerate(path): 96 | if isinstance(key, int): 97 | data = data[key] 98 | elif i == len(path) - 1: 99 | data[key] = value 100 | else: 101 | next_key = path[i+1] if i+1 < len(path) else None 102 | if key in data: 103 | if isinstance(next_key, int): 104 | if not isinstance(data[key], list): 105 | logger.error(f"Keep error: {i}, {path}, {data[key]}") 106 | return 107 | data[key].extend([{} for _ in range(next_key - len(data[key]) + 1)]) 108 | else: 109 | if not isinstance(data[key], dict): 110 | logger.error(f"Keep error: {i}, {path}, {data[key]}") 111 | return 112 | else: 113 | if isinstance(next_key, int): 114 | data[key] = [{} for _ in range(next_key + 1)] 115 | else: 116 | data[key] = {} 117 | 118 | data = data[key] 119 | return 120 | 121 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=30), retry=retry_if_exception_type(notion_errors.RequestTimeoutError)) 122 | def __get_children_blocks(self): 123 | children = self.notion.blocks.children.list(block_id=self.page_id, page_size=self.page_size) 124 | if not isinstance(children, dict): 125 | logger.error(f"Get children failed: {children}") 126 | return None 127 | 128 | loop_count = 1 129 | while isinstance(children, dict) and "has_more" in children and children["has_more"]: 130 | next_cursor = children["next_cursor"] 131 | self.all_blocks.extend(children["results"]) 132 | children = self.notion.blocks.children.list( 133 | block_id=self.page_id, page_size=self.page_size, start_cursor=next_cursor) 134 | loop_count += 1 135 | cur_content = json.dumps(children, indent=4, ensure_ascii=False) 136 | logger.debug(f'Get child, {loop_count}: {cur_content}') 137 | 138 | if isinstance(children, dict) and "has_more" in children and not children["has_more"]: 139 | self.all_blocks.extend(children["results"]) 140 | return children 141 | 142 | @staticmethod 143 | def export_dict(data): 144 | clean_block = {} 145 | stack = [(data, list())] 146 | while stack: 147 | cur, prefix = stack.pop() 148 | if isinstance(cur, dict): 149 | for k, v in cur.items(): 150 | prefix.append(k) 151 | # logger.debug(f"Export dict, prefix: {prefix}, value: {v}") 152 | stack.append((v, prefix[:])) 153 | prefix.pop() 154 | elif isinstance(cur, list): 155 | for i, v in enumerate(cur): 156 | # logger.debug(f"Export array, prefix: {prefix}, {i}, value: {v}") 157 | prefix.append(i) 158 | stack.append((v, prefix[:])) 159 | prefix.pop() 160 | else: 161 | if (not NotionExporter.check_is_delete(prefix[:], cur)): 162 | logger.debug(f"Keep {prefix}: {cur}") 163 | NotionExporter.keep_dict_pathvalue(clean_block, prefix, cur) 164 | return clean_block 165 | 166 | def export_blocks(self): 167 | self.__get_children_blocks() 168 | result = json.dumps(self.all_blocks, indent=4, ensure_ascii=False) 169 | logger.info(f"Before process, blocks {result}") 170 | 171 | if not self.all_blocks: 172 | logger.error("Get children empty") 173 | 174 | for block in self.all_blocks: 175 | output_block = self.export_dict(block) 176 | self.output_blocks.append(output_block) 177 | 178 | return self.output_blocks 179 | 180 | 181 | if __name__ == "__main__": 182 | test_prepare_conf() 183 | exporter = NotionExporter(api_key=config['notion']['api_key'], 184 | page_id=config['notion']['page_id'], 185 | page_size=10) 186 | exporter.export_blocks() 187 | print(json.dumps(exporter.output_blocks, indent=4, ensure_ascii=False)) 188 | -------------------------------------------------------------------------------- /html2notion/translate/notion_import.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import traceback 4 | from aiohttp import ClientSession 5 | from pathlib import Path 6 | from notion_client import AsyncClient 7 | from notion_client.errors import RequestTimeoutError 8 | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type 9 | from ..utils import logger, test_prepare_conf, config, rate_limit 10 | from ..translate.html2json import html2json_process 11 | from ..translate.import_stats import ImportStats 12 | 13 | 14 | class NotionImporter: 15 | def __init__(self, session: ClientSession, notion_client): 16 | self.session = session 17 | self.notion_client = notion_client 18 | self.import_stats = ImportStats() 19 | 20 | async def process_file(self, file_path: Path): 21 | self.import_stats.set_filename(file_path) 22 | try: 23 | notion_data, html_type = html2json_process(file_path, self.import_stats) 24 | except Exception as e: 25 | error_message = traceback.format_exc() 26 | self.import_stats.set_exception(e) 27 | logger.error(f"Error processing {file_path}: {str(e)}, {error_message}") 28 | return "fail" 29 | 30 | logger.info(f"Process path: {file_path}, html type: {html_type}, {self.import_stats.get_detail()}") 31 | try: 32 | create_result = await self.create_new_page(notion_data) 33 | except Exception as e: 34 | error_message = traceback.format_exc() 35 | self.import_stats.set_exception(e) 36 | logger.error(f"Error create notion page {file_path}: {str(e)}, {error_message}") 37 | return "fail" 38 | logger.info(f"Create notion page: {create_result}") 39 | return "succ" 40 | 41 | # https://developers.notion.com/reference/request-limits 42 | # The rate limit for incoming requests per integration is an average of three requests per second. 43 | # Doc of create page: https://developers.notion.com/reference/post-page 44 | @retry(stop=stop_after_attempt(5), 45 | wait=wait_exponential(multiplier=1, min=3, max=30), 46 | retry=retry_if_exception_type(RequestTimeoutError)) 47 | async def create_new_page(self, notion_data): 48 | # logger.debug(f'Create new page: {notion_data["parent"]}, {notion_data["properties"]}') 49 | # body.children.length should be ≤ `100`, 50 | blocks = notion_data.get("children", []) 51 | # logger.debug(f'Create new page: {notion_data["parent"]}, {notion_data["properties"]}, blocks: {blocks}') 52 | 53 | limit_size = 100 54 | chunks = [blocks[i: i + limit_size] for i in range(0, len(blocks), limit_size)] 55 | if blocks: 56 | notion_data.pop("children") 57 | first_chunk = chunks[0] if chunks else [] 58 | async with rate_limit: 59 | created_page = await self.notion_client.pages.create(**notion_data, children=first_chunk) 60 | page_id = created_page["id"] 61 | for chunk in chunks[1:]: 62 | await self.notion_client.blocks.children.append(page_id, children=chunk) 63 | return created_page 64 | 65 | 66 | async def main(file_path, notion_api_key): 67 | async with ClientSession() as session: 68 | async with AsyncClient(auth=notion_api_key) as notion_client: 69 | importer = NotionImporter(session, notion_client) 70 | result = await importer.process_file(file_path) 71 | logger.info(f"Import result: {result}") 72 | 73 | 74 | if __name__ == "__main__": 75 | test_prepare_conf() 76 | file = Path("./demos/Test Case E.html") 77 | notion_api_key = "" 78 | if 'GITHUB_ACTIONS' in os.environ: 79 | notion_api_key = os.environ['notion_api_key'] 80 | else: 81 | notion_api_key = config['notion']['api_key'] 82 | asyncio.run(main(file, notion_api_key)) 83 | -------------------------------------------------------------------------------- /html2notion/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .log import logger, setup_logger 2 | from .load_config import read_config, config, rate_limit 3 | from .url_process import is_valid_url 4 | from .timeutil import DateStrToISO8601 5 | from pathlib import Path 6 | 7 | 8 | def test_prepare_conf(): 9 | log_path = Path("./logs") 10 | conf_path = Path("./.config.json") 11 | setup_logger(log_path) 12 | read_config(conf_path) 13 | logger.info(f"test_prepare_conf, log path({log_path}), conf path({conf_path})") 14 | 15 | 16 | __all__ = ['logger', 'setup_logger', 'config', 'read_config', 'test_prepare_conf', 'rate_limit', 'is_valid_url', 'DateStrToISO8601'] 17 | -------------------------------------------------------------------------------- /html2notion/utils/load_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from aiolimiter import AsyncLimiter 4 | rate_limit = AsyncLimiter(3, 1) 5 | config = {} 6 | 7 | 8 | def read_config(file_path): 9 | """ 10 | { 11 | "notion": { 12 | "database_id": "xxxxx", 13 | "api_key": "xxxxx" 14 | } 15 | } 16 | """ 17 | if not file_path.is_file(): 18 | print(f"Load {file_path} fail") 19 | sys.exit(1) 20 | 21 | with open(file_path, "r") as f: 22 | json_conf = json.load(f) 23 | 24 | config.update(json_conf) 25 | if "notion" not in config: 26 | raise Exception("notion is not set in config.json") 27 | 28 | notion_conf = config["notion"] 29 | if "database_id" not in notion_conf: 30 | raise Exception("database_id is not set in config.json") 31 | if "api_key" not in notion_conf: 32 | raise Exception("api_key is not set in config.json") 33 | return 34 | -------------------------------------------------------------------------------- /html2notion/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from logging import handlers 4 | logger = logging.getLogger() 5 | 6 | 7 | class CustomFormatter(logging.Formatter): 8 | green = "\033[92m" 9 | normal = "\x1b[38;21m" 10 | yellow = "\x1b[33;21m" 11 | red = "\x1b[31;21m" 12 | bold_red = "\x1b[31;1m" 13 | reset = "\x1b[0m" 14 | format = "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s" # type: ignore 15 | 16 | FORMATS = { 17 | logging.DEBUG: green + format + reset, # type: ignore 18 | logging.INFO: normal + format + reset, # type: ignore 19 | logging.WARNING: yellow + format + reset, # type: ignore 20 | logging.ERROR: red + format + reset, # type: ignore 21 | logging.CRITICAL: bold_red + format + reset # type: ignore 22 | } 23 | 24 | def format(self, record): 25 | log_fmt = self.FORMATS.get(record.levelno) 26 | formatter = logging.Formatter(log_fmt) 27 | return formatter.format(record) 28 | 29 | 30 | def setup_logger(log_path): 31 | file_path = log_path.joinpath("html2notion_error.log") 32 | handler = handlers.TimedRotatingFileHandler( 33 | filename=file_path, when='midnight', backupCount=30, encoding='utf-8') 34 | handler.setLevel(logging.DEBUG) 35 | handler.setFormatter(CustomFormatter()) 36 | logger.addHandler(handler) 37 | logger.setLevel(logging.DEBUG) 38 | 39 | logger.debug('Logging debug message') 40 | logger.info('Logging info message') 41 | logger.warning('Logging warning message') 42 | logger.error('Logging error message') 43 | 44 | 45 | def log_only_local(content): 46 | if 'GITHUB_ACTIONS' in os.environ: 47 | return 48 | 49 | from html2notion.utils import logger 50 | logger.info(content) 51 | -------------------------------------------------------------------------------- /html2notion/utils/timeutil.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from . import logger 3 | 4 | 5 | def DateStrToISO8601(date_string: str) -> str: 6 | """Converts a date string to ISO 8601 format. 7 | 8 | Args: 9 | date_string (str): Date string to convert. 10 | 11 | Returns: 12 | str: ISO 8601 formatted date string. 13 | """ 14 | 15 | date_format = "%Y-%m-%d %H:%M:%S %z" 16 | try: 17 | date_obj = datetime.strptime(date_string, date_format).astimezone() 18 | except ValueError: 19 | logger.warning(f"Invalid date string: {date_string}") 20 | return "" 21 | 22 | output_string = date_obj.isoformat() 23 | return output_string 24 | -------------------------------------------------------------------------------- /html2notion/utils/url_process.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | 4 | def is_valid_url(url): 5 | if not isinstance(url, str): 6 | return False 7 | try: 8 | result = urlparse(url) 9 | return all([result.scheme, result.netloc]) and is_valid_port(result.port) 10 | except ValueError: 11 | return False 12 | 13 | 14 | def is_valid_port(port): 15 | if port is None: 16 | return True 17 | return 0 <= port <= 65535 18 | 19 | 20 | if __name__ == '__main__': 21 | print(is_valid_url("https://www.google.com")) # Returns: True 22 | print(is_valid_url("google")) # Returns: False 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=54", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.11.2 2 | httpcore>=0.16.3 3 | httpx>=0.23.3 4 | notion-client>=2.0.0 5 | PyYAML>=6.0 6 | aiohttp>=3.8.4 7 | anyio>=3.6.2 8 | cos-python-sdk-v5>=1.9.23 9 | tenacity>=8.2.2 10 | rich>=13.3.4 11 | aiolimiter>=1.0.0 12 | chardet>=5.1.0 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = html2notion 3 | version = 0.2.0 4 | author = selfboot 5 | author_email = xuezaigds@gmail.com 6 | description = This tool can accurately convert HTML to Notion notes and is also useful for exporting Evernote notes to Notion. 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/selfboot/html2notion 10 | license_files = LICENSE 11 | classifiers = 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: MIT License 14 | Operating System :: OS Independent 15 | 16 | [options] 17 | packages = find: 18 | install_requires = 19 | beautifulsoup4>=4.11.2 20 | httpcore>=0.16.3 21 | httpx>=0.23.3 22 | notion-client>=2.0.0 23 | PyYAML>=6.0 24 | aiohttp>=3.8.4 25 | anyio>=3.6.2 26 | cos-python-sdk-v5>=1.9.23 27 | tenacity>=8.2.2 28 | rich>=13.3.4 29 | aiolimiter>=1.0.0 30 | chardet>=5.1.0 31 | 32 | [options.entry_points] 33 | console_scripts = 34 | html2notion = html2notion.main:main 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from html2notion.utils import test_prepare_conf, logger 4 | 5 | 6 | @pytest.fixture(autouse=True, scope='module') 7 | def prepare_conf(): 8 | if 'GITHUB_ACTIONS' not in os.environ: 9 | test_prepare_conf() 10 | logger.info("prepare_conf_fixture") 11 | -------------------------------------------------------------------------------- /tests/test_batchimport.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pytest 3 | import time 4 | import os 5 | from pathlib import Path 6 | from unittest.mock import patch 7 | from tempfile import TemporaryDirectory 8 | from http import HTTPStatus 9 | from html2notion.translate.batch_import import BatchImport 10 | from html2notion.utils import rate_limit 11 | from html2notion.utils.log import log_only_local 12 | 13 | process_once_time = 0.5 14 | 15 | 16 | async def mock_notion_api_request(file_path, *args, **kwargs): 17 | class MockResponse: 18 | def __init__(self, status_code, file_content, elapsed_time): 19 | self.status_code = status_code 20 | self.file_content = file_content 21 | self.elapsed_time = elapsed_time 22 | 23 | def json(self): 24 | return {"result": "success", "file_content": self.file_content, "elapsed_time": self.elapsed_time} 25 | 26 | start_time = time.perf_counter() 27 | content = file_path.read_text() 28 | if 'GITHUB_ACTIONS' not in os.environ: 29 | from html2notion.utils import logger 30 | logger.debug(f"mock_notion_api_request: {file_path}") 31 | await asyncio.sleep(process_once_time) 32 | end_time = time.perf_counter() 33 | elapsed_time = end_time - start_time 34 | return MockResponse(HTTPStatus.OK, content, elapsed_time) 35 | 36 | 37 | async def mock_notion_create_page(notion_data, *args, **kwargs): 38 | async with rate_limit: 39 | await asyncio.sleep(0.01) 40 | log_only_local(f"mock_notion_create_page") 41 | return "succ" 42 | 43 | @pytest.fixture(params=[10, 20]) 44 | def temp_dir_fixture(request): 45 | num_files = request.param 46 | with TemporaryDirectory() as temp_dir: 47 | dir_path = Path(temp_dir) 48 | temp_files = [] 49 | for i in range(num_files): 50 | temp_file = dir_path / f"file{i}.html" 51 | temp_file.write_text(f"file{i}") 52 | temp_files.append(temp_file) 53 | 54 | yield dir_path 55 | 56 | 57 | @pytest.mark.parametrize("concurrent_limit", [2, 5, 10]) 58 | @pytest.mark.asyncio 59 | async def test_batch_process(temp_dir_fixture, concurrent_limit): 60 | dir_path = temp_dir_fixture 61 | start_time = time.perf_counter() 62 | with patch("html2notion.translate.notion_import.NotionImporter.process_file", side_effect=mock_notion_api_request): 63 | batch_processor = BatchImport( 64 | dir_path, concurrent_limit=concurrent_limit) 65 | responses = await batch_processor.process_directory() 66 | 67 | end_time = time.perf_counter() 68 | for file_path, response in zip( 69 | sorted(dir_path.iterdir()), 70 | sorted(responses, key=lambda x: x.json()["file_content"])): 71 | assert response.json()["file_content"] == f"{file_path.stem}" 72 | 73 | total_time = end_time-start_time 74 | sync_time = sum(res.json()["elapsed_time"] for res in responses) 75 | least_time = min(res.json()["elapsed_time"] for res in responses) 76 | log_only_local( 77 | f"total_time: {total_time}, sync_time: {sync_time}, least_time: {least_time}") 78 | assert total_time >= least_time 79 | assert total_time <= sync_time 80 | 81 | 82 | @pytest.mark.parametrize("concurrent_limit", [5, 10, 20]) 83 | @pytest.mark.asyncio 84 | async def test_reqlimit(temp_dir_fixture, concurrent_limit): 85 | dir_path = temp_dir_fixture 86 | start_time = time.perf_counter() 87 | with patch("html2notion.translate.notion_import.NotionImporter.create_new_page", side_effect=mock_notion_create_page): 88 | batch_processor = BatchImport(dir_path, concurrent_limit=concurrent_limit) 89 | responses = await batch_processor.process_directory() 90 | 91 | end_time = time.perf_counter() 92 | total_time = end_time-start_time 93 | num_files = len(list(dir_path.glob('*.html'))) 94 | log_only_local(f"file nums: {num_files}, concurrent {concurrent_limit}, total_time: {total_time}") 95 | # The time deviation within 1 second is acceptable here. 96 | assert (total_time >= num_files / 3 - 1) -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pathlib import Path 3 | from unittest.mock import mock_open, patch 4 | from html2notion.utils import read_config, config 5 | import pytest 6 | 7 | 8 | def test_read_config(): 9 | mock_file_content = """{ 10 | "notion": { 11 | "database_id": "test_db_id", 12 | "api_key": "test_api_key" 13 | }, 14 | "log_path": "/test/log/path" 15 | } 16 | """ 17 | with patch("builtins.open", mock_open(read_data=mock_file_content)): 18 | with patch.object(Path, "is_file", return_value=True): 19 | read_config(Path("test_config.json")) 20 | assert "notion" in config 21 | assert "database_id" in config["notion"] 22 | assert "api_key" in config["notion"] 23 | assert config["notion"]["database_id"] == "test_db_id" 24 | assert config["notion"]["api_key"] == "test_api_key" 25 | config.clear() 26 | 27 | # Testing for missing database_id, notion, or api_key configurations throws an exception 28 | with patch("builtins.open", mock_open(read_data="{}")), patch.object(Path, "is_file", return_value=True), pytest.raises(Exception, match="notion is not set in config.json"): 29 | read_config(Path("test_config.json")) 30 | config.clear() 31 | 32 | with patch("builtins.open", mock_open(read_data="{\"notion\": {}}")), patch.object(Path, "is_file", return_value=True), pytest.raises(Exception, match="database_id is not set in config.json"): 33 | read_config(Path("test_config.json")) 34 | config.clear() 35 | 36 | with patch("builtins.open", mock_open(read_data="{\"notion\": {\"database_id\": \"test_db_id\"}}")), patch.object(Path, "is_file", return_value=True), pytest.raises(Exception, match="api_key is not set in config.json"): 37 | read_config(Path("test_config.json")) 38 | config.clear() 39 | -------------------------------------------------------------------------------- /tests/test_cosupload.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pytest 3 | import time 4 | import os 5 | import random 6 | import string 7 | from pathlib import Path 8 | from unittest.mock import patch 9 | from tempfile import TemporaryDirectory 10 | from html2notion.translate.batch_import import BatchImport 11 | from html2notion.translate.cos_uploader import TencentCosUploaderAsync 12 | from html2notion.utils.log import log_only_local 13 | 14 | 15 | async def mock_cos_upload_request(file_path, *args, **kwargs): 16 | if 'GITHUB_ACTIONS' not in os.environ: 17 | from html2notion.utils import config 18 | secret_id = config["cos"]["secret_id"] 19 | secret_key = config["cos"]["secret_key"] 20 | region = config["cos"]["region"] 21 | bucket = config["cos"]["bucket"] 22 | else: 23 | secret_id = os.environ['cos_secret_id'] 24 | secret_key = os.environ['cos_secret_key'] 25 | region = os.environ['cos_region'] 26 | bucket = os.environ['cos_bucket'] 27 | 28 | start_time = time.perf_counter() 29 | uploader = TencentCosUploaderAsync(secret_id, secret_key, region, bucket) 30 | loop = asyncio.get_event_loop() 31 | key = f"test_workflow/{file_path.name}" 32 | upload_response = await uploader.upload_file(loop, file_path, key) 33 | log_only_local(f"Upload response: {upload_response}") 34 | 35 | is_exist = await uploader.check_file_exist(loop, key) 36 | end_time = time.perf_counter() 37 | elapsed_time = end_time - start_time 38 | log_only_local(f"Upload elapsed time: {elapsed_time}") 39 | return (is_exist, elapsed_time) 40 | 41 | 42 | @pytest.fixture() 43 | def temp_dir_fixture(): 44 | with TemporaryDirectory() as temp_dir: 45 | dir_path = Path(temp_dir) 46 | temp_files = [] 47 | for i in range(20): 48 | file_size = random.randint(1 * 1024, 1 * 1024 * 1024) 49 | random_text = "".join(random.choices(string.ascii_letters + string.digits, k=file_size)) 50 | 51 | temp_file = dir_path / f"file_{i}.html" 52 | temp_file.write_text(random_text) 53 | temp_files.append(temp_file) 54 | 55 | yield dir_path 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_batch_cos_upload(temp_dir_fixture): 60 | concurrent_limit = 5 61 | dir_path = temp_dir_fixture 62 | 63 | start_time = time.perf_counter() 64 | with patch("html2notion.translate.notion_import.NotionImporter.process_file", side_effect=mock_cos_upload_request): 65 | batch_processor = BatchImport( 66 | dir_path, concurrent_limit=concurrent_limit) 67 | responses = await batch_processor.process_directory() 68 | end_time = time.perf_counter() 69 | 70 | for res in responses: 71 | assert (res[0]) 72 | 73 | total_time = end_time - start_time 74 | elapsed_times = sum([res[1] for res in responses]) 75 | least_tiems = min([res[1] for res in responses]) 76 | log_only_local(f"Time: sum: {elapsed_times}, min {least_tiems}, total: {total_time}") 77 | assert (total_time < elapsed_times) 78 | assert (total_time >= least_tiems) 79 | -------------------------------------------------------------------------------- /tests/test_demos.py: -------------------------------------------------------------------------------- 1 | # import glob 2 | import json 3 | import os 4 | from pathlib import Path 5 | from html2notion.translate.html2json import html2json_process 6 | from html2notion.translate.import_stats import ImportStats 7 | from html2notion.translate.html2json_markdown import YinXiangMarkdown_Type 8 | from html2notion.translate.html2json_clipper import YinXiangClipper_Type 9 | from html2notion.utils import logger, config 10 | 11 | 12 | def test_demo_files(): 13 | if 'GITHUB_ACTIONS' in os.environ: 14 | database_id = os.environ['notion_db_id_1'] 15 | else: 16 | database_id = config['notion']['database_id'] 17 | 18 | testcases = [ 19 | ["./demos/yinxiang_markdown.html", YinXiangMarkdown_Type, "./demos/yinxiang_markdown.json"], 20 | ["./demos/yinxiang_clipper.html", YinXiangClipper_Type, "./demos/yinxiang_clipper.json"], 21 | ["./demos/yinxiang_clipper_wx.html", YinXiangClipper_Type, "./demos/yinxiang_clipper_wx.json"], 22 | ] 23 | 24 | for md_file, expect_type, expect_file in testcases: 25 | import_stats = ImportStats() 26 | notion_data, html_type = html2json_process(Path(md_file), import_stats) 27 | 28 | assert html_type == expect_type 29 | with open(expect_file, "r") as f: 30 | content = f.read() 31 | 32 | # Replace the placeholder 33 | content = content.replace("###database_id###", database_id) 34 | expect = json.loads(content) 35 | 36 | # The timezone causes the calculated time to be different, and the check here can be ignored 37 | try: 38 | del expect['properties']['Created']['date']['start'] 39 | del notion_data['properties']['Created']['date']['start'] 40 | except KeyError as e: 41 | pass 42 | 43 | # import dictdiffer 44 | # diff = dictdiffer.diff(notion_data, expect) 45 | # for d in diff: 46 | # logger.debug(f'Diff: {d}') 47 | # aa = json.dumps(notion_data, ensure_ascii=False) 48 | # logger.debug(f'notion_data: {aa}') 49 | assert notion_data ==expect 50 | 51 | -------------------------------------------------------------------------------- /tests/test_log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from unittest.mock import patch, MagicMock 3 | from pathlib import Path 4 | from html2notion.utils import setup_logger, logger 5 | from html2notion.utils.log import CustomFormatter 6 | 7 | 8 | class MockHandler(MagicMock): 9 | @property 10 | def level(self): 11 | return 0 12 | 13 | 14 | @patch('logging.handlers.TimedRotatingFileHandler', new_callable=MockHandler) 15 | def test_setup_logger(mock_handler): 16 | setup_logger(Path('/fake/path')) 17 | 18 | # Assert TimedRotatingFileHandler is called with the correct arguments 19 | mock_handler.assert_called_once_with( 20 | filename=Path('/fake/path', 'html2notion_error.log'), 21 | when='midnight', backupCount=30, encoding='utf-8' 22 | ) 23 | 24 | # Assert the mock handler instance is set with the correct level and formatter 25 | mock_handler.return_value.setLevel.assert_called_once_with(logging.DEBUG) 26 | assert isinstance(mock_handler.return_value.setFormatter.call_args[0][0], CustomFormatter) 27 | 28 | # Assert logger has the correct level 29 | assert logger.level == logging.DEBUG 30 | 31 | 32 | def test_custom_formatter(): 33 | formatter = CustomFormatter() 34 | 35 | for level, color in [(logging.DEBUG, "\033[92m"), (logging.INFO, "\x1b[38;21m"), 36 | (logging.WARNING, "\x1b[33;21m"), (logging.ERROR, "\x1b[31;21m"), 37 | (logging.CRITICAL, "\x1b[31;1m")]: 38 | record = logging.LogRecord( 39 | name="test", level=level, pathname='test_path', lineno=0, 40 | msg="test message", args=None, exc_info=None 41 | ) 42 | record.filename = "test.py" 43 | record.lineno = 1 44 | 45 | result = formatter.format(record) 46 | expected_format = f"{color}%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s\x1b[0m" 47 | expected_message = logging.Formatter(expected_format).format(record) 48 | 49 | assert result == expected_message 50 | -------------------------------------------------------------------------------- /tests/test_notionexport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from html2notion.translate.notion_export import NotionExporter 4 | from html2notion.utils import config 5 | 6 | 7 | def test_check_is_delete(): 8 | del_keyvalue = [ 9 | (["id"], "95948188-43cb-451f-b538-e0375368ca96"), 10 | (["parent", "type"], "page_id"), 11 | (["created_by", "object"], "user"), 12 | (["paragraph", "rich_text", 0, "text", "link"], None), 13 | (["paragraph", "rich_text", 0, "annotations", "code"], False), 14 | (["paragraph", "rich_text", 0, "annotations", "color"], "default"), 15 | (["quote", "rich_text", 0, "annotations", "color"], "default"), 16 | (["numbered_list_item", "rich_text", 0, "annotations", "color"], "default"), 17 | (["bulleted_list_item", "rich_text", 0, "annotations", "color"], "default"), 18 | ] 19 | 20 | for (path, value) in del_keyvalue: 21 | assert NotionExporter.check_is_delete(path, value) 22 | 23 | keep_keyvalue = [ 24 | (["type"], "paragraph"), 25 | (["type"], "image"), 26 | (["object"], "block"), 27 | (["paragraph", "rich_text", 0, "text", "link"], "https://selfboot.com"), 28 | (["paragraph", "rich_text", 0, "annotations", "code"], True), 29 | (["paragraph", "rich_text", 0, "annotations", "color"], "red"), 30 | (["quote", "rich_text", 0, "annotations", "color"], "red"), 31 | (["numbered_list_item", "rich_text", 0, "annotations", "color"], "red"), 32 | (["bulleted_list_item", "rich_text", 0, "annotations", "color"], "red"), 33 | (["bulleted_list_item", "rich_text", 0, "annotations", "code"], True), 34 | ] 35 | for (path, value) in keep_keyvalue: 36 | assert not NotionExporter.check_is_delete(path, value) 37 | 38 | 39 | def test_export_blocks(): 40 | if 'GITHUB_ACTIONS' in os.environ: 41 | api_key = os.environ['notion_api_key'] 42 | page_id = os.environ['notion_page_id_1'] 43 | else: 44 | api_key = config['notion']['api_key'] 45 | page_id = config['notion']['page_id'] 46 | 47 | names = locals() 48 | page_sizes = [1, 5, 10, 100] 49 | for i in page_sizes: 50 | names['exporter_' + str(i)] = NotionExporter( 51 | api_key=api_key, 52 | page_id=page_id, 53 | page_size=i) 54 | 55 | names['exporter_' + str(i)].export_blocks() 56 | names['page_json_'+str(i)] = json.dumps(names['exporter_' + str(i)].output_blocks, indent=4, ensure_ascii=False) 57 | 58 | for i in page_sizes[1:]: 59 | if names['page_json_' + str(i)] != names['page_json_' + str(page_sizes[0])]: 60 | assert False 61 | 62 | 63 | if __name__ == '__main__': 64 | if 'GITHUB_ACTIONS' not in os.environ: 65 | from html2notion.utils import config, test_prepare_conf 66 | test_prepare_conf() 67 | 68 | test_check_is_delete() 69 | test_export_blocks() 70 | -------------------------------------------------------------------------------- /tests/test_reqlimit.py: -------------------------------------------------------------------------------- 1 | import json 2 | from html2notion.translate.html2json_yinxiang import Html2JsonYinXiang 3 | from html2notion.translate.import_stats import ImportStats 4 | 5 | 6 | block_max_conent = "Some words" * 200 7 | one_text_obj = { 8 | "plain_text": block_max_conent, 9 | "text": { 10 | "content": block_max_conent 11 | }, 12 | "type": "text" 13 | } 14 | remain_text_obj = { 15 | "plain_text": " more words", 16 | "text": { 17 | "content": " more words" 18 | }, 19 | "type": "text" 20 | } 21 | 22 | 23 | def test_reqlimit(): 24 | paragram_rich_block = [ 25 | { 26 | "object": "block", 27 | "type": "paragraph", 28 | "paragraph": { 29 | "rich_text": [ 30 | one_text_obj, one_text_obj, remain_text_obj 31 | ] 32 | } 33 | } 34 | ] 35 | 36 | paragram_rich_content = f'
{block_max_conent * 2} more words' 37 | import_stats = ImportStats() 38 | yinxiang = Html2JsonYinXiang(paragram_rich_content, import_stats) 39 | yinxiang.process() 40 | json_obj = yinxiang.children 41 | # print(json.dumps(json_obj, indent=4)) 42 | assert json_obj == paragram_rich_block 43 | 44 | 45 | def test_code_reqlimit(): 46 | code_rich_content = f'{block_max_conent * 2} more words' 47 | import_stats = ImportStats() 48 | yinxiang = Html2JsonYinXiang(code_rich_content, import_stats) 49 | yinxiang.process() 50 | json_obj = yinxiang.children 51 | # print(json.dumps(json_obj, indent=4)) 52 | 53 | split_block_result = [ 54 | { 55 | "object": "block", 56 | "type": "code", 57 | "code": { 58 | "rich_text": [ 59 | one_text_obj, one_text_obj, remain_text_obj 60 | ], 61 | "language": "plain text" 62 | } 63 | } 64 | ] 65 | assert json_obj == split_block_result 66 | 67 | 68 | if __name__ == '__main__': 69 | # test_reqlimit() 70 | test_code_reqlimit() 71 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from html2notion.utils import DateStrToISO8601, is_valid_url 3 | 4 | 5 | def test_date_to_ios8601(): 6 | valid_date_pair = ["2018-09-20 10:30:36 +0000", "2023-05-12 03:49:56 +0000"] 7 | 8 | for date_string in valid_date_pair: 9 | expect = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S %z").astimezone().isoformat() 10 | assert DateStrToISO8601(date_string) 11 | 12 | invalid_date_pair = ["2018-09-20 10:30", "2018-09-20 10:30:36", "2018-09-20 10:30:36+0800"] 13 | for date_string in invalid_date_pair: 14 | assert DateStrToISO8601(date_string) == "" 15 | 16 | 17 | def test_is_valid_url(): 18 | valid_urls = [ 19 | "http://www.example.com", 20 | "https://www.example.com", 21 | "ftp://www.example.com", 22 | "http://localhost", 23 | "http://127.0.0.1", 24 | "http://example.com/path?query#fragment", 25 | ] 26 | 27 | invalid_urls = [ 28 | "example.com", 29 | "www.example.com", 30 | "http://", 31 | "http:///example.com", 32 | "http://example.com:80:80", # Two port numbers 33 | None, 34 | 123, # Non-string input 35 | "", 36 | ] 37 | 38 | for url in valid_urls: 39 | assert is_valid_url(url) == True, f"Expected {url} to be valid" 40 | 41 | for url in invalid_urls: 42 | assert is_valid_url(url) == False, f"Expected {url} to be invalid" 43 | 44 | 45 | if __name__ == '__main__': 46 | test_date_to_ios8601() 47 | --------------------------------------------------------------------------------