├── .coveragerc ├── .github └── workflows │ └── python-package.yml ├── .gitignore ├── LICENSE ├── README.md ├── README_zh.md ├── config.json ├── demos ├── notion_templage.png ├── yinxiang_clipper.html ├── yinxiang_clipper.json ├── yinxiang_clipper.resources │ ├── svg_1.svg │ ├── svg_10.svg │ ├── svg_11.svg │ ├── svg_12.svg │ ├── svg_13.svg │ ├── svg_2.svg │ ├── svg_3.svg │ ├── svg_4.svg │ ├── svg_5.svg │ ├── svg_6.svg │ ├── svg_7.svg │ ├── svg_8.svg │ └── svg_9.svg ├── yinxiang_clipper_2.html ├── yinxiang_clipper_2.resources │ ├── svg_1.svg │ ├── svg_10.svg │ ├── svg_11.svg │ ├── svg_12.svg │ ├── svg_13.svg │ ├── svg_14.svg │ ├── svg_15.svg │ ├── svg_16.svg │ ├── svg_17.svg │ ├── svg_18.svg │ ├── svg_19.svg │ ├── svg_2.svg │ ├── svg_20.svg │ ├── svg_21.svg │ ├── svg_22.svg │ ├── svg_23.svg │ ├── svg_24.svg │ ├── svg_25.svg │ ├── svg_26.svg │ ├── svg_27.svg │ ├── svg_3.svg │ ├── svg_4.svg │ ├── svg_5.svg │ ├── svg_6.svg │ ├── svg_7.svg │ ├── svg_8.svg │ └── svg_9.svg ├── yinxiang_clipper_wx.html ├── yinxiang_clipper_wx.json ├── yinxiang_gbk.html ├── yinxiang_markdown.html ├── yinxiang_markdown.json ├── yinxiang_markdown.resources │ └── 5BB98FD9-8FA4-481F-AF4E-E3B1F2DD38BC.png ├── yinxiang_mobile.html ├── yinxiang_normal.html ├── yinxiang_normal.resources │ └── 7672861D-5C56-4A07-B0E6-256950F2775A.png ├── yinxiang_normal_format.html ├── yinxiang_notion.png ├── yinxiang_notion2.png └── yinxiang_supernote.html ├── examples ├── insert_divider.ipynb ├── insert_table.ipynb ├── insert_text.ipynb ├── insert_todo.ipynb ├── parse_code.ipynb ├── parse_tag.ipynb └── process_md.ipynb ├── html2notion ├── __init__.py ├── main.py ├── translate │ ├── __init__.py │ ├── batch_import.py │ ├── cos_uploader.py │ ├── html2json.py │ ├── html2json_base.py │ ├── html2json_clipper.py │ ├── html2json_default.py │ ├── html2json_markdown.py │ ├── html2json_yinxiang.py │ ├── import_stats.py │ ├── notion_export.py │ └── notion_import.py └── utils │ ├── __init__.py │ ├── load_config.py │ ├── log.py │ ├── timeutil.py │ └── url_process.py ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── conftest.py ├── test_batchimport.py ├── test_config.py ├── test_cosupload.py ├── test_demos.py ├── test_log.py ├── test_notionexport.py ├── test_reqlimit.py ├── test_util.py └── test_yinxiang.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = 3 | */__init__.py 4 | main.py 5 | 6 | [report] 7 | exclude_lines = 8 | if __name__ == .__main__.: 9 | async def main(.*): 10 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8", "3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest pytest-asyncio pytest-cov 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | pip install -e . 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest --cov=html2notion --cov-config=.coveragerc 42 | env: 43 | notion_api_key: ${{ secrets.NOTION_API_KEY }} 44 | notion_db_id_1: ${{ secrets.NOTION_DATABASE_ID_1 }} 45 | notion_page_id_1: ${{ secrets.NOTION_PAGE_ID_1 }} 46 | cos_secret_id: ${{ secrets.COS_SECRET_ID }} 47 | cos_secret_key: ${{ secrets.COS_SECRET_KEY }} 48 | cos_region: ${{ secrets.COS_REGION }} 49 | cos_bucket: ${{ secrets.COS_BUCKET }} 50 | - name: Upload coverage reports to Codecov 51 | uses: codecov/codecov-action@v3 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | *.pyc 3 | notion_demo/* 4 | build/ 5 | html2notion.egg-info/ 6 | logs/* 7 | .config.json 8 | .DS_Store 9 | dist/* 10 | .coverage 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) SelfBoot 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Html2notion 简体中文

2 |

3 | 4 | CI Test Status 5 | 6 | 7 | Test coverage 8 | 9 |

10 | 11 | html2notion is an incredibly useful tool written in Python, which allows you to import content from HTML documents into Notion notes, making it more convenient for you to organize information on the Notion platform. In addition, html2notion has been specifically optimized for the content of Evernote, and you can also use it to import notes from Evernote into Notion. 12 | 13 | html2notion has powerful features and supports converting various tags in HTML files into corresponding Blocks in Notion, such as rich text blocks, headings, images, code blocks, quotes, links, etc. Below are examples of converting notes from Evernote into Notion pages. 14 | 15 | ![yinxiang notion(simple demos)](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/yinxiang_notion.png) 16 | 17 | ![yinxiang notion2(rich text)](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/yinxiang_notion2.png) 18 | 19 | # Prepare 20 | 21 | You only need 3 steps to use htmlnotion to import HTML into Notion. 22 | 23 | ## Duplicate database 24 | 25 | Click the link [notion template](https://selfboot.notion.site/selfboot/130bb48c6cbd4abbbb713d4d8472481a?v=ddda20d3f46b4b44a055d06792c142f0). As shown in the image below, use the "Duplicate" button to copy a new database to your own Notion workspace. 26 | 27 | ![notion template](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/notion_templage.png) 28 | 29 | ## Install html2notion 30 | 31 | Requires python>=3.8, install the html2notion library. You can use the pip command to install it: 32 | 33 | ``` 34 | pip install html2notion 35 | ``` 36 | 37 | ## Prepare Notion Configuration 38 | 39 | We need to use the `Notion API key` and `Database ID` to authorize html2notion to access the Notion database. Please follow these steps: 40 | 41 | 1. Create an integration; 42 | 2. Share a database with your integration; 43 | 3. Export the database ID; 44 | 45 | When sharing the database here, you need to choose the previously duplicated database because the import operation requires some preset [properties](https://developers.notion.com/reference/property-object) information in this database. 46 | 47 | For specific methods, please refer to the Notion official documentation [create an integration](https://developers.notion.com/docs/create-a-notion-integration). 48 | 49 | After the setup is complete, write your API Key and database ID into a configuration file config.json. 50 | 51 | ```shell 52 | { 53 | "notion": { 54 | "database_id": "<***demo***>", 55 | "api_key": "<***demo***>" 56 | } 57 | } 58 | ``` 59 | 60 | # Usage 61 | 62 | You can use `html2notion -h` to view detailed help documentation. 63 | 64 | ```shell 65 | usage: html2notion [-h] --conf CONF [--log LOG] [--batch BATCH] (--file FILE | --dir DIR) 66 | 67 | Html2notion: Save HTML to your Notion notes quickly and easily, while keeping the original format as much as possible 68 | 69 | options: 70 | -h, --help show this help message and exit 71 | --conf CONF conf file path 72 | --log LOG log direct path 73 | --batch BATCH batch save concurrent limit 74 | --file FILE Save single html file to notion 75 | --dir DIR Save all html files in the dir to notion 76 | ``` 77 | 78 | For example, if you want to import all html files in the `./demos` directory into Notion, you can use the following command: 79 | 80 | ```shell 81 | html2notion --conf config.json --dir ./demos --log ~/logs --batch 10 82 | ``` 83 | 84 | The above command will import all html files in the `./demos` directory into Notion, while outputting logs to the `~/logs` directory, with up to 10 concurrent tasks. 85 | 86 | # More information 87 | 88 | You can find more information and examples in the html2notion library's Issue: [html2notion](https://github.com/selfboot/html2notion/issues) 89 | 90 | ## Contribution 91 | 92 | If you find any errors or have any suggestions for improvement, please do not hesitate to submit a pull request or raise an issue, I am more than happy to accept your contributions and feedback! 93 | 94 | If you encounter import failures, you can submit the html file and log file together in the issue for easier problem identification. 95 | 96 | > If there are any private information in the files, please remove it first. 97 | 98 | 99 | ## License 100 | 101 | This project uses the MIT license. Please refer to the [LICENSE](./LICENSE) for details. 102 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 |

Html2notion English

2 |

3 | 4 | CI Test Status 5 | 6 | 7 | Test coverage 8 | 9 |

10 | 11 | html2notion 是一个非常有用的 Python 写的工具,它可以将 HTML 文档中的内容导入到 Notion 笔记中,从而使您能够更方便地将信息整理到 Notion 平台上。此外,html2notion 对印象笔记的内容进行了专门优化,还可以使用它来将印象笔记中的笔记导入到 Notion 中。 12 | 13 | html2notion 功能非常强大,它支持将 HTML 文件的各种标签转换为 Notion 中对应的 Block,比如富文本块、标题、图片、代码块、引用、链接等。下面是将印象笔记中的笔记转换为 notion page 中的示例。 14 | 15 | ![迁移notion(保留格式)](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/yinxiang_notion.png) 16 | 17 | ![迁移notion2(保留格式)](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/yinxiang_notion2.png) 18 | 19 | # 准备工作 20 | 21 | 只需要3步就可以使用 htmlnotion 来导入 html 到 notion 中。 22 | 23 | ## 复制 notion 数据库 24 | 25 | 点击链接 [notion template](https://selfboot.notion.site/selfboot/130bb48c6cbd4abbbb713d4d8472481a?v=ddda20d3f46b4b44a055d06792c142f0), 如下面的图所示,通过 "Duplicate" 按钮,复制一个新的数据库到自己的notion工作空间。 26 | 27 | ![notion template](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/notion_templage.png) 28 | 29 | ## 安装 html2notion 30 | 需要 python>=3.8, 安装 html2notion 库。您可以使用 pip 命令来安装它: 31 | 32 | ``` 33 | pip install html2notion 34 | ``` 35 | 36 | ## 准备 Notion 配置 37 | 38 | 我们需要使用 Notion API 密钥和数据库 ID 来授权 html2notion 访问 Notion 数据库,请按照以下步骤操作: 39 | 40 | 1. 创建 Integration 41 | 2. 与 Integration 共享数据库 42 | 3. 获取数据库 ID 和 API Key 43 | 44 | 这里共享数据库的时候,要选择前面 Duplicate 的数据库,因为导入操作需要用到这个 database 里面的一些预设 [Properties](https://developers.notion.com/reference/property-object) 信息。 45 | 46 | 具体方法请参考 notion 官方文档 [Create an integration](https://developers.notion.com/docs/create-a-notion-integration)。 47 | 48 | 设置完成后,将自己的 API Key 和数据库 ID 写入到一个配置文件 `config.json`。 49 | 50 | ```shell 51 | { 52 | "notion": { 53 | "database_id": "<***demo***>", 54 | "api_key": "<***demo***>" 55 | } 56 | } 57 | ``` 58 | 59 | # 使用 60 | 61 | 可以使用 `html2notion -h` 查看详细的帮助文档; 62 | 63 | ``` 64 | usage: html2notion [-h] --conf CONF [--log LOG] [--batch BATCH] (--file FILE | --dir DIR) 65 | 66 | Html2notion: Save HTML to your Notion notes quickly and easily, while keeping the original format as much as possible 67 | 68 | options: 69 | -h, --help show this help message and exit 70 | --conf CONF conf file path 71 | --log LOG log direct path 72 | --batch BATCH batch save concurrent limit 73 | --file FILE Save single html file to notion 74 | --dir DIR Save all html files in the dir to notion 75 | ``` 76 | 77 | 比如要将路径 `./demos` 下的所有 html 文件导入到 notion 中,可以使用如下命令: 78 | 79 | ```shell 80 | html2notion --conf config.json --dir ./demos --log ~/logs --batch 10 81 | ``` 82 | 83 | 上面命令会将 `./demos` 目录下的所有 html 文件导入到 notion 中,同时会将日志输出到 `~/logs` 目录下,最多有 10 个并发任务。 84 | 85 | # 更多信息 86 | 87 | 您可以在 html2notion 库的 GitHub 存储库中找到更多的信息和示例:[html2notion](https://github.com/kevinzg/html2notion) 88 | 89 | ## 贡献 90 | 91 | 如果您发现了任何错误或有任何改进意见,请不要犹豫,提交一个 pull request 或提出一个 issue,我很乐意接受您的贡献和反馈! 92 | 93 | 如果遇到导入失败,可以将 html 文件和日志文件一起提交到 issue 中,方便定位问题。 94 | 95 | > 如果 html 文件中有隐私信息,请先删除。 96 | 97 | ## 许可证 98 | 99 | 此项目使用 MIT 许可证。详情请参阅 [LICENSE](./LICENSE) 文件。 100 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "notion": { 3 | "database_id": "********", 4 | "api_key": "********" 5 | }, 6 | 7 | "cos": { 8 | "secret_id": "********", 9 | "secret_key": "********", 10 | "region": "********", 11 | "bucket": "********" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /demos/notion_templage.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/demos/notion_templage.png -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_1.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_10.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_11.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_12.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_13.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_2.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_3.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_4.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_5.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_6.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_7.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_8.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper.resources/svg_9.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_1.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_10.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_11.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_12.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_13.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_14.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_15.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_16.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_17.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_18.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_19.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_2.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_20.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_21.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_22.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_23.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_24.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_25.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_26.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_27.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_3.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_4.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_5.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_6.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_7.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_8.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_clipper_2.resources/svg_9.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /demos/yinxiang_gbk.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/demos/yinxiang_gbk.html -------------------------------------------------------------------------------- /demos/yinxiang_markdown.resources/5BB98FD9-8FA4-481F-AF4E-E3B1F2DD38BC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/demos/yinxiang_markdown.resources/5BB98FD9-8FA4-481F-AF4E-E3B1F2DD38BC.png -------------------------------------------------------------------------------- /demos/yinxiang_mobile.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | YinXiang Mobile
su

dd if=/sdcard/twrp.img of=/dev/block/platform/msm_sdcc.1/by-name/recovery

-------------------------------------------------------------------------------- /demos/yinxiang_normal.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | yinxiang_normal
Overview

The OpenAI API can be applied to virtually any task that involves understanding or generating natural language, code, or images. We offer a spectrum of models with different levels of power suitable for different tasks, as well as the ability to fine-tune your own custom models. These models can be used for everything from content generation to semantic search and classification.

The API is powered by a set of models with different capabilities and price points. GPT-4 is our latest and most powerful model. GPT-3.5-Turbo is the model that powers ChatGPT and is optimized for conversational formats. To learn more about these models and what else we offer, visit our models documentation.

This is Red text, Green text, Gray textPurple textOrange textYellow text. This is link content, underline text, bold text, strikethrough text, italic text.

This is Red text, Green text, Gray textPurple textOrange textYellow text. This is link content, underline text, bold text, strikethrough text, italic text. (Content by copy)

Quote test

Our models understand and process text by breaking it down into tokens. Tokens can be words or just chunks of characters. For example, the word “hamburger” gets broken up into the tokens “ham”, “bur” and “ger”, while a short and common word like “pear” is a single token. Many tokens start with a whitespace, for example “ hello” and “ bye”.


The number of tokens processed in a given API request depends on the length of both your inputs and outputs. As a rough rule of thumb, 1 token is approximately 4 characters or 0.75 words for English text. One limitation to keep in mind is that your text prompt and generated completion combined must be no more than the model's maximum context length (for most models this is 2048 tokens, or about 1500 words). Check out our tokenizer tool to learn more about how text translates to tokens.

Quote 1
2
3
Read 4
5
6. OpenAI’s mission is to create artificial intelligence systems that benefit everyone. To that end, we invest heavily in research and engineering to ensure our AI systems are safe and secure. However, as with any complex technology, we understand that vulnerabilities and flaws can emerge.

List test
  1. You are authorized to perform testing in compliance with this policy.
  2. Follow this policy and any other relevant agreements. In case of inconsistency, this policy takes precedence.
  3. Promptly report discovered vulnerabilities.

As part of this policy, we commit to:

Local Image


ToDo List (with divider)


 Choose classes that map to a single token. At inference time, specify max_tokens=1 since you only need the first token for classification.
 Use a separator at the end of the prompt, e.g. \n\n###\n\n.Remember to also append this separator when you eventually make requests to your model.
 Ensure that the prompt + completion doesn't exceed 2048 tokens, including the separator

Table test
Row 1: Role, Read, Normal
Row 1: Bold Text. Undownline
Row 1: In classification problems:
  • List 1.
  • List 2
Code here;
Row 2: System
Row 2:You are a helpful assistant.
Row 2: none



-------------------------------------------------------------------------------- /demos/yinxiang_normal.resources/7672861D-5C56-4A07-B0E6-256950F2775A.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/demos/yinxiang_normal.resources/7672861D-5C56-4A07-B0E6-256950F2775A.png -------------------------------------------------------------------------------- /demos/yinxiang_normal_format.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | yinxiang_normal_format 4 |
趣味篇
5 |
6 |

7 |
1. 唐僧其实是一个俗人。悟空才是天生有慧根。每当唐僧在路上唉声叹气取经苦,都是悟空引经据典地安慰他。比如说三十二回中:
8 |

9 | 10 | 11 | 14 | 15 |
12 |
师徒们正行赏间,又见一山挡路。唐僧道徒弟们仔细,前遇山高,恐有虎狼阻挡行者道:师父,出家人莫说在家话。你记得那乌巢和尚的《心经》云心无挂碍,无挂碍,方无恐怖,远离颠倒梦想之言?但只是扫除心上垢,洗净耳边尘。不受苦中苦,难为人上人。你莫生忧虑,但有老孙,就是塌下天来,可保无事。怕什么虎狼
13 |
16 |

17 |
其实唐僧最俗的地方还可以见诸很多细节,比如说,作为一个出家人,唐僧从来不在意别人杀生。当然这个生指的是动物。比如说刚刚收了孙悟空,路遇一大老虎,被孙悟空一棒打死,唐僧的关注点是:天啊!之前那个猎户这么厉害,还跟老虎斗了半天,这猴子竟然能一棒打死一只!
18 |

19 |
当然如果悟空杀人的话唐僧是会怒的,但是他的关注点仍然和正常的出家人不一样:
20 |
他会先吟两句诗以示出家人的慈悲为怀:
21 |

22 | 23 | 24 | 27 | 28 |
25 |
扫地恐伤蝼蚁命,爱惜飞蛾纱罩灯。
26 |
29 |

30 |
当然如前所述,如果孙悟空真的只是打杀了蝼蚁或飞蛾,他是不会管的。随后唐僧说的话就很有趣了:
31 |

32 | 33 | 34 | 37 | 38 |
35 |
早还是山野中无人查考,若到城市,倘有人一时冲撞了你,你也行凶,执着棍子,乱打伤人,我可做得白客,怎能脱身?
36 |
39 |

40 |
看见没,他关心的是你乱杀人连累了我怎么办?又道:此事若告到官,就是你老子做官,也说不过去 。
41 |
42 |

43 | 这就更有趣了,说的是就算你爸是李刚也没用。这样一套对话在西游记里出现了可不止一次,算是唐僧的经典演讲路线了。这思维哪里是圣僧,分明是一个封建小农啊。
44 |

45 |
2. 再说说唐僧什么时候念紧箍咒:不是在悟空犯错的时候,而是在悟空令他不爽的时候。比如说遇到一个喜欢收集袈裟的和尚,当时悟空入世未深,想要把师傅的袈裟拿出来让人家开开眼界,反倒是唐僧说对方似心怀险恶,不要露富。(顺便插一句,从这里就可以看出唐僧其实懂得人心险恶,根本没有大多数人以为的那样天真!那他为什么总是看不出孙悟空说的是真话?这又是另一个故事了。)结果孙悟空不听呀,必须要炫耀呀,最后衣服被偷了呀。唐僧一听自己的宝贝晚礼服丢了,那个气呀!当场把紧箍咒好好念了个几遍。
46 |

47 |
再比如说但凡猪八戒撺掇唐僧念咒,唐僧多半是会念的。
48 |

49 |
对比之下,孙悟空推倒人参果树,这错够大了吧?唐僧的反应竟然是:矮油,这是你的不对啦,赶快跟道长道个歉啦。孙悟空说:哼!于是唐僧只好带着徒弟拔腿就跑。
50 |

51 |
心酸篇
52 |
53 |
54 |

55 |
我看西游记的时候是很心疼孙悟空的,三个徒弟里他最爱师傅,师傅却永远最不相信他,简直是太虐了。 56 |

57 |
58 |
1. 唐僧很少撒谎,但在要孙悟空带上金箍的时候骗了他。
59 |
60 |

61 |
62 |
2. 孙悟空三打白骨精,被唐僧赶走了。很多人想必还记得他临走前要拜唐僧,唐僧不受,于是他变作四个围着唐僧拜了一拜吧?拜完之后,悟空独自一人返回了花果山。
63 |
64 |

65 |
66 |
你看他忍气别了师父,纵筋斗云,径回花果山水帘洞去了。独自个凄凄惨惨,忽闻得水声聒耳,大圣在那半空里看时,原来是东洋大海潮发的声响。一见了,又想起唐僧,止不住腮边泪坠,停云住步,良久方去。
67 |
68 |

69 | 3. 这次被赶走应是孙悟空人生中的一个转折点。在那之前,看见妖怪他只会直接一棒子打死,指着一坨人形肉泥说:师傅!这是妖怪!但再那之后,他明白了世事没有那么单纯,好好的一个少年从此就踏上了腹黑之路,唉。 70 |

71 |
72 |
4. 孙悟空被赶走没多久,唐僧遇险,剩下两徒弟加白龙马都无计可施,白龙马苦劝之下猪八戒回去找大师兄。孙悟空明知猪八戒来意,但是装作不知,只带他逛花果山,享受着猪八戒的心焦,直到猪八戒忍不住暴露来意,才顺水推舟地勉强答应。这时候的孙悟空已经和被赶走之前的不太一样了。
73 |

74 |
但是他一面跟八戒说,我就是看在妖怪太嚣张的份上帮你们一把,事了还回来做我的猴王。一面却跟花果山的小猴们说:天底下谁不知道我是唐僧的徒弟,我跟师傅感情好着呢,只是他看我想家,让我回来玩两天,现在我要回去陪他取经啦。(真是太傲娇了呀)
75 |

76 |
等救了唐僧出来,唐僧感谢他,假装不记得曾经赶走他的事,孙悟空也绝口不提,就仿佛什么都没发生过一样,师徒四人照常上路了。 77 |

78 | 5. 但是还是可以很明显地看到孙悟空的变化。下一次遇到妖怪,他不再冲锋陷阵了。他打算诳猪八戒先去试试深浅,反正就算被捉住了他也有自信救回来,这样还能显得他比较有本事。于是他首先问猪八戒:照顾师傅和探路你愿做哪一个?照顾师傅嘛,你要陪他上厕所、扶他走路、还要负责喂饱他,饿了瘦了的话你就给我等着。 79 |

80 |
81 |
猪八戒一听,当然是探路去啊!孙悟空料到猪八戒不会好好干活,变成虫鸟跟着他,并把他偷懒的劣迹回来一一汇报给唐僧,总算博得了唐僧的一次信任。当徒弟累成这样,也是不容易呀。
82 |

83 |

84 |
附录篇
85 |
86 |
87 |
趣味篇第一条见第十四回:
88 |

89 | 90 | 91 | 94 | 95 |
92 |
那只虎蹲着身,伏在尘埃,动也不敢动动。却被他照头一棒,就打的脑浆迸万点桃红,牙齿喷几珠玉块,唬得那陈玄奘滚鞍落马,咬指道声天哪,天哪!刘太保前日打的斑斓虎,还与他斗了半日。今日孙悟空不用争持,把这虎一棒打得稀烂,正是强中更有强中手
93 |
96 |

97 |
心酸篇第一条还是第十四回:
98 |

99 | 100 | 101 | 104 | 105 |
102 |
行者去解开包袱,在那包裹中间见有几个粗面烧饼,拿出来递与师父。又见那光艳艳的一领绵布直裰,一顶嵌金花帽,行者道这衣帽是东土带来的?三藏就顺口儿答应道是我小时穿戴的。这帽子若戴了,不用教经,就会念经;这衣服若穿了,不用演礼,就会行礼行者道好师父,把与我穿戴了罢三藏道只怕长短不一,你若穿得,就穿了罢行者遂脱下旧白布直裰,将绵布直裰穿上,也就是比量着身体裁的一般,把帽儿戴上。三藏见他戴上帽子,就不吃干粮,却默默的念那紧箍咒一遍。行者叫道头痛,头痛那师父不住的又念了几遍,把个行者痛得打滚,抓破了嵌金的花帽。
103 |
106 |

107 |
心酸篇第四条见第三十一回:
108 |

109 | 110 | 111 | 114 | 115 |
112 |
行者道贤弟,你起来。不是我去不成,既是妖精敢骂我,我就不能不降他,我和你去。老孙五百年前大闹天宫,普天的神将看见我,一个个控背躬身,口口称呼大圣。这妖怪无礼,他敢背前面后骂我!我这去,把他拿住,碎尸万段,以报骂我之仇!报毕,我即回来八戒道哥哥,正是,你只去拿了妖精,报了你仇,那时来与不来,任从尊意。 那猴才跳下崖,撞入洞里,脱了妖衣,整一整锦直裰,束一束虎皮裙,执了铁棒,径出门来。慌得那群猴拦住道大圣爷爷,你往那里去?带挈我们耍子几年也好行者道小的们,你说那里话!我保唐僧的这桩事,天上地下,都晓得孙悟空是唐僧的徒弟。他倒不是赶我回来,倒是教我来家看看,送我来家自在耍子。如今只因这件事,你们却都要仔细看守家业,依时插柳栽松,毋得废坠,待我还去保唐僧,取经回东土。功成之后,仍回来与你们共乐天真众猴各各领命。
113 |
116 |

117 |
118 |

119 | 120 | 121 | 124 | 125 |
122 |
长老现了原身,定性睁睛,才认得是行者,一把搀住道:悟空!你从那里来也?沙僧侍立左右,把那请行者降妖精,救公主,解虎气,并回朝上项事,备陈了一遍。三藏谢之不尽道贤徒,亏了你也,亏了你也!这一去,早诣西方,径回东土,奏唐王,你的功劳第一行者笑道莫说莫说!但不念那话儿,足感爱厚之情也国王闻此言,又劝谢了他四众,整治素筵,大开东阁。他师徒受了皇恩,辞王西去。
123 |
126 |

127 |
心酸篇第五条见第三十二回:
128 |

129 | 130 | 131 | 134 | 135 |
132 |
行者闻言,把功曹叱退,切切在心,按云头,径来山上。只见长老与八戒、沙僧,簇拥前进,他却暗想我若把功曹的言语实实告诵师父,师父他不济事,必就哭了;假若不与他实说,梦着头,带着他走,常言道乍入芦圩,不知深浅。倘或被妖魔捞去,却不又要老孙费心?且等我照顾八戒一照顾,先着他出头与那怪打一仗看。若是打得过他,就算他一功;若是没手段,被怪拿去,等老孙再去救他不迟,却好显我本事出名。
133 |
136 |

137 |
138 |

139 | 140 | 141 | 144 | 145 |
142 |
呆子真个对行者说道:哥哥,你教我做甚事?行者道第一件是看师父,第二件是去巡山。八戒道看师父是坐,巡山去是走。终不然教我坐一会又走,走一会又坐,两处怎么顾盼得来?行者道:不是教你两件齐干,只是领了一件便八戒又笑道:这等也好计较。但不知看师父是怎样,巡山是怎样,你先与我讲讲,等我依个相应些儿的去干罢行者道看师父啊,师父去出恭,你伺候;师父要走路,你扶持;师父要吃斋,你化斋。若他饿了些儿,你该打;黄了些儿脸皮,你该打;瘦了些儿形骸,你该打
143 |
146 |

147 | -------------------------------------------------------------------------------- /demos/yinxiang_notion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/demos/yinxiang_notion.png -------------------------------------------------------------------------------- /demos/yinxiang_notion2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/demos/yinxiang_notion2.png -------------------------------------------------------------------------------- /demos/yinxiang_supernote.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | yinxiang_supernote

Introduction


Overview

The OpenAI API can be applied to virtually any task that involves understanding or generating natural language, code, or images. We offer a spectrum of models with different levels of power suitable for different tasks, as well as the ability to fine-tune your own custom models. These models can be used for everything from content generation to semantic search and classification.


Key concepts

Quote from here:

We recommend completing our quickstart tutorial to get acquainted with key concepts through a hands-on, interactive example.

First, you’ll need a prompt that makes it clear what you want. Let’s start with an instruction. Submit this prompt to generate your first completion.
Prompts: Designing your prompt is essentially how you “program” the model, usually by providing some instructions or a few examples.

Next steps

Code

import os 4 | print("hello")

TODO List



Table
Row 1: Role, Read, Normal
Row 1: Bold Text. Undownline
Row 1: Link here: https://platform.openai.com/docs/guides/chat/introduction
Row 1: In classification problems:
  • List 1.
  • List 2
Code here;
Row 2: System
Row 2:You are a helpful assistant.
Row 2: none

Animal
Names
Column
Column2
Cat
Captain
pading
agagin
Dog
Ruff the Protector
null


-------------------------------------------------------------------------------- /examples/insert_divider.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "Initialize the API key and database ID of Notion." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "config = {}\n", 19 | "with open('../.config.json', \"r\") as f:\n", 20 | " json_conf = json.load(f)\n", 21 | " config.update(json_conf)\n", 22 | "\n", 23 | "notion_api_key = config['notion']['api_key']\n", 24 | "database_id = config['notion']['database_id']\n", 25 | "\n", 26 | "print(notion_api_key, database_id)" 27 | ] 28 | }, 29 | { 30 | "attachments": {}, 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Ref\n", 35 | "https://developers.notion.com/reference/block#to-do" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from notion_client import Client\n", 45 | "notion = Client(auth=notion_api_key)\n", 46 | "parent = {\"type\": \"database_id\", \"database_id\": database_id}\n", 47 | "\n", 48 | "properties = {\"Title\": {\"title\": [{\"text\": {\"content\": \"Page with divider\"}}]}}\n", 49 | "\n", 50 | "divider_block = [\n", 51 | " {\n", 52 | " \"type\": \"divider\",\n", 53 | " \"divider\": {}\n", 54 | " }\n", 55 | "]\n", 56 | "\n", 57 | "created_page = notion.pages.create(\n", 58 | " parent=parent,\n", 59 | " properties=properties,\n", 60 | " children=divider_block\n", 61 | ")\n", 62 | "\n", 63 | "from typing import Dict, Any, cast\n", 64 | "created_page = cast(Dict[str, Any], created_page)\n", 65 | "print(f'page_id = {created_page[\"id\"]}')" 66 | ] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "openai", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.11.2" 86 | }, 87 | "orig_nbformat": 4 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 2 91 | } 92 | -------------------------------------------------------------------------------- /examples/insert_table.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "Initialize the API key and database ID of Notion." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "config = {}\n", 19 | "with open('../.config.json', \"r\") as f:\n", 20 | " json_conf = json.load(f)\n", 21 | " config.update(json_conf)\n", 22 | "\n", 23 | "notion_api_key = config['notion']['api_key']\n", 24 | "database_id = config['notion']['database_id']\n", 25 | "\n", 26 | "print(notion_api_key, database_id)" 27 | ] 28 | }, 29 | { 30 | "attachments": {}, 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Create a table, note that when creating it, you can directly insert table rows. However, when reading it, you actually need to first obtain the blockid of the table in the page and then retrieve its children blocks.\n", 35 | "\n", 36 | "https://developers.notion.com/reference/block#table\n", 37 | "https://developers.notion.com/changelog/simple-table-support" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "from notion_client import Client\n", 47 | "notion = Client(auth=notion_api_key)\n", 48 | "parent = {\"type\": \"database_id\", \"database_id\": database_id}\n", 49 | "\n", 50 | "properties = {\"Title\": {\"title\": [{\"text\": {\"content\": \"Page with table\"}}]}}\n", 51 | "\n", 52 | "one_row = {\n", 53 | " \"type\": \"table_row\",\n", 54 | " \"table_row\": {\n", 55 | " \"cells\": [\n", 56 | " [\n", 57 | " {\n", 58 | " \"type\": \"text\",\n", 59 | " \"text\": {\n", 60 | " \"content\": \"column 1 content\",\n", 61 | " },\n", 62 | " \"plain_text\": \"column 1 content\",\n", 63 | " }\n", 64 | " ],\n", 65 | " [\n", 66 | " {\n", 67 | " \"type\": \"text\",\n", 68 | " \"text\": {\n", 69 | " \"content\": \"column 2 content\",\n", 70 | "\n", 71 | " },\n", 72 | " \"plain_text\": \"column 2 content\",\n", 73 | " }\n", 74 | " ]\n", 75 | " ]\n", 76 | " }\n", 77 | "}\n", 78 | "children = [\n", 79 | " {\n", 80 | " \"table\": {\n", 81 | " \"has_row_header\": False,\n", 82 | " \"has_column_header\": False,\n", 83 | " \"table_width\": 2,\n", 84 | " \"children\": [one_row],\n", 85 | " }\n", 86 | " }\n", 87 | "]\n", 88 | "\n", 89 | "created_page = notion.pages.create(\n", 90 | " parent=parent,\n", 91 | " properties=properties,\n", 92 | " children=children\n", 93 | ")\n", 94 | "\n", 95 | "from typing import Dict, Any, cast\n", 96 | "created_page = cast(Dict[str, Any], created_page)\n", 97 | "print(f'page_id = {created_page[\"id\"]}')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from notion_client import Client\n", 107 | "notion = Client(auth=notion_api_key)\n", 108 | "parent = {\"type\": \"database_id\", \"database_id\": database_id}\n", 109 | "\n", 110 | "properties = {\"Title\": {\"title\": [{\"text\": {\"content\": \"Page with table(has header)\"}}]}}\n", 111 | "\n", 112 | "children = [\n", 113 | " {\n", 114 | " \"table\": {\n", 115 | " \"has_row_header\": False,\n", 116 | " \"has_column_header\": True,\n", 117 | " \"table_width\": 2,\n", 118 | " \"children\": [one_row],\n", 119 | " }\n", 120 | " }\n", 121 | "]\n", 122 | "\n", 123 | "for i in range(1, 4):\n", 124 | " one_row = {\n", 125 | " \"type\": \"table_row\",\n", 126 | " \"table_row\": {\n", 127 | " \"cells\": [\n", 128 | " [\n", 129 | " {\n", 130 | " \"type\": \"text\",\n", 131 | " \"text\": {\n", 132 | " \"content\": f\"column 1 content {i}\",\n", 133 | " },\n", 134 | " \"plain_text\": f\"column 1 content {i}\",\n", 135 | " }\n", 136 | " ],\n", 137 | " [\n", 138 | " {\n", 139 | " \"type\": \"text\",\n", 140 | " \"text\": {\n", 141 | " \"content\": f\"column 2 content {i}\",\n", 142 | "\n", 143 | " },\n", 144 | " \"plain_text\": f\"column 2 content {i}\",\n", 145 | " }\n", 146 | " ]\n", 147 | " ]\n", 148 | " }\n", 149 | " }\n", 150 | " children[0][\"table\"][\"children\"].append(one_row)\n", 151 | "\n", 152 | "created_page = notion.pages.create(\n", 153 | " parent=parent,\n", 154 | " properties=properties,\n", 155 | " children=children\n", 156 | ")\n", 157 | "\n", 158 | "from typing import Dict, Any, cast\n", 159 | "created_page = cast(Dict[str, Any], created_page)\n", 160 | "print(f'page_id = {created_page[\"id\"]}')" 161 | ] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "openai", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.11.0" 181 | }, 182 | "orig_nbformat": 4 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /examples/insert_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "Initialize the API key and database ID of Notion." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "config = {}\n", 19 | "with open('../.config.json', \"r\") as f:\n", 20 | " json_conf = json.load(f)\n", 21 | " config.update(json_conf)\n", 22 | "\n", 23 | "notion_api_key = config['notion']['api_key']\n", 24 | "database_id = config['notion']['database_id']\n", 25 | "\n", 26 | "print(notion_api_key, database_id)" 27 | ] 28 | }, 29 | { 30 | "attachments": {}, 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Limits for property values\n", 35 | "Rich text object\ttext.content\t2000 characters\n", 36 | "\n", 37 | "https://developers.notion.com/reference/request-limits\n", 38 | "https://developers.notion.com/reference/rich-textupport" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from notion_client import Client\n", 48 | "notion = Client(auth=notion_api_key)\n", 49 | "parent = {\"type\": \"database_id\", \"database_id\": database_id}\n", 50 | "properties = {\"Title\": {\"title\": [{\"text\": {\"content\": \"Page with multi text\"}}]}}\n", 51 | "text_content = \"Some words\" \n", 52 | "text_block = {\n", 53 | " \"type\": \"text\",\n", 54 | " \"text\": {\n", 55 | " \"content\": text_content,\n", 56 | " },\n", 57 | " \"annotations\": {\n", 58 | " \"bold\": False,\n", 59 | " \"italic\": False,\n", 60 | " \"strikethrough\": False,\n", 61 | " \"underline\": False,\n", 62 | " \"code\": False,\n", 63 | " \"color\": \"default\"\n", 64 | " },\n", 65 | " \"plain_text\": text_content,\n", 66 | " \"href\": None\n", 67 | "}\n", 68 | "\n", 69 | "equation = {\n", 70 | " \"type\": \"equation\",\n", 71 | " \"equation\": {\n", 72 | " \"expression\": \"e=mc^2\"\n", 73 | " }\n", 74 | "}\n", 75 | "\n", 76 | "children = [{\n", 77 | " \"object\": \"block\",\n", 78 | " \"type\": \"paragraph\",\n", 79 | " \"paragraph\": {\n", 80 | " \"rich_text\": [text_block, equation]\n", 81 | " }\n", 82 | "}]\n", 83 | "\n", 84 | "created_page = notion.pages.create(\n", 85 | " parent=parent,\n", 86 | " properties=properties,\n", 87 | " children=children\n", 88 | ")\n", 89 | "\n", 90 | "from typing import Dict, Any, cast\n", 91 | "created_page = cast(Dict[str, Any], created_page)\n", 92 | "print(f'page_id = {created_page[\"id\"]}')" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "from notion_client import Client\n", 102 | "notion = Client(auth=notion_api_key)\n", 103 | "parent = {\"type\": \"database_id\", \"database_id\": database_id}\n", 104 | "properties = {\"Title\": {\"title\": [{\"text\": {\"content\": \"Page with image\"}}]}}\n", 105 | "image_block = {\n", 106 | " \"object\": \"block\",\n", 107 | " \"type\": \"image\",\n", 108 | " \"image\": {\n", 109 | " \"type\": \"external\",\n", 110 | " \"external\": {\n", 111 | " \"url\": \"https://raw.githubusercontent.com/selfboot/html2notion/master/demos/notion_templage.png\"\n", 112 | " }\n", 113 | " }\n", 114 | "}\n", 115 | "\n", 116 | "children = [image_block]\n", 117 | "\n", 118 | "created_page = notion.pages.create(\n", 119 | " parent=parent,\n", 120 | " properties=properties,\n", 121 | " children=children\n", 122 | ")\n", 123 | "\n", 124 | "from typing import Dict, Any, cast\n", 125 | "created_page = cast(Dict[str, Any], created_page)\n", 126 | "print(f'page_id = {created_page[\"id\"]}')\n" 127 | ] 128 | } 129 | ], 130 | "metadata": { 131 | "kernelspec": { 132 | "display_name": "openai", 133 | "language": "python", 134 | "name": "python3" 135 | }, 136 | "language_info": { 137 | "codemirror_mode": { 138 | "name": "ipython", 139 | "version": 3 140 | }, 141 | "file_extension": ".py", 142 | "mimetype": "text/x-python", 143 | "name": "python", 144 | "nbconvert_exporter": "python", 145 | "pygments_lexer": "ipython3", 146 | "version": "3.11.2" 147 | }, 148 | "orig_nbformat": 4 149 | }, 150 | "nbformat": 4, 151 | "nbformat_minor": 2 152 | } 153 | -------------------------------------------------------------------------------- /examples/insert_todo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": {}, 7 | "source": [ 8 | "Initialize the API key and database ID of Notion." 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "config = {}\n", 19 | "with open('../.config.json', \"r\") as f:\n", 20 | " json_conf = json.load(f)\n", 21 | " config.update(json_conf)\n", 22 | "\n", 23 | "notion_api_key = config['notion']['api_key']\n", 24 | "database_id = config['notion']['database_id']\n", 25 | "\n", 26 | "print(notion_api_key, database_id)" 27 | ] 28 | }, 29 | { 30 | "attachments": {}, 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "Ref\n", 35 | "https://developers.notion.com/reference/block#to-do" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from notion_client import Client\n", 45 | "notion = Client(auth=notion_api_key)\n", 46 | "parent = {\"type\": \"database_id\", \"database_id\": database_id}\n", 47 | "\n", 48 | "properties = {\"Title\": {\"title\": [{\"text\": {\"content\": \"Page with todo\"}}]}}\n", 49 | "\n", 50 | "todo_block = [{\n", 51 | " \"type\": \"to_do\",\n", 52 | " \"to_do\": {\n", 53 | " \"rich_text\": [{\n", 54 | " \"type\": \"text\",\n", 55 | " \"text\": {\n", 56 | " \"content\": \"Finish Q3 goals\",\n", 57 | " }\n", 58 | " },\n", 59 | " {\n", 60 | " \"plain_text\": \"Goals detail:\\n\",\n", 61 | " \"text\": {\n", 62 | " \"content\": \"Goals detail:\\n\"\n", 63 | " },\n", 64 | " \"type\": \"text\"\n", 65 | " },\n", 66 | " {\n", 67 | " \"plain_text\": \"You are a helpful assistant.\",\n", 68 | " \"text\": {\n", 69 | " \"content\": \"You are a helpful assistant.\"\n", 70 | " },\n", 71 | " \"type\": \"text\",\n", 72 | " \"annotations\": {\n", 73 | " \"color\": \"green\"\n", 74 | " }\n", 75 | " }\n", 76 | " ],\n", 77 | " \"checked\": False,\n", 78 | " \"color\": \"default\"\n", 79 | " }\n", 80 | "}]\n", 81 | "\n", 82 | "created_page = notion.pages.create(\n", 83 | " parent=parent,\n", 84 | " properties=properties,\n", 85 | " children=todo_block\n", 86 | ")\n", 87 | "\n", 88 | "from typing import Dict, Any, cast\n", 89 | "created_page = cast(Dict[str, Any], created_page)\n", 90 | "print(f'page_id = {created_page[\"id\"]}')" 91 | ] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "openai", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.11.0" 111 | }, 112 | "orig_nbformat": 4 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 2 116 | } 117 | -------------------------------------------------------------------------------- /examples/parse_code.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from bs4 import BeautifulSoup, NavigableString\n", 10 | "\n", 11 | "html_doc = \"\"\"\n", 12 | " \n", 33 | " 1\n", 34 | "2\n", 35 | "3\n", 36 | "4\n", 37 | "5\n", 38 | "6\n", 39 | "7\n", 40 | "8\n", 41 | "9\n", 42 | "10\n", 43 | "11\n", 44 | "12\n", 45 | "\n", 46 | "# Note: you need to be using OpenAI Python v0.27.0 for the code below to workimport openai\n", 48 | "\n", 49 | "openai.ChatCompletion.create(\n", 50 | " model=\"gpt-3.5-turbo\",\n", 51 | " messages=[\n", 52 | " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", 53 | " {\"role\": \"user\", \"content\": \"Who won the world series in 2020?\"},\n", 54 | " {\"role\": \"assistant\", \"content\": \"The Los Angeles Dodgers won the World Series in 2020.\"},\n", 55 | " {\"role\": \"user\", \"content\": \"Where was it played?\"}\n", 56 | " ]\n", 57 | ")\n", 58 | "\"\"\"\n", 59 | "soup = BeautifulSoup(html_doc, 'html.parser')\n", 60 | "\n", 61 | "# 找到所有的
标签\n",
 62 |     "pre_tags = soup.find_all('pre')\n",
 63 |     "\n",
 64 |     "for pre in pre_tags:\n",
 65 |     "    # 在每个
标签中找到标签\n",
 66 |     "    code_tags = pre.find_all('code')\n",
 67 |     "    \n",
 68 |     "    for code in code_tags:\n",
 69 |     "        # 检查标签是否包含行号,这里假设行号是在标签中的数字\n",
 70 |     "        span_tags = code.find_all('span')\n",
 71 |     "        \n",
 72 |     "        for span in span_tags:\n",
 73 |     "            if span.string and span.string.strip().isdigit():\n",
 74 |     "                # 如果是行号,则删除这个标签\n",
 75 |     "                span.decompose()\n",
 76 |     "\n",
 77 |     "# 这时,soup中的HTML已经没有行号了\n",
 78 |     "print(soup.prettify())\n"
 79 |    ]
 80 |   }
 81 |  ],
 82 |  "metadata": {
 83 |   "kernelspec": {
 84 |    "display_name": "notion",
 85 |    "language": "python",
 86 |    "name": "python3"
 87 |   },
 88 |   "language_info": {
 89 |    "codemirror_mode": {
 90 |     "name": "ipython",
 91 |     "version": 3
 92 |    },
 93 |    "file_extension": ".py",
 94 |    "mimetype": "text/x-python",
 95 |    "name": "python",
 96 |    "nbconvert_exporter": "python",
 97 |    "pygments_lexer": "ipython3",
 98 |    "version": "3.11.2"
 99 |   },
100 |   "orig_nbformat": 4
101 |  },
102 |  "nbformat": 4,
103 |  "nbformat_minor": 2
104 | }
105 | 


--------------------------------------------------------------------------------
/examples/parse_tag.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from bs4 import BeautifulSoup, NavigableString\n",
 10 |     "\n",
 11 |     "html = '''\n",
 12 |     "
\n", 13 | "

\n", 14 | " \n", 15 | " \n", 16 | " \n", 17 | " \n", 20 | " \n", 23 | " \n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | "
\n", 18 | "
Row 1: You are a helpful assistant. Remember it.
\n", 19 | "
\n", 21 | " \n", 22 | " \n", 24 | "
Row 1: Import Content Read more.
\n", 25 | "
Row 2:
Row 2:
Row 2:
\n", 34 | "

\n", 35 | "
\n", 36 | "'''\n", 37 | "\n", 38 | "def extract_text_and_parents(tag, parents=[]):\n", 39 | " results = []\n", 40 | " for child in tag.children:\n", 41 | " if isinstance(child, NavigableString):\n", 42 | " if child.strip():\n", 43 | " text = child.strip()\n", 44 | " parent_tags = [{\"name\": p.name, \"attrs\": p.attrs} for p in parents + [tag]]\n", 45 | " results.append({\"text\": text, \"parent_tags\": parent_tags})\n", 46 | " else:\n", 47 | " results.extend(extract_text_and_parents(child, parents + [tag]))\n", 48 | " return results\n", 49 | "\n", 50 | "soup = BeautifulSoup(html, 'html.parser')\n", 51 | "td_tags = soup.find_all('td')\n", 52 | "\n", 53 | "for i, td in enumerate(td_tags, 1):\n", 54 | " text_with_parents = extract_text_and_parents(td)\n", 55 | " print(f\"Text and parent tags in TD {i}:\")\n", 56 | " for item in text_with_parents:\n", 57 | " print(f\"Text: {item['text']}\")\n", 58 | " print(\"Parent tags:\")\n", 59 | " for parent in item[\"parent_tags\"]:\n", 60 | " print(f\" Tag: {parent['name']}, Attributes: {parent['attrs']}\")\n", 61 | " print()\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "from bs4 import BeautifulSoup\n", 71 | "from html2notion.translate.html2json_base import Html2JsonBase\n", 72 | "content = \"\"\"

web image:

\n", 73 | "\"\"\"\n", 74 | "tag = BeautifulSoup(content, 'html.parser').find('p')\n", 75 | "text_and_parents = Html2JsonBase.extract_text_and_parents(tag)\n", 76 | "for item in text_and_parents:\n", 77 | " print(f\"Text: {item[0]}, {item[1]}\")" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "notion", 84 | "language": "python", 85 | "name": "python3" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.11.2" 98 | }, 99 | "orig_nbformat": 4 100 | }, 101 | "nbformat": 4, 102 | "nbformat_minor": 2 103 | } 104 | -------------------------------------------------------------------------------- /examples/process_md.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "md_text = \"\"\"\n", 10 | "# Header\n", 11 | "\n", 12 | "**bold**, _ite_, ~~other~~, more...\n", 13 | "`inline code` here.\n", 14 | "\n", 15 | "```python\n", 16 | "import os\n", 17 | "os.print('hello')\n", 18 | "```\n", 19 | "\n", 20 | "> Please work through this document in its entirety to better understand how OpenAI’s rate limit system works. We include code examples and possible solutions to handle common issues. It is recommended to **follow** this guidance before filling out the [Rate Limit Increase Request form](https://docs.google.com/forms/d/e/1FAIpQLSc6gSL3zfHFlL6gNIyUcjkEv29jModHGxg5_XGyr-PrE2LaHw/viewform) with details regarding how to fill it out in the last section.\n", 21 | "\n", 22 | "divider\n", 23 | "* * *\n", 24 | "\n", 25 | "### image\n", 26 | "local images:\n", 27 | "\n", 28 | "![846f62a6516227df1b4370aea3f63143.png](evernotecid://A2B91148-7880-4D85-A7CC-3A794B21D0F8/appyinxiangcom/186128/ENResource/p3511)\n", 29 | "\n", 30 | "web image:\n", 31 | "![pic](https://raw.githubusercontent.com/selfboot/html2notion/master/demos/notion_templage.png)\n", 32 | "\n", 33 | "[link](https://docs.microsoft.com/zh-tw/previous-versions/visualstudio/design-tools/expression-studio-2/cc294571(v=expression.10))\n", 34 | "\n", 35 | "### Table\n", 36 | "\n", 37 | "|header| column1 | column 2\n", 38 | "|-|-|-\n", 39 | "|row 1| row 1_1 | row 1_2\n", 40 | "|row 2| row 2_2 **bold**, _ite_, ~~other~~, more... | row 2_3\n", 41 | "\n", 42 | "### list\n", 43 | "\n", 44 | "[Why do we have rate limits?](https://platform.openai.com/docs/guides/rate-limits/overview)\n", 45 | "Rate limits are a common practice for APIs, and they're put in place for a few different reasons:\n", 46 | "\n", 47 | "- They help protect against abuse or misuse of the API. For example, a malicious actor could flood the API with requests in an attempt to overload it or cause disruptions in service. By setting rate limits, `OpenAI` can prevent this kind of activity.\n", 48 | "- Rate limits help ensure that everyone has fair access to the API. If one person or organization makes an excessive number of requests, it could bog down the API for everyone else. By throttling the number of requests that a single user can make, OpenAI ensures that the most number of people have an opportunity to use the API without experiencing slowdowns.\n", 49 | "- Rate limits can help OpenAI manage the aggregate load on its infrastructure. If requests to the API increase dramatically, it could tax the servers and cause performance issues. By setting rate limits, OpenAI can help maintain a smooth and consistent experience for all users.\n", 50 | "\n", 51 | "number list\n", 52 | "\n", 53 | "1. number list1\n", 54 | "2. numner list2\n", 55 | "\n", 56 | "## checkbox\n", 57 | "\n", 58 | "Three frogs\n", 59 | "* [x] The first frog\n", 60 | "* [ ] The second frog\n", 61 | "* [ ] The third frog\n", 62 | "\n", 63 | "# math and grapth\n", 64 | "\n", 65 | "Here is math\n", 66 | "```math\n", 67 | "e^{i\\pi} + 1 = 0\n", 68 | "```\n", 69 | "\n", 70 | "mermaid grapth:\n", 71 | "\n", 72 | "```mermaid\n", 73 | "graph TD\n", 74 | "A[Module A] -->|A1| B( Module B)\n", 75 | "B --> C{Confidition C}\n", 76 | "C -->|condition C1| D[Module D]\n", 77 | "C -->|condition C2| E[Module E]\n", 78 | "C -->|condition C3| F[Module F]\n", 79 | "```\n", 80 | "\n", 81 | "sequenceDiagram\n", 82 | "\n", 83 | "```mermaid\n", 84 | "sequenceDiagram\n", 85 | "A->>B: Have you received a message?\n", 86 | "B-->>A: Message received\n", 87 | "```\n", 88 | "\n", 89 | "gantt\n", 90 | "\n", 91 | "```mermaid\n", 92 | "gantt\n", 93 | "title Gantt chart\n", 94 | "dateFormat YYYY-MM-DD\n", 95 | "section Proj A\n", 96 | "Task 1 :a1, 2018-06-06, 30d\n", 97 | "Task 2 :after a1 , 20d\n", 98 | "section Proj B\n", 99 | "Task 3 :2018-06-12 , 12d\n", 100 | "Task 4 : 24d\n", 101 | "```\n", 102 | "\n", 103 | "### chart\n", 104 | "\n", 105 | "```chart\n", 106 | ", budget, income, expenses, debt\n", 107 | "June,5000,8000,4000,6000\n", 108 | "July,3000,1000,4000,3000\n", 109 | "Aug,5000,7000,6000,3000\n", 110 | "Sep,7000,2000,3000,1000\n", 111 | "Oct,6000,5000,4000,2000\n", 112 | "Nov,4000,3000,5000,\n", 113 | "\n", 114 | "type: pie\n", 115 | "title: 每月收益\n", 116 | "x.title: Amount\n", 117 | "y.title: Month\n", 118 | "y.suffix: $\n", 119 | "```\n", 120 | "\n", 121 | "```chart\n", 122 | ",Budget,Income,Expenses,Debt\n", 123 | "June,5000,8000,4000,6000\n", 124 | "July,3000,1000,4000,3000\n", 125 | "Aug,5000,7000,6000,3000\n", 126 | "Sep,7000,2000,3000,1000\n", 127 | "Oct,6000,5000,4000,2000\n", 128 | "Nov,4000,3000,5000,\n", 129 | "\n", 130 | "type: line\n", 131 | "title: Monthly Revenue\n", 132 | "x.title: Amount\n", 133 | "y.title: Month\n", 134 | "y.suffix: $\n", 135 | "```\n", 136 | "\"\"\"" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "import re\n", 146 | "\n", 147 | "def extract_code_blocks(md_text):\n", 148 | " code_pattern = re.compile(r'```(\\w+)?\\n(.*?)```', re.DOTALL)\n", 149 | " matches = code_pattern.findall(md_text)\n", 150 | " code_blocks = [{'language': match[0], 'code': match[1]} for match in matches]\n", 151 | " return code_blocks\n", 152 | "\n", 153 | "\n", 154 | "code_blocks = extract_code_blocks(md_text)\n", 155 | "\n", 156 | "for block in code_blocks:\n", 157 | " print(f\"Language: {block['language']}\")\n", 158 | " print(f\"Code: {block['code']}\\n\")\n" 159 | ] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "notion", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.11.2" 179 | }, 180 | "orig_nbformat": 4 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /html2notion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/html2notion/__init__.py -------------------------------------------------------------------------------- /html2notion/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | import json 5 | import asyncio 6 | from pathlib import Path 7 | from aiohttp import ClientSession 8 | from notion_client import AsyncClient 9 | from rich.console import Console 10 | from rich.table import Table 11 | from rich.text import Text 12 | from rich import box 13 | from .utils import setup_logger, read_config, logger, config 14 | from .translate.notion_import import NotionImporter 15 | from .translate.batch_import import BatchImport 16 | from .translate.import_stats import StatLevel 17 | console = Console() 18 | 19 | 20 | def prepare_args(): 21 | parser = argparse.ArgumentParser( 22 | description='Html2notion: Save HTML to your Notion notes quickly and easily, while keeping the original format as much as possible') 23 | parser.add_argument('--conf', type=str, help='conf file path', required=True) 24 | parser.add_argument('--log', type=str, help='log directory path') 25 | parser.add_argument('--batch', type=int, default=15, help='batch save concurrent limit') 26 | 27 | group = parser.add_mutually_exclusive_group(required=True) 28 | group.add_argument('--file', type=str, help='Save single html file to notion') 29 | group.add_argument('--dir', type=str, help='Save all html files in the dir to notion') 30 | return parser 31 | 32 | 33 | def print_single_stats(stat): 34 | if stat.get_level() == StatLevel.EXCEPTION.value: 35 | text = Text(f"Failed to import {stat.filename}", style="default") 36 | text.append(f"\nException: {stat.exception}", style="red") 37 | if 'body.parent.page_id should be defined' in str(stat.exception): 38 | text.append(f"\nHeadmeta : \n{json.dumps(stat.head_meta, indent=4)}", style="yellow") 39 | console.print(text) 40 | return 41 | 42 | title = f"{stat.filename}" if stat.filename else "Import Result (Loss filename)" 43 | style = "default" 44 | if stat.get_level() == StatLevel.LOSS.value: 45 | title += " (Loss some content)" 46 | style = "yellow" 47 | elif stat.get_level() == StatLevel.SUCC.value: 48 | title += "(Import successfully)" 49 | style = "green" 50 | 51 | table = Table(title=title, title_style=style, expand=True, box=box.HEAVY_HEAD, show_lines=True) 52 | table.add_column("Item", justify="right", style="default") 53 | table.add_column("Html", style="default") 54 | table.add_column("Notion", justify="left", style="default") 55 | table.add_row("Text Len", str(stat.text_count), str(stat.notion_text_count)) 56 | table.add_row("Image Count", str(stat.image_count), str(stat.notion_image_count)) 57 | if stat.skip_tag: 58 | table.add_row("Skip Tag Count", "", 'Detail: [yellow]' + ";".join([repr(s) 59 | for s in stat.skip_tag])[:2000] + "[/yellow]") 60 | 61 | console.print(table) 62 | 63 | 64 | def print_batch_stats(batch_import): 65 | all_files = batch_import.all_files 66 | batch_stats = batch_import.batch_stats 67 | success_stats = [stat for stat in batch_stats if not stat.get_level() == StatLevel.SUCC.value] 68 | if len(success_stats) == len(all_files): 69 | console.print(f"All files migrated successfully and there is no data loss.", style="green") 70 | 71 | failed_stats = [stat for stat in batch_stats if stat.get_level() == StatLevel.EXCEPTION.value] 72 | if failed_stats: 73 | table = Table(title=f"\nImport Fail Exception Detail\nLog path: {config.get('log_path')}", expand=True, box=box.HEAVY_HEAD, show_lines=True) 74 | table.add_column("File Name", justify="left", style="default") 75 | table.add_column("Fail Reason", justify="left", style="default") 76 | 77 | for stat in failed_stats: 78 | table.add_row(str(stat.filename), str(stat)) 79 | console.print(table) 80 | 81 | less_stats = [stat for stat in batch_stats if stat.get_level() == StatLevel.LOSS.value] 82 | if less_stats: 83 | table = Table(title=f"\nImport Data Loss Detail (You can use --file to import single file for more info)\n", expand=True, box=box.HEAVY_HEAD, show_lines=True) 84 | table.add_column("File Name", justify="left", style="default") 85 | table.add_column("Loss Detail", justify="left", style="default") 86 | 87 | for stat in less_stats: 88 | table.add_row(str(stat.filename), str(stat)) 89 | console.print(table) 90 | 91 | 92 | 93 | def prepare_env(args: argparse.Namespace): 94 | log_path = Path(args.log) if args.log else Path.cwd() / 'logs/' 95 | if not log_path.is_dir(): 96 | log_path.mkdir(parents=True) 97 | 98 | conf_path = Path(args.conf) 99 | if not conf_path.is_file(): 100 | text = Text(f"Read conf {conf_path} failed.", style="red") 101 | console.print(text) 102 | sys.exit(1) 103 | 104 | setup_logger(log_path) 105 | read_config(conf_path) 106 | logger.info(f"Read log {log_path}, conf {conf_path}") 107 | 108 | 109 | async def import_single_file(file): 110 | notion_api_key = "" 111 | if 'GITHUB_ACTIONS' in os.environ: 112 | notion_api_key = os.environ['notion_api_key'] 113 | else: 114 | notion_api_key = config['notion']['api_key'] 115 | async with ClientSession() as session: 116 | async with AsyncClient(auth=notion_api_key) as notion_client: 117 | notion_importer = NotionImporter(session, notion_client) 118 | await notion_importer.process_file(file) 119 | return notion_importer.import_stats 120 | 121 | 122 | def main(): 123 | arg_parse = prepare_args() 124 | args = arg_parse.parse_args() 125 | prepare_env(args) 126 | 127 | text = Text("") 128 | file_path = Path(args.file) if args.file else None 129 | dir_path = Path(args.dir) if args.dir else None 130 | max_concurrency = args.batch 131 | if file_path and file_path.is_file(): 132 | stats = asyncio.run(import_single_file(file_path)) 133 | print_single_stats(stats) 134 | elif dir_path and dir_path.is_dir(): 135 | logger.info(f"Begin save all html files in the dir: {dir_path}.") 136 | batch_import = BatchImport(dir_path, max_concurrency) 137 | result = asyncio.run(batch_import.process_directory()) 138 | logger.info(f"Finish save all html files in the dir: {dir_path}.\n{result}") 139 | print_batch_stats(batch_import) 140 | else: 141 | text.append("The parameters provided are incorrect, please check.", style="red") 142 | text.append(f"\n{arg_parse.format_help()}", style="default") 143 | 144 | text.append("\nIf you need help, please submit an ", style="default") 145 | link = Text("issue", style="cyan underline link https://github.com/selfboot/html2notion/issues") 146 | text.append(link) 147 | text.append(" on gitHub.", style="default") 148 | console.print(text) 149 | return 150 | 151 | 152 | if __name__ == '__main__': 153 | main() 154 | -------------------------------------------------------------------------------- /html2notion/translate/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/selfboot/html2notion/2f02e7a465fcdb5e3a80631f7d29fcaefb195339/html2notion/translate/__init__.py -------------------------------------------------------------------------------- /html2notion/translate/batch_import.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import aiohttp 3 | import os 4 | from pathlib import Path 5 | from asyncio import Lock 6 | from notion_client import AsyncClient 7 | from rich.progress import ( 8 | BarColumn, 9 | MofNCompleteColumn, 10 | Progress, 11 | TextColumn, 12 | TimeRemainingColumn, 13 | ) 14 | from ..translate.notion_import import NotionImporter 15 | from ..utils import logger, config 16 | 17 | 18 | class BatchImport: 19 | def __init__(self, directory: Path, concurrent_limit: int = 10): 20 | self.directory = directory 21 | self.concurrent_limit = concurrent_limit 22 | if 'GITHUB_ACTIONS' in os.environ: 23 | self.notion_api_key = os.environ['notion_api_key'] 24 | else: 25 | self.notion_api_key = config['notion']['api_key'] 26 | self.notion_client = AsyncClient(auth=self.notion_api_key) 27 | self.batch_stats = [] 28 | self.files_lock = Lock() 29 | 30 | @staticmethod 31 | async def process_file(session, notion_client, file_path, files_lock, batch_stats): 32 | logger.info(f"Begin file, file {file_path}") 33 | notion_import = NotionImporter(session, notion_client) 34 | response = await notion_import.process_file(file_path) 35 | logger.info(f"Finish file {file_path}, status {str(notion_import.import_stats)}") 36 | async with files_lock: 37 | batch_stats.append(notion_import.import_stats) 38 | return response 39 | 40 | async def process_directory(self): 41 | semaphore = asyncio.Semaphore(self.concurrent_limit) 42 | self.all_files = [file_path for file_path in self.directory.glob('*.html') if file_path.name != 'index.html'] 43 | files_len = len(self.all_files) 44 | 45 | with Progress( 46 | TextColumn("[progress.description]{task.description}", justify="right"), 47 | BarColumn(), 48 | MofNCompleteColumn(), 49 | TextColumn(" "), 50 | TimeRemainingColumn() 51 | ) as progress: 52 | # with Progress() as progress: 53 | progress.add_task("[cyan]Total", total=files_len, 54 | completed=files_len, update_period=0, style="cyan") 55 | success_task_id = progress.add_task( 56 | "[green]Success", total=files_len, style="green") 57 | failed_task_id = progress.add_task("[red]Failed", total=files_len, style="red") 58 | async def process_file_with_semaphore(session, notion_client, file_path): 59 | async with semaphore: 60 | result = await self.process_file(session, notion_client, file_path, self.files_lock, self.batch_stats) 61 | if result == "succ": 62 | progress.update(success_task_id, advance=1) 63 | else: 64 | progress.update(failed_task_id, advance=1) 65 | return result 66 | 67 | async with aiohttp.ClientSession() as session: 68 | tasks = [process_file_with_semaphore(session, self.notion_client, file_path) for file_path in self.all_files] 69 | results = await asyncio.gather(*tasks) 70 | await session.close() 71 | return results 72 | 73 | 74 | if __name__ == '__main__': 75 | from ..utils import test_prepare_conf 76 | test_prepare_conf() 77 | from tempfile import TemporaryDirectory 78 | with TemporaryDirectory() as temp_dir: 79 | temp_dir_path = Path(temp_dir) 80 | files = [] 81 | for i in range(100): 82 | temp_file = temp_dir_path / f"file{i}.txt" 83 | temp_file.write_text("main_hold") 84 | files.append(temp_file) 85 | 86 | max_concurrency = 2 87 | batch_import = BatchImport(temp_dir_path, max_concurrency) 88 | result = asyncio.run(batch_import.process_directory()) 89 | print(result) 90 | -------------------------------------------------------------------------------- /html2notion/translate/cos_uploader.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from qcloud_cos import CosConfig 3 | from qcloud_cos import CosS3Client 4 | from qcloud_cos.cos_exception import CosClientError 5 | from functools import partial 6 | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type 7 | from ..utils import logger, test_prepare_conf, config 8 | 9 | class TencentCosUploaderAsync: 10 | def __init__(self, secret_id, secret_key, region, bucket, timeout=60): 11 | self.config = CosConfig(Region=region, SecretId=secret_id, SecretKey=secret_key, Timeout=timeout) 12 | self.client = CosS3Client(self.config) 13 | self.bucket = bucket 14 | 15 | @retry(stop=stop_after_attempt(5), 16 | wait=wait_exponential(multiplier=1, min=3, max=30), 17 | retry=retry_if_exception_type(CosClientError)) 18 | async def upload_file(self, loop, local_path, key): 19 | with open(local_path, 'rb') as f: 20 | content = f.read() 21 | 22 | executor = loop.run_in_executor 23 | put_object_partial = partial(self.client.put_object, Bucket=self.bucket, Body=content, Key=key) 24 | response = await executor(None, put_object_partial) 25 | return response 26 | 27 | @retry(stop=stop_after_attempt(5), 28 | wait=wait_exponential(multiplier=1, min=3, max=30), 29 | retry=retry_if_exception_type(CosClientError)) 30 | async def check_file_exist(self, loop, key): 31 | try: 32 | executor = loop.run_in_executor 33 | return await executor(None, self.client.object_exists, self.bucket, key) 34 | except Exception as e: 35 | logger.error(e) 36 | return False 37 | 38 | @retry(stop=stop_after_attempt(5), 39 | wait=wait_exponential(multiplier=1, min=3, max=30), 40 | retry=retry_if_exception_type(CosClientError)) 41 | async def delete_file(self, loop, key): 42 | executor = loop.run_in_executor 43 | response = await executor(None, self.client.delete_object, self.bucket, key) 44 | return response 45 | 46 | 47 | async def main(): 48 | test_prepare_conf() 49 | 50 | try: 51 | secret_id = config["cos"]["secret_id"] 52 | secret_key = config["cos"]["secret_key"] 53 | region = config["cos"]["region"] 54 | bucket = config["cos"]["bucket"] 55 | except Exception as e: 56 | print(f"Please fill cos conf in the config file") 57 | return 58 | 59 | local_path = './demos/saul.webp' 60 | key = 'test/saul.webp' 61 | 62 | uploader = TencentCosUploaderAsync(secret_id, secret_key, region, bucket) 63 | loop = asyncio.get_event_loop() 64 | 65 | upload_response = await uploader.upload_file(loop, local_path, key) 66 | print(f"Upload response: {upload_response}") 67 | 68 | if await uploader.check_file_exist(loop, key): 69 | print("Upload successful!") 70 | else: 71 | print("Upload failed!") 72 | 73 | 74 | if __name__ == "__main__": 75 | asyncio.run(main()) 76 | -------------------------------------------------------------------------------- /html2notion/translate/html2json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import chardet 3 | import time 4 | from functools import singledispatch 5 | from pathlib import Path 6 | from bs4 import BeautifulSoup, Tag 7 | from ..utils import logger, test_prepare_conf 8 | from ..translate.html2json_base import Html2JsonBase 9 | from ..translate.html2json_default import Default_Type 10 | from ..translate.html2json_yinxiang import YinXiang_Type 11 | from ..translate.html2json_clipper import YinXiangClipper_Type 12 | from ..translate.html2json_markdown import YinXiangMarkdown_Type 13 | 14 | 15 | """ 16 | 17 | 18 | 19 | """ 20 | def _is_yinxiang_export_html(html_soup, import_stat): 21 | meta_source = html_soup.select_one('html > head > meta[name="source"]') 22 | meta_source_content = meta_source.get('content', "") if isinstance(meta_source, Tag) else "" 23 | if not meta_source_content: 24 | return False 25 | yinxiang_source_content = ["yinxiang", "desktop", "mobile"] 26 | import_stat.head_meta["source"] = meta_source_content 27 | for prefix in yinxiang_source_content: 28 | if isinstance(meta_source_content, str) and meta_source_content.startswith(prefix): 29 | return True 30 | return False 31 | 32 | 33 | """ 34 | 35 | 36 | """ 37 | def _is_yinxiang_clipper_html(html_soup, import_stat): 38 | meta_source_application = html_soup.select_one('html > head > meta[name="source-application"]') 39 | source_application = meta_source_application.get('content', "") if isinstance(meta_source_application, Tag) else "" 40 | if not source_application: 41 | return False 42 | import_stat.head_meta["source-application"] = source_application 43 | if isinstance(source_application, str) and source_application.endswith("evernote"): 44 | return True 45 | if isinstance(source_application, str) and source_application in ["微信",]: 46 | return True 47 | return False 48 | 49 | 50 | """ 51 | 52 | """ 53 | def _is_yinxiang_markdown_html(html_soup, import_stat): 54 | meta_content_class = html_soup.select_one('html > head > meta[name="content-class"]') 55 | content_class = meta_content_class.get('content', "") if isinstance(meta_content_class, Tag) else "" 56 | if not content_class: 57 | return False 58 | import_stat.head_meta["content_class"] = content_class 59 | if isinstance(content_class, str) and content_class.endswith("markdown"): 60 | return True 61 | return False 62 | 63 | 64 | # 65 | # 66 | def _infer_input_type(html_content, import_stat): 67 | soup = BeautifulSoup(html_content, 'html.parser') 68 | exporter_version_meta = soup.select_one('html > head > meta[name="exporter-version"]') 69 | exporter_version_content = exporter_version_meta.get('content', "") if isinstance( exporter_version_meta, Tag) else "" 70 | import_stat.head_meta["exporter-version"] = exporter_version_content 71 | exporter_version = exporter_version_content if isinstance(exporter_version_content, str) else "" 72 | if exporter_version.startswith("Evernote") or exporter_version.startswith("YXBJ"): 73 | if _is_yinxiang_markdown_html(soup, import_stat): 74 | return YinXiangMarkdown_Type 75 | if _is_yinxiang_clipper_html(soup, import_stat): 76 | return YinXiangClipper_Type 77 | elif _is_yinxiang_export_html(soup, import_stat): 78 | return YinXiang_Type 79 | 80 | return YinXiang_Type # default 81 | 82 | return Default_Type 83 | 84 | 85 | def _get_converter(html_content, import_stat): 86 | html_type = _infer_input_type(html_content, import_stat) 87 | import_stat.head_meta["parse_type"] = html_type 88 | logger.info(f"Input type: {html_type}") 89 | converter = Html2JsonBase.create(html_type, html_content, import_stat) 90 | return converter 91 | 92 | 93 | @singledispatch 94 | def html2json_process(html_content, import_stat): 95 | raise TypeError(f"Unsupported {type(html_content)}, {import_stat}") 96 | 97 | 98 | @html2json_process.register 99 | def _(html_content: str, import_stat): 100 | converter = _get_converter(html_content, import_stat) 101 | result = converter.process() 102 | return converter.get_notion_data(), result 103 | 104 | 105 | @html2json_process.register 106 | def _(html_file: Path, import_stat): 107 | if not html_file.is_file(): 108 | print(f"Load file: {html_file.resolve()} failed") 109 | raise FileNotFoundError 110 | 111 | html_content = "" 112 | with html_file.open('rb') as f: 113 | data = f.read() 114 | result = chardet.detect(data) 115 | encoding = result['encoding'] if result['encoding'] else 'utf-8' 116 | html_content = data.decode(encoding) 117 | 118 | if html_content == "main_hold": # just for local debug 119 | time.sleep(1) 120 | return "main_hold" 121 | 122 | converter = _get_converter(html_content, import_stat) 123 | result = converter.process() 124 | return converter.get_notion_data(), result 125 | 126 | 127 | if __name__ == "__main__": 128 | test_prepare_conf() 129 | html_file = Path("./demos/Test Case D.html") 130 | result, html_type = html2json_process(html_file) 131 | print(html_type) 132 | print(json.dumps(result, indent=4, ensure_ascii=False)) 133 | result2, html_type2 = html2json_process( 134 | "
test
") 135 | print(html_type2) 136 | print(json.dumps(result2, indent=4, ensure_ascii=False)) 137 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_base.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import copy 4 | from collections import namedtuple 5 | from bs4 import NavigableString, Tag, PageElement 6 | from enum import Enum 7 | from ..utils import logger, config, is_valid_url 8 | 9 | class Block(Enum): 10 | FAIL = "fail" 11 | PARAGRAPH = "paragraph" 12 | QUOTE = "quote" 13 | NUMBERED_LIST = "numbered_list_item" 14 | BULLETED_LIST = "bulleted_list_item" 15 | HEADING = "heading" 16 | CODE = "code" 17 | DIVIDER = "divider" 18 | TABLE = "table" 19 | TO_DO = "to_do" 20 | EQUATION = "equation" 21 | 22 | 23 | class Html2JsonBase: 24 | # https://developers.notion.com/reference/request-limits 25 | URL_MAX_LENGTH = 2000 26 | TEXT_MAX_LENGTH = 2000 27 | EXPRESSION_MAX_LENGTH = 1000 28 | RICHTEXT_ARRAY_LENGTH = 100 29 | 30 | _registry = {} 31 | _text_annotations = { 32 | "bold": bool, 33 | "italic": bool, 34 | "strikethrough": bool, 35 | "underline": bool, 36 | "code": bool, 37 | "color": str, 38 | } 39 | 40 | _language = {"abap", "agda", "arduino", 41 | "assembly", "bash", "basic", "bnf", "c", "c#", "c++", "clojure", "coffeescript", "coq", "css", 42 | "dart", "dhall", "diff", "docker", "ebnf", "elixir", "elm", "erlang", "f#", "flow", "fortran", 43 | "gherkin", "glsl", "go", "graphql", "groovy", "haskell", "html", "idris", "java", "javascript", 44 | "json", "julia", "kotlin", "latex", "less", "lisp", "livescript", "llvm ir", "lua", "makefile", 45 | "markdown", "markup", "matlab", "mathematica", "mermaid", "nix", "objective-c", "ocaml", "pascal", 46 | "perl", "php", "plain text", "powershell", "prolog", "protobuf", "purescript", "python", "r", 47 | "racket", "reason", "ruby", "rust", "sass", "scala", "scheme", "scss", "shell", "solidity", "sql", 48 | "swift", "toml", "typescript", "vb.net", "verilog", "vhdl", "visual basic", "webassembly", "xml", 49 | "yaml", "java/c/c++/c#"} 50 | 51 | _color_tuple = namedtuple("Color", "name r g b") 52 | _notion_color = [ 53 | _color_tuple("default", 0, 0, 0), 54 | _color_tuple("gray", 128, 128, 128), 55 | _color_tuple("brown", 165, 42, 42), 56 | _color_tuple("orange", 255, 165, 0), 57 | _color_tuple("yellow", 255, 255, 0), 58 | _color_tuple("green", 0, 128, 0), 59 | _color_tuple("blue", 0, 0, 255), 60 | _color_tuple("purple", 128, 0, 128), 61 | _color_tuple("pink", 255, 192, 203), 62 | _color_tuple("red", 255, 0, 0), 63 | ] 64 | 65 | # Page content should be: https://developers.notion.com/reference/post-page 66 | def __init__(self, html_content, import_stat): 67 | self.html_content = html_content 68 | self.children = [] 69 | self.properties = {} 70 | self.parent = {} 71 | self.import_stat = import_stat 72 | if 'GITHUB_ACTIONS' in os.environ: 73 | notion_database_id = os.environ['notion_db_id_1'] 74 | else: 75 | notion_database_id = config['notion']['database_id'] 76 | self.parent = {"type": "database_id", "database_id": notion_database_id} 77 | 78 | def process(self): 79 | raise NotImplementedError("Subclasses must implement this method") 80 | 81 | def get_notion_data(self): 82 | return { 83 | key: value 84 | for key, value in { 85 | 'children': self.children, 86 | 'properties': self.properties, 87 | 'parent': self.parent, 88 | }.items() 89 | if value 90 | } 91 | 92 | @staticmethod 93 | def extract_text_and_parents(tag: PageElement, parents=[]): 94 | results = [] 95 | # Filter empty content when tag is not img 96 | if isinstance(tag, NavigableString) and tag.strip(): 97 | results.append((tag, parents)) 98 | return results 99 | elif isinstance(tag, Tag): 100 | if tag.name == 'img': 101 | img_src = tag.get('src', '') 102 | parent_tags = [p for p in parents + [tag]] 103 | results.append((img_src, parent_tags)) 104 | else: 105 | for child in tag.children: 106 | if isinstance(child, NavigableString): 107 | if tag.name != 'img' and child.strip(): 108 | text = child.text 109 | parent_tags = [p for p in parents + [tag]] 110 | results.append((text, parent_tags)) 111 | elif isinstance(child, Tag) and child.name == 'br': 112 | results.append(('
', [])) 113 | else: 114 | results.extend(Html2JsonBase.extract_text_and_parents(child, parents + [tag])) 115 | return results 116 | 117 | @staticmethod 118 | def parse_one_style(tag_soup: Tag, text_params: dict): 119 | tag_name = tag_soup.name.lower() 120 | styles = Html2JsonBase.get_tag_style(tag_soup) 121 | if Html2JsonBase.is_bold(tag_name, styles): 122 | text_params["bold"] = True 123 | if Html2JsonBase.is_italic(tag_name, styles): 124 | text_params["italic"] = True 125 | if Html2JsonBase.is_strikethrough(tag_name, styles): 126 | text_params["strikethrough"] = True 127 | if Html2JsonBase.is_underline(tag_name, styles): 128 | text_params["underline"] = True 129 | if Html2JsonBase.is_code(tag_name, styles): 130 | text_params["code"] = True 131 | 132 | color = Html2JsonBase.get_color(styles, tag_soup.attrs if tag_name else {}) 133 | if color != 'default': 134 | text_params["color"] = color 135 | 136 | if tag_name == 'a': 137 | href = tag_soup.get('href', "") 138 | if not href: 139 | logger.warning("Link href is empty") 140 | text_params["url"] = href 141 | elif tag_name == 'img': 142 | src = tag_soup.get('src', "") 143 | # only support external image here. 144 | if not src: 145 | logger.warning("Image src is empty") 146 | text_params["src"] = src 147 | return 148 | 149 | # https://developers.notion.com/reference/request-limits 150 | # Process one tag and return a list of objects 151 | # unlineline and bold 152 | #
Red color4
153 | #
Code in super note
154 | def generate_inline_obj(self, tag: PageElement): 155 | res_obj = [] 156 | text_with_parents = Html2JsonBase.extract_text_and_parents(tag) 157 | for (text, parent_tags) in text_with_parents: 158 | text_params = {"plain_text": text} 159 | for parent in parent_tags: 160 | Html2JsonBase.parse_one_style(parent, text_params) 161 | if text == "
": 162 | try: 163 | res_obj[-1]["text"]["content"] += "\n" 164 | res_obj[-1]["plain_text"] += "\n" 165 | except Exception as e: 166 | pass 167 | continue 168 | 169 | link_url = text_params.get("url", "") 170 | text_obj = {} 171 | if text_params.get("url", "") and is_valid_url(link_url): 172 | text_obj = self.generate_link(**text_params) 173 | # Here image is a independent block, split out in the outer layer 174 | elif text_params.get("src", ""): 175 | text_obj = self.generate_image(**text_params) 176 | else: 177 | if len(text) <= self.TEXT_MAX_LENGTH: 178 | text_obj = self.generate_text(**text_params) 179 | else: 180 | for chunk in [text[i:i+self.TEXT_MAX_LENGTH] for i in range(0, len(text), self.TEXT_MAX_LENGTH)]: 181 | text_params["plain_text"] = chunk 182 | text_obj = self.generate_text(**text_params) 183 | if text_obj: 184 | res_obj.append(text_obj) 185 | text_obj = None 186 | if text_obj: 187 | res_obj.append(text_obj) 188 | return res_obj 189 | 190 | def generate_link(self, **kwargs): 191 | link_url = kwargs.get("url", "") 192 | plain_text = kwargs.get("plain_text", "") 193 | if not plain_text or not is_valid_url(link_url): 194 | return 195 | 196 | link_url = link_url[:self.URL_MAX_LENGTH] 197 | self.import_stat.add_notion_text(plain_text) 198 | return { 199 | "href": link_url, 200 | "plain_text": plain_text, 201 | "text": { 202 | "link": {"url": link_url}, 203 | "content": plain_text 204 | }, 205 | "type": "text" 206 | } 207 | 208 | def generate_image(self, **kwargs): 209 | source = kwargs.get("src", "") 210 | if not source or not is_valid_url(source): 211 | return 212 | self.import_stat.add_notion_image(source) 213 | image_block = { 214 | "object": "block", 215 | "type": "image", 216 | "image": { 217 | "type": "external", 218 | "external": { 219 | "url": source 220 | } 221 | } 222 | } 223 | return image_block 224 | 225 | def generate_text(self, **kwargs): 226 | plain_text = kwargs.get("plain_text", "") 227 | if not plain_text: 228 | return 229 | annotations = { 230 | key: value 231 | for key, value in kwargs.items() 232 | if key in Html2JsonBase._text_annotations and isinstance(value, Html2JsonBase._text_annotations[key]) 233 | } 234 | stats_count = kwargs.get("stats_count", True) 235 | if stats_count: 236 | self.import_stat.add_notion_text(plain_text) 237 | text_obj = { 238 | "plain_text": plain_text, 239 | "text": {"content": plain_text}, 240 | "type": "text" 241 | } 242 | if annotations: 243 | text_obj["annotations"] = annotations 244 | 245 | return text_obj 246 | 247 | def generate_properties(self, **kwargs): 248 | title = kwargs.get("title", "") 249 | url = kwargs.get("url", "") 250 | tags = kwargs.get("tags", []) 251 | created_time = kwargs.get("created_time", "") 252 | 253 | property_map = { 254 | "Title": {"title": [{"text": {"content": title}}]} if title else None, 255 | "URL": {"url": url, "type": "url"} if url else None, 256 | "Tags": {"type": "multi_select", "multi_select": [{"name": tag} for tag in tags]} if tags else None, 257 | "Created": {"date": {"start": created_time}, "type": "date"} if created_time else None, 258 | } 259 | 260 | properties_obj = {key: value for key, value in property_map.items() if value is not None} 261 | 262 | logger.debug(f"properties: {properties_obj}") 263 | return properties_obj 264 | 265 | @staticmethod 266 | def is_same_annotations_text(text_one: dict, text_another: dict): 267 | if text_one["type"] != "text" or text_another["type"] != "text": 268 | return False 269 | attributes = ["annotations", "href"] 270 | 271 | # When merging, be careful not to let the text length exceed the limit 272 | total_size = len(text_one["text"]["content"]) + len(text_another["text"]["content"]) 273 | if total_size > Html2JsonBase.TEXT_MAX_LENGTH: 274 | return False 275 | 276 | return all(text_one.get(attr) == text_another.get(attr) for attr in attributes) 277 | 278 | @staticmethod 279 | def merge_rich_text(rich_text: list): 280 | if not rich_text: 281 | return [] 282 | merged_text = [] 283 | current_text = rich_text[0] 284 | for text in rich_text[1:]: 285 | if Html2JsonBase.is_same_annotations_text(current_text, text): 286 | text_content = current_text["text"]["content"] + text["text"]["content"] 287 | current_text["plain_text"] = text_content 288 | current_text["text"]["content"] = text_content 289 | else: 290 | merged_text.append(current_text) 291 | current_text = text 292 | if current_text: 293 | merged_text.append(current_text) 294 | 295 | return merged_text 296 | 297 | @staticmethod 298 | def is_bold(tag_name: str, styles: dict) -> bool: 299 | if tag_name in ('b', 'strong'): 300 | return True 301 | 302 | font_weight = styles.get('font-weight', None) 303 | if font_weight is None: 304 | return False 305 | elif font_weight == 'bold': 306 | return True 307 | elif font_weight.isdigit() and int(font_weight) >= 700: 308 | return True 309 | return False 310 | 311 | @staticmethod 312 | def is_strikethrough(tag_name: str, styles: dict) -> bool: 313 | if tag_name in ('s', 'strike', 'del'): 314 | return True 315 | text_decoration = styles.get("text-decoration", "") 316 | return "line-through" in text_decoration 317 | 318 | @staticmethod 319 | def is_italic(tag_name: str, styles: dict) -> bool: 320 | if tag_name in ('i', 'em'): 321 | return True 322 | font_style = styles.get('font-style', "") 323 | return "italic" in font_style 324 | 325 | @staticmethod 326 | def is_underline(tag_name: str, styles: dict) -> bool: 327 | # A tuple of a single element requires a comma after the element 328 | if tag_name in ('u',): 329 | return True 330 | text_decoration = styles.get('text-decoration', "") 331 | return 'underline' in text_decoration 332 | 333 | @staticmethod 334 | def is_code(tag_name: str, styles: dict): 335 | if tag_name in ('code',): 336 | return True 337 | 338 | # style="-en-code: true" 339 | if styles.get('-en-code', "false") == "true": 340 | return True 341 | 342 | # Check if the font-family is monospace 343 | font_family = styles.get('font-family', "") 344 | monospace_fonts = {'courier', 'monospace'} 345 | if not font_family: 346 | return False 347 | for font in monospace_fonts: 348 | if font.lower() == font_family.lower(): 349 | return True 350 | 351 | @staticmethod 352 | def _closest_color(r, g, b): 353 | closest_distance = float("inf") 354 | closest_color = None 355 | 356 | for color in Html2JsonBase._notion_color: 357 | distance = ((r - color.r) ** 2 + (g - color.g) ** 2 + (b - color.b) ** 2) ** 0.5 358 | if distance < closest_distance: 359 | closest_distance = distance 360 | closest_color = color.name 361 | 362 | return closest_color 363 | 364 | @staticmethod 365 | def _hex_to_rgb(hex_color): 366 | hex_color = hex_color.lstrip("#") 367 | return tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4)) 368 | 369 | @staticmethod 370 | def get_color(styles: dict, attrs): 371 | color = styles.get('color', "") 372 | if not color and 'color' in attrs: 373 | color = attrs['color'] 374 | if not color: 375 | return "default" 376 | # If the color_values have 4 items, then it is RGBA and the last value is alpha 377 | # rgba(174, 174, 188, 0.2) 378 | if color.startswith("rgb"): 379 | color_values = [int(x.strip()) for x in re.findall(r'\d+', color)] 380 | if len(color_values) >= 3: 381 | r, g, b = color_values[:3] 382 | return Html2JsonBase._closest_color(r, g, b) 383 | # Check if color is in hexadecimal format 384 | elif re.match(r'^#(?:[0-9a-fA-F]{3}){1,2}$', color): 385 | if len(color) == 4: # Short form like #abc -> #aabbcc 386 | color = '#' + ''.join([c*2 for c in color[1:]]) 387 | r, g, b = Html2JsonBase._hex_to_rgb(color) 388 | return Html2JsonBase._closest_color(r, g, b) 389 | 390 | return "default" 391 | 392 | def convert_paragraph(self, soup): 393 | json_obj = { 394 | "object": "block", 395 | "type": "paragraph", 396 | "paragraph": { 397 | "rich_text": [] 398 | } 399 | } 400 | rich_text = json_obj["paragraph"]["rich_text"] 401 | text_obj = self.generate_inline_obj(soup) 402 | if text_obj: 403 | rich_text.extend(text_obj) 404 | 405 | # Split out image into a independent blocks 406 | split_objs = Html2JsonBase.split_image_src(json_obj) 407 | return Html2JsonBase.ensure_array_len(split_objs) 408 | 409 | def convert_divider(self, soup): 410 | return { 411 | "object": "block", 412 | "type": "divider", 413 | "divider": {} 414 | } 415 | 416 | def convert_heading(self, soup): 417 | heading_map = {"h1": "heading_1", "h2": "heading_2", "h3": "heading_3", 418 | "h4": "heading_3", "h5": "heading_3", "h6": "heading_3"} 419 | 420 | heading_level = heading_map.get(soup.name, "heading_3") 421 | json_obj = { 422 | "object": "block", 423 | "type": heading_level, 424 | heading_level: { 425 | "rich_text": [] 426 | } 427 | } 428 | rich_text = json_obj[heading_level]["rich_text"] 429 | text_obj = self.generate_inline_obj(soup) 430 | if text_obj: 431 | rich_text.extend(text_obj) 432 | return json_obj 433 | return None 434 | 435 | #
  1. first
  2. second
  3. third
436 | def convert_numbered_list_item(self, soup): 437 | return self.convert_list_items(soup, 'numbered_list_item') 438 | 439 | #
  • itemA
  • itemB
  • itemC
440 | def convert_bulleted_list_item(self, soup): 441 | return self.convert_list_items(soup, 'bulleted_list_item') 442 | 443 | def convert_list_items(self, soup, list_type): 444 | # Remove heading tags in li 445 | for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): 446 | heading.unwrap() 447 | 448 | items = soup.find_all('li', recursive=True) 449 | if not items: 450 | logger.warning("No list items found in {soup}") 451 | 452 | json_arr = [] 453 | for item in items: 454 | one_item = self._convert_one_list_item(item, list_type) 455 | if one_item: 456 | json_arr.append(one_item) 457 | else: 458 | logger.info(f'empty {item}') 459 | return json_arr 460 | 461 | def _convert_one_list_item(self, soup, list_type): 462 | if list_type not in {'numbered_list_item', 'bulleted_list_item'}: 463 | logger.warning(f'Not support list_type') 464 | 465 | json_obj = { 466 | "object": "block", 467 | list_type: { 468 | "rich_text": [] 469 | }, 470 | "type": list_type, 471 | } 472 | rich_text = json_obj[list_type]["rich_text"] 473 | text_obj = self.generate_inline_obj(soup) 474 | if text_obj: 475 | rich_text.extend(text_obj) 476 | 477 | return json_obj 478 | 479 | """ 480 |
481 |

482 |
483 |

484 |
485 | """ 486 | # ../examples/insert_table.ipynb 487 | def convert_table(self, soup): 488 | table_rows = [] 489 | tr_tags = soup.find_all('tr') 490 | if not tr_tags: 491 | logger.error(f"No tr found in {soup}") 492 | return 493 | 494 | table_width = len(tr_tags[0].find_all('td')) 495 | has_header = False 496 | for tr in tr_tags: 497 | td_tags = tr.find_all('td') 498 | if not td_tags: 499 | td_tags = tr.find_all('th') 500 | has_header = True 501 | table_width = max(table_width, len(td_tags)) 502 | one_row = { 503 | "type": "table_row", 504 | "table_row": { 505 | "cells": [] 506 | } 507 | } 508 | for td in td_tags: 509 | col = self.generate_inline_obj(td) 510 | one_row["table_row"]["cells"].append(col) 511 | table_rows.append(one_row) 512 | 513 | table_obj = { 514 | "table": { 515 | "has_row_header": False, 516 | "has_column_header": has_header, 517 | "table_width": table_width, 518 | "children": table_rows, 519 | } 520 | } 521 | return table_obj 522 | 523 | @staticmethod 524 | def split_image_src(text_obj): 525 | rich_text = text_obj["paragraph"]["rich_text"] 526 | need_split = any(text.get("object") == "block" for text in rich_text) 527 | if not need_split: 528 | return [text_obj] 529 | 530 | split_obj = [] 531 | cur_obj = { 532 | "object": "block", 533 | "type": "paragraph", 534 | "paragraph": { 535 | "rich_text": [] 536 | } 537 | } 538 | for text in rich_text: 539 | if text.get("object") == "block": 540 | if len(cur_obj["paragraph"]["rich_text"]) > 0: 541 | split_obj.append(copy.deepcopy(cur_obj)) 542 | cur_obj["paragraph"]["rich_text"].clear() 543 | split_obj.append(text) 544 | continue 545 | cur_obj["paragraph"]["rich_text"].append(text) 546 | if len(cur_obj["paragraph"]["rich_text"]) > 0: 547 | split_obj.append(cur_obj) 548 | return split_obj 549 | 550 | # Only if there is no ";" in the value of the attribute, you can use this method to get all attributes. 551 | # Can't use this way like: background-image: url('data:image/png;base64...') 552 | @staticmethod 553 | def get_tag_style(tag_soup): 554 | styles = {} 555 | if not isinstance(tag_soup, Tag): 556 | return styles 557 | style = tag_soup.get('style', "") 558 | if str and isinstance(style, str): 559 | # style = ''.join(style.split()) 560 | styles = { 561 | rule.split(':')[0].strip(): rule.split(':')[1].strip().lower() 562 | for rule in style.split(';') 563 | if rule and len(rule.split(':')) > 1 564 | } 565 | return styles 566 | 567 | @staticmethod 568 | def get_valid_language(language): 569 | if language in Html2JsonBase._language: 570 | return language 571 | return "plain text" 572 | 573 | @staticmethod 574 | def ensure_array_len(blocks): 575 | final_objs = [] 576 | for obj in blocks: 577 | if "paragraph" not in obj or "rich_text" not in obj["paragraph"] or len( 578 | obj["paragraph"]["rich_text"]) <= Html2JsonBase.RICHTEXT_ARRAY_LENGTH: 579 | final_objs.append(obj) 580 | continue 581 | 582 | # If the length of rich_text is greater than RICHTEXT_ARRAY_LENGTH, we split it 583 | rich_text_arr = obj["paragraph"]["rich_text"] 584 | rich_texts = [rich_text_arr[i:i+Html2JsonBase.RICHTEXT_ARRAY_LENGTH] 585 | for i in range(0, len(rich_text_arr), Html2JsonBase.RICHTEXT_ARRAY_LENGTH)] 586 | for rich_text in rich_texts: 587 | new_json_obj = { 588 | "object": "block", 589 | "type": "paragraph", 590 | "paragraph": { 591 | "rich_text": rich_text 592 | } 593 | } 594 | final_objs.append(new_json_obj) 595 | return final_objs 596 | 597 | @classmethod 598 | def register(cls, input_type, subclass): 599 | cls._registry[input_type] = subclass 600 | 601 | @classmethod 602 | def create(cls, input_type, html_content, import_stat): 603 | subclass = cls._registry.get(input_type) 604 | if subclass is None: 605 | raise ValueError(f"noknown: {input_type}") 606 | return subclass(html_content, import_stat) 607 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_clipper.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup, NavigableString, Tag 2 | from ..utils import logger, DateStrToISO8601 3 | from ..translate.html2json_base import Html2JsonBase, Block 4 | 5 | YinXiangClipper_Type = "clipper.yinxiang" 6 | 7 | 8 | class Html2JsonClipper(Html2JsonBase): 9 | input_type = YinXiangClipper_Type 10 | 11 | def __init__(self, html_content, import_stat): 12 | super().__init__(html_content, import_stat) 13 | 14 | def process(self): 15 | soup = BeautifulSoup(self.html_content, 'html.parser') 16 | self.convert_properties(soup) 17 | 18 | content_tags = soup.body 19 | if not content_tags: 20 | logger.error("No content found") 21 | raise Exception("No content found") 22 | 23 | self.import_stat.add_text(content_tags.get_text()) 24 | self.convert_children(content_tags) # Assesume only one body tag 25 | 26 | return YinXiangClipper_Type 27 | 28 | def convert_properties(self, soup): 29 | properties = {"title": "Unknown"} 30 | title_tag = soup.select_one('head > title') 31 | if title_tag: 32 | properties["title"] = title_tag.text 33 | 34 | meta_tags = [ 35 | ('head > meta[name="source-url"]', "url"), 36 | ('head > meta[name="keywords"]', "tags", lambda x: x.split(",")), 37 | ('head > meta[name="created"]', "created_time", DateStrToISO8601), 38 | ] 39 | 40 | for selector, key, *converter in meta_tags: 41 | tag = soup.select_one(selector) 42 | if tag and tag.get('content', None): 43 | content = tag['content'] 44 | properties[key] = converter[0](content) if converter else content 45 | 46 | self.properties = self.generate_properties(**properties) 47 | return 48 | 49 | def get_block_type(self, element): 50 | tag_name = element.name 51 | if tag_name == "p": 52 | return Block.PARAGRAPH.value 53 | elif tag_name == "table": 54 | return Block.TABLE.value 55 | elif tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): 56 | return Block.HEADING.value 57 | elif tag_name == 'hr': 58 | return Block.DIVIDER.value 59 | elif tag_name == 'ol': 60 | return Block.NUMBERED_LIST.value 61 | elif tag_name == 'ul': 62 | return Block.BULLETED_LIST.value 63 | elif element.name == 'pre' and element.code: 64 | return Block.CODE.value 65 | elif self._check_is_block(element): 66 | return Block.QUOTE.value 67 | 68 | return Block.FAIL.value 69 | 70 | def convert_children(self, soup): 71 | processed_tags = set() 72 | for element in soup.descendants: 73 | if isinstance(element, NavigableString): 74 | continue 75 | if any(id(ancestor) in processed_tags for ancestor in element.parents): 76 | logger.debug(f"Skip processed tag {element}") 77 | continue 78 | block_type = self.get_block_type(element) 79 | if hasattr(self, f"convert_{block_type}"): 80 | converter = getattr(self, f"convert_{block_type}") 81 | block = converter(element) 82 | if block: 83 | self.children.extend([block] if not isinstance(block, list) else block) 84 | processed_tags.add(id(element)) 85 | unprocessed_tags = set() 86 | for element in soup.descendants: 87 | if not isinstance(element, NavigableString) or id(element) in processed_tags: 88 | continue 89 | if any(id(ancestor) in processed_tags for ancestor in element.parents): 90 | continue 91 | unprocessed_tags.add(element) 92 | 93 | for unprocessed_tag in unprocessed_tags: 94 | logger.warning(f"Unknown tag {unprocessed_tag.name}, {self.get_block_type(unprocessed_tag)}") 95 | self.import_stat.add_skip_tag(unprocessed_tag.get_text()) 96 | return 97 | 98 | #
line number... code content ...
99 | def convert_code(self, soup): 100 | json_obj = { 101 | "object": "block", 102 | "type": "code", 103 | "code": { 104 | "rich_text": [], 105 | "language": "plain text", 106 | }, 107 | } 108 | rich_text = json_obj["code"]["rich_text"] 109 | code_tag = soup.code 110 | if not code_tag: 111 | logger.error(f'No code tag found in {soup}') 112 | return 113 | children_list = list(code_tag.children) if isinstance(code_tag, Tag) else [code_tag] 114 | for child in children_list: 115 | if isinstance(child, Tag) and child.name == "code": 116 | logger.debug(f'Skip line number') 117 | continue 118 | text_obj = self.generate_inline_obj(child) 119 | if text_obj: 120 | rich_text.extend(text_obj) 121 | json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text) 122 | return json_obj 123 | 124 | def convert_quote(self, soup): 125 | json_obj = { 126 | "object": "block", 127 | "type": "quote", 128 | "quote": { 129 | "rich_text": [] 130 | } 131 | } 132 | rich_text = json_obj["quote"]["rich_text"] 133 | text_obj = self.generate_inline_obj(soup) 134 | if text_obj: 135 | rich_text.extend(text_obj) 136 | 137 | # Merge tags has same anotions 138 | return json_obj 139 | 140 | def _check_is_block(self, element): 141 | quote_elements = {'blockquote', 'q', 'cite'} 142 | if element.name in quote_elements: 143 | return True 144 | 145 | if element.name != 'div': 146 | return False 147 | 148 | # if 'class' in element.attrs: 149 | # if any('quote' in class_name.lower() for class_name in element.attrs['class']): 150 | # return True 151 | 152 | # if 'style' in element.attrs: 153 | # style_attrs = element.attrs['style'].lower() 154 | # if 'border:' in style_attrs or 'padding:' in style_attrs: 155 | # return True 156 | 157 | return False 158 | 159 | 160 | Html2JsonBase.register(YinXiangClipper_Type, Html2JsonClipper) 161 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_default.py: -------------------------------------------------------------------------------- 1 | # For notes that are clipped from web pages 2 | # that are not written manually by Evernote and have rich text formatting, 3 | # try to keep the format for conversion 4 | 5 | from ..translate.html2json_base import Html2JsonBase 6 | 7 | Default_Type = "default" 8 | 9 | 10 | class Html2JsonDefault(Html2JsonBase): 11 | input_type = Default_Type 12 | 13 | def __init__(self, html_content, import_stat): 14 | super().__init__(html_content, import_stat) 15 | 16 | # todo 17 | def process(self): 18 | return Default_Type 19 | 20 | 21 | Html2JsonBase.register(Default_Type, Html2JsonDefault) 22 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_markdown.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup, Tag 3 | from urllib.parse import unquote 4 | from ..utils import logger, is_valid_url, DateStrToISO8601 5 | from ..translate.html2json_base import Html2JsonBase, Block 6 | 7 | YinXiangMarkdown_Type = "markdown.yinxiang" 8 | 9 | # Yinxiang markdown 10 | # https://list.yinxiang.com/markdown/eef42447-db3f-48ee-827b-1bb34c03eb83.php 11 | 12 | 13 | class Html2JsonMarkdown(Html2JsonBase): 14 | input_type = YinXiangMarkdown_Type 15 | undo_image = "url('data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAA4AAAAOCAYAAAAfSC3RAAAAAXNSR0IArs4c6QAAADdJREFUKBVjvHv37n8GMgALSI+SkhJJWu/du8fARJIOJMWjGpECA505GjjoIYLEB6dVUNojFQAA/1MJUFWet/4AAAAASUVORK5CYII=')" 16 | 17 | def __init__(self, html_content, import_stat): 18 | super().__init__(html_content, import_stat) 19 | self.markdown = "" 20 | 21 | def process(self): 22 | soup = BeautifulSoup(self.html_content, 'html.parser') 23 | self.convert_properties(soup) 24 | 25 | content_tags = soup.body 26 | if not content_tags: 27 | logger.error("No content found") 28 | raise Exception("No content found") 29 | 30 | # The center records the contents of the original markdown file, which is useless 31 | center_to_delete = content_tags.find('center') 32 | if isinstance(center_to_delete, Tag): 33 | md_encode = center_to_delete.get_text() 34 | self.markdown = unquote(md_encode) 35 | if isinstance(center_to_delete, Tag): 36 | center_to_delete.decompose() 37 | 38 | # Special handling contains blocks of code, 39 | # because some chart blocks are converted into images and cannot be processed directly 40 | self._replace_pre_code(soup) 41 | self.import_stat.add_text(content_tags.get_text()) 42 | img_tags = content_tags.find_all('img') 43 | for img in img_tags: 44 | img_src = img.get('src', '') 45 | if is_valid_url(img_src): 46 | self.import_stat.add_image(img_src) 47 | 48 | self.convert_children(content_tags) # Assesume only one body tag 49 | 50 | return YinXiangMarkdown_Type 51 | 52 | def convert_properties(self, soup): 53 | properties = {"title": "Unknown"} 54 | title_tag = soup.select_one('head > title') 55 | if title_tag: 56 | properties["title"] = title_tag.text 57 | 58 | meta_tags = [ 59 | ('head > meta[name="source-url"]', "url"), 60 | ('head > meta[name="keywords"]', "tags", lambda x: x.split(",")), 61 | ('head > meta[name="created"]', "created_time", DateStrToISO8601), 62 | ] 63 | 64 | for selector, key, *converter in meta_tags: 65 | tag = soup.select_one(selector) 66 | if tag and tag.get('content', None): 67 | content = tag['content'] 68 | properties[key] = converter[0](content) if converter else content 69 | 70 | self.properties = self.generate_properties(**properties) 71 | return 72 | 73 | def get_block_type(self, element): 74 | tag_name = element.name 75 | if tag_name == "p": 76 | return Block.PARAGRAPH.value 77 | elif tag_name == "table": 78 | return Block.TABLE.value 79 | elif tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): 80 | return Block.HEADING.value 81 | elif tag_name == 'hr': 82 | return Block.DIVIDER.value 83 | elif tag_name == 'ol': 84 | return Block.NUMBERED_LIST.value 85 | elif tag_name == 'ul': 86 | if self._is_checkbox(element): 87 | return Block.TO_DO.value 88 | return Block.BULLETED_LIST.value 89 | elif element.name == 'pre' and element.code: 90 | if self._is_math(element): 91 | return Block.EQUATION.value 92 | return Block.CODE.value 93 | elif element.name == "blockquote": 94 | return Block.QUOTE.value 95 | 96 | return Block.FAIL.value 97 | 98 | def convert_children(self, soup): 99 | div_tag = soup.find('div') 100 | if not div_tag: 101 | logger.error(f'No div tag found in {soup}') 102 | return 103 | for child in div_tag.children: 104 | block_type = self.get_block_type(child) 105 | logger.debug(f'block_type: {block_type}, child: {child}') 106 | if hasattr(self, f"convert_{block_type}"): 107 | converter = getattr(self, f"convert_{block_type}") 108 | block = converter(child) 109 | if block: 110 | self.children.extend([block] if not isinstance(block, list) else block) 111 | else: 112 | self.import_stat.add_skip_tag(child.get_text()) 113 | logger.warning(f"Unknown tag : {child}") 114 | return 115 | 116 | def convert_code(self, soup): 117 | json_obj = { 118 | "object": "block", 119 | "type": "code", 120 | "code": { 121 | "rich_text": [], 122 | "language": "plain text", 123 | }, 124 | } 125 | rich_text = json_obj["code"]["rich_text"] 126 | code_tag = soup.code 127 | if not code_tag: 128 | logger.error(f'No code tag found in {soup}') 129 | return 130 | children_list = list(code_tag.children) if isinstance(code_tag, Tag) else [code_tag] 131 | for child in children_list: 132 | text_obj = self.generate_inline_obj(child) 133 | if text_obj: 134 | rich_text.extend(text_obj) 135 | 136 | css_dict = Html2JsonBase.get_tag_style(code_tag) 137 | language = css_dict.get('language', 'plain text') 138 | json_obj["code"]["language"] = Html2JsonBase.get_valid_language(language) 139 | json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text) 140 | return json_obj 141 | 142 | def convert_quote(self, soup): 143 | json_obj = { 144 | "object": "block", 145 | "type": "quote", 146 | "quote": { 147 | "rich_text": [] 148 | } 149 | } 150 | rich_text = json_obj["quote"]["rich_text"] 151 | text_obj = self.generate_inline_obj(soup) 152 | if text_obj: 153 | rich_text.extend(text_obj) 154 | return json_obj 155 | 156 | def convert_equation(self, soup: Tag): 157 | json_obj = { 158 | "object": "block", 159 | "type": "paragraph", 160 | "paragraph": { 161 | "rich_text": [] 162 | } 163 | } 164 | expression = soup.get_text()[:Html2JsonBase.EXPRESSION_MAX_LENGTH] 165 | equation = json_obj["paragraph"]["rich_text"] 166 | equation.append({ 167 | "type": "equation", 168 | "equation": { 169 | "expression": expression 170 | } 171 | }) 172 | return json_obj 173 | 174 | def convert_to_do(self, soup: Tag): 175 | li_tags = soup.find_all('li', recursive=True) 176 | childs = li_tags if li_tags else [soup] 177 | to_do_blocks = [] 178 | for child in childs: 179 | json_obj = { 180 | "object": "block", 181 | "type": "to_do", 182 | "to_do": { 183 | "rich_text": [], 184 | "checked": False 185 | } 186 | } 187 | text = json_obj["to_do"]["rich_text"] 188 | text_obj = self.generate_inline_obj(child) 189 | if text_obj: 190 | text.extend(text_obj) 191 | 192 | style = child.get('style', '') 193 | if isinstance(style, str) and Html2JsonMarkdown.undo_image not in style: 194 | json_obj["to_do"]["checked"] = True 195 | to_do_blocks.append(json_obj) 196 | return to_do_blocks 197 | 198 | # Each style in
  • has a background-image, which is considered a check box 199 | def _is_checkbox(self, soup): 200 | for li in soup.find_all('li'): 201 | style = li.get('style', '') 202 | if not "background-image: url('data:image/png;" in style: 203 | return False 204 | return True 205 | 206 | def _extract_code_blocks(self): 207 | code_pattern = re.compile(r'```(\w+)?\n(.*?)```', re.DOTALL) 208 | matches = code_pattern.findall(self.markdown) 209 | code_blocks = [{'language': match[0], 'code': match[1].rstrip('\n')} for match in matches] 210 | return code_blocks 211 | 212 | def _replace_pre_code(self, soup): 213 | markdown_code_blocks = self._extract_code_blocks() 214 | count = sum(1 for pre_tag in soup.find_all('pre') if pre_tag.find('code')) 215 | 216 | if markdown_code_blocks and count != len(markdown_code_blocks): 217 | logger.warning(f'Code block count not match: {count} != {len(markdown_code_blocks)}') 218 | return 219 | 220 | pre_tags = soup.find_all('pre') 221 | idx = 0 222 | for pre in pre_tags: 223 | code = pre.find('code') 224 | if not code: 225 | continue 226 | new_tag = soup.new_tag('code') 227 | new_tag.string = markdown_code_blocks[idx]['code'] 228 | new_tag['style'] = 'language: ' + markdown_code_blocks[idx]['language'] 229 | idx += 1 230 | code.replace_with(new_tag) 231 | return soup 232 | 233 | def _is_math(self, soup): 234 | code_tag = soup.code 235 | if not code_tag: 236 | return False 237 | 238 | css_dict = Html2JsonBase.get_tag_style(code_tag) 239 | if 'language' in css_dict and css_dict['language'] == 'math': 240 | return True 241 | return False 242 | 243 | 244 | Html2JsonBase.register(YinXiangMarkdown_Type, Html2JsonMarkdown) 245 | -------------------------------------------------------------------------------- /html2notion/translate/html2json_yinxiang.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup, Tag 2 | from ..utils import logger, DateStrToISO8601 3 | from ..translate.html2json_base import Html2JsonBase, Block 4 | 5 | YinXiang_Type = "yinxiang" 6 | 7 | 8 | class Html2JsonYinXiang(Html2JsonBase): 9 | input_type = YinXiang_Type 10 | 11 | def __init__(self, html_content, import_stat): 12 | super().__init__(html_content, import_stat) 13 | 14 | def process(self): 15 | soup = BeautifulSoup(self.html_content, 'html.parser') 16 | self.convert_children(soup) 17 | self.convert_properties(soup) 18 | return YinXiang_Type 19 | 20 | def convert_properties(self, soup): 21 | properties = {"title": "Unknown"} 22 | title_tag = soup.select_one('head > title') 23 | if title_tag: 24 | properties["title"] = title_tag.text 25 | 26 | meta_tags = [ 27 | ('head > meta[name="source-url"]', "url"), 28 | ('head > meta[name="keywords"]', "tags", lambda x: x.split(",")), 29 | ('head > meta[name="created"]', "created_time", DateStrToISO8601), 30 | ] 31 | 32 | for selector, key, *converter in meta_tags: 33 | tag = soup.select_one(selector) 34 | if tag and tag.get('content', None): 35 | content = tag['content'] 36 | properties[key] = converter[0](content) if converter else content 37 | 38 | self.properties = self.generate_properties(**properties) 39 | return 40 | 41 | def convert_children(self, soup): 42 | content_tags = soup.find_all('body', recursive=True) 43 | if not content_tags: 44 | logger.warning("No content found") 45 | raise Exception("No content found") 46 | 47 | self.import_stat.add_text(content_tags[0].get_text()) 48 | for child in content_tags[0].children: 49 | block_type = self.get_block_type(child) 50 | # Computer all text len in html 51 | logger.debug(f'Support tag {child} with style {block_type}') 52 | if hasattr(self, f"convert_{block_type}"): 53 | converter = getattr(self, f"convert_{block_type}") 54 | block = converter(child) 55 | if block: 56 | self.children.extend([block] if not isinstance(block, list) else block) 57 | else: 58 | self.import_stat.add_skip_tag(child.get_text()) 59 | logger.warning(f"Unknown tag : {child}") 60 | 61 | def convert_code(self, soup): 62 | json_obj = { 63 | "object": "block", 64 | "type": "code", 65 | "code": { 66 | "rich_text": [], 67 | "language": "plain text", 68 | }, 69 | } 70 | rich_text = json_obj["code"]["rich_text"] 71 | 72 | children_list = list(soup.children) if isinstance(soup, Tag) else [soup] 73 | for index, child in enumerate(children_list): 74 | is_last_child = index == len(children_list) - 1 75 | text_obj = self.generate_inline_obj(child) 76 | if text_obj: 77 | rich_text.extend(text_obj) 78 | if not is_last_child: 79 | rich_text.append(self.generate_text(plain_text='\n', stats_count=False)) 80 | json_obj["code"]["rich_text"] = self.merge_rich_text(rich_text) 81 | css_dict = Html2JsonBase.get_tag_style(soup) 82 | language = css_dict.get('--en-codeblockLanguage', 'plain text') 83 | json_obj["code"]["language"] = language 84 | return json_obj 85 | 86 | def convert_quote(self, soup): 87 | json_obj = { 88 | "object": "block", 89 | "type": "quote", 90 | "quote": { 91 | "rich_text": [] 92 | } 93 | } 94 | rich_text = json_obj["quote"]["rich_text"] 95 | 96 | children_list = list(soup.children) 97 | for index, child in enumerate(children_list): 98 | is_last_child = index == len(children_list) - 1 99 | text_obj = self.generate_inline_obj(child) 100 | if text_obj: 101 | rich_text.extend(text_obj) 102 | if not is_last_child: 103 | rich_text.append(self.generate_text(plain_text='\n', stats_count=False)) 104 | 105 | # Merge tags has same anotions 106 | logger.debug(f'before merge: {rich_text}') 107 | json_obj["quote"]["rich_text"] = self.merge_rich_text(rich_text) 108 | return json_obj 109 | 110 | def convert_to_do(self, soup: Tag): 111 | # Compatible with the situation where input is under li tag(super note). 112 | li_tags = soup.find_all('li', recursive=True) 113 | childs = li_tags if li_tags else [soup] 114 | to_do_blocks = [] 115 | for child in childs: 116 | json_obj = { 117 | "object": "block", 118 | "type": "to_do", 119 | "to_do": { 120 | "rich_text": [], 121 | "checked": False 122 | } 123 | } 124 | text = json_obj["to_do"]["rich_text"] 125 | text_obj = self.generate_inline_obj(child) 126 | if text_obj: 127 | text.extend(text_obj) 128 | input_tag = child.find('input') 129 | if input_tag and isinstance(input_tag, Tag) and input_tag.get('checked', 'false') == 'true': 130 | json_obj["to_do"]["checked"] = True 131 | to_do_blocks.append(json_obj) 132 | return to_do_blocks 133 | 134 | def get_block_type(self, single_tag): 135 | tag_name = single_tag.name 136 | style = single_tag.get('style') if tag_name else "" 137 | 138 | # There are priorities here. It is possible to hit multiple targets 139 | # at the same time, and the first one takes precedence. 140 | if self._check_is_todo(single_tag): 141 | return Block.TO_DO.value 142 | elif tag_name == 'hr': 143 | return Block.DIVIDER.value 144 | elif tag_name == 'ol': 145 | return Block.NUMBERED_LIST.value 146 | elif tag_name == 'ul': 147 | return Block.BULLETED_LIST.value 148 | elif tag_name == 'p': 149 | return Block.PARAGRAPH.value 150 | elif tag_name in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): 151 | return Block.HEADING.value 152 | elif tag_name == 'table' or self._check_is_table(single_tag): 153 | return Block.TABLE.value 154 | 155 | css_dict = Html2JsonBase.get_tag_style(single_tag) 156 | if css_dict.get('--en-blockquote', None) == 'true': 157 | return Block.QUOTE.value 158 | if css_dict.get('--en-codeblock', None) == 'true': 159 | return Block.CODE.value 160 | if css_dict.get('-en-codeblock', None) == 'true': 161 | return Block.CODE.value 162 | 163 | # Issue 5:
    164 | if tag_name == 'div': 165 | return Block.PARAGRAPH.value 166 | return Block.FAIL.value 167 | 168 | #
    169 | def _check_is_table(self, tag): 170 | if tag.name == "div": 171 | children = list(filter(lambda x: x != '\n', tag.contents)) 172 | table_count = sum(1 for child in children if child.name == "table") 173 | return table_count >= 1 174 | return False 175 | 176 | def _check_is_todo(self, tag): 177 | if not isinstance(tag, Tag): 178 | return False 179 | input_tag = tag.find('input') 180 | if input_tag and isinstance(input_tag, Tag) and input_tag.get('type') == 'checkbox': 181 | return True 182 | return False 183 | 184 | Html2JsonBase.register(YinXiang_Type, Html2JsonYinXiang) 185 | -------------------------------------------------------------------------------- /html2notion/translate/import_stats.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from enum import Enum 3 | 4 | 5 | class StatLevel(Enum): 6 | EXCEPTION = "exception" 7 | LOSS = "loss" 8 | SUCC = "success" 9 | 10 | 11 | class ImportStats: 12 | def __init__(self): 13 | self.text_count = 0 14 | self.image_count = 0 15 | self.notion_text_count = 0 16 | self.notion_image_count = 0 17 | self.skip_tag = [] 18 | self.exception = None 19 | self.filename = "None" 20 | self.html_content = "" 21 | self.notion_content = "" 22 | self.html_image_src = [] 23 | self.notion_image_src = [] 24 | self.head_meta = {} 25 | 26 | def add_text(self, text: str): 27 | self.text_count += len(text) 28 | self.html_content += text 29 | 30 | def add_notion_text(self, text: str): 31 | self.notion_content += text 32 | self.notion_text_count += len(text) 33 | 34 | def add_image(self, src: str): 35 | self.html_image_src.append(src) 36 | self.image_count += 1 37 | 38 | def add_notion_image(self, src: str): 39 | self.notion_image_src.append(src) 40 | self.notion_image_count += 1 41 | 42 | def add_skip_tag(self, tag): 43 | self.skip_tag.append(tag) 44 | 45 | def set_filename(self, filename: Path): 46 | self.filename = filename 47 | 48 | def set_exception(self, exception: Exception): 49 | self.exception = exception 50 | 51 | def get_level(self): 52 | if self.exception: 53 | return StatLevel.EXCEPTION.value 54 | if self.notion_text_count < self.text_count: 55 | return StatLevel.LOSS.value 56 | return StatLevel.SUCC.value 57 | 58 | def __str__(self): 59 | msg = "" 60 | if self.get_level() == StatLevel.EXCEPTION.value: 61 | msg += f"[red]{str(self.exception)}[/red]" 62 | if 'body.parent.page_id should be defined' in str(self.exception): 63 | msg += f"\nHeadmeta: [yellow]{self.head_meta}[/yellow]" 64 | 65 | if self.get_level() == StatLevel.LOSS.value: 66 | if self.text_count != self.notion_text_count: 67 | msg += f"Text Len {self.text_count} -> {self.notion_text_count}, Loss [yellow]{self.text_count-self.notion_text_count}[/yellow]" 68 | 69 | msg += '\nDetail: [yellow]' + ";".join([repr(s) for s in self.skip_tag])[:500] + "[/yellow]" 70 | return msg 71 | 72 | def get_detail(self): 73 | return f"filename: {self.filename}, {self.text_count} text, {self.image_count} image\nNotion {self.notion_text_count} text, {self.notion_image_count} image\n{self.skip_tag}" 74 | 75 | 76 | if __name__ == '__main__': 77 | task_stats = ImportStats() 78 | task_stats.add_text(100) 79 | task_stats.add_image(20) 80 | task_stats.add_notion_text(80) 81 | task_stats.add_notion_image(15) 82 | task_stats.set_exception(Exception("Some error occurred")) 83 | 84 | print(task_stats) 85 | -------------------------------------------------------------------------------- /html2notion/translate/notion_export.py: -------------------------------------------------------------------------------- 1 | import json 2 | from notion_client import Client, errors as notion_errors 3 | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type 4 | from ..utils import logger, test_prepare_conf, config 5 | 6 | class NotionExporter: 7 | # Remove keys which not used by add page 8 | delete_block = { 9 | "rich_text": [ 10 | { 11 | # "type": "text", 12 | "text": { 13 | # "content": "测试第一行", 14 | "link": None 15 | }, 16 | "annotations": { 17 | "bold": False, 18 | "italic": False, 19 | "strikethrough": False, 20 | "underline": False, 21 | "code": False, 22 | "color": "default" 23 | }, 24 | # "plain_text": "测试第一行", 25 | "href": None 26 | } 27 | ], 28 | "color": "default", 29 | "is_toggleable": False 30 | } 31 | 32 | delete_conf = { 33 | # "object": "block", 34 | "id": "__any__", 35 | "parent": "__any__", 36 | "created_time": "__any__", 37 | "last_edited_time": "__any__", 38 | "created_by": "__any__", 39 | "last_edited_by": "__any__", 40 | "has_children": False, 41 | "archived": False, 42 | # "type": "paragraph", 43 | "paragraph": delete_block, 44 | "quote": delete_block, 45 | "numbered_list_item": delete_block, 46 | "bulleted_list_item": delete_block, 47 | "heading_1": delete_block, 48 | "heading_2": delete_block, 49 | "heading_3": delete_block, 50 | } 51 | 52 | def __init__(self, api_key, page_id, page_size=2): 53 | self.notion = Client(auth=api_key, logger=logger) 54 | self.page_id = page_id 55 | self.page_size = page_size 56 | self.all_blocks = [] 57 | self.output_blocks = [] 58 | 59 | @staticmethod 60 | def get_delete_conf(key_path): 61 | result = NotionExporter.delete_conf.copy() 62 | for key in key_path: 63 | # Number in path is json array placeholder 64 | if isinstance(key, int): 65 | if isinstance(result, list) and len(result) > 0: 66 | result = result[0] # type: ignore 67 | else: 68 | result = None 69 | elif isinstance(result, dict) and key in result: 70 | # If prefix path has __any__ conf, then delete all children 71 | if result[key] == "__any__": 72 | return ["__any__"] 73 | else: 74 | result = result[key] 75 | else: 76 | result = None 77 | 78 | if (isinstance(result, list)): 79 | return result 80 | elif (isinstance(result, str) or isinstance(result, bool) or isinstance(result, int)): 81 | return [result] 82 | else: 83 | return [None] 84 | 85 | @staticmethod 86 | def check_is_delete(key_path: list, value): 87 | delete_values = NotionExporter.get_delete_conf(key_path) 88 | if value in delete_values or '__any__' in delete_values: 89 | return True 90 | # logger.debug(f"Check key: {key_path}, value: {value}, delete values: {delete_values}") 91 | return False 92 | 93 | @staticmethod 94 | def keep_dict_pathvalue(data, path, value): 95 | for i, key in enumerate(path): 96 | if isinstance(key, int): 97 | data = data[key] 98 | elif i == len(path) - 1: 99 | data[key] = value 100 | else: 101 | next_key = path[i+1] if i+1 < len(path) else None 102 | if key in data: 103 | if isinstance(next_key, int): 104 | if not isinstance(data[key], list): 105 | logger.error(f"Keep error: {i}, {path}, {data[key]}") 106 | return 107 | data[key].extend([{} for _ in range(next_key - len(data[key]) + 1)]) 108 | else: 109 | if not isinstance(data[key], dict): 110 | logger.error(f"Keep error: {i}, {path}, {data[key]}") 111 | return 112 | else: 113 | if isinstance(next_key, int): 114 | data[key] = [{} for _ in range(next_key + 1)] 115 | else: 116 | data[key] = {} 117 | 118 | data = data[key] 119 | return 120 | 121 | @retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=3, max=30), retry=retry_if_exception_type(notion_errors.RequestTimeoutError)) 122 | def __get_children_blocks(self): 123 | children = self.notion.blocks.children.list(block_id=self.page_id, page_size=self.page_size) 124 | if not isinstance(children, dict): 125 | logger.error(f"Get children failed: {children}") 126 | return None 127 | 128 | loop_count = 1 129 | while isinstance(children, dict) and "has_more" in children and children["has_more"]: 130 | next_cursor = children["next_cursor"] 131 | self.all_blocks.extend(children["results"]) 132 | children = self.notion.blocks.children.list( 133 | block_id=self.page_id, page_size=self.page_size, start_cursor=next_cursor) 134 | loop_count += 1 135 | cur_content = json.dumps(children, indent=4, ensure_ascii=False) 136 | logger.debug(f'Get child, {loop_count}: {cur_content}') 137 | 138 | if isinstance(children, dict) and "has_more" in children and not children["has_more"]: 139 | self.all_blocks.extend(children["results"]) 140 | return children 141 | 142 | @staticmethod 143 | def export_dict(data): 144 | clean_block = {} 145 | stack = [(data, list())] 146 | while stack: 147 | cur, prefix = stack.pop() 148 | if isinstance(cur, dict): 149 | for k, v in cur.items(): 150 | prefix.append(k) 151 | # logger.debug(f"Export dict, prefix: {prefix}, value: {v}") 152 | stack.append((v, prefix[:])) 153 | prefix.pop() 154 | elif isinstance(cur, list): 155 | for i, v in enumerate(cur): 156 | # logger.debug(f"Export array, prefix: {prefix}, {i}, value: {v}") 157 | prefix.append(i) 158 | stack.append((v, prefix[:])) 159 | prefix.pop() 160 | else: 161 | if (not NotionExporter.check_is_delete(prefix[:], cur)): 162 | logger.debug(f"Keep {prefix}: {cur}") 163 | NotionExporter.keep_dict_pathvalue(clean_block, prefix, cur) 164 | return clean_block 165 | 166 | def export_blocks(self): 167 | self.__get_children_blocks() 168 | result = json.dumps(self.all_blocks, indent=4, ensure_ascii=False) 169 | logger.info(f"Before process, blocks {result}") 170 | 171 | if not self.all_blocks: 172 | logger.error("Get children empty") 173 | 174 | for block in self.all_blocks: 175 | output_block = self.export_dict(block) 176 | self.output_blocks.append(output_block) 177 | 178 | return self.output_blocks 179 | 180 | 181 | if __name__ == "__main__": 182 | test_prepare_conf() 183 | exporter = NotionExporter(api_key=config['notion']['api_key'], 184 | page_id=config['notion']['page_id'], 185 | page_size=10) 186 | exporter.export_blocks() 187 | print(json.dumps(exporter.output_blocks, indent=4, ensure_ascii=False)) 188 | -------------------------------------------------------------------------------- /html2notion/translate/notion_import.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | import traceback 4 | from aiohttp import ClientSession 5 | from pathlib import Path 6 | from notion_client import AsyncClient 7 | from notion_client.errors import RequestTimeoutError 8 | from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type 9 | from ..utils import logger, test_prepare_conf, config, rate_limit 10 | from ..translate.html2json import html2json_process 11 | from ..translate.import_stats import ImportStats 12 | 13 | 14 | class NotionImporter: 15 | def __init__(self, session: ClientSession, notion_client): 16 | self.session = session 17 | self.notion_client = notion_client 18 | self.import_stats = ImportStats() 19 | 20 | async def process_file(self, file_path: Path): 21 | self.import_stats.set_filename(file_path) 22 | try: 23 | notion_data, html_type = html2json_process(file_path, self.import_stats) 24 | except Exception as e: 25 | error_message = traceback.format_exc() 26 | self.import_stats.set_exception(e) 27 | logger.error(f"Error processing {file_path}: {str(e)}, {error_message}") 28 | return "fail" 29 | 30 | logger.info(f"Process path: {file_path}, html type: {html_type}, {self.import_stats.get_detail()}") 31 | try: 32 | create_result = await self.create_new_page(notion_data) 33 | except Exception as e: 34 | error_message = traceback.format_exc() 35 | self.import_stats.set_exception(e) 36 | logger.error(f"Error create notion page {file_path}: {str(e)}, {error_message}") 37 | return "fail" 38 | logger.info(f"Create notion page: {create_result}") 39 | return "succ" 40 | 41 | # https://developers.notion.com/reference/request-limits 42 | # The rate limit for incoming requests per integration is an average of three requests per second. 43 | # Doc of create page: https://developers.notion.com/reference/post-page 44 | @retry(stop=stop_after_attempt(5), 45 | wait=wait_exponential(multiplier=1, min=3, max=30), 46 | retry=retry_if_exception_type(RequestTimeoutError)) 47 | async def create_new_page(self, notion_data): 48 | # logger.debug(f'Create new page: {notion_data["parent"]}, {notion_data["properties"]}') 49 | # body.children.length should be ≤ `100`, 50 | blocks = notion_data.get("children", []) 51 | # logger.debug(f'Create new page: {notion_data["parent"]}, {notion_data["properties"]}, blocks: {blocks}') 52 | 53 | limit_size = 100 54 | chunks = [blocks[i: i + limit_size] for i in range(0, len(blocks), limit_size)] 55 | if blocks: 56 | notion_data.pop("children") 57 | first_chunk = chunks[0] if chunks else [] 58 | async with rate_limit: 59 | created_page = await self.notion_client.pages.create(**notion_data, children=first_chunk) 60 | page_id = created_page["id"] 61 | for chunk in chunks[1:]: 62 | await self.notion_client.blocks.children.append(page_id, children=chunk) 63 | return created_page 64 | 65 | 66 | async def main(file_path, notion_api_key): 67 | async with ClientSession() as session: 68 | async with AsyncClient(auth=notion_api_key) as notion_client: 69 | importer = NotionImporter(session, notion_client) 70 | result = await importer.process_file(file_path) 71 | logger.info(f"Import result: {result}") 72 | 73 | 74 | if __name__ == "__main__": 75 | test_prepare_conf() 76 | file = Path("./demos/Test Case E.html") 77 | notion_api_key = "" 78 | if 'GITHUB_ACTIONS' in os.environ: 79 | notion_api_key = os.environ['notion_api_key'] 80 | else: 81 | notion_api_key = config['notion']['api_key'] 82 | asyncio.run(main(file, notion_api_key)) 83 | -------------------------------------------------------------------------------- /html2notion/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .log import logger, setup_logger 2 | from .load_config import read_config, config, rate_limit 3 | from .url_process import is_valid_url 4 | from .timeutil import DateStrToISO8601 5 | from pathlib import Path 6 | 7 | 8 | def test_prepare_conf(): 9 | log_path = Path("./logs") 10 | conf_path = Path("./.config.json") 11 | setup_logger(log_path) 12 | read_config(conf_path) 13 | logger.info(f"test_prepare_conf, log path({log_path}), conf path({conf_path})") 14 | 15 | 16 | __all__ = ['logger', 'setup_logger', 'config', 'read_config', 'test_prepare_conf', 'rate_limit', 'is_valid_url', 'DateStrToISO8601'] 17 | -------------------------------------------------------------------------------- /html2notion/utils/load_config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | from aiolimiter import AsyncLimiter 4 | rate_limit = AsyncLimiter(3, 1) 5 | config = {} 6 | 7 | 8 | def read_config(file_path): 9 | """ 10 | { 11 | "notion": { 12 | "database_id": "xxxxx", 13 | "api_key": "xxxxx" 14 | } 15 | } 16 | """ 17 | if not file_path.is_file(): 18 | print(f"Load {file_path} fail") 19 | sys.exit(1) 20 | 21 | with open(file_path, "r") as f: 22 | json_conf = json.load(f) 23 | 24 | config.update(json_conf) 25 | if "notion" not in config: 26 | raise Exception("notion is not set in config.json") 27 | 28 | notion_conf = config["notion"] 29 | if "database_id" not in notion_conf: 30 | raise Exception("database_id is not set in config.json") 31 | if "api_key" not in notion_conf: 32 | raise Exception("api_key is not set in config.json") 33 | return 34 | -------------------------------------------------------------------------------- /html2notion/utils/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from logging import handlers 4 | logger = logging.getLogger() 5 | 6 | 7 | class CustomFormatter(logging.Formatter): 8 | green = "\033[92m" 9 | normal = "\x1b[38;21m" 10 | yellow = "\x1b[33;21m" 11 | red = "\x1b[31;21m" 12 | bold_red = "\x1b[31;1m" 13 | reset = "\x1b[0m" 14 | format = "%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s" # type: ignore 15 | 16 | FORMATS = { 17 | logging.DEBUG: green + format + reset, # type: ignore 18 | logging.INFO: normal + format + reset, # type: ignore 19 | logging.WARNING: yellow + format + reset, # type: ignore 20 | logging.ERROR: red + format + reset, # type: ignore 21 | logging.CRITICAL: bold_red + format + reset # type: ignore 22 | } 23 | 24 | def format(self, record): 25 | log_fmt = self.FORMATS.get(record.levelno) 26 | formatter = logging.Formatter(log_fmt) 27 | return formatter.format(record) 28 | 29 | 30 | def setup_logger(log_path): 31 | file_path = log_path.joinpath("html2notion_error.log") 32 | handler = handlers.TimedRotatingFileHandler( 33 | filename=file_path, when='midnight', backupCount=30, encoding='utf-8') 34 | handler.setLevel(logging.DEBUG) 35 | handler.setFormatter(CustomFormatter()) 36 | logger.addHandler(handler) 37 | logger.setLevel(logging.DEBUG) 38 | 39 | logger.debug('Logging debug message') 40 | logger.info('Logging info message') 41 | logger.warning('Logging warning message') 42 | logger.error('Logging error message') 43 | 44 | 45 | def log_only_local(content): 46 | if 'GITHUB_ACTIONS' in os.environ: 47 | return 48 | 49 | from html2notion.utils import logger 50 | logger.info(content) 51 | -------------------------------------------------------------------------------- /html2notion/utils/timeutil.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from . import logger 3 | 4 | 5 | def DateStrToISO8601(date_string: str) -> str: 6 | """Converts a date string to ISO 8601 format. 7 | 8 | Args: 9 | date_string (str): Date string to convert. 10 | 11 | Returns: 12 | str: ISO 8601 formatted date string. 13 | """ 14 | 15 | date_format = "%Y-%m-%d %H:%M:%S %z" 16 | try: 17 | date_obj = datetime.strptime(date_string, date_format).astimezone() 18 | except ValueError: 19 | logger.warning(f"Invalid date string: {date_string}") 20 | return "" 21 | 22 | output_string = date_obj.isoformat() 23 | return output_string 24 | -------------------------------------------------------------------------------- /html2notion/utils/url_process.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | 4 | def is_valid_url(url): 5 | if not isinstance(url, str): 6 | return False 7 | try: 8 | result = urlparse(url) 9 | return all([result.scheme, result.netloc]) and is_valid_port(result.port) 10 | except ValueError: 11 | return False 12 | 13 | 14 | def is_valid_port(port): 15 | if port is None: 16 | return True 17 | return 0 <= port <= 65535 18 | 19 | 20 | if __name__ == '__main__': 21 | print(is_valid_url("https://www.google.com")) # Returns: True 22 | print(is_valid_url("google")) # Returns: False 23 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=54", 4 | "wheel" 5 | ] 6 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.11.2 2 | httpcore>=0.16.3 3 | httpx>=0.23.3 4 | notion-client>=2.0.0 5 | PyYAML>=6.0 6 | aiohttp>=3.8.4 7 | anyio>=3.6.2 8 | cos-python-sdk-v5>=1.9.23 9 | tenacity>=8.2.2 10 | rich>=13.3.4 11 | aiolimiter>=1.0.0 12 | chardet>=5.1.0 13 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = html2notion 3 | version = 0.2.0 4 | author = selfboot 5 | author_email = xuezaigds@gmail.com 6 | description = This tool can accurately convert HTML to Notion notes and is also useful for exporting Evernote notes to Notion. 7 | long_description = file: README.md 8 | long_description_content_type = text/markdown 9 | url = https://github.com/selfboot/html2notion 10 | license_files = LICENSE 11 | classifiers = 12 | Programming Language :: Python :: 3 13 | License :: OSI Approved :: MIT License 14 | Operating System :: OS Independent 15 | 16 | [options] 17 | packages = find: 18 | install_requires = 19 | beautifulsoup4>=4.11.2 20 | httpcore>=0.16.3 21 | httpx>=0.23.3 22 | notion-client>=2.0.0 23 | PyYAML>=6.0 24 | aiohttp>=3.8.4 25 | anyio>=3.6.2 26 | cos-python-sdk-v5>=1.9.23 27 | tenacity>=8.2.2 28 | rich>=13.3.4 29 | aiolimiter>=1.0.0 30 | chardet>=5.1.0 31 | 32 | [options.entry_points] 33 | console_scripts = 34 | html2notion = html2notion.main:main 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup() 4 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from html2notion.utils import test_prepare_conf, logger 4 | 5 | 6 | @pytest.fixture(autouse=True, scope='module') 7 | def prepare_conf(): 8 | if 'GITHUB_ACTIONS' not in os.environ: 9 | test_prepare_conf() 10 | logger.info("prepare_conf_fixture") 11 | -------------------------------------------------------------------------------- /tests/test_batchimport.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pytest 3 | import time 4 | import os 5 | from pathlib import Path 6 | from unittest.mock import patch 7 | from tempfile import TemporaryDirectory 8 | from http import HTTPStatus 9 | from html2notion.translate.batch_import import BatchImport 10 | from html2notion.utils import rate_limit 11 | from html2notion.utils.log import log_only_local 12 | 13 | process_once_time = 0.5 14 | 15 | 16 | async def mock_notion_api_request(file_path, *args, **kwargs): 17 | class MockResponse: 18 | def __init__(self, status_code, file_content, elapsed_time): 19 | self.status_code = status_code 20 | self.file_content = file_content 21 | self.elapsed_time = elapsed_time 22 | 23 | def json(self): 24 | return {"result": "success", "file_content": self.file_content, "elapsed_time": self.elapsed_time} 25 | 26 | start_time = time.perf_counter() 27 | content = file_path.read_text() 28 | if 'GITHUB_ACTIONS' not in os.environ: 29 | from html2notion.utils import logger 30 | logger.debug(f"mock_notion_api_request: {file_path}") 31 | await asyncio.sleep(process_once_time) 32 | end_time = time.perf_counter() 33 | elapsed_time = end_time - start_time 34 | return MockResponse(HTTPStatus.OK, content, elapsed_time) 35 | 36 | 37 | async def mock_notion_create_page(notion_data, *args, **kwargs): 38 | async with rate_limit: 39 | await asyncio.sleep(0.01) 40 | log_only_local(f"mock_notion_create_page") 41 | return "succ" 42 | 43 | @pytest.fixture(params=[10, 20]) 44 | def temp_dir_fixture(request): 45 | num_files = request.param 46 | with TemporaryDirectory() as temp_dir: 47 | dir_path = Path(temp_dir) 48 | temp_files = [] 49 | for i in range(num_files): 50 | temp_file = dir_path / f"file{i}.html" 51 | temp_file.write_text(f"file{i}") 52 | temp_files.append(temp_file) 53 | 54 | yield dir_path 55 | 56 | 57 | @pytest.mark.parametrize("concurrent_limit", [2, 5, 10]) 58 | @pytest.mark.asyncio 59 | async def test_batch_process(temp_dir_fixture, concurrent_limit): 60 | dir_path = temp_dir_fixture 61 | start_time = time.perf_counter() 62 | with patch("html2notion.translate.notion_import.NotionImporter.process_file", side_effect=mock_notion_api_request): 63 | batch_processor = BatchImport( 64 | dir_path, concurrent_limit=concurrent_limit) 65 | responses = await batch_processor.process_directory() 66 | 67 | end_time = time.perf_counter() 68 | for file_path, response in zip( 69 | sorted(dir_path.iterdir()), 70 | sorted(responses, key=lambda x: x.json()["file_content"])): 71 | assert response.json()["file_content"] == f"{file_path.stem}" 72 | 73 | total_time = end_time-start_time 74 | sync_time = sum(res.json()["elapsed_time"] for res in responses) 75 | least_time = min(res.json()["elapsed_time"] for res in responses) 76 | log_only_local( 77 | f"total_time: {total_time}, sync_time: {sync_time}, least_time: {least_time}") 78 | assert total_time >= least_time 79 | assert total_time <= sync_time 80 | 81 | 82 | @pytest.mark.parametrize("concurrent_limit", [5, 10, 20]) 83 | @pytest.mark.asyncio 84 | async def test_reqlimit(temp_dir_fixture, concurrent_limit): 85 | dir_path = temp_dir_fixture 86 | start_time = time.perf_counter() 87 | with patch("html2notion.translate.notion_import.NotionImporter.create_new_page", side_effect=mock_notion_create_page): 88 | batch_processor = BatchImport(dir_path, concurrent_limit=concurrent_limit) 89 | responses = await batch_processor.process_directory() 90 | 91 | end_time = time.perf_counter() 92 | total_time = end_time-start_time 93 | num_files = len(list(dir_path.glob('*.html'))) 94 | log_only_local(f"file nums: {num_files}, concurrent {concurrent_limit}, total_time: {total_time}") 95 | # The time deviation within 1 second is acceptable here. 96 | assert (total_time >= num_files / 3 - 1) -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pathlib import Path 3 | from unittest.mock import mock_open, patch 4 | from html2notion.utils import read_config, config 5 | import pytest 6 | 7 | 8 | def test_read_config(): 9 | mock_file_content = """{ 10 | "notion": { 11 | "database_id": "test_db_id", 12 | "api_key": "test_api_key" 13 | }, 14 | "log_path": "/test/log/path" 15 | } 16 | """ 17 | with patch("builtins.open", mock_open(read_data=mock_file_content)): 18 | with patch.object(Path, "is_file", return_value=True): 19 | read_config(Path("test_config.json")) 20 | assert "notion" in config 21 | assert "database_id" in config["notion"] 22 | assert "api_key" in config["notion"] 23 | assert config["notion"]["database_id"] == "test_db_id" 24 | assert config["notion"]["api_key"] == "test_api_key" 25 | config.clear() 26 | 27 | # Testing for missing database_id, notion, or api_key configurations throws an exception 28 | with patch("builtins.open", mock_open(read_data="{}")), patch.object(Path, "is_file", return_value=True), pytest.raises(Exception, match="notion is not set in config.json"): 29 | read_config(Path("test_config.json")) 30 | config.clear() 31 | 32 | with patch("builtins.open", mock_open(read_data="{\"notion\": {}}")), patch.object(Path, "is_file", return_value=True), pytest.raises(Exception, match="database_id is not set in config.json"): 33 | read_config(Path("test_config.json")) 34 | config.clear() 35 | 36 | with patch("builtins.open", mock_open(read_data="{\"notion\": {\"database_id\": \"test_db_id\"}}")), patch.object(Path, "is_file", return_value=True), pytest.raises(Exception, match="api_key is not set in config.json"): 37 | read_config(Path("test_config.json")) 38 | config.clear() 39 | -------------------------------------------------------------------------------- /tests/test_cosupload.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import pytest 3 | import time 4 | import os 5 | import random 6 | import string 7 | from pathlib import Path 8 | from unittest.mock import patch 9 | from tempfile import TemporaryDirectory 10 | from html2notion.translate.batch_import import BatchImport 11 | from html2notion.translate.cos_uploader import TencentCosUploaderAsync 12 | from html2notion.utils.log import log_only_local 13 | 14 | 15 | async def mock_cos_upload_request(file_path, *args, **kwargs): 16 | if 'GITHUB_ACTIONS' not in os.environ: 17 | from html2notion.utils import config 18 | secret_id = config["cos"]["secret_id"] 19 | secret_key = config["cos"]["secret_key"] 20 | region = config["cos"]["region"] 21 | bucket = config["cos"]["bucket"] 22 | else: 23 | secret_id = os.environ['cos_secret_id'] 24 | secret_key = os.environ['cos_secret_key'] 25 | region = os.environ['cos_region'] 26 | bucket = os.environ['cos_bucket'] 27 | 28 | start_time = time.perf_counter() 29 | uploader = TencentCosUploaderAsync(secret_id, secret_key, region, bucket) 30 | loop = asyncio.get_event_loop() 31 | key = f"test_workflow/{file_path.name}" 32 | upload_response = await uploader.upload_file(loop, file_path, key) 33 | log_only_local(f"Upload response: {upload_response}") 34 | 35 | is_exist = await uploader.check_file_exist(loop, key) 36 | end_time = time.perf_counter() 37 | elapsed_time = end_time - start_time 38 | log_only_local(f"Upload elapsed time: {elapsed_time}") 39 | return (is_exist, elapsed_time) 40 | 41 | 42 | @pytest.fixture() 43 | def temp_dir_fixture(): 44 | with TemporaryDirectory() as temp_dir: 45 | dir_path = Path(temp_dir) 46 | temp_files = [] 47 | for i in range(20): 48 | file_size = random.randint(1 * 1024, 1 * 1024 * 1024) 49 | random_text = "".join(random.choices(string.ascii_letters + string.digits, k=file_size)) 50 | 51 | temp_file = dir_path / f"file_{i}.html" 52 | temp_file.write_text(random_text) 53 | temp_files.append(temp_file) 54 | 55 | yield dir_path 56 | 57 | 58 | @pytest.mark.asyncio 59 | async def test_batch_cos_upload(temp_dir_fixture): 60 | concurrent_limit = 5 61 | dir_path = temp_dir_fixture 62 | 63 | start_time = time.perf_counter() 64 | with patch("html2notion.translate.notion_import.NotionImporter.process_file", side_effect=mock_cos_upload_request): 65 | batch_processor = BatchImport( 66 | dir_path, concurrent_limit=concurrent_limit) 67 | responses = await batch_processor.process_directory() 68 | end_time = time.perf_counter() 69 | 70 | for res in responses: 71 | assert (res[0]) 72 | 73 | total_time = end_time - start_time 74 | elapsed_times = sum([res[1] for res in responses]) 75 | least_tiems = min([res[1] for res in responses]) 76 | log_only_local(f"Time: sum: {elapsed_times}, min {least_tiems}, total: {total_time}") 77 | assert (total_time < elapsed_times) 78 | assert (total_time >= least_tiems) 79 | -------------------------------------------------------------------------------- /tests/test_demos.py: -------------------------------------------------------------------------------- 1 | # import glob 2 | import json 3 | import os 4 | from pathlib import Path 5 | from html2notion.translate.html2json import html2json_process 6 | from html2notion.translate.import_stats import ImportStats 7 | from html2notion.translate.html2json_markdown import YinXiangMarkdown_Type 8 | from html2notion.translate.html2json_clipper import YinXiangClipper_Type 9 | from html2notion.utils import logger, config 10 | 11 | 12 | def test_demo_files(): 13 | if 'GITHUB_ACTIONS' in os.environ: 14 | database_id = os.environ['notion_db_id_1'] 15 | else: 16 | database_id = config['notion']['database_id'] 17 | 18 | testcases = [ 19 | ["./demos/yinxiang_markdown.html", YinXiangMarkdown_Type, "./demos/yinxiang_markdown.json"], 20 | ["./demos/yinxiang_clipper.html", YinXiangClipper_Type, "./demos/yinxiang_clipper.json"], 21 | ["./demos/yinxiang_clipper_wx.html", YinXiangClipper_Type, "./demos/yinxiang_clipper_wx.json"], 22 | ] 23 | 24 | for md_file, expect_type, expect_file in testcases: 25 | import_stats = ImportStats() 26 | notion_data, html_type = html2json_process(Path(md_file), import_stats) 27 | 28 | assert html_type == expect_type 29 | with open(expect_file, "r") as f: 30 | content = f.read() 31 | 32 | # Replace the placeholder 33 | content = content.replace("###database_id###", database_id) 34 | expect = json.loads(content) 35 | 36 | # The timezone causes the calculated time to be different, and the check here can be ignored 37 | try: 38 | del expect['properties']['Created']['date']['start'] 39 | del notion_data['properties']['Created']['date']['start'] 40 | except KeyError as e: 41 | pass 42 | 43 | # import dictdiffer 44 | # diff = dictdiffer.diff(notion_data, expect) 45 | # for d in diff: 46 | # logger.debug(f'Diff: {d}') 47 | # aa = json.dumps(notion_data, ensure_ascii=False) 48 | # logger.debug(f'notion_data: {aa}') 49 | assert notion_data ==expect 50 | 51 | -------------------------------------------------------------------------------- /tests/test_log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from unittest.mock import patch, MagicMock 3 | from pathlib import Path 4 | from html2notion.utils import setup_logger, logger 5 | from html2notion.utils.log import CustomFormatter 6 | 7 | 8 | class MockHandler(MagicMock): 9 | @property 10 | def level(self): 11 | return 0 12 | 13 | 14 | @patch('logging.handlers.TimedRotatingFileHandler', new_callable=MockHandler) 15 | def test_setup_logger(mock_handler): 16 | setup_logger(Path('/fake/path')) 17 | 18 | # Assert TimedRotatingFileHandler is called with the correct arguments 19 | mock_handler.assert_called_once_with( 20 | filename=Path('/fake/path', 'html2notion_error.log'), 21 | when='midnight', backupCount=30, encoding='utf-8' 22 | ) 23 | 24 | # Assert the mock handler instance is set with the correct level and formatter 25 | mock_handler.return_value.setLevel.assert_called_once_with(logging.DEBUG) 26 | assert isinstance(mock_handler.return_value.setFormatter.call_args[0][0], CustomFormatter) 27 | 28 | # Assert logger has the correct level 29 | assert logger.level == logging.DEBUG 30 | 31 | 32 | def test_custom_formatter(): 33 | formatter = CustomFormatter() 34 | 35 | for level, color in [(logging.DEBUG, "\033[92m"), (logging.INFO, "\x1b[38;21m"), 36 | (logging.WARNING, "\x1b[33;21m"), (logging.ERROR, "\x1b[31;21m"), 37 | (logging.CRITICAL, "\x1b[31;1m")]: 38 | record = logging.LogRecord( 39 | name="test", level=level, pathname='test_path', lineno=0, 40 | msg="test message", args=None, exc_info=None 41 | ) 42 | record.filename = "test.py" 43 | record.lineno = 1 44 | 45 | result = formatter.format(record) 46 | expected_format = f"{color}%(asctime)s - %(levelname)s - %(filename)s:%(lineno)d - %(message)s\x1b[0m" 47 | expected_message = logging.Formatter(expected_format).format(record) 48 | 49 | assert result == expected_message 50 | -------------------------------------------------------------------------------- /tests/test_notionexport.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from html2notion.translate.notion_export import NotionExporter 4 | from html2notion.utils import config 5 | 6 | 7 | def test_check_is_delete(): 8 | del_keyvalue = [ 9 | (["id"], "95948188-43cb-451f-b538-e0375368ca96"), 10 | (["parent", "type"], "page_id"), 11 | (["created_by", "object"], "user"), 12 | (["paragraph", "rich_text", 0, "text", "link"], None), 13 | (["paragraph", "rich_text", 0, "annotations", "code"], False), 14 | (["paragraph", "rich_text", 0, "annotations", "color"], "default"), 15 | (["quote", "rich_text", 0, "annotations", "color"], "default"), 16 | (["numbered_list_item", "rich_text", 0, "annotations", "color"], "default"), 17 | (["bulleted_list_item", "rich_text", 0, "annotations", "color"], "default"), 18 | ] 19 | 20 | for (path, value) in del_keyvalue: 21 | assert NotionExporter.check_is_delete(path, value) 22 | 23 | keep_keyvalue = [ 24 | (["type"], "paragraph"), 25 | (["type"], "image"), 26 | (["object"], "block"), 27 | (["paragraph", "rich_text", 0, "text", "link"], "https://selfboot.com"), 28 | (["paragraph", "rich_text", 0, "annotations", "code"], True), 29 | (["paragraph", "rich_text", 0, "annotations", "color"], "red"), 30 | (["quote", "rich_text", 0, "annotations", "color"], "red"), 31 | (["numbered_list_item", "rich_text", 0, "annotations", "color"], "red"), 32 | (["bulleted_list_item", "rich_text", 0, "annotations", "color"], "red"), 33 | (["bulleted_list_item", "rich_text", 0, "annotations", "code"], True), 34 | ] 35 | for (path, value) in keep_keyvalue: 36 | assert not NotionExporter.check_is_delete(path, value) 37 | 38 | 39 | def test_export_blocks(): 40 | if 'GITHUB_ACTIONS' in os.environ: 41 | api_key = os.environ['notion_api_key'] 42 | page_id = os.environ['notion_page_id_1'] 43 | else: 44 | api_key = config['notion']['api_key'] 45 | page_id = config['notion']['page_id'] 46 | 47 | names = locals() 48 | page_sizes = [1, 5, 10, 100] 49 | for i in page_sizes: 50 | names['exporter_' + str(i)] = NotionExporter( 51 | api_key=api_key, 52 | page_id=page_id, 53 | page_size=i) 54 | 55 | names['exporter_' + str(i)].export_blocks() 56 | names['page_json_'+str(i)] = json.dumps(names['exporter_' + str(i)].output_blocks, indent=4, ensure_ascii=False) 57 | 58 | for i in page_sizes[1:]: 59 | if names['page_json_' + str(i)] != names['page_json_' + str(page_sizes[0])]: 60 | assert False 61 | 62 | 63 | if __name__ == '__main__': 64 | if 'GITHUB_ACTIONS' not in os.environ: 65 | from html2notion.utils import config, test_prepare_conf 66 | test_prepare_conf() 67 | 68 | test_check_is_delete() 69 | test_export_blocks() 70 | -------------------------------------------------------------------------------- /tests/test_reqlimit.py: -------------------------------------------------------------------------------- 1 | import json 2 | from html2notion.translate.html2json_yinxiang import Html2JsonYinXiang 3 | from html2notion.translate.import_stats import ImportStats 4 | 5 | 6 | block_max_conent = "Some words" * 200 7 | one_text_obj = { 8 | "plain_text": block_max_conent, 9 | "text": { 10 | "content": block_max_conent 11 | }, 12 | "type": "text" 13 | } 14 | remain_text_obj = { 15 | "plain_text": " more words", 16 | "text": { 17 | "content": " more words" 18 | }, 19 | "type": "text" 20 | } 21 | 22 | 23 | def test_reqlimit(): 24 | paragram_rich_block = [ 25 | { 26 | "object": "block", 27 | "type": "paragraph", 28 | "paragraph": { 29 | "rich_text": [ 30 | one_text_obj, one_text_obj, remain_text_obj 31 | ] 32 | } 33 | } 34 | ] 35 | 36 | paragram_rich_content = f'
    {block_max_conent * 2} more words
    ' 37 | import_stats = ImportStats() 38 | yinxiang = Html2JsonYinXiang(paragram_rich_content, import_stats) 39 | yinxiang.process() 40 | json_obj = yinxiang.children 41 | # print(json.dumps(json_obj, indent=4)) 42 | assert json_obj == paragram_rich_block 43 | 44 | 45 | def test_code_reqlimit(): 46 | code_rich_content = f'
    {block_max_conent * 2} more words
    ' 47 | import_stats = ImportStats() 48 | yinxiang = Html2JsonYinXiang(code_rich_content, import_stats) 49 | yinxiang.process() 50 | json_obj = yinxiang.children 51 | # print(json.dumps(json_obj, indent=4)) 52 | 53 | split_block_result = [ 54 | { 55 | "object": "block", 56 | "type": "code", 57 | "code": { 58 | "rich_text": [ 59 | one_text_obj, one_text_obj, remain_text_obj 60 | ], 61 | "language": "plain text" 62 | } 63 | } 64 | ] 65 | assert json_obj == split_block_result 66 | 67 | 68 | if __name__ == '__main__': 69 | # test_reqlimit() 70 | test_code_reqlimit() 71 | -------------------------------------------------------------------------------- /tests/test_util.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from html2notion.utils import DateStrToISO8601, is_valid_url 3 | 4 | 5 | def test_date_to_ios8601(): 6 | valid_date_pair = ["2018-09-20 10:30:36 +0000", "2023-05-12 03:49:56 +0000"] 7 | 8 | for date_string in valid_date_pair: 9 | expect = datetime.strptime(date_string, "%Y-%m-%d %H:%M:%S %z").astimezone().isoformat() 10 | assert DateStrToISO8601(date_string) 11 | 12 | invalid_date_pair = ["2018-09-20 10:30", "2018-09-20 10:30:36", "2018-09-20 10:30:36+0800"] 13 | for date_string in invalid_date_pair: 14 | assert DateStrToISO8601(date_string) == "" 15 | 16 | 17 | def test_is_valid_url(): 18 | valid_urls = [ 19 | "http://www.example.com", 20 | "https://www.example.com", 21 | "ftp://www.example.com", 22 | "http://localhost", 23 | "http://127.0.0.1", 24 | "http://example.com/path?query#fragment", 25 | ] 26 | 27 | invalid_urls = [ 28 | "example.com", 29 | "www.example.com", 30 | "http://", 31 | "http:///example.com", 32 | "http://example.com:80:80", # Two port numbers 33 | None, 34 | 123, # Non-string input 35 | "", 36 | ] 37 | 38 | for url in valid_urls: 39 | assert is_valid_url(url) == True, f"Expected {url} to be valid" 40 | 41 | for url in invalid_urls: 42 | assert is_valid_url(url) == False, f"Expected {url} to be invalid" 43 | 44 | 45 | if __name__ == '__main__': 46 | test_date_to_ios8601() 47 | --------------------------------------------------------------------------------