├── .gitignore
├── Examples
    ├── block_dump.py
    ├── db_dump.py
    └── page_dump.py
├── LICENSE
├── MANIFEST.in
├── NotionDump
    ├── Dump
    │   ├── __init__.py
    │   ├── block.py
    │   ├── database.py
    │   ├── dump.py
    │   └── page.py
    ├── Notion
    │   ├── Buffer.py
    │   ├── Notion.py
    │   └── __init__.py
    ├── Parser
    │   ├── __init__.py
    │   ├── base_parser.py
    │   ├── block_parser.py
    │   ├── database_parser.py
    │   └── mix_parser.py
    ├── __init__.py
    └── utils
    │   ├── __init__.py
    │   ├── common_op.py
    │   ├── content_format.py
    │   └── internal_var.py
├── README.md
├── README_zh.md
├── img
    ├── get_data.png
    └── parser_structure.png
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # pycharm
132 | .idea/
133 | 
134 | # tempfile
135 | Tests/.tmp/
136 | Examples/.tmp/
137 | 


--------------------------------------------------------------------------------
/Examples/block_dump.py:
--------------------------------------------------------------------------------
 1 | # author: delta1037
 2 | # Date: 2022/01/11
 3 | # mail:geniusrabbit@qq.com
 4 | 
 5 | import logging
 6 | 
 7 | import NotionDump
 8 | from NotionDump.Dump.dump import Dump
 9 | from NotionDump.Notion.Notion import NotionQuery
10 | from NotionDump.utils import common_op
11 | 
12 | TOKEN_TEST = "secret_WRLJ9xyEawNxzRhVHVWfciTl9FAyNCd29GMUvr2hQD4"
13 | TABLE_ID = "13b914160ef740dcb64e55c5393762fa"
14 | RER_LIST_ID = "d32db4693409464b9981caec9ef11974"
15 | 
16 | 
17 | # 页面表格测试
18 | def test_get_table_block(query, export_child=True):
19 |     block_handle = Dump(
20 |         dump_id=TABLE_ID,
21 |         query_handle=query,
22 |         export_child_pages=export_child,
23 |         dump_type=NotionDump.DUMP_TYPE_BLOCK
24 |     )
25 |     # 将解析内容存储到文件中；返回内容存储为json文件
26 |     page_detail_json = block_handle.dump_to_file()
27 | 
28 |     print("json output to block_table_parser_result")
29 |     common_op.save_json_to_file(
30 |         handle=page_detail_json,
31 |         json_name=".tmp/block_table_parser_result.json"
32 |     )
33 | 
34 | 
35 | # 递归列表测试
36 | def test_get_rer_list(query, export_child=True):
37 |     block_handle = Dump(
38 |         dump_id=RER_LIST_ID,
39 |         query_handle=query,
40 |         export_child_pages=export_child,
41 |         dump_type=NotionDump.DUMP_TYPE_BLOCK
42 |     )
43 |     # 将解析内容存储到文件中；返回内容存储为json文件
44 |     page_detail_json = block_handle.dump_to_file()
45 | 
46 |     print("json output to block_list_parser_result")
47 |     common_op.save_json_to_file(
48 |         handle=page_detail_json,
49 |         json_name=".tmp/block_list_parser_result.json"
50 |     )
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     query_handle = NotionQuery(token=TOKEN_TEST)
55 |     if query_handle is None:
56 |         logging.exception("query handle init error")
57 |         exit(-1)
58 | 
59 |     # Block解析测试
60 |     # test_get_table_block(query_handle, export_child=False)
61 |     test_get_table_block(query_handle, export_child=True)
62 | 
63 |     # Block解析测试
64 |     # test_get_rer_list(query_handle, export_child=False)
65 |     test_get_rer_list(query_handle, export_child=True)
66 | 


--------------------------------------------------------------------------------
/Examples/db_dump.py:
--------------------------------------------------------------------------------
 1 | # author: delta1037
 2 | # Date: 2022/01/11
 3 | # mail:geniusrabbit@qq.com
 4 | 
 5 | import logging
 6 | 
 7 | import NotionDump
 8 | from NotionDump.Dump.database import Database
 9 | from NotionDump.Dump.dump import Dump
10 | from NotionDump.Notion.Notion import NotionQuery
11 | from NotionDump.utils import common_op
12 | 
13 | TOKEN_TEST = "secret_ALjbBRGaZcagEjPtL1c2F139steBXjr8Fc8uQso4YLV"
14 | DB_TABLE_INLINE_ID = "0b1f524ad42b420f889a2c6adb9b8c92"
15 | NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG
16 | 
17 | 
18 | # 解析数据库内容测试：根据token和id解析数据库内容，得到临时CSV文件
19 | def test_db_table_inline_parser_dic(query):
20 |     db_handle = Database(
21 |         database_id=DB_TABLE_INLINE_ID,
22 |         query_handle=query,
23 |         export_child_pages=False
24 |     )
25 |     # 将解析内容存储到文件中；返回内容存储为json文件
26 |     page_detail_json = db_handle.dump_to_file()
27 | 
28 |     print("json output to db_parser_result")
29 |     common_op.save_json_to_file(
30 |         handle=page_detail_json,
31 |         json_name=".tmp/db_parser_result.json"
32 |     )
33 |     print(db_handle.dump_to_dic())
34 | 
35 | 
36 | # 解析数据库内容测试：根据token和id解析数据库内容，得到临时CSV文件
37 | def test_db_table_inline_parser_csv(query, export_child=False):
38 |     db_handle = Dump(
39 |         dump_id=DB_TABLE_INLINE_ID,
40 |         query_handle=query,
41 |         export_child_pages=export_child,
42 |         dump_type=NotionDump.DUMP_TYPE_DB_TABLE
43 |     )
44 |     # 将解析内容存储到文件中；返回内容存储为json文件
45 |     page_detail_json = db_handle.dump_to_file()
46 | 
47 |     print("json output to db_parser_result")
48 |     common_op.save_json_to_file(
49 |         handle=page_detail_json,
50 |         json_name=".tmp/db_parser_result.json"
51 |     )
52 | 
53 | 
54 | def test_db_table_inline_parser_md(query, export_child=False):
55 |     db_handle = Dump(
56 |         dump_id=DB_TABLE_INLINE_ID,
57 |         query_handle=query,
58 |         export_child_pages=export_child,
59 |         dump_type=NotionDump.DUMP_TYPE_DB_TABLE,
60 |         db_parser_type=NotionDump.PARSER_TYPE_MD,
61 |     )
62 |     # 将解析内容存储到文件中；返回内容存储为json文件
63 |     page_detail_json = db_handle.dump_to_file()
64 | 
65 |     print("json output to db_parser_result")
66 |     common_op.save_json_to_file(
67 |         handle=page_detail_json,
68 |         json_name=".tmp/db_parser_result.json"
69 |     )
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     query_handle = NotionQuery(token=TOKEN_TEST)
74 |     if query_handle is None:
75 |         logging.exception("query handle init error")
76 |         exit(-1)
77 | 
78 |     # 数据库存储到CSV文件
79 |     # test_db_table_inline_parser_csv(query_handle, True)
80 | 
81 |     # 数据库存储到MD文件
82 |     test_db_table_inline_parser_md(query_handle, True)
83 | 
84 |     # 数据库存储到字典
85 |     # test_db_table_inline_parser_dic(query_handle)
86 | 
87 | 


--------------------------------------------------------------------------------
/Examples/page_dump.py:
--------------------------------------------------------------------------------
 1 | # author: delta1037
 2 | # Date: 2022/01/11
 3 | # mail:geniusrabbit@qq.com
 4 | 
 5 | import logging
 6 | 
 7 | import NotionDump
 8 | from NotionDump.Dump.dump import Dump
 9 | from NotionDump.Notion.Notion import NotionQuery
10 | from NotionDump.utils import common_op
11 | 
12 | TOKEN_TEST = "secret_ALjbBRGaZcagEjPtL1c2F139steBXjr8Fc8uQso4YLV"
13 | PAGE_MIX_ID = "921e6b4ea44046c6935bcb2c69453196"
14 | # NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG
15 | 
16 | 
17 | # 解析数据库内容测试：根据token和id解析数据库内容，得到临时CSV文件
18 | def test_page_parser(query, export_child=False, db_parser_type=NotionDump.PARSER_TYPE_MD):
19 |     page_handle = Dump(
20 |         dump_id=PAGE_MIX_ID,
21 |         query_handle=query,
22 |         export_child_pages=export_child,
23 |         dump_type=NotionDump.DUMP_TYPE_PAGE,
24 |         db_parser_type=db_parser_type
25 |     )
26 |     # 将解析内容存储到文件中；返回内容存储为json文件
27 |     page_detail_json = page_handle.dump_to_file()
28 | 
29 |     print("json output to page_parser_result")
30 |     common_op.save_json_to_file(
31 |         handle=page_detail_json,
32 |         json_name=".tmp/page_parser_result.json"
33 |     )
34 | 
35 | 
36 | def test_page_retrieve(query: NotionQuery):
37 |     # 获取页面信息测试
38 |     ret = query.retrieve_page("0cee7c12f04c4157bcc025355adf2312")
39 |     print(ret)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     query_handle = NotionQuery(token=TOKEN_TEST)
44 |     if query_handle is None:
45 |         logging.exception("query handle init error")
46 |         exit(-1)
47 | 
48 |     # 测试收集页面信息
49 |     # test_page_retrieve(query_handle)
50 | 
51 |     # 页面解析测试,递归
52 |     test_page_parser(query_handle, True)
53 | 
54 |     # 页面解析测试,非递归
55 |     # test_page_parser(query_handle, False)
56 | 
57 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 delta1037
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | recursive-include examples *.txt *.py
3 | prune examples/sample?/build
4 | 


--------------------------------------------------------------------------------
/NotionDump/Dump/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Dump/__init__.py


--------------------------------------------------------------------------------
/NotionDump/Dump/block.py:
--------------------------------------------------------------------------------
 1 | # author: delta1037
 2 | # Date: 2022/01/08
 3 | # mail:geniusrabbit@qq.com
 4 | import NotionDump
 5 | from NotionDump.Dump.page import Page
 6 | from NotionDump.Notion.Notion import NotionQuery
 7 | from NotionDump.utils import internal_var
 8 | 
 9 | 
10 | # Block内容解析
11 | class Block:
12 |     # 初始化
13 |     def __init__(
14 |             self,
15 |             block_id,
16 |             query_handle:
17 |             NotionQuery,
18 |             export_child_pages=False,
19 |             page_parser_type=NotionDump.PARSER_TYPE_MD,
20 |             db_parser_type=NotionDump.PARSER_TYPE_PLAIN
21 |     ):
22 |         self.block_id = block_id.replace('-', '')
23 |         self.query_handle = query_handle
24 |         # 是否导出子页面
25 |         self.export_child_page = export_child_pages
26 |         self.page_parser_type = page_parser_type
27 |         self.db_parser_type = db_parser_type
28 | 
29 |         # 构造解析器
30 |         self.page_handle = Page(
31 |             page_id=self.block_id,
32 |             query_handle=self.query_handle,
33 |             export_child_pages=self.export_child_page,
34 |             page_parser_type=self.page_parser_type,
35 |             db_parser_type=self.db_parser_type
36 |         )
37 | 
38 |     # show_child_page
39 |     @staticmethod
40 |     def get_pages_detail():
41 |         return internal_var.PAGE_DIC
42 | 
43 |     # 获取到所有的BLOCK数据
44 |     def dump_to_file(self, file_name=None):
45 |         # 递归时第一个block单独作为一个main page存放
46 |         return self.page_handle.dump_to_file(file_name=file_name)
47 | 
48 |     def dump_to_db(self):
49 |         return self.page_handle.dump_to_db()
50 | 
51 |     # 源文件，直接输出成json; 辅助测试使用
52 |     def dump_to_json(self, json_name=None):
53 |         return self.page_handle.dump_to_json(json_name=json_name)
54 | 


--------------------------------------------------------------------------------
/NotionDump/Dump/database.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/08
  3 | # mail:geniusrabbit@qq.com
  4 | 
  5 | import os
  6 | import shutil
  7 | 
  8 | import NotionDump
  9 | from NotionDump.Notion.Notion import NotionQuery
 10 | from NotionDump.Parser.mix_parser import MixParser
 11 | from NotionDump.utils import common_op, internal_var
 12 | 
 13 | 
 14 | class Database:
 15 |     # 初始化
 16 |     def __init__(
 17 |             self,
 18 |             database_id,
 19 |             query_handle: NotionQuery,
 20 |             export_child_pages=False,
 21 |             page_parser_type=NotionDump.PARSER_TYPE_MD,
 22 |             db_parser_type=NotionDump.PARSER_TYPE_PLAIN
 23 |     ):
 24 |         self.database_id = database_id.replace('-', '')
 25 |         self.query_handle = query_handle
 26 |         # 是否导出子页面
 27 |         self.export_child_page = export_child_pages
 28 |         self.page_parser_type = page_parser_type
 29 |         self.db_parser_type = db_parser_type
 30 | 
 31 |         # 构造解析器
 32 |         self.mix_parser = MixParser(
 33 |             mix_id=self.database_id,
 34 |             query_handle=self.query_handle,
 35 |             export_child_pages=self.export_child_page,
 36 |             page_parser_type=self.page_parser_type,
 37 |             db_parser_type=self.db_parser_type
 38 |         )
 39 | 
 40 |         # 创建临时文件夹
 41 |         self.tmp_dir = NotionDump.TMP_DIR
 42 |         if not os.path.exists(self.tmp_dir):
 43 |             os.mkdir(self.tmp_dir)
 44 | 
 45 |     # show_child_page
 46 |     @staticmethod
 47 |     def get_pages_detail():
 48 |         return internal_var.PAGE_DIC
 49 | 
 50 |     # 获取到所有的数据库数据(CSV格式)(数据库导出均是CSV)
 51 |     def dump_to_file(self, file_name=None, col_name_list=None):
 52 |         # 解析到临时文件中
 53 |         tmp_filename = self.mix_parser.mix_parser(root_id=self.database_id, id_type="database", col_name_list=col_name_list)
 54 |         if tmp_filename is None:
 55 |             common_op.debug_log("page parser fail, id=" + self.database_id, level=NotionDump.DUMP_MODE_DEFAULT)
 56 |             return ""
 57 | 
 58 |         if file_name is not None:
 59 |             shutil.copyfile(tmp_filename, file_name)
 60 |             common_op.debug_log("copy " + tmp_filename + " to " + file_name, level=NotionDump.DUMP_MODE_DEFAULT)
 61 |             return file_name
 62 | 
 63 |         return tmp_filename
 64 | 
 65 |     def dump_to_db(self, col_name_list=None, db_q_filter="{}", db_q_sorts="[]"):
 66 |         # 从配置文件中获取数据库配置，打开数据库，并将csv文件写入到数据库中
 67 |         db_json = self.query_handle.query_database(
 68 |             database_id=self.database_id,
 69 |             db_q_filter=db_q_filter,
 70 |             db_q_sorts=db_q_sorts)
 71 |         if db_json is None:
 72 |             return ""
 73 | 
 74 |         # TODO 将CSV文件写入到数据库；调用SQL中的notion2sql提供的接口
 75 |         return
 76 | 
 77 |     # 源文件，直接输出成json; 辅助测试使用
 78 |     def dump_to_json(self, json_name=None, db_q_filter="{}", db_q_sorts="[]"):
 79 |         db_json = self.query_handle.query_database(
 80 |             database_id=self.database_id,
 81 |             db_q_filter=db_q_filter,
 82 |             db_q_sorts=db_q_sorts)
 83 |         if db_json is None:
 84 |             return ""
 85 | 
 86 |         if json_name is None:
 87 |             json_name = self.tmp_dir + self.database_id + ".json"
 88 |         common_op.save_json_to_file(db_json, json_name)
 89 | 
 90 |     def dump_to_dic(self, col_name_list=None, db_q_filter="{}", db_q_sorts="[]"):
 91 |         # 由于数据库修改属性不会使数据库的修改时间改变，所以这里采用强制更新的方式
 92 |         db_json = self.query_handle.query_database(
 93 |             database_id=self.database_id,
 94 |             db_q_filter=db_q_filter,
 95 |             db_q_sorts=db_q_sorts,
 96 |             force_update=True
 97 |         )
 98 |         if db_json is None:
 99 |             common_op.debug_log("query database get nothing, id=" + self.database_id,
100 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
101 |             return ""
102 | 
103 |         return self.mix_parser.database_collection(
104 |             json_handle=db_json,
105 |             json_type="database",
106 |             col_name_list=col_name_list
107 |         )
108 | 


--------------------------------------------------------------------------------
/NotionDump/Dump/dump.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/08
  3 | # mail:geniusrabbit@qq.com
  4 | import copy
  5 | 
  6 | import NotionDump
  7 | from NotionDump.Dump.block import Block
  8 | from NotionDump.Dump.database import Database
  9 | from NotionDump.Dump.page import Page
 10 | from NotionDump.Notion.Notion import NotionQuery
 11 | from NotionDump.utils import internal_var, common_op
 12 | 
 13 | 
 14 | class Dump:
 15 |     def __init__(
 16 |             self,
 17 |             dump_id,
 18 |             query_handle: NotionQuery,
 19 |             export_child_pages=False,
 20 |             page_parser_type=NotionDump.PARSER_TYPE_MD,
 21 |             db_parser_type=NotionDump.PARSER_TYPE_PLAIN,
 22 |             dump_type=NotionDump.DUMP_TYPE_PAGE
 23 |     ):
 24 |         self.dump_id = dump_id.replace('-', '')
 25 |         self.query_handle = query_handle
 26 |         # 是否导出子页面
 27 |         self.export_child_page = export_child_pages
 28 |         self.page_parser_type = page_parser_type
 29 |         self.db_parser_type = db_parser_type
 30 |         self.dump_type = dump_type
 31 | 
 32 |         self.handle = None
 33 |         if dump_type == NotionDump.DUMP_TYPE_PAGE:
 34 |             self.handle = Page(
 35 |                 page_id=self.dump_id,
 36 |                 query_handle=self.query_handle,
 37 |                 export_child_pages=self.export_child_page,
 38 |                 page_parser_type=self.page_parser_type,
 39 |                 db_parser_type=self.db_parser_type
 40 |             )
 41 |         elif dump_type == NotionDump.DUMP_TYPE_BLOCK:
 42 |             self.handle = Block(
 43 |                 block_id=self.dump_id,
 44 |                 query_handle=self.query_handle,
 45 |                 export_child_pages=self.export_child_page,
 46 |                 page_parser_type=self.page_parser_type,
 47 |                 db_parser_type=self.db_parser_type
 48 |             )
 49 |         elif dump_type == NotionDump.DUMP_TYPE_DB_TABLE:
 50 |             self.handle = Database(
 51 |                 database_id=self.dump_id,
 52 |                 query_handle=self.query_handle,
 53 |                 export_child_pages=self.export_child_page,
 54 |                 page_parser_type=self.page_parser_type,
 55 |                 db_parser_type=self.db_parser_type
 56 |             )
 57 |         else:
 58 |             common_op.debug_log("unknown dump type:" + str(self.dump_type), level=NotionDump.DUMP_MODE_DEFAULT)
 59 | 
 60 |     # show_child_page
 61 |     @staticmethod
 62 |     def __get_pages_detail():
 63 |         return internal_var.PAGE_DIC
 64 | 
 65 |     # 获取到所有的BLOCK数据
 66 |     def dump_to_file(self, file_name=None):
 67 |         if self.handle is None:
 68 |             common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT)
 69 |             return ""
 70 |         # 递归时第一个block单独作为一个main page存放
 71 |         self.handle.dump_to_file(file_name=file_name)
 72 |         self.query_handle.safe_save()
 73 | 
 74 |         pages_detail = copy.deepcopy(internal_var.PAGE_DIC)
 75 |         internal_var.PAGE_DIC = {}
 76 |         return pages_detail
 77 | 
 78 |     def dump_to_db(self):
 79 |         if self.handle is None:
 80 |             common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT)
 81 |             return ""
 82 |         # 将内容导出到数据库
 83 |         self.handle.dump_to_db()
 84 |         self.query_handle.safe_save()
 85 | 
 86 |         pages_detail = copy.deepcopy(internal_var.PAGE_DIC)
 87 |         internal_var.PAGE_DIC = {}
 88 |         return pages_detail
 89 | 
 90 |     # 源文件，直接输出成json; 辅助测试使用
 91 |     def dump_to_json(self, json_name=None):
 92 |         if self.handle is None:
 93 |             common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT)
 94 |             return ""
 95 | 
 96 |         self.handle.dump_to_json(json_name=json_name)
 97 |         self.query_handle.safe_save()
 98 | 
 99 |         pages_detail = copy.deepcopy(internal_var.PAGE_DIC)
100 |         internal_var.PAGE_DIC = {}
101 |         return pages_detail
102 | 


--------------------------------------------------------------------------------
/NotionDump/Dump/page.py:
--------------------------------------------------------------------------------
 1 | # author: delta1037
 2 | # Date: 2022/01/08
 3 | # mail:geniusrabbit@qq.com
 4 | import os
 5 | import shutil
 6 | 
 7 | import NotionDump
 8 | from NotionDump.Notion.Notion import NotionQuery
 9 | from NotionDump.Parser.mix_parser import MixParser
10 | from NotionDump.utils import common_op
11 | from NotionDump.utils import internal_var
12 | 
13 | 
14 | class Page:
15 |     # 初始化
16 |     def __init__(
17 |             self,
18 |             page_id,
19 |             query_handle: NotionQuery,
20 |             export_child_pages=False,
21 |             page_parser_type=NotionDump.PARSER_TYPE_MD,
22 |             db_parser_type=NotionDump.PARSER_TYPE_PLAIN
23 |     ):
24 |         self.page_id = page_id.replace('-', '')
25 |         self.query_handle = query_handle
26 |         # 是否导出子页面
27 |         self.export_child_page = export_child_pages
28 |         self.page_parser_type = page_parser_type
29 |         self.db_parser_type = db_parser_type
30 | 
31 |         # 构造解析器
32 |         self.mix_parser = MixParser(
33 |             mix_id=self.page_id,
34 |             query_handle=self.query_handle,
35 |             export_child_pages=self.export_child_page,
36 |             page_parser_type=self.page_parser_type,
37 |             db_parser_type=self.db_parser_type
38 |         )
39 | 
40 |         # 创建临时文件夹
41 |         self.tmp_dir = NotionDump.TMP_DIR
42 |         if not os.path.exists(self.tmp_dir):
43 |             os.mkdir(self.tmp_dir)
44 | 
45 |     # show_child_page
46 |     @staticmethod
47 |     def get_pages_detail():
48 |         return internal_var.PAGE_DIC
49 | 
50 |     # 获取到所有的PAGE数据
51 |     def dump_to_file(self, file_name=None):
52 |         # 解析到临时文件中
53 |         tmp_md_filename = self.mix_parser.mix_parser(root_id=self.page_id, id_type="block")
54 |         if tmp_md_filename is None:
55 |             common_op.debug_log("page parser fail, id="+self.page_id, level=NotionDump.DUMP_MODE_DEFAULT)
56 |             return ""
57 | 
58 |         if file_name is not None:
59 |             shutil.copyfile(tmp_md_filename, file_name)
60 |             common_op.debug_log("copy " + tmp_md_filename + " to " + file_name, level=NotionDump.DUMP_MODE_DEFAULT)
61 |             return file_name
62 | 
63 |         return tmp_md_filename
64 | 
65 |     def dump_to_db(self):
66 |         # 从配置文件中获取数据库配置，打开数据库，并将csv文件写入到数据库中
67 |         page_json = self.query_handle.retrieve_block_children(self.page_id)
68 |         if page_json is None:
69 |             return None
70 | 
71 |         # TODO 将Md文件写入到数据库;调用SQL中的notion2sql提供的接口
72 |         return
73 | 
74 |     # 源文件，直接输出成json; 辅助测试使用
75 |     def dump_to_json(self, json_name=None):
76 |         page_json = self.query_handle.retrieve_block_children(self.page_id)
77 |         if page_json is None:
78 |             return None
79 | 
80 |         if json_name is None:
81 |             json_name = self.tmp_dir + self.page_id + ".json"
82 |         common_op.save_json_to_file(page_json, json_name)
83 |         return
84 | 


--------------------------------------------------------------------------------
/NotionDump/Notion/Buffer.py:
--------------------------------------------------------------------------------
 1 | from time import strftime, localtime
 2 | 
 3 | import NotionDump
 4 | from NotionDump.utils import common_op
 5 | 
 6 | 
 7 | class Buffer:
 8 |     def __init__(self):
 9 |         self.base_time = strftime("%Y-%m-%d %H:%M:%S", localtime())
10 |         self.buffer_map = common_op.load_json_from_file(NotionDump.BUFFER_FILE)
11 |         if self.buffer_map is None:
12 |             self.buffer_map = {}
13 | 
14 |     def save_buffer(self):
15 |         common_op.debug_log("save buffer file")
16 |         common_op.save_json_to_file(self.buffer_map, NotionDump.BUFFER_FILE)
17 | 
18 |     def add_buffer(self, page_id, page_time, id_type="page"):
19 |         if page_id not in self.buffer_map:
20 |             common_op.debug_log("[BUFFER] add_buffer, new, id=" + page_id + ", type=" + id_type)
21 |             self.buffer_map[page_id] = {
22 |                 "type": id_type,
23 |                 # 页面上次编辑时间
24 |                 "last_edited_time": page_time,
25 |                 # 页面上次下载时间
26 |                 "update_time": None,
27 |                 # 页面脏标志
28 |                 "dirty": True
29 |             }
30 |         else:
31 |             if page_time != self.buffer_map[page_id]["last_edited_time"]:
32 |                 # 页面编辑过，需要重新下载
33 |                 common_op.debug_log("[BUFFER] add_buffer, update, id=" + page_id + ", type=" + id_type)
34 |                 self.buffer_map[page_id]["dirty"] = True
35 |                 self.buffer_map[page_id]["last_edited_time"] = page_time
36 | 
37 |     def update_buffer(self, page_id):
38 |         # 文件已重新下载，设置更新时间
39 |         if page_id in self.buffer_map:
40 |             common_op.debug_log("[BUFFER] update_buffer, id=" + page_id)
41 |             self.buffer_map[page_id]["update_time"] = strftime("%Y-%m-%d %H:%M:%S", localtime())
42 |             self.buffer_map[page_id]["dirty"] = False
43 | 
44 |     def select_buffer(self, page_id, is_child=False):
45 |         # 查看缓存中是否命中，命中返回True（说明缓存有效），没命中返回False（说明缓存文件无效，需要重新下载）
46 |         if page_id not in self.buffer_map:
47 |             common_op.debug_log("[BUFFER] select_buffer, id=" + page_id + ", not exist")
48 |             return True
49 |         else:
50 |             if is_child:
51 |                 if self.buffer_map[page_id]["update_time"] >= self.base_time:
52 |                     # 子块所在的页面刚更新过，子块也要随之更新
53 |                     common_op.debug_log("[BUFFER] select_buffer, child update, id=" + page_id)
54 |                     return True
55 |                 else:
56 |                     common_op.debug_log("[BUFFER] select_buffer, child old, id=" + page_id)
57 |                     return self.buffer_map[page_id]["dirty"]
58 |             else:
59 |                 common_op.debug_log("[BUFFER] select_buffer, main, id=" + page_id)
60 |                 return self.buffer_map[page_id]["dirty"]
61 | 


--------------------------------------------------------------------------------
/NotionDump/Notion/Notion.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/10
  3 | # mail:geniusrabbit@qq.com
  4 | import os
  5 | from time import sleep, time
  6 | import urllib.request
  7 | from time import time, sleep
  8 | from urllib.error import URLError
  9 | from urllib.parse import quote
 10 | from notion_client import Client, AsyncClient
 11 | from notion_client import APIErrorCode, APIResponseError
 12 | 
 13 | import NotionDump
 14 | from NotionDump.Notion.Buffer import Buffer
 15 | from NotionDump.utils import common_op, internal_var
 16 | 
 17 | 
 18 | class NotionQuery:
 19 |     def __init__(self, token, client_handle=None, async_api=False):
 20 |         self.token = token
 21 |         if client_handle is None and token is not None:
 22 |             # 有的token话就初始化一下
 23 |             if not async_api:
 24 |                 self.client = Client(auth=self.token)
 25 |             else:
 26 |                 self.client = AsyncClient(auth=self.token)
 27 |         else:
 28 |             # 没有token，传进来handle就用，没传就不用
 29 |             self.client = client_handle
 30 | 
 31 |         if self.client is None:
 32 |             common_op.debug_log("notion query init fail", level=NotionDump.DUMP_MODE_DEFAULT)
 33 | 
 34 |         # 创建临时文件夹
 35 |         self.tmp_dir = NotionDump.TMP_DIR
 36 |         if not os.path.exists(self.tmp_dir):
 37 |             os.mkdir(self.tmp_dir)
 38 | 
 39 |         self.last_call_time = None
 40 |         self.friendly_time = internal_var.FRIENDLY_USE_API
 41 | 
 42 |         self.buffer = Buffer()
 43 | 
 44 |     def safe_save(self):
 45 |         self.buffer.save_buffer()
 46 | 
 47 |     def __friendly_use_api(self):
 48 |         now_time = time()
 49 |         # 睡眠时间 = 间隔时间 - 函数执行时间
 50 |         if self.last_call_time is None:
 51 |             func_exec_ms = self.friendly_time
 52 |         else:
 53 |             func_exec_ms = int(round(now_time * 1000)) - int(round(self.last_call_time * 1000))
 54 |         sleep_ms = self.friendly_time - func_exec_ms
 55 |         while sleep_ms > 0:
 56 |             # 如果需要睡眠
 57 |             if sleep_ms > 100:
 58 |                 sleep(0.1)
 59 |             else:
 60 |                 sleep(sleep_ms / 1000.0)
 61 |             # 按照每次100ms累计
 62 |             common_op.debug_log("wait for server response..." + str(sleep_ms) + "ms", level=NotionDump.DUMP_MODE_DEFAULT)
 63 |             sleep_ms -= 100
 64 |         # 更新上次执行时间
 65 |         self.last_call_time = time()
 66 | 
 67 |     # 获取该块下所有的子块
 68 |     def retrieve_block_children(self, block_id, parent_id=None, page_size=100, force_update=False):
 69 |         # 添加缓存系统
 70 |         if not force_update and NotionDump.USE_BUFFER:
 71 |             if parent_id is not None:
 72 |                 dirty = self.buffer.select_buffer(parent_id, is_child=True)
 73 |             else:
 74 |                 dirty = self.buffer.select_buffer(block_id)
 75 |             if not dirty:
 76 |                 # 缓存命中，直接从缓存中加载数据
 77 |                 common_op.debug_log("[##CACHE] cached and load " + block_id + ";parent is " + str(parent_id), level=NotionDump.DUMP_MODE_DEFAULT)
 78 |                 load = self.__load_from_json(block_id, prefix="retrieve_block_")
 79 |                 if load is not None:
 80 |                     return load
 81 | 
 82 |         common_op.debug_log("[&&CACHE] no cached and load " + block_id + "; parent is " + str(parent_id), level=NotionDump.DUMP_MODE_DEFAULT)
 83 |         self.__friendly_use_api()
 84 |         query_post = {
 85 |             "block_id": block_id,
 86 |             "page_size": page_size
 87 |         }
 88 |         try:
 89 |             query_ret = self.client.blocks.children.list(
 90 |                 **query_post
 91 |             )
 92 | 
 93 |             # 大量数据一次未读完
 94 |             next_cur = query_ret["next_cursor"]
 95 |             while query_ret["has_more"]:
 96 |                 query_post["start_cursor"] = next_cur
 97 |                 common_op.debug_log(query_post, level=NotionDump.DUMP_MODE_DEFAULT)
 98 |                 db_query_ret = self.client.blocks.children.list(
 99 |                     **query_post
100 |                 )
101 |                 next_cur = db_query_ret["next_cursor"]
102 |                 query_ret["results"] += db_query_ret["results"]
103 |                 if next_cur is None:
104 |                     break
105 |             if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
106 |                 self.__save_to_json(query_ret, block_id, prefix="retrieve_block_")
107 |             if NotionDump.USE_BUFFER and parent_id is None:
108 |                 # 独立的page 更新页面状态
109 |                 self.buffer.update_buffer(block_id)
110 |             return query_ret
111 |         except APIResponseError as error:
112 |             if error.code == APIErrorCode.ObjectNotFound:
113 |                 common_op.debug_log("Block " + block_id + " Retrieve child is invalid",
114 |                                     level=NotionDump.DUMP_MODE_DEFAULT)
115 |             else:
116 |                 # Other error handling code
117 |                 common_op.debug_log(error)
118 |                 common_op.debug_log("Block " + block_id + " response error", level=NotionDump.DUMP_MODE_DEFAULT)
119 |         except Exception as e:
120 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
121 |             common_op.debug_log("Block " + block_id + " Not found or no authority", level=NotionDump.DUMP_MODE_DEFAULT)
122 |         return None
123 | 
124 |     # 获取到所有的数据库数据(JSon格式)
125 |     def query_database(self, database_id, db_q_filter="{}", db_q_sorts="[]", force_update=False):
126 |         # 添加缓存系统
127 |         if not force_update and NotionDump.USE_BUFFER:
128 |             if not self.buffer.select_buffer(database_id):
129 |                 # 缓存命中，直接从缓存中加载数据
130 |                 common_op.debug_log("[##CACHE] cached and load " + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
131 |                 load = self.__load_from_json(database_id, prefix="query_db_")
132 |                 if load is not None:
133 |                     return load
134 |         common_op.debug_log("[&&CACHE] no cached and load " + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
135 | 
136 |         self.__friendly_use_api()
137 |         # 组合查询条件
138 |         query_post = {"database_id": database_id}
139 |         if db_q_sorts != "[]":
140 |             query_post["sorts"] = db_q_sorts
141 |         if db_q_filter != "{}":
142 |             query_post["filter"] = db_q_sorts
143 |         try:
144 |             query_ret = self.client.databases.query(
145 |                 **query_post
146 |             )
147 | 
148 |             # 大量数据一次未读完
149 |             next_cur = query_ret["next_cursor"]
150 |             while query_ret["has_more"]:
151 |                 query_post["start_cursor"] = next_cur
152 |                 common_op.debug_log(query_post, level=NotionDump.DUMP_MODE_DEFAULT)
153 |                 db_query_ret = self.client.databases.query(
154 |                     **query_post
155 |                 )
156 |                 next_cur = db_query_ret["next_cursor"]
157 |                 query_ret["results"] += db_query_ret["results"]
158 |                 if next_cur is None:
159 |                     break
160 | 
161 |             if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
162 |                 self.__save_to_json(query_ret, database_id, prefix="query_db_")
163 |             if NotionDump.USE_BUFFER:
164 |                 # 独立的page 更新页面状态
165 |                 self.buffer.update_buffer(database_id)
166 |             return query_ret
167 |         except APIResponseError as error:
168 |             if error.code == APIErrorCode.ObjectNotFound:
169 |                 common_op.debug_log("Database Query is invalid, id=" + database_id,
170 |                                     level=NotionDump.DUMP_MODE_DEFAULT)
171 |             else:
172 |                 # Other error handling code
173 |                 common_op.debug_log(error)
174 |                 common_op.debug_log("Database Query is invalid, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
175 |         except Exception as e:
176 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
177 |             common_op.debug_log("Database Query Not found or no authority, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
178 |         return None
179 | 
180 |     # 获取数据库信息
181 |     def retrieve_database(self, database_id):
182 |         self.__friendly_use_api()
183 |         try:
184 |             retrieve_ret = self.client.databases.retrieve(database_id=database_id)
185 |             if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
186 |                 self.__save_to_json(retrieve_ret, database_id, prefix="retrieve_db_")
187 |             if NotionDump.USE_BUFFER and retrieve_ret is not None and "last_edited_time" in retrieve_ret:
188 |                 self.buffer.add_buffer(database_id, retrieve_ret["last_edited_time"], id_type="database")
189 |             return retrieve_ret
190 |         except APIResponseError as error:
191 |             if error.code == APIErrorCode.ObjectNotFound:
192 |                 common_op.debug_log("Database retrieve is invalid, id=" + database_id,
193 |                                     level=NotionDump.DUMP_MODE_DEFAULT)
194 |             else:
195 |                 # Other error handling code
196 |                 common_op.debug_log(error)
197 |                 common_op.debug_log("Database retrieve is invalid, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
198 |         except Exception as e:
199 |             common_op.debug_log(e)
200 |             common_op.debug_log("Database retrieve Not found or no authority, id=" + database_id,
201 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
202 |         return None
203 | 
204 |     # 获取Page的信息
205 |     def retrieve_page(self, page_id):
206 |         self.__friendly_use_api()
207 |         try:
208 |             retrieve_ret = self.client.pages.retrieve(page_id=page_id)
209 |             if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
210 |                 self.__save_to_json(retrieve_ret, page_id, prefix="retrieve_page_")
211 |             if NotionDump.USE_BUFFER and retrieve_ret is not None and "last_edited_time" in retrieve_ret:
212 |                 self.buffer.add_buffer(page_id, retrieve_ret["last_edited_time"])
213 |             return retrieve_ret
214 |         except APIResponseError as error:
215 |             if error.code == APIErrorCode.ObjectNotFound:
216 |                 common_op.debug_log("Page retrieve is invalid(api), id=" + page_id,
217 |                                     level=NotionDump.DUMP_MODE_DEFAULT)
218 |             else:
219 |                 # Other error handling code
220 |                 common_op.debug_log(error)
221 |                 common_op.debug_log("Page retrieve is invalid(other), id=" + page_id,
222 |                                     level=NotionDump.DUMP_MODE_DEFAULT)
223 |         except Exception as e:
224 |             common_op.debug_log(e)
225 |             common_op.debug_log("Page retrieve Not found or no authority, id=" + page_id,
226 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
227 |         return None
228 | 
229 |     def download_to_file(self, download_id, child_page_item):
230 |         # 设置文件链接嵌入时，只有存储在Notion的文件需要下载（不下载会由于时间问题导致链接失效）
231 |         if NotionDump.FILE_WITH_LINK and "secure.notion-static.com" not in child_page_item["link_src"]:
232 |             return ""
233 | 
234 |         # 解析文件后缀名
235 |         file_url = child_page_item["link_src"]
236 |         common_op.debug_log("download url is " + file_url, level=NotionDump.DUMP_MODE_DEBUG)
237 |         if file_url == "":
238 |             return ""
239 |         # 文件名在最后一个/和?之间
240 |         if file_url.find('?') != -1:
241 |             filename = file_url[file_url.rfind('/') + 1:file_url.find('?')]
242 |         else:
243 |             filename = file_url[file_url.rfind('/') + 1:]
244 |         file_suffix = filename[filename.find('.'):]
245 |         # 使用后缀和id生成可识别的文件
246 |         download_name = self.tmp_dir + download_id + file_suffix
247 |         common_op.debug_log("download name " + download_name, level=NotionDump.DUMP_MODE_DEBUG)
248 | 
249 |         if NotionDump.USE_BUFFER:
250 |             # 看文件是否需要重新下载
251 |             if not self.buffer.select_buffer(download_id) and os.path.exists(download_name):
252 |                 return download_name
253 | 
254 |             # 新增记录（注意这里与上面select不属于同一个执行分支）
255 |             self.buffer.add_buffer(download_id, "", id_type="file")
256 | 
257 |         if os.path.exists(download_name):
258 |             common_op.debug_log("[WARN] file " + download_name + " was covered", level=NotionDump.DUMP_MODE_DEFAULT)
259 |         # 下载文件
260 |         self.__friendly_use_api()
261 |         try:
262 |             file_url = quote(file_url, safe='/:?=&%')
263 |             urllib.request.urlretrieve(file_url, download_name)
264 |             if NotionDump.USE_BUFFER:
265 |                 self.buffer.update_buffer(download_id)
266 |             return download_name
267 |         except urllib.error.HTTPError as e:
268 |             common_op.debug_log("download name " + download_name + " get error:HTTPError",
269 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
270 |             common_op.debug_log("download url " + file_url + " get error:HTTPError",
271 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
272 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
273 |         except urllib.error.ContentTooShortError as e:
274 |             common_op.debug_log("download name " + download_name + " get error:ContentTooShortError",
275 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
276 |             common_op.debug_log("download url " + file_url + " get error:ContentTooShortError",
277 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
278 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
279 |         except urllib.error.URLError as e:
280 |             common_op.debug_log("download name " + download_name + " get error:URLError",
281 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
282 |             common_op.debug_log("download url " + file_url + " get error:URLError",
283 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
284 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
285 |         except TimeoutError as e:
286 |             common_op.debug_log("download name " + download_name + " get error:TimeoutError",
287 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
288 |             common_op.debug_log("download url " + file_url + " get error:TimeoutError",
289 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
290 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
291 |         except Exception as e:
292 |             common_op.debug_log("download name " + download_name + " get error:Exception",
293 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
294 |             common_op.debug_log("download url " + file_url + " get error:Exception",
295 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
296 |             common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
297 |         return ""
298 | 
299 |     # 源文件，直接输出成json; 辅助测试使用
300 |     def __save_to_json(self, page_json, json_id, json_name=None, prefix=None):
301 |         if json_name is None:
302 |             if prefix is not None:
303 |                 json_name = self.tmp_dir + prefix + json_id + ".json"
304 |             else:
305 |                 json_name = self.tmp_dir + json_id + ".json"
306 |         common_op.save_json_to_file(page_json, json_name)
307 | 
308 |     def __load_from_json(self, json_id, json_name=None, prefix=None):
309 |         if json_name is None:
310 |             if prefix is not None:
311 |                 json_name = self.tmp_dir + prefix + json_id + ".json"
312 |             else:
313 |                 json_name = self.tmp_dir + json_id + ".json"
314 |         return common_op.load_json_from_file(json_name)
315 | 


--------------------------------------------------------------------------------
/NotionDump/Notion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Notion/__init__.py


--------------------------------------------------------------------------------
/NotionDump/Parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Parser/__init__.py


--------------------------------------------------------------------------------
/NotionDump/Parser/base_parser.py:
--------------------------------------------------------------------------------
   1 | # author: delta1037
   2 | # Date: 2022/01/08
   3 | # mail:geniusrabbit@qq.com
   4 | import copy
   5 | 
   6 | import NotionDump
   7 | from NotionDump.utils import content_format, common_op
   8 | from NotionDump.utils import internal_var
   9 | from urllib.parse import unquote
  10 | from NotionDump.utils.content_format import color_transformer, color_transformer_db, format_date_or_time
  11 | 
  12 | 
  13 | class BaseParser:
  14 |     def __init__(self, base_id, export_child=False):
  15 |         self.base_id = base_id.replace('-', '')
  16 |         self.export_child = export_child
  17 | 
  18 |         # 设置变量存放子page 字典
  19 |         self.child_pages = {}
  20 | 
  21 |     def set_new_id(self, parent_id):
  22 |         self.base_id = parent_id
  23 | 
  24 |     # 获取子页面字典，只返回一次，离台概不负责
  25 |     def get_child_pages_dic(self):
  26 |         child_pages = copy.deepcopy(self.child_pages)
  27 |         self.child_pages.clear()  # 清空已有的内容
  28 |         return child_pages
  29 | 
  30 |     # 文本的格式生成
  31 |     @staticmethod
  32 |     def __annotations_parser(block_handle, str_plain):
  33 |         if str_plain is None or str_plain == "":
  34 |             return ""
  35 |         last_char = str_plain[-1:]
  36 |         if last_char == "\n" or last_char == "\t":
  37 |             str_ret = str_plain[0:-1]
  38 |         else:
  39 |             str_ret = str_plain
  40 |         if block_handle["code"]:
  41 |             str_ret = "`" + str_ret + "`"
  42 |         if block_handle["underline"]:
  43 |             str_ret = "<u>" + str_ret + "</u>"
  44 |         if block_handle["bold"]:
  45 |             str_ret = "**" + str_ret + "**"
  46 |         if block_handle["italic"]:
  47 |             str_ret = "*" + str_ret + "*"
  48 |         if block_handle["color"] != "default":
  49 |             # 添加颜色，区分背景色和前景色
  50 |             if NotionDump.S_THEME_TYPE == "markdown":
  51 |                 # 使用markdown默认的高亮来渲染所有的颜色类型
  52 |                 str_ret = NotionDump.MD_HIGHLIGHT + str_ret + NotionDump.MD_HIGHLIGHT
  53 |             else:
  54 |                 if block_handle["color"].find("_background") != -1:
  55 |                     bg_color = block_handle["color"][0:block_handle["color"].rfind('_')]
  56 |                     str_ret = "<span style=\"background-color:" + color_transformer(bg_color, background=True) + "\">" + str_ret + "</span>"
  57 |                 else:
  58 |                     str_ret = "<font color=\"" + color_transformer(block_handle["color"], background=False) + "\">" + str_ret + "</font>"
  59 |         if block_handle["strikethrough"]:
  60 |             str_ret = "~~" + str_ret + "~~"
  61 |         if last_char == "\n" or last_char == "\t":
  62 |             str_ret += last_char
  63 |         return str_ret
  64 | 
  65 |     def __text_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False):
  66 |         if block_handle["type"] != "text":
  67 |             common_op.debug_log(
  68 |                 "text type error! id=" + self.base_id + " not type " + block_handle["type"],
  69 |                 level=NotionDump.DUMP_MODE_DEFAULT)
  70 |             return ""
  71 | 
  72 |         text_str = ""
  73 |         if "plain_text" in block_handle:
  74 |             text_str = block_handle["plain_text"]
  75 |         if text_str is None:
  76 |             text_str = ""
  77 |         # 如果有链接
  78 |         text_url = block_handle["href"]
  79 |         if text_url is not None and parser_type == NotionDump.PARSER_TYPE_MD and not is_db_title:  # 数据库标题越过链接解析
  80 |             # 文字有链接内容，分为网络链接和本地链接
  81 |             if text_url.startswith("http") or not text_url.startswith("/"):
  82 |                 # 网络链接，直接一步到位
  83 |                 text_str = content_format.get_url_format(text_url, text_str)
  84 |             else:
  85 |                 # Page或者数据库类型，等待重定位
  86 |                 if text_url.find("=") != -1:
  87 |                     id_type = "database"
  88 |                     page_id = text_url[text_url.rfind("/") + 1:text_url.rfind("?")].replace('-', '')
  89 |                 else:
  90 |                     id_type = "page"
  91 |                     page_id = text_url[text_url.rfind("/") + 1:].replace('-', '')
  92 |                 if len(page_id) == NotionDump.ID_LEN:
  93 |                     common_op.debug_log("### page id " + page_id + " is " + id_type)
  94 |                     common_op.add_new_child_page(
  95 |                         self.child_pages,
  96 |                         key_id=page_id + "_" + text_str,
  97 |                         link_id=page_id,
  98 |                         link_src=text_url,
  99 |                         page_type=id_type,
 100 |                         page_name=text_str
 101 |                     )
 102 |                     # 将页面保存，等待进一步递归操作
 103 |                     # 保存子页面信息
 104 |                     common_op.debug_log("child_page_parser add page id = " + page_id + "_" + text_str, level=NotionDump.DUMP_MODE_DEFAULT)
 105 |                     text_str = content_format.get_page_format_md(page_id + "_" + text_str, text_str,
 106 |                                                                  export_child=self.export_child)
 107 |                 else:
 108 |                     text_str = content_format.get_url_format("", text_str)
 109 | 
 110 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 111 |             # 解析annotations部分，为text_str添加格式
 112 |             return self.__annotations_parser(block_handle["annotations"], text_str)
 113 |         else:
 114 |             return text_str
 115 | 
 116 |     def __text_block_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False):
 117 |         paragraph_ret = ""
 118 |         if block_handle["type"] == "text":
 119 |             paragraph_ret = self.__text_parser(block_handle, parser_type)
 120 |         elif block_handle["type"] == "equation":
 121 |             paragraph_ret = self.__equation_inline_parser(block_handle)
 122 |         elif block_handle["type"] == "mention":
 123 |             paragraph_ret = self.__mention_parser(block_handle, parser_type, is_db_title=is_db_title)
 124 |         else:
 125 |             common_op.debug_log(
 126 |                 "text type " + block_handle["type"] + " error! parent_id= " + self.base_id,
 127 |                 level=NotionDump.DUMP_MODE_DEFAULT)
 128 |         return paragraph_ret
 129 | 
 130 |     def __text_list_parser(self, text_list, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db=False, is_db_title=False):
 131 |         plain_text = ""
 132 |         if text_list is not None:
 133 |             for text_block in text_list:
 134 |                 plain_text += self.__text_block_parser(text_block, parser_type, is_db_title=is_db_title)
 135 |         if is_db:
 136 |             # 数据库内容特殊字符校对
 137 |             return plain_text.replace("|", "\\|")
 138 |         else:
 139 |             return plain_text
 140 | 
 141 |     # TODO : people只获取了名字和ID，后续可以做深度解析用户相关内容
 142 |     def __people_parser(self, block_handle):
 143 |         if block_handle["object"] != "user":
 144 |             common_op.debug_log("people type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 145 |             return ""
 146 |         # 优先获取名字
 147 |         if "name" in block_handle.keys():
 148 |             return block_handle["name"]
 149 |         # 如果无法获取名字则返回id
 150 |         return block_handle["id"].replace('-', '')
 151 | 
 152 |     def __user_parser(self, block_handle):
 153 |         if block_handle["type"] != "user":
 154 |             common_op.debug_log("user type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 155 |             return ""
 156 |         user_body = block_handle["user"]
 157 |         return self.__people_parser(user_body)
 158 | 
 159 |     def __db_file_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 160 |         if block_handle["type"] != "file":
 161 |             common_op.debug_log("file type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 162 |             return ""
 163 |         filename = block_handle["name"]
 164 |         file_url = block_handle["file"]["url"]
 165 | 
 166 |         # 解析文件的ID
 167 |         url_prefix = file_url[0:file_url.rfind("/")]
 168 |         file_id = url_prefix[url_prefix.rfind("/") + 1:].replace('-', '')
 169 |         common_op.debug_log("file id is : " + file_id)
 170 | 
 171 |         if filename == "":
 172 |             # 如果文件没有名字使用id作为默认名字
 173 |             filename = file_id
 174 |         common_op.add_new_child_page(
 175 |             self.child_pages,
 176 |             key_id=file_id,
 177 |             link_src=file_url,
 178 |             page_type="file",
 179 |             page_name=filename
 180 |         )
 181 |         common_op.debug_log(
 182 |             "file_parser add page id = " + file_id + " name : " + filename, level=NotionDump.DUMP_MODE_DEFAULT)
 183 |         common_op.debug_log(internal_var.PAGE_DIC)
 184 |         common_op.debug_log("#############")
 185 |         common_op.debug_log(self.child_pages)
 186 | 
 187 |         # 格式处理简单格式（也可以转换成markdown格式[]()）
 188 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 189 |             # file转换成文件链接的形式
 190 |             return content_format.get_file_format_md(filename, file_url, file_id, self.export_child)
 191 |         else:
 192 |             return content_format.get_file_format_plain(filename, file_url)
 193 | 
 194 |     # "$ equation_inline $"
 195 |     def __equation_inline_parser(self, block_handle):
 196 |         if block_handle["type"] != "equation":
 197 |             common_op.debug_log("equation inline type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 198 |             return ""
 199 |         # 公式删除富文本格式
 200 |         # return content_format.get_equation_inline(
 201 |         #     self.__annotations_parser(block_handle["annotations"], block_handle["plain_text"])
 202 |         # )
 203 |         return content_format.get_equation_inline(block_handle["plain_text"])
 204 | 
 205 |     # "$$ equation_block $$"
 206 |     def __equation_block_parser(self, block_handle):
 207 |         if block_handle["expression"] is None:
 208 |             common_op.debug_log("equation block no expression! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 209 |             return ""
 210 |         return content_format.get_equation_block(block_handle["expression"])
 211 | 
 212 |     # Attention!!! 关于链接到其它的Page可能需要递归处理
 213 |     def __page_parser(self, block_handle):
 214 |         if block_handle["type"] != "page":
 215 |             common_op.debug_log("page type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 216 |             return ""
 217 | 
 218 |         page_body = block_handle["page"]
 219 |         return page_body["id"].replace('-', '')
 220 | 
 221 |     # 提及到其它页面，日期，用户
 222 |     def __mention_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False):
 223 |         if block_handle["type"] != "mention":
 224 |             common_op.debug_log("mention type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 225 |             return ""
 226 | 
 227 |         mention_body = block_handle["mention"]
 228 |         mention_plain = ""
 229 |         if mention_body["type"] == "date":
 230 |             mention_plain = self.date_parser(mention_body)
 231 |         elif mention_body["type"] == "user":
 232 |             mention_plain = self.__user_parser(mention_body)
 233 |         elif mention_body["type"] == "link_preview" and "url" in mention_body["link_preview"].keys():
 234 |             mention_plain = mention_body["link_preview"]["url"]
 235 |         elif mention_body["type"] == "database":
 236 |             database_id = mention_body["database"]["id"].replace('-', '')
 237 |             key_id = database_id + "_mention"
 238 |             common_op.debug_log("__mention_parser add database id = " + database_id)
 239 |             # 获取页面的名字
 240 |             database_name = block_handle["plain_text"]
 241 |             database_link = block_handle["href"]
 242 |             if is_db_title:
 243 |                 mention_plain = database_name
 244 |             else:
 245 |                 common_op.add_new_child_page(
 246 |                     self.child_pages,
 247 |                     key_id=key_id,
 248 |                     link_id=database_id,
 249 |                     link_src=database_link,
 250 |                     page_type="database",
 251 |                     page_name=database_name
 252 |                 )
 253 |                 common_op.debug_log(
 254 |                     "file_parser add page id = " + key_id + " name : " + database_name, level=NotionDump.DUMP_MODE_DEFAULT)
 255 |                 common_op.debug_log(internal_var.PAGE_DIC)
 256 |                 common_op.debug_log("#############")
 257 |                 common_op.debug_log(self.child_pages)
 258 | 
 259 |                 if parser_type == NotionDump.PARSER_TYPE_MD:
 260 |                     mention_plain = content_format.get_page_format_md(key_id, database_name, export_child=self.export_child)
 261 |                 else:
 262 |                     mention_plain = database_name
 263 |         elif mention_body["type"] == "page":
 264 |             page_id = self.__page_parser(mention_body)
 265 |             key_id = page_id + "_mention"
 266 |             common_op.debug_log("__mention_parser add page id = " + page_id)
 267 |             # 获取页面的名字
 268 |             page_name = block_handle["plain_text"]
 269 |             page_link = block_handle["href"]
 270 | 
 271 |             if is_db_title:
 272 |                 mention_plain = page_name
 273 |             else:
 274 |                 # 提及页面按照链接页面处理
 275 |                 common_op.add_new_child_page(
 276 |                     self.child_pages,
 277 |                     key_id=key_id,
 278 |                     link_id=page_id,
 279 |                     link_src=page_link,
 280 |                     page_type="page",
 281 |                     page_name=page_name
 282 |                 )
 283 |                 common_op.debug_log(
 284 |                     "file_parser add page id = " + key_id + " name : " + page_name, level=NotionDump.DUMP_MODE_DEFAULT)
 285 |                 common_op.debug_log(internal_var.PAGE_DIC)
 286 |                 common_op.debug_log("#############")
 287 |                 common_op.debug_log(self.child_pages)
 288 | 
 289 |                 if parser_type == NotionDump.PARSER_TYPE_MD:
 290 |                     mention_plain = content_format.get_page_format_md(key_id, page_name, export_child=self.export_child)
 291 |                 else:
 292 |                     mention_plain = page_name
 293 |         else:
 294 |             common_op.debug_log("unknown mention type " + mention_body["type"], level=NotionDump.DUMP_MODE_DEFAULT)
 295 | 
 296 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 297 |             # 解析annotations部分，为mention_plain添加格式
 298 |             return self.__annotations_parser(block_handle["annotations"],
 299 |                                              content_format.get_mention_format(mention_plain))
 300 |         else:
 301 |             return content_format.get_mention_format(mention_plain)
 302 | 
 303 |     def __table_row_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 304 |         if block_handle["type"] != "table_row":
 305 |             common_op.debug_log("table_row type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
 306 |             return ""
 307 |         table_col_cells = block_handle["table_row"]["cells"]
 308 |         table_row = []
 309 |         for cell in table_col_cells:
 310 |             table_row.append(self.__text_list_parser(cell, parser_type))
 311 |         return table_row
 312 | 
 313 |     # 数据库 title
 314 |     def title_parser(self, block_handle, page_id, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 315 |         if block_handle["type"] != "title":
 316 |             common_op.debug_log("title type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 317 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 318 |             return ""
 319 |         db_page_title = self.__text_list_parser(block_handle["title"], parser_type, is_db=True, is_db_title=True)
 320 |         if page_id == "":
 321 |             # 如果page id是空的，说明只想要内容，不需要重定位
 322 |             return db_page_title
 323 | 
 324 |         if db_page_title != "":
 325 |             # 如果存在子Page就加入到待解析队列
 326 |             common_op.debug_log("title ret = " + db_page_title)
 327 |             if parser_type != NotionDump.PARSER_TYPE_PLAIN:
 328 |                 common_op.debug_log("title_parser add page id = " + page_id, level=NotionDump.DUMP_MODE_DEFAULT)
 329 |             else:
 330 |                 common_op.debug_log("title_parser add page id = " + page_id)
 331 |             # 数据库里的都是子页面
 332 |             common_op.add_new_child_page(self.child_pages, key_id=page_id, page_name=db_page_title)
 333 | 
 334 |             # 如果有子页面就添加一个占位符，之后方便重定位
 335 |             db_page_title = content_format.get_database_title_format(page_id, db_page_title, self.export_child)
 336 |         return db_page_title
 337 | 
 338 |     # 数据库 rich_text
 339 |     def rich_text_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 340 |         if block_handle["type"] != "rich_text":
 341 |             common_op.debug_log("rich_text type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 342 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 343 |             return ""
 344 |         return self.__text_list_parser(block_handle["rich_text"], parser_type, is_db=True)
 345 | 
 346 |     # 数据库 multi_select
 347 |     def multi_select_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 348 |         if block_handle["type"] != "multi_select":
 349 |             common_op.debug_log("multi_select type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 350 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 351 |             return ""
 352 |         multi_select_list = block_handle["multi_select"]
 353 |         ret_str = ""
 354 |         if multi_select_list is None:
 355 |             return ret_str
 356 |         for multi_select in multi_select_list:
 357 |             if ret_str != "":
 358 |                 ret_str += ","  # 多个选项之间用“,”分割
 359 |             if parser_type == NotionDump.PARSER_TYPE_MD:
 360 |                 ret_str += "<span style=\"background-color:" \
 361 |                            + color_transformer_db(multi_select["color"]) \
 362 |                            + "\">&nbsp;" + multi_select["name"] + "&nbsp;</span>"
 363 |             else:
 364 |                 ret_str += multi_select["name"]
 365 |         return ret_str
 366 | 
 367 |     # 数据库 select
 368 |     def select_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 369 |         if block_handle["type"] != "select":
 370 |             common_op.debug_log("select type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 371 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 372 |             return ""
 373 |         select = block_handle["select"]
 374 |         ret_str = ""
 375 |         if select is None:
 376 |             return ret_str
 377 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 378 |             ret_str = "<span style=\"background-color:" \
 379 |                 + color_transformer_db(select["color"]) \
 380 |                 + "\">&nbsp;" + select["name"] + "&nbsp;</span>"
 381 |         else:
 382 |             ret_str = select["name"]
 383 |         return ret_str
 384 | 
 385 |     # 数据库 url
 386 |     def url_parser(self, block_handle):
 387 |         if block_handle["type"] != "url":
 388 |             common_op.debug_log("url type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 389 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 390 |             return ""
 391 |         url = block_handle["url"]
 392 |         if url is None:
 393 |             url = ""
 394 |         return content_format.get_url_format(url)
 395 | 
 396 |     # 数据库 email
 397 |     def email_parser(self, block_handle):
 398 |         if block_handle["type"] != "email":
 399 |             common_op.debug_log("email type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 400 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 401 |             return ""
 402 |         email = block_handle["email"]
 403 |         ret_str = ""
 404 |         if email is not None:
 405 |             ret_str = email
 406 |         return ret_str
 407 | 
 408 |     # 数据库 checkbox
 409 |     def checkbox_parser(self, block_handle):
 410 |         if block_handle["type"] != "checkbox":
 411 |             common_op.debug_log("checkbox type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 412 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 413 |             return ""
 414 |         checkbox = block_handle["checkbox"]
 415 |         if checkbox is True:
 416 |             ret_str = NotionDump.MD_BOOL_TRUE
 417 |         else:
 418 |             ret_str = NotionDump.MD_BOOL_FALSE
 419 |         return ret_str
 420 | 
 421 |     # 数据库 phone_number
 422 |     def phone_number_parser(self, block_handle):
 423 |         if block_handle["type"] != "phone_number":
 424 |             common_op.debug_log("phone_number type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 425 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 426 |             return ""
 427 |         phone_number = block_handle["phone_number"]
 428 |         ret_str = ""
 429 |         if phone_number is not None:
 430 |             ret_str = phone_number
 431 |         return ret_str
 432 | 
 433 |     # 数据库 date
 434 |     def date_parser(self, block_handle):
 435 |         if block_handle["type"] != "date":
 436 |             common_op.debug_log("date type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 437 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 438 |             return ""
 439 |         date = block_handle["date"]
 440 |         if date is None:
 441 |             return ""
 442 |         return content_format.get_date_format(date["start"], date["end"])
 443 | 
 444 |     # 数据库 people
 445 |     def people_parser(self, block_handle):
 446 |         if block_handle["type"] != "people":
 447 |             common_op.debug_log("people type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 448 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 449 |             return ""
 450 |         people_list = block_handle["people"]
 451 |         ret_str = ""
 452 |         if people_list is None:
 453 |             return ret_str
 454 |         for people in people_list:
 455 |             if ret_str != "":
 456 |                 ret_str += ","  # 多个用户之间用“,”分割
 457 |             ret_str += self.__people_parser(people)
 458 |         return ret_str
 459 | 
 460 |     # 数据库 number
 461 |     def number_parser(self, block_handle):
 462 |         if block_handle["type"] != "number":
 463 |             common_op.debug_log("number type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 464 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 465 |             return ""
 466 |         number = block_handle["number"]
 467 |         ret_str = ""
 468 |         if number is None:
 469 |             return ret_str
 470 |         ret_str = number
 471 |         return str(ret_str)
 472 | 
 473 |     # 数据库 files
 474 |     def files_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 475 |         if block_handle["type"] != "files":
 476 |             common_op.debug_log("files type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 477 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 478 |             return ""
 479 |         files_list = block_handle["files"]
 480 |         ret_str = ""
 481 |         if files_list is None:
 482 |             return ret_str
 483 |         for file in files_list:
 484 |             if ret_str != "":
 485 |                 if parser_type == NotionDump.PARSER_TYPE_MD:
 486 |                     ret_str += "<br>"  # 多个文件之间用“<br>”分割
 487 |                 else:
 488 |                     ret_str += ","  # 多个文件之间用“,”分割
 489 |             ret_str += self.__db_file_parser(file, parser_type)
 490 |         return ret_str
 491 | 
 492 |     # 数据库 relation 数据
 493 |     def relation_parser(self, block_handle):
 494 |         if block_handle["type"] != "relation":
 495 |             common_op.debug_log("relation type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 496 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 497 |             return ""
 498 |         relation_list = block_handle["relation"]
 499 |         relation_ret = ""
 500 |         for relation_item in relation_list:
 501 |             relation_id = relation_item["id"].replace("-", "")
 502 |             # 按照软连接处理
 503 |             common_op.add_new_child_page(
 504 |                 self.child_pages,
 505 |                 key_id=relation_id + "_relation",
 506 |                 link_id=relation_id,
 507 |                 page_type="page",
 508 |                 page_name=""
 509 |             )
 510 |             if relation_ret != "":
 511 |                 relation_ret += ","
 512 |             relation_ret += content_format.get_database_title_format(relation_id + "_relation", "", self.export_child)
 513 |         return relation_ret
 514 | 
 515 |     # 数据库 formula 数据
 516 |     def formula_parser(self, block_handle):
 517 |         if block_handle["type"] != "formula":
 518 |             common_op.debug_log("formula type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 519 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 520 |             return ""
 521 |         formula_block = block_handle["formula"]
 522 |         ret_str = ""
 523 |         if formula_block["type"] == "string":
 524 |             ret_str = formula_block["string"]
 525 |         elif formula_block["type"] == "number":
 526 |             ret_str = str(formula_block["number"])
 527 |         elif formula_block["type"] == "boolean":
 528 |             if formula_block["boolean"] is True:
 529 |                 ret_str = NotionDump.MD_BOOL_TRUE
 530 |             else:
 531 |                 ret_str = NotionDump.MD_BOOL_FALSE
 532 |             # ret_str = str(formula_block["boolean"])
 533 |         elif formula_block["type"] == "date":
 534 |             ret_str = self.date_parser(formula_block)
 535 |         else:
 536 |             ret_str = "[unknown_formula_type:" + formula_block["type"] + "]"
 537 |         return ret_str
 538 | 
 539 |     # 数据库 created_time
 540 |     def created_time_parser(self, block_handle):
 541 |         if block_handle["type"] != "created_time":
 542 |             common_op.debug_log("created_time type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 543 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 544 |             return ""
 545 |         return format_date_or_time(block_handle["created_time"])
 546 | 
 547 |     # 数据库 last_edited_time
 548 |     def last_edited_time_parser(self, block_handle):
 549 |         if block_handle["type"] != "last_edited_time":
 550 |             common_op.debug_log(
 551 |                 "last_edited_time type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 552 |                 level=NotionDump.DUMP_MODE_DEFAULT)
 553 |             return ""
 554 |         return format_date_or_time(block_handle["last_edited_time"])
 555 | 
 556 |     def created_by_parser(self, block_handle):
 557 |         if block_handle["type"] != "created_by":
 558 |             common_op.debug_log("created_by type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 559 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 560 |             return ""
 561 |         return self.__people_parser(block_handle["created_by"])
 562 | 
 563 |     # 数据库 last_edited_by
 564 |     def last_edited_by_parser(self, block_handle):
 565 |         if block_handle["type"] != "last_edited_by":
 566 |             common_op.debug_log(
 567 |                 "last_edited_by type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 568 |                 level=NotionDump.DUMP_MODE_DEFAULT)
 569 |             return ""
 570 |         return self.__people_parser(block_handle["last_edited_by"])
 571 | 
 572 |     # Page paragraph
 573 |     #   mention
 574 |     #       date
 575 |     #       user
 576 |     #       page
 577 |     #   text
 578 |     #   equation
 579 |     def paragraph_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 580 |         paragraph_ret = ""
 581 |         if block_handle["type"] != "paragraph":
 582 |             common_op.debug_log("paragraph type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 583 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 584 |             return paragraph_ret
 585 |         return self.__text_list_parser(block_handle["paragraph"]["rich_text"], parser_type)
 586 | 
 587 |     # Page heading_1
 588 |     def heading_1_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 589 |         heading_1_ret = ""
 590 |         if block_handle["type"] != "heading_1":
 591 |             common_op.debug_log("heading_1 type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 592 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 593 |             return heading_1_ret
 594 |         heading_1_ret = self.__text_list_parser(block_handle["heading_1"]["rich_text"], parser_type)
 595 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 596 |             return "# " + heading_1_ret
 597 |         else:
 598 |             return heading_1_ret
 599 | 
 600 |     # Page heading_2
 601 |     def heading_2_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 602 |         heading_2_ret = ""
 603 |         if block_handle["type"] != "heading_2":
 604 |             common_op.debug_log("heading_2 type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 605 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 606 |             return heading_2_ret
 607 |         heading_2_ret = self.__text_list_parser(block_handle["heading_2"]["rich_text"], parser_type)
 608 | 
 609 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 610 |             return "## " + heading_2_ret
 611 |         else:
 612 |             return heading_2_ret
 613 | 
 614 |     # Page heading_3
 615 |     def heading_3_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 616 |         heading_3_ret = ""
 617 |         if block_handle["type"] != "heading_3":
 618 |             common_op.debug_log("heading_3 type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 619 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 620 |             return heading_3_ret
 621 |         heading_3_ret = self.__text_list_parser(block_handle["heading_3"]["rich_text"], parser_type)
 622 | 
 623 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 624 |             return "### " + heading_3_ret
 625 |         else:
 626 |             return heading_3_ret
 627 | 
 628 |     # Page to_do
 629 |     def to_do_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 630 |         to_do_ret = ""
 631 |         if block_handle["type"] != "to_do":
 632 |             common_op.debug_log("to_do type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 633 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 634 |             return to_do_ret
 635 |         to_do_ret = self.__text_list_parser(block_handle["to_do"]["rich_text"], parser_type)
 636 | 
 637 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 638 |             if block_handle["to_do"]["checked"]:
 639 |                 return "- [x] " + to_do_ret
 640 |             else:
 641 |                 return "- [ ] " + to_do_ret
 642 |         else:
 643 |             return to_do_ret
 644 | 
 645 |     # Page bulleted_list_item
 646 |     def bulleted_list_item_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 647 |         bulleted_list_item_ret = ""
 648 |         if block_handle["type"] != "bulleted_list_item":
 649 |             common_op.debug_log(
 650 |                 "bulleted_list_item type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 651 |                 level=NotionDump.DUMP_MODE_DEFAULT)
 652 |             return bulleted_list_item_ret
 653 |         bulleted_list_item_ret = self.__text_list_parser(block_handle["bulleted_list_item"]["rich_text"], parser_type)
 654 | 
 655 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 656 |             return "- " + bulleted_list_item_ret
 657 |         else:
 658 |             return bulleted_list_item_ret
 659 | 
 660 |     # Page numbered_list_item
 661 |     def numbered_list_item_parser(self, block_handle, list_index, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 662 |         numbered_list_item_ret = ""
 663 |         if block_handle["type"] != "numbered_list_item":
 664 |             common_op.debug_log(
 665 |                 "numbered_list_item type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 666 |                 level=NotionDump.DUMP_MODE_DEFAULT)
 667 |             return numbered_list_item_ret
 668 |         numbered_list_item_ret = self.__text_list_parser(block_handle["numbered_list_item"]["rich_text"], parser_type)
 669 | 
 670 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 671 |             return str(list_index) + ". " + numbered_list_item_ret
 672 |         else:
 673 |             return numbered_list_item_ret
 674 | 
 675 |     # Page toggle
 676 |     def toggle_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 677 |         toggle_ret = ""
 678 |         if block_handle["type"] != "toggle":
 679 |             common_op.debug_log("toggle type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 680 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 681 |             return toggle_ret
 682 |         toggle_ret = self.__text_list_parser(block_handle["toggle"]["rich_text"], parser_type)
 683 | 
 684 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 685 |             return "- " + toggle_ret
 686 |         else:
 687 |             return toggle_ret
 688 | 
 689 |     # Page divider
 690 |     def divider_parser(self, block_handle):
 691 |         divider_ret = ""
 692 |         if block_handle["type"] != "divider":
 693 |             common_op.debug_log("divider type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 694 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 695 |             return divider_ret
 696 |         divider_ret = NotionDump.MD_DIVIDER
 697 |         return divider_ret
 698 | 
 699 |     # Page callout
 700 |     def callout_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 701 |         callout_ret = ""
 702 |         if block_handle["type"] != "callout":
 703 |             common_op.debug_log("callout type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 704 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 705 |             return callout_ret
 706 |         callout_ret = self.__text_list_parser(block_handle["callout"]["rich_text"], parser_type)
 707 | 
 708 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 709 |             # 这里是否每一行都操作
 710 |             return "> " + callout_ret
 711 |         else:
 712 |             return callout_ret
 713 | 
 714 |     # Page code
 715 |     def code_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 716 |         code_ret = ""
 717 |         if block_handle["type"] != "code":
 718 |             common_op.debug_log("code type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 719 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 720 |             return code_ret
 721 |         code_ret = self.__text_list_parser(block_handle["code"]["rich_text"], parser_type)
 722 | 
 723 |         code_type = block_handle["code"]["language"]
 724 |         if code_type is None:
 725 |             code_type = ""
 726 | 
 727 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 728 |             # 这里是否每一行都操作
 729 |             return "```" + code_type + "\n" + code_ret + "\n```"
 730 |         else:
 731 |             return code_ret
 732 | 
 733 |     # Page quote
 734 |     def quote_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 735 |         quote_ret = ""
 736 |         if block_handle["type"] != "quote":
 737 |             common_op.debug_log("quote type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 738 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 739 |             return quote_ret
 740 |         quote_ret = self.__text_list_parser(block_handle["quote"]["rich_text"], parser_type)
 741 |         # 最外层颜色
 742 |         if block_handle["quote"]["color"] != "default":
 743 |             # 添加颜色，区分背景色和前景色
 744 |             if NotionDump.S_THEME_TYPE == "markdown":
 745 |                 # 使用markdown默认的高亮来渲染所有的颜色类型
 746 |                 quote_ret = NotionDump.MD_HIGHLIGHT + quote_ret + NotionDump.MD_HIGHLIGHT
 747 |             else:
 748 |                 if block_handle["quote"]["color"].find("_background") != -1:
 749 |                     bg_color = block_handle["quote"]["color"][0:block_handle["quote"]["color"].rfind('_')]
 750 |                     quote_ret = "<span style=\"background-color:" + color_transformer(bg_color,
 751 |                                                                                       background=True) + "\">" + quote_ret + "</span>"
 752 |                 else:
 753 |                     quote_ret = "<font color=\"" + color_transformer(block_handle["quote"]["color"],
 754 |                                                                      background=False) + "\">" + quote_ret + "</font>"
 755 | 
 756 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 757 |             # 这里是否每一行都操作
 758 |             return "> " + quote_ret
 759 |         else:
 760 |             return quote_ret
 761 | 
 762 |     # Page equation
 763 |     def equation_parser(self, block_handle):
 764 |         equation_ret = ""
 765 |         if block_handle["type"] != "equation":
 766 |             common_op.debug_log(" type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 767 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 768 |             return equation_ret
 769 |         return self.__equation_block_parser(block_handle["equation"])
 770 | 
 771 |     # Page table_row
 772 |     def table_row_parser(self, block_handle, first_row=False, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 773 |         table_row_ret = ""
 774 |         if block_handle["type"] != "table_row":
 775 |             common_op.debug_log("table_row type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 776 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 777 |             return table_row_ret
 778 | 
 779 |         table_row_list = self.__table_row_parser(block_handle, parser_type)
 780 |         table_row_ret = "|"
 781 |         for it in table_row_list:
 782 |             table_row_ret += it.replace('\n', '<br>') + "|"
 783 |         if first_row:
 784 |             table_row_ret += "\n|"
 785 |             for i in range(len(table_row_list)):
 786 |                 table_row_ret += " --- " + "|"
 787 | 
 788 |         return table_row_ret
 789 | 
 790 |     def child_page_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 791 |         child_page_ret = ""
 792 |         if block_handle["type"] != "child_page":
 793 |             common_op.debug_log("child_page type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 794 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 795 |             return child_page_ret
 796 | 
 797 |         page_body = block_handle["child_page"]
 798 |         if page_body["title"] == "":
 799 |             if parser_type == NotionDump.PARSER_TYPE_MD:
 800 |                 return content_format.get_page_format_md("NULL Page", "NULL Page", export_child=self.export_child)
 801 |             else:
 802 |                 return content_format.get_page_format_plain("NULL Page")
 803 |         else:
 804 |             page_id = (block_handle["id"]).replace('-', '')
 805 | 
 806 |             # 保存子页面信息
 807 |             common_op.debug_log("child_page_parser add page id = " + page_id, level=NotionDump.DUMP_MODE_DEFAULT)
 808 |             common_op.add_new_child_page(self.child_pages, key_id=page_id, page_name=page_body["title"])
 809 | 
 810 |             if parser_type == NotionDump.PARSER_TYPE_MD:
 811 |                 return content_format.get_page_format_md(page_id, page_body["title"], export_child=self.export_child)
 812 |             else:
 813 |                 return content_format.get_page_format_plain(page_body["title"])
 814 | 
 815 |     # Page child_database
 816 |     def child_database_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 817 |         if block_handle["type"] != "child_database":
 818 |             common_op.debug_log("child_database type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 819 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 820 |             return ""
 821 | 
 822 |         # 子数据库保存在页面表中，不解析
 823 |         child_db_id = block_handle["id"].replace('-', '')
 824 |         common_op.add_new_child_page(
 825 |             self.child_pages,
 826 |             key_id=child_db_id,
 827 |             page_type="database",
 828 |             page_name=block_handle["child_database"]["title"]
 829 |         )
 830 |         common_op.debug_log(
 831 |             "child_database_parser add page id = " + child_db_id + "name : " + block_handle["child_database"]["title"], level=NotionDump.DUMP_MODE_DEFAULT)
 832 |         common_op.debug_log(internal_var.PAGE_DIC)
 833 |         common_op.debug_log("#############")
 834 |         common_op.debug_log(self.child_pages)
 835 | 
 836 |         # 子数据库要返回一个链接占位符，供后续解析使用
 837 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 838 |             return content_format.get_page_format_md(
 839 |                 child_db_id,
 840 |                 block_handle["child_database"]["title"],
 841 |                 export_child=self.export_child
 842 |             )
 843 |         else:
 844 |             return content_format.get_page_format_plain(block_handle["child_database"]["title"])
 845 | 
 846 |     # Page image
 847 |     def image_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 848 |         if block_handle["type"] != "image":
 849 |             common_op.debug_log("image type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 850 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 851 |             return ""
 852 | 
 853 |         # 子数据库保存在页面表中，不解析
 854 |         image_id = block_handle["id"].replace('-', '')
 855 |         image_name = self.__text_list_parser(block_handle["image"]["caption"], parser_type)
 856 |         image_url = ""
 857 |         image_type = block_handle["image"]["type"]
 858 |         if image_type in block_handle["image"].keys():
 859 |             if "url" in block_handle["image"][image_type].keys():
 860 |                 image_url = block_handle["image"][image_type]["url"]
 861 |         if image_url == "":
 862 |             common_op.debug_log("unknown image type" + block_handle["image"]["type"],
 863 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 864 |         if image_name == "":
 865 |             # 如果文件没有名字使用id作为默认名字
 866 |             image_name = image_id
 867 |         common_op.add_new_child_page(
 868 |             self.child_pages,
 869 |             key_id=image_id,
 870 |             link_src=image_url,
 871 |             page_type="image",
 872 |             page_name=image_name
 873 |         )
 874 | 
 875 |         common_op.debug_log(
 876 |             "image_parser add page id = " + image_id + "name : " + image_name, level=NotionDump.DUMP_MODE_DEFAULT)
 877 |         common_op.debug_log(internal_var.PAGE_DIC)
 878 |         common_op.debug_log("#############")
 879 |         common_op.debug_log(self.child_pages)
 880 | 
 881 |         # 图片类型要返回一个链接占位符，供后续解析使用
 882 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 883 |             return content_format.get_page_format_md(
 884 |                 image_id,
 885 |                 image_name,
 886 |                 export_child=self.export_child
 887 |             )
 888 |         else:
 889 |             return content_format.get_page_format_plain(image_name)
 890 | 
 891 |     # Page file(file,pdf,video)
 892 |     def file_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 893 |         if block_handle["type"] != "file" and block_handle["type"] != "pdf" and block_handle["type"] != "video":
 894 |             common_op.debug_log("file type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 895 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 896 |             return ""
 897 | 
 898 |         block_type = block_handle["type"]
 899 |         file_id = block_handle["id"].replace('-', '')
 900 |         file_name = self.__text_list_parser(block_handle[block_type]["caption"], parser_type)
 901 |         file_url = ""
 902 |         file_type = block_handle[block_type]["type"]
 903 |         if file_type in block_handle[block_type].keys():
 904 |             if "url" in block_handle[block_type][file_type].keys():
 905 |                 file_url = block_handle[block_type][file_type]["url"]
 906 |         if file_url == "":
 907 |             common_op.debug_log("unknown block type" + block_handle[block_type]["type"] + " with null url",
 908 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 909 |             return ""
 910 |         # 如果caption中没有文件名，尝试从url中分离
 911 |         if file_name == "":
 912 |             file_url_basic = file_url[0:file_url.rfind('?')]
 913 |             file_name = file_url_basic[file_url_basic.rfind('/')+1:]
 914 |             # url中分离的内容需要转码
 915 |             file_name = unquote(file_name, 'utf-8')
 916 |         if file_name == "":
 917 |             # 如果文件没有名字使用file作为默认名字
 918 |             file_name = "FILE"
 919 |         common_op.add_new_child_page(
 920 |             self.child_pages,
 921 |             key_id=file_id,
 922 |             link_src=file_url,
 923 |             page_type="file",
 924 |             page_name=file_name
 925 |         )
 926 | 
 927 |         common_op.debug_log(
 928 |             "file_parser add page id = " + file_id + " name : " + file_name, level=NotionDump.DUMP_MODE_DEFAULT)
 929 |         common_op.debug_log(internal_var.PAGE_DIC)
 930 |         common_op.debug_log("#############")
 931 |         common_op.debug_log(self.child_pages)
 932 | 
 933 |         # 文件类型要返回一个链接占位符，供后续解析使用
 934 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 935 |             return content_format.get_page_format_md(
 936 |                 file_id,
 937 |                 file_name,
 938 |                 export_child=self.export_child
 939 |             )
 940 |         else:
 941 |             return content_format.get_page_format_plain(file_name)
 942 | 
 943 |     # Page bookmark
 944 |     def bookmark_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 945 |         bookmark_ret = ""
 946 |         if block_handle["type"] != "bookmark":
 947 |             common_op.debug_log("bookmark type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 948 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 949 |             return bookmark_ret
 950 |         bookmark_name = self.__text_list_parser(block_handle["bookmark"]["caption"], parser_type)
 951 |         if bookmark_name == "":
 952 |             bookmark_name = "BOOKMARK"
 953 |         bookmark_url = block_handle["bookmark"]["url"]
 954 | 
 955 |         # bookmark 类型要返回一个链接占位符，供后续解析使用
 956 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 957 |             # file转换成文件链接的形式
 958 |             return content_format.get_file_format_md(bookmark_name, bookmark_url)
 959 |         else:
 960 |             return content_format.get_file_format_plain(bookmark_name, bookmark_url)
 961 | 
 962 |     # Page embed
 963 |     def embed_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 964 |         embed_ret = ""
 965 |         if block_handle["type"] != "embed":
 966 |             common_op.debug_log("embed type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 967 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 968 |             return embed_ret
 969 |         embed_name = self.__text_list_parser(block_handle["embed"]["caption"], parser_type)
 970 |         if embed_name == "":
 971 |             embed_name = "EMBED"
 972 |         embed_url = block_handle["embed"]["url"]
 973 | 
 974 |         # bookmark 类型要返回一个链接占位符，供后续解析使用
 975 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 976 |             # file转换成文件链接的形式
 977 |             return content_format.get_file_format_md(embed_name, embed_url)
 978 |         else:
 979 |             return content_format.get_file_format_plain(embed_name, embed_url)
 980 | 
 981 |     # Page link_preview
 982 |     def link_preview_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
 983 |         link_preview_ret = ""
 984 |         if block_handle["type"] != "link_preview":
 985 |             common_op.debug_log("link_preview type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
 986 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
 987 |             return link_preview_ret
 988 |         link_preview_name = "LINK_PREVIEW"
 989 |         link_preview_url = block_handle["link_preview"]["url"]
 990 | 
 991 |         # bookmark 类型要返回一个链接占位符，供后续解析使用
 992 |         if parser_type == NotionDump.PARSER_TYPE_MD:
 993 |             # file转换成文件链接的形式
 994 |             return content_format.get_file_format_md(link_preview_name, link_preview_url)
 995 |         else:
 996 |             return content_format.get_file_format_plain(link_preview_name, link_preview_url)
 997 | 
 998 |     # Page link_to_page
 999 |     def link_to_page_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
1000 |         link_to_page_ret = ""
1001 |         if block_handle["type"] != "link_to_page":
1002 |             common_op.debug_log("link_to_page type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
1003 |                                 level=NotionDump.DUMP_MODE_DEFAULT)
1004 |             return link_to_page_ret
1005 | 
1006 |         link_page = block_handle["link_to_page"]
1007 |         if link_page["type"] == "page_id":
1008 |             page_id = link_page["page_id"].replace('-', '')
1009 |             page_name = ""
1010 |             key_id = page_id + "_link_page"
1011 |             common_op.add_new_child_page(
1012 |                 self.child_pages,
1013 |                 key_id=key_id,
1014 |                 link_id=page_id,
1015 |                 page_type="page",
1016 |                 page_name=page_name
1017 |             )
1018 |             common_op.debug_log(
1019 |                 "link_to_page_parser add link_page key_id = " + key_id, level=NotionDump.DUMP_MODE_DEFAULT)
1020 |             common_op.debug_log(internal_var.PAGE_DIC)
1021 |             common_op.debug_log("#############")
1022 |             common_op.debug_log(self.child_pages)
1023 |             return content_format.get_page_format_md(
1024 |                 key_id,
1025 |                 page_name,
1026 |                 export_child=self.export_child
1027 |             )
1028 |         else:
1029 |             common_op.debug_log("unknown type " + link_page["type"], level=NotionDump.DUMP_MODE_DEFAULT)
1030 |         return link_to_page_ret
1031 | 


--------------------------------------------------------------------------------
/NotionDump/Parser/block_parser.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/08
  3 | # mail:geniusrabbit@qq.com
  4 | 
  5 | import os
  6 | 
  7 | import NotionDump
  8 | from NotionDump.Notion.Notion import NotionQuery
  9 | from NotionDump.Parser.base_parser import BaseParser
 10 | from NotionDump.utils import common_op
 11 | 
 12 | 
 13 | # Block内容解析
 14 | class BlockParser:
 15 |     # 初始化
 16 |     def __init__(
 17 |             self,
 18 |             block_id,
 19 |             query_handle: NotionQuery,
 20 |             parser_type=NotionDump.PARSER_TYPE_MD,
 21 |             export_child_pages=False
 22 |     ):
 23 |         self.block_id = block_id.replace('-', '')
 24 |         self.query_handle = query_handle
 25 |         self.parser_type = parser_type
 26 |         # 是否导出子页面,也就是递归操作
 27 |         self.export_child_page = export_child_pages
 28 | 
 29 |         # 创建临时文件夹
 30 |         self.tmp_dir = NotionDump.TMP_DIR
 31 |         if not os.path.exists(self.tmp_dir):
 32 |             os.mkdir(self.tmp_dir)
 33 | 
 34 |         # 基解析器
 35 |         self.base_parser = BaseParser(
 36 |             base_id=self.block_id,
 37 |             export_child=self.export_child_page
 38 |         )
 39 | 
 40 |     # 获取子页面字典
 41 |     def get_child_pages_dic(self):
 42 |         return self.base_parser.get_child_pages_dic()
 43 | 
 44 |     def __get_children_block_list(self, block):
 45 |         # 如果没有子页面，直接返回空
 46 |         if not block["has_children"]:
 47 |             return None
 48 | 
 49 |         if block["type"] == 'child_page':
 50 |             return None
 51 | 
 52 |         # 递归黑名单
 53 |         if block["type"] == "template":
 54 |             common_op.debug_log("type " + block["type"] + " has no child, ignore", level=NotionDump.DUMP_MODE_DEFAULT)
 55 |             return None
 56 | 
 57 |         # 指定类型才递归(白名单)
 58 |         if block["type"] != "to_do" \
 59 |                 and block["type"] != "numbered_list_item" \
 60 |                 and block["type"] != "bulleted_list_item" \
 61 |                 and block["type"] != "toggle" \
 62 |                 and block["type"] != "table" \
 63 |                 and block["type"] != "table_row"\
 64 |                 and block["type"] != "column_list" \
 65 |                 and block["type"] != "column" \
 66 |                 and block["type"] != "synced_block" \
 67 |                 and block["type"] != "heading_1" \
 68 |                 and block["type"] != "heading_2" \
 69 |                 and block["type"] != "heading_3" \
 70 |                 and block["type"] != "paragraph" \
 71 |                 and block["type"] != "quote" \
 72 |                 and block["type"] != "callout":
 73 |             common_op.debug_log("[ISSUE] type " + block["type"] + " has no child", level=NotionDump.DUMP_MODE_DEFAULT)
 74 |             return None
 75 | 
 76 |         # 获取块id下面的内容并继续解析
 77 |         if block["type"] == "synced_block" and block["synced_block"]["synced_from"] is not None:
 78 |             child_block_id = block["synced_block"]["synced_from"]["block_id"]
 79 |             common_op.debug_log("type synced_block " + child_block_id + " get child", level=NotionDump.DUMP_MODE_DEFAULT)
 80 |         else:
 81 |             child_block_id = block["id"]
 82 | 
 83 |         block_list = []
 84 |         retrieve_ret = self.query_handle.retrieve_block_children(child_block_id, parent_id=self.block_id)
 85 |         if retrieve_ret is not None:
 86 |             block_list = retrieve_ret["results"]
 87 | 
 88 |         # 如果没有获取到块，也返回空
 89 |         if len(block_list) == 0:
 90 |             return None
 91 |         # 返回获取到的块列表
 92 |         common_op.debug_log("## retrieve block " + child_block_id, level=NotionDump.DUMP_MODE_DEFAULT)
 93 |         return block_list
 94 | 
 95 |     def parser_block(self, block, list_index, last_line_is_table, prefix):
 96 |         block_type = block["type"]
 97 |         block_text = ""
 98 |         if block_type == "paragraph":
 99 |             # paragraph
100 |             block_text = self.base_parser.paragraph_parser(block, self.parser_type)
101 |         elif block_type == "heading_1":
102 |             # heading_1
103 |             block_text = self.base_parser.heading_1_parser(block, self.parser_type)
104 |         elif block_type == "heading_2":
105 |             # heading_2
106 |             block_text = self.base_parser.heading_2_parser(block, self.parser_type)
107 |         elif block_type == "heading_3":
108 |             # heading_3
109 |             block_text = self.base_parser.heading_3_parser(block, self.parser_type)
110 |         elif block_type == "to_do":
111 |             # to_do
112 |             block_text = self.base_parser.to_do_parser(block, self.parser_type)
113 |         elif block_type == "bulleted_list_item":
114 |             # bulleted_list_item
115 |             block_text = self.base_parser.bulleted_list_item_parser(block, self.parser_type)
116 |         elif block_type == "numbered_list_item":
117 |             # numbered_list_item
118 |             block_text = self.base_parser.numbered_list_item_parser(block, list_index, self.parser_type)
119 |         elif block_type == "toggle":
120 |             # toggle
121 |             block_text = self.base_parser.toggle_parser(block, self.parser_type)
122 |         elif block_type == "divider":
123 |             # divider
124 |             block_text = self.base_parser.divider_parser(block)
125 |         elif block_type == "callout":
126 |             # callout
127 |             block_text = self.base_parser.callout_parser(block, self.parser_type)
128 |             # callout内换行使用HTML符号
129 |             block_text = block_text.replace('\n', '<br>')
130 |         elif block_type == "code":
131 |             # code
132 |             code_text = self.base_parser.code_parser(block, self.parser_type)
133 |             block_text = code_text.replace('\n', '\n'+prefix)
134 |         elif block_type == "quote":
135 |             # quote
136 |             block_text = self.base_parser.quote_parser(block, self.parser_type)
137 |             block_text = block_text.replace('\n', '<br>')
138 |         elif block_type == "equation":
139 |             # Page equation
140 |             block_text = self.base_parser.equation_parser(block)
141 |         elif block_type == "table":
142 |             # table直接递归即可
143 |             pass
144 |         elif block_type == "table_row":
145 |             # Page table_row
146 |             block_text = self.base_parser.table_row_parser(
147 |                 block,
148 |                 first_row=last_line_is_table,
149 |                 parser_type=self.parser_type
150 |             )
151 |         elif block_type == "child_page":
152 |             # Page child_page 子页面只返回链接，不返回内容
153 |             block_text = self.base_parser.child_page_parser(block, self.parser_type)
154 |         elif block_type == "child_database":
155 |             # Page child_database
156 |             # Page中嵌套数据库的类型，只保存页面，不进行解析
157 |             block_text = self.base_parser.child_database_parser(block, self.parser_type)
158 |         elif block_type == "image":
159 |             # Page image
160 |             block_text = self.base_parser.image_parser(block, self.parser_type)
161 |         elif block_type == "file" or block_type == "pdf" or block_type == "video":
162 |             # Page file
163 |             block_text = self.base_parser.file_parser(block, self.parser_type)
164 |         elif block_type == "bookmark":
165 |             # Page bookmark
166 |             block_text = self.base_parser.bookmark_parser(block, self.parser_type)
167 |         elif block_type == "embed":
168 |             # Page embed
169 |             block_text = self.base_parser.embed_parser(block, self.parser_type)
170 |         elif block_type == "link_preview":
171 |             # Page bookmark
172 |             block_text = self.base_parser.link_preview_parser(block, self.parser_type)
173 |         elif block_type == "link_to_page":
174 |             # Page link_to_page
175 |             block_text = self.base_parser.link_to_page_parser(block, self.parser_type)
176 |         elif block_type == "table_of_contents":
177 |             block_text = '[TOC]'
178 |         elif block_type == "template":
179 |             # 模板内容不解析
180 |             block_text = '[TEMPLATE]'
181 |         elif block_type == "breadcrumb":
182 |             # 路径信息不解析（notion也不会返回）
183 |             block_text = "[breadcrumb]"
184 |         else:
185 |             common_op.debug_log("[ISSUE] unknown page block properties type:" + block_type, level=NotionDump.DUMP_MODE_DEFAULT)
186 |             block_text = "[unknown_type:" + block_type + "]"
187 |         if block_text is None:
188 |             block_text = ""
189 |         return block_text
190 | 
191 |     def parser_block_list(self, block_list, indent=0, line_div="\n", last_block_type="none"):
192 |         prefix = ""
193 |         p_index = 0
194 |         # line_div 为br时，是内部换行，\n时是大块换行
195 |         while p_index < indent and line_div == "\n":
196 |             prefix += "\t"  # 前缀是一个TAB
197 |             p_index += 1
198 | 
199 |         # 如果有内容先加个换行再说
200 |         block_text = ""
201 |         if indent != 0 and line_div == "\n":
202 |             block_text = line_div
203 | 
204 |         last_type = "to_do"  # 初始化不换行
205 |         list_index = 1
206 | 
207 |         # 记录解析到的表格的状态，表格会一次性解析完，所以这里不需要重新设置
208 |         last_line_is_table = True
209 | 
210 |         for block in block_list:
211 |             # 遍历block，解析内容，填充到md文件中
212 |             block_type = block["type"]
213 | 
214 |             # 在外面解析列类型
215 |             if block_type == "column_list":
216 |                 # 列类型的分解
217 |                 column_list = self.__get_children_block_list(block)
218 |                 if block_text == "\n":
219 |                     # 如果只有一个换行符，重置内容
220 |                     block_text = ""
221 |                 if column_list is not None:
222 |                     for column in column_list:
223 |                         column_rows = self.__get_children_block_list(column)
224 |                         if column_rows is not None:
225 |                             if block_text != "":
226 |                                 # 与前边得隔离开
227 |                                 block_text += "\n"
228 |                             block_text += self.parser_block_list(column_rows, indent)
229 |             elif block_type == "synced_block":
230 |                 # 同步块解析其中的内容
231 |                 synced_block_list = self.__get_children_block_list(block)
232 |                 if block_text == "\n":
233 |                     # 如果只有一个换行符，重置内容
234 |                     block_text = ""
235 |                 if synced_block_list is not None:
236 |                     block_text += self.parser_block_list(synced_block_list, indent, last_block_type="synced_block")
237 |             else:
238 |                 # 如果是连续的类型，就不需要额外加换行符
239 |                 if common_op.parser_newline(last_type, block_type) and block_text != "" and block_text != "\n":
240 |                     block_text += line_div
241 | 
242 |                 # 记录数字列表的标识
243 |                 if last_type == "numbered_list_item":
244 |                     list_index = list_index + 1
245 |                 else:
246 |                     list_index = 1
247 |                 last_type = block_type
248 |                 if block_type != "table" and block_type != "table_row":
249 |                     block_text += prefix
250 | 
251 |                 block_text += self.parser_block(
252 |                     block=block,
253 |                     list_index=list_index,
254 |                     last_line_is_table=last_line_is_table,
255 |                     prefix=prefix
256 |                 )
257 | 
258 |                 # 看改块下面有没有子块，如果有就继续解析
259 |                 children_block_list = self.__get_children_block_list(block)
260 |                 t_line_div = "\n"
261 |                 if block_type == "quote" or block_type == "callout":
262 |                     t_line_div = "<br>"
263 |                 if children_block_list is not None:
264 |                     if block_type == "heading_1" \
265 |                             or block_type == "heading_2" \
266 |                             or block_type == "heading_3" \
267 |                             or block_type == "paragraph" \
268 |                             or block_type == "quote" \
269 |                             or block_type == "callout":
270 |                         # 不需要加大indent值
271 |                         # if block_type != "quote" and block_type != "callout":
272 |                         #     # 处理quote和callout内部的换行问题
273 |                         block_text += t_line_div
274 |                         block_text += self.parser_block_list(children_block_list, indent, line_div=t_line_div)
275 |                     else:
276 |                         block_text += self.parser_block_list(children_block_list, indent + 1)
277 |                 else:
278 |                     block_text += "\n"
279 | 
280 |             if block_type == "table_row":
281 |                 # 第一行设置首行标志
282 |                 last_line_is_table = False
283 | 
284 |         return block_text
285 | 
286 |     def block_to_md(self, block_handle, page_detail=None, new_id=None):
287 |         block_list = block_handle["results"]
288 |         # 空内容不生成文件
289 |         if len(block_list) == 0 and (page_detail is None or page_detail == ""):
290 |             return ""
291 | 
292 |         # 创建Markdown文件
293 |         if new_id is not None:
294 |             self.block_id = new_id.replace('-', '')
295 |             self.base_parser.set_new_id(self.block_id)
296 |         tmp_md_filename = self.tmp_dir + self.block_id + ".md"
297 |         file = open(tmp_md_filename, "w", encoding="utf-8", newline='')
298 | 
299 |         # 如果存在属性就拼接上去
300 |         block_text = ""
301 |         if page_detail is not None and page_detail != "":
302 |             block_text = page_detail + "\n" + NotionDump.MD_DIVIDER + "\n"
303 | 
304 |         # 解析block_list
305 |         block_text += self.parser_block_list(block_list)
306 | 
307 |         # 将解析内容写入文件
308 |         file.write(block_text)
309 |         file.flush()
310 |         file.close()
311 | 
312 |         common_op.debug_log("write file " + tmp_md_filename, level=NotionDump.DUMP_MODE_DEFAULT)
313 |         # 将临时文件地址转出去，由外面进行进一步的操作
314 |         return tmp_md_filename
315 | 
316 |     # 源文件，直接输出成json; 辅助测试使用
317 |     def block_to_json(self, block_json, json_name=None):
318 |         if block_json is None:
319 |             return None
320 | 
321 |         if json_name is None:
322 |             json_name = self.tmp_dir + self.block_id + ".json"
323 |         common_op.save_json_to_file(block_json, json_name)
324 | 


--------------------------------------------------------------------------------
/NotionDump/Parser/database_parser.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/08
  3 | # mail:geniusrabbit@qq.com
  4 | 
  5 | import csv
  6 | import os
  7 | 
  8 | import NotionDump
  9 | from NotionDump.Parser.base_parser import BaseParser
 10 | from NotionDump.utils import common_op
 11 | 
 12 | 
 13 | class DatabaseParser:
 14 |     def __init__(
 15 |             self,
 16 |             database_id,
 17 |             parser_type=NotionDump.PARSER_TYPE_PLAIN,
 18 |             export_child_pages=False
 19 |     ):
 20 |         self.database_id = database_id.replace('-', '')
 21 |         self.parser_type = parser_type
 22 |         # 是否导出子页面,也就是递归操作
 23 |         self.export_child_page = export_child_pages
 24 | 
 25 |         self.tmp_dir = NotionDump.TMP_DIR
 26 |         if not os.path.exists(self.tmp_dir):
 27 |             os.mkdir(self.tmp_dir)
 28 | 
 29 |         # 块解析器
 30 |         self.base_parser = BaseParser(
 31 |             base_id=self.database_id,
 32 |             export_child=self.export_child_page
 33 |         )
 34 | 
 35 |     # 从一个页面里把列名给解析出来
 36 |     def __get_col_name_list(self, one_page):
 37 |         col_name_list = []
 38 |         title_name = ""
 39 |         for item in one_page["properties"]:
 40 |             if one_page["properties"][item]["type"] == "title":
 41 |                 title_name = item
 42 |             else:
 43 |                 col_name_list.append(item)
 44 |         if title_name == "":
 45 |             common_op.debug_log("col name no title error! id=" + self.database_id, level=NotionDump.DUMP_MODE_DEFAULT)
 46 |             return ""
 47 |         col_name_list.append(title_name)  # 把title_name放在最后一个，逆序之后就是第一个
 48 |         # 根据现有的数据库看来这里需要逆序一下才和实际的数据库一致
 49 |         col_name_list.reverse()
 50 |         return col_name_list
 51 | 
 52 |     def get_child_pages_dic(self):
 53 |         return self.base_parser.get_child_pages_dic()
 54 | 
 55 |     # 解析一列中的一项
 56 |     def __parser_item(self, item_block, page_id):
 57 |         item_ret = ""
 58 |         if item_block["type"] == "title":  # title
 59 |             item_ret = self.base_parser.title_parser(item_block, page_id, parser_type=self.parser_type)
 60 |         elif item_block["type"] == "multi_select":  # multi_select
 61 |             item_ret = self.base_parser.multi_select_parser(item_block, parser_type=self.parser_type)
 62 |         elif item_block["type"] == "select":
 63 |             item_ret = self.base_parser.select_parser(item_block, parser_type=self.parser_type)
 64 |         elif item_block["type"] == "rich_text":
 65 |             item_ret = self.base_parser.rich_text_parser(item_block, parser_type=self.parser_type)
 66 |         elif item_block["type"] == "url":
 67 |             item_ret = self.base_parser.url_parser(item_block)
 68 |         elif item_block["type"] == "email":
 69 |             item_ret = self.base_parser.email_parser(item_block)
 70 |         elif item_block["type"] == "checkbox":
 71 |             item_ret = self.base_parser.checkbox_parser(item_block)
 72 |         elif item_block["type"] == "phone_number":
 73 |             item_ret = self.base_parser.phone_number_parser(item_block)
 74 |         elif item_block["type"] == "date":
 75 |             item_ret = self.base_parser.date_parser(item_block)
 76 |         elif item_block["type"] == "people":
 77 |             item_ret = self.base_parser.people_parser(item_block)
 78 |         elif item_block["type"] == "number":
 79 |             item_ret = self.base_parser.number_parser(item_block)
 80 |         elif item_block["type"] == "files":
 81 |             item_ret = self.base_parser.files_parser(item_block, parser_type=self.parser_type)
 82 |         elif item_block["type"] == "relation":
 83 |             item_ret = self.base_parser.relation_parser(item_block)
 84 |         elif item_block["type"] == "rollup":
 85 |             # rollup类型单独解析
 86 |             rollup_block = item_block["rollup"]
 87 |             if "array" in rollup_block:
 88 |                 # 列表的解析
 89 |                 for rollup_item in rollup_block["array"]:
 90 |                     if item_ret != "":
 91 |                         item_ret += ","
 92 |                     item_ret += self.__parser_item(rollup_item, "")
 93 |             else:
 94 |                 # 单个内容的解析
 95 |                 item_ret += self.__parser_item(rollup_block, "")
 96 |         elif item_block["type"] == "formula":
 97 |             item_ret = self.base_parser.formula_parser(item_block)
 98 |         elif item_block["type"] == "created_time":
 99 |             item_ret = self.base_parser.created_time_parser(item_block)
100 |         elif item_block["type"] == "last_edited_time":
101 |             item_ret = self.base_parser.last_edited_time_parser(item_block)
102 |         elif item_block["type"] == "created_by":
103 |             item_ret = self.base_parser.created_by_parser(item_block)
104 |         elif item_block["type"] == "last_edited_by":
105 |             item_ret = self.base_parser.last_edited_by_parser(item_block)
106 |         else:
107 |             item_ret = "[unknown_type:" + item_block["type"] + "]"
108 |             common_op.debug_log("[ISSUE] unknown properties type:" + item_block["type"], level=NotionDump.DUMP_MODE_DEFAULT)
109 |         if item_ret is None:
110 |             item_ret = ""
111 |         return item_ret
112 | 
113 |     def database_to_md(self, page_properties, new_id=None):
114 |         if page_properties is None:
115 |             return "", ""
116 |         # 获取属性部分
117 |         if "properties" not in page_properties:
118 |             return "", ""
119 |         page_properties = page_properties["properties"]
120 | 
121 |         # 设置基础解析器的id
122 |         if new_id is not None:
123 |             self.base_parser.set_new_id(new_id)
124 | 
125 |         # 数据库是空的，直接返回完事
126 |         if len(page_properties) == 0:
127 |             return "", ""
128 | 
129 |         properties_md = ""
130 |         # print(page_properties.keys())
131 |         p_title = ""
132 |         p_title_name = ""
133 |         for p_name in list(page_properties.keys())[::-1]:
134 |             p_value = self.__parser_item(page_properties[p_name], page_id="").replace('\n', '<br>')
135 |             if page_properties[p_name]["type"] == "title":
136 |                 p_title = p_value
137 |                 p_title_name = p_name
138 |                 continue
139 |             # print(p_value, p_name)
140 |             properties_md += "\n" + "|" + str(p_name) + "|" + str(p_value) + "|"
141 |         if p_title != "" or p_title_name != "":
142 |             properties_md = "|" + p_title_name + "|" + p_title + "|\n|---|---|" + properties_md
143 |         else:
144 |             properties_md = "|KEY|VALUE|\n|---|---|" + properties_md
145 | 
146 |         if len(page_properties) == 1:
147 |             return "", p_title
148 |         else:
149 |             return properties_md, p_title
150 | 
151 |     # 格式化存储，这里是临时文件存在方式（在外面转成数据库，或者最终输出CSV的格式）
152 |     def database_to_file(self, database_handle, col_name_list=None, new_id=None):
153 |         page_list = database_handle.get("results")
154 |         # 数据库是空的，直接返回完事
155 |         if len(page_list) == 0:
156 |             return ""
157 | 
158 |         # col_name_list 是想要的列，并且会按照该顺序输出；如果没有给定则获取所有列
159 |         if col_name_list is None:
160 |             # 如果没有给定输出顺序，则获取到page中的所有列（注意不保证是显示的顺序！！！！）
161 |             col_name_list = self.__get_col_name_list(page_list[0])
162 | 
163 |         # 创建文件
164 |         suffix = ".csv"
165 |         if self.parser_type == NotionDump.PARSER_TYPE_MD:
166 |             suffix = ".md"
167 |         if new_id is not None:
168 |             self.base_parser.set_new_id(new_id)
169 |             tmp_filename = self.tmp_dir + new_id.replace('-', '') + suffix
170 |         else:
171 |             tmp_filename = self.tmp_dir + self.database_id + suffix
172 | 
173 |         file = open(tmp_filename, "w", encoding="utf-8", newline='')
174 | 
175 |         csv_writer = None
176 |         if self.parser_type == NotionDump.PARSER_TYPE_MD:
177 |             head_line = "|"
178 |             for it in col_name_list:
179 |                 head_line += it + "|"
180 |             head_line += "\n|"
181 |             for i in range(len(col_name_list)):
182 |                 head_line += " --- " + "|"
183 |             file.write(head_line + "\n")
184 |         else:
185 |             csv_writer = csv.writer(file)
186 |             # 首先将列的名称写入到CSV文件中
187 |             csv_writer.writerow(col_name_list)
188 | 
189 |         # 返回的内容好像是倒序的，先倒置过来吧
190 |         page_list.reverse()
191 |         # 解析每一个page的内容
192 |         for page in page_list:
193 |             # 每一个page都有page id
194 |             page_id = page["id"].replace('-', '')
195 |             common_op.debug_log("database page id" + page_id)
196 |             page_iter = []
197 |             for item in col_name_list:
198 |                 # 解析每一个方格的内容
199 |                 page_iter.append(self.__parser_item(page["properties"][item], page_id))
200 |             # 将内容填充到文件中
201 |             if self.parser_type == NotionDump.PARSER_TYPE_MD:
202 |                 page_line = "|"
203 |                 for it in page_iter:
204 |                     if isinstance(it, str):
205 |                         page_line += it.replace('\n', '<br>') + "|"
206 |                     else:
207 |                         page_line += str(it) + "|"
208 |                 file.write(page_line + "\n")
209 |             else:
210 |                 if csv_writer is not None:
211 |                     csv_writer.writerow(page_iter)
212 |                     common_op.debug_log("database page " + page_id + " write csv success")
213 |                 else:
214 |                     common_op.debug_log("database page " + page_id + " write csv fail", level=NotionDump.DUMP_MODE_DEFAULT)
215 |         file.flush()
216 |         file.close()
217 | 
218 |         common_op.debug_log("write file " + tmp_filename, level=NotionDump.DUMP_MODE_DEFAULT)
219 |         # 将临时文件地址转出去，由外面进行进一步的操作
220 |         return tmp_filename
221 | 
222 |     def database_to_dic(self, database_handle, col_name_list=None, new_id=None):
223 |         page_list = database_handle.get("results")
224 |         # 数据库是空的，直接返回完事
225 |         if len(page_list) == 0:
226 |             return
227 | 
228 |         # col_name_list 是想要的列，并且会按照该顺序输出；如果没有给定则获取所有列
229 |         if col_name_list is None:
230 |             # 如果没有给定输出顺序，则获取到page中的所有列（注意不保证是显示的顺序！！！！）
231 |             col_name_list = self.__get_col_name_list(page_list[0])
232 | 
233 |         # 返回的内容好像是倒序的，先倒置过来吧
234 |         page_list.reverse()
235 | 
236 |         db_dic = []
237 |         # 解析每一个page的内容
238 |         for page in page_list:
239 |             # 每一个page都有page id
240 |             page_id = page["id"].replace('-', '')
241 |             common_op.debug_log("database page id" + page_id)
242 |             db_dic_line = {"_page_id": page_id}
243 |             for item in col_name_list:
244 |                 # 解析每一个方格的内容
245 |                 db_dic_line[item] = self.__parser_item(page["properties"][item], page_id)
246 |             # 将内容填充list中
247 |             db_dic.append(db_dic_line)
248 |             common_op.debug_log("database page " + page_id + " get dic success")
249 | 
250 |         # 将临时文件地址转出去，由外面进行进一步的操作
251 |         return db_dic
252 | 


--------------------------------------------------------------------------------
/NotionDump/Parser/mix_parser.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/10
  3 | # mail:geniusrabbit@qq.com
  4 | import copy
  5 | import os
  6 | 
  7 | import NotionDump
  8 | from NotionDump.Notion.Notion import NotionQuery
  9 | from NotionDump.Parser.block_parser import BlockParser
 10 | from NotionDump.Parser.database_parser import DatabaseParser
 11 | from NotionDump.utils import common_op, internal_var
 12 | 
 13 | 
 14 | # 混合递归调用，主要是为Page和Database类型
 15 | class MixParser:
 16 |     # 初始化
 17 |     def __init__(
 18 |             self,
 19 |             mix_id,
 20 |             query_handle: NotionQuery,
 21 |             export_child_pages=False,
 22 |             page_parser_type=NotionDump.PARSER_TYPE_MD,
 23 |             db_parser_type=NotionDump.PARSER_TYPE_PLAIN,
 24 |             col_name_list=None,  # 数据库使用的字段
 25 |     ):
 26 |         self.mix_id = mix_id
 27 |         self.query_handle = query_handle
 28 |         self.page_parser_type = page_parser_type
 29 |         self.db_parser_type = db_parser_type
 30 | 
 31 |         # 是否导出子页面,也就是递归操作
 32 |         self.export_child_page = export_child_pages
 33 | 
 34 |         # 创建临时文件夹
 35 |         self.tmp_dir = NotionDump.TMP_DIR
 36 |         if not os.path.exists(self.tmp_dir):
 37 |             os.mkdir(self.tmp_dir)
 38 | 
 39 |         # 解析器
 40 |         # 这里传入handle是为了子块的解析
 41 |         self.block_parser = BlockParser(
 42 |             block_id=self.mix_id,
 43 |             query_handle=self.query_handle,
 44 |             parser_type=self.page_parser_type,
 45 |             export_child_pages=self.export_child_page
 46 |         )
 47 |         # 初始化一个Database对象，这里page id无关紧要
 48 |         self.database_parser = DatabaseParser(
 49 |             self.mix_id,
 50 |             parser_type=self.db_parser_type,
 51 |             export_child_pages=self.export_child_page
 52 |         )
 53 | 
 54 |         # 收集解析中发证的错误
 55 |         self.error_list = []
 56 | 
 57 |     # 调试时显示子页面内容
 58 |     def __test_show_child_page(self):
 59 |         if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG:
 60 |             print("in page_id: ", self.mix_id, internal_var.PAGE_DIC)
 61 | 
 62 |     def __recursion_mix_parser(self, is_main=False, col_name_list=None):
 63 |         root_name = None
 64 |         update_flag = False
 65 |         recursion_page = copy.deepcopy(internal_var.PAGE_DIC)
 66 |         for child_id in recursion_page:
 67 |             # 判断页面是子页面还是链接页面，链接页面不进行解析(因为添加链接页面时把原页面也加进来了)
 68 |             if common_op.is_link_page(child_id, recursion_page[child_id]):
 69 |                 common_op.update_page_recursion(child_id, recursion=True)
 70 |                 continue
 71 |             # 判断页面是否已经操作过
 72 |             if not common_op.is_page_recursion(child_id):
 73 |                 continue
 74 | 
 75 |             update_flag = True
 76 |             common_op.debug_log("start child_page_id=" + child_id)
 77 |             self.__test_show_child_page()
 78 |             # 先更新页面的状态，无论获取成功或者失败都过去了，只获取一次
 79 |             common_op.update_page_recursion(child_id, recursion=True)
 80 |             common_op.debug_log("S process id " + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
 81 |             page_title = None
 82 |             tmp_filename = None
 83 |             if common_op.is_page(child_id):
 84 |                 # 页面信息
 85 |                 page_detail = self.query_handle.retrieve_page(child_id)
 86 |                 # 页面内容
 87 |                 page_json = self.query_handle.retrieve_block_children(child_id)
 88 |                 if page_json is None or page_detail is None:
 89 |                     common_op.debug_log("get page error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
 90 |                     self.error_list.append("get page error, id=" + child_id)
 91 |                     continue
 92 |                 # 解析属性文本到变量中
 93 |                 page_properties = None
 94 |                 if NotionDump.S_PAGE_PROPERTIES or common_op.is_page_soft(child_id):
 95 |                     # 获取文本
 96 |                     page_properties, page_title = self.database_parser.database_to_md(page_detail, new_id=child_id)
 97 |                 # 解析内容到临时文件中
 98 |                 tmp_filename = self.block_parser.block_to_md(page_json, page_detail=page_properties, new_id=child_id)
 99 |                 # 处理遇到的子页面
100 |                 child_pages_dic = self.block_parser.get_child_pages_dic()
101 |                 if NotionDump.S_PAGE_PROPERTIES:
102 |                     db_child_pages_dic = self.database_parser.get_child_pages_dic()
103 |                     for db_child_dic_key in db_child_pages_dic:
104 |                         if db_child_dic_key not in child_pages_dic:
105 |                             child_pages_dic[db_child_dic_key] = db_child_pages_dic[db_child_dic_key]
106 |             elif common_op.is_db(child_id):
107 |                 db_info = self.query_handle.retrieve_database(child_id)
108 |                 # page里面搞一个Database的解析器
109 |                 db_detail = self.query_handle.query_database(child_id)
110 | 
111 |                 if db_detail is None:
112 |                     # db_info不是必须的，但是在link数据库获取不到
113 |                     common_op.debug_log("get database error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
114 |                     self.error_list.append("get database error, id=" + child_id)
115 |                     continue
116 |                 # 获取解析后的数据
117 |                 tmp_filename = self.database_parser.database_to_file(db_detail, new_id=child_id, col_name_list=col_name_list)
118 |                 child_pages_dic = self.database_parser.get_child_pages_dic()
119 |             elif common_op.is_download(child_id):
120 |                 # 可下载类型
121 |                 # 获取下载后的数据
122 |                 tmp_filename = self.query_handle.download_to_file(download_id=child_id, child_page_item=recursion_page[child_id])
123 |                 child_pages_dic = {}
124 |                 # 尝试下载，没下载成功
125 |                 if tmp_filename == "" and not NotionDump.FILE_WITH_LINK:
126 |                     common_op.debug_log("file download error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
127 |                     self.error_list.append("download error, link:" + recursion_page[child_id]["link_src"])
128 |                     continue
129 |             else:
130 |                 common_op.debug_log("!!! unknown child id type, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
131 |                 self.error_list.append("!!! unknown child id type, id=" + child_id)
132 |                 continue
133 | 
134 |             common_op.debug_log("E process id " + child_id + " success", level=NotionDump.DUMP_MODE_DEFAULT)
135 |             # 再更新本地的存放路径
136 |             common_op.update_child_page_stats(child_id, dumped=True, main_page=is_main, local_path=tmp_filename, page_title=page_title)
137 |             if is_main:
138 |                 root_name = tmp_filename
139 |             # 从页面里获取到所有的子页面,并将子页面添加到父id中
140 |             common_op.update_child_pages(child_pages_dic, child_id)
141 | 
142 |             # 调试
143 |             common_op.debug_log("# end child_page_id=", child_id)
144 |             self.__test_show_child_page()
145 | 
146 |         if update_flag:
147 |             self.__recursion_mix_parser()
148 |         return root_name
149 | 
150 |     def mix_parser(self, root_id, id_type, col_name_list=None):
151 |         # col_name_list 是数据库的可选字段
152 |         common_op.update_child_page_stats(root_id, main_page=True, page_type=id_type)
153 |         root_filename = self.__recursion_mix_parser(True, col_name_list)
154 |         internal_var.PAGE_DIC["errors"] = self.error_list
155 |         return root_filename
156 | 
157 |     def database_collection(self, json_handle, json_type, col_name_list=None):
158 |         # 只能获取数据库类型
159 |         common_op.debug_log("parser_type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT)
160 |         if json_type == "database":
161 |             return self.database_parser.database_to_dic(json_handle, col_name_list=col_name_list)
162 |         elif json_type == "block":
163 |             common_op.debug_log("need database get type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT)
164 |             return None
165 |         else:
166 |             common_op.debug_log("unknown parser_type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT)
167 |             return None
168 | 


--------------------------------------------------------------------------------
/NotionDump/__init__.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/08
  3 | # mail:geniusrabbit@qq.com
  4 | 
  5 | __author__ = "delta1037 <geniusrabbit@qq.com>"
  6 | __version__ = "0.2.3"
  7 | 
  8 | from NotionDump import utils
  9 | 
 10 | # 临时存放文件夹
 11 | TMP_DIR = "./.tmp/"
 12 | 
 13 | # Markdown的分割条语法
 14 | MD_DIVIDER = "------"
 15 | MD_BOOL_TRUE = "✓"
 16 | MD_BOOL_FALSE = "✕"
 17 | # ,、<br> 逗号或者换行
 18 | MD_ROLLUP_SEP = ","
 19 | MD_HIGHLIGHT = "=="
 20 | ID_LEN = len("921e6b4ea44046c6935bcb2c69453196")
 21 | 
 22 | # 日志输出模式
 23 | DUMP_MODE_DEBUG = 0
 24 | DUMP_MODE_DEFAULT = 1
 25 | DUMP_MODE_SILENT = 2
 26 | DUMP_MODE = DUMP_MODE_DEFAULT
 27 | 
 28 | 
 29 | # 日志控制器
 30 | class NotionBackupLogger:
 31 |     def __init__(self):
 32 |         self.prefix = "[NotionDump] "
 33 |         self.log_fd = None
 34 | 
 35 |     def log_debug(self, log_str):
 36 |         self.log_info(log_str)
 37 | 
 38 |         # debug内容写入到文件
 39 |         if self.log_fd is None:
 40 |             self.log_fd = open("notion-export-kernel-debug.log", "a+", encoding='utf-8')
 41 |         self.log_fd.write(str(log_str) + "\n")
 42 |         self.log_fd.flush()
 43 | 
 44 |     def log_info(self, log_str):
 45 |         print(self.prefix, end='')
 46 |         print(log_str)
 47 | 
 48 | 
 49 | LOGGER = NotionBackupLogger()
 50 | 
 51 | # 导出的类型
 52 | DUMP_TYPE_BLOCK = 1
 53 | DUMP_TYPE_PAGE = 2
 54 | DUMP_TYPE_DB_TABLE = 4
 55 | 
 56 | # 解析的类型：分为Markdown和纯文本
 57 | PARSER_TYPE_MD = 0
 58 | PARSER_TYPE_PLAIN = 2
 59 | 
 60 | # 是否使用缓存
 61 | BUFFER_FILE = TMP_DIR + "notion_download_buffer.json"
 62 | USE_BUFFER = True
 63 | 
 64 | # 一些配置开关
 65 | # 对没有在notion保存的文件(pdf\image)尝试下载，否则直接放置链接
 66 | FILE_WITH_LINK = True
 67 | FORMAT_DATE = "%Y/%m/%d"
 68 | FORMAT_DATETIME = "%Y/%m/%d-%H:%M:%S"
 69 | # 是否导出page的properties
 70 | S_PAGE_PROPERTIES = True
 71 | # 主题的格式，default，light，dark，markdown，self_define
 72 | S_THEME_TYPE = "default"
 73 | # f开头的是字体颜色，b开头的是背景颜色，d开头的是数据库标签
 74 | S_THEME_LIGHT = {
 75 |     "f_gray": "#787774",
 76 |     "f_brown": "#9F6B53",
 77 |     "f_orange": "#D9730D",
 78 |     "f_yellow": "#CB912F",
 79 |     "f_green": "#448361",
 80 |     "f_blue": "#337EA9",
 81 |     "f_purple": "#9065B0",
 82 |     "f_pink": "#C14C8A",
 83 |     "f_red": "#D44C47",
 84 |     "b_gray": "#F1F1EF",
 85 |     "b_brown": "#F4EEEE",
 86 |     "b_orange": "#FBECDD",
 87 |     "b_yellow": "#FBF3DB",
 88 |     "b_green": "#EDF3EC",
 89 |     "b_blue": "#E7F3F8",
 90 |     "b_purple": "#F4F0F7CC",
 91 |     "b_pink": "#F9EEF3CC",
 92 |     "b_red": "#FDEBEC",
 93 |     "d_light_gray": "#E3E2E080",
 94 |     "d_gray": "#E3E2E0",
 95 |     "d_brown": "#EEE0DA",
 96 |     "d_orange": "#FADEC9",
 97 |     "d_yellow": "#FDECC8",
 98 |     "d_green": "#DBEDDB",
 99 |     "d_blue": "#D3E5EF",
100 |     "d_purple": "#E8DEEE",
101 |     "d_pink": "#F5E0E9",
102 |     "d_red": "#FFE2DD",
103 | }
104 | 
105 | S_THEME_DARK = {
106 |     "f_gray": "#9B9B9B",
107 |     "f_brown": "#BA856F",
108 |     "f_orange": "#C77D48",
109 |     "f_yellow": "#CA9849",
110 |     "f_green": "#529E72",
111 |     "f_blue": "#5E87C9",
112 |     "f_purple": "#9D68D3",
113 |     "f_pink": "#D15796",
114 |     "f_red": "#DF5453",
115 |     "b_gray": "#2F2F2F",
116 |     "b_brown": "#4A3228",
117 |     "b_orange": "#5C3B23",
118 |     "b_yellow": "#564328",
119 |     "b_green": "#243D30",
120 |     "b_blue": "#143A4E",
121 |     "b_purple": "#3C2D49",
122 |     "b_pink": "#4E2C3C",
123 |     "b_red": "#522E2A",
124 |     "d_light_gray": "#373737",
125 |     "d_gray": "#5A5A5A",
126 |     "d_brown": "#603B2C",
127 |     "d_orange": "#854C1D",
128 |     "d_yellow": "#89632A",
129 |     "d_green": "#2B593F",
130 |     "d_blue": "#28456C",
131 |     "d_purple": "#492F64",
132 |     "d_pink": "#69314C",
133 |     "d_red": "#6E3630",
134 | }
135 | 
136 | S_THEME_SELF_DEFINE = {
137 |     "f_gray": "#787774",
138 |     "f_brown": "#9F6B53",
139 |     "f_orange": "#D9730D",
140 |     "f_yellow": "#CB912F",
141 |     "f_green": "#448361",
142 |     "f_blue": "#337EA9",
143 |     "f_purple": "#9065B0",
144 |     "f_pink": "#C14C8A",
145 |     "f_red": "#D44C47",
146 |     "b_gray": "#F1F1EF",
147 |     "b_brown": "#F4EEEE",
148 |     "b_orange": "#FBECDD",
149 |     "b_yellow": "#FBF3DB",
150 |     "b_green": "#EDF3EC",
151 |     "b_blue": "#E7F3F8",
152 |     "b_purple": "#F4F0F7CC",
153 |     "b_pink": "#F9EEF3CC",
154 |     "b_red": "#FDEBEC",
155 |     "d_light_gray": "#E3E2E080",
156 |     "d_gray": "#E3E2E0",
157 |     "d_brown": "#EEE0DA",
158 |     "d_orange": "#FADEC9",
159 |     "d_yellow": "#FDECC8",
160 |     "d_green": "#DBEDDB",
161 |     "d_blue": "#D3E5EF",
162 |     "d_purple": "#E8DEEE",
163 |     "d_pink": "#F5E0E9",
164 |     "d_red": "#FFE2DD",
165 | }
166 | 


--------------------------------------------------------------------------------
/NotionDump/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/utils/__init__.py


--------------------------------------------------------------------------------
/NotionDump/utils/common_op.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/09
  3 | # mail:geniusrabbit@qq.com
  4 | 
  5 | import copy
  6 | import json
  7 | import os.path
  8 | from json import JSONDecodeError
  9 | 
 10 | import NotionDump
 11 | from NotionDump.utils import internal_var
 12 | 
 13 | 
 14 | # 更新子页面的状态
 15 | def update_child_page_stats(child_key, dumped=False, main_page=False, local_path=None, page_type=None, page_title=None):
 16 |     if child_key not in internal_var.PAGE_DIC:
 17 |         # 如果现有的列表里没有这一条,则新加一条
 18 |         debug_log("CREATE child page " + child_key + " from temp", level=NotionDump.DUMP_MODE_DEFAULT)
 19 |         internal_var.PAGE_DIC[child_key] = copy.deepcopy(internal_var.CHILD_PAGE_TEMP)
 20 |     internal_var.PAGE_DIC[child_key]["dumped"] = dumped
 21 |     internal_var.PAGE_DIC[child_key]["main_page"] = main_page
 22 |     if local_path is not None:
 23 |         internal_var.PAGE_DIC[child_key]["local_path"] = local_path
 24 |     if page_type is not None:
 25 |         if page_type == "block" or page_type == "page":
 26 |             internal_var.PAGE_DIC[child_key]["type"] = "page"
 27 |         elif page_type == "database":
 28 |             internal_var.PAGE_DIC[child_key]["type"] = "database"
 29 |         else:
 30 |             debug_log("update_child_page_stats page type is unknown:" + str(page_type),
 31 |                       level=NotionDump.DUMP_MODE_DEFAULT)
 32 |     if page_title is not None and internal_var.PAGE_DIC[child_key]["inter_soft_page"] is True:
 33 |         internal_var.PAGE_DIC[child_key]["inter_soft_page"] = False
 34 |         if internal_var.PAGE_DIC[child_key]["page_name"] == "":
 35 |             internal_var.PAGE_DIC[child_key]["page_name"] = page_title
 36 | 
 37 | 
 38 | # 关于软连接一共有如下情况
 39 | # 同一个页面：add_new_child_page
 40 | # 在同一个页面中，软连接先于实际链接出现
 41 | #   软连接先占位，把实际链接加进去
 42 | # 在同一个页面中，软连接在实际链接后出现
 43 | # 不同的页面：update_child_pages
 44 | # 在不同页面中，软连接先于实际链接出现
 45 | #   实际链接替换，重新解析
 46 | # 在不同页面中，软连接在实际链接后出现
 47 | #   忽略软连接
 48 | # 只出现软连接而没有出现实际链接，pass
 49 | def update_child_pages(child_pages, parent_id):
 50 |     # 按理说这里一定会有父id，如果没有就是出大事了
 51 |     if parent_id not in internal_var.PAGE_DIC:
 52 |         debug_log("parent id" + parent_id + " not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
 53 |         return
 54 | 
 55 |     for child_page_id in child_pages:
 56 |         # 如果发现表里已经有了该页面，看是不是软链接创建的
 57 |         if child_page_id in internal_var.PAGE_DIC:
 58 |             # 如果页表里是软连接创建的，并且外面的不是软连接创建的
 59 |             # 如果里面是硬链接，外面是软连接则会忽略
 60 |             if internal_var.PAGE_DIC[child_page_id]["inter_soft_page"] \
 61 |                     and not child_pages[child_page_id]["inter_soft_page"]:
 62 |                 # 将外面的合入到页面表，替换之后会重新解析，不用担心已经解析过的内容
 63 |                 # 这里相当于填充了一个未开始解析的内容，而调用这个函数之后
 64 |                 # __recursion_mix_parser会在循环遍历一次，将这个页面重新解析
 65 |                 internal_var.PAGE_DIC[child_page_id] = child_pages[child_page_id]
 66 |                 debug_log("REPLACE last created soft page, id=" + child_page_id, level=NotionDump.DUMP_MODE_DEFAULT)
 67 | 
 68 |         # 包括占位的类型，如果总页面表里不存在都放进去
 69 |         if child_page_id not in internal_var.PAGE_DIC:
 70 |             # 如果现有的列表里没有这一条,则新加一条
 71 |             debug_log("CREATE child page " + child_page_id + " from child_pages", level=NotionDump.DUMP_MODE_DEFAULT)
 72 |             internal_var.PAGE_DIC[child_page_id] = copy.deepcopy(child_pages[child_page_id])
 73 | 
 74 |         # 如果该页面是占位的，则不加到父页面表里
 75 |         if not child_pages[child_page_id]["inter_soft_page"]:
 76 |             debug_log("parent id" + parent_id + " add child " + child_page_id,
 77 |                       level=NotionDump.DUMP_MODE_DEFAULT)
 78 |             internal_var.PAGE_DIC[parent_id]["child_pages"].append(child_page_id)
 79 |         else:
 80 |             debug_log("SOFT_PAGE " + child_page_id + " dont need to add to parent_id " + parent_id,
 81 |                       level=NotionDump.DUMP_MODE_DEFAULT)
 82 | 
 83 | 
 84 | # 添加一个新的子页
 85 | # 链接的key格式是 id_链接名
 86 | # 子页面的key格式是id
 87 | def add_new_child_page(child_pages, key_id, link_id=None, link_src=None, page_name=None, page_type=None,
 88 |                        inter_soft_page=False):
 89 |     # 判断id是否存在，存在就不添加了，防止覆盖
 90 |     debug_log("add new child key:" + key_id)
 91 |     # id 存在并且不是软连接创建的，就不添加了（硬链接先于软连接）
 92 |     if key_id in child_pages and not child_pages[key_id]["inter_soft_page"]:
 93 |         debug_log("WARN key_id:" + key_id + " exist, skip", level=NotionDump.DUMP_MODE_DEFAULT)
 94 |         return
 95 |     # 如果不存在或者上一个是软连接创建的，就重新赋值
 96 |     child_pages[key_id] = copy.deepcopy(internal_var.CHILD_PAGE_TEMP)
 97 |     child_pages[key_id]["inter_soft_page"] = inter_soft_page
 98 |     if link_id is not None:
 99 |         # 如果是软链接，递归看一下对应的子页面在不在,如果不在就先占个坑(忽略file和image类型)
100 |         # inter_soft_page 表明该项是软连接创建的
101 |         debug_log("SOFT_PAGE key_id " + key_id + " link_id " + link_id + ", create a null page with link_id",
102 |                   level=NotionDump.DUMP_MODE_DEFAULT)
103 |         add_new_child_page(child_pages, key_id=link_id, link_src=link_src, page_type=page_type, inter_soft_page=True)
104 |     if page_name is not None:
105 |         child_pages[key_id]["page_name"] = page_name
106 |     if link_id is not None:
107 |         child_pages[key_id]["link_id"] = link_id
108 |     if link_src is not None:
109 |         child_pages[key_id]["link_src"] = link_src
110 |     if page_type is not None:
111 |         child_pages[key_id]["type"] = page_type
112 | 
113 | 
114 | # 用此函数的前提是page表中已经存在
115 | def update_page_recursion(page_id, recursion=False):
116 |     if page_id not in internal_var.PAGE_DIC:
117 |         debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
118 |         return
119 |     internal_var.PAGE_DIC[page_id]["inter_recursion"] = recursion
120 | 
121 | 
122 | def is_page_recursion(page_id):
123 |     if page_id not in internal_var.PAGE_DIC:
124 |         debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
125 |         return False
126 |     return not internal_var.PAGE_DIC[page_id]["inter_recursion"]
127 | 
128 | 
129 | def is_page_soft(page_id):
130 |     if page_id not in internal_var.PAGE_DIC:
131 |         debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
132 |         return False
133 |     return internal_var.PAGE_DIC[page_id]["inter_soft_page"]
134 | 
135 | 
136 | # page 返回True，DB返回False
137 | def is_page(page_id):
138 |     if page_id not in internal_var.PAGE_DIC:
139 |         debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
140 |         return False
141 |     return internal_var.PAGE_DIC[page_id]["type"] == "page"
142 | 
143 | 
144 | # database 返回True
145 | def is_db(db_id):
146 |     if db_id not in internal_var.PAGE_DIC:
147 |         debug_log("db_id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
148 |         return False
149 |     return internal_var.PAGE_DIC[db_id]["type"] == "database"
150 | 
151 | 
152 | # database 返回True
153 | def is_download(download_id):
154 |     if download_id not in internal_var.PAGE_DIC:
155 |         debug_log("download_id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
156 |         return False
157 |     # 可下载类型
158 |     return internal_var.PAGE_DIC[download_id]["type"] == "image" or internal_var.PAGE_DIC[download_id]["type"] == "file"
159 | 
160 | 
161 | # 判断是否是链接页面
162 | def is_link_page(page_id, page_handle):
163 |     return (page_id.find("_") != -1) and page_handle["link_id"] != ""
164 | 
165 | 
166 | # 将文本保存为json文件
167 | def save_json_to_file(handle, json_name):
168 |     try:
169 |         json_handle = json.dumps(handle, ensure_ascii=False, indent=4)
170 |     except JSONDecodeError:
171 |         debug_log("json decode error", level=NotionDump.DUMP_MODE_DEFAULT)
172 |         return
173 | 
174 |     file = open(json_name, "w+", encoding="utf-8")
175 |     file.write(json_handle)
176 |     file.flush()
177 |     file.close()
178 | 
179 | 
180 | # 从文件中加载json文件
181 | def load_json_from_file(json_name):
182 |     if not os.path.exists(json_name):
183 |         debug_log("json file not exist, path=" + json_name, level=NotionDump.DUMP_MODE_DEFAULT)
184 |         return None
185 |     try:
186 |         json_fd = open(json_name, "r", encoding="utf-8")
187 |         return json.load(json_fd)
188 |     except JSONDecodeError:
189 |         debug_log("json decode error", level=NotionDump.DUMP_MODE_DEFAULT)
190 |         return None
191 | 
192 | 
193 | # 判断是否添加额外的换行
194 | def parser_newline(last_type, now_type):
195 |     if last_type == "to_do" and now_type == "to_do":
196 |         return False
197 |     if last_type == "numbered_list_item" and now_type == "numbered_list_item":
198 |         return False
199 |     if last_type == "bulleted_list_item" and now_type == "bulleted_list_item":
200 |         return False
201 |     if last_type == "toggle" and now_type == "toggle":
202 |         return False
203 |     # 处理表格类型
204 |     if last_type == "table" and now_type == "table_row":
205 |         return False
206 |     if last_type == "table_row" and now_type == "table_row":
207 |         return False
208 |     return True
209 | 
210 | 
211 | def debug_log(debug_str, level=NotionDump.DUMP_MODE_DEBUG):
212 |     if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG:
213 |         NotionDump.LOGGER.log_debug(debug_str)
214 |     elif NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEFAULT and level == NotionDump.DUMP_MODE_DEFAULT:
215 |         # 默认模式 对 level进行过滤
216 |         NotionDump.LOGGER.log_info(debug_str)
217 |     # 静默模式什么都不输出
218 | 


--------------------------------------------------------------------------------
/NotionDump/utils/content_format.py:
--------------------------------------------------------------------------------
  1 | # author: delta1037
  2 | # Date: 2022/01/08
  3 | # mail:geniusrabbit@qq.com
  4 | import datetime
  5 | 
  6 | import dateutil.parser
  7 | 
  8 | 
  9 | # 获取mention的格式
 10 | import NotionDump
 11 | 
 12 | 
 13 | def get_mention_format(mention_content):
 14 |     return "@(" + mention_content + ")"
 15 | 
 16 | 
 17 | # 获取page的格式 运行过程中只填充id，后续调整页面供定位使用
 18 | def get_page_format_md(page_id, page_name, export_child):
 19 |     if export_child:
 20 |         return "[" + page_id + "]()"
 21 |     else:
 22 |         return "[" + page_name + "](" + page_id + ")"
 23 | 
 24 | 
 25 | # 数据库title格式
 26 | def get_database_title_format(title_id, title_ret, export_child):
 27 |     if export_child:
 28 |         return "[" + title_id + "]()"
 29 |     else:
 30 |         # 不导出子页面直接把标题填上去
 31 |         return title_ret
 32 | 
 33 | 
 34 | # 获取page的格式 纯文本只填充名字即可
 35 | def get_page_format_plain(page_name):
 36 |     return page_name
 37 | 
 38 | 
 39 | # 封装URL的格式
 40 | def get_url_format(url_plain, name="link"):
 41 |     return "[" + name + "](" + url_plain + ")"
 42 | 
 43 | 
 44 | def format_date_or_time(date_time):
 45 |     # print(date_time)
 46 |     t_datetime = dateutil.parser.parse(date_time)
 47 |     # print(date_time, t_datetime)
 48 |     if date_time.find('T') != -1:
 49 |         # datetime
 50 |         return t_datetime.strftime(NotionDump.FORMAT_DATETIME)
 51 |     else:
 52 |         # date
 53 |         return t_datetime.strftime(NotionDump.FORMAT_DATE)
 54 | 
 55 | 
 56 | # 封装date的格式
 57 | def get_date_format(start, end):
 58 |     ret_str = ""
 59 |     if start is not None:
 60 |         ret_str = format_date_or_time(start)
 61 |     if end is not None:
 62 |         ret_str += " ~ " + format_date_or_time(end)  # 日期之间用“~”分割
 63 |     return ret_str
 64 | 
 65 | 
 66 | # 封装文件链接格式
 67 | def get_file_format_md(filename, file_url, file_id="", export_child=False):
 68 |     if export_child:
 69 |         if file_id == "":
 70 |             return "[" + filename + "](" + file_url + ")"
 71 |         else:
 72 |             # 等待重定位
 73 |             return "[" + file_id + "]()"
 74 |     else:
 75 |         # 不导出子页面直接把标题填上去
 76 |         return "[" + filename + "](" + file_url + ")"
 77 | 
 78 | 
 79 | # 封装文件链接格式
 80 | def get_file_format_plain(filename, file_url):
 81 |     return filename + "(" + file_url + ")"
 82 | 
 83 | 
 84 | # 行内公式格式
 85 | def get_equation_inline(equation):
 86 |     return "$ " + equation + " $"
 87 | 
 88 | 
 89 | # 块级公式格式
 90 | def get_equation_block(equation):
 91 |     return "$$ " + equation + " $$"
 92 | 
 93 | 
 94 | def color_transformer(input_color, background=False):
 95 |     if background:
 96 |         color_str = "b_" + input_color
 97 |     else:
 98 |         color_str = "f_" + input_color
 99 |     color_ret = ""
100 |     if NotionDump.S_THEME_TYPE == "dark":
101 |         # dark
102 |         if color_str in NotionDump.S_THEME_DARK:
103 |             color_ret = NotionDump.S_THEME_DARK[color_str]
104 |     elif NotionDump.S_THEME_TYPE == "self_define":
105 |         # self_define
106 |         if color_str in NotionDump.S_THEME_SELF_DEFINE:
107 |             color_ret = NotionDump.S_THEME_SELF_DEFINE[color_str]
108 |     else:
109 |         # default light
110 |         if color_str in NotionDump.S_THEME_LIGHT:
111 |             color_ret = NotionDump.S_THEME_LIGHT[color_str]
112 |     if color_ret != "":
113 |         return color_ret
114 |     return input_color
115 | 
116 | 
117 | def color_transformer_db(input_color):
118 |     if input_color == "default":
119 |         color_str = "d_light_gray"
120 |     else:
121 |         color_str = "d_" + input_color
122 | 
123 |     color_ret = ""
124 |     if NotionDump.S_THEME_TYPE == "dark":
125 |         # dark
126 |         if color_str in NotionDump.S_THEME_DARK:
127 |             color_ret = NotionDump.S_THEME_DARK[color_str]
128 |     elif NotionDump.S_THEME_TYPE == "self_define":
129 |         # self_define
130 |         if color_str in NotionDump.S_THEME_SELF_DEFINE:
131 |             color_ret = NotionDump.S_THEME_SELF_DEFINE[color_str]
132 |     else:
133 |         # default light
134 |         if color_str in NotionDump.S_THEME_LIGHT:
135 |             color_ret = NotionDump.S_THEME_LIGHT[color_str]
136 |     if color_ret != "":
137 |         return color_ret
138 |     return input_color
139 | 


--------------------------------------------------------------------------------
/NotionDump/utils/internal_var.py:
--------------------------------------------------------------------------------
 1 | # author: delta1037
 2 | # Date: 2022/01/08
 3 | # mail:geniusrabbit@qq.com
 4 | 
 5 | # ms
 6 | FRIENDLY_USE_API = 400
 7 | FRIENDLY_DOWNLOAD = 1000
 8 | 
 9 | # 导出页面结构
10 | PAGE_DIC = {}
11 | 
12 | # 导出页面列表的格式
13 | CHILD_PAGE_TEMP = {
14 |     "dumped": False,
15 |     "main_page": False,
16 |     "type": "page",
17 |     "local_path": "",
18 |     "page_name": "",
19 |     "link_id": "",
20 |     "link_src": "",
21 |     "child_pages": [],
22 |     "inter_recursion": False,
23 |     "inter_soft_page": False
24 | }
25 | # inter_soft_link 表示该页是由链接创建的
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # notion-export-kernel
  2 | 
  3 | [中文](https://github.com/delta1037/notion-export-kernel/blob/main/README_zh.md)
  4 | 
  5 | ## Description
  6 | 
  7 | This repository is a development based on [notion-sdk-py](https://github.com/ramnes/notion-sdk-py)（notion official API）,  target to export notion pages and database
  8 | 
  9 | Main targets：
 10 | 
 11 | - [x] Export Notion Database and page to markdown file
 12 | - [x] Recursion Export child Pages
 13 | - [x] Download image and files in notion
 14 | 
 15 | ## Structure
 16 | 
 17 | ```shell
 18 | notoin-dump
 19 | ├─NotionDump
 20 | │  ├─Dump   # External Interface
 21 | │  ├─Notion # Unified encapsulation interface for communication with Notion
 22 | │  ├─Parser # Some parser
 23 | │  └─utils  # Internal variables and utils functions
 24 | └─Tests 	# Test code
 25 | ```
 26 | 
 27 | #### Parser code structure
 28 | 
 29 | ```mermaid
 30 | graph TD
 31 |     A[Dump] -->B(Database)
 32 |     A[Dump] -->C(Page/Block)
 33 |     B --> D[Mix Parser]
 34 |     C --> D[Mix Parser]
 35 | 
 36 |     D --> E[Database Parser]
 37 |     D --> F[Block Parser]
 38 |     
 39 |     E --> G[Base Parser]
 40 |     F --> G[Base Parser]
 41 | ```
 42 | 
 43 | ## Usage
 44 | 
 45 | ### 3.0 install & example
 46 | 
 47 | **install `notion-dump-kernel`**
 48 | 
 49 | ```powershell
 50 | # open terminal, type the cmd (install the latest version)
 51 | pip install python-dateutil
 52 | pip install notion-dump-kernel
 53 | ```
 54 | 
 55 | **example**
 56 | 
 57 | ```python
 58 | # Example: export page
 59 | import NotionDump
 60 | from NotionDump.Dump.dump import Dump
 61 | from NotionDump.Notion.Notion import NotionQuery
 62 | from NotionDump.utils import common_op
 63 | 
 64 | TOKEN_TEST = "secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
 65 | PAGE_MIX_ID = "43e7aa8ccfb0488eb18f8a453eab0177"
 66 | # NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG
 67 | 
 68 | def test_page_parser(query, export_child=False, db_parser_type=NotionDump.PARSER_TYPE_MD):
 69 |     # init what you want to export
 70 |     # Explain:
 71 |     # 	dump_id: the ID which need to export (block, page or database)
 72 |     # 	query_handle: Notion query handle for getting data from API (NOT the offical API handle)
 73 |     # 	export_child_pages: whether export all nested pages(sub-page and link page)
 74 |     # 	dump_type: the dump_id type [DUMP_TYPE_BLOCK/DUMP_TYPE_PAGE/DUMP_TYPE_DB_TABLE]
 75 |     # 	db_parser_type: PARSER_TYPE_MD meas export database as markdown table; PARSER_TYPE_PLAIN means export database as CSV file
 76 |     # 	page_parser_type: PARSER_TYPE_MD meas export page as markdown file; PARSER_TYPE_PLAIN means export page as txt
 77 |     page_handle = Dump(
 78 |         dump_id=PAGE_MIX_ID,
 79 |         query_handle=query,
 80 |         export_child_pages=export_child,
 81 |         dump_type=NotionDump.DUMP_TYPE_PAGE,
 82 |         db_parser_type=db_parser_type,
 83 |         page_parser_type=NotionDump.PARSER_TYPE_MD
 84 |     )
 85 |     
 86 |     # Returned variable , which contain all info about dumped files structure
 87 |     # All parsered files will be save at .tmp/
 88 |     page_detail_json = page_handle.dump_to_file()
 89 | 	
 90 |     # all info about dumped files structure save as json file
 91 |     print("json output to page_parser_result")
 92 |     common_op.save_json_to_file(
 93 |         handle=page_detail_json,
 94 |         json_name=".tmp/page_parser_result.json"
 95 |     )
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     # We need a qurey handle for getting data from API
100 |     query_handle = NotionQuery(token=TOKEN_TEST)
101 |     if query_handle is None:
102 |         logging.exception("query handle init error")
103 |         exit(-1)
104 | 
105 |     # export_child means export all nested pages(sub-page and link page)
106 |     test_page_parser(query_handle, True)
107 | ```
108 | 
109 | ### 3.1 Output
110 | 
111 | All export files will be seen at `.tmp/` and the **page structure save at returned variable**, which contain all info about dumped files structure. 
112 | 
113 | return variable (`page_detail_json`) will be like：
114 | 
115 | ```json
116 | {
117 |     "key_id_1": {
118 |         "dumped": true,
119 |         "main_page": true,
120 |         "type": "page",
121 |         "local_path": "xxxx",
122 |         "page_name": "",
123 |         "link_id": "",
124 |         "child_pages": [
125 |             "xxxxx",
126 |             "xxxxx"
127 |         ],
128 |         "inter_recursion": true,
129 |         "inter_soft_page": false
130 |     },
131 |     "key_id_2": {
132 |         "dumped": false,
133 |         "main_page": false,
134 |         "type": "page",
135 |         "local_path": "",
136 |         "page_name": "",
137 |         "link_id": "xxxxx",
138 |         "child_pages": [],
139 |         "inter_recursion": true,
140 |         "inter_soft_page": false
141 |     }
142 | }
143 | ```
144 | 
145 | **output explain**：
146 | 
147 | -   key_id_1：key is id (block id/page id/database id) and it is the combination of link name and id in link page，the id is the tag to relocate link in page
148 | -   dumped：download status of the resource specifid by id
149 | -   main_page：whether the page is the page specifid by dump_id (root)
150 | -   type：id type, database or page (page type contain page and block)（if id_1 is a link the type is the page type that the link linked）
151 | -   local_path：the location of export file, for subsequent operations
152 | -   page_name：page name (for subsequent relocation of page url)
153 | -   child_pages：subpage or database id this key_id contain
154 | -   inter_recursion：internal variable(NOT use)
155 | -   inter_soft_page：internal variable(NOT use)
156 | 
157 | ## Attention
158 | 
159 | - [ ] Comment not support
160 | 
161 | ## Others
162 | 
163 | ### 6.1、Notion Test Page
164 | 
165 | [Notion Test Page](https://delta1037.notion.site/Notion-dump-ed0a3b0f57b34712bc6bafcbdb413d50)
166 | 
167 | ### 6.2 Notion export client
168 | 
169 | which base on notion-export-kernel, it is used to rebuild the structure of dumped files（dumped by notion-export-kernel） and relocate the link in pages
170 | 
171 | [Github](https://github.com/delta1037/notion-export-client)
172 | 
173 | 


--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
  1 | # notion-export-kernel
  2 | 
  3 | 
  4 | 
  5 | ------
  6 | 
  7 | ## 一、项目说明
  8 | 
  9 | 本仓库是基于 [notion-sdk-py](https://github.com/ramnes/notion-sdk-py)（notion官方API）的开发，导出Notion页面和数据库。
 10 | 
 11 | 项目目标
 12 | 
 13 | - [x] 将Notion页面和数据库导出为Markdown文件
 14 | - [x] 递归导出所有子页面（或者链接）
 15 | - [x] 下载文件和图片
 16 | 
 17 | ## 二、项目结构
 18 | 
 19 | ```shell
 20 | notoin-dump
 21 | ├─NotionDump
 22 | │  ├─Dump   # 对外接口
 23 | │  ├─Notion # 与Notion通信统一封装接口
 24 | │  ├─Parser # 实现的一些解析器
 25 | │  └─utils  # 内部变量与杂项函数
 26 | └─Tests 	# 测试代码
 27 | ```
 28 | 
 29 | ```mermaid
 30 | graph TD
 31 |     A[Dump] -->B(Database)
 32 |     A[Dump] -->C(Page/Block)
 33 |     B --> D[Mix Parser]
 34 |     C --> D[Mix Parser]
 35 | 
 36 |     D --> E[Database Parser]
 37 |     D --> F[Block Parser]
 38 |     
 39 |     E --> G[Base Parser]
 40 |     F --> G[Base Parser]
 41 | ```
 42 | 
 43 | 
 44 | 
 45 | 
 46 | 
 47 | ## 三、使用方法
 48 | 
 49 | ### 3.0 安装导入
 50 | 
 51 | **安装`notion-dump-kernel`**
 52 | 
 53 | ```powershell
 54 | # 打开终端，输入如下命令安装（装最最新版）
 55 | pip install python-dateutil
 56 | pip install notion-dump-kernel
 57 | ```
 58 | 
 59 | **导入使用**
 60 | 
 61 | ```python
 62 | import NotionDump
 63 | from NotionDump.Dump.dump import Dump
 64 | from NotionDump.Notion.Notion import NotionQuery
 65 | ```
 66 | 
 67 | 
 68 | 
 69 | ### 3.1 对外统一接口
 70 | 
 71 | ```python
 72 | # 获取Notion查询句柄
 73 | query_handle = NotionQuery(
 74 |     token=TOKEN_TEST,                  				# Token
 75 |     client_handle=None,                				# Notion官方API句柄，默认为空
 76 |     async_api=False                    				# 异步调用，默认为False
 77 | )
 78 | 
 79 | # 获取操作句柄
 80 | handle = Dump(
 81 |     dump_id=ID,                        				# 需要导出的页面ID
 82 |     query_handle=query,                				# Notion查询句柄
 83 |     export_child_pages=True, 		   				# 是否递归导出子页面
 84 |     page_parser_type=NotionDump.PARSER_TYPE_MD,  	# Page导出类型
 85 |     db_parser_type=NotionDump.PARSER_TYPE_PLAIN,	# 数据库导出类型
 86 |     dump_type=NotionDump.DUMP_TYPE_XXX 				# ID的类型，详细见后续说明
 87 | )
 88 | 
 89 | # dump类型 dump_type
 90 | DUMP_TYPE_BLOCK						   				# 块类型
 91 | DUMP_TYPE_PAGE						   				# 页面类型
 92 | DUMP_TYPE_DB_TABLE                     				# 数据库Table类型
 93 | 
 94 | # 导出类型
 95 | PARSER_TYPE_MD										# Markdown格式
 96 | PARSER_TYPE_PLAIN									# 纯文本格式
 97 | 
 98 | # 其它
 99 | # 变量自解释，不再赘述
100 | ```
101 | 
102 | [操作示例](https://github.com/delta1037/notion-dump-kernel/tree/main/Examples)
103 | 
104 | ### 3.2 获取输出
105 | 
106 | dump的结果存放在一个字典变量中，改变量包含了外部可以操作的所有新信息，获取输出和输出解释如下
107 | 
108 | ```python
109 | # 获取输出
110 | dump_output = dump_handle.dump_to_file()
111 | # 其中dump_handle为上述的操作句柄（Dump(xxx)返回值）
112 | ```
113 | 
114 | 输出样例：
115 | 
116 | ```json
117 | {
118 |     "key_id_1": {
119 |         "dumped": true,
120 |         "main_page": true,
121 |         "type": "page",
122 |         "local_path": "xxxx",
123 |         "page_name": "",
124 |         "link_id": "",
125 |         "child_pages": [
126 |             "xxxxx",
127 |             "xxxxx"
128 |         ],
129 |         "inter_recursion": true,
130 |         "inter_soft_page": false
131 |     },
132 |     "key_id_2": {
133 |         "dumped": false,
134 |         "main_page": false,
135 |         "type": "page",
136 |         "local_path": "",
137 |         "page_name": "",
138 |         "link_id": "xxxxx",
139 |         "child_pages": [],
140 |         "inter_recursion": true,
141 |         "inter_soft_page": false
142 |     }
143 | }
144 | ```
145 | 
146 | **输出解释**：
147 | 
148 | -   key_id_1：键值，也是dump下来的页面需要重定位的标志
149 | -   dumped：id指向的资源是否成功下载
150 | -   main_page：页面是否是主页
151 | -   type：该id的类型，database或者page（链接的话是链接指向的页面的类型）
152 | -   local_path：导出的文件位置，供后续操作
153 | -   page_name：页面是否有名称（后续重定位使用）
154 | -   child_pages：包含的子页面或者子数据库
155 | -   inter_recursion：内部使用变量，无需关注
156 | -   inter_soft_page：内部使用变量，无需关注
157 | 
158 | ## 五、注意
159 | 
160 | - [ ] 不支持评论内容
161 | 
162 | ## 六、附录
163 | 
164 | ### 6.1、项目测试
165 | 
166 | [项目测试页面](https://delta1037.notion.site/Notion-dump-ed0a3b0f57b34712bc6bafcbdb413d50)
167 | 
168 | ### 6.2 Notion dump client
169 | 
170 | 基于notion-dump-kernel做的一个对下载下来的页面重新组合文件结构，并对其中的链接部分进行重定位的项目
171 | 
172 | [项目Github地址](https://github.com/delta1037/notion-export-local)
173 | 
174 | 


--------------------------------------------------------------------------------
/img/get_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/img/get_data.png


--------------------------------------------------------------------------------
/img/parser_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/img/parser_structure.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wheel~=0.36.2
2 | setuptools~=57.0.0
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | try:
 4 |     from setuptools import setup, find_packages
 5 |     from setuptools import Command
 6 |     from setuptools import Extension
 7 | except ImportError:
 8 |     sys.exit(
 9 |         "We need the Python library setuptools to be installed. "
10 |         "Try running: python -m ensurepip"
11 |     )
12 | 
13 | if "bdist_wheel" in sys.argv:
14 |     try:
15 |         import wheel  # noqa: F401
16 |     except ImportError:
17 |         sys.exit(
18 |             "We need both setuptools AND wheel packages installed "
19 |             "for bdist_wheel to work. Try running: pip install wheel"
20 |         )
21 | 
22 | 
23 | REQUIRES = ["notion-client>=0.8.0"]
24 | 
25 | # with open("README_En.md", encoding="utf-8") as handle:
26 | #    readme_rst = handle.read()
27 | 
28 | setup(
29 |     name="notion-dump-kernel",
30 |     version="0.2.4",
31 |     author="delta1037",
32 |     author_email="geniusrabbit@qq.com",
33 |     url="https://github.com/delta1037/notion-export-kernel",
34 |     description="Freely available tools for export Notion page and database.",
35 |     project_urls={
36 |         "Documentation": "https://github.com/delta1037/notion-export-kernel/blob/main/README.md",
37 |         "Source": "https://github.com/delta1037/notion-export-kernel",
38 |         "Tracker": "https://github.com/delta1037/notion-export-kernel/issues",
39 |     },
40 |     classifiers=[
41 |         "Development Status :: 3 - Alpha",
42 |         "Intended Audience :: Developers",
43 |         "License :: OSI Approved :: MIT License",
44 |         "Operating System :: OS Independent",
45 |         "Programming Language :: Python",
46 |         "Programming Language :: Python :: 3.9",
47 |         "Topic :: Text Processing :: Markup",
48 |         "Topic :: Software Development :: Libraries :: Python Modules",
49 |     ],
50 |     packages=find_packages(where='.', exclude=(), include=('*',)),
51 |     include_package_data=True,  # done via MANIFEST.in under setuptools
52 |     install_requires=REQUIRES,
53 | )
54 | # 打包发布
55 | # 1、python setup.py sdist
56 | # 2、twine upload dist/*
57 | 
58 | 


--------------------------------------------------------------------------------