├── .gitignore ├── Examples ├── block_dump.py ├── db_dump.py └── page_dump.py ├── LICENSE ├── MANIFEST.in ├── NotionDump ├── Dump │ ├── __init__.py │ ├── block.py │ ├── database.py │ ├── dump.py │ └── page.py ├── Notion │ ├── Buffer.py │ ├── Notion.py │ └── __init__.py ├── Parser │ ├── __init__.py │ ├── base_parser.py │ ├── block_parser.py │ ├── database_parser.py │ └── mix_parser.py ├── __init__.py └── utils │ ├── __init__.py │ ├── common_op.py │ ├── content_format.py │ └── internal_var.py ├── README.md ├── README_zh.md ├── img ├── get_data.png └── parser_structure.png ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # pycharm 132 | .idea/ 133 | 134 | # tempfile 135 | Tests/.tmp/ 136 | Examples/.tmp/ 137 | -------------------------------------------------------------------------------- /Examples/block_dump.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/11 3 | # mail:geniusrabbit@qq.com 4 | 5 | import logging 6 | 7 | import NotionDump 8 | from NotionDump.Dump.dump import Dump 9 | from NotionDump.Notion.Notion import NotionQuery 10 | from NotionDump.utils import common_op 11 | 12 | TOKEN_TEST = "secret_WRLJ9xyEawNxzRhVHVWfciTl9FAyNCd29GMUvr2hQD4" 13 | TABLE_ID = "13b914160ef740dcb64e55c5393762fa" 14 | RER_LIST_ID = "d32db4693409464b9981caec9ef11974" 15 | 16 | 17 | # 页面表格测试 18 | def test_get_table_block(query, export_child=True): 19 | block_handle = Dump( 20 | dump_id=TABLE_ID, 21 | query_handle=query, 22 | export_child_pages=export_child, 23 | dump_type=NotionDump.DUMP_TYPE_BLOCK 24 | ) 25 | # 将解析内容存储到文件中;返回内容存储为json文件 26 | page_detail_json = block_handle.dump_to_file() 27 | 28 | print("json output to block_table_parser_result") 29 | common_op.save_json_to_file( 30 | handle=page_detail_json, 31 | json_name=".tmp/block_table_parser_result.json" 32 | ) 33 | 34 | 35 | # 递归列表测试 36 | def test_get_rer_list(query, export_child=True): 37 | block_handle = Dump( 38 | dump_id=RER_LIST_ID, 39 | query_handle=query, 40 | export_child_pages=export_child, 41 | dump_type=NotionDump.DUMP_TYPE_BLOCK 42 | ) 43 | # 将解析内容存储到文件中;返回内容存储为json文件 44 | page_detail_json = block_handle.dump_to_file() 45 | 46 | print("json output to block_list_parser_result") 47 | common_op.save_json_to_file( 48 | handle=page_detail_json, 49 | json_name=".tmp/block_list_parser_result.json" 50 | ) 51 | 52 | 53 | if __name__ == '__main__': 54 | query_handle = NotionQuery(token=TOKEN_TEST) 55 | if query_handle is None: 56 | logging.exception("query handle init error") 57 | exit(-1) 58 | 59 | # Block解析测试 60 | # test_get_table_block(query_handle, export_child=False) 61 | test_get_table_block(query_handle, export_child=True) 62 | 63 | # Block解析测试 64 | # test_get_rer_list(query_handle, export_child=False) 65 | test_get_rer_list(query_handle, export_child=True) 66 | -------------------------------------------------------------------------------- /Examples/db_dump.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/11 3 | # mail:geniusrabbit@qq.com 4 | 5 | import logging 6 | 7 | import NotionDump 8 | from NotionDump.Dump.database import Database 9 | from NotionDump.Dump.dump import Dump 10 | from NotionDump.Notion.Notion import NotionQuery 11 | from NotionDump.utils import common_op 12 | 13 | TOKEN_TEST = "secret_ALjbBRGaZcagEjPtL1c2F139steBXjr8Fc8uQso4YLV" 14 | DB_TABLE_INLINE_ID = "0b1f524ad42b420f889a2c6adb9b8c92" 15 | NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG 16 | 17 | 18 | # 解析数据库内容测试:根据token和id解析数据库内容,得到临时CSV文件 19 | def test_db_table_inline_parser_dic(query): 20 | db_handle = Database( 21 | database_id=DB_TABLE_INLINE_ID, 22 | query_handle=query, 23 | export_child_pages=False 24 | ) 25 | # 将解析内容存储到文件中;返回内容存储为json文件 26 | page_detail_json = db_handle.dump_to_file() 27 | 28 | print("json output to db_parser_result") 29 | common_op.save_json_to_file( 30 | handle=page_detail_json, 31 | json_name=".tmp/db_parser_result.json" 32 | ) 33 | print(db_handle.dump_to_dic()) 34 | 35 | 36 | # 解析数据库内容测试:根据token和id解析数据库内容,得到临时CSV文件 37 | def test_db_table_inline_parser_csv(query, export_child=False): 38 | db_handle = Dump( 39 | dump_id=DB_TABLE_INLINE_ID, 40 | query_handle=query, 41 | export_child_pages=export_child, 42 | dump_type=NotionDump.DUMP_TYPE_DB_TABLE 43 | ) 44 | # 将解析内容存储到文件中;返回内容存储为json文件 45 | page_detail_json = db_handle.dump_to_file() 46 | 47 | print("json output to db_parser_result") 48 | common_op.save_json_to_file( 49 | handle=page_detail_json, 50 | json_name=".tmp/db_parser_result.json" 51 | ) 52 | 53 | 54 | def test_db_table_inline_parser_md(query, export_child=False): 55 | db_handle = Dump( 56 | dump_id=DB_TABLE_INLINE_ID, 57 | query_handle=query, 58 | export_child_pages=export_child, 59 | dump_type=NotionDump.DUMP_TYPE_DB_TABLE, 60 | db_parser_type=NotionDump.PARSER_TYPE_MD, 61 | ) 62 | # 将解析内容存储到文件中;返回内容存储为json文件 63 | page_detail_json = db_handle.dump_to_file() 64 | 65 | print("json output to db_parser_result") 66 | common_op.save_json_to_file( 67 | handle=page_detail_json, 68 | json_name=".tmp/db_parser_result.json" 69 | ) 70 | 71 | 72 | if __name__ == '__main__': 73 | query_handle = NotionQuery(token=TOKEN_TEST) 74 | if query_handle is None: 75 | logging.exception("query handle init error") 76 | exit(-1) 77 | 78 | # 数据库存储到CSV文件 79 | # test_db_table_inline_parser_csv(query_handle, True) 80 | 81 | # 数据库存储到MD文件 82 | test_db_table_inline_parser_md(query_handle, True) 83 | 84 | # 数据库存储到字典 85 | # test_db_table_inline_parser_dic(query_handle) 86 | 87 | -------------------------------------------------------------------------------- /Examples/page_dump.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/11 3 | # mail:geniusrabbit@qq.com 4 | 5 | import logging 6 | 7 | import NotionDump 8 | from NotionDump.Dump.dump import Dump 9 | from NotionDump.Notion.Notion import NotionQuery 10 | from NotionDump.utils import common_op 11 | 12 | TOKEN_TEST = "secret_ALjbBRGaZcagEjPtL1c2F139steBXjr8Fc8uQso4YLV" 13 | PAGE_MIX_ID = "921e6b4ea44046c6935bcb2c69453196" 14 | # NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG 15 | 16 | 17 | # 解析数据库内容测试:根据token和id解析数据库内容,得到临时CSV文件 18 | def test_page_parser(query, export_child=False, db_parser_type=NotionDump.PARSER_TYPE_MD): 19 | page_handle = Dump( 20 | dump_id=PAGE_MIX_ID, 21 | query_handle=query, 22 | export_child_pages=export_child, 23 | dump_type=NotionDump.DUMP_TYPE_PAGE, 24 | db_parser_type=db_parser_type 25 | ) 26 | # 将解析内容存储到文件中;返回内容存储为json文件 27 | page_detail_json = page_handle.dump_to_file() 28 | 29 | print("json output to page_parser_result") 30 | common_op.save_json_to_file( 31 | handle=page_detail_json, 32 | json_name=".tmp/page_parser_result.json" 33 | ) 34 | 35 | 36 | def test_page_retrieve(query: NotionQuery): 37 | # 获取页面信息测试 38 | ret = query.retrieve_page("0cee7c12f04c4157bcc025355adf2312") 39 | print(ret) 40 | 41 | 42 | if __name__ == '__main__': 43 | query_handle = NotionQuery(token=TOKEN_TEST) 44 | if query_handle is None: 45 | logging.exception("query handle init error") 46 | exit(-1) 47 | 48 | # 测试收集页面信息 49 | # test_page_retrieve(query_handle) 50 | 51 | # 页面解析测试,递归 52 | test_page_parser(query_handle, True) 53 | 54 | # 页面解析测试,非递归 55 | # test_page_parser(query_handle, False) 56 | 57 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 delta1037 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | recursive-include examples *.txt *.py 3 | prune examples/sample?/build 4 | -------------------------------------------------------------------------------- /NotionDump/Dump/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Dump/__init__.py -------------------------------------------------------------------------------- /NotionDump/Dump/block.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | import NotionDump 5 | from NotionDump.Dump.page import Page 6 | from NotionDump.Notion.Notion import NotionQuery 7 | from NotionDump.utils import internal_var 8 | 9 | 10 | # Block内容解析 11 | class Block: 12 | # 初始化 13 | def __init__( 14 | self, 15 | block_id, 16 | query_handle: 17 | NotionQuery, 18 | export_child_pages=False, 19 | page_parser_type=NotionDump.PARSER_TYPE_MD, 20 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN 21 | ): 22 | self.block_id = block_id.replace('-', '') 23 | self.query_handle = query_handle 24 | # 是否导出子页面 25 | self.export_child_page = export_child_pages 26 | self.page_parser_type = page_parser_type 27 | self.db_parser_type = db_parser_type 28 | 29 | # 构造解析器 30 | self.page_handle = Page( 31 | page_id=self.block_id, 32 | query_handle=self.query_handle, 33 | export_child_pages=self.export_child_page, 34 | page_parser_type=self.page_parser_type, 35 | db_parser_type=self.db_parser_type 36 | ) 37 | 38 | # show_child_page 39 | @staticmethod 40 | def get_pages_detail(): 41 | return internal_var.PAGE_DIC 42 | 43 | # 获取到所有的BLOCK数据 44 | def dump_to_file(self, file_name=None): 45 | # 递归时第一个block单独作为一个main page存放 46 | return self.page_handle.dump_to_file(file_name=file_name) 47 | 48 | def dump_to_db(self): 49 | return self.page_handle.dump_to_db() 50 | 51 | # 源文件,直接输出成json; 辅助测试使用 52 | def dump_to_json(self, json_name=None): 53 | return self.page_handle.dump_to_json(json_name=json_name) 54 | -------------------------------------------------------------------------------- /NotionDump/Dump/database.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | 5 | import os 6 | import shutil 7 | 8 | import NotionDump 9 | from NotionDump.Notion.Notion import NotionQuery 10 | from NotionDump.Parser.mix_parser import MixParser 11 | from NotionDump.utils import common_op, internal_var 12 | 13 | 14 | class Database: 15 | # 初始化 16 | def __init__( 17 | self, 18 | database_id, 19 | query_handle: NotionQuery, 20 | export_child_pages=False, 21 | page_parser_type=NotionDump.PARSER_TYPE_MD, 22 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN 23 | ): 24 | self.database_id = database_id.replace('-', '') 25 | self.query_handle = query_handle 26 | # 是否导出子页面 27 | self.export_child_page = export_child_pages 28 | self.page_parser_type = page_parser_type 29 | self.db_parser_type = db_parser_type 30 | 31 | # 构造解析器 32 | self.mix_parser = MixParser( 33 | mix_id=self.database_id, 34 | query_handle=self.query_handle, 35 | export_child_pages=self.export_child_page, 36 | page_parser_type=self.page_parser_type, 37 | db_parser_type=self.db_parser_type 38 | ) 39 | 40 | # 创建临时文件夹 41 | self.tmp_dir = NotionDump.TMP_DIR 42 | if not os.path.exists(self.tmp_dir): 43 | os.mkdir(self.tmp_dir) 44 | 45 | # show_child_page 46 | @staticmethod 47 | def get_pages_detail(): 48 | return internal_var.PAGE_DIC 49 | 50 | # 获取到所有的数据库数据(CSV格式)(数据库导出均是CSV) 51 | def dump_to_file(self, file_name=None, col_name_list=None): 52 | # 解析到临时文件中 53 | tmp_filename = self.mix_parser.mix_parser(root_id=self.database_id, id_type="database", col_name_list=col_name_list) 54 | if tmp_filename is None: 55 | common_op.debug_log("page parser fail, id=" + self.database_id, level=NotionDump.DUMP_MODE_DEFAULT) 56 | return "" 57 | 58 | if file_name is not None: 59 | shutil.copyfile(tmp_filename, file_name) 60 | common_op.debug_log("copy " + tmp_filename + " to " + file_name, level=NotionDump.DUMP_MODE_DEFAULT) 61 | return file_name 62 | 63 | return tmp_filename 64 | 65 | def dump_to_db(self, col_name_list=None, db_q_filter="{}", db_q_sorts="[]"): 66 | # 从配置文件中获取数据库配置,打开数据库,并将csv文件写入到数据库中 67 | db_json = self.query_handle.query_database( 68 | database_id=self.database_id, 69 | db_q_filter=db_q_filter, 70 | db_q_sorts=db_q_sorts) 71 | if db_json is None: 72 | return "" 73 | 74 | # TODO 将CSV文件写入到数据库;调用SQL中的notion2sql提供的接口 75 | return 76 | 77 | # 源文件,直接输出成json; 辅助测试使用 78 | def dump_to_json(self, json_name=None, db_q_filter="{}", db_q_sorts="[]"): 79 | db_json = self.query_handle.query_database( 80 | database_id=self.database_id, 81 | db_q_filter=db_q_filter, 82 | db_q_sorts=db_q_sorts) 83 | if db_json is None: 84 | return "" 85 | 86 | if json_name is None: 87 | json_name = self.tmp_dir + self.database_id + ".json" 88 | common_op.save_json_to_file(db_json, json_name) 89 | 90 | def dump_to_dic(self, col_name_list=None, db_q_filter="{}", db_q_sorts="[]"): 91 | # 由于数据库修改属性不会使数据库的修改时间改变,所以这里采用强制更新的方式 92 | db_json = self.query_handle.query_database( 93 | database_id=self.database_id, 94 | db_q_filter=db_q_filter, 95 | db_q_sorts=db_q_sorts, 96 | force_update=True 97 | ) 98 | if db_json is None: 99 | common_op.debug_log("query database get nothing, id=" + self.database_id, 100 | level=NotionDump.DUMP_MODE_DEFAULT) 101 | return "" 102 | 103 | return self.mix_parser.database_collection( 104 | json_handle=db_json, 105 | json_type="database", 106 | col_name_list=col_name_list 107 | ) 108 | -------------------------------------------------------------------------------- /NotionDump/Dump/dump.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | import copy 5 | 6 | import NotionDump 7 | from NotionDump.Dump.block import Block 8 | from NotionDump.Dump.database import Database 9 | from NotionDump.Dump.page import Page 10 | from NotionDump.Notion.Notion import NotionQuery 11 | from NotionDump.utils import internal_var, common_op 12 | 13 | 14 | class Dump: 15 | def __init__( 16 | self, 17 | dump_id, 18 | query_handle: NotionQuery, 19 | export_child_pages=False, 20 | page_parser_type=NotionDump.PARSER_TYPE_MD, 21 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN, 22 | dump_type=NotionDump.DUMP_TYPE_PAGE 23 | ): 24 | self.dump_id = dump_id.replace('-', '') 25 | self.query_handle = query_handle 26 | # 是否导出子页面 27 | self.export_child_page = export_child_pages 28 | self.page_parser_type = page_parser_type 29 | self.db_parser_type = db_parser_type 30 | self.dump_type = dump_type 31 | 32 | self.handle = None 33 | if dump_type == NotionDump.DUMP_TYPE_PAGE: 34 | self.handle = Page( 35 | page_id=self.dump_id, 36 | query_handle=self.query_handle, 37 | export_child_pages=self.export_child_page, 38 | page_parser_type=self.page_parser_type, 39 | db_parser_type=self.db_parser_type 40 | ) 41 | elif dump_type == NotionDump.DUMP_TYPE_BLOCK: 42 | self.handle = Block( 43 | block_id=self.dump_id, 44 | query_handle=self.query_handle, 45 | export_child_pages=self.export_child_page, 46 | page_parser_type=self.page_parser_type, 47 | db_parser_type=self.db_parser_type 48 | ) 49 | elif dump_type == NotionDump.DUMP_TYPE_DB_TABLE: 50 | self.handle = Database( 51 | database_id=self.dump_id, 52 | query_handle=self.query_handle, 53 | export_child_pages=self.export_child_page, 54 | page_parser_type=self.page_parser_type, 55 | db_parser_type=self.db_parser_type 56 | ) 57 | else: 58 | common_op.debug_log("unknown dump type:" + str(self.dump_type), level=NotionDump.DUMP_MODE_DEFAULT) 59 | 60 | # show_child_page 61 | @staticmethod 62 | def __get_pages_detail(): 63 | return internal_var.PAGE_DIC 64 | 65 | # 获取到所有的BLOCK数据 66 | def dump_to_file(self, file_name=None): 67 | if self.handle is None: 68 | common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT) 69 | return "" 70 | # 递归时第一个block单独作为一个main page存放 71 | self.handle.dump_to_file(file_name=file_name) 72 | self.query_handle.safe_save() 73 | 74 | pages_detail = copy.deepcopy(internal_var.PAGE_DIC) 75 | internal_var.PAGE_DIC = {} 76 | return pages_detail 77 | 78 | def dump_to_db(self): 79 | if self.handle is None: 80 | common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT) 81 | return "" 82 | # 将内容导出到数据库 83 | self.handle.dump_to_db() 84 | self.query_handle.safe_save() 85 | 86 | pages_detail = copy.deepcopy(internal_var.PAGE_DIC) 87 | internal_var.PAGE_DIC = {} 88 | return pages_detail 89 | 90 | # 源文件,直接输出成json; 辅助测试使用 91 | def dump_to_json(self, json_name=None): 92 | if self.handle is None: 93 | common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT) 94 | return "" 95 | 96 | self.handle.dump_to_json(json_name=json_name) 97 | self.query_handle.safe_save() 98 | 99 | pages_detail = copy.deepcopy(internal_var.PAGE_DIC) 100 | internal_var.PAGE_DIC = {} 101 | return pages_detail 102 | -------------------------------------------------------------------------------- /NotionDump/Dump/page.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | import os 5 | import shutil 6 | 7 | import NotionDump 8 | from NotionDump.Notion.Notion import NotionQuery 9 | from NotionDump.Parser.mix_parser import MixParser 10 | from NotionDump.utils import common_op 11 | from NotionDump.utils import internal_var 12 | 13 | 14 | class Page: 15 | # 初始化 16 | def __init__( 17 | self, 18 | page_id, 19 | query_handle: NotionQuery, 20 | export_child_pages=False, 21 | page_parser_type=NotionDump.PARSER_TYPE_MD, 22 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN 23 | ): 24 | self.page_id = page_id.replace('-', '') 25 | self.query_handle = query_handle 26 | # 是否导出子页面 27 | self.export_child_page = export_child_pages 28 | self.page_parser_type = page_parser_type 29 | self.db_parser_type = db_parser_type 30 | 31 | # 构造解析器 32 | self.mix_parser = MixParser( 33 | mix_id=self.page_id, 34 | query_handle=self.query_handle, 35 | export_child_pages=self.export_child_page, 36 | page_parser_type=self.page_parser_type, 37 | db_parser_type=self.db_parser_type 38 | ) 39 | 40 | # 创建临时文件夹 41 | self.tmp_dir = NotionDump.TMP_DIR 42 | if not os.path.exists(self.tmp_dir): 43 | os.mkdir(self.tmp_dir) 44 | 45 | # show_child_page 46 | @staticmethod 47 | def get_pages_detail(): 48 | return internal_var.PAGE_DIC 49 | 50 | # 获取到所有的PAGE数据 51 | def dump_to_file(self, file_name=None): 52 | # 解析到临时文件中 53 | tmp_md_filename = self.mix_parser.mix_parser(root_id=self.page_id, id_type="block") 54 | if tmp_md_filename is None: 55 | common_op.debug_log("page parser fail, id="+self.page_id, level=NotionDump.DUMP_MODE_DEFAULT) 56 | return "" 57 | 58 | if file_name is not None: 59 | shutil.copyfile(tmp_md_filename, file_name) 60 | common_op.debug_log("copy " + tmp_md_filename + " to " + file_name, level=NotionDump.DUMP_MODE_DEFAULT) 61 | return file_name 62 | 63 | return tmp_md_filename 64 | 65 | def dump_to_db(self): 66 | # 从配置文件中获取数据库配置,打开数据库,并将csv文件写入到数据库中 67 | page_json = self.query_handle.retrieve_block_children(self.page_id) 68 | if page_json is None: 69 | return None 70 | 71 | # TODO 将Md文件写入到数据库;调用SQL中的notion2sql提供的接口 72 | return 73 | 74 | # 源文件,直接输出成json; 辅助测试使用 75 | def dump_to_json(self, json_name=None): 76 | page_json = self.query_handle.retrieve_block_children(self.page_id) 77 | if page_json is None: 78 | return None 79 | 80 | if json_name is None: 81 | json_name = self.tmp_dir + self.page_id + ".json" 82 | common_op.save_json_to_file(page_json, json_name) 83 | return 84 | -------------------------------------------------------------------------------- /NotionDump/Notion/Buffer.py: -------------------------------------------------------------------------------- 1 | from time import strftime, localtime 2 | 3 | import NotionDump 4 | from NotionDump.utils import common_op 5 | 6 | 7 | class Buffer: 8 | def __init__(self): 9 | self.base_time = strftime("%Y-%m-%d %H:%M:%S", localtime()) 10 | self.buffer_map = common_op.load_json_from_file(NotionDump.BUFFER_FILE) 11 | if self.buffer_map is None: 12 | self.buffer_map = {} 13 | 14 | def save_buffer(self): 15 | common_op.debug_log("save buffer file") 16 | common_op.save_json_to_file(self.buffer_map, NotionDump.BUFFER_FILE) 17 | 18 | def add_buffer(self, page_id, page_time, id_type="page"): 19 | if page_id not in self.buffer_map: 20 | common_op.debug_log("[BUFFER] add_buffer, new, id=" + page_id + ", type=" + id_type) 21 | self.buffer_map[page_id] = { 22 | "type": id_type, 23 | # 页面上次编辑时间 24 | "last_edited_time": page_time, 25 | # 页面上次下载时间 26 | "update_time": None, 27 | # 页面脏标志 28 | "dirty": True 29 | } 30 | else: 31 | if page_time != self.buffer_map[page_id]["last_edited_time"]: 32 | # 页面编辑过,需要重新下载 33 | common_op.debug_log("[BUFFER] add_buffer, update, id=" + page_id + ", type=" + id_type) 34 | self.buffer_map[page_id]["dirty"] = True 35 | self.buffer_map[page_id]["last_edited_time"] = page_time 36 | 37 | def update_buffer(self, page_id): 38 | # 文件已重新下载,设置更新时间 39 | if page_id in self.buffer_map: 40 | common_op.debug_log("[BUFFER] update_buffer, id=" + page_id) 41 | self.buffer_map[page_id]["update_time"] = strftime("%Y-%m-%d %H:%M:%S", localtime()) 42 | self.buffer_map[page_id]["dirty"] = False 43 | 44 | def select_buffer(self, page_id, is_child=False): 45 | # 查看缓存中是否命中,命中返回True(说明缓存有效),没命中返回False(说明缓存文件无效,需要重新下载) 46 | if page_id not in self.buffer_map: 47 | common_op.debug_log("[BUFFER] select_buffer, id=" + page_id + ", not exist") 48 | return True 49 | else: 50 | if is_child: 51 | if self.buffer_map[page_id]["update_time"] >= self.base_time: 52 | # 子块所在的页面刚更新过,子块也要随之更新 53 | common_op.debug_log("[BUFFER] select_buffer, child update, id=" + page_id) 54 | return True 55 | else: 56 | common_op.debug_log("[BUFFER] select_buffer, child old, id=" + page_id) 57 | return self.buffer_map[page_id]["dirty"] 58 | else: 59 | common_op.debug_log("[BUFFER] select_buffer, main, id=" + page_id) 60 | return self.buffer_map[page_id]["dirty"] 61 | -------------------------------------------------------------------------------- /NotionDump/Notion/Notion.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/10 3 | # mail:geniusrabbit@qq.com 4 | import os 5 | from time import sleep, time 6 | import urllib.request 7 | from time import time, sleep 8 | from urllib.error import URLError 9 | from urllib.parse import quote 10 | from notion_client import Client, AsyncClient 11 | from notion_client import APIErrorCode, APIResponseError 12 | 13 | import NotionDump 14 | from NotionDump.Notion.Buffer import Buffer 15 | from NotionDump.utils import common_op, internal_var 16 | 17 | 18 | class NotionQuery: 19 | def __init__(self, token, client_handle=None, async_api=False): 20 | self.token = token 21 | if client_handle is None and token is not None: 22 | # 有的token话就初始化一下 23 | if not async_api: 24 | self.client = Client(auth=self.token) 25 | else: 26 | self.client = AsyncClient(auth=self.token) 27 | else: 28 | # 没有token,传进来handle就用,没传就不用 29 | self.client = client_handle 30 | 31 | if self.client is None: 32 | common_op.debug_log("notion query init fail", level=NotionDump.DUMP_MODE_DEFAULT) 33 | 34 | # 创建临时文件夹 35 | self.tmp_dir = NotionDump.TMP_DIR 36 | if not os.path.exists(self.tmp_dir): 37 | os.mkdir(self.tmp_dir) 38 | 39 | self.last_call_time = None 40 | self.friendly_time = internal_var.FRIENDLY_USE_API 41 | 42 | self.buffer = Buffer() 43 | 44 | def safe_save(self): 45 | self.buffer.save_buffer() 46 | 47 | def __friendly_use_api(self): 48 | now_time = time() 49 | # 睡眠时间 = 间隔时间 - 函数执行时间 50 | if self.last_call_time is None: 51 | func_exec_ms = self.friendly_time 52 | else: 53 | func_exec_ms = int(round(now_time * 1000)) - int(round(self.last_call_time * 1000)) 54 | sleep_ms = self.friendly_time - func_exec_ms 55 | while sleep_ms > 0: 56 | # 如果需要睡眠 57 | if sleep_ms > 100: 58 | sleep(0.1) 59 | else: 60 | sleep(sleep_ms / 1000.0) 61 | # 按照每次100ms累计 62 | common_op.debug_log("wait for server response..." + str(sleep_ms) + "ms", level=NotionDump.DUMP_MODE_DEFAULT) 63 | sleep_ms -= 100 64 | # 更新上次执行时间 65 | self.last_call_time = time() 66 | 67 | # 获取该块下所有的子块 68 | def retrieve_block_children(self, block_id, parent_id=None, page_size=100, force_update=False): 69 | # 添加缓存系统 70 | if not force_update and NotionDump.USE_BUFFER: 71 | if parent_id is not None: 72 | dirty = self.buffer.select_buffer(parent_id, is_child=True) 73 | else: 74 | dirty = self.buffer.select_buffer(block_id) 75 | if not dirty: 76 | # 缓存命中,直接从缓存中加载数据 77 | common_op.debug_log("[##CACHE] cached and load " + block_id + ";parent is " + str(parent_id), level=NotionDump.DUMP_MODE_DEFAULT) 78 | load = self.__load_from_json(block_id, prefix="retrieve_block_") 79 | if load is not None: 80 | return load 81 | 82 | common_op.debug_log("[&&CACHE] no cached and load " + block_id + "; parent is " + str(parent_id), level=NotionDump.DUMP_MODE_DEFAULT) 83 | self.__friendly_use_api() 84 | query_post = { 85 | "block_id": block_id, 86 | "page_size": page_size 87 | } 88 | try: 89 | query_ret = self.client.blocks.children.list( 90 | **query_post 91 | ) 92 | 93 | # 大量数据一次未读完 94 | next_cur = query_ret["next_cursor"] 95 | while query_ret["has_more"]: 96 | query_post["start_cursor"] = next_cur 97 | common_op.debug_log(query_post, level=NotionDump.DUMP_MODE_DEFAULT) 98 | db_query_ret = self.client.blocks.children.list( 99 | **query_post 100 | ) 101 | next_cur = db_query_ret["next_cursor"] 102 | query_ret["results"] += db_query_ret["results"] 103 | if next_cur is None: 104 | break 105 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER: 106 | self.__save_to_json(query_ret, block_id, prefix="retrieve_block_") 107 | if NotionDump.USE_BUFFER and parent_id is None: 108 | # 独立的page 更新页面状态 109 | self.buffer.update_buffer(block_id) 110 | return query_ret 111 | except APIResponseError as error: 112 | if error.code == APIErrorCode.ObjectNotFound: 113 | common_op.debug_log("Block " + block_id + " Retrieve child is invalid", 114 | level=NotionDump.DUMP_MODE_DEFAULT) 115 | else: 116 | # Other error handling code 117 | common_op.debug_log(error) 118 | common_op.debug_log("Block " + block_id + " response error", level=NotionDump.DUMP_MODE_DEFAULT) 119 | except Exception as e: 120 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 121 | common_op.debug_log("Block " + block_id + " Not found or no authority", level=NotionDump.DUMP_MODE_DEFAULT) 122 | return None 123 | 124 | # 获取到所有的数据库数据(JSon格式) 125 | def query_database(self, database_id, db_q_filter="{}", db_q_sorts="[]", force_update=False): 126 | # 添加缓存系统 127 | if not force_update and NotionDump.USE_BUFFER: 128 | if not self.buffer.select_buffer(database_id): 129 | # 缓存命中,直接从缓存中加载数据 130 | common_op.debug_log("[##CACHE] cached and load " + database_id, level=NotionDump.DUMP_MODE_DEFAULT) 131 | load = self.__load_from_json(database_id, prefix="query_db_") 132 | if load is not None: 133 | return load 134 | common_op.debug_log("[&&CACHE] no cached and load " + database_id, level=NotionDump.DUMP_MODE_DEFAULT) 135 | 136 | self.__friendly_use_api() 137 | # 组合查询条件 138 | query_post = {"database_id": database_id} 139 | if db_q_sorts != "[]": 140 | query_post["sorts"] = db_q_sorts 141 | if db_q_filter != "{}": 142 | query_post["filter"] = db_q_sorts 143 | try: 144 | query_ret = self.client.databases.query( 145 | **query_post 146 | ) 147 | 148 | # 大量数据一次未读完 149 | next_cur = query_ret["next_cursor"] 150 | while query_ret["has_more"]: 151 | query_post["start_cursor"] = next_cur 152 | common_op.debug_log(query_post, level=NotionDump.DUMP_MODE_DEFAULT) 153 | db_query_ret = self.client.databases.query( 154 | **query_post 155 | ) 156 | next_cur = db_query_ret["next_cursor"] 157 | query_ret["results"] += db_query_ret["results"] 158 | if next_cur is None: 159 | break 160 | 161 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER: 162 | self.__save_to_json(query_ret, database_id, prefix="query_db_") 163 | if NotionDump.USE_BUFFER: 164 | # 独立的page 更新页面状态 165 | self.buffer.update_buffer(database_id) 166 | return query_ret 167 | except APIResponseError as error: 168 | if error.code == APIErrorCode.ObjectNotFound: 169 | common_op.debug_log("Database Query is invalid, id=" + database_id, 170 | level=NotionDump.DUMP_MODE_DEFAULT) 171 | else: 172 | # Other error handling code 173 | common_op.debug_log(error) 174 | common_op.debug_log("Database Query is invalid, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT) 175 | except Exception as e: 176 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 177 | common_op.debug_log("Database Query Not found or no authority, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT) 178 | return None 179 | 180 | # 获取数据库信息 181 | def retrieve_database(self, database_id): 182 | self.__friendly_use_api() 183 | try: 184 | retrieve_ret = self.client.databases.retrieve(database_id=database_id) 185 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER: 186 | self.__save_to_json(retrieve_ret, database_id, prefix="retrieve_db_") 187 | if NotionDump.USE_BUFFER and retrieve_ret is not None and "last_edited_time" in retrieve_ret: 188 | self.buffer.add_buffer(database_id, retrieve_ret["last_edited_time"], id_type="database") 189 | return retrieve_ret 190 | except APIResponseError as error: 191 | if error.code == APIErrorCode.ObjectNotFound: 192 | common_op.debug_log("Database retrieve is invalid, id=" + database_id, 193 | level=NotionDump.DUMP_MODE_DEFAULT) 194 | else: 195 | # Other error handling code 196 | common_op.debug_log(error) 197 | common_op.debug_log("Database retrieve is invalid, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT) 198 | except Exception as e: 199 | common_op.debug_log(e) 200 | common_op.debug_log("Database retrieve Not found or no authority, id=" + database_id, 201 | level=NotionDump.DUMP_MODE_DEFAULT) 202 | return None 203 | 204 | # 获取Page的信息 205 | def retrieve_page(self, page_id): 206 | self.__friendly_use_api() 207 | try: 208 | retrieve_ret = self.client.pages.retrieve(page_id=page_id) 209 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER: 210 | self.__save_to_json(retrieve_ret, page_id, prefix="retrieve_page_") 211 | if NotionDump.USE_BUFFER and retrieve_ret is not None and "last_edited_time" in retrieve_ret: 212 | self.buffer.add_buffer(page_id, retrieve_ret["last_edited_time"]) 213 | return retrieve_ret 214 | except APIResponseError as error: 215 | if error.code == APIErrorCode.ObjectNotFound: 216 | common_op.debug_log("Page retrieve is invalid(api), id=" + page_id, 217 | level=NotionDump.DUMP_MODE_DEFAULT) 218 | else: 219 | # Other error handling code 220 | common_op.debug_log(error) 221 | common_op.debug_log("Page retrieve is invalid(other), id=" + page_id, 222 | level=NotionDump.DUMP_MODE_DEFAULT) 223 | except Exception as e: 224 | common_op.debug_log(e) 225 | common_op.debug_log("Page retrieve Not found or no authority, id=" + page_id, 226 | level=NotionDump.DUMP_MODE_DEFAULT) 227 | return None 228 | 229 | def download_to_file(self, download_id, child_page_item): 230 | # 设置文件链接嵌入时,只有存储在Notion的文件需要下载(不下载会由于时间问题导致链接失效) 231 | if NotionDump.FILE_WITH_LINK and "secure.notion-static.com" not in child_page_item["link_src"]: 232 | return "" 233 | 234 | # 解析文件后缀名 235 | file_url = child_page_item["link_src"] 236 | common_op.debug_log("download url is " + file_url, level=NotionDump.DUMP_MODE_DEBUG) 237 | if file_url == "": 238 | return "" 239 | # 文件名在最后一个/和?之间 240 | if file_url.find('?') != -1: 241 | filename = file_url[file_url.rfind('/') + 1:file_url.find('?')] 242 | else: 243 | filename = file_url[file_url.rfind('/') + 1:] 244 | file_suffix = filename[filename.find('.'):] 245 | # 使用后缀和id生成可识别的文件 246 | download_name = self.tmp_dir + download_id + file_suffix 247 | common_op.debug_log("download name " + download_name, level=NotionDump.DUMP_MODE_DEBUG) 248 | 249 | if NotionDump.USE_BUFFER: 250 | # 看文件是否需要重新下载 251 | if not self.buffer.select_buffer(download_id) and os.path.exists(download_name): 252 | return download_name 253 | 254 | # 新增记录(注意这里与上面select不属于同一个执行分支) 255 | self.buffer.add_buffer(download_id, "", id_type="file") 256 | 257 | if os.path.exists(download_name): 258 | common_op.debug_log("[WARN] file " + download_name + " was covered", level=NotionDump.DUMP_MODE_DEFAULT) 259 | # 下载文件 260 | self.__friendly_use_api() 261 | try: 262 | file_url = quote(file_url, safe='/:?=&%') 263 | urllib.request.urlretrieve(file_url, download_name) 264 | if NotionDump.USE_BUFFER: 265 | self.buffer.update_buffer(download_id) 266 | return download_name 267 | except urllib.error.HTTPError as e: 268 | common_op.debug_log("download name " + download_name + " get error:HTTPError", 269 | level=NotionDump.DUMP_MODE_DEFAULT) 270 | common_op.debug_log("download url " + file_url + " get error:HTTPError", 271 | level=NotionDump.DUMP_MODE_DEFAULT) 272 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 273 | except urllib.error.ContentTooShortError as e: 274 | common_op.debug_log("download name " + download_name + " get error:ContentTooShortError", 275 | level=NotionDump.DUMP_MODE_DEFAULT) 276 | common_op.debug_log("download url " + file_url + " get error:ContentTooShortError", 277 | level=NotionDump.DUMP_MODE_DEFAULT) 278 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 279 | except urllib.error.URLError as e: 280 | common_op.debug_log("download name " + download_name + " get error:URLError", 281 | level=NotionDump.DUMP_MODE_DEFAULT) 282 | common_op.debug_log("download url " + file_url + " get error:URLError", 283 | level=NotionDump.DUMP_MODE_DEFAULT) 284 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 285 | except TimeoutError as e: 286 | common_op.debug_log("download name " + download_name + " get error:TimeoutError", 287 | level=NotionDump.DUMP_MODE_DEFAULT) 288 | common_op.debug_log("download url " + file_url + " get error:TimeoutError", 289 | level=NotionDump.DUMP_MODE_DEFAULT) 290 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 291 | except Exception as e: 292 | common_op.debug_log("download name " + download_name + " get error:Exception", 293 | level=NotionDump.DUMP_MODE_DEFAULT) 294 | common_op.debug_log("download url " + file_url + " get error:Exception", 295 | level=NotionDump.DUMP_MODE_DEFAULT) 296 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT) 297 | return "" 298 | 299 | # 源文件,直接输出成json; 辅助测试使用 300 | def __save_to_json(self, page_json, json_id, json_name=None, prefix=None): 301 | if json_name is None: 302 | if prefix is not None: 303 | json_name = self.tmp_dir + prefix + json_id + ".json" 304 | else: 305 | json_name = self.tmp_dir + json_id + ".json" 306 | common_op.save_json_to_file(page_json, json_name) 307 | 308 | def __load_from_json(self, json_id, json_name=None, prefix=None): 309 | if json_name is None: 310 | if prefix is not None: 311 | json_name = self.tmp_dir + prefix + json_id + ".json" 312 | else: 313 | json_name = self.tmp_dir + json_id + ".json" 314 | return common_op.load_json_from_file(json_name) 315 | -------------------------------------------------------------------------------- /NotionDump/Notion/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Notion/__init__.py -------------------------------------------------------------------------------- /NotionDump/Parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Parser/__init__.py -------------------------------------------------------------------------------- /NotionDump/Parser/base_parser.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | import copy 5 | 6 | import NotionDump 7 | from NotionDump.utils import content_format, common_op 8 | from NotionDump.utils import internal_var 9 | from urllib.parse import unquote 10 | from NotionDump.utils.content_format import color_transformer, color_transformer_db, format_date_or_time 11 | 12 | 13 | class BaseParser: 14 | def __init__(self, base_id, export_child=False): 15 | self.base_id = base_id.replace('-', '') 16 | self.export_child = export_child 17 | 18 | # 设置变量存放子page 字典 19 | self.child_pages = {} 20 | 21 | def set_new_id(self, parent_id): 22 | self.base_id = parent_id 23 | 24 | # 获取子页面字典,只返回一次,离台概不负责 25 | def get_child_pages_dic(self): 26 | child_pages = copy.deepcopy(self.child_pages) 27 | self.child_pages.clear() # 清空已有的内容 28 | return child_pages 29 | 30 | # 文本的格式生成 31 | @staticmethod 32 | def __annotations_parser(block_handle, str_plain): 33 | if str_plain is None or str_plain == "": 34 | return "" 35 | last_char = str_plain[-1:] 36 | if last_char == "\n" or last_char == "\t": 37 | str_ret = str_plain[0:-1] 38 | else: 39 | str_ret = str_plain 40 | if block_handle["code"]: 41 | str_ret = "`" + str_ret + "`" 42 | if block_handle["underline"]: 43 | str_ret = "" + str_ret + "" 44 | if block_handle["bold"]: 45 | str_ret = "**" + str_ret + "**" 46 | if block_handle["italic"]: 47 | str_ret = "*" + str_ret + "*" 48 | if block_handle["color"] != "default": 49 | # 添加颜色,区分背景色和前景色 50 | if NotionDump.S_THEME_TYPE == "markdown": 51 | # 使用markdown默认的高亮来渲染所有的颜色类型 52 | str_ret = NotionDump.MD_HIGHLIGHT + str_ret + NotionDump.MD_HIGHLIGHT 53 | else: 54 | if block_handle["color"].find("_background") != -1: 55 | bg_color = block_handle["color"][0:block_handle["color"].rfind('_')] 56 | str_ret = "" + str_ret + "" 57 | else: 58 | str_ret = "" + str_ret + "" 59 | if block_handle["strikethrough"]: 60 | str_ret = "~~" + str_ret + "~~" 61 | if last_char == "\n" or last_char == "\t": 62 | str_ret += last_char 63 | return str_ret 64 | 65 | def __text_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False): 66 | if block_handle["type"] != "text": 67 | common_op.debug_log( 68 | "text type error! id=" + self.base_id + " not type " + block_handle["type"], 69 | level=NotionDump.DUMP_MODE_DEFAULT) 70 | return "" 71 | 72 | text_str = "" 73 | if "plain_text" in block_handle: 74 | text_str = block_handle["plain_text"] 75 | if text_str is None: 76 | text_str = "" 77 | # 如果有链接 78 | text_url = block_handle["href"] 79 | if text_url is not None and parser_type == NotionDump.PARSER_TYPE_MD and not is_db_title: # 数据库标题越过链接解析 80 | # 文字有链接内容,分为网络链接和本地链接 81 | if text_url.startswith("http") or not text_url.startswith("/"): 82 | # 网络链接,直接一步到位 83 | text_str = content_format.get_url_format(text_url, text_str) 84 | else: 85 | # Page或者数据库类型,等待重定位 86 | if text_url.find("=") != -1: 87 | id_type = "database" 88 | page_id = text_url[text_url.rfind("/") + 1:text_url.rfind("?")].replace('-', '') 89 | else: 90 | id_type = "page" 91 | page_id = text_url[text_url.rfind("/") + 1:].replace('-', '') 92 | if len(page_id) == NotionDump.ID_LEN: 93 | common_op.debug_log("### page id " + page_id + " is " + id_type) 94 | common_op.add_new_child_page( 95 | self.child_pages, 96 | key_id=page_id + "_" + text_str, 97 | link_id=page_id, 98 | link_src=text_url, 99 | page_type=id_type, 100 | page_name=text_str 101 | ) 102 | # 将页面保存,等待进一步递归操作 103 | # 保存子页面信息 104 | common_op.debug_log("child_page_parser add page id = " + page_id + "_" + text_str, level=NotionDump.DUMP_MODE_DEFAULT) 105 | text_str = content_format.get_page_format_md(page_id + "_" + text_str, text_str, 106 | export_child=self.export_child) 107 | else: 108 | text_str = content_format.get_url_format("", text_str) 109 | 110 | if parser_type == NotionDump.PARSER_TYPE_MD: 111 | # 解析annotations部分,为text_str添加格式 112 | return self.__annotations_parser(block_handle["annotations"], text_str) 113 | else: 114 | return text_str 115 | 116 | def __text_block_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False): 117 | paragraph_ret = "" 118 | if block_handle["type"] == "text": 119 | paragraph_ret = self.__text_parser(block_handle, parser_type) 120 | elif block_handle["type"] == "equation": 121 | paragraph_ret = self.__equation_inline_parser(block_handle) 122 | elif block_handle["type"] == "mention": 123 | paragraph_ret = self.__mention_parser(block_handle, parser_type, is_db_title=is_db_title) 124 | else: 125 | common_op.debug_log( 126 | "text type " + block_handle["type"] + " error! parent_id= " + self.base_id, 127 | level=NotionDump.DUMP_MODE_DEFAULT) 128 | return paragraph_ret 129 | 130 | def __text_list_parser(self, text_list, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db=False, is_db_title=False): 131 | plain_text = "" 132 | if text_list is not None: 133 | for text_block in text_list: 134 | plain_text += self.__text_block_parser(text_block, parser_type, is_db_title=is_db_title) 135 | if is_db: 136 | # 数据库内容特殊字符校对 137 | return plain_text.replace("|", "\\|") 138 | else: 139 | return plain_text 140 | 141 | # TODO : people只获取了名字和ID,后续可以做深度解析用户相关内容 142 | def __people_parser(self, block_handle): 143 | if block_handle["object"] != "user": 144 | common_op.debug_log("people type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 145 | return "" 146 | # 优先获取名字 147 | if "name" in block_handle.keys(): 148 | return block_handle["name"] 149 | # 如果无法获取名字则返回id 150 | return block_handle["id"].replace('-', '') 151 | 152 | def __user_parser(self, block_handle): 153 | if block_handle["type"] != "user": 154 | common_op.debug_log("user type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 155 | return "" 156 | user_body = block_handle["user"] 157 | return self.__people_parser(user_body) 158 | 159 | def __db_file_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 160 | if block_handle["type"] != "file": 161 | common_op.debug_log("file type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 162 | return "" 163 | filename = block_handle["name"] 164 | file_url = block_handle["file"]["url"] 165 | 166 | # 解析文件的ID 167 | url_prefix = file_url[0:file_url.rfind("/")] 168 | file_id = url_prefix[url_prefix.rfind("/") + 1:].replace('-', '') 169 | common_op.debug_log("file id is : " + file_id) 170 | 171 | if filename == "": 172 | # 如果文件没有名字使用id作为默认名字 173 | filename = file_id 174 | common_op.add_new_child_page( 175 | self.child_pages, 176 | key_id=file_id, 177 | link_src=file_url, 178 | page_type="file", 179 | page_name=filename 180 | ) 181 | common_op.debug_log( 182 | "file_parser add page id = " + file_id + " name : " + filename, level=NotionDump.DUMP_MODE_DEFAULT) 183 | common_op.debug_log(internal_var.PAGE_DIC) 184 | common_op.debug_log("#############") 185 | common_op.debug_log(self.child_pages) 186 | 187 | # 格式处理简单格式(也可以转换成markdown格式[]()) 188 | if parser_type == NotionDump.PARSER_TYPE_MD: 189 | # file转换成文件链接的形式 190 | return content_format.get_file_format_md(filename, file_url, file_id, self.export_child) 191 | else: 192 | return content_format.get_file_format_plain(filename, file_url) 193 | 194 | # "$ equation_inline $" 195 | def __equation_inline_parser(self, block_handle): 196 | if block_handle["type"] != "equation": 197 | common_op.debug_log("equation inline type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 198 | return "" 199 | # 公式删除富文本格式 200 | # return content_format.get_equation_inline( 201 | # self.__annotations_parser(block_handle["annotations"], block_handle["plain_text"]) 202 | # ) 203 | return content_format.get_equation_inline(block_handle["plain_text"]) 204 | 205 | # "$$ equation_block $$" 206 | def __equation_block_parser(self, block_handle): 207 | if block_handle["expression"] is None: 208 | common_op.debug_log("equation block no expression! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 209 | return "" 210 | return content_format.get_equation_block(block_handle["expression"]) 211 | 212 | # Attention!!! 关于链接到其它的Page可能需要递归处理 213 | def __page_parser(self, block_handle): 214 | if block_handle["type"] != "page": 215 | common_op.debug_log("page type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 216 | return "" 217 | 218 | page_body = block_handle["page"] 219 | return page_body["id"].replace('-', '') 220 | 221 | # 提及到其它页面,日期,用户 222 | def __mention_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False): 223 | if block_handle["type"] != "mention": 224 | common_op.debug_log("mention type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 225 | return "" 226 | 227 | mention_body = block_handle["mention"] 228 | mention_plain = "" 229 | if mention_body["type"] == "date": 230 | mention_plain = self.date_parser(mention_body) 231 | elif mention_body["type"] == "user": 232 | mention_plain = self.__user_parser(mention_body) 233 | elif mention_body["type"] == "link_preview" and "url" in mention_body["link_preview"].keys(): 234 | mention_plain = mention_body["link_preview"]["url"] 235 | elif mention_body["type"] == "database": 236 | database_id = mention_body["database"]["id"].replace('-', '') 237 | key_id = database_id + "_mention" 238 | common_op.debug_log("__mention_parser add database id = " + database_id) 239 | # 获取页面的名字 240 | database_name = block_handle["plain_text"] 241 | database_link = block_handle["href"] 242 | if is_db_title: 243 | mention_plain = database_name 244 | else: 245 | common_op.add_new_child_page( 246 | self.child_pages, 247 | key_id=key_id, 248 | link_id=database_id, 249 | link_src=database_link, 250 | page_type="database", 251 | page_name=database_name 252 | ) 253 | common_op.debug_log( 254 | "file_parser add page id = " + key_id + " name : " + database_name, level=NotionDump.DUMP_MODE_DEFAULT) 255 | common_op.debug_log(internal_var.PAGE_DIC) 256 | common_op.debug_log("#############") 257 | common_op.debug_log(self.child_pages) 258 | 259 | if parser_type == NotionDump.PARSER_TYPE_MD: 260 | mention_plain = content_format.get_page_format_md(key_id, database_name, export_child=self.export_child) 261 | else: 262 | mention_plain = database_name 263 | elif mention_body["type"] == "page": 264 | page_id = self.__page_parser(mention_body) 265 | key_id = page_id + "_mention" 266 | common_op.debug_log("__mention_parser add page id = " + page_id) 267 | # 获取页面的名字 268 | page_name = block_handle["plain_text"] 269 | page_link = block_handle["href"] 270 | 271 | if is_db_title: 272 | mention_plain = page_name 273 | else: 274 | # 提及页面按照链接页面处理 275 | common_op.add_new_child_page( 276 | self.child_pages, 277 | key_id=key_id, 278 | link_id=page_id, 279 | link_src=page_link, 280 | page_type="page", 281 | page_name=page_name 282 | ) 283 | common_op.debug_log( 284 | "file_parser add page id = " + key_id + " name : " + page_name, level=NotionDump.DUMP_MODE_DEFAULT) 285 | common_op.debug_log(internal_var.PAGE_DIC) 286 | common_op.debug_log("#############") 287 | common_op.debug_log(self.child_pages) 288 | 289 | if parser_type == NotionDump.PARSER_TYPE_MD: 290 | mention_plain = content_format.get_page_format_md(key_id, page_name, export_child=self.export_child) 291 | else: 292 | mention_plain = page_name 293 | else: 294 | common_op.debug_log("unknown mention type " + mention_body["type"], level=NotionDump.DUMP_MODE_DEFAULT) 295 | 296 | if parser_type == NotionDump.PARSER_TYPE_MD: 297 | # 解析annotations部分,为mention_plain添加格式 298 | return self.__annotations_parser(block_handle["annotations"], 299 | content_format.get_mention_format(mention_plain)) 300 | else: 301 | return content_format.get_mention_format(mention_plain) 302 | 303 | def __table_row_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 304 | if block_handle["type"] != "table_row": 305 | common_op.debug_log("table_row type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT) 306 | return "" 307 | table_col_cells = block_handle["table_row"]["cells"] 308 | table_row = [] 309 | for cell in table_col_cells: 310 | table_row.append(self.__text_list_parser(cell, parser_type)) 311 | return table_row 312 | 313 | # 数据库 title 314 | def title_parser(self, block_handle, page_id, parser_type=NotionDump.PARSER_TYPE_PLAIN): 315 | if block_handle["type"] != "title": 316 | common_op.debug_log("title type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 317 | level=NotionDump.DUMP_MODE_DEFAULT) 318 | return "" 319 | db_page_title = self.__text_list_parser(block_handle["title"], parser_type, is_db=True, is_db_title=True) 320 | if page_id == "": 321 | # 如果page id是空的,说明只想要内容,不需要重定位 322 | return db_page_title 323 | 324 | if db_page_title != "": 325 | # 如果存在子Page就加入到待解析队列 326 | common_op.debug_log("title ret = " + db_page_title) 327 | if parser_type != NotionDump.PARSER_TYPE_PLAIN: 328 | common_op.debug_log("title_parser add page id = " + page_id, level=NotionDump.DUMP_MODE_DEFAULT) 329 | else: 330 | common_op.debug_log("title_parser add page id = " + page_id) 331 | # 数据库里的都是子页面 332 | common_op.add_new_child_page(self.child_pages, key_id=page_id, page_name=db_page_title) 333 | 334 | # 如果有子页面就添加一个占位符,之后方便重定位 335 | db_page_title = content_format.get_database_title_format(page_id, db_page_title, self.export_child) 336 | return db_page_title 337 | 338 | # 数据库 rich_text 339 | def rich_text_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 340 | if block_handle["type"] != "rich_text": 341 | common_op.debug_log("rich_text type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 342 | level=NotionDump.DUMP_MODE_DEFAULT) 343 | return "" 344 | return self.__text_list_parser(block_handle["rich_text"], parser_type, is_db=True) 345 | 346 | # 数据库 multi_select 347 | def multi_select_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 348 | if block_handle["type"] != "multi_select": 349 | common_op.debug_log("multi_select type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 350 | level=NotionDump.DUMP_MODE_DEFAULT) 351 | return "" 352 | multi_select_list = block_handle["multi_select"] 353 | ret_str = "" 354 | if multi_select_list is None: 355 | return ret_str 356 | for multi_select in multi_select_list: 357 | if ret_str != "": 358 | ret_str += "," # 多个选项之间用“,”分割 359 | if parser_type == NotionDump.PARSER_TYPE_MD: 360 | ret_str += " " + multi_select["name"] + " " 363 | else: 364 | ret_str += multi_select["name"] 365 | return ret_str 366 | 367 | # 数据库 select 368 | def select_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 369 | if block_handle["type"] != "select": 370 | common_op.debug_log("select type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 371 | level=NotionDump.DUMP_MODE_DEFAULT) 372 | return "" 373 | select = block_handle["select"] 374 | ret_str = "" 375 | if select is None: 376 | return ret_str 377 | if parser_type == NotionDump.PARSER_TYPE_MD: 378 | ret_str = " " + select["name"] + " " 381 | else: 382 | ret_str = select["name"] 383 | return ret_str 384 | 385 | # 数据库 url 386 | def url_parser(self, block_handle): 387 | if block_handle["type"] != "url": 388 | common_op.debug_log("url type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 389 | level=NotionDump.DUMP_MODE_DEFAULT) 390 | return "" 391 | url = block_handle["url"] 392 | if url is None: 393 | url = "" 394 | return content_format.get_url_format(url) 395 | 396 | # 数据库 email 397 | def email_parser(self, block_handle): 398 | if block_handle["type"] != "email": 399 | common_op.debug_log("email type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 400 | level=NotionDump.DUMP_MODE_DEFAULT) 401 | return "" 402 | email = block_handle["email"] 403 | ret_str = "" 404 | if email is not None: 405 | ret_str = email 406 | return ret_str 407 | 408 | # 数据库 checkbox 409 | def checkbox_parser(self, block_handle): 410 | if block_handle["type"] != "checkbox": 411 | common_op.debug_log("checkbox type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 412 | level=NotionDump.DUMP_MODE_DEFAULT) 413 | return "" 414 | checkbox = block_handle["checkbox"] 415 | if checkbox is True: 416 | ret_str = NotionDump.MD_BOOL_TRUE 417 | else: 418 | ret_str = NotionDump.MD_BOOL_FALSE 419 | return ret_str 420 | 421 | # 数据库 phone_number 422 | def phone_number_parser(self, block_handle): 423 | if block_handle["type"] != "phone_number": 424 | common_op.debug_log("phone_number type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 425 | level=NotionDump.DUMP_MODE_DEFAULT) 426 | return "" 427 | phone_number = block_handle["phone_number"] 428 | ret_str = "" 429 | if phone_number is not None: 430 | ret_str = phone_number 431 | return ret_str 432 | 433 | # 数据库 date 434 | def date_parser(self, block_handle): 435 | if block_handle["type"] != "date": 436 | common_op.debug_log("date type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 437 | level=NotionDump.DUMP_MODE_DEFAULT) 438 | return "" 439 | date = block_handle["date"] 440 | if date is None: 441 | return "" 442 | return content_format.get_date_format(date["start"], date["end"]) 443 | 444 | # 数据库 people 445 | def people_parser(self, block_handle): 446 | if block_handle["type"] != "people": 447 | common_op.debug_log("people type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 448 | level=NotionDump.DUMP_MODE_DEFAULT) 449 | return "" 450 | people_list = block_handle["people"] 451 | ret_str = "" 452 | if people_list is None: 453 | return ret_str 454 | for people in people_list: 455 | if ret_str != "": 456 | ret_str += "," # 多个用户之间用“,”分割 457 | ret_str += self.__people_parser(people) 458 | return ret_str 459 | 460 | # 数据库 number 461 | def number_parser(self, block_handle): 462 | if block_handle["type"] != "number": 463 | common_op.debug_log("number type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 464 | level=NotionDump.DUMP_MODE_DEFAULT) 465 | return "" 466 | number = block_handle["number"] 467 | ret_str = "" 468 | if number is None: 469 | return ret_str 470 | ret_str = number 471 | return str(ret_str) 472 | 473 | # 数据库 files 474 | def files_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 475 | if block_handle["type"] != "files": 476 | common_op.debug_log("files type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 477 | level=NotionDump.DUMP_MODE_DEFAULT) 478 | return "" 479 | files_list = block_handle["files"] 480 | ret_str = "" 481 | if files_list is None: 482 | return ret_str 483 | for file in files_list: 484 | if ret_str != "": 485 | if parser_type == NotionDump.PARSER_TYPE_MD: 486 | ret_str += "
" # 多个文件之间用“
”分割 487 | else: 488 | ret_str += "," # 多个文件之间用“,”分割 489 | ret_str += self.__db_file_parser(file, parser_type) 490 | return ret_str 491 | 492 | # 数据库 relation 数据 493 | def relation_parser(self, block_handle): 494 | if block_handle["type"] != "relation": 495 | common_op.debug_log("relation type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 496 | level=NotionDump.DUMP_MODE_DEFAULT) 497 | return "" 498 | relation_list = block_handle["relation"] 499 | relation_ret = "" 500 | for relation_item in relation_list: 501 | relation_id = relation_item["id"].replace("-", "") 502 | # 按照软连接处理 503 | common_op.add_new_child_page( 504 | self.child_pages, 505 | key_id=relation_id + "_relation", 506 | link_id=relation_id, 507 | page_type="page", 508 | page_name="" 509 | ) 510 | if relation_ret != "": 511 | relation_ret += "," 512 | relation_ret += content_format.get_database_title_format(relation_id + "_relation", "", self.export_child) 513 | return relation_ret 514 | 515 | # 数据库 formula 数据 516 | def formula_parser(self, block_handle): 517 | if block_handle["type"] != "formula": 518 | common_op.debug_log("formula type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 519 | level=NotionDump.DUMP_MODE_DEFAULT) 520 | return "" 521 | formula_block = block_handle["formula"] 522 | ret_str = "" 523 | if formula_block["type"] == "string": 524 | ret_str = formula_block["string"] 525 | elif formula_block["type"] == "number": 526 | ret_str = str(formula_block["number"]) 527 | elif formula_block["type"] == "boolean": 528 | if formula_block["boolean"] is True: 529 | ret_str = NotionDump.MD_BOOL_TRUE 530 | else: 531 | ret_str = NotionDump.MD_BOOL_FALSE 532 | # ret_str = str(formula_block["boolean"]) 533 | elif formula_block["type"] == "date": 534 | ret_str = self.date_parser(formula_block) 535 | else: 536 | ret_str = "[unknown_formula_type:" + formula_block["type"] + "]" 537 | return ret_str 538 | 539 | # 数据库 created_time 540 | def created_time_parser(self, block_handle): 541 | if block_handle["type"] != "created_time": 542 | common_op.debug_log("created_time type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 543 | level=NotionDump.DUMP_MODE_DEFAULT) 544 | return "" 545 | return format_date_or_time(block_handle["created_time"]) 546 | 547 | # 数据库 last_edited_time 548 | def last_edited_time_parser(self, block_handle): 549 | if block_handle["type"] != "last_edited_time": 550 | common_op.debug_log( 551 | "last_edited_time type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 552 | level=NotionDump.DUMP_MODE_DEFAULT) 553 | return "" 554 | return format_date_or_time(block_handle["last_edited_time"]) 555 | 556 | def created_by_parser(self, block_handle): 557 | if block_handle["type"] != "created_by": 558 | common_op.debug_log("created_by type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 559 | level=NotionDump.DUMP_MODE_DEFAULT) 560 | return "" 561 | return self.__people_parser(block_handle["created_by"]) 562 | 563 | # 数据库 last_edited_by 564 | def last_edited_by_parser(self, block_handle): 565 | if block_handle["type"] != "last_edited_by": 566 | common_op.debug_log( 567 | "last_edited_by type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 568 | level=NotionDump.DUMP_MODE_DEFAULT) 569 | return "" 570 | return self.__people_parser(block_handle["last_edited_by"]) 571 | 572 | # Page paragraph 573 | # mention 574 | # date 575 | # user 576 | # page 577 | # text 578 | # equation 579 | def paragraph_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 580 | paragraph_ret = "" 581 | if block_handle["type"] != "paragraph": 582 | common_op.debug_log("paragraph type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 583 | level=NotionDump.DUMP_MODE_DEFAULT) 584 | return paragraph_ret 585 | return self.__text_list_parser(block_handle["paragraph"]["rich_text"], parser_type) 586 | 587 | # Page heading_1 588 | def heading_1_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 589 | heading_1_ret = "" 590 | if block_handle["type"] != "heading_1": 591 | common_op.debug_log("heading_1 type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 592 | level=NotionDump.DUMP_MODE_DEFAULT) 593 | return heading_1_ret 594 | heading_1_ret = self.__text_list_parser(block_handle["heading_1"]["rich_text"], parser_type) 595 | if parser_type == NotionDump.PARSER_TYPE_MD: 596 | return "# " + heading_1_ret 597 | else: 598 | return heading_1_ret 599 | 600 | # Page heading_2 601 | def heading_2_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 602 | heading_2_ret = "" 603 | if block_handle["type"] != "heading_2": 604 | common_op.debug_log("heading_2 type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 605 | level=NotionDump.DUMP_MODE_DEFAULT) 606 | return heading_2_ret 607 | heading_2_ret = self.__text_list_parser(block_handle["heading_2"]["rich_text"], parser_type) 608 | 609 | if parser_type == NotionDump.PARSER_TYPE_MD: 610 | return "## " + heading_2_ret 611 | else: 612 | return heading_2_ret 613 | 614 | # Page heading_3 615 | def heading_3_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 616 | heading_3_ret = "" 617 | if block_handle["type"] != "heading_3": 618 | common_op.debug_log("heading_3 type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 619 | level=NotionDump.DUMP_MODE_DEFAULT) 620 | return heading_3_ret 621 | heading_3_ret = self.__text_list_parser(block_handle["heading_3"]["rich_text"], parser_type) 622 | 623 | if parser_type == NotionDump.PARSER_TYPE_MD: 624 | return "### " + heading_3_ret 625 | else: 626 | return heading_3_ret 627 | 628 | # Page to_do 629 | def to_do_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 630 | to_do_ret = "" 631 | if block_handle["type"] != "to_do": 632 | common_op.debug_log("to_do type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 633 | level=NotionDump.DUMP_MODE_DEFAULT) 634 | return to_do_ret 635 | to_do_ret = self.__text_list_parser(block_handle["to_do"]["rich_text"], parser_type) 636 | 637 | if parser_type == NotionDump.PARSER_TYPE_MD: 638 | if block_handle["to_do"]["checked"]: 639 | return "- [x] " + to_do_ret 640 | else: 641 | return "- [ ] " + to_do_ret 642 | else: 643 | return to_do_ret 644 | 645 | # Page bulleted_list_item 646 | def bulleted_list_item_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 647 | bulleted_list_item_ret = "" 648 | if block_handle["type"] != "bulleted_list_item": 649 | common_op.debug_log( 650 | "bulleted_list_item type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 651 | level=NotionDump.DUMP_MODE_DEFAULT) 652 | return bulleted_list_item_ret 653 | bulleted_list_item_ret = self.__text_list_parser(block_handle["bulleted_list_item"]["rich_text"], parser_type) 654 | 655 | if parser_type == NotionDump.PARSER_TYPE_MD: 656 | return "- " + bulleted_list_item_ret 657 | else: 658 | return bulleted_list_item_ret 659 | 660 | # Page numbered_list_item 661 | def numbered_list_item_parser(self, block_handle, list_index, parser_type=NotionDump.PARSER_TYPE_PLAIN): 662 | numbered_list_item_ret = "" 663 | if block_handle["type"] != "numbered_list_item": 664 | common_op.debug_log( 665 | "numbered_list_item type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 666 | level=NotionDump.DUMP_MODE_DEFAULT) 667 | return numbered_list_item_ret 668 | numbered_list_item_ret = self.__text_list_parser(block_handle["numbered_list_item"]["rich_text"], parser_type) 669 | 670 | if parser_type == NotionDump.PARSER_TYPE_MD: 671 | return str(list_index) + ". " + numbered_list_item_ret 672 | else: 673 | return numbered_list_item_ret 674 | 675 | # Page toggle 676 | def toggle_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 677 | toggle_ret = "" 678 | if block_handle["type"] != "toggle": 679 | common_op.debug_log("toggle type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 680 | level=NotionDump.DUMP_MODE_DEFAULT) 681 | return toggle_ret 682 | toggle_ret = self.__text_list_parser(block_handle["toggle"]["rich_text"], parser_type) 683 | 684 | if parser_type == NotionDump.PARSER_TYPE_MD: 685 | return "- " + toggle_ret 686 | else: 687 | return toggle_ret 688 | 689 | # Page divider 690 | def divider_parser(self, block_handle): 691 | divider_ret = "" 692 | if block_handle["type"] != "divider": 693 | common_op.debug_log("divider type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 694 | level=NotionDump.DUMP_MODE_DEFAULT) 695 | return divider_ret 696 | divider_ret = NotionDump.MD_DIVIDER 697 | return divider_ret 698 | 699 | # Page callout 700 | def callout_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 701 | callout_ret = "" 702 | if block_handle["type"] != "callout": 703 | common_op.debug_log("callout type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 704 | level=NotionDump.DUMP_MODE_DEFAULT) 705 | return callout_ret 706 | callout_ret = self.__text_list_parser(block_handle["callout"]["rich_text"], parser_type) 707 | 708 | if parser_type == NotionDump.PARSER_TYPE_MD: 709 | # 这里是否每一行都操作 710 | return "> " + callout_ret 711 | else: 712 | return callout_ret 713 | 714 | # Page code 715 | def code_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 716 | code_ret = "" 717 | if block_handle["type"] != "code": 718 | common_op.debug_log("code type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 719 | level=NotionDump.DUMP_MODE_DEFAULT) 720 | return code_ret 721 | code_ret = self.__text_list_parser(block_handle["code"]["rich_text"], parser_type) 722 | 723 | code_type = block_handle["code"]["language"] 724 | if code_type is None: 725 | code_type = "" 726 | 727 | if parser_type == NotionDump.PARSER_TYPE_MD: 728 | # 这里是否每一行都操作 729 | return "```" + code_type + "\n" + code_ret + "\n```" 730 | else: 731 | return code_ret 732 | 733 | # Page quote 734 | def quote_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 735 | quote_ret = "" 736 | if block_handle["type"] != "quote": 737 | common_op.debug_log("quote type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 738 | level=NotionDump.DUMP_MODE_DEFAULT) 739 | return quote_ret 740 | quote_ret = self.__text_list_parser(block_handle["quote"]["rich_text"], parser_type) 741 | # 最外层颜色 742 | if block_handle["quote"]["color"] != "default": 743 | # 添加颜色,区分背景色和前景色 744 | if NotionDump.S_THEME_TYPE == "markdown": 745 | # 使用markdown默认的高亮来渲染所有的颜色类型 746 | quote_ret = NotionDump.MD_HIGHLIGHT + quote_ret + NotionDump.MD_HIGHLIGHT 747 | else: 748 | if block_handle["quote"]["color"].find("_background") != -1: 749 | bg_color = block_handle["quote"]["color"][0:block_handle["quote"]["color"].rfind('_')] 750 | quote_ret = "" + quote_ret + "" 752 | else: 753 | quote_ret = "" + quote_ret + "" 755 | 756 | if parser_type == NotionDump.PARSER_TYPE_MD: 757 | # 这里是否每一行都操作 758 | return "> " + quote_ret 759 | else: 760 | return quote_ret 761 | 762 | # Page equation 763 | def equation_parser(self, block_handle): 764 | equation_ret = "" 765 | if block_handle["type"] != "equation": 766 | common_op.debug_log(" type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 767 | level=NotionDump.DUMP_MODE_DEFAULT) 768 | return equation_ret 769 | return self.__equation_block_parser(block_handle["equation"]) 770 | 771 | # Page table_row 772 | def table_row_parser(self, block_handle, first_row=False, parser_type=NotionDump.PARSER_TYPE_PLAIN): 773 | table_row_ret = "" 774 | if block_handle["type"] != "table_row": 775 | common_op.debug_log("table_row type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 776 | level=NotionDump.DUMP_MODE_DEFAULT) 777 | return table_row_ret 778 | 779 | table_row_list = self.__table_row_parser(block_handle, parser_type) 780 | table_row_ret = "|" 781 | for it in table_row_list: 782 | table_row_ret += it.replace('\n', '
') + "|" 783 | if first_row: 784 | table_row_ret += "\n|" 785 | for i in range(len(table_row_list)): 786 | table_row_ret += " --- " + "|" 787 | 788 | return table_row_ret 789 | 790 | def child_page_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 791 | child_page_ret = "" 792 | if block_handle["type"] != "child_page": 793 | common_op.debug_log("child_page type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 794 | level=NotionDump.DUMP_MODE_DEFAULT) 795 | return child_page_ret 796 | 797 | page_body = block_handle["child_page"] 798 | if page_body["title"] == "": 799 | if parser_type == NotionDump.PARSER_TYPE_MD: 800 | return content_format.get_page_format_md("NULL Page", "NULL Page", export_child=self.export_child) 801 | else: 802 | return content_format.get_page_format_plain("NULL Page") 803 | else: 804 | page_id = (block_handle["id"]).replace('-', '') 805 | 806 | # 保存子页面信息 807 | common_op.debug_log("child_page_parser add page id = " + page_id, level=NotionDump.DUMP_MODE_DEFAULT) 808 | common_op.add_new_child_page(self.child_pages, key_id=page_id, page_name=page_body["title"]) 809 | 810 | if parser_type == NotionDump.PARSER_TYPE_MD: 811 | return content_format.get_page_format_md(page_id, page_body["title"], export_child=self.export_child) 812 | else: 813 | return content_format.get_page_format_plain(page_body["title"]) 814 | 815 | # Page child_database 816 | def child_database_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 817 | if block_handle["type"] != "child_database": 818 | common_op.debug_log("child_database type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 819 | level=NotionDump.DUMP_MODE_DEFAULT) 820 | return "" 821 | 822 | # 子数据库保存在页面表中,不解析 823 | child_db_id = block_handle["id"].replace('-', '') 824 | common_op.add_new_child_page( 825 | self.child_pages, 826 | key_id=child_db_id, 827 | page_type="database", 828 | page_name=block_handle["child_database"]["title"] 829 | ) 830 | common_op.debug_log( 831 | "child_database_parser add page id = " + child_db_id + "name : " + block_handle["child_database"]["title"], level=NotionDump.DUMP_MODE_DEFAULT) 832 | common_op.debug_log(internal_var.PAGE_DIC) 833 | common_op.debug_log("#############") 834 | common_op.debug_log(self.child_pages) 835 | 836 | # 子数据库要返回一个链接占位符,供后续解析使用 837 | if parser_type == NotionDump.PARSER_TYPE_MD: 838 | return content_format.get_page_format_md( 839 | child_db_id, 840 | block_handle["child_database"]["title"], 841 | export_child=self.export_child 842 | ) 843 | else: 844 | return content_format.get_page_format_plain(block_handle["child_database"]["title"]) 845 | 846 | # Page image 847 | def image_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 848 | if block_handle["type"] != "image": 849 | common_op.debug_log("image type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 850 | level=NotionDump.DUMP_MODE_DEFAULT) 851 | return "" 852 | 853 | # 子数据库保存在页面表中,不解析 854 | image_id = block_handle["id"].replace('-', '') 855 | image_name = self.__text_list_parser(block_handle["image"]["caption"], parser_type) 856 | image_url = "" 857 | image_type = block_handle["image"]["type"] 858 | if image_type in block_handle["image"].keys(): 859 | if "url" in block_handle["image"][image_type].keys(): 860 | image_url = block_handle["image"][image_type]["url"] 861 | if image_url == "": 862 | common_op.debug_log("unknown image type" + block_handle["image"]["type"], 863 | level=NotionDump.DUMP_MODE_DEFAULT) 864 | if image_name == "": 865 | # 如果文件没有名字使用id作为默认名字 866 | image_name = image_id 867 | common_op.add_new_child_page( 868 | self.child_pages, 869 | key_id=image_id, 870 | link_src=image_url, 871 | page_type="image", 872 | page_name=image_name 873 | ) 874 | 875 | common_op.debug_log( 876 | "image_parser add page id = " + image_id + "name : " + image_name, level=NotionDump.DUMP_MODE_DEFAULT) 877 | common_op.debug_log(internal_var.PAGE_DIC) 878 | common_op.debug_log("#############") 879 | common_op.debug_log(self.child_pages) 880 | 881 | # 图片类型要返回一个链接占位符,供后续解析使用 882 | if parser_type == NotionDump.PARSER_TYPE_MD: 883 | return content_format.get_page_format_md( 884 | image_id, 885 | image_name, 886 | export_child=self.export_child 887 | ) 888 | else: 889 | return content_format.get_page_format_plain(image_name) 890 | 891 | # Page file(file,pdf,video) 892 | def file_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 893 | if block_handle["type"] != "file" and block_handle["type"] != "pdf" and block_handle["type"] != "video": 894 | common_op.debug_log("file type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 895 | level=NotionDump.DUMP_MODE_DEFAULT) 896 | return "" 897 | 898 | block_type = block_handle["type"] 899 | file_id = block_handle["id"].replace('-', '') 900 | file_name = self.__text_list_parser(block_handle[block_type]["caption"], parser_type) 901 | file_url = "" 902 | file_type = block_handle[block_type]["type"] 903 | if file_type in block_handle[block_type].keys(): 904 | if "url" in block_handle[block_type][file_type].keys(): 905 | file_url = block_handle[block_type][file_type]["url"] 906 | if file_url == "": 907 | common_op.debug_log("unknown block type" + block_handle[block_type]["type"] + " with null url", 908 | level=NotionDump.DUMP_MODE_DEFAULT) 909 | return "" 910 | # 如果caption中没有文件名,尝试从url中分离 911 | if file_name == "": 912 | file_url_basic = file_url[0:file_url.rfind('?')] 913 | file_name = file_url_basic[file_url_basic.rfind('/')+1:] 914 | # url中分离的内容需要转码 915 | file_name = unquote(file_name, 'utf-8') 916 | if file_name == "": 917 | # 如果文件没有名字使用file作为默认名字 918 | file_name = "FILE" 919 | common_op.add_new_child_page( 920 | self.child_pages, 921 | key_id=file_id, 922 | link_src=file_url, 923 | page_type="file", 924 | page_name=file_name 925 | ) 926 | 927 | common_op.debug_log( 928 | "file_parser add page id = " + file_id + " name : " + file_name, level=NotionDump.DUMP_MODE_DEFAULT) 929 | common_op.debug_log(internal_var.PAGE_DIC) 930 | common_op.debug_log("#############") 931 | common_op.debug_log(self.child_pages) 932 | 933 | # 文件类型要返回一个链接占位符,供后续解析使用 934 | if parser_type == NotionDump.PARSER_TYPE_MD: 935 | return content_format.get_page_format_md( 936 | file_id, 937 | file_name, 938 | export_child=self.export_child 939 | ) 940 | else: 941 | return content_format.get_page_format_plain(file_name) 942 | 943 | # Page bookmark 944 | def bookmark_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 945 | bookmark_ret = "" 946 | if block_handle["type"] != "bookmark": 947 | common_op.debug_log("bookmark type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 948 | level=NotionDump.DUMP_MODE_DEFAULT) 949 | return bookmark_ret 950 | bookmark_name = self.__text_list_parser(block_handle["bookmark"]["caption"], parser_type) 951 | if bookmark_name == "": 952 | bookmark_name = "BOOKMARK" 953 | bookmark_url = block_handle["bookmark"]["url"] 954 | 955 | # bookmark 类型要返回一个链接占位符,供后续解析使用 956 | if parser_type == NotionDump.PARSER_TYPE_MD: 957 | # file转换成文件链接的形式 958 | return content_format.get_file_format_md(bookmark_name, bookmark_url) 959 | else: 960 | return content_format.get_file_format_plain(bookmark_name, bookmark_url) 961 | 962 | # Page embed 963 | def embed_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 964 | embed_ret = "" 965 | if block_handle["type"] != "embed": 966 | common_op.debug_log("embed type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 967 | level=NotionDump.DUMP_MODE_DEFAULT) 968 | return embed_ret 969 | embed_name = self.__text_list_parser(block_handle["embed"]["caption"], parser_type) 970 | if embed_name == "": 971 | embed_name = "EMBED" 972 | embed_url = block_handle["embed"]["url"] 973 | 974 | # bookmark 类型要返回一个链接占位符,供后续解析使用 975 | if parser_type == NotionDump.PARSER_TYPE_MD: 976 | # file转换成文件链接的形式 977 | return content_format.get_file_format_md(embed_name, embed_url) 978 | else: 979 | return content_format.get_file_format_plain(embed_name, embed_url) 980 | 981 | # Page link_preview 982 | def link_preview_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 983 | link_preview_ret = "" 984 | if block_handle["type"] != "link_preview": 985 | common_op.debug_log("link_preview type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 986 | level=NotionDump.DUMP_MODE_DEFAULT) 987 | return link_preview_ret 988 | link_preview_name = "LINK_PREVIEW" 989 | link_preview_url = block_handle["link_preview"]["url"] 990 | 991 | # bookmark 类型要返回一个链接占位符,供后续解析使用 992 | if parser_type == NotionDump.PARSER_TYPE_MD: 993 | # file转换成文件链接的形式 994 | return content_format.get_file_format_md(link_preview_name, link_preview_url) 995 | else: 996 | return content_format.get_file_format_plain(link_preview_name, link_preview_url) 997 | 998 | # Page link_to_page 999 | def link_to_page_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN): 1000 | link_to_page_ret = "" 1001 | if block_handle["type"] != "link_to_page": 1002 | common_op.debug_log("link_to_page type error! parent_id= " + self.base_id + " id= " + block_handle["id"], 1003 | level=NotionDump.DUMP_MODE_DEFAULT) 1004 | return link_to_page_ret 1005 | 1006 | link_page = block_handle["link_to_page"] 1007 | if link_page["type"] == "page_id": 1008 | page_id = link_page["page_id"].replace('-', '') 1009 | page_name = "" 1010 | key_id = page_id + "_link_page" 1011 | common_op.add_new_child_page( 1012 | self.child_pages, 1013 | key_id=key_id, 1014 | link_id=page_id, 1015 | page_type="page", 1016 | page_name=page_name 1017 | ) 1018 | common_op.debug_log( 1019 | "link_to_page_parser add link_page key_id = " + key_id, level=NotionDump.DUMP_MODE_DEFAULT) 1020 | common_op.debug_log(internal_var.PAGE_DIC) 1021 | common_op.debug_log("#############") 1022 | common_op.debug_log(self.child_pages) 1023 | return content_format.get_page_format_md( 1024 | key_id, 1025 | page_name, 1026 | export_child=self.export_child 1027 | ) 1028 | else: 1029 | common_op.debug_log("unknown type " + link_page["type"], level=NotionDump.DUMP_MODE_DEFAULT) 1030 | return link_to_page_ret 1031 | -------------------------------------------------------------------------------- /NotionDump/Parser/block_parser.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | 5 | import os 6 | 7 | import NotionDump 8 | from NotionDump.Notion.Notion import NotionQuery 9 | from NotionDump.Parser.base_parser import BaseParser 10 | from NotionDump.utils import common_op 11 | 12 | 13 | # Block内容解析 14 | class BlockParser: 15 | # 初始化 16 | def __init__( 17 | self, 18 | block_id, 19 | query_handle: NotionQuery, 20 | parser_type=NotionDump.PARSER_TYPE_MD, 21 | export_child_pages=False 22 | ): 23 | self.block_id = block_id.replace('-', '') 24 | self.query_handle = query_handle 25 | self.parser_type = parser_type 26 | # 是否导出子页面,也就是递归操作 27 | self.export_child_page = export_child_pages 28 | 29 | # 创建临时文件夹 30 | self.tmp_dir = NotionDump.TMP_DIR 31 | if not os.path.exists(self.tmp_dir): 32 | os.mkdir(self.tmp_dir) 33 | 34 | # 基解析器 35 | self.base_parser = BaseParser( 36 | base_id=self.block_id, 37 | export_child=self.export_child_page 38 | ) 39 | 40 | # 获取子页面字典 41 | def get_child_pages_dic(self): 42 | return self.base_parser.get_child_pages_dic() 43 | 44 | def __get_children_block_list(self, block): 45 | # 如果没有子页面,直接返回空 46 | if not block["has_children"]: 47 | return None 48 | 49 | if block["type"] == 'child_page': 50 | return None 51 | 52 | # 递归黑名单 53 | if block["type"] == "template": 54 | common_op.debug_log("type " + block["type"] + " has no child, ignore", level=NotionDump.DUMP_MODE_DEFAULT) 55 | return None 56 | 57 | # 指定类型才递归(白名单) 58 | if block["type"] != "to_do" \ 59 | and block["type"] != "numbered_list_item" \ 60 | and block["type"] != "bulleted_list_item" \ 61 | and block["type"] != "toggle" \ 62 | and block["type"] != "table" \ 63 | and block["type"] != "table_row"\ 64 | and block["type"] != "column_list" \ 65 | and block["type"] != "column" \ 66 | and block["type"] != "synced_block" \ 67 | and block["type"] != "heading_1" \ 68 | and block["type"] != "heading_2" \ 69 | and block["type"] != "heading_3" \ 70 | and block["type"] != "paragraph" \ 71 | and block["type"] != "quote" \ 72 | and block["type"] != "callout": 73 | common_op.debug_log("[ISSUE] type " + block["type"] + " has no child", level=NotionDump.DUMP_MODE_DEFAULT) 74 | return None 75 | 76 | # 获取块id下面的内容并继续解析 77 | if block["type"] == "synced_block" and block["synced_block"]["synced_from"] is not None: 78 | child_block_id = block["synced_block"]["synced_from"]["block_id"] 79 | common_op.debug_log("type synced_block " + child_block_id + " get child", level=NotionDump.DUMP_MODE_DEFAULT) 80 | else: 81 | child_block_id = block["id"] 82 | 83 | block_list = [] 84 | retrieve_ret = self.query_handle.retrieve_block_children(child_block_id, parent_id=self.block_id) 85 | if retrieve_ret is not None: 86 | block_list = retrieve_ret["results"] 87 | 88 | # 如果没有获取到块,也返回空 89 | if len(block_list) == 0: 90 | return None 91 | # 返回获取到的块列表 92 | common_op.debug_log("## retrieve block " + child_block_id, level=NotionDump.DUMP_MODE_DEFAULT) 93 | return block_list 94 | 95 | def parser_block(self, block, list_index, last_line_is_table, prefix): 96 | block_type = block["type"] 97 | block_text = "" 98 | if block_type == "paragraph": 99 | # paragraph 100 | block_text = self.base_parser.paragraph_parser(block, self.parser_type) 101 | elif block_type == "heading_1": 102 | # heading_1 103 | block_text = self.base_parser.heading_1_parser(block, self.parser_type) 104 | elif block_type == "heading_2": 105 | # heading_2 106 | block_text = self.base_parser.heading_2_parser(block, self.parser_type) 107 | elif block_type == "heading_3": 108 | # heading_3 109 | block_text = self.base_parser.heading_3_parser(block, self.parser_type) 110 | elif block_type == "to_do": 111 | # to_do 112 | block_text = self.base_parser.to_do_parser(block, self.parser_type) 113 | elif block_type == "bulleted_list_item": 114 | # bulleted_list_item 115 | block_text = self.base_parser.bulleted_list_item_parser(block, self.parser_type) 116 | elif block_type == "numbered_list_item": 117 | # numbered_list_item 118 | block_text = self.base_parser.numbered_list_item_parser(block, list_index, self.parser_type) 119 | elif block_type == "toggle": 120 | # toggle 121 | block_text = self.base_parser.toggle_parser(block, self.parser_type) 122 | elif block_type == "divider": 123 | # divider 124 | block_text = self.base_parser.divider_parser(block) 125 | elif block_type == "callout": 126 | # callout 127 | block_text = self.base_parser.callout_parser(block, self.parser_type) 128 | # callout内换行使用HTML符号 129 | block_text = block_text.replace('\n', '
') 130 | elif block_type == "code": 131 | # code 132 | code_text = self.base_parser.code_parser(block, self.parser_type) 133 | block_text = code_text.replace('\n', '\n'+prefix) 134 | elif block_type == "quote": 135 | # quote 136 | block_text = self.base_parser.quote_parser(block, self.parser_type) 137 | block_text = block_text.replace('\n', '
') 138 | elif block_type == "equation": 139 | # Page equation 140 | block_text = self.base_parser.equation_parser(block) 141 | elif block_type == "table": 142 | # table直接递归即可 143 | pass 144 | elif block_type == "table_row": 145 | # Page table_row 146 | block_text = self.base_parser.table_row_parser( 147 | block, 148 | first_row=last_line_is_table, 149 | parser_type=self.parser_type 150 | ) 151 | elif block_type == "child_page": 152 | # Page child_page 子页面只返回链接,不返回内容 153 | block_text = self.base_parser.child_page_parser(block, self.parser_type) 154 | elif block_type == "child_database": 155 | # Page child_database 156 | # Page中嵌套数据库的类型,只保存页面,不进行解析 157 | block_text = self.base_parser.child_database_parser(block, self.parser_type) 158 | elif block_type == "image": 159 | # Page image 160 | block_text = self.base_parser.image_parser(block, self.parser_type) 161 | elif block_type == "file" or block_type == "pdf" or block_type == "video": 162 | # Page file 163 | block_text = self.base_parser.file_parser(block, self.parser_type) 164 | elif block_type == "bookmark": 165 | # Page bookmark 166 | block_text = self.base_parser.bookmark_parser(block, self.parser_type) 167 | elif block_type == "embed": 168 | # Page embed 169 | block_text = self.base_parser.embed_parser(block, self.parser_type) 170 | elif block_type == "link_preview": 171 | # Page bookmark 172 | block_text = self.base_parser.link_preview_parser(block, self.parser_type) 173 | elif block_type == "link_to_page": 174 | # Page link_to_page 175 | block_text = self.base_parser.link_to_page_parser(block, self.parser_type) 176 | elif block_type == "table_of_contents": 177 | block_text = '[TOC]' 178 | elif block_type == "template": 179 | # 模板内容不解析 180 | block_text = '[TEMPLATE]' 181 | elif block_type == "breadcrumb": 182 | # 路径信息不解析(notion也不会返回) 183 | block_text = "[breadcrumb]" 184 | else: 185 | common_op.debug_log("[ISSUE] unknown page block properties type:" + block_type, level=NotionDump.DUMP_MODE_DEFAULT) 186 | block_text = "[unknown_type:" + block_type + "]" 187 | if block_text is None: 188 | block_text = "" 189 | return block_text 190 | 191 | def parser_block_list(self, block_list, indent=0, line_div="\n", last_block_type="none"): 192 | prefix = "" 193 | p_index = 0 194 | # line_div 为br时,是内部换行,\n时是大块换行 195 | while p_index < indent and line_div == "\n": 196 | prefix += "\t" # 前缀是一个TAB 197 | p_index += 1 198 | 199 | # 如果有内容先加个换行再说 200 | block_text = "" 201 | if indent != 0 and line_div == "\n": 202 | block_text = line_div 203 | 204 | last_type = "to_do" # 初始化不换行 205 | list_index = 1 206 | 207 | # 记录解析到的表格的状态,表格会一次性解析完,所以这里不需要重新设置 208 | last_line_is_table = True 209 | 210 | for block in block_list: 211 | # 遍历block,解析内容,填充到md文件中 212 | block_type = block["type"] 213 | 214 | # 在外面解析列类型 215 | if block_type == "column_list": 216 | # 列类型的分解 217 | column_list = self.__get_children_block_list(block) 218 | if block_text == "\n": 219 | # 如果只有一个换行符,重置内容 220 | block_text = "" 221 | if column_list is not None: 222 | for column in column_list: 223 | column_rows = self.__get_children_block_list(column) 224 | if column_rows is not None: 225 | if block_text != "": 226 | # 与前边得隔离开 227 | block_text += "\n" 228 | block_text += self.parser_block_list(column_rows, indent) 229 | elif block_type == "synced_block": 230 | # 同步块解析其中的内容 231 | synced_block_list = self.__get_children_block_list(block) 232 | if block_text == "\n": 233 | # 如果只有一个换行符,重置内容 234 | block_text = "" 235 | if synced_block_list is not None: 236 | block_text += self.parser_block_list(synced_block_list, indent, last_block_type="synced_block") 237 | else: 238 | # 如果是连续的类型,就不需要额外加换行符 239 | if common_op.parser_newline(last_type, block_type) and block_text != "" and block_text != "\n": 240 | block_text += line_div 241 | 242 | # 记录数字列表的标识 243 | if last_type == "numbered_list_item": 244 | list_index = list_index + 1 245 | else: 246 | list_index = 1 247 | last_type = block_type 248 | if block_type != "table" and block_type != "table_row": 249 | block_text += prefix 250 | 251 | block_text += self.parser_block( 252 | block=block, 253 | list_index=list_index, 254 | last_line_is_table=last_line_is_table, 255 | prefix=prefix 256 | ) 257 | 258 | # 看改块下面有没有子块,如果有就继续解析 259 | children_block_list = self.__get_children_block_list(block) 260 | t_line_div = "\n" 261 | if block_type == "quote" or block_type == "callout": 262 | t_line_div = "
" 263 | if children_block_list is not None: 264 | if block_type == "heading_1" \ 265 | or block_type == "heading_2" \ 266 | or block_type == "heading_3" \ 267 | or block_type == "paragraph" \ 268 | or block_type == "quote" \ 269 | or block_type == "callout": 270 | # 不需要加大indent值 271 | # if block_type != "quote" and block_type != "callout": 272 | # # 处理quote和callout内部的换行问题 273 | block_text += t_line_div 274 | block_text += self.parser_block_list(children_block_list, indent, line_div=t_line_div) 275 | else: 276 | block_text += self.parser_block_list(children_block_list, indent + 1) 277 | else: 278 | block_text += "\n" 279 | 280 | if block_type == "table_row": 281 | # 第一行设置首行标志 282 | last_line_is_table = False 283 | 284 | return block_text 285 | 286 | def block_to_md(self, block_handle, page_detail=None, new_id=None): 287 | block_list = block_handle["results"] 288 | # 空内容不生成文件 289 | if len(block_list) == 0 and (page_detail is None or page_detail == ""): 290 | return "" 291 | 292 | # 创建Markdown文件 293 | if new_id is not None: 294 | self.block_id = new_id.replace('-', '') 295 | self.base_parser.set_new_id(self.block_id) 296 | tmp_md_filename = self.tmp_dir + self.block_id + ".md" 297 | file = open(tmp_md_filename, "w", encoding="utf-8", newline='') 298 | 299 | # 如果存在属性就拼接上去 300 | block_text = "" 301 | if page_detail is not None and page_detail != "": 302 | block_text = page_detail + "\n" + NotionDump.MD_DIVIDER + "\n" 303 | 304 | # 解析block_list 305 | block_text += self.parser_block_list(block_list) 306 | 307 | # 将解析内容写入文件 308 | file.write(block_text) 309 | file.flush() 310 | file.close() 311 | 312 | common_op.debug_log("write file " + tmp_md_filename, level=NotionDump.DUMP_MODE_DEFAULT) 313 | # 将临时文件地址转出去,由外面进行进一步的操作 314 | return tmp_md_filename 315 | 316 | # 源文件,直接输出成json; 辅助测试使用 317 | def block_to_json(self, block_json, json_name=None): 318 | if block_json is None: 319 | return None 320 | 321 | if json_name is None: 322 | json_name = self.tmp_dir + self.block_id + ".json" 323 | common_op.save_json_to_file(block_json, json_name) 324 | -------------------------------------------------------------------------------- /NotionDump/Parser/database_parser.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | 5 | import csv 6 | import os 7 | 8 | import NotionDump 9 | from NotionDump.Parser.base_parser import BaseParser 10 | from NotionDump.utils import common_op 11 | 12 | 13 | class DatabaseParser: 14 | def __init__( 15 | self, 16 | database_id, 17 | parser_type=NotionDump.PARSER_TYPE_PLAIN, 18 | export_child_pages=False 19 | ): 20 | self.database_id = database_id.replace('-', '') 21 | self.parser_type = parser_type 22 | # 是否导出子页面,也就是递归操作 23 | self.export_child_page = export_child_pages 24 | 25 | self.tmp_dir = NotionDump.TMP_DIR 26 | if not os.path.exists(self.tmp_dir): 27 | os.mkdir(self.tmp_dir) 28 | 29 | # 块解析器 30 | self.base_parser = BaseParser( 31 | base_id=self.database_id, 32 | export_child=self.export_child_page 33 | ) 34 | 35 | # 从一个页面里把列名给解析出来 36 | def __get_col_name_list(self, one_page): 37 | col_name_list = [] 38 | title_name = "" 39 | for item in one_page["properties"]: 40 | if one_page["properties"][item]["type"] == "title": 41 | title_name = item 42 | else: 43 | col_name_list.append(item) 44 | if title_name == "": 45 | common_op.debug_log("col name no title error! id=" + self.database_id, level=NotionDump.DUMP_MODE_DEFAULT) 46 | return "" 47 | col_name_list.append(title_name) # 把title_name放在最后一个,逆序之后就是第一个 48 | # 根据现有的数据库看来这里需要逆序一下才和实际的数据库一致 49 | col_name_list.reverse() 50 | return col_name_list 51 | 52 | def get_child_pages_dic(self): 53 | return self.base_parser.get_child_pages_dic() 54 | 55 | # 解析一列中的一项 56 | def __parser_item(self, item_block, page_id): 57 | item_ret = "" 58 | if item_block["type"] == "title": # title 59 | item_ret = self.base_parser.title_parser(item_block, page_id, parser_type=self.parser_type) 60 | elif item_block["type"] == "multi_select": # multi_select 61 | item_ret = self.base_parser.multi_select_parser(item_block, parser_type=self.parser_type) 62 | elif item_block["type"] == "select": 63 | item_ret = self.base_parser.select_parser(item_block, parser_type=self.parser_type) 64 | elif item_block["type"] == "rich_text": 65 | item_ret = self.base_parser.rich_text_parser(item_block, parser_type=self.parser_type) 66 | elif item_block["type"] == "url": 67 | item_ret = self.base_parser.url_parser(item_block) 68 | elif item_block["type"] == "email": 69 | item_ret = self.base_parser.email_parser(item_block) 70 | elif item_block["type"] == "checkbox": 71 | item_ret = self.base_parser.checkbox_parser(item_block) 72 | elif item_block["type"] == "phone_number": 73 | item_ret = self.base_parser.phone_number_parser(item_block) 74 | elif item_block["type"] == "date": 75 | item_ret = self.base_parser.date_parser(item_block) 76 | elif item_block["type"] == "people": 77 | item_ret = self.base_parser.people_parser(item_block) 78 | elif item_block["type"] == "number": 79 | item_ret = self.base_parser.number_parser(item_block) 80 | elif item_block["type"] == "files": 81 | item_ret = self.base_parser.files_parser(item_block, parser_type=self.parser_type) 82 | elif item_block["type"] == "relation": 83 | item_ret = self.base_parser.relation_parser(item_block) 84 | elif item_block["type"] == "rollup": 85 | # rollup类型单独解析 86 | rollup_block = item_block["rollup"] 87 | if "array" in rollup_block: 88 | # 列表的解析 89 | for rollup_item in rollup_block["array"]: 90 | if item_ret != "": 91 | item_ret += "," 92 | item_ret += self.__parser_item(rollup_item, "") 93 | else: 94 | # 单个内容的解析 95 | item_ret += self.__parser_item(rollup_block, "") 96 | elif item_block["type"] == "formula": 97 | item_ret = self.base_parser.formula_parser(item_block) 98 | elif item_block["type"] == "created_time": 99 | item_ret = self.base_parser.created_time_parser(item_block) 100 | elif item_block["type"] == "last_edited_time": 101 | item_ret = self.base_parser.last_edited_time_parser(item_block) 102 | elif item_block["type"] == "created_by": 103 | item_ret = self.base_parser.created_by_parser(item_block) 104 | elif item_block["type"] == "last_edited_by": 105 | item_ret = self.base_parser.last_edited_by_parser(item_block) 106 | else: 107 | item_ret = "[unknown_type:" + item_block["type"] + "]" 108 | common_op.debug_log("[ISSUE] unknown properties type:" + item_block["type"], level=NotionDump.DUMP_MODE_DEFAULT) 109 | if item_ret is None: 110 | item_ret = "" 111 | return item_ret 112 | 113 | def database_to_md(self, page_properties, new_id=None): 114 | if page_properties is None: 115 | return "", "" 116 | # 获取属性部分 117 | if "properties" not in page_properties: 118 | return "", "" 119 | page_properties = page_properties["properties"] 120 | 121 | # 设置基础解析器的id 122 | if new_id is not None: 123 | self.base_parser.set_new_id(new_id) 124 | 125 | # 数据库是空的,直接返回完事 126 | if len(page_properties) == 0: 127 | return "", "" 128 | 129 | properties_md = "" 130 | # print(page_properties.keys()) 131 | p_title = "" 132 | p_title_name = "" 133 | for p_name in list(page_properties.keys())[::-1]: 134 | p_value = self.__parser_item(page_properties[p_name], page_id="").replace('\n', '
') 135 | if page_properties[p_name]["type"] == "title": 136 | p_title = p_value 137 | p_title_name = p_name 138 | continue 139 | # print(p_value, p_name) 140 | properties_md += "\n" + "|" + str(p_name) + "|" + str(p_value) + "|" 141 | if p_title != "" or p_title_name != "": 142 | properties_md = "|" + p_title_name + "|" + p_title + "|\n|---|---|" + properties_md 143 | else: 144 | properties_md = "|KEY|VALUE|\n|---|---|" + properties_md 145 | 146 | if len(page_properties) == 1: 147 | return "", p_title 148 | else: 149 | return properties_md, p_title 150 | 151 | # 格式化存储,这里是临时文件存在方式(在外面转成数据库,或者最终输出CSV的格式) 152 | def database_to_file(self, database_handle, col_name_list=None, new_id=None): 153 | page_list = database_handle.get("results") 154 | # 数据库是空的,直接返回完事 155 | if len(page_list) == 0: 156 | return "" 157 | 158 | # col_name_list 是想要的列,并且会按照该顺序输出;如果没有给定则获取所有列 159 | if col_name_list is None: 160 | # 如果没有给定输出顺序,则获取到page中的所有列(注意不保证是显示的顺序!!!!) 161 | col_name_list = self.__get_col_name_list(page_list[0]) 162 | 163 | # 创建文件 164 | suffix = ".csv" 165 | if self.parser_type == NotionDump.PARSER_TYPE_MD: 166 | suffix = ".md" 167 | if new_id is not None: 168 | self.base_parser.set_new_id(new_id) 169 | tmp_filename = self.tmp_dir + new_id.replace('-', '') + suffix 170 | else: 171 | tmp_filename = self.tmp_dir + self.database_id + suffix 172 | 173 | file = open(tmp_filename, "w", encoding="utf-8", newline='') 174 | 175 | csv_writer = None 176 | if self.parser_type == NotionDump.PARSER_TYPE_MD: 177 | head_line = "|" 178 | for it in col_name_list: 179 | head_line += it + "|" 180 | head_line += "\n|" 181 | for i in range(len(col_name_list)): 182 | head_line += " --- " + "|" 183 | file.write(head_line + "\n") 184 | else: 185 | csv_writer = csv.writer(file) 186 | # 首先将列的名称写入到CSV文件中 187 | csv_writer.writerow(col_name_list) 188 | 189 | # 返回的内容好像是倒序的,先倒置过来吧 190 | page_list.reverse() 191 | # 解析每一个page的内容 192 | for page in page_list: 193 | # 每一个page都有page id 194 | page_id = page["id"].replace('-', '') 195 | common_op.debug_log("database page id" + page_id) 196 | page_iter = [] 197 | for item in col_name_list: 198 | # 解析每一个方格的内容 199 | page_iter.append(self.__parser_item(page["properties"][item], page_id)) 200 | # 将内容填充到文件中 201 | if self.parser_type == NotionDump.PARSER_TYPE_MD: 202 | page_line = "|" 203 | for it in page_iter: 204 | if isinstance(it, str): 205 | page_line += it.replace('\n', '
') + "|" 206 | else: 207 | page_line += str(it) + "|" 208 | file.write(page_line + "\n") 209 | else: 210 | if csv_writer is not None: 211 | csv_writer.writerow(page_iter) 212 | common_op.debug_log("database page " + page_id + " write csv success") 213 | else: 214 | common_op.debug_log("database page " + page_id + " write csv fail", level=NotionDump.DUMP_MODE_DEFAULT) 215 | file.flush() 216 | file.close() 217 | 218 | common_op.debug_log("write file " + tmp_filename, level=NotionDump.DUMP_MODE_DEFAULT) 219 | # 将临时文件地址转出去,由外面进行进一步的操作 220 | return tmp_filename 221 | 222 | def database_to_dic(self, database_handle, col_name_list=None, new_id=None): 223 | page_list = database_handle.get("results") 224 | # 数据库是空的,直接返回完事 225 | if len(page_list) == 0: 226 | return 227 | 228 | # col_name_list 是想要的列,并且会按照该顺序输出;如果没有给定则获取所有列 229 | if col_name_list is None: 230 | # 如果没有给定输出顺序,则获取到page中的所有列(注意不保证是显示的顺序!!!!) 231 | col_name_list = self.__get_col_name_list(page_list[0]) 232 | 233 | # 返回的内容好像是倒序的,先倒置过来吧 234 | page_list.reverse() 235 | 236 | db_dic = [] 237 | # 解析每一个page的内容 238 | for page in page_list: 239 | # 每一个page都有page id 240 | page_id = page["id"].replace('-', '') 241 | common_op.debug_log("database page id" + page_id) 242 | db_dic_line = {"_page_id": page_id} 243 | for item in col_name_list: 244 | # 解析每一个方格的内容 245 | db_dic_line[item] = self.__parser_item(page["properties"][item], page_id) 246 | # 将内容填充list中 247 | db_dic.append(db_dic_line) 248 | common_op.debug_log("database page " + page_id + " get dic success") 249 | 250 | # 将临时文件地址转出去,由外面进行进一步的操作 251 | return db_dic 252 | -------------------------------------------------------------------------------- /NotionDump/Parser/mix_parser.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/10 3 | # mail:geniusrabbit@qq.com 4 | import copy 5 | import os 6 | 7 | import NotionDump 8 | from NotionDump.Notion.Notion import NotionQuery 9 | from NotionDump.Parser.block_parser import BlockParser 10 | from NotionDump.Parser.database_parser import DatabaseParser 11 | from NotionDump.utils import common_op, internal_var 12 | 13 | 14 | # 混合递归调用,主要是为Page和Database类型 15 | class MixParser: 16 | # 初始化 17 | def __init__( 18 | self, 19 | mix_id, 20 | query_handle: NotionQuery, 21 | export_child_pages=False, 22 | page_parser_type=NotionDump.PARSER_TYPE_MD, 23 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN, 24 | col_name_list=None, # 数据库使用的字段 25 | ): 26 | self.mix_id = mix_id 27 | self.query_handle = query_handle 28 | self.page_parser_type = page_parser_type 29 | self.db_parser_type = db_parser_type 30 | 31 | # 是否导出子页面,也就是递归操作 32 | self.export_child_page = export_child_pages 33 | 34 | # 创建临时文件夹 35 | self.tmp_dir = NotionDump.TMP_DIR 36 | if not os.path.exists(self.tmp_dir): 37 | os.mkdir(self.tmp_dir) 38 | 39 | # 解析器 40 | # 这里传入handle是为了子块的解析 41 | self.block_parser = BlockParser( 42 | block_id=self.mix_id, 43 | query_handle=self.query_handle, 44 | parser_type=self.page_parser_type, 45 | export_child_pages=self.export_child_page 46 | ) 47 | # 初始化一个Database对象,这里page id无关紧要 48 | self.database_parser = DatabaseParser( 49 | self.mix_id, 50 | parser_type=self.db_parser_type, 51 | export_child_pages=self.export_child_page 52 | ) 53 | 54 | # 收集解析中发证的错误 55 | self.error_list = [] 56 | 57 | # 调试时显示子页面内容 58 | def __test_show_child_page(self): 59 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG: 60 | print("in page_id: ", self.mix_id, internal_var.PAGE_DIC) 61 | 62 | def __recursion_mix_parser(self, is_main=False, col_name_list=None): 63 | root_name = None 64 | update_flag = False 65 | recursion_page = copy.deepcopy(internal_var.PAGE_DIC) 66 | for child_id in recursion_page: 67 | # 判断页面是子页面还是链接页面,链接页面不进行解析(因为添加链接页面时把原页面也加进来了) 68 | if common_op.is_link_page(child_id, recursion_page[child_id]): 69 | common_op.update_page_recursion(child_id, recursion=True) 70 | continue 71 | # 判断页面是否已经操作过 72 | if not common_op.is_page_recursion(child_id): 73 | continue 74 | 75 | update_flag = True 76 | common_op.debug_log("start child_page_id=" + child_id) 77 | self.__test_show_child_page() 78 | # 先更新页面的状态,无论获取成功或者失败都过去了,只获取一次 79 | common_op.update_page_recursion(child_id, recursion=True) 80 | common_op.debug_log("S process id " + child_id, level=NotionDump.DUMP_MODE_DEFAULT) 81 | page_title = None 82 | tmp_filename = None 83 | if common_op.is_page(child_id): 84 | # 页面信息 85 | page_detail = self.query_handle.retrieve_page(child_id) 86 | # 页面内容 87 | page_json = self.query_handle.retrieve_block_children(child_id) 88 | if page_json is None or page_detail is None: 89 | common_op.debug_log("get page error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT) 90 | self.error_list.append("get page error, id=" + child_id) 91 | continue 92 | # 解析属性文本到变量中 93 | page_properties = None 94 | if NotionDump.S_PAGE_PROPERTIES or common_op.is_page_soft(child_id): 95 | # 获取文本 96 | page_properties, page_title = self.database_parser.database_to_md(page_detail, new_id=child_id) 97 | # 解析内容到临时文件中 98 | tmp_filename = self.block_parser.block_to_md(page_json, page_detail=page_properties, new_id=child_id) 99 | # 处理遇到的子页面 100 | child_pages_dic = self.block_parser.get_child_pages_dic() 101 | if NotionDump.S_PAGE_PROPERTIES: 102 | db_child_pages_dic = self.database_parser.get_child_pages_dic() 103 | for db_child_dic_key in db_child_pages_dic: 104 | if db_child_dic_key not in child_pages_dic: 105 | child_pages_dic[db_child_dic_key] = db_child_pages_dic[db_child_dic_key] 106 | elif common_op.is_db(child_id): 107 | db_info = self.query_handle.retrieve_database(child_id) 108 | # page里面搞一个Database的解析器 109 | db_detail = self.query_handle.query_database(child_id) 110 | 111 | if db_detail is None: 112 | # db_info不是必须的,但是在link数据库获取不到 113 | common_op.debug_log("get database error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT) 114 | self.error_list.append("get database error, id=" + child_id) 115 | continue 116 | # 获取解析后的数据 117 | tmp_filename = self.database_parser.database_to_file(db_detail, new_id=child_id, col_name_list=col_name_list) 118 | child_pages_dic = self.database_parser.get_child_pages_dic() 119 | elif common_op.is_download(child_id): 120 | # 可下载类型 121 | # 获取下载后的数据 122 | tmp_filename = self.query_handle.download_to_file(download_id=child_id, child_page_item=recursion_page[child_id]) 123 | child_pages_dic = {} 124 | # 尝试下载,没下载成功 125 | if tmp_filename == "" and not NotionDump.FILE_WITH_LINK: 126 | common_op.debug_log("file download error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT) 127 | self.error_list.append("download error, link:" + recursion_page[child_id]["link_src"]) 128 | continue 129 | else: 130 | common_op.debug_log("!!! unknown child id type, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT) 131 | self.error_list.append("!!! unknown child id type, id=" + child_id) 132 | continue 133 | 134 | common_op.debug_log("E process id " + child_id + " success", level=NotionDump.DUMP_MODE_DEFAULT) 135 | # 再更新本地的存放路径 136 | common_op.update_child_page_stats(child_id, dumped=True, main_page=is_main, local_path=tmp_filename, page_title=page_title) 137 | if is_main: 138 | root_name = tmp_filename 139 | # 从页面里获取到所有的子页面,并将子页面添加到父id中 140 | common_op.update_child_pages(child_pages_dic, child_id) 141 | 142 | # 调试 143 | common_op.debug_log("# end child_page_id=", child_id) 144 | self.__test_show_child_page() 145 | 146 | if update_flag: 147 | self.__recursion_mix_parser() 148 | return root_name 149 | 150 | def mix_parser(self, root_id, id_type, col_name_list=None): 151 | # col_name_list 是数据库的可选字段 152 | common_op.update_child_page_stats(root_id, main_page=True, page_type=id_type) 153 | root_filename = self.__recursion_mix_parser(True, col_name_list) 154 | internal_var.PAGE_DIC["errors"] = self.error_list 155 | return root_filename 156 | 157 | def database_collection(self, json_handle, json_type, col_name_list=None): 158 | # 只能获取数据库类型 159 | common_op.debug_log("parser_type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT) 160 | if json_type == "database": 161 | return self.database_parser.database_to_dic(json_handle, col_name_list=col_name_list) 162 | elif json_type == "block": 163 | common_op.debug_log("need database get type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT) 164 | return None 165 | else: 166 | common_op.debug_log("unknown parser_type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT) 167 | return None 168 | -------------------------------------------------------------------------------- /NotionDump/__init__.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | 5 | __author__ = "delta1037 " 6 | __version__ = "0.2.3" 7 | 8 | from NotionDump import utils 9 | 10 | # 临时存放文件夹 11 | TMP_DIR = "./.tmp/" 12 | 13 | # Markdown的分割条语法 14 | MD_DIVIDER = "------" 15 | MD_BOOL_TRUE = "✓" 16 | MD_BOOL_FALSE = "✕" 17 | # ,、
逗号或者换行 18 | MD_ROLLUP_SEP = "," 19 | MD_HIGHLIGHT = "==" 20 | ID_LEN = len("921e6b4ea44046c6935bcb2c69453196") 21 | 22 | # 日志输出模式 23 | DUMP_MODE_DEBUG = 0 24 | DUMP_MODE_DEFAULT = 1 25 | DUMP_MODE_SILENT = 2 26 | DUMP_MODE = DUMP_MODE_DEFAULT 27 | 28 | 29 | # 日志控制器 30 | class NotionBackupLogger: 31 | def __init__(self): 32 | self.prefix = "[NotionDump] " 33 | self.log_fd = None 34 | 35 | def log_debug(self, log_str): 36 | self.log_info(log_str) 37 | 38 | # debug内容写入到文件 39 | if self.log_fd is None: 40 | self.log_fd = open("notion-export-kernel-debug.log", "a+", encoding='utf-8') 41 | self.log_fd.write(str(log_str) + "\n") 42 | self.log_fd.flush() 43 | 44 | def log_info(self, log_str): 45 | print(self.prefix, end='') 46 | print(log_str) 47 | 48 | 49 | LOGGER = NotionBackupLogger() 50 | 51 | # 导出的类型 52 | DUMP_TYPE_BLOCK = 1 53 | DUMP_TYPE_PAGE = 2 54 | DUMP_TYPE_DB_TABLE = 4 55 | 56 | # 解析的类型:分为Markdown和纯文本 57 | PARSER_TYPE_MD = 0 58 | PARSER_TYPE_PLAIN = 2 59 | 60 | # 是否使用缓存 61 | BUFFER_FILE = TMP_DIR + "notion_download_buffer.json" 62 | USE_BUFFER = True 63 | 64 | # 一些配置开关 65 | # 对没有在notion保存的文件(pdf\image)尝试下载,否则直接放置链接 66 | FILE_WITH_LINK = True 67 | FORMAT_DATE = "%Y/%m/%d" 68 | FORMAT_DATETIME = "%Y/%m/%d-%H:%M:%S" 69 | # 是否导出page的properties 70 | S_PAGE_PROPERTIES = True 71 | # 主题的格式,default,light,dark,markdown,self_define 72 | S_THEME_TYPE = "default" 73 | # f开头的是字体颜色,b开头的是背景颜色,d开头的是数据库标签 74 | S_THEME_LIGHT = { 75 | "f_gray": "#787774", 76 | "f_brown": "#9F6B53", 77 | "f_orange": "#D9730D", 78 | "f_yellow": "#CB912F", 79 | "f_green": "#448361", 80 | "f_blue": "#337EA9", 81 | "f_purple": "#9065B0", 82 | "f_pink": "#C14C8A", 83 | "f_red": "#D44C47", 84 | "b_gray": "#F1F1EF", 85 | "b_brown": "#F4EEEE", 86 | "b_orange": "#FBECDD", 87 | "b_yellow": "#FBF3DB", 88 | "b_green": "#EDF3EC", 89 | "b_blue": "#E7F3F8", 90 | "b_purple": "#F4F0F7CC", 91 | "b_pink": "#F9EEF3CC", 92 | "b_red": "#FDEBEC", 93 | "d_light_gray": "#E3E2E080", 94 | "d_gray": "#E3E2E0", 95 | "d_brown": "#EEE0DA", 96 | "d_orange": "#FADEC9", 97 | "d_yellow": "#FDECC8", 98 | "d_green": "#DBEDDB", 99 | "d_blue": "#D3E5EF", 100 | "d_purple": "#E8DEEE", 101 | "d_pink": "#F5E0E9", 102 | "d_red": "#FFE2DD", 103 | } 104 | 105 | S_THEME_DARK = { 106 | "f_gray": "#9B9B9B", 107 | "f_brown": "#BA856F", 108 | "f_orange": "#C77D48", 109 | "f_yellow": "#CA9849", 110 | "f_green": "#529E72", 111 | "f_blue": "#5E87C9", 112 | "f_purple": "#9D68D3", 113 | "f_pink": "#D15796", 114 | "f_red": "#DF5453", 115 | "b_gray": "#2F2F2F", 116 | "b_brown": "#4A3228", 117 | "b_orange": "#5C3B23", 118 | "b_yellow": "#564328", 119 | "b_green": "#243D30", 120 | "b_blue": "#143A4E", 121 | "b_purple": "#3C2D49", 122 | "b_pink": "#4E2C3C", 123 | "b_red": "#522E2A", 124 | "d_light_gray": "#373737", 125 | "d_gray": "#5A5A5A", 126 | "d_brown": "#603B2C", 127 | "d_orange": "#854C1D", 128 | "d_yellow": "#89632A", 129 | "d_green": "#2B593F", 130 | "d_blue": "#28456C", 131 | "d_purple": "#492F64", 132 | "d_pink": "#69314C", 133 | "d_red": "#6E3630", 134 | } 135 | 136 | S_THEME_SELF_DEFINE = { 137 | "f_gray": "#787774", 138 | "f_brown": "#9F6B53", 139 | "f_orange": "#D9730D", 140 | "f_yellow": "#CB912F", 141 | "f_green": "#448361", 142 | "f_blue": "#337EA9", 143 | "f_purple": "#9065B0", 144 | "f_pink": "#C14C8A", 145 | "f_red": "#D44C47", 146 | "b_gray": "#F1F1EF", 147 | "b_brown": "#F4EEEE", 148 | "b_orange": "#FBECDD", 149 | "b_yellow": "#FBF3DB", 150 | "b_green": "#EDF3EC", 151 | "b_blue": "#E7F3F8", 152 | "b_purple": "#F4F0F7CC", 153 | "b_pink": "#F9EEF3CC", 154 | "b_red": "#FDEBEC", 155 | "d_light_gray": "#E3E2E080", 156 | "d_gray": "#E3E2E0", 157 | "d_brown": "#EEE0DA", 158 | "d_orange": "#FADEC9", 159 | "d_yellow": "#FDECC8", 160 | "d_green": "#DBEDDB", 161 | "d_blue": "#D3E5EF", 162 | "d_purple": "#E8DEEE", 163 | "d_pink": "#F5E0E9", 164 | "d_red": "#FFE2DD", 165 | } 166 | -------------------------------------------------------------------------------- /NotionDump/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/utils/__init__.py -------------------------------------------------------------------------------- /NotionDump/utils/common_op.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/09 3 | # mail:geniusrabbit@qq.com 4 | 5 | import copy 6 | import json 7 | import os.path 8 | from json import JSONDecodeError 9 | 10 | import NotionDump 11 | from NotionDump.utils import internal_var 12 | 13 | 14 | # 更新子页面的状态 15 | def update_child_page_stats(child_key, dumped=False, main_page=False, local_path=None, page_type=None, page_title=None): 16 | if child_key not in internal_var.PAGE_DIC: 17 | # 如果现有的列表里没有这一条,则新加一条 18 | debug_log("CREATE child page " + child_key + " from temp", level=NotionDump.DUMP_MODE_DEFAULT) 19 | internal_var.PAGE_DIC[child_key] = copy.deepcopy(internal_var.CHILD_PAGE_TEMP) 20 | internal_var.PAGE_DIC[child_key]["dumped"] = dumped 21 | internal_var.PAGE_DIC[child_key]["main_page"] = main_page 22 | if local_path is not None: 23 | internal_var.PAGE_DIC[child_key]["local_path"] = local_path 24 | if page_type is not None: 25 | if page_type == "block" or page_type == "page": 26 | internal_var.PAGE_DIC[child_key]["type"] = "page" 27 | elif page_type == "database": 28 | internal_var.PAGE_DIC[child_key]["type"] = "database" 29 | else: 30 | debug_log("update_child_page_stats page type is unknown:" + str(page_type), 31 | level=NotionDump.DUMP_MODE_DEFAULT) 32 | if page_title is not None and internal_var.PAGE_DIC[child_key]["inter_soft_page"] is True: 33 | internal_var.PAGE_DIC[child_key]["inter_soft_page"] = False 34 | if internal_var.PAGE_DIC[child_key]["page_name"] == "": 35 | internal_var.PAGE_DIC[child_key]["page_name"] = page_title 36 | 37 | 38 | # 关于软连接一共有如下情况 39 | # 同一个页面:add_new_child_page 40 | # 在同一个页面中,软连接先于实际链接出现 41 | # 软连接先占位,把实际链接加进去 42 | # 在同一个页面中,软连接在实际链接后出现 43 | # 不同的页面:update_child_pages 44 | # 在不同页面中,软连接先于实际链接出现 45 | # 实际链接替换,重新解析 46 | # 在不同页面中,软连接在实际链接后出现 47 | # 忽略软连接 48 | # 只出现软连接而没有出现实际链接,pass 49 | def update_child_pages(child_pages, parent_id): 50 | # 按理说这里一定会有父id,如果没有就是出大事了 51 | if parent_id not in internal_var.PAGE_DIC: 52 | debug_log("parent id" + parent_id + " not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 53 | return 54 | 55 | for child_page_id in child_pages: 56 | # 如果发现表里已经有了该页面,看是不是软链接创建的 57 | if child_page_id in internal_var.PAGE_DIC: 58 | # 如果页表里是软连接创建的,并且外面的不是软连接创建的 59 | # 如果里面是硬链接,外面是软连接则会忽略 60 | if internal_var.PAGE_DIC[child_page_id]["inter_soft_page"] \ 61 | and not child_pages[child_page_id]["inter_soft_page"]: 62 | # 将外面的合入到页面表,替换之后会重新解析,不用担心已经解析过的内容 63 | # 这里相当于填充了一个未开始解析的内容,而调用这个函数之后 64 | # __recursion_mix_parser会在循环遍历一次,将这个页面重新解析 65 | internal_var.PAGE_DIC[child_page_id] = child_pages[child_page_id] 66 | debug_log("REPLACE last created soft page, id=" + child_page_id, level=NotionDump.DUMP_MODE_DEFAULT) 67 | 68 | # 包括占位的类型,如果总页面表里不存在都放进去 69 | if child_page_id not in internal_var.PAGE_DIC: 70 | # 如果现有的列表里没有这一条,则新加一条 71 | debug_log("CREATE child page " + child_page_id + " from child_pages", level=NotionDump.DUMP_MODE_DEFAULT) 72 | internal_var.PAGE_DIC[child_page_id] = copy.deepcopy(child_pages[child_page_id]) 73 | 74 | # 如果该页面是占位的,则不加到父页面表里 75 | if not child_pages[child_page_id]["inter_soft_page"]: 76 | debug_log("parent id" + parent_id + " add child " + child_page_id, 77 | level=NotionDump.DUMP_MODE_DEFAULT) 78 | internal_var.PAGE_DIC[parent_id]["child_pages"].append(child_page_id) 79 | else: 80 | debug_log("SOFT_PAGE " + child_page_id + " dont need to add to parent_id " + parent_id, 81 | level=NotionDump.DUMP_MODE_DEFAULT) 82 | 83 | 84 | # 添加一个新的子页 85 | # 链接的key格式是 id_链接名 86 | # 子页面的key格式是id 87 | def add_new_child_page(child_pages, key_id, link_id=None, link_src=None, page_name=None, page_type=None, 88 | inter_soft_page=False): 89 | # 判断id是否存在,存在就不添加了,防止覆盖 90 | debug_log("add new child key:" + key_id) 91 | # id 存在并且不是软连接创建的,就不添加了(硬链接先于软连接) 92 | if key_id in child_pages and not child_pages[key_id]["inter_soft_page"]: 93 | debug_log("WARN key_id:" + key_id + " exist, skip", level=NotionDump.DUMP_MODE_DEFAULT) 94 | return 95 | # 如果不存在或者上一个是软连接创建的,就重新赋值 96 | child_pages[key_id] = copy.deepcopy(internal_var.CHILD_PAGE_TEMP) 97 | child_pages[key_id]["inter_soft_page"] = inter_soft_page 98 | if link_id is not None: 99 | # 如果是软链接,递归看一下对应的子页面在不在,如果不在就先占个坑(忽略file和image类型) 100 | # inter_soft_page 表明该项是软连接创建的 101 | debug_log("SOFT_PAGE key_id " + key_id + " link_id " + link_id + ", create a null page with link_id", 102 | level=NotionDump.DUMP_MODE_DEFAULT) 103 | add_new_child_page(child_pages, key_id=link_id, link_src=link_src, page_type=page_type, inter_soft_page=True) 104 | if page_name is not None: 105 | child_pages[key_id]["page_name"] = page_name 106 | if link_id is not None: 107 | child_pages[key_id]["link_id"] = link_id 108 | if link_src is not None: 109 | child_pages[key_id]["link_src"] = link_src 110 | if page_type is not None: 111 | child_pages[key_id]["type"] = page_type 112 | 113 | 114 | # 用此函数的前提是page表中已经存在 115 | def update_page_recursion(page_id, recursion=False): 116 | if page_id not in internal_var.PAGE_DIC: 117 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 118 | return 119 | internal_var.PAGE_DIC[page_id]["inter_recursion"] = recursion 120 | 121 | 122 | def is_page_recursion(page_id): 123 | if page_id not in internal_var.PAGE_DIC: 124 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 125 | return False 126 | return not internal_var.PAGE_DIC[page_id]["inter_recursion"] 127 | 128 | 129 | def is_page_soft(page_id): 130 | if page_id not in internal_var.PAGE_DIC: 131 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 132 | return False 133 | return internal_var.PAGE_DIC[page_id]["inter_soft_page"] 134 | 135 | 136 | # page 返回True,DB返回False 137 | def is_page(page_id): 138 | if page_id not in internal_var.PAGE_DIC: 139 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 140 | return False 141 | return internal_var.PAGE_DIC[page_id]["type"] == "page" 142 | 143 | 144 | # database 返回True 145 | def is_db(db_id): 146 | if db_id not in internal_var.PAGE_DIC: 147 | debug_log("db_id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 148 | return False 149 | return internal_var.PAGE_DIC[db_id]["type"] == "database" 150 | 151 | 152 | # database 返回True 153 | def is_download(download_id): 154 | if download_id not in internal_var.PAGE_DIC: 155 | debug_log("download_id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT) 156 | return False 157 | # 可下载类型 158 | return internal_var.PAGE_DIC[download_id]["type"] == "image" or internal_var.PAGE_DIC[download_id]["type"] == "file" 159 | 160 | 161 | # 判断是否是链接页面 162 | def is_link_page(page_id, page_handle): 163 | return (page_id.find("_") != -1) and page_handle["link_id"] != "" 164 | 165 | 166 | # 将文本保存为json文件 167 | def save_json_to_file(handle, json_name): 168 | try: 169 | json_handle = json.dumps(handle, ensure_ascii=False, indent=4) 170 | except JSONDecodeError: 171 | debug_log("json decode error", level=NotionDump.DUMP_MODE_DEFAULT) 172 | return 173 | 174 | file = open(json_name, "w+", encoding="utf-8") 175 | file.write(json_handle) 176 | file.flush() 177 | file.close() 178 | 179 | 180 | # 从文件中加载json文件 181 | def load_json_from_file(json_name): 182 | if not os.path.exists(json_name): 183 | debug_log("json file not exist, path=" + json_name, level=NotionDump.DUMP_MODE_DEFAULT) 184 | return None 185 | try: 186 | json_fd = open(json_name, "r", encoding="utf-8") 187 | return json.load(json_fd) 188 | except JSONDecodeError: 189 | debug_log("json decode error", level=NotionDump.DUMP_MODE_DEFAULT) 190 | return None 191 | 192 | 193 | # 判断是否添加额外的换行 194 | def parser_newline(last_type, now_type): 195 | if last_type == "to_do" and now_type == "to_do": 196 | return False 197 | if last_type == "numbered_list_item" and now_type == "numbered_list_item": 198 | return False 199 | if last_type == "bulleted_list_item" and now_type == "bulleted_list_item": 200 | return False 201 | if last_type == "toggle" and now_type == "toggle": 202 | return False 203 | # 处理表格类型 204 | if last_type == "table" and now_type == "table_row": 205 | return False 206 | if last_type == "table_row" and now_type == "table_row": 207 | return False 208 | return True 209 | 210 | 211 | def debug_log(debug_str, level=NotionDump.DUMP_MODE_DEBUG): 212 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG: 213 | NotionDump.LOGGER.log_debug(debug_str) 214 | elif NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEFAULT and level == NotionDump.DUMP_MODE_DEFAULT: 215 | # 默认模式 对 level进行过滤 216 | NotionDump.LOGGER.log_info(debug_str) 217 | # 静默模式什么都不输出 218 | -------------------------------------------------------------------------------- /NotionDump/utils/content_format.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | import datetime 5 | 6 | import dateutil.parser 7 | 8 | 9 | # 获取mention的格式 10 | import NotionDump 11 | 12 | 13 | def get_mention_format(mention_content): 14 | return "@(" + mention_content + ")" 15 | 16 | 17 | # 获取page的格式 运行过程中只填充id,后续调整页面供定位使用 18 | def get_page_format_md(page_id, page_name, export_child): 19 | if export_child: 20 | return "[" + page_id + "]()" 21 | else: 22 | return "[" + page_name + "](" + page_id + ")" 23 | 24 | 25 | # 数据库title格式 26 | def get_database_title_format(title_id, title_ret, export_child): 27 | if export_child: 28 | return "[" + title_id + "]()" 29 | else: 30 | # 不导出子页面直接把标题填上去 31 | return title_ret 32 | 33 | 34 | # 获取page的格式 纯文本只填充名字即可 35 | def get_page_format_plain(page_name): 36 | return page_name 37 | 38 | 39 | # 封装URL的格式 40 | def get_url_format(url_plain, name="link"): 41 | return "[" + name + "](" + url_plain + ")" 42 | 43 | 44 | def format_date_or_time(date_time): 45 | # print(date_time) 46 | t_datetime = dateutil.parser.parse(date_time) 47 | # print(date_time, t_datetime) 48 | if date_time.find('T') != -1: 49 | # datetime 50 | return t_datetime.strftime(NotionDump.FORMAT_DATETIME) 51 | else: 52 | # date 53 | return t_datetime.strftime(NotionDump.FORMAT_DATE) 54 | 55 | 56 | # 封装date的格式 57 | def get_date_format(start, end): 58 | ret_str = "" 59 | if start is not None: 60 | ret_str = format_date_or_time(start) 61 | if end is not None: 62 | ret_str += " ~ " + format_date_or_time(end) # 日期之间用“~”分割 63 | return ret_str 64 | 65 | 66 | # 封装文件链接格式 67 | def get_file_format_md(filename, file_url, file_id="", export_child=False): 68 | if export_child: 69 | if file_id == "": 70 | return "[" + filename + "](" + file_url + ")" 71 | else: 72 | # 等待重定位 73 | return "[" + file_id + "]()" 74 | else: 75 | # 不导出子页面直接把标题填上去 76 | return "[" + filename + "](" + file_url + ")" 77 | 78 | 79 | # 封装文件链接格式 80 | def get_file_format_plain(filename, file_url): 81 | return filename + "(" + file_url + ")" 82 | 83 | 84 | # 行内公式格式 85 | def get_equation_inline(equation): 86 | return "$ " + equation + " $" 87 | 88 | 89 | # 块级公式格式 90 | def get_equation_block(equation): 91 | return "$$ " + equation + " $$" 92 | 93 | 94 | def color_transformer(input_color, background=False): 95 | if background: 96 | color_str = "b_" + input_color 97 | else: 98 | color_str = "f_" + input_color 99 | color_ret = "" 100 | if NotionDump.S_THEME_TYPE == "dark": 101 | # dark 102 | if color_str in NotionDump.S_THEME_DARK: 103 | color_ret = NotionDump.S_THEME_DARK[color_str] 104 | elif NotionDump.S_THEME_TYPE == "self_define": 105 | # self_define 106 | if color_str in NotionDump.S_THEME_SELF_DEFINE: 107 | color_ret = NotionDump.S_THEME_SELF_DEFINE[color_str] 108 | else: 109 | # default light 110 | if color_str in NotionDump.S_THEME_LIGHT: 111 | color_ret = NotionDump.S_THEME_LIGHT[color_str] 112 | if color_ret != "": 113 | return color_ret 114 | return input_color 115 | 116 | 117 | def color_transformer_db(input_color): 118 | if input_color == "default": 119 | color_str = "d_light_gray" 120 | else: 121 | color_str = "d_" + input_color 122 | 123 | color_ret = "" 124 | if NotionDump.S_THEME_TYPE == "dark": 125 | # dark 126 | if color_str in NotionDump.S_THEME_DARK: 127 | color_ret = NotionDump.S_THEME_DARK[color_str] 128 | elif NotionDump.S_THEME_TYPE == "self_define": 129 | # self_define 130 | if color_str in NotionDump.S_THEME_SELF_DEFINE: 131 | color_ret = NotionDump.S_THEME_SELF_DEFINE[color_str] 132 | else: 133 | # default light 134 | if color_str in NotionDump.S_THEME_LIGHT: 135 | color_ret = NotionDump.S_THEME_LIGHT[color_str] 136 | if color_ret != "": 137 | return color_ret 138 | return input_color 139 | -------------------------------------------------------------------------------- /NotionDump/utils/internal_var.py: -------------------------------------------------------------------------------- 1 | # author: delta1037 2 | # Date: 2022/01/08 3 | # mail:geniusrabbit@qq.com 4 | 5 | # ms 6 | FRIENDLY_USE_API = 400 7 | FRIENDLY_DOWNLOAD = 1000 8 | 9 | # 导出页面结构 10 | PAGE_DIC = {} 11 | 12 | # 导出页面列表的格式 13 | CHILD_PAGE_TEMP = { 14 | "dumped": False, 15 | "main_page": False, 16 | "type": "page", 17 | "local_path": "", 18 | "page_name": "", 19 | "link_id": "", 20 | "link_src": "", 21 | "child_pages": [], 22 | "inter_recursion": False, 23 | "inter_soft_page": False 24 | } 25 | # inter_soft_link 表示该页是由链接创建的 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # notion-export-kernel 2 | 3 | [中文](https://github.com/delta1037/notion-export-kernel/blob/main/README_zh.md) 4 | 5 | ## Description 6 | 7 | This repository is a development based on [notion-sdk-py](https://github.com/ramnes/notion-sdk-py)(notion official API), target to export notion pages and database 8 | 9 | Main targets: 10 | 11 | - [x] Export Notion Database and page to markdown file 12 | - [x] Recursion Export child Pages 13 | - [x] Download image and files in notion 14 | 15 | ## Structure 16 | 17 | ```shell 18 | notoin-dump 19 | ├─NotionDump 20 | │ ├─Dump # External Interface 21 | │ ├─Notion # Unified encapsulation interface for communication with Notion 22 | │ ├─Parser # Some parser 23 | │ └─utils # Internal variables and utils functions 24 | └─Tests # Test code 25 | ``` 26 | 27 | #### Parser code structure 28 | 29 | ```mermaid 30 | graph TD 31 | A[Dump] -->B(Database) 32 | A[Dump] -->C(Page/Block) 33 | B --> D[Mix Parser] 34 | C --> D[Mix Parser] 35 | 36 | D --> E[Database Parser] 37 | D --> F[Block Parser] 38 | 39 | E --> G[Base Parser] 40 | F --> G[Base Parser] 41 | ``` 42 | 43 | ## Usage 44 | 45 | ### 3.0 install & example 46 | 47 | **install `notion-dump-kernel`** 48 | 49 | ```powershell 50 | # open terminal, type the cmd (install the latest version) 51 | pip install python-dateutil 52 | pip install notion-dump-kernel 53 | ``` 54 | 55 | **example** 56 | 57 | ```python 58 | # Example: export page 59 | import NotionDump 60 | from NotionDump.Dump.dump import Dump 61 | from NotionDump.Notion.Notion import NotionQuery 62 | from NotionDump.utils import common_op 63 | 64 | TOKEN_TEST = "secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" 65 | PAGE_MIX_ID = "43e7aa8ccfb0488eb18f8a453eab0177" 66 | # NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG 67 | 68 | def test_page_parser(query, export_child=False, db_parser_type=NotionDump.PARSER_TYPE_MD): 69 | # init what you want to export 70 | # Explain: 71 | # dump_id: the ID which need to export (block, page or database) 72 | # query_handle: Notion query handle for getting data from API (NOT the offical API handle) 73 | # export_child_pages: whether export all nested pages(sub-page and link page) 74 | # dump_type: the dump_id type [DUMP_TYPE_BLOCK/DUMP_TYPE_PAGE/DUMP_TYPE_DB_TABLE] 75 | # db_parser_type: PARSER_TYPE_MD meas export database as markdown table; PARSER_TYPE_PLAIN means export database as CSV file 76 | # page_parser_type: PARSER_TYPE_MD meas export page as markdown file; PARSER_TYPE_PLAIN means export page as txt 77 | page_handle = Dump( 78 | dump_id=PAGE_MIX_ID, 79 | query_handle=query, 80 | export_child_pages=export_child, 81 | dump_type=NotionDump.DUMP_TYPE_PAGE, 82 | db_parser_type=db_parser_type, 83 | page_parser_type=NotionDump.PARSER_TYPE_MD 84 | ) 85 | 86 | # Returned variable , which contain all info about dumped files structure 87 | # All parsered files will be save at .tmp/ 88 | page_detail_json = page_handle.dump_to_file() 89 | 90 | # all info about dumped files structure save as json file 91 | print("json output to page_parser_result") 92 | common_op.save_json_to_file( 93 | handle=page_detail_json, 94 | json_name=".tmp/page_parser_result.json" 95 | ) 96 | 97 | 98 | if __name__ == '__main__': 99 | # We need a qurey handle for getting data from API 100 | query_handle = NotionQuery(token=TOKEN_TEST) 101 | if query_handle is None: 102 | logging.exception("query handle init error") 103 | exit(-1) 104 | 105 | # export_child means export all nested pages(sub-page and link page) 106 | test_page_parser(query_handle, True) 107 | ``` 108 | 109 | ### 3.1 Output 110 | 111 | All export files will be seen at `.tmp/` and the **page structure save at returned variable**, which contain all info about dumped files structure. 112 | 113 | return variable (`page_detail_json`) will be like: 114 | 115 | ```json 116 | { 117 | "key_id_1": { 118 | "dumped": true, 119 | "main_page": true, 120 | "type": "page", 121 | "local_path": "xxxx", 122 | "page_name": "", 123 | "link_id": "", 124 | "child_pages": [ 125 | "xxxxx", 126 | "xxxxx" 127 | ], 128 | "inter_recursion": true, 129 | "inter_soft_page": false 130 | }, 131 | "key_id_2": { 132 | "dumped": false, 133 | "main_page": false, 134 | "type": "page", 135 | "local_path": "", 136 | "page_name": "", 137 | "link_id": "xxxxx", 138 | "child_pages": [], 139 | "inter_recursion": true, 140 | "inter_soft_page": false 141 | } 142 | } 143 | ``` 144 | 145 | **output explain**: 146 | 147 | - key_id_1:key is id (block id/page id/database id) and it is the combination of link name and id in link page,the id is the tag to relocate link in page 148 | - dumped:download status of the resource specifid by id 149 | - main_page:whether the page is the page specifid by dump_id (root) 150 | - type:id type, database or page (page type contain page and block)(if id_1 is a link the type is the page type that the link linked) 151 | - local_path:the location of export file, for subsequent operations 152 | - page_name:page name (for subsequent relocation of page url) 153 | - child_pages:subpage or database id this key_id contain 154 | - inter_recursion:internal variable(NOT use) 155 | - inter_soft_page:internal variable(NOT use) 156 | 157 | ## Attention 158 | 159 | - [ ] Comment not support 160 | 161 | ## Others 162 | 163 | ### 6.1、Notion Test Page 164 | 165 | [Notion Test Page](https://delta1037.notion.site/Notion-dump-ed0a3b0f57b34712bc6bafcbdb413d50) 166 | 167 | ### 6.2 Notion export client 168 | 169 | which base on notion-export-kernel, it is used to rebuild the structure of dumped files(dumped by notion-export-kernel) and relocate the link in pages 170 | 171 | [Github](https://github.com/delta1037/notion-export-client) 172 | 173 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 | # notion-export-kernel 2 | 3 | 4 | 5 | ------ 6 | 7 | ## 一、项目说明 8 | 9 | 本仓库是基于 [notion-sdk-py](https://github.com/ramnes/notion-sdk-py)(notion官方API)的开发,导出Notion页面和数据库。 10 | 11 | 项目目标 12 | 13 | - [x] 将Notion页面和数据库导出为Markdown文件 14 | - [x] 递归导出所有子页面(或者链接) 15 | - [x] 下载文件和图片 16 | 17 | ## 二、项目结构 18 | 19 | ```shell 20 | notoin-dump 21 | ├─NotionDump 22 | │ ├─Dump # 对外接口 23 | │ ├─Notion # 与Notion通信统一封装接口 24 | │ ├─Parser # 实现的一些解析器 25 | │ └─utils # 内部变量与杂项函数 26 | └─Tests # 测试代码 27 | ``` 28 | 29 | ```mermaid 30 | graph TD 31 | A[Dump] -->B(Database) 32 | A[Dump] -->C(Page/Block) 33 | B --> D[Mix Parser] 34 | C --> D[Mix Parser] 35 | 36 | D --> E[Database Parser] 37 | D --> F[Block Parser] 38 | 39 | E --> G[Base Parser] 40 | F --> G[Base Parser] 41 | ``` 42 | 43 | 44 | 45 | 46 | 47 | ## 三、使用方法 48 | 49 | ### 3.0 安装导入 50 | 51 | **安装`notion-dump-kernel`** 52 | 53 | ```powershell 54 | # 打开终端,输入如下命令安装(装最最新版) 55 | pip install python-dateutil 56 | pip install notion-dump-kernel 57 | ``` 58 | 59 | **导入使用** 60 | 61 | ```python 62 | import NotionDump 63 | from NotionDump.Dump.dump import Dump 64 | from NotionDump.Notion.Notion import NotionQuery 65 | ``` 66 | 67 | 68 | 69 | ### 3.1 对外统一接口 70 | 71 | ```python 72 | # 获取Notion查询句柄 73 | query_handle = NotionQuery( 74 | token=TOKEN_TEST, # Token 75 | client_handle=None, # Notion官方API句柄,默认为空 76 | async_api=False # 异步调用,默认为False 77 | ) 78 | 79 | # 获取操作句柄 80 | handle = Dump( 81 | dump_id=ID, # 需要导出的页面ID 82 | query_handle=query, # Notion查询句柄 83 | export_child_pages=True, # 是否递归导出子页面 84 | page_parser_type=NotionDump.PARSER_TYPE_MD, # Page导出类型 85 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN, # 数据库导出类型 86 | dump_type=NotionDump.DUMP_TYPE_XXX # ID的类型,详细见后续说明 87 | ) 88 | 89 | # dump类型 dump_type 90 | DUMP_TYPE_BLOCK # 块类型 91 | DUMP_TYPE_PAGE # 页面类型 92 | DUMP_TYPE_DB_TABLE # 数据库Table类型 93 | 94 | # 导出类型 95 | PARSER_TYPE_MD # Markdown格式 96 | PARSER_TYPE_PLAIN # 纯文本格式 97 | 98 | # 其它 99 | # 变量自解释,不再赘述 100 | ``` 101 | 102 | [操作示例](https://github.com/delta1037/notion-dump-kernel/tree/main/Examples) 103 | 104 | ### 3.2 获取输出 105 | 106 | dump的结果存放在一个字典变量中,改变量包含了外部可以操作的所有新信息,获取输出和输出解释如下 107 | 108 | ```python 109 | # 获取输出 110 | dump_output = dump_handle.dump_to_file() 111 | # 其中dump_handle为上述的操作句柄(Dump(xxx)返回值) 112 | ``` 113 | 114 | 输出样例: 115 | 116 | ```json 117 | { 118 | "key_id_1": { 119 | "dumped": true, 120 | "main_page": true, 121 | "type": "page", 122 | "local_path": "xxxx", 123 | "page_name": "", 124 | "link_id": "", 125 | "child_pages": [ 126 | "xxxxx", 127 | "xxxxx" 128 | ], 129 | "inter_recursion": true, 130 | "inter_soft_page": false 131 | }, 132 | "key_id_2": { 133 | "dumped": false, 134 | "main_page": false, 135 | "type": "page", 136 | "local_path": "", 137 | "page_name": "", 138 | "link_id": "xxxxx", 139 | "child_pages": [], 140 | "inter_recursion": true, 141 | "inter_soft_page": false 142 | } 143 | } 144 | ``` 145 | 146 | **输出解释**: 147 | 148 | - key_id_1:键值,也是dump下来的页面需要重定位的标志 149 | - dumped:id指向的资源是否成功下载 150 | - main_page:页面是否是主页 151 | - type:该id的类型,database或者page(链接的话是链接指向的页面的类型) 152 | - local_path:导出的文件位置,供后续操作 153 | - page_name:页面是否有名称(后续重定位使用) 154 | - child_pages:包含的子页面或者子数据库 155 | - inter_recursion:内部使用变量,无需关注 156 | - inter_soft_page:内部使用变量,无需关注 157 | 158 | ## 五、注意 159 | 160 | - [ ] 不支持评论内容 161 | 162 | ## 六、附录 163 | 164 | ### 6.1、项目测试 165 | 166 | [项目测试页面](https://delta1037.notion.site/Notion-dump-ed0a3b0f57b34712bc6bafcbdb413d50) 167 | 168 | ### 6.2 Notion dump client 169 | 170 | 基于notion-dump-kernel做的一个对下载下来的页面重新组合文件结构,并对其中的链接部分进行重定位的项目 171 | 172 | [项目Github地址](https://github.com/delta1037/notion-export-local) 173 | 174 | -------------------------------------------------------------------------------- /img/get_data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/img/get_data.png -------------------------------------------------------------------------------- /img/parser_structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/img/parser_structure.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel~=0.36.2 2 | setuptools~=57.0.0 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | try: 4 | from setuptools import setup, find_packages 5 | from setuptools import Command 6 | from setuptools import Extension 7 | except ImportError: 8 | sys.exit( 9 | "We need the Python library setuptools to be installed. " 10 | "Try running: python -m ensurepip" 11 | ) 12 | 13 | if "bdist_wheel" in sys.argv: 14 | try: 15 | import wheel # noqa: F401 16 | except ImportError: 17 | sys.exit( 18 | "We need both setuptools AND wheel packages installed " 19 | "for bdist_wheel to work. Try running: pip install wheel" 20 | ) 21 | 22 | 23 | REQUIRES = ["notion-client>=0.8.0"] 24 | 25 | # with open("README_En.md", encoding="utf-8") as handle: 26 | # readme_rst = handle.read() 27 | 28 | setup( 29 | name="notion-dump-kernel", 30 | version="0.2.4", 31 | author="delta1037", 32 | author_email="geniusrabbit@qq.com", 33 | url="https://github.com/delta1037/notion-export-kernel", 34 | description="Freely available tools for export Notion page and database.", 35 | project_urls={ 36 | "Documentation": "https://github.com/delta1037/notion-export-kernel/blob/main/README.md", 37 | "Source": "https://github.com/delta1037/notion-export-kernel", 38 | "Tracker": "https://github.com/delta1037/notion-export-kernel/issues", 39 | }, 40 | classifiers=[ 41 | "Development Status :: 3 - Alpha", 42 | "Intended Audience :: Developers", 43 | "License :: OSI Approved :: MIT License", 44 | "Operating System :: OS Independent", 45 | "Programming Language :: Python", 46 | "Programming Language :: Python :: 3.9", 47 | "Topic :: Text Processing :: Markup", 48 | "Topic :: Software Development :: Libraries :: Python Modules", 49 | ], 50 | packages=find_packages(where='.', exclude=(), include=('*',)), 51 | include_package_data=True, # done via MANIFEST.in under setuptools 52 | install_requires=REQUIRES, 53 | ) 54 | # 打包发布 55 | # 1、python setup.py sdist 56 | # 2、twine upload dist/* 57 | 58 | --------------------------------------------------------------------------------