├── .gitignore
├── Examples
├── block_dump.py
├── db_dump.py
└── page_dump.py
├── LICENSE
├── MANIFEST.in
├── NotionDump
├── Dump
│ ├── __init__.py
│ ├── block.py
│ ├── database.py
│ ├── dump.py
│ └── page.py
├── Notion
│ ├── Buffer.py
│ ├── Notion.py
│ └── __init__.py
├── Parser
│ ├── __init__.py
│ ├── base_parser.py
│ ├── block_parser.py
│ ├── database_parser.py
│ └── mix_parser.py
├── __init__.py
└── utils
│ ├── __init__.py
│ ├── common_op.py
│ ├── content_format.py
│ └── internal_var.py
├── README.md
├── README_zh.md
├── img
├── get_data.png
└── parser_structure.png
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # pycharm
132 | .idea/
133 |
134 | # tempfile
135 | Tests/.tmp/
136 | Examples/.tmp/
137 |
--------------------------------------------------------------------------------
/Examples/block_dump.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/11
3 | # mail:geniusrabbit@qq.com
4 |
5 | import logging
6 |
7 | import NotionDump
8 | from NotionDump.Dump.dump import Dump
9 | from NotionDump.Notion.Notion import NotionQuery
10 | from NotionDump.utils import common_op
11 |
12 | TOKEN_TEST = "secret_WRLJ9xyEawNxzRhVHVWfciTl9FAyNCd29GMUvr2hQD4"
13 | TABLE_ID = "13b914160ef740dcb64e55c5393762fa"
14 | RER_LIST_ID = "d32db4693409464b9981caec9ef11974"
15 |
16 |
17 | # 页面表格测试
18 | def test_get_table_block(query, export_child=True):
19 | block_handle = Dump(
20 | dump_id=TABLE_ID,
21 | query_handle=query,
22 | export_child_pages=export_child,
23 | dump_type=NotionDump.DUMP_TYPE_BLOCK
24 | )
25 | # 将解析内容存储到文件中;返回内容存储为json文件
26 | page_detail_json = block_handle.dump_to_file()
27 |
28 | print("json output to block_table_parser_result")
29 | common_op.save_json_to_file(
30 | handle=page_detail_json,
31 | json_name=".tmp/block_table_parser_result.json"
32 | )
33 |
34 |
35 | # 递归列表测试
36 | def test_get_rer_list(query, export_child=True):
37 | block_handle = Dump(
38 | dump_id=RER_LIST_ID,
39 | query_handle=query,
40 | export_child_pages=export_child,
41 | dump_type=NotionDump.DUMP_TYPE_BLOCK
42 | )
43 | # 将解析内容存储到文件中;返回内容存储为json文件
44 | page_detail_json = block_handle.dump_to_file()
45 |
46 | print("json output to block_list_parser_result")
47 | common_op.save_json_to_file(
48 | handle=page_detail_json,
49 | json_name=".tmp/block_list_parser_result.json"
50 | )
51 |
52 |
53 | if __name__ == '__main__':
54 | query_handle = NotionQuery(token=TOKEN_TEST)
55 | if query_handle is None:
56 | logging.exception("query handle init error")
57 | exit(-1)
58 |
59 | # Block解析测试
60 | # test_get_table_block(query_handle, export_child=False)
61 | test_get_table_block(query_handle, export_child=True)
62 |
63 | # Block解析测试
64 | # test_get_rer_list(query_handle, export_child=False)
65 | test_get_rer_list(query_handle, export_child=True)
66 |
--------------------------------------------------------------------------------
/Examples/db_dump.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/11
3 | # mail:geniusrabbit@qq.com
4 |
5 | import logging
6 |
7 | import NotionDump
8 | from NotionDump.Dump.database import Database
9 | from NotionDump.Dump.dump import Dump
10 | from NotionDump.Notion.Notion import NotionQuery
11 | from NotionDump.utils import common_op
12 |
13 | TOKEN_TEST = "secret_ALjbBRGaZcagEjPtL1c2F139steBXjr8Fc8uQso4YLV"
14 | DB_TABLE_INLINE_ID = "0b1f524ad42b420f889a2c6adb9b8c92"
15 | NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG
16 |
17 |
18 | # 解析数据库内容测试:根据token和id解析数据库内容,得到临时CSV文件
19 | def test_db_table_inline_parser_dic(query):
20 | db_handle = Database(
21 | database_id=DB_TABLE_INLINE_ID,
22 | query_handle=query,
23 | export_child_pages=False
24 | )
25 | # 将解析内容存储到文件中;返回内容存储为json文件
26 | page_detail_json = db_handle.dump_to_file()
27 |
28 | print("json output to db_parser_result")
29 | common_op.save_json_to_file(
30 | handle=page_detail_json,
31 | json_name=".tmp/db_parser_result.json"
32 | )
33 | print(db_handle.dump_to_dic())
34 |
35 |
36 | # 解析数据库内容测试:根据token和id解析数据库内容,得到临时CSV文件
37 | def test_db_table_inline_parser_csv(query, export_child=False):
38 | db_handle = Dump(
39 | dump_id=DB_TABLE_INLINE_ID,
40 | query_handle=query,
41 | export_child_pages=export_child,
42 | dump_type=NotionDump.DUMP_TYPE_DB_TABLE
43 | )
44 | # 将解析内容存储到文件中;返回内容存储为json文件
45 | page_detail_json = db_handle.dump_to_file()
46 |
47 | print("json output to db_parser_result")
48 | common_op.save_json_to_file(
49 | handle=page_detail_json,
50 | json_name=".tmp/db_parser_result.json"
51 | )
52 |
53 |
54 | def test_db_table_inline_parser_md(query, export_child=False):
55 | db_handle = Dump(
56 | dump_id=DB_TABLE_INLINE_ID,
57 | query_handle=query,
58 | export_child_pages=export_child,
59 | dump_type=NotionDump.DUMP_TYPE_DB_TABLE,
60 | db_parser_type=NotionDump.PARSER_TYPE_MD,
61 | )
62 | # 将解析内容存储到文件中;返回内容存储为json文件
63 | page_detail_json = db_handle.dump_to_file()
64 |
65 | print("json output to db_parser_result")
66 | common_op.save_json_to_file(
67 | handle=page_detail_json,
68 | json_name=".tmp/db_parser_result.json"
69 | )
70 |
71 |
72 | if __name__ == '__main__':
73 | query_handle = NotionQuery(token=TOKEN_TEST)
74 | if query_handle is None:
75 | logging.exception("query handle init error")
76 | exit(-1)
77 |
78 | # 数据库存储到CSV文件
79 | # test_db_table_inline_parser_csv(query_handle, True)
80 |
81 | # 数据库存储到MD文件
82 | test_db_table_inline_parser_md(query_handle, True)
83 |
84 | # 数据库存储到字典
85 | # test_db_table_inline_parser_dic(query_handle)
86 |
87 |
--------------------------------------------------------------------------------
/Examples/page_dump.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/11
3 | # mail:geniusrabbit@qq.com
4 |
5 | import logging
6 |
7 | import NotionDump
8 | from NotionDump.Dump.dump import Dump
9 | from NotionDump.Notion.Notion import NotionQuery
10 | from NotionDump.utils import common_op
11 |
12 | TOKEN_TEST = "secret_ALjbBRGaZcagEjPtL1c2F139steBXjr8Fc8uQso4YLV"
13 | PAGE_MIX_ID = "921e6b4ea44046c6935bcb2c69453196"
14 | # NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG
15 |
16 |
17 | # 解析数据库内容测试:根据token和id解析数据库内容,得到临时CSV文件
18 | def test_page_parser(query, export_child=False, db_parser_type=NotionDump.PARSER_TYPE_MD):
19 | page_handle = Dump(
20 | dump_id=PAGE_MIX_ID,
21 | query_handle=query,
22 | export_child_pages=export_child,
23 | dump_type=NotionDump.DUMP_TYPE_PAGE,
24 | db_parser_type=db_parser_type
25 | )
26 | # 将解析内容存储到文件中;返回内容存储为json文件
27 | page_detail_json = page_handle.dump_to_file()
28 |
29 | print("json output to page_parser_result")
30 | common_op.save_json_to_file(
31 | handle=page_detail_json,
32 | json_name=".tmp/page_parser_result.json"
33 | )
34 |
35 |
36 | def test_page_retrieve(query: NotionQuery):
37 | # 获取页面信息测试
38 | ret = query.retrieve_page("0cee7c12f04c4157bcc025355adf2312")
39 | print(ret)
40 |
41 |
42 | if __name__ == '__main__':
43 | query_handle = NotionQuery(token=TOKEN_TEST)
44 | if query_handle is None:
45 | logging.exception("query handle init error")
46 | exit(-1)
47 |
48 | # 测试收集页面信息
49 | # test_page_retrieve(query_handle)
50 |
51 | # 页面解析测试,递归
52 | test_page_parser(query_handle, True)
53 |
54 | # 页面解析测试,非递归
55 | # test_page_parser(query_handle, False)
56 |
57 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 delta1037
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt
2 | recursive-include examples *.txt *.py
3 | prune examples/sample?/build
4 |
--------------------------------------------------------------------------------
/NotionDump/Dump/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Dump/__init__.py
--------------------------------------------------------------------------------
/NotionDump/Dump/block.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 | import NotionDump
5 | from NotionDump.Dump.page import Page
6 | from NotionDump.Notion.Notion import NotionQuery
7 | from NotionDump.utils import internal_var
8 |
9 |
10 | # Block内容解析
11 | class Block:
12 | # 初始化
13 | def __init__(
14 | self,
15 | block_id,
16 | query_handle:
17 | NotionQuery,
18 | export_child_pages=False,
19 | page_parser_type=NotionDump.PARSER_TYPE_MD,
20 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN
21 | ):
22 | self.block_id = block_id.replace('-', '')
23 | self.query_handle = query_handle
24 | # 是否导出子页面
25 | self.export_child_page = export_child_pages
26 | self.page_parser_type = page_parser_type
27 | self.db_parser_type = db_parser_type
28 |
29 | # 构造解析器
30 | self.page_handle = Page(
31 | page_id=self.block_id,
32 | query_handle=self.query_handle,
33 | export_child_pages=self.export_child_page,
34 | page_parser_type=self.page_parser_type,
35 | db_parser_type=self.db_parser_type
36 | )
37 |
38 | # show_child_page
39 | @staticmethod
40 | def get_pages_detail():
41 | return internal_var.PAGE_DIC
42 |
43 | # 获取到所有的BLOCK数据
44 | def dump_to_file(self, file_name=None):
45 | # 递归时第一个block单独作为一个main page存放
46 | return self.page_handle.dump_to_file(file_name=file_name)
47 |
48 | def dump_to_db(self):
49 | return self.page_handle.dump_to_db()
50 |
51 | # 源文件,直接输出成json; 辅助测试使用
52 | def dump_to_json(self, json_name=None):
53 | return self.page_handle.dump_to_json(json_name=json_name)
54 |
--------------------------------------------------------------------------------
/NotionDump/Dump/database.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 |
5 | import os
6 | import shutil
7 |
8 | import NotionDump
9 | from NotionDump.Notion.Notion import NotionQuery
10 | from NotionDump.Parser.mix_parser import MixParser
11 | from NotionDump.utils import common_op, internal_var
12 |
13 |
14 | class Database:
15 | # 初始化
16 | def __init__(
17 | self,
18 | database_id,
19 | query_handle: NotionQuery,
20 | export_child_pages=False,
21 | page_parser_type=NotionDump.PARSER_TYPE_MD,
22 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN
23 | ):
24 | self.database_id = database_id.replace('-', '')
25 | self.query_handle = query_handle
26 | # 是否导出子页面
27 | self.export_child_page = export_child_pages
28 | self.page_parser_type = page_parser_type
29 | self.db_parser_type = db_parser_type
30 |
31 | # 构造解析器
32 | self.mix_parser = MixParser(
33 | mix_id=self.database_id,
34 | query_handle=self.query_handle,
35 | export_child_pages=self.export_child_page,
36 | page_parser_type=self.page_parser_type,
37 | db_parser_type=self.db_parser_type
38 | )
39 |
40 | # 创建临时文件夹
41 | self.tmp_dir = NotionDump.TMP_DIR
42 | if not os.path.exists(self.tmp_dir):
43 | os.mkdir(self.tmp_dir)
44 |
45 | # show_child_page
46 | @staticmethod
47 | def get_pages_detail():
48 | return internal_var.PAGE_DIC
49 |
50 | # 获取到所有的数据库数据(CSV格式)(数据库导出均是CSV)
51 | def dump_to_file(self, file_name=None, col_name_list=None):
52 | # 解析到临时文件中
53 | tmp_filename = self.mix_parser.mix_parser(root_id=self.database_id, id_type="database", col_name_list=col_name_list)
54 | if tmp_filename is None:
55 | common_op.debug_log("page parser fail, id=" + self.database_id, level=NotionDump.DUMP_MODE_DEFAULT)
56 | return ""
57 |
58 | if file_name is not None:
59 | shutil.copyfile(tmp_filename, file_name)
60 | common_op.debug_log("copy " + tmp_filename + " to " + file_name, level=NotionDump.DUMP_MODE_DEFAULT)
61 | return file_name
62 |
63 | return tmp_filename
64 |
65 | def dump_to_db(self, col_name_list=None, db_q_filter="{}", db_q_sorts="[]"):
66 | # 从配置文件中获取数据库配置,打开数据库,并将csv文件写入到数据库中
67 | db_json = self.query_handle.query_database(
68 | database_id=self.database_id,
69 | db_q_filter=db_q_filter,
70 | db_q_sorts=db_q_sorts)
71 | if db_json is None:
72 | return ""
73 |
74 | # TODO 将CSV文件写入到数据库;调用SQL中的notion2sql提供的接口
75 | return
76 |
77 | # 源文件,直接输出成json; 辅助测试使用
78 | def dump_to_json(self, json_name=None, db_q_filter="{}", db_q_sorts="[]"):
79 | db_json = self.query_handle.query_database(
80 | database_id=self.database_id,
81 | db_q_filter=db_q_filter,
82 | db_q_sorts=db_q_sorts)
83 | if db_json is None:
84 | return ""
85 |
86 | if json_name is None:
87 | json_name = self.tmp_dir + self.database_id + ".json"
88 | common_op.save_json_to_file(db_json, json_name)
89 |
90 | def dump_to_dic(self, col_name_list=None, db_q_filter="{}", db_q_sorts="[]"):
91 | # 由于数据库修改属性不会使数据库的修改时间改变,所以这里采用强制更新的方式
92 | db_json = self.query_handle.query_database(
93 | database_id=self.database_id,
94 | db_q_filter=db_q_filter,
95 | db_q_sorts=db_q_sorts,
96 | force_update=True
97 | )
98 | if db_json is None:
99 | common_op.debug_log("query database get nothing, id=" + self.database_id,
100 | level=NotionDump.DUMP_MODE_DEFAULT)
101 | return ""
102 |
103 | return self.mix_parser.database_collection(
104 | json_handle=db_json,
105 | json_type="database",
106 | col_name_list=col_name_list
107 | )
108 |
--------------------------------------------------------------------------------
/NotionDump/Dump/dump.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 | import copy
5 |
6 | import NotionDump
7 | from NotionDump.Dump.block import Block
8 | from NotionDump.Dump.database import Database
9 | from NotionDump.Dump.page import Page
10 | from NotionDump.Notion.Notion import NotionQuery
11 | from NotionDump.utils import internal_var, common_op
12 |
13 |
14 | class Dump:
15 | def __init__(
16 | self,
17 | dump_id,
18 | query_handle: NotionQuery,
19 | export_child_pages=False,
20 | page_parser_type=NotionDump.PARSER_TYPE_MD,
21 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN,
22 | dump_type=NotionDump.DUMP_TYPE_PAGE
23 | ):
24 | self.dump_id = dump_id.replace('-', '')
25 | self.query_handle = query_handle
26 | # 是否导出子页面
27 | self.export_child_page = export_child_pages
28 | self.page_parser_type = page_parser_type
29 | self.db_parser_type = db_parser_type
30 | self.dump_type = dump_type
31 |
32 | self.handle = None
33 | if dump_type == NotionDump.DUMP_TYPE_PAGE:
34 | self.handle = Page(
35 | page_id=self.dump_id,
36 | query_handle=self.query_handle,
37 | export_child_pages=self.export_child_page,
38 | page_parser_type=self.page_parser_type,
39 | db_parser_type=self.db_parser_type
40 | )
41 | elif dump_type == NotionDump.DUMP_TYPE_BLOCK:
42 | self.handle = Block(
43 | block_id=self.dump_id,
44 | query_handle=self.query_handle,
45 | export_child_pages=self.export_child_page,
46 | page_parser_type=self.page_parser_type,
47 | db_parser_type=self.db_parser_type
48 | )
49 | elif dump_type == NotionDump.DUMP_TYPE_DB_TABLE:
50 | self.handle = Database(
51 | database_id=self.dump_id,
52 | query_handle=self.query_handle,
53 | export_child_pages=self.export_child_page,
54 | page_parser_type=self.page_parser_type,
55 | db_parser_type=self.db_parser_type
56 | )
57 | else:
58 | common_op.debug_log("unknown dump type:" + str(self.dump_type), level=NotionDump.DUMP_MODE_DEFAULT)
59 |
60 | # show_child_page
61 | @staticmethod
62 | def __get_pages_detail():
63 | return internal_var.PAGE_DIC
64 |
65 | # 获取到所有的BLOCK数据
66 | def dump_to_file(self, file_name=None):
67 | if self.handle is None:
68 | common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT)
69 | return ""
70 | # 递归时第一个block单独作为一个main page存放
71 | self.handle.dump_to_file(file_name=file_name)
72 | self.query_handle.safe_save()
73 |
74 | pages_detail = copy.deepcopy(internal_var.PAGE_DIC)
75 | internal_var.PAGE_DIC = {}
76 | return pages_detail
77 |
78 | def dump_to_db(self):
79 | if self.handle is None:
80 | common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT)
81 | return ""
82 | # 将内容导出到数据库
83 | self.handle.dump_to_db()
84 | self.query_handle.safe_save()
85 |
86 | pages_detail = copy.deepcopy(internal_var.PAGE_DIC)
87 | internal_var.PAGE_DIC = {}
88 | return pages_detail
89 |
90 | # 源文件,直接输出成json; 辅助测试使用
91 | def dump_to_json(self, json_name=None):
92 | if self.handle is None:
93 | common_op.debug_log("dump init fail", level=NotionDump.DUMP_MODE_DEFAULT)
94 | return ""
95 |
96 | self.handle.dump_to_json(json_name=json_name)
97 | self.query_handle.safe_save()
98 |
99 | pages_detail = copy.deepcopy(internal_var.PAGE_DIC)
100 | internal_var.PAGE_DIC = {}
101 | return pages_detail
102 |
--------------------------------------------------------------------------------
/NotionDump/Dump/page.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 | import os
5 | import shutil
6 |
7 | import NotionDump
8 | from NotionDump.Notion.Notion import NotionQuery
9 | from NotionDump.Parser.mix_parser import MixParser
10 | from NotionDump.utils import common_op
11 | from NotionDump.utils import internal_var
12 |
13 |
14 | class Page:
15 | # 初始化
16 | def __init__(
17 | self,
18 | page_id,
19 | query_handle: NotionQuery,
20 | export_child_pages=False,
21 | page_parser_type=NotionDump.PARSER_TYPE_MD,
22 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN
23 | ):
24 | self.page_id = page_id.replace('-', '')
25 | self.query_handle = query_handle
26 | # 是否导出子页面
27 | self.export_child_page = export_child_pages
28 | self.page_parser_type = page_parser_type
29 | self.db_parser_type = db_parser_type
30 |
31 | # 构造解析器
32 | self.mix_parser = MixParser(
33 | mix_id=self.page_id,
34 | query_handle=self.query_handle,
35 | export_child_pages=self.export_child_page,
36 | page_parser_type=self.page_parser_type,
37 | db_parser_type=self.db_parser_type
38 | )
39 |
40 | # 创建临时文件夹
41 | self.tmp_dir = NotionDump.TMP_DIR
42 | if not os.path.exists(self.tmp_dir):
43 | os.mkdir(self.tmp_dir)
44 |
45 | # show_child_page
46 | @staticmethod
47 | def get_pages_detail():
48 | return internal_var.PAGE_DIC
49 |
50 | # 获取到所有的PAGE数据
51 | def dump_to_file(self, file_name=None):
52 | # 解析到临时文件中
53 | tmp_md_filename = self.mix_parser.mix_parser(root_id=self.page_id, id_type="block")
54 | if tmp_md_filename is None:
55 | common_op.debug_log("page parser fail, id="+self.page_id, level=NotionDump.DUMP_MODE_DEFAULT)
56 | return ""
57 |
58 | if file_name is not None:
59 | shutil.copyfile(tmp_md_filename, file_name)
60 | common_op.debug_log("copy " + tmp_md_filename + " to " + file_name, level=NotionDump.DUMP_MODE_DEFAULT)
61 | return file_name
62 |
63 | return tmp_md_filename
64 |
65 | def dump_to_db(self):
66 | # 从配置文件中获取数据库配置,打开数据库,并将csv文件写入到数据库中
67 | page_json = self.query_handle.retrieve_block_children(self.page_id)
68 | if page_json is None:
69 | return None
70 |
71 | # TODO 将Md文件写入到数据库;调用SQL中的notion2sql提供的接口
72 | return
73 |
74 | # 源文件,直接输出成json; 辅助测试使用
75 | def dump_to_json(self, json_name=None):
76 | page_json = self.query_handle.retrieve_block_children(self.page_id)
77 | if page_json is None:
78 | return None
79 |
80 | if json_name is None:
81 | json_name = self.tmp_dir + self.page_id + ".json"
82 | common_op.save_json_to_file(page_json, json_name)
83 | return
84 |
--------------------------------------------------------------------------------
/NotionDump/Notion/Buffer.py:
--------------------------------------------------------------------------------
1 | from time import strftime, localtime
2 |
3 | import NotionDump
4 | from NotionDump.utils import common_op
5 |
6 |
7 | class Buffer:
8 | def __init__(self):
9 | self.base_time = strftime("%Y-%m-%d %H:%M:%S", localtime())
10 | self.buffer_map = common_op.load_json_from_file(NotionDump.BUFFER_FILE)
11 | if self.buffer_map is None:
12 | self.buffer_map = {}
13 |
14 | def save_buffer(self):
15 | common_op.debug_log("save buffer file")
16 | common_op.save_json_to_file(self.buffer_map, NotionDump.BUFFER_FILE)
17 |
18 | def add_buffer(self, page_id, page_time, id_type="page"):
19 | if page_id not in self.buffer_map:
20 | common_op.debug_log("[BUFFER] add_buffer, new, id=" + page_id + ", type=" + id_type)
21 | self.buffer_map[page_id] = {
22 | "type": id_type,
23 | # 页面上次编辑时间
24 | "last_edited_time": page_time,
25 | # 页面上次下载时间
26 | "update_time": None,
27 | # 页面脏标志
28 | "dirty": True
29 | }
30 | else:
31 | if page_time != self.buffer_map[page_id]["last_edited_time"]:
32 | # 页面编辑过,需要重新下载
33 | common_op.debug_log("[BUFFER] add_buffer, update, id=" + page_id + ", type=" + id_type)
34 | self.buffer_map[page_id]["dirty"] = True
35 | self.buffer_map[page_id]["last_edited_time"] = page_time
36 |
37 | def update_buffer(self, page_id):
38 | # 文件已重新下载,设置更新时间
39 | if page_id in self.buffer_map:
40 | common_op.debug_log("[BUFFER] update_buffer, id=" + page_id)
41 | self.buffer_map[page_id]["update_time"] = strftime("%Y-%m-%d %H:%M:%S", localtime())
42 | self.buffer_map[page_id]["dirty"] = False
43 |
44 | def select_buffer(self, page_id, is_child=False):
45 | # 查看缓存中是否命中,命中返回True(说明缓存有效),没命中返回False(说明缓存文件无效,需要重新下载)
46 | if page_id not in self.buffer_map:
47 | common_op.debug_log("[BUFFER] select_buffer, id=" + page_id + ", not exist")
48 | return True
49 | else:
50 | if is_child:
51 | if self.buffer_map[page_id]["update_time"] >= self.base_time:
52 | # 子块所在的页面刚更新过,子块也要随之更新
53 | common_op.debug_log("[BUFFER] select_buffer, child update, id=" + page_id)
54 | return True
55 | else:
56 | common_op.debug_log("[BUFFER] select_buffer, child old, id=" + page_id)
57 | return self.buffer_map[page_id]["dirty"]
58 | else:
59 | common_op.debug_log("[BUFFER] select_buffer, main, id=" + page_id)
60 | return self.buffer_map[page_id]["dirty"]
61 |
--------------------------------------------------------------------------------
/NotionDump/Notion/Notion.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/10
3 | # mail:geniusrabbit@qq.com
4 | import os
5 | from time import sleep, time
6 | import urllib.request
7 | from time import time, sleep
8 | from urllib.error import URLError
9 | from urllib.parse import quote
10 | from notion_client import Client, AsyncClient
11 | from notion_client import APIErrorCode, APIResponseError
12 |
13 | import NotionDump
14 | from NotionDump.Notion.Buffer import Buffer
15 | from NotionDump.utils import common_op, internal_var
16 |
17 |
18 | class NotionQuery:
19 | def __init__(self, token, client_handle=None, async_api=False):
20 | self.token = token
21 | if client_handle is None and token is not None:
22 | # 有的token话就初始化一下
23 | if not async_api:
24 | self.client = Client(auth=self.token)
25 | else:
26 | self.client = AsyncClient(auth=self.token)
27 | else:
28 | # 没有token,传进来handle就用,没传就不用
29 | self.client = client_handle
30 |
31 | if self.client is None:
32 | common_op.debug_log("notion query init fail", level=NotionDump.DUMP_MODE_DEFAULT)
33 |
34 | # 创建临时文件夹
35 | self.tmp_dir = NotionDump.TMP_DIR
36 | if not os.path.exists(self.tmp_dir):
37 | os.mkdir(self.tmp_dir)
38 |
39 | self.last_call_time = None
40 | self.friendly_time = internal_var.FRIENDLY_USE_API
41 |
42 | self.buffer = Buffer()
43 |
44 | def safe_save(self):
45 | self.buffer.save_buffer()
46 |
47 | def __friendly_use_api(self):
48 | now_time = time()
49 | # 睡眠时间 = 间隔时间 - 函数执行时间
50 | if self.last_call_time is None:
51 | func_exec_ms = self.friendly_time
52 | else:
53 | func_exec_ms = int(round(now_time * 1000)) - int(round(self.last_call_time * 1000))
54 | sleep_ms = self.friendly_time - func_exec_ms
55 | while sleep_ms > 0:
56 | # 如果需要睡眠
57 | if sleep_ms > 100:
58 | sleep(0.1)
59 | else:
60 | sleep(sleep_ms / 1000.0)
61 | # 按照每次100ms累计
62 | common_op.debug_log("wait for server response..." + str(sleep_ms) + "ms", level=NotionDump.DUMP_MODE_DEFAULT)
63 | sleep_ms -= 100
64 | # 更新上次执行时间
65 | self.last_call_time = time()
66 |
67 | # 获取该块下所有的子块
68 | def retrieve_block_children(self, block_id, parent_id=None, page_size=100, force_update=False):
69 | # 添加缓存系统
70 | if not force_update and NotionDump.USE_BUFFER:
71 | if parent_id is not None:
72 | dirty = self.buffer.select_buffer(parent_id, is_child=True)
73 | else:
74 | dirty = self.buffer.select_buffer(block_id)
75 | if not dirty:
76 | # 缓存命中,直接从缓存中加载数据
77 | common_op.debug_log("[##CACHE] cached and load " + block_id + ";parent is " + str(parent_id), level=NotionDump.DUMP_MODE_DEFAULT)
78 | load = self.__load_from_json(block_id, prefix="retrieve_block_")
79 | if load is not None:
80 | return load
81 |
82 | common_op.debug_log("[&&CACHE] no cached and load " + block_id + "; parent is " + str(parent_id), level=NotionDump.DUMP_MODE_DEFAULT)
83 | self.__friendly_use_api()
84 | query_post = {
85 | "block_id": block_id,
86 | "page_size": page_size
87 | }
88 | try:
89 | query_ret = self.client.blocks.children.list(
90 | **query_post
91 | )
92 |
93 | # 大量数据一次未读完
94 | next_cur = query_ret["next_cursor"]
95 | while query_ret["has_more"]:
96 | query_post["start_cursor"] = next_cur
97 | common_op.debug_log(query_post, level=NotionDump.DUMP_MODE_DEFAULT)
98 | db_query_ret = self.client.blocks.children.list(
99 | **query_post
100 | )
101 | next_cur = db_query_ret["next_cursor"]
102 | query_ret["results"] += db_query_ret["results"]
103 | if next_cur is None:
104 | break
105 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
106 | self.__save_to_json(query_ret, block_id, prefix="retrieve_block_")
107 | if NotionDump.USE_BUFFER and parent_id is None:
108 | # 独立的page 更新页面状态
109 | self.buffer.update_buffer(block_id)
110 | return query_ret
111 | except APIResponseError as error:
112 | if error.code == APIErrorCode.ObjectNotFound:
113 | common_op.debug_log("Block " + block_id + " Retrieve child is invalid",
114 | level=NotionDump.DUMP_MODE_DEFAULT)
115 | else:
116 | # Other error handling code
117 | common_op.debug_log(error)
118 | common_op.debug_log("Block " + block_id + " response error", level=NotionDump.DUMP_MODE_DEFAULT)
119 | except Exception as e:
120 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
121 | common_op.debug_log("Block " + block_id + " Not found or no authority", level=NotionDump.DUMP_MODE_DEFAULT)
122 | return None
123 |
124 | # 获取到所有的数据库数据(JSon格式)
125 | def query_database(self, database_id, db_q_filter="{}", db_q_sorts="[]", force_update=False):
126 | # 添加缓存系统
127 | if not force_update and NotionDump.USE_BUFFER:
128 | if not self.buffer.select_buffer(database_id):
129 | # 缓存命中,直接从缓存中加载数据
130 | common_op.debug_log("[##CACHE] cached and load " + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
131 | load = self.__load_from_json(database_id, prefix="query_db_")
132 | if load is not None:
133 | return load
134 | common_op.debug_log("[&&CACHE] no cached and load " + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
135 |
136 | self.__friendly_use_api()
137 | # 组合查询条件
138 | query_post = {"database_id": database_id}
139 | if db_q_sorts != "[]":
140 | query_post["sorts"] = db_q_sorts
141 | if db_q_filter != "{}":
142 | query_post["filter"] = db_q_sorts
143 | try:
144 | query_ret = self.client.databases.query(
145 | **query_post
146 | )
147 |
148 | # 大量数据一次未读完
149 | next_cur = query_ret["next_cursor"]
150 | while query_ret["has_more"]:
151 | query_post["start_cursor"] = next_cur
152 | common_op.debug_log(query_post, level=NotionDump.DUMP_MODE_DEFAULT)
153 | db_query_ret = self.client.databases.query(
154 | **query_post
155 | )
156 | next_cur = db_query_ret["next_cursor"]
157 | query_ret["results"] += db_query_ret["results"]
158 | if next_cur is None:
159 | break
160 |
161 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
162 | self.__save_to_json(query_ret, database_id, prefix="query_db_")
163 | if NotionDump.USE_BUFFER:
164 | # 独立的page 更新页面状态
165 | self.buffer.update_buffer(database_id)
166 | return query_ret
167 | except APIResponseError as error:
168 | if error.code == APIErrorCode.ObjectNotFound:
169 | common_op.debug_log("Database Query is invalid, id=" + database_id,
170 | level=NotionDump.DUMP_MODE_DEFAULT)
171 | else:
172 | # Other error handling code
173 | common_op.debug_log(error)
174 | common_op.debug_log("Database Query is invalid, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
175 | except Exception as e:
176 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
177 | common_op.debug_log("Database Query Not found or no authority, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
178 | return None
179 |
180 | # 获取数据库信息
181 | def retrieve_database(self, database_id):
182 | self.__friendly_use_api()
183 | try:
184 | retrieve_ret = self.client.databases.retrieve(database_id=database_id)
185 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
186 | self.__save_to_json(retrieve_ret, database_id, prefix="retrieve_db_")
187 | if NotionDump.USE_BUFFER and retrieve_ret is not None and "last_edited_time" in retrieve_ret:
188 | self.buffer.add_buffer(database_id, retrieve_ret["last_edited_time"], id_type="database")
189 | return retrieve_ret
190 | except APIResponseError as error:
191 | if error.code == APIErrorCode.ObjectNotFound:
192 | common_op.debug_log("Database retrieve is invalid, id=" + database_id,
193 | level=NotionDump.DUMP_MODE_DEFAULT)
194 | else:
195 | # Other error handling code
196 | common_op.debug_log(error)
197 | common_op.debug_log("Database retrieve is invalid, id=" + database_id, level=NotionDump.DUMP_MODE_DEFAULT)
198 | except Exception as e:
199 | common_op.debug_log(e)
200 | common_op.debug_log("Database retrieve Not found or no authority, id=" + database_id,
201 | level=NotionDump.DUMP_MODE_DEFAULT)
202 | return None
203 |
204 | # 获取Page的信息
205 | def retrieve_page(self, page_id):
206 | self.__friendly_use_api()
207 | try:
208 | retrieve_ret = self.client.pages.retrieve(page_id=page_id)
209 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG or NotionDump.USE_BUFFER:
210 | self.__save_to_json(retrieve_ret, page_id, prefix="retrieve_page_")
211 | if NotionDump.USE_BUFFER and retrieve_ret is not None and "last_edited_time" in retrieve_ret:
212 | self.buffer.add_buffer(page_id, retrieve_ret["last_edited_time"])
213 | return retrieve_ret
214 | except APIResponseError as error:
215 | if error.code == APIErrorCode.ObjectNotFound:
216 | common_op.debug_log("Page retrieve is invalid(api), id=" + page_id,
217 | level=NotionDump.DUMP_MODE_DEFAULT)
218 | else:
219 | # Other error handling code
220 | common_op.debug_log(error)
221 | common_op.debug_log("Page retrieve is invalid(other), id=" + page_id,
222 | level=NotionDump.DUMP_MODE_DEFAULT)
223 | except Exception as e:
224 | common_op.debug_log(e)
225 | common_op.debug_log("Page retrieve Not found or no authority, id=" + page_id,
226 | level=NotionDump.DUMP_MODE_DEFAULT)
227 | return None
228 |
229 | def download_to_file(self, download_id, child_page_item):
230 | # 设置文件链接嵌入时,只有存储在Notion的文件需要下载(不下载会由于时间问题导致链接失效)
231 | if NotionDump.FILE_WITH_LINK and "secure.notion-static.com" not in child_page_item["link_src"]:
232 | return ""
233 |
234 | # 解析文件后缀名
235 | file_url = child_page_item["link_src"]
236 | common_op.debug_log("download url is " + file_url, level=NotionDump.DUMP_MODE_DEBUG)
237 | if file_url == "":
238 | return ""
239 | # 文件名在最后一个/和?之间
240 | if file_url.find('?') != -1:
241 | filename = file_url[file_url.rfind('/') + 1:file_url.find('?')]
242 | else:
243 | filename = file_url[file_url.rfind('/') + 1:]
244 | file_suffix = filename[filename.find('.'):]
245 | # 使用后缀和id生成可识别的文件
246 | download_name = self.tmp_dir + download_id + file_suffix
247 | common_op.debug_log("download name " + download_name, level=NotionDump.DUMP_MODE_DEBUG)
248 |
249 | if NotionDump.USE_BUFFER:
250 | # 看文件是否需要重新下载
251 | if not self.buffer.select_buffer(download_id) and os.path.exists(download_name):
252 | return download_name
253 |
254 | # 新增记录(注意这里与上面select不属于同一个执行分支)
255 | self.buffer.add_buffer(download_id, "", id_type="file")
256 |
257 | if os.path.exists(download_name):
258 | common_op.debug_log("[WARN] file " + download_name + " was covered", level=NotionDump.DUMP_MODE_DEFAULT)
259 | # 下载文件
260 | self.__friendly_use_api()
261 | try:
262 | file_url = quote(file_url, safe='/:?=&%')
263 | urllib.request.urlretrieve(file_url, download_name)
264 | if NotionDump.USE_BUFFER:
265 | self.buffer.update_buffer(download_id)
266 | return download_name
267 | except urllib.error.HTTPError as e:
268 | common_op.debug_log("download name " + download_name + " get error:HTTPError",
269 | level=NotionDump.DUMP_MODE_DEFAULT)
270 | common_op.debug_log("download url " + file_url + " get error:HTTPError",
271 | level=NotionDump.DUMP_MODE_DEFAULT)
272 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
273 | except urllib.error.ContentTooShortError as e:
274 | common_op.debug_log("download name " + download_name + " get error:ContentTooShortError",
275 | level=NotionDump.DUMP_MODE_DEFAULT)
276 | common_op.debug_log("download url " + file_url + " get error:ContentTooShortError",
277 | level=NotionDump.DUMP_MODE_DEFAULT)
278 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
279 | except urllib.error.URLError as e:
280 | common_op.debug_log("download name " + download_name + " get error:URLError",
281 | level=NotionDump.DUMP_MODE_DEFAULT)
282 | common_op.debug_log("download url " + file_url + " get error:URLError",
283 | level=NotionDump.DUMP_MODE_DEFAULT)
284 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
285 | except TimeoutError as e:
286 | common_op.debug_log("download name " + download_name + " get error:TimeoutError",
287 | level=NotionDump.DUMP_MODE_DEFAULT)
288 | common_op.debug_log("download url " + file_url + " get error:TimeoutError",
289 | level=NotionDump.DUMP_MODE_DEFAULT)
290 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
291 | except Exception as e:
292 | common_op.debug_log("download name " + download_name + " get error:Exception",
293 | level=NotionDump.DUMP_MODE_DEFAULT)
294 | common_op.debug_log("download url " + file_url + " get error:Exception",
295 | level=NotionDump.DUMP_MODE_DEFAULT)
296 | common_op.debug_log(e, level=NotionDump.DUMP_MODE_DEFAULT)
297 | return ""
298 |
299 | # 源文件,直接输出成json; 辅助测试使用
300 | def __save_to_json(self, page_json, json_id, json_name=None, prefix=None):
301 | if json_name is None:
302 | if prefix is not None:
303 | json_name = self.tmp_dir + prefix + json_id + ".json"
304 | else:
305 | json_name = self.tmp_dir + json_id + ".json"
306 | common_op.save_json_to_file(page_json, json_name)
307 |
308 | def __load_from_json(self, json_id, json_name=None, prefix=None):
309 | if json_name is None:
310 | if prefix is not None:
311 | json_name = self.tmp_dir + prefix + json_id + ".json"
312 | else:
313 | json_name = self.tmp_dir + json_id + ".json"
314 | return common_op.load_json_from_file(json_name)
315 |
--------------------------------------------------------------------------------
/NotionDump/Notion/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Notion/__init__.py
--------------------------------------------------------------------------------
/NotionDump/Parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/Parser/__init__.py
--------------------------------------------------------------------------------
/NotionDump/Parser/base_parser.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 | import copy
5 |
6 | import NotionDump
7 | from NotionDump.utils import content_format, common_op
8 | from NotionDump.utils import internal_var
9 | from urllib.parse import unquote
10 | from NotionDump.utils.content_format import color_transformer, color_transformer_db, format_date_or_time
11 |
12 |
13 | class BaseParser:
14 | def __init__(self, base_id, export_child=False):
15 | self.base_id = base_id.replace('-', '')
16 | self.export_child = export_child
17 |
18 | # 设置变量存放子page 字典
19 | self.child_pages = {}
20 |
21 | def set_new_id(self, parent_id):
22 | self.base_id = parent_id
23 |
24 | # 获取子页面字典,只返回一次,离台概不负责
25 | def get_child_pages_dic(self):
26 | child_pages = copy.deepcopy(self.child_pages)
27 | self.child_pages.clear() # 清空已有的内容
28 | return child_pages
29 |
30 | # 文本的格式生成
31 | @staticmethod
32 | def __annotations_parser(block_handle, str_plain):
33 | if str_plain is None or str_plain == "":
34 | return ""
35 | last_char = str_plain[-1:]
36 | if last_char == "\n" or last_char == "\t":
37 | str_ret = str_plain[0:-1]
38 | else:
39 | str_ret = str_plain
40 | if block_handle["code"]:
41 | str_ret = "`" + str_ret + "`"
42 | if block_handle["underline"]:
43 | str_ret = "" + str_ret + ""
44 | if block_handle["bold"]:
45 | str_ret = "**" + str_ret + "**"
46 | if block_handle["italic"]:
47 | str_ret = "*" + str_ret + "*"
48 | if block_handle["color"] != "default":
49 | # 添加颜色,区分背景色和前景色
50 | if NotionDump.S_THEME_TYPE == "markdown":
51 | # 使用markdown默认的高亮来渲染所有的颜色类型
52 | str_ret = NotionDump.MD_HIGHLIGHT + str_ret + NotionDump.MD_HIGHLIGHT
53 | else:
54 | if block_handle["color"].find("_background") != -1:
55 | bg_color = block_handle["color"][0:block_handle["color"].rfind('_')]
56 | str_ret = "" + str_ret + ""
57 | else:
58 | str_ret = "" + str_ret + ""
59 | if block_handle["strikethrough"]:
60 | str_ret = "~~" + str_ret + "~~"
61 | if last_char == "\n" or last_char == "\t":
62 | str_ret += last_char
63 | return str_ret
64 |
65 | def __text_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False):
66 | if block_handle["type"] != "text":
67 | common_op.debug_log(
68 | "text type error! id=" + self.base_id + " not type " + block_handle["type"],
69 | level=NotionDump.DUMP_MODE_DEFAULT)
70 | return ""
71 |
72 | text_str = ""
73 | if "plain_text" in block_handle:
74 | text_str = block_handle["plain_text"]
75 | if text_str is None:
76 | text_str = ""
77 | # 如果有链接
78 | text_url = block_handle["href"]
79 | if text_url is not None and parser_type == NotionDump.PARSER_TYPE_MD and not is_db_title: # 数据库标题越过链接解析
80 | # 文字有链接内容,分为网络链接和本地链接
81 | if text_url.startswith("http") or not text_url.startswith("/"):
82 | # 网络链接,直接一步到位
83 | text_str = content_format.get_url_format(text_url, text_str)
84 | else:
85 | # Page或者数据库类型,等待重定位
86 | if text_url.find("=") != -1:
87 | id_type = "database"
88 | page_id = text_url[text_url.rfind("/") + 1:text_url.rfind("?")].replace('-', '')
89 | else:
90 | id_type = "page"
91 | page_id = text_url[text_url.rfind("/") + 1:].replace('-', '')
92 | if len(page_id) == NotionDump.ID_LEN:
93 | common_op.debug_log("### page id " + page_id + " is " + id_type)
94 | common_op.add_new_child_page(
95 | self.child_pages,
96 | key_id=page_id + "_" + text_str,
97 | link_id=page_id,
98 | link_src=text_url,
99 | page_type=id_type,
100 | page_name=text_str
101 | )
102 | # 将页面保存,等待进一步递归操作
103 | # 保存子页面信息
104 | common_op.debug_log("child_page_parser add page id = " + page_id + "_" + text_str, level=NotionDump.DUMP_MODE_DEFAULT)
105 | text_str = content_format.get_page_format_md(page_id + "_" + text_str, text_str,
106 | export_child=self.export_child)
107 | else:
108 | text_str = content_format.get_url_format("", text_str)
109 |
110 | if parser_type == NotionDump.PARSER_TYPE_MD:
111 | # 解析annotations部分,为text_str添加格式
112 | return self.__annotations_parser(block_handle["annotations"], text_str)
113 | else:
114 | return text_str
115 |
116 | def __text_block_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False):
117 | paragraph_ret = ""
118 | if block_handle["type"] == "text":
119 | paragraph_ret = self.__text_parser(block_handle, parser_type)
120 | elif block_handle["type"] == "equation":
121 | paragraph_ret = self.__equation_inline_parser(block_handle)
122 | elif block_handle["type"] == "mention":
123 | paragraph_ret = self.__mention_parser(block_handle, parser_type, is_db_title=is_db_title)
124 | else:
125 | common_op.debug_log(
126 | "text type " + block_handle["type"] + " error! parent_id= " + self.base_id,
127 | level=NotionDump.DUMP_MODE_DEFAULT)
128 | return paragraph_ret
129 |
130 | def __text_list_parser(self, text_list, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db=False, is_db_title=False):
131 | plain_text = ""
132 | if text_list is not None:
133 | for text_block in text_list:
134 | plain_text += self.__text_block_parser(text_block, parser_type, is_db_title=is_db_title)
135 | if is_db:
136 | # 数据库内容特殊字符校对
137 | return plain_text.replace("|", "\\|")
138 | else:
139 | return plain_text
140 |
141 | # TODO : people只获取了名字和ID,后续可以做深度解析用户相关内容
142 | def __people_parser(self, block_handle):
143 | if block_handle["object"] != "user":
144 | common_op.debug_log("people type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
145 | return ""
146 | # 优先获取名字
147 | if "name" in block_handle.keys():
148 | return block_handle["name"]
149 | # 如果无法获取名字则返回id
150 | return block_handle["id"].replace('-', '')
151 |
152 | def __user_parser(self, block_handle):
153 | if block_handle["type"] != "user":
154 | common_op.debug_log("user type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
155 | return ""
156 | user_body = block_handle["user"]
157 | return self.__people_parser(user_body)
158 |
159 | def __db_file_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
160 | if block_handle["type"] != "file":
161 | common_op.debug_log("file type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
162 | return ""
163 | filename = block_handle["name"]
164 | file_url = block_handle["file"]["url"]
165 |
166 | # 解析文件的ID
167 | url_prefix = file_url[0:file_url.rfind("/")]
168 | file_id = url_prefix[url_prefix.rfind("/") + 1:].replace('-', '')
169 | common_op.debug_log("file id is : " + file_id)
170 |
171 | if filename == "":
172 | # 如果文件没有名字使用id作为默认名字
173 | filename = file_id
174 | common_op.add_new_child_page(
175 | self.child_pages,
176 | key_id=file_id,
177 | link_src=file_url,
178 | page_type="file",
179 | page_name=filename
180 | )
181 | common_op.debug_log(
182 | "file_parser add page id = " + file_id + " name : " + filename, level=NotionDump.DUMP_MODE_DEFAULT)
183 | common_op.debug_log(internal_var.PAGE_DIC)
184 | common_op.debug_log("#############")
185 | common_op.debug_log(self.child_pages)
186 |
187 | # 格式处理简单格式(也可以转换成markdown格式[]())
188 | if parser_type == NotionDump.PARSER_TYPE_MD:
189 | # file转换成文件链接的形式
190 | return content_format.get_file_format_md(filename, file_url, file_id, self.export_child)
191 | else:
192 | return content_format.get_file_format_plain(filename, file_url)
193 |
194 | # "$ equation_inline $"
195 | def __equation_inline_parser(self, block_handle):
196 | if block_handle["type"] != "equation":
197 | common_op.debug_log("equation inline type error! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
198 | return ""
199 | # 公式删除富文本格式
200 | # return content_format.get_equation_inline(
201 | # self.__annotations_parser(block_handle["annotations"], block_handle["plain_text"])
202 | # )
203 | return content_format.get_equation_inline(block_handle["plain_text"])
204 |
205 | # "$$ equation_block $$"
206 | def __equation_block_parser(self, block_handle):
207 | if block_handle["expression"] is None:
208 | common_op.debug_log("equation block no expression! id=" + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
209 | return ""
210 | return content_format.get_equation_block(block_handle["expression"])
211 |
212 | # Attention!!! 关于链接到其它的Page可能需要递归处理
213 | def __page_parser(self, block_handle):
214 | if block_handle["type"] != "page":
215 | common_op.debug_log("page type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
216 | return ""
217 |
218 | page_body = block_handle["page"]
219 | return page_body["id"].replace('-', '')
220 |
221 | # 提及到其它页面,日期,用户
222 | def __mention_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN, is_db_title=False):
223 | if block_handle["type"] != "mention":
224 | common_op.debug_log("mention type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
225 | return ""
226 |
227 | mention_body = block_handle["mention"]
228 | mention_plain = ""
229 | if mention_body["type"] == "date":
230 | mention_plain = self.date_parser(mention_body)
231 | elif mention_body["type"] == "user":
232 | mention_plain = self.__user_parser(mention_body)
233 | elif mention_body["type"] == "link_preview" and "url" in mention_body["link_preview"].keys():
234 | mention_plain = mention_body["link_preview"]["url"]
235 | elif mention_body["type"] == "database":
236 | database_id = mention_body["database"]["id"].replace('-', '')
237 | key_id = database_id + "_mention"
238 | common_op.debug_log("__mention_parser add database id = " + database_id)
239 | # 获取页面的名字
240 | database_name = block_handle["plain_text"]
241 | database_link = block_handle["href"]
242 | if is_db_title:
243 | mention_plain = database_name
244 | else:
245 | common_op.add_new_child_page(
246 | self.child_pages,
247 | key_id=key_id,
248 | link_id=database_id,
249 | link_src=database_link,
250 | page_type="database",
251 | page_name=database_name
252 | )
253 | common_op.debug_log(
254 | "file_parser add page id = " + key_id + " name : " + database_name, level=NotionDump.DUMP_MODE_DEFAULT)
255 | common_op.debug_log(internal_var.PAGE_DIC)
256 | common_op.debug_log("#############")
257 | common_op.debug_log(self.child_pages)
258 |
259 | if parser_type == NotionDump.PARSER_TYPE_MD:
260 | mention_plain = content_format.get_page_format_md(key_id, database_name, export_child=self.export_child)
261 | else:
262 | mention_plain = database_name
263 | elif mention_body["type"] == "page":
264 | page_id = self.__page_parser(mention_body)
265 | key_id = page_id + "_mention"
266 | common_op.debug_log("__mention_parser add page id = " + page_id)
267 | # 获取页面的名字
268 | page_name = block_handle["plain_text"]
269 | page_link = block_handle["href"]
270 |
271 | if is_db_title:
272 | mention_plain = page_name
273 | else:
274 | # 提及页面按照链接页面处理
275 | common_op.add_new_child_page(
276 | self.child_pages,
277 | key_id=key_id,
278 | link_id=page_id,
279 | link_src=page_link,
280 | page_type="page",
281 | page_name=page_name
282 | )
283 | common_op.debug_log(
284 | "file_parser add page id = " + key_id + " name : " + page_name, level=NotionDump.DUMP_MODE_DEFAULT)
285 | common_op.debug_log(internal_var.PAGE_DIC)
286 | common_op.debug_log("#############")
287 | common_op.debug_log(self.child_pages)
288 |
289 | if parser_type == NotionDump.PARSER_TYPE_MD:
290 | mention_plain = content_format.get_page_format_md(key_id, page_name, export_child=self.export_child)
291 | else:
292 | mention_plain = page_name
293 | else:
294 | common_op.debug_log("unknown mention type " + mention_body["type"], level=NotionDump.DUMP_MODE_DEFAULT)
295 |
296 | if parser_type == NotionDump.PARSER_TYPE_MD:
297 | # 解析annotations部分,为mention_plain添加格式
298 | return self.__annotations_parser(block_handle["annotations"],
299 | content_format.get_mention_format(mention_plain))
300 | else:
301 | return content_format.get_mention_format(mention_plain)
302 |
303 | def __table_row_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
304 | if block_handle["type"] != "table_row":
305 | common_op.debug_log("table_row type error! parent_id= " + self.base_id, level=NotionDump.DUMP_MODE_DEFAULT)
306 | return ""
307 | table_col_cells = block_handle["table_row"]["cells"]
308 | table_row = []
309 | for cell in table_col_cells:
310 | table_row.append(self.__text_list_parser(cell, parser_type))
311 | return table_row
312 |
313 | # 数据库 title
314 | def title_parser(self, block_handle, page_id, parser_type=NotionDump.PARSER_TYPE_PLAIN):
315 | if block_handle["type"] != "title":
316 | common_op.debug_log("title type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
317 | level=NotionDump.DUMP_MODE_DEFAULT)
318 | return ""
319 | db_page_title = self.__text_list_parser(block_handle["title"], parser_type, is_db=True, is_db_title=True)
320 | if page_id == "":
321 | # 如果page id是空的,说明只想要内容,不需要重定位
322 | return db_page_title
323 |
324 | if db_page_title != "":
325 | # 如果存在子Page就加入到待解析队列
326 | common_op.debug_log("title ret = " + db_page_title)
327 | if parser_type != NotionDump.PARSER_TYPE_PLAIN:
328 | common_op.debug_log("title_parser add page id = " + page_id, level=NotionDump.DUMP_MODE_DEFAULT)
329 | else:
330 | common_op.debug_log("title_parser add page id = " + page_id)
331 | # 数据库里的都是子页面
332 | common_op.add_new_child_page(self.child_pages, key_id=page_id, page_name=db_page_title)
333 |
334 | # 如果有子页面就添加一个占位符,之后方便重定位
335 | db_page_title = content_format.get_database_title_format(page_id, db_page_title, self.export_child)
336 | return db_page_title
337 |
338 | # 数据库 rich_text
339 | def rich_text_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
340 | if block_handle["type"] != "rich_text":
341 | common_op.debug_log("rich_text type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
342 | level=NotionDump.DUMP_MODE_DEFAULT)
343 | return ""
344 | return self.__text_list_parser(block_handle["rich_text"], parser_type, is_db=True)
345 |
346 | # 数据库 multi_select
347 | def multi_select_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
348 | if block_handle["type"] != "multi_select":
349 | common_op.debug_log("multi_select type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
350 | level=NotionDump.DUMP_MODE_DEFAULT)
351 | return ""
352 | multi_select_list = block_handle["multi_select"]
353 | ret_str = ""
354 | if multi_select_list is None:
355 | return ret_str
356 | for multi_select in multi_select_list:
357 | if ret_str != "":
358 | ret_str += "," # 多个选项之间用“,”分割
359 | if parser_type == NotionDump.PARSER_TYPE_MD:
360 | ret_str += " " + multi_select["name"] + " "
363 | else:
364 | ret_str += multi_select["name"]
365 | return ret_str
366 |
367 | # 数据库 select
368 | def select_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
369 | if block_handle["type"] != "select":
370 | common_op.debug_log("select type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
371 | level=NotionDump.DUMP_MODE_DEFAULT)
372 | return ""
373 | select = block_handle["select"]
374 | ret_str = ""
375 | if select is None:
376 | return ret_str
377 | if parser_type == NotionDump.PARSER_TYPE_MD:
378 | ret_str = " " + select["name"] + " "
381 | else:
382 | ret_str = select["name"]
383 | return ret_str
384 |
385 | # 数据库 url
386 | def url_parser(self, block_handle):
387 | if block_handle["type"] != "url":
388 | common_op.debug_log("url type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
389 | level=NotionDump.DUMP_MODE_DEFAULT)
390 | return ""
391 | url = block_handle["url"]
392 | if url is None:
393 | url = ""
394 | return content_format.get_url_format(url)
395 |
396 | # 数据库 email
397 | def email_parser(self, block_handle):
398 | if block_handle["type"] != "email":
399 | common_op.debug_log("email type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
400 | level=NotionDump.DUMP_MODE_DEFAULT)
401 | return ""
402 | email = block_handle["email"]
403 | ret_str = ""
404 | if email is not None:
405 | ret_str = email
406 | return ret_str
407 |
408 | # 数据库 checkbox
409 | def checkbox_parser(self, block_handle):
410 | if block_handle["type"] != "checkbox":
411 | common_op.debug_log("checkbox type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
412 | level=NotionDump.DUMP_MODE_DEFAULT)
413 | return ""
414 | checkbox = block_handle["checkbox"]
415 | if checkbox is True:
416 | ret_str = NotionDump.MD_BOOL_TRUE
417 | else:
418 | ret_str = NotionDump.MD_BOOL_FALSE
419 | return ret_str
420 |
421 | # 数据库 phone_number
422 | def phone_number_parser(self, block_handle):
423 | if block_handle["type"] != "phone_number":
424 | common_op.debug_log("phone_number type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
425 | level=NotionDump.DUMP_MODE_DEFAULT)
426 | return ""
427 | phone_number = block_handle["phone_number"]
428 | ret_str = ""
429 | if phone_number is not None:
430 | ret_str = phone_number
431 | return ret_str
432 |
433 | # 数据库 date
434 | def date_parser(self, block_handle):
435 | if block_handle["type"] != "date":
436 | common_op.debug_log("date type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
437 | level=NotionDump.DUMP_MODE_DEFAULT)
438 | return ""
439 | date = block_handle["date"]
440 | if date is None:
441 | return ""
442 | return content_format.get_date_format(date["start"], date["end"])
443 |
444 | # 数据库 people
445 | def people_parser(self, block_handle):
446 | if block_handle["type"] != "people":
447 | common_op.debug_log("people type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
448 | level=NotionDump.DUMP_MODE_DEFAULT)
449 | return ""
450 | people_list = block_handle["people"]
451 | ret_str = ""
452 | if people_list is None:
453 | return ret_str
454 | for people in people_list:
455 | if ret_str != "":
456 | ret_str += "," # 多个用户之间用“,”分割
457 | ret_str += self.__people_parser(people)
458 | return ret_str
459 |
460 | # 数据库 number
461 | def number_parser(self, block_handle):
462 | if block_handle["type"] != "number":
463 | common_op.debug_log("number type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
464 | level=NotionDump.DUMP_MODE_DEFAULT)
465 | return ""
466 | number = block_handle["number"]
467 | ret_str = ""
468 | if number is None:
469 | return ret_str
470 | ret_str = number
471 | return str(ret_str)
472 |
473 | # 数据库 files
474 | def files_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
475 | if block_handle["type"] != "files":
476 | common_op.debug_log("files type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
477 | level=NotionDump.DUMP_MODE_DEFAULT)
478 | return ""
479 | files_list = block_handle["files"]
480 | ret_str = ""
481 | if files_list is None:
482 | return ret_str
483 | for file in files_list:
484 | if ret_str != "":
485 | if parser_type == NotionDump.PARSER_TYPE_MD:
486 | ret_str += "
" # 多个文件之间用“
”分割
487 | else:
488 | ret_str += "," # 多个文件之间用“,”分割
489 | ret_str += self.__db_file_parser(file, parser_type)
490 | return ret_str
491 |
492 | # 数据库 relation 数据
493 | def relation_parser(self, block_handle):
494 | if block_handle["type"] != "relation":
495 | common_op.debug_log("relation type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
496 | level=NotionDump.DUMP_MODE_DEFAULT)
497 | return ""
498 | relation_list = block_handle["relation"]
499 | relation_ret = ""
500 | for relation_item in relation_list:
501 | relation_id = relation_item["id"].replace("-", "")
502 | # 按照软连接处理
503 | common_op.add_new_child_page(
504 | self.child_pages,
505 | key_id=relation_id + "_relation",
506 | link_id=relation_id,
507 | page_type="page",
508 | page_name=""
509 | )
510 | if relation_ret != "":
511 | relation_ret += ","
512 | relation_ret += content_format.get_database_title_format(relation_id + "_relation", "", self.export_child)
513 | return relation_ret
514 |
515 | # 数据库 formula 数据
516 | def formula_parser(self, block_handle):
517 | if block_handle["type"] != "formula":
518 | common_op.debug_log("formula type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
519 | level=NotionDump.DUMP_MODE_DEFAULT)
520 | return ""
521 | formula_block = block_handle["formula"]
522 | ret_str = ""
523 | if formula_block["type"] == "string":
524 | ret_str = formula_block["string"]
525 | elif formula_block["type"] == "number":
526 | ret_str = str(formula_block["number"])
527 | elif formula_block["type"] == "boolean":
528 | if formula_block["boolean"] is True:
529 | ret_str = NotionDump.MD_BOOL_TRUE
530 | else:
531 | ret_str = NotionDump.MD_BOOL_FALSE
532 | # ret_str = str(formula_block["boolean"])
533 | elif formula_block["type"] == "date":
534 | ret_str = self.date_parser(formula_block)
535 | else:
536 | ret_str = "[unknown_formula_type:" + formula_block["type"] + "]"
537 | return ret_str
538 |
539 | # 数据库 created_time
540 | def created_time_parser(self, block_handle):
541 | if block_handle["type"] != "created_time":
542 | common_op.debug_log("created_time type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
543 | level=NotionDump.DUMP_MODE_DEFAULT)
544 | return ""
545 | return format_date_or_time(block_handle["created_time"])
546 |
547 | # 数据库 last_edited_time
548 | def last_edited_time_parser(self, block_handle):
549 | if block_handle["type"] != "last_edited_time":
550 | common_op.debug_log(
551 | "last_edited_time type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
552 | level=NotionDump.DUMP_MODE_DEFAULT)
553 | return ""
554 | return format_date_or_time(block_handle["last_edited_time"])
555 |
556 | def created_by_parser(self, block_handle):
557 | if block_handle["type"] != "created_by":
558 | common_op.debug_log("created_by type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
559 | level=NotionDump.DUMP_MODE_DEFAULT)
560 | return ""
561 | return self.__people_parser(block_handle["created_by"])
562 |
563 | # 数据库 last_edited_by
564 | def last_edited_by_parser(self, block_handle):
565 | if block_handle["type"] != "last_edited_by":
566 | common_op.debug_log(
567 | "last_edited_by type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
568 | level=NotionDump.DUMP_MODE_DEFAULT)
569 | return ""
570 | return self.__people_parser(block_handle["last_edited_by"])
571 |
572 | # Page paragraph
573 | # mention
574 | # date
575 | # user
576 | # page
577 | # text
578 | # equation
579 | def paragraph_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
580 | paragraph_ret = ""
581 | if block_handle["type"] != "paragraph":
582 | common_op.debug_log("paragraph type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
583 | level=NotionDump.DUMP_MODE_DEFAULT)
584 | return paragraph_ret
585 | return self.__text_list_parser(block_handle["paragraph"]["rich_text"], parser_type)
586 |
587 | # Page heading_1
588 | def heading_1_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
589 | heading_1_ret = ""
590 | if block_handle["type"] != "heading_1":
591 | common_op.debug_log("heading_1 type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
592 | level=NotionDump.DUMP_MODE_DEFAULT)
593 | return heading_1_ret
594 | heading_1_ret = self.__text_list_parser(block_handle["heading_1"]["rich_text"], parser_type)
595 | if parser_type == NotionDump.PARSER_TYPE_MD:
596 | return "# " + heading_1_ret
597 | else:
598 | return heading_1_ret
599 |
600 | # Page heading_2
601 | def heading_2_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
602 | heading_2_ret = ""
603 | if block_handle["type"] != "heading_2":
604 | common_op.debug_log("heading_2 type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
605 | level=NotionDump.DUMP_MODE_DEFAULT)
606 | return heading_2_ret
607 | heading_2_ret = self.__text_list_parser(block_handle["heading_2"]["rich_text"], parser_type)
608 |
609 | if parser_type == NotionDump.PARSER_TYPE_MD:
610 | return "## " + heading_2_ret
611 | else:
612 | return heading_2_ret
613 |
614 | # Page heading_3
615 | def heading_3_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
616 | heading_3_ret = ""
617 | if block_handle["type"] != "heading_3":
618 | common_op.debug_log("heading_3 type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
619 | level=NotionDump.DUMP_MODE_DEFAULT)
620 | return heading_3_ret
621 | heading_3_ret = self.__text_list_parser(block_handle["heading_3"]["rich_text"], parser_type)
622 |
623 | if parser_type == NotionDump.PARSER_TYPE_MD:
624 | return "### " + heading_3_ret
625 | else:
626 | return heading_3_ret
627 |
628 | # Page to_do
629 | def to_do_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
630 | to_do_ret = ""
631 | if block_handle["type"] != "to_do":
632 | common_op.debug_log("to_do type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
633 | level=NotionDump.DUMP_MODE_DEFAULT)
634 | return to_do_ret
635 | to_do_ret = self.__text_list_parser(block_handle["to_do"]["rich_text"], parser_type)
636 |
637 | if parser_type == NotionDump.PARSER_TYPE_MD:
638 | if block_handle["to_do"]["checked"]:
639 | return "- [x] " + to_do_ret
640 | else:
641 | return "- [ ] " + to_do_ret
642 | else:
643 | return to_do_ret
644 |
645 | # Page bulleted_list_item
646 | def bulleted_list_item_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
647 | bulleted_list_item_ret = ""
648 | if block_handle["type"] != "bulleted_list_item":
649 | common_op.debug_log(
650 | "bulleted_list_item type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
651 | level=NotionDump.DUMP_MODE_DEFAULT)
652 | return bulleted_list_item_ret
653 | bulleted_list_item_ret = self.__text_list_parser(block_handle["bulleted_list_item"]["rich_text"], parser_type)
654 |
655 | if parser_type == NotionDump.PARSER_TYPE_MD:
656 | return "- " + bulleted_list_item_ret
657 | else:
658 | return bulleted_list_item_ret
659 |
660 | # Page numbered_list_item
661 | def numbered_list_item_parser(self, block_handle, list_index, parser_type=NotionDump.PARSER_TYPE_PLAIN):
662 | numbered_list_item_ret = ""
663 | if block_handle["type"] != "numbered_list_item":
664 | common_op.debug_log(
665 | "numbered_list_item type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
666 | level=NotionDump.DUMP_MODE_DEFAULT)
667 | return numbered_list_item_ret
668 | numbered_list_item_ret = self.__text_list_parser(block_handle["numbered_list_item"]["rich_text"], parser_type)
669 |
670 | if parser_type == NotionDump.PARSER_TYPE_MD:
671 | return str(list_index) + ". " + numbered_list_item_ret
672 | else:
673 | return numbered_list_item_ret
674 |
675 | # Page toggle
676 | def toggle_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
677 | toggle_ret = ""
678 | if block_handle["type"] != "toggle":
679 | common_op.debug_log("toggle type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
680 | level=NotionDump.DUMP_MODE_DEFAULT)
681 | return toggle_ret
682 | toggle_ret = self.__text_list_parser(block_handle["toggle"]["rich_text"], parser_type)
683 |
684 | if parser_type == NotionDump.PARSER_TYPE_MD:
685 | return "- " + toggle_ret
686 | else:
687 | return toggle_ret
688 |
689 | # Page divider
690 | def divider_parser(self, block_handle):
691 | divider_ret = ""
692 | if block_handle["type"] != "divider":
693 | common_op.debug_log("divider type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
694 | level=NotionDump.DUMP_MODE_DEFAULT)
695 | return divider_ret
696 | divider_ret = NotionDump.MD_DIVIDER
697 | return divider_ret
698 |
699 | # Page callout
700 | def callout_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
701 | callout_ret = ""
702 | if block_handle["type"] != "callout":
703 | common_op.debug_log("callout type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
704 | level=NotionDump.DUMP_MODE_DEFAULT)
705 | return callout_ret
706 | callout_ret = self.__text_list_parser(block_handle["callout"]["rich_text"], parser_type)
707 |
708 | if parser_type == NotionDump.PARSER_TYPE_MD:
709 | # 这里是否每一行都操作
710 | return "> " + callout_ret
711 | else:
712 | return callout_ret
713 |
714 | # Page code
715 | def code_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
716 | code_ret = ""
717 | if block_handle["type"] != "code":
718 | common_op.debug_log("code type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
719 | level=NotionDump.DUMP_MODE_DEFAULT)
720 | return code_ret
721 | code_ret = self.__text_list_parser(block_handle["code"]["rich_text"], parser_type)
722 |
723 | code_type = block_handle["code"]["language"]
724 | if code_type is None:
725 | code_type = ""
726 |
727 | if parser_type == NotionDump.PARSER_TYPE_MD:
728 | # 这里是否每一行都操作
729 | return "```" + code_type + "\n" + code_ret + "\n```"
730 | else:
731 | return code_ret
732 |
733 | # Page quote
734 | def quote_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
735 | quote_ret = ""
736 | if block_handle["type"] != "quote":
737 | common_op.debug_log("quote type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
738 | level=NotionDump.DUMP_MODE_DEFAULT)
739 | return quote_ret
740 | quote_ret = self.__text_list_parser(block_handle["quote"]["rich_text"], parser_type)
741 | # 最外层颜色
742 | if block_handle["quote"]["color"] != "default":
743 | # 添加颜色,区分背景色和前景色
744 | if NotionDump.S_THEME_TYPE == "markdown":
745 | # 使用markdown默认的高亮来渲染所有的颜色类型
746 | quote_ret = NotionDump.MD_HIGHLIGHT + quote_ret + NotionDump.MD_HIGHLIGHT
747 | else:
748 | if block_handle["quote"]["color"].find("_background") != -1:
749 | bg_color = block_handle["quote"]["color"][0:block_handle["quote"]["color"].rfind('_')]
750 | quote_ret = "" + quote_ret + ""
752 | else:
753 | quote_ret = "" + quote_ret + ""
755 |
756 | if parser_type == NotionDump.PARSER_TYPE_MD:
757 | # 这里是否每一行都操作
758 | return "> " + quote_ret
759 | else:
760 | return quote_ret
761 |
762 | # Page equation
763 | def equation_parser(self, block_handle):
764 | equation_ret = ""
765 | if block_handle["type"] != "equation":
766 | common_op.debug_log(" type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
767 | level=NotionDump.DUMP_MODE_DEFAULT)
768 | return equation_ret
769 | return self.__equation_block_parser(block_handle["equation"])
770 |
771 | # Page table_row
772 | def table_row_parser(self, block_handle, first_row=False, parser_type=NotionDump.PARSER_TYPE_PLAIN):
773 | table_row_ret = ""
774 | if block_handle["type"] != "table_row":
775 | common_op.debug_log("table_row type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
776 | level=NotionDump.DUMP_MODE_DEFAULT)
777 | return table_row_ret
778 |
779 | table_row_list = self.__table_row_parser(block_handle, parser_type)
780 | table_row_ret = "|"
781 | for it in table_row_list:
782 | table_row_ret += it.replace('\n', '
') + "|"
783 | if first_row:
784 | table_row_ret += "\n|"
785 | for i in range(len(table_row_list)):
786 | table_row_ret += " --- " + "|"
787 |
788 | return table_row_ret
789 |
790 | def child_page_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
791 | child_page_ret = ""
792 | if block_handle["type"] != "child_page":
793 | common_op.debug_log("child_page type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
794 | level=NotionDump.DUMP_MODE_DEFAULT)
795 | return child_page_ret
796 |
797 | page_body = block_handle["child_page"]
798 | if page_body["title"] == "":
799 | if parser_type == NotionDump.PARSER_TYPE_MD:
800 | return content_format.get_page_format_md("NULL Page", "NULL Page", export_child=self.export_child)
801 | else:
802 | return content_format.get_page_format_plain("NULL Page")
803 | else:
804 | page_id = (block_handle["id"]).replace('-', '')
805 |
806 | # 保存子页面信息
807 | common_op.debug_log("child_page_parser add page id = " + page_id, level=NotionDump.DUMP_MODE_DEFAULT)
808 | common_op.add_new_child_page(self.child_pages, key_id=page_id, page_name=page_body["title"])
809 |
810 | if parser_type == NotionDump.PARSER_TYPE_MD:
811 | return content_format.get_page_format_md(page_id, page_body["title"], export_child=self.export_child)
812 | else:
813 | return content_format.get_page_format_plain(page_body["title"])
814 |
815 | # Page child_database
816 | def child_database_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
817 | if block_handle["type"] != "child_database":
818 | common_op.debug_log("child_database type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
819 | level=NotionDump.DUMP_MODE_DEFAULT)
820 | return ""
821 |
822 | # 子数据库保存在页面表中,不解析
823 | child_db_id = block_handle["id"].replace('-', '')
824 | common_op.add_new_child_page(
825 | self.child_pages,
826 | key_id=child_db_id,
827 | page_type="database",
828 | page_name=block_handle["child_database"]["title"]
829 | )
830 | common_op.debug_log(
831 | "child_database_parser add page id = " + child_db_id + "name : " + block_handle["child_database"]["title"], level=NotionDump.DUMP_MODE_DEFAULT)
832 | common_op.debug_log(internal_var.PAGE_DIC)
833 | common_op.debug_log("#############")
834 | common_op.debug_log(self.child_pages)
835 |
836 | # 子数据库要返回一个链接占位符,供后续解析使用
837 | if parser_type == NotionDump.PARSER_TYPE_MD:
838 | return content_format.get_page_format_md(
839 | child_db_id,
840 | block_handle["child_database"]["title"],
841 | export_child=self.export_child
842 | )
843 | else:
844 | return content_format.get_page_format_plain(block_handle["child_database"]["title"])
845 |
846 | # Page image
847 | def image_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
848 | if block_handle["type"] != "image":
849 | common_op.debug_log("image type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
850 | level=NotionDump.DUMP_MODE_DEFAULT)
851 | return ""
852 |
853 | # 子数据库保存在页面表中,不解析
854 | image_id = block_handle["id"].replace('-', '')
855 | image_name = self.__text_list_parser(block_handle["image"]["caption"], parser_type)
856 | image_url = ""
857 | image_type = block_handle["image"]["type"]
858 | if image_type in block_handle["image"].keys():
859 | if "url" in block_handle["image"][image_type].keys():
860 | image_url = block_handle["image"][image_type]["url"]
861 | if image_url == "":
862 | common_op.debug_log("unknown image type" + block_handle["image"]["type"],
863 | level=NotionDump.DUMP_MODE_DEFAULT)
864 | if image_name == "":
865 | # 如果文件没有名字使用id作为默认名字
866 | image_name = image_id
867 | common_op.add_new_child_page(
868 | self.child_pages,
869 | key_id=image_id,
870 | link_src=image_url,
871 | page_type="image",
872 | page_name=image_name
873 | )
874 |
875 | common_op.debug_log(
876 | "image_parser add page id = " + image_id + "name : " + image_name, level=NotionDump.DUMP_MODE_DEFAULT)
877 | common_op.debug_log(internal_var.PAGE_DIC)
878 | common_op.debug_log("#############")
879 | common_op.debug_log(self.child_pages)
880 |
881 | # 图片类型要返回一个链接占位符,供后续解析使用
882 | if parser_type == NotionDump.PARSER_TYPE_MD:
883 | return content_format.get_page_format_md(
884 | image_id,
885 | image_name,
886 | export_child=self.export_child
887 | )
888 | else:
889 | return content_format.get_page_format_plain(image_name)
890 |
891 | # Page file(file,pdf,video)
892 | def file_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
893 | if block_handle["type"] != "file" and block_handle["type"] != "pdf" and block_handle["type"] != "video":
894 | common_op.debug_log("file type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
895 | level=NotionDump.DUMP_MODE_DEFAULT)
896 | return ""
897 |
898 | block_type = block_handle["type"]
899 | file_id = block_handle["id"].replace('-', '')
900 | file_name = self.__text_list_parser(block_handle[block_type]["caption"], parser_type)
901 | file_url = ""
902 | file_type = block_handle[block_type]["type"]
903 | if file_type in block_handle[block_type].keys():
904 | if "url" in block_handle[block_type][file_type].keys():
905 | file_url = block_handle[block_type][file_type]["url"]
906 | if file_url == "":
907 | common_op.debug_log("unknown block type" + block_handle[block_type]["type"] + " with null url",
908 | level=NotionDump.DUMP_MODE_DEFAULT)
909 | return ""
910 | # 如果caption中没有文件名,尝试从url中分离
911 | if file_name == "":
912 | file_url_basic = file_url[0:file_url.rfind('?')]
913 | file_name = file_url_basic[file_url_basic.rfind('/')+1:]
914 | # url中分离的内容需要转码
915 | file_name = unquote(file_name, 'utf-8')
916 | if file_name == "":
917 | # 如果文件没有名字使用file作为默认名字
918 | file_name = "FILE"
919 | common_op.add_new_child_page(
920 | self.child_pages,
921 | key_id=file_id,
922 | link_src=file_url,
923 | page_type="file",
924 | page_name=file_name
925 | )
926 |
927 | common_op.debug_log(
928 | "file_parser add page id = " + file_id + " name : " + file_name, level=NotionDump.DUMP_MODE_DEFAULT)
929 | common_op.debug_log(internal_var.PAGE_DIC)
930 | common_op.debug_log("#############")
931 | common_op.debug_log(self.child_pages)
932 |
933 | # 文件类型要返回一个链接占位符,供后续解析使用
934 | if parser_type == NotionDump.PARSER_TYPE_MD:
935 | return content_format.get_page_format_md(
936 | file_id,
937 | file_name,
938 | export_child=self.export_child
939 | )
940 | else:
941 | return content_format.get_page_format_plain(file_name)
942 |
943 | # Page bookmark
944 | def bookmark_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
945 | bookmark_ret = ""
946 | if block_handle["type"] != "bookmark":
947 | common_op.debug_log("bookmark type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
948 | level=NotionDump.DUMP_MODE_DEFAULT)
949 | return bookmark_ret
950 | bookmark_name = self.__text_list_parser(block_handle["bookmark"]["caption"], parser_type)
951 | if bookmark_name == "":
952 | bookmark_name = "BOOKMARK"
953 | bookmark_url = block_handle["bookmark"]["url"]
954 |
955 | # bookmark 类型要返回一个链接占位符,供后续解析使用
956 | if parser_type == NotionDump.PARSER_TYPE_MD:
957 | # file转换成文件链接的形式
958 | return content_format.get_file_format_md(bookmark_name, bookmark_url)
959 | else:
960 | return content_format.get_file_format_plain(bookmark_name, bookmark_url)
961 |
962 | # Page embed
963 | def embed_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
964 | embed_ret = ""
965 | if block_handle["type"] != "embed":
966 | common_op.debug_log("embed type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
967 | level=NotionDump.DUMP_MODE_DEFAULT)
968 | return embed_ret
969 | embed_name = self.__text_list_parser(block_handle["embed"]["caption"], parser_type)
970 | if embed_name == "":
971 | embed_name = "EMBED"
972 | embed_url = block_handle["embed"]["url"]
973 |
974 | # bookmark 类型要返回一个链接占位符,供后续解析使用
975 | if parser_type == NotionDump.PARSER_TYPE_MD:
976 | # file转换成文件链接的形式
977 | return content_format.get_file_format_md(embed_name, embed_url)
978 | else:
979 | return content_format.get_file_format_plain(embed_name, embed_url)
980 |
981 | # Page link_preview
982 | def link_preview_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
983 | link_preview_ret = ""
984 | if block_handle["type"] != "link_preview":
985 | common_op.debug_log("link_preview type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
986 | level=NotionDump.DUMP_MODE_DEFAULT)
987 | return link_preview_ret
988 | link_preview_name = "LINK_PREVIEW"
989 | link_preview_url = block_handle["link_preview"]["url"]
990 |
991 | # bookmark 类型要返回一个链接占位符,供后续解析使用
992 | if parser_type == NotionDump.PARSER_TYPE_MD:
993 | # file转换成文件链接的形式
994 | return content_format.get_file_format_md(link_preview_name, link_preview_url)
995 | else:
996 | return content_format.get_file_format_plain(link_preview_name, link_preview_url)
997 |
998 | # Page link_to_page
999 | def link_to_page_parser(self, block_handle, parser_type=NotionDump.PARSER_TYPE_PLAIN):
1000 | link_to_page_ret = ""
1001 | if block_handle["type"] != "link_to_page":
1002 | common_op.debug_log("link_to_page type error! parent_id= " + self.base_id + " id= " + block_handle["id"],
1003 | level=NotionDump.DUMP_MODE_DEFAULT)
1004 | return link_to_page_ret
1005 |
1006 | link_page = block_handle["link_to_page"]
1007 | if link_page["type"] == "page_id":
1008 | page_id = link_page["page_id"].replace('-', '')
1009 | page_name = ""
1010 | key_id = page_id + "_link_page"
1011 | common_op.add_new_child_page(
1012 | self.child_pages,
1013 | key_id=key_id,
1014 | link_id=page_id,
1015 | page_type="page",
1016 | page_name=page_name
1017 | )
1018 | common_op.debug_log(
1019 | "link_to_page_parser add link_page key_id = " + key_id, level=NotionDump.DUMP_MODE_DEFAULT)
1020 | common_op.debug_log(internal_var.PAGE_DIC)
1021 | common_op.debug_log("#############")
1022 | common_op.debug_log(self.child_pages)
1023 | return content_format.get_page_format_md(
1024 | key_id,
1025 | page_name,
1026 | export_child=self.export_child
1027 | )
1028 | else:
1029 | common_op.debug_log("unknown type " + link_page["type"], level=NotionDump.DUMP_MODE_DEFAULT)
1030 | return link_to_page_ret
1031 |
--------------------------------------------------------------------------------
/NotionDump/Parser/block_parser.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 |
5 | import os
6 |
7 | import NotionDump
8 | from NotionDump.Notion.Notion import NotionQuery
9 | from NotionDump.Parser.base_parser import BaseParser
10 | from NotionDump.utils import common_op
11 |
12 |
13 | # Block内容解析
14 | class BlockParser:
15 | # 初始化
16 | def __init__(
17 | self,
18 | block_id,
19 | query_handle: NotionQuery,
20 | parser_type=NotionDump.PARSER_TYPE_MD,
21 | export_child_pages=False
22 | ):
23 | self.block_id = block_id.replace('-', '')
24 | self.query_handle = query_handle
25 | self.parser_type = parser_type
26 | # 是否导出子页面,也就是递归操作
27 | self.export_child_page = export_child_pages
28 |
29 | # 创建临时文件夹
30 | self.tmp_dir = NotionDump.TMP_DIR
31 | if not os.path.exists(self.tmp_dir):
32 | os.mkdir(self.tmp_dir)
33 |
34 | # 基解析器
35 | self.base_parser = BaseParser(
36 | base_id=self.block_id,
37 | export_child=self.export_child_page
38 | )
39 |
40 | # 获取子页面字典
41 | def get_child_pages_dic(self):
42 | return self.base_parser.get_child_pages_dic()
43 |
44 | def __get_children_block_list(self, block):
45 | # 如果没有子页面,直接返回空
46 | if not block["has_children"]:
47 | return None
48 |
49 | if block["type"] == 'child_page':
50 | return None
51 |
52 | # 递归黑名单
53 | if block["type"] == "template":
54 | common_op.debug_log("type " + block["type"] + " has no child, ignore", level=NotionDump.DUMP_MODE_DEFAULT)
55 | return None
56 |
57 | # 指定类型才递归(白名单)
58 | if block["type"] != "to_do" \
59 | and block["type"] != "numbered_list_item" \
60 | and block["type"] != "bulleted_list_item" \
61 | and block["type"] != "toggle" \
62 | and block["type"] != "table" \
63 | and block["type"] != "table_row"\
64 | and block["type"] != "column_list" \
65 | and block["type"] != "column" \
66 | and block["type"] != "synced_block" \
67 | and block["type"] != "heading_1" \
68 | and block["type"] != "heading_2" \
69 | and block["type"] != "heading_3" \
70 | and block["type"] != "paragraph" \
71 | and block["type"] != "quote" \
72 | and block["type"] != "callout":
73 | common_op.debug_log("[ISSUE] type " + block["type"] + " has no child", level=NotionDump.DUMP_MODE_DEFAULT)
74 | return None
75 |
76 | # 获取块id下面的内容并继续解析
77 | if block["type"] == "synced_block" and block["synced_block"]["synced_from"] is not None:
78 | child_block_id = block["synced_block"]["synced_from"]["block_id"]
79 | common_op.debug_log("type synced_block " + child_block_id + " get child", level=NotionDump.DUMP_MODE_DEFAULT)
80 | else:
81 | child_block_id = block["id"]
82 |
83 | block_list = []
84 | retrieve_ret = self.query_handle.retrieve_block_children(child_block_id, parent_id=self.block_id)
85 | if retrieve_ret is not None:
86 | block_list = retrieve_ret["results"]
87 |
88 | # 如果没有获取到块,也返回空
89 | if len(block_list) == 0:
90 | return None
91 | # 返回获取到的块列表
92 | common_op.debug_log("## retrieve block " + child_block_id, level=NotionDump.DUMP_MODE_DEFAULT)
93 | return block_list
94 |
95 | def parser_block(self, block, list_index, last_line_is_table, prefix):
96 | block_type = block["type"]
97 | block_text = ""
98 | if block_type == "paragraph":
99 | # paragraph
100 | block_text = self.base_parser.paragraph_parser(block, self.parser_type)
101 | elif block_type == "heading_1":
102 | # heading_1
103 | block_text = self.base_parser.heading_1_parser(block, self.parser_type)
104 | elif block_type == "heading_2":
105 | # heading_2
106 | block_text = self.base_parser.heading_2_parser(block, self.parser_type)
107 | elif block_type == "heading_3":
108 | # heading_3
109 | block_text = self.base_parser.heading_3_parser(block, self.parser_type)
110 | elif block_type == "to_do":
111 | # to_do
112 | block_text = self.base_parser.to_do_parser(block, self.parser_type)
113 | elif block_type == "bulleted_list_item":
114 | # bulleted_list_item
115 | block_text = self.base_parser.bulleted_list_item_parser(block, self.parser_type)
116 | elif block_type == "numbered_list_item":
117 | # numbered_list_item
118 | block_text = self.base_parser.numbered_list_item_parser(block, list_index, self.parser_type)
119 | elif block_type == "toggle":
120 | # toggle
121 | block_text = self.base_parser.toggle_parser(block, self.parser_type)
122 | elif block_type == "divider":
123 | # divider
124 | block_text = self.base_parser.divider_parser(block)
125 | elif block_type == "callout":
126 | # callout
127 | block_text = self.base_parser.callout_parser(block, self.parser_type)
128 | # callout内换行使用HTML符号
129 | block_text = block_text.replace('\n', '
')
130 | elif block_type == "code":
131 | # code
132 | code_text = self.base_parser.code_parser(block, self.parser_type)
133 | block_text = code_text.replace('\n', '\n'+prefix)
134 | elif block_type == "quote":
135 | # quote
136 | block_text = self.base_parser.quote_parser(block, self.parser_type)
137 | block_text = block_text.replace('\n', '
')
138 | elif block_type == "equation":
139 | # Page equation
140 | block_text = self.base_parser.equation_parser(block)
141 | elif block_type == "table":
142 | # table直接递归即可
143 | pass
144 | elif block_type == "table_row":
145 | # Page table_row
146 | block_text = self.base_parser.table_row_parser(
147 | block,
148 | first_row=last_line_is_table,
149 | parser_type=self.parser_type
150 | )
151 | elif block_type == "child_page":
152 | # Page child_page 子页面只返回链接,不返回内容
153 | block_text = self.base_parser.child_page_parser(block, self.parser_type)
154 | elif block_type == "child_database":
155 | # Page child_database
156 | # Page中嵌套数据库的类型,只保存页面,不进行解析
157 | block_text = self.base_parser.child_database_parser(block, self.parser_type)
158 | elif block_type == "image":
159 | # Page image
160 | block_text = self.base_parser.image_parser(block, self.parser_type)
161 | elif block_type == "file" or block_type == "pdf" or block_type == "video":
162 | # Page file
163 | block_text = self.base_parser.file_parser(block, self.parser_type)
164 | elif block_type == "bookmark":
165 | # Page bookmark
166 | block_text = self.base_parser.bookmark_parser(block, self.parser_type)
167 | elif block_type == "embed":
168 | # Page embed
169 | block_text = self.base_parser.embed_parser(block, self.parser_type)
170 | elif block_type == "link_preview":
171 | # Page bookmark
172 | block_text = self.base_parser.link_preview_parser(block, self.parser_type)
173 | elif block_type == "link_to_page":
174 | # Page link_to_page
175 | block_text = self.base_parser.link_to_page_parser(block, self.parser_type)
176 | elif block_type == "table_of_contents":
177 | block_text = '[TOC]'
178 | elif block_type == "template":
179 | # 模板内容不解析
180 | block_text = '[TEMPLATE]'
181 | elif block_type == "breadcrumb":
182 | # 路径信息不解析(notion也不会返回)
183 | block_text = "[breadcrumb]"
184 | else:
185 | common_op.debug_log("[ISSUE] unknown page block properties type:" + block_type, level=NotionDump.DUMP_MODE_DEFAULT)
186 | block_text = "[unknown_type:" + block_type + "]"
187 | if block_text is None:
188 | block_text = ""
189 | return block_text
190 |
191 | def parser_block_list(self, block_list, indent=0, line_div="\n", last_block_type="none"):
192 | prefix = ""
193 | p_index = 0
194 | # line_div 为br时,是内部换行,\n时是大块换行
195 | while p_index < indent and line_div == "\n":
196 | prefix += "\t" # 前缀是一个TAB
197 | p_index += 1
198 |
199 | # 如果有内容先加个换行再说
200 | block_text = ""
201 | if indent != 0 and line_div == "\n":
202 | block_text = line_div
203 |
204 | last_type = "to_do" # 初始化不换行
205 | list_index = 1
206 |
207 | # 记录解析到的表格的状态,表格会一次性解析完,所以这里不需要重新设置
208 | last_line_is_table = True
209 |
210 | for block in block_list:
211 | # 遍历block,解析内容,填充到md文件中
212 | block_type = block["type"]
213 |
214 | # 在外面解析列类型
215 | if block_type == "column_list":
216 | # 列类型的分解
217 | column_list = self.__get_children_block_list(block)
218 | if block_text == "\n":
219 | # 如果只有一个换行符,重置内容
220 | block_text = ""
221 | if column_list is not None:
222 | for column in column_list:
223 | column_rows = self.__get_children_block_list(column)
224 | if column_rows is not None:
225 | if block_text != "":
226 | # 与前边得隔离开
227 | block_text += "\n"
228 | block_text += self.parser_block_list(column_rows, indent)
229 | elif block_type == "synced_block":
230 | # 同步块解析其中的内容
231 | synced_block_list = self.__get_children_block_list(block)
232 | if block_text == "\n":
233 | # 如果只有一个换行符,重置内容
234 | block_text = ""
235 | if synced_block_list is not None:
236 | block_text += self.parser_block_list(synced_block_list, indent, last_block_type="synced_block")
237 | else:
238 | # 如果是连续的类型,就不需要额外加换行符
239 | if common_op.parser_newline(last_type, block_type) and block_text != "" and block_text != "\n":
240 | block_text += line_div
241 |
242 | # 记录数字列表的标识
243 | if last_type == "numbered_list_item":
244 | list_index = list_index + 1
245 | else:
246 | list_index = 1
247 | last_type = block_type
248 | if block_type != "table" and block_type != "table_row":
249 | block_text += prefix
250 |
251 | block_text += self.parser_block(
252 | block=block,
253 | list_index=list_index,
254 | last_line_is_table=last_line_is_table,
255 | prefix=prefix
256 | )
257 |
258 | # 看改块下面有没有子块,如果有就继续解析
259 | children_block_list = self.__get_children_block_list(block)
260 | t_line_div = "\n"
261 | if block_type == "quote" or block_type == "callout":
262 | t_line_div = "
"
263 | if children_block_list is not None:
264 | if block_type == "heading_1" \
265 | or block_type == "heading_2" \
266 | or block_type == "heading_3" \
267 | or block_type == "paragraph" \
268 | or block_type == "quote" \
269 | or block_type == "callout":
270 | # 不需要加大indent值
271 | # if block_type != "quote" and block_type != "callout":
272 | # # 处理quote和callout内部的换行问题
273 | block_text += t_line_div
274 | block_text += self.parser_block_list(children_block_list, indent, line_div=t_line_div)
275 | else:
276 | block_text += self.parser_block_list(children_block_list, indent + 1)
277 | else:
278 | block_text += "\n"
279 |
280 | if block_type == "table_row":
281 | # 第一行设置首行标志
282 | last_line_is_table = False
283 |
284 | return block_text
285 |
286 | def block_to_md(self, block_handle, page_detail=None, new_id=None):
287 | block_list = block_handle["results"]
288 | # 空内容不生成文件
289 | if len(block_list) == 0 and (page_detail is None or page_detail == ""):
290 | return ""
291 |
292 | # 创建Markdown文件
293 | if new_id is not None:
294 | self.block_id = new_id.replace('-', '')
295 | self.base_parser.set_new_id(self.block_id)
296 | tmp_md_filename = self.tmp_dir + self.block_id + ".md"
297 | file = open(tmp_md_filename, "w", encoding="utf-8", newline='')
298 |
299 | # 如果存在属性就拼接上去
300 | block_text = ""
301 | if page_detail is not None and page_detail != "":
302 | block_text = page_detail + "\n" + NotionDump.MD_DIVIDER + "\n"
303 |
304 | # 解析block_list
305 | block_text += self.parser_block_list(block_list)
306 |
307 | # 将解析内容写入文件
308 | file.write(block_text)
309 | file.flush()
310 | file.close()
311 |
312 | common_op.debug_log("write file " + tmp_md_filename, level=NotionDump.DUMP_MODE_DEFAULT)
313 | # 将临时文件地址转出去,由外面进行进一步的操作
314 | return tmp_md_filename
315 |
316 | # 源文件,直接输出成json; 辅助测试使用
317 | def block_to_json(self, block_json, json_name=None):
318 | if block_json is None:
319 | return None
320 |
321 | if json_name is None:
322 | json_name = self.tmp_dir + self.block_id + ".json"
323 | common_op.save_json_to_file(block_json, json_name)
324 |
--------------------------------------------------------------------------------
/NotionDump/Parser/database_parser.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 |
5 | import csv
6 | import os
7 |
8 | import NotionDump
9 | from NotionDump.Parser.base_parser import BaseParser
10 | from NotionDump.utils import common_op
11 |
12 |
13 | class DatabaseParser:
14 | def __init__(
15 | self,
16 | database_id,
17 | parser_type=NotionDump.PARSER_TYPE_PLAIN,
18 | export_child_pages=False
19 | ):
20 | self.database_id = database_id.replace('-', '')
21 | self.parser_type = parser_type
22 | # 是否导出子页面,也就是递归操作
23 | self.export_child_page = export_child_pages
24 |
25 | self.tmp_dir = NotionDump.TMP_DIR
26 | if not os.path.exists(self.tmp_dir):
27 | os.mkdir(self.tmp_dir)
28 |
29 | # 块解析器
30 | self.base_parser = BaseParser(
31 | base_id=self.database_id,
32 | export_child=self.export_child_page
33 | )
34 |
35 | # 从一个页面里把列名给解析出来
36 | def __get_col_name_list(self, one_page):
37 | col_name_list = []
38 | title_name = ""
39 | for item in one_page["properties"]:
40 | if one_page["properties"][item]["type"] == "title":
41 | title_name = item
42 | else:
43 | col_name_list.append(item)
44 | if title_name == "":
45 | common_op.debug_log("col name no title error! id=" + self.database_id, level=NotionDump.DUMP_MODE_DEFAULT)
46 | return ""
47 | col_name_list.append(title_name) # 把title_name放在最后一个,逆序之后就是第一个
48 | # 根据现有的数据库看来这里需要逆序一下才和实际的数据库一致
49 | col_name_list.reverse()
50 | return col_name_list
51 |
52 | def get_child_pages_dic(self):
53 | return self.base_parser.get_child_pages_dic()
54 |
55 | # 解析一列中的一项
56 | def __parser_item(self, item_block, page_id):
57 | item_ret = ""
58 | if item_block["type"] == "title": # title
59 | item_ret = self.base_parser.title_parser(item_block, page_id, parser_type=self.parser_type)
60 | elif item_block["type"] == "multi_select": # multi_select
61 | item_ret = self.base_parser.multi_select_parser(item_block, parser_type=self.parser_type)
62 | elif item_block["type"] == "select":
63 | item_ret = self.base_parser.select_parser(item_block, parser_type=self.parser_type)
64 | elif item_block["type"] == "rich_text":
65 | item_ret = self.base_parser.rich_text_parser(item_block, parser_type=self.parser_type)
66 | elif item_block["type"] == "url":
67 | item_ret = self.base_parser.url_parser(item_block)
68 | elif item_block["type"] == "email":
69 | item_ret = self.base_parser.email_parser(item_block)
70 | elif item_block["type"] == "checkbox":
71 | item_ret = self.base_parser.checkbox_parser(item_block)
72 | elif item_block["type"] == "phone_number":
73 | item_ret = self.base_parser.phone_number_parser(item_block)
74 | elif item_block["type"] == "date":
75 | item_ret = self.base_parser.date_parser(item_block)
76 | elif item_block["type"] == "people":
77 | item_ret = self.base_parser.people_parser(item_block)
78 | elif item_block["type"] == "number":
79 | item_ret = self.base_parser.number_parser(item_block)
80 | elif item_block["type"] == "files":
81 | item_ret = self.base_parser.files_parser(item_block, parser_type=self.parser_type)
82 | elif item_block["type"] == "relation":
83 | item_ret = self.base_parser.relation_parser(item_block)
84 | elif item_block["type"] == "rollup":
85 | # rollup类型单独解析
86 | rollup_block = item_block["rollup"]
87 | if "array" in rollup_block:
88 | # 列表的解析
89 | for rollup_item in rollup_block["array"]:
90 | if item_ret != "":
91 | item_ret += ","
92 | item_ret += self.__parser_item(rollup_item, "")
93 | else:
94 | # 单个内容的解析
95 | item_ret += self.__parser_item(rollup_block, "")
96 | elif item_block["type"] == "formula":
97 | item_ret = self.base_parser.formula_parser(item_block)
98 | elif item_block["type"] == "created_time":
99 | item_ret = self.base_parser.created_time_parser(item_block)
100 | elif item_block["type"] == "last_edited_time":
101 | item_ret = self.base_parser.last_edited_time_parser(item_block)
102 | elif item_block["type"] == "created_by":
103 | item_ret = self.base_parser.created_by_parser(item_block)
104 | elif item_block["type"] == "last_edited_by":
105 | item_ret = self.base_parser.last_edited_by_parser(item_block)
106 | else:
107 | item_ret = "[unknown_type:" + item_block["type"] + "]"
108 | common_op.debug_log("[ISSUE] unknown properties type:" + item_block["type"], level=NotionDump.DUMP_MODE_DEFAULT)
109 | if item_ret is None:
110 | item_ret = ""
111 | return item_ret
112 |
113 | def database_to_md(self, page_properties, new_id=None):
114 | if page_properties is None:
115 | return "", ""
116 | # 获取属性部分
117 | if "properties" not in page_properties:
118 | return "", ""
119 | page_properties = page_properties["properties"]
120 |
121 | # 设置基础解析器的id
122 | if new_id is not None:
123 | self.base_parser.set_new_id(new_id)
124 |
125 | # 数据库是空的,直接返回完事
126 | if len(page_properties) == 0:
127 | return "", ""
128 |
129 | properties_md = ""
130 | # print(page_properties.keys())
131 | p_title = ""
132 | p_title_name = ""
133 | for p_name in list(page_properties.keys())[::-1]:
134 | p_value = self.__parser_item(page_properties[p_name], page_id="").replace('\n', '
')
135 | if page_properties[p_name]["type"] == "title":
136 | p_title = p_value
137 | p_title_name = p_name
138 | continue
139 | # print(p_value, p_name)
140 | properties_md += "\n" + "|" + str(p_name) + "|" + str(p_value) + "|"
141 | if p_title != "" or p_title_name != "":
142 | properties_md = "|" + p_title_name + "|" + p_title + "|\n|---|---|" + properties_md
143 | else:
144 | properties_md = "|KEY|VALUE|\n|---|---|" + properties_md
145 |
146 | if len(page_properties) == 1:
147 | return "", p_title
148 | else:
149 | return properties_md, p_title
150 |
151 | # 格式化存储,这里是临时文件存在方式(在外面转成数据库,或者最终输出CSV的格式)
152 | def database_to_file(self, database_handle, col_name_list=None, new_id=None):
153 | page_list = database_handle.get("results")
154 | # 数据库是空的,直接返回完事
155 | if len(page_list) == 0:
156 | return ""
157 |
158 | # col_name_list 是想要的列,并且会按照该顺序输出;如果没有给定则获取所有列
159 | if col_name_list is None:
160 | # 如果没有给定输出顺序,则获取到page中的所有列(注意不保证是显示的顺序!!!!)
161 | col_name_list = self.__get_col_name_list(page_list[0])
162 |
163 | # 创建文件
164 | suffix = ".csv"
165 | if self.parser_type == NotionDump.PARSER_TYPE_MD:
166 | suffix = ".md"
167 | if new_id is not None:
168 | self.base_parser.set_new_id(new_id)
169 | tmp_filename = self.tmp_dir + new_id.replace('-', '') + suffix
170 | else:
171 | tmp_filename = self.tmp_dir + self.database_id + suffix
172 |
173 | file = open(tmp_filename, "w", encoding="utf-8", newline='')
174 |
175 | csv_writer = None
176 | if self.parser_type == NotionDump.PARSER_TYPE_MD:
177 | head_line = "|"
178 | for it in col_name_list:
179 | head_line += it + "|"
180 | head_line += "\n|"
181 | for i in range(len(col_name_list)):
182 | head_line += " --- " + "|"
183 | file.write(head_line + "\n")
184 | else:
185 | csv_writer = csv.writer(file)
186 | # 首先将列的名称写入到CSV文件中
187 | csv_writer.writerow(col_name_list)
188 |
189 | # 返回的内容好像是倒序的,先倒置过来吧
190 | page_list.reverse()
191 | # 解析每一个page的内容
192 | for page in page_list:
193 | # 每一个page都有page id
194 | page_id = page["id"].replace('-', '')
195 | common_op.debug_log("database page id" + page_id)
196 | page_iter = []
197 | for item in col_name_list:
198 | # 解析每一个方格的内容
199 | page_iter.append(self.__parser_item(page["properties"][item], page_id))
200 | # 将内容填充到文件中
201 | if self.parser_type == NotionDump.PARSER_TYPE_MD:
202 | page_line = "|"
203 | for it in page_iter:
204 | if isinstance(it, str):
205 | page_line += it.replace('\n', '
') + "|"
206 | else:
207 | page_line += str(it) + "|"
208 | file.write(page_line + "\n")
209 | else:
210 | if csv_writer is not None:
211 | csv_writer.writerow(page_iter)
212 | common_op.debug_log("database page " + page_id + " write csv success")
213 | else:
214 | common_op.debug_log("database page " + page_id + " write csv fail", level=NotionDump.DUMP_MODE_DEFAULT)
215 | file.flush()
216 | file.close()
217 |
218 | common_op.debug_log("write file " + tmp_filename, level=NotionDump.DUMP_MODE_DEFAULT)
219 | # 将临时文件地址转出去,由外面进行进一步的操作
220 | return tmp_filename
221 |
222 | def database_to_dic(self, database_handle, col_name_list=None, new_id=None):
223 | page_list = database_handle.get("results")
224 | # 数据库是空的,直接返回完事
225 | if len(page_list) == 0:
226 | return
227 |
228 | # col_name_list 是想要的列,并且会按照该顺序输出;如果没有给定则获取所有列
229 | if col_name_list is None:
230 | # 如果没有给定输出顺序,则获取到page中的所有列(注意不保证是显示的顺序!!!!)
231 | col_name_list = self.__get_col_name_list(page_list[0])
232 |
233 | # 返回的内容好像是倒序的,先倒置过来吧
234 | page_list.reverse()
235 |
236 | db_dic = []
237 | # 解析每一个page的内容
238 | for page in page_list:
239 | # 每一个page都有page id
240 | page_id = page["id"].replace('-', '')
241 | common_op.debug_log("database page id" + page_id)
242 | db_dic_line = {"_page_id": page_id}
243 | for item in col_name_list:
244 | # 解析每一个方格的内容
245 | db_dic_line[item] = self.__parser_item(page["properties"][item], page_id)
246 | # 将内容填充list中
247 | db_dic.append(db_dic_line)
248 | common_op.debug_log("database page " + page_id + " get dic success")
249 |
250 | # 将临时文件地址转出去,由外面进行进一步的操作
251 | return db_dic
252 |
--------------------------------------------------------------------------------
/NotionDump/Parser/mix_parser.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/10
3 | # mail:geniusrabbit@qq.com
4 | import copy
5 | import os
6 |
7 | import NotionDump
8 | from NotionDump.Notion.Notion import NotionQuery
9 | from NotionDump.Parser.block_parser import BlockParser
10 | from NotionDump.Parser.database_parser import DatabaseParser
11 | from NotionDump.utils import common_op, internal_var
12 |
13 |
14 | # 混合递归调用,主要是为Page和Database类型
15 | class MixParser:
16 | # 初始化
17 | def __init__(
18 | self,
19 | mix_id,
20 | query_handle: NotionQuery,
21 | export_child_pages=False,
22 | page_parser_type=NotionDump.PARSER_TYPE_MD,
23 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN,
24 | col_name_list=None, # 数据库使用的字段
25 | ):
26 | self.mix_id = mix_id
27 | self.query_handle = query_handle
28 | self.page_parser_type = page_parser_type
29 | self.db_parser_type = db_parser_type
30 |
31 | # 是否导出子页面,也就是递归操作
32 | self.export_child_page = export_child_pages
33 |
34 | # 创建临时文件夹
35 | self.tmp_dir = NotionDump.TMP_DIR
36 | if not os.path.exists(self.tmp_dir):
37 | os.mkdir(self.tmp_dir)
38 |
39 | # 解析器
40 | # 这里传入handle是为了子块的解析
41 | self.block_parser = BlockParser(
42 | block_id=self.mix_id,
43 | query_handle=self.query_handle,
44 | parser_type=self.page_parser_type,
45 | export_child_pages=self.export_child_page
46 | )
47 | # 初始化一个Database对象,这里page id无关紧要
48 | self.database_parser = DatabaseParser(
49 | self.mix_id,
50 | parser_type=self.db_parser_type,
51 | export_child_pages=self.export_child_page
52 | )
53 |
54 | # 收集解析中发证的错误
55 | self.error_list = []
56 |
57 | # 调试时显示子页面内容
58 | def __test_show_child_page(self):
59 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG:
60 | print("in page_id: ", self.mix_id, internal_var.PAGE_DIC)
61 |
62 | def __recursion_mix_parser(self, is_main=False, col_name_list=None):
63 | root_name = None
64 | update_flag = False
65 | recursion_page = copy.deepcopy(internal_var.PAGE_DIC)
66 | for child_id in recursion_page:
67 | # 判断页面是子页面还是链接页面,链接页面不进行解析(因为添加链接页面时把原页面也加进来了)
68 | if common_op.is_link_page(child_id, recursion_page[child_id]):
69 | common_op.update_page_recursion(child_id, recursion=True)
70 | continue
71 | # 判断页面是否已经操作过
72 | if not common_op.is_page_recursion(child_id):
73 | continue
74 |
75 | update_flag = True
76 | common_op.debug_log("start child_page_id=" + child_id)
77 | self.__test_show_child_page()
78 | # 先更新页面的状态,无论获取成功或者失败都过去了,只获取一次
79 | common_op.update_page_recursion(child_id, recursion=True)
80 | common_op.debug_log("S process id " + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
81 | page_title = None
82 | tmp_filename = None
83 | if common_op.is_page(child_id):
84 | # 页面信息
85 | page_detail = self.query_handle.retrieve_page(child_id)
86 | # 页面内容
87 | page_json = self.query_handle.retrieve_block_children(child_id)
88 | if page_json is None or page_detail is None:
89 | common_op.debug_log("get page error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
90 | self.error_list.append("get page error, id=" + child_id)
91 | continue
92 | # 解析属性文本到变量中
93 | page_properties = None
94 | if NotionDump.S_PAGE_PROPERTIES or common_op.is_page_soft(child_id):
95 | # 获取文本
96 | page_properties, page_title = self.database_parser.database_to_md(page_detail, new_id=child_id)
97 | # 解析内容到临时文件中
98 | tmp_filename = self.block_parser.block_to_md(page_json, page_detail=page_properties, new_id=child_id)
99 | # 处理遇到的子页面
100 | child_pages_dic = self.block_parser.get_child_pages_dic()
101 | if NotionDump.S_PAGE_PROPERTIES:
102 | db_child_pages_dic = self.database_parser.get_child_pages_dic()
103 | for db_child_dic_key in db_child_pages_dic:
104 | if db_child_dic_key not in child_pages_dic:
105 | child_pages_dic[db_child_dic_key] = db_child_pages_dic[db_child_dic_key]
106 | elif common_op.is_db(child_id):
107 | db_info = self.query_handle.retrieve_database(child_id)
108 | # page里面搞一个Database的解析器
109 | db_detail = self.query_handle.query_database(child_id)
110 |
111 | if db_detail is None:
112 | # db_info不是必须的,但是在link数据库获取不到
113 | common_op.debug_log("get database error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
114 | self.error_list.append("get database error, id=" + child_id)
115 | continue
116 | # 获取解析后的数据
117 | tmp_filename = self.database_parser.database_to_file(db_detail, new_id=child_id, col_name_list=col_name_list)
118 | child_pages_dic = self.database_parser.get_child_pages_dic()
119 | elif common_op.is_download(child_id):
120 | # 可下载类型
121 | # 获取下载后的数据
122 | tmp_filename = self.query_handle.download_to_file(download_id=child_id, child_page_item=recursion_page[child_id])
123 | child_pages_dic = {}
124 | # 尝试下载,没下载成功
125 | if tmp_filename == "" and not NotionDump.FILE_WITH_LINK:
126 | common_op.debug_log("file download error, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
127 | self.error_list.append("download error, link:" + recursion_page[child_id]["link_src"])
128 | continue
129 | else:
130 | common_op.debug_log("!!! unknown child id type, id=" + child_id, level=NotionDump.DUMP_MODE_DEFAULT)
131 | self.error_list.append("!!! unknown child id type, id=" + child_id)
132 | continue
133 |
134 | common_op.debug_log("E process id " + child_id + " success", level=NotionDump.DUMP_MODE_DEFAULT)
135 | # 再更新本地的存放路径
136 | common_op.update_child_page_stats(child_id, dumped=True, main_page=is_main, local_path=tmp_filename, page_title=page_title)
137 | if is_main:
138 | root_name = tmp_filename
139 | # 从页面里获取到所有的子页面,并将子页面添加到父id中
140 | common_op.update_child_pages(child_pages_dic, child_id)
141 |
142 | # 调试
143 | common_op.debug_log("# end child_page_id=", child_id)
144 | self.__test_show_child_page()
145 |
146 | if update_flag:
147 | self.__recursion_mix_parser()
148 | return root_name
149 |
150 | def mix_parser(self, root_id, id_type, col_name_list=None):
151 | # col_name_list 是数据库的可选字段
152 | common_op.update_child_page_stats(root_id, main_page=True, page_type=id_type)
153 | root_filename = self.__recursion_mix_parser(True, col_name_list)
154 | internal_var.PAGE_DIC["errors"] = self.error_list
155 | return root_filename
156 |
157 | def database_collection(self, json_handle, json_type, col_name_list=None):
158 | # 只能获取数据库类型
159 | common_op.debug_log("parser_type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT)
160 | if json_type == "database":
161 | return self.database_parser.database_to_dic(json_handle, col_name_list=col_name_list)
162 | elif json_type == "block":
163 | common_op.debug_log("need database get type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT)
164 | return None
165 | else:
166 | common_op.debug_log("unknown parser_type:" + json_type, level=NotionDump.DUMP_MODE_DEFAULT)
167 | return None
168 |
--------------------------------------------------------------------------------
/NotionDump/__init__.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 |
5 | __author__ = "delta1037 "
6 | __version__ = "0.2.3"
7 |
8 | from NotionDump import utils
9 |
10 | # 临时存放文件夹
11 | TMP_DIR = "./.tmp/"
12 |
13 | # Markdown的分割条语法
14 | MD_DIVIDER = "------"
15 | MD_BOOL_TRUE = "✓"
16 | MD_BOOL_FALSE = "✕"
17 | # ,、
逗号或者换行
18 | MD_ROLLUP_SEP = ","
19 | MD_HIGHLIGHT = "=="
20 | ID_LEN = len("921e6b4ea44046c6935bcb2c69453196")
21 |
22 | # 日志输出模式
23 | DUMP_MODE_DEBUG = 0
24 | DUMP_MODE_DEFAULT = 1
25 | DUMP_MODE_SILENT = 2
26 | DUMP_MODE = DUMP_MODE_DEFAULT
27 |
28 |
29 | # 日志控制器
30 | class NotionBackupLogger:
31 | def __init__(self):
32 | self.prefix = "[NotionDump] "
33 | self.log_fd = None
34 |
35 | def log_debug(self, log_str):
36 | self.log_info(log_str)
37 |
38 | # debug内容写入到文件
39 | if self.log_fd is None:
40 | self.log_fd = open("notion-export-kernel-debug.log", "a+", encoding='utf-8')
41 | self.log_fd.write(str(log_str) + "\n")
42 | self.log_fd.flush()
43 |
44 | def log_info(self, log_str):
45 | print(self.prefix, end='')
46 | print(log_str)
47 |
48 |
49 | LOGGER = NotionBackupLogger()
50 |
51 | # 导出的类型
52 | DUMP_TYPE_BLOCK = 1
53 | DUMP_TYPE_PAGE = 2
54 | DUMP_TYPE_DB_TABLE = 4
55 |
56 | # 解析的类型:分为Markdown和纯文本
57 | PARSER_TYPE_MD = 0
58 | PARSER_TYPE_PLAIN = 2
59 |
60 | # 是否使用缓存
61 | BUFFER_FILE = TMP_DIR + "notion_download_buffer.json"
62 | USE_BUFFER = True
63 |
64 | # 一些配置开关
65 | # 对没有在notion保存的文件(pdf\image)尝试下载,否则直接放置链接
66 | FILE_WITH_LINK = True
67 | FORMAT_DATE = "%Y/%m/%d"
68 | FORMAT_DATETIME = "%Y/%m/%d-%H:%M:%S"
69 | # 是否导出page的properties
70 | S_PAGE_PROPERTIES = True
71 | # 主题的格式,default,light,dark,markdown,self_define
72 | S_THEME_TYPE = "default"
73 | # f开头的是字体颜色,b开头的是背景颜色,d开头的是数据库标签
74 | S_THEME_LIGHT = {
75 | "f_gray": "#787774",
76 | "f_brown": "#9F6B53",
77 | "f_orange": "#D9730D",
78 | "f_yellow": "#CB912F",
79 | "f_green": "#448361",
80 | "f_blue": "#337EA9",
81 | "f_purple": "#9065B0",
82 | "f_pink": "#C14C8A",
83 | "f_red": "#D44C47",
84 | "b_gray": "#F1F1EF",
85 | "b_brown": "#F4EEEE",
86 | "b_orange": "#FBECDD",
87 | "b_yellow": "#FBF3DB",
88 | "b_green": "#EDF3EC",
89 | "b_blue": "#E7F3F8",
90 | "b_purple": "#F4F0F7CC",
91 | "b_pink": "#F9EEF3CC",
92 | "b_red": "#FDEBEC",
93 | "d_light_gray": "#E3E2E080",
94 | "d_gray": "#E3E2E0",
95 | "d_brown": "#EEE0DA",
96 | "d_orange": "#FADEC9",
97 | "d_yellow": "#FDECC8",
98 | "d_green": "#DBEDDB",
99 | "d_blue": "#D3E5EF",
100 | "d_purple": "#E8DEEE",
101 | "d_pink": "#F5E0E9",
102 | "d_red": "#FFE2DD",
103 | }
104 |
105 | S_THEME_DARK = {
106 | "f_gray": "#9B9B9B",
107 | "f_brown": "#BA856F",
108 | "f_orange": "#C77D48",
109 | "f_yellow": "#CA9849",
110 | "f_green": "#529E72",
111 | "f_blue": "#5E87C9",
112 | "f_purple": "#9D68D3",
113 | "f_pink": "#D15796",
114 | "f_red": "#DF5453",
115 | "b_gray": "#2F2F2F",
116 | "b_brown": "#4A3228",
117 | "b_orange": "#5C3B23",
118 | "b_yellow": "#564328",
119 | "b_green": "#243D30",
120 | "b_blue": "#143A4E",
121 | "b_purple": "#3C2D49",
122 | "b_pink": "#4E2C3C",
123 | "b_red": "#522E2A",
124 | "d_light_gray": "#373737",
125 | "d_gray": "#5A5A5A",
126 | "d_brown": "#603B2C",
127 | "d_orange": "#854C1D",
128 | "d_yellow": "#89632A",
129 | "d_green": "#2B593F",
130 | "d_blue": "#28456C",
131 | "d_purple": "#492F64",
132 | "d_pink": "#69314C",
133 | "d_red": "#6E3630",
134 | }
135 |
136 | S_THEME_SELF_DEFINE = {
137 | "f_gray": "#787774",
138 | "f_brown": "#9F6B53",
139 | "f_orange": "#D9730D",
140 | "f_yellow": "#CB912F",
141 | "f_green": "#448361",
142 | "f_blue": "#337EA9",
143 | "f_purple": "#9065B0",
144 | "f_pink": "#C14C8A",
145 | "f_red": "#D44C47",
146 | "b_gray": "#F1F1EF",
147 | "b_brown": "#F4EEEE",
148 | "b_orange": "#FBECDD",
149 | "b_yellow": "#FBF3DB",
150 | "b_green": "#EDF3EC",
151 | "b_blue": "#E7F3F8",
152 | "b_purple": "#F4F0F7CC",
153 | "b_pink": "#F9EEF3CC",
154 | "b_red": "#FDEBEC",
155 | "d_light_gray": "#E3E2E080",
156 | "d_gray": "#E3E2E0",
157 | "d_brown": "#EEE0DA",
158 | "d_orange": "#FADEC9",
159 | "d_yellow": "#FDECC8",
160 | "d_green": "#DBEDDB",
161 | "d_blue": "#D3E5EF",
162 | "d_purple": "#E8DEEE",
163 | "d_pink": "#F5E0E9",
164 | "d_red": "#FFE2DD",
165 | }
166 |
--------------------------------------------------------------------------------
/NotionDump/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/NotionDump/utils/__init__.py
--------------------------------------------------------------------------------
/NotionDump/utils/common_op.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/09
3 | # mail:geniusrabbit@qq.com
4 |
5 | import copy
6 | import json
7 | import os.path
8 | from json import JSONDecodeError
9 |
10 | import NotionDump
11 | from NotionDump.utils import internal_var
12 |
13 |
14 | # 更新子页面的状态
15 | def update_child_page_stats(child_key, dumped=False, main_page=False, local_path=None, page_type=None, page_title=None):
16 | if child_key not in internal_var.PAGE_DIC:
17 | # 如果现有的列表里没有这一条,则新加一条
18 | debug_log("CREATE child page " + child_key + " from temp", level=NotionDump.DUMP_MODE_DEFAULT)
19 | internal_var.PAGE_DIC[child_key] = copy.deepcopy(internal_var.CHILD_PAGE_TEMP)
20 | internal_var.PAGE_DIC[child_key]["dumped"] = dumped
21 | internal_var.PAGE_DIC[child_key]["main_page"] = main_page
22 | if local_path is not None:
23 | internal_var.PAGE_DIC[child_key]["local_path"] = local_path
24 | if page_type is not None:
25 | if page_type == "block" or page_type == "page":
26 | internal_var.PAGE_DIC[child_key]["type"] = "page"
27 | elif page_type == "database":
28 | internal_var.PAGE_DIC[child_key]["type"] = "database"
29 | else:
30 | debug_log("update_child_page_stats page type is unknown:" + str(page_type),
31 | level=NotionDump.DUMP_MODE_DEFAULT)
32 | if page_title is not None and internal_var.PAGE_DIC[child_key]["inter_soft_page"] is True:
33 | internal_var.PAGE_DIC[child_key]["inter_soft_page"] = False
34 | if internal_var.PAGE_DIC[child_key]["page_name"] == "":
35 | internal_var.PAGE_DIC[child_key]["page_name"] = page_title
36 |
37 |
38 | # 关于软连接一共有如下情况
39 | # 同一个页面:add_new_child_page
40 | # 在同一个页面中,软连接先于实际链接出现
41 | # 软连接先占位,把实际链接加进去
42 | # 在同一个页面中,软连接在实际链接后出现
43 | # 不同的页面:update_child_pages
44 | # 在不同页面中,软连接先于实际链接出现
45 | # 实际链接替换,重新解析
46 | # 在不同页面中,软连接在实际链接后出现
47 | # 忽略软连接
48 | # 只出现软连接而没有出现实际链接,pass
49 | def update_child_pages(child_pages, parent_id):
50 | # 按理说这里一定会有父id,如果没有就是出大事了
51 | if parent_id not in internal_var.PAGE_DIC:
52 | debug_log("parent id" + parent_id + " not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
53 | return
54 |
55 | for child_page_id in child_pages:
56 | # 如果发现表里已经有了该页面,看是不是软链接创建的
57 | if child_page_id in internal_var.PAGE_DIC:
58 | # 如果页表里是软连接创建的,并且外面的不是软连接创建的
59 | # 如果里面是硬链接,外面是软连接则会忽略
60 | if internal_var.PAGE_DIC[child_page_id]["inter_soft_page"] \
61 | and not child_pages[child_page_id]["inter_soft_page"]:
62 | # 将外面的合入到页面表,替换之后会重新解析,不用担心已经解析过的内容
63 | # 这里相当于填充了一个未开始解析的内容,而调用这个函数之后
64 | # __recursion_mix_parser会在循环遍历一次,将这个页面重新解析
65 | internal_var.PAGE_DIC[child_page_id] = child_pages[child_page_id]
66 | debug_log("REPLACE last created soft page, id=" + child_page_id, level=NotionDump.DUMP_MODE_DEFAULT)
67 |
68 | # 包括占位的类型,如果总页面表里不存在都放进去
69 | if child_page_id not in internal_var.PAGE_DIC:
70 | # 如果现有的列表里没有这一条,则新加一条
71 | debug_log("CREATE child page " + child_page_id + " from child_pages", level=NotionDump.DUMP_MODE_DEFAULT)
72 | internal_var.PAGE_DIC[child_page_id] = copy.deepcopy(child_pages[child_page_id])
73 |
74 | # 如果该页面是占位的,则不加到父页面表里
75 | if not child_pages[child_page_id]["inter_soft_page"]:
76 | debug_log("parent id" + parent_id + " add child " + child_page_id,
77 | level=NotionDump.DUMP_MODE_DEFAULT)
78 | internal_var.PAGE_DIC[parent_id]["child_pages"].append(child_page_id)
79 | else:
80 | debug_log("SOFT_PAGE " + child_page_id + " dont need to add to parent_id " + parent_id,
81 | level=NotionDump.DUMP_MODE_DEFAULT)
82 |
83 |
84 | # 添加一个新的子页
85 | # 链接的key格式是 id_链接名
86 | # 子页面的key格式是id
87 | def add_new_child_page(child_pages, key_id, link_id=None, link_src=None, page_name=None, page_type=None,
88 | inter_soft_page=False):
89 | # 判断id是否存在,存在就不添加了,防止覆盖
90 | debug_log("add new child key:" + key_id)
91 | # id 存在并且不是软连接创建的,就不添加了(硬链接先于软连接)
92 | if key_id in child_pages and not child_pages[key_id]["inter_soft_page"]:
93 | debug_log("WARN key_id:" + key_id + " exist, skip", level=NotionDump.DUMP_MODE_DEFAULT)
94 | return
95 | # 如果不存在或者上一个是软连接创建的,就重新赋值
96 | child_pages[key_id] = copy.deepcopy(internal_var.CHILD_PAGE_TEMP)
97 | child_pages[key_id]["inter_soft_page"] = inter_soft_page
98 | if link_id is not None:
99 | # 如果是软链接,递归看一下对应的子页面在不在,如果不在就先占个坑(忽略file和image类型)
100 | # inter_soft_page 表明该项是软连接创建的
101 | debug_log("SOFT_PAGE key_id " + key_id + " link_id " + link_id + ", create a null page with link_id",
102 | level=NotionDump.DUMP_MODE_DEFAULT)
103 | add_new_child_page(child_pages, key_id=link_id, link_src=link_src, page_type=page_type, inter_soft_page=True)
104 | if page_name is not None:
105 | child_pages[key_id]["page_name"] = page_name
106 | if link_id is not None:
107 | child_pages[key_id]["link_id"] = link_id
108 | if link_src is not None:
109 | child_pages[key_id]["link_src"] = link_src
110 | if page_type is not None:
111 | child_pages[key_id]["type"] = page_type
112 |
113 |
114 | # 用此函数的前提是page表中已经存在
115 | def update_page_recursion(page_id, recursion=False):
116 | if page_id not in internal_var.PAGE_DIC:
117 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
118 | return
119 | internal_var.PAGE_DIC[page_id]["inter_recursion"] = recursion
120 |
121 |
122 | def is_page_recursion(page_id):
123 | if page_id not in internal_var.PAGE_DIC:
124 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
125 | return False
126 | return not internal_var.PAGE_DIC[page_id]["inter_recursion"]
127 |
128 |
129 | def is_page_soft(page_id):
130 | if page_id not in internal_var.PAGE_DIC:
131 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
132 | return False
133 | return internal_var.PAGE_DIC[page_id]["inter_soft_page"]
134 |
135 |
136 | # page 返回True,DB返回False
137 | def is_page(page_id):
138 | if page_id not in internal_var.PAGE_DIC:
139 | debug_log("page id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
140 | return False
141 | return internal_var.PAGE_DIC[page_id]["type"] == "page"
142 |
143 |
144 | # database 返回True
145 | def is_db(db_id):
146 | if db_id not in internal_var.PAGE_DIC:
147 | debug_log("db_id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
148 | return False
149 | return internal_var.PAGE_DIC[db_id]["type"] == "database"
150 |
151 |
152 | # database 返回True
153 | def is_download(download_id):
154 | if download_id not in internal_var.PAGE_DIC:
155 | debug_log("download_id not exist!!!", level=NotionDump.DUMP_MODE_DEFAULT)
156 | return False
157 | # 可下载类型
158 | return internal_var.PAGE_DIC[download_id]["type"] == "image" or internal_var.PAGE_DIC[download_id]["type"] == "file"
159 |
160 |
161 | # 判断是否是链接页面
162 | def is_link_page(page_id, page_handle):
163 | return (page_id.find("_") != -1) and page_handle["link_id"] != ""
164 |
165 |
166 | # 将文本保存为json文件
167 | def save_json_to_file(handle, json_name):
168 | try:
169 | json_handle = json.dumps(handle, ensure_ascii=False, indent=4)
170 | except JSONDecodeError:
171 | debug_log("json decode error", level=NotionDump.DUMP_MODE_DEFAULT)
172 | return
173 |
174 | file = open(json_name, "w+", encoding="utf-8")
175 | file.write(json_handle)
176 | file.flush()
177 | file.close()
178 |
179 |
180 | # 从文件中加载json文件
181 | def load_json_from_file(json_name):
182 | if not os.path.exists(json_name):
183 | debug_log("json file not exist, path=" + json_name, level=NotionDump.DUMP_MODE_DEFAULT)
184 | return None
185 | try:
186 | json_fd = open(json_name, "r", encoding="utf-8")
187 | return json.load(json_fd)
188 | except JSONDecodeError:
189 | debug_log("json decode error", level=NotionDump.DUMP_MODE_DEFAULT)
190 | return None
191 |
192 |
193 | # 判断是否添加额外的换行
194 | def parser_newline(last_type, now_type):
195 | if last_type == "to_do" and now_type == "to_do":
196 | return False
197 | if last_type == "numbered_list_item" and now_type == "numbered_list_item":
198 | return False
199 | if last_type == "bulleted_list_item" and now_type == "bulleted_list_item":
200 | return False
201 | if last_type == "toggle" and now_type == "toggle":
202 | return False
203 | # 处理表格类型
204 | if last_type == "table" and now_type == "table_row":
205 | return False
206 | if last_type == "table_row" and now_type == "table_row":
207 | return False
208 | return True
209 |
210 |
211 | def debug_log(debug_str, level=NotionDump.DUMP_MODE_DEBUG):
212 | if NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEBUG:
213 | NotionDump.LOGGER.log_debug(debug_str)
214 | elif NotionDump.DUMP_MODE == NotionDump.DUMP_MODE_DEFAULT and level == NotionDump.DUMP_MODE_DEFAULT:
215 | # 默认模式 对 level进行过滤
216 | NotionDump.LOGGER.log_info(debug_str)
217 | # 静默模式什么都不输出
218 |
--------------------------------------------------------------------------------
/NotionDump/utils/content_format.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 | import datetime
5 |
6 | import dateutil.parser
7 |
8 |
9 | # 获取mention的格式
10 | import NotionDump
11 |
12 |
13 | def get_mention_format(mention_content):
14 | return "@(" + mention_content + ")"
15 |
16 |
17 | # 获取page的格式 运行过程中只填充id,后续调整页面供定位使用
18 | def get_page_format_md(page_id, page_name, export_child):
19 | if export_child:
20 | return "[" + page_id + "]()"
21 | else:
22 | return "[" + page_name + "](" + page_id + ")"
23 |
24 |
25 | # 数据库title格式
26 | def get_database_title_format(title_id, title_ret, export_child):
27 | if export_child:
28 | return "[" + title_id + "]()"
29 | else:
30 | # 不导出子页面直接把标题填上去
31 | return title_ret
32 |
33 |
34 | # 获取page的格式 纯文本只填充名字即可
35 | def get_page_format_plain(page_name):
36 | return page_name
37 |
38 |
39 | # 封装URL的格式
40 | def get_url_format(url_plain, name="link"):
41 | return "[" + name + "](" + url_plain + ")"
42 |
43 |
44 | def format_date_or_time(date_time):
45 | # print(date_time)
46 | t_datetime = dateutil.parser.parse(date_time)
47 | # print(date_time, t_datetime)
48 | if date_time.find('T') != -1:
49 | # datetime
50 | return t_datetime.strftime(NotionDump.FORMAT_DATETIME)
51 | else:
52 | # date
53 | return t_datetime.strftime(NotionDump.FORMAT_DATE)
54 |
55 |
56 | # 封装date的格式
57 | def get_date_format(start, end):
58 | ret_str = ""
59 | if start is not None:
60 | ret_str = format_date_or_time(start)
61 | if end is not None:
62 | ret_str += " ~ " + format_date_or_time(end) # 日期之间用“~”分割
63 | return ret_str
64 |
65 |
66 | # 封装文件链接格式
67 | def get_file_format_md(filename, file_url, file_id="", export_child=False):
68 | if export_child:
69 | if file_id == "":
70 | return "[" + filename + "](" + file_url + ")"
71 | else:
72 | # 等待重定位
73 | return "[" + file_id + "]()"
74 | else:
75 | # 不导出子页面直接把标题填上去
76 | return "[" + filename + "](" + file_url + ")"
77 |
78 |
79 | # 封装文件链接格式
80 | def get_file_format_plain(filename, file_url):
81 | return filename + "(" + file_url + ")"
82 |
83 |
84 | # 行内公式格式
85 | def get_equation_inline(equation):
86 | return "$ " + equation + " $"
87 |
88 |
89 | # 块级公式格式
90 | def get_equation_block(equation):
91 | return "$$ " + equation + " $$"
92 |
93 |
94 | def color_transformer(input_color, background=False):
95 | if background:
96 | color_str = "b_" + input_color
97 | else:
98 | color_str = "f_" + input_color
99 | color_ret = ""
100 | if NotionDump.S_THEME_TYPE == "dark":
101 | # dark
102 | if color_str in NotionDump.S_THEME_DARK:
103 | color_ret = NotionDump.S_THEME_DARK[color_str]
104 | elif NotionDump.S_THEME_TYPE == "self_define":
105 | # self_define
106 | if color_str in NotionDump.S_THEME_SELF_DEFINE:
107 | color_ret = NotionDump.S_THEME_SELF_DEFINE[color_str]
108 | else:
109 | # default light
110 | if color_str in NotionDump.S_THEME_LIGHT:
111 | color_ret = NotionDump.S_THEME_LIGHT[color_str]
112 | if color_ret != "":
113 | return color_ret
114 | return input_color
115 |
116 |
117 | def color_transformer_db(input_color):
118 | if input_color == "default":
119 | color_str = "d_light_gray"
120 | else:
121 | color_str = "d_" + input_color
122 |
123 | color_ret = ""
124 | if NotionDump.S_THEME_TYPE == "dark":
125 | # dark
126 | if color_str in NotionDump.S_THEME_DARK:
127 | color_ret = NotionDump.S_THEME_DARK[color_str]
128 | elif NotionDump.S_THEME_TYPE == "self_define":
129 | # self_define
130 | if color_str in NotionDump.S_THEME_SELF_DEFINE:
131 | color_ret = NotionDump.S_THEME_SELF_DEFINE[color_str]
132 | else:
133 | # default light
134 | if color_str in NotionDump.S_THEME_LIGHT:
135 | color_ret = NotionDump.S_THEME_LIGHT[color_str]
136 | if color_ret != "":
137 | return color_ret
138 | return input_color
139 |
--------------------------------------------------------------------------------
/NotionDump/utils/internal_var.py:
--------------------------------------------------------------------------------
1 | # author: delta1037
2 | # Date: 2022/01/08
3 | # mail:geniusrabbit@qq.com
4 |
5 | # ms
6 | FRIENDLY_USE_API = 400
7 | FRIENDLY_DOWNLOAD = 1000
8 |
9 | # 导出页面结构
10 | PAGE_DIC = {}
11 |
12 | # 导出页面列表的格式
13 | CHILD_PAGE_TEMP = {
14 | "dumped": False,
15 | "main_page": False,
16 | "type": "page",
17 | "local_path": "",
18 | "page_name": "",
19 | "link_id": "",
20 | "link_src": "",
21 | "child_pages": [],
22 | "inter_recursion": False,
23 | "inter_soft_page": False
24 | }
25 | # inter_soft_link 表示该页是由链接创建的
26 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # notion-export-kernel
2 |
3 | [中文](https://github.com/delta1037/notion-export-kernel/blob/main/README_zh.md)
4 |
5 | ## Description
6 |
7 | This repository is a development based on [notion-sdk-py](https://github.com/ramnes/notion-sdk-py)(notion official API), target to export notion pages and database
8 |
9 | Main targets:
10 |
11 | - [x] Export Notion Database and page to markdown file
12 | - [x] Recursion Export child Pages
13 | - [x] Download image and files in notion
14 |
15 | ## Structure
16 |
17 | ```shell
18 | notoin-dump
19 | ├─NotionDump
20 | │ ├─Dump # External Interface
21 | │ ├─Notion # Unified encapsulation interface for communication with Notion
22 | │ ├─Parser # Some parser
23 | │ └─utils # Internal variables and utils functions
24 | └─Tests # Test code
25 | ```
26 |
27 | #### Parser code structure
28 |
29 | ```mermaid
30 | graph TD
31 | A[Dump] -->B(Database)
32 | A[Dump] -->C(Page/Block)
33 | B --> D[Mix Parser]
34 | C --> D[Mix Parser]
35 |
36 | D --> E[Database Parser]
37 | D --> F[Block Parser]
38 |
39 | E --> G[Base Parser]
40 | F --> G[Base Parser]
41 | ```
42 |
43 | ## Usage
44 |
45 | ### 3.0 install & example
46 |
47 | **install `notion-dump-kernel`**
48 |
49 | ```powershell
50 | # open terminal, type the cmd (install the latest version)
51 | pip install python-dateutil
52 | pip install notion-dump-kernel
53 | ```
54 |
55 | **example**
56 |
57 | ```python
58 | # Example: export page
59 | import NotionDump
60 | from NotionDump.Dump.dump import Dump
61 | from NotionDump.Notion.Notion import NotionQuery
62 | from NotionDump.utils import common_op
63 |
64 | TOKEN_TEST = "secret_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
65 | PAGE_MIX_ID = "43e7aa8ccfb0488eb18f8a453eab0177"
66 | # NotionDump.DUMP_MODE = NotionDump.DUMP_MODE_DEBUG
67 |
68 | def test_page_parser(query, export_child=False, db_parser_type=NotionDump.PARSER_TYPE_MD):
69 | # init what you want to export
70 | # Explain:
71 | # dump_id: the ID which need to export (block, page or database)
72 | # query_handle: Notion query handle for getting data from API (NOT the offical API handle)
73 | # export_child_pages: whether export all nested pages(sub-page and link page)
74 | # dump_type: the dump_id type [DUMP_TYPE_BLOCK/DUMP_TYPE_PAGE/DUMP_TYPE_DB_TABLE]
75 | # db_parser_type: PARSER_TYPE_MD meas export database as markdown table; PARSER_TYPE_PLAIN means export database as CSV file
76 | # page_parser_type: PARSER_TYPE_MD meas export page as markdown file; PARSER_TYPE_PLAIN means export page as txt
77 | page_handle = Dump(
78 | dump_id=PAGE_MIX_ID,
79 | query_handle=query,
80 | export_child_pages=export_child,
81 | dump_type=NotionDump.DUMP_TYPE_PAGE,
82 | db_parser_type=db_parser_type,
83 | page_parser_type=NotionDump.PARSER_TYPE_MD
84 | )
85 |
86 | # Returned variable , which contain all info about dumped files structure
87 | # All parsered files will be save at .tmp/
88 | page_detail_json = page_handle.dump_to_file()
89 |
90 | # all info about dumped files structure save as json file
91 | print("json output to page_parser_result")
92 | common_op.save_json_to_file(
93 | handle=page_detail_json,
94 | json_name=".tmp/page_parser_result.json"
95 | )
96 |
97 |
98 | if __name__ == '__main__':
99 | # We need a qurey handle for getting data from API
100 | query_handle = NotionQuery(token=TOKEN_TEST)
101 | if query_handle is None:
102 | logging.exception("query handle init error")
103 | exit(-1)
104 |
105 | # export_child means export all nested pages(sub-page and link page)
106 | test_page_parser(query_handle, True)
107 | ```
108 |
109 | ### 3.1 Output
110 |
111 | All export files will be seen at `.tmp/` and the **page structure save at returned variable**, which contain all info about dumped files structure.
112 |
113 | return variable (`page_detail_json`) will be like:
114 |
115 | ```json
116 | {
117 | "key_id_1": {
118 | "dumped": true,
119 | "main_page": true,
120 | "type": "page",
121 | "local_path": "xxxx",
122 | "page_name": "",
123 | "link_id": "",
124 | "child_pages": [
125 | "xxxxx",
126 | "xxxxx"
127 | ],
128 | "inter_recursion": true,
129 | "inter_soft_page": false
130 | },
131 | "key_id_2": {
132 | "dumped": false,
133 | "main_page": false,
134 | "type": "page",
135 | "local_path": "",
136 | "page_name": "",
137 | "link_id": "xxxxx",
138 | "child_pages": [],
139 | "inter_recursion": true,
140 | "inter_soft_page": false
141 | }
142 | }
143 | ```
144 |
145 | **output explain**:
146 |
147 | - key_id_1:key is id (block id/page id/database id) and it is the combination of link name and id in link page,the id is the tag to relocate link in page
148 | - dumped:download status of the resource specifid by id
149 | - main_page:whether the page is the page specifid by dump_id (root)
150 | - type:id type, database or page (page type contain page and block)(if id_1 is a link the type is the page type that the link linked)
151 | - local_path:the location of export file, for subsequent operations
152 | - page_name:page name (for subsequent relocation of page url)
153 | - child_pages:subpage or database id this key_id contain
154 | - inter_recursion:internal variable(NOT use)
155 | - inter_soft_page:internal variable(NOT use)
156 |
157 | ## Attention
158 |
159 | - [ ] Comment not support
160 |
161 | ## Others
162 |
163 | ### 6.1、Notion Test Page
164 |
165 | [Notion Test Page](https://delta1037.notion.site/Notion-dump-ed0a3b0f57b34712bc6bafcbdb413d50)
166 |
167 | ### 6.2 Notion export client
168 |
169 | which base on notion-export-kernel, it is used to rebuild the structure of dumped files(dumped by notion-export-kernel) and relocate the link in pages
170 |
171 | [Github](https://github.com/delta1037/notion-export-client)
172 |
173 |
--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
1 | # notion-export-kernel
2 |
3 |
4 |
5 | ------
6 |
7 | ## 一、项目说明
8 |
9 | 本仓库是基于 [notion-sdk-py](https://github.com/ramnes/notion-sdk-py)(notion官方API)的开发,导出Notion页面和数据库。
10 |
11 | 项目目标
12 |
13 | - [x] 将Notion页面和数据库导出为Markdown文件
14 | - [x] 递归导出所有子页面(或者链接)
15 | - [x] 下载文件和图片
16 |
17 | ## 二、项目结构
18 |
19 | ```shell
20 | notoin-dump
21 | ├─NotionDump
22 | │ ├─Dump # 对外接口
23 | │ ├─Notion # 与Notion通信统一封装接口
24 | │ ├─Parser # 实现的一些解析器
25 | │ └─utils # 内部变量与杂项函数
26 | └─Tests # 测试代码
27 | ```
28 |
29 | ```mermaid
30 | graph TD
31 | A[Dump] -->B(Database)
32 | A[Dump] -->C(Page/Block)
33 | B --> D[Mix Parser]
34 | C --> D[Mix Parser]
35 |
36 | D --> E[Database Parser]
37 | D --> F[Block Parser]
38 |
39 | E --> G[Base Parser]
40 | F --> G[Base Parser]
41 | ```
42 |
43 |
44 |
45 |
46 |
47 | ## 三、使用方法
48 |
49 | ### 3.0 安装导入
50 |
51 | **安装`notion-dump-kernel`**
52 |
53 | ```powershell
54 | # 打开终端,输入如下命令安装(装最最新版)
55 | pip install python-dateutil
56 | pip install notion-dump-kernel
57 | ```
58 |
59 | **导入使用**
60 |
61 | ```python
62 | import NotionDump
63 | from NotionDump.Dump.dump import Dump
64 | from NotionDump.Notion.Notion import NotionQuery
65 | ```
66 |
67 |
68 |
69 | ### 3.1 对外统一接口
70 |
71 | ```python
72 | # 获取Notion查询句柄
73 | query_handle = NotionQuery(
74 | token=TOKEN_TEST, # Token
75 | client_handle=None, # Notion官方API句柄,默认为空
76 | async_api=False # 异步调用,默认为False
77 | )
78 |
79 | # 获取操作句柄
80 | handle = Dump(
81 | dump_id=ID, # 需要导出的页面ID
82 | query_handle=query, # Notion查询句柄
83 | export_child_pages=True, # 是否递归导出子页面
84 | page_parser_type=NotionDump.PARSER_TYPE_MD, # Page导出类型
85 | db_parser_type=NotionDump.PARSER_TYPE_PLAIN, # 数据库导出类型
86 | dump_type=NotionDump.DUMP_TYPE_XXX # ID的类型,详细见后续说明
87 | )
88 |
89 | # dump类型 dump_type
90 | DUMP_TYPE_BLOCK # 块类型
91 | DUMP_TYPE_PAGE # 页面类型
92 | DUMP_TYPE_DB_TABLE # 数据库Table类型
93 |
94 | # 导出类型
95 | PARSER_TYPE_MD # Markdown格式
96 | PARSER_TYPE_PLAIN # 纯文本格式
97 |
98 | # 其它
99 | # 变量自解释,不再赘述
100 | ```
101 |
102 | [操作示例](https://github.com/delta1037/notion-dump-kernel/tree/main/Examples)
103 |
104 | ### 3.2 获取输出
105 |
106 | dump的结果存放在一个字典变量中,改变量包含了外部可以操作的所有新信息,获取输出和输出解释如下
107 |
108 | ```python
109 | # 获取输出
110 | dump_output = dump_handle.dump_to_file()
111 | # 其中dump_handle为上述的操作句柄(Dump(xxx)返回值)
112 | ```
113 |
114 | 输出样例:
115 |
116 | ```json
117 | {
118 | "key_id_1": {
119 | "dumped": true,
120 | "main_page": true,
121 | "type": "page",
122 | "local_path": "xxxx",
123 | "page_name": "",
124 | "link_id": "",
125 | "child_pages": [
126 | "xxxxx",
127 | "xxxxx"
128 | ],
129 | "inter_recursion": true,
130 | "inter_soft_page": false
131 | },
132 | "key_id_2": {
133 | "dumped": false,
134 | "main_page": false,
135 | "type": "page",
136 | "local_path": "",
137 | "page_name": "",
138 | "link_id": "xxxxx",
139 | "child_pages": [],
140 | "inter_recursion": true,
141 | "inter_soft_page": false
142 | }
143 | }
144 | ```
145 |
146 | **输出解释**:
147 |
148 | - key_id_1:键值,也是dump下来的页面需要重定位的标志
149 | - dumped:id指向的资源是否成功下载
150 | - main_page:页面是否是主页
151 | - type:该id的类型,database或者page(链接的话是链接指向的页面的类型)
152 | - local_path:导出的文件位置,供后续操作
153 | - page_name:页面是否有名称(后续重定位使用)
154 | - child_pages:包含的子页面或者子数据库
155 | - inter_recursion:内部使用变量,无需关注
156 | - inter_soft_page:内部使用变量,无需关注
157 |
158 | ## 五、注意
159 |
160 | - [ ] 不支持评论内容
161 |
162 | ## 六、附录
163 |
164 | ### 6.1、项目测试
165 |
166 | [项目测试页面](https://delta1037.notion.site/Notion-dump-ed0a3b0f57b34712bc6bafcbdb413d50)
167 |
168 | ### 6.2 Notion dump client
169 |
170 | 基于notion-dump-kernel做的一个对下载下来的页面重新组合文件结构,并对其中的链接部分进行重定位的项目
171 |
172 | [项目Github地址](https://github.com/delta1037/notion-export-local)
173 |
174 |
--------------------------------------------------------------------------------
/img/get_data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/img/get_data.png
--------------------------------------------------------------------------------
/img/parser_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/delta1037/notion-export-kernel/fa7154054c0f0cfe1dd404f5c6a2a87672816a64/img/parser_structure.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wheel~=0.36.2
2 | setuptools~=57.0.0
3 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | try:
4 | from setuptools import setup, find_packages
5 | from setuptools import Command
6 | from setuptools import Extension
7 | except ImportError:
8 | sys.exit(
9 | "We need the Python library setuptools to be installed. "
10 | "Try running: python -m ensurepip"
11 | )
12 |
13 | if "bdist_wheel" in sys.argv:
14 | try:
15 | import wheel # noqa: F401
16 | except ImportError:
17 | sys.exit(
18 | "We need both setuptools AND wheel packages installed "
19 | "for bdist_wheel to work. Try running: pip install wheel"
20 | )
21 |
22 |
23 | REQUIRES = ["notion-client>=0.8.0"]
24 |
25 | # with open("README_En.md", encoding="utf-8") as handle:
26 | # readme_rst = handle.read()
27 |
28 | setup(
29 | name="notion-dump-kernel",
30 | version="0.2.4",
31 | author="delta1037",
32 | author_email="geniusrabbit@qq.com",
33 | url="https://github.com/delta1037/notion-export-kernel",
34 | description="Freely available tools for export Notion page and database.",
35 | project_urls={
36 | "Documentation": "https://github.com/delta1037/notion-export-kernel/blob/main/README.md",
37 | "Source": "https://github.com/delta1037/notion-export-kernel",
38 | "Tracker": "https://github.com/delta1037/notion-export-kernel/issues",
39 | },
40 | classifiers=[
41 | "Development Status :: 3 - Alpha",
42 | "Intended Audience :: Developers",
43 | "License :: OSI Approved :: MIT License",
44 | "Operating System :: OS Independent",
45 | "Programming Language :: Python",
46 | "Programming Language :: Python :: 3.9",
47 | "Topic :: Text Processing :: Markup",
48 | "Topic :: Software Development :: Libraries :: Python Modules",
49 | ],
50 | packages=find_packages(where='.', exclude=(), include=('*',)),
51 | include_package_data=True, # done via MANIFEST.in under setuptools
52 | install_requires=REQUIRES,
53 | )
54 | # 打包发布
55 | # 1、python setup.py sdist
56 | # 2、twine upload dist/*
57 |
58 |
--------------------------------------------------------------------------------