├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── graph.py
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── templates
└── index.html
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.toptal.com/developers/gitignore/api/python,node,macos,linux,windows
3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,node,macos,linux,windows
4 |
5 | ### Linux ###
6 | *~
7 |
8 | # temporary files which can be created if a process still has a handle open of a deleted file
9 | .fuse_hidden*
10 |
11 | # KDE directory preferences
12 | .directory
13 |
14 | # Linux trash folder which might appear on any partition or disk
15 | .Trash-*
16 |
17 | # .nfs files are created when an open file is removed but is still being accessed
18 | .nfs*
19 |
20 | ### macOS ###
21 | # General
22 | .DS_Store
23 | .AppleDouble
24 | .LSOverride
25 |
26 | # Icon must end with two \r
27 | Icon
28 |
29 |
30 | # Thumbnails
31 | ._*
32 |
33 | # Files that might appear in the root of a volume
34 | .DocumentRevisions-V100
35 | .fseventsd
36 | .Spotlight-V100
37 | .TemporaryItems
38 | .Trashes
39 | .VolumeIcon.icns
40 | .com.apple.timemachine.donotpresent
41 |
42 | # Directories potentially created on remote AFP share
43 | .AppleDB
44 | .AppleDesktop
45 | Network Trash Folder
46 | Temporary Items
47 | .apdisk
48 |
49 | ### macOS Patch ###
50 | # iCloud generated files
51 | *.icloud
52 |
53 | ### Node ###
54 | # Logs
55 | logs
56 | *.log
57 | npm-debug.log*
58 | yarn-debug.log*
59 | yarn-error.log*
60 | lerna-debug.log*
61 | .pnpm-debug.log*
62 |
63 | # Diagnostic reports (https://nodejs.org/api/report.html)
64 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
65 |
66 | # Runtime data
67 | pids
68 | *.pid
69 | *.seed
70 | *.pid.lock
71 |
72 | # Directory for instrumented libs generated by jscoverage/JSCover
73 | lib-cov
74 |
75 | # Coverage directory used by tools like istanbul
76 | coverage
77 | *.lcov
78 |
79 | # nyc test coverage
80 | .nyc_output
81 |
82 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
83 | .grunt
84 |
85 | # Bower dependency directory (https://bower.io/)
86 | bower_components
87 |
88 | # node-waf configuration
89 | .lock-wscript
90 |
91 | # Compiled binary addons (https://nodejs.org/api/addons.html)
92 | build/Release
93 |
94 | # Dependency directories
95 | node_modules/
96 | jspm_packages/
97 |
98 | # Snowpack dependency directory (https://snowpack.dev/)
99 | web_modules/
100 |
101 | # TypeScript cache
102 | *.tsbuildinfo
103 |
104 | # Optional npm cache directory
105 | .npm
106 |
107 | # Optional eslint cache
108 | .eslintcache
109 |
110 | # Optional stylelint cache
111 | .stylelintcache
112 |
113 | # Microbundle cache
114 | .rpt2_cache/
115 | .rts2_cache_cjs/
116 | .rts2_cache_es/
117 | .rts2_cache_umd/
118 |
119 | # Optional REPL history
120 | .node_repl_history
121 |
122 | # Output of 'npm pack'
123 | *.tgz
124 |
125 | # Yarn Integrity file
126 | .yarn-integrity
127 |
128 | # dotenv environment variable files
129 | .env
130 | .env.development.local
131 | .env.test.local
132 | .env.production.local
133 | .env.local
134 |
135 | # parcel-bundler cache (https://parceljs.org/)
136 | .cache
137 | .parcel-cache
138 |
139 | # Next.js build output
140 | .next
141 | out
142 |
143 | # Nuxt.js build / generate output
144 | .nuxt
145 | dist
146 |
147 | # Gatsby files
148 | .cache/
149 | # Comment in the public line in if your project uses Gatsby and not Next.js
150 | # https://nextjs.org/blog/next-9-1#public-directory-support
151 | # public
152 |
153 | # vuepress build output
154 | .vuepress/dist
155 |
156 | # vuepress v2.x temp and cache directory
157 | .temp
158 |
159 | # Docusaurus cache and generated files
160 | .docusaurus
161 |
162 | # Serverless directories
163 | .serverless/
164 |
165 | # FuseBox cache
166 | .fusebox/
167 |
168 | # DynamoDB Local files
169 | .dynamodb/
170 |
171 | # TernJS port file
172 | .tern-port
173 |
174 | # Stores VSCode versions used for testing VSCode extensions
175 | .vscode-test
176 |
177 | # yarn v2
178 | .yarn/cache
179 | .yarn/unplugged
180 | .yarn/build-state.yml
181 | .yarn/install-state.gz
182 | .pnp.*
183 |
184 | ### Node Patch ###
185 | # Serverless Webpack directories
186 | .webpack/
187 |
188 | # Optional stylelint cache
189 |
190 | # SvelteKit build / generate output
191 | .svelte-kit
192 |
193 | ### Python ###
194 | # Byte-compiled / optimized / DLL files
195 | __pycache__/
196 | *.py[cod]
197 | *$py.class
198 |
199 | # C extensions
200 | *.so
201 |
202 | # Distribution / packaging
203 | .Python
204 | build/
205 | develop-eggs/
206 | dist/
207 | downloads/
208 | eggs/
209 | .eggs/
210 | lib/
211 | lib64/
212 | parts/
213 | sdist/
214 | var/
215 | wheels/
216 | share/python-wheels/
217 | *.egg-info/
218 | .installed.cfg
219 | *.egg
220 | MANIFEST
221 |
222 | # PyInstaller
223 | # Usually these files are written by a python script from a template
224 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
225 | *.manifest
226 | *.spec
227 |
228 | # Installer logs
229 | pip-log.txt
230 | pip-delete-this-directory.txt
231 |
232 | # Unit test / coverage reports
233 | htmlcov/
234 | .tox/
235 | .nox/
236 | .coverage
237 | .coverage.*
238 | nosetests.xml
239 | coverage.xml
240 | *.cover
241 | *.py,cover
242 | .hypothesis/
243 | .pytest_cache/
244 | cover/
245 |
246 | # Translations
247 | *.mo
248 | *.pot
249 |
250 | # Django stuff:
251 | local_settings.py
252 | db.sqlite3
253 | db.sqlite3-journal
254 |
255 | # Flask stuff:
256 | instance/
257 | .webassets-cache
258 |
259 | # Scrapy stuff:
260 | .scrapy
261 |
262 | # Sphinx documentation
263 | types-python-dateutil
264 | docs/_build/
265 |
266 | # PyBuilder
267 | .pybuilder/
268 | target/
269 |
270 | # Jupyter Notebook
271 | .ipynb_checkpoints
272 |
273 | # IPython
274 | profile_default/
275 | ipython_config.py
276 |
277 | # pyenv
278 | # For a library or package, you might want to ignore these files since the code is
279 | # intended to run in multiple environments; otherwise, check them in:
280 | # .python-version
281 |
282 | # pipenv
283 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
284 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
285 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
286 | # install all needed dependencies.
287 | #Pipfile.lock
288 |
289 | # poetry
290 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
291 | # This is especially recommended for binary packages to ensure reproducibility, and is more
292 | # commonly ignored for libraries.
293 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
294 | #poetry.lock
295 |
296 | # pdm
297 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
298 | #pdm.lock
299 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
300 | # in version control.
301 | # https://pdm.fming.dev/#use-with-ide
302 | .pdm.toml
303 |
304 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
305 | __pypackages__/
306 |
307 | # Celery stuff
308 | celerybeat-schedule
309 | celerybeat.pid
310 |
311 | # SageMath parsed files
312 | *.sage.py
313 |
314 | # Environments
315 | .venv
316 | env/
317 | venv/
318 | ENV/
319 | env.bak/
320 | venv.bak/
321 |
322 | # Spyder project settings
323 | .spyderproject
324 | .spyproject
325 |
326 | # Rope project settings
327 | .ropeproject
328 |
329 | # mkdocs documentation
330 | /site
331 |
332 | # mypy
333 | .mypy_cache/
334 | .dmypy.json
335 | dmypy.json
336 |
337 | # Pyre type checker
338 | .pyre/
339 |
340 | # pytype static type analyzer
341 | .pytype/
342 |
343 | # Cython debug symbols
344 | cython_debug/
345 |
346 | # PyCharm
347 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
348 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
349 | # and can be added to the global gitignore or merged into this file. For a more nuclear
350 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
351 | #.idea/
352 |
353 | ### Windows ###
354 | # Windows thumbnail cache files
355 | Thumbs.db
356 | Thumbs.db:encryptable
357 | ehthumbs.db
358 | ehthumbs_vista.db
359 |
360 | # Dump file
361 | *.stackdump
362 |
363 | # Folder config file
364 | [Dd]esktop.ini
365 |
366 | # Recycle Bin used on file shares
367 | $RECYCLE.BIN/
368 |
369 | # Windows Installer files
370 | *.cab
371 | *.msi
372 | *.msix
373 | *.msm
374 | *.msp
375 |
376 | # Windows shortcuts
377 | *.lnk
378 |
379 | # End of https://www.toptal.com/developers/gitignore/api/python,node,macos,linux,windows
380 |
381 | graph.json
382 | data.json
383 | data/
384 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v4.6.0
4 | hooks:
5 | - id: check-docstring-first
6 | - id: check-yaml
7 | - id: double-quote-string-fixer
8 | - id: end-of-file-fixer
9 | - id: requirements-txt-fixer
10 | - id: trailing-whitespace
11 | - repo: https://github.com/PyCQA/flake8
12 | rev: 7.0.0
13 | hooks:
14 | - id: flake8
15 | - repo: https://github.com/asottile/reorder-python-imports
16 | rev: v3.13.0
17 | hooks:
18 | - id: reorder-python-imports
19 | args: [--py37-plus, --add-import, "from __future__ import annotations"]
20 | - repo: https://github.com/asottile/pyupgrade
21 | rev: v3.16.0
22 | hooks:
23 | - id: pyupgrade
24 | args: [--py310-plus]
25 | - repo: https://github.com/pre-commit/mirrors-mypy
26 | rev: v1.10.0
27 | hooks:
28 | - id: mypy
29 | additional_dependencies: ["types-PyYAML", "types-python-dateutil"]
30 | - repo: https://github.com/psf/black
31 | rev: 24.4.2
32 | hooks:
33 | - id: black
34 | additional_dependencies: ["click==8.0.2"]
35 | language_version: python3.10
36 | args: [--skip-string-normalization]
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Dominique Garmier
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | [](https://results.pre-commit.ci/latest/github/DominiqueGarmier/notion-graph/main)
2 |
3 | # notion-graph
4 |
5 | opensource graph view of your notion pages, inspired by [Obsidian](https://obsidian.md/).
6 |
7 | ## WARNING THIS IS STILL IN DEVELOPMENT
8 |
9 | #### what currently works:
10 |
11 | - a simple flask server (see the gif below)
12 | - background parsing and auto updating (parses every X minutes automatically)
13 | - retrying logic (it hasn't crashed into an unrecoverable state for me yet)
14 | - partial updates (only parse pages that were edited since last parse)
15 |
16 |
17 |
18 |
19 |
20 | ## Installing
21 |
22 | Clone this repo.
23 |
24 | ```
25 | git clone git@github.com:dominiquegarmier/notion-graph
26 | cd notion-graph
27 | ```
28 |
29 | Install dependencies.
30 |
31 | ```
32 | virtualenv .venv -ppython3.10
33 | source .venv/bin/activate
34 | pip install -r requirements.txt
35 | ```
36 |
37 | ## Setup
38 |
39 | - set the environment variable `NOTION_KEY` with your notion api key that has read access to some pages (see [notion docs]("https://developers.notion.com/docs/create-a-notion-integration")).
40 |
41 | ## Usage
42 |
43 | you can now run the following command to start notion-graph
44 |
45 | ```
46 | python graph.py
47 | ```
48 |
49 | This will automatically discover any page shared with your notion integration. Subsequently it will create a task queue to query every discovered page. The initial parse of your document might take a while as notions api is limited to three requests per second. You will notice that the programm will create a new folder `data/` which contains the parsed pages and links. Subsequent parses will only refresh pages that have be edited since the last parse.
50 |
51 | The graph view will be served on `localhost:8080`. Make sure to hit refresh when the parsing is done.
52 |
53 | ## Development
54 |
55 | Install dev-dependencies
56 |
57 | ```
58 | pip install -r requirements-dev.txt
59 | ```
60 |
61 | Install pre-commit hooks.
62 |
63 | ```
64 | pre-commit install
65 | ```
66 |
--------------------------------------------------------------------------------
/graph.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import asyncio
4 | import dataclasses
5 | import json
6 | import logging
7 | import math
8 | import os
9 | import tempfile
10 | from collections import defaultdict
11 | from collections.abc import AsyncGenerator
12 | from collections.abc import Collection
13 | from collections.abc import Generator
14 | from contextlib import asynccontextmanager
15 | from contextlib import contextmanager
16 | from dataclasses import dataclass
17 | from datetime import datetime
18 | from datetime import timezone
19 | from logging import getLogger
20 | from pathlib import Path
21 | from threading import Thread
22 | from typing import Any
23 | from typing import Literal
24 | from typing import NoReturn
25 | from uuid import UUID
26 | from uuid import uuid4
27 |
28 | from aiohttp import ClientSession
29 | from aiolimiter import AsyncLimiter
30 | from dateutil.parser import parse as parse_date
31 | from dotenv import load_dotenv
32 | from flask import Flask
33 | from flask import jsonify
34 | from flask import render_template
35 | from werkzeug import Response
36 |
37 | logger = getLogger(__name__)
38 | logging.basicConfig(level=logging.DEBUG)
39 |
40 | NOTION_URL = 'https://www.notion.so'
41 | NOTION_API_URL = 'https://api.notion.com/v1'
42 |
43 | DEFAULT_DATA_PATH = Path(__file__).parent / 'data'
44 | DEFAULT_REFRESH_INTERVAL = 60 * 5
45 | DEFAULT_N_WORKERS = 4
46 |
47 | # api config
48 | RATE_LIMIT_BURST = 1
49 | RATE_LIMIT = 3 * RATE_LIMIT_BURST
50 | RATE_LIMITER = AsyncLimiter(RATE_LIMIT, RATE_LIMIT_BURST)
51 | TIMEOUT = 30
52 | MAX_RETRY = 1
53 |
54 | SKIP_PROPAGATION_BLOCK_TYPES = (
55 | 'child_page',
56 | 'child_database',
57 | )
58 |
59 |
60 | @dataclass(frozen=True)
61 | class Config:
62 | notion_key: str
63 | data_path: Path
64 | refresh_interval: int
65 | n_workers: int
66 |
67 |
68 | def load_config() -> Config:
69 | load_dotenv()
70 | try:
71 | notion_key = os.environ['NOTION_KEY']
72 | except KeyError:
73 | raise ValueError('Missing NOTION_KEY environment variable')
74 |
75 | data_path = Path(os.environ.get('GRAPH_DATA_PATH', DEFAULT_DATA_PATH))
76 | refresh_interval = int(
77 | os.environ.get('GRAPH_REFRESH_INTERVAL', DEFAULT_REFRESH_INTERVAL)
78 | )
79 | n_workers = int(os.environ.get('GRAPH_N_WORKERS', DEFAULT_N_WORKERS))
80 |
81 | return Config(notion_key, data_path, refresh_interval, n_workers)
82 |
83 |
84 | @dataclass
85 | class Page:
86 | id: UUID
87 | url: str
88 | title: str
89 | last_parsed: datetime
90 |
91 | def __hash__(self) -> int:
92 | return hash(self.id)
93 |
94 | def __eq__(self, other: object) -> bool:
95 | if not isinstance(other, Page):
96 | return NotImplemented
97 | return self.id == other.id
98 |
99 |
100 | def serialize_page(page: Page) -> dict[str, str]:
101 | return {
102 | 'id': str(page.id),
103 | 'url': str(page.url),
104 | 'title': str(page.title),
105 | 'last_parsed': page.last_parsed.isoformat(),
106 | }
107 |
108 |
109 | def deserialize_page(data: dict[str, str]) -> Page:
110 | return Page(
111 | id=UUID(data['id']),
112 | url=data['url'],
113 | title=data['title'],
114 | last_parsed=datetime.fromisoformat(data['last_parsed']),
115 | )
116 |
117 |
118 | @dataclass
119 | class Link:
120 | id: UUID
121 | source: UUID
122 | target: UUID
123 | link_type: Literal['page', 'database', 'mention', 'href']
124 |
125 | def __hash__(self) -> int:
126 | return hash(self.id)
127 |
128 | def __eq__(self, other: object) -> bool:
129 | if not isinstance(other, Link):
130 | return NotImplemented
131 | return self.id == other.id
132 |
133 |
134 | def serialize_link(link: Link) -> dict[str, str]:
135 | return {
136 | 'id': str(link.id),
137 | 'source': str(link.source),
138 | 'target': str(link.target),
139 | 'link_type': link.link_type,
140 | }
141 |
142 |
143 | def deserialize_link(data: dict[str, str]) -> Link:
144 | return Link(
145 | id=UUID(data['id']),
146 | source=UUID(data['source']),
147 | target=UUID(data['target']),
148 | link_type=data['link_type'], # type: ignore
149 | )
150 |
151 |
152 | class Graph:
153 | _pages: dict[UUID, Page]
154 | _links: dict[UUID, Link]
155 |
156 | def __init__(
157 | self,
158 | pages: Collection[Page] | None = None,
159 | links: Collection[Link] | None = None,
160 | ) -> None:
161 | self._pages = {}
162 | self._links = {}
163 |
164 | if pages is None:
165 | pages = []
166 | if links is None:
167 | links = []
168 |
169 | for page in pages:
170 | self.add(page)
171 | for link in links:
172 | self.add(link)
173 |
174 | def __contains__(self, item: Page | UUID) -> bool:
175 | if isinstance(item, Page):
176 | return item.id in self._pages
177 | elif isinstance(item, UUID):
178 | return item in self._pages
179 | else:
180 | raise TypeError(f'Cannot check for item of type {type(item)}')
181 |
182 | @classmethod
183 | def deserialize(cls, data: str) -> Graph:
184 | dct = json.loads(data)
185 | try:
186 | pages = [deserialize_page(page) for page in dct['pages']]
187 | links = [deserialize_link(link) for link in dct['links']]
188 | except Exception as e:
189 | raise ValueError(f'Invalid data {e}')
190 | return cls(pages, links)
191 |
192 | def serialize(self) -> str:
193 | pages = [serialize_page(page) for page in self._pages.values()]
194 | links = [serialize_link(link) for link in self._links.values()]
195 | return json.dumps({'pages': pages, 'links': links})
196 |
197 | def save(self, path: str | os.PathLike) -> None:
198 | with open(path, 'w') as f:
199 | f.write(self.serialize())
200 |
201 | @classmethod
202 | def load(cls, path: str | os.PathLike) -> Graph:
203 | with open(path) as f:
204 | return cls.deserialize(f.read())
205 |
206 | @property
207 | def pages(self) -> list[Page]:
208 | return list(self._pages.values())
209 |
210 | @property
211 | def links(self) -> list[Link]:
212 | return list(self._links.values())
213 |
214 | def add(self, item: Page | Link) -> None:
215 | if isinstance(item, Page):
216 | if item.id in self._pages:
217 | raise ValueError(f'Page {item.id} already exists')
218 | self._pages[item.id] = item
219 | elif isinstance(item, Link):
220 | if item.id in self._links:
221 | raise ValueError(f'Link {item.id} already exists')
222 | self._links[item.id] = item
223 | else:
224 | raise TypeError(f'Cannot add item of type {type(item)}')
225 |
226 | def prune(self) -> Graph:
227 | links = set()
228 | for link in self._links.values():
229 | if link.source in self and link.target in self:
230 | links.add(link)
231 | return self.__class__(list(self._pages.values()), links)
232 |
233 | def update(self, pages: Collection[Page], links: Collection[Link]) -> None:
234 | new_pages = {page.id: page for page in pages}
235 |
236 | delete_ids = []
237 | for link in self._links.values():
238 | if link.source in new_pages:
239 | delete_ids.append(link.id)
240 | for id_ in delete_ids:
241 | del self._links[id_]
242 |
243 | self._pages.update(new_pages)
244 | self._links.update({link.id: link for link in links})
245 |
246 |
247 | @dataclass
248 | class DisplayLink:
249 | source: str
250 | target: str
251 | rotation: float
252 | curvature: float
253 |
254 |
255 | @dataclass
256 | class DisplayNode:
257 | id: str
258 | title: str
259 | url: str
260 |
261 |
262 | @dataclass
263 | class DisplayGraph:
264 | nodes: list[DisplayNode]
265 | links: list[DisplayLink]
266 |
267 |
268 | def to_display_graph(graph: Graph) -> DisplayGraph:
269 | node_ids: set[str] = set()
270 | nodes = []
271 | for page in graph.pages:
272 | node_ids.add(str(page.id))
273 | node = DisplayNode(id=str(page.id), title=page.title, url=page.url)
274 | nodes.append(node)
275 |
276 | links_dict: dict[tuple[UUID, UUID], list[Link]] = defaultdict(list)
277 | for link in graph.links:
278 | if str(link.source) not in node_ids or str(link.target) not in node_ids:
279 | continue
280 | links_dict[(link.source, link.target)].append(link)
281 |
282 | links_list = []
283 | for ids_tp, links in links_dict.items():
284 | if ids_tp[0] == ids_tp[1]:
285 | base_curvature = 0.5
286 | else:
287 | base_curvature = 0
288 | n = len(links)
289 | for i, link in enumerate(links):
290 | rotation = 2 * math.pi * i / n
291 | new_link = DisplayLink(
292 | source=str(link.source),
293 | target=str(link.target),
294 | rotation=rotation,
295 | curvature=base_curvature + min((n - 1) / 10, 0.5),
296 | )
297 | links_list.append(new_link)
298 |
299 | return DisplayGraph(nodes=nodes, links=links_list)
300 |
301 |
302 | @contextmanager
303 | def persisted_graph(
304 | path: str | Path, flush: bool = False, persist: bool = True
305 | ) -> Generator[Graph, None, None]:
306 | if not flush:
307 | try:
308 | graph = Graph.load(path)
309 | except Exception:
310 | graph = Graph()
311 | logger.warning(f'Could not load graph from {path}, creating new graph.')
312 | else:
313 | graph = Graph()
314 |
315 | try:
316 | yield graph
317 | except Exception:
318 | raise
319 | else:
320 | if persist:
321 | fd, tmp_path = tempfile.mkstemp()
322 | with open(fd, 'w') as f:
323 | f.write(graph.serialize())
324 |
325 | Path(path).parent.mkdir(parents=True, exist_ok=True)
326 | os.replace(tmp_path, path)
327 |
328 |
329 | @asynccontextmanager
330 | async def RateLimitedSession(
331 | config: Config,
332 | auth: str | None = None,
333 | ) -> AsyncGenerator[ClientSession, None]:
334 | auth = auth or config.notion_key
335 | headers = {
336 | 'Authorization': f'Bearer {auth}',
337 | 'Notion-Version': '2022-02-22',
338 | }
339 | async with RATE_LIMITER:
340 | async with ClientSession(headers=headers) as session:
341 | yield session
342 |
343 |
344 | async def paginated(
345 | method: Literal['GET', 'POST'],
346 | url: str,
347 | config: Config,
348 | initial_params: dict[str, Any] | None = None,
349 | ) -> list[dict]:
350 | results = []
351 |
352 | cursor: str | None = None
353 | has_more = True
354 |
355 | while has_more:
356 | params = initial_params or {}
357 | if cursor is not None:
358 | params = params | {'start_cursor': cursor}
359 | data = {}
360 | async with RateLimitedSession(config=config) as session:
361 | if method == 'GET':
362 | async with session.get(url, params=params) as resp:
363 | data = await resp.json()
364 | elif method == 'POST':
365 | async with session.post(url, json=params) as resp:
366 | data = await resp.json()
367 |
368 | results.extend(data['results'])
369 |
370 | cursor = data.get('next_cursor')
371 | has_more = data.get('has_more', False) and cursor is not None
372 |
373 | return results
374 |
375 |
376 | def _uuid_to_url(uuid: UUID) -> str:
377 | return f"{NOTION_URL}/{str(uuid).replace('-', '')}"
378 |
379 |
380 | def _strip_uuid(href: str) -> UUID:
381 | if not href.startswith('/'):
382 | raise ValueError
383 | no_slash = href[1:]
384 | try:
385 | return UUID(no_slash.split('#')[0])
386 | except ValueError:
387 | pass
388 | try:
389 | return UUID(no_slash.split('?')[0])
390 | except ValueError:
391 | raise
392 |
393 |
394 | def _parse_page(
395 | page_data: dict[str, Any], last_parsed: dict[UUID, datetime]
396 | ) -> Page | None:
397 | # skip archived pages
398 | if page_data['archived']:
399 | return None
400 |
401 | # only parse page if it has been updated since last parse
402 | page_id = UUID(page_data['id'])
403 | if page_id in last_parsed:
404 | last_edited = parse_date(page_data['last_edited_time'])
405 |
406 | time = last_parsed[page_id]
407 | if time.tzinfo is None:
408 | time = time.replace(tzinfo=timezone.utc)
409 |
410 | if last_edited < last_parsed[page_id]:
411 | return None
412 |
413 | properties = page_data.get('properties', {})
414 | for value in properties.values():
415 | if isinstance(value, dict) and value.get('type') == 'title':
416 | title_rich_text = value.get('title', [])
417 | break
418 | else:
419 | title_rich_text = []
420 |
421 | title_rich_text = [rt for rt in title_rich_text if rt['type'] == 'text']
422 | title = '-'.join([rt['text']['content'] for rt in title_rich_text])
423 |
424 | return Page(
425 | id=page_id,
426 | url=_uuid_to_url(page_id),
427 | title=title,
428 | last_parsed=datetime.now(timezone.utc),
429 | )
430 |
431 |
432 | async def parse_pages(last_parsed: dict[UUID, datetime], config: Config) -> set[Page]:
433 | param = {'filter': {'value': 'page', 'property': 'object'}}
434 | logger.debug('getting page ids...')
435 | resp = await paginated(
436 | 'POST', url=f'{NOTION_API_URL}/search', config=config, initial_params=param
437 | )
438 |
439 | ret = []
440 | for data in resp:
441 | page = _parse_page(page_data=data, last_parsed=last_parsed)
442 | if page is None:
443 | continue
444 | ret.append(page)
445 | return set(ret)
446 |
447 |
448 | def _parse_rich_text(page: UUID, rich_text: dict[str, Any]) -> Link | None:
449 | if rich_text['type'] == 'mention':
450 | mention = rich_text['mention']
451 | if mention['type'] == 'page':
452 | return Link(
453 | id=uuid4(),
454 | source=page,
455 | target=UUID(mention['page']['id']),
456 | link_type='mention',
457 | )
458 | elif rich_text['type'] == 'text':
459 | if rich_text.get('href') is not None:
460 | try:
461 | uuid = _strip_uuid(rich_text['href'])
462 | except ValueError:
463 | logger.debug(f"failed to parse href format: {rich_text['href']}")
464 | return None
465 | return Link(id=uuid4(), source=page, target=uuid, link_type='href')
466 | return None
467 |
468 |
469 | def parse_links(page: UUID, data: dict[str, Any]) -> list[Link]:
470 | block_type = data['type']
471 | if block_type in ('child_page', 'child_database'):
472 | return []
473 | if block_type not in data:
474 | return []
475 |
476 | ret: list[Link] = []
477 | block = data.get(block_type, {})
478 | for rich_text in block.get('rich_text', []):
479 | try:
480 | link = _parse_rich_text(page=page, rich_text=rich_text)
481 | except KeyError:
482 | pass
483 | else:
484 | if link is not None:
485 | ret.append(link)
486 | return ret
487 |
488 |
489 | async def parse_children(
490 | page: UUID, block: UUID, config: Config
491 | ) -> tuple[list[UUID], list[Link]]:
492 | # logger.info(f"parsing children of {block} in {page}...")
493 | resp = await paginated(
494 | 'GET', url=f'{NOTION_API_URL}/blocks/{block}/children', config=config
495 | )
496 |
497 | links: list[Link] = []
498 | children: list[UUID] = []
499 |
500 | for data in resp:
501 | # handle child_pages separately
502 | tp = data['type']
503 | target = data['id']
504 | if tp == 'child_page':
505 | links.append(Link(id=uuid4(), source=page, target=target, link_type='page'))
506 | if tp == 'child_database':
507 | links.append(
508 | Link(id=uuid4(), source=page, target=target, link_type='database')
509 | )
510 |
511 | # handle any other links such as mentions and hrefs
512 | links.extend(parse_links(page=page, data=data))
513 |
514 | # handle propagation to children
515 | if tp in SKIP_PROPAGATION_BLOCK_TYPES:
516 | continue
517 |
518 | if data.get('has_children'):
519 | children.append(UUID(target))
520 |
521 | return children, links
522 |
523 |
524 | @dataclass
525 | class Task:
526 | page: UUID
527 | block: UUID
528 | retry: int = 0
529 |
530 | def __hash__(self) -> int:
531 | return hash(self.block)
532 |
533 | def __eq__(self, other: Any) -> bool:
534 | if not isinstance(other, Task):
535 | return NotImplemented
536 | return self.block == other.block
537 |
538 |
539 | async def worker(
540 | queue: asyncio.Queue[Task],
541 | links: set[Link],
542 | enqueued: set[Task],
543 | done: set[Task],
544 | failed: set[Task],
545 | config: Config,
546 | ):
547 | while True:
548 | task = await queue.get()
549 | try:
550 | children, new_links = await asyncio.wait_for(
551 | parse_children(page=task.page, block=task.block, config=config), TIMEOUT
552 | )
553 | except Exception:
554 | if task.retry >= MAX_RETRY:
555 | logger.debug(f'task failed: {task!r}')
556 | failed.add(task)
557 | else:
558 | logger.debug(f'retrying task: {task!r}')
559 | task.retry += 1
560 | queue.put_nowait(task)
561 | else:
562 | async with asyncio.Lock():
563 | for child in children:
564 | # don't parse blocks twice
565 | if child not in enqueued:
566 | new_task = Task(page=task.page, block=child)
567 | enqueued.add(new_task)
568 | queue.put_nowait(new_task)
569 |
570 | for link in new_links:
571 | links.add(link)
572 | done.add(task)
573 | finally:
574 | queue.task_done()
575 |
576 |
577 | async def parse(
578 | last_parsed: dict[UUID, datetime],
579 | config: Config,
580 | ) -> tuple[set[Page], set[Link]]:
581 | pages = await parse_pages(config=config, last_parsed=last_parsed)
582 | links: set[Link] = set()
583 |
584 | # monitor the queue
585 | queue: asyncio.Queue[Task] = asyncio.Queue()
586 | enqueued: set[Task] = set()
587 | done: set[Task] = set()
588 | failed: set[Task] = set()
589 |
590 | workers = []
591 | for _ in range(config.n_workers):
592 | task = asyncio.create_task(
593 | worker(queue, links, enqueued, done, failed, config=config)
594 | )
595 | workers.append(task)
596 |
597 | for page in pages:
598 | new_task = Task(page=page.id, block=page.id)
599 | queue.put_nowait(new_task)
600 | enqueued.add(new_task)
601 |
602 | # wait for all tasks to be done
603 | async def monitor() -> None:
604 | while True:
605 | await asyncio.sleep(1)
606 | logger.debug(
607 | f'ENQUEUED: {len(enqueued)}, DONE: {len(done)}, FAILED: {len(failed)}'
608 | )
609 |
610 | logger_task = asyncio.create_task(monitor())
611 |
612 | await queue.join()
613 |
614 | logger_task.cancel()
615 | logger.debug('work done, cancelling workers...')
616 |
617 | for w in workers:
618 | w.cancel()
619 |
620 | logger.info(f'done: {len(done)}, failed: {len(failed)}')
621 |
622 | return pages, links
623 |
624 |
625 | async def partial_parse(config: Config, flush: bool = False) -> None:
626 | with persisted_graph(config.data_path / 'graph.json', flush=flush) as graph:
627 | last_parsed = {page.id: page.last_parsed for page in graph.pages}
628 | pages, links = await parse(last_parsed=last_parsed, config=config)
629 | graph.update(pages, links)
630 |
631 |
632 | async def run_daemon(config: Config) -> NoReturn:
633 | while True:
634 | try:
635 | logger.info('refreshing graph...')
636 | await partial_parse(config=config, flush=False)
637 | await asyncio.sleep(config.refresh_interval)
638 | except Exception:
639 | logger.exception('error while parsing, retrying in 5s...')
640 | await asyncio.sleep(5)
641 |
642 |
643 | def flask_app(config: Config) -> Flask:
644 | app = Flask(__name__)
645 |
646 | def index() -> Any:
647 | return render_template('index.html')
648 |
649 | def data() -> Response:
650 | with persisted_graph(config.data_path / 'graph.json') as graph:
651 | display_graph = to_display_graph(graph)
652 | return jsonify(dataclasses.asdict(display_graph))
653 |
654 | app.add_url_rule('/', view_func=index)
655 | app.add_url_rule('/data', view_func=data)
656 | return app
657 |
658 |
659 | def main() -> int:
660 | config = load_config()
661 | app = flask_app(config)
662 |
663 | daemon = Thread(target=lambda: asyncio.run(run_daemon(config)))
664 | flask = Thread(target=lambda: app.run(host='0.0.0.0', port=8080))
665 |
666 | daemon.start()
667 | flask.start()
668 |
669 | daemon.join()
670 | flask.join()
671 | return 0
672 |
673 |
674 | if __name__ == '__main__':
675 | raise SystemExit(main())
676 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | mypy
4 | pre-commit
5 | pytest
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | aiolimiter
3 | flask
4 | python-dateutil
5 | python-dotenv
6 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | count = True
3 | statistics = True
4 | max-line-length = 127
5 | extend-exclude = .venv, .pyc
6 | ignore=F401,E123,W503
7 |
8 | [mypy]
9 | python_version = 3.10
10 | warn_return_any = True
11 | warn_unused_configs = True
12 | exclude = (setup.py|build/|tests/)
13 |
14 | [tool:pytest]
15 | python_files = tests/*.py tests/**/*.py tests.py test_*.py *_tests.py
16 | asyncio_mode = auto
17 |
--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
23 |
24 |
--------------------------------------------------------------------------------