├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── graph.py ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg └── templates └── index.html /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,node,macos,linux,windows 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,node,macos,linux,windows 4 | 5 | ### Linux ### 6 | *~ 7 | 8 | # temporary files which can be created if a process still has a handle open of a deleted file 9 | .fuse_hidden* 10 | 11 | # KDE directory preferences 12 | .directory 13 | 14 | # Linux trash folder which might appear on any partition or disk 15 | .Trash-* 16 | 17 | # .nfs files are created when an open file is removed but is still being accessed 18 | .nfs* 19 | 20 | ### macOS ### 21 | # General 22 | .DS_Store 23 | .AppleDouble 24 | .LSOverride 25 | 26 | # Icon must end with two \r 27 | Icon 28 | 29 | 30 | # Thumbnails 31 | ._* 32 | 33 | # Files that might appear in the root of a volume 34 | .DocumentRevisions-V100 35 | .fseventsd 36 | .Spotlight-V100 37 | .TemporaryItems 38 | .Trashes 39 | .VolumeIcon.icns 40 | .com.apple.timemachine.donotpresent 41 | 42 | # Directories potentially created on remote AFP share 43 | .AppleDB 44 | .AppleDesktop 45 | Network Trash Folder 46 | Temporary Items 47 | .apdisk 48 | 49 | ### macOS Patch ### 50 | # iCloud generated files 51 | *.icloud 52 | 53 | ### Node ### 54 | # Logs 55 | logs 56 | *.log 57 | npm-debug.log* 58 | yarn-debug.log* 59 | yarn-error.log* 60 | lerna-debug.log* 61 | .pnpm-debug.log* 62 | 63 | # Diagnostic reports (https://nodejs.org/api/report.html) 64 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 65 | 66 | # Runtime data 67 | pids 68 | *.pid 69 | *.seed 70 | *.pid.lock 71 | 72 | # Directory for instrumented libs generated by jscoverage/JSCover 73 | lib-cov 74 | 75 | # Coverage directory used by tools like istanbul 76 | coverage 77 | *.lcov 78 | 79 | # nyc test coverage 80 | .nyc_output 81 | 82 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 83 | .grunt 84 | 85 | # Bower dependency directory (https://bower.io/) 86 | bower_components 87 | 88 | # node-waf configuration 89 | .lock-wscript 90 | 91 | # Compiled binary addons (https://nodejs.org/api/addons.html) 92 | build/Release 93 | 94 | # Dependency directories 95 | node_modules/ 96 | jspm_packages/ 97 | 98 | # Snowpack dependency directory (https://snowpack.dev/) 99 | web_modules/ 100 | 101 | # TypeScript cache 102 | *.tsbuildinfo 103 | 104 | # Optional npm cache directory 105 | .npm 106 | 107 | # Optional eslint cache 108 | .eslintcache 109 | 110 | # Optional stylelint cache 111 | .stylelintcache 112 | 113 | # Microbundle cache 114 | .rpt2_cache/ 115 | .rts2_cache_cjs/ 116 | .rts2_cache_es/ 117 | .rts2_cache_umd/ 118 | 119 | # Optional REPL history 120 | .node_repl_history 121 | 122 | # Output of 'npm pack' 123 | *.tgz 124 | 125 | # Yarn Integrity file 126 | .yarn-integrity 127 | 128 | # dotenv environment variable files 129 | .env 130 | .env.development.local 131 | .env.test.local 132 | .env.production.local 133 | .env.local 134 | 135 | # parcel-bundler cache (https://parceljs.org/) 136 | .cache 137 | .parcel-cache 138 | 139 | # Next.js build output 140 | .next 141 | out 142 | 143 | # Nuxt.js build / generate output 144 | .nuxt 145 | dist 146 | 147 | # Gatsby files 148 | .cache/ 149 | # Comment in the public line in if your project uses Gatsby and not Next.js 150 | # https://nextjs.org/blog/next-9-1#public-directory-support 151 | # public 152 | 153 | # vuepress build output 154 | .vuepress/dist 155 | 156 | # vuepress v2.x temp and cache directory 157 | .temp 158 | 159 | # Docusaurus cache and generated files 160 | .docusaurus 161 | 162 | # Serverless directories 163 | .serverless/ 164 | 165 | # FuseBox cache 166 | .fusebox/ 167 | 168 | # DynamoDB Local files 169 | .dynamodb/ 170 | 171 | # TernJS port file 172 | .tern-port 173 | 174 | # Stores VSCode versions used for testing VSCode extensions 175 | .vscode-test 176 | 177 | # yarn v2 178 | .yarn/cache 179 | .yarn/unplugged 180 | .yarn/build-state.yml 181 | .yarn/install-state.gz 182 | .pnp.* 183 | 184 | ### Node Patch ### 185 | # Serverless Webpack directories 186 | .webpack/ 187 | 188 | # Optional stylelint cache 189 | 190 | # SvelteKit build / generate output 191 | .svelte-kit 192 | 193 | ### Python ### 194 | # Byte-compiled / optimized / DLL files 195 | __pycache__/ 196 | *.py[cod] 197 | *$py.class 198 | 199 | # C extensions 200 | *.so 201 | 202 | # Distribution / packaging 203 | .Python 204 | build/ 205 | develop-eggs/ 206 | dist/ 207 | downloads/ 208 | eggs/ 209 | .eggs/ 210 | lib/ 211 | lib64/ 212 | parts/ 213 | sdist/ 214 | var/ 215 | wheels/ 216 | share/python-wheels/ 217 | *.egg-info/ 218 | .installed.cfg 219 | *.egg 220 | MANIFEST 221 | 222 | # PyInstaller 223 | # Usually these files are written by a python script from a template 224 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 225 | *.manifest 226 | *.spec 227 | 228 | # Installer logs 229 | pip-log.txt 230 | pip-delete-this-directory.txt 231 | 232 | # Unit test / coverage reports 233 | htmlcov/ 234 | .tox/ 235 | .nox/ 236 | .coverage 237 | .coverage.* 238 | nosetests.xml 239 | coverage.xml 240 | *.cover 241 | *.py,cover 242 | .hypothesis/ 243 | .pytest_cache/ 244 | cover/ 245 | 246 | # Translations 247 | *.mo 248 | *.pot 249 | 250 | # Django stuff: 251 | local_settings.py 252 | db.sqlite3 253 | db.sqlite3-journal 254 | 255 | # Flask stuff: 256 | instance/ 257 | .webassets-cache 258 | 259 | # Scrapy stuff: 260 | .scrapy 261 | 262 | # Sphinx documentation 263 | types-python-dateutil 264 | docs/_build/ 265 | 266 | # PyBuilder 267 | .pybuilder/ 268 | target/ 269 | 270 | # Jupyter Notebook 271 | .ipynb_checkpoints 272 | 273 | # IPython 274 | profile_default/ 275 | ipython_config.py 276 | 277 | # pyenv 278 | # For a library or package, you might want to ignore these files since the code is 279 | # intended to run in multiple environments; otherwise, check them in: 280 | # .python-version 281 | 282 | # pipenv 283 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 284 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 285 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 286 | # install all needed dependencies. 287 | #Pipfile.lock 288 | 289 | # poetry 290 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 291 | # This is especially recommended for binary packages to ensure reproducibility, and is more 292 | # commonly ignored for libraries. 293 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 294 | #poetry.lock 295 | 296 | # pdm 297 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 298 | #pdm.lock 299 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 300 | # in version control. 301 | # https://pdm.fming.dev/#use-with-ide 302 | .pdm.toml 303 | 304 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 305 | __pypackages__/ 306 | 307 | # Celery stuff 308 | celerybeat-schedule 309 | celerybeat.pid 310 | 311 | # SageMath parsed files 312 | *.sage.py 313 | 314 | # Environments 315 | .venv 316 | env/ 317 | venv/ 318 | ENV/ 319 | env.bak/ 320 | venv.bak/ 321 | 322 | # Spyder project settings 323 | .spyderproject 324 | .spyproject 325 | 326 | # Rope project settings 327 | .ropeproject 328 | 329 | # mkdocs documentation 330 | /site 331 | 332 | # mypy 333 | .mypy_cache/ 334 | .dmypy.json 335 | dmypy.json 336 | 337 | # Pyre type checker 338 | .pyre/ 339 | 340 | # pytype static type analyzer 341 | .pytype/ 342 | 343 | # Cython debug symbols 344 | cython_debug/ 345 | 346 | # PyCharm 347 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 348 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 349 | # and can be added to the global gitignore or merged into this file. For a more nuclear 350 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 351 | #.idea/ 352 | 353 | ### Windows ### 354 | # Windows thumbnail cache files 355 | Thumbs.db 356 | Thumbs.db:encryptable 357 | ehthumbs.db 358 | ehthumbs_vista.db 359 | 360 | # Dump file 361 | *.stackdump 362 | 363 | # Folder config file 364 | [Dd]esktop.ini 365 | 366 | # Recycle Bin used on file shares 367 | $RECYCLE.BIN/ 368 | 369 | # Windows Installer files 370 | *.cab 371 | *.msi 372 | *.msix 373 | *.msm 374 | *.msp 375 | 376 | # Windows shortcuts 377 | *.lnk 378 | 379 | # End of https://www.toptal.com/developers/gitignore/api/python,node,macos,linux,windows 380 | 381 | graph.json 382 | data.json 383 | data/ 384 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.6.0 4 | hooks: 5 | - id: check-docstring-first 6 | - id: check-yaml 7 | - id: double-quote-string-fixer 8 | - id: end-of-file-fixer 9 | - id: requirements-txt-fixer 10 | - id: trailing-whitespace 11 | - repo: https://github.com/PyCQA/flake8 12 | rev: 7.0.0 13 | hooks: 14 | - id: flake8 15 | - repo: https://github.com/asottile/reorder-python-imports 16 | rev: v3.13.0 17 | hooks: 18 | - id: reorder-python-imports 19 | args: [--py37-plus, --add-import, "from __future__ import annotations"] 20 | - repo: https://github.com/asottile/pyupgrade 21 | rev: v3.16.0 22 | hooks: 23 | - id: pyupgrade 24 | args: [--py310-plus] 25 | - repo: https://github.com/pre-commit/mirrors-mypy 26 | rev: v1.10.0 27 | hooks: 28 | - id: mypy 29 | additional_dependencies: ["types-PyYAML", "types-python-dateutil"] 30 | - repo: https://github.com/psf/black 31 | rev: 24.4.2 32 | hooks: 33 | - id: black 34 | additional_dependencies: ["click==8.0.2"] 35 | language_version: python3.10 36 | args: [--skip-string-normalization] 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Dominique Garmier 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/DominiqueGarmier/notion-graph/main.svg)](https://results.pre-commit.ci/latest/github/DominiqueGarmier/notion-graph/main) 2 | 3 | # notion-graph 4 | 5 | opensource graph view of your notion pages, inspired by [Obsidian](https://obsidian.md/). 6 | 7 | ## WARNING THIS IS STILL IN DEVELOPMENT 8 | 9 | #### what currently works: 10 | 11 | - a simple flask server (see the gif below) 12 | - background parsing and auto updating (parses every X minutes automatically) 13 | - retrying logic (it hasn't crashed into an unrecoverable state for me yet) 14 | - partial updates (only parse pages that were edited since last parse) 15 | 16 |

17 | notion-graph preview 18 |

19 | 20 | ## Installing 21 | 22 | Clone this repo. 23 | 24 | ``` 25 | git clone git@github.com:dominiquegarmier/notion-graph 26 | cd notion-graph 27 | ``` 28 | 29 | Install dependencies. 30 | 31 | ``` 32 | virtualenv .venv -ppython3.10 33 | source .venv/bin/activate 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | ## Setup 38 | 39 | - set the environment variable `NOTION_KEY` with your notion api key that has read access to some pages (see [notion docs]("https://developers.notion.com/docs/create-a-notion-integration")). 40 | 41 | ## Usage 42 | 43 | you can now run the following command to start notion-graph 44 | 45 | ``` 46 | python graph.py 47 | ``` 48 | 49 | This will automatically discover any page shared with your notion integration. Subsequently it will create a task queue to query every discovered page. The initial parse of your document might take a while as notions api is limited to three requests per second. You will notice that the programm will create a new folder `data/` which contains the parsed pages and links. Subsequent parses will only refresh pages that have be edited since the last parse. 50 | 51 | The graph view will be served on `localhost:8080`. Make sure to hit refresh when the parsing is done. 52 | 53 | ## Development 54 | 55 | Install dev-dependencies 56 | 57 | ``` 58 | pip install -r requirements-dev.txt 59 | ``` 60 | 61 | Install pre-commit hooks. 62 | 63 | ``` 64 | pre-commit install 65 | ``` 66 | -------------------------------------------------------------------------------- /graph.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import asyncio 4 | import dataclasses 5 | import json 6 | import logging 7 | import math 8 | import os 9 | import tempfile 10 | from collections import defaultdict 11 | from collections.abc import AsyncGenerator 12 | from collections.abc import Collection 13 | from collections.abc import Generator 14 | from contextlib import asynccontextmanager 15 | from contextlib import contextmanager 16 | from dataclasses import dataclass 17 | from datetime import datetime 18 | from datetime import timezone 19 | from logging import getLogger 20 | from pathlib import Path 21 | from threading import Thread 22 | from typing import Any 23 | from typing import Literal 24 | from typing import NoReturn 25 | from uuid import UUID 26 | from uuid import uuid4 27 | 28 | from aiohttp import ClientSession 29 | from aiolimiter import AsyncLimiter 30 | from dateutil.parser import parse as parse_date 31 | from dotenv import load_dotenv 32 | from flask import Flask 33 | from flask import jsonify 34 | from flask import render_template 35 | from werkzeug import Response 36 | 37 | logger = getLogger(__name__) 38 | logging.basicConfig(level=logging.DEBUG) 39 | 40 | NOTION_URL = 'https://www.notion.so' 41 | NOTION_API_URL = 'https://api.notion.com/v1' 42 | 43 | DEFAULT_DATA_PATH = Path(__file__).parent / 'data' 44 | DEFAULT_REFRESH_INTERVAL = 60 * 5 45 | DEFAULT_N_WORKERS = 4 46 | 47 | # api config 48 | RATE_LIMIT_BURST = 1 49 | RATE_LIMIT = 3 * RATE_LIMIT_BURST 50 | RATE_LIMITER = AsyncLimiter(RATE_LIMIT, RATE_LIMIT_BURST) 51 | TIMEOUT = 30 52 | MAX_RETRY = 1 53 | 54 | SKIP_PROPAGATION_BLOCK_TYPES = ( 55 | 'child_page', 56 | 'child_database', 57 | ) 58 | 59 | 60 | @dataclass(frozen=True) 61 | class Config: 62 | notion_key: str 63 | data_path: Path 64 | refresh_interval: int 65 | n_workers: int 66 | 67 | 68 | def load_config() -> Config: 69 | load_dotenv() 70 | try: 71 | notion_key = os.environ['NOTION_KEY'] 72 | except KeyError: 73 | raise ValueError('Missing NOTION_KEY environment variable') 74 | 75 | data_path = Path(os.environ.get('GRAPH_DATA_PATH', DEFAULT_DATA_PATH)) 76 | refresh_interval = int( 77 | os.environ.get('GRAPH_REFRESH_INTERVAL', DEFAULT_REFRESH_INTERVAL) 78 | ) 79 | n_workers = int(os.environ.get('GRAPH_N_WORKERS', DEFAULT_N_WORKERS)) 80 | 81 | return Config(notion_key, data_path, refresh_interval, n_workers) 82 | 83 | 84 | @dataclass 85 | class Page: 86 | id: UUID 87 | url: str 88 | title: str 89 | last_parsed: datetime 90 | 91 | def __hash__(self) -> int: 92 | return hash(self.id) 93 | 94 | def __eq__(self, other: object) -> bool: 95 | if not isinstance(other, Page): 96 | return NotImplemented 97 | return self.id == other.id 98 | 99 | 100 | def serialize_page(page: Page) -> dict[str, str]: 101 | return { 102 | 'id': str(page.id), 103 | 'url': str(page.url), 104 | 'title': str(page.title), 105 | 'last_parsed': page.last_parsed.isoformat(), 106 | } 107 | 108 | 109 | def deserialize_page(data: dict[str, str]) -> Page: 110 | return Page( 111 | id=UUID(data['id']), 112 | url=data['url'], 113 | title=data['title'], 114 | last_parsed=datetime.fromisoformat(data['last_parsed']), 115 | ) 116 | 117 | 118 | @dataclass 119 | class Link: 120 | id: UUID 121 | source: UUID 122 | target: UUID 123 | link_type: Literal['page', 'database', 'mention', 'href'] 124 | 125 | def __hash__(self) -> int: 126 | return hash(self.id) 127 | 128 | def __eq__(self, other: object) -> bool: 129 | if not isinstance(other, Link): 130 | return NotImplemented 131 | return self.id == other.id 132 | 133 | 134 | def serialize_link(link: Link) -> dict[str, str]: 135 | return { 136 | 'id': str(link.id), 137 | 'source': str(link.source), 138 | 'target': str(link.target), 139 | 'link_type': link.link_type, 140 | } 141 | 142 | 143 | def deserialize_link(data: dict[str, str]) -> Link: 144 | return Link( 145 | id=UUID(data['id']), 146 | source=UUID(data['source']), 147 | target=UUID(data['target']), 148 | link_type=data['link_type'], # type: ignore 149 | ) 150 | 151 | 152 | class Graph: 153 | _pages: dict[UUID, Page] 154 | _links: dict[UUID, Link] 155 | 156 | def __init__( 157 | self, 158 | pages: Collection[Page] | None = None, 159 | links: Collection[Link] | None = None, 160 | ) -> None: 161 | self._pages = {} 162 | self._links = {} 163 | 164 | if pages is None: 165 | pages = [] 166 | if links is None: 167 | links = [] 168 | 169 | for page in pages: 170 | self.add(page) 171 | for link in links: 172 | self.add(link) 173 | 174 | def __contains__(self, item: Page | UUID) -> bool: 175 | if isinstance(item, Page): 176 | return item.id in self._pages 177 | elif isinstance(item, UUID): 178 | return item in self._pages 179 | else: 180 | raise TypeError(f'Cannot check for item of type {type(item)}') 181 | 182 | @classmethod 183 | def deserialize(cls, data: str) -> Graph: 184 | dct = json.loads(data) 185 | try: 186 | pages = [deserialize_page(page) for page in dct['pages']] 187 | links = [deserialize_link(link) for link in dct['links']] 188 | except Exception as e: 189 | raise ValueError(f'Invalid data {e}') 190 | return cls(pages, links) 191 | 192 | def serialize(self) -> str: 193 | pages = [serialize_page(page) for page in self._pages.values()] 194 | links = [serialize_link(link) for link in self._links.values()] 195 | return json.dumps({'pages': pages, 'links': links}) 196 | 197 | def save(self, path: str | os.PathLike) -> None: 198 | with open(path, 'w') as f: 199 | f.write(self.serialize()) 200 | 201 | @classmethod 202 | def load(cls, path: str | os.PathLike) -> Graph: 203 | with open(path) as f: 204 | return cls.deserialize(f.read()) 205 | 206 | @property 207 | def pages(self) -> list[Page]: 208 | return list(self._pages.values()) 209 | 210 | @property 211 | def links(self) -> list[Link]: 212 | return list(self._links.values()) 213 | 214 | def add(self, item: Page | Link) -> None: 215 | if isinstance(item, Page): 216 | if item.id in self._pages: 217 | raise ValueError(f'Page {item.id} already exists') 218 | self._pages[item.id] = item 219 | elif isinstance(item, Link): 220 | if item.id in self._links: 221 | raise ValueError(f'Link {item.id} already exists') 222 | self._links[item.id] = item 223 | else: 224 | raise TypeError(f'Cannot add item of type {type(item)}') 225 | 226 | def prune(self) -> Graph: 227 | links = set() 228 | for link in self._links.values(): 229 | if link.source in self and link.target in self: 230 | links.add(link) 231 | return self.__class__(list(self._pages.values()), links) 232 | 233 | def update(self, pages: Collection[Page], links: Collection[Link]) -> None: 234 | new_pages = {page.id: page for page in pages} 235 | 236 | delete_ids = [] 237 | for link in self._links.values(): 238 | if link.source in new_pages: 239 | delete_ids.append(link.id) 240 | for id_ in delete_ids: 241 | del self._links[id_] 242 | 243 | self._pages.update(new_pages) 244 | self._links.update({link.id: link for link in links}) 245 | 246 | 247 | @dataclass 248 | class DisplayLink: 249 | source: str 250 | target: str 251 | rotation: float 252 | curvature: float 253 | 254 | 255 | @dataclass 256 | class DisplayNode: 257 | id: str 258 | title: str 259 | url: str 260 | 261 | 262 | @dataclass 263 | class DisplayGraph: 264 | nodes: list[DisplayNode] 265 | links: list[DisplayLink] 266 | 267 | 268 | def to_display_graph(graph: Graph) -> DisplayGraph: 269 | node_ids: set[str] = set() 270 | nodes = [] 271 | for page in graph.pages: 272 | node_ids.add(str(page.id)) 273 | node = DisplayNode(id=str(page.id), title=page.title, url=page.url) 274 | nodes.append(node) 275 | 276 | links_dict: dict[tuple[UUID, UUID], list[Link]] = defaultdict(list) 277 | for link in graph.links: 278 | if str(link.source) not in node_ids or str(link.target) not in node_ids: 279 | continue 280 | links_dict[(link.source, link.target)].append(link) 281 | 282 | links_list = [] 283 | for ids_tp, links in links_dict.items(): 284 | if ids_tp[0] == ids_tp[1]: 285 | base_curvature = 0.5 286 | else: 287 | base_curvature = 0 288 | n = len(links) 289 | for i, link in enumerate(links): 290 | rotation = 2 * math.pi * i / n 291 | new_link = DisplayLink( 292 | source=str(link.source), 293 | target=str(link.target), 294 | rotation=rotation, 295 | curvature=base_curvature + min((n - 1) / 10, 0.5), 296 | ) 297 | links_list.append(new_link) 298 | 299 | return DisplayGraph(nodes=nodes, links=links_list) 300 | 301 | 302 | @contextmanager 303 | def persisted_graph( 304 | path: str | Path, flush: bool = False, persist: bool = True 305 | ) -> Generator[Graph, None, None]: 306 | if not flush: 307 | try: 308 | graph = Graph.load(path) 309 | except Exception: 310 | graph = Graph() 311 | logger.warning(f'Could not load graph from {path}, creating new graph.') 312 | else: 313 | graph = Graph() 314 | 315 | try: 316 | yield graph 317 | except Exception: 318 | raise 319 | else: 320 | if persist: 321 | fd, tmp_path = tempfile.mkstemp() 322 | with open(fd, 'w') as f: 323 | f.write(graph.serialize()) 324 | 325 | Path(path).parent.mkdir(parents=True, exist_ok=True) 326 | os.replace(tmp_path, path) 327 | 328 | 329 | @asynccontextmanager 330 | async def RateLimitedSession( 331 | config: Config, 332 | auth: str | None = None, 333 | ) -> AsyncGenerator[ClientSession, None]: 334 | auth = auth or config.notion_key 335 | headers = { 336 | 'Authorization': f'Bearer {auth}', 337 | 'Notion-Version': '2022-02-22', 338 | } 339 | async with RATE_LIMITER: 340 | async with ClientSession(headers=headers) as session: 341 | yield session 342 | 343 | 344 | async def paginated( 345 | method: Literal['GET', 'POST'], 346 | url: str, 347 | config: Config, 348 | initial_params: dict[str, Any] | None = None, 349 | ) -> list[dict]: 350 | results = [] 351 | 352 | cursor: str | None = None 353 | has_more = True 354 | 355 | while has_more: 356 | params = initial_params or {} 357 | if cursor is not None: 358 | params = params | {'start_cursor': cursor} 359 | data = {} 360 | async with RateLimitedSession(config=config) as session: 361 | if method == 'GET': 362 | async with session.get(url, params=params) as resp: 363 | data = await resp.json() 364 | elif method == 'POST': 365 | async with session.post(url, json=params) as resp: 366 | data = await resp.json() 367 | 368 | results.extend(data['results']) 369 | 370 | cursor = data.get('next_cursor') 371 | has_more = data.get('has_more', False) and cursor is not None 372 | 373 | return results 374 | 375 | 376 | def _uuid_to_url(uuid: UUID) -> str: 377 | return f"{NOTION_URL}/{str(uuid).replace('-', '')}" 378 | 379 | 380 | def _strip_uuid(href: str) -> UUID: 381 | if not href.startswith('/'): 382 | raise ValueError 383 | no_slash = href[1:] 384 | try: 385 | return UUID(no_slash.split('#')[0]) 386 | except ValueError: 387 | pass 388 | try: 389 | return UUID(no_slash.split('?')[0]) 390 | except ValueError: 391 | raise 392 | 393 | 394 | def _parse_page( 395 | page_data: dict[str, Any], last_parsed: dict[UUID, datetime] 396 | ) -> Page | None: 397 | # skip archived pages 398 | if page_data['archived']: 399 | return None 400 | 401 | # only parse page if it has been updated since last parse 402 | page_id = UUID(page_data['id']) 403 | if page_id in last_parsed: 404 | last_edited = parse_date(page_data['last_edited_time']) 405 | 406 | time = last_parsed[page_id] 407 | if time.tzinfo is None: 408 | time = time.replace(tzinfo=timezone.utc) 409 | 410 | if last_edited < last_parsed[page_id]: 411 | return None 412 | 413 | properties = page_data.get('properties', {}) 414 | for value in properties.values(): 415 | if isinstance(value, dict) and value.get('type') == 'title': 416 | title_rich_text = value.get('title', []) 417 | break 418 | else: 419 | title_rich_text = [] 420 | 421 | title_rich_text = [rt for rt in title_rich_text if rt['type'] == 'text'] 422 | title = '-'.join([rt['text']['content'] for rt in title_rich_text]) 423 | 424 | return Page( 425 | id=page_id, 426 | url=_uuid_to_url(page_id), 427 | title=title, 428 | last_parsed=datetime.now(timezone.utc), 429 | ) 430 | 431 | 432 | async def parse_pages(last_parsed: dict[UUID, datetime], config: Config) -> set[Page]: 433 | param = {'filter': {'value': 'page', 'property': 'object'}} 434 | logger.debug('getting page ids...') 435 | resp = await paginated( 436 | 'POST', url=f'{NOTION_API_URL}/search', config=config, initial_params=param 437 | ) 438 | 439 | ret = [] 440 | for data in resp: 441 | page = _parse_page(page_data=data, last_parsed=last_parsed) 442 | if page is None: 443 | continue 444 | ret.append(page) 445 | return set(ret) 446 | 447 | 448 | def _parse_rich_text(page: UUID, rich_text: dict[str, Any]) -> Link | None: 449 | if rich_text['type'] == 'mention': 450 | mention = rich_text['mention'] 451 | if mention['type'] == 'page': 452 | return Link( 453 | id=uuid4(), 454 | source=page, 455 | target=UUID(mention['page']['id']), 456 | link_type='mention', 457 | ) 458 | elif rich_text['type'] == 'text': 459 | if rich_text.get('href') is not None: 460 | try: 461 | uuid = _strip_uuid(rich_text['href']) 462 | except ValueError: 463 | logger.debug(f"failed to parse href format: {rich_text['href']}") 464 | return None 465 | return Link(id=uuid4(), source=page, target=uuid, link_type='href') 466 | return None 467 | 468 | 469 | def parse_links(page: UUID, data: dict[str, Any]) -> list[Link]: 470 | block_type = data['type'] 471 | if block_type in ('child_page', 'child_database'): 472 | return [] 473 | if block_type not in data: 474 | return [] 475 | 476 | ret: list[Link] = [] 477 | block = data.get(block_type, {}) 478 | for rich_text in block.get('rich_text', []): 479 | try: 480 | link = _parse_rich_text(page=page, rich_text=rich_text) 481 | except KeyError: 482 | pass 483 | else: 484 | if link is not None: 485 | ret.append(link) 486 | return ret 487 | 488 | 489 | async def parse_children( 490 | page: UUID, block: UUID, config: Config 491 | ) -> tuple[list[UUID], list[Link]]: 492 | # logger.info(f"parsing children of {block} in {page}...") 493 | resp = await paginated( 494 | 'GET', url=f'{NOTION_API_URL}/blocks/{block}/children', config=config 495 | ) 496 | 497 | links: list[Link] = [] 498 | children: list[UUID] = [] 499 | 500 | for data in resp: 501 | # handle child_pages separately 502 | tp = data['type'] 503 | target = data['id'] 504 | if tp == 'child_page': 505 | links.append(Link(id=uuid4(), source=page, target=target, link_type='page')) 506 | if tp == 'child_database': 507 | links.append( 508 | Link(id=uuid4(), source=page, target=target, link_type='database') 509 | ) 510 | 511 | # handle any other links such as mentions and hrefs 512 | links.extend(parse_links(page=page, data=data)) 513 | 514 | # handle propagation to children 515 | if tp in SKIP_PROPAGATION_BLOCK_TYPES: 516 | continue 517 | 518 | if data.get('has_children'): 519 | children.append(UUID(target)) 520 | 521 | return children, links 522 | 523 | 524 | @dataclass 525 | class Task: 526 | page: UUID 527 | block: UUID 528 | retry: int = 0 529 | 530 | def __hash__(self) -> int: 531 | return hash(self.block) 532 | 533 | def __eq__(self, other: Any) -> bool: 534 | if not isinstance(other, Task): 535 | return NotImplemented 536 | return self.block == other.block 537 | 538 | 539 | async def worker( 540 | queue: asyncio.Queue[Task], 541 | links: set[Link], 542 | enqueued: set[Task], 543 | done: set[Task], 544 | failed: set[Task], 545 | config: Config, 546 | ): 547 | while True: 548 | task = await queue.get() 549 | try: 550 | children, new_links = await asyncio.wait_for( 551 | parse_children(page=task.page, block=task.block, config=config), TIMEOUT 552 | ) 553 | except Exception: 554 | if task.retry >= MAX_RETRY: 555 | logger.debug(f'task failed: {task!r}') 556 | failed.add(task) 557 | else: 558 | logger.debug(f'retrying task: {task!r}') 559 | task.retry += 1 560 | queue.put_nowait(task) 561 | else: 562 | async with asyncio.Lock(): 563 | for child in children: 564 | # don't parse blocks twice 565 | if child not in enqueued: 566 | new_task = Task(page=task.page, block=child) 567 | enqueued.add(new_task) 568 | queue.put_nowait(new_task) 569 | 570 | for link in new_links: 571 | links.add(link) 572 | done.add(task) 573 | finally: 574 | queue.task_done() 575 | 576 | 577 | async def parse( 578 | last_parsed: dict[UUID, datetime], 579 | config: Config, 580 | ) -> tuple[set[Page], set[Link]]: 581 | pages = await parse_pages(config=config, last_parsed=last_parsed) 582 | links: set[Link] = set() 583 | 584 | # monitor the queue 585 | queue: asyncio.Queue[Task] = asyncio.Queue() 586 | enqueued: set[Task] = set() 587 | done: set[Task] = set() 588 | failed: set[Task] = set() 589 | 590 | workers = [] 591 | for _ in range(config.n_workers): 592 | task = asyncio.create_task( 593 | worker(queue, links, enqueued, done, failed, config=config) 594 | ) 595 | workers.append(task) 596 | 597 | for page in pages: 598 | new_task = Task(page=page.id, block=page.id) 599 | queue.put_nowait(new_task) 600 | enqueued.add(new_task) 601 | 602 | # wait for all tasks to be done 603 | async def monitor() -> None: 604 | while True: 605 | await asyncio.sleep(1) 606 | logger.debug( 607 | f'ENQUEUED: {len(enqueued)}, DONE: {len(done)}, FAILED: {len(failed)}' 608 | ) 609 | 610 | logger_task = asyncio.create_task(monitor()) 611 | 612 | await queue.join() 613 | 614 | logger_task.cancel() 615 | logger.debug('work done, cancelling workers...') 616 | 617 | for w in workers: 618 | w.cancel() 619 | 620 | logger.info(f'done: {len(done)}, failed: {len(failed)}') 621 | 622 | return pages, links 623 | 624 | 625 | async def partial_parse(config: Config, flush: bool = False) -> None: 626 | with persisted_graph(config.data_path / 'graph.json', flush=flush) as graph: 627 | last_parsed = {page.id: page.last_parsed for page in graph.pages} 628 | pages, links = await parse(last_parsed=last_parsed, config=config) 629 | graph.update(pages, links) 630 | 631 | 632 | async def run_daemon(config: Config) -> NoReturn: 633 | while True: 634 | try: 635 | logger.info('refreshing graph...') 636 | await partial_parse(config=config, flush=False) 637 | await asyncio.sleep(config.refresh_interval) 638 | except Exception: 639 | logger.exception('error while parsing, retrying in 5s...') 640 | await asyncio.sleep(5) 641 | 642 | 643 | def flask_app(config: Config) -> Flask: 644 | app = Flask(__name__) 645 | 646 | def index() -> Any: 647 | return render_template('index.html') 648 | 649 | def data() -> Response: 650 | with persisted_graph(config.data_path / 'graph.json') as graph: 651 | display_graph = to_display_graph(graph) 652 | return jsonify(dataclasses.asdict(display_graph)) 653 | 654 | app.add_url_rule('/', view_func=index) 655 | app.add_url_rule('/data', view_func=data) 656 | return app 657 | 658 | 659 | def main() -> int: 660 | config = load_config() 661 | app = flask_app(config) 662 | 663 | daemon = Thread(target=lambda: asyncio.run(run_daemon(config))) 664 | flask = Thread(target=lambda: app.run(host='0.0.0.0', port=8080)) 665 | 666 | daemon.start() 667 | flask.start() 668 | 669 | daemon.join() 670 | flask.join() 671 | return 0 672 | 673 | 674 | if __name__ == '__main__': 675 | raise SystemExit(main()) 676 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | mypy 4 | pre-commit 5 | pytest 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | aiolimiter 3 | flask 4 | python-dateutil 5 | python-dotenv 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | count = True 3 | statistics = True 4 | max-line-length = 127 5 | extend-exclude = .venv, .pyc 6 | ignore=F401,E123,W503 7 | 8 | [mypy] 9 | python_version = 3.10 10 | warn_return_any = True 11 | warn_unused_configs = True 12 | exclude = (setup.py|build/|tests/) 13 | 14 | [tool:pytest] 15 | python_files = tests/*.py tests/**/*.py tests.py test_*.py *_tests.py 16 | asyncio_mode = auto 17 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 |
11 | 12 | 23 | 24 | --------------------------------------------------------------------------------