├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── graph.py
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── templates
    └── index.html


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/python,node,macos,linux,windows
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,node,macos,linux,windows
  4 | 
  5 | ### Linux ###
  6 | *~
  7 | 
  8 | # temporary files which can be created if a process still has a handle open of a deleted file
  9 | .fuse_hidden*
 10 | 
 11 | # KDE directory preferences
 12 | .directory
 13 | 
 14 | # Linux trash folder which might appear on any partition or disk
 15 | .Trash-*
 16 | 
 17 | # .nfs files are created when an open file is removed but is still being accessed
 18 | .nfs*
 19 | 
 20 | ### macOS ###
 21 | # General
 22 | .DS_Store
 23 | .AppleDouble
 24 | .LSOverride
 25 | 
 26 | # Icon must end with two \r
 27 | Icon
 28 | 
 29 | 
 30 | # Thumbnails
 31 | ._*
 32 | 
 33 | # Files that might appear in the root of a volume
 34 | .DocumentRevisions-V100
 35 | .fseventsd
 36 | .Spotlight-V100
 37 | .TemporaryItems
 38 | .Trashes
 39 | .VolumeIcon.icns
 40 | .com.apple.timemachine.donotpresent
 41 | 
 42 | # Directories potentially created on remote AFP share
 43 | .AppleDB
 44 | .AppleDesktop
 45 | Network Trash Folder
 46 | Temporary Items
 47 | .apdisk
 48 | 
 49 | ### macOS Patch ###
 50 | # iCloud generated files
 51 | *.icloud
 52 | 
 53 | ### Node ###
 54 | # Logs
 55 | logs
 56 | *.log
 57 | npm-debug.log*
 58 | yarn-debug.log*
 59 | yarn-error.log*
 60 | lerna-debug.log*
 61 | .pnpm-debug.log*
 62 | 
 63 | # Diagnostic reports (https://nodejs.org/api/report.html)
 64 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 65 | 
 66 | # Runtime data
 67 | pids
 68 | *.pid
 69 | *.seed
 70 | *.pid.lock
 71 | 
 72 | # Directory for instrumented libs generated by jscoverage/JSCover
 73 | lib-cov
 74 | 
 75 | # Coverage directory used by tools like istanbul
 76 | coverage
 77 | *.lcov
 78 | 
 79 | # nyc test coverage
 80 | .nyc_output
 81 | 
 82 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 83 | .grunt
 84 | 
 85 | # Bower dependency directory (https://bower.io/)
 86 | bower_components
 87 | 
 88 | # node-waf configuration
 89 | .lock-wscript
 90 | 
 91 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 92 | build/Release
 93 | 
 94 | # Dependency directories
 95 | node_modules/
 96 | jspm_packages/
 97 | 
 98 | # Snowpack dependency directory (https://snowpack.dev/)
 99 | web_modules/
100 | 
101 | # TypeScript cache
102 | *.tsbuildinfo
103 | 
104 | # Optional npm cache directory
105 | .npm
106 | 
107 | # Optional eslint cache
108 | .eslintcache
109 | 
110 | # Optional stylelint cache
111 | .stylelintcache
112 | 
113 | # Microbundle cache
114 | .rpt2_cache/
115 | .rts2_cache_cjs/
116 | .rts2_cache_es/
117 | .rts2_cache_umd/
118 | 
119 | # Optional REPL history
120 | .node_repl_history
121 | 
122 | # Output of 'npm pack'
123 | *.tgz
124 | 
125 | # Yarn Integrity file
126 | .yarn-integrity
127 | 
128 | # dotenv environment variable files
129 | .env
130 | .env.development.local
131 | .env.test.local
132 | .env.production.local
133 | .env.local
134 | 
135 | # parcel-bundler cache (https://parceljs.org/)
136 | .cache
137 | .parcel-cache
138 | 
139 | # Next.js build output
140 | .next
141 | out
142 | 
143 | # Nuxt.js build / generate output
144 | .nuxt
145 | dist
146 | 
147 | # Gatsby files
148 | .cache/
149 | # Comment in the public line in if your project uses Gatsby and not Next.js
150 | # https://nextjs.org/blog/next-9-1#public-directory-support
151 | # public
152 | 
153 | # vuepress build output
154 | .vuepress/dist
155 | 
156 | # vuepress v2.x temp and cache directory
157 | .temp
158 | 
159 | # Docusaurus cache and generated files
160 | .docusaurus
161 | 
162 | # Serverless directories
163 | .serverless/
164 | 
165 | # FuseBox cache
166 | .fusebox/
167 | 
168 | # DynamoDB Local files
169 | .dynamodb/
170 | 
171 | # TernJS port file
172 | .tern-port
173 | 
174 | # Stores VSCode versions used for testing VSCode extensions
175 | .vscode-test
176 | 
177 | # yarn v2
178 | .yarn/cache
179 | .yarn/unplugged
180 | .yarn/build-state.yml
181 | .yarn/install-state.gz
182 | .pnp.*
183 | 
184 | ### Node Patch ###
185 | # Serverless Webpack directories
186 | .webpack/
187 | 
188 | # Optional stylelint cache
189 | 
190 | # SvelteKit build / generate output
191 | .svelte-kit
192 | 
193 | ### Python ###
194 | # Byte-compiled / optimized / DLL files
195 | __pycache__/
196 | *.py[cod]
197 | *$py.class
198 | 
199 | # C extensions
200 | *.so
201 | 
202 | # Distribution / packaging
203 | .Python
204 | build/
205 | develop-eggs/
206 | dist/
207 | downloads/
208 | eggs/
209 | .eggs/
210 | lib/
211 | lib64/
212 | parts/
213 | sdist/
214 | var/
215 | wheels/
216 | share/python-wheels/
217 | *.egg-info/
218 | .installed.cfg
219 | *.egg
220 | MANIFEST
221 | 
222 | # PyInstaller
223 | #  Usually these files are written by a python script from a template
224 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
225 | *.manifest
226 | *.spec
227 | 
228 | # Installer logs
229 | pip-log.txt
230 | pip-delete-this-directory.txt
231 | 
232 | # Unit test / coverage reports
233 | htmlcov/
234 | .tox/
235 | .nox/
236 | .coverage
237 | .coverage.*
238 | nosetests.xml
239 | coverage.xml
240 | *.cover
241 | *.py,cover
242 | .hypothesis/
243 | .pytest_cache/
244 | cover/
245 | 
246 | # Translations
247 | *.mo
248 | *.pot
249 | 
250 | # Django stuff:
251 | local_settings.py
252 | db.sqlite3
253 | db.sqlite3-journal
254 | 
255 | # Flask stuff:
256 | instance/
257 | .webassets-cache
258 | 
259 | # Scrapy stuff:
260 | .scrapy
261 | 
262 | # Sphinx documentation
263 | types-python-dateutil
264 | docs/_build/
265 | 
266 | # PyBuilder
267 | .pybuilder/
268 | target/
269 | 
270 | # Jupyter Notebook
271 | .ipynb_checkpoints
272 | 
273 | # IPython
274 | profile_default/
275 | ipython_config.py
276 | 
277 | # pyenv
278 | #   For a library or package, you might want to ignore these files since the code is
279 | #   intended to run in multiple environments; otherwise, check them in:
280 | # .python-version
281 | 
282 | # pipenv
283 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
284 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
285 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
286 | #   install all needed dependencies.
287 | #Pipfile.lock
288 | 
289 | # poetry
290 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
291 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
292 | #   commonly ignored for libraries.
293 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
294 | #poetry.lock
295 | 
296 | # pdm
297 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
298 | #pdm.lock
299 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
300 | #   in version control.
301 | #   https://pdm.fming.dev/#use-with-ide
302 | .pdm.toml
303 | 
304 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
305 | __pypackages__/
306 | 
307 | # Celery stuff
308 | celerybeat-schedule
309 | celerybeat.pid
310 | 
311 | # SageMath parsed files
312 | *.sage.py
313 | 
314 | # Environments
315 | .venv
316 | env/
317 | venv/
318 | ENV/
319 | env.bak/
320 | venv.bak/
321 | 
322 | # Spyder project settings
323 | .spyderproject
324 | .spyproject
325 | 
326 | # Rope project settings
327 | .ropeproject
328 | 
329 | # mkdocs documentation
330 | /site
331 | 
332 | # mypy
333 | .mypy_cache/
334 | .dmypy.json
335 | dmypy.json
336 | 
337 | # Pyre type checker
338 | .pyre/
339 | 
340 | # pytype static type analyzer
341 | .pytype/
342 | 
343 | # Cython debug symbols
344 | cython_debug/
345 | 
346 | # PyCharm
347 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
348 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
349 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
350 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
351 | #.idea/
352 | 
353 | ### Windows ###
354 | # Windows thumbnail cache files
355 | Thumbs.db
356 | Thumbs.db:encryptable
357 | ehthumbs.db
358 | ehthumbs_vista.db
359 | 
360 | # Dump file
361 | *.stackdump
362 | 
363 | # Folder config file
364 | [Dd]esktop.ini
365 | 
366 | # Recycle Bin used on file shares
367 | $RECYCLE.BIN/
368 | 
369 | # Windows Installer files
370 | *.cab
371 | *.msi
372 | *.msix
373 | *.msm
374 | *.msp
375 | 
376 | # Windows shortcuts
377 | *.lnk
378 | 
379 | # End of https://www.toptal.com/developers/gitignore/api/python,node,macos,linux,windows
380 | 
381 | graph.json
382 | data.json
383 | data/
384 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.6.0
 4 |     hooks:
 5 |       - id: check-docstring-first
 6 |       - id: check-yaml
 7 |       - id: double-quote-string-fixer
 8 |       - id: end-of-file-fixer
 9 |       - id: requirements-txt-fixer
10 |       - id: trailing-whitespace
11 |   - repo: https://github.com/PyCQA/flake8
12 |     rev: 7.0.0
13 |     hooks:
14 |       - id: flake8
15 |   - repo: https://github.com/asottile/reorder-python-imports
16 |     rev: v3.13.0
17 |     hooks:
18 |       - id: reorder-python-imports
19 |         args: [--py37-plus, --add-import, "from __future__ import annotations"]
20 |   - repo: https://github.com/asottile/pyupgrade
21 |     rev: v3.16.0
22 |     hooks:
23 |       - id: pyupgrade
24 |         args: [--py310-plus]
25 |   - repo: https://github.com/pre-commit/mirrors-mypy
26 |     rev: v1.10.0
27 |     hooks:
28 |       - id: mypy
29 |         additional_dependencies: ["types-PyYAML", "types-python-dateutil"]
30 |   - repo: https://github.com/psf/black
31 |     rev: 24.4.2
32 |     hooks:
33 |       - id: black
34 |         additional_dependencies: ["click==8.0.2"]
35 |         language_version: python3.10
36 |         args: [--skip-string-normalization]
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Dominique Garmier
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/DominiqueGarmier/notion-graph/main.svg)](https://results.pre-commit.ci/latest/github/DominiqueGarmier/notion-graph/main)
 2 | 
 3 | # notion-graph
 4 | 
 5 | opensource graph view of your notion pages, inspired by [Obsidian](https://obsidian.md/).
 6 | 
 7 | ## WARNING THIS IS STILL IN DEVELOPMENT
 8 | 
 9 | #### what currently works:
10 | 
11 | - a simple flask server (see the gif below)
12 | - background parsing and auto updating (parses every X minutes automatically)
13 | - retrying logic (it hasn't crashed into an unrecoverable state for me yet)
14 | - partial updates (only parse pages that were edited since last parse)
15 | 
16 | <p align="center">
17 |   <img src="https://github.com/DominiqueGarmier/notion-graph/assets/42445422/9735496a-fdd7-4ba0-a8df-7acacbba3f28" alt="notion-graph preview"/>
18 | </p>
19 | 
20 | ## Installing
21 | 
22 | Clone this repo.
23 | 
24 | ```
25 | git clone git@github.com:dominiquegarmier/notion-graph
26 | cd notion-graph
27 | ```
28 | 
29 | Install dependencies.
30 | 
31 | ```
32 | virtualenv .venv -ppython3.10
33 | source .venv/bin/activate
34 | pip install -r requirements.txt
35 | ```
36 | 
37 | ## Setup
38 | 
39 | - set the environment variable `NOTION_KEY` with your notion api key that has read access to some pages (see [notion docs]("https://developers.notion.com/docs/create-a-notion-integration")).
40 | 
41 | ## Usage
42 | 
43 | you can now run the following command to start notion-graph
44 | 
45 | ```
46 | python graph.py
47 | ```
48 | 
49 | This will automatically discover any page shared with your notion integration. Subsequently it will create a task queue to query every discovered page. The initial parse of your document might take a while as notions api is limited to three requests per second. You will notice that the programm will create a new folder `data/` which contains the parsed pages and links. Subsequent parses will only refresh pages that have be edited since the last parse.
50 | 
51 | The graph view will be served on `localhost:8080`. Make sure to hit refresh when the parsing is done.
52 | 
53 | ## Development
54 | 
55 | Install dev-dependencies
56 | 
57 | ```
58 | pip install -r requirements-dev.txt
59 | ```
60 | 
61 | Install pre-commit hooks.
62 | 
63 | ```
64 | pre-commit install
65 | ```
66 | 


--------------------------------------------------------------------------------
/graph.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import asyncio
  4 | import dataclasses
  5 | import json
  6 | import logging
  7 | import math
  8 | import os
  9 | import tempfile
 10 | from collections import defaultdict
 11 | from collections.abc import AsyncGenerator
 12 | from collections.abc import Collection
 13 | from collections.abc import Generator
 14 | from contextlib import asynccontextmanager
 15 | from contextlib import contextmanager
 16 | from dataclasses import dataclass
 17 | from datetime import datetime
 18 | from datetime import timezone
 19 | from logging import getLogger
 20 | from pathlib import Path
 21 | from threading import Thread
 22 | from typing import Any
 23 | from typing import Literal
 24 | from typing import NoReturn
 25 | from uuid import UUID
 26 | from uuid import uuid4
 27 | 
 28 | from aiohttp import ClientSession
 29 | from aiolimiter import AsyncLimiter
 30 | from dateutil.parser import parse as parse_date
 31 | from dotenv import load_dotenv
 32 | from flask import Flask
 33 | from flask import jsonify
 34 | from flask import render_template
 35 | from werkzeug import Response
 36 | 
 37 | logger = getLogger(__name__)
 38 | logging.basicConfig(level=logging.DEBUG)
 39 | 
 40 | NOTION_URL = 'https://www.notion.so'
 41 | NOTION_API_URL = 'https://api.notion.com/v1'
 42 | 
 43 | DEFAULT_DATA_PATH = Path(__file__).parent / 'data'
 44 | DEFAULT_REFRESH_INTERVAL = 60 * 5
 45 | DEFAULT_N_WORKERS = 4
 46 | 
 47 | # api config
 48 | RATE_LIMIT_BURST = 1
 49 | RATE_LIMIT = 3 * RATE_LIMIT_BURST
 50 | RATE_LIMITER = AsyncLimiter(RATE_LIMIT, RATE_LIMIT_BURST)
 51 | TIMEOUT = 30
 52 | MAX_RETRY = 1
 53 | 
 54 | SKIP_PROPAGATION_BLOCK_TYPES = (
 55 |     'child_page',
 56 |     'child_database',
 57 | )
 58 | 
 59 | 
 60 | @dataclass(frozen=True)
 61 | class Config:
 62 |     notion_key: str
 63 |     data_path: Path
 64 |     refresh_interval: int
 65 |     n_workers: int
 66 | 
 67 | 
 68 | def load_config() -> Config:
 69 |     load_dotenv()
 70 |     try:
 71 |         notion_key = os.environ['NOTION_KEY']
 72 |     except KeyError:
 73 |         raise ValueError('Missing NOTION_KEY environment variable')
 74 | 
 75 |     data_path = Path(os.environ.get('GRAPH_DATA_PATH', DEFAULT_DATA_PATH))
 76 |     refresh_interval = int(
 77 |         os.environ.get('GRAPH_REFRESH_INTERVAL', DEFAULT_REFRESH_INTERVAL)
 78 |     )
 79 |     n_workers = int(os.environ.get('GRAPH_N_WORKERS', DEFAULT_N_WORKERS))
 80 | 
 81 |     return Config(notion_key, data_path, refresh_interval, n_workers)
 82 | 
 83 | 
 84 | @dataclass
 85 | class Page:
 86 |     id: UUID
 87 |     url: str
 88 |     title: str
 89 |     last_parsed: datetime
 90 | 
 91 |     def __hash__(self) -> int:
 92 |         return hash(self.id)
 93 | 
 94 |     def __eq__(self, other: object) -> bool:
 95 |         if not isinstance(other, Page):
 96 |             return NotImplemented
 97 |         return self.id == other.id
 98 | 
 99 | 
100 | def serialize_page(page: Page) -> dict[str, str]:
101 |     return {
102 |         'id': str(page.id),
103 |         'url': str(page.url),
104 |         'title': str(page.title),
105 |         'last_parsed': page.last_parsed.isoformat(),
106 |     }
107 | 
108 | 
109 | def deserialize_page(data: dict[str, str]) -> Page:
110 |     return Page(
111 |         id=UUID(data['id']),
112 |         url=data['url'],
113 |         title=data['title'],
114 |         last_parsed=datetime.fromisoformat(data['last_parsed']),
115 |     )
116 | 
117 | 
118 | @dataclass
119 | class Link:
120 |     id: UUID
121 |     source: UUID
122 |     target: UUID
123 |     link_type: Literal['page', 'database', 'mention', 'href']
124 | 
125 |     def __hash__(self) -> int:
126 |         return hash(self.id)
127 | 
128 |     def __eq__(self, other: object) -> bool:
129 |         if not isinstance(other, Link):
130 |             return NotImplemented
131 |         return self.id == other.id
132 | 
133 | 
134 | def serialize_link(link: Link) -> dict[str, str]:
135 |     return {
136 |         'id': str(link.id),
137 |         'source': str(link.source),
138 |         'target': str(link.target),
139 |         'link_type': link.link_type,
140 |     }
141 | 
142 | 
143 | def deserialize_link(data: dict[str, str]) -> Link:
144 |     return Link(
145 |         id=UUID(data['id']),
146 |         source=UUID(data['source']),
147 |         target=UUID(data['target']),
148 |         link_type=data['link_type'],  # type: ignore
149 |     )
150 | 
151 | 
152 | class Graph:
153 |     _pages: dict[UUID, Page]
154 |     _links: dict[UUID, Link]
155 | 
156 |     def __init__(
157 |         self,
158 |         pages: Collection[Page] | None = None,
159 |         links: Collection[Link] | None = None,
160 |     ) -> None:
161 |         self._pages = {}
162 |         self._links = {}
163 | 
164 |         if pages is None:
165 |             pages = []
166 |         if links is None:
167 |             links = []
168 | 
169 |         for page in pages:
170 |             self.add(page)
171 |         for link in links:
172 |             self.add(link)
173 | 
174 |     def __contains__(self, item: Page | UUID) -> bool:
175 |         if isinstance(item, Page):
176 |             return item.id in self._pages
177 |         elif isinstance(item, UUID):
178 |             return item in self._pages
179 |         else:
180 |             raise TypeError(f'Cannot check for item of type {type(item)}')
181 | 
182 |     @classmethod
183 |     def deserialize(cls, data: str) -> Graph:
184 |         dct = json.loads(data)
185 |         try:
186 |             pages = [deserialize_page(page) for page in dct['pages']]
187 |             links = [deserialize_link(link) for link in dct['links']]
188 |         except Exception as e:
189 |             raise ValueError(f'Invalid data {e}')
190 |         return cls(pages, links)
191 | 
192 |     def serialize(self) -> str:
193 |         pages = [serialize_page(page) for page in self._pages.values()]
194 |         links = [serialize_link(link) for link in self._links.values()]
195 |         return json.dumps({'pages': pages, 'links': links})
196 | 
197 |     def save(self, path: str | os.PathLike) -> None:
198 |         with open(path, 'w') as f:
199 |             f.write(self.serialize())
200 | 
201 |     @classmethod
202 |     def load(cls, path: str | os.PathLike) -> Graph:
203 |         with open(path) as f:
204 |             return cls.deserialize(f.read())
205 | 
206 |     @property
207 |     def pages(self) -> list[Page]:
208 |         return list(self._pages.values())
209 | 
210 |     @property
211 |     def links(self) -> list[Link]:
212 |         return list(self._links.values())
213 | 
214 |     def add(self, item: Page | Link) -> None:
215 |         if isinstance(item, Page):
216 |             if item.id in self._pages:
217 |                 raise ValueError(f'Page {item.id} already exists')
218 |             self._pages[item.id] = item
219 |         elif isinstance(item, Link):
220 |             if item.id in self._links:
221 |                 raise ValueError(f'Link {item.id} already exists')
222 |             self._links[item.id] = item
223 |         else:
224 |             raise TypeError(f'Cannot add item of type {type(item)}')
225 | 
226 |     def prune(self) -> Graph:
227 |         links = set()
228 |         for link in self._links.values():
229 |             if link.source in self and link.target in self:
230 |                 links.add(link)
231 |         return self.__class__(list(self._pages.values()), links)
232 | 
233 |     def update(self, pages: Collection[Page], links: Collection[Link]) -> None:
234 |         new_pages = {page.id: page for page in pages}
235 | 
236 |         delete_ids = []
237 |         for link in self._links.values():
238 |             if link.source in new_pages:
239 |                 delete_ids.append(link.id)
240 |         for id_ in delete_ids:
241 |             del self._links[id_]
242 | 
243 |         self._pages.update(new_pages)
244 |         self._links.update({link.id: link for link in links})
245 | 
246 | 
247 | @dataclass
248 | class DisplayLink:
249 |     source: str
250 |     target: str
251 |     rotation: float
252 |     curvature: float
253 | 
254 | 
255 | @dataclass
256 | class DisplayNode:
257 |     id: str
258 |     title: str
259 |     url: str
260 | 
261 | 
262 | @dataclass
263 | class DisplayGraph:
264 |     nodes: list[DisplayNode]
265 |     links: list[DisplayLink]
266 | 
267 | 
268 | def to_display_graph(graph: Graph) -> DisplayGraph:
269 |     node_ids: set[str] = set()
270 |     nodes = []
271 |     for page in graph.pages:
272 |         node_ids.add(str(page.id))
273 |         node = DisplayNode(id=str(page.id), title=page.title, url=page.url)
274 |         nodes.append(node)
275 | 
276 |     links_dict: dict[tuple[UUID, UUID], list[Link]] = defaultdict(list)
277 |     for link in graph.links:
278 |         if str(link.source) not in node_ids or str(link.target) not in node_ids:
279 |             continue
280 |         links_dict[(link.source, link.target)].append(link)
281 | 
282 |     links_list = []
283 |     for ids_tp, links in links_dict.items():
284 |         if ids_tp[0] == ids_tp[1]:
285 |             base_curvature = 0.5
286 |         else:
287 |             base_curvature = 0
288 |         n = len(links)
289 |         for i, link in enumerate(links):
290 |             rotation = 2 * math.pi * i / n
291 |             new_link = DisplayLink(
292 |                 source=str(link.source),
293 |                 target=str(link.target),
294 |                 rotation=rotation,
295 |                 curvature=base_curvature + min((n - 1) / 10, 0.5),
296 |             )
297 |             links_list.append(new_link)
298 | 
299 |     return DisplayGraph(nodes=nodes, links=links_list)
300 | 
301 | 
302 | @contextmanager
303 | def persisted_graph(
304 |     path: str | Path, flush: bool = False, persist: bool = True
305 | ) -> Generator[Graph, None, None]:
306 |     if not flush:
307 |         try:
308 |             graph = Graph.load(path)
309 |         except Exception:
310 |             graph = Graph()
311 |             logger.warning(f'Could not load graph from {path}, creating new graph.')
312 |     else:
313 |         graph = Graph()
314 | 
315 |     try:
316 |         yield graph
317 |     except Exception:
318 |         raise
319 |     else:
320 |         if persist:
321 |             fd, tmp_path = tempfile.mkstemp()
322 |             with open(fd, 'w') as f:
323 |                 f.write(graph.serialize())
324 | 
325 |             Path(path).parent.mkdir(parents=True, exist_ok=True)
326 |             os.replace(tmp_path, path)
327 | 
328 | 
329 | @asynccontextmanager
330 | async def RateLimitedSession(
331 |     config: Config,
332 |     auth: str | None = None,
333 | ) -> AsyncGenerator[ClientSession, None]:
334 |     auth = auth or config.notion_key
335 |     headers = {
336 |         'Authorization': f'Bearer {auth}',
337 |         'Notion-Version': '2022-02-22',
338 |     }
339 |     async with RATE_LIMITER:
340 |         async with ClientSession(headers=headers) as session:
341 |             yield session
342 | 
343 | 
344 | async def paginated(
345 |     method: Literal['GET', 'POST'],
346 |     url: str,
347 |     config: Config,
348 |     initial_params: dict[str, Any] | None = None,
349 | ) -> list[dict]:
350 |     results = []
351 | 
352 |     cursor: str | None = None
353 |     has_more = True
354 | 
355 |     while has_more:
356 |         params = initial_params or {}
357 |         if cursor is not None:
358 |             params = params | {'start_cursor': cursor}
359 |         data = {}
360 |         async with RateLimitedSession(config=config) as session:
361 |             if method == 'GET':
362 |                 async with session.get(url, params=params) as resp:
363 |                     data = await resp.json()
364 |             elif method == 'POST':
365 |                 async with session.post(url, json=params) as resp:
366 |                     data = await resp.json()
367 | 
368 |         results.extend(data['results'])
369 | 
370 |         cursor = data.get('next_cursor')
371 |         has_more = data.get('has_more', False) and cursor is not None
372 | 
373 |     return results
374 | 
375 | 
376 | def _uuid_to_url(uuid: UUID) -> str:
377 |     return f"{NOTION_URL}/{str(uuid).replace('-', '')}"
378 | 
379 | 
380 | def _strip_uuid(href: str) -> UUID:
381 |     if not href.startswith('/'):
382 |         raise ValueError
383 |     no_slash = href[1:]
384 |     try:
385 |         return UUID(no_slash.split('#')[0])
386 |     except ValueError:
387 |         pass
388 |     try:
389 |         return UUID(no_slash.split('?')[0])
390 |     except ValueError:
391 |         raise
392 | 
393 | 
394 | def _parse_page(
395 |     page_data: dict[str, Any], last_parsed: dict[UUID, datetime]
396 | ) -> Page | None:
397 |     # skip archived pages
398 |     if page_data['archived']:
399 |         return None
400 | 
401 |     # only parse page if it has been updated since last parse
402 |     page_id = UUID(page_data['id'])
403 |     if page_id in last_parsed:
404 |         last_edited = parse_date(page_data['last_edited_time'])
405 | 
406 |         time = last_parsed[page_id]
407 |         if time.tzinfo is None:
408 |             time = time.replace(tzinfo=timezone.utc)
409 | 
410 |         if last_edited < last_parsed[page_id]:
411 |             return None
412 | 
413 |     properties = page_data.get('properties', {})
414 |     for value in properties.values():
415 |         if isinstance(value, dict) and value.get('type') == 'title':
416 |             title_rich_text = value.get('title', [])
417 |             break
418 |     else:
419 |         title_rich_text = []
420 | 
421 |     title_rich_text = [rt for rt in title_rich_text if rt['type'] == 'text']
422 |     title = '-'.join([rt['text']['content'] for rt in title_rich_text])
423 | 
424 |     return Page(
425 |         id=page_id,
426 |         url=_uuid_to_url(page_id),
427 |         title=title,
428 |         last_parsed=datetime.now(timezone.utc),
429 |     )
430 | 
431 | 
432 | async def parse_pages(last_parsed: dict[UUID, datetime], config: Config) -> set[Page]:
433 |     param = {'filter': {'value': 'page', 'property': 'object'}}
434 |     logger.debug('getting page ids...')
435 |     resp = await paginated(
436 |         'POST', url=f'{NOTION_API_URL}/search', config=config, initial_params=param
437 |     )
438 | 
439 |     ret = []
440 |     for data in resp:
441 |         page = _parse_page(page_data=data, last_parsed=last_parsed)
442 |         if page is None:
443 |             continue
444 |         ret.append(page)
445 |     return set(ret)
446 | 
447 | 
448 | def _parse_rich_text(page: UUID, rich_text: dict[str, Any]) -> Link | None:
449 |     if rich_text['type'] == 'mention':
450 |         mention = rich_text['mention']
451 |         if mention['type'] == 'page':
452 |             return Link(
453 |                 id=uuid4(),
454 |                 source=page,
455 |                 target=UUID(mention['page']['id']),
456 |                 link_type='mention',
457 |             )
458 |     elif rich_text['type'] == 'text':
459 |         if rich_text.get('href') is not None:
460 |             try:
461 |                 uuid = _strip_uuid(rich_text['href'])
462 |             except ValueError:
463 |                 logger.debug(f"failed to parse href format: {rich_text['href']}")
464 |                 return None
465 |             return Link(id=uuid4(), source=page, target=uuid, link_type='href')
466 |     return None
467 | 
468 | 
469 | def parse_links(page: UUID, data: dict[str, Any]) -> list[Link]:
470 |     block_type = data['type']
471 |     if block_type in ('child_page', 'child_database'):
472 |         return []
473 |     if block_type not in data:
474 |         return []
475 | 
476 |     ret: list[Link] = []
477 |     block = data.get(block_type, {})
478 |     for rich_text in block.get('rich_text', []):
479 |         try:
480 |             link = _parse_rich_text(page=page, rich_text=rich_text)
481 |         except KeyError:
482 |             pass
483 |         else:
484 |             if link is not None:
485 |                 ret.append(link)
486 |     return ret
487 | 
488 | 
489 | async def parse_children(
490 |     page: UUID, block: UUID, config: Config
491 | ) -> tuple[list[UUID], list[Link]]:
492 |     # logger.info(f"parsing children of {block} in {page}...")
493 |     resp = await paginated(
494 |         'GET', url=f'{NOTION_API_URL}/blocks/{block}/children', config=config
495 |     )
496 | 
497 |     links: list[Link] = []
498 |     children: list[UUID] = []
499 | 
500 |     for data in resp:
501 |         # handle child_pages separately
502 |         tp = data['type']
503 |         target = data['id']
504 |         if tp == 'child_page':
505 |             links.append(Link(id=uuid4(), source=page, target=target, link_type='page'))
506 |         if tp == 'child_database':
507 |             links.append(
508 |                 Link(id=uuid4(), source=page, target=target, link_type='database')
509 |             )
510 | 
511 |         # handle any other links such as mentions and hrefs
512 |         links.extend(parse_links(page=page, data=data))
513 | 
514 |         # handle propagation to children
515 |         if tp in SKIP_PROPAGATION_BLOCK_TYPES:
516 |             continue
517 | 
518 |         if data.get('has_children'):
519 |             children.append(UUID(target))
520 | 
521 |     return children, links
522 | 
523 | 
524 | @dataclass
525 | class Task:
526 |     page: UUID
527 |     block: UUID
528 |     retry: int = 0
529 | 
530 |     def __hash__(self) -> int:
531 |         return hash(self.block)
532 | 
533 |     def __eq__(self, other: Any) -> bool:
534 |         if not isinstance(other, Task):
535 |             return NotImplemented
536 |         return self.block == other.block
537 | 
538 | 
539 | async def worker(
540 |     queue: asyncio.Queue[Task],
541 |     links: set[Link],
542 |     enqueued: set[Task],
543 |     done: set[Task],
544 |     failed: set[Task],
545 |     config: Config,
546 | ):
547 |     while True:
548 |         task = await queue.get()
549 |         try:
550 |             children, new_links = await asyncio.wait_for(
551 |                 parse_children(page=task.page, block=task.block, config=config), TIMEOUT
552 |             )
553 |         except Exception:
554 |             if task.retry >= MAX_RETRY:
555 |                 logger.debug(f'task failed: {task!r}')
556 |                 failed.add(task)
557 |             else:
558 |                 logger.debug(f'retrying task: {task!r}')
559 |                 task.retry += 1
560 |                 queue.put_nowait(task)
561 |         else:
562 |             async with asyncio.Lock():
563 |                 for child in children:
564 |                     # don't parse blocks twice
565 |                     if child not in enqueued:
566 |                         new_task = Task(page=task.page, block=child)
567 |                         enqueued.add(new_task)
568 |                         queue.put_nowait(new_task)
569 | 
570 |             for link in new_links:
571 |                 links.add(link)
572 |             done.add(task)
573 |         finally:
574 |             queue.task_done()
575 | 
576 | 
577 | async def parse(
578 |     last_parsed: dict[UUID, datetime],
579 |     config: Config,
580 | ) -> tuple[set[Page], set[Link]]:
581 |     pages = await parse_pages(config=config, last_parsed=last_parsed)
582 |     links: set[Link] = set()
583 | 
584 |     # monitor the queue
585 |     queue: asyncio.Queue[Task] = asyncio.Queue()
586 |     enqueued: set[Task] = set()
587 |     done: set[Task] = set()
588 |     failed: set[Task] = set()
589 | 
590 |     workers = []
591 |     for _ in range(config.n_workers):
592 |         task = asyncio.create_task(
593 |             worker(queue, links, enqueued, done, failed, config=config)
594 |         )
595 |         workers.append(task)
596 | 
597 |     for page in pages:
598 |         new_task = Task(page=page.id, block=page.id)
599 |         queue.put_nowait(new_task)
600 |         enqueued.add(new_task)
601 | 
602 |     # wait for all tasks to be done
603 |     async def monitor() -> None:
604 |         while True:
605 |             await asyncio.sleep(1)
606 |             logger.debug(
607 |                 f'ENQUEUED: {len(enqueued)}, DONE: {len(done)}, FAILED: {len(failed)}'
608 |             )
609 | 
610 |     logger_task = asyncio.create_task(monitor())
611 | 
612 |     await queue.join()
613 | 
614 |     logger_task.cancel()
615 |     logger.debug('work done, cancelling workers...')
616 | 
617 |     for w in workers:
618 |         w.cancel()
619 | 
620 |     logger.info(f'done: {len(done)}, failed: {len(failed)}')
621 | 
622 |     return pages, links
623 | 
624 | 
625 | async def partial_parse(config: Config, flush: bool = False) -> None:
626 |     with persisted_graph(config.data_path / 'graph.json', flush=flush) as graph:
627 |         last_parsed = {page.id: page.last_parsed for page in graph.pages}
628 |         pages, links = await parse(last_parsed=last_parsed, config=config)
629 |         graph.update(pages, links)
630 | 
631 | 
632 | async def run_daemon(config: Config) -> NoReturn:
633 |     while True:
634 |         try:
635 |             logger.info('refreshing graph...')
636 |             await partial_parse(config=config, flush=False)
637 |             await asyncio.sleep(config.refresh_interval)
638 |         except Exception:
639 |             logger.exception('error while parsing, retrying in 5s...')
640 |             await asyncio.sleep(5)
641 | 
642 | 
643 | def flask_app(config: Config) -> Flask:
644 |     app = Flask(__name__)
645 | 
646 |     def index() -> Any:
647 |         return render_template('index.html')
648 | 
649 |     def data() -> Response:
650 |         with persisted_graph(config.data_path / 'graph.json') as graph:
651 |             display_graph = to_display_graph(graph)
652 |             return jsonify(dataclasses.asdict(display_graph))
653 | 
654 |     app.add_url_rule('/', view_func=index)
655 |     app.add_url_rule('/data', view_func=data)
656 |     return app
657 | 
658 | 
659 | def main() -> int:
660 |     config = load_config()
661 |     app = flask_app(config)
662 | 
663 |     daemon = Thread(target=lambda: asyncio.run(run_daemon(config)))
664 |     flask = Thread(target=lambda: app.run(host='0.0.0.0', port=8080))
665 | 
666 |     daemon.start()
667 |     flask.start()
668 | 
669 |     daemon.join()
670 |     flask.join()
671 |     return 0
672 | 
673 | 
674 | if __name__ == '__main__':
675 |     raise SystemExit(main())
676 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | mypy
4 | pre-commit
5 | pytest
6 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | aiolimiter
3 | flask
4 | python-dateutil
5 | python-dotenv
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | count = True
 3 | statistics = True
 4 | max-line-length = 127
 5 | extend-exclude = .venv, .pyc
 6 | ignore=F401,E123,W503
 7 | 
 8 | [mypy]
 9 | python_version = 3.10
10 | warn_return_any = True
11 | warn_unused_configs = True
12 | exclude = (setup.py|build/|tests/)
13 | 
14 | [tool:pytest]
15 | python_files = tests/*.py tests/**/*.py tests.py test_*.py *_tests.py
16 | asyncio_mode = auto
17 | 


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
 1 | <head>
 2 |   <style> body { margin: 0; } </style>
 3 | 
 4 |   <script src="//unpkg.com/3d-force-graph"></script>
 5 | <!--  <script src="../../dist/3d-force-graph.js"></script>-->
 6 | </head>
 7 | 
 8 | <body>
 9 | 
10 |   <div id="3d-graph"></div>
11 | 
12 |   <script>
13 |     const elem = document.getElementById('3d-graph');
14 | 
15 |     const Graph = ForceGraph3D()(elem)
16 |       .jsonUrl('./data')
17 |       .nodeAutoColorBy('title')
18 |       .linkCurvature('curvature')
19 |       .linkCurveRotation('rotation')
20 |       .nodeLabel(node => `${node.title}`)
21 |       .onNodeClick(node => window.open(`${node.url}`, '_blank'));
22 |   </script>
23 | </body>
24 | 


--------------------------------------------------------------------------------