├── README.rst
├── index
    ├── .managed.json
    └── meta.json
├── main.py
├── requirements.txt
├── setup.py
├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
└── onote
    ├── auth.py
    └── command_line.py


/README.rst:
--------------------------------------------------------------------------------
1 | OneNote Search Client.


--------------------------------------------------------------------------------
/index/.managed.json:
--------------------------------------------------------------------------------
1 | ["meta.json"]
2 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from onote.command_line import main
2 | 
3 | if __name__ == '__main__':
4 |     main()
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2023.11.17
 2 | cffi==1.16.0
 3 | chardet==5.2.0
 4 | cryptography==41.0.7
 5 | docopt==0.6.2
 6 | flake8==7.0.0
 7 | idna==3.6
 8 | mccabe==0.7.0
 9 | msal==1.26.0
10 | pycodestyle==2.11.1
11 | pycparser==2.21
12 | pyflakes==3.2.0
13 | PyJWT==2.8.0
14 | requests==2.31.0
15 | six==1.16.0
16 | urllib3==2.1.0
17 | 


--------------------------------------------------------------------------------
/index/meta.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "segments": [],
 3 |   "schema": [
 4 |     {
 5 |       "name": "title",
 6 |       "type": "text",
 7 |       "options": {
 8 |         "indexing": {
 9 |           "record": "position",
10 |           "tokenizer": "en_stem"
11 |         },
12 |         "stored": true
13 |       }
14 |     },
15 |     {
16 |       "name": "content",
17 |       "type": "text",
18 |       "options": {
19 |         "indexing": {
20 |           "record": "position",
21 |           "tokenizer": "en_stem"
22 |         },
23 |         "stored": false
24 |       }
25 |     },
26 |     {
27 |       "name": "url",
28 |       "type": "text",
29 |       "options": {
30 |         "indexing": null,
31 |         "stored": true
32 |       }
33 |     }
34 |   ],
35 |   "opstamp": 0
36 | }
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | from onote.command_line import VERSION
 4 | 
 5 | 
 6 | def readme():
 7 |     with open('README.rst') as f:
 8 |         return f.read()
 9 | 
10 | 
11 | setup(name='onote',
12 |       version=VERSION,
13 |       python_requires='>=3.7',
14 |       description='Search onenote pages',
15 |       long_description=readme(),
16 |       url='https://github.com/antonydeepak/onote',
17 |       author='Antony Thomas',
18 |       author_email='gogsbread@gmail.com',
19 |       packages=['onote'],
20 |       install_requires=[
21 |           'docopt',
22 |           'msal',
23 |           'requests',
24 |       ],
25 |       zip_safe=False,
26 |       entry_points={
27 |           'console_scripts': ['onote=onote.command_line:main'],
28 |       })
29 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 


--------------------------------------------------------------------------------
/onote/auth.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import logging
 3 | import msal
 4 | import os
 5 | import requests
 6 | 
 7 | from pathlib import Path
 8 | from typing import List
 9 | 
10 | CACHE_PATH = Path(os.path.join(Path.home(), ".onote", "user_token.bin"))
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class OneNoteAuthenticator():
16 |     def __init__(self, user_name: str, client_id: str, scopes: List[str]):
17 |         self.cache = msal.SerializableTokenCache()
18 |         if CACHE_PATH.exists():
19 |             self.cache.deserialize(open(CACHE_PATH, "r").read())
20 |         atexit.register(self._serialize_cache)
21 | 
22 |         self.user_name = user_name
23 |         self.app = msal.PublicClientApplication(
24 |             client_id=client_id,
25 |             token_cache=self.cache
26 |         )
27 |         self.scopes = scopes
28 | 
29 |     def __call__(self):
30 |         """Uses device code flow
31 |         Authorization code flow is annoying because it is meant to be used in servers which should be 
32 |         time synced. Else, the token barfs because the time is slightly off when msal tries to decode """
33 |         accounts = self.app.get_accounts(self.user_name)
34 |         result = None
35 |         if accounts:
36 |             a = accounts[0]
37 |             logger.info(f"User must have logged-in before. Taking the first account \'{a['username']}\'")
38 |             result = self.app.acquire_token_silent(self.scopes, account=a)
39 |         if not result:
40 |             logger.info("Could not use credentials from cache; using device flow to obtain credentials")
41 |             flow = self.app.initiate_device_flow(scopes=self.scopes)
42 |             logger.info(flow["message"])
43 |             result = self.app.acquire_token_by_device_flow(flow)
44 | 
45 |         if "error" in result:
46 |             raise Exception(result["error_description"])
47 | 
48 |         return result["access_token"]
49 | 
50 |     def _serialize_cache(self):
51 |         if self.cache.has_state_changed:
52 |             cache_dir = CACHE_PATH.parent
53 |             if not cache_dir.exists():
54 |                 os.makedirs(cache_dir)
55 |             open(CACHE_PATH, "w").write(self.cache.serialize())
56 | 
57 | 
58 | class OneNoteSession(requests.Session):
59 |     def __init__(self, token_fetcher: callable):
60 |         super().__init__()
61 | 
62 |         self.token_fetcher = token_fetcher
63 |         token = self.token_fetcher()
64 |         self.headers.update(
65 |             {"User-Agent": "onoteClient", "Authorization": f"Bearer {token}"}
66 |         )
67 | 
68 |     def request(self, *args, **kwargs):
69 |         resp = super().request(*args, **kwargs)
70 |         if resp.status_code == 401:
71 |             token = self.token_fetcher()
72 |             self.headers["Authorization"] = f"Bearer {token}"
73 |             resp = super().request(*args, **kwargs)
74 |         return resp
75 | 


--------------------------------------------------------------------------------
/onote/command_line.py:
--------------------------------------------------------------------------------
  1 | """Usage: onote search [-i <index>] QUERY...
  2 |           onote index [-d <directory>] [-u <user>] [-p]
  3 | 
  4 | -i <index>, --index <index>              Path to index directory
  5 | -d <directory>, --directory <directory>  Path to index directory
  6 | -u <user>, --user <user>                 User account to use for indexing
  7 | -p, --purge                              Purge the index instead of creating it
  8 | --version                                Show version
  9 | --help
 10 | """
 11 | 
 12 | import concurrent.futures
 13 | import json
 14 | import logging
 15 | import os
 16 | import shutil
 17 | import subprocess
 18 | 
 19 | from collections import namedtuple
 20 | from concurrent.futures import ThreadPoolExecutor
 21 | from docopt import docopt
 22 | from html.parser import HTMLParser
 23 | from pathlib import Path
 24 | from typing import Generator
 25 | 
 26 | from onote.auth import OneNoteAuthenticator, OneNoteSession
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | PAGES_URL = "https://graph.microsoft.com/v1.0/me/onenote/pages"
 31 | 
 32 | VERSION = 0.1
 33 | INDEX_DIR_PATH = Path(os.path.join(Path.home(), ".onote", "index"))
 34 | 
 35 | 
 36 | def create_index(path: Path):
 37 |     index_managed = path.joinpath(".managed.json")
 38 |     index_meta = path.joinpath("meta.json")
 39 |     if not path.exists():
 40 |         logger.info(f"Creating '{path}'")
 41 |         os.makedirs(path)
 42 |     if not index_managed.exists():
 43 |         logger.info(f"Creating '{index_managed}'")
 44 |         with open("index/.managed.json", "r") as r:
 45 |             with open(index_managed, "w") as w:
 46 |                 w.write(r.read())
 47 |     if not index_meta.exists():
 48 |         logger.info(f"Creating '{index_meta}'")
 49 |         with open("index/meta.json", "r") as r:
 50 |             with open(index_meta, "w") as w:
 51 |                 w.write(r.read())
 52 | 
 53 | 
 54 | def purge_index(path: Path):
 55 |     if path.exists():
 56 |         logger.info(f"Purging '{path}'")
 57 |         shutil.rmtree(path)
 58 | 
 59 | 
 60 | class HtmlOnenoteContentParser(HTMLParser):
 61 |     def __init__(self, *args, **kwargs):
 62 |         super().__init__(*args, **kwargs)
 63 | 
 64 |         self.is_ptag = False
 65 |         self._content = []
 66 | 
 67 |     def handle_starttag(self, tag, attrs):
 68 |         self.is_ptag = (tag == "p")
 69 | 
 70 |     def handle_data(self, data):
 71 |         if self.is_ptag:
 72 |             self._content.append(data)
 73 | 
 74 |     @property
 75 |     def content(self):
 76 |         return " ".join(self._content)
 77 | 
 78 | 
 79 | class IndexError(Exception):
 80 |     pass
 81 | 
 82 | 
 83 | class SearchError(Exception):
 84 |     pass
 85 | 
 86 | 
 87 | def index(index_path, downloader):
 88 |     """
 89 |     Onenote PAGES_URL returns paginated list pages.
 90 |     Idea is to concurrently download a list of page content urls and index in a single batch using tantivy
 91 |     """
 92 |     Page = namedtuple('Page', ['title', 'content_url', 'weblink'])
 93 | 
 94 |     def tantivy(pages):
 95 |         p = subprocess.run(["tantivy", "index", "--index", index_path],
 96 |                            input=pages, encoding="utf8", stderr=subprocess.PIPE)
 97 |         if p.returncode != 0:
 98 |             raise IndexError(p.stderr)
 99 | 
100 |     future_to_page = {}
101 |     with ThreadPoolExecutor() as executor:
102 |         # download page content
103 |         pages_url = PAGES_URL
104 |         while pages_url is not None:
105 |             logger.debug(f"Downloading {pages_url}")
106 |             r = downloader(pages_url)
107 |             r.raise_for_status()
108 | 
109 |             c = json.loads(r.text)
110 |             pages = c["value"]
111 |             for p in pages:
112 |                 page = Page(title=p["title"], content_url=p["contentUrl"], weblink=p["links"]["oneNoteWebUrl"]["href"])
113 |                 future = executor.submit(downloader, page.content_url)
114 |                 future_to_page[future] = page
115 | 
116 |                 logger.info(f"Downloading content page '{page.title}' from '{page.content_url}'")
117 | 
118 |             pages_url = c.get("@odata.nextLink")
119 | 
120 |         # index pages
121 |         logger.info("Indexing pages")
122 |         indexable_pages = []
123 |         for future in concurrent.futures.as_completed(future_to_page):
124 |             page = future_to_page[future]
125 |             if future.exception():
126 |                 logger.warning(f"Failed to index {page.title} because {future.exception()}")
127 |                 continue
128 | 
129 |             r = future.result()
130 |             content_parser = HtmlOnenoteContentParser()
131 |             content_parser.feed(r.text)
132 |             content = content_parser.content
133 | 
134 |             indexable_pages.append(json.dumps({
135 |                 "title": page.title,
136 |                 "content": content,
137 |                 "url": page.weblink
138 |             }))
139 |         d = "\n".join(indexable_pages)
140 |         tantivy(d)
141 | 
142 |         logger.info(f"Total {len(indexable_pages)} pages have been indexed")
143 | 
144 | 
145 | SearchResult = namedtuple('SearchResult', ['title', 'url'])
146 | 
147 | 
148 | def search(query, index_path) -> Generator[SearchResult, None, None]:
149 |     p = subprocess.run(["tantivy", "search", "--index", index_path, "-q", query],
150 |                        encoding="utf8", capture_output=True)
151 |     if p.returncode != 0:
152 |         raise SearchError(p.stderr)
153 |     out = p.stdout.strip()
154 |     if out:
155 |         for r in out.split('\n'):
156 |             c = json.loads(r)
157 |             yield SearchResult(title=c["title"][0], url=c["url"][0])
158 | 
159 | 
160 | def main():
161 |     logging.basicConfig(level=logging.INFO)
162 | 
163 |     args = docopt(__doc__, version=VERSION)
164 | 
165 |     # index
166 |     if args["index"]:
167 |         path = Path(args["--directory"] if args["--directory"] else INDEX_DIR_PATH)
168 |         if args["--purge"]:
169 |             purge_index(path)
170 |             exit(0)
171 | 
172 |         logger.debug(f"Looking for index at path {path}")
173 |         # TODO: have to look into deleting an existing document in Tantivy. Until then
174 |         # we have to purge existing index and re-index it again.
175 |         purge_index(path)
176 |         create_index(path)
177 | 
178 |         client_id = "595d2745-c735-44f9-b568-c709fbefce81"
179 |         scopes = ["user.read", "notes.read"]
180 |         user_name = args["--user"]
181 |         authenticator = OneNoteAuthenticator(user_name, client_id, scopes)
182 |         downloader = OneNoteSession(authenticator).get
183 |         try:
184 |             index(path, downloader)
185 |             exit(0)
186 |         except IndexError as e:
187 |             logger.error(f"Indexing failed with message '{e}'")
188 |             exit(1)
189 | 
190 |     # search
191 |     if args["search"]:
192 |         q = ' '.join(args["QUERY"]).strip()
193 |         path = args["--index"] if args["--index"] else INDEX_DIR_PATH
194 |         try:
195 |             for r in search(q, path):
196 |                 print(f"title: {r.title}\nurl: {r.url}")
197 |                 print()
198 |         except SearchError as e:
199 |             logger.error(f"Search failed with message '{e}'")
200 |             exit(1)
201 | 


--------------------------------------------------------------------------------