├── README.rst ├── index ├── .managed.json └── meta.json ├── main.py ├── requirements.txt ├── setup.py ├── .github └── workflows │ └── python-publish.yml ├── .gitignore └── onote ├── auth.py └── command_line.py /README.rst: -------------------------------------------------------------------------------- 1 | OneNote Search Client. -------------------------------------------------------------------------------- /index/.managed.json: -------------------------------------------------------------------------------- 1 | ["meta.json"] 2 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from onote.command_line import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2023.11.17 2 | cffi==1.16.0 3 | chardet==5.2.0 4 | cryptography==41.0.7 5 | docopt==0.6.2 6 | flake8==7.0.0 7 | idna==3.6 8 | mccabe==0.7.0 9 | msal==1.26.0 10 | pycodestyle==2.11.1 11 | pycparser==2.21 12 | pyflakes==3.2.0 13 | PyJWT==2.8.0 14 | requests==2.31.0 15 | six==1.16.0 16 | urllib3==2.1.0 17 | -------------------------------------------------------------------------------- /index/meta.json: -------------------------------------------------------------------------------- 1 | { 2 | "segments": [], 3 | "schema": [ 4 | { 5 | "name": "title", 6 | "type": "text", 7 | "options": { 8 | "indexing": { 9 | "record": "position", 10 | "tokenizer": "en_stem" 11 | }, 12 | "stored": true 13 | } 14 | }, 15 | { 16 | "name": "content", 17 | "type": "text", 18 | "options": { 19 | "indexing": { 20 | "record": "position", 21 | "tokenizer": "en_stem" 22 | }, 23 | "stored": false 24 | } 25 | }, 26 | { 27 | "name": "url", 28 | "type": "text", 29 | "options": { 30 | "indexing": null, 31 | "stored": true 32 | } 33 | } 34 | ], 35 | "opstamp": 0 36 | } 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | from onote.command_line import VERSION 4 | 5 | 6 | def readme(): 7 | with open('README.rst') as f: 8 | return f.read() 9 | 10 | 11 | setup(name='onote', 12 | version=VERSION, 13 | python_requires='>=3.7', 14 | description='Search onenote pages', 15 | long_description=readme(), 16 | url='https://github.com/antonydeepak/onote', 17 | author='Antony Thomas', 18 | author_email='gogsbread@gmail.com', 19 | packages=['onote'], 20 | install_requires=[ 21 | 'docopt', 22 | 'msal', 23 | 'requests', 24 | ], 25 | zip_safe=False, 26 | entry_points={ 27 | 'console_scripts': ['onote=onote.command_line:main'], 28 | }) 29 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | -------------------------------------------------------------------------------- /onote/auth.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import logging 3 | import msal 4 | import os 5 | import requests 6 | 7 | from pathlib import Path 8 | from typing import List 9 | 10 | CACHE_PATH = Path(os.path.join(Path.home(), ".onote", "user_token.bin")) 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class OneNoteAuthenticator(): 16 | def __init__(self, user_name: str, client_id: str, scopes: List[str]): 17 | self.cache = msal.SerializableTokenCache() 18 | if CACHE_PATH.exists(): 19 | self.cache.deserialize(open(CACHE_PATH, "r").read()) 20 | atexit.register(self._serialize_cache) 21 | 22 | self.user_name = user_name 23 | self.app = msal.PublicClientApplication( 24 | client_id=client_id, 25 | token_cache=self.cache 26 | ) 27 | self.scopes = scopes 28 | 29 | def __call__(self): 30 | """Uses device code flow 31 | Authorization code flow is annoying because it is meant to be used in servers which should be 32 | time synced. Else, the token barfs because the time is slightly off when msal tries to decode """ 33 | accounts = self.app.get_accounts(self.user_name) 34 | result = None 35 | if accounts: 36 | a = accounts[0] 37 | logger.info(f"User must have logged-in before. Taking the first account \'{a['username']}\'") 38 | result = self.app.acquire_token_silent(self.scopes, account=a) 39 | if not result: 40 | logger.info("Could not use credentials from cache; using device flow to obtain credentials") 41 | flow = self.app.initiate_device_flow(scopes=self.scopes) 42 | logger.info(flow["message"]) 43 | result = self.app.acquire_token_by_device_flow(flow) 44 | 45 | if "error" in result: 46 | raise Exception(result["error_description"]) 47 | 48 | return result["access_token"] 49 | 50 | def _serialize_cache(self): 51 | if self.cache.has_state_changed: 52 | cache_dir = CACHE_PATH.parent 53 | if not cache_dir.exists(): 54 | os.makedirs(cache_dir) 55 | open(CACHE_PATH, "w").write(self.cache.serialize()) 56 | 57 | 58 | class OneNoteSession(requests.Session): 59 | def __init__(self, token_fetcher: callable): 60 | super().__init__() 61 | 62 | self.token_fetcher = token_fetcher 63 | token = self.token_fetcher() 64 | self.headers.update( 65 | {"User-Agent": "onoteClient", "Authorization": f"Bearer {token}"} 66 | ) 67 | 68 | def request(self, *args, **kwargs): 69 | resp = super().request(*args, **kwargs) 70 | if resp.status_code == 401: 71 | token = self.token_fetcher() 72 | self.headers["Authorization"] = f"Bearer {token}" 73 | resp = super().request(*args, **kwargs) 74 | return resp 75 | -------------------------------------------------------------------------------- /onote/command_line.py: -------------------------------------------------------------------------------- 1 | """Usage: onote search [-i ] QUERY... 2 | onote index [-d ] [-u ] [-p] 3 | 4 | -i , --index Path to index directory 5 | -d , --directory Path to index directory 6 | -u , --user User account to use for indexing 7 | -p, --purge Purge the index instead of creating it 8 | --version Show version 9 | --help 10 | """ 11 | 12 | import concurrent.futures 13 | import json 14 | import logging 15 | import os 16 | import shutil 17 | import subprocess 18 | 19 | from collections import namedtuple 20 | from concurrent.futures import ThreadPoolExecutor 21 | from docopt import docopt 22 | from html.parser import HTMLParser 23 | from pathlib import Path 24 | from typing import Generator 25 | 26 | from onote.auth import OneNoteAuthenticator, OneNoteSession 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | PAGES_URL = "https://graph.microsoft.com/v1.0/me/onenote/pages" 31 | 32 | VERSION = 0.1 33 | INDEX_DIR_PATH = Path(os.path.join(Path.home(), ".onote", "index")) 34 | 35 | 36 | def create_index(path: Path): 37 | index_managed = path.joinpath(".managed.json") 38 | index_meta = path.joinpath("meta.json") 39 | if not path.exists(): 40 | logger.info(f"Creating '{path}'") 41 | os.makedirs(path) 42 | if not index_managed.exists(): 43 | logger.info(f"Creating '{index_managed}'") 44 | with open("index/.managed.json", "r") as r: 45 | with open(index_managed, "w") as w: 46 | w.write(r.read()) 47 | if not index_meta.exists(): 48 | logger.info(f"Creating '{index_meta}'") 49 | with open("index/meta.json", "r") as r: 50 | with open(index_meta, "w") as w: 51 | w.write(r.read()) 52 | 53 | 54 | def purge_index(path: Path): 55 | if path.exists(): 56 | logger.info(f"Purging '{path}'") 57 | shutil.rmtree(path) 58 | 59 | 60 | class HtmlOnenoteContentParser(HTMLParser): 61 | def __init__(self, *args, **kwargs): 62 | super().__init__(*args, **kwargs) 63 | 64 | self.is_ptag = False 65 | self._content = [] 66 | 67 | def handle_starttag(self, tag, attrs): 68 | self.is_ptag = (tag == "p") 69 | 70 | def handle_data(self, data): 71 | if self.is_ptag: 72 | self._content.append(data) 73 | 74 | @property 75 | def content(self): 76 | return " ".join(self._content) 77 | 78 | 79 | class IndexError(Exception): 80 | pass 81 | 82 | 83 | class SearchError(Exception): 84 | pass 85 | 86 | 87 | def index(index_path, downloader): 88 | """ 89 | Onenote PAGES_URL returns paginated list pages. 90 | Idea is to concurrently download a list of page content urls and index in a single batch using tantivy 91 | """ 92 | Page = namedtuple('Page', ['title', 'content_url', 'weblink']) 93 | 94 | def tantivy(pages): 95 | p = subprocess.run(["tantivy", "index", "--index", index_path], 96 | input=pages, encoding="utf8", stderr=subprocess.PIPE) 97 | if p.returncode != 0: 98 | raise IndexError(p.stderr) 99 | 100 | future_to_page = {} 101 | with ThreadPoolExecutor() as executor: 102 | # download page content 103 | pages_url = PAGES_URL 104 | while pages_url is not None: 105 | logger.debug(f"Downloading {pages_url}") 106 | r = downloader(pages_url) 107 | r.raise_for_status() 108 | 109 | c = json.loads(r.text) 110 | pages = c["value"] 111 | for p in pages: 112 | page = Page(title=p["title"], content_url=p["contentUrl"], weblink=p["links"]["oneNoteWebUrl"]["href"]) 113 | future = executor.submit(downloader, page.content_url) 114 | future_to_page[future] = page 115 | 116 | logger.info(f"Downloading content page '{page.title}' from '{page.content_url}'") 117 | 118 | pages_url = c.get("@odata.nextLink") 119 | 120 | # index pages 121 | logger.info("Indexing pages") 122 | indexable_pages = [] 123 | for future in concurrent.futures.as_completed(future_to_page): 124 | page = future_to_page[future] 125 | if future.exception(): 126 | logger.warning(f"Failed to index {page.title} because {future.exception()}") 127 | continue 128 | 129 | r = future.result() 130 | content_parser = HtmlOnenoteContentParser() 131 | content_parser.feed(r.text) 132 | content = content_parser.content 133 | 134 | indexable_pages.append(json.dumps({ 135 | "title": page.title, 136 | "content": content, 137 | "url": page.weblink 138 | })) 139 | d = "\n".join(indexable_pages) 140 | tantivy(d) 141 | 142 | logger.info(f"Total {len(indexable_pages)} pages have been indexed") 143 | 144 | 145 | SearchResult = namedtuple('SearchResult', ['title', 'url']) 146 | 147 | 148 | def search(query, index_path) -> Generator[SearchResult, None, None]: 149 | p = subprocess.run(["tantivy", "search", "--index", index_path, "-q", query], 150 | encoding="utf8", capture_output=True) 151 | if p.returncode != 0: 152 | raise SearchError(p.stderr) 153 | out = p.stdout.strip() 154 | if out: 155 | for r in out.split('\n'): 156 | c = json.loads(r) 157 | yield SearchResult(title=c["title"][0], url=c["url"][0]) 158 | 159 | 160 | def main(): 161 | logging.basicConfig(level=logging.INFO) 162 | 163 | args = docopt(__doc__, version=VERSION) 164 | 165 | # index 166 | if args["index"]: 167 | path = Path(args["--directory"] if args["--directory"] else INDEX_DIR_PATH) 168 | if args["--purge"]: 169 | purge_index(path) 170 | exit(0) 171 | 172 | logger.debug(f"Looking for index at path {path}") 173 | # TODO: have to look into deleting an existing document in Tantivy. Until then 174 | # we have to purge existing index and re-index it again. 175 | purge_index(path) 176 | create_index(path) 177 | 178 | client_id = "595d2745-c735-44f9-b568-c709fbefce81" 179 | scopes = ["user.read", "notes.read"] 180 | user_name = args["--user"] 181 | authenticator = OneNoteAuthenticator(user_name, client_id, scopes) 182 | downloader = OneNoteSession(authenticator).get 183 | try: 184 | index(path, downloader) 185 | exit(0) 186 | except IndexError as e: 187 | logger.error(f"Indexing failed with message '{e}'") 188 | exit(1) 189 | 190 | # search 191 | if args["search"]: 192 | q = ' '.join(args["QUERY"]).strip() 193 | path = args["--index"] if args["--index"] else INDEX_DIR_PATH 194 | try: 195 | for r in search(q, path): 196 | print(f"title: {r.title}\nurl: {r.url}") 197 | print() 198 | except SearchError as e: 199 | logger.error(f"Search failed with message '{e}'") 200 | exit(1) 201 | --------------------------------------------------------------------------------