├── .github └── workflows │ ├── docker.yml │ └── main.yml ├── .gitignore ├── .vscode ├── extensions.json ├── launch.json └── settings.json ├── Dockerfile.linux ├── LICENSE ├── Makefile ├── README.md ├── bookworm_genai ├── __init__.py ├── __main__.py ├── commands │ ├── __init__.py │ ├── ask.py │ ├── export.py │ └── sync.py ├── integrations.py ├── metadata.py ├── models.py ├── storage.py └── utils.py ├── poetry.lock ├── pyproject.toml └── tests ├── test_ask.py ├── test_export.py ├── test_main.py ├── test_models.py ├── test_storage.py ├── test_sync.py └── test_utils.py /.github/workflows/docker.yml: -------------------------------------------------------------------------------- 1 | name: docker 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - main 7 | pull_request: 8 | jobs: 9 | linux: 10 | runs-on: ubuntu-latest 11 | timeout-minutes: 10 12 | strategy: 13 | fail-fast: false 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Run Docker 17 | run: | 18 | make docker_linux -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | on: 3 | push: 4 | branches: 5 | - master 6 | - main 7 | pull_request: 8 | jobs: 9 | build: 10 | runs-on: ${{ matrix.os }} 11 | timeout-minutes: 10 12 | permissions: 13 | contents: read 14 | pull-requests: write 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | # https://devguide.python.org/versions/ 19 | python-version: ['3.9', '3.10', '3.11', '3.12'] 20 | poetry-version: ['1.4.2'] 21 | os: [ubuntu-latest] # , windows-latest] 22 | steps: 23 | - uses: actions/checkout@v3 24 | - uses: actions/setup-python@v4 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - uses: abatilo/actions-poetry@v2 28 | with: 29 | poetry-version: ${{ matrix.poetry-version }} 30 | - name: Poetry Check 31 | run: poetry check 32 | - name: Install Dependencies 33 | run: | 34 | poetry build 35 | poetry install 36 | - name: Lint 37 | run: make lint 38 | - name: Test 39 | run: make test 40 | - name: Code Coverage 41 | if: matrix.python-version == '3.12' && matrix.poetry-version == '1.4.2' && matrix.os == 'ubuntu-latest' && github.ref != 'refs/heads/main' 42 | run: make coverage 43 | - name: Code Coverage Report 44 | if: matrix.python-version == '3.12' && matrix.poetry-version == '1.4.2' && matrix.os == 'ubuntu-latest' && github.ref != 'refs/heads/main' 45 | uses: romeovs/lcov-reporter-action@2a28ec3e25fb7eae9cb537e9141603486f810d1a # https://github.com/romeovs/lcov-reporter-action/issues/46 46 | with: 47 | lcov-file: coverage.lcov 48 | github-token: ${{ secrets.GITHUB_TOKEN }} 49 | delete-old-comments: true 50 | release: 51 | runs-on: ubuntu-latest 52 | timeout-minutes: 10 53 | needs: [build] 54 | permissions: 55 | contents: write 56 | pull-requests: write 57 | steps: 58 | - uses: actions/checkout@v3 59 | - uses: abatilo/actions-poetry@v2 60 | - name: Release 61 | id: release 62 | uses: go-semantic-release/action@v1 63 | if: github.ref == 'refs/heads/main' 64 | with: 65 | github-token: ${{ secrets.GITHUB_TOKEN }} 66 | allow-initial-development-versions: true 67 | # Prerelease 68 | - name: Bump Poetry (Prerelease) 69 | id: prerelease 70 | if: github.ref != 'refs/heads/main' && github.event_name == 'pull_request' 71 | run: | 72 | python -m pip install semantic-version 73 | 74 | # extract the current version 75 | VERSION=$(cat pyproject.toml | grep "^version =" | cut -d= -f2 | sed 's/"//g' | sed 's/^\s*\|\s*$//g') 76 | 77 | # bump the version 78 | # it's difficult to know if this should be patch, minor, major ahead of time as it would require running the actual release action 79 | VERSION=$(python -c "from semantic_version import Version; v = Version('${VERSION}'); v = v.next_patch(); print(v)") 80 | 81 | # add prerelease version identifier 82 | VERSION="${VERSION}b${{ github.run_number }}" # _${{ github.run_attempt }}" 83 | 84 | poetry version $VERSION 85 | 86 | # ensure the published package links back to the pull request 87 | PULL_REQUEST_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH") 88 | sed "s/repository =.*/repository = \"https:\/\/github.com\/kiran94\/bookworm\/pull\/$PULL_REQUEST_NUMBER\"/" pyproject.toml -i 89 | 90 | echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT 91 | - name: Deploy (Prerelease) 92 | if: github.ref != 'refs/heads/main' && github.event_name == 'pull_request' 93 | run: | 94 | poetry build 95 | poetry publish -u __token__ -p $POETRY_PYPI_TOKEN_PYPI 96 | env: 97 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }} 98 | - name: Comment (Prerelease) 99 | if: github.ref != 'refs/heads/main' && github.event_name == 'pull_request' 100 | uses: thollander/actions-comment-pull-request@v2 101 | with: 102 | message: | 103 | Change was prereleased to pypi. Try it out :rocket: 104 | 105 | ```bash 106 | python -m pip install bookworm_genai==${{steps.prerelease.outputs.VERSION}} 107 | ``` 108 | 109 | https://pypi.org/project/bookworm_genai/${{steps.prerelease.outputs.VERSION}}/ 110 | comment_tag: execution 111 | # Main Release 112 | - name: Bump Poetry 113 | if: github.ref == 'refs/heads/main' && steps.release.outputs.version != '' 114 | run: poetry version ${{ steps.release.outputs.version }} 115 | - name: Deploy 116 | if: github.ref == 'refs/heads/main' && steps.release.outputs.version != '' 117 | run: | 118 | poetry build 119 | poetry publish -u __token__ -p $POETRY_PYPI_TOKEN_PYPI 120 | env: 121 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }} 122 | - uses: EndBug/add-and-commit@v9 123 | if: github.ref == 'refs/heads/main' && steps.release.outputs.version != '' 124 | with: 125 | message: "chore(version): bump" 126 | default_author: github_actions 127 | add: pyproject.toml 128 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | cover/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | .pybuilder/ 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | # For a library or package, you might want to ignore these files since the code is 91 | # intended to run in multiple environments; otherwise, check them in: 92 | # .python-version 93 | 94 | # pipenv 95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 98 | # install all needed dependencies. 99 | #Pipfile.lock 100 | 101 | # poetry 102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 103 | # This is especially recommended for binary packages to ensure reproducibility, and is more 104 | # commonly ignored for libraries. 105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 106 | #poetry.lock 107 | 108 | # pdm 109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 110 | #pdm.lock 111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 112 | # in version control. 113 | # https://pdm.fming.dev/#use-with-ide 114 | .pdm.toml 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | ### Python Patch ### 167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 168 | poetry.toml 169 | 170 | # ruff 171 | .ruff_cache/ 172 | 173 | # LSP config files 174 | pyrightconfig.json 175 | 176 | # End of https://www.toptal.com/developers/gitignore/api/python 177 | 178 | .duckdb 179 | .json 180 | .sqlite 181 | *.csv 182 | 183 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-python.python", 4 | "ms-python.vscode-pylance", 5 | "ms-python.debugpy", 6 | "ms-toolsai.jupyter", 7 | "yzhang.markdown-all-in-one", 8 | "charliermarsh.ruff", 9 | "hbenl.vscode-test-explorer", 10 | ] 11 | } -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Bookworm: Sync", 6 | "type": "debugpy", 7 | "request": "launch", 8 | "module": "bookworm_genai", 9 | "envFile": "${workspaceFolder}/.env", 10 | "env": { 11 | "LOGGING_LEVEL": "DEBUG" 12 | }, 13 | "args": [ 14 | "sync" 15 | ] 16 | }, 17 | { 18 | "name": "Bookworm: Sync (Browser Filter)", 19 | "type": "debugpy", 20 | "request": "launch", 21 | "module": "bookworm_genai", 22 | "envFile": "${workspaceFolder}/.env", 23 | "env": { 24 | "LOGGING_LEVEL": "DEBUG" 25 | }, 26 | "args": [ 27 | "sync", 28 | "--browser-filter", "${input:browser}" 29 | ] 30 | }, 31 | { 32 | "name": "Bookworm: Ask", 33 | "type": "debugpy", 34 | "request": "launch", 35 | "module": "bookworm_genai", 36 | "envFile": "${workspaceFolder}/.env", 37 | "env": { 38 | "LOGGING_LEVEL": "DEBUG" 39 | }, 40 | "args": [ 41 | "ask" 42 | ] 43 | } 44 | ], 45 | "inputs": [ 46 | { 47 | "id": "browser", 48 | "type": "pickString", 49 | "description": "Pick the browser you want to filter on:", 50 | "options": [ 51 | "chrome", 52 | "firefox", 53 | "brave" 54 | ], 55 | "default": "chrome" 56 | } 57 | ] 58 | } 59 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "notebook.formatOnSave.enabled": true, 8 | "[python]": { 9 | "editor.formatOnSave": true, 10 | "editor.defaultFormatter": "charliermarsh.ruff", 11 | } 12 | } -------------------------------------------------------------------------------- /Dockerfile.linux: -------------------------------------------------------------------------------- 1 | FROM python:3.9-slim 2 | 3 | RUN python3 -m pip install poetry==1.4.2 4 | 5 | RUN useradd -m -s /bin/bash my_user \ 6 | && echo "my_user:password" | chpasswd \ 7 | && usermod -aG sudo my_user 8 | 9 | COPY . /home/my_user/app 10 | WORKDIR /home/my_user/app 11 | RUN chown -R my_user:my_user /home/my_user/app 12 | 13 | USER my_user 14 | 15 | RUN poetry install 16 | RUN poetry run pytest -v -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2024 Kiran Patel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | OS := $(shell uname) 2 | 3 | test: 4 | poetry run pytest -vv 5 | 6 | lint: 7 | poetry run ruff check $(if $(GITHUB_ACTIONS),--output-format github,) . 8 | 9 | format: 10 | poetry run ruff format 11 | 12 | coverage: 13 | poetry run pytest -q --cov=bookworm_genai --cov-report=term # for local 14 | poetry run pytest -q --cov=bookworm_genai --cov-report=html # for local 15 | 16 | # for sonarqube 17 | $(if $(GITHUB_ACTIONS),poetry run pytest -q --cov=bookworm_genai --cov-report=xml,) 18 | 19 | # for github action 20 | $(if $(GITHUB_ACTIONS),poetry run pytest -q --cov=bookworm_genai --cov-report=lcov,) 21 | 22 | check_database: 23 | ifeq ($(OS),Darwin) 24 | duckdb "/Users/kiran/Library/Application Support/bookworm/bookmarks.duckdb" -c 'SELECT * FROM embeddings LIMIT 5; SELECT COUNT(*) FROM embeddings' 25 | else ifeq ($(OS),Linux) 26 | duckdb ~/.local/share/bookworm/bookmarks.duckdb -c 'SELECT * FROM embeddings LIMIT 5; SELECT COUNT(*) FROM embeddings' 27 | else 28 | @echo "OS not supported" 29 | endif 30 | 31 | # Useful if you are running on non-linux machine 32 | # and want to verify tests are still working on that platform 33 | docker_linux: 34 | docker build -f Dockerfile.linux -t bookworm_linux . 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # bookworm 📖 2 | 3 | [![main](https://github.com/kiran94/bookworm/actions/workflows/main.yml/badge.svg)](https://github.com/kiran94/bookworm/actions/workflows/main.yml) [![PyPI version](https://badge.fury.io/py/bookworm_genai.svg)](https://badge.fury.io/py/bookworm_genai) 4 | 5 | > LLM-powered bookmark search engine 6 | 7 | `bookworm` allows you to search from your local browser bookmarks using natural language. For times when you have a large collection of bookmarks and you can't quite remember where you put that one website you need at the moment. 8 | 9 | [![asciicast](https://asciinema.org/a/696722.svg)](https://asciinema.org/a/696722) 10 | 11 | *In the example above, we search for the term “Japan.” While some results don’t explicitly mention the word, terms like “Osaka” appear because they are closely related to the search term based on OpenAI embeddings.* 12 | 13 | ## Install 14 | 15 | ```bash 16 | python -m pip install bookworm_genai 17 | ``` 18 | 19 | > [!TIP] 20 | > If you are using [`uvx`](https://docs.astral.sh/uv/guides/tools/) then you can also just run this: 21 | > ```bash 22 | > uvx --from bookworm_genai bookworm --help 23 | > ``` 24 | 25 | ## Usage 26 | 27 | ```bash 28 | export OPENAI_API_KEY= 29 | 30 | # Run once and then anytime bookmarks across supported browsers changes 31 | bookworm sync 32 | 33 | # Sync bookmarks only from a specific browser 34 | bookworm sync --browser-filter chrome 35 | 36 | # Ask questions against the bookmark database 37 | bookworm ask 38 | 39 | # Ask questions against the bookmark database 40 | # Specify the query when invoking the command 41 | # If you omit this then you will be asked for a query when the tool is running 42 | bookworm ask -q pandas 43 | 44 | # Ask questions against the bookmark database and specify the number of results that should come back 45 | bookworm ask -n 1 46 | ``` 47 | 48 | The `sync` process currently supports the following configurations: 49 | 50 | | Operating System | Google Chrome | Mozilla Firefox | Brave | Microsoft Edge | 51 | | ------------------ | --------------- | ----------------- | ------- | ---------------- | 52 | | **Linux** | ✅ | ✅ | ✅ | ❌ | 53 | | **macOS** | ✅ | ✅ | ✅ | ❌ | 54 | | **Windows** | ❌ | ❌ | ❌ | ❌ | 55 | 56 | > [!TIP] 57 | > ✨ Want to contribute? See the [adding an integration](#adding-an-integration) section. 58 | 59 | ## Processes 60 | 61 | *`bookworm sync`* 62 | 63 | Vectorize your bookmarks across all supported browsers. 64 | 65 | ```mermaid 66 | graph LR 67 | 68 | subgraph Bookmarks 69 | Chrome(Chrome Bookmarks) 70 | Brave(Brave Bookmarks) 71 | Firefox(Firefox Bookmarks) 72 | end 73 | 74 | Bookworm(bookworm sync) 75 | 76 | EmbeddingsService(Embeddings Service e.g OpenAIEmbeddings) 77 | 78 | VectorStore(Vector Store e.g DuckDB) 79 | 80 | Chrome -->|load bookmarks|Bookworm 81 | Brave -->|load bookmarks|Bookworm 82 | Firefox -->|load bookmarks|Bookworm 83 | 84 | Bookworm -->|vectorize bookmarks|EmbeddingsService-->|store embeddings|VectorStore 85 | ``` 86 | 87 |
88 | Details 89 | 90 | The vector database depicted above is stored locally on your machine. You can check it's location by running the following after installing this project: 91 | 92 | ```python 93 | from platformdirs import PlatformDirs 94 | 95 | print(PlatformDirs('bookworm').user_data_dir) 96 | ``` 97 | 98 |
99 | 100 | --- 101 | 102 | *`bookworm ask`* 103 | 104 | Search from your bookmarks 105 | 106 | ```mermaid 107 | graph LR 108 | 109 | query 110 | Bookworm(bookworm ask) 111 | 112 | subgraph _ 113 | LLM(LLM e.g OpenAI) 114 | VectorStore(Vector Store e.g DuckDB) 115 | end 116 | 117 | query -->|user queries for information|Bookworm 118 | 119 | Bookworm -->|similarity search|VectorStore -->|send similar docs + user query|LLM 120 | LLM -->|send back response|Bookworm 121 | ``` 122 | 123 | --- 124 | 125 | *`bookworm export`* 126 | 127 | Export your bookmarks across all supported browsers into an output (e.g CSV) 128 | 129 | ```mermaid 130 | graph LR 131 | 132 | VectorStore 133 | Bookworm(bookworm export) 134 | CSV(bookmarks.csv) 135 | 136 | VectorStore -->|extract all bookmarks|Bookworm 137 | Bookworm -->|export into file|CSV 138 | ``` 139 | 140 | ## Developer Setup 141 | 142 | ```bash 143 | # LLMs 144 | export OPENAI_API_KEY= 145 | 146 | # Langchain (optional, but useful for debugging) 147 | export LANGCHAIN_API_KEY= 148 | export LANGCHAIN_TRACING_V2=true 149 | export LANGCHAIN_PROJECT=bookworm 150 | 151 | # Misc (optional) 152 | export LOGGING_LEVEL=INFO 153 | ``` 154 | 155 | Recommendations: 156 | 157 | - Install [`pyenv`](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) and ensure [build dependencies are installed](https://github.com/pyenv/pyenv?tab=readme-ov-file#install-python-build-dependencies) for your OS. 158 | - Install [Poetry](https://python-poetry.org/docs/) we will be using [environment management](https://python-poetry.org/docs/managing-environments/) below. 159 | - VS Code Extensions recommendations can be found [here](./.vscode/extensions.json) and will be suggested upon first opening the project. 160 | 161 | 162 | ```bash 163 | poetry env use 3.9 # or path to your 3.9 installation 164 | 165 | poetry shell 166 | poetry install 167 | 168 | bookworm --help 169 | ``` 170 | 171 |
172 | Running Linux tests on MacOS/Windows 173 | 174 | If you are running on a non-linux machine, it may be helpful to run the provided [Dockerfile](./Dockerfile.linux) to verify it's working on that environment. 175 | 176 | You can build this via: 177 | 178 | ```bash 179 | make docker_linux 180 | ``` 181 | 182 | You will need to have Docker installed to run this. 183 | 184 |
185 | 186 | ## Adding an Integration 187 | 188 | As you can see from [usage](#usage), bookworm supports various integrations but not all. If you find one that you want to support one, then a change is needed inside [integrations.py](./bookworm_genai/integrations.py). 189 | 190 | You can see in that file there is a variable called `browsers` that follows this structure: 191 | 192 | ```python 193 | browsers = { 194 | "BROWSER": { 195 | "PLATFORM": { 196 | ... 197 | } 198 | } 199 | } 200 | ``` 201 | 202 | So say you wanted to add Chrome support in Windows then you would go under the Chrome key and then add a `win32` key which has all the details. You can refer to existing examples but generally the contents of those details are *where* to find the bookmarks on the user's system along with how to *interpret* them. 203 | 204 | You can also find a full list of the document loaders supported [here](https://python.langchain.com/docs/integrations/document_loaders/). -------------------------------------------------------------------------------- /bookworm_genai/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import importlib.metadata 4 | 5 | from rich.logging import RichHandler 6 | 7 | LOGGING_LEVEL = os.environ.get("LOGGING_LEVEL", logging.INFO) 8 | LOGGING_FORMAT = os.environ.get("LOGGING_FORMAT", "%(message)s") 9 | 10 | __version__ = importlib.metadata.version(__name__) 11 | _is_debug = logging.getLevelName(LOGGING_LEVEL) == logging.DEBUG 12 | 13 | logging.basicConfig( 14 | level=LOGGING_LEVEL, 15 | format=LOGGING_FORMAT, 16 | handlers=[RichHandler(markup=True, show_path=_is_debug, show_time=_is_debug, show_level=_is_debug)], 17 | ) 18 | 19 | logging.getLogger("httpx").setLevel(logging.WARNING) 20 | -------------------------------------------------------------------------------- /bookworm_genai/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import argparse 4 | 5 | from bookworm_genai import __version__ 6 | from bookworm_genai.integrations import browsers, Browser 7 | from bookworm_genai.commands.sync import sync 8 | from bookworm_genai.commands.ask import BookmarkChain 9 | from bookworm_genai.commands.export import export 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | def main(): 15 | arg_parser = argparse.ArgumentParser(description="LLM-powered bookmark search engine") 16 | arg_parser.add_argument("--version", action="version", version=__version__) 17 | 18 | sub_parsers = arg_parser.add_subparsers(dest="command", help="Available commands", required=True) 19 | 20 | sync_parser = sub_parsers.add_parser("sync", help="Sync the bookmark database with the latest changes") 21 | sync_parser.add_argument("--estimate-cost", action="store_true", default=False, help="Estimate the cost of syncing the bookmark database") 22 | sync_parser.add_argument("--browser-filter", default=[], help="Only sync a subset of browsers", choices=Browser.list()) 23 | 24 | ask_parser = sub_parsers.add_parser("ask", help="Search for a bookmark") 25 | ask_parser.add_argument("-n", "--top-n", type=int, default=3, help="Number of bookmarks to return") 26 | ask_parser.add_argument("-q", "--query", help="The Search Query") 27 | 28 | export_parser = sub_parsers.add_parser("export", help="Export bookmarks") 29 | export_parser.add_argument("--format", choices=["csv"], default="csv") 30 | export_parser.add_argument("--output", default="bookmarks.csv") 31 | 32 | args = arg_parser.parse_args(sys.argv[1:]) 33 | 34 | logger.info("[bold green]Starting Bookworm 📖") 35 | logger.debug("Running on platform '%s' with version '%s'", sys.platform, __version__) 36 | 37 | logger.debug("Arguments: %s", args) 38 | 39 | if args.command == "sync": 40 | sync(browsers, estimate_cost=args.estimate_cost, browser_filter=args.browser_filter) 41 | 42 | elif args.command == "ask": 43 | if not args.query: 44 | logger.info("What would you like to search for?") 45 | query = input("> ") 46 | else: 47 | query = args.query 48 | 49 | logger.debug("query: %s", query) 50 | 51 | with BookmarkChain(vector_store_search_n=args.top_n) as bookmark_chain: 52 | if not bookmark_chain.is_valid(): 53 | logger.debug("bookmark chain is not valid, exiting early.") 54 | return 55 | 56 | logger.info("Searching for bookmarks...") 57 | bookmarks = bookmark_chain.ask(query) 58 | 59 | if not bookmarks.bookmarks: 60 | logger.info(""" 61 | No bookmarks found for the query 🙁. Please ensure you have performed a "bookworm sync" to update the database 62 | and the query is relevant to the bookmarks stored. 63 | """) 64 | return 65 | 66 | for index, bookmark in enumerate(bookmarks.bookmarks): 67 | if logger.isEnabledFor(logging.DEBUG): 68 | # also shows the source of the bookmark 69 | logger.info( 70 | f"[green][{index}] [/] {bookmark.title} - [link={bookmark.url}]{bookmark.url}[/link] ([green]{bookmark.source}[/])" 71 | ) # pragma: no cover 72 | else: 73 | logger.info(f"[green][{index}] [/] {bookmark.title} - [link={bookmark.url}]{bookmark.url}[/link] ([green]{bookmark.browser}[/])") 74 | 75 | logger.info("Press a number to open the bookmark:") 76 | while True: 77 | try: 78 | raw_input = input("> ") 79 | selected_index = int(raw_input) 80 | bookmarks.bookmarks[selected_index].open() 81 | 82 | break 83 | except ValueError: 84 | logger.warning(f"Invalid input: '{raw_input}'. Please enter a number.") 85 | except IndexError: 86 | logger.warning(f"Invalid index: '{selected_index}'. Please select a valid index.") 87 | 88 | elif args.command == "export": 89 | bookmarks = export() 90 | 91 | logger.info(f"[blue]Exporting bookmarks to '{args.output}' [/]") 92 | bookmarks.to_csv(args.output, index=False) 93 | 94 | 95 | if __name__ == "__main__": 96 | main() # pragma: no cover 97 | -------------------------------------------------------------------------------- /bookworm_genai/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kiran94/bookworm/4a23d5472f12177560e9caf18a3e0a987881e82e/bookworm_genai/commands/__init__.py -------------------------------------------------------------------------------- /bookworm_genai/commands/ask.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | 4 | import duckdb 5 | from langchain_community.vectorstores import DuckDB as DuckDBVectorStore 6 | from langchain_openai import ChatOpenAI 7 | from langchain_core.prompts import ChatPromptTemplate 8 | from langchain_core.runnables import RunnablePassthrough 9 | from langchain_core.language_models.chat_models import BaseChatModel 10 | 11 | from bookworm_genai.models import Bookmarks 12 | from bookworm_genai.storage import _get_local_store, _get_embedding_store 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | _system_message = """ 18 | You have knowledge about all the browser bookmarks stored by an individual. 19 | When a user asks a question, you should be able to search the bookmarks and return the most relevant bookmark title and URL. 20 | It could be multiple bookmarks. 21 | If you don't have anything in the context then return empty list 22 | 23 | The bookmarks available are from the context: 24 | {context} 25 | """ 26 | 27 | 28 | class BookmarkChain: 29 | def __init__(self, vector_store_search_n: int = 3): 30 | full_database_path = _get_local_store() 31 | logger.debug("Connecting to vector database at: %s", full_database_path) 32 | self._duckdb_connection = duckdb.connect(full_database_path, read_only=False) 33 | self.vector_store = DuckDBVectorStore(connection=self._duckdb_connection, embedding=_get_embedding_store()) 34 | 35 | llm = _get_llm() 36 | llm = llm.with_structured_output(Bookmarks) 37 | 38 | prompt = ChatPromptTemplate.from_messages([("system", _system_message), ("human", "{query}")]) 39 | 40 | search_kwargs = {"k": vector_store_search_n} 41 | 42 | self.chain = {"context": self.vector_store.as_retriever(search_kwargs=search_kwargs), "query": RunnablePassthrough()} | prompt | llm 43 | 44 | def ask(self, query: str) -> Bookmarks: 45 | logger.debug("Searching for bookmarks with query: %s", query) 46 | 47 | return self.chain.invoke(query) 48 | 49 | def is_valid(self) -> bool: 50 | res = self._duckdb_connection.execute("SELECT COUNT(*) FROM embeddings").fetchall() 51 | 52 | try: 53 | res = res[0][0] 54 | except (IndexError, TypeError) as e: 55 | logger.warning("validation check failed due to unexpected response from the database.") 56 | logger.debug("Error: %s", e) 57 | logger.debug("Raw DuckDB Response: %s", res) 58 | 59 | return False 60 | 61 | if res == 0: 62 | logger.warning("No bookmarks were found in database. Please ensure you run 'bookworm sync' before asking questions") 63 | return False 64 | else: 65 | return True 66 | 67 | def __enter__(self): 68 | return self 69 | 70 | def __exit__(self, exc_type, exc_val, exc_tb): 71 | logger.debug("Closing DuckDB connection") 72 | 73 | self._duckdb_connection.close() 74 | 75 | 76 | def _get_llm() -> BaseChatModel: 77 | kwargs = { 78 | "temperature": 0.0, 79 | } 80 | 81 | if os.environ.get("OPENAI_API_KEY"): 82 | # https://api.python.langchain.com/en/latest/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html 83 | return ChatOpenAI(**kwargs) 84 | 85 | else: 86 | raise ValueError( 87 | "LLM service could not be configured. Ensure you have OPENAI_API_KEY. If you are using OpenAI then please ensure you have the OPENAI_API_KEY environment variable set." 88 | ) 89 | -------------------------------------------------------------------------------- /bookworm_genai/commands/export.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | import pandas as pd 5 | import duckdb 6 | 7 | from bookworm_genai.storage import _get_local_store 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def export() -> pd.DataFrame: 13 | store = _get_local_store() 14 | 15 | logger.debug(f"reading from vector store {store}") 16 | with duckdb.connect(store, read_only=True) as duck: 17 | df = duck.execute("select * from embeddings").df() 18 | 19 | logger.debug("extracting useful information from structured columns") 20 | browser_col = df["metadata"].apply(json.loads).apply(lambda x: x["browser"]).rename(index="browser") 21 | source_col = df["metadata"].apply(json.loads).apply(lambda x: x["source"]).rename(index="source") 22 | name_col = df["text"].apply(json.loads).apply(lambda x: x["name"]).rename(index="name") 23 | url_col = df["text"].apply(json.loads).apply(lambda x: x["url"]).rename(index="url") 24 | 25 | logger.debug("dropping unnecessary columns") 26 | cleaned_df = df.drop(columns=["id", "metadata", "text", "embedding"]) 27 | 28 | bookmark_summary_df = pd.concat([cleaned_df, name_col, url_col, browser_col, source_col], axis=1) 29 | return bookmark_summary_df 30 | -------------------------------------------------------------------------------- /bookworm_genai/commands/sync.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import logging 5 | import shutil 6 | from typing import Optional, Union 7 | 8 | import tiktoken 9 | from langchain_core.documents import Document 10 | 11 | from bookworm_genai.integrations import Browser, browsers, BrowserManifest 12 | from bookworm_genai.storage import store_documents, _get_embedding_store 13 | from bookworm_genai.metadata import attach_metadata 14 | 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | def sync(browsers: BrowserManifest = browsers, estimate_cost: bool = False, browser_filter: list[str] = []) -> Union[None, float]: 20 | docs: list[Document] = [] 21 | 22 | for browser, config in browsers.items(): 23 | browser: Browser = browser 24 | 25 | if browser_filter and (browser.value not in browser_filter): 26 | logger.debug(f"browser {browser.value} skipped due to filter") 27 | continue 28 | 29 | try: 30 | platform_config = config[sys.platform] 31 | except KeyError: 32 | logger.warning(f"🔄 browser {browser.value} is not supported on {sys.platform} yet") 33 | continue 34 | else: 35 | if "copy" in platform_config: 36 | try: 37 | _copy(platform_config["copy"]) 38 | except BrowserBookmarkFileNotFound as e: 39 | logger.warning(f"🔄 browser {browser.value} skipped due to missing file '{e.file}'") 40 | continue 41 | 42 | _log_bookmark_source(browser, platform_config) 43 | 44 | config = platform_config["bookmark_loader_kwargs"] 45 | if "db" in config: 46 | if callable(config["db"]): 47 | config["db"] = config["db"](None) 48 | 49 | loader = platform_config["bookmark_loader"](**config) 50 | 51 | current_docs: list[Document] = list(loader.lazy_load()) 52 | 53 | for index, doc in enumerate(current_docs): 54 | logger.debug(doc.page_content) 55 | current_docs[index] = attach_metadata(current_docs[index], browser) 56 | 57 | docs.extend(current_docs) 58 | 59 | logger.debug(f"{len(docs)} Bookmarks loaded") 60 | 61 | if estimate_cost: 62 | return _estimate_cost(docs) 63 | 64 | if docs: 65 | store_documents(docs) 66 | 67 | 68 | def _copy(config: dict): 69 | logger.debug(f"Copying {config['from']} to {config['to']}") 70 | 71 | source = glob.glob(config["from"]) 72 | 73 | try: 74 | source = source[0] 75 | except IndexError as e: 76 | logger.debug(f"source {config['from']} not found") 77 | raise BrowserBookmarkFileNotFound(config["from"]) from e 78 | 79 | directory = os.path.dirname(config["to"]) 80 | os.makedirs(directory, exist_ok=True) 81 | 82 | shutil.copy(source, config["to"]) 83 | 84 | 85 | def _log_bookmark_source(browser: Browser, platform_config: dict): 86 | logger.info(f"✅ browser {browser.value} bookmarks loaded!") 87 | 88 | path = "" 89 | 90 | try: 91 | path = platform_config["bookmark_loader_kwargs"]["file_path"] 92 | except KeyError: 93 | pass 94 | 95 | try: 96 | path = platform_config["bookmark_loader_kwargs"]["db"] 97 | if callable(path): 98 | path = path(path) 99 | 100 | path = path._engine.url 101 | 102 | except KeyError: 103 | pass 104 | 105 | logger.debug("Loading bookmarks from %s", path) 106 | 107 | 108 | def _estimate_cost(docs: list[Document], cost_per_million: Optional[float] = None) -> float: 109 | embedding = _get_embedding_store() 110 | 111 | # NOTE: using _get_embedding_store here means that it's more likely that the model we are using 112 | # in the actual embedding is the one we use for cost estimation 113 | # however note that .model here is not part of the contract for Embeddings 114 | # so this is a bit of a hack 115 | # if we add more embeddings options in the future, we need to re-evaluate this. 116 | encoding = tiktoken.encoding_for_model(embedding.model) 117 | 118 | logger.info(f"Estimating cost for {embedding.model}") 119 | 120 | tokens: int = 0 121 | for doc in docs: 122 | tokens += len(encoding.encode(doc.page_content)) 123 | 124 | if not cost_per_million: 125 | # https://openai.com/api/pricing/ 126 | price = float(input(f"what is the current cost for {embedding.model} per million? (non-batch) ")) 127 | else: 128 | price = cost_per_million 129 | 130 | # price is often advertise per million; so find the price per token 131 | price_per_token = price / 1_000_000 132 | 133 | # given the number total tokens we have, apply the price per token 134 | cost = tokens * price_per_token 135 | 136 | logger.info(f"Estimated cost: ${cost} (tokens: {tokens}) ") 137 | 138 | return cost 139 | 140 | 141 | class BrowserBookmarkFileNotFound(Exception): 142 | """ 143 | Represents that a bookmark file on the local file system could not be found. 144 | For example if a configuration is defined with a glob expression /my/path/*.sqlite but that path resolves to nothing. 145 | """ 146 | 147 | def __init__(self, file: str): 148 | self.file = file 149 | super().__init__(f"Could not resolve file: {file}") 150 | -------------------------------------------------------------------------------- /bookworm_genai/integrations.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | from typing import Any 4 | 5 | from langchain_community.document_loaders import JSONLoader 6 | from langchain_community.document_loaders.sql_database import SQLDatabaseLoader 7 | from langchain_community.utilities.sql_database import SQLDatabase 8 | 9 | from bookworm_genai.utils import CHROMIUM_JQ_COMMAND, sql_loader_page_content_mapper, sql_loader_firefox_copy_path, sql_loader_firefox_sql_query 10 | 11 | 12 | class Browser(str, Enum): 13 | BRAVE = "brave" 14 | CHROME = "chrome" 15 | FIREFOX = "firefox" 16 | 17 | @classmethod 18 | def list(cls): 19 | return list(map(lambda c: c.value, cls)) 20 | 21 | 22 | BrowserManifest = dict[Browser, dict[str, dict[str, Any]]] 23 | 24 | # Configuration for various browsers and details about them 25 | # The bookmark_file_path is the path to the bookmarks file for the browsers, in order for it to be used it must be used in conjunction with 26 | # os.path.expanduser as it may contain environment variables 27 | # 28 | # The platform configuration is keyed off the values from https://docs.python.org/3/library/sys.html#sys.platform 29 | # 30 | browsers: BrowserManifest = { 31 | Browser.BRAVE: { 32 | "linux": { 33 | "bookmark_loader": JSONLoader, 34 | "bookmark_loader_kwargs": { 35 | "file_path": os.path.expanduser("~/.config/BraveSoftware/Brave-Browser/Default/Bookmarks"), 36 | "jq_schema": CHROMIUM_JQ_COMMAND, 37 | "text_content": False, 38 | }, 39 | }, 40 | "darwin": { 41 | "bookmark_loader": JSONLoader, 42 | "bookmark_loader_kwargs": { 43 | "file_path": os.path.expanduser("~/Library/Application Support/BraveSoftware/Brave-Browser/Default/Bookmarks"), 44 | "jq_schema": CHROMIUM_JQ_COMMAND, 45 | "text_content": False, 46 | }, 47 | }, 48 | # "win32": {}, 49 | }, 50 | Browser.CHROME: { 51 | "linux": { 52 | "bookmark_loader": JSONLoader, 53 | "bookmark_loader_kwargs": { 54 | "file_path": os.path.expanduser("~/.config/google-chrome/Default/Bookmarks"), 55 | "jq_schema": CHROMIUM_JQ_COMMAND, 56 | "text_content": False, 57 | }, 58 | }, 59 | "darwin": { 60 | "bookmark_loader": JSONLoader, 61 | "bookmark_loader_kwargs": { 62 | "file_path": os.path.expanduser("~/Library/Application Support/Google/Chrome/Default/Bookmarks"), 63 | "jq_schema": CHROMIUM_JQ_COMMAND, 64 | "text_content": False, 65 | }, 66 | }, 67 | # "win32": {}, 68 | }, 69 | Browser.FIREFOX: { 70 | "linux": { 71 | "bookmark_loader": SQLDatabaseLoader, 72 | "bookmark_loader_kwargs": { 73 | "db": lambda _: SQLDatabase.from_uri("sqlite:////tmp/bookworm/firefox.sqlite"), 74 | "query": sql_loader_firefox_sql_query(), 75 | "source_columns": ["source"], 76 | "page_content_mapper": lambda row: sql_loader_page_content_mapper(row), 77 | }, 78 | "copy": { 79 | "from": sql_loader_firefox_copy_path(), 80 | "to": "/tmp/bookworm/firefox.sqlite", 81 | }, 82 | }, 83 | "darwin": { 84 | "bookmark_loader": SQLDatabaseLoader, 85 | "bookmark_loader_kwargs": { 86 | "db": lambda _: SQLDatabase.from_uri("sqlite:////tmp/bookworm/firefox.sqlite"), 87 | "query": sql_loader_firefox_sql_query(), 88 | "source_columns": ["source"], 89 | "page_content_mapper": lambda row: sql_loader_page_content_mapper(row), 90 | }, 91 | "copy": { 92 | "from": sql_loader_firefox_copy_path(), 93 | "to": "/tmp/bookworm/firefox.sqlite", 94 | }, 95 | }, 96 | # "win32": {}, 97 | }, 98 | } 99 | -------------------------------------------------------------------------------- /bookworm_genai/metadata.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from langchain_core.documents import Document 3 | 4 | from bookworm_genai import __version__ 5 | from bookworm_genai.integrations import Browser 6 | 7 | 8 | class Metadata(str, enum.Enum): 9 | Browser = "browser" 10 | BookwormVersion = "bookworm_version" 11 | 12 | 13 | def attach_metadata(doc: Document, browser: Browser) -> Document: 14 | doc.metadata[Metadata.Browser.value] = browser.value 15 | doc.metadata[Metadata.BookwormVersion.value] = __version__ 16 | return doc 17 | -------------------------------------------------------------------------------- /bookworm_genai/models.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import subprocess 3 | import logging 4 | 5 | from langchain_core.pydantic_v1 import BaseModel, Field 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class Bookmark(BaseModel): 11 | """ 12 | A bookmark to a website 13 | """ 14 | 15 | title: str = Field(description="The title of the bookmark") 16 | url: str = Field(description="The URL of the bookmark") 17 | source: str = Field(description="The source of the bookmark") 18 | browser: str = Field(description="The browser that the bookmark was saved from") 19 | 20 | def open(self): 21 | if sys.platform == "win32": 22 | subprocess.Popen(["start", self.url], shell=True) 23 | elif sys.platform == "darwin": 24 | subprocess.Popen(["open", self.url]) 25 | elif sys.platform == "linux": 26 | subprocess.Popen(["xdg-open", self.url]) 27 | else: 28 | logger.warning(f'Platform "{sys.platform}" not supported. Printing URL instead') 29 | logger.info(self.url) 30 | 31 | 32 | class Bookmarks(BaseModel): 33 | """ 34 | A list of bookmarks 35 | """ 36 | 37 | bookmarks: list[Bookmark] = Field(description="A list of bookmarks") 38 | -------------------------------------------------------------------------------- /bookworm_genai/storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | import duckdb 3 | import logging 4 | 5 | from platformdirs import PlatformDirs 6 | from langchain_community.vectorstores import DuckDB as DuckDBVectorStore 7 | from langchain_community.vectorstores.duckdb import DEFAULT_TABLE_NAME 8 | from langchain_core.documents import Document 9 | from langchain_core.embeddings.embeddings import Embeddings 10 | from langchain_openai.embeddings import OpenAIEmbeddings 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def store_documents(docs: list[Document]): 16 | full_database_path = _get_local_store() 17 | 18 | embeddings = _get_embedding_store() 19 | 20 | logger.info(f"vectorizing and storing {len(docs)} documents locally") 21 | logger.debug(f"storing into {full_database_path}") 22 | 23 | with duckdb.connect(full_database_path) as conn: 24 | logger.debug(f"dropping existing embeddings table '{DEFAULT_TABLE_NAME}' if exists") 25 | conn.execute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE_NAME}") 26 | 27 | logger.debug(f"loading {len(docs)} documents") 28 | DuckDBVectorStore.from_documents(docs, embeddings, connection=conn) 29 | 30 | 31 | def _get_local_store() -> str: 32 | appdirs = PlatformDirs("bookworm", "bookworm") 33 | database_name = "bookmarks.duckdb" 34 | full_database_path = os.path.join(appdirs.user_data_dir, database_name) 35 | 36 | logger.debug(f"creating folder {appdirs.user_data_dir}") 37 | os.makedirs(appdirs.user_data_dir, exist_ok=True) 38 | 39 | return full_database_path 40 | 41 | 42 | def _get_embedding_store() -> Embeddings: 43 | if os.environ.get("OPENAI_API_KEY", None): 44 | logger.debug("Using OpenAI Embeddings") 45 | # https://api.python.langchain.com/en/latest/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html 46 | return OpenAIEmbeddings() 47 | 48 | else: 49 | raise ValueError("Embeddings service could not be configured. Ensure you have OPENAI_API_KEY set.") 50 | -------------------------------------------------------------------------------- /bookworm_genai/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | from sqlalchemy import RowMapping 5 | from functools import cache 6 | 7 | 8 | ## CHROMIUM 9 | 10 | CHROMIUM_JQ_COMMAND = """ 11 | [.roots.bookmark_bar.children, .roots.other.children] | 12 | flatten | 13 | .. | 14 | objects | 15 | select(.type == "url") 16 | """ 17 | 18 | ## SQL LOADER 19 | 20 | 21 | def sql_loader_page_content_mapper(row: RowMapping) -> str: 22 | """ 23 | Dictates how a SQL Loader row maps into page content stored into the vector database. 24 | 25 | This is required because the langchain SQLLoader and JSONLoader output different formats so this function is inplace 26 | to ensure that the output is consistent. 27 | """ 28 | row = dict(row) 29 | row["name"] = row["title"] 30 | del row["title"] 31 | 32 | return json.dumps(row) 33 | 34 | 35 | @cache 36 | def sql_loader_firefox_copy_path() -> str: 37 | """ 38 | Returns the path to the Firefox database file for the SQL Loader. 39 | """ 40 | if sys.platform == "linux": 41 | return os.path.expanduser("~/.mozilla/firefox/*.default-release/places.sqlite") 42 | elif sys.platform == "darwin": 43 | return os.path.expanduser("~/Library/Application Support/Firefox/Profiles/*.default-release/places.sqlite") 44 | else: 45 | raise NotImplementedError(f"Platform {sys.platform} is not supported") 46 | 47 | 48 | @cache 49 | def sql_loader_firefox_sql_query() -> str: 50 | """ 51 | Generates the SQL query for the SQL Loader to extract the bookmarks from the Firefox database. 52 | This query also embeds a literal column called 'source' which is the path to the database file. This is needed in the query so 53 | that when the SQL Loader runs we can tell it to put this source into the metadata. 54 | """ 55 | return f""" 56 | SELECT 57 | CAST(moz_places.id AS TEXT) AS id, 58 | moz_bookmarks.title, 59 | moz_places.url, 60 | CAST(moz_bookmarks.dateAdded AS TEXT) AS dateAdded, 61 | CAST(moz_bookmarks.lastModified AS TEXT) AS lastModified, 62 | '{sql_loader_firefox_copy_path()}' as source 63 | FROM 64 | moz_bookmarks 65 | LEFT JOIN 66 | moz_places 67 | ON 68 | moz_bookmarks.fk = moz_places.id 69 | WHERE 70 | moz_bookmarks.type = 1 71 | AND 72 | moz_bookmarks.title IS NOT NULL 73 | """ 74 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "bookworm_genai" 3 | version = "0.13.0" 4 | description = "Bookworm - A LLM-powered bookmark search engine" 5 | authors = ["kiran94"] 6 | readme = "README.md" 7 | license = "MIT" 8 | homepage = "https://pypi.org/project/bookworm_genai/" 9 | repository = "https://github.com/kiran94/bookworm" 10 | documentation = "https://github.com/kiran94/bookworm/blob/main/README.md" 11 | keywords = [ "bookmarks", "bookmark-manager", "genai", "chatbots" ] 12 | classifiers = [ 13 | "Intended Audience :: Developers", 14 | "Operating System :: OS Independent", 15 | "Topic :: Utilities", 16 | ] 17 | 18 | [tool.poetry.dependencies] 19 | python = "^3.9" 20 | langchain = "^0.2.12" 21 | langchain-community = "^0.2.11" 22 | langchain-openai = "^0.1.20" 23 | jq = "^1.7.0" 24 | duckdb = "^1.0.0" 25 | rich = "^13.7.1" 26 | platformdirs = "^4.2.2" 27 | pandas = "^2.2.2" 28 | tiktoken = "^0.7.0" 29 | 30 | [tool.poetry.group.dev.dependencies] 31 | pytest = "^8.3.2" 32 | ruff = "^0.5.6" 33 | pytest-cov = "^5.0.0" 34 | pytest-github-actions-annotate-failures = "^0.2.0" 35 | litecli = "^1.11.0" 36 | 37 | [build-system] 38 | requires = ["poetry-core"] 39 | build-backend = "poetry.core.masonry.api" 40 | 41 | [tool.poetry.scripts] 42 | bookworm = 'bookworm_genai.__main__:main' 43 | 44 | [tool.ruff] 45 | line-length = 160 46 | -------------------------------------------------------------------------------- /tests/test_ask.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import patch, Mock 3 | 4 | import pytest 5 | 6 | from bookworm_genai.commands.ask import BookmarkChain, _system_message, _get_llm 7 | from bookworm_genai.models import Bookmarks 8 | 9 | 10 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 11 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate") 12 | @patch("bookworm_genai.commands.ask.ChatOpenAI") 13 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore") 14 | @patch("bookworm_genai.commands.ask.duckdb") 15 | @patch("bookworm_genai.commands.ask._get_embedding_store") 16 | @patch("bookworm_genai.commands.ask._get_local_store") 17 | def test_bookmark_chain_ask( 18 | mock_local_store: Mock, 19 | mock_embedding_store: Mock, 20 | mock_duckdb: Mock, 21 | mock_duckdb_vector: Mock, 22 | mock_chatopenai: Mock, 23 | mock_chat_prompt_template: Mock, 24 | ): 25 | mock_local_store.return_value = "/test/bookmark.duckdb" 26 | 27 | mock_duckdb_connection = Mock() 28 | mock_duckdb.connect.return_value = mock_duckdb_connection 29 | 30 | mock_embedding = Mock() 31 | mock_embedding_store.return_value = mock_embedding 32 | 33 | mock_llm = Mock() 34 | mock_chatopenai.return_value = mock_llm 35 | 36 | mock_chain = Mock(name="chain") 37 | mock_chat_prompt_template.from_messages.return_value.__ror__.return_value.__or__.return_value = mock_chain 38 | 39 | with BookmarkChain() as bc: 40 | # If this checks fails then most likely the chain constructed in the BookmarkChain has changed 41 | # review the mock_chain 42 | assert mock_chain == bc.chain 43 | 44 | bc.ask("test") 45 | 46 | mock_duckdb.connect.assert_called_once_with("/test/bookmark.duckdb", read_only=False) 47 | mock_duckdb_vector.assert_called_once_with(connection=mock_duckdb_connection, embedding=mock_embedding) 48 | mock_duckdb_vector.return_value.as_retriever.assert_called_once_with(search_kwargs={"k": 3}) 49 | assert mock_duckdb_connection.close.called 50 | 51 | mock_chatopenai.assert_called_once_with(temperature=0.0) 52 | mock_llm.with_structured_output.assert_called_once_with(Bookmarks) 53 | mock_chat_prompt_template.from_messages.assert_called_once_with([("system", _system_message), ("human", "{query}")]) 54 | 55 | mock_chain.invoke.assert_called_once_with("test") 56 | 57 | 58 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 59 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate") 60 | @patch("bookworm_genai.commands.ask.ChatOpenAI") 61 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore") 62 | @patch("bookworm_genai.commands.ask.duckdb") 63 | @patch("bookworm_genai.commands.ask._get_embedding_store") 64 | @patch("bookworm_genai.commands.ask._get_local_store") 65 | def test_bookmark_chain_ask_n_parameter( 66 | mock_local_store: Mock, 67 | mock_embedding_store: Mock, 68 | mock_duckdb: Mock, 69 | mock_duckdb_vector: Mock, 70 | mock_chatopenai: Mock, 71 | mock_chat_prompt_template: Mock, 72 | ): 73 | n = 15 74 | with BookmarkChain(vector_store_search_n=n): 75 | pass 76 | 77 | mock_duckdb_vector.return_value.as_retriever.assert_called_once_with(search_kwargs={"k": n}) 78 | 79 | 80 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 81 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate") 82 | @patch("bookworm_genai.commands.ask.ChatOpenAI") 83 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore") 84 | @patch("bookworm_genai.commands.ask.duckdb") 85 | @patch("bookworm_genai.commands.ask._get_embedding_store") 86 | @patch("bookworm_genai.commands.ask._get_local_store") 87 | def test_bookmark_chain_is_valid( 88 | mock_local_store: Mock, 89 | mock_embedding_store: Mock, 90 | mock_duckdb: Mock, 91 | mock_duckdb_vector: Mock, 92 | mock_chatopenai: Mock, 93 | mock_chat_prompt_template: Mock, 94 | ): 95 | mock_duckdb_connection = Mock() 96 | mock_duckdb.connect.return_value = mock_duckdb_connection 97 | 98 | mock_duckdb_connection.execute.return_value.fetchall.return_value = [(1,)] 99 | 100 | with BookmarkChain() as bc: 101 | assert bc.is_valid() 102 | 103 | mock_duckdb_connection.execute.assert_called_once_with("SELECT COUNT(*) FROM embeddings") 104 | 105 | 106 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 107 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate") 108 | @patch("bookworm_genai.commands.ask.ChatOpenAI") 109 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore") 110 | @patch("bookworm_genai.commands.ask.duckdb") 111 | @patch("bookworm_genai.commands.ask._get_embedding_store") 112 | @patch("bookworm_genai.commands.ask._get_local_store") 113 | def test_bookmark_chain_is_valid_zero_count( 114 | mock_local_store: Mock, 115 | mock_embedding_store: Mock, 116 | mock_duckdb: Mock, 117 | mock_duckdb_vector: Mock, 118 | mock_chatopenai: Mock, 119 | mock_chat_prompt_template: Mock, 120 | ): 121 | mock_duckdb_connection = Mock() 122 | mock_duckdb.connect.return_value = mock_duckdb_connection 123 | 124 | mock_duckdb_connection.execute.return_value.fetchall.return_value = [(0,)] 125 | 126 | with BookmarkChain() as bc: 127 | assert not bc.is_valid() 128 | 129 | 130 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 131 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate") 132 | @patch("bookworm_genai.commands.ask.ChatOpenAI") 133 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore") 134 | @patch("bookworm_genai.commands.ask.duckdb") 135 | @patch("bookworm_genai.commands.ask._get_embedding_store") 136 | @patch("bookworm_genai.commands.ask._get_local_store") 137 | @pytest.mark.parametrize( 138 | "duckdb_response", 139 | [[], None], 140 | ) 141 | def test_bookmark_chain_is_valid_invalid_response( 142 | mock_local_store: Mock, 143 | mock_embedding_store: Mock, 144 | mock_duckdb: Mock, 145 | mock_duckdb_vector: Mock, 146 | mock_chatopenai: Mock, 147 | mock_chat_prompt_template: Mock, 148 | duckdb_response, 149 | ): 150 | mock_duckdb_connection = Mock() 151 | mock_duckdb.connect.return_value = mock_duckdb_connection 152 | 153 | mock_duckdb_connection.execute.return_value.fetchall.return_value = duckdb_response 154 | 155 | with BookmarkChain() as bc: 156 | assert not bc.is_valid() 157 | 158 | 159 | @patch.dict(os.environ, {}, clear=True) 160 | def test_get_llm_no_env(): 161 | with pytest.raises(ValueError, match="LLM service could not be configured"): 162 | _get_llm() 163 | -------------------------------------------------------------------------------- /tests/test_export.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch, call 2 | 3 | import pandas as pd 4 | 5 | from bookworm_genai.commands.export import export 6 | 7 | 8 | @patch("bookworm_genai.commands.export._get_local_store") 9 | @patch("bookworm_genai.commands.export.duckdb") 10 | def test_export(mock_duckdb: Mock, mock_get_local_store: Mock): 11 | df = pd.DataFrame( 12 | data={ 13 | "id": ["id"], 14 | "text": ['{"name": "my_bookmark", "url": "https://bookmark.com"}'], 15 | "embedding": [1], 16 | "metadata": ['{"source": "my_source", "browser": "chrome"}'], 17 | } 18 | ) 19 | 20 | mock_duckdb.connect.return_value.__enter__.return_value.execute.return_value.df.return_value = df 21 | 22 | result = export() 23 | 24 | assert mock_duckdb.connect.call_args_list == [call(mock_get_local_store.return_value, read_only=True)] 25 | 26 | expected_df = pd.DataFrame(data={"name": "my_bookmark", "url": "https://bookmark.com", "browser": "chrome", "source": "my_source"}, index=[0]) 27 | pd.testing.assert_frame_equal(expected_df, result) 28 | -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch, call 2 | 3 | import pytest 4 | 5 | from bookworm_genai.__main__ import main 6 | 7 | 8 | @patch("bookworm_genai.__main__.sys") 9 | def test_main_no_arguments(mock_sys: Mock): 10 | mock_sys.argv = ["script"] 11 | with pytest.raises(SystemExit, match="2"): 12 | main() 13 | 14 | 15 | @patch("bookworm_genai.__main__.browsers") 16 | @patch("bookworm_genai.__main__.sync") 17 | @patch("bookworm_genai.__main__.sys") 18 | def test_main_sync(mock_sys: Mock, mock_sync: Mock, mock_browsers: Mock): 19 | mock_sys.argv = ["script", "sync"] 20 | 21 | main() 22 | 23 | assert mock_sync.call_args_list == [call(mock_browsers, estimate_cost=False, browser_filter=[])] 24 | 25 | 26 | @patch("builtins.input") 27 | @patch("bookworm_genai.__main__.BookmarkChain") 28 | @patch("bookworm_genai.__main__.sys") 29 | def test_main_ask(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock): 30 | mock_sys.argv = ["script", "ask"] 31 | mock_input.side_effect = ["pandas column", "0"] 32 | 33 | bc = Mock() 34 | mock_bookmark_chain.return_value.__enter__.return_value = bc 35 | 36 | bc.is_valid.return_value = True 37 | bc.ask.return_value = Mock( 38 | bookmarks=[ 39 | Mock(title="first", url="http://google.com", source="/file/hello.txt"), 40 | Mock(title="second", url="http://google.com", source="/file/hello.txt"), 41 | ] 42 | ) 43 | 44 | main() 45 | 46 | # We expect that this is called because in the mock_input above we are selecting index 0 to open 47 | assert bc.ask.return_value.bookmarks[0].open.called 48 | 49 | 50 | @patch("builtins.input") 51 | @patch("bookworm_genai.__main__.BookmarkChain") 52 | @patch("bookworm_genai.__main__.sys") 53 | def test_main_ask_query(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock): 54 | query = "dummy search query" 55 | 56 | mock_sys.argv = ["script", "ask", "-q", query] 57 | mock_input.side_effect = ["0"] 58 | 59 | bc = Mock() 60 | bc.is_valid.return_value = True 61 | bc.ask.return_value = Mock( 62 | bookmarks=[ 63 | Mock(title="first", url="http://google.com", source="/file/hello.txt"), 64 | Mock(title="second", url="http://google.com", source="/file/hello.txt"), 65 | ] 66 | ) 67 | 68 | mock_bookmark_chain.return_value.__enter__.return_value = bc 69 | 70 | main() 71 | 72 | assert bc.ask.call_args_list == [call(query)] 73 | 74 | 75 | @patch("builtins.input") 76 | @patch("bookworm_genai.__main__.BookmarkChain") 77 | @patch("bookworm_genai.__main__.sys") 78 | def test_main_ask_not_valid(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock): 79 | mock_sys.argv = ["script", "ask"] 80 | mock_input.side_effect = ["pandas column", "0"] 81 | 82 | bc = Mock() 83 | mock_bookmark_chain.return_value.__enter__.return_value = bc 84 | 85 | bc.is_valid.return_value = False 86 | 87 | main() 88 | 89 | assert not bc.ask.called 90 | 91 | 92 | @patch("builtins.input") 93 | @patch("bookworm_genai.__main__.BookmarkChain") 94 | @patch("bookworm_genai.__main__.sys") 95 | def test_main_ask_no_results(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock, caplog): 96 | mock_sys.argv = ["script", "ask"] 97 | mock_input.side_effect = ["pandas column", "0"] 98 | 99 | bc = Mock() 100 | bc.ask.return_value = Mock(bookmarks=[]) 101 | bc.is_valid.return_value = True 102 | 103 | mock_bookmark_chain.return_value.__enter__.return_value = bc 104 | 105 | main() 106 | 107 | 108 | @patch("builtins.input") 109 | @patch("bookworm_genai.__main__.BookmarkChain") 110 | @patch("bookworm_genai.__main__.sys") 111 | def test_main_ask_invalid_input(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock): 112 | mock_sys.argv = ["script", "ask"] 113 | 114 | # This simulates asking for a bookmark related to pandas columns 115 | # Then entering an invalid non-numberic input 116 | # Then entering a out of range index 117 | # and then entering a valid number to open the bookmark 118 | mock_input.side_effect = ["pandas column", "NOT_A_NUMBER", "999", "1"] 119 | 120 | bc = Mock() 121 | mock_bookmark_chain.return_value.__enter__.return_value = bc 122 | 123 | bc.is_valid.return_value = True 124 | bc.ask.return_value = Mock( 125 | bookmarks=[ 126 | Mock(title="first", url="http://google.com", source="/file/hello.txt"), 127 | Mock(title="second", url="http://google.com", source="/file/hello.txt"), 128 | ] 129 | ) 130 | 131 | main() 132 | 133 | assert bc.ask.return_value.bookmarks[1].open.called 134 | 135 | 136 | @pytest.mark.parametrize( 137 | "arguments, expected_call", 138 | [ 139 | pytest.param([], [call("bookmarks.csv", index=False)], id="no_output_override"), 140 | pytest.param(["--output", "hello.csv"], [call("hello.csv", index=False)], id="output_override"), 141 | ], 142 | ) 143 | @patch("bookworm_genai.__main__.export") 144 | @patch("bookworm_genai.__main__.sys") 145 | def test_main_export(mock_sys: Mock, mock_export: Mock, arguments: list[str], expected_call): 146 | mock_sys.argv = ["script", "export", *arguments] 147 | 148 | mock_bookmarks = Mock() 149 | mock_export.return_value = mock_bookmarks 150 | 151 | main() 152 | 153 | assert mock_bookmarks.to_csv.call_args_list == expected_call 154 | -------------------------------------------------------------------------------- /tests/test_models.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import call, patch, Mock 2 | 3 | import pytest 4 | 5 | from bookworm_genai.integrations import Browser 6 | from bookworm_genai.models import Bookmark 7 | 8 | 9 | @pytest.mark.parametrize( 10 | "platform, expected_subprocess_call", 11 | [ 12 | ("linux", call(["xdg-open", "https://www.google.com"])), 13 | ("win32", call(["start", "https://www.google.com"], shell=True)), 14 | ("darwin", call(["open", "https://www.google.com"])), 15 | ], 16 | ) 17 | @patch("bookworm_genai.models.subprocess") 18 | @patch("bookworm_genai.models.sys") 19 | def test_bookmark_open(mock_platform: Mock, mock_subprocess: Mock, platform: str, expected_subprocess_call: call): 20 | mock_platform.platform = platform 21 | 22 | bookmark = Bookmark(title="Google", url="https://www.google.com", source="Google", browser=Browser.CHROME.value) 23 | bookmark.open() 24 | 25 | assert mock_subprocess.Popen.call_args == expected_subprocess_call 26 | 27 | 28 | @patch("bookworm_genai.models.logger") 29 | @patch("bookworm_genai.models.subprocess") 30 | @patch("bookworm_genai.models.sys") 31 | def test_bookmark_unsupported_os(mock_platform: Mock, mock_subprocess: Mock, mock_logger: Mock): 32 | mock_platform.platform = "chromeos" 33 | 34 | bookmark = Bookmark(title="Google", url="https://www.google.com", source="Google", browser=Browser.CHROME.value) 35 | bookmark.open() 36 | 37 | assert mock_logger.warning.call_args == call('Platform "chromeos" not supported. Printing URL instead') 38 | assert mock_logger.info.call_args == call("https://www.google.com") 39 | -------------------------------------------------------------------------------- /tests/test_storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import patch, Mock, call 3 | 4 | import pytest 5 | 6 | from bookworm_genai.storage import store_documents 7 | 8 | 9 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 10 | @patch("bookworm_genai.storage.OpenAIEmbeddings") 11 | @patch("bookworm_genai.storage.DuckDBVectorStore") 12 | @patch("bookworm_genai.storage.duckdb") 13 | @patch("bookworm_genai.storage.PlatformDirs") 14 | @patch("bookworm_genai.storage.os.makedirs") 15 | def test_store_documents( 16 | mock_os_makedirs: Mock, 17 | mock_platform_dirs: Mock, 18 | mock_duckdb: Mock, 19 | mock_duckdb_vector: Mock, 20 | mock_openai_embeddings: Mock, 21 | ): 22 | docs = [Mock(), Mock()] 23 | 24 | mock_user_data_dir = "/test" 25 | mock_platform_dirs.return_value.user_data_dir = mock_user_data_dir 26 | 27 | store_documents(docs) 28 | 29 | assert mock_platform_dirs.call_args_list == [call("bookworm", "bookworm")] 30 | assert mock_duckdb.connect.call_args_list == [call(f"{mock_user_data_dir}/bookmarks.duckdb")] 31 | assert mock_os_makedirs.call_args_list == [call(mock_user_data_dir, exist_ok=True)] 32 | 33 | mock_duckb_connection = mock_duckdb.connect.return_value.__enter__.return_value 34 | assert mock_duckb_connection.execute.call_args_list == [call("DROP TABLE IF EXISTS embeddings")] 35 | assert mock_duckdb_vector.from_documents.call_args_list == [call(docs, mock_openai_embeddings.return_value, connection=mock_duckb_connection)] 36 | 37 | 38 | @patch.dict(os.environ, {}, clear=True) 39 | @patch("bookworm_genai.storage.PlatformDirs") 40 | @patch("bookworm_genai.storage.os.makedirs") 41 | def test_no_proper_embedding_environment( 42 | mock_os_makedirs: Mock, 43 | mock_platform_dirs: Mock, 44 | ): 45 | docs = [Mock(), Mock()] 46 | 47 | with pytest.raises(ValueError, match="Embeddings service could not be configured"): 48 | store_documents(docs) 49 | -------------------------------------------------------------------------------- /tests/test_sync.py: -------------------------------------------------------------------------------- 1 | import os 2 | from getpass import getuser 3 | import sys 4 | from unittest.mock import patch, Mock, call, ANY 5 | 6 | import pytest 7 | 8 | from bookworm_genai import __version__ 9 | from bookworm_genai.commands.sync import _estimate_cost, sync 10 | from bookworm_genai.integrations import Browser, browsers 11 | from bookworm_genai.metadata import Metadata 12 | from bookworm_genai.utils import sql_loader_firefox_sql_query 13 | 14 | 15 | def _mock_browsers_config(platform: str = "linux", mocked_documents: list[any] = ["DOC1", "DOC2"]): 16 | new_browsers = browsers.copy() 17 | 18 | for browser, config in new_browsers.items(): 19 | mock_loader = Mock() 20 | mock_loader.return_value.lazy_load.return_value = mocked_documents 21 | 22 | for platform in config: 23 | try: 24 | config[platform]["bookmark_loader"] = mock_loader 25 | except KeyError: 26 | continue 27 | 28 | if "db" in config[platform]["bookmark_loader_kwargs"]: 29 | mock_sqlite = Mock() 30 | mock_sqlite.return_value.return_value._engine.url = "mocked_database_connection" 31 | 32 | config[platform]["bookmark_loader_kwargs"]["db"] = mock_sqlite 33 | 34 | return new_browsers 35 | 36 | 37 | def _collect_browser_calls(platform: str, browsers: dict) -> tuple[list[str], list[call]]: 38 | collected_file_paths: list[str] = [] 39 | collected_loader_calls: list[call] = [] 40 | 41 | for browser, config in browsers.items(): 42 | if platform not in config: 43 | continue 44 | 45 | if "file_path" in config[platform]["bookmark_loader_kwargs"]: 46 | collected_file_paths.append(config[platform]["bookmark_loader_kwargs"]["file_path"]) 47 | elif "db" in config[platform]["bookmark_loader_kwargs"]: 48 | path = config[platform]["bookmark_loader_kwargs"]["db"] 49 | if callable(path): 50 | path = path(path) 51 | collected_file_paths.append(path._engine.url) 52 | 53 | collected_loader_calls.extend(config[platform]["bookmark_loader"].call_args_list) 54 | 55 | return collected_file_paths, collected_loader_calls 56 | 57 | 58 | @pytest.mark.skipif(sys.platform != "linux", reason="this test is only for linux") 59 | @patch.dict(browsers, _mock_browsers_config(), clear=True) 60 | @patch("bookworm_genai.commands.sync.glob") 61 | @patch("bookworm_genai.commands.sync.shutil") 62 | @patch("bookworm_genai.commands.sync.os.makedirs") 63 | @patch("bookworm_genai.commands.sync.store_documents") 64 | @patch("bookworm_genai.commands.sync.sys") 65 | def test_sync_linux(mock_sys: Mock, mock_store_documents: Mock, mock_makedirs: Mock, mock_shutil: Mock, mock_glob: Mock): 66 | platform = "linux" 67 | 68 | mock_sys.platform = platform 69 | user = getuser() 70 | mock_glob.glob.return_value = ["/mocked/firefox.sqlite"] 71 | 72 | browsers = _mock_browsers_config(mocked_documents=[Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")]) 73 | sync(browsers) 74 | 75 | collected_file_paths, collected_loader_calls = _collect_browser_calls(platform, browsers) 76 | 77 | assert collected_file_paths == [ 78 | f"/home/{user}/.config/BraveSoftware/Brave-Browser/Default/Bookmarks", 79 | f"/home/{user}/.config/google-chrome/Default/Bookmarks", 80 | "mocked_database_connection", 81 | ] 82 | 83 | assert collected_loader_calls == [ 84 | call( 85 | file_path=ANY, 86 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n', 87 | text_content=False, 88 | ), 89 | call( 90 | file_path=ANY, 91 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n', 92 | text_content=False, 93 | ), 94 | call(db=ANY, query=sql_loader_firefox_sql_query(), source_columns=["source"], page_content_mapper=ANY), 95 | ] 96 | 97 | assert mock_store_documents.call_count == 1, "store_documents should be called once" 98 | 99 | args, _ = mock_store_documents.call_args_list[0] 100 | assert len(args) == 1, "store_documents should be called with one argument" 101 | 102 | stored_documents = args[0] 103 | assert len(stored_documents) == 6, "store_documents should be called with 6 documents. 2 per browser" 104 | 105 | assert mock_makedirs.call_args_list == [call("/tmp/bookworm", exist_ok=True)] 106 | assert mock_shutil.copy.call_args_list == [call(mock_glob.glob.return_value[0], "/tmp/bookworm/firefox.sqlite")] 107 | 108 | 109 | @pytest.mark.skipif(sys.platform != "darwin", reason="this test is only for macos") 110 | @patch.dict(browsers, _mock_browsers_config(), clear=True) 111 | @patch("bookworm_genai.commands.sync.glob") 112 | @patch("bookworm_genai.commands.sync.shutil") 113 | @patch("bookworm_genai.commands.sync.os.makedirs") 114 | @patch("bookworm_genai.commands.sync.store_documents") 115 | @patch("bookworm_genai.commands.sync.sys") 116 | def test_sync_macos(mock_sys: Mock, mock_store_documents: Mock, mock_makedirs: Mock, mock_shutil: Mock, mock_glob: Mock): 117 | platform = "darwin" 118 | 119 | mock_sys.platform = platform 120 | user = getuser() 121 | 122 | browsers = _mock_browsers_config(platform, mocked_documents=[Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")]) 123 | sync(browsers) 124 | 125 | collected_file_paths, collected_loader_calls = _collect_browser_calls(platform, browsers) 126 | 127 | assert collected_file_paths == [ 128 | # brave 129 | f"/Users/{user}/Library/Application Support/BraveSoftware/Brave-Browser/Default/Bookmarks", 130 | # chrome 131 | f"/Users/{user}/Library/Application Support/Google/Chrome/Default/Bookmarks", 132 | # firefox 133 | "mocked_database_connection", 134 | ] 135 | assert collected_loader_calls == [ 136 | # brave 137 | call( 138 | file_path=f"/Users/{user}/Library/Application Support/BraveSoftware/Brave-Browser/Default/Bookmarks", 139 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n', 140 | text_content=False, 141 | ), 142 | # chrome 143 | call( 144 | file_path=f"/Users/{user}/Library/Application Support/Google/Chrome/Default/Bookmarks", 145 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n', 146 | text_content=False, 147 | ), 148 | # firefox 149 | call(db=ANY, query=sql_loader_firefox_sql_query(), source_columns=["source"], page_content_mapper=ANY), 150 | ] 151 | 152 | 153 | @patch("bookworm_genai.commands.sync.store_documents") 154 | @patch.dict(browsers, _mock_browsers_config(), clear=True) 155 | @patch("bookworm_genai.commands.sync.sys") 156 | def test_sync_platform_unsupported(mock_sys: Mock, mock_store_documents: Mock, caplog): 157 | platform = "unsupported" 158 | 159 | mock_sys.platform = platform 160 | 161 | browsers = _mock_browsers_config() 162 | sync(browsers) 163 | 164 | assert not mock_store_documents.called 165 | 166 | logs = [log.message for log in caplog.records if log.levelname == "WARNING"] 167 | logs.sort() 168 | assert logs == [ 169 | "🔄 browser brave is not supported on unsupported yet", 170 | "🔄 browser chrome is not supported on unsupported yet", 171 | "🔄 browser firefox is not supported on unsupported yet", 172 | ] 173 | 174 | 175 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 176 | @patch.dict(browsers, _mock_browsers_config(), clear=True) 177 | @patch("builtins.input") 178 | @patch("bookworm_genai.commands.sync.tiktoken") 179 | @patch("bookworm_genai.commands.sync.glob") 180 | @patch("bookworm_genai.commands.sync.shutil") 181 | @patch("bookworm_genai.commands.sync.os.makedirs") 182 | @patch("bookworm_genai.commands.sync.store_documents") 183 | @patch("bookworm_genai.commands.sync.sys") 184 | def test_sync_estimate_cost( 185 | mock_sys: Mock, 186 | mock_store_documents: Mock, 187 | mock_makedirs: Mock, 188 | mock_shutil: Mock, 189 | mock_glob: Mock, 190 | mock_tiktoken: Mock, 191 | mocked_input: Mock, 192 | caplog, 193 | ): 194 | platform = "linux" 195 | mock_sys.platform = platform 196 | 197 | mock_encoding = Mock() 198 | mock_encoding.encode.return_value = "mocked_page_content" * 100 # The multiplier just simulates a larger document 199 | mock_tiktoken.encoding_for_model.return_value = mock_encoding 200 | 201 | # At the time of writing ada v2 is priced at $0.100 per 1M tokens 202 | # so this is what we are using for this unit test 203 | # https://openai.com/api/pricing/ 204 | mocked_input.return_value = "0.100" 205 | 206 | mocked_documents = [ 207 | Mock(page_content="mocked_page_content", metadata={}), 208 | ] 209 | 210 | browsers = _mock_browsers_config(mocked_documents=mocked_documents) 211 | cost = sync(browsers, estimate_cost=True) 212 | 213 | assert not mock_store_documents.called 214 | assert mock_encoding.encode.call_args_list == [ 215 | call("mocked_page_content"), 216 | call("mocked_page_content"), 217 | call("mocked_page_content"), 218 | ] 219 | 220 | assert cost == 0.0005700000000000001 221 | 222 | 223 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True) 224 | @patch("builtins.input") 225 | @patch("bookworm_genai.commands.sync.tiktoken") 226 | def test_sync_estimate_cost_non_interactive(mock_tiktoken: Mock, mock_input: Mock): 227 | mocked_documents = [ 228 | Mock(page_content="mocked_page_content"), 229 | ] 230 | 231 | mock_encoding = Mock() 232 | mock_encoding.encode.return_value = "mocked_page_content" * 100 # The multiplier just simulates a larger document 233 | mock_tiktoken.encoding_for_model.return_value = mock_encoding 234 | 235 | cost = _estimate_cost(mocked_documents, cost_per_million=0.100) 236 | 237 | assert cost == 0.00019 238 | assert not mock_input.called 239 | 240 | 241 | @patch("bookworm_genai.commands.sync.glob") 242 | @patch("bookworm_genai.commands.sync.shutil") 243 | @patch("bookworm_genai.commands.sync.os.makedirs") 244 | @patch("bookworm_genai.commands.sync.store_documents") 245 | @patch("bookworm_genai.commands.sync.sys") 246 | def test_sync_browser_filter(mock_sys: Mock, mock_store_documents: Mock, mock_makedirs: Mock, mock_shutil: Mock, mock_glob: Mock): 247 | browser_filter = [Browser.CHROME.value] 248 | 249 | platform = "darwin" 250 | mock_sys.platform = platform 251 | 252 | browsers = _mock_browsers_config(mocked_documents=[Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")]) 253 | sync(browsers, browser_filter=browser_filter) 254 | 255 | assert browsers[Browser.CHROME][platform]["bookmark_loader"].called 256 | assert not browsers[Browser.FIREFOX][platform]["bookmark_loader"].called 257 | 258 | 259 | @patch("bookworm_genai.commands.sync.store_documents") 260 | @patch("bookworm_genai.commands.sync.os") 261 | @patch("bookworm_genai.commands.sync.shutil") 262 | @patch("bookworm_genai.commands.sync.glob") 263 | def test_sync_copy_source_missing(mock_glob: Mock, mock_shutil: Mock, mock_os: Mock, mock_store_documents: Mock): 264 | path_to_missing_file = "/path/to/missing/file" 265 | 266 | mock_docs_loader = Mock() 267 | mock_docs_loader.return_value.lazy_load.return_value = [Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")] 268 | 269 | browsers = { 270 | # this one will fail and be skipped due to missing file 271 | # ensure that even if this one fails, the next one will still be processed 272 | Browser.FIREFOX: { 273 | sys.platform: { 274 | "bookmark_loader": Mock(), 275 | "bookmark_loader_kwargs": {}, 276 | "copy": { 277 | "from": path_to_missing_file, 278 | "to": "/path/to/destination", 279 | }, 280 | } 281 | }, 282 | # this one will be processed 283 | Browser.CHROME: { 284 | sys.platform: { 285 | "bookmark_loader": mock_docs_loader, 286 | "bookmark_loader_kwargs": {}, 287 | } 288 | }, 289 | } 290 | 291 | mock_glob.glob.return_value = [] 292 | 293 | sync(browsers=browsers) 294 | 295 | mock_glob.glob.assert_called_once_with(path_to_missing_file) 296 | 297 | # ensures that even if the first browser fails, the second one still extracts docs and submits to storage 298 | assert mock_store_documents.call_count == 1 299 | assert len(mock_store_documents.call_args_list[0]) == 2 300 | 301 | 302 | @patch("bookworm_genai.commands.sync.store_documents") 303 | def test_sync_metadata_attached(store_document: Mock): 304 | document_mock = Mock("DOC1", metadata={}, page_content="") 305 | mock_browsers = _mock_browsers_config(sys.platform, [document_mock]) 306 | 307 | sync(mock_browsers, browser_filter=Browser.CHROME) 308 | 309 | assert document_mock.metadata == {Metadata.Browser.value: Browser.CHROME.value, Metadata.BookwormVersion.value: __version__} 310 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import Mock, patch 2 | 3 | import pytest 4 | from bookworm_genai.utils import sql_loader_firefox_copy_path, sql_loader_page_content_mapper 5 | 6 | 7 | def test_sql_loader_page_content_mapper(): 8 | row = {"id": 1, "title": "title", "url": "url", "dateAdded": "dateAdded", "lastModified": "lastModified", "source": "source"} 9 | 10 | result = sql_loader_page_content_mapper(row) 11 | assert result == '{"id": 1, "url": "url", "dateAdded": "dateAdded", "lastModified": "lastModified", "source": "source", "name": "title"}' 12 | 13 | 14 | @pytest.mark.parametrize( 15 | "platform,mocked_expanduser", 16 | [ 17 | pytest.param("linux", "/home/user/.mozilla/firefox/*.default-release/places.sqlite", id="linux"), 18 | pytest.param("darwin", "/Users/user/Library/Application Support/Firefox/Profiles/*.default-release/places.sqlite", id="darwin"), 19 | ], 20 | ) 21 | @patch("bookworm_genai.utils.os.path.expanduser") 22 | @patch("bookworm_genai.utils.sys") 23 | def test_sql_loader_firefox_copy_path_linux(mock_sys: Mock, mock_expanduser: Mock, platform: str, mocked_expanduser: str): 24 | sql_loader_firefox_copy_path.cache_clear() 25 | 26 | mock_sys.platform = platform 27 | mock_expanduser.return_value = mocked_expanduser 28 | 29 | assert sql_loader_firefox_copy_path() == mocked_expanduser 30 | 31 | 32 | @patch("bookworm_genai.utils.sys") 33 | def test_sql_loader_firefox_copy_path_unknown(mock_sys: Mock): 34 | sql_loader_firefox_copy_path.cache_clear() 35 | 36 | mock_sys.platform = "unknown" 37 | 38 | with pytest.raises(NotImplementedError, match="Platform unknown is not supported"): 39 | sql_loader_firefox_copy_path() 40 | --------------------------------------------------------------------------------