├── .github
└── workflows
│ ├── docker.yml
│ └── main.yml
├── .gitignore
├── .vscode
├── extensions.json
├── launch.json
└── settings.json
├── Dockerfile.linux
├── LICENSE
├── Makefile
├── README.md
├── bookworm_genai
├── __init__.py
├── __main__.py
├── commands
│ ├── __init__.py
│ ├── ask.py
│ ├── export.py
│ └── sync.py
├── integrations.py
├── metadata.py
├── models.py
├── storage.py
└── utils.py
├── poetry.lock
├── pyproject.toml
└── tests
├── test_ask.py
├── test_export.py
├── test_main.py
├── test_models.py
├── test_storage.py
├── test_sync.py
└── test_utils.py
/.github/workflows/docker.yml:
--------------------------------------------------------------------------------
1 | name: docker
2 | on:
3 | push:
4 | branches:
5 | - master
6 | - main
7 | pull_request:
8 | jobs:
9 | linux:
10 | runs-on: ubuntu-latest
11 | timeout-minutes: 10
12 | strategy:
13 | fail-fast: false
14 | steps:
15 | - uses: actions/checkout@v3
16 | - name: Run Docker
17 | run: |
18 | make docker_linux
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: main
2 | on:
3 | push:
4 | branches:
5 | - master
6 | - main
7 | pull_request:
8 | jobs:
9 | build:
10 | runs-on: ${{ matrix.os }}
11 | timeout-minutes: 10
12 | permissions:
13 | contents: read
14 | pull-requests: write
15 | strategy:
16 | fail-fast: false
17 | matrix:
18 | # https://devguide.python.org/versions/
19 | python-version: ['3.9', '3.10', '3.11', '3.12']
20 | poetry-version: ['1.4.2']
21 | os: [ubuntu-latest] # , windows-latest]
22 | steps:
23 | - uses: actions/checkout@v3
24 | - uses: actions/setup-python@v4
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - uses: abatilo/actions-poetry@v2
28 | with:
29 | poetry-version: ${{ matrix.poetry-version }}
30 | - name: Poetry Check
31 | run: poetry check
32 | - name: Install Dependencies
33 | run: |
34 | poetry build
35 | poetry install
36 | - name: Lint
37 | run: make lint
38 | - name: Test
39 | run: make test
40 | - name: Code Coverage
41 | if: matrix.python-version == '3.12' && matrix.poetry-version == '1.4.2' && matrix.os == 'ubuntu-latest' && github.ref != 'refs/heads/main'
42 | run: make coverage
43 | - name: Code Coverage Report
44 | if: matrix.python-version == '3.12' && matrix.poetry-version == '1.4.2' && matrix.os == 'ubuntu-latest' && github.ref != 'refs/heads/main'
45 | uses: romeovs/lcov-reporter-action@2a28ec3e25fb7eae9cb537e9141603486f810d1a # https://github.com/romeovs/lcov-reporter-action/issues/46
46 | with:
47 | lcov-file: coverage.lcov
48 | github-token: ${{ secrets.GITHUB_TOKEN }}
49 | delete-old-comments: true
50 | release:
51 | runs-on: ubuntu-latest
52 | timeout-minutes: 10
53 | needs: [build]
54 | permissions:
55 | contents: write
56 | pull-requests: write
57 | steps:
58 | - uses: actions/checkout@v3
59 | - uses: abatilo/actions-poetry@v2
60 | - name: Release
61 | id: release
62 | uses: go-semantic-release/action@v1
63 | if: github.ref == 'refs/heads/main'
64 | with:
65 | github-token: ${{ secrets.GITHUB_TOKEN }}
66 | allow-initial-development-versions: true
67 | # Prerelease
68 | - name: Bump Poetry (Prerelease)
69 | id: prerelease
70 | if: github.ref != 'refs/heads/main' && github.event_name == 'pull_request'
71 | run: |
72 | python -m pip install semantic-version
73 |
74 | # extract the current version
75 | VERSION=$(cat pyproject.toml | grep "^version =" | cut -d= -f2 | sed 's/"//g' | sed 's/^\s*\|\s*$//g')
76 |
77 | # bump the version
78 | # it's difficult to know if this should be patch, minor, major ahead of time as it would require running the actual release action
79 | VERSION=$(python -c "from semantic_version import Version; v = Version('${VERSION}'); v = v.next_patch(); print(v)")
80 |
81 | # add prerelease version identifier
82 | VERSION="${VERSION}b${{ github.run_number }}" # _${{ github.run_attempt }}"
83 |
84 | poetry version $VERSION
85 |
86 | # ensure the published package links back to the pull request
87 | PULL_REQUEST_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH")
88 | sed "s/repository =.*/repository = \"https:\/\/github.com\/kiran94\/bookworm\/pull\/$PULL_REQUEST_NUMBER\"/" pyproject.toml -i
89 |
90 | echo "VERSION=${VERSION}" >> $GITHUB_OUTPUT
91 | - name: Deploy (Prerelease)
92 | if: github.ref != 'refs/heads/main' && github.event_name == 'pull_request'
93 | run: |
94 | poetry build
95 | poetry publish -u __token__ -p $POETRY_PYPI_TOKEN_PYPI
96 | env:
97 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }}
98 | - name: Comment (Prerelease)
99 | if: github.ref != 'refs/heads/main' && github.event_name == 'pull_request'
100 | uses: thollander/actions-comment-pull-request@v2
101 | with:
102 | message: |
103 | Change was prereleased to pypi. Try it out :rocket:
104 |
105 | ```bash
106 | python -m pip install bookworm_genai==${{steps.prerelease.outputs.VERSION}}
107 | ```
108 |
109 | https://pypi.org/project/bookworm_genai/${{steps.prerelease.outputs.VERSION}}/
110 | comment_tag: execution
111 | # Main Release
112 | - name: Bump Poetry
113 | if: github.ref == 'refs/heads/main' && steps.release.outputs.version != ''
114 | run: poetry version ${{ steps.release.outputs.version }}
115 | - name: Deploy
116 | if: github.ref == 'refs/heads/main' && steps.release.outputs.version != ''
117 | run: |
118 | poetry build
119 | poetry publish -u __token__ -p $POETRY_PYPI_TOKEN_PYPI
120 | env:
121 | POETRY_PYPI_TOKEN_PYPI: ${{ secrets.POETRY_PYPI_TOKEN_PYPI }}
122 | - uses: EndBug/add-and-commit@v9
123 | if: github.ref == 'refs/heads/main' && steps.release.outputs.version != ''
124 | with:
125 | message: "chore(version): bump"
126 | default_author: github_actions
127 | add: pyproject.toml
128 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.toptal.com/developers/gitignore/api/python
2 | # Edit at https://www.toptal.com/developers/gitignore?templates=python
3 |
4 | ### Python ###
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | cover/
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | .pybuilder/
80 | target/
81 |
82 | # Jupyter Notebook
83 | .ipynb_checkpoints
84 |
85 | # IPython
86 | profile_default/
87 | ipython_config.py
88 |
89 | # pyenv
90 | # For a library or package, you might want to ignore these files since the code is
91 | # intended to run in multiple environments; otherwise, check them in:
92 | # .python-version
93 |
94 | # pipenv
95 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
96 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
97 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
98 | # install all needed dependencies.
99 | #Pipfile.lock
100 |
101 | # poetry
102 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
103 | # This is especially recommended for binary packages to ensure reproducibility, and is more
104 | # commonly ignored for libraries.
105 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
106 | #poetry.lock
107 |
108 | # pdm
109 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
110 | #pdm.lock
111 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
112 | # in version control.
113 | # https://pdm.fming.dev/#use-with-ide
114 | .pdm.toml
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 |
166 | ### Python Patch ###
167 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
168 | poetry.toml
169 |
170 | # ruff
171 | .ruff_cache/
172 |
173 | # LSP config files
174 | pyrightconfig.json
175 |
176 | # End of https://www.toptal.com/developers/gitignore/api/python
177 |
178 | .duckdb
179 | .json
180 | .sqlite
181 | *.csv
182 |
183 |
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | "recommendations": [
3 | "ms-python.python",
4 | "ms-python.vscode-pylance",
5 | "ms-python.debugpy",
6 | "ms-toolsai.jupyter",
7 | "yzhang.markdown-all-in-one",
8 | "charliermarsh.ruff",
9 | "hbenl.vscode-test-explorer",
10 | ]
11 | }
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": "0.2.0",
3 | "configurations": [
4 | {
5 | "name": "Bookworm: Sync",
6 | "type": "debugpy",
7 | "request": "launch",
8 | "module": "bookworm_genai",
9 | "envFile": "${workspaceFolder}/.env",
10 | "env": {
11 | "LOGGING_LEVEL": "DEBUG"
12 | },
13 | "args": [
14 | "sync"
15 | ]
16 | },
17 | {
18 | "name": "Bookworm: Sync (Browser Filter)",
19 | "type": "debugpy",
20 | "request": "launch",
21 | "module": "bookworm_genai",
22 | "envFile": "${workspaceFolder}/.env",
23 | "env": {
24 | "LOGGING_LEVEL": "DEBUG"
25 | },
26 | "args": [
27 | "sync",
28 | "--browser-filter", "${input:browser}"
29 | ]
30 | },
31 | {
32 | "name": "Bookworm: Ask",
33 | "type": "debugpy",
34 | "request": "launch",
35 | "module": "bookworm_genai",
36 | "envFile": "${workspaceFolder}/.env",
37 | "env": {
38 | "LOGGING_LEVEL": "DEBUG"
39 | },
40 | "args": [
41 | "ask"
42 | ]
43 | }
44 | ],
45 | "inputs": [
46 | {
47 | "id": "browser",
48 | "type": "pickString",
49 | "description": "Pick the browser you want to filter on:",
50 | "options": [
51 | "chrome",
52 | "firefox",
53 | "brave"
54 | ],
55 | "default": "chrome"
56 | }
57 | ]
58 | }
59 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.testing.pytestArgs": [
3 | "tests"
4 | ],
5 | "python.testing.unittestEnabled": false,
6 | "python.testing.pytestEnabled": true,
7 | "notebook.formatOnSave.enabled": true,
8 | "[python]": {
9 | "editor.formatOnSave": true,
10 | "editor.defaultFormatter": "charliermarsh.ruff",
11 | }
12 | }
--------------------------------------------------------------------------------
/Dockerfile.linux:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim
2 |
3 | RUN python3 -m pip install poetry==1.4.2
4 |
5 | RUN useradd -m -s /bin/bash my_user \
6 | && echo "my_user:password" | chpasswd \
7 | && usermod -aG sudo my_user
8 |
9 | COPY . /home/my_user/app
10 | WORKDIR /home/my_user/app
11 | RUN chown -R my_user:my_user /home/my_user/app
12 |
13 | USER my_user
14 |
15 | RUN poetry install
16 | RUN poetry run pytest -v
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2024 Kiran Patel
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | OS := $(shell uname)
2 |
3 | test:
4 | poetry run pytest -vv
5 |
6 | lint:
7 | poetry run ruff check $(if $(GITHUB_ACTIONS),--output-format github,) .
8 |
9 | format:
10 | poetry run ruff format
11 |
12 | coverage:
13 | poetry run pytest -q --cov=bookworm_genai --cov-report=term # for local
14 | poetry run pytest -q --cov=bookworm_genai --cov-report=html # for local
15 |
16 | # for sonarqube
17 | $(if $(GITHUB_ACTIONS),poetry run pytest -q --cov=bookworm_genai --cov-report=xml,)
18 |
19 | # for github action
20 | $(if $(GITHUB_ACTIONS),poetry run pytest -q --cov=bookworm_genai --cov-report=lcov,)
21 |
22 | check_database:
23 | ifeq ($(OS),Darwin)
24 | duckdb "/Users/kiran/Library/Application Support/bookworm/bookmarks.duckdb" -c 'SELECT * FROM embeddings LIMIT 5; SELECT COUNT(*) FROM embeddings'
25 | else ifeq ($(OS),Linux)
26 | duckdb ~/.local/share/bookworm/bookmarks.duckdb -c 'SELECT * FROM embeddings LIMIT 5; SELECT COUNT(*) FROM embeddings'
27 | else
28 | @echo "OS not supported"
29 | endif
30 |
31 | # Useful if you are running on non-linux machine
32 | # and want to verify tests are still working on that platform
33 | docker_linux:
34 | docker build -f Dockerfile.linux -t bookworm_linux .
35 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # bookworm 📖
2 |
3 | [](https://github.com/kiran94/bookworm/actions/workflows/main.yml) [](https://badge.fury.io/py/bookworm_genai)
4 |
5 | > LLM-powered bookmark search engine
6 |
7 | `bookworm` allows you to search from your local browser bookmarks using natural language. For times when you have a large collection of bookmarks and you can't quite remember where you put that one website you need at the moment.
8 |
9 | [](https://asciinema.org/a/696722)
10 |
11 | *In the example above, we search for the term “Japan.” While some results don’t explicitly mention the word, terms like “Osaka” appear because they are closely related to the search term based on OpenAI embeddings.*
12 |
13 | ## Install
14 |
15 | ```bash
16 | python -m pip install bookworm_genai
17 | ```
18 |
19 | > [!TIP]
20 | > If you are using [`uvx`](https://docs.astral.sh/uv/guides/tools/) then you can also just run this:
21 | > ```bash
22 | > uvx --from bookworm_genai bookworm --help
23 | > ```
24 |
25 | ## Usage
26 |
27 | ```bash
28 | export OPENAI_API_KEY=
29 |
30 | # Run once and then anytime bookmarks across supported browsers changes
31 | bookworm sync
32 |
33 | # Sync bookmarks only from a specific browser
34 | bookworm sync --browser-filter chrome
35 |
36 | # Ask questions against the bookmark database
37 | bookworm ask
38 |
39 | # Ask questions against the bookmark database
40 | # Specify the query when invoking the command
41 | # If you omit this then you will be asked for a query when the tool is running
42 | bookworm ask -q pandas
43 |
44 | # Ask questions against the bookmark database and specify the number of results that should come back
45 | bookworm ask -n 1
46 | ```
47 |
48 | The `sync` process currently supports the following configurations:
49 |
50 | | Operating System | Google Chrome | Mozilla Firefox | Brave | Microsoft Edge |
51 | | ------------------ | --------------- | ----------------- | ------- | ---------------- |
52 | | **Linux** | ✅ | ✅ | ✅ | ❌ |
53 | | **macOS** | ✅ | ✅ | ✅ | ❌ |
54 | | **Windows** | ❌ | ❌ | ❌ | ❌ |
55 |
56 | > [!TIP]
57 | > ✨ Want to contribute? See the [adding an integration](#adding-an-integration) section.
58 |
59 | ## Processes
60 |
61 | *`bookworm sync`*
62 |
63 | Vectorize your bookmarks across all supported browsers.
64 |
65 | ```mermaid
66 | graph LR
67 |
68 | subgraph Bookmarks
69 | Chrome(Chrome Bookmarks)
70 | Brave(Brave Bookmarks)
71 | Firefox(Firefox Bookmarks)
72 | end
73 |
74 | Bookworm(bookworm sync)
75 |
76 | EmbeddingsService(Embeddings Service e.g OpenAIEmbeddings)
77 |
78 | VectorStore(Vector Store e.g DuckDB)
79 |
80 | Chrome -->|load bookmarks|Bookworm
81 | Brave -->|load bookmarks|Bookworm
82 | Firefox -->|load bookmarks|Bookworm
83 |
84 | Bookworm -->|vectorize bookmarks|EmbeddingsService-->|store embeddings|VectorStore
85 | ```
86 |
87 |
88 | Details
89 |
90 | The vector database depicted above is stored locally on your machine. You can check it's location by running the following after installing this project:
91 |
92 | ```python
93 | from platformdirs import PlatformDirs
94 |
95 | print(PlatformDirs('bookworm').user_data_dir)
96 | ```
97 |
98 |
99 |
100 | ---
101 |
102 | *`bookworm ask`*
103 |
104 | Search from your bookmarks
105 |
106 | ```mermaid
107 | graph LR
108 |
109 | query
110 | Bookworm(bookworm ask)
111 |
112 | subgraph _
113 | LLM(LLM e.g OpenAI)
114 | VectorStore(Vector Store e.g DuckDB)
115 | end
116 |
117 | query -->|user queries for information|Bookworm
118 |
119 | Bookworm -->|similarity search|VectorStore -->|send similar docs + user query|LLM
120 | LLM -->|send back response|Bookworm
121 | ```
122 |
123 | ---
124 |
125 | *`bookworm export`*
126 |
127 | Export your bookmarks across all supported browsers into an output (e.g CSV)
128 |
129 | ```mermaid
130 | graph LR
131 |
132 | VectorStore
133 | Bookworm(bookworm export)
134 | CSV(bookmarks.csv)
135 |
136 | VectorStore -->|extract all bookmarks|Bookworm
137 | Bookworm -->|export into file|CSV
138 | ```
139 |
140 | ## Developer Setup
141 |
142 | ```bash
143 | # LLMs
144 | export OPENAI_API_KEY=
145 |
146 | # Langchain (optional, but useful for debugging)
147 | export LANGCHAIN_API_KEY=
148 | export LANGCHAIN_TRACING_V2=true
149 | export LANGCHAIN_PROJECT=bookworm
150 |
151 | # Misc (optional)
152 | export LOGGING_LEVEL=INFO
153 | ```
154 |
155 | Recommendations:
156 |
157 | - Install [`pyenv`](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation) and ensure [build dependencies are installed](https://github.com/pyenv/pyenv?tab=readme-ov-file#install-python-build-dependencies) for your OS.
158 | - Install [Poetry](https://python-poetry.org/docs/) we will be using [environment management](https://python-poetry.org/docs/managing-environments/) below.
159 | - VS Code Extensions recommendations can be found [here](./.vscode/extensions.json) and will be suggested upon first opening the project.
160 |
161 |
162 | ```bash
163 | poetry env use 3.9 # or path to your 3.9 installation
164 |
165 | poetry shell
166 | poetry install
167 |
168 | bookworm --help
169 | ```
170 |
171 |
172 | Running Linux tests on MacOS/Windows
173 |
174 | If you are running on a non-linux machine, it may be helpful to run the provided [Dockerfile](./Dockerfile.linux) to verify it's working on that environment.
175 |
176 | You can build this via:
177 |
178 | ```bash
179 | make docker_linux
180 | ```
181 |
182 | You will need to have Docker installed to run this.
183 |
184 |
185 |
186 | ## Adding an Integration
187 |
188 | As you can see from [usage](#usage), bookworm supports various integrations but not all. If you find one that you want to support one, then a change is needed inside [integrations.py](./bookworm_genai/integrations.py).
189 |
190 | You can see in that file there is a variable called `browsers` that follows this structure:
191 |
192 | ```python
193 | browsers = {
194 | "BROWSER": {
195 | "PLATFORM": {
196 | ...
197 | }
198 | }
199 | }
200 | ```
201 |
202 | So say you wanted to add Chrome support in Windows then you would go under the Chrome key and then add a `win32` key which has all the details. You can refer to existing examples but generally the contents of those details are *where* to find the bookmarks on the user's system along with how to *interpret* them.
203 |
204 | You can also find a full list of the document loaders supported [here](https://python.langchain.com/docs/integrations/document_loaders/).
--------------------------------------------------------------------------------
/bookworm_genai/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import importlib.metadata
4 |
5 | from rich.logging import RichHandler
6 |
7 | LOGGING_LEVEL = os.environ.get("LOGGING_LEVEL", logging.INFO)
8 | LOGGING_FORMAT = os.environ.get("LOGGING_FORMAT", "%(message)s")
9 |
10 | __version__ = importlib.metadata.version(__name__)
11 | _is_debug = logging.getLevelName(LOGGING_LEVEL) == logging.DEBUG
12 |
13 | logging.basicConfig(
14 | level=LOGGING_LEVEL,
15 | format=LOGGING_FORMAT,
16 | handlers=[RichHandler(markup=True, show_path=_is_debug, show_time=_is_debug, show_level=_is_debug)],
17 | )
18 |
19 | logging.getLogger("httpx").setLevel(logging.WARNING)
20 |
--------------------------------------------------------------------------------
/bookworm_genai/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import logging
3 | import argparse
4 |
5 | from bookworm_genai import __version__
6 | from bookworm_genai.integrations import browsers, Browser
7 | from bookworm_genai.commands.sync import sync
8 | from bookworm_genai.commands.ask import BookmarkChain
9 | from bookworm_genai.commands.export import export
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | def main():
15 | arg_parser = argparse.ArgumentParser(description="LLM-powered bookmark search engine")
16 | arg_parser.add_argument("--version", action="version", version=__version__)
17 |
18 | sub_parsers = arg_parser.add_subparsers(dest="command", help="Available commands", required=True)
19 |
20 | sync_parser = sub_parsers.add_parser("sync", help="Sync the bookmark database with the latest changes")
21 | sync_parser.add_argument("--estimate-cost", action="store_true", default=False, help="Estimate the cost of syncing the bookmark database")
22 | sync_parser.add_argument("--browser-filter", default=[], help="Only sync a subset of browsers", choices=Browser.list())
23 |
24 | ask_parser = sub_parsers.add_parser("ask", help="Search for a bookmark")
25 | ask_parser.add_argument("-n", "--top-n", type=int, default=3, help="Number of bookmarks to return")
26 | ask_parser.add_argument("-q", "--query", help="The Search Query")
27 |
28 | export_parser = sub_parsers.add_parser("export", help="Export bookmarks")
29 | export_parser.add_argument("--format", choices=["csv"], default="csv")
30 | export_parser.add_argument("--output", default="bookmarks.csv")
31 |
32 | args = arg_parser.parse_args(sys.argv[1:])
33 |
34 | logger.info("[bold green]Starting Bookworm 📖")
35 | logger.debug("Running on platform '%s' with version '%s'", sys.platform, __version__)
36 |
37 | logger.debug("Arguments: %s", args)
38 |
39 | if args.command == "sync":
40 | sync(browsers, estimate_cost=args.estimate_cost, browser_filter=args.browser_filter)
41 |
42 | elif args.command == "ask":
43 | if not args.query:
44 | logger.info("What would you like to search for?")
45 | query = input("> ")
46 | else:
47 | query = args.query
48 |
49 | logger.debug("query: %s", query)
50 |
51 | with BookmarkChain(vector_store_search_n=args.top_n) as bookmark_chain:
52 | if not bookmark_chain.is_valid():
53 | logger.debug("bookmark chain is not valid, exiting early.")
54 | return
55 |
56 | logger.info("Searching for bookmarks...")
57 | bookmarks = bookmark_chain.ask(query)
58 |
59 | if not bookmarks.bookmarks:
60 | logger.info("""
61 | No bookmarks found for the query 🙁. Please ensure you have performed a "bookworm sync" to update the database
62 | and the query is relevant to the bookmarks stored.
63 | """)
64 | return
65 |
66 | for index, bookmark in enumerate(bookmarks.bookmarks):
67 | if logger.isEnabledFor(logging.DEBUG):
68 | # also shows the source of the bookmark
69 | logger.info(
70 | f"[green][{index}] [/] {bookmark.title} - [link={bookmark.url}]{bookmark.url}[/link] ([green]{bookmark.source}[/])"
71 | ) # pragma: no cover
72 | else:
73 | logger.info(f"[green][{index}] [/] {bookmark.title} - [link={bookmark.url}]{bookmark.url}[/link] ([green]{bookmark.browser}[/])")
74 |
75 | logger.info("Press a number to open the bookmark:")
76 | while True:
77 | try:
78 | raw_input = input("> ")
79 | selected_index = int(raw_input)
80 | bookmarks.bookmarks[selected_index].open()
81 |
82 | break
83 | except ValueError:
84 | logger.warning(f"Invalid input: '{raw_input}'. Please enter a number.")
85 | except IndexError:
86 | logger.warning(f"Invalid index: '{selected_index}'. Please select a valid index.")
87 |
88 | elif args.command == "export":
89 | bookmarks = export()
90 |
91 | logger.info(f"[blue]Exporting bookmarks to '{args.output}' [/]")
92 | bookmarks.to_csv(args.output, index=False)
93 |
94 |
95 | if __name__ == "__main__":
96 | main() # pragma: no cover
97 |
--------------------------------------------------------------------------------
/bookworm_genai/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kiran94/bookworm/4a23d5472f12177560e9caf18a3e0a987881e82e/bookworm_genai/commands/__init__.py
--------------------------------------------------------------------------------
/bookworm_genai/commands/ask.py:
--------------------------------------------------------------------------------
1 | import os
2 | import logging
3 |
4 | import duckdb
5 | from langchain_community.vectorstores import DuckDB as DuckDBVectorStore
6 | from langchain_openai import ChatOpenAI
7 | from langchain_core.prompts import ChatPromptTemplate
8 | from langchain_core.runnables import RunnablePassthrough
9 | from langchain_core.language_models.chat_models import BaseChatModel
10 |
11 | from bookworm_genai.models import Bookmarks
12 | from bookworm_genai.storage import _get_local_store, _get_embedding_store
13 |
14 | logger = logging.getLogger(__name__)
15 |
16 |
17 | _system_message = """
18 | You have knowledge about all the browser bookmarks stored by an individual.
19 | When a user asks a question, you should be able to search the bookmarks and return the most relevant bookmark title and URL.
20 | It could be multiple bookmarks.
21 | If you don't have anything in the context then return empty list
22 |
23 | The bookmarks available are from the context:
24 | {context}
25 | """
26 |
27 |
28 | class BookmarkChain:
29 | def __init__(self, vector_store_search_n: int = 3):
30 | full_database_path = _get_local_store()
31 | logger.debug("Connecting to vector database at: %s", full_database_path)
32 | self._duckdb_connection = duckdb.connect(full_database_path, read_only=False)
33 | self.vector_store = DuckDBVectorStore(connection=self._duckdb_connection, embedding=_get_embedding_store())
34 |
35 | llm = _get_llm()
36 | llm = llm.with_structured_output(Bookmarks)
37 |
38 | prompt = ChatPromptTemplate.from_messages([("system", _system_message), ("human", "{query}")])
39 |
40 | search_kwargs = {"k": vector_store_search_n}
41 |
42 | self.chain = {"context": self.vector_store.as_retriever(search_kwargs=search_kwargs), "query": RunnablePassthrough()} | prompt | llm
43 |
44 | def ask(self, query: str) -> Bookmarks:
45 | logger.debug("Searching for bookmarks with query: %s", query)
46 |
47 | return self.chain.invoke(query)
48 |
49 | def is_valid(self) -> bool:
50 | res = self._duckdb_connection.execute("SELECT COUNT(*) FROM embeddings").fetchall()
51 |
52 | try:
53 | res = res[0][0]
54 | except (IndexError, TypeError) as e:
55 | logger.warning("validation check failed due to unexpected response from the database.")
56 | logger.debug("Error: %s", e)
57 | logger.debug("Raw DuckDB Response: %s", res)
58 |
59 | return False
60 |
61 | if res == 0:
62 | logger.warning("No bookmarks were found in database. Please ensure you run 'bookworm sync' before asking questions")
63 | return False
64 | else:
65 | return True
66 |
67 | def __enter__(self):
68 | return self
69 |
70 | def __exit__(self, exc_type, exc_val, exc_tb):
71 | logger.debug("Closing DuckDB connection")
72 |
73 | self._duckdb_connection.close()
74 |
75 |
76 | def _get_llm() -> BaseChatModel:
77 | kwargs = {
78 | "temperature": 0.0,
79 | }
80 |
81 | if os.environ.get("OPENAI_API_KEY"):
82 | # https://api.python.langchain.com/en/latest/chat_models/langchain_openai.chat_models.base.ChatOpenAI.html
83 | return ChatOpenAI(**kwargs)
84 |
85 | else:
86 | raise ValueError(
87 | "LLM service could not be configured. Ensure you have OPENAI_API_KEY. If you are using OpenAI then please ensure you have the OPENAI_API_KEY environment variable set."
88 | )
89 |
--------------------------------------------------------------------------------
/bookworm_genai/commands/export.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 |
4 | import pandas as pd
5 | import duckdb
6 |
7 | from bookworm_genai.storage import _get_local_store
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | def export() -> pd.DataFrame:
13 | store = _get_local_store()
14 |
15 | logger.debug(f"reading from vector store {store}")
16 | with duckdb.connect(store, read_only=True) as duck:
17 | df = duck.execute("select * from embeddings").df()
18 |
19 | logger.debug("extracting useful information from structured columns")
20 | browser_col = df["metadata"].apply(json.loads).apply(lambda x: x["browser"]).rename(index="browser")
21 | source_col = df["metadata"].apply(json.loads).apply(lambda x: x["source"]).rename(index="source")
22 | name_col = df["text"].apply(json.loads).apply(lambda x: x["name"]).rename(index="name")
23 | url_col = df["text"].apply(json.loads).apply(lambda x: x["url"]).rename(index="url")
24 |
25 | logger.debug("dropping unnecessary columns")
26 | cleaned_df = df.drop(columns=["id", "metadata", "text", "embedding"])
27 |
28 | bookmark_summary_df = pd.concat([cleaned_df, name_col, url_col, browser_col, source_col], axis=1)
29 | return bookmark_summary_df
30 |
--------------------------------------------------------------------------------
/bookworm_genai/commands/sync.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import glob
4 | import logging
5 | import shutil
6 | from typing import Optional, Union
7 |
8 | import tiktoken
9 | from langchain_core.documents import Document
10 |
11 | from bookworm_genai.integrations import Browser, browsers, BrowserManifest
12 | from bookworm_genai.storage import store_documents, _get_embedding_store
13 | from bookworm_genai.metadata import attach_metadata
14 |
15 |
16 | logger = logging.getLogger(__name__)
17 |
18 |
19 | def sync(browsers: BrowserManifest = browsers, estimate_cost: bool = False, browser_filter: list[str] = []) -> Union[None, float]:
20 | docs: list[Document] = []
21 |
22 | for browser, config in browsers.items():
23 | browser: Browser = browser
24 |
25 | if browser_filter and (browser.value not in browser_filter):
26 | logger.debug(f"browser {browser.value} skipped due to filter")
27 | continue
28 |
29 | try:
30 | platform_config = config[sys.platform]
31 | except KeyError:
32 | logger.warning(f"🔄 browser {browser.value} is not supported on {sys.platform} yet")
33 | continue
34 | else:
35 | if "copy" in platform_config:
36 | try:
37 | _copy(platform_config["copy"])
38 | except BrowserBookmarkFileNotFound as e:
39 | logger.warning(f"🔄 browser {browser.value} skipped due to missing file '{e.file}'")
40 | continue
41 |
42 | _log_bookmark_source(browser, platform_config)
43 |
44 | config = platform_config["bookmark_loader_kwargs"]
45 | if "db" in config:
46 | if callable(config["db"]):
47 | config["db"] = config["db"](None)
48 |
49 | loader = platform_config["bookmark_loader"](**config)
50 |
51 | current_docs: list[Document] = list(loader.lazy_load())
52 |
53 | for index, doc in enumerate(current_docs):
54 | logger.debug(doc.page_content)
55 | current_docs[index] = attach_metadata(current_docs[index], browser)
56 |
57 | docs.extend(current_docs)
58 |
59 | logger.debug(f"{len(docs)} Bookmarks loaded")
60 |
61 | if estimate_cost:
62 | return _estimate_cost(docs)
63 |
64 | if docs:
65 | store_documents(docs)
66 |
67 |
68 | def _copy(config: dict):
69 | logger.debug(f"Copying {config['from']} to {config['to']}")
70 |
71 | source = glob.glob(config["from"])
72 |
73 | try:
74 | source = source[0]
75 | except IndexError as e:
76 | logger.debug(f"source {config['from']} not found")
77 | raise BrowserBookmarkFileNotFound(config["from"]) from e
78 |
79 | directory = os.path.dirname(config["to"])
80 | os.makedirs(directory, exist_ok=True)
81 |
82 | shutil.copy(source, config["to"])
83 |
84 |
85 | def _log_bookmark_source(browser: Browser, platform_config: dict):
86 | logger.info(f"✅ browser {browser.value} bookmarks loaded!")
87 |
88 | path = ""
89 |
90 | try:
91 | path = platform_config["bookmark_loader_kwargs"]["file_path"]
92 | except KeyError:
93 | pass
94 |
95 | try:
96 | path = platform_config["bookmark_loader_kwargs"]["db"]
97 | if callable(path):
98 | path = path(path)
99 |
100 | path = path._engine.url
101 |
102 | except KeyError:
103 | pass
104 |
105 | logger.debug("Loading bookmarks from %s", path)
106 |
107 |
108 | def _estimate_cost(docs: list[Document], cost_per_million: Optional[float] = None) -> float:
109 | embedding = _get_embedding_store()
110 |
111 | # NOTE: using _get_embedding_store here means that it's more likely that the model we are using
112 | # in the actual embedding is the one we use for cost estimation
113 | # however note that .model here is not part of the contract for Embeddings
114 | # so this is a bit of a hack
115 | # if we add more embeddings options in the future, we need to re-evaluate this.
116 | encoding = tiktoken.encoding_for_model(embedding.model)
117 |
118 | logger.info(f"Estimating cost for {embedding.model}")
119 |
120 | tokens: int = 0
121 | for doc in docs:
122 | tokens += len(encoding.encode(doc.page_content))
123 |
124 | if not cost_per_million:
125 | # https://openai.com/api/pricing/
126 | price = float(input(f"what is the current cost for {embedding.model} per million? (non-batch) "))
127 | else:
128 | price = cost_per_million
129 |
130 | # price is often advertise per million; so find the price per token
131 | price_per_token = price / 1_000_000
132 |
133 | # given the number total tokens we have, apply the price per token
134 | cost = tokens * price_per_token
135 |
136 | logger.info(f"Estimated cost: ${cost} (tokens: {tokens}) ")
137 |
138 | return cost
139 |
140 |
141 | class BrowserBookmarkFileNotFound(Exception):
142 | """
143 | Represents that a bookmark file on the local file system could not be found.
144 | For example if a configuration is defined with a glob expression /my/path/*.sqlite but that path resolves to nothing.
145 | """
146 |
147 | def __init__(self, file: str):
148 | self.file = file
149 | super().__init__(f"Could not resolve file: {file}")
150 |
--------------------------------------------------------------------------------
/bookworm_genai/integrations.py:
--------------------------------------------------------------------------------
1 | import os
2 | from enum import Enum
3 | from typing import Any
4 |
5 | from langchain_community.document_loaders import JSONLoader
6 | from langchain_community.document_loaders.sql_database import SQLDatabaseLoader
7 | from langchain_community.utilities.sql_database import SQLDatabase
8 |
9 | from bookworm_genai.utils import CHROMIUM_JQ_COMMAND, sql_loader_page_content_mapper, sql_loader_firefox_copy_path, sql_loader_firefox_sql_query
10 |
11 |
12 | class Browser(str, Enum):
13 | BRAVE = "brave"
14 | CHROME = "chrome"
15 | FIREFOX = "firefox"
16 |
17 | @classmethod
18 | def list(cls):
19 | return list(map(lambda c: c.value, cls))
20 |
21 |
22 | BrowserManifest = dict[Browser, dict[str, dict[str, Any]]]
23 |
24 | # Configuration for various browsers and details about them
25 | # The bookmark_file_path is the path to the bookmarks file for the browsers, in order for it to be used it must be used in conjunction with
26 | # os.path.expanduser as it may contain environment variables
27 | #
28 | # The platform configuration is keyed off the values from https://docs.python.org/3/library/sys.html#sys.platform
29 | #
30 | browsers: BrowserManifest = {
31 | Browser.BRAVE: {
32 | "linux": {
33 | "bookmark_loader": JSONLoader,
34 | "bookmark_loader_kwargs": {
35 | "file_path": os.path.expanduser("~/.config/BraveSoftware/Brave-Browser/Default/Bookmarks"),
36 | "jq_schema": CHROMIUM_JQ_COMMAND,
37 | "text_content": False,
38 | },
39 | },
40 | "darwin": {
41 | "bookmark_loader": JSONLoader,
42 | "bookmark_loader_kwargs": {
43 | "file_path": os.path.expanduser("~/Library/Application Support/BraveSoftware/Brave-Browser/Default/Bookmarks"),
44 | "jq_schema": CHROMIUM_JQ_COMMAND,
45 | "text_content": False,
46 | },
47 | },
48 | # "win32": {},
49 | },
50 | Browser.CHROME: {
51 | "linux": {
52 | "bookmark_loader": JSONLoader,
53 | "bookmark_loader_kwargs": {
54 | "file_path": os.path.expanduser("~/.config/google-chrome/Default/Bookmarks"),
55 | "jq_schema": CHROMIUM_JQ_COMMAND,
56 | "text_content": False,
57 | },
58 | },
59 | "darwin": {
60 | "bookmark_loader": JSONLoader,
61 | "bookmark_loader_kwargs": {
62 | "file_path": os.path.expanduser("~/Library/Application Support/Google/Chrome/Default/Bookmarks"),
63 | "jq_schema": CHROMIUM_JQ_COMMAND,
64 | "text_content": False,
65 | },
66 | },
67 | # "win32": {},
68 | },
69 | Browser.FIREFOX: {
70 | "linux": {
71 | "bookmark_loader": SQLDatabaseLoader,
72 | "bookmark_loader_kwargs": {
73 | "db": lambda _: SQLDatabase.from_uri("sqlite:////tmp/bookworm/firefox.sqlite"),
74 | "query": sql_loader_firefox_sql_query(),
75 | "source_columns": ["source"],
76 | "page_content_mapper": lambda row: sql_loader_page_content_mapper(row),
77 | },
78 | "copy": {
79 | "from": sql_loader_firefox_copy_path(),
80 | "to": "/tmp/bookworm/firefox.sqlite",
81 | },
82 | },
83 | "darwin": {
84 | "bookmark_loader": SQLDatabaseLoader,
85 | "bookmark_loader_kwargs": {
86 | "db": lambda _: SQLDatabase.from_uri("sqlite:////tmp/bookworm/firefox.sqlite"),
87 | "query": sql_loader_firefox_sql_query(),
88 | "source_columns": ["source"],
89 | "page_content_mapper": lambda row: sql_loader_page_content_mapper(row),
90 | },
91 | "copy": {
92 | "from": sql_loader_firefox_copy_path(),
93 | "to": "/tmp/bookworm/firefox.sqlite",
94 | },
95 | },
96 | # "win32": {},
97 | },
98 | }
99 |
--------------------------------------------------------------------------------
/bookworm_genai/metadata.py:
--------------------------------------------------------------------------------
1 | import enum
2 | from langchain_core.documents import Document
3 |
4 | from bookworm_genai import __version__
5 | from bookworm_genai.integrations import Browser
6 |
7 |
8 | class Metadata(str, enum.Enum):
9 | Browser = "browser"
10 | BookwormVersion = "bookworm_version"
11 |
12 |
13 | def attach_metadata(doc: Document, browser: Browser) -> Document:
14 | doc.metadata[Metadata.Browser.value] = browser.value
15 | doc.metadata[Metadata.BookwormVersion.value] = __version__
16 | return doc
17 |
--------------------------------------------------------------------------------
/bookworm_genai/models.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import subprocess
3 | import logging
4 |
5 | from langchain_core.pydantic_v1 import BaseModel, Field
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | class Bookmark(BaseModel):
11 | """
12 | A bookmark to a website
13 | """
14 |
15 | title: str = Field(description="The title of the bookmark")
16 | url: str = Field(description="The URL of the bookmark")
17 | source: str = Field(description="The source of the bookmark")
18 | browser: str = Field(description="The browser that the bookmark was saved from")
19 |
20 | def open(self):
21 | if sys.platform == "win32":
22 | subprocess.Popen(["start", self.url], shell=True)
23 | elif sys.platform == "darwin":
24 | subprocess.Popen(["open", self.url])
25 | elif sys.platform == "linux":
26 | subprocess.Popen(["xdg-open", self.url])
27 | else:
28 | logger.warning(f'Platform "{sys.platform}" not supported. Printing URL instead')
29 | logger.info(self.url)
30 |
31 |
32 | class Bookmarks(BaseModel):
33 | """
34 | A list of bookmarks
35 | """
36 |
37 | bookmarks: list[Bookmark] = Field(description="A list of bookmarks")
38 |
--------------------------------------------------------------------------------
/bookworm_genai/storage.py:
--------------------------------------------------------------------------------
1 | import os
2 | import duckdb
3 | import logging
4 |
5 | from platformdirs import PlatformDirs
6 | from langchain_community.vectorstores import DuckDB as DuckDBVectorStore
7 | from langchain_community.vectorstores.duckdb import DEFAULT_TABLE_NAME
8 | from langchain_core.documents import Document
9 | from langchain_core.embeddings.embeddings import Embeddings
10 | from langchain_openai.embeddings import OpenAIEmbeddings
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 |
15 | def store_documents(docs: list[Document]):
16 | full_database_path = _get_local_store()
17 |
18 | embeddings = _get_embedding_store()
19 |
20 | logger.info(f"vectorizing and storing {len(docs)} documents locally")
21 | logger.debug(f"storing into {full_database_path}")
22 |
23 | with duckdb.connect(full_database_path) as conn:
24 | logger.debug(f"dropping existing embeddings table '{DEFAULT_TABLE_NAME}' if exists")
25 | conn.execute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE_NAME}")
26 |
27 | logger.debug(f"loading {len(docs)} documents")
28 | DuckDBVectorStore.from_documents(docs, embeddings, connection=conn)
29 |
30 |
31 | def _get_local_store() -> str:
32 | appdirs = PlatformDirs("bookworm", "bookworm")
33 | database_name = "bookmarks.duckdb"
34 | full_database_path = os.path.join(appdirs.user_data_dir, database_name)
35 |
36 | logger.debug(f"creating folder {appdirs.user_data_dir}")
37 | os.makedirs(appdirs.user_data_dir, exist_ok=True)
38 |
39 | return full_database_path
40 |
41 |
42 | def _get_embedding_store() -> Embeddings:
43 | if os.environ.get("OPENAI_API_KEY", None):
44 | logger.debug("Using OpenAI Embeddings")
45 | # https://api.python.langchain.com/en/latest/embeddings/langchain_openai.embeddings.base.OpenAIEmbeddings.html
46 | return OpenAIEmbeddings()
47 |
48 | else:
49 | raise ValueError("Embeddings service could not be configured. Ensure you have OPENAI_API_KEY set.")
50 |
--------------------------------------------------------------------------------
/bookworm_genai/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import sys
3 | import os
4 | from sqlalchemy import RowMapping
5 | from functools import cache
6 |
7 |
8 | ## CHROMIUM
9 |
10 | CHROMIUM_JQ_COMMAND = """
11 | [.roots.bookmark_bar.children, .roots.other.children] |
12 | flatten |
13 | .. |
14 | objects |
15 | select(.type == "url")
16 | """
17 |
18 | ## SQL LOADER
19 |
20 |
21 | def sql_loader_page_content_mapper(row: RowMapping) -> str:
22 | """
23 | Dictates how a SQL Loader row maps into page content stored into the vector database.
24 |
25 | This is required because the langchain SQLLoader and JSONLoader output different formats so this function is inplace
26 | to ensure that the output is consistent.
27 | """
28 | row = dict(row)
29 | row["name"] = row["title"]
30 | del row["title"]
31 |
32 | return json.dumps(row)
33 |
34 |
35 | @cache
36 | def sql_loader_firefox_copy_path() -> str:
37 | """
38 | Returns the path to the Firefox database file for the SQL Loader.
39 | """
40 | if sys.platform == "linux":
41 | return os.path.expanduser("~/.mozilla/firefox/*.default-release/places.sqlite")
42 | elif sys.platform == "darwin":
43 | return os.path.expanduser("~/Library/Application Support/Firefox/Profiles/*.default-release/places.sqlite")
44 | else:
45 | raise NotImplementedError(f"Platform {sys.platform} is not supported")
46 |
47 |
48 | @cache
49 | def sql_loader_firefox_sql_query() -> str:
50 | """
51 | Generates the SQL query for the SQL Loader to extract the bookmarks from the Firefox database.
52 | This query also embeds a literal column called 'source' which is the path to the database file. This is needed in the query so
53 | that when the SQL Loader runs we can tell it to put this source into the metadata.
54 | """
55 | return f"""
56 | SELECT
57 | CAST(moz_places.id AS TEXT) AS id,
58 | moz_bookmarks.title,
59 | moz_places.url,
60 | CAST(moz_bookmarks.dateAdded AS TEXT) AS dateAdded,
61 | CAST(moz_bookmarks.lastModified AS TEXT) AS lastModified,
62 | '{sql_loader_firefox_copy_path()}' as source
63 | FROM
64 | moz_bookmarks
65 | LEFT JOIN
66 | moz_places
67 | ON
68 | moz_bookmarks.fk = moz_places.id
69 | WHERE
70 | moz_bookmarks.type = 1
71 | AND
72 | moz_bookmarks.title IS NOT NULL
73 | """
74 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "bookworm_genai"
3 | version = "0.13.0"
4 | description = "Bookworm - A LLM-powered bookmark search engine"
5 | authors = ["kiran94"]
6 | readme = "README.md"
7 | license = "MIT"
8 | homepage = "https://pypi.org/project/bookworm_genai/"
9 | repository = "https://github.com/kiran94/bookworm"
10 | documentation = "https://github.com/kiran94/bookworm/blob/main/README.md"
11 | keywords = [ "bookmarks", "bookmark-manager", "genai", "chatbots" ]
12 | classifiers = [
13 | "Intended Audience :: Developers",
14 | "Operating System :: OS Independent",
15 | "Topic :: Utilities",
16 | ]
17 |
18 | [tool.poetry.dependencies]
19 | python = "^3.9"
20 | langchain = "^0.2.12"
21 | langchain-community = "^0.2.11"
22 | langchain-openai = "^0.1.20"
23 | jq = "^1.7.0"
24 | duckdb = "^1.0.0"
25 | rich = "^13.7.1"
26 | platformdirs = "^4.2.2"
27 | pandas = "^2.2.2"
28 | tiktoken = "^0.7.0"
29 |
30 | [tool.poetry.group.dev.dependencies]
31 | pytest = "^8.3.2"
32 | ruff = "^0.5.6"
33 | pytest-cov = "^5.0.0"
34 | pytest-github-actions-annotate-failures = "^0.2.0"
35 | litecli = "^1.11.0"
36 |
37 | [build-system]
38 | requires = ["poetry-core"]
39 | build-backend = "poetry.core.masonry.api"
40 |
41 | [tool.poetry.scripts]
42 | bookworm = 'bookworm_genai.__main__:main'
43 |
44 | [tool.ruff]
45 | line-length = 160
46 |
--------------------------------------------------------------------------------
/tests/test_ask.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest.mock import patch, Mock
3 |
4 | import pytest
5 |
6 | from bookworm_genai.commands.ask import BookmarkChain, _system_message, _get_llm
7 | from bookworm_genai.models import Bookmarks
8 |
9 |
10 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
11 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate")
12 | @patch("bookworm_genai.commands.ask.ChatOpenAI")
13 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore")
14 | @patch("bookworm_genai.commands.ask.duckdb")
15 | @patch("bookworm_genai.commands.ask._get_embedding_store")
16 | @patch("bookworm_genai.commands.ask._get_local_store")
17 | def test_bookmark_chain_ask(
18 | mock_local_store: Mock,
19 | mock_embedding_store: Mock,
20 | mock_duckdb: Mock,
21 | mock_duckdb_vector: Mock,
22 | mock_chatopenai: Mock,
23 | mock_chat_prompt_template: Mock,
24 | ):
25 | mock_local_store.return_value = "/test/bookmark.duckdb"
26 |
27 | mock_duckdb_connection = Mock()
28 | mock_duckdb.connect.return_value = mock_duckdb_connection
29 |
30 | mock_embedding = Mock()
31 | mock_embedding_store.return_value = mock_embedding
32 |
33 | mock_llm = Mock()
34 | mock_chatopenai.return_value = mock_llm
35 |
36 | mock_chain = Mock(name="chain")
37 | mock_chat_prompt_template.from_messages.return_value.__ror__.return_value.__or__.return_value = mock_chain
38 |
39 | with BookmarkChain() as bc:
40 | # If this checks fails then most likely the chain constructed in the BookmarkChain has changed
41 | # review the mock_chain
42 | assert mock_chain == bc.chain
43 |
44 | bc.ask("test")
45 |
46 | mock_duckdb.connect.assert_called_once_with("/test/bookmark.duckdb", read_only=False)
47 | mock_duckdb_vector.assert_called_once_with(connection=mock_duckdb_connection, embedding=mock_embedding)
48 | mock_duckdb_vector.return_value.as_retriever.assert_called_once_with(search_kwargs={"k": 3})
49 | assert mock_duckdb_connection.close.called
50 |
51 | mock_chatopenai.assert_called_once_with(temperature=0.0)
52 | mock_llm.with_structured_output.assert_called_once_with(Bookmarks)
53 | mock_chat_prompt_template.from_messages.assert_called_once_with([("system", _system_message), ("human", "{query}")])
54 |
55 | mock_chain.invoke.assert_called_once_with("test")
56 |
57 |
58 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
59 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate")
60 | @patch("bookworm_genai.commands.ask.ChatOpenAI")
61 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore")
62 | @patch("bookworm_genai.commands.ask.duckdb")
63 | @patch("bookworm_genai.commands.ask._get_embedding_store")
64 | @patch("bookworm_genai.commands.ask._get_local_store")
65 | def test_bookmark_chain_ask_n_parameter(
66 | mock_local_store: Mock,
67 | mock_embedding_store: Mock,
68 | mock_duckdb: Mock,
69 | mock_duckdb_vector: Mock,
70 | mock_chatopenai: Mock,
71 | mock_chat_prompt_template: Mock,
72 | ):
73 | n = 15
74 | with BookmarkChain(vector_store_search_n=n):
75 | pass
76 |
77 | mock_duckdb_vector.return_value.as_retriever.assert_called_once_with(search_kwargs={"k": n})
78 |
79 |
80 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
81 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate")
82 | @patch("bookworm_genai.commands.ask.ChatOpenAI")
83 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore")
84 | @patch("bookworm_genai.commands.ask.duckdb")
85 | @patch("bookworm_genai.commands.ask._get_embedding_store")
86 | @patch("bookworm_genai.commands.ask._get_local_store")
87 | def test_bookmark_chain_is_valid(
88 | mock_local_store: Mock,
89 | mock_embedding_store: Mock,
90 | mock_duckdb: Mock,
91 | mock_duckdb_vector: Mock,
92 | mock_chatopenai: Mock,
93 | mock_chat_prompt_template: Mock,
94 | ):
95 | mock_duckdb_connection = Mock()
96 | mock_duckdb.connect.return_value = mock_duckdb_connection
97 |
98 | mock_duckdb_connection.execute.return_value.fetchall.return_value = [(1,)]
99 |
100 | with BookmarkChain() as bc:
101 | assert bc.is_valid()
102 |
103 | mock_duckdb_connection.execute.assert_called_once_with("SELECT COUNT(*) FROM embeddings")
104 |
105 |
106 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
107 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate")
108 | @patch("bookworm_genai.commands.ask.ChatOpenAI")
109 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore")
110 | @patch("bookworm_genai.commands.ask.duckdb")
111 | @patch("bookworm_genai.commands.ask._get_embedding_store")
112 | @patch("bookworm_genai.commands.ask._get_local_store")
113 | def test_bookmark_chain_is_valid_zero_count(
114 | mock_local_store: Mock,
115 | mock_embedding_store: Mock,
116 | mock_duckdb: Mock,
117 | mock_duckdb_vector: Mock,
118 | mock_chatopenai: Mock,
119 | mock_chat_prompt_template: Mock,
120 | ):
121 | mock_duckdb_connection = Mock()
122 | mock_duckdb.connect.return_value = mock_duckdb_connection
123 |
124 | mock_duckdb_connection.execute.return_value.fetchall.return_value = [(0,)]
125 |
126 | with BookmarkChain() as bc:
127 | assert not bc.is_valid()
128 |
129 |
130 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
131 | @patch("bookworm_genai.commands.ask.ChatPromptTemplate")
132 | @patch("bookworm_genai.commands.ask.ChatOpenAI")
133 | @patch("bookworm_genai.commands.ask.DuckDBVectorStore")
134 | @patch("bookworm_genai.commands.ask.duckdb")
135 | @patch("bookworm_genai.commands.ask._get_embedding_store")
136 | @patch("bookworm_genai.commands.ask._get_local_store")
137 | @pytest.mark.parametrize(
138 | "duckdb_response",
139 | [[], None],
140 | )
141 | def test_bookmark_chain_is_valid_invalid_response(
142 | mock_local_store: Mock,
143 | mock_embedding_store: Mock,
144 | mock_duckdb: Mock,
145 | mock_duckdb_vector: Mock,
146 | mock_chatopenai: Mock,
147 | mock_chat_prompt_template: Mock,
148 | duckdb_response,
149 | ):
150 | mock_duckdb_connection = Mock()
151 | mock_duckdb.connect.return_value = mock_duckdb_connection
152 |
153 | mock_duckdb_connection.execute.return_value.fetchall.return_value = duckdb_response
154 |
155 | with BookmarkChain() as bc:
156 | assert not bc.is_valid()
157 |
158 |
159 | @patch.dict(os.environ, {}, clear=True)
160 | def test_get_llm_no_env():
161 | with pytest.raises(ValueError, match="LLM service could not be configured"):
162 | _get_llm()
163 |
--------------------------------------------------------------------------------
/tests/test_export.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import Mock, patch, call
2 |
3 | import pandas as pd
4 |
5 | from bookworm_genai.commands.export import export
6 |
7 |
8 | @patch("bookworm_genai.commands.export._get_local_store")
9 | @patch("bookworm_genai.commands.export.duckdb")
10 | def test_export(mock_duckdb: Mock, mock_get_local_store: Mock):
11 | df = pd.DataFrame(
12 | data={
13 | "id": ["id"],
14 | "text": ['{"name": "my_bookmark", "url": "https://bookmark.com"}'],
15 | "embedding": [1],
16 | "metadata": ['{"source": "my_source", "browser": "chrome"}'],
17 | }
18 | )
19 |
20 | mock_duckdb.connect.return_value.__enter__.return_value.execute.return_value.df.return_value = df
21 |
22 | result = export()
23 |
24 | assert mock_duckdb.connect.call_args_list == [call(mock_get_local_store.return_value, read_only=True)]
25 |
26 | expected_df = pd.DataFrame(data={"name": "my_bookmark", "url": "https://bookmark.com", "browser": "chrome", "source": "my_source"}, index=[0])
27 | pd.testing.assert_frame_equal(expected_df, result)
28 |
--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import Mock, patch, call
2 |
3 | import pytest
4 |
5 | from bookworm_genai.__main__ import main
6 |
7 |
8 | @patch("bookworm_genai.__main__.sys")
9 | def test_main_no_arguments(mock_sys: Mock):
10 | mock_sys.argv = ["script"]
11 | with pytest.raises(SystemExit, match="2"):
12 | main()
13 |
14 |
15 | @patch("bookworm_genai.__main__.browsers")
16 | @patch("bookworm_genai.__main__.sync")
17 | @patch("bookworm_genai.__main__.sys")
18 | def test_main_sync(mock_sys: Mock, mock_sync: Mock, mock_browsers: Mock):
19 | mock_sys.argv = ["script", "sync"]
20 |
21 | main()
22 |
23 | assert mock_sync.call_args_list == [call(mock_browsers, estimate_cost=False, browser_filter=[])]
24 |
25 |
26 | @patch("builtins.input")
27 | @patch("bookworm_genai.__main__.BookmarkChain")
28 | @patch("bookworm_genai.__main__.sys")
29 | def test_main_ask(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock):
30 | mock_sys.argv = ["script", "ask"]
31 | mock_input.side_effect = ["pandas column", "0"]
32 |
33 | bc = Mock()
34 | mock_bookmark_chain.return_value.__enter__.return_value = bc
35 |
36 | bc.is_valid.return_value = True
37 | bc.ask.return_value = Mock(
38 | bookmarks=[
39 | Mock(title="first", url="http://google.com", source="/file/hello.txt"),
40 | Mock(title="second", url="http://google.com", source="/file/hello.txt"),
41 | ]
42 | )
43 |
44 | main()
45 |
46 | # We expect that this is called because in the mock_input above we are selecting index 0 to open
47 | assert bc.ask.return_value.bookmarks[0].open.called
48 |
49 |
50 | @patch("builtins.input")
51 | @patch("bookworm_genai.__main__.BookmarkChain")
52 | @patch("bookworm_genai.__main__.sys")
53 | def test_main_ask_query(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock):
54 | query = "dummy search query"
55 |
56 | mock_sys.argv = ["script", "ask", "-q", query]
57 | mock_input.side_effect = ["0"]
58 |
59 | bc = Mock()
60 | bc.is_valid.return_value = True
61 | bc.ask.return_value = Mock(
62 | bookmarks=[
63 | Mock(title="first", url="http://google.com", source="/file/hello.txt"),
64 | Mock(title="second", url="http://google.com", source="/file/hello.txt"),
65 | ]
66 | )
67 |
68 | mock_bookmark_chain.return_value.__enter__.return_value = bc
69 |
70 | main()
71 |
72 | assert bc.ask.call_args_list == [call(query)]
73 |
74 |
75 | @patch("builtins.input")
76 | @patch("bookworm_genai.__main__.BookmarkChain")
77 | @patch("bookworm_genai.__main__.sys")
78 | def test_main_ask_not_valid(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock):
79 | mock_sys.argv = ["script", "ask"]
80 | mock_input.side_effect = ["pandas column", "0"]
81 |
82 | bc = Mock()
83 | mock_bookmark_chain.return_value.__enter__.return_value = bc
84 |
85 | bc.is_valid.return_value = False
86 |
87 | main()
88 |
89 | assert not bc.ask.called
90 |
91 |
92 | @patch("builtins.input")
93 | @patch("bookworm_genai.__main__.BookmarkChain")
94 | @patch("bookworm_genai.__main__.sys")
95 | def test_main_ask_no_results(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock, caplog):
96 | mock_sys.argv = ["script", "ask"]
97 | mock_input.side_effect = ["pandas column", "0"]
98 |
99 | bc = Mock()
100 | bc.ask.return_value = Mock(bookmarks=[])
101 | bc.is_valid.return_value = True
102 |
103 | mock_bookmark_chain.return_value.__enter__.return_value = bc
104 |
105 | main()
106 |
107 |
108 | @patch("builtins.input")
109 | @patch("bookworm_genai.__main__.BookmarkChain")
110 | @patch("bookworm_genai.__main__.sys")
111 | def test_main_ask_invalid_input(mock_sys: Mock, mock_bookmark_chain: Mock, mock_input: Mock):
112 | mock_sys.argv = ["script", "ask"]
113 |
114 | # This simulates asking for a bookmark related to pandas columns
115 | # Then entering an invalid non-numberic input
116 | # Then entering a out of range index
117 | # and then entering a valid number to open the bookmark
118 | mock_input.side_effect = ["pandas column", "NOT_A_NUMBER", "999", "1"]
119 |
120 | bc = Mock()
121 | mock_bookmark_chain.return_value.__enter__.return_value = bc
122 |
123 | bc.is_valid.return_value = True
124 | bc.ask.return_value = Mock(
125 | bookmarks=[
126 | Mock(title="first", url="http://google.com", source="/file/hello.txt"),
127 | Mock(title="second", url="http://google.com", source="/file/hello.txt"),
128 | ]
129 | )
130 |
131 | main()
132 |
133 | assert bc.ask.return_value.bookmarks[1].open.called
134 |
135 |
136 | @pytest.mark.parametrize(
137 | "arguments, expected_call",
138 | [
139 | pytest.param([], [call("bookmarks.csv", index=False)], id="no_output_override"),
140 | pytest.param(["--output", "hello.csv"], [call("hello.csv", index=False)], id="output_override"),
141 | ],
142 | )
143 | @patch("bookworm_genai.__main__.export")
144 | @patch("bookworm_genai.__main__.sys")
145 | def test_main_export(mock_sys: Mock, mock_export: Mock, arguments: list[str], expected_call):
146 | mock_sys.argv = ["script", "export", *arguments]
147 |
148 | mock_bookmarks = Mock()
149 | mock_export.return_value = mock_bookmarks
150 |
151 | main()
152 |
153 | assert mock_bookmarks.to_csv.call_args_list == expected_call
154 |
--------------------------------------------------------------------------------
/tests/test_models.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import call, patch, Mock
2 |
3 | import pytest
4 |
5 | from bookworm_genai.integrations import Browser
6 | from bookworm_genai.models import Bookmark
7 |
8 |
9 | @pytest.mark.parametrize(
10 | "platform, expected_subprocess_call",
11 | [
12 | ("linux", call(["xdg-open", "https://www.google.com"])),
13 | ("win32", call(["start", "https://www.google.com"], shell=True)),
14 | ("darwin", call(["open", "https://www.google.com"])),
15 | ],
16 | )
17 | @patch("bookworm_genai.models.subprocess")
18 | @patch("bookworm_genai.models.sys")
19 | def test_bookmark_open(mock_platform: Mock, mock_subprocess: Mock, platform: str, expected_subprocess_call: call):
20 | mock_platform.platform = platform
21 |
22 | bookmark = Bookmark(title="Google", url="https://www.google.com", source="Google", browser=Browser.CHROME.value)
23 | bookmark.open()
24 |
25 | assert mock_subprocess.Popen.call_args == expected_subprocess_call
26 |
27 |
28 | @patch("bookworm_genai.models.logger")
29 | @patch("bookworm_genai.models.subprocess")
30 | @patch("bookworm_genai.models.sys")
31 | def test_bookmark_unsupported_os(mock_platform: Mock, mock_subprocess: Mock, mock_logger: Mock):
32 | mock_platform.platform = "chromeos"
33 |
34 | bookmark = Bookmark(title="Google", url="https://www.google.com", source="Google", browser=Browser.CHROME.value)
35 | bookmark.open()
36 |
37 | assert mock_logger.warning.call_args == call('Platform "chromeos" not supported. Printing URL instead')
38 | assert mock_logger.info.call_args == call("https://www.google.com")
39 |
--------------------------------------------------------------------------------
/tests/test_storage.py:
--------------------------------------------------------------------------------
1 | import os
2 | from unittest.mock import patch, Mock, call
3 |
4 | import pytest
5 |
6 | from bookworm_genai.storage import store_documents
7 |
8 |
9 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
10 | @patch("bookworm_genai.storage.OpenAIEmbeddings")
11 | @patch("bookworm_genai.storage.DuckDBVectorStore")
12 | @patch("bookworm_genai.storage.duckdb")
13 | @patch("bookworm_genai.storage.PlatformDirs")
14 | @patch("bookworm_genai.storage.os.makedirs")
15 | def test_store_documents(
16 | mock_os_makedirs: Mock,
17 | mock_platform_dirs: Mock,
18 | mock_duckdb: Mock,
19 | mock_duckdb_vector: Mock,
20 | mock_openai_embeddings: Mock,
21 | ):
22 | docs = [Mock(), Mock()]
23 |
24 | mock_user_data_dir = "/test"
25 | mock_platform_dirs.return_value.user_data_dir = mock_user_data_dir
26 |
27 | store_documents(docs)
28 |
29 | assert mock_platform_dirs.call_args_list == [call("bookworm", "bookworm")]
30 | assert mock_duckdb.connect.call_args_list == [call(f"{mock_user_data_dir}/bookmarks.duckdb")]
31 | assert mock_os_makedirs.call_args_list == [call(mock_user_data_dir, exist_ok=True)]
32 |
33 | mock_duckb_connection = mock_duckdb.connect.return_value.__enter__.return_value
34 | assert mock_duckb_connection.execute.call_args_list == [call("DROP TABLE IF EXISTS embeddings")]
35 | assert mock_duckdb_vector.from_documents.call_args_list == [call(docs, mock_openai_embeddings.return_value, connection=mock_duckb_connection)]
36 |
37 |
38 | @patch.dict(os.environ, {}, clear=True)
39 | @patch("bookworm_genai.storage.PlatformDirs")
40 | @patch("bookworm_genai.storage.os.makedirs")
41 | def test_no_proper_embedding_environment(
42 | mock_os_makedirs: Mock,
43 | mock_platform_dirs: Mock,
44 | ):
45 | docs = [Mock(), Mock()]
46 |
47 | with pytest.raises(ValueError, match="Embeddings service could not be configured"):
48 | store_documents(docs)
49 |
--------------------------------------------------------------------------------
/tests/test_sync.py:
--------------------------------------------------------------------------------
1 | import os
2 | from getpass import getuser
3 | import sys
4 | from unittest.mock import patch, Mock, call, ANY
5 |
6 | import pytest
7 |
8 | from bookworm_genai import __version__
9 | from bookworm_genai.commands.sync import _estimate_cost, sync
10 | from bookworm_genai.integrations import Browser, browsers
11 | from bookworm_genai.metadata import Metadata
12 | from bookworm_genai.utils import sql_loader_firefox_sql_query
13 |
14 |
15 | def _mock_browsers_config(platform: str = "linux", mocked_documents: list[any] = ["DOC1", "DOC2"]):
16 | new_browsers = browsers.copy()
17 |
18 | for browser, config in new_browsers.items():
19 | mock_loader = Mock()
20 | mock_loader.return_value.lazy_load.return_value = mocked_documents
21 |
22 | for platform in config:
23 | try:
24 | config[platform]["bookmark_loader"] = mock_loader
25 | except KeyError:
26 | continue
27 |
28 | if "db" in config[platform]["bookmark_loader_kwargs"]:
29 | mock_sqlite = Mock()
30 | mock_sqlite.return_value.return_value._engine.url = "mocked_database_connection"
31 |
32 | config[platform]["bookmark_loader_kwargs"]["db"] = mock_sqlite
33 |
34 | return new_browsers
35 |
36 |
37 | def _collect_browser_calls(platform: str, browsers: dict) -> tuple[list[str], list[call]]:
38 | collected_file_paths: list[str] = []
39 | collected_loader_calls: list[call] = []
40 |
41 | for browser, config in browsers.items():
42 | if platform not in config:
43 | continue
44 |
45 | if "file_path" in config[platform]["bookmark_loader_kwargs"]:
46 | collected_file_paths.append(config[platform]["bookmark_loader_kwargs"]["file_path"])
47 | elif "db" in config[platform]["bookmark_loader_kwargs"]:
48 | path = config[platform]["bookmark_loader_kwargs"]["db"]
49 | if callable(path):
50 | path = path(path)
51 | collected_file_paths.append(path._engine.url)
52 |
53 | collected_loader_calls.extend(config[platform]["bookmark_loader"].call_args_list)
54 |
55 | return collected_file_paths, collected_loader_calls
56 |
57 |
58 | @pytest.mark.skipif(sys.platform != "linux", reason="this test is only for linux")
59 | @patch.dict(browsers, _mock_browsers_config(), clear=True)
60 | @patch("bookworm_genai.commands.sync.glob")
61 | @patch("bookworm_genai.commands.sync.shutil")
62 | @patch("bookworm_genai.commands.sync.os.makedirs")
63 | @patch("bookworm_genai.commands.sync.store_documents")
64 | @patch("bookworm_genai.commands.sync.sys")
65 | def test_sync_linux(mock_sys: Mock, mock_store_documents: Mock, mock_makedirs: Mock, mock_shutil: Mock, mock_glob: Mock):
66 | platform = "linux"
67 |
68 | mock_sys.platform = platform
69 | user = getuser()
70 | mock_glob.glob.return_value = ["/mocked/firefox.sqlite"]
71 |
72 | browsers = _mock_browsers_config(mocked_documents=[Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")])
73 | sync(browsers)
74 |
75 | collected_file_paths, collected_loader_calls = _collect_browser_calls(platform, browsers)
76 |
77 | assert collected_file_paths == [
78 | f"/home/{user}/.config/BraveSoftware/Brave-Browser/Default/Bookmarks",
79 | f"/home/{user}/.config/google-chrome/Default/Bookmarks",
80 | "mocked_database_connection",
81 | ]
82 |
83 | assert collected_loader_calls == [
84 | call(
85 | file_path=ANY,
86 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n',
87 | text_content=False,
88 | ),
89 | call(
90 | file_path=ANY,
91 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n',
92 | text_content=False,
93 | ),
94 | call(db=ANY, query=sql_loader_firefox_sql_query(), source_columns=["source"], page_content_mapper=ANY),
95 | ]
96 |
97 | assert mock_store_documents.call_count == 1, "store_documents should be called once"
98 |
99 | args, _ = mock_store_documents.call_args_list[0]
100 | assert len(args) == 1, "store_documents should be called with one argument"
101 |
102 | stored_documents = args[0]
103 | assert len(stored_documents) == 6, "store_documents should be called with 6 documents. 2 per browser"
104 |
105 | assert mock_makedirs.call_args_list == [call("/tmp/bookworm", exist_ok=True)]
106 | assert mock_shutil.copy.call_args_list == [call(mock_glob.glob.return_value[0], "/tmp/bookworm/firefox.sqlite")]
107 |
108 |
109 | @pytest.mark.skipif(sys.platform != "darwin", reason="this test is only for macos")
110 | @patch.dict(browsers, _mock_browsers_config(), clear=True)
111 | @patch("bookworm_genai.commands.sync.glob")
112 | @patch("bookworm_genai.commands.sync.shutil")
113 | @patch("bookworm_genai.commands.sync.os.makedirs")
114 | @patch("bookworm_genai.commands.sync.store_documents")
115 | @patch("bookworm_genai.commands.sync.sys")
116 | def test_sync_macos(mock_sys: Mock, mock_store_documents: Mock, mock_makedirs: Mock, mock_shutil: Mock, mock_glob: Mock):
117 | platform = "darwin"
118 |
119 | mock_sys.platform = platform
120 | user = getuser()
121 |
122 | browsers = _mock_browsers_config(platform, mocked_documents=[Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")])
123 | sync(browsers)
124 |
125 | collected_file_paths, collected_loader_calls = _collect_browser_calls(platform, browsers)
126 |
127 | assert collected_file_paths == [
128 | # brave
129 | f"/Users/{user}/Library/Application Support/BraveSoftware/Brave-Browser/Default/Bookmarks",
130 | # chrome
131 | f"/Users/{user}/Library/Application Support/Google/Chrome/Default/Bookmarks",
132 | # firefox
133 | "mocked_database_connection",
134 | ]
135 | assert collected_loader_calls == [
136 | # brave
137 | call(
138 | file_path=f"/Users/{user}/Library/Application Support/BraveSoftware/Brave-Browser/Default/Bookmarks",
139 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n',
140 | text_content=False,
141 | ),
142 | # chrome
143 | call(
144 | file_path=f"/Users/{user}/Library/Application Support/Google/Chrome/Default/Bookmarks",
145 | jq_schema='\n [.roots.bookmark_bar.children, .roots.other.children] |\n flatten |\n .. |\n objects |\n select(.type == "url")\n',
146 | text_content=False,
147 | ),
148 | # firefox
149 | call(db=ANY, query=sql_loader_firefox_sql_query(), source_columns=["source"], page_content_mapper=ANY),
150 | ]
151 |
152 |
153 | @patch("bookworm_genai.commands.sync.store_documents")
154 | @patch.dict(browsers, _mock_browsers_config(), clear=True)
155 | @patch("bookworm_genai.commands.sync.sys")
156 | def test_sync_platform_unsupported(mock_sys: Mock, mock_store_documents: Mock, caplog):
157 | platform = "unsupported"
158 |
159 | mock_sys.platform = platform
160 |
161 | browsers = _mock_browsers_config()
162 | sync(browsers)
163 |
164 | assert not mock_store_documents.called
165 |
166 | logs = [log.message for log in caplog.records if log.levelname == "WARNING"]
167 | logs.sort()
168 | assert logs == [
169 | "🔄 browser brave is not supported on unsupported yet",
170 | "🔄 browser chrome is not supported on unsupported yet",
171 | "🔄 browser firefox is not supported on unsupported yet",
172 | ]
173 |
174 |
175 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
176 | @patch.dict(browsers, _mock_browsers_config(), clear=True)
177 | @patch("builtins.input")
178 | @patch("bookworm_genai.commands.sync.tiktoken")
179 | @patch("bookworm_genai.commands.sync.glob")
180 | @patch("bookworm_genai.commands.sync.shutil")
181 | @patch("bookworm_genai.commands.sync.os.makedirs")
182 | @patch("bookworm_genai.commands.sync.store_documents")
183 | @patch("bookworm_genai.commands.sync.sys")
184 | def test_sync_estimate_cost(
185 | mock_sys: Mock,
186 | mock_store_documents: Mock,
187 | mock_makedirs: Mock,
188 | mock_shutil: Mock,
189 | mock_glob: Mock,
190 | mock_tiktoken: Mock,
191 | mocked_input: Mock,
192 | caplog,
193 | ):
194 | platform = "linux"
195 | mock_sys.platform = platform
196 |
197 | mock_encoding = Mock()
198 | mock_encoding.encode.return_value = "mocked_page_content" * 100 # The multiplier just simulates a larger document
199 | mock_tiktoken.encoding_for_model.return_value = mock_encoding
200 |
201 | # At the time of writing ada v2 is priced at $0.100 per 1M tokens
202 | # so this is what we are using for this unit test
203 | # https://openai.com/api/pricing/
204 | mocked_input.return_value = "0.100"
205 |
206 | mocked_documents = [
207 | Mock(page_content="mocked_page_content", metadata={}),
208 | ]
209 |
210 | browsers = _mock_browsers_config(mocked_documents=mocked_documents)
211 | cost = sync(browsers, estimate_cost=True)
212 |
213 | assert not mock_store_documents.called
214 | assert mock_encoding.encode.call_args_list == [
215 | call("mocked_page_content"),
216 | call("mocked_page_content"),
217 | call("mocked_page_content"),
218 | ]
219 |
220 | assert cost == 0.0005700000000000001
221 |
222 |
223 | @patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
224 | @patch("builtins.input")
225 | @patch("bookworm_genai.commands.sync.tiktoken")
226 | def test_sync_estimate_cost_non_interactive(mock_tiktoken: Mock, mock_input: Mock):
227 | mocked_documents = [
228 | Mock(page_content="mocked_page_content"),
229 | ]
230 |
231 | mock_encoding = Mock()
232 | mock_encoding.encode.return_value = "mocked_page_content" * 100 # The multiplier just simulates a larger document
233 | mock_tiktoken.encoding_for_model.return_value = mock_encoding
234 |
235 | cost = _estimate_cost(mocked_documents, cost_per_million=0.100)
236 |
237 | assert cost == 0.00019
238 | assert not mock_input.called
239 |
240 |
241 | @patch("bookworm_genai.commands.sync.glob")
242 | @patch("bookworm_genai.commands.sync.shutil")
243 | @patch("bookworm_genai.commands.sync.os.makedirs")
244 | @patch("bookworm_genai.commands.sync.store_documents")
245 | @patch("bookworm_genai.commands.sync.sys")
246 | def test_sync_browser_filter(mock_sys: Mock, mock_store_documents: Mock, mock_makedirs: Mock, mock_shutil: Mock, mock_glob: Mock):
247 | browser_filter = [Browser.CHROME.value]
248 |
249 | platform = "darwin"
250 | mock_sys.platform = platform
251 |
252 | browsers = _mock_browsers_config(mocked_documents=[Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")])
253 | sync(browsers, browser_filter=browser_filter)
254 |
255 | assert browsers[Browser.CHROME][platform]["bookmark_loader"].called
256 | assert not browsers[Browser.FIREFOX][platform]["bookmark_loader"].called
257 |
258 |
259 | @patch("bookworm_genai.commands.sync.store_documents")
260 | @patch("bookworm_genai.commands.sync.os")
261 | @patch("bookworm_genai.commands.sync.shutil")
262 | @patch("bookworm_genai.commands.sync.glob")
263 | def test_sync_copy_source_missing(mock_glob: Mock, mock_shutil: Mock, mock_os: Mock, mock_store_documents: Mock):
264 | path_to_missing_file = "/path/to/missing/file"
265 |
266 | mock_docs_loader = Mock()
267 | mock_docs_loader.return_value.lazy_load.return_value = [Mock("DOC1", metadata={}, page_content=""), Mock("DOC2", metadata={}, page_content="")]
268 |
269 | browsers = {
270 | # this one will fail and be skipped due to missing file
271 | # ensure that even if this one fails, the next one will still be processed
272 | Browser.FIREFOX: {
273 | sys.platform: {
274 | "bookmark_loader": Mock(),
275 | "bookmark_loader_kwargs": {},
276 | "copy": {
277 | "from": path_to_missing_file,
278 | "to": "/path/to/destination",
279 | },
280 | }
281 | },
282 | # this one will be processed
283 | Browser.CHROME: {
284 | sys.platform: {
285 | "bookmark_loader": mock_docs_loader,
286 | "bookmark_loader_kwargs": {},
287 | }
288 | },
289 | }
290 |
291 | mock_glob.glob.return_value = []
292 |
293 | sync(browsers=browsers)
294 |
295 | mock_glob.glob.assert_called_once_with(path_to_missing_file)
296 |
297 | # ensures that even if the first browser fails, the second one still extracts docs and submits to storage
298 | assert mock_store_documents.call_count == 1
299 | assert len(mock_store_documents.call_args_list[0]) == 2
300 |
301 |
302 | @patch("bookworm_genai.commands.sync.store_documents")
303 | def test_sync_metadata_attached(store_document: Mock):
304 | document_mock = Mock("DOC1", metadata={}, page_content="")
305 | mock_browsers = _mock_browsers_config(sys.platform, [document_mock])
306 |
307 | sync(mock_browsers, browser_filter=Browser.CHROME)
308 |
309 | assert document_mock.metadata == {Metadata.Browser.value: Browser.CHROME.value, Metadata.BookwormVersion.value: __version__}
310 |
--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import Mock, patch
2 |
3 | import pytest
4 | from bookworm_genai.utils import sql_loader_firefox_copy_path, sql_loader_page_content_mapper
5 |
6 |
7 | def test_sql_loader_page_content_mapper():
8 | row = {"id": 1, "title": "title", "url": "url", "dateAdded": "dateAdded", "lastModified": "lastModified", "source": "source"}
9 |
10 | result = sql_loader_page_content_mapper(row)
11 | assert result == '{"id": 1, "url": "url", "dateAdded": "dateAdded", "lastModified": "lastModified", "source": "source", "name": "title"}'
12 |
13 |
14 | @pytest.mark.parametrize(
15 | "platform,mocked_expanduser",
16 | [
17 | pytest.param("linux", "/home/user/.mozilla/firefox/*.default-release/places.sqlite", id="linux"),
18 | pytest.param("darwin", "/Users/user/Library/Application Support/Firefox/Profiles/*.default-release/places.sqlite", id="darwin"),
19 | ],
20 | )
21 | @patch("bookworm_genai.utils.os.path.expanduser")
22 | @patch("bookworm_genai.utils.sys")
23 | def test_sql_loader_firefox_copy_path_linux(mock_sys: Mock, mock_expanduser: Mock, platform: str, mocked_expanduser: str):
24 | sql_loader_firefox_copy_path.cache_clear()
25 |
26 | mock_sys.platform = platform
27 | mock_expanduser.return_value = mocked_expanduser
28 |
29 | assert sql_loader_firefox_copy_path() == mocked_expanduser
30 |
31 |
32 | @patch("bookworm_genai.utils.sys")
33 | def test_sql_loader_firefox_copy_path_unknown(mock_sys: Mock):
34 | sql_loader_firefox_copy_path.cache_clear()
35 |
36 | mock_sys.platform = "unknown"
37 |
38 | with pytest.raises(NotImplementedError, match="Platform unknown is not supported"):
39 | sql_loader_firefox_copy_path()
40 |
--------------------------------------------------------------------------------