├── .gitignore
├── community_scripts
    ├── Karanki
    │   └── README.md
    ├── karakeep-time-tagger
    │   ├── karakeep-time-tagger.timer
    │   ├── karakeep-time-tagger.service
    │   ├── README.md
    │   └── karakeep-time-tagger.py
    ├── karakeep-archive-before-date
    │   ├── README.md
    │   └── archiving_before_date.py
    ├── README.md
    ├── karakeep-list-to-tag
    │   ├── README.md
    │   └── karakeep-list-to-tag.py
    ├── karakeep-remove-ai-tags
    │   ├── README.md
    │   └── karakeep-remove-ai-tags.py
    ├── pocket2karakeep-archived
    │   ├── README.md
    │   └── pocket_archiving_status_updater.py
    ├── omnivore2karakeep-archived
    │   ├── README.md
    │   └── omnivore2karakeep-archived.py
    └── omnivore2karakeep-highlights
    │   ├── README.md
    │   ├── omnivore2karakeep-highlights.py
    │   └── string_context_matcher.py
├── MANIFEST.in
├── tests
    ├── PDF Bookmark Sample.pdf
    └── conftest.py
├── .gitmodules
├── bumpver.toml
├── karakeep_python_api
    ├── __init__.py
    ├── datatypes.py
    └── __main__.py
├── .pre-commit-config.yaml
├── setup.py
├── README.md
└── LICENSE


/.gitignore:
--------------------------------------------------------------------------------
1 | .aider*
2 | **/__pycache__
3 | .env*
4 | *.egg-info
5 | *author*
6 | **/*log
7 | **/*.temp
8 | 


--------------------------------------------------------------------------------
/community_scripts/Karanki/README.md:
--------------------------------------------------------------------------------
1 | Moved to its own repository: [Karanki](https://github.com/thiswillbeyourgithub/Karanki/)
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the OpenAPI specification file in the distribution
2 | include karakeep_python_api/openapi_reference.json
3 | 


--------------------------------------------------------------------------------
/tests/PDF Bookmark Sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thiswillbeyourgithub/karakeep_python_api/HEAD/tests/PDF Bookmark Sample.pdf


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "community_scripts/Freshrss-To-Karakeep"]
2 | 	path = community_scripts/Freshrss-To-Karakeep
3 | 	url = https://github.com/thiswillbeyourgithub/freshrss_to_karakeep
4 |     branch = main
5 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-time-tagger/karakeep-time-tagger.timer:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Run Karakeep-Time-Tagger every 4 hours
 3 | Requires=karakeep-time-tagger.service
 4 | 
 5 | [Timer]
 6 | OnBootSec=15min
 7 | OnUnitActiveSec=4h
 8 | Persistent=true
 9 | 
10 | [Install]
11 | WantedBy=timers.target
12 | 


--------------------------------------------------------------------------------
/bumpver.toml:
--------------------------------------------------------------------------------
 1 | [bumpver]
 2 | current_version = "1.5.0"
 3 | version_pattern = "MAJOR.MINOR.PATCH"
 4 | commit_message = "bump version {old_version} -> {new_version}"
 5 | tag_message = "{new_version}"
 6 | tag_scope = "default"
 7 | commit = true
 8 | tag = true
 9 | push = false
10 | 
11 | [bumpver.file_patterns]
12 | "bumpver.toml" = ['current_version = "{version}"']
13 | "setup.py" = ['version="{version}"']
14 | "karakeep_python_api/karakeep_api.py" = ['VERSION: str = "{version}"']
15 | 


--------------------------------------------------------------------------------
/karakeep_python_api/__init__.py:
--------------------------------------------------------------------------------
 1 | # Import API class and errors directly from the module
 2 | from .karakeep_api import KarakeepAPI, APIError, AuthenticationError
 3 | 
 4 | # Import the datatypes module so users can do `from karakeep_python_api.datatypes import ...`
 5 | from . import datatypes
 6 | 
 7 | # Define the package version
 8 | # This is the single source of truth, read by setup.py and updated by bumpver.
 9 | __version__ = KarakeepAPI.VERSION
10 | 
11 | __all__ = [
12 |     "KarakeepAPI",
13 |     "APIError",
14 |     "AuthenticationError",
15 |     "datatypes",  # Expose the datatypes module
16 |     "__version__",
17 | ]
18 | 
19 | # Models are available via `from karakeep_python_api.datatypes import ...`
20 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-time-tagger/karakeep-time-tagger.service:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Add time-to-read tags to Karakeep bookmarks by calling Karakeep-Time-Tagger
 3 | After=network-online.target
 4 | Wants=network-online.target
 5 | 
 6 | [Service]
 7 | Type=oneshot
 8 | # Update this path to your karakeep-python-api repository location
 9 | WorkingDirectory=%h/repos/karakeep-python-api
10 | ExecStart=/usr/bin/python3 community_scripts/karakeep-time-tagger/karakeep-time-tagger.py --verbose=false
11 | # Uncomment and set path to environment file containing KARAKEEP_PYTHON_API_KEY and KARAKEEP_PYTHON_API_ENDPOINT
12 | EnvironmentFile=%h/.config/karakeep/env
13 | StandardOutput=journal
14 | StandardError=journal
15 | 
16 | [Install]
17 | WantedBy=default.target
18 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-archive-before-date/README.md:
--------------------------------------------------------------------------------
 1 | # Karakeep Archive before date
 2 | 
 3 | Small cleaning script to clean old article not archived after an import from another readlater app.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | N/A 
 8 | 
 9 | ## Usage
10 | 
11 | Define a date to limit archiving. All not archived bookmarks before this date will be archived. 
12 | 
13 | ```bash
14 | python archiving_before_date.py --before-date 2023-12-24
15 | ```
16 | 
17 | `--before-date` format is `YYYY-MM-DD`
18 | 
19 | You might need to set up environment variables for the Karakeep API client or pass them as arguments if the script supports it (e.g., `KARAKEEP_PYTHON_API_ENDPOINT` and `KARAKEEP_PYTHON_API_KEY`). Refer to the script's help or the `karakeep-python-api` documentation for more details on authentication.
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/community_scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Karakeep Community Scripts
 2 | 
 3 | This part of the repository is where Community Scripts are stored.
 4 | 
 5 | Don't hesitate to create PR to add your own:
 6 | - Think of a good name, ideally something that is easy to find when using search engines. And the name of the directory should ideally match the name of the script.
 7 | - Include a README.md.
 8 | - Include a VERSION variable, for example `VERSION: str = "1.0.0"`.
 9 | - Mention your script to the table in the README.md at the root of the repository.
10 | - If you think your script should be known of the entire community, think about adding it to the table in [the official Karakeep documentation](https://docs.karakeep.app/community-projects)
11 | - If possible run [ruff](https://github.com/astral-sh/ruff/) on your code before doing the PR.
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: local
 3 |     hooks:
 4 |     -   id: pytest
 5 |         name: pytest
 6 |         # entry: pytest tests
 7 |         entry: pytest tests --quiet
 8 |         language: system
 9 |         pass_filenames: false
10 |         always_run: true
11 |         stages: [pre-merge-commit]
12 | # -   repo: https://github.com/psf/black
13 | #     rev: 22.10.0
14 | #     hooks:
15 | #     -   id: black
16 | #         args: ["--quiet"]
17 | #         language: system
18 | # -   repo: https://github.com/pycqa/isort
19 | #     rev: 5.12.0
20 | #     hooks:
21 | #       - id: isort
22 | #         args: ["--profile", "black", "--quiet"]
23 | #         language: system
24 | 
25 | # https://github.com/astral-sh/ruff-pre-commit
26 | repos:
27 | - repo: https://github.com/astral-sh/ruff-pre-commit
28 |   # Ruff version.
29 |   rev: v0.14.1
30 |   hooks:
31 |     # # Run the linter.
32 |     # - id: ruff-check
33 |     #   args: [ --fix ]
34 |     # Run the formatter.
35 |     - id: ruff-format
36 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-list-to-tag/README.md:
--------------------------------------------------------------------------------
 1 | # Karakeep-List-To-Tag
 2 | 
 3 | This script allows you to convert a list into a tag by adding a specified tag to all bookmarks in a specified list.
 4 | 
 5 | ## Purpose
 6 | 
 7 | Sometimes it's useful to "turn a list into a tag" so you can then create more flexible smart lists. For example, if you have a list called "Omnivore Imports", you can tag all those bookmarks with `#omnivore`, then create a smart list with the query `#omnivore -is:archived` to show only unarchived Omnivore bookmarks.
 8 | 
 9 | ## Usage
10 | 
11 | ```bash
12 | python karakeep-list-to-tag.py "My List Name" "my-tag"
13 | ```
14 | 
15 | This will:
16 | 1. Find the list with the specified name
17 | 2. Get all bookmarks from that list  
18 | 3. Add the specified tag to each bookmark (skipping any that already have the tag)
19 | 
20 | ## Example
21 | 
22 | ```bash
23 | python karakeep-list-to-tag.py "Omnivore Imports" "omnivore"
24 | ```
25 | 
26 | After running this, you can create a smart list with query `#omnivore -is:archived` to filter your Omnivore bookmarks by archived status.
27 | 
28 | ---
29 | *This documentation was created with assistance from [aider.chat](https://github.com/Aider-AI/aider/).*
30 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-remove-ai-tags/README.md:
--------------------------------------------------------------------------------
 1 | # Karakeep-Remove-AI-Tags
 2 | 
 3 | This script allows you to identify and remove tags that were attached by AI and have no human attachments.
 4 | 
 5 | ## Purpose
 6 | 
 7 | Karakeep can automatically tag bookmarks using AI. Sometimes, you may want to clean up tags that were only added by AI and not by humans. This script helps you identify and remove such tags.
 8 | 
 9 | ## Usage
10 | 
11 | ```bash
12 | python karakeep-remove-ai-tags.py [--dry-run]
13 | ```
14 | 
15 | ### Parameters
16 | 
17 | - `--dry-run`: Optional. If provided, the script will only list the tags that would be removed without actually removing them.
18 | 
19 | ## What the script does
20 | 
21 | This script will:
22 | 1. Fetch all tags from your Karakeep account
23 | 2. Identify tags that are attached by AI and have no human attachments
24 | 3. List these tags with their IDs and the number of AI attachments
25 | 4. If not in dry-run mode, ask for confirmation before removing the tags
26 | 5. Remove the confirmed tags
27 | 
28 | ## Examples
29 | 
30 | ### Dry run (preview only)
31 | 
32 | ```bash
33 | python karakeep-remove-ai-tags.py --dry-run
34 | ```
35 | 
36 | This will list all tags that would be removed without actually removing them.
37 | 
38 | ### Remove AI-only tags
39 | 
40 | ```bash
41 | python karakeep-remove-ai-tags.py
42 | ```
43 | 
44 | This will list all tags that are attached by AI and have no human attachments, ask for confirmation, and then remove the confirmed tags.
45 | 
46 | ---
47 | *This documentation was created with assistance from AI.*
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | from setuptools import find_packages, setup
 4 | 
 5 | with open("README.md", "r") as readme:
 6 |     long_description = readme.read()
 7 | 
 8 | setup(
 9 |     name="karakeep_python_api",
10 |     version="1.5.0",
11 |     description="Community python client for the Karakeep API.",  # Simplified description
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     url="https://github.com/thiswillbeyourgithub/karakeep_python_api/",
15 |     packages=find_packages(),
16 |     include_package_data=True,
17 |     classifiers=[
18 |         "Programming Language :: Python :: 3",
19 |         "Operating System :: OS Independent",
20 |     ],
21 |     keywords=[
22 |         "rss",
23 |         "karakeep",
24 |         "hoarder",
25 |         "data-hoarding",
26 |         "python",
27 |         "api",
28 |         "feeds",
29 |         "openapi",
30 |     ],
31 |     python_requires=">=3.9",
32 |     install_requires=[
33 |         "requests >= 2.32.3",
34 |         "loguru >= 0.7.3",
35 |         "pydantic >= 2.0",  # For data validation and modeling based on datatypes.py
36 |         "click >= 8.0",  # For the CLI
37 |     ],
38 |     extras_require={
39 |         "dev": [
40 |             # "openapi-pydantic >= 0.5.1", # For generating datatypes.py from OpenAPI spec
41 |             "beartype >= 0.20.2",  # Optional runtime type checking
42 |             "pytest >= 8.3.4",
43 |             "build >= 1.2.2.post1",
44 |             "twine >= 6.1.0",
45 |             "bumpver >= 2024.1130",
46 |         ],
47 |     },
48 |     entry_points={
49 |         "console_scripts": [
50 |             "karakeep=karakeep_python_api.__main__:cli",
51 |         ],
52 |     },
53 | )
54 | 


--------------------------------------------------------------------------------
/community_scripts/pocket2karakeep-archived/README.md:
--------------------------------------------------------------------------------
 1 | # Pocket2Karakeep-Archived
 2 | 
 3 | This script addresses an issue in Karakeep (as of version 0.24.1) where the "archived" status of bookmarks imported from Pocket is not preserved. See [karakeep issue #703](https://github.com/karakeep-app/karakeep/issues/703) for more details.
 4 | 
 5 | This tool reads your Pocket export data and updates the corresponding bookmarks in your Karakeep instance to reflect their original "archived" status.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | 1.  **Pocket Export Directory**: You need to have an export of your Ocket data. This should be a directory containing the `part_00000X.csv` files provided by POcket. The script will automatically find and process all such files within the specified directory.
10 | 
11 | ## Usage
12 | 
13 | Ensure you have your Pocket export directory ready. Then, run the script:
14 | 
15 | ```bash
16 | python pocket2karakeep-archived.py --pocket-export-dir /path/to/your/pocket_export_directory
17 | ```
18 | 
19 | You might need to set up environment variables for the Karakeep API client or pass them as arguments if the script supports it (e.g., `KARAKEEP_PYTHON_API_ENDPOINT` and `KARAKEEP_PYTHON_API_KEY`). Refer to the script's help or the `karakeep-python-api` documentation for more details on authentication.
20 | 
21 | The script will:
22 | 1. Scan the specified Pocket export directory for `part_00000X.csv` files.
23 | 2. Load and combine data from all found JSON files to identify articles that should be "Archived".
24 | 3. Fetch all bookmarks from your Karakeep instance. (This can take a while and is cached locally in `karakeep_bookmarks.temp` by default to speed up subsequent runs).
25 | 4. For each Pocket article marked as "Archived", it will find the corresponding bookmark in Karakeep (matching by URL or title) and update its status to "archived" if it's not already.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/community_scripts/omnivore2karakeep-archived/README.md:
--------------------------------------------------------------------------------
 1 | # Omnivore2Karakeep-archived
 2 | 
 3 | This script addresses an issue in Karakeep (as of version 0.24.1) where the "archived" status of bookmarks imported from Omnivore is not preserved. See [karakeep issue #703](https://github.com/karakeep-app/karakeep/issues/703) for more details.
 4 | 
 5 | This tool reads your Omnivore export data and updates the corresponding bookmarks in your Karakeep instance to reflect their original "archived" status.
 6 | 
 7 | ## Prerequisites
 8 | 
 9 | 1.  **Omnivore Export Directory**: You need to have an export of your Omnivore data. This should be a directory containing the `metadata_X_to_Y.json` files provided by Omnivore. The script will automatically find and process all such files within the specified directory.
10 | 
11 | ## Usage
12 | 
13 | Ensure you have your Omnivore export directory ready. Then, run the script:
14 | 
15 | ```bash
16 | python omnivore2karakeep-archived.py --omnivore-export-dir /path/to/your/omnivore_export_directory
17 | ```
18 | 
19 | You might need to set up environment variables for the Karakeep API client or pass them as arguments if the script supports it (e.g., `KARAKEEP_PYTHON_API_ENDPOINT` and `KARAKEEP_PYTHON_API_KEY`). Refer to the script's help or the `karakeep-python-api` documentation for more details on authentication.
20 | 
21 | The script will:
22 | 1. Scan the specified Omnivore export directory for `metadata_*_to_*.json` files.
23 | 2. Load and combine data from all found JSON files to identify articles that should be "Archived".
24 | 3. Fetch all bookmarks from your Karakeep instance. (This can take a while and is cached locally in `karakeep_bookmarks.temp` by default to speed up subsequent runs).
25 | 4. For each Omnivore article marked as "Archived", it will find the corresponding bookmark in Karakeep (matching by URL or title) and update its status to "archived" if it's not already.
26 | 
27 | ---
28 | This tool was developed with assistance from [aider.chat](https://github.com/Aider-AI/aider/).
29 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-list-to-tag/karakeep-list-to-tag.py:
--------------------------------------------------------------------------------
 1 | """Adds a specified tag to all bookmarks in a specified list."""
 2 | 
 3 | import json
 4 | import fire
 5 | from tqdm import tqdm
 6 | from karakeep_python_api import KarakeepAPI
 7 | 
 8 | 
 9 | def main(list_name: str, tag_to_add: str):
10 |     """
11 |     Adds a specified tag to all bookmarks in a specified list.
12 | 
13 |     Parameters:
14 |         list_name: Name of the list to get bookmarks from
15 |         tag_to_add: Name of the tag to add to bookmarks
16 |     """
17 |     k = KarakeepAPI()
18 | 
19 |     # Get all lists and find the one with the specified name
20 |     lists = k.get_all_lists()
21 |     target_list = None
22 |     for l in lists:
23 |         if l.name == list_name:
24 |             target_list = l
25 |             break
26 | 
27 |     if not target_list:
28 |         print(f"List '{list_name}' not found")
29 |         return
30 | 
31 |     list_id = target_list.id
32 | 
33 |     # Get all bookmarks from the list
34 |     bookmarks = []
35 |     cursor = None
36 |     while True:
37 |         page = k.get_bookmarks_in_the_list(
38 |             list_id=list_id, include_content=False, limit=50, cursor=cursor
39 |         )
40 |         cursor = page.nextCursor
41 |         new = page.bookmarks
42 |         if not new:
43 |             print("No new bookmarks")
44 |             break
45 |         bookmarks.extend(new)
46 |         print(f"Added {len(new)} bookmarks, total is {len(bookmarks)}")
47 |         if not cursor:
48 |             print("No cursor")
49 |             break
50 | 
51 |     # Add tag to bookmarks that don't already have it
52 |     skipped = 0
53 |     added = 0
54 |     for b in tqdm(bookmarks):
55 |         # Check if bookmark already has the tag
56 |         existing_tag_names = [tag.name for tag in b.tags] if b.tags else []
57 |         if tag_to_add in existing_tag_names:
58 |             tqdm.write(f"Skipping bookmark {b.id} - already has tag '{tag_to_add}'")
59 |             skipped += 1
60 |             continue
61 | 
62 |         out = k.attach_tags_to_a_bookmark(
63 |             bookmark_id=b.id,
64 |             tag_names=[tag_to_add],
65 |         )
66 |         tqdm.write(f"Title: '{b.title}'  Answer: '{json.dumps(out)}'")
67 |         added += 1
68 | 
69 |     print(
70 |         f"Added tag '{tag_to_add}' to {added} bookmarks, skipped {skipped} bookmarks that already had the tag"
71 |     )
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     fire.Fire(main)
76 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-time-tagger/README.md:
--------------------------------------------------------------------------------
 1 | # Karakeep-Time-Tagger
 2 | 
 3 | Automatically adds time-to-read tags to your Karakeep bookmarks based on content length.
 4 | 
 5 | ## What it does
 6 | 
 7 | - Analyzes bookmark content (text and HTML) to estimate reading time
 8 | - Adds appropriate time tags: `0-1m`, `1-5m`, `5-10m`, `10-15m`, `15-30m`, `30m+`
 9 | - Removes conflicting time tags to ensure each bookmark has only one time estimate
10 | - Creates smart lists for each time category (can be disabled)
11 | - Supports both link and text bookmark types
12 | - Uses caching to speed up repeated runs
13 | 
14 | ## Usage
15 | 
16 | Basic usage with default settings (200 WPM):
17 | ```bash
18 | python karakeep-time-tagger.py
19 | ```
20 | 
21 | Customize reading speed:
22 | ```bash
23 | python karakeep-time-tagger.py --wpm 250
24 | ```
25 | 
26 | Process all bookmarks (including those already tagged):
27 | ```bash
28 | python karakeep-time-tagger.py --reset_all
29 | ```
30 | 
31 | Enable verbose logging:
32 | ```bash
33 | python karakeep-time-tagger.py --verbose
34 | ```
35 | 
36 | Use custom cache file location:
37 | ```bash
38 | python karakeep-time-tagger.py --cache_file ./my_bookmarks.cache
39 | ```
40 | 
41 | Skip creating smart lists:
42 | ```bash
43 | python karakeep-time-tagger.py --create_lists False
44 | ```
45 | 
46 | ## Options
47 | 
48 | - `--wpm`: Words per minute reading speed (default: 200)
49 | - `--reset_all`: Process all bookmarks, even those already tagged (default: False)
50 | - `--verbose`: Show debug logs in console (default: False) 
51 | - `--cache_file`: Path to bookmark cache file (default: ./bookmarks.temp)
52 | - `--create_lists`: Create smart lists for each time slot (default: True)
53 | 
54 | ## Prerequisites
55 | 
56 | - Karakeep API credentials configured (via environment variables or command line)
57 | - Python packages: `fire`, `tqdm`, `beautifulsoup4`, `loguru`, `karakeep-python-api`
58 | 
59 | ## Behavior
60 | 
61 | By default, the script skips bookmarks that already have exactly one time tag (assumes they're correct). It only processes:
62 | - Bookmarks with no time tags
63 | - Bookmarks with multiple conflicting time tags
64 | 
65 | Use `--reset_all` to force reprocessing of all bookmarks.
66 | 
67 | ## Caching
68 | 
69 | The script caches downloaded bookmarks to speed up repeated runs during testing. Delete the cache file to force a fresh download from the API.
70 | 
71 | ---
72 | 
73 | *This tool was created with assistance from [aider.chat](https://github.com/Aider-AI/aider/).*
74 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-remove-ai-tags/karakeep-remove-ai-tags.py:
--------------------------------------------------------------------------------
 1 | """Removes tags that are attached by AI and have no human attachments."""
 2 | 
 3 | import json
 4 | import fire
 5 | from tqdm import tqdm
 6 | from karakeep_python_api import KarakeepAPI
 7 | 
 8 | VERSION: str = "1.0.0"
 9 | 
10 | 
11 | def main(dry_run: bool = False):
12 |     """
13 |     Lists all tags and removes those that are attached by AI and have no human attachments.
14 | 
15 |     Parameters:
16 |         dry_run: If True, only lists the tags that would be removed without actually removing them
17 |     """
18 |     k = KarakeepAPI()
19 | 
20 |     # Get all tags
21 |     print("Fetching all tags...")
22 |     tags = k.get_all_tags()
23 |     print(f"Found {len(tags)} tags")
24 | 
25 |     # Identify tags that are attached by AI and have no human attachments
26 |     ai_only_tags = []
27 |     for tag in tags:
28 |         ai_count = tag.numBookmarksByAttachedType.ai or 0
29 |         human_count = tag.numBookmarksByAttachedType.human or 0
30 | 
31 |         if ai_count > 0 and human_count == 0:
32 |             ai_only_tags.append(tag)
33 | 
34 |     print(
35 |         f"Found {len(ai_only_tags)} tags that are attached by AI and have no human attachments"
36 |     )
37 | 
38 |     # List the tags that will be removed
39 |     if ai_only_tags:
40 |         print("\nTags that will be removed:")
41 |         for tag in ai_only_tags:
42 |             print(
43 |                 f"- {tag.name} (ID: {tag.id}, AI attachments: {tag.numBookmarksByAttachedType.ai})"
44 |             )
45 |     else:
46 |         print("No tags to remove")
47 |         return
48 | 
49 |     # If dry_run is True, don't actually remove the tags
50 |     if dry_run:
51 |         print("\nDRY RUN: No tags were removed")
52 |         return
53 | 
54 |     # Confirm before removing tags
55 |     confirm = input("\nAre you sure you want to remove these tags? (y/n): ")
56 |     if confirm.lower() != "y":
57 |         print("Operation cancelled")
58 |         return
59 | 
60 |     # Remove the tags
61 |     print("\nRemoving tags...")
62 |     removed = 0
63 |     for tag in tqdm(ai_only_tags):
64 |         try:
65 |             k.delete_a_tag(tag.id)
66 |             tqdm.write(f"Removed tag: {tag.name} (ID: {tag.id})")
67 |             removed += 1
68 |         except Exception as e:
69 |             tqdm.write(f"Error removing tag {tag.name} (ID: {tag.id}): {str(e)}")
70 | 
71 |     print(f"\nRemoved {removed} tags out of {len(ai_only_tags)} AI-only tags")
72 | 
73 | 
74 | if __name__ == "__main__":
75 |     fire.Fire(main)
76 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-archive-before-date/archiving_before_date.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Small script to clean old article not archived after an import from another readlater app.
 3 | 
 4 | Parameters:
 5 |     before_date: Date in YYYY-MM-DD format. Articles created before this date will be archived.
 6 | """
 7 | 
 8 | import time
 9 | from datetime import datetime
10 | 
11 | from Levenshtein import ratio
12 | import pickle
13 | from fire import Fire
14 | from typing import Optional
15 | from pathlib import Path
16 | import json
17 | import csv
18 | from karakeep_python_api import KarakeepAPI
19 | from tqdm import tqdm
20 | 
21 | VERSION: str = "1.0.0"
22 | 
23 | karakeep = KarakeepAPI(verbose=False)
24 | 
25 | 
26 | def main(before_date: str) -> None:
27 |     """Archive articles created before the specified date.
28 | 
29 |     Args:
30 |         before_date: Date string in YYYY-MM-DD format
31 |     """
32 |     before_date = datetime.strptime(before_date, "%Y-%m-%d")
33 | 
34 |     n = karakeep.get_current_user_stats()["numBookmarks"]
35 |     pbar = tqdm(total=n, desc="Fetching bookmarks")
36 |     all_bm = []
37 |     batch_size = 100  # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed
38 |     page = karakeep.get_all_bookmarks(
39 |         include_content=False,
40 |         limit=batch_size,
41 |     )
42 |     all_bm.extend(page.bookmarks)
43 |     pbar.update(len(all_bm))
44 |     while page.nextCursor:
45 |         page = karakeep.get_all_bookmarks(
46 |             include_content=False,
47 |             limit=batch_size,
48 |             cursor=page.nextCursor,
49 |         )
50 |         all_bm.extend(page.bookmarks)
51 |         pbar.update(len(page.bookmarks))
52 | 
53 |     assert len(all_bm) == n, f"Only retrieved {len(all_bm)} bookmarks instead of {n}"
54 |     pbar.close()
55 | 
56 |     failed = []
57 |     for bookmark in all_bm:
58 |         # skip already archived
59 |         if bookmark.archived:
60 |             continue
61 | 
62 |         # tqdm.write(f"Creation Date: {bookmark.createdAt}")
63 |         creation_date = datetime.strptime(bookmark.createdAt, "%Y-%m-%dT%H:%M:%S.%fZ")
64 | 
65 |         if creation_date > before_date:
66 |             continue
67 | 
68 |         # do the archiving
69 |         retries = 3
70 |         for attempt in range(retries):
71 |             try:
72 |                 res_arch = karakeep.update_a_bookmark(
73 |                     bookmark_id=bookmark.id,
74 |                     update_data={"archived": True},
75 |                 )
76 |                 break
77 |             except Exception as e:
78 |                 if attempt == retries - 1:
79 |                     raise e
80 |                 tqdm.write(f"Update failed, retrying ({attempt + 1}/{retries})")
81 |                 time.sleep(1)
82 |         if isinstance(res_arch, dict):
83 |             assert res_arch["archived"], res_arch
84 |         else:
85 |             assert res_arch.archived, res_arch
86 |         tqdm.write(f"Successfuly archived: {bookmark.title}")
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     Fire(main)
91 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pytest
 3 | import time
 4 | import random
 5 | import string
 6 | from typing import Optional
 7 | import beartype  # to trigger the runtime typechecking
 8 | from karakeep_python_api import KarakeepAPI, datatypes
 9 | 
10 | 
11 | @pytest.fixture
12 | def karakeep_client():
13 |     """
14 |     Fixture that provides a configured Karakeep API client.
15 | 
16 |     Requires the following environment variables:
17 |     - KARAKEEP_PYTHON_API_ENDPOINT
18 |     - KARAKEEP_PYTHON_API_KEY
19 |     - KARAKEEP_PYTHON_API_VERIFY_SSL (optional, defaults to true)
20 |     """
21 |     api_endpoint = os.environ.get("KARAKEEP_PYTHON_API_ENDPOINT")
22 |     api_key = os.environ.get("KARAKEEP_PYTHON_API_KEY")
23 |     verify_ssl_str = os.environ.get("KARAKEEP_PYTHON_API_VERIFY_SSL", "true")
24 | 
25 |     if not api_endpoint or not api_key:
26 |         missing = []
27 |         if not api_endpoint:
28 |             missing.append("KARAKEEP_PYTHON_API_ENDPOINT")
29 |         if not api_key:
30 |             missing.append("KARAKEEP_PYTHON_API_KEY")
31 |         pytest.skip(
32 |             f"Missing required environment variables for Karakeep API tests: {', '.join(missing)}. Set these to run integration tests."
33 |         )
34 | 
35 |     verify_ssl = verify_ssl_str.lower() in ("true", "1", "yes")
36 | 
37 |     # Instantiate the client using standard environment variables
38 |     # KarakeepAPI constructor handles api_endpoint and api_key directly
39 |     return KarakeepAPI(
40 |         api_endpoint=api_endpoint,
41 |         api_key=api_key,
42 |         verify_ssl=verify_ssl,
43 |         verbose=True,  # Enable verbose logging for tests
44 |     )
45 | 
46 | 
47 | @pytest.fixture
48 | def managed_bookmark(karakeep_client: KarakeepAPI) -> datatypes.Bookmark:
49 |     """
50 |     Fixture to create a bookmark before a test and delete it afterwards.
51 |     Yields the created bookmark object.
52 |     """
53 |     created_bookmark_id: Optional[str] = None
54 |     # Generate unique URL and title to avoid collisions and aid debugging
55 |     timestamp = int(time.time())
56 |     random_suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=6))
57 |     test_url = f"https://example.com/test_page_fixture_{timestamp}_{random_suffix}"
58 |     original_title = f"Managed Fixture Bookmark {timestamp}-{random_suffix}"
59 | 
60 |     print(
61 |         f"\n  FIXTURE SETUP: Attempting to create bookmark (URL: {test_url}, Title: '{original_title}')"
62 |     )
63 |     try:
64 |         # Create the bookmark
65 |         bookmark = karakeep_client.create_a_new_bookmark(
66 |             type="link", url=test_url, title=original_title
67 |         )
68 |         assert isinstance(bookmark, datatypes.Bookmark), (
69 |             "Fixture: create_a_new_bookmark should return a Bookmark model"
70 |         )
71 |         assert bookmark.id, "Fixture: Created bookmark must have an ID"
72 |         created_bookmark_id = bookmark.id
73 |         print(
74 |             f"  FIXTURE SETUP: ✓ Successfully created bookmark with ID: {created_bookmark_id}"
75 |         )
76 | 
77 |         yield bookmark  # Provide the bookmark to the test function
78 | 
79 |     finally:
80 |         # Teardown: Delete the bookmark
81 |         if created_bookmark_id:
82 |             print(
83 |                 f"\n  FIXTURE TEARDOWN: Attempting to delete bookmark ID: {created_bookmark_id}"
84 |             )
85 |             try:
86 |                 karakeep_client.delete_a_bookmark(bookmark_id=created_bookmark_id)
87 |                 print(
88 |                     f"  FIXTURE TEARDOWN: ✓ Successfully deleted bookmark ID: {created_bookmark_id}"
89 |                 )
90 |             except Exception as e:
91 |                 # Log error during teardown but don't let it mask original test failure
92 |                 print(
93 |                     f"  FIXTURE TEARDOWN: ERROR during bookmark deletion for ID {created_bookmark_id}: {e}"
94 |                 )
95 |         else:
96 |             print("\n  FIXTURE TEARDOWN: No bookmark ID recorded, skipping deletion.")
97 | 


--------------------------------------------------------------------------------
/karakeep_python_api/datatypes.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The datatype file was originally generated by datamodel-codegen, then refactored and manually (or via LLMs) kept up to date with upstream
  3 | """
  4 | 
  5 | from __future__ import annotations
  6 | 
  7 | from enum import Enum
  8 | from typing import List, Optional, Union, Literal
  9 | 
 10 | from pydantic import BaseModel, Field, RootModel
 11 | 
 12 | 
 13 | class StatusTypes(str, Enum):
 14 |     success = "success"
 15 |     failure = "failure"
 16 |     pending = "pending"
 17 | 
 18 | 
 19 | class NumBookmarksByAttachedType(BaseModel):
 20 |     ai: Optional[int] = None
 21 |     human: Optional[int] = None
 22 | 
 23 | 
 24 | class TagShort(BaseModel):
 25 |     id: str
 26 |     name: str
 27 |     attachedBy: Literal["ai", "human"]
 28 | 
 29 | 
 30 | class Tag(BaseModel):
 31 |     id: str
 32 |     name: str
 33 |     numBookmarks: int
 34 |     numBookmarksByAttachedType: NumBookmarksByAttachedType
 35 | 
 36 | 
 37 | class Type(str, Enum):
 38 |     link = "link"
 39 | 
 40 | 
 41 | class ContentTypeLink(BaseModel):
 42 |     type: Literal["link"] = "link"
 43 |     url: str
 44 |     title: Optional[str] = None
 45 |     description: Optional[str] = None
 46 |     imageUrl: Optional[str] = None
 47 |     imageAssetId: Optional[str] = None
 48 |     screenshotAssetId: Optional[str] = None
 49 |     fullPageArchiveAssetId: Optional[str] = None
 50 |     precrawledArchiveAssetId: Optional[str] = None
 51 |     videoAssetId: Optional[str] = None
 52 |     favicon: Optional[str] = None
 53 |     htmlContent: Optional[str] = None
 54 |     contentAssetId: Optional[str] = None
 55 |     crawledAt: Optional[str] = None
 56 |     author: Optional[str] = None
 57 |     publisher: Optional[str] = None
 58 |     datePublished: Optional[str] = None
 59 |     dateModified: Optional[str] = None
 60 | 
 61 | 
 62 | class ContentTypeUnknown(BaseModel):
 63 |     type: Literal["unknown"] = "unknown"
 64 | 
 65 | 
 66 | class ContentTypeText(BaseModel):
 67 |     type: Literal["text"] = "text"
 68 |     text: str
 69 |     sourceUrl: Optional[str] = None
 70 | 
 71 | 
 72 | class ContentTypeAsset(BaseModel):
 73 |     type: Literal["asset"] = "asset"
 74 |     assetType: Literal["image", "pdf"]
 75 |     assetId: str
 76 |     fileName: Optional[str] = None
 77 |     sourceUrl: Optional[str] = None
 78 |     size: Optional[float] = None
 79 |     content: Optional[str] = None
 80 | 
 81 | 
 82 | class BookmarkAsset(BaseModel):
 83 |     id: str
 84 |     assetType: Literal[
 85 |         "linkHtmlContent",
 86 |         "screenshot",
 87 |         "assetScreenshot",
 88 |         "bannerImage",
 89 |         "fullPageArchive",
 90 |         "video",
 91 |         "bookmarkAsset",
 92 |         "precrawledArchive",
 93 |         "userUploaded",
 94 |         "unknown",
 95 |     ]
 96 |     fileName: Optional[str] = None
 97 | 
 98 | 
 99 | class Asset(BaseModel):
100 |     assetId: str
101 |     contentType: str
102 |     size: float
103 |     fileName: str
104 | 
105 | 
106 | class Bookmark(BaseModel):
107 |     id: str
108 |     createdAt: str
109 |     modifiedAt: Optional[str]
110 |     title: Optional[str] = None
111 |     archived: bool
112 |     favourited: bool
113 |     taggingStatus: Literal["success", "failure", "pending"]
114 |     summarizationStatus: Optional[Literal["success", "failure", "pending"]] = None
115 |     note: Optional[str] = None
116 |     summary: Optional[str] = None
117 |     source: Optional[
118 |         Literal[
119 |             "api", "web", "cli", "mobile", "extension", "singlefile", "rss", "import"
120 |         ]
121 |     ] = None
122 |     userId: str
123 |     tags: List[TagShort]
124 |     content: Union[
125 |         ContentTypeLink, ContentTypeText, ContentTypeAsset, ContentTypeUnknown
126 |     ]
127 |     assets: List[BookmarkAsset]
128 | 
129 | 
130 | class PaginatedBookmarks(BaseModel):
131 |     bookmarks: List[Bookmark]
132 |     nextCursor: Optional[str] = ""
133 | 
134 | 
135 | class ListModel(BaseModel):
136 |     id: str
137 |     name: str
138 |     description: Optional[str] = None
139 |     icon: str
140 |     parentId: Optional[str]
141 |     type: Optional[Literal["manual", "smart"]] = "manual"
142 |     query: Optional[str] = None
143 |     public: bool
144 |     hasCollaborators: bool
145 |     userRole: Literal["owner", "editor", "viewer", "public"]
146 | 
147 | 
148 | class Highlight(BaseModel):
149 |     bookmarkId: str
150 |     startOffset: float
151 |     endOffset: float
152 |     color: Optional[Literal["yellow", "red", "green", "blue"]] = "yellow"
153 |     text: Optional[str]
154 |     note: Optional[str]
155 |     id: str
156 |     userId: str
157 |     createdAt: str
158 | 
159 | 
160 | class PaginatedHighlights(BaseModel):
161 |     highlights: List[Highlight]
162 |     nextCursor: Optional[str] = ""
163 | 
164 | 
165 | class PaginatedTags(BaseModel):
166 |     tags: List[Tag]
167 |     nextCursor: Optional[str] = ""
168 | 
169 | 
170 | class Backup(BaseModel):
171 |     id: str
172 |     userId: str
173 |     assetId: Optional[str]
174 |     createdAt: str
175 |     size: float
176 |     bookmarkCount: int
177 |     status: Literal["pending", "success", "failure"]
178 |     errorMessage: Optional[str] = None
179 | 


--------------------------------------------------------------------------------
/community_scripts/omnivore2karakeep-highlights/README.md:
--------------------------------------------------------------------------------
  1 | # Omnivore2Karakeep-highlights
  2 | 
  3 | This script imports highlights from an Omnivore export to a Karakeep instance. It matches Omnivore bookmarks to existing Karakeep bookmarks and creates corresponding highlights with position information.
  4 | 
  5 | *Note: This tool was developed with assistance from [aider.chat](https://github.com/Aider-AI/aider/).*
  6 | 
  7 | ## Features
  8 | 
  9 | - **Probabilistic bookmark matching**: Uses multiple strategies to match Omnivore bookmarks to Karakeep bookmarks, including exact URL matching, exact title matching, and fuzzy title matching with configurable thresholds
 10 | - **Position detection**: Intelligently determines highlight positions within documents using multiple strategies including direct text matching and fuzzy matching algorithms
 11 | - **Safe operation**: Only creates new highlights without modifying existing bookmarks or data - all operations are additive and reversible
 12 | - **Metadata preservation**: Stores import metadata in each highlight's note field for full traceability and potential cleanup operations
 13 | - **Caching**: Caches Karakeep bookmarks locally to avoid repeated API calls during development
 14 | - **Progress tracking**: Shows progress bars for long-running operations
 15 | - **Dry run mode**: Test the import process without actually creating highlights
 16 | 
 17 | ## Important Notes
 18 | 
 19 | ### Safety and Data Integrity
 20 | 
 21 | ✅ **Database Safety:**
 22 | - **Non-destructive**: This script only creates new highlights and never modifies or deletes existing bookmarks or highlights
 23 | - **Additive operations**: All changes are purely additive to your Karakeep database
 24 | - **Reversible**: Import metadata is stored in each highlight's note field, allowing for easy identification and cleanup if needed
 25 | - **No data loss risk**: The import process cannot damage your existing Karakeep data
 26 | 
 27 | ### Matching Process
 28 | 
 29 | 🎯 **Probabilistic Matching:**
 30 | - **Multi-strategy approach**: Uses exact URL matching, exact title matching, and fuzzy title matching as fallbacks
 31 | - **Configurable thresholds**: Fuzzy matching uses a 95% similarity threshold by default, which can be adjusted
 32 | - **Best-effort matching**: Some Omnivore bookmarks may not find matches due to title differences or missing URLs
 33 | - **Manual review recommended**: Check the console output for unmatched bookmarks that may need manual attention
 34 | 
 35 | ### Current Limitations
 36 | 
 37 | ⚠️ **Known Limitations:**
 38 | - **PDF highlights not supported**: The script currently skips PDF files by default (`skip_pdf=True`) as PDF highlight positioning is not yet implemented
 39 | - **HTML content dependency**: Only works with web page bookmarks that have HTML content available in Karakeep
 40 | - **Single highlight color**: All imported highlights use yellow color because Omnivore exports do not include color information - original highlight colors cannot be preserved (the default color can be modified in the code)
 41 | - **No duplicate detection**: The script doesn't check if highlights already exist before creating them (relies on user to clean up duplicates if needed)
 42 | 
 43 | ## Prerequisites
 44 | 
 45 | 1. **Omnivore Export**: You need a complete Omnivore export containing:
 46 |    - `highlights/` directory with `.md` files
 47 |    - `content/` directory with `.html` and `.pdf` files  
 48 |    - `metadata_*.json` files with bookmark information
 49 | 
 50 | 2. **Karakeep Instance**: A running Karakeep instance with API access
 51 | 
 52 | 3. **Environment Setup**: Karakeep API credentials configured via environment variables or parameters
 53 | 
 54 | ## Installation
 55 | 
 56 | Ensure you have the required dependencies installed:
 57 | 
 58 | ```bash
 59 | pip install karakeep-python-api fire tqdm pathlib beautifulsoup4 html2text markdown python-levenshtein
 60 | ```
 61 | 
 62 | ## Usage
 63 | 
 64 | ### Basic Usage (Dry Run)
 65 | 
 66 | ```bash
 67 | python omniore2karakeep-highlights.py /path/to/omnivore/export
 68 | ```
 69 | 
 70 | ### Actually Import Highlights
 71 | 
 72 | ```bash
 73 | python omniore2karakeep-highlights.py /path/to/omnivore/export --dry=False
 74 | ```
 75 | 
 76 | ### Include PDF Processing (Experimental)
 77 | 
 78 | ```bash
 79 | python omniore2karakeep-highlights.py /path/to/omnivore/export --skip_pdf=False --dry=False
 80 | ```
 81 | 
 82 | ### Custom Cache File Location
 83 | 
 84 | ```bash
 85 | python Omnivore2Karakeep-highlights.py /path/to/omnivore/export --karakeep_path=./my_bookmarks.temp --dry=False
 86 | ```
 87 | 
 88 | ## Parameters
 89 | 
 90 | - `omnivore_export_dir` (required): Path to the Omnivore export directory
 91 | - `karakeep_path` (optional): Path for caching Karakeep bookmarks (default: `./karakeep_bookmarks.temp`)
 92 | - `dry` (optional): If True, simulates the import without creating highlights (default: `True`)
 93 | - `skip_pdf` (optional): If True, skips PDF files (default: `True`)
 94 | 
 95 | ## How It Works
 96 | 
 97 | 1. **Load Omnivore Data**: Reads metadata files and highlight files from the export
 98 | 2. **Cache Karakeep Bookmarks**: Fetches all bookmarks from Karakeep (cached locally for performance)
 99 | 3. **Match Bookmarks**: For each Omnivore highlight file:
100 |    - Finds the corresponding Omnivore bookmark metadata
101 |    - Matches it to a Karakeep bookmark using:
102 |      - Exact URL matching
103 |      - Exact title matching
104 |      - Fuzzy title matching (95% threshold)
105 | 4. **Position Detection**: Determines highlight positions using multiple strategies:
106 |    - Direct text matching in plain text
107 |    - Markdown content matching with position scaling
108 |    - Fuzzy matching using Levenshtein distance
109 |    - Link extraction for link-only highlights
110 | 5. **Create Highlights**: Creates highlights in Karakeep with calculated positions and stores import metadata in the note field for future reference
111 | 
112 | ## Expected Directory Structure
113 | 
114 | Your Omnivore export should have this structure:
115 | 
116 | ```
117 | omnivore_export/
118 | ├── highlights/
119 | │   ├── article1.md
120 | │   ├── article2.md
121 | │   └── ...
122 | ├── content/
123 | │   ├── article1.html
124 | │   ├── article2.pdf
125 | │   └── ...
126 | └── metadata_YYYY-MM-DD_to_YYYY-MM-DD.json
127 | ```
128 | 
129 | ## Environment Variables
130 | 
131 | Set up your Karakeep API credentials:
132 | 
133 | ```bash
134 | export KARAKEEP_PYTHON_API_ENDPOINT="https://your-instance.com/api/v1/"
135 | export KARAKEEP_PYTHON_API_KEY="your-api-key"
136 | ```
137 | 
138 | ## Highlight Metadata and Cleanup
139 | 
140 | ### Stored Metadata
141 | 
142 | Each imported highlight includes metadata in its note field containing:
143 | - Original Omnivore bookmark information
144 | - Import timestamp
145 | - Matching strategy used
146 | - Position detection method
147 | - Source file information
148 | 
149 | This metadata enables:
150 | - **Full traceability** of imported highlights
151 | - **Easy identification** of imported vs. native highlights
152 | - **Batch cleanup operations** if needed
153 | - **Debugging** of matching and positioning issues
154 | 
155 | ### Cleanup Operations
156 | 
157 | To identify all imported highlights, you can search for highlights containing specific metadata markers in their notes. The metadata format allows for easy filtering and bulk operations if you need to remove imported highlights later.
158 | 
159 | ## Troubleshooting
160 | 
161 | ### Common Issues
162 | 
163 | 1. **"Could not find bookmark"**: The script couldn't match an Omnivore bookmark to a Karakeep bookmark
164 |    - This is normal for some bookmarks due to probabilistic matching
165 |    - Ensure the bookmark exists in Karakeep with similar URL or title
166 |    - Check if URLs or titles match between systems
167 |    - Consider manual import for important unmatched highlights
168 | 
169 | 2. **"No HTML content available"**: The Karakeep bookmark doesn't have HTML content
170 |    - Some bookmarks may not have been fully processed by Karakeep
171 |    - Wait for Karakeep to finish processing the bookmark content
172 |    - Manual content extraction may be needed for complex cases
173 | 
174 | 3. **"Could not match highlight text to corpus"**: The highlight text couldn't be located in the document
175 |    - This may happen with heavily formatted content or dynamic web pages
176 |    - The highlight will be skipped but logged for manual review
177 |    - Consider manual review of problematic highlights
178 | 
179 | ### Performance Notes
180 | 
181 | - The initial bookmark cache creation can take several minutes for large Karakeep instances
182 | - The cache file is automatically deleted upon successful completion
183 | - Subsequent runs use the cached data for faster processing
184 | - Monitor console output for matching statistics and any issues
185 | 
186 | ## Version
187 | 
188 | Current version: 0.0.1
189 | 
190 | ## Contributing
191 | 
192 | This script is part of the karakeep-python-api project. Please report issues or contribute improvements through the main repository.
193 | 


--------------------------------------------------------------------------------
/community_scripts/pocket2karakeep-archived/pocket_archiving_status_updater.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Small script made to solve the karakeep issue where Pocket's imported document would not preserve the "archived" value.
  3 | 
  4 | This script reads a CSV file exported from Pocket with the following format:
  5 | title,url,time_added,tags,status
  6 | 
  7 | It identifies entries with status "archive" and updates their status in Karakeep.
  8 | 
  9 | """
 10 | 
 11 | import time
 12 | 
 13 | from Levenshtein import ratio
 14 | import pickle
 15 | from fire import Fire
 16 | from typing import Optional
 17 | from pathlib import Path
 18 | import json
 19 | import csv
 20 | from karakeep_python_api import KarakeepAPI
 21 | from tqdm import tqdm
 22 | 
 23 | VERSION: str = "1.1.0"
 24 | 
 25 | karakeep = KarakeepAPI(verbose=False)
 26 | 
 27 | 
 28 | def get_pocket_archived(pocket_export_dir: str) -> list[dict]:
 29 |     """
 30 |     Loads and parses a CSV file from the specified directory.
 31 |     Filters and returns a list of articles that are marked as "archive" in the status column.
 32 | 
 33 |     CSV format:
 34 |     title,url,time_added,tags,status
 35 |     """
 36 |     export_dir = Path(pocket_export_dir)
 37 |     all_data: list[dict] = []
 38 | 
 39 |     # Check if the provided path is a file or directory
 40 |     if export_dir.is_file():
 41 |         csv_file = export_dir
 42 |     else:
 43 |         # Look for CSV files in the directory
 44 |         csv_files = list(export_dir.glob("*.csv"))
 45 |         if not csv_files:
 46 |             print(f"Warning: No CSV files found in {pocket_export_dir}.")
 47 |             return []
 48 |         # Use the first CSV file found
 49 |         csv_file = csv_files[0]
 50 | 
 51 |     try:
 52 |         with open(csv_file, "r", encoding="utf-8") as f:
 53 |             reader = csv.DictReader(f)
 54 |             for row in reader:
 55 |                 all_data.append(row)
 56 |     except Exception as e:
 57 |         print(f"Warning: Could not read or process {csv_file.name}: {e}")
 58 |         return []
 59 | 
 60 |     if not all_data:
 61 |         print(f"Warning: No data loaded from {csv_file}.")
 62 |         return []
 63 | 
 64 |     # Filter for articles with status "archive"
 65 |     archived = []
 66 |     for d in all_data:
 67 |         if d.get("status", "").lower() == "archive":
 68 |             # Ensure the dictionary has the required fields
 69 |             if "url" not in d:
 70 |                 print(f"Warning: Entry missing URL: {d}")
 71 |                 continue
 72 | 
 73 |             # Create a dictionary with the expected structure
 74 |             archived_entry = {
 75 |                 "url": d["url"],
 76 |                 "title": d.get("title", ""),  # Use empty string if title is missing
 77 |                 "time_added": d.get("time_added", ""),
 78 |                 "tags": d.get("tags", ""),
 79 |                 "state": "Archived",  # Add state field for compatibility with the rest of the script
 80 |             }
 81 |             archived.append(archived_entry)
 82 | 
 83 |     return archived
 84 | 
 85 | 
 86 | def main(
 87 |     pocket_export_dir: str,
 88 |     karakeep_path: Optional[str] = "./karakeep_bookmarks.temp",
 89 | ) -> None:
 90 |     archived = get_pocket_archived(pocket_export_dir)
 91 | 
 92 |     if not archived:
 93 |         print("No archived Pocket articles found or loaded. Exiting.")
 94 |         return
 95 | 
 96 |     # fetch all the bookmarks from karakeep, as the search feature is unreliable
 97 |     # as the loading can be pretty long, we store it to a local file
 98 |     if Path(karakeep_path).exists():
 99 |         with Path(karakeep_path).open("rb") as f:
100 |             all_bm = pickle.load(f)
101 |     else:
102 |         n = karakeep.get_current_user_stats()["numBookmarks"]
103 |         pbar = tqdm(total=n, desc="Fetching bookmarks")
104 |         all_bm = []
105 |         batch_size = 100  # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed
106 |         page = karakeep.get_all_bookmarks(
107 |             include_content=False,
108 |             limit=batch_size,
109 |         )
110 |         all_bm.extend(page.bookmarks)
111 |         pbar.update(len(all_bm))
112 |         while page.nextCursor:
113 |             page = karakeep.get_all_bookmarks(
114 |                 include_content=False,
115 |                 limit=batch_size,
116 |                 cursor=page.nextCursor,
117 |             )
118 |             all_bm.extend(page.bookmarks)
119 |             pbar.update(len(page.bookmarks))
120 | 
121 |         assert len(all_bm) == n, (
122 |             f"Only retrieved {len(all_bm)} bookmarks instead of {n}"
123 |         )
124 |         pbar.close()
125 | 
126 |         with Path(karakeep_path).open("wb") as f:
127 |             pickle.dump(all_bm, f)
128 | 
129 |     failed = []
130 |     for pocket in tqdm(archived, desc="Archiving", unit="doc"):
131 |         url = pocket["url"]
132 | 
133 |         found_it = False
134 |         for bookmark in all_bm:
135 |             found_url = None
136 |             content = bookmark.content
137 |             if hasattr(content, "url"):
138 |                 found_url = content.url
139 |             elif hasattr(content, "sourceUrl"):
140 |                 found_url = content.sourceUrl
141 |             else:
142 |                 found_url = ""
143 | 
144 |             if found_url == url:
145 |                 found_it = True
146 |                 break
147 | 
148 |             # couldn't find a matching url, match by title
149 |             # exact title match:
150 |             if (
151 |                 "title" in pocket
152 |                 and pocket["title"]
153 |                 and hasattr(content, "title")
154 |                 and content.title
155 |             ):
156 |                 if pocket["title"].lower() == content.title.lower():
157 |                     found_it = True
158 |                     break
159 |             if (
160 |                 "title" in pocket
161 |                 and pocket["title"]
162 |                 and hasattr(bookmark, "title")
163 |                 and bookmark.title
164 |             ):
165 |                 if pocket["title"].lower() == bookmark.title.lower():
166 |                     found_it = True
167 |                     break
168 | 
169 |             # fuzzy matching, as a last resort
170 |             threshold = 0.95
171 |             if (
172 |                 "title" in pocket
173 |                 and pocket["title"]
174 |                 and hasattr(content, "title")
175 |                 and content.title
176 |             ):
177 |                 r = ratio(pocket["title"].lower(), content.title.lower())
178 |                 if r >= threshold:
179 |                     found_it = True
180 |                     # breakpoint()
181 |                     break
182 | 
183 |             if (
184 |                 "title" in pocket
185 |                 and pocket["title"]
186 |                 and hasattr(bookmark, "title")
187 |                 and bookmark.title
188 |             ):
189 |                 r = ratio(pocket["title"].lower(), bookmark.title.lower())
190 |                 if r >= threshold:
191 |                     found_it = True
192 |                     break
193 | 
194 |         # couldn't be found
195 |         if not found_it:
196 |             failed.append(pocket)
197 |             tqdm.write(f"Failed to find {url}")
198 |             # breakpoint()
199 |             with open("./omnivore_archiver_failed.txt", "a") as f:
200 |                 f.write(f"\n{pocket}")
201 |             continue
202 | 
203 |         # skip already archived
204 |         if bookmark.archived:
205 |             tqdm.write(f"Already archived: {url}")
206 |             continue
207 |         for attempt in range(5):
208 |             try:
209 |                 fresh = karakeep.get_a_single_bookmark(
210 |                     bookmark_id=bookmark.id, include_content=False
211 |                 )
212 |                 break
213 |             except Exception as e:
214 |                 if attempt == 4:
215 |                     raise e
216 |                 tqdm.write(f"Get single bookmark failed, retrying ({attempt + 1}/5)")
217 |                 time.sleep(1)
218 |         if fresh.archived:
219 |             tqdm.write(f"Already archived: {url}")
220 |             continue
221 | 
222 |         # do the archiving
223 |         retries = 10
224 |         for attempt in range(retries):
225 |             try:
226 |                 res_arch = karakeep.update_a_bookmark(
227 |                     bookmark_id=bookmark.id,
228 |                     update_data={"archived": True},
229 |                 )
230 |                 break
231 |             except Exception as e:
232 |                 if attempt == retries - 1:
233 |                     raise e
234 |                 tqdm.write(f"Update failed, retrying ({attempt + 1}/{retries})")
235 |                 time.sleep(1)
236 |         if isinstance(res_arch, dict):
237 |             assert res_arch["archived"], res_arch
238 |         else:
239 |             assert res_arch.archived, res_arch
240 |         tqdm.write(f"Succesfuly archived: {url}")
241 | 
242 | 
243 | if __name__ == "__main__":
244 |     Fire(main)
245 | 


--------------------------------------------------------------------------------
/community_scripts/omnivore2karakeep-archived/omnivore2karakeep-archived.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Small script made to solve the karakeep issue where Omnivore's imported document would not preserve the "archived" value.
  3 | 
  4 | Link: https://github.com/karakeep-app/karakeep/issues/703
  5 | 
  6 | """
  7 | 
  8 | from Levenshtein import ratio
  9 | import pickle
 10 | from typing import Optional
 11 | from fire import Fire
 12 | from pathlib import Path
 13 | import json
 14 | from karakeep_python_api import KarakeepAPI
 15 | from tqdm import tqdm
 16 | from loguru import logger
 17 | 
 18 | # Configure loguru to log debug messages to a local file
 19 | logger.add("omniore2karakeep-archived.log", level="DEBUG", rotation="10 MB")
 20 | 
 21 | karakeep = KarakeepAPI(verbose=False)
 22 | 
 23 | VERSION: str = "2.0.0"
 24 | 
 25 | 
 26 | def match_omnivore_to_bookmark(omnivore: dict, bookmark) -> tuple[bool, float]:
 27 |     """
 28 |     Determines if an Omnivore article matches a Karakeep bookmark.
 29 | 
 30 |     Uses URL matching first, then title matching (exact and fuzzy).
 31 | 
 32 |     Parameters:
 33 |     - omnivore: Omnivore article dictionary
 34 |     - bookmark: Karakeep bookmark object
 35 | 
 36 |     Returns:
 37 |     - tuple[bool, float]: (is_match, ratio) where ratio is 1.0 for exact matches
 38 |       and Levenshtein ratio for fuzzy matches
 39 |     """
 40 |     url = omnivore["url"]
 41 | 
 42 |     # Try URL matching first
 43 |     found_url = None
 44 |     content = bookmark.content
 45 |     if hasattr(content, "url"):
 46 |         found_url = content.url
 47 |     elif hasattr(content, "sourceUrl"):
 48 |         found_url = content.sourceUrl
 49 |     else:
 50 |         raise ValueError(content)
 51 | 
 52 |     # handling local PDF, they don't have proper url
 53 |     if found_url and found_url.startswith("https://omnivore.app"):
 54 |         found_url = None
 55 | 
 56 |     if found_url == url:
 57 |         return True, 1.0
 58 | 
 59 |     # couldn't find a matching url, match by title
 60 |     # exact title match:
 61 |     if (
 62 |         "title" in omnivore
 63 |         and omnivore["title"]
 64 |         and hasattr(content, "title")
 65 |         and content.title
 66 |     ):
 67 |         if omnivore["title"].lower() == content.title.lower():
 68 |             return True, 1.0
 69 |     if (
 70 |         "title" in omnivore
 71 |         and omnivore["title"]
 72 |         and hasattr(bookmark, "title")
 73 |         and bookmark.title
 74 |     ):
 75 |         if omnivore["title"].lower() == bookmark.title.lower():
 76 |             return True, 1.0
 77 | 
 78 |     # fuzzy matching, as a last resort
 79 |     threshold = 0.95
 80 |     best_ratio = 0.0
 81 | 
 82 |     if (
 83 |         "title" in omnivore
 84 |         and omnivore["title"]
 85 |         and hasattr(content, "title")
 86 |         and content.title
 87 |     ):
 88 |         r = ratio(omnivore["title"].lower(), content.title.lower())
 89 |         best_ratio = max(best_ratio, r)
 90 | 
 91 |     if (
 92 |         "title" in omnivore
 93 |         and omnivore["title"]
 94 |         and hasattr(bookmark, "title")
 95 |         and bookmark.title
 96 |     ):
 97 |         r = ratio(omnivore["title"].lower(), bookmark.title.lower())
 98 |         best_ratio = max(best_ratio, r)
 99 | 
100 |     if best_ratio >= threshold:
101 |         return True, best_ratio
102 | 
103 |     return False, best_ratio
104 | 
105 | 
106 | def get_omnivores_archived(
107 |     omnivore_export_dir: str,
108 |     read_threshold: int = 80,
109 |     treat_read_as_archived: bool = True,
110 | ) -> list[dict]:
111 |     """
112 |     Loads and concatenates all Omnivore metadata JSON files from the specified directory.
113 |     Filters and returns a list of articles that are marked as "Archived".
114 | 
115 |     Parameters:
116 |     - read_threshold: Reading progress percentage threshold to consider an article as "read" (default: 80)
117 |     - treat_read_as_archived: If True, treat articles above read_threshold as archived (default: True)
118 |     """
119 |     export_dir = Path(omnivore_export_dir)
120 |     all_data: list[dict] = []
121 | 
122 |     # Find all metadata_*.json files, load, and concatenate their lists
123 |     for json_file in export_dir.glob("metadata_*_to_*.json"):
124 |         try:
125 |             content = json_file.read_text()
126 |             data: list[dict] = json.loads(content)
127 |             all_data.extend(data)
128 |         except json.JSONDecodeError as e:
129 |             logger.warning(f"Could not decode JSON from {json_file.name}: {e}")
130 |         except Exception as e:
131 |             logger.warning(f"Could not read or process {json_file.name}: {e}")
132 | 
133 |     if not all_data:
134 |         logger.warning(
135 |             f"No data loaded from {omnivore_export_dir}. Ensure 'metadata_*_to_*.json' files exist and are valid."
136 |         )
137 |         return []
138 | 
139 |     # figure out which should have been archived
140 |     data = all_data  # Use the concatenated data
141 |     active = []
142 |     archived = []
143 |     read = []
144 |     unknown = []
145 |     for d in data:
146 |         if int(d["readingProgress"]) > read_threshold:
147 |             read.append(d)
148 |         if d["state"] == "Archived":
149 |             archived.append(d)
150 |         elif d["state"] == "Active":
151 |             active.append(d)
152 |         elif d["state"] == "Unknown":
153 |             unknown.append(d)
154 |         else:
155 |             raise ValueError(json.dumps(d))
156 | 
157 |     # If treat_read_as_archived is True, add read articles to archived list (avoiding duplicates)
158 |     if treat_read_as_archived:
159 |         archived_urls = {
160 |             d["url"] for d in archived
161 |         }  # Create set of already archived URLs
162 |         for read_article in read:
163 |             if read_article["url"] not in archived_urls:
164 |                 archived.append(read_article)
165 | 
166 |     return archived
167 | 
168 | 
169 | def main(
170 |     omnivore_export_dir: str,
171 |     karakeep_temp_path: Optional[str] = "./karakeep_bookmarks.temp",
172 |     read_threshold: int = 80,
173 |     treat_read_as_archived: bool = True,
174 | ) -> None:
175 |     assert Path(omnivore_export_dir).exists(), "Omnivore export dir does not exist"
176 |     assert Path(omnivore_export_dir).is_dir(), "Omnivore export dir is not a dir"
177 |     archived = get_omnivores_archived(
178 |         omnivore_export_dir, read_threshold, treat_read_as_archived
179 |     )
180 | 
181 |     if not archived:
182 |         logger.info("No archived Omnivore articles found or loaded. Exiting.")
183 |         return
184 | 
185 |     # fetch all the bookmarks from karakeep, as the search feature is unreliable
186 |     # as the loading can be pretty long, we store it to a local file
187 |     if Path(karakeep_temp_path).exists():
188 |         with Path(karakeep_temp_path).open("rb") as f:
189 |             all_bm = pickle.load(f)
190 |     else:
191 |         n = karakeep.get_current_user_stats()["numBookmarks"]
192 |         pbar = tqdm(total=n, desc="Fetching bookmarks")
193 |         all_bm = []
194 |         batch_size = 100  # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed
195 |         page = karakeep.get_all_bookmarks(
196 |             include_content=False,
197 |             limit=batch_size,
198 |         )
199 |         all_bm.extend(page.bookmarks)
200 |         pbar.update(len(all_bm))
201 |         while page.nextCursor:
202 |             page = karakeep.get_all_bookmarks(
203 |                 include_content=False,
204 |                 limit=batch_size,
205 |                 cursor=page.nextCursor,
206 |             )
207 |             all_bm.extend(page.bookmarks)
208 |             pbar.update(len(page.bookmarks))
209 | 
210 |         assert len(all_bm) == n, (
211 |             f"Only retrieved {len(all_bm)} bookmarks instead of {n}"
212 |         )
213 |         pbar.close()
214 | 
215 |         with Path(karakeep_temp_path).open("wb") as f:
216 |             pickle.dump(all_bm, f)
217 | 
218 |     failed = []
219 |     for omnivore in tqdm(archived, desc="Archiving", unit="doc"):
220 |         url = omnivore["url"]
221 | 
222 |         # Collect all potential matches with their ratios
223 |         potential_matches = []
224 |         for bookmark in all_bm:
225 |             is_match, match_ratio = match_omnivore_to_bookmark(omnivore, bookmark)
226 |             if is_match:
227 |                 potential_matches.append((bookmark, match_ratio))
228 | 
229 |         # couldn't be found
230 |         if not potential_matches:
231 |             failed.append(omnivore)
232 |             tqdm.write(f"Failed to find {url}")
233 |             with open("./omnivore_archiver_failed.txt", "a") as f:
234 |                 f.write(f"\n{omnivore}")
235 |             continue
236 | 
237 |         # Choose the bookmark with the highest ratio
238 |         bookmark = max(potential_matches, key=lambda x: x[1])[0]
239 | 
240 |         # skip already archived
241 |         if bookmark.archived:
242 |             tqdm.write(f"Already archived: {url}")
243 |             continue
244 |         fresh = karakeep.get_a_single_bookmark(
245 |             bookmark_id=bookmark.id, include_content=False
246 |         )
247 |         if fresh.archived:
248 |             tqdm.write(f"Already archived: {url}")
249 |             continue
250 | 
251 |         # do the archiving
252 |         res_arch = karakeep.update_a_bookmark(
253 |             bookmark_id=bookmark.id,
254 |             update_data={"archived": True},
255 |         )
256 |         assert res_arch["archived"], res_arch
257 |         tqdm.write(f"Succesfuly archived: {url}")
258 | 
259 |     # Clean up the temporary file since everything worked successfully
260 |     if Path(karakeep_temp_path).exists():
261 |         Path(karakeep_temp_path).unlink()
262 | 
263 | 
264 | if __name__ == "__main__":
265 |     Fire(main)
266 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Karakeep Python API Client
  2 | 
  3 | [![PyPI version](https://badge.fury.io/py/karakeep-python-api.svg)](https://badge.fury.io/py/karakeep-python-api)
  4 | 
  5 | A community-developed Python client for the [Karakeep](https://karakeep.app/) API.
  6 | 
  7 | **Disclaimer:** This is an unofficial, community-driven project. The developers of Karakeep were not consulted during its creation. Use at your own discretion.
  8 | 
  9 | ## Table of Contents
 10 | 
 11 | - [Overview](#overview)
 12 | - [Current Status & Caveats](#current-status--caveats)
 13 | - [API Method Coverage](#api-method-coverage)
 14 | - [Installation](#installation)
 15 | - [Usage](#usage)
 16 |   - [Environment Variables](#environment-variables)
 17 |   - [Command Line Interface (CLI)](#command-line-interface-cli)
 18 |   - [Python Library](#python-library)
 19 | - [Community Scripts](#community-scripts)
 20 | - [Development](#development)
 21 | 
 22 | ## Overview
 23 | 
 24 | This library provides a Python interface (both a class and a command-line tool) to interact with a Karakeep instance's API. The author also developed [freshrss_to_karakeep](https://github.com/thiswillbeyourgithub/freshrss_to_karakeep), a Python script that periodically sends FreshRSS "favourite" articles to Karakeep (a bookmarking and read-it-later app, see [Karakeep on GitHub](https://github.com/karakeep-app/karakeep)).
 25 | 
 26 | The development process involved:
 27 | 
 28 | 1.  Starting with the official Karakeep OpenAPI specification: [karakeep-openapi-spec.json](https://github.com/karakeep-app/karakeep/blob/main/packages/open-api/karakeep-openapi-spec.json).
 29 | 2.  Generating Pydantic data models from the specification using [datamodel-code-generator](https://koxudaxi.github.io/datamodel-code-generator/).
 30 | 3.  Using [aider.chat](https://aider.chat), an AI pair programming tool, to write the `KarakeepAPI` client class, the Click-based CLI, and the initial Pytest suite.
 31 | 
 32 | ## Current Status & Caveats
 33 | 
 34 | *   **Experimental Methods:** The included Pytest suite currently only covers a subset of the available API methods (primarily 'get all' endpoints and client initialization). Methods *not* explicitly tested should be considered **experimental**.
 35 | *   **Ongoing Development:** The author intends to improve and validate methods as they are needed for personal use cases. Contributions and bug reports are welcome!
 36 | * **Updating process**: I have local scripts to keep track of the changes in the OpenAPI specs on the server side. From time to time I go and use aider to make the code reflect the latest openapi and push that to the `dev` branch. When there is a new karakeep release, I will merge the `dev` branch to the `main` branch and create my own release. This way, `dev` hopefully is up to date with the latest code of karakeep, while `main` is up to date with the latest release of karakeep.
 37 | 
 38 | ## API Method Coverage
 39 | 
 40 | The following table lists the public methods available in the `KarakeepAPI` class.
 41 | *   The "Pytest" column indicates whether the Python library method is covered by the automated test suite (`tests/test_karakeep_api.py`).
 42 | *   The "CLI" column indicates whether the corresponding CLI command for that method is tested within the Pytest suite (typically via `subprocess`).
 43 | Methods or CLI commands marked with ❌ should be used with caution as their behavior has not been automatically verified within the test suite.
 44 | 
 45 | | Method Name                      | Pytest | CLI  | Remarks                                      |
 46 | | -------------------------------- | :----: | :--: | -------------------------------------------- |
 47 | | `get_all_bookmarks`              |   ✅   |  ✅  | Tested with pagination.                      |
 48 | | `create_a_new_bookmark`          |   ✅   |  ❌  | Pytest for `type="link"` via fixture and `type="asset"` via PDF test. CLI not directly tested. |
 49 | | `search_bookmarks`               |   ✅   |  ✅  | Seems to be nondeterministic and fails if using more than 3 words        |
 50 | | `get_a_single_bookmark`          |   ✅   |  ❌  |  |
 51 | | `delete_a_bookmark`              |   ✅   |  ❌  |  |
 52 | | `update_a_bookmark`              |   ✅   |  ✅  | Tested for title updates.                    |
 53 | | `summarize_a_bookmark`           |   ❌   |  ❌  |                                              |
 54 | | `attach_tags_to_a_bookmark`      |   ✅   |  ❌  |  |
 55 | | `detach_tags_from_a_bookmark`    |   ✅   |  ❌  |  |
 56 | | `get_highlights_of_a_bookmark`   |   ❌   |  ❌  | Works from the CLI; not yet added to Pytest. |
 57 | | `attach_asset`                   |   ❌   |  ❌  |                                              |
 58 | | `replace_asset`                  |   ❌   |  ❌  |                                              |
 59 | | `detach_asset`                   |   ❌   |  ❌  |                                              |
 60 | | `get_all_lists`                  |   ✅   |  ✅  |                                              |
 61 | | `create_a_new_list`              |   ✅   |  ❌  | |
 62 | | `get_a_single_list`              |   ✅   |  ❌  | |
 63 | | `delete_a_list`                  |   ✅   |  ❌  | |
 64 | | `update_a_list`                  |   ❌   |  ❌  |                                              |
 65 | | `get_bookmarks_in_the_list`      |   ❌   |  ❌  |                                              |
 66 | | `add_a_bookmark_to_a_list`       |   ❌   |  ❌  |                                              |
 67 | | `remove_a_bookmark_from_a_list`  |   ❌   |  ❌  |                                              |
 68 | | `get_all_tags`                   |   ✅   |  ✅  |                                              |
 69 | | `create_a_new_tag`               |   ❌   |  ❌  |                                              |
 70 | | `get_a_single_tag`               |   ✅   |  ❌  |  |
 71 | | `delete_a_tag`                   |   ✅   |  ❌  |  |
 72 | | `update_a_tag`                   |   ✅   |  ❌  |  No output validation due to [server bug](https://github.com/karakeep-app/karakeep/issues/1365). |
 73 | | `get_bookmarks_with_the_tag`   |   ❌   |  ❌  |                                              |
 74 | | `get_all_highlights`             |   ✅   |  ✅  | Tested with pagination.                      |
 75 | | `create_a_new_highlight`         |   ❌   |  ❌  |                                              |
 76 | | `get_a_single_highlight`         |   ❌   |  ❌  |                                              |
 77 | | `delete_a_highlight`             |   ❌   |  ❌  | Works from the CLI; not yet added to Pytest. |
 78 | | `update_a_highlight`             |   ❌   |  ❌  |                                              |
 79 | | `upload_a_new_asset`             |   ✅   |  ❌  | Tested in PDF asset lifecycle test.         |
 80 | | `get_a_single_asset`             |   ✅   |  ❌  | Tested in PDF asset lifecycle test.         |
 81 | | `get_current_user_info`          |   ✅   |  ❌  | Pytest: Tested indirectly during client init. CLI not directly tested. |
 82 | | `get_current_user_stats`         |   ✅   |  ✅  |                                              |
 83 | | `update_user`                    |   ❌   |  ❌  |                                              |
 84 | | `get_all_backups`                |   ✅   |  ✅  | Tested in backup lifecycle test.            |
 85 | | `trigger_a_new_backup`           |   ✅   |  ❌  | Tested in backup lifecycle test.            |
 86 | | `get_a_single_backup`            |   ✅   |  ❌  | Tested in backup lifecycle test.            |
 87 | | `delete_a_backup`                |   ✅   |  ❌  | Tested in backup lifecycle test.            |
 88 | | `download_a_backup`              |   ✅   |  ❌  | Tested in backup lifecycle test.            |
 89 | 
 90 | ## Installation
 91 | 
 92 | It is recommended to use `uv` for faster installation:
 93 | 
 94 | ```bash
 95 | uv pip install karakeep-python-api
 96 | ```
 97 | 
 98 | Alternatively, use standard `pip`:
 99 | 
100 | ```bash
101 | pip install karakeep-python-api
102 | ```
103 | 
104 | ## Usage
105 | 
106 | This package can be used as a Python library or as a command-line interface (CLI).
107 | 
108 | ### Environment Variables
109 | 
110 | The client can be configured using the following environment variables:
111 | 
112 | *   `KARAKEEP_PYTHON_API_ENDPOINT`: **Required**. The full URL of your Karakeep API, including the `/api/v1/` path (e.g., `https://karakeep.domain.com/api/v1/` or `https://try.karakeep.app/api/v1/`).
113 | *   `KARAKEEP_PYTHON_API_KEY`: **Required**. Your Karakeep API key (Bearer token).
114 | *   `KARAKEEP_PYTHON_API_VERIFY_SSL`: Set to `false` to disable SSL certificate verification (default: `true`).
115 | *   `KARAKEEP_PYTHON_API_VERBOSE`: Set to `true` to enable verbose debug logging for the client and CLI (default: `false`).
116 | *   `KARAKEEP_PYTHON_API_DISABLE_RESPONSE_VALIDATION`: Set to `true` to disable Pydantic validation of API responses. The client will return raw dictionary/list data instead of Pydantic models (default: `false`).
117 | *   `KARAKEEP_PYTHON_API_ENSURE_ASCII`: Set to `true` to escape non-ASCII characters in the JSON output (default: `false`, which means Unicode characters are kept).
118 | 
119 | ### Command Line Interface (CLI)
120 | 
121 | The CLI dynamically generates commands based on the API methods. You need to provide your API key and endpoint either via environment variables (recommended) or command-line options.
122 | 
123 | **Basic Structure:**
124 | 
125 | ```bash
126 | python -m karakeep_python_api [GLOBAL_OPTIONS] <COMMAND> [COMMAND_OPTIONS]
127 | ```
128 | 
129 | **Getting Help:**
130 | 
131 | ```bash
132 | # General help and list of commands
133 | python -m karakeep_python_api --help
134 | 
135 | # Help for a specific command
136 | python -m karakeep_python_api get-all-bookmarks --help
137 | ```
138 | 
139 | **Examples:**
140 | 
141 | ```bash
142 | # List all tags (requires env vars set)
143 | python -m karakeep_python_api get-all-tags
144 | 
145 | # Get the first page of bookmarks with a limit, overriding env vars if needed
146 | # Note: The /api/v1/ path will be automatically appended if not present
147 | python -m karakeep_python_api --base-url https://karakeep.domain.com/api/v1/ --api-key YOUR_API_KEY get-all-bookmarks --limit 10
148 | 
149 | # Get all lists and pipe the JSON output to jq to extract the first list
150 | python -m karakeep_python_api get-all-lists | jq '.[0]'
151 | 
152 | # Create a new bookmark from a link (body provided as JSON string)
153 | python -m karakeep_python_api create-a-new-bookmark --data '{"type": "link", "url": "https://example.com"}'
154 | 
155 | # Get all tags and ensure ASCII output (e.g., for compatibility with systems that don't handle Unicode well)
156 | python -m karakeep_python_api --ascii get-all-tags
157 | 
158 | # Dump the raw OpenAPI spec used by the client
159 | python -m karakeep_python_api --dump-openapi-specification
160 | ```
161 | 
162 | ### Python Library
163 | 
164 | Import the `KarakeepAPI` class and instantiate it.
165 | 
166 | ```python
167 | import os
168 | from karakeep_python_api import KarakeepAPI, APIError, AuthenticationError, datatypes
169 | 
170 | # Ensure required environment variables are set
171 | # Example: os.environ["KARAKEEP_PYTHON_API_ENDPOINT"] = "https://karakeep.domain.com/api/v1/"
172 | # Example: os.environ["KARAKEEP_PYTHON_API_KEY"] = "your_secret_api_key"
173 | 
174 | try:
175 |     # Initialize the client (reads from env vars by default)
176 |     client = KarakeepAPI(
177 |         # Optionally override env vars:
178 |         # api_endpoint="https://karakeep.domain.com/api/v1/",
179 |         # api_key="another_key",
180 |         # verbose=True,
181 |         # disable_response_validation=False
182 |     )
183 | 
184 |     # Example: Get all lists
185 |     all_lists = client.get_all_lists()
186 |     if all_lists:
187 |         print(f"Retrieved {len(all_lists)} lists.")
188 |         # Access list properties (uses Pydantic models by default)
189 |         print(f"First list name: {all_lists[0].name}")
190 |         print(f"First list ID: {all_lists[0].id}")
191 |     else:
192 |         print("No lists found.")
193 | 
194 |     # Example: Get first page of bookmarks
195 |     bookmarks_page = client.get_all_bookmarks(limit=5)
196 |     print(f"\nRetrieved {len(bookmarks_page.bookmarks)} bookmarks.")
197 |     if bookmarks_page.bookmarks:
198 |         print(f"First bookmark title: {bookmarks_page.bookmarks[0].title}")
199 |     if bookmarks_page.nextCursor:
200 |         print(f"Next page cursor: {bookmarks_page.nextCursor}")
201 | 
202 | 
203 | except AuthenticationError as e:
204 |     print(f"Authentication failed: {e}")
205 | except APIError as e:
206 |     print(f"An API error occurred: {e}")
207 | except ValueError as e:
208 |     # Handles missing API key/endpoint during initialization
209 |     print(f"Configuration error: {e}")
210 | except Exception as e:
211 |     print(f"An unexpected error occurred: {e}")
212 | 
213 | ```
214 | 
215 | ## Community Scripts
216 | 
217 | Community Scripts are a bunch of scripts made to solve specific issues. They are made by the community so don't hesitate to submit yours or open an issue if you have a bug. They also serve as example of how to use the API.
218 | 
219 | They can be found in the [./community_scripts](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts) folder. Don't hesitate to submit yours, the contribution guidelines are in the community_scripts directory README.md file.
220 | 
221 | | Community Script | Description                                                                                                |
222 | |----------------|--------------------------------------------------------------------------------------------------------------|
223 | | [Karakeep-Time-Tagger](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-time-tagger) | Automatically adds time-to-read tags (`0-5m`, `5-10m`, etc.) to bookmarks based on content length analysis. Includes systemd service and timer files for automated periodic execution. |
224 | | [Karakeep-List-To-Tag](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-list-to-tag) | Converts a Karakeep list into tags by adding a specified tag to all bookmarks within that list.                                                                                        |
225 | | [Omnivore2Karakeep-Highlights](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/omnivore2karakeep-highlights) | Imports highlights from Omnivore export data to Karakeep, with intelligent position detection and bookmark matching. Supports dry-run mode for testing.                                |
226 | | [Omnivore2Karakeep-Archived](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/omnivore2karakeep-archived) | (Should not be needed anymore) Fixes the archived status of bookmarks imported from Omnivore by reading export data and updating Karakeep accordingly.                                 |
227 | | [pocket2karakeep-archived](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/pocket2karakeep-archived) by [@youenchene](https://github.com/youenchene) | (Should not be needed anymore) Fixes the archived status of bookmarks imported from Pocket by reading export data and updating Karakeep accordingly.                                   |
228 | | [Karakeep-Archive-Before-Date](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-archive-before-date) by [@youenchene](https://github.com/youenchene) | Allow you to archive all not archived post before a given date                                                                                                                         |
229 | | [Freshrss-To-Karakeep](https://github.com/thiswillbeyourgithub/freshrss_to_karakeep)  |  Syncs some links from Freshrss to Karakeep      |
230 | | [Karanki (WIP)](https://github.com/thiswillbeyourgithub/Karanki)  |  Bidirectional sync between anki notes and highlights      |
231 | | [Karakeep-remove-ai-tags](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-remove-ai-tags)  by [@youenchene](https://github.com/youenchene)                                                   | Remove all tags attached by AI and not human attached       |
232 | 
233 | ## Development
234 | 
235 | 1.  Clone the repository.
236 | 2.  Create a virtual environment and activate it.
237 | 3.  Install dependencies, including development tools (using `uv` recommended):
238 | 
239 |     ```bash
240 |     uv pip install -e ".[dev]"
241 |     ```
242 | 4.  Set the required environment variables (`KARAKEEP_PYTHON_API_ENDPOINT`, `KARAKEEP_PYTHON_API_KEY`) for running tests against a live instance.
243 | 5.  Run tests:
244 | 
245 |     ```bash
246 |     pytest
247 |     ```
248 | 
249 | 
250 | ---
251 | 
252 | *This README was generated with assistance from [aider.chat](https://aider.chat).*
253 | 


--------------------------------------------------------------------------------
/community_scripts/omnivore2karakeep-highlights/omnivore2karakeep-highlights.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import markdown
  3 | from Levenshtein import ratio
  4 | import re
  5 | from typing import Optional
  6 | import json
  7 | import pickle
  8 | from karakeep_python_api import KarakeepAPI
  9 | from tqdm import tqdm
 10 | from pathlib import Path
 11 | from bs4 import BeautifulSoup
 12 | from html2text import html2text
 13 | import fire
 14 | 
 15 | from string_context_matcher import match_highlight_to_corpus
 16 | 
 17 | VERSION: str = "1.0.0"
 18 | 
 19 | 
 20 | def find_highlight_position(
 21 |     highlight: str, as_text: str, as_md: str, kara_content: str
 22 | ) -> tuple[int, int]:
 23 |     """
 24 |     Find the start and end positions of a highlight within the document content.
 25 | 
 26 |     This function uses multiple strategies to locate highlights:
 27 |     1. Direct text matching in plain text
 28 |     2. Markdown content matching with position scaling
 29 |     3. Fuzzy matching using string context matcher
 30 |     4. Link extraction for highlights containing only links
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     highlight : str
 35 |         The original highlight text (may contain markdown/HTML)
 36 |     as_text : str
 37 |         The full document content as plain text
 38 |     as_md : str
 39 |         The full document content as markdown
 40 |     kara_content : str
 41 |         The raw HTML content of the document
 42 | 
 43 |     Returns
 44 |     -------
 45 |     tuple[int, int]
 46 |         A tuple containing (start_position, end_position) of the highlight
 47 |     """
 48 |     # Convert highlight to plain text for matching
 49 |     high_as_text = BeautifulSoup(markdown.markdown(highlight), "html.parser").get_text()
 50 | 
 51 |     start = 0
 52 | 
 53 |     # Strategy 1: Direct text matching
 54 |     if high_as_text in as_text:
 55 |         start = as_text.index(high_as_text)
 56 | 
 57 |     # Strategy 2: Markdown content matching with position scaling
 58 |     if highlight in as_md:
 59 |         if start == 0:
 60 |             start = int(as_md.index(highlight) / len(as_md) * len(as_text))
 61 |         else:
 62 |             start = (
 63 |                 start + int(as_md.index(highlight) / len(as_md) * len(as_text))
 64 |             ) // 2
 65 | 
 66 |     # Strategy 3: Fuzzy matching when direct matching fails
 67 |     if start == 0:
 68 |         match_text = match_highlight_to_corpus(query=high_as_text, corpus=as_text)
 69 |         match_md = match_highlight_to_corpus(query=highlight, corpus=as_md)
 70 | 
 71 |         if match_text.matches and match_md.matches:
 72 |             position_text = as_text.index(match_text.matches[0]) / len(as_text)
 73 |             position_md = as_md.index(match_md.matches[0]) / len(as_md)
 74 |             diff = abs(position_text - position_md)
 75 | 
 76 |             if diff >= 0.20:
 77 |                 # if differ too much, assume html has a too large overhead
 78 |                 rel_pos = position_md
 79 |             else:
 80 |                 rel_pos = (position_text + position_md) / 2
 81 |             del diff
 82 |         elif match_text.matches:
 83 |             rel_pos = as_text.index(match_text.matches[0]) / len(as_text)
 84 |         elif match_md.matches:
 85 |             rel_pos = as_md.index(match_md.matches[0]) / len(as_md)
 86 |         elif not high_as_text:  # probably contains only a link, so we have to find that link in the raw html
 87 |             links = re.findall(
 88 |                 r"\bhttp:\/\/[-\w+&@#\/%?=~()|!:,.;]*[-\w+&@#\/%=~()|]",
 89 |                 highlight,
 90 |             )
 91 |             positions = [
 92 |                 kara_content.index(link) for link in links if link in kara_content
 93 |             ]
 94 |             assert positions, highlight
 95 |             rel_pos = int(sum(positions) / len(positions))
 96 |         else:
 97 |             raise ValueError(
 98 |                 f"Could not match highlight text to corpus for highlight: {highlight[:100]}{'...' if len(highlight) > 100 else ''}"
 99 |             )
100 |         start = int(rel_pos * len(high_as_text))
101 |         del rel_pos
102 | 
103 |     end = start + len(high_as_text)
104 |     return start, end
105 | 
106 | 
107 | def find_matching_bookmark(
108 |     omnivore: dict, url: str, all_bm: list, is_pdf: bool, threshold: float = 0.95
109 | ):
110 |     """
111 |     Find a matching bookmark in Karakeep based on Omnivore bookmark data.
112 | 
113 |     This function attempts to match an Omnivore bookmark to a Karakeep bookmark using
114 |     multiple strategies: URL matching, exact title matching, and fuzzy title matching.
115 | 
116 |     Parameters
117 |     ----------
118 |     omnivore : dict
119 |         The Omnivore bookmark data containing URL and title information
120 |     url : str
121 |         The URL of the Omnivore bookmark
122 |     all_bm : list
123 |         List of all Karakeep bookmarks to search through
124 |     is_pdf : bool
125 |         Whether the bookmark is a PDF (not yet supported)
126 |     threshold : float, optional
127 |         Minimum similarity threshold for fuzzy matching, by default 0.95
128 | 
129 |     Returns
130 |     -------
131 |     bookmark
132 |         The matched Karakeep bookmark
133 | 
134 |     Raises
135 |     ------
136 |     NotImplementedError
137 |         If the bookmark is a PDF (not yet supported)
138 |     RuntimeError
139 |         If no matching bookmark is found or if bookmark content lacks URL attributes
140 |     """
141 |     found_bm = False
142 |     best_bookmark = None
143 |     best_score = 0.0
144 | 
145 |     for bookmark in all_bm:
146 |         found_url = None
147 |         content = bookmark.content
148 | 
149 |         if is_pdf:
150 |             raise NotImplementedError("PDF highlights are not yet supported")
151 | 
152 |         if hasattr(content, "url"):
153 |             found_url = content.url
154 |         elif hasattr(content, "sourceUrl"):
155 |             found_url = content.sourceUrl
156 |         else:
157 |             raise RuntimeError(
158 |                 f"Bookmark content has no 'url' or 'sourceUrl' attribute. Available attributes: {[attr for attr in dir(content) if not attr.startswith('_')]}"
159 |             )
160 | 
161 |         # handling local PDF, they don't have proper url
162 |         if found_url and found_url.startswith("https://omnivore.app"):
163 |             found_url = None
164 | 
165 |         if found_url == url:
166 |             found_bm = True
167 |             break
168 | 
169 |         # couldn't find a matching url, match by title
170 |         # exact title match:
171 |         if (
172 |             "title" in omnivore
173 |             and omnivore["title"]
174 |             and hasattr(content, "title")
175 |             and content.title
176 |         ):
177 |             if omnivore["title"].lower() == content.title.lower():
178 |                 found_bm = True
179 |                 break
180 |         if (
181 |             "title" in omnivore
182 |             and omnivore["title"]
183 |             and hasattr(bookmark, "title")
184 |             and bookmark.title
185 |         ):
186 |             if omnivore["title"].lower() == bookmark.title.lower():
187 |                 found_bm = True
188 |                 break
189 | 
190 |         # fuzzy matching, as a last resort - track the best match
191 |         if (
192 |             "title" in omnivore
193 |             and omnivore["title"]
194 |             and hasattr(content, "title")
195 |             and content.title
196 |         ):
197 |             r = ratio(omnivore["title"].lower(), content.title.lower())
198 |             if r > best_score:
199 |                 best_score = r
200 |                 best_bookmark = bookmark
201 | 
202 |         if (
203 |             "title" in omnivore
204 |             and omnivore["title"]
205 |             and hasattr(bookmark, "title")
206 |             and bookmark.title
207 |         ):
208 |             r = ratio(omnivore["title"].lower(), bookmark.title.lower())
209 |             if r > best_score:
210 |                 best_score = r
211 |                 best_bookmark = bookmark
212 | 
213 |     # Use the best fuzzy match if it meets the threshold
214 |     if not found_bm and best_score >= threshold:
215 |         found_bm = True
216 |         bookmark = best_bookmark
217 | 
218 |     if not found_bm:
219 |         raise RuntimeError(
220 |             f"Could not find bookmark for highlight file: {omnivore.get('slug', 'unknown')}"
221 |         )
222 | 
223 |     return bookmark
224 | 
225 | 
226 | def load_bookmarks_from_karakeep(karakeep: KarakeepAPI, karakeep_path: str) -> list:
227 |     """
228 |     Load all bookmarks from Karakeep API, using local cache if available.
229 | 
230 |     This function fetches all bookmarks from the Karakeep instance, with content included.
231 |     To avoid repeated API calls during development, bookmarks are cached locally.
232 | 
233 |     Parameters
234 |     ----------
235 |     karakeep : KarakeepAPI
236 |         The Karakeep API client instance
237 |     karakeep_path : str
238 |         Path to the local cache file for storing bookmarks
239 | 
240 |     Returns
241 |     -------
242 |     list
243 |         List of all bookmarks from the Karakeep instance
244 |     """
245 |     if Path(karakeep_path).exists():
246 |         with Path(karakeep_path).open("rb") as f:
247 |             all_bm = pickle.load(f)
248 |     else:
249 |         n = karakeep.get_current_user_stats()["numBookmarks"]
250 |         pbar = tqdm(total=n, desc="Fetching bookmarks")
251 |         all_bm = []
252 |         batch_size = 100  # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed
253 |         page = karakeep.get_all_bookmarks(
254 |             include_content=True,
255 |             limit=batch_size,
256 |         )
257 |         all_bm.extend(page.bookmarks)
258 |         pbar.update(len(all_bm))
259 |         while page.nextCursor:
260 |             page = karakeep.get_all_bookmarks(
261 |                 include_content=True,
262 |                 limit=batch_size,
263 |                 cursor=page.nextCursor,
264 |             )
265 |             all_bm.extend(page.bookmarks)
266 |             pbar.update(len(page.bookmarks))
267 | 
268 |         assert len(all_bm) == n, (
269 |             f"Only retrieved {len(all_bm)} bookmarks instead of {n}"
270 |         )
271 |         pbar.close()
272 | 
273 |         with Path(karakeep_path).open("wb") as f:
274 |             pickle.dump(all_bm, f)
275 | 
276 |     return all_bm
277 | 
278 | 
279 | def get_omnivores_bookmarks(omnivore_export_dir: str) -> list[dict]:
280 |     """
281 |     Load and concatenate bookmark data from all Omnivore export metadata files.
282 | 
283 |     This function searches for metadata files matching the pattern 'metadata_*_to_*.json'
284 |     in the specified Omnivore export directory and combines all bookmark data into a
285 |     single list. Each metadata file is expected to contain a JSON list of bookmarks.
286 | 
287 |     Parameters
288 |     ----------
289 |     omnivore_export_dir : str
290 |         Path to the Omnivore export directory containing metadata files
291 | 
292 |     Returns
293 |     -------
294 |     list[dict]
295 |         Combined list of all bookmark dictionaries found in the metadata files.
296 |         Returns empty list if no valid metadata files are found.
297 | 
298 |     Notes
299 |     -----
300 |     Files that cannot be decoded as JSON or do not contain lists will be skipped
301 |     with warning messages. The function processes files in sorted order to ensure
302 |     consistent results.
303 |     """
304 |     export_path = Path(omnivore_export_dir)
305 |     all_data: list[dict] = []
306 | 
307 |     # Glob for metadata files and sort them to ensure consistent order (e.g., by date if named accordingly)
308 |     metadata_files = sorted(export_path.glob("metadata_*_to_*.json"))
309 | 
310 |     if not metadata_files:
311 |         print(
312 |             f"Warning: No metadata files matching 'metadata_*_to_*.json' found in {omnivore_export_dir}"
313 |         )
314 |         return []
315 | 
316 |     for file_path in metadata_files:
317 |         try:
318 |             content = file_path.read_text()
319 |             # Each metadata file is expected to contain a JSON list of bookmarks
320 |             data_from_file: list[dict] = json.loads(content)
321 |             if isinstance(data_from_file, list):
322 |                 all_data.extend(data_from_file)
323 |             else:
324 |                 print(
325 |                     f"Warning: Metadata file {file_path.name} does not contain a JSON list. Skipping."
326 |                 )
327 |         except json.JSONDecodeError:
328 |             print(f"Warning: Could not decode JSON from {file_path.name}. Skipping.")
329 |         except Exception as e:
330 |             print(
331 |                 f"Warning: An error occurred while processing {file_path.name}: {e}. Skipping."
332 |             )
333 | 
334 |     return all_data
335 | 
336 | 
337 | def main(
338 |     omnivore_export_dir: str,
339 |     karakeep_path: Optional[str] = "./karakeep_bookmarks.temp",
340 |     dry: bool = True,
341 |     skip_pdf: bool = True,
342 | ) -> None:
343 |     """
344 |     Import highlights from Omnivore export to Karakeep.
345 | 
346 |     This function processes Omnivore export data to import highlights into a Karakeep instance.
347 |     It matches Omnivore bookmarks to Karakeep bookmarks and creates corresponding highlights.
348 | 
349 |     The temporary Karakeep bookmarks cache file will be automatically deleted upon successful
350 |     completion to avoid leaving temporary files behind.
351 | 
352 |     Parameters
353 |     ----------
354 |     omnivore_export_dir : str
355 |         Path to the Omnivore export directory containing highlights and content
356 |     karakeep_path : str, optional
357 |         Path to temporary file for caching Karakeep bookmarks, by default "./karakeep_bookmarks.temp"
358 |     dry : bool, optional
359 |         If True, only simulate the import without actually creating highlights, by default True
360 |     skip_pdf : bool, optional
361 |         If True, skip processing PDF highlights (not yet supported), by default True
362 |     """
363 |     omnivore_export_path = Path(omnivore_export_dir)
364 |     highlights_dir_path = omnivore_export_path / "highlights"
365 |     omnivore_content_dir_path = omnivore_export_path / "content"
366 | 
367 |     assert omnivore_export_path.exists() and omnivore_export_path.is_dir(), (
368 |         f"Omnivore export directory not found: {omnivore_export_dir}"
369 |     )
370 |     assert highlights_dir_path.exists() and highlights_dir_path.is_dir(), (
371 |         f"Highlights directory not found: {highlights_dir_path}"
372 |     )
373 |     assert omnivore_content_dir_path.exists() and omnivore_content_dir_path.is_dir(), (
374 |         f"Omnivore content directory not found: {omnivore_content_dir_path}"
375 |     )
376 | 
377 |     highlights_files = [
378 |         p
379 |         for p in highlights_dir_path.iterdir()
380 |         if p.name.endswith(".md") and p.read_text().strip()
381 |     ]
382 |     content_files: dict = {
383 |         p.stem: p.suffix for p in omnivore_content_dir_path.iterdir()
384 |     }
385 | 
386 |     data = get_omnivores_bookmarks(omnivore_export_dir)
387 | 
388 |     karakeep = KarakeepAPI(verbose=False)
389 | 
390 |     # fetch all the bookmarks from karakeep, as the search feature is unreliable
391 |     # as the loading can be pretty long, we store it to a local file
392 |     all_bm = load_bookmarks_from_karakeep(karakeep, karakeep_path)
393 | 
394 |     for f_ind, f in enumerate(
395 |         tqdm(highlights_files, unit="highlight", desc="importing highlights")
396 |     ):
397 |         name = f.stem
398 | 
399 |         highlights = f.read_text().strip().split("\n> ")
400 |         highlights = [h.strip() for h in highlights if h.strip()]
401 |         if not highlights:
402 |             continue
403 | 
404 |         found_omni = False
405 |         for omnivore in data:
406 |             if omnivore["slug"] == name:
407 |                 found_omni = True
408 |                 break
409 | 
410 |         if not found_omni:
411 |             print("Couldn't find the omnivore 'bookmark' for that highlight")
412 |             raise RuntimeError(
413 |                 f"Could not find omnivore bookmark for highlight file: {name}"
414 |             )
415 |         url = omnivore["url"]
416 | 
417 |         # check if the highlight is from a pdf or an html
418 |         assert name in content_files, name
419 |         if content_files[name] == ".pdf":
420 |             is_pdf = True
421 |             if skip_pdf:
422 |                 continue
423 |             else:
424 |                 raise NotImplementedError("PDF highlights are not yet supported")
425 |         elif content_files[name] == ".html":
426 |             is_pdf = False
427 |         else:
428 |             raise RuntimeError(
429 |                 f"Unexpected file extension '{content_files[name]}' for file '{name}'. Expected '.pdf' or '.html'"
430 |             )
431 | 
432 |         bookmark = find_matching_bookmark(omnivore, url, all_bm, is_pdf)
433 | 
434 |         kara_content = bookmark.content.htmlContent
435 | 
436 |         if not kara_content:
437 |             print(
438 |                 f"Skipping bookmark '{bookmark.title or name}' (ID: {bookmark.id}) - no HTML content available"
439 |             )
440 |             continue
441 | 
442 |         as_md = html2text(kara_content, bodywidth=9999999)
443 | 
444 |         as_text = BeautifulSoup(kara_content, "html.parser").get_text()
445 | 
446 |         for highlight in highlights:
447 |             if highlight.startswith("> "):
448 |                 highlight = highlight[1:]
449 |                 highlight.strip()
450 | 
451 |             # fix URLs of omnivore to point to the original source
452 |             highlight = re.sub(
453 |                 r"https://proxy-prod.omnivore-image-cache.app/.*https://",
454 |                 "https://",
455 |                 highlight,
456 |             )
457 | 
458 |             link_pattern = r"\[.*?\]\((.*?)\)"
459 |             link_replaced = re.sub(link_pattern, r" (Link to \1)", highlight)
460 |             high_link_replaced_as_text = BeautifulSoup(
461 |                 markdown.markdown(link_replaced), "html.parser"
462 |             ).get_text()
463 | 
464 |             if not high_link_replaced_as_text:
465 |                 assert high_link_replaced_as_text, (
466 |                     f"Empty highlight text after processing. Original highlight: {highlight[:200]}{'...' if len(highlight) > 200 else ''}, Link replaced: {link_replaced[:200]}{'...' if len(link_replaced) > 200 else ''}"
467 |                 )
468 | 
469 |             start, end = find_highlight_position(
470 |                 highlight=highlight,
471 |                 as_text=as_text,
472 |                 as_md=as_md,
473 |                 kara_content=kara_content,
474 |             )
475 | 
476 |             if not dry:
477 |                 # Create metadata dict for this highlight
478 |                 highlight_metadata = {
479 |                     "omnivore_bookmark_id": omnivore["id"],
480 |                     "omnivore_highlight_filename": f.name,
481 |                     "omnivore_highlight_importer_version": VERSION,
482 |                 }
483 | 
484 |                 resp = karakeep.create_a_new_highlight(
485 |                     bookmark_id=bookmark.id,
486 |                     start_offset=start,
487 |                     end_offset=end,
488 |                     color="yellow",
489 |                     text=high_link_replaced_as_text,
490 |                     note=json.dumps(highlight_metadata, ensure_ascii=False),
491 |                 )
492 |                 assert resp, highlight
493 | 
494 | 
495 | if __name__ == "__main__":
496 |     fire.Fire(main)
497 | 


--------------------------------------------------------------------------------
/community_scripts/karakeep-time-tagger/karakeep-time-tagger.py:
--------------------------------------------------------------------------------
  1 | """Adds time-to-read tags to bookmarks based on content length."""
  2 | 
  3 | import sys
  4 | import pickle
  5 | import fire
  6 | from pathlib import Path
  7 | from tqdm import tqdm
  8 | from bs4 import BeautifulSoup
  9 | from loguru import logger
 10 | from karakeep_python_api import KarakeepAPI
 11 | 
 12 | 
 13 | VERSION: str = "1.3.0"
 14 | 
 15 | 
 16 | class AddTimeToRead:
 17 |     """Class to add time-to-read tags to bookmarks based on content length."""
 18 | 
 19 |     # Define the time-to-read tags
 20 |     TIME_TAGS = ["0-1m", "1-5m", "5-10m", "10-15m", "15-30m", "30m+"]
 21 | 
 22 |     def __init__(self):
 23 |         """Initialize the AddTimeToRead class."""
 24 |         self.karakeep = None
 25 | 
 26 |     def create_time_reading_lists(self):
 27 |         """Create smart lists for each time-to-read tag."""
 28 |         # Get existing lists to avoid duplicates
 29 |         try:
 30 |             existing_lists = self.karakeep.get_all_lists()
 31 |             existing_list_names = {lst.name for lst in existing_lists}
 32 |             logger.info(f"Found {len(existing_lists)} existing lists")
 33 |         except Exception as e:
 34 |             logger.error(f"Failed to fetch existing lists: {e}")
 35 |             return
 36 | 
 37 |         # Mapping of time tags to zero-padded list names and descriptions
 38 |         list_configs = [
 39 |             {
 40 |                 "name": "00-05m",
 41 |                 "tag": "0-5m",
 42 |                 "description": "Quick reads under 5 minutes",
 43 |             },
 44 |             {
 45 |                 "name": "05-10m",
 46 |                 "tag": "5-10m",
 47 |                 "description": "Short reads 5-10 minutes",
 48 |             },
 49 |             {
 50 |                 "name": "10-15m",
 51 |                 "tag": "10-15m",
 52 |                 "description": "Medium reads 10-15 minutes",
 53 |             },
 54 |             {
 55 |                 "name": "15-30m",
 56 |                 "tag": "15-30m",
 57 |                 "description": "Long reads 15-30 minutes",
 58 |             },
 59 |             {
 60 |                 "name": "30m+",
 61 |                 "tag": "30m+",
 62 |                 "description": "Extended reads over 30 minutes",
 63 |             },
 64 |         ]
 65 | 
 66 |         for config in list_configs:
 67 |             list_name = config["name"]
 68 | 
 69 |             # Skip if list already exists
 70 |             if list_name in existing_list_names:
 71 |                 logger.info(f"List '{list_name}' already exists, skipping creation")
 72 |                 continue
 73 | 
 74 |             tag_name = config["tag"]
 75 |             description = config["description"]
 76 |             query = f"#{tag_name} -is:archived"
 77 | 
 78 |             logger.info(f"Creating smart list '{list_name}' with query '{query}'")
 79 | 
 80 |             try:
 81 |                 result = self.karakeep.create_a_new_list(
 82 |                     name=list_name,
 83 |                     icon="⏱️",  # Clock icon for time-based lists
 84 |                     description=description,
 85 |                     list_type="smart",
 86 |                     query=query,
 87 |                 )
 88 |                 logger.info(
 89 |                     f"Successfully created list '{list_name}' with ID: {result.id if hasattr(result, 'id') else 'unknown'}"
 90 |                 )
 91 | 
 92 |             except Exception as e:
 93 |                 logger.error(f"Failed to create list '{list_name}': {e}")
 94 | 
 95 |     def setup_logging(self, verbose: bool = False):
 96 |         """Setup loguru logging with file output and console output based on verbosity."""
 97 |         # Remove default logger
 98 |         # logger.remove()
 99 | 
100 |         # Add file logger with debug level
101 |         logger.add("karakeep-time-tagger.log", level="DEBUG", rotation="10 MB")
102 | 
103 |         # Add console logger based on verbosity
104 |         if verbose:
105 |             logger.add(sys.stderr, level="DEBUG")
106 |         else:
107 |             logger.add(sys.stderr, level="INFO")
108 | 
109 |     def extract_content_text(self, bookmark) -> str:
110 |         """
111 |         Extract text content from bookmark based on its type.
112 | 
113 |         Args:
114 |             bookmark: Bookmark object with content
115 | 
116 |         Returns:
117 |             str: Text content to analyze
118 |         """
119 |         if bookmark.content.type == "link":
120 |             # For link bookmarks, content is in bookmark.content.content
121 |             return bookmark.content.htmlContent
122 |         elif bookmark.content.type == "text":
123 |             # For text bookmarks, content is in bookmark.content.text
124 |             return bookmark.content.text
125 |         else:
126 |             logger.debug(f"Unsupported content type: {bookmark.content.type}")
127 |             return ""
128 | 
129 |     def estimate_reading_time(self, bookmark, wpm: int) -> str:
130 |         """
131 |         Estimate reading time for given bookmark and return appropriate tag.
132 | 
133 |         Args:
134 |             bookmark: Bookmark object to analyze
135 |             wpm: Words per minute reading speed
136 | 
137 |         Returns:
138 |             str: Time tag (0-5m, 5-10m, 10-15m, 15-30m, 30m+)
139 |         """
140 |         # Extract text content based on bookmark type
141 |         content = self.extract_content_text(bookmark)
142 | 
143 |         if not content:
144 |             logger.debug("Empty content, returning 0-5m tag")
145 |             return "0-5m"
146 | 
147 |         # Parse HTML and extract text
148 |         soup = BeautifulSoup(content, "html.parser")
149 |         text = soup.get_text()
150 | 
151 |         # Count words (split by whitespace)
152 |         word_count = len(text.split())
153 |         logger.debug(f"Word count: {word_count}")
154 | 
155 |         # Calculate reading time in minutes
156 |         reading_time_minutes = word_count / wpm
157 |         logger.debug(f"Estimated reading time: {reading_time_minutes:.2f} minutes")
158 | 
159 |         # Determine appropriate tag
160 |         if reading_time_minutes <= 5:
161 |             return "0-5m"
162 |         elif reading_time_minutes <= 10:
163 |             return "5-10m"
164 |         elif reading_time_minutes <= 15:
165 |             return "10-15m"
166 |         elif reading_time_minutes <= 30:
167 |             return "15-30m"
168 |         else:
169 |             return "30m+"
170 | 
171 |     def get_current_time_tags(self, bookmark) -> list:
172 |         """Get list of current time-to-read tags on bookmark."""
173 |         if not bookmark.tags:
174 |             return []
175 | 
176 |         current_time_tags = []
177 |         for tag in bookmark.tags:
178 |             if tag.name in self.TIME_TAGS:
179 |                 current_time_tags.append(tag.name)
180 | 
181 |         return current_time_tags
182 | 
183 |     def should_skip_bookmark(self, bookmark, reset_all: bool) -> bool:
184 |         """
185 |         Determine if bookmark should be skipped based on reset_all setting.
186 | 
187 |         Logic:
188 |         - If reset_all is True: never skip
189 |         - If reset_all is False:
190 |           - When using search mode, we've already filtered to untagged bookmarks, so never skip
191 |           - This method is kept for consistency but simplified logic when not reset_all
192 |         """
193 |         if reset_all:
194 |             return False
195 | 
196 |         # When reset_all is False, we've already used search to find untagged bookmarks
197 |         # So we should process all bookmarks in the filtered set
198 |         # However, we still check for multiple time tags that might need reset
199 |         current_time_tags = self.get_current_time_tags(bookmark)
200 | 
201 |         # If exactly one time tag, this shouldn't happen in search mode but handle gracefully
202 |         if len(current_time_tags) == 1:
203 |             logger.debug(
204 |                 f"Unexpected: bookmark {bookmark.id} found in search but has time tag: {current_time_tags[0]}"
205 |             )
206 |             return True
207 | 
208 |         # Process bookmarks with multiple time tags or no time tags
209 |         return False
210 | 
211 |     def needs_reset(self, bookmark) -> bool:
212 |         """Check if bookmark has multiple time tags and needs reset."""
213 |         current_time_tags = self.get_current_time_tags(bookmark)
214 |         return len(current_time_tags) > 1
215 | 
216 |     def process_bookmark(self, bookmark, wpm: int):
217 |         """Process a single bookmark to add appropriate time-to-read tag."""
218 |         logger.debug(f"Processing bookmark {bookmark.id}: {bookmark.title}")
219 | 
220 |         # Only process link and text bookmarks
221 |         if bookmark.content.type not in ["link", "text"]:
222 |             logger.debug(
223 |                 f"Skipping bookmark {bookmark.id} - type {bookmark.content.type} not supported"
224 |             )
225 |             return
226 | 
227 |         # Estimate reading time
228 |         target_tag = self.estimate_reading_time(bookmark, wpm)
229 |         logger.debug(f"Target tag for bookmark {bookmark.id}: {target_tag}")
230 | 
231 |         # Get current time tags
232 |         current_time_tags = self.get_current_time_tags(bookmark)
233 |         logger.debug(
234 |             f"Current time tags for bookmark {bookmark.id}: {current_time_tags}"
235 |         )
236 | 
237 |         # If bookmark already has the correct tag and no others, skip
238 |         if current_time_tags == [target_tag]:
239 |             logger.debug(f"Bookmark {bookmark.id} already has correct tag")
240 |             return
241 | 
242 |         # Remove all existing time tags if any
243 |         if current_time_tags:
244 |             logger.info(
245 |                 f"Removing existing time tags {current_time_tags} from bookmark {bookmark.id}"
246 |             )
247 |             try:
248 |                 self.karakeep.detach_tags_from_a_bookmark(
249 |                     bookmark_id=bookmark.id, tag_names=current_time_tags
250 |                 )
251 |             except Exception as e:
252 |                 logger.error(f"Failed to remove tags from bookmark {bookmark.id}: {e}")
253 |                 return
254 | 
255 |         # Add the target tag
256 |         logger.info(
257 |             f"Adding tag '{target_tag}' to bookmark {bookmark.id}: {bookmark.title}"
258 |         )
259 |         try:
260 |             self.karakeep.attach_tags_to_a_bookmark(
261 |                 bookmark_id=bookmark.id, tag_names=[target_tag]
262 |             )
263 |         except Exception as e:
264 |             logger.error(f"Failed to add tag to bookmark {bookmark.id}: {e}")
265 | 
266 |     def run(
267 |         self,
268 |         wpm: int = 200,
269 |         reset_all: bool = False,
270 |         verbose: bool = False,
271 |         cache_file: str = "./bookmarks.temp",
272 |         create_lists: bool = True,
273 |     ):
274 |         """
275 |         Main method to process all bookmarks and add time-to-read tags.
276 | 
277 |         Args:
278 |             wpm: Words per minute reading speed (default: 200)
279 |             reset_all: If True, process all bookmarks. If False, skip bookmarks that already have a single time tag.
280 |             verbose: If True, show debug level logs in console
281 |             cache_file: Path to cache file for bookmarks (default: ./bookmarks.temp)
282 |             create_lists: If True, create smart lists for each time slot (default: False)
283 |         """
284 |         # Setup logging
285 |         self.setup_logging(verbose)
286 | 
287 |         logger.info(f"Starting AddTimeToRead with wpm={wpm}, reset_all={reset_all}")
288 | 
289 |         # Connect to Karakeep
290 |         try:
291 |             self.karakeep = KarakeepAPI()
292 |             logger.info("Connected to Karakeep API")
293 |         except Exception as e:
294 |             logger.error(f"Failed to connect to Karakeep API: {e}")
295 |             return
296 | 
297 |         # Create smart lists if requested
298 |         if create_lists:
299 |             logger.info("Creating smart lists for time-to-read tags...")
300 |             self.create_time_reading_lists()
301 | 
302 |         # Determine cache file name based on reset_all mode
303 |         if reset_all:
304 |             cache_file_final = cache_file
305 |         else:
306 |             # Use different cache for untagged bookmarks search
307 |             cache_parts = Path(cache_file).parts
308 |             cache_file_final = str(
309 |                 Path(*cache_parts[:-1]) / f"untagged_{cache_parts[-1]}"
310 |             )
311 | 
312 |         # Fetch bookmarks with content, using cache to speed up testing
313 |         # As the loading can be pretty long, we store it to a local file
314 |         if reset_all:
315 |             if Path(cache_file_final).exists():
316 |                 logger.info(f"Loading bookmarks from cache file: {cache_file_final}")
317 |                 with Path(cache_file_final).open("rb") as f:
318 |                     bookmarks = pickle.load(f)
319 |                 logger.info(f"Loaded {len(bookmarks)} bookmarks from cache")
320 |             else:
321 |                 logger.info("Cache file not found, fetching bookmarks from API...")
322 | 
323 |                 # Fetch all bookmarks when reset_all is True
324 |                 try:
325 |                     n = self.karakeep.get_current_user_stats()["numBookmarks"]
326 |                     logger.info(f"Total bookmarks to fetch: {n}")
327 |                 except Exception as e:
328 |                     logger.error(f"Failed to get bookmark count: {e}")
329 |                     return
330 | 
331 |                 logger.info("Fetching all bookmarks with content...")
332 |                 pbar = tqdm(total=n, desc="Fetching bookmarks")
333 |                 bookmarks = []
334 |                 batch_size = 100  # Maximum allowed batch size to avoid crashing the karakeep instance
335 | 
336 |                 try:
337 |                     page = self.karakeep.get_all_bookmarks(
338 |                         include_content=True,
339 |                         limit=batch_size,
340 |                     )
341 |                     bookmarks.extend(page.bookmarks)
342 |                     pbar.update(len(page.bookmarks))
343 | 
344 |                     while page.nextCursor:
345 |                         page = self.karakeep.get_all_bookmarks(
346 |                             include_content=True,
347 |                             limit=batch_size,
348 |                             cursor=page.nextCursor,
349 |                         )
350 |                         bookmarks.extend(page.bookmarks)
351 |                         pbar.update(len(page.bookmarks))
352 | 
353 |                     assert len(bookmarks) == n, (
354 |                         f"Only retrieved {len(bookmarks)} bookmarks instead of {n}"
355 |                     )
356 |                     pbar.close()
357 | 
358 |                 except Exception as e:
359 |                     pbar.close()
360 |                     logger.error(f"Error fetching bookmarks: {e}")
361 |                     return
362 | 
363 |             # Save bookmarks to cache file
364 |             logger.info(
365 |                 f"Saving {len(bookmarks)} bookmarks to cache file: {cache_file_final}"
366 |             )
367 |             with Path(cache_file_final).open("wb") as f:
368 |                 pickle.dump(bookmarks, f)
369 |         else:
370 |             # Use search to find bookmarks without time tags when reset_all is False
371 |             search_query = "-#0-5m -#5-10m -#10-15m -#15-30m -#30m+"
372 |             logger.info(
373 |                 f"Searching for bookmarks without time tags using query: {search_query}"
374 |             )
375 | 
376 |             bookmarks = []
377 |             batch_size = 100
378 | 
379 |             try:
380 |                 page = self.karakeep.search_bookmarks(
381 |                     q=search_query,
382 |                     include_content=True,
383 |                     limit=batch_size,
384 |                 )
385 |                 bookmarks.extend(page.bookmarks)
386 |                 logger.info(f"Found {len(page.bookmarks)} bookmarks in first page")
387 | 
388 |                 while page.nextCursor:
389 |                     page = self.karakeep.search_bookmarks(
390 |                         q=search_query,
391 |                         include_content=True,
392 |                         limit=batch_size,
393 |                         cursor=page.nextCursor,
394 |                     )
395 |                     bookmarks.extend(page.bookmarks)
396 |                     logger.info(f"Found {len(page.bookmarks)} additional bookmarks")
397 | 
398 |                 logger.info(f"Total untagged bookmarks found: {len(bookmarks)}")
399 | 
400 |             except Exception as e:
401 |                 logger.error(f"Error searching for untagged bookmarks: {e}")
402 |                 return
403 | 
404 |         logger.info(f"Total bookmarks fetched: {len(bookmarks)}")
405 | 
406 |         # Process bookmarks
407 |         processed = 0
408 |         skipped_by_policy = 0
409 |         skipped_by_type = 0
410 |         errors = 0
411 | 
412 |         for bookmark in tqdm(bookmarks, desc="Processing bookmarks"):
413 |             try:
414 |                 # Check bookmark type first
415 |                 if bookmark.content.type not in ["link", "text"]:
416 |                     skipped_by_type += 1
417 |                     continue
418 | 
419 |                 # Check if we should skip this bookmark based on reset policy
420 |                 if self.should_skip_bookmark(bookmark, reset_all):
421 |                     skipped_by_policy += 1
422 |                     continue
423 | 
424 |                 # Check if bookmark needs reset (has multiple time tags)
425 |                 if self.needs_reset(bookmark):
426 |                     logger.info(
427 |                         f"Bookmark {bookmark.id} has multiple time tags, will be reset"
428 |                     )
429 | 
430 |                 # Process the bookmark
431 |                 self.process_bookmark(bookmark, wpm)
432 |                 processed += 1
433 | 
434 |             except Exception as e:
435 |                 logger.error(f"Error processing bookmark {bookmark.id}: {e}")
436 |                 errors += 1
437 | 
438 |         logger.info(
439 |             f"Processing complete. Processed: {processed}, Skipped (policy): {skipped_by_policy}, Skipped (type): {skipped_by_type}, Errors: {errors}"
440 |         )
441 | 
442 |         # Clean up cache file after successful completion
443 |         if Path(cache_file_final).exists():
444 |             try:
445 |                 Path(cache_file_final).unlink()
446 |                 logger.info(f"Cleaned up cache file: {cache_file_final}")
447 |             except Exception as e:
448 |                 logger.warning(f"Failed to delete cache file {cache_file_final}: {e}")
449 | 
450 | 
451 | def main(
452 |     wpm: int = 200,
453 |     reset_all: bool = False,
454 |     verbose: bool = False,
455 |     cache_file: str = "./bookmarks.temp",
456 |     create_lists: bool = True,
457 | ):
458 |     """
459 |     Main entry point for the script.
460 | 
461 |     Args:
462 |         wpm: Words per minute reading speed (default: 200)
463 |         reset_all: If True, process all bookmarks. If False, skip bookmarks that already have a single time tag.
464 |         verbose: If True, show debug level logs in console
465 |         cache_file: Path to cache file for bookmarks (default: ./bookmarks.temp)
466 |         create_lists: If True, create smart lists for each time slot (default: False)
467 |     """
468 |     add_time_to_read = AddTimeToRead()
469 |     add_time_to_read.run(
470 |         wpm=wpm,
471 |         reset_all=reset_all,
472 |         verbose=verbose,
473 |         cache_file=cache_file,
474 |         create_lists=create_lists,
475 |     )
476 | 
477 | 
478 | if __name__ == "__main__":
479 |     fire.Fire(main)
480 | 


--------------------------------------------------------------------------------
/community_scripts/omnivore2karakeep-highlights/string_context_matcher.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from math import inf
  3 | from dataclasses import dataclass
  4 | 
  5 | from joblib import Parallel, delayed, Memory
  6 | from rapidfuzz.distance.Levenshtein import normalized_distance as lev_dist
  7 | from rapidfuzz.fuzz import ratio as lev_ratio
  8 | 
  9 | # Initialize joblib Memory for caching
 10 | # Using a distinct cache name for this module
 11 | mem = Memory(".cache_string_matcher", verbose=False)
 12 | 
 13 | 
 14 | @dataclass
 15 | class MatchResult:
 16 |     """
 17 |     Holds the results of a highlight matching operation.
 18 | 
 19 |     Attributes:
 20 |     - matches: List of best matching substrings of corpus.
 21 |     - ratio: Levenshtein ratio of the closest match.
 22 |     - distance: Levenshtein distance of the closest match.
 23 |     - quick_match_used: True if the quick matching algorithm was used, False otherwise.
 24 |     """
 25 | 
 26 |     matches: List[str]
 27 |     ratio: float
 28 |     distance: (
 29 |         float  # Levenshtein distance is usually int, but normalized_distance is float
 30 |     )
 31 |     quick_match_used: bool
 32 | 
 33 | 
 34 | @mem.cache(ignore=["n_jobs"])
 35 | def match_highlight_to_corpus(
 36 |     query: str,
 37 |     corpus: str,
 38 |     case_sensitive: bool = True,
 39 |     step_factor: int = 500,
 40 |     n_jobs: int = -1,
 41 | ) -> MatchResult:
 42 |     """
 43 |     Source: https://stackoverflow.com/questions/36013295/find-best-substring-match
 44 |     Returns the substring of the corpus with the least Levenshtein distance from the query
 45 |     (May not always return optimal answer).
 46 | 
 47 |     Arguments
 48 |     - query: str
 49 |     - corpus: str
 50 |     - case_sensitive: bool
 51 |     - step_factor: int
 52 |         Only used in the long way.
 53 |         Influences the resolution of the thorough search once the general region is found.
 54 |         The increment in ngrams lengths used for the thorough search is calculated as len(query)//step_factor.
 55 |         Increasing this increases the number of ngram lengths used in the thorough search and increases the chances
 56 |         of getting the optimal solution at the cost of runtime and memory.
 57 |     - n_jobs: int
 58 |         number of jobs to use for multithreading. 1 to disable
 59 | 
 60 |     Returns
 61 |     MatchResult object containing:
 62 |         - matches: List of best matching substrings of corpus,
 63 |         - ratio: Levenshtein ratio of closest match,
 64 |         - distance: Levenshtein distance of closest match,
 65 |         - quick_match_used: True if used the quick way False if using the long way,
 66 |     """
 67 | 
 68 |     # quick way
 69 |     lq = len(query)
 70 |     lc = len(corpus)
 71 | 
 72 |     # Prepare query and corpus for caseless comparison if needed for word matching
 73 |     # but original query/corpus are used for levenshtein to respect case_sensitive flag later if it were used.
 74 |     # Note: The current 'quick way' does not explicitly use the case_sensitive flag for its Levenshtein comparisons.
 75 |     # It uses casefolded strings for identifying regions.
 76 |     lquery_caseless = query.casefold()
 77 |     lcorp_caseless = corpus.casefold()
 78 | 
 79 |     # 1. find most probably region that contains the appropriate words
 80 |     qwords = [w.strip() for w in set(lquery_caseless.split(" ")) if len(w.strip()) > 3]
 81 |     indexes = []
 82 |     for w in qwords:
 83 |         m = []
 84 |         prev = 0
 85 |         # Search for word occurrences in the case-folded corpus
 86 |         while True:
 87 |             try:
 88 |                 found_idx = lcorp_caseless.index(w, prev)
 89 |                 m.append(found_idx)
 90 |                 prev = found_idx + 1
 91 |                 if len(m) >= 20:  # Limit number of matches per word
 92 |                     break
 93 |             except ValueError:  # Substring not found
 94 |                 break
 95 |         if len(m) > 20:  # if limit was reached (and thus potentially many more matches)
 96 |             continue  # this word might be too common, skip it
 97 |         if m:
 98 |             indexes.append(m)
 99 | 
100 |     if indexes:
101 |         mins = [min(ind_list) for ind_list in indexes]
102 |         maxs = [max(ind_list) for ind_list in indexes]
103 |         # Calculate mean start and end points, expand by 1.2 * query length
104 |         mean_min = max(0, int(sum(mins) / len(mins)) - int(lq * 1.2))
105 |         mean_max = min(lc, int(sum(maxs) / len(maxs)) + int(lq * 1.2))
106 | 
107 |         mini_corp = corpus[mean_min : mean_max + 1]
108 | 
109 |         # 2. in the region, check the lev ratio in a sliding window
110 |         # to determine best sub region
111 |         # Create batches of query length from the mini_corp
112 |         batches = [
113 |             mini_corp[i * lq : (i + 1) * lq] for i in range(0, len(mini_corp) // lq + 1)
114 |         ]
115 |         batches = [b for b in batches if b.strip()]  # Filter out empty batches
116 | 
117 |         if not batches:  # No suitable batches found
118 |             pass  # Will proceed to the "long way" or return based on later logic
119 |         else:
120 |             ratios = Parallel(
121 |                 backend="threading",
122 |                 n_jobs=n_jobs,
123 |             )(delayed(lev_ratio)(query, b) for b in batches)  # Use lev_ratio
124 |             max_rat = max(ratios) if ratios else -1.0
125 |             max_rat_idx = [i for i, r in enumerate(ratios) if r == max_rat]
126 | 
127 |             # 3. in the best sub region, find the best substring with a 1
128 |             # character sliding window using both ratio and distance
129 |             best_ratio = -inf
130 |             best_dist = inf
131 |             best_matches = []
132 | 
133 |             def get_rat_dist(s1, s2):
134 |                 # Corrected to use imported lev_ratio and lev_dist
135 |                 return [lev_ratio(s1, s2), lev_dist(s1, s2)]
136 | 
137 |             for current_region_idx_in_batches in max_rat_idx:
138 |                 # Define area based on batches around the current max_rat_idx
139 |                 # Original: "".join(batches[current_region_idx_in_batches-1:current_region_idx_in_batches+1])
140 |                 # This needs careful handling of start/end of batches list
141 |                 start_slice = max(0, current_region_idx_in_batches - 1)
142 |                 end_slice = (
143 |                     current_region_idx_in_batches + 1
144 |                 )  # Slicing is exclusive at end
145 | 
146 |                 # The string to find index of, from the original batches
147 |                 string_markers_for_iidx = "".join(batches[start_slice:end_slice])
148 | 
149 |                 try:
150 |                     # Find this concatenated marker string within mini_corp to get a starting point
151 |                     iidx = mini_corp.index(string_markers_for_iidx)
152 |                 except ValueError:
153 |                     # If the joined string isn't found (e.g., if batches were empty or logic error)
154 |                     # try to use the start of the current batch element as a fallback index.
155 |                     if batches and current_region_idx_in_batches < len(batches):
156 |                         try:
157 |                             iidx = mini_corp.index(
158 |                                 batches[current_region_idx_in_batches]
159 |                             )
160 |                         except ValueError:
161 |                             continue  # Skip this max_rat_idx if problematic
162 |                     else:
163 |                         continue  # Skip this max_rat_idx if problematic
164 | 
165 |                 area = mini_corp[
166 |                     iidx : iidx + 3 * lq
167 |                 ]  # Define search area in mini_corp
168 |                 if not area.strip():
169 |                     continue
170 | 
171 |                 # Generate sub-batches (batches2) from this 'area'
172 |                 # Original: [area[i:lq+i] for i in range(0, len(area) + 1)]
173 |                 # This creates n-grams of length lq, then shorter suffixes.
174 |                 batches2 = [area[i : i + lq] for i in range(0, len(area) - lq + 1)]
175 |                 # Add shorter suffixes as in original intent (approximation):
176 |                 for k in range(1, lq):
177 |                     if len(area) - lq + k < len(area):
178 |                         batches2.append(area[len(area) - lq + k :])
179 |                 batches2 = [b for b in batches2 if b]  # Ensure no empty strings
180 | 
181 |                 if not batches2:
182 |                     continue
183 | 
184 |                 ratdist2 = Parallel(
185 |                     backend="threading",
186 |                     n_jobs=n_jobs,
187 |                 )(delayed(get_rat_dist)(query, b) for b in batches2)
188 | 
189 |                 ratios2 = [it[0] for it in ratdist2]
190 |                 distances2 = [it[1] for it in ratdist2]
191 | 
192 |                 current_batch_max_r = max(ratios2) if ratios2 else -inf
193 |                 current_batch_min_d = min(distances2) if distances2 else inf
194 | 
195 |                 # Original logic for updating global best_matches
196 |                 if (
197 |                     current_batch_max_r >= best_ratio
198 |                     and current_batch_min_d <= best_dist
199 |                 ):
200 |                     # Find all strings in batches2 that yield current_batch_max_r
201 |                     indices_for_current_max_r = [
202 |                         i
203 |                         for i, r_val in enumerate(ratios2)
204 |                         if r_val == current_batch_max_r
205 |                     ]
206 | 
207 |                     if (
208 |                         not indices_for_current_max_r
209 |                     ):  # Should not happen if current_batch_max_r is from ratios2
210 |                         continue
211 | 
212 |                     # Pick the first one as per original's implied logic (using index())
213 |                     candidate_string_from_batch = batches2[indices_for_current_max_r[0]]
214 | 
215 |                     if (
216 |                         current_batch_max_r == best_ratio
217 |                         and current_batch_min_d == best_dist
218 |                     ):
219 |                         best_matches.append(candidate_string_from_batch)
220 |                     else:  # New global bests found from this sub-batch's characteristics
221 |                         best_ratio = current_batch_max_r
222 |                         best_dist = current_batch_min_d
223 |                         best_matches = [candidate_string_from_batch]
224 | 
225 |             if best_matches:
226 |                 best_matches = list(set(best_matches))  # Deduplicate
227 |                 return MatchResult(
228 |                     matches=best_matches,
229 |                     ratio=best_ratio,
230 |                     distance=best_dist,
231 |                     quick_match_used=True,
232 |                 )
233 | 
234 |     # Fallback or "long way" if quick way did not yield results or was skipped
235 |     query_to_compare = query if case_sensitive else query.casefold()
236 |     corpus_to_compare = corpus if case_sensitive else corpus.casefold()
237 | 
238 |     corpus_len = len(corpus_to_compare)
239 |     query_len = len(query_to_compare)
240 |     if query_len == 0:
241 |         return MatchResult(
242 |             matches=[], ratio=0.0, distance=1.0, quick_match_used=False
243 |         )  # Or handle as error
244 |     if corpus_len == 0:
245 |         return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False)
246 | 
247 |     query_len_by_2 = max(query_len // 2, 1)
248 |     query_len_by_step_factor = max(query_len // step_factor, 1)
249 | 
250 |     min_dist_val = inf  # Renamed from min_dist to avoid clash with the variable from quick path if it ran partially
251 | 
252 |     # Initial search of corpus: ngrams of same length as query, step half query length
253 |     corpus_ngrams_initial = [
254 |         corpus_to_compare[i : i + query_len]
255 |         for i in range(0, corpus_len - query_len + 1, query_len_by_2)
256 |     ]
257 |     if not corpus_ngrams_initial:  # e.g. corpus shorter than query
258 |         # Try one comparison with the full corpus if it's shorter than query_len
259 |         if corpus_len < query_len:
260 |             corpus_ngrams_initial = [corpus_to_compare]
261 |         else:  # No ngrams to check, means cannot find match.
262 |             # Check what ratio/distance to return for "no match"
263 |             # An empty list of matches, ratio 0, distance 1 (max normalized distance)
264 |             return MatchResult(
265 |                 matches=[], ratio=0.0, distance=1.0, quick_match_used=False
266 |             )
267 | 
268 |     dists_initial = Parallel(
269 |         backend="threading",
270 |         n_jobs=n_jobs,
271 |     )(delayed(lev_dist)(ngram, query_to_compare) for ngram in corpus_ngrams_initial)
272 | 
273 |     closest_match_idx_initial = 0
274 |     if dists_initial:
275 |         min_dist_val = min(dists_initial)
276 |         closest_match_idx_initial = dists_initial.index(min_dist_val)
277 |     else:  # No initial distances, implies no ngrams, return no match
278 |         return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False)
279 | 
280 |     # Determine narrowed search region based on initial best match
281 |     closest_match_corpus_start_idx = closest_match_idx_initial * query_len_by_2
282 | 
283 |     # Define search window around this initial best match point
284 |     # Original boundaries:
285 |     # left = max(closest_match_idx - query_len_by_2 - 1, 0)
286 |     # right = min((closest_match_idx+query_len-1) + query_len_by_2 + 2, corpus_len)
287 |     # Using corpus indices:
288 |     left_boundary = max(0, closest_match_corpus_start_idx - query_len_by_2 - 1)
289 |     # The end of the initial best ngram is closest_match_corpus_start_idx + query_len
290 |     right_boundary = min(
291 |         corpus_len,
292 |         (closest_match_corpus_start_idx + query_len - 1) + query_len_by_2 + 2,
293 |     )
294 | 
295 |     narrowed_corpus_to_compare = corpus_to_compare[left_boundary:right_boundary]
296 |     # Important: We need to map findings in narrowed_corpus_to_compare back to original `corpus` strings
297 |     narrowed_corpus_original_case = corpus[left_boundary:right_boundary]
298 | 
299 |     narrowed_corpus_len = len(narrowed_corpus_to_compare)
300 |     if narrowed_corpus_len == 0:
301 |         return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False)
302 | 
303 |     # Generate ngram lengths for thorough search in the narrowed region
304 |     # From narrowed_corpus_len down to query_len_by_2, stepping by -query_len_by_step_factor
305 |     ngram_lens_thorough = [
306 |         l
307 |         for l in range(
308 |             narrowed_corpus_len, query_len_by_2 - 1, -query_len_by_step_factor
309 |         )
310 |         if l > 0
311 |     ]
312 |     if (
313 |         not ngram_lens_thorough
314 |     ):  # If narrowed_corpus_len is too small or other edge cases
315 |         ngram_lens_thorough.append(
316 |             min(query_len, narrowed_corpus_len)
317 |         )  # Ensure at least one sensible length
318 |         ngram_lens_thorough = [l for l in ngram_lens_thorough if l > 0]
319 | 
320 |     # Construct sets of ngrams from narrowed_corpus for each length
321 |     narrowed_corpus_ngrams_thorough_sets = []
322 |     narrowed_corpus_ngrams_original_case_sets = []
323 | 
324 |     for ngram_len in ngram_lens_thorough:
325 |         if ngram_len > narrowed_corpus_len:
326 |             continue  # Should not happen if ngram_lens_thorough is generated correctly
327 |         current_set_compare = [
328 |             narrowed_corpus_to_compare[i : i + ngram_len]
329 |             for i in range(0, narrowed_corpus_len - ngram_len + 1)
330 |         ]
331 |         current_set_original = [
332 |             narrowed_corpus_original_case[i : i + ngram_len]
333 |             for i in range(0, narrowed_corpus_len - ngram_len + 1)
334 |         ]
335 |         if current_set_compare:
336 |             narrowed_corpus_ngrams_thorough_sets.append(current_set_compare)
337 |             narrowed_corpus_ngrams_original_case_sets.append(current_set_original)
338 | 
339 |     if not narrowed_corpus_ngrams_thorough_sets:
340 |         # This can happen if narrowed_corpus is shorter than all generated ngram_lens
341 |         # e.g. query_len_by_2 is too large relative to narrowed_corpus_len
342 |         # As a fallback, compare query against the whole narrowed_corpus_original_case
343 |         dist_val = lev_dist(narrowed_corpus_to_compare, query_to_compare)
344 |         ratio_val = lev_ratio(narrowed_corpus_to_compare, query_to_compare)
345 |         if dist_val <= min_dist_val:  # Using min_dist_val from initial pass
346 |             return MatchResult(
347 |                 matches=[narrowed_corpus_original_case],
348 |                 ratio=ratio_val,
349 |                 distance=dist_val,
350 |                 quick_match_used=False,
351 |             )
352 |         else:  # Initial pass was better or no match found
353 |             # This part needs to re-evaluate what to return if narrowed search fails.
354 |             # Fallback to returning based on min_dist_val if nothing better is found here.
355 |             # For now, let's assume if we reach here with no ngrams, original min_dist_val holds the best.
356 |             # This path implies the more thorough search didn't find anything or couldn't run.
357 |             # Find the string associated with min_dist_val from initial pass:
358 |             best_ngram_from_initial_pass_idx = dists_initial.index(min_dist_val)
359 |             best_ngram_str_initial_pass_original_case = corpus[
360 |                 best_ngram_from_initial_pass_idx
361 |                 * query_len_by_2 : best_ngram_from_initial_pass_idx * query_len_by_2
362 |                 + query_len
363 |             ]
364 |             ratio_for_initial_best = lev_ratio(
365 |                 (
366 |                     best_ngram_str_initial_pass_original_case.casefold()
367 |                     if not case_sensitive
368 |                     else best_ngram_str_initial_pass_original_case
369 |                 ),
370 |                 query_to_compare,
371 |             )
372 |             return MatchResult(
373 |                 matches=[best_ngram_str_initial_pass_original_case],
374 |                 ratio=ratio_for_initial_best,
375 |                 distance=min_dist_val,
376 |                 quick_match_used=False,
377 |             )
378 | 
379 |     # Calculate distances for all ngrams in the thorough search sets
380 |     dist_list_thorough = []
381 |     for ngram_set in narrowed_corpus_ngrams_thorough_sets:
382 |         dist_list_thorough.append(
383 |             Parallel(backend="threading", n_jobs=n_jobs)(
384 |                 delayed(lev_dist)(ngram, query_to_compare) for ngram in ngram_set
385 |             )
386 |         )
387 | 
388 |     final_best_matches = []
389 |     # min_dist_val still holds the minimum distance found so far (from initial pass)
390 | 
391 |     for i_set, ngram_set_original_case in enumerate(
392 |         narrowed_corpus_ngrams_original_case_sets
393 |     ):
394 |         current_dists_for_set = dist_list_thorough[i_set]
395 |         for i_ngram, ngram_original_case in enumerate(ngram_set_original_case):
396 |             ngram_dist = current_dists_for_set[i_ngram]
397 |             if ngram_dist < min_dist_val:
398 |                 min_dist_val = ngram_dist
399 |                 final_best_matches = [ngram_original_case]
400 |             elif ngram_dist == min_dist_val:
401 |                 final_best_matches.append(ngram_original_case)
402 | 
403 |     # If initial pass found a better or equal min_dist_val and thorough search didn't improve OR final_best_matches empty
404 |     if not final_best_matches:
405 |         # Fallback to best from initial pass if thorough search yielded nothing
406 |         # This case should ideally be covered by min_dist_val initialization and updates
407 |         # For safety, ensure if final_best_matches is empty, we use the best known from initial scan.
408 |         idx = dists_initial.index(min_dist_val)
409 |         # Original string from corpus that corresponds to this match
410 |         original_string_match = corpus[
411 |             idx * query_len_by_2 : idx * query_len_by_2 + query_len
412 |         ]
413 |         final_best_matches = [original_string_match]
414 | 
415 |     final_best_matches = list(set(final_best_matches))  # Deduplicate
416 |     if not final_best_matches:  # Should not be empty if corpus & query were not empty
417 |         return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False)
418 | 
419 |     # Calculate ratio for the first best match found (or an aggregate if multiple have same min_dist_val)
420 |     # Ensure query_to_compare is used for ratio calculation consistency
421 |     # best_ratio_val = lev_ratio( # Original calculation commented for review
422 |     # (final_best_matches[0].casefold() if not case_sensitive else final_best_matches[0]),
423 |     # query_to_compare
424 |     # )
425 |     # Re-calculate max ratio among all best_matches to be robust
426 |     all_ratios = [
427 |         lev_ratio((bm.casefold() if not case_sensitive else bm), query_to_compare)
428 |         for bm in final_best_matches
429 |     ]
430 |     best_ratio_val = max(all_ratios) if all_ratios else 0.0
431 |     # The min_dist_val should already be correct for these final_best_matches
432 | 
433 |     return MatchResult(
434 |         matches=final_best_matches,
435 |         ratio=best_ratio_val,
436 |         distance=min_dist_val,
437 |         quick_match_used=False,
438 |     )  # False: used long way
439 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published by
637 |     the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <https://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <https://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/karakeep_python_api/__main__.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | import inspect
  3 | import json
  4 | import sys
  5 | import os
  6 | import functools
  7 | import re  # Import re module
  8 | import click
  9 | import traceback  # Moved import to top
 10 | from typing import (
 11 |     Any,
 12 |     List,
 13 |     Dict,
 14 |     Optional,
 15 |     Callable,
 16 |     Union,
 17 |     get_origin,
 18 |     get_args,
 19 |     Literal,
 20 | )
 21 | from pydantic import BaseModel, ValidationError
 22 | from loguru import logger  # Import logger
 23 | 
 24 | # Attempt relative imports for package execution
 25 | try:
 26 |     # Import API class and errors directly from the module
 27 |     from .karakeep_api import KarakeepAPI, APIError, AuthenticationError
 28 | 
 29 |     # Models are not directly used here, API methods handle data types
 30 | except ImportError:
 31 |     # Fallback for direct script execution (e.g., python -m karakeep_python_api ...)
 32 |     # Import API class and errors directly from the module
 33 |     from karakeep_api import KarakeepAPI, APIError, AuthenticationError
 34 | 
 35 | 
 36 | # --- Serialization Helper ---
 37 | def serialize_output(data: Any) -> Any:
 38 |     """
 39 |     Recursively serialize data for JSON output, handling Pydantic models,
 40 |     dataclasses, lists, and dicts.
 41 |     """
 42 |     if isinstance(data, BaseModel):
 43 |         return data.model_dump(
 44 |             mode="json"
 45 |         )  # Use Pydantic's built-in JSON serialization
 46 |     elif isinstance(data, list):
 47 |         return [serialize_output(item) for item in data]
 48 |     elif isinstance(data, dict):
 49 |         # Serialize dictionary values
 50 |         return {k: serialize_output(v) for k, v in data.items()}
 51 |     # Add handling for other types like datetime if needed, though Pydantic's
 52 |     # model_dump(mode='json') often handles them.
 53 |     # Basic types (str, int, float, bool, None) are returned as is.
 54 |     return data
 55 | 
 56 | 
 57 | # --- Click CLI Setup ---
 58 | 
 59 | # Shared options for the API client
 60 | shared_options = [
 61 |     click.option(
 62 |         "--api-endpoint",
 63 |         envvar="KARAKEEP_PYTHON_API_ENDPOINT",
 64 |         help="Full Karakeep API endpoint URL, including /api/v1/ (e.g., https://instance.com/api/v1/).",
 65 |     ),
 66 |     click.option(
 67 |         "--api-key",
 68 |         envvar="KARAKEEP_PYTHON_API_KEY",
 69 |         help="Karakeep API Key (required, uses env var if not provided).",
 70 |         required=False,
 71 |     ),  # Made not required here, checked in context
 72 |     click.option(
 73 |         "--verify-ssl/--no-verify-ssl",
 74 |         default=True,
 75 |         envvar="KARAKEEP_PYTHON_API_VERIFY_SSL",
 76 |         help="Verify SSL certificates.",
 77 |     ),
 78 |     click.option(
 79 |         "--verbose",
 80 |         "-v",
 81 |         is_flag=True,
 82 |         default=False,
 83 |         help="Enable verbose logging.",
 84 |     ),
 85 |     click.option(
 86 |         "--disable-response-validation",
 87 |         is_flag=True,
 88 |         default=False,
 89 |         envvar="KARAKEEP_PYTHON_API_DISABLE_RESPONSE_VALIDATION",
 90 |         help="Disable Pydantic validation of API responses (returns raw data).",
 91 |     ),
 92 |     click.option(
 93 |         "--ascii",
 94 |         "ensure_ascii",  # Use 'ensure_ascii' as the destination variable name
 95 |         is_flag=True,
 96 |         default=False,  # Default is False, meaning ensure_ascii=False by default
 97 |         envvar="KARAKEEP_PYTHON_API_ENSURE_ASCII",
 98 |         help="Escape non-ASCII characters in the JSON output (default: keep Unicode).",
 99 |     ),
100 | ]
101 | 
102 | 
103 | def add_options(options):
104 |     """Decorator to add a list of click options to a command."""
105 | 
106 |     def _add_options(func):
107 |         for option in reversed(options):
108 |             func = option(func)
109 |         return func
110 | 
111 |     return _add_options
112 | 
113 | 
114 | # --- Callback for --dump-openapi-specification ---
115 | def print_openapi_spec(ctx, param, value):
116 |     """Callback function for the --dump-openapi-specification option."""
117 |     if not value or ctx.resilient_parsing:
118 |         # Exit if the flag is not set, or if Click is doing resilient parsing (e.g., for completion)
119 |         return
120 |     try:
121 |         package_dir = os.path.dirname(__file__)
122 |         spec_path = os.path.join(package_dir, "openapi_reference.json")
123 |         if not os.path.exists(spec_path):
124 |             click.echo(
125 |                 f"Error: Specification file not found at expected location: {spec_path}",
126 |                 err=True,
127 |             )
128 |             ctx.exit(1)  # Use ctx.exit
129 |         with open(spec_path, "r") as f:
130 |             click.echo(f.read())  # Use click.echo
131 |     except Exception as e:
132 |         click.echo(f"Error reading or printing specification file: {e}", err=True)
133 |         ctx.exit(1)  # Exit with error code if reading failed
134 |     # Exit successfully *after* the try/except block if no error occurred
135 |     ctx.exit(0)
136 | 
137 | 
138 | @click.group(context_settings=dict(help_option_names=["-h", "--help"]))
139 | @click.option(
140 |     "--dump-openapi-specification",
141 |     is_flag=True,
142 |     callback=print_openapi_spec,
143 |     expose_value=False,  # Don't pass the value to the main cli function
144 |     is_eager=True,  # Process this option before others
145 |     help="Dump the OpenAPI specification JSON to stdout and exit.",
146 | )
147 | @add_options(
148 |     shared_options
149 | )  # Apply shared options to the group (ensure_ascii is now included)
150 | @click.pass_context
151 | def cli(
152 |     ctx,
153 |     api_endpoint,
154 |     api_key,
155 |     verify_ssl,
156 |     verbose,
157 |     disable_response_validation,
158 |     ensure_ascii,
159 | ):
160 |     """
161 |     Karakeep Python API Command Line Interface.
162 | 
163 |     Dynamically generates commands based on the OpenAPI specification.
164 |     Requires KARAKEEP_PYTHON_API_KEY environment variable or --api-key option.
165 |     """
166 |     # Ensure the context object exists
167 |     ctx.ensure_object(dict)
168 | 
169 |     # --- Strict Check for API Key and Endpoint ---
170 |     # Check for API key (must be provided via arg or env)
171 |     resolved_api_key = api_key or os.environ.get("KARAKEEP_PYTHON_API_KEY")
172 |     if not resolved_api_key:
173 |         raise click.UsageError(
174 |             "API Key is required. Provide --api-key option or set KARAKEEP_PYTHON_API_KEY environment variable."
175 |         )
176 | 
177 |     # Check for API endpoint (must be provided via arg or env)
178 |     resolved_api_endpoint = api_endpoint or os.environ.get(
179 |         "KARAKEEP_PYTHON_API_ENDPOINT"
180 |     )
181 |     if not resolved_api_endpoint:
182 |         raise click.UsageError(
183 |             "API endpoint is required. Provide --api-endpoint option or set KARAKEEP_PYTHON_API_ENDPOINT environment variable. "
184 |             "The URL must include the API path, e.g., 'https://your-instance.com/api/v1/'."
185 |         )
186 | 
187 |     # Store common API parameters in the context for commands to use
188 |     ctx.obj["API_ENDPOINT"] = resolved_api_endpoint  # Store the resolved endpoint
189 |     ctx.obj["API_KEY"] = resolved_api_key  # Store the resolved key
190 |     ctx.obj["VERIFY_SSL"] = verify_ssl
191 |     ctx.obj["VERBOSE"] = verbose
192 |     ctx.obj["DISABLE_RESPONSE_VALIDATION"] = (
193 |         disable_response_validation  # Store the flag
194 |     )
195 |     ctx.obj["ENSURE_ASCII"] = ensure_ascii  # Store the ensure_ascii flag
196 | 
197 | 
198 | def create_click_command(
199 |     api_method_name: str, api_method: Callable
200 | ) -> Optional[click.Command]:
201 |     """
202 |     Dynamically creates a Click command for a given API method instance,
203 |     inspecting its signature for arguments. Returns None if creation fails.
204 |     """
205 |     # Get the signature from the bound method (which copied it from the original function)
206 |     try:
207 |         sig = inspect.signature(api_method)
208 |         # Exclude 'self' parameter from the signature when creating CLI options
209 |         params = [p for p in sig.parameters.values() if p.name != "self"]
210 |     except (ValueError, TypeError) as e:
211 |         logger.warning(f"Could not get signature for method '{api_method_name}': {e}")
212 |         return None
213 | 
214 |     # Define the command function template using a closure
215 |     def command_func_factory(method_name, signature):
216 |         @click.pass_context
217 |         def command_func(ctx, **kwargs):
218 |             """Dynamically generated command function wrapper."""
219 |             # Retrieve API parameters from context, ensuring API key is present now
220 |             api_endpoint = ctx.obj["API_ENDPOINT"]
221 |             api_key = ctx.obj["API_KEY"]
222 |             verify_ssl = ctx.obj["VERIFY_SSL"]
223 |             verbose = ctx.obj["VERBOSE"]
224 |             disable_validation = ctx.obj["DISABLE_RESPONSE_VALIDATION"]  # Retrieve flag
225 |             ensure_ascii_output = ctx.obj["ENSURE_ASCII"]  # Retrieve ensure_ascii flag
226 | 
227 |             if not api_key:
228 |                 click.echo(
229 |                     "Error: API Key is required via --api-key or KARAKEEP_PYTHON_API_KEY environment variable.",
230 |                     err=True,
231 |                 )
232 |                 ctx.exit(1)
233 | 
234 |             try:
235 |                 # Initialize API client within the command context
236 |                 # Method generation already happened during inspection phase or initial load
237 |                 api = KarakeepAPI(
238 |                     api_key=api_key,
239 |                     api_endpoint=api_endpoint,
240 |                     verify_ssl=verify_ssl,
241 |                     verbose=verbose,
242 |                     disable_response_validation=disable_validation,  # Pass flag to constructor
243 |                 )
244 |                 # Get the actual bound method from the initialized API instance
245 |                 instance_method = getattr(
246 |                     api, method_name
247 |                 )  # Use the captured method_name
248 | 
249 |                 # Prepare arguments for the API call from Click's kwargs
250 |                 call_args = {}
251 |                 sig_params = signature.parameters  # Use captured signature
252 | 
253 |                 # Process Click kwargs into API call arguments
254 |                 # Convert kebab-case keys from Click back to snake_case for Python call
255 |                 call_args = {
256 |                     k.replace("-", "_"): v for k, v in kwargs.items() if v is not None
257 |                 }
258 | 
259 |                 # Remove arguments that are not part of the method signature
260 |                 # (e.g., if extra options were somehow passed)
261 |                 valid_arg_names = set(signature.parameters.keys())
262 |                 call_args = {k: v for k, v in call_args.items() if k in valid_arg_names}
263 | 
264 |                 # --- JSON Parsing for Dict/List Parameters ---
265 |                 # Iterate through the expected parameters from the signature
266 |                 for param_name, param_sig in signature.parameters.items():
267 |                     if param_name in call_args:
268 |                         param_value = call_args[param_name]
269 |                         param_annotation = param_sig.annotation
270 |                         origin = getattr(param_annotation, "__origin__", None)
271 | 
272 |                         # Check if the annotation is dict/list or typing.Dict/List
273 |                         # and if the received value is a string (needs parsing)
274 |                         if (
275 |                             param_annotation in (dict, list)
276 |                             or origin in (dict, list, Dict, List)
277 |                         ) and isinstance(param_value, str):
278 |                             try:
279 |                                 # Attempt to parse the JSON string
280 |                                 call_args[param_name] = json.loads(param_value)
281 |                                 logger.debug(
282 |                                     f"Parsed JSON string for parameter '{param_name}'."
283 |                                 )
284 |                             except json.JSONDecodeError as json_err:
285 |                                 # Handle invalid JSON input from the user
286 |                                 click.echo(
287 |                                     f"Error: Invalid JSON provided for parameter '{param_name.replace('_', '-')}': {json_err}",
288 |                                     err=True,
289 |                                 )
290 |                                 click.echo(f"Provided value: {param_value}", err=True)
291 |                                 ctx.exit(1)
292 | 
293 |                 # Call the API method
294 |                 try:
295 |                     if method_name == "get_all_bookmarks":
296 |                         logger.debug(
297 |                             f"Special CLI pagination handling for '{method_name}'."
298 |                         )
299 |                         cli_total_limit = call_args.pop("limit", None)
300 |                         # Other relevant params for get_all_bookmarks
301 |                         archived_filter = call_args.get("archived")
302 |                         favourited_filter = call_args.get("favourited")
303 |                         include_content_cli = call_args.get("include_content", True)
304 | 
305 |                         call_args.pop("cursor", None)  # Ignore CLI cursor
306 | 
307 |                         all_bookmarks_data = []
308 |                         current_page_api_cursor = None
309 |                         fetched_count = 0
310 |                         API_INTERNAL_PAGE_SIZE = 50  # Define a page size for API calls
311 | 
312 |                         while True:
313 |                             api_call_limit = API_INTERNAL_PAGE_SIZE
314 |                             if cli_total_limit is not None:
315 |                                 remaining_needed = cli_total_limit - fetched_count
316 |                                 if remaining_needed <= 0:
317 |                                     break  # Reached or exceeded CLI total limit
318 |                                 api_call_limit = min(
319 |                                     API_INTERNAL_PAGE_SIZE, remaining_needed
320 |                                 )
321 | 
322 |                             if (
323 |                                 api_call_limit <= 0 and cli_total_limit is not None
324 |                             ):  # Avoid asking for 0 or negative items unless fetching all
325 |                                 break
326 | 
327 |                             logger.debug(
328 |                                 f"Fetching page for '{method_name}' with cursor: {current_page_api_cursor}, api_limit: {api_call_limit}"
329 |                             )
330 | 
331 |                             page_call_args = {
332 |                                 "archived": archived_filter,
333 |                                 "favourited": favourited_filter,
334 |                                 "limit": api_call_limit,
335 |                                 "cursor": current_page_api_cursor,
336 |                                 "include_content": include_content_cli,
337 |                             }
338 |                             page_call_args_filtered = {
339 |                                 k: v for k, v in page_call_args.items() if v is not None
340 |                             }
341 | 
342 |                             try:
343 |                                 page_result_obj = instance_method(
344 |                                     **page_call_args_filtered
345 |                                 )
346 |                             except TypeError as call_error_page:
347 |                                 logger.error(
348 |                                     f"Error calling API method '{method_name}' (paginated): {call_error_page}"
349 |                                 )
350 |                                 logger.error(
351 |                                     f"Provided arguments for page: {page_call_args_filtered}"
352 |                                 )
353 |                                 if verbose:
354 |                                     logger.debug(traceback.format_exc())
355 |                                 ctx.exit(1)
356 | 
357 |                             bookmarks_on_this_page = []
358 |                             next_api_cursor = None
359 | 
360 |                             # Convert Pydantic model to dict using model_dump if available
361 |                             if hasattr(page_result_obj, "model_dump"):
362 |                                 result_dict = page_result_obj.model_dump()
363 |                             elif isinstance(page_result_obj, dict):
364 |                                 result_dict = page_result_obj
365 |                             else:
366 |                                 logger.warning(
367 |                                     f"Unexpected result type: {type(page_result_obj)}"
368 |                                 )
369 |                                 result_dict = {}
370 | 
371 |                             # Extract data and cursor from the dict
372 |                             bookmarks_on_this_page = result_dict.get("bookmarks", [])
373 |                             next_api_cursor = result_dict.get("nextCursor")
374 | 
375 |                             logger.debug(
376 |                                 f"Extracted {len(bookmarks_on_this_page)} bookmarks and cursor: {next_api_cursor}"
377 |                             )
378 | 
379 |                             if not isinstance(bookmarks_on_this_page, list):
380 |                                 logger.warning(
381 |                                     f"Expected a list of bookmarks, got {type(bookmarks_on_this_page)}. Stopping pagination."
382 |                                 )
383 |                                 break
384 | 
385 |                             all_bookmarks_data.extend(bookmarks_on_this_page)
386 |                             fetched_count += len(bookmarks_on_this_page)
387 |                             logger.debug(
388 |                                 f"Fetched {len(bookmarks_on_this_page)} bookmarks this page. Total fetched: {fetched_count}."
389 |                             )
390 | 
391 |                             current_page_api_cursor = next_api_cursor
392 |                             if not current_page_api_cursor:
393 |                                 logger.debug(
394 |                                     "No nextCursor from API, pagination complete."
395 |                                 )
396 |                                 break
397 |                             if (
398 |                                 cli_total_limit is not None
399 |                                 and fetched_count >= cli_total_limit
400 |                             ):
401 |                                 logger.debug(
402 |                                     f"CLI total limit of {cli_total_limit} reached or exceeded."
403 |                                 )
404 |                                 break
405 |                             if not bookmarks_on_this_page and api_call_limit > 0:
406 |                                 logger.debug(
407 |                                     "API returned an empty list of bookmarks while a positive limit was set, assuming end of data."
408 |                                 )
409 |                                 break
410 | 
411 |                         result = all_bookmarks_data  # This will be a list of Bookmark models or dicts
412 |                     elif method_name == "get_all_highlights":
413 |                         logger.debug(
414 |                             f"Special CLI pagination handling for '{method_name}'."
415 |                         )
416 |                         cli_total_limit = call_args.pop("limit", None)
417 | 
418 |                         call_args.pop("cursor", None)  # Ignore CLI cursor
419 | 
420 |                         all_highlights_data = []
421 |                         current_page_api_cursor = None
422 |                         fetched_count = 0
423 |                         API_INTERNAL_PAGE_SIZE = 50  # Define a page size for API calls
424 | 
425 |                         while True:
426 |                             api_call_limit = API_INTERNAL_PAGE_SIZE
427 |                             if cli_total_limit is not None:
428 |                                 remaining_needed = cli_total_limit - fetched_count
429 |                                 if remaining_needed <= 0:
430 |                                     break  # Reached or exceeded CLI total limit
431 |                                 api_call_limit = min(
432 |                                     API_INTERNAL_PAGE_SIZE, remaining_needed
433 |                                 )
434 | 
435 |                             if (
436 |                                 api_call_limit <= 0 and cli_total_limit is not None
437 |                             ):  # Avoid asking for 0 or negative items unless fetching all
438 |                                 break
439 | 
440 |                             logger.debug(
441 |                                 f"Fetching page for '{method_name}' with cursor: {current_page_api_cursor}, api_limit: {api_call_limit}"
442 |                             )
443 | 
444 |                             page_call_args = {
445 |                                 "limit": api_call_limit,
446 |                                 "cursor": current_page_api_cursor,
447 |                             }
448 |                             page_call_args_filtered = {
449 |                                 k: v for k, v in page_call_args.items() if v is not None
450 |                             }
451 | 
452 |                             try:
453 |                                 page_result_obj = instance_method(
454 |                                     **page_call_args_filtered
455 |                                 )
456 |                             except TypeError as call_error_page:
457 |                                 logger.error(
458 |                                     f"Error calling API method '{method_name}' (paginated): {call_error_page}"
459 |                                 )
460 |                                 logger.error(
461 |                                     f"Provided arguments for page: {page_call_args_filtered}"
462 |                                 )
463 |                                 if verbose:
464 |                                     logger.debug(traceback.format_exc())
465 |                                 ctx.exit(1)
466 | 
467 |                             highlights_on_this_page = []
468 |                             next_api_cursor = None
469 | 
470 |                             # Convert Pydantic model to dict using model_dump if available
471 |                             if hasattr(page_result_obj, "model_dump"):
472 |                                 result_dict = page_result_obj.model_dump()
473 |                             elif isinstance(page_result_obj, dict):
474 |                                 result_dict = page_result_obj
475 |                             else:
476 |                                 logger.warning(
477 |                                     f"Unexpected result type: {type(page_result_obj)}"
478 |                                 )
479 |                                 result_dict = {}
480 | 
481 |                             # Extract data and cursor from the dict
482 |                             highlights_on_this_page = result_dict.get("highlights", [])
483 |                             next_api_cursor = result_dict.get("nextCursor")
484 | 
485 |                             logger.debug(
486 |                                 f"Extracted {len(highlights_on_this_page)} highlights and cursor: {next_api_cursor}"
487 |                             )
488 | 
489 |                             if not isinstance(highlights_on_this_page, list):
490 |                                 logger.warning(
491 |                                     f"Expected a list of highlights, got {type(highlights_on_this_page)}. Stopping pagination."
492 |                                 )
493 |                                 break
494 | 
495 |                             all_highlights_data.extend(highlights_on_this_page)
496 |                             fetched_count += len(highlights_on_this_page)
497 |                             logger.debug(
498 |                                 f"Fetched {len(highlights_on_this_page)} highlights this page. Total fetched: {fetched_count}."
499 |                             )
500 | 
501 |                             current_page_api_cursor = next_api_cursor
502 |                             if not current_page_api_cursor:
503 |                                 logger.debug(
504 |                                     "No nextCursor from API, pagination complete."
505 |                                 )
506 |                                 break
507 |                             if (
508 |                                 cli_total_limit is not None
509 |                                 and fetched_count >= cli_total_limit
510 |                             ):
511 |                                 logger.debug(
512 |                                     f"CLI total limit of {cli_total_limit} reached or exceeded."
513 |                                 )
514 |                                 break
515 |                             if not highlights_on_this_page and api_call_limit > 0:
516 |                                 logger.debug(
517 |                                     "API returned an empty list of highlights while a positive limit was set, assuming end of data."
518 |                                 )
519 |                                 break
520 | 
521 |                         result = all_highlights_data  # This will be a list of Highlight models or dicts
522 |                     else:
523 |                         # Original behavior for other commands
524 |                         logger.debug(
525 |                             f"Calling API method '{method_name}' with args: {call_args}"
526 |                         )
527 |                         result = instance_method(**call_args)
528 | 
529 |                 except TypeError as call_error:
530 |                     logger.error(
531 |                         f"Error calling API method '{method_name}': {call_error}"
532 |                     )
533 |                     logger.error(f"Provided arguments: {call_args}")
534 |                     logger.error(f"Expected signature: {signature}")
535 |                     # Add traceback in verbose mode
536 |                     if verbose:
537 |                         logger.debug(traceback.format_exc())
538 |                     ctx.exit(1)
539 | 
540 |                 # Serialize and print the result
541 |                 if result is not None:
542 |                     output_data = serialize_output(result)
543 |                     # Use ensure_ascii_output flag to control JSON encoding
544 |                     click.echo(
545 |                         json.dumps(
546 |                             output_data, indent=2, ensure_ascii=ensure_ascii_output
547 |                         )
548 |                     )
549 |                 else:
550 |                     # Handle None result (e.g., 204 No Content) gracefully
551 |                     # Verbose check is implicitly handled by logger level
552 |                     logger.debug("Operation successful (No content returned).")
553 | 
554 |             except (
555 |                 APIError,
556 |                 AuthenticationError,
557 |                 ValueError,
558 |                 ValidationError,
559 |                 TypeError,
560 |             ) as e:
561 |                 logger.error(f"Error: {e}")
562 |                 # Provide more detail for TypeErrors during binding/call
563 |                 if isinstance(e, TypeError):
564 |                     logger.error(f"Error: {e}")
565 |                     # Provide more detail for TypeErrors during binding/call
566 |                     if isinstance(e, TypeError) and verbose:
567 |                         logger.debug(traceback.format_exc())  # Use top-level import
568 |                 sys.exit(1)
569 |             except Exception as e:
570 |                 logger.error(f"An unexpected error occurred: {e}")
571 |                 if verbose:
572 |                     logger.debug(traceback.format_exc())  # Use top-level import
573 |                 sys.exit(1)
574 | 
575 |         # Set the name of the inner function for help display purposes
576 |         command_func.__name__ = method_name
577 |         return command_func
578 | 
579 |     # Create the actual command function instance using the factory
580 |     command_func = command_func_factory(api_method_name, sig)
581 | 
582 |     # --- Add Click options/arguments based on the captured method signature ---
583 |     click_params = []
584 |     # Use the docstring from the original method (captured by functools.update_wrapper)
585 |     docstring = api_method.__doc__ or f"Execute the {api_method_name} API operation."
586 |     docstring = dedent(docstring)
587 |     docstring_lines = docstring.split("\n")
588 |     help_text = " ".join(
589 |         docstring.split("\n\n")[0].splitlines()
590 |     ).strip()  # First lines as short help
591 |     # Full docstring as help
592 |     full_help = docstring
593 | 
594 |     # tweak the whitespaces in the full help:
595 |     full_help = full_help.replace("\n               ", " ")
596 |     full_help = full_help.replace("\n", "\n\n")
597 | 
598 |     # Extract parameter descriptions from the Args section of the docstring
599 |     param_descriptions = {}
600 |     in_args_section = False
601 |     args_section_lines = []
602 |     assert "Returns:" in docstring
603 |     assert "Raises:" in docstring
604 |     for line in docstring_lines:
605 |         stripped_line = line.strip()
606 |         if stripped_line == "Args:":
607 |             in_args_section = True
608 |         elif stripped_line == "Returns:" or stripped_line == "Raises:":
609 |             in_args_section = False  # Stop capturing when Returns/Raises section starts
610 |         elif in_args_section and stripped_line:
611 |             args_section_lines.append(stripped_line)
612 |             # Use regex to capture 'param_name: description' structure, allowing leading whitespace
613 |             # Pattern: ^\s+ (parameter_name): \s* (description) $
614 |             match = re.match(r"^\s+([a-zA-Z_][a-zA-Z0-9_]*):\s+(.*)$", line)
615 |             if match and match.group(1) != "Example":
616 |                 param_name = match.group(1)
617 |                 description = match.group(2).strip()
618 |                 param_descriptions[param_name] = description
619 |                 logger.trace(
620 |                     f"Parsed docstring param: '{param_name}' -> '{description}'"
621 |                 )
622 |             else:
623 |                 param_descriptions[param_name] += " " + stripped_line
624 | 
625 |     # Removed breakpoint() that was added for debugging
626 |     # Add parameters from signature to Click command
627 |     for param in params:  # Use the filtered list from signature inspection
628 |         param_name_cli = param.name.replace("_", "-")  # Use kebab-case for CLI options
629 |         is_required_in_sig = param.default is inspect.Parameter.empty
630 |         default_value = param.default if not is_required_in_sig else None
631 |         param_type = click.STRING  # Default to string for CLI
632 | 
633 |         # Basic type mapping for Click
634 |         annotation = param.annotation
635 |         origin = getattr(annotation, "__origin__", None)
636 |         args = getattr(annotation, "__args__", [])
637 | 
638 |         # Determine Click type and if it's a flag
639 |         is_flag = False
640 |         click_type = click.STRING
641 |         if annotation is int:
642 |             click_type = click.INT
643 |         elif annotation is float:
644 |             click_type = click.FLOAT
645 |         elif annotation is bool:
646 |             click_type = click.BOOL
647 |             # Boolean options are flags if they don't have a default or default is False
648 |             is_flag = is_required_in_sig or default_value is False
649 |         # Handle Optional[T] - makes the option not required unless T is bool
650 |         elif origin is Union and type(None) in args and len(args) == 2:
651 |             non_none_type = args[0] if args[1] is type(None) else args[1]
652 |             if non_none_type is int:
653 |                 click_type = click.INT
654 |             elif non_none_type is float:
655 |                 click_type = click.FLOAT
656 |             elif non_none_type is bool:
657 |                 click_type = click.BOOL
658 |                 # Optional bools are typically flags like --enable-feature/--disable-feature
659 |                 # For simplicity, treat as a standard option unless explicitly designed as toggle
660 |                 is_flag = (
661 |                     False  # Treat Optional[bool] as --option/--no-option by default
662 |                 )
663 |             # Keep click_type as STRING for Optional[List/Dict/str/Any]
664 |             is_required_in_sig = False  # Optional means not required
665 |             default_value = None  # Default for Optional is None
666 | 
667 |         # Handle List[T] or Dict[K, V] - expect JSON string
668 |         elif origin in (list, dict, List, Dict) or annotation in (list, dict):
669 |             click_type = click.STRING  # Expect JSON string
670 |         # Handle Literal[...] for choices
671 |         elif origin is Literal:
672 |             choices = get_args(annotation)
673 |             # Ensure all choices are strings for click.Choice
674 |             if all(isinstance(c, str) for c in choices):
675 |                 click_type = click.Choice(choices, case_sensitive=False)
676 |             else:
677 |                 logger.warning(
678 |                     f"Parameter '{param.name}' is Literal but contains non-string types. Treating as STRING."
679 |                 )
680 |                 click_type = click.STRING  # Fallback
681 | 
682 |         # Determine option name(s) and help text
683 |         option_names = [f"--{param_name_cli}"]
684 |         # Add /--no- option for boolean flags that are not required and default to True
685 |         if (
686 |             is_flag
687 |             and annotation is bool
688 |             and not is_required_in_sig
689 |             and default_value is True
690 |         ):
691 |             option_names.append(f"--no-{param_name_cli}")
692 | 
693 |         param_help = param_descriptions.get(param.name, f"Parameter '{param.name}'.")
694 |         if click_type is click.STRING and (
695 |             origin in (list, dict) or annotation in (list, dict)
696 |         ):
697 |             param_help += " (Provide as JSON string)"
698 |         elif isinstance(click_type, click.Choice):
699 |             param_help += f" (Choices: {', '.join(click_type.choices)})"
700 | 
701 |         click_required = is_required_in_sig and default_value is None and not is_flag
702 | 
703 |         # Make copies of properties that might be modified for specific commands/params
704 |         current_param_help = param_help
705 |         current_click_required = click_required
706 |         current_default_value = default_value
707 |         current_is_flag = (
708 |             is_flag  # Though is_flag interpretation might change help/required
709 |         )
710 | 
711 |         # Special handling for 'get_all_bookmarks' command parameters
712 |         if api_method_name == "get_all_bookmarks":
713 |             if param.name == "cursor":
714 |                 current_param_help = (
715 |                     "[Ignored by CLI for get-all-bookmarks] " + param_help
716 |                 )
717 |                 current_click_required = (
718 |                     False  # Cursor is handled by CLI, not required from user
719 |                 )
720 |                 current_default_value = (
721 |                     None  # Explicitly set default to None for ignored param
722 |                 )
723 |             elif param.name == "limit":
724 |                 current_param_help = "Total maximum number of bookmarks to fetch across pages for get-all-bookmarks. If omitted, all are fetched."
725 |                 # For 'limit', required status and default remain as derived from its Optional[int] type hint
726 |                 # current_click_required and current_default_value will be correctly False and None respectively.
727 | 
728 |         # Special handling for 'get_all_highlights' command parameters
729 |         if api_method_name == "get_all_highlights":
730 |             if param.name == "cursor":
731 |                 current_param_help = (
732 |                     "[Ignored by CLI for get-all-highlights] " + param_help
733 |                 )
734 |                 current_click_required = (
735 |                     False  # Cursor is handled by CLI, not required from user
736 |                 )
737 |                 current_default_value = (
738 |                     None  # Explicitly set default to None for ignored param
739 |                 )
740 |             elif param.name == "limit":
741 |                 current_param_help = "Total maximum number of highlights to fetch across pages for get-all-highlights. If omitted, all are fetched."
742 |                 # For 'limit', required status and default remain as derived from its Optional[int] type hint
743 |                 # current_click_required and current_default_value will be correctly False and None respectively.
744 | 
745 |         # Add the Click Option
746 |         click_params.append(
747 |             click.Option(
748 |                 option_names,
749 |                 type=click_type,
750 |                 required=current_click_required,
751 |                 default=current_default_value if not current_is_flag else None,
752 |                 help=current_param_help,
753 |                 is_flag=(current_is_flag if len(option_names) == 1 else False),
754 |                 show_default=not current_is_flag and current_default_value is not None,
755 |                 # Click derives the Python identifier (e.g., 'bookmark_id') from the first long option name
756 |             )
757 |         )
758 | 
759 |     # Create the Click command
760 |     try:
761 |         dynamic_command = click.Command(
762 |             name=api_method_name.replace("_", "-"),  # Use kebab-case for command names
763 |             callback=command_func,
764 |             params=click_params,
765 |             help=full_help,
766 |             short_help=help_text,
767 |         )
768 |         return dynamic_command
769 |     except Exception as e:
770 |         logger.warning(f"Failed to create click command for '{api_method_name}': {e}")
771 |         return None
772 | 
773 | 
774 | # --- Dynamically Add Commands to CLI Group ---
775 | def add_commands_to_cli(cli_group):
776 |     """
777 |     Inspects the KarakeepAPI class *statically* to find public methods
778 |     and adds them as Click commands. Does NOT require API keys or URL for inspection.
779 |     """
780 |     logger.info("Statically inspecting KarakeepAPI class and generating commands...")
781 | 
782 |     try:
783 |         added_count = 0
784 |         skipped_count = 0
785 |         # Inspect the KarakeepAPI class directly, not an instance
786 |         for name, member in inspect.getmembers(KarakeepAPI):
787 |             # Check if it's a public function/method defined in the class
788 |             if (
789 |                 not name.startswith("_")
790 |                 and inspect.isfunction(
791 |                     member
792 |                 )  # Check if it's a function (method in class def)
793 |                 # Add further checks if needed, e.g., based on naming convention or decorators
794 |             ):
795 |                 try:
796 |                     # Attempt to create a command for the method
797 |                     # We pass the function object directly. The command_func will later
798 |                     # get the bound method from the API instance created at runtime.
799 |                     command = create_click_command(name, member)
800 |                     if command:
801 |                         cli_group.add_command(command)
802 |                         added_count += 1
803 |                     else:
804 |                         logger.warning(f"Skipped command generation for method: {name}")
805 |                         skipped_count += 1
806 |                 except Exception as cmd_gen_e:
807 |                     logger.warning(
808 |                         f"Failed to create command for method '{name}': {cmd_gen_e}"
809 |                     )
810 |                     skipped_count += 1
811 | 
812 |         if added_count == 0:
813 |             logger.warning(
814 |                 "No API commands were dynamically added. Check KarakeepAPI class definition and logs."
815 |             )
816 |         else:
817 |             logger.info(f"Added {added_count} API commands. Skipped {skipped_count}.")
818 | 
819 |     except Exception as e:
820 |         # Handle errors during static inspection or command creation
821 |         logger.error(f"Unexpected error during dynamic command setup: {e}")
822 |         # Determine verbosity from environment for traceback logging during setup
823 |         verbose_setup = os.environ.get("KARAKEEP_PYTHON_API_VERBOSE", "").lower() in (
824 |             "true",
825 |             "1",
826 |             "yes",
827 |         )
828 |         if verbose_setup:
829 |             logger.debug(traceback.format_exc())  # Use top-level import
830 |         # Raise an exception to halt execution if setup fails
831 |         error_message = f"Error: Unexpected error during dynamic command setup: {e}"
832 |         raise click.ClickException(error_message)
833 | 
834 | 
835 | # Add commands when the script is loaded by calling the function
836 | add_commands_to_cli(cli)
837 | 
838 | # Main entry point for the script
839 | if __name__ == "__main__":
840 |     # Normal Click execution starts here. The --dump-openapi-specification
841 |     # is now handled by its callback function defined above.
842 |     cli(obj={})  # Pass initial empty object for context
843 | 


--------------------------------------------------------------------------------