├── .gitignore ├── community_scripts ├── Karanki │ └── README.md ├── karakeep-time-tagger │ ├── karakeep-time-tagger.timer │ ├── karakeep-time-tagger.service │ ├── README.md │ └── karakeep-time-tagger.py ├── karakeep-archive-before-date │ ├── README.md │ └── archiving_before_date.py ├── README.md ├── karakeep-list-to-tag │ ├── README.md │ └── karakeep-list-to-tag.py ├── karakeep-remove-ai-tags │ ├── README.md │ └── karakeep-remove-ai-tags.py ├── pocket2karakeep-archived │ ├── README.md │ └── pocket_archiving_status_updater.py ├── omnivore2karakeep-archived │ ├── README.md │ └── omnivore2karakeep-archived.py └── omnivore2karakeep-highlights │ ├── README.md │ ├── omnivore2karakeep-highlights.py │ └── string_context_matcher.py ├── MANIFEST.in ├── tests ├── PDF Bookmark Sample.pdf └── conftest.py ├── .gitmodules ├── bumpver.toml ├── karakeep_python_api ├── __init__.py ├── datatypes.py └── __main__.py ├── .pre-commit-config.yaml ├── setup.py ├── README.md └── LICENSE /.gitignore: -------------------------------------------------------------------------------- 1 | .aider* 2 | **/__pycache__ 3 | .env* 4 | *.egg-info 5 | *author* 6 | **/*log 7 | **/*.temp 8 | -------------------------------------------------------------------------------- /community_scripts/Karanki/README.md: -------------------------------------------------------------------------------- 1 | Moved to its own repository: [Karanki](https://github.com/thiswillbeyourgithub/Karanki/) 2 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the OpenAPI specification file in the distribution 2 | include karakeep_python_api/openapi_reference.json 3 | -------------------------------------------------------------------------------- /tests/PDF Bookmark Sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thiswillbeyourgithub/karakeep_python_api/HEAD/tests/PDF Bookmark Sample.pdf -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "community_scripts/Freshrss-To-Karakeep"] 2 | path = community_scripts/Freshrss-To-Karakeep 3 | url = https://github.com/thiswillbeyourgithub/freshrss_to_karakeep 4 | branch = main 5 | -------------------------------------------------------------------------------- /community_scripts/karakeep-time-tagger/karakeep-time-tagger.timer: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Run Karakeep-Time-Tagger every 4 hours 3 | Requires=karakeep-time-tagger.service 4 | 5 | [Timer] 6 | OnBootSec=15min 7 | OnUnitActiveSec=4h 8 | Persistent=true 9 | 10 | [Install] 11 | WantedBy=timers.target 12 | -------------------------------------------------------------------------------- /bumpver.toml: -------------------------------------------------------------------------------- 1 | [bumpver] 2 | current_version = "1.5.0" 3 | version_pattern = "MAJOR.MINOR.PATCH" 4 | commit_message = "bump version {old_version} -> {new_version}" 5 | tag_message = "{new_version}" 6 | tag_scope = "default" 7 | commit = true 8 | tag = true 9 | push = false 10 | 11 | [bumpver.file_patterns] 12 | "bumpver.toml" = ['current_version = "{version}"'] 13 | "setup.py" = ['version="{version}"'] 14 | "karakeep_python_api/karakeep_api.py" = ['VERSION: str = "{version}"'] 15 | -------------------------------------------------------------------------------- /karakeep_python_api/__init__.py: -------------------------------------------------------------------------------- 1 | # Import API class and errors directly from the module 2 | from .karakeep_api import KarakeepAPI, APIError, AuthenticationError 3 | 4 | # Import the datatypes module so users can do `from karakeep_python_api.datatypes import ...` 5 | from . import datatypes 6 | 7 | # Define the package version 8 | # This is the single source of truth, read by setup.py and updated by bumpver. 9 | __version__ = KarakeepAPI.VERSION 10 | 11 | __all__ = [ 12 | "KarakeepAPI", 13 | "APIError", 14 | "AuthenticationError", 15 | "datatypes", # Expose the datatypes module 16 | "__version__", 17 | ] 18 | 19 | # Models are available via `from karakeep_python_api.datatypes import ...` 20 | -------------------------------------------------------------------------------- /community_scripts/karakeep-time-tagger/karakeep-time-tagger.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Add time-to-read tags to Karakeep bookmarks by calling Karakeep-Time-Tagger 3 | After=network-online.target 4 | Wants=network-online.target 5 | 6 | [Service] 7 | Type=oneshot 8 | # Update this path to your karakeep-python-api repository location 9 | WorkingDirectory=%h/repos/karakeep-python-api 10 | ExecStart=/usr/bin/python3 community_scripts/karakeep-time-tagger/karakeep-time-tagger.py --verbose=false 11 | # Uncomment and set path to environment file containing KARAKEEP_PYTHON_API_KEY and KARAKEEP_PYTHON_API_ENDPOINT 12 | EnvironmentFile=%h/.config/karakeep/env 13 | StandardOutput=journal 14 | StandardError=journal 15 | 16 | [Install] 17 | WantedBy=default.target 18 | -------------------------------------------------------------------------------- /community_scripts/karakeep-archive-before-date/README.md: -------------------------------------------------------------------------------- 1 | # Karakeep Archive before date 2 | 3 | Small cleaning script to clean old article not archived after an import from another readlater app. 4 | 5 | ## Prerequisites 6 | 7 | N/A 8 | 9 | ## Usage 10 | 11 | Define a date to limit archiving. All not archived bookmarks before this date will be archived. 12 | 13 | ```bash 14 | python archiving_before_date.py --before-date 2023-12-24 15 | ``` 16 | 17 | `--before-date` format is `YYYY-MM-DD` 18 | 19 | You might need to set up environment variables for the Karakeep API client or pass them as arguments if the script supports it (e.g., `KARAKEEP_PYTHON_API_ENDPOINT` and `KARAKEEP_PYTHON_API_KEY`). Refer to the script's help or the `karakeep-python-api` documentation for more details on authentication. 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /community_scripts/README.md: -------------------------------------------------------------------------------- 1 | # Karakeep Community Scripts 2 | 3 | This part of the repository is where Community Scripts are stored. 4 | 5 | Don't hesitate to create PR to add your own: 6 | - Think of a good name, ideally something that is easy to find when using search engines. And the name of the directory should ideally match the name of the script. 7 | - Include a README.md. 8 | - Include a VERSION variable, for example `VERSION: str = "1.0.0"`. 9 | - Mention your script to the table in the README.md at the root of the repository. 10 | - If you think your script should be known of the entire community, think about adding it to the table in [the official Karakeep documentation](https://docs.karakeep.app/community-projects) 11 | - If possible run [ruff](https://github.com/astral-sh/ruff/) on your code before doing the PR. 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: pytest 5 | name: pytest 6 | # entry: pytest tests 7 | entry: pytest tests --quiet 8 | language: system 9 | pass_filenames: false 10 | always_run: true 11 | stages: [pre-merge-commit] 12 | # - repo: https://github.com/psf/black 13 | # rev: 22.10.0 14 | # hooks: 15 | # - id: black 16 | # args: ["--quiet"] 17 | # language: system 18 | # - repo: https://github.com/pycqa/isort 19 | # rev: 5.12.0 20 | # hooks: 21 | # - id: isort 22 | # args: ["--profile", "black", "--quiet"] 23 | # language: system 24 | 25 | # https://github.com/astral-sh/ruff-pre-commit 26 | repos: 27 | - repo: https://github.com/astral-sh/ruff-pre-commit 28 | # Ruff version. 29 | rev: v0.14.1 30 | hooks: 31 | # # Run the linter. 32 | # - id: ruff-check 33 | # args: [ --fix ] 34 | # Run the formatter. 35 | - id: ruff-format 36 | -------------------------------------------------------------------------------- /community_scripts/karakeep-list-to-tag/README.md: -------------------------------------------------------------------------------- 1 | # Karakeep-List-To-Tag 2 | 3 | This script allows you to convert a list into a tag by adding a specified tag to all bookmarks in a specified list. 4 | 5 | ## Purpose 6 | 7 | Sometimes it's useful to "turn a list into a tag" so you can then create more flexible smart lists. For example, if you have a list called "Omnivore Imports", you can tag all those bookmarks with `#omnivore`, then create a smart list with the query `#omnivore -is:archived` to show only unarchived Omnivore bookmarks. 8 | 9 | ## Usage 10 | 11 | ```bash 12 | python karakeep-list-to-tag.py "My List Name" "my-tag" 13 | ``` 14 | 15 | This will: 16 | 1. Find the list with the specified name 17 | 2. Get all bookmarks from that list 18 | 3. Add the specified tag to each bookmark (skipping any that already have the tag) 19 | 20 | ## Example 21 | 22 | ```bash 23 | python karakeep-list-to-tag.py "Omnivore Imports" "omnivore" 24 | ``` 25 | 26 | After running this, you can create a smart list with query `#omnivore -is:archived` to filter your Omnivore bookmarks by archived status. 27 | 28 | --- 29 | *This documentation was created with assistance from [aider.chat](https://github.com/Aider-AI/aider/).* 30 | -------------------------------------------------------------------------------- /community_scripts/karakeep-remove-ai-tags/README.md: -------------------------------------------------------------------------------- 1 | # Karakeep-Remove-AI-Tags 2 | 3 | This script allows you to identify and remove tags that were attached by AI and have no human attachments. 4 | 5 | ## Purpose 6 | 7 | Karakeep can automatically tag bookmarks using AI. Sometimes, you may want to clean up tags that were only added by AI and not by humans. This script helps you identify and remove such tags. 8 | 9 | ## Usage 10 | 11 | ```bash 12 | python karakeep-remove-ai-tags.py [--dry-run] 13 | ``` 14 | 15 | ### Parameters 16 | 17 | - `--dry-run`: Optional. If provided, the script will only list the tags that would be removed without actually removing them. 18 | 19 | ## What the script does 20 | 21 | This script will: 22 | 1. Fetch all tags from your Karakeep account 23 | 2. Identify tags that are attached by AI and have no human attachments 24 | 3. List these tags with their IDs and the number of AI attachments 25 | 4. If not in dry-run mode, ask for confirmation before removing the tags 26 | 5. Remove the confirmed tags 27 | 28 | ## Examples 29 | 30 | ### Dry run (preview only) 31 | 32 | ```bash 33 | python karakeep-remove-ai-tags.py --dry-run 34 | ``` 35 | 36 | This will list all tags that would be removed without actually removing them. 37 | 38 | ### Remove AI-only tags 39 | 40 | ```bash 41 | python karakeep-remove-ai-tags.py 42 | ``` 43 | 44 | This will list all tags that are attached by AI and have no human attachments, ask for confirmation, and then remove the confirmed tags. 45 | 46 | --- 47 | *This documentation was created with assistance from AI.* 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | from setuptools import find_packages, setup 4 | 5 | with open("README.md", "r") as readme: 6 | long_description = readme.read() 7 | 8 | setup( 9 | name="karakeep_python_api", 10 | version="1.5.0", 11 | description="Community python client for the Karakeep API.", # Simplified description 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/thiswillbeyourgithub/karakeep_python_api/", 15 | packages=find_packages(), 16 | include_package_data=True, 17 | classifiers=[ 18 | "Programming Language :: Python :: 3", 19 | "Operating System :: OS Independent", 20 | ], 21 | keywords=[ 22 | "rss", 23 | "karakeep", 24 | "hoarder", 25 | "data-hoarding", 26 | "python", 27 | "api", 28 | "feeds", 29 | "openapi", 30 | ], 31 | python_requires=">=3.9", 32 | install_requires=[ 33 | "requests >= 2.32.3", 34 | "loguru >= 0.7.3", 35 | "pydantic >= 2.0", # For data validation and modeling based on datatypes.py 36 | "click >= 8.0", # For the CLI 37 | ], 38 | extras_require={ 39 | "dev": [ 40 | # "openapi-pydantic >= 0.5.1", # For generating datatypes.py from OpenAPI spec 41 | "beartype >= 0.20.2", # Optional runtime type checking 42 | "pytest >= 8.3.4", 43 | "build >= 1.2.2.post1", 44 | "twine >= 6.1.0", 45 | "bumpver >= 2024.1130", 46 | ], 47 | }, 48 | entry_points={ 49 | "console_scripts": [ 50 | "karakeep=karakeep_python_api.__main__:cli", 51 | ], 52 | }, 53 | ) 54 | -------------------------------------------------------------------------------- /community_scripts/pocket2karakeep-archived/README.md: -------------------------------------------------------------------------------- 1 | # Pocket2Karakeep-Archived 2 | 3 | This script addresses an issue in Karakeep (as of version 0.24.1) where the "archived" status of bookmarks imported from Pocket is not preserved. See [karakeep issue #703](https://github.com/karakeep-app/karakeep/issues/703) for more details. 4 | 5 | This tool reads your Pocket export data and updates the corresponding bookmarks in your Karakeep instance to reflect their original "archived" status. 6 | 7 | ## Prerequisites 8 | 9 | 1. **Pocket Export Directory**: You need to have an export of your Ocket data. This should be a directory containing the `part_00000X.csv` files provided by POcket. The script will automatically find and process all such files within the specified directory. 10 | 11 | ## Usage 12 | 13 | Ensure you have your Pocket export directory ready. Then, run the script: 14 | 15 | ```bash 16 | python pocket2karakeep-archived.py --pocket-export-dir /path/to/your/pocket_export_directory 17 | ``` 18 | 19 | You might need to set up environment variables for the Karakeep API client or pass them as arguments if the script supports it (e.g., `KARAKEEP_PYTHON_API_ENDPOINT` and `KARAKEEP_PYTHON_API_KEY`). Refer to the script's help or the `karakeep-python-api` documentation for more details on authentication. 20 | 21 | The script will: 22 | 1. Scan the specified Pocket export directory for `part_00000X.csv` files. 23 | 2. Load and combine data from all found JSON files to identify articles that should be "Archived". 24 | 3. Fetch all bookmarks from your Karakeep instance. (This can take a while and is cached locally in `karakeep_bookmarks.temp` by default to speed up subsequent runs). 25 | 4. For each Pocket article marked as "Archived", it will find the corresponding bookmark in Karakeep (matching by URL or title) and update its status to "archived" if it's not already. 26 | 27 | 28 | -------------------------------------------------------------------------------- /community_scripts/omnivore2karakeep-archived/README.md: -------------------------------------------------------------------------------- 1 | # Omnivore2Karakeep-archived 2 | 3 | This script addresses an issue in Karakeep (as of version 0.24.1) where the "archived" status of bookmarks imported from Omnivore is not preserved. See [karakeep issue #703](https://github.com/karakeep-app/karakeep/issues/703) for more details. 4 | 5 | This tool reads your Omnivore export data and updates the corresponding bookmarks in your Karakeep instance to reflect their original "archived" status. 6 | 7 | ## Prerequisites 8 | 9 | 1. **Omnivore Export Directory**: You need to have an export of your Omnivore data. This should be a directory containing the `metadata_X_to_Y.json` files provided by Omnivore. The script will automatically find and process all such files within the specified directory. 10 | 11 | ## Usage 12 | 13 | Ensure you have your Omnivore export directory ready. Then, run the script: 14 | 15 | ```bash 16 | python omnivore2karakeep-archived.py --omnivore-export-dir /path/to/your/omnivore_export_directory 17 | ``` 18 | 19 | You might need to set up environment variables for the Karakeep API client or pass them as arguments if the script supports it (e.g., `KARAKEEP_PYTHON_API_ENDPOINT` and `KARAKEEP_PYTHON_API_KEY`). Refer to the script's help or the `karakeep-python-api` documentation for more details on authentication. 20 | 21 | The script will: 22 | 1. Scan the specified Omnivore export directory for `metadata_*_to_*.json` files. 23 | 2. Load and combine data from all found JSON files to identify articles that should be "Archived". 24 | 3. Fetch all bookmarks from your Karakeep instance. (This can take a while and is cached locally in `karakeep_bookmarks.temp` by default to speed up subsequent runs). 25 | 4. For each Omnivore article marked as "Archived", it will find the corresponding bookmark in Karakeep (matching by URL or title) and update its status to "archived" if it's not already. 26 | 27 | --- 28 | This tool was developed with assistance from [aider.chat](https://github.com/Aider-AI/aider/). 29 | -------------------------------------------------------------------------------- /community_scripts/karakeep-list-to-tag/karakeep-list-to-tag.py: -------------------------------------------------------------------------------- 1 | """Adds a specified tag to all bookmarks in a specified list.""" 2 | 3 | import json 4 | import fire 5 | from tqdm import tqdm 6 | from karakeep_python_api import KarakeepAPI 7 | 8 | 9 | def main(list_name: str, tag_to_add: str): 10 | """ 11 | Adds a specified tag to all bookmarks in a specified list. 12 | 13 | Parameters: 14 | list_name: Name of the list to get bookmarks from 15 | tag_to_add: Name of the tag to add to bookmarks 16 | """ 17 | k = KarakeepAPI() 18 | 19 | # Get all lists and find the one with the specified name 20 | lists = k.get_all_lists() 21 | target_list = None 22 | for l in lists: 23 | if l.name == list_name: 24 | target_list = l 25 | break 26 | 27 | if not target_list: 28 | print(f"List '{list_name}' not found") 29 | return 30 | 31 | list_id = target_list.id 32 | 33 | # Get all bookmarks from the list 34 | bookmarks = [] 35 | cursor = None 36 | while True: 37 | page = k.get_bookmarks_in_the_list( 38 | list_id=list_id, include_content=False, limit=50, cursor=cursor 39 | ) 40 | cursor = page.nextCursor 41 | new = page.bookmarks 42 | if not new: 43 | print("No new bookmarks") 44 | break 45 | bookmarks.extend(new) 46 | print(f"Added {len(new)} bookmarks, total is {len(bookmarks)}") 47 | if not cursor: 48 | print("No cursor") 49 | break 50 | 51 | # Add tag to bookmarks that don't already have it 52 | skipped = 0 53 | added = 0 54 | for b in tqdm(bookmarks): 55 | # Check if bookmark already has the tag 56 | existing_tag_names = [tag.name for tag in b.tags] if b.tags else [] 57 | if tag_to_add in existing_tag_names: 58 | tqdm.write(f"Skipping bookmark {b.id} - already has tag '{tag_to_add}'") 59 | skipped += 1 60 | continue 61 | 62 | out = k.attach_tags_to_a_bookmark( 63 | bookmark_id=b.id, 64 | tag_names=[tag_to_add], 65 | ) 66 | tqdm.write(f"Title: '{b.title}' Answer: '{json.dumps(out)}'") 67 | added += 1 68 | 69 | print( 70 | f"Added tag '{tag_to_add}' to {added} bookmarks, skipped {skipped} bookmarks that already had the tag" 71 | ) 72 | 73 | 74 | if __name__ == "__main__": 75 | fire.Fire(main) 76 | -------------------------------------------------------------------------------- /community_scripts/karakeep-time-tagger/README.md: -------------------------------------------------------------------------------- 1 | # Karakeep-Time-Tagger 2 | 3 | Automatically adds time-to-read tags to your Karakeep bookmarks based on content length. 4 | 5 | ## What it does 6 | 7 | - Analyzes bookmark content (text and HTML) to estimate reading time 8 | - Adds appropriate time tags: `0-1m`, `1-5m`, `5-10m`, `10-15m`, `15-30m`, `30m+` 9 | - Removes conflicting time tags to ensure each bookmark has only one time estimate 10 | - Creates smart lists for each time category (can be disabled) 11 | - Supports both link and text bookmark types 12 | - Uses caching to speed up repeated runs 13 | 14 | ## Usage 15 | 16 | Basic usage with default settings (200 WPM): 17 | ```bash 18 | python karakeep-time-tagger.py 19 | ``` 20 | 21 | Customize reading speed: 22 | ```bash 23 | python karakeep-time-tagger.py --wpm 250 24 | ``` 25 | 26 | Process all bookmarks (including those already tagged): 27 | ```bash 28 | python karakeep-time-tagger.py --reset_all 29 | ``` 30 | 31 | Enable verbose logging: 32 | ```bash 33 | python karakeep-time-tagger.py --verbose 34 | ``` 35 | 36 | Use custom cache file location: 37 | ```bash 38 | python karakeep-time-tagger.py --cache_file ./my_bookmarks.cache 39 | ``` 40 | 41 | Skip creating smart lists: 42 | ```bash 43 | python karakeep-time-tagger.py --create_lists False 44 | ``` 45 | 46 | ## Options 47 | 48 | - `--wpm`: Words per minute reading speed (default: 200) 49 | - `--reset_all`: Process all bookmarks, even those already tagged (default: False) 50 | - `--verbose`: Show debug logs in console (default: False) 51 | - `--cache_file`: Path to bookmark cache file (default: ./bookmarks.temp) 52 | - `--create_lists`: Create smart lists for each time slot (default: True) 53 | 54 | ## Prerequisites 55 | 56 | - Karakeep API credentials configured (via environment variables or command line) 57 | - Python packages: `fire`, `tqdm`, `beautifulsoup4`, `loguru`, `karakeep-python-api` 58 | 59 | ## Behavior 60 | 61 | By default, the script skips bookmarks that already have exactly one time tag (assumes they're correct). It only processes: 62 | - Bookmarks with no time tags 63 | - Bookmarks with multiple conflicting time tags 64 | 65 | Use `--reset_all` to force reprocessing of all bookmarks. 66 | 67 | ## Caching 68 | 69 | The script caches downloaded bookmarks to speed up repeated runs during testing. Delete the cache file to force a fresh download from the API. 70 | 71 | --- 72 | 73 | *This tool was created with assistance from [aider.chat](https://github.com/Aider-AI/aider/).* 74 | -------------------------------------------------------------------------------- /community_scripts/karakeep-remove-ai-tags/karakeep-remove-ai-tags.py: -------------------------------------------------------------------------------- 1 | """Removes tags that are attached by AI and have no human attachments.""" 2 | 3 | import json 4 | import fire 5 | from tqdm import tqdm 6 | from karakeep_python_api import KarakeepAPI 7 | 8 | VERSION: str = "1.0.0" 9 | 10 | 11 | def main(dry_run: bool = False): 12 | """ 13 | Lists all tags and removes those that are attached by AI and have no human attachments. 14 | 15 | Parameters: 16 | dry_run: If True, only lists the tags that would be removed without actually removing them 17 | """ 18 | k = KarakeepAPI() 19 | 20 | # Get all tags 21 | print("Fetching all tags...") 22 | tags = k.get_all_tags() 23 | print(f"Found {len(tags)} tags") 24 | 25 | # Identify tags that are attached by AI and have no human attachments 26 | ai_only_tags = [] 27 | for tag in tags: 28 | ai_count = tag.numBookmarksByAttachedType.ai or 0 29 | human_count = tag.numBookmarksByAttachedType.human or 0 30 | 31 | if ai_count > 0 and human_count == 0: 32 | ai_only_tags.append(tag) 33 | 34 | print( 35 | f"Found {len(ai_only_tags)} tags that are attached by AI and have no human attachments" 36 | ) 37 | 38 | # List the tags that will be removed 39 | if ai_only_tags: 40 | print("\nTags that will be removed:") 41 | for tag in ai_only_tags: 42 | print( 43 | f"- {tag.name} (ID: {tag.id}, AI attachments: {tag.numBookmarksByAttachedType.ai})" 44 | ) 45 | else: 46 | print("No tags to remove") 47 | return 48 | 49 | # If dry_run is True, don't actually remove the tags 50 | if dry_run: 51 | print("\nDRY RUN: No tags were removed") 52 | return 53 | 54 | # Confirm before removing tags 55 | confirm = input("\nAre you sure you want to remove these tags? (y/n): ") 56 | if confirm.lower() != "y": 57 | print("Operation cancelled") 58 | return 59 | 60 | # Remove the tags 61 | print("\nRemoving tags...") 62 | removed = 0 63 | for tag in tqdm(ai_only_tags): 64 | try: 65 | k.delete_a_tag(tag.id) 66 | tqdm.write(f"Removed tag: {tag.name} (ID: {tag.id})") 67 | removed += 1 68 | except Exception as e: 69 | tqdm.write(f"Error removing tag {tag.name} (ID: {tag.id}): {str(e)}") 70 | 71 | print(f"\nRemoved {removed} tags out of {len(ai_only_tags)} AI-only tags") 72 | 73 | 74 | if __name__ == "__main__": 75 | fire.Fire(main) 76 | -------------------------------------------------------------------------------- /community_scripts/karakeep-archive-before-date/archiving_before_date.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small script to clean old article not archived after an import from another readlater app. 3 | 4 | Parameters: 5 | before_date: Date in YYYY-MM-DD format. Articles created before this date will be archived. 6 | """ 7 | 8 | import time 9 | from datetime import datetime 10 | 11 | from Levenshtein import ratio 12 | import pickle 13 | from fire import Fire 14 | from typing import Optional 15 | from pathlib import Path 16 | import json 17 | import csv 18 | from karakeep_python_api import KarakeepAPI 19 | from tqdm import tqdm 20 | 21 | VERSION: str = "1.0.0" 22 | 23 | karakeep = KarakeepAPI(verbose=False) 24 | 25 | 26 | def main(before_date: str) -> None: 27 | """Archive articles created before the specified date. 28 | 29 | Args: 30 | before_date: Date string in YYYY-MM-DD format 31 | """ 32 | before_date = datetime.strptime(before_date, "%Y-%m-%d") 33 | 34 | n = karakeep.get_current_user_stats()["numBookmarks"] 35 | pbar = tqdm(total=n, desc="Fetching bookmarks") 36 | all_bm = [] 37 | batch_size = 100 # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed 38 | page = karakeep.get_all_bookmarks( 39 | include_content=False, 40 | limit=batch_size, 41 | ) 42 | all_bm.extend(page.bookmarks) 43 | pbar.update(len(all_bm)) 44 | while page.nextCursor: 45 | page = karakeep.get_all_bookmarks( 46 | include_content=False, 47 | limit=batch_size, 48 | cursor=page.nextCursor, 49 | ) 50 | all_bm.extend(page.bookmarks) 51 | pbar.update(len(page.bookmarks)) 52 | 53 | assert len(all_bm) == n, f"Only retrieved {len(all_bm)} bookmarks instead of {n}" 54 | pbar.close() 55 | 56 | failed = [] 57 | for bookmark in all_bm: 58 | # skip already archived 59 | if bookmark.archived: 60 | continue 61 | 62 | # tqdm.write(f"Creation Date: {bookmark.createdAt}") 63 | creation_date = datetime.strptime(bookmark.createdAt, "%Y-%m-%dT%H:%M:%S.%fZ") 64 | 65 | if creation_date > before_date: 66 | continue 67 | 68 | # do the archiving 69 | retries = 3 70 | for attempt in range(retries): 71 | try: 72 | res_arch = karakeep.update_a_bookmark( 73 | bookmark_id=bookmark.id, 74 | update_data={"archived": True}, 75 | ) 76 | break 77 | except Exception as e: 78 | if attempt == retries - 1: 79 | raise e 80 | tqdm.write(f"Update failed, retrying ({attempt + 1}/{retries})") 81 | time.sleep(1) 82 | if isinstance(res_arch, dict): 83 | assert res_arch["archived"], res_arch 84 | else: 85 | assert res_arch.archived, res_arch 86 | tqdm.write(f"Successfuly archived: {bookmark.title}") 87 | 88 | 89 | if __name__ == "__main__": 90 | Fire(main) 91 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import time 4 | import random 5 | import string 6 | from typing import Optional 7 | import beartype # to trigger the runtime typechecking 8 | from karakeep_python_api import KarakeepAPI, datatypes 9 | 10 | 11 | @pytest.fixture 12 | def karakeep_client(): 13 | """ 14 | Fixture that provides a configured Karakeep API client. 15 | 16 | Requires the following environment variables: 17 | - KARAKEEP_PYTHON_API_ENDPOINT 18 | - KARAKEEP_PYTHON_API_KEY 19 | - KARAKEEP_PYTHON_API_VERIFY_SSL (optional, defaults to true) 20 | """ 21 | api_endpoint = os.environ.get("KARAKEEP_PYTHON_API_ENDPOINT") 22 | api_key = os.environ.get("KARAKEEP_PYTHON_API_KEY") 23 | verify_ssl_str = os.environ.get("KARAKEEP_PYTHON_API_VERIFY_SSL", "true") 24 | 25 | if not api_endpoint or not api_key: 26 | missing = [] 27 | if not api_endpoint: 28 | missing.append("KARAKEEP_PYTHON_API_ENDPOINT") 29 | if not api_key: 30 | missing.append("KARAKEEP_PYTHON_API_KEY") 31 | pytest.skip( 32 | f"Missing required environment variables for Karakeep API tests: {', '.join(missing)}. Set these to run integration tests." 33 | ) 34 | 35 | verify_ssl = verify_ssl_str.lower() in ("true", "1", "yes") 36 | 37 | # Instantiate the client using standard environment variables 38 | # KarakeepAPI constructor handles api_endpoint and api_key directly 39 | return KarakeepAPI( 40 | api_endpoint=api_endpoint, 41 | api_key=api_key, 42 | verify_ssl=verify_ssl, 43 | verbose=True, # Enable verbose logging for tests 44 | ) 45 | 46 | 47 | @pytest.fixture 48 | def managed_bookmark(karakeep_client: KarakeepAPI) -> datatypes.Bookmark: 49 | """ 50 | Fixture to create a bookmark before a test and delete it afterwards. 51 | Yields the created bookmark object. 52 | """ 53 | created_bookmark_id: Optional[str] = None 54 | # Generate unique URL and title to avoid collisions and aid debugging 55 | timestamp = int(time.time()) 56 | random_suffix = "".join(random.choices(string.ascii_lowercase + string.digits, k=6)) 57 | test_url = f"https://example.com/test_page_fixture_{timestamp}_{random_suffix}" 58 | original_title = f"Managed Fixture Bookmark {timestamp}-{random_suffix}" 59 | 60 | print( 61 | f"\n FIXTURE SETUP: Attempting to create bookmark (URL: {test_url}, Title: '{original_title}')" 62 | ) 63 | try: 64 | # Create the bookmark 65 | bookmark = karakeep_client.create_a_new_bookmark( 66 | type="link", url=test_url, title=original_title 67 | ) 68 | assert isinstance(bookmark, datatypes.Bookmark), ( 69 | "Fixture: create_a_new_bookmark should return a Bookmark model" 70 | ) 71 | assert bookmark.id, "Fixture: Created bookmark must have an ID" 72 | created_bookmark_id = bookmark.id 73 | print( 74 | f" FIXTURE SETUP: ✓ Successfully created bookmark with ID: {created_bookmark_id}" 75 | ) 76 | 77 | yield bookmark # Provide the bookmark to the test function 78 | 79 | finally: 80 | # Teardown: Delete the bookmark 81 | if created_bookmark_id: 82 | print( 83 | f"\n FIXTURE TEARDOWN: Attempting to delete bookmark ID: {created_bookmark_id}" 84 | ) 85 | try: 86 | karakeep_client.delete_a_bookmark(bookmark_id=created_bookmark_id) 87 | print( 88 | f" FIXTURE TEARDOWN: ✓ Successfully deleted bookmark ID: {created_bookmark_id}" 89 | ) 90 | except Exception as e: 91 | # Log error during teardown but don't let it mask original test failure 92 | print( 93 | f" FIXTURE TEARDOWN: ERROR during bookmark deletion for ID {created_bookmark_id}: {e}" 94 | ) 95 | else: 96 | print("\n FIXTURE TEARDOWN: No bookmark ID recorded, skipping deletion.") 97 | -------------------------------------------------------------------------------- /karakeep_python_api/datatypes.py: -------------------------------------------------------------------------------- 1 | """ 2 | The datatype file was originally generated by datamodel-codegen, then refactored and manually (or via LLMs) kept up to date with upstream 3 | """ 4 | 5 | from __future__ import annotations 6 | 7 | from enum import Enum 8 | from typing import List, Optional, Union, Literal 9 | 10 | from pydantic import BaseModel, Field, RootModel 11 | 12 | 13 | class StatusTypes(str, Enum): 14 | success = "success" 15 | failure = "failure" 16 | pending = "pending" 17 | 18 | 19 | class NumBookmarksByAttachedType(BaseModel): 20 | ai: Optional[int] = None 21 | human: Optional[int] = None 22 | 23 | 24 | class TagShort(BaseModel): 25 | id: str 26 | name: str 27 | attachedBy: Literal["ai", "human"] 28 | 29 | 30 | class Tag(BaseModel): 31 | id: str 32 | name: str 33 | numBookmarks: int 34 | numBookmarksByAttachedType: NumBookmarksByAttachedType 35 | 36 | 37 | class Type(str, Enum): 38 | link = "link" 39 | 40 | 41 | class ContentTypeLink(BaseModel): 42 | type: Literal["link"] = "link" 43 | url: str 44 | title: Optional[str] = None 45 | description: Optional[str] = None 46 | imageUrl: Optional[str] = None 47 | imageAssetId: Optional[str] = None 48 | screenshotAssetId: Optional[str] = None 49 | fullPageArchiveAssetId: Optional[str] = None 50 | precrawledArchiveAssetId: Optional[str] = None 51 | videoAssetId: Optional[str] = None 52 | favicon: Optional[str] = None 53 | htmlContent: Optional[str] = None 54 | contentAssetId: Optional[str] = None 55 | crawledAt: Optional[str] = None 56 | author: Optional[str] = None 57 | publisher: Optional[str] = None 58 | datePublished: Optional[str] = None 59 | dateModified: Optional[str] = None 60 | 61 | 62 | class ContentTypeUnknown(BaseModel): 63 | type: Literal["unknown"] = "unknown" 64 | 65 | 66 | class ContentTypeText(BaseModel): 67 | type: Literal["text"] = "text" 68 | text: str 69 | sourceUrl: Optional[str] = None 70 | 71 | 72 | class ContentTypeAsset(BaseModel): 73 | type: Literal["asset"] = "asset" 74 | assetType: Literal["image", "pdf"] 75 | assetId: str 76 | fileName: Optional[str] = None 77 | sourceUrl: Optional[str] = None 78 | size: Optional[float] = None 79 | content: Optional[str] = None 80 | 81 | 82 | class BookmarkAsset(BaseModel): 83 | id: str 84 | assetType: Literal[ 85 | "linkHtmlContent", 86 | "screenshot", 87 | "assetScreenshot", 88 | "bannerImage", 89 | "fullPageArchive", 90 | "video", 91 | "bookmarkAsset", 92 | "precrawledArchive", 93 | "userUploaded", 94 | "unknown", 95 | ] 96 | fileName: Optional[str] = None 97 | 98 | 99 | class Asset(BaseModel): 100 | assetId: str 101 | contentType: str 102 | size: float 103 | fileName: str 104 | 105 | 106 | class Bookmark(BaseModel): 107 | id: str 108 | createdAt: str 109 | modifiedAt: Optional[str] 110 | title: Optional[str] = None 111 | archived: bool 112 | favourited: bool 113 | taggingStatus: Literal["success", "failure", "pending"] 114 | summarizationStatus: Optional[Literal["success", "failure", "pending"]] = None 115 | note: Optional[str] = None 116 | summary: Optional[str] = None 117 | source: Optional[ 118 | Literal[ 119 | "api", "web", "cli", "mobile", "extension", "singlefile", "rss", "import" 120 | ] 121 | ] = None 122 | userId: str 123 | tags: List[TagShort] 124 | content: Union[ 125 | ContentTypeLink, ContentTypeText, ContentTypeAsset, ContentTypeUnknown 126 | ] 127 | assets: List[BookmarkAsset] 128 | 129 | 130 | class PaginatedBookmarks(BaseModel): 131 | bookmarks: List[Bookmark] 132 | nextCursor: Optional[str] = "" 133 | 134 | 135 | class ListModel(BaseModel): 136 | id: str 137 | name: str 138 | description: Optional[str] = None 139 | icon: str 140 | parentId: Optional[str] 141 | type: Optional[Literal["manual", "smart"]] = "manual" 142 | query: Optional[str] = None 143 | public: bool 144 | hasCollaborators: bool 145 | userRole: Literal["owner", "editor", "viewer", "public"] 146 | 147 | 148 | class Highlight(BaseModel): 149 | bookmarkId: str 150 | startOffset: float 151 | endOffset: float 152 | color: Optional[Literal["yellow", "red", "green", "blue"]] = "yellow" 153 | text: Optional[str] 154 | note: Optional[str] 155 | id: str 156 | userId: str 157 | createdAt: str 158 | 159 | 160 | class PaginatedHighlights(BaseModel): 161 | highlights: List[Highlight] 162 | nextCursor: Optional[str] = "" 163 | 164 | 165 | class PaginatedTags(BaseModel): 166 | tags: List[Tag] 167 | nextCursor: Optional[str] = "" 168 | 169 | 170 | class Backup(BaseModel): 171 | id: str 172 | userId: str 173 | assetId: Optional[str] 174 | createdAt: str 175 | size: float 176 | bookmarkCount: int 177 | status: Literal["pending", "success", "failure"] 178 | errorMessage: Optional[str] = None 179 | -------------------------------------------------------------------------------- /community_scripts/omnivore2karakeep-highlights/README.md: -------------------------------------------------------------------------------- 1 | # Omnivore2Karakeep-highlights 2 | 3 | This script imports highlights from an Omnivore export to a Karakeep instance. It matches Omnivore bookmarks to existing Karakeep bookmarks and creates corresponding highlights with position information. 4 | 5 | *Note: This tool was developed with assistance from [aider.chat](https://github.com/Aider-AI/aider/).* 6 | 7 | ## Features 8 | 9 | - **Probabilistic bookmark matching**: Uses multiple strategies to match Omnivore bookmarks to Karakeep bookmarks, including exact URL matching, exact title matching, and fuzzy title matching with configurable thresholds 10 | - **Position detection**: Intelligently determines highlight positions within documents using multiple strategies including direct text matching and fuzzy matching algorithms 11 | - **Safe operation**: Only creates new highlights without modifying existing bookmarks or data - all operations are additive and reversible 12 | - **Metadata preservation**: Stores import metadata in each highlight's note field for full traceability and potential cleanup operations 13 | - **Caching**: Caches Karakeep bookmarks locally to avoid repeated API calls during development 14 | - **Progress tracking**: Shows progress bars for long-running operations 15 | - **Dry run mode**: Test the import process without actually creating highlights 16 | 17 | ## Important Notes 18 | 19 | ### Safety and Data Integrity 20 | 21 | ✅ **Database Safety:** 22 | - **Non-destructive**: This script only creates new highlights and never modifies or deletes existing bookmarks or highlights 23 | - **Additive operations**: All changes are purely additive to your Karakeep database 24 | - **Reversible**: Import metadata is stored in each highlight's note field, allowing for easy identification and cleanup if needed 25 | - **No data loss risk**: The import process cannot damage your existing Karakeep data 26 | 27 | ### Matching Process 28 | 29 | 🎯 **Probabilistic Matching:** 30 | - **Multi-strategy approach**: Uses exact URL matching, exact title matching, and fuzzy title matching as fallbacks 31 | - **Configurable thresholds**: Fuzzy matching uses a 95% similarity threshold by default, which can be adjusted 32 | - **Best-effort matching**: Some Omnivore bookmarks may not find matches due to title differences or missing URLs 33 | - **Manual review recommended**: Check the console output for unmatched bookmarks that may need manual attention 34 | 35 | ### Current Limitations 36 | 37 | ⚠️ **Known Limitations:** 38 | - **PDF highlights not supported**: The script currently skips PDF files by default (`skip_pdf=True`) as PDF highlight positioning is not yet implemented 39 | - **HTML content dependency**: Only works with web page bookmarks that have HTML content available in Karakeep 40 | - **Single highlight color**: All imported highlights use yellow color because Omnivore exports do not include color information - original highlight colors cannot be preserved (the default color can be modified in the code) 41 | - **No duplicate detection**: The script doesn't check if highlights already exist before creating them (relies on user to clean up duplicates if needed) 42 | 43 | ## Prerequisites 44 | 45 | 1. **Omnivore Export**: You need a complete Omnivore export containing: 46 | - `highlights/` directory with `.md` files 47 | - `content/` directory with `.html` and `.pdf` files 48 | - `metadata_*.json` files with bookmark information 49 | 50 | 2. **Karakeep Instance**: A running Karakeep instance with API access 51 | 52 | 3. **Environment Setup**: Karakeep API credentials configured via environment variables or parameters 53 | 54 | ## Installation 55 | 56 | Ensure you have the required dependencies installed: 57 | 58 | ```bash 59 | pip install karakeep-python-api fire tqdm pathlib beautifulsoup4 html2text markdown python-levenshtein 60 | ``` 61 | 62 | ## Usage 63 | 64 | ### Basic Usage (Dry Run) 65 | 66 | ```bash 67 | python omniore2karakeep-highlights.py /path/to/omnivore/export 68 | ``` 69 | 70 | ### Actually Import Highlights 71 | 72 | ```bash 73 | python omniore2karakeep-highlights.py /path/to/omnivore/export --dry=False 74 | ``` 75 | 76 | ### Include PDF Processing (Experimental) 77 | 78 | ```bash 79 | python omniore2karakeep-highlights.py /path/to/omnivore/export --skip_pdf=False --dry=False 80 | ``` 81 | 82 | ### Custom Cache File Location 83 | 84 | ```bash 85 | python Omnivore2Karakeep-highlights.py /path/to/omnivore/export --karakeep_path=./my_bookmarks.temp --dry=False 86 | ``` 87 | 88 | ## Parameters 89 | 90 | - `omnivore_export_dir` (required): Path to the Omnivore export directory 91 | - `karakeep_path` (optional): Path for caching Karakeep bookmarks (default: `./karakeep_bookmarks.temp`) 92 | - `dry` (optional): If True, simulates the import without creating highlights (default: `True`) 93 | - `skip_pdf` (optional): If True, skips PDF files (default: `True`) 94 | 95 | ## How It Works 96 | 97 | 1. **Load Omnivore Data**: Reads metadata files and highlight files from the export 98 | 2. **Cache Karakeep Bookmarks**: Fetches all bookmarks from Karakeep (cached locally for performance) 99 | 3. **Match Bookmarks**: For each Omnivore highlight file: 100 | - Finds the corresponding Omnivore bookmark metadata 101 | - Matches it to a Karakeep bookmark using: 102 | - Exact URL matching 103 | - Exact title matching 104 | - Fuzzy title matching (95% threshold) 105 | 4. **Position Detection**: Determines highlight positions using multiple strategies: 106 | - Direct text matching in plain text 107 | - Markdown content matching with position scaling 108 | - Fuzzy matching using Levenshtein distance 109 | - Link extraction for link-only highlights 110 | 5. **Create Highlights**: Creates highlights in Karakeep with calculated positions and stores import metadata in the note field for future reference 111 | 112 | ## Expected Directory Structure 113 | 114 | Your Omnivore export should have this structure: 115 | 116 | ``` 117 | omnivore_export/ 118 | ├── highlights/ 119 | │ ├── article1.md 120 | │ ├── article2.md 121 | │ └── ... 122 | ├── content/ 123 | │ ├── article1.html 124 | │ ├── article2.pdf 125 | │ └── ... 126 | └── metadata_YYYY-MM-DD_to_YYYY-MM-DD.json 127 | ``` 128 | 129 | ## Environment Variables 130 | 131 | Set up your Karakeep API credentials: 132 | 133 | ```bash 134 | export KARAKEEP_PYTHON_API_ENDPOINT="https://your-instance.com/api/v1/" 135 | export KARAKEEP_PYTHON_API_KEY="your-api-key" 136 | ``` 137 | 138 | ## Highlight Metadata and Cleanup 139 | 140 | ### Stored Metadata 141 | 142 | Each imported highlight includes metadata in its note field containing: 143 | - Original Omnivore bookmark information 144 | - Import timestamp 145 | - Matching strategy used 146 | - Position detection method 147 | - Source file information 148 | 149 | This metadata enables: 150 | - **Full traceability** of imported highlights 151 | - **Easy identification** of imported vs. native highlights 152 | - **Batch cleanup operations** if needed 153 | - **Debugging** of matching and positioning issues 154 | 155 | ### Cleanup Operations 156 | 157 | To identify all imported highlights, you can search for highlights containing specific metadata markers in their notes. The metadata format allows for easy filtering and bulk operations if you need to remove imported highlights later. 158 | 159 | ## Troubleshooting 160 | 161 | ### Common Issues 162 | 163 | 1. **"Could not find bookmark"**: The script couldn't match an Omnivore bookmark to a Karakeep bookmark 164 | - This is normal for some bookmarks due to probabilistic matching 165 | - Ensure the bookmark exists in Karakeep with similar URL or title 166 | - Check if URLs or titles match between systems 167 | - Consider manual import for important unmatched highlights 168 | 169 | 2. **"No HTML content available"**: The Karakeep bookmark doesn't have HTML content 170 | - Some bookmarks may not have been fully processed by Karakeep 171 | - Wait for Karakeep to finish processing the bookmark content 172 | - Manual content extraction may be needed for complex cases 173 | 174 | 3. **"Could not match highlight text to corpus"**: The highlight text couldn't be located in the document 175 | - This may happen with heavily formatted content or dynamic web pages 176 | - The highlight will be skipped but logged for manual review 177 | - Consider manual review of problematic highlights 178 | 179 | ### Performance Notes 180 | 181 | - The initial bookmark cache creation can take several minutes for large Karakeep instances 182 | - The cache file is automatically deleted upon successful completion 183 | - Subsequent runs use the cached data for faster processing 184 | - Monitor console output for matching statistics and any issues 185 | 186 | ## Version 187 | 188 | Current version: 0.0.1 189 | 190 | ## Contributing 191 | 192 | This script is part of the karakeep-python-api project. Please report issues or contribute improvements through the main repository. 193 | -------------------------------------------------------------------------------- /community_scripts/pocket2karakeep-archived/pocket_archiving_status_updater.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small script made to solve the karakeep issue where Pocket's imported document would not preserve the "archived" value. 3 | 4 | This script reads a CSV file exported from Pocket with the following format: 5 | title,url,time_added,tags,status 6 | 7 | It identifies entries with status "archive" and updates their status in Karakeep. 8 | 9 | """ 10 | 11 | import time 12 | 13 | from Levenshtein import ratio 14 | import pickle 15 | from fire import Fire 16 | from typing import Optional 17 | from pathlib import Path 18 | import json 19 | import csv 20 | from karakeep_python_api import KarakeepAPI 21 | from tqdm import tqdm 22 | 23 | VERSION: str = "1.1.0" 24 | 25 | karakeep = KarakeepAPI(verbose=False) 26 | 27 | 28 | def get_pocket_archived(pocket_export_dir: str) -> list[dict]: 29 | """ 30 | Loads and parses a CSV file from the specified directory. 31 | Filters and returns a list of articles that are marked as "archive" in the status column. 32 | 33 | CSV format: 34 | title,url,time_added,tags,status 35 | """ 36 | export_dir = Path(pocket_export_dir) 37 | all_data: list[dict] = [] 38 | 39 | # Check if the provided path is a file or directory 40 | if export_dir.is_file(): 41 | csv_file = export_dir 42 | else: 43 | # Look for CSV files in the directory 44 | csv_files = list(export_dir.glob("*.csv")) 45 | if not csv_files: 46 | print(f"Warning: No CSV files found in {pocket_export_dir}.") 47 | return [] 48 | # Use the first CSV file found 49 | csv_file = csv_files[0] 50 | 51 | try: 52 | with open(csv_file, "r", encoding="utf-8") as f: 53 | reader = csv.DictReader(f) 54 | for row in reader: 55 | all_data.append(row) 56 | except Exception as e: 57 | print(f"Warning: Could not read or process {csv_file.name}: {e}") 58 | return [] 59 | 60 | if not all_data: 61 | print(f"Warning: No data loaded from {csv_file}.") 62 | return [] 63 | 64 | # Filter for articles with status "archive" 65 | archived = [] 66 | for d in all_data: 67 | if d.get("status", "").lower() == "archive": 68 | # Ensure the dictionary has the required fields 69 | if "url" not in d: 70 | print(f"Warning: Entry missing URL: {d}") 71 | continue 72 | 73 | # Create a dictionary with the expected structure 74 | archived_entry = { 75 | "url": d["url"], 76 | "title": d.get("title", ""), # Use empty string if title is missing 77 | "time_added": d.get("time_added", ""), 78 | "tags": d.get("tags", ""), 79 | "state": "Archived", # Add state field for compatibility with the rest of the script 80 | } 81 | archived.append(archived_entry) 82 | 83 | return archived 84 | 85 | 86 | def main( 87 | pocket_export_dir: str, 88 | karakeep_path: Optional[str] = "./karakeep_bookmarks.temp", 89 | ) -> None: 90 | archived = get_pocket_archived(pocket_export_dir) 91 | 92 | if not archived: 93 | print("No archived Pocket articles found or loaded. Exiting.") 94 | return 95 | 96 | # fetch all the bookmarks from karakeep, as the search feature is unreliable 97 | # as the loading can be pretty long, we store it to a local file 98 | if Path(karakeep_path).exists(): 99 | with Path(karakeep_path).open("rb") as f: 100 | all_bm = pickle.load(f) 101 | else: 102 | n = karakeep.get_current_user_stats()["numBookmarks"] 103 | pbar = tqdm(total=n, desc="Fetching bookmarks") 104 | all_bm = [] 105 | batch_size = 100 # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed 106 | page = karakeep.get_all_bookmarks( 107 | include_content=False, 108 | limit=batch_size, 109 | ) 110 | all_bm.extend(page.bookmarks) 111 | pbar.update(len(all_bm)) 112 | while page.nextCursor: 113 | page = karakeep.get_all_bookmarks( 114 | include_content=False, 115 | limit=batch_size, 116 | cursor=page.nextCursor, 117 | ) 118 | all_bm.extend(page.bookmarks) 119 | pbar.update(len(page.bookmarks)) 120 | 121 | assert len(all_bm) == n, ( 122 | f"Only retrieved {len(all_bm)} bookmarks instead of {n}" 123 | ) 124 | pbar.close() 125 | 126 | with Path(karakeep_path).open("wb") as f: 127 | pickle.dump(all_bm, f) 128 | 129 | failed = [] 130 | for pocket in tqdm(archived, desc="Archiving", unit="doc"): 131 | url = pocket["url"] 132 | 133 | found_it = False 134 | for bookmark in all_bm: 135 | found_url = None 136 | content = bookmark.content 137 | if hasattr(content, "url"): 138 | found_url = content.url 139 | elif hasattr(content, "sourceUrl"): 140 | found_url = content.sourceUrl 141 | else: 142 | found_url = "" 143 | 144 | if found_url == url: 145 | found_it = True 146 | break 147 | 148 | # couldn't find a matching url, match by title 149 | # exact title match: 150 | if ( 151 | "title" in pocket 152 | and pocket["title"] 153 | and hasattr(content, "title") 154 | and content.title 155 | ): 156 | if pocket["title"].lower() == content.title.lower(): 157 | found_it = True 158 | break 159 | if ( 160 | "title" in pocket 161 | and pocket["title"] 162 | and hasattr(bookmark, "title") 163 | and bookmark.title 164 | ): 165 | if pocket["title"].lower() == bookmark.title.lower(): 166 | found_it = True 167 | break 168 | 169 | # fuzzy matching, as a last resort 170 | threshold = 0.95 171 | if ( 172 | "title" in pocket 173 | and pocket["title"] 174 | and hasattr(content, "title") 175 | and content.title 176 | ): 177 | r = ratio(pocket["title"].lower(), content.title.lower()) 178 | if r >= threshold: 179 | found_it = True 180 | # breakpoint() 181 | break 182 | 183 | if ( 184 | "title" in pocket 185 | and pocket["title"] 186 | and hasattr(bookmark, "title") 187 | and bookmark.title 188 | ): 189 | r = ratio(pocket["title"].lower(), bookmark.title.lower()) 190 | if r >= threshold: 191 | found_it = True 192 | break 193 | 194 | # couldn't be found 195 | if not found_it: 196 | failed.append(pocket) 197 | tqdm.write(f"Failed to find {url}") 198 | # breakpoint() 199 | with open("./omnivore_archiver_failed.txt", "a") as f: 200 | f.write(f"\n{pocket}") 201 | continue 202 | 203 | # skip already archived 204 | if bookmark.archived: 205 | tqdm.write(f"Already archived: {url}") 206 | continue 207 | for attempt in range(5): 208 | try: 209 | fresh = karakeep.get_a_single_bookmark( 210 | bookmark_id=bookmark.id, include_content=False 211 | ) 212 | break 213 | except Exception as e: 214 | if attempt == 4: 215 | raise e 216 | tqdm.write(f"Get single bookmark failed, retrying ({attempt + 1}/5)") 217 | time.sleep(1) 218 | if fresh.archived: 219 | tqdm.write(f"Already archived: {url}") 220 | continue 221 | 222 | # do the archiving 223 | retries = 10 224 | for attempt in range(retries): 225 | try: 226 | res_arch = karakeep.update_a_bookmark( 227 | bookmark_id=bookmark.id, 228 | update_data={"archived": True}, 229 | ) 230 | break 231 | except Exception as e: 232 | if attempt == retries - 1: 233 | raise e 234 | tqdm.write(f"Update failed, retrying ({attempt + 1}/{retries})") 235 | time.sleep(1) 236 | if isinstance(res_arch, dict): 237 | assert res_arch["archived"], res_arch 238 | else: 239 | assert res_arch.archived, res_arch 240 | tqdm.write(f"Succesfuly archived: {url}") 241 | 242 | 243 | if __name__ == "__main__": 244 | Fire(main) 245 | -------------------------------------------------------------------------------- /community_scripts/omnivore2karakeep-archived/omnivore2karakeep-archived.py: -------------------------------------------------------------------------------- 1 | """ 2 | Small script made to solve the karakeep issue where Omnivore's imported document would not preserve the "archived" value. 3 | 4 | Link: https://github.com/karakeep-app/karakeep/issues/703 5 | 6 | """ 7 | 8 | from Levenshtein import ratio 9 | import pickle 10 | from typing import Optional 11 | from fire import Fire 12 | from pathlib import Path 13 | import json 14 | from karakeep_python_api import KarakeepAPI 15 | from tqdm import tqdm 16 | from loguru import logger 17 | 18 | # Configure loguru to log debug messages to a local file 19 | logger.add("omniore2karakeep-archived.log", level="DEBUG", rotation="10 MB") 20 | 21 | karakeep = KarakeepAPI(verbose=False) 22 | 23 | VERSION: str = "2.0.0" 24 | 25 | 26 | def match_omnivore_to_bookmark(omnivore: dict, bookmark) -> tuple[bool, float]: 27 | """ 28 | Determines if an Omnivore article matches a Karakeep bookmark. 29 | 30 | Uses URL matching first, then title matching (exact and fuzzy). 31 | 32 | Parameters: 33 | - omnivore: Omnivore article dictionary 34 | - bookmark: Karakeep bookmark object 35 | 36 | Returns: 37 | - tuple[bool, float]: (is_match, ratio) where ratio is 1.0 for exact matches 38 | and Levenshtein ratio for fuzzy matches 39 | """ 40 | url = omnivore["url"] 41 | 42 | # Try URL matching first 43 | found_url = None 44 | content = bookmark.content 45 | if hasattr(content, "url"): 46 | found_url = content.url 47 | elif hasattr(content, "sourceUrl"): 48 | found_url = content.sourceUrl 49 | else: 50 | raise ValueError(content) 51 | 52 | # handling local PDF, they don't have proper url 53 | if found_url and found_url.startswith("https://omnivore.app"): 54 | found_url = None 55 | 56 | if found_url == url: 57 | return True, 1.0 58 | 59 | # couldn't find a matching url, match by title 60 | # exact title match: 61 | if ( 62 | "title" in omnivore 63 | and omnivore["title"] 64 | and hasattr(content, "title") 65 | and content.title 66 | ): 67 | if omnivore["title"].lower() == content.title.lower(): 68 | return True, 1.0 69 | if ( 70 | "title" in omnivore 71 | and omnivore["title"] 72 | and hasattr(bookmark, "title") 73 | and bookmark.title 74 | ): 75 | if omnivore["title"].lower() == bookmark.title.lower(): 76 | return True, 1.0 77 | 78 | # fuzzy matching, as a last resort 79 | threshold = 0.95 80 | best_ratio = 0.0 81 | 82 | if ( 83 | "title" in omnivore 84 | and omnivore["title"] 85 | and hasattr(content, "title") 86 | and content.title 87 | ): 88 | r = ratio(omnivore["title"].lower(), content.title.lower()) 89 | best_ratio = max(best_ratio, r) 90 | 91 | if ( 92 | "title" in omnivore 93 | and omnivore["title"] 94 | and hasattr(bookmark, "title") 95 | and bookmark.title 96 | ): 97 | r = ratio(omnivore["title"].lower(), bookmark.title.lower()) 98 | best_ratio = max(best_ratio, r) 99 | 100 | if best_ratio >= threshold: 101 | return True, best_ratio 102 | 103 | return False, best_ratio 104 | 105 | 106 | def get_omnivores_archived( 107 | omnivore_export_dir: str, 108 | read_threshold: int = 80, 109 | treat_read_as_archived: bool = True, 110 | ) -> list[dict]: 111 | """ 112 | Loads and concatenates all Omnivore metadata JSON files from the specified directory. 113 | Filters and returns a list of articles that are marked as "Archived". 114 | 115 | Parameters: 116 | - read_threshold: Reading progress percentage threshold to consider an article as "read" (default: 80) 117 | - treat_read_as_archived: If True, treat articles above read_threshold as archived (default: True) 118 | """ 119 | export_dir = Path(omnivore_export_dir) 120 | all_data: list[dict] = [] 121 | 122 | # Find all metadata_*.json files, load, and concatenate their lists 123 | for json_file in export_dir.glob("metadata_*_to_*.json"): 124 | try: 125 | content = json_file.read_text() 126 | data: list[dict] = json.loads(content) 127 | all_data.extend(data) 128 | except json.JSONDecodeError as e: 129 | logger.warning(f"Could not decode JSON from {json_file.name}: {e}") 130 | except Exception as e: 131 | logger.warning(f"Could not read or process {json_file.name}: {e}") 132 | 133 | if not all_data: 134 | logger.warning( 135 | f"No data loaded from {omnivore_export_dir}. Ensure 'metadata_*_to_*.json' files exist and are valid." 136 | ) 137 | return [] 138 | 139 | # figure out which should have been archived 140 | data = all_data # Use the concatenated data 141 | active = [] 142 | archived = [] 143 | read = [] 144 | unknown = [] 145 | for d in data: 146 | if int(d["readingProgress"]) > read_threshold: 147 | read.append(d) 148 | if d["state"] == "Archived": 149 | archived.append(d) 150 | elif d["state"] == "Active": 151 | active.append(d) 152 | elif d["state"] == "Unknown": 153 | unknown.append(d) 154 | else: 155 | raise ValueError(json.dumps(d)) 156 | 157 | # If treat_read_as_archived is True, add read articles to archived list (avoiding duplicates) 158 | if treat_read_as_archived: 159 | archived_urls = { 160 | d["url"] for d in archived 161 | } # Create set of already archived URLs 162 | for read_article in read: 163 | if read_article["url"] not in archived_urls: 164 | archived.append(read_article) 165 | 166 | return archived 167 | 168 | 169 | def main( 170 | omnivore_export_dir: str, 171 | karakeep_temp_path: Optional[str] = "./karakeep_bookmarks.temp", 172 | read_threshold: int = 80, 173 | treat_read_as_archived: bool = True, 174 | ) -> None: 175 | assert Path(omnivore_export_dir).exists(), "Omnivore export dir does not exist" 176 | assert Path(omnivore_export_dir).is_dir(), "Omnivore export dir is not a dir" 177 | archived = get_omnivores_archived( 178 | omnivore_export_dir, read_threshold, treat_read_as_archived 179 | ) 180 | 181 | if not archived: 182 | logger.info("No archived Omnivore articles found or loaded. Exiting.") 183 | return 184 | 185 | # fetch all the bookmarks from karakeep, as the search feature is unreliable 186 | # as the loading can be pretty long, we store it to a local file 187 | if Path(karakeep_temp_path).exists(): 188 | with Path(karakeep_temp_path).open("rb") as f: 189 | all_bm = pickle.load(f) 190 | else: 191 | n = karakeep.get_current_user_stats()["numBookmarks"] 192 | pbar = tqdm(total=n, desc="Fetching bookmarks") 193 | all_bm = [] 194 | batch_size = 100 # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed 195 | page = karakeep.get_all_bookmarks( 196 | include_content=False, 197 | limit=batch_size, 198 | ) 199 | all_bm.extend(page.bookmarks) 200 | pbar.update(len(all_bm)) 201 | while page.nextCursor: 202 | page = karakeep.get_all_bookmarks( 203 | include_content=False, 204 | limit=batch_size, 205 | cursor=page.nextCursor, 206 | ) 207 | all_bm.extend(page.bookmarks) 208 | pbar.update(len(page.bookmarks)) 209 | 210 | assert len(all_bm) == n, ( 211 | f"Only retrieved {len(all_bm)} bookmarks instead of {n}" 212 | ) 213 | pbar.close() 214 | 215 | with Path(karakeep_temp_path).open("wb") as f: 216 | pickle.dump(all_bm, f) 217 | 218 | failed = [] 219 | for omnivore in tqdm(archived, desc="Archiving", unit="doc"): 220 | url = omnivore["url"] 221 | 222 | # Collect all potential matches with their ratios 223 | potential_matches = [] 224 | for bookmark in all_bm: 225 | is_match, match_ratio = match_omnivore_to_bookmark(omnivore, bookmark) 226 | if is_match: 227 | potential_matches.append((bookmark, match_ratio)) 228 | 229 | # couldn't be found 230 | if not potential_matches: 231 | failed.append(omnivore) 232 | tqdm.write(f"Failed to find {url}") 233 | with open("./omnivore_archiver_failed.txt", "a") as f: 234 | f.write(f"\n{omnivore}") 235 | continue 236 | 237 | # Choose the bookmark with the highest ratio 238 | bookmark = max(potential_matches, key=lambda x: x[1])[0] 239 | 240 | # skip already archived 241 | if bookmark.archived: 242 | tqdm.write(f"Already archived: {url}") 243 | continue 244 | fresh = karakeep.get_a_single_bookmark( 245 | bookmark_id=bookmark.id, include_content=False 246 | ) 247 | if fresh.archived: 248 | tqdm.write(f"Already archived: {url}") 249 | continue 250 | 251 | # do the archiving 252 | res_arch = karakeep.update_a_bookmark( 253 | bookmark_id=bookmark.id, 254 | update_data={"archived": True}, 255 | ) 256 | assert res_arch["archived"], res_arch 257 | tqdm.write(f"Succesfuly archived: {url}") 258 | 259 | # Clean up the temporary file since everything worked successfully 260 | if Path(karakeep_temp_path).exists(): 261 | Path(karakeep_temp_path).unlink() 262 | 263 | 264 | if __name__ == "__main__": 265 | Fire(main) 266 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Karakeep Python API Client 2 | 3 | [![PyPI version](https://badge.fury.io/py/karakeep-python-api.svg)](https://badge.fury.io/py/karakeep-python-api) 4 | 5 | A community-developed Python client for the [Karakeep](https://karakeep.app/) API. 6 | 7 | **Disclaimer:** This is an unofficial, community-driven project. The developers of Karakeep were not consulted during its creation. Use at your own discretion. 8 | 9 | ## Table of Contents 10 | 11 | - [Overview](#overview) 12 | - [Current Status & Caveats](#current-status--caveats) 13 | - [API Method Coverage](#api-method-coverage) 14 | - [Installation](#installation) 15 | - [Usage](#usage) 16 | - [Environment Variables](#environment-variables) 17 | - [Command Line Interface (CLI)](#command-line-interface-cli) 18 | - [Python Library](#python-library) 19 | - [Community Scripts](#community-scripts) 20 | - [Development](#development) 21 | 22 | ## Overview 23 | 24 | This library provides a Python interface (both a class and a command-line tool) to interact with a Karakeep instance's API. The author also developed [freshrss_to_karakeep](https://github.com/thiswillbeyourgithub/freshrss_to_karakeep), a Python script that periodically sends FreshRSS "favourite" articles to Karakeep (a bookmarking and read-it-later app, see [Karakeep on GitHub](https://github.com/karakeep-app/karakeep)). 25 | 26 | The development process involved: 27 | 28 | 1. Starting with the official Karakeep OpenAPI specification: [karakeep-openapi-spec.json](https://github.com/karakeep-app/karakeep/blob/main/packages/open-api/karakeep-openapi-spec.json). 29 | 2. Generating Pydantic data models from the specification using [datamodel-code-generator](https://koxudaxi.github.io/datamodel-code-generator/). 30 | 3. Using [aider.chat](https://aider.chat), an AI pair programming tool, to write the `KarakeepAPI` client class, the Click-based CLI, and the initial Pytest suite. 31 | 32 | ## Current Status & Caveats 33 | 34 | * **Experimental Methods:** The included Pytest suite currently only covers a subset of the available API methods (primarily 'get all' endpoints and client initialization). Methods *not* explicitly tested should be considered **experimental**. 35 | * **Ongoing Development:** The author intends to improve and validate methods as they are needed for personal use cases. Contributions and bug reports are welcome! 36 | * **Updating process**: I have local scripts to keep track of the changes in the OpenAPI specs on the server side. From time to time I go and use aider to make the code reflect the latest openapi and push that to the `dev` branch. When there is a new karakeep release, I will merge the `dev` branch to the `main` branch and create my own release. This way, `dev` hopefully is up to date with the latest code of karakeep, while `main` is up to date with the latest release of karakeep. 37 | 38 | ## API Method Coverage 39 | 40 | The following table lists the public methods available in the `KarakeepAPI` class. 41 | * The "Pytest" column indicates whether the Python library method is covered by the automated test suite (`tests/test_karakeep_api.py`). 42 | * The "CLI" column indicates whether the corresponding CLI command for that method is tested within the Pytest suite (typically via `subprocess`). 43 | Methods or CLI commands marked with ❌ should be used with caution as their behavior has not been automatically verified within the test suite. 44 | 45 | | Method Name | Pytest | CLI | Remarks | 46 | | -------------------------------- | :----: | :--: | -------------------------------------------- | 47 | | `get_all_bookmarks` | ✅ | ✅ | Tested with pagination. | 48 | | `create_a_new_bookmark` | ✅ | ❌ | Pytest for `type="link"` via fixture and `type="asset"` via PDF test. CLI not directly tested. | 49 | | `search_bookmarks` | ✅ | ✅ | Seems to be nondeterministic and fails if using more than 3 words | 50 | | `get_a_single_bookmark` | ✅ | ❌ | | 51 | | `delete_a_bookmark` | ✅ | ❌ | | 52 | | `update_a_bookmark` | ✅ | ✅ | Tested for title updates. | 53 | | `summarize_a_bookmark` | ❌ | ❌ | | 54 | | `attach_tags_to_a_bookmark` | ✅ | ❌ | | 55 | | `detach_tags_from_a_bookmark` | ✅ | ❌ | | 56 | | `get_highlights_of_a_bookmark` | ❌ | ❌ | Works from the CLI; not yet added to Pytest. | 57 | | `attach_asset` | ❌ | ❌ | | 58 | | `replace_asset` | ❌ | ❌ | | 59 | | `detach_asset` | ❌ | ❌ | | 60 | | `get_all_lists` | ✅ | ✅ | | 61 | | `create_a_new_list` | ✅ | ❌ | | 62 | | `get_a_single_list` | ✅ | ❌ | | 63 | | `delete_a_list` | ✅ | ❌ | | 64 | | `update_a_list` | ❌ | ❌ | | 65 | | `get_bookmarks_in_the_list` | ❌ | ❌ | | 66 | | `add_a_bookmark_to_a_list` | ❌ | ❌ | | 67 | | `remove_a_bookmark_from_a_list` | ❌ | ❌ | | 68 | | `get_all_tags` | ✅ | ✅ | | 69 | | `create_a_new_tag` | ❌ | ❌ | | 70 | | `get_a_single_tag` | ✅ | ❌ | | 71 | | `delete_a_tag` | ✅ | ❌ | | 72 | | `update_a_tag` | ✅ | ❌ | No output validation due to [server bug](https://github.com/karakeep-app/karakeep/issues/1365). | 73 | | `get_bookmarks_with_the_tag` | ❌ | ❌ | | 74 | | `get_all_highlights` | ✅ | ✅ | Tested with pagination. | 75 | | `create_a_new_highlight` | ❌ | ❌ | | 76 | | `get_a_single_highlight` | ❌ | ❌ | | 77 | | `delete_a_highlight` | ❌ | ❌ | Works from the CLI; not yet added to Pytest. | 78 | | `update_a_highlight` | ❌ | ❌ | | 79 | | `upload_a_new_asset` | ✅ | ❌ | Tested in PDF asset lifecycle test. | 80 | | `get_a_single_asset` | ✅ | ❌ | Tested in PDF asset lifecycle test. | 81 | | `get_current_user_info` | ✅ | ❌ | Pytest: Tested indirectly during client init. CLI not directly tested. | 82 | | `get_current_user_stats` | ✅ | ✅ | | 83 | | `update_user` | ❌ | ❌ | | 84 | | `get_all_backups` | ✅ | ✅ | Tested in backup lifecycle test. | 85 | | `trigger_a_new_backup` | ✅ | ❌ | Tested in backup lifecycle test. | 86 | | `get_a_single_backup` | ✅ | ❌ | Tested in backup lifecycle test. | 87 | | `delete_a_backup` | ✅ | ❌ | Tested in backup lifecycle test. | 88 | | `download_a_backup` | ✅ | ❌ | Tested in backup lifecycle test. | 89 | 90 | ## Installation 91 | 92 | It is recommended to use `uv` for faster installation: 93 | 94 | ```bash 95 | uv pip install karakeep-python-api 96 | ``` 97 | 98 | Alternatively, use standard `pip`: 99 | 100 | ```bash 101 | pip install karakeep-python-api 102 | ``` 103 | 104 | ## Usage 105 | 106 | This package can be used as a Python library or as a command-line interface (CLI). 107 | 108 | ### Environment Variables 109 | 110 | The client can be configured using the following environment variables: 111 | 112 | * `KARAKEEP_PYTHON_API_ENDPOINT`: **Required**. The full URL of your Karakeep API, including the `/api/v1/` path (e.g., `https://karakeep.domain.com/api/v1/` or `https://try.karakeep.app/api/v1/`). 113 | * `KARAKEEP_PYTHON_API_KEY`: **Required**. Your Karakeep API key (Bearer token). 114 | * `KARAKEEP_PYTHON_API_VERIFY_SSL`: Set to `false` to disable SSL certificate verification (default: `true`). 115 | * `KARAKEEP_PYTHON_API_VERBOSE`: Set to `true` to enable verbose debug logging for the client and CLI (default: `false`). 116 | * `KARAKEEP_PYTHON_API_DISABLE_RESPONSE_VALIDATION`: Set to `true` to disable Pydantic validation of API responses. The client will return raw dictionary/list data instead of Pydantic models (default: `false`). 117 | * `KARAKEEP_PYTHON_API_ENSURE_ASCII`: Set to `true` to escape non-ASCII characters in the JSON output (default: `false`, which means Unicode characters are kept). 118 | 119 | ### Command Line Interface (CLI) 120 | 121 | The CLI dynamically generates commands based on the API methods. You need to provide your API key and endpoint either via environment variables (recommended) or command-line options. 122 | 123 | **Basic Structure:** 124 | 125 | ```bash 126 | python -m karakeep_python_api [GLOBAL_OPTIONS] [COMMAND_OPTIONS] 127 | ``` 128 | 129 | **Getting Help:** 130 | 131 | ```bash 132 | # General help and list of commands 133 | python -m karakeep_python_api --help 134 | 135 | # Help for a specific command 136 | python -m karakeep_python_api get-all-bookmarks --help 137 | ``` 138 | 139 | **Examples:** 140 | 141 | ```bash 142 | # List all tags (requires env vars set) 143 | python -m karakeep_python_api get-all-tags 144 | 145 | # Get the first page of bookmarks with a limit, overriding env vars if needed 146 | # Note: The /api/v1/ path will be automatically appended if not present 147 | python -m karakeep_python_api --base-url https://karakeep.domain.com/api/v1/ --api-key YOUR_API_KEY get-all-bookmarks --limit 10 148 | 149 | # Get all lists and pipe the JSON output to jq to extract the first list 150 | python -m karakeep_python_api get-all-lists | jq '.[0]' 151 | 152 | # Create a new bookmark from a link (body provided as JSON string) 153 | python -m karakeep_python_api create-a-new-bookmark --data '{"type": "link", "url": "https://example.com"}' 154 | 155 | # Get all tags and ensure ASCII output (e.g., for compatibility with systems that don't handle Unicode well) 156 | python -m karakeep_python_api --ascii get-all-tags 157 | 158 | # Dump the raw OpenAPI spec used by the client 159 | python -m karakeep_python_api --dump-openapi-specification 160 | ``` 161 | 162 | ### Python Library 163 | 164 | Import the `KarakeepAPI` class and instantiate it. 165 | 166 | ```python 167 | import os 168 | from karakeep_python_api import KarakeepAPI, APIError, AuthenticationError, datatypes 169 | 170 | # Ensure required environment variables are set 171 | # Example: os.environ["KARAKEEP_PYTHON_API_ENDPOINT"] = "https://karakeep.domain.com/api/v1/" 172 | # Example: os.environ["KARAKEEP_PYTHON_API_KEY"] = "your_secret_api_key" 173 | 174 | try: 175 | # Initialize the client (reads from env vars by default) 176 | client = KarakeepAPI( 177 | # Optionally override env vars: 178 | # api_endpoint="https://karakeep.domain.com/api/v1/", 179 | # api_key="another_key", 180 | # verbose=True, 181 | # disable_response_validation=False 182 | ) 183 | 184 | # Example: Get all lists 185 | all_lists = client.get_all_lists() 186 | if all_lists: 187 | print(f"Retrieved {len(all_lists)} lists.") 188 | # Access list properties (uses Pydantic models by default) 189 | print(f"First list name: {all_lists[0].name}") 190 | print(f"First list ID: {all_lists[0].id}") 191 | else: 192 | print("No lists found.") 193 | 194 | # Example: Get first page of bookmarks 195 | bookmarks_page = client.get_all_bookmarks(limit=5) 196 | print(f"\nRetrieved {len(bookmarks_page.bookmarks)} bookmarks.") 197 | if bookmarks_page.bookmarks: 198 | print(f"First bookmark title: {bookmarks_page.bookmarks[0].title}") 199 | if bookmarks_page.nextCursor: 200 | print(f"Next page cursor: {bookmarks_page.nextCursor}") 201 | 202 | 203 | except AuthenticationError as e: 204 | print(f"Authentication failed: {e}") 205 | except APIError as e: 206 | print(f"An API error occurred: {e}") 207 | except ValueError as e: 208 | # Handles missing API key/endpoint during initialization 209 | print(f"Configuration error: {e}") 210 | except Exception as e: 211 | print(f"An unexpected error occurred: {e}") 212 | 213 | ``` 214 | 215 | ## Community Scripts 216 | 217 | Community Scripts are a bunch of scripts made to solve specific issues. They are made by the community so don't hesitate to submit yours or open an issue if you have a bug. They also serve as example of how to use the API. 218 | 219 | They can be found in the [./community_scripts](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts) folder. Don't hesitate to submit yours, the contribution guidelines are in the community_scripts directory README.md file. 220 | 221 | | Community Script | Description | 222 | |----------------|--------------------------------------------------------------------------------------------------------------| 223 | | [Karakeep-Time-Tagger](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-time-tagger) | Automatically adds time-to-read tags (`0-5m`, `5-10m`, etc.) to bookmarks based on content length analysis. Includes systemd service and timer files for automated periodic execution. | 224 | | [Karakeep-List-To-Tag](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-list-to-tag) | Converts a Karakeep list into tags by adding a specified tag to all bookmarks within that list. | 225 | | [Omnivore2Karakeep-Highlights](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/omnivore2karakeep-highlights) | Imports highlights from Omnivore export data to Karakeep, with intelligent position detection and bookmark matching. Supports dry-run mode for testing. | 226 | | [Omnivore2Karakeep-Archived](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/omnivore2karakeep-archived) | (Should not be needed anymore) Fixes the archived status of bookmarks imported from Omnivore by reading export data and updating Karakeep accordingly. | 227 | | [pocket2karakeep-archived](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/pocket2karakeep-archived) by [@youenchene](https://github.com/youenchene) | (Should not be needed anymore) Fixes the archived status of bookmarks imported from Pocket by reading export data and updating Karakeep accordingly. | 228 | | [Karakeep-Archive-Before-Date](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-archive-before-date) by [@youenchene](https://github.com/youenchene) | Allow you to archive all not archived post before a given date | 229 | | [Freshrss-To-Karakeep](https://github.com/thiswillbeyourgithub/freshrss_to_karakeep) | Syncs some links from Freshrss to Karakeep | 230 | | [Karanki (WIP)](https://github.com/thiswillbeyourgithub/Karanki) | Bidirectional sync between anki notes and highlights | 231 | | [Karakeep-remove-ai-tags](https://github.com/thiswillbeyourgithub/karakeep_python_api/tree/main/community_scripts/karakeep-remove-ai-tags) by [@youenchene](https://github.com/youenchene) | Remove all tags attached by AI and not human attached | 232 | 233 | ## Development 234 | 235 | 1. Clone the repository. 236 | 2. Create a virtual environment and activate it. 237 | 3. Install dependencies, including development tools (using `uv` recommended): 238 | 239 | ```bash 240 | uv pip install -e ".[dev]" 241 | ``` 242 | 4. Set the required environment variables (`KARAKEEP_PYTHON_API_ENDPOINT`, `KARAKEEP_PYTHON_API_KEY`) for running tests against a live instance. 243 | 5. Run tests: 244 | 245 | ```bash 246 | pytest 247 | ``` 248 | 249 | 250 | --- 251 | 252 | *This README was generated with assistance from [aider.chat](https://aider.chat).* 253 | -------------------------------------------------------------------------------- /community_scripts/omnivore2karakeep-highlights/omnivore2karakeep-highlights.py: -------------------------------------------------------------------------------- 1 | import re 2 | import markdown 3 | from Levenshtein import ratio 4 | import re 5 | from typing import Optional 6 | import json 7 | import pickle 8 | from karakeep_python_api import KarakeepAPI 9 | from tqdm import tqdm 10 | from pathlib import Path 11 | from bs4 import BeautifulSoup 12 | from html2text import html2text 13 | import fire 14 | 15 | from string_context_matcher import match_highlight_to_corpus 16 | 17 | VERSION: str = "1.0.0" 18 | 19 | 20 | def find_highlight_position( 21 | highlight: str, as_text: str, as_md: str, kara_content: str 22 | ) -> tuple[int, int]: 23 | """ 24 | Find the start and end positions of a highlight within the document content. 25 | 26 | This function uses multiple strategies to locate highlights: 27 | 1. Direct text matching in plain text 28 | 2. Markdown content matching with position scaling 29 | 3. Fuzzy matching using string context matcher 30 | 4. Link extraction for highlights containing only links 31 | 32 | Parameters 33 | ---------- 34 | highlight : str 35 | The original highlight text (may contain markdown/HTML) 36 | as_text : str 37 | The full document content as plain text 38 | as_md : str 39 | The full document content as markdown 40 | kara_content : str 41 | The raw HTML content of the document 42 | 43 | Returns 44 | ------- 45 | tuple[int, int] 46 | A tuple containing (start_position, end_position) of the highlight 47 | """ 48 | # Convert highlight to plain text for matching 49 | high_as_text = BeautifulSoup(markdown.markdown(highlight), "html.parser").get_text() 50 | 51 | start = 0 52 | 53 | # Strategy 1: Direct text matching 54 | if high_as_text in as_text: 55 | start = as_text.index(high_as_text) 56 | 57 | # Strategy 2: Markdown content matching with position scaling 58 | if highlight in as_md: 59 | if start == 0: 60 | start = int(as_md.index(highlight) / len(as_md) * len(as_text)) 61 | else: 62 | start = ( 63 | start + int(as_md.index(highlight) / len(as_md) * len(as_text)) 64 | ) // 2 65 | 66 | # Strategy 3: Fuzzy matching when direct matching fails 67 | if start == 0: 68 | match_text = match_highlight_to_corpus(query=high_as_text, corpus=as_text) 69 | match_md = match_highlight_to_corpus(query=highlight, corpus=as_md) 70 | 71 | if match_text.matches and match_md.matches: 72 | position_text = as_text.index(match_text.matches[0]) / len(as_text) 73 | position_md = as_md.index(match_md.matches[0]) / len(as_md) 74 | diff = abs(position_text - position_md) 75 | 76 | if diff >= 0.20: 77 | # if differ too much, assume html has a too large overhead 78 | rel_pos = position_md 79 | else: 80 | rel_pos = (position_text + position_md) / 2 81 | del diff 82 | elif match_text.matches: 83 | rel_pos = as_text.index(match_text.matches[0]) / len(as_text) 84 | elif match_md.matches: 85 | rel_pos = as_md.index(match_md.matches[0]) / len(as_md) 86 | elif not high_as_text: # probably contains only a link, so we have to find that link in the raw html 87 | links = re.findall( 88 | r"\bhttp:\/\/[-\w+&@#\/%?=~()|!:,.;]*[-\w+&@#\/%=~()|]", 89 | highlight, 90 | ) 91 | positions = [ 92 | kara_content.index(link) for link in links if link in kara_content 93 | ] 94 | assert positions, highlight 95 | rel_pos = int(sum(positions) / len(positions)) 96 | else: 97 | raise ValueError( 98 | f"Could not match highlight text to corpus for highlight: {highlight[:100]}{'...' if len(highlight) > 100 else ''}" 99 | ) 100 | start = int(rel_pos * len(high_as_text)) 101 | del rel_pos 102 | 103 | end = start + len(high_as_text) 104 | return start, end 105 | 106 | 107 | def find_matching_bookmark( 108 | omnivore: dict, url: str, all_bm: list, is_pdf: bool, threshold: float = 0.95 109 | ): 110 | """ 111 | Find a matching bookmark in Karakeep based on Omnivore bookmark data. 112 | 113 | This function attempts to match an Omnivore bookmark to a Karakeep bookmark using 114 | multiple strategies: URL matching, exact title matching, and fuzzy title matching. 115 | 116 | Parameters 117 | ---------- 118 | omnivore : dict 119 | The Omnivore bookmark data containing URL and title information 120 | url : str 121 | The URL of the Omnivore bookmark 122 | all_bm : list 123 | List of all Karakeep bookmarks to search through 124 | is_pdf : bool 125 | Whether the bookmark is a PDF (not yet supported) 126 | threshold : float, optional 127 | Minimum similarity threshold for fuzzy matching, by default 0.95 128 | 129 | Returns 130 | ------- 131 | bookmark 132 | The matched Karakeep bookmark 133 | 134 | Raises 135 | ------ 136 | NotImplementedError 137 | If the bookmark is a PDF (not yet supported) 138 | RuntimeError 139 | If no matching bookmark is found or if bookmark content lacks URL attributes 140 | """ 141 | found_bm = False 142 | best_bookmark = None 143 | best_score = 0.0 144 | 145 | for bookmark in all_bm: 146 | found_url = None 147 | content = bookmark.content 148 | 149 | if is_pdf: 150 | raise NotImplementedError("PDF highlights are not yet supported") 151 | 152 | if hasattr(content, "url"): 153 | found_url = content.url 154 | elif hasattr(content, "sourceUrl"): 155 | found_url = content.sourceUrl 156 | else: 157 | raise RuntimeError( 158 | f"Bookmark content has no 'url' or 'sourceUrl' attribute. Available attributes: {[attr for attr in dir(content) if not attr.startswith('_')]}" 159 | ) 160 | 161 | # handling local PDF, they don't have proper url 162 | if found_url and found_url.startswith("https://omnivore.app"): 163 | found_url = None 164 | 165 | if found_url == url: 166 | found_bm = True 167 | break 168 | 169 | # couldn't find a matching url, match by title 170 | # exact title match: 171 | if ( 172 | "title" in omnivore 173 | and omnivore["title"] 174 | and hasattr(content, "title") 175 | and content.title 176 | ): 177 | if omnivore["title"].lower() == content.title.lower(): 178 | found_bm = True 179 | break 180 | if ( 181 | "title" in omnivore 182 | and omnivore["title"] 183 | and hasattr(bookmark, "title") 184 | and bookmark.title 185 | ): 186 | if omnivore["title"].lower() == bookmark.title.lower(): 187 | found_bm = True 188 | break 189 | 190 | # fuzzy matching, as a last resort - track the best match 191 | if ( 192 | "title" in omnivore 193 | and omnivore["title"] 194 | and hasattr(content, "title") 195 | and content.title 196 | ): 197 | r = ratio(omnivore["title"].lower(), content.title.lower()) 198 | if r > best_score: 199 | best_score = r 200 | best_bookmark = bookmark 201 | 202 | if ( 203 | "title" in omnivore 204 | and omnivore["title"] 205 | and hasattr(bookmark, "title") 206 | and bookmark.title 207 | ): 208 | r = ratio(omnivore["title"].lower(), bookmark.title.lower()) 209 | if r > best_score: 210 | best_score = r 211 | best_bookmark = bookmark 212 | 213 | # Use the best fuzzy match if it meets the threshold 214 | if not found_bm and best_score >= threshold: 215 | found_bm = True 216 | bookmark = best_bookmark 217 | 218 | if not found_bm: 219 | raise RuntimeError( 220 | f"Could not find bookmark for highlight file: {omnivore.get('slug', 'unknown')}" 221 | ) 222 | 223 | return bookmark 224 | 225 | 226 | def load_bookmarks_from_karakeep(karakeep: KarakeepAPI, karakeep_path: str) -> list: 227 | """ 228 | Load all bookmarks from Karakeep API, using local cache if available. 229 | 230 | This function fetches all bookmarks from the Karakeep instance, with content included. 231 | To avoid repeated API calls during development, bookmarks are cached locally. 232 | 233 | Parameters 234 | ---------- 235 | karakeep : KarakeepAPI 236 | The Karakeep API client instance 237 | karakeep_path : str 238 | Path to the local cache file for storing bookmarks 239 | 240 | Returns 241 | ------- 242 | list 243 | List of all bookmarks from the Karakeep instance 244 | """ 245 | if Path(karakeep_path).exists(): 246 | with Path(karakeep_path).open("rb") as f: 247 | all_bm = pickle.load(f) 248 | else: 249 | n = karakeep.get_current_user_stats()["numBookmarks"] 250 | pbar = tqdm(total=n, desc="Fetching bookmarks") 251 | all_bm = [] 252 | batch_size = 100 # if you set it too high, you can crash the karakeep instance, 100 being the maximum allowed 253 | page = karakeep.get_all_bookmarks( 254 | include_content=True, 255 | limit=batch_size, 256 | ) 257 | all_bm.extend(page.bookmarks) 258 | pbar.update(len(all_bm)) 259 | while page.nextCursor: 260 | page = karakeep.get_all_bookmarks( 261 | include_content=True, 262 | limit=batch_size, 263 | cursor=page.nextCursor, 264 | ) 265 | all_bm.extend(page.bookmarks) 266 | pbar.update(len(page.bookmarks)) 267 | 268 | assert len(all_bm) == n, ( 269 | f"Only retrieved {len(all_bm)} bookmarks instead of {n}" 270 | ) 271 | pbar.close() 272 | 273 | with Path(karakeep_path).open("wb") as f: 274 | pickle.dump(all_bm, f) 275 | 276 | return all_bm 277 | 278 | 279 | def get_omnivores_bookmarks(omnivore_export_dir: str) -> list[dict]: 280 | """ 281 | Load and concatenate bookmark data from all Omnivore export metadata files. 282 | 283 | This function searches for metadata files matching the pattern 'metadata_*_to_*.json' 284 | in the specified Omnivore export directory and combines all bookmark data into a 285 | single list. Each metadata file is expected to contain a JSON list of bookmarks. 286 | 287 | Parameters 288 | ---------- 289 | omnivore_export_dir : str 290 | Path to the Omnivore export directory containing metadata files 291 | 292 | Returns 293 | ------- 294 | list[dict] 295 | Combined list of all bookmark dictionaries found in the metadata files. 296 | Returns empty list if no valid metadata files are found. 297 | 298 | Notes 299 | ----- 300 | Files that cannot be decoded as JSON or do not contain lists will be skipped 301 | with warning messages. The function processes files in sorted order to ensure 302 | consistent results. 303 | """ 304 | export_path = Path(omnivore_export_dir) 305 | all_data: list[dict] = [] 306 | 307 | # Glob for metadata files and sort them to ensure consistent order (e.g., by date if named accordingly) 308 | metadata_files = sorted(export_path.glob("metadata_*_to_*.json")) 309 | 310 | if not metadata_files: 311 | print( 312 | f"Warning: No metadata files matching 'metadata_*_to_*.json' found in {omnivore_export_dir}" 313 | ) 314 | return [] 315 | 316 | for file_path in metadata_files: 317 | try: 318 | content = file_path.read_text() 319 | # Each metadata file is expected to contain a JSON list of bookmarks 320 | data_from_file: list[dict] = json.loads(content) 321 | if isinstance(data_from_file, list): 322 | all_data.extend(data_from_file) 323 | else: 324 | print( 325 | f"Warning: Metadata file {file_path.name} does not contain a JSON list. Skipping." 326 | ) 327 | except json.JSONDecodeError: 328 | print(f"Warning: Could not decode JSON from {file_path.name}. Skipping.") 329 | except Exception as e: 330 | print( 331 | f"Warning: An error occurred while processing {file_path.name}: {e}. Skipping." 332 | ) 333 | 334 | return all_data 335 | 336 | 337 | def main( 338 | omnivore_export_dir: str, 339 | karakeep_path: Optional[str] = "./karakeep_bookmarks.temp", 340 | dry: bool = True, 341 | skip_pdf: bool = True, 342 | ) -> None: 343 | """ 344 | Import highlights from Omnivore export to Karakeep. 345 | 346 | This function processes Omnivore export data to import highlights into a Karakeep instance. 347 | It matches Omnivore bookmarks to Karakeep bookmarks and creates corresponding highlights. 348 | 349 | The temporary Karakeep bookmarks cache file will be automatically deleted upon successful 350 | completion to avoid leaving temporary files behind. 351 | 352 | Parameters 353 | ---------- 354 | omnivore_export_dir : str 355 | Path to the Omnivore export directory containing highlights and content 356 | karakeep_path : str, optional 357 | Path to temporary file for caching Karakeep bookmarks, by default "./karakeep_bookmarks.temp" 358 | dry : bool, optional 359 | If True, only simulate the import without actually creating highlights, by default True 360 | skip_pdf : bool, optional 361 | If True, skip processing PDF highlights (not yet supported), by default True 362 | """ 363 | omnivore_export_path = Path(omnivore_export_dir) 364 | highlights_dir_path = omnivore_export_path / "highlights" 365 | omnivore_content_dir_path = omnivore_export_path / "content" 366 | 367 | assert omnivore_export_path.exists() and omnivore_export_path.is_dir(), ( 368 | f"Omnivore export directory not found: {omnivore_export_dir}" 369 | ) 370 | assert highlights_dir_path.exists() and highlights_dir_path.is_dir(), ( 371 | f"Highlights directory not found: {highlights_dir_path}" 372 | ) 373 | assert omnivore_content_dir_path.exists() and omnivore_content_dir_path.is_dir(), ( 374 | f"Omnivore content directory not found: {omnivore_content_dir_path}" 375 | ) 376 | 377 | highlights_files = [ 378 | p 379 | for p in highlights_dir_path.iterdir() 380 | if p.name.endswith(".md") and p.read_text().strip() 381 | ] 382 | content_files: dict = { 383 | p.stem: p.suffix for p in omnivore_content_dir_path.iterdir() 384 | } 385 | 386 | data = get_omnivores_bookmarks(omnivore_export_dir) 387 | 388 | karakeep = KarakeepAPI(verbose=False) 389 | 390 | # fetch all the bookmarks from karakeep, as the search feature is unreliable 391 | # as the loading can be pretty long, we store it to a local file 392 | all_bm = load_bookmarks_from_karakeep(karakeep, karakeep_path) 393 | 394 | for f_ind, f in enumerate( 395 | tqdm(highlights_files, unit="highlight", desc="importing highlights") 396 | ): 397 | name = f.stem 398 | 399 | highlights = f.read_text().strip().split("\n> ") 400 | highlights = [h.strip() for h in highlights if h.strip()] 401 | if not highlights: 402 | continue 403 | 404 | found_omni = False 405 | for omnivore in data: 406 | if omnivore["slug"] == name: 407 | found_omni = True 408 | break 409 | 410 | if not found_omni: 411 | print("Couldn't find the omnivore 'bookmark' for that highlight") 412 | raise RuntimeError( 413 | f"Could not find omnivore bookmark for highlight file: {name}" 414 | ) 415 | url = omnivore["url"] 416 | 417 | # check if the highlight is from a pdf or an html 418 | assert name in content_files, name 419 | if content_files[name] == ".pdf": 420 | is_pdf = True 421 | if skip_pdf: 422 | continue 423 | else: 424 | raise NotImplementedError("PDF highlights are not yet supported") 425 | elif content_files[name] == ".html": 426 | is_pdf = False 427 | else: 428 | raise RuntimeError( 429 | f"Unexpected file extension '{content_files[name]}' for file '{name}'. Expected '.pdf' or '.html'" 430 | ) 431 | 432 | bookmark = find_matching_bookmark(omnivore, url, all_bm, is_pdf) 433 | 434 | kara_content = bookmark.content.htmlContent 435 | 436 | if not kara_content: 437 | print( 438 | f"Skipping bookmark '{bookmark.title or name}' (ID: {bookmark.id}) - no HTML content available" 439 | ) 440 | continue 441 | 442 | as_md = html2text(kara_content, bodywidth=9999999) 443 | 444 | as_text = BeautifulSoup(kara_content, "html.parser").get_text() 445 | 446 | for highlight in highlights: 447 | if highlight.startswith("> "): 448 | highlight = highlight[1:] 449 | highlight.strip() 450 | 451 | # fix URLs of omnivore to point to the original source 452 | highlight = re.sub( 453 | r"https://proxy-prod.omnivore-image-cache.app/.*https://", 454 | "https://", 455 | highlight, 456 | ) 457 | 458 | link_pattern = r"\[.*?\]\((.*?)\)" 459 | link_replaced = re.sub(link_pattern, r" (Link to \1)", highlight) 460 | high_link_replaced_as_text = BeautifulSoup( 461 | markdown.markdown(link_replaced), "html.parser" 462 | ).get_text() 463 | 464 | if not high_link_replaced_as_text: 465 | assert high_link_replaced_as_text, ( 466 | f"Empty highlight text after processing. Original highlight: {highlight[:200]}{'...' if len(highlight) > 200 else ''}, Link replaced: {link_replaced[:200]}{'...' if len(link_replaced) > 200 else ''}" 467 | ) 468 | 469 | start, end = find_highlight_position( 470 | highlight=highlight, 471 | as_text=as_text, 472 | as_md=as_md, 473 | kara_content=kara_content, 474 | ) 475 | 476 | if not dry: 477 | # Create metadata dict for this highlight 478 | highlight_metadata = { 479 | "omnivore_bookmark_id": omnivore["id"], 480 | "omnivore_highlight_filename": f.name, 481 | "omnivore_highlight_importer_version": VERSION, 482 | } 483 | 484 | resp = karakeep.create_a_new_highlight( 485 | bookmark_id=bookmark.id, 486 | start_offset=start, 487 | end_offset=end, 488 | color="yellow", 489 | text=high_link_replaced_as_text, 490 | note=json.dumps(highlight_metadata, ensure_ascii=False), 491 | ) 492 | assert resp, highlight 493 | 494 | 495 | if __name__ == "__main__": 496 | fire.Fire(main) 497 | -------------------------------------------------------------------------------- /community_scripts/karakeep-time-tagger/karakeep-time-tagger.py: -------------------------------------------------------------------------------- 1 | """Adds time-to-read tags to bookmarks based on content length.""" 2 | 3 | import sys 4 | import pickle 5 | import fire 6 | from pathlib import Path 7 | from tqdm import tqdm 8 | from bs4 import BeautifulSoup 9 | from loguru import logger 10 | from karakeep_python_api import KarakeepAPI 11 | 12 | 13 | VERSION: str = "1.3.0" 14 | 15 | 16 | class AddTimeToRead: 17 | """Class to add time-to-read tags to bookmarks based on content length.""" 18 | 19 | # Define the time-to-read tags 20 | TIME_TAGS = ["0-1m", "1-5m", "5-10m", "10-15m", "15-30m", "30m+"] 21 | 22 | def __init__(self): 23 | """Initialize the AddTimeToRead class.""" 24 | self.karakeep = None 25 | 26 | def create_time_reading_lists(self): 27 | """Create smart lists for each time-to-read tag.""" 28 | # Get existing lists to avoid duplicates 29 | try: 30 | existing_lists = self.karakeep.get_all_lists() 31 | existing_list_names = {lst.name for lst in existing_lists} 32 | logger.info(f"Found {len(existing_lists)} existing lists") 33 | except Exception as e: 34 | logger.error(f"Failed to fetch existing lists: {e}") 35 | return 36 | 37 | # Mapping of time tags to zero-padded list names and descriptions 38 | list_configs = [ 39 | { 40 | "name": "00-05m", 41 | "tag": "0-5m", 42 | "description": "Quick reads under 5 minutes", 43 | }, 44 | { 45 | "name": "05-10m", 46 | "tag": "5-10m", 47 | "description": "Short reads 5-10 minutes", 48 | }, 49 | { 50 | "name": "10-15m", 51 | "tag": "10-15m", 52 | "description": "Medium reads 10-15 minutes", 53 | }, 54 | { 55 | "name": "15-30m", 56 | "tag": "15-30m", 57 | "description": "Long reads 15-30 minutes", 58 | }, 59 | { 60 | "name": "30m+", 61 | "tag": "30m+", 62 | "description": "Extended reads over 30 minutes", 63 | }, 64 | ] 65 | 66 | for config in list_configs: 67 | list_name = config["name"] 68 | 69 | # Skip if list already exists 70 | if list_name in existing_list_names: 71 | logger.info(f"List '{list_name}' already exists, skipping creation") 72 | continue 73 | 74 | tag_name = config["tag"] 75 | description = config["description"] 76 | query = f"#{tag_name} -is:archived" 77 | 78 | logger.info(f"Creating smart list '{list_name}' with query '{query}'") 79 | 80 | try: 81 | result = self.karakeep.create_a_new_list( 82 | name=list_name, 83 | icon="⏱️", # Clock icon for time-based lists 84 | description=description, 85 | list_type="smart", 86 | query=query, 87 | ) 88 | logger.info( 89 | f"Successfully created list '{list_name}' with ID: {result.id if hasattr(result, 'id') else 'unknown'}" 90 | ) 91 | 92 | except Exception as e: 93 | logger.error(f"Failed to create list '{list_name}': {e}") 94 | 95 | def setup_logging(self, verbose: bool = False): 96 | """Setup loguru logging with file output and console output based on verbosity.""" 97 | # Remove default logger 98 | # logger.remove() 99 | 100 | # Add file logger with debug level 101 | logger.add("karakeep-time-tagger.log", level="DEBUG", rotation="10 MB") 102 | 103 | # Add console logger based on verbosity 104 | if verbose: 105 | logger.add(sys.stderr, level="DEBUG") 106 | else: 107 | logger.add(sys.stderr, level="INFO") 108 | 109 | def extract_content_text(self, bookmark) -> str: 110 | """ 111 | Extract text content from bookmark based on its type. 112 | 113 | Args: 114 | bookmark: Bookmark object with content 115 | 116 | Returns: 117 | str: Text content to analyze 118 | """ 119 | if bookmark.content.type == "link": 120 | # For link bookmarks, content is in bookmark.content.content 121 | return bookmark.content.htmlContent 122 | elif bookmark.content.type == "text": 123 | # For text bookmarks, content is in bookmark.content.text 124 | return bookmark.content.text 125 | else: 126 | logger.debug(f"Unsupported content type: {bookmark.content.type}") 127 | return "" 128 | 129 | def estimate_reading_time(self, bookmark, wpm: int) -> str: 130 | """ 131 | Estimate reading time for given bookmark and return appropriate tag. 132 | 133 | Args: 134 | bookmark: Bookmark object to analyze 135 | wpm: Words per minute reading speed 136 | 137 | Returns: 138 | str: Time tag (0-5m, 5-10m, 10-15m, 15-30m, 30m+) 139 | """ 140 | # Extract text content based on bookmark type 141 | content = self.extract_content_text(bookmark) 142 | 143 | if not content: 144 | logger.debug("Empty content, returning 0-5m tag") 145 | return "0-5m" 146 | 147 | # Parse HTML and extract text 148 | soup = BeautifulSoup(content, "html.parser") 149 | text = soup.get_text() 150 | 151 | # Count words (split by whitespace) 152 | word_count = len(text.split()) 153 | logger.debug(f"Word count: {word_count}") 154 | 155 | # Calculate reading time in minutes 156 | reading_time_minutes = word_count / wpm 157 | logger.debug(f"Estimated reading time: {reading_time_minutes:.2f} minutes") 158 | 159 | # Determine appropriate tag 160 | if reading_time_minutes <= 5: 161 | return "0-5m" 162 | elif reading_time_minutes <= 10: 163 | return "5-10m" 164 | elif reading_time_minutes <= 15: 165 | return "10-15m" 166 | elif reading_time_minutes <= 30: 167 | return "15-30m" 168 | else: 169 | return "30m+" 170 | 171 | def get_current_time_tags(self, bookmark) -> list: 172 | """Get list of current time-to-read tags on bookmark.""" 173 | if not bookmark.tags: 174 | return [] 175 | 176 | current_time_tags = [] 177 | for tag in bookmark.tags: 178 | if tag.name in self.TIME_TAGS: 179 | current_time_tags.append(tag.name) 180 | 181 | return current_time_tags 182 | 183 | def should_skip_bookmark(self, bookmark, reset_all: bool) -> bool: 184 | """ 185 | Determine if bookmark should be skipped based on reset_all setting. 186 | 187 | Logic: 188 | - If reset_all is True: never skip 189 | - If reset_all is False: 190 | - When using search mode, we've already filtered to untagged bookmarks, so never skip 191 | - This method is kept for consistency but simplified logic when not reset_all 192 | """ 193 | if reset_all: 194 | return False 195 | 196 | # When reset_all is False, we've already used search to find untagged bookmarks 197 | # So we should process all bookmarks in the filtered set 198 | # However, we still check for multiple time tags that might need reset 199 | current_time_tags = self.get_current_time_tags(bookmark) 200 | 201 | # If exactly one time tag, this shouldn't happen in search mode but handle gracefully 202 | if len(current_time_tags) == 1: 203 | logger.debug( 204 | f"Unexpected: bookmark {bookmark.id} found in search but has time tag: {current_time_tags[0]}" 205 | ) 206 | return True 207 | 208 | # Process bookmarks with multiple time tags or no time tags 209 | return False 210 | 211 | def needs_reset(self, bookmark) -> bool: 212 | """Check if bookmark has multiple time tags and needs reset.""" 213 | current_time_tags = self.get_current_time_tags(bookmark) 214 | return len(current_time_tags) > 1 215 | 216 | def process_bookmark(self, bookmark, wpm: int): 217 | """Process a single bookmark to add appropriate time-to-read tag.""" 218 | logger.debug(f"Processing bookmark {bookmark.id}: {bookmark.title}") 219 | 220 | # Only process link and text bookmarks 221 | if bookmark.content.type not in ["link", "text"]: 222 | logger.debug( 223 | f"Skipping bookmark {bookmark.id} - type {bookmark.content.type} not supported" 224 | ) 225 | return 226 | 227 | # Estimate reading time 228 | target_tag = self.estimate_reading_time(bookmark, wpm) 229 | logger.debug(f"Target tag for bookmark {bookmark.id}: {target_tag}") 230 | 231 | # Get current time tags 232 | current_time_tags = self.get_current_time_tags(bookmark) 233 | logger.debug( 234 | f"Current time tags for bookmark {bookmark.id}: {current_time_tags}" 235 | ) 236 | 237 | # If bookmark already has the correct tag and no others, skip 238 | if current_time_tags == [target_tag]: 239 | logger.debug(f"Bookmark {bookmark.id} already has correct tag") 240 | return 241 | 242 | # Remove all existing time tags if any 243 | if current_time_tags: 244 | logger.info( 245 | f"Removing existing time tags {current_time_tags} from bookmark {bookmark.id}" 246 | ) 247 | try: 248 | self.karakeep.detach_tags_from_a_bookmark( 249 | bookmark_id=bookmark.id, tag_names=current_time_tags 250 | ) 251 | except Exception as e: 252 | logger.error(f"Failed to remove tags from bookmark {bookmark.id}: {e}") 253 | return 254 | 255 | # Add the target tag 256 | logger.info( 257 | f"Adding tag '{target_tag}' to bookmark {bookmark.id}: {bookmark.title}" 258 | ) 259 | try: 260 | self.karakeep.attach_tags_to_a_bookmark( 261 | bookmark_id=bookmark.id, tag_names=[target_tag] 262 | ) 263 | except Exception as e: 264 | logger.error(f"Failed to add tag to bookmark {bookmark.id}: {e}") 265 | 266 | def run( 267 | self, 268 | wpm: int = 200, 269 | reset_all: bool = False, 270 | verbose: bool = False, 271 | cache_file: str = "./bookmarks.temp", 272 | create_lists: bool = True, 273 | ): 274 | """ 275 | Main method to process all bookmarks and add time-to-read tags. 276 | 277 | Args: 278 | wpm: Words per minute reading speed (default: 200) 279 | reset_all: If True, process all bookmarks. If False, skip bookmarks that already have a single time tag. 280 | verbose: If True, show debug level logs in console 281 | cache_file: Path to cache file for bookmarks (default: ./bookmarks.temp) 282 | create_lists: If True, create smart lists for each time slot (default: False) 283 | """ 284 | # Setup logging 285 | self.setup_logging(verbose) 286 | 287 | logger.info(f"Starting AddTimeToRead with wpm={wpm}, reset_all={reset_all}") 288 | 289 | # Connect to Karakeep 290 | try: 291 | self.karakeep = KarakeepAPI() 292 | logger.info("Connected to Karakeep API") 293 | except Exception as e: 294 | logger.error(f"Failed to connect to Karakeep API: {e}") 295 | return 296 | 297 | # Create smart lists if requested 298 | if create_lists: 299 | logger.info("Creating smart lists for time-to-read tags...") 300 | self.create_time_reading_lists() 301 | 302 | # Determine cache file name based on reset_all mode 303 | if reset_all: 304 | cache_file_final = cache_file 305 | else: 306 | # Use different cache for untagged bookmarks search 307 | cache_parts = Path(cache_file).parts 308 | cache_file_final = str( 309 | Path(*cache_parts[:-1]) / f"untagged_{cache_parts[-1]}" 310 | ) 311 | 312 | # Fetch bookmarks with content, using cache to speed up testing 313 | # As the loading can be pretty long, we store it to a local file 314 | if reset_all: 315 | if Path(cache_file_final).exists(): 316 | logger.info(f"Loading bookmarks from cache file: {cache_file_final}") 317 | with Path(cache_file_final).open("rb") as f: 318 | bookmarks = pickle.load(f) 319 | logger.info(f"Loaded {len(bookmarks)} bookmarks from cache") 320 | else: 321 | logger.info("Cache file not found, fetching bookmarks from API...") 322 | 323 | # Fetch all bookmarks when reset_all is True 324 | try: 325 | n = self.karakeep.get_current_user_stats()["numBookmarks"] 326 | logger.info(f"Total bookmarks to fetch: {n}") 327 | except Exception as e: 328 | logger.error(f"Failed to get bookmark count: {e}") 329 | return 330 | 331 | logger.info("Fetching all bookmarks with content...") 332 | pbar = tqdm(total=n, desc="Fetching bookmarks") 333 | bookmarks = [] 334 | batch_size = 100 # Maximum allowed batch size to avoid crashing the karakeep instance 335 | 336 | try: 337 | page = self.karakeep.get_all_bookmarks( 338 | include_content=True, 339 | limit=batch_size, 340 | ) 341 | bookmarks.extend(page.bookmarks) 342 | pbar.update(len(page.bookmarks)) 343 | 344 | while page.nextCursor: 345 | page = self.karakeep.get_all_bookmarks( 346 | include_content=True, 347 | limit=batch_size, 348 | cursor=page.nextCursor, 349 | ) 350 | bookmarks.extend(page.bookmarks) 351 | pbar.update(len(page.bookmarks)) 352 | 353 | assert len(bookmarks) == n, ( 354 | f"Only retrieved {len(bookmarks)} bookmarks instead of {n}" 355 | ) 356 | pbar.close() 357 | 358 | except Exception as e: 359 | pbar.close() 360 | logger.error(f"Error fetching bookmarks: {e}") 361 | return 362 | 363 | # Save bookmarks to cache file 364 | logger.info( 365 | f"Saving {len(bookmarks)} bookmarks to cache file: {cache_file_final}" 366 | ) 367 | with Path(cache_file_final).open("wb") as f: 368 | pickle.dump(bookmarks, f) 369 | else: 370 | # Use search to find bookmarks without time tags when reset_all is False 371 | search_query = "-#0-5m -#5-10m -#10-15m -#15-30m -#30m+" 372 | logger.info( 373 | f"Searching for bookmarks without time tags using query: {search_query}" 374 | ) 375 | 376 | bookmarks = [] 377 | batch_size = 100 378 | 379 | try: 380 | page = self.karakeep.search_bookmarks( 381 | q=search_query, 382 | include_content=True, 383 | limit=batch_size, 384 | ) 385 | bookmarks.extend(page.bookmarks) 386 | logger.info(f"Found {len(page.bookmarks)} bookmarks in first page") 387 | 388 | while page.nextCursor: 389 | page = self.karakeep.search_bookmarks( 390 | q=search_query, 391 | include_content=True, 392 | limit=batch_size, 393 | cursor=page.nextCursor, 394 | ) 395 | bookmarks.extend(page.bookmarks) 396 | logger.info(f"Found {len(page.bookmarks)} additional bookmarks") 397 | 398 | logger.info(f"Total untagged bookmarks found: {len(bookmarks)}") 399 | 400 | except Exception as e: 401 | logger.error(f"Error searching for untagged bookmarks: {e}") 402 | return 403 | 404 | logger.info(f"Total bookmarks fetched: {len(bookmarks)}") 405 | 406 | # Process bookmarks 407 | processed = 0 408 | skipped_by_policy = 0 409 | skipped_by_type = 0 410 | errors = 0 411 | 412 | for bookmark in tqdm(bookmarks, desc="Processing bookmarks"): 413 | try: 414 | # Check bookmark type first 415 | if bookmark.content.type not in ["link", "text"]: 416 | skipped_by_type += 1 417 | continue 418 | 419 | # Check if we should skip this bookmark based on reset policy 420 | if self.should_skip_bookmark(bookmark, reset_all): 421 | skipped_by_policy += 1 422 | continue 423 | 424 | # Check if bookmark needs reset (has multiple time tags) 425 | if self.needs_reset(bookmark): 426 | logger.info( 427 | f"Bookmark {bookmark.id} has multiple time tags, will be reset" 428 | ) 429 | 430 | # Process the bookmark 431 | self.process_bookmark(bookmark, wpm) 432 | processed += 1 433 | 434 | except Exception as e: 435 | logger.error(f"Error processing bookmark {bookmark.id}: {e}") 436 | errors += 1 437 | 438 | logger.info( 439 | f"Processing complete. Processed: {processed}, Skipped (policy): {skipped_by_policy}, Skipped (type): {skipped_by_type}, Errors: {errors}" 440 | ) 441 | 442 | # Clean up cache file after successful completion 443 | if Path(cache_file_final).exists(): 444 | try: 445 | Path(cache_file_final).unlink() 446 | logger.info(f"Cleaned up cache file: {cache_file_final}") 447 | except Exception as e: 448 | logger.warning(f"Failed to delete cache file {cache_file_final}: {e}") 449 | 450 | 451 | def main( 452 | wpm: int = 200, 453 | reset_all: bool = False, 454 | verbose: bool = False, 455 | cache_file: str = "./bookmarks.temp", 456 | create_lists: bool = True, 457 | ): 458 | """ 459 | Main entry point for the script. 460 | 461 | Args: 462 | wpm: Words per minute reading speed (default: 200) 463 | reset_all: If True, process all bookmarks. If False, skip bookmarks that already have a single time tag. 464 | verbose: If True, show debug level logs in console 465 | cache_file: Path to cache file for bookmarks (default: ./bookmarks.temp) 466 | create_lists: If True, create smart lists for each time slot (default: False) 467 | """ 468 | add_time_to_read = AddTimeToRead() 469 | add_time_to_read.run( 470 | wpm=wpm, 471 | reset_all=reset_all, 472 | verbose=verbose, 473 | cache_file=cache_file, 474 | create_lists=create_lists, 475 | ) 476 | 477 | 478 | if __name__ == "__main__": 479 | fire.Fire(main) 480 | -------------------------------------------------------------------------------- /community_scripts/omnivore2karakeep-highlights/string_context_matcher.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from math import inf 3 | from dataclasses import dataclass 4 | 5 | from joblib import Parallel, delayed, Memory 6 | from rapidfuzz.distance.Levenshtein import normalized_distance as lev_dist 7 | from rapidfuzz.fuzz import ratio as lev_ratio 8 | 9 | # Initialize joblib Memory for caching 10 | # Using a distinct cache name for this module 11 | mem = Memory(".cache_string_matcher", verbose=False) 12 | 13 | 14 | @dataclass 15 | class MatchResult: 16 | """ 17 | Holds the results of a highlight matching operation. 18 | 19 | Attributes: 20 | - matches: List of best matching substrings of corpus. 21 | - ratio: Levenshtein ratio of the closest match. 22 | - distance: Levenshtein distance of the closest match. 23 | - quick_match_used: True if the quick matching algorithm was used, False otherwise. 24 | """ 25 | 26 | matches: List[str] 27 | ratio: float 28 | distance: ( 29 | float # Levenshtein distance is usually int, but normalized_distance is float 30 | ) 31 | quick_match_used: bool 32 | 33 | 34 | @mem.cache(ignore=["n_jobs"]) 35 | def match_highlight_to_corpus( 36 | query: str, 37 | corpus: str, 38 | case_sensitive: bool = True, 39 | step_factor: int = 500, 40 | n_jobs: int = -1, 41 | ) -> MatchResult: 42 | """ 43 | Source: https://stackoverflow.com/questions/36013295/find-best-substring-match 44 | Returns the substring of the corpus with the least Levenshtein distance from the query 45 | (May not always return optimal answer). 46 | 47 | Arguments 48 | - query: str 49 | - corpus: str 50 | - case_sensitive: bool 51 | - step_factor: int 52 | Only used in the long way. 53 | Influences the resolution of the thorough search once the general region is found. 54 | The increment in ngrams lengths used for the thorough search is calculated as len(query)//step_factor. 55 | Increasing this increases the number of ngram lengths used in the thorough search and increases the chances 56 | of getting the optimal solution at the cost of runtime and memory. 57 | - n_jobs: int 58 | number of jobs to use for multithreading. 1 to disable 59 | 60 | Returns 61 | MatchResult object containing: 62 | - matches: List of best matching substrings of corpus, 63 | - ratio: Levenshtein ratio of closest match, 64 | - distance: Levenshtein distance of closest match, 65 | - quick_match_used: True if used the quick way False if using the long way, 66 | """ 67 | 68 | # quick way 69 | lq = len(query) 70 | lc = len(corpus) 71 | 72 | # Prepare query and corpus for caseless comparison if needed for word matching 73 | # but original query/corpus are used for levenshtein to respect case_sensitive flag later if it were used. 74 | # Note: The current 'quick way' does not explicitly use the case_sensitive flag for its Levenshtein comparisons. 75 | # It uses casefolded strings for identifying regions. 76 | lquery_caseless = query.casefold() 77 | lcorp_caseless = corpus.casefold() 78 | 79 | # 1. find most probably region that contains the appropriate words 80 | qwords = [w.strip() for w in set(lquery_caseless.split(" ")) if len(w.strip()) > 3] 81 | indexes = [] 82 | for w in qwords: 83 | m = [] 84 | prev = 0 85 | # Search for word occurrences in the case-folded corpus 86 | while True: 87 | try: 88 | found_idx = lcorp_caseless.index(w, prev) 89 | m.append(found_idx) 90 | prev = found_idx + 1 91 | if len(m) >= 20: # Limit number of matches per word 92 | break 93 | except ValueError: # Substring not found 94 | break 95 | if len(m) > 20: # if limit was reached (and thus potentially many more matches) 96 | continue # this word might be too common, skip it 97 | if m: 98 | indexes.append(m) 99 | 100 | if indexes: 101 | mins = [min(ind_list) for ind_list in indexes] 102 | maxs = [max(ind_list) for ind_list in indexes] 103 | # Calculate mean start and end points, expand by 1.2 * query length 104 | mean_min = max(0, int(sum(mins) / len(mins)) - int(lq * 1.2)) 105 | mean_max = min(lc, int(sum(maxs) / len(maxs)) + int(lq * 1.2)) 106 | 107 | mini_corp = corpus[mean_min : mean_max + 1] 108 | 109 | # 2. in the region, check the lev ratio in a sliding window 110 | # to determine best sub region 111 | # Create batches of query length from the mini_corp 112 | batches = [ 113 | mini_corp[i * lq : (i + 1) * lq] for i in range(0, len(mini_corp) // lq + 1) 114 | ] 115 | batches = [b for b in batches if b.strip()] # Filter out empty batches 116 | 117 | if not batches: # No suitable batches found 118 | pass # Will proceed to the "long way" or return based on later logic 119 | else: 120 | ratios = Parallel( 121 | backend="threading", 122 | n_jobs=n_jobs, 123 | )(delayed(lev_ratio)(query, b) for b in batches) # Use lev_ratio 124 | max_rat = max(ratios) if ratios else -1.0 125 | max_rat_idx = [i for i, r in enumerate(ratios) if r == max_rat] 126 | 127 | # 3. in the best sub region, find the best substring with a 1 128 | # character sliding window using both ratio and distance 129 | best_ratio = -inf 130 | best_dist = inf 131 | best_matches = [] 132 | 133 | def get_rat_dist(s1, s2): 134 | # Corrected to use imported lev_ratio and lev_dist 135 | return [lev_ratio(s1, s2), lev_dist(s1, s2)] 136 | 137 | for current_region_idx_in_batches in max_rat_idx: 138 | # Define area based on batches around the current max_rat_idx 139 | # Original: "".join(batches[current_region_idx_in_batches-1:current_region_idx_in_batches+1]) 140 | # This needs careful handling of start/end of batches list 141 | start_slice = max(0, current_region_idx_in_batches - 1) 142 | end_slice = ( 143 | current_region_idx_in_batches + 1 144 | ) # Slicing is exclusive at end 145 | 146 | # The string to find index of, from the original batches 147 | string_markers_for_iidx = "".join(batches[start_slice:end_slice]) 148 | 149 | try: 150 | # Find this concatenated marker string within mini_corp to get a starting point 151 | iidx = mini_corp.index(string_markers_for_iidx) 152 | except ValueError: 153 | # If the joined string isn't found (e.g., if batches were empty or logic error) 154 | # try to use the start of the current batch element as a fallback index. 155 | if batches and current_region_idx_in_batches < len(batches): 156 | try: 157 | iidx = mini_corp.index( 158 | batches[current_region_idx_in_batches] 159 | ) 160 | except ValueError: 161 | continue # Skip this max_rat_idx if problematic 162 | else: 163 | continue # Skip this max_rat_idx if problematic 164 | 165 | area = mini_corp[ 166 | iidx : iidx + 3 * lq 167 | ] # Define search area in mini_corp 168 | if not area.strip(): 169 | continue 170 | 171 | # Generate sub-batches (batches2) from this 'area' 172 | # Original: [area[i:lq+i] for i in range(0, len(area) + 1)] 173 | # This creates n-grams of length lq, then shorter suffixes. 174 | batches2 = [area[i : i + lq] for i in range(0, len(area) - lq + 1)] 175 | # Add shorter suffixes as in original intent (approximation): 176 | for k in range(1, lq): 177 | if len(area) - lq + k < len(area): 178 | batches2.append(area[len(area) - lq + k :]) 179 | batches2 = [b for b in batches2 if b] # Ensure no empty strings 180 | 181 | if not batches2: 182 | continue 183 | 184 | ratdist2 = Parallel( 185 | backend="threading", 186 | n_jobs=n_jobs, 187 | )(delayed(get_rat_dist)(query, b) for b in batches2) 188 | 189 | ratios2 = [it[0] for it in ratdist2] 190 | distances2 = [it[1] for it in ratdist2] 191 | 192 | current_batch_max_r = max(ratios2) if ratios2 else -inf 193 | current_batch_min_d = min(distances2) if distances2 else inf 194 | 195 | # Original logic for updating global best_matches 196 | if ( 197 | current_batch_max_r >= best_ratio 198 | and current_batch_min_d <= best_dist 199 | ): 200 | # Find all strings in batches2 that yield current_batch_max_r 201 | indices_for_current_max_r = [ 202 | i 203 | for i, r_val in enumerate(ratios2) 204 | if r_val == current_batch_max_r 205 | ] 206 | 207 | if ( 208 | not indices_for_current_max_r 209 | ): # Should not happen if current_batch_max_r is from ratios2 210 | continue 211 | 212 | # Pick the first one as per original's implied logic (using index()) 213 | candidate_string_from_batch = batches2[indices_for_current_max_r[0]] 214 | 215 | if ( 216 | current_batch_max_r == best_ratio 217 | and current_batch_min_d == best_dist 218 | ): 219 | best_matches.append(candidate_string_from_batch) 220 | else: # New global bests found from this sub-batch's characteristics 221 | best_ratio = current_batch_max_r 222 | best_dist = current_batch_min_d 223 | best_matches = [candidate_string_from_batch] 224 | 225 | if best_matches: 226 | best_matches = list(set(best_matches)) # Deduplicate 227 | return MatchResult( 228 | matches=best_matches, 229 | ratio=best_ratio, 230 | distance=best_dist, 231 | quick_match_used=True, 232 | ) 233 | 234 | # Fallback or "long way" if quick way did not yield results or was skipped 235 | query_to_compare = query if case_sensitive else query.casefold() 236 | corpus_to_compare = corpus if case_sensitive else corpus.casefold() 237 | 238 | corpus_len = len(corpus_to_compare) 239 | query_len = len(query_to_compare) 240 | if query_len == 0: 241 | return MatchResult( 242 | matches=[], ratio=0.0, distance=1.0, quick_match_used=False 243 | ) # Or handle as error 244 | if corpus_len == 0: 245 | return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False) 246 | 247 | query_len_by_2 = max(query_len // 2, 1) 248 | query_len_by_step_factor = max(query_len // step_factor, 1) 249 | 250 | min_dist_val = inf # Renamed from min_dist to avoid clash with the variable from quick path if it ran partially 251 | 252 | # Initial search of corpus: ngrams of same length as query, step half query length 253 | corpus_ngrams_initial = [ 254 | corpus_to_compare[i : i + query_len] 255 | for i in range(0, corpus_len - query_len + 1, query_len_by_2) 256 | ] 257 | if not corpus_ngrams_initial: # e.g. corpus shorter than query 258 | # Try one comparison with the full corpus if it's shorter than query_len 259 | if corpus_len < query_len: 260 | corpus_ngrams_initial = [corpus_to_compare] 261 | else: # No ngrams to check, means cannot find match. 262 | # Check what ratio/distance to return for "no match" 263 | # An empty list of matches, ratio 0, distance 1 (max normalized distance) 264 | return MatchResult( 265 | matches=[], ratio=0.0, distance=1.0, quick_match_used=False 266 | ) 267 | 268 | dists_initial = Parallel( 269 | backend="threading", 270 | n_jobs=n_jobs, 271 | )(delayed(lev_dist)(ngram, query_to_compare) for ngram in corpus_ngrams_initial) 272 | 273 | closest_match_idx_initial = 0 274 | if dists_initial: 275 | min_dist_val = min(dists_initial) 276 | closest_match_idx_initial = dists_initial.index(min_dist_val) 277 | else: # No initial distances, implies no ngrams, return no match 278 | return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False) 279 | 280 | # Determine narrowed search region based on initial best match 281 | closest_match_corpus_start_idx = closest_match_idx_initial * query_len_by_2 282 | 283 | # Define search window around this initial best match point 284 | # Original boundaries: 285 | # left = max(closest_match_idx - query_len_by_2 - 1, 0) 286 | # right = min((closest_match_idx+query_len-1) + query_len_by_2 + 2, corpus_len) 287 | # Using corpus indices: 288 | left_boundary = max(0, closest_match_corpus_start_idx - query_len_by_2 - 1) 289 | # The end of the initial best ngram is closest_match_corpus_start_idx + query_len 290 | right_boundary = min( 291 | corpus_len, 292 | (closest_match_corpus_start_idx + query_len - 1) + query_len_by_2 + 2, 293 | ) 294 | 295 | narrowed_corpus_to_compare = corpus_to_compare[left_boundary:right_boundary] 296 | # Important: We need to map findings in narrowed_corpus_to_compare back to original `corpus` strings 297 | narrowed_corpus_original_case = corpus[left_boundary:right_boundary] 298 | 299 | narrowed_corpus_len = len(narrowed_corpus_to_compare) 300 | if narrowed_corpus_len == 0: 301 | return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False) 302 | 303 | # Generate ngram lengths for thorough search in the narrowed region 304 | # From narrowed_corpus_len down to query_len_by_2, stepping by -query_len_by_step_factor 305 | ngram_lens_thorough = [ 306 | l 307 | for l in range( 308 | narrowed_corpus_len, query_len_by_2 - 1, -query_len_by_step_factor 309 | ) 310 | if l > 0 311 | ] 312 | if ( 313 | not ngram_lens_thorough 314 | ): # If narrowed_corpus_len is too small or other edge cases 315 | ngram_lens_thorough.append( 316 | min(query_len, narrowed_corpus_len) 317 | ) # Ensure at least one sensible length 318 | ngram_lens_thorough = [l for l in ngram_lens_thorough if l > 0] 319 | 320 | # Construct sets of ngrams from narrowed_corpus for each length 321 | narrowed_corpus_ngrams_thorough_sets = [] 322 | narrowed_corpus_ngrams_original_case_sets = [] 323 | 324 | for ngram_len in ngram_lens_thorough: 325 | if ngram_len > narrowed_corpus_len: 326 | continue # Should not happen if ngram_lens_thorough is generated correctly 327 | current_set_compare = [ 328 | narrowed_corpus_to_compare[i : i + ngram_len] 329 | for i in range(0, narrowed_corpus_len - ngram_len + 1) 330 | ] 331 | current_set_original = [ 332 | narrowed_corpus_original_case[i : i + ngram_len] 333 | for i in range(0, narrowed_corpus_len - ngram_len + 1) 334 | ] 335 | if current_set_compare: 336 | narrowed_corpus_ngrams_thorough_sets.append(current_set_compare) 337 | narrowed_corpus_ngrams_original_case_sets.append(current_set_original) 338 | 339 | if not narrowed_corpus_ngrams_thorough_sets: 340 | # This can happen if narrowed_corpus is shorter than all generated ngram_lens 341 | # e.g. query_len_by_2 is too large relative to narrowed_corpus_len 342 | # As a fallback, compare query against the whole narrowed_corpus_original_case 343 | dist_val = lev_dist(narrowed_corpus_to_compare, query_to_compare) 344 | ratio_val = lev_ratio(narrowed_corpus_to_compare, query_to_compare) 345 | if dist_val <= min_dist_val: # Using min_dist_val from initial pass 346 | return MatchResult( 347 | matches=[narrowed_corpus_original_case], 348 | ratio=ratio_val, 349 | distance=dist_val, 350 | quick_match_used=False, 351 | ) 352 | else: # Initial pass was better or no match found 353 | # This part needs to re-evaluate what to return if narrowed search fails. 354 | # Fallback to returning based on min_dist_val if nothing better is found here. 355 | # For now, let's assume if we reach here with no ngrams, original min_dist_val holds the best. 356 | # This path implies the more thorough search didn't find anything or couldn't run. 357 | # Find the string associated with min_dist_val from initial pass: 358 | best_ngram_from_initial_pass_idx = dists_initial.index(min_dist_val) 359 | best_ngram_str_initial_pass_original_case = corpus[ 360 | best_ngram_from_initial_pass_idx 361 | * query_len_by_2 : best_ngram_from_initial_pass_idx * query_len_by_2 362 | + query_len 363 | ] 364 | ratio_for_initial_best = lev_ratio( 365 | ( 366 | best_ngram_str_initial_pass_original_case.casefold() 367 | if not case_sensitive 368 | else best_ngram_str_initial_pass_original_case 369 | ), 370 | query_to_compare, 371 | ) 372 | return MatchResult( 373 | matches=[best_ngram_str_initial_pass_original_case], 374 | ratio=ratio_for_initial_best, 375 | distance=min_dist_val, 376 | quick_match_used=False, 377 | ) 378 | 379 | # Calculate distances for all ngrams in the thorough search sets 380 | dist_list_thorough = [] 381 | for ngram_set in narrowed_corpus_ngrams_thorough_sets: 382 | dist_list_thorough.append( 383 | Parallel(backend="threading", n_jobs=n_jobs)( 384 | delayed(lev_dist)(ngram, query_to_compare) for ngram in ngram_set 385 | ) 386 | ) 387 | 388 | final_best_matches = [] 389 | # min_dist_val still holds the minimum distance found so far (from initial pass) 390 | 391 | for i_set, ngram_set_original_case in enumerate( 392 | narrowed_corpus_ngrams_original_case_sets 393 | ): 394 | current_dists_for_set = dist_list_thorough[i_set] 395 | for i_ngram, ngram_original_case in enumerate(ngram_set_original_case): 396 | ngram_dist = current_dists_for_set[i_ngram] 397 | if ngram_dist < min_dist_val: 398 | min_dist_val = ngram_dist 399 | final_best_matches = [ngram_original_case] 400 | elif ngram_dist == min_dist_val: 401 | final_best_matches.append(ngram_original_case) 402 | 403 | # If initial pass found a better or equal min_dist_val and thorough search didn't improve OR final_best_matches empty 404 | if not final_best_matches: 405 | # Fallback to best from initial pass if thorough search yielded nothing 406 | # This case should ideally be covered by min_dist_val initialization and updates 407 | # For safety, ensure if final_best_matches is empty, we use the best known from initial scan. 408 | idx = dists_initial.index(min_dist_val) 409 | # Original string from corpus that corresponds to this match 410 | original_string_match = corpus[ 411 | idx * query_len_by_2 : idx * query_len_by_2 + query_len 412 | ] 413 | final_best_matches = [original_string_match] 414 | 415 | final_best_matches = list(set(final_best_matches)) # Deduplicate 416 | if not final_best_matches: # Should not be empty if corpus & query were not empty 417 | return MatchResult(matches=[], ratio=0.0, distance=1.0, quick_match_used=False) 418 | 419 | # Calculate ratio for the first best match found (or an aggregate if multiple have same min_dist_val) 420 | # Ensure query_to_compare is used for ratio calculation consistency 421 | # best_ratio_val = lev_ratio( # Original calculation commented for review 422 | # (final_best_matches[0].casefold() if not case_sensitive else final_best_matches[0]), 423 | # query_to_compare 424 | # ) 425 | # Re-calculate max ratio among all best_matches to be robust 426 | all_ratios = [ 427 | lev_ratio((bm.casefold() if not case_sensitive else bm), query_to_compare) 428 | for bm in final_best_matches 429 | ] 430 | best_ratio_val = max(all_ratios) if all_ratios else 0.0 431 | # The min_dist_val should already be correct for these final_best_matches 432 | 433 | return MatchResult( 434 | matches=final_best_matches, 435 | ratio=best_ratio_val, 436 | distance=min_dist_val, 437 | quick_match_used=False, 438 | ) # False: used long way 439 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published by 637 | the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /karakeep_python_api/__main__.py: -------------------------------------------------------------------------------- 1 | from textwrap import dedent 2 | import inspect 3 | import json 4 | import sys 5 | import os 6 | import functools 7 | import re # Import re module 8 | import click 9 | import traceback # Moved import to top 10 | from typing import ( 11 | Any, 12 | List, 13 | Dict, 14 | Optional, 15 | Callable, 16 | Union, 17 | get_origin, 18 | get_args, 19 | Literal, 20 | ) 21 | from pydantic import BaseModel, ValidationError 22 | from loguru import logger # Import logger 23 | 24 | # Attempt relative imports for package execution 25 | try: 26 | # Import API class and errors directly from the module 27 | from .karakeep_api import KarakeepAPI, APIError, AuthenticationError 28 | 29 | # Models are not directly used here, API methods handle data types 30 | except ImportError: 31 | # Fallback for direct script execution (e.g., python -m karakeep_python_api ...) 32 | # Import API class and errors directly from the module 33 | from karakeep_api import KarakeepAPI, APIError, AuthenticationError 34 | 35 | 36 | # --- Serialization Helper --- 37 | def serialize_output(data: Any) -> Any: 38 | """ 39 | Recursively serialize data for JSON output, handling Pydantic models, 40 | dataclasses, lists, and dicts. 41 | """ 42 | if isinstance(data, BaseModel): 43 | return data.model_dump( 44 | mode="json" 45 | ) # Use Pydantic's built-in JSON serialization 46 | elif isinstance(data, list): 47 | return [serialize_output(item) for item in data] 48 | elif isinstance(data, dict): 49 | # Serialize dictionary values 50 | return {k: serialize_output(v) for k, v in data.items()} 51 | # Add handling for other types like datetime if needed, though Pydantic's 52 | # model_dump(mode='json') often handles them. 53 | # Basic types (str, int, float, bool, None) are returned as is. 54 | return data 55 | 56 | 57 | # --- Click CLI Setup --- 58 | 59 | # Shared options for the API client 60 | shared_options = [ 61 | click.option( 62 | "--api-endpoint", 63 | envvar="KARAKEEP_PYTHON_API_ENDPOINT", 64 | help="Full Karakeep API endpoint URL, including /api/v1/ (e.g., https://instance.com/api/v1/).", 65 | ), 66 | click.option( 67 | "--api-key", 68 | envvar="KARAKEEP_PYTHON_API_KEY", 69 | help="Karakeep API Key (required, uses env var if not provided).", 70 | required=False, 71 | ), # Made not required here, checked in context 72 | click.option( 73 | "--verify-ssl/--no-verify-ssl", 74 | default=True, 75 | envvar="KARAKEEP_PYTHON_API_VERIFY_SSL", 76 | help="Verify SSL certificates.", 77 | ), 78 | click.option( 79 | "--verbose", 80 | "-v", 81 | is_flag=True, 82 | default=False, 83 | help="Enable verbose logging.", 84 | ), 85 | click.option( 86 | "--disable-response-validation", 87 | is_flag=True, 88 | default=False, 89 | envvar="KARAKEEP_PYTHON_API_DISABLE_RESPONSE_VALIDATION", 90 | help="Disable Pydantic validation of API responses (returns raw data).", 91 | ), 92 | click.option( 93 | "--ascii", 94 | "ensure_ascii", # Use 'ensure_ascii' as the destination variable name 95 | is_flag=True, 96 | default=False, # Default is False, meaning ensure_ascii=False by default 97 | envvar="KARAKEEP_PYTHON_API_ENSURE_ASCII", 98 | help="Escape non-ASCII characters in the JSON output (default: keep Unicode).", 99 | ), 100 | ] 101 | 102 | 103 | def add_options(options): 104 | """Decorator to add a list of click options to a command.""" 105 | 106 | def _add_options(func): 107 | for option in reversed(options): 108 | func = option(func) 109 | return func 110 | 111 | return _add_options 112 | 113 | 114 | # --- Callback for --dump-openapi-specification --- 115 | def print_openapi_spec(ctx, param, value): 116 | """Callback function for the --dump-openapi-specification option.""" 117 | if not value or ctx.resilient_parsing: 118 | # Exit if the flag is not set, or if Click is doing resilient parsing (e.g., for completion) 119 | return 120 | try: 121 | package_dir = os.path.dirname(__file__) 122 | spec_path = os.path.join(package_dir, "openapi_reference.json") 123 | if not os.path.exists(spec_path): 124 | click.echo( 125 | f"Error: Specification file not found at expected location: {spec_path}", 126 | err=True, 127 | ) 128 | ctx.exit(1) # Use ctx.exit 129 | with open(spec_path, "r") as f: 130 | click.echo(f.read()) # Use click.echo 131 | except Exception as e: 132 | click.echo(f"Error reading or printing specification file: {e}", err=True) 133 | ctx.exit(1) # Exit with error code if reading failed 134 | # Exit successfully *after* the try/except block if no error occurred 135 | ctx.exit(0) 136 | 137 | 138 | @click.group(context_settings=dict(help_option_names=["-h", "--help"])) 139 | @click.option( 140 | "--dump-openapi-specification", 141 | is_flag=True, 142 | callback=print_openapi_spec, 143 | expose_value=False, # Don't pass the value to the main cli function 144 | is_eager=True, # Process this option before others 145 | help="Dump the OpenAPI specification JSON to stdout and exit.", 146 | ) 147 | @add_options( 148 | shared_options 149 | ) # Apply shared options to the group (ensure_ascii is now included) 150 | @click.pass_context 151 | def cli( 152 | ctx, 153 | api_endpoint, 154 | api_key, 155 | verify_ssl, 156 | verbose, 157 | disable_response_validation, 158 | ensure_ascii, 159 | ): 160 | """ 161 | Karakeep Python API Command Line Interface. 162 | 163 | Dynamically generates commands based on the OpenAPI specification. 164 | Requires KARAKEEP_PYTHON_API_KEY environment variable or --api-key option. 165 | """ 166 | # Ensure the context object exists 167 | ctx.ensure_object(dict) 168 | 169 | # --- Strict Check for API Key and Endpoint --- 170 | # Check for API key (must be provided via arg or env) 171 | resolved_api_key = api_key or os.environ.get("KARAKEEP_PYTHON_API_KEY") 172 | if not resolved_api_key: 173 | raise click.UsageError( 174 | "API Key is required. Provide --api-key option or set KARAKEEP_PYTHON_API_KEY environment variable." 175 | ) 176 | 177 | # Check for API endpoint (must be provided via arg or env) 178 | resolved_api_endpoint = api_endpoint or os.environ.get( 179 | "KARAKEEP_PYTHON_API_ENDPOINT" 180 | ) 181 | if not resolved_api_endpoint: 182 | raise click.UsageError( 183 | "API endpoint is required. Provide --api-endpoint option or set KARAKEEP_PYTHON_API_ENDPOINT environment variable. " 184 | "The URL must include the API path, e.g., 'https://your-instance.com/api/v1/'." 185 | ) 186 | 187 | # Store common API parameters in the context for commands to use 188 | ctx.obj["API_ENDPOINT"] = resolved_api_endpoint # Store the resolved endpoint 189 | ctx.obj["API_KEY"] = resolved_api_key # Store the resolved key 190 | ctx.obj["VERIFY_SSL"] = verify_ssl 191 | ctx.obj["VERBOSE"] = verbose 192 | ctx.obj["DISABLE_RESPONSE_VALIDATION"] = ( 193 | disable_response_validation # Store the flag 194 | ) 195 | ctx.obj["ENSURE_ASCII"] = ensure_ascii # Store the ensure_ascii flag 196 | 197 | 198 | def create_click_command( 199 | api_method_name: str, api_method: Callable 200 | ) -> Optional[click.Command]: 201 | """ 202 | Dynamically creates a Click command for a given API method instance, 203 | inspecting its signature for arguments. Returns None if creation fails. 204 | """ 205 | # Get the signature from the bound method (which copied it from the original function) 206 | try: 207 | sig = inspect.signature(api_method) 208 | # Exclude 'self' parameter from the signature when creating CLI options 209 | params = [p for p in sig.parameters.values() if p.name != "self"] 210 | except (ValueError, TypeError) as e: 211 | logger.warning(f"Could not get signature for method '{api_method_name}': {e}") 212 | return None 213 | 214 | # Define the command function template using a closure 215 | def command_func_factory(method_name, signature): 216 | @click.pass_context 217 | def command_func(ctx, **kwargs): 218 | """Dynamically generated command function wrapper.""" 219 | # Retrieve API parameters from context, ensuring API key is present now 220 | api_endpoint = ctx.obj["API_ENDPOINT"] 221 | api_key = ctx.obj["API_KEY"] 222 | verify_ssl = ctx.obj["VERIFY_SSL"] 223 | verbose = ctx.obj["VERBOSE"] 224 | disable_validation = ctx.obj["DISABLE_RESPONSE_VALIDATION"] # Retrieve flag 225 | ensure_ascii_output = ctx.obj["ENSURE_ASCII"] # Retrieve ensure_ascii flag 226 | 227 | if not api_key: 228 | click.echo( 229 | "Error: API Key is required via --api-key or KARAKEEP_PYTHON_API_KEY environment variable.", 230 | err=True, 231 | ) 232 | ctx.exit(1) 233 | 234 | try: 235 | # Initialize API client within the command context 236 | # Method generation already happened during inspection phase or initial load 237 | api = KarakeepAPI( 238 | api_key=api_key, 239 | api_endpoint=api_endpoint, 240 | verify_ssl=verify_ssl, 241 | verbose=verbose, 242 | disable_response_validation=disable_validation, # Pass flag to constructor 243 | ) 244 | # Get the actual bound method from the initialized API instance 245 | instance_method = getattr( 246 | api, method_name 247 | ) # Use the captured method_name 248 | 249 | # Prepare arguments for the API call from Click's kwargs 250 | call_args = {} 251 | sig_params = signature.parameters # Use captured signature 252 | 253 | # Process Click kwargs into API call arguments 254 | # Convert kebab-case keys from Click back to snake_case for Python call 255 | call_args = { 256 | k.replace("-", "_"): v for k, v in kwargs.items() if v is not None 257 | } 258 | 259 | # Remove arguments that are not part of the method signature 260 | # (e.g., if extra options were somehow passed) 261 | valid_arg_names = set(signature.parameters.keys()) 262 | call_args = {k: v for k, v in call_args.items() if k in valid_arg_names} 263 | 264 | # --- JSON Parsing for Dict/List Parameters --- 265 | # Iterate through the expected parameters from the signature 266 | for param_name, param_sig in signature.parameters.items(): 267 | if param_name in call_args: 268 | param_value = call_args[param_name] 269 | param_annotation = param_sig.annotation 270 | origin = getattr(param_annotation, "__origin__", None) 271 | 272 | # Check if the annotation is dict/list or typing.Dict/List 273 | # and if the received value is a string (needs parsing) 274 | if ( 275 | param_annotation in (dict, list) 276 | or origin in (dict, list, Dict, List) 277 | ) and isinstance(param_value, str): 278 | try: 279 | # Attempt to parse the JSON string 280 | call_args[param_name] = json.loads(param_value) 281 | logger.debug( 282 | f"Parsed JSON string for parameter '{param_name}'." 283 | ) 284 | except json.JSONDecodeError as json_err: 285 | # Handle invalid JSON input from the user 286 | click.echo( 287 | f"Error: Invalid JSON provided for parameter '{param_name.replace('_', '-')}': {json_err}", 288 | err=True, 289 | ) 290 | click.echo(f"Provided value: {param_value}", err=True) 291 | ctx.exit(1) 292 | 293 | # Call the API method 294 | try: 295 | if method_name == "get_all_bookmarks": 296 | logger.debug( 297 | f"Special CLI pagination handling for '{method_name}'." 298 | ) 299 | cli_total_limit = call_args.pop("limit", None) 300 | # Other relevant params for get_all_bookmarks 301 | archived_filter = call_args.get("archived") 302 | favourited_filter = call_args.get("favourited") 303 | include_content_cli = call_args.get("include_content", True) 304 | 305 | call_args.pop("cursor", None) # Ignore CLI cursor 306 | 307 | all_bookmarks_data = [] 308 | current_page_api_cursor = None 309 | fetched_count = 0 310 | API_INTERNAL_PAGE_SIZE = 50 # Define a page size for API calls 311 | 312 | while True: 313 | api_call_limit = API_INTERNAL_PAGE_SIZE 314 | if cli_total_limit is not None: 315 | remaining_needed = cli_total_limit - fetched_count 316 | if remaining_needed <= 0: 317 | break # Reached or exceeded CLI total limit 318 | api_call_limit = min( 319 | API_INTERNAL_PAGE_SIZE, remaining_needed 320 | ) 321 | 322 | if ( 323 | api_call_limit <= 0 and cli_total_limit is not None 324 | ): # Avoid asking for 0 or negative items unless fetching all 325 | break 326 | 327 | logger.debug( 328 | f"Fetching page for '{method_name}' with cursor: {current_page_api_cursor}, api_limit: {api_call_limit}" 329 | ) 330 | 331 | page_call_args = { 332 | "archived": archived_filter, 333 | "favourited": favourited_filter, 334 | "limit": api_call_limit, 335 | "cursor": current_page_api_cursor, 336 | "include_content": include_content_cli, 337 | } 338 | page_call_args_filtered = { 339 | k: v for k, v in page_call_args.items() if v is not None 340 | } 341 | 342 | try: 343 | page_result_obj = instance_method( 344 | **page_call_args_filtered 345 | ) 346 | except TypeError as call_error_page: 347 | logger.error( 348 | f"Error calling API method '{method_name}' (paginated): {call_error_page}" 349 | ) 350 | logger.error( 351 | f"Provided arguments for page: {page_call_args_filtered}" 352 | ) 353 | if verbose: 354 | logger.debug(traceback.format_exc()) 355 | ctx.exit(1) 356 | 357 | bookmarks_on_this_page = [] 358 | next_api_cursor = None 359 | 360 | # Convert Pydantic model to dict using model_dump if available 361 | if hasattr(page_result_obj, "model_dump"): 362 | result_dict = page_result_obj.model_dump() 363 | elif isinstance(page_result_obj, dict): 364 | result_dict = page_result_obj 365 | else: 366 | logger.warning( 367 | f"Unexpected result type: {type(page_result_obj)}" 368 | ) 369 | result_dict = {} 370 | 371 | # Extract data and cursor from the dict 372 | bookmarks_on_this_page = result_dict.get("bookmarks", []) 373 | next_api_cursor = result_dict.get("nextCursor") 374 | 375 | logger.debug( 376 | f"Extracted {len(bookmarks_on_this_page)} bookmarks and cursor: {next_api_cursor}" 377 | ) 378 | 379 | if not isinstance(bookmarks_on_this_page, list): 380 | logger.warning( 381 | f"Expected a list of bookmarks, got {type(bookmarks_on_this_page)}. Stopping pagination." 382 | ) 383 | break 384 | 385 | all_bookmarks_data.extend(bookmarks_on_this_page) 386 | fetched_count += len(bookmarks_on_this_page) 387 | logger.debug( 388 | f"Fetched {len(bookmarks_on_this_page)} bookmarks this page. Total fetched: {fetched_count}." 389 | ) 390 | 391 | current_page_api_cursor = next_api_cursor 392 | if not current_page_api_cursor: 393 | logger.debug( 394 | "No nextCursor from API, pagination complete." 395 | ) 396 | break 397 | if ( 398 | cli_total_limit is not None 399 | and fetched_count >= cli_total_limit 400 | ): 401 | logger.debug( 402 | f"CLI total limit of {cli_total_limit} reached or exceeded." 403 | ) 404 | break 405 | if not bookmarks_on_this_page and api_call_limit > 0: 406 | logger.debug( 407 | "API returned an empty list of bookmarks while a positive limit was set, assuming end of data." 408 | ) 409 | break 410 | 411 | result = all_bookmarks_data # This will be a list of Bookmark models or dicts 412 | elif method_name == "get_all_highlights": 413 | logger.debug( 414 | f"Special CLI pagination handling for '{method_name}'." 415 | ) 416 | cli_total_limit = call_args.pop("limit", None) 417 | 418 | call_args.pop("cursor", None) # Ignore CLI cursor 419 | 420 | all_highlights_data = [] 421 | current_page_api_cursor = None 422 | fetched_count = 0 423 | API_INTERNAL_PAGE_SIZE = 50 # Define a page size for API calls 424 | 425 | while True: 426 | api_call_limit = API_INTERNAL_PAGE_SIZE 427 | if cli_total_limit is not None: 428 | remaining_needed = cli_total_limit - fetched_count 429 | if remaining_needed <= 0: 430 | break # Reached or exceeded CLI total limit 431 | api_call_limit = min( 432 | API_INTERNAL_PAGE_SIZE, remaining_needed 433 | ) 434 | 435 | if ( 436 | api_call_limit <= 0 and cli_total_limit is not None 437 | ): # Avoid asking for 0 or negative items unless fetching all 438 | break 439 | 440 | logger.debug( 441 | f"Fetching page for '{method_name}' with cursor: {current_page_api_cursor}, api_limit: {api_call_limit}" 442 | ) 443 | 444 | page_call_args = { 445 | "limit": api_call_limit, 446 | "cursor": current_page_api_cursor, 447 | } 448 | page_call_args_filtered = { 449 | k: v for k, v in page_call_args.items() if v is not None 450 | } 451 | 452 | try: 453 | page_result_obj = instance_method( 454 | **page_call_args_filtered 455 | ) 456 | except TypeError as call_error_page: 457 | logger.error( 458 | f"Error calling API method '{method_name}' (paginated): {call_error_page}" 459 | ) 460 | logger.error( 461 | f"Provided arguments for page: {page_call_args_filtered}" 462 | ) 463 | if verbose: 464 | logger.debug(traceback.format_exc()) 465 | ctx.exit(1) 466 | 467 | highlights_on_this_page = [] 468 | next_api_cursor = None 469 | 470 | # Convert Pydantic model to dict using model_dump if available 471 | if hasattr(page_result_obj, "model_dump"): 472 | result_dict = page_result_obj.model_dump() 473 | elif isinstance(page_result_obj, dict): 474 | result_dict = page_result_obj 475 | else: 476 | logger.warning( 477 | f"Unexpected result type: {type(page_result_obj)}" 478 | ) 479 | result_dict = {} 480 | 481 | # Extract data and cursor from the dict 482 | highlights_on_this_page = result_dict.get("highlights", []) 483 | next_api_cursor = result_dict.get("nextCursor") 484 | 485 | logger.debug( 486 | f"Extracted {len(highlights_on_this_page)} highlights and cursor: {next_api_cursor}" 487 | ) 488 | 489 | if not isinstance(highlights_on_this_page, list): 490 | logger.warning( 491 | f"Expected a list of highlights, got {type(highlights_on_this_page)}. Stopping pagination." 492 | ) 493 | break 494 | 495 | all_highlights_data.extend(highlights_on_this_page) 496 | fetched_count += len(highlights_on_this_page) 497 | logger.debug( 498 | f"Fetched {len(highlights_on_this_page)} highlights this page. Total fetched: {fetched_count}." 499 | ) 500 | 501 | current_page_api_cursor = next_api_cursor 502 | if not current_page_api_cursor: 503 | logger.debug( 504 | "No nextCursor from API, pagination complete." 505 | ) 506 | break 507 | if ( 508 | cli_total_limit is not None 509 | and fetched_count >= cli_total_limit 510 | ): 511 | logger.debug( 512 | f"CLI total limit of {cli_total_limit} reached or exceeded." 513 | ) 514 | break 515 | if not highlights_on_this_page and api_call_limit > 0: 516 | logger.debug( 517 | "API returned an empty list of highlights while a positive limit was set, assuming end of data." 518 | ) 519 | break 520 | 521 | result = all_highlights_data # This will be a list of Highlight models or dicts 522 | else: 523 | # Original behavior for other commands 524 | logger.debug( 525 | f"Calling API method '{method_name}' with args: {call_args}" 526 | ) 527 | result = instance_method(**call_args) 528 | 529 | except TypeError as call_error: 530 | logger.error( 531 | f"Error calling API method '{method_name}': {call_error}" 532 | ) 533 | logger.error(f"Provided arguments: {call_args}") 534 | logger.error(f"Expected signature: {signature}") 535 | # Add traceback in verbose mode 536 | if verbose: 537 | logger.debug(traceback.format_exc()) 538 | ctx.exit(1) 539 | 540 | # Serialize and print the result 541 | if result is not None: 542 | output_data = serialize_output(result) 543 | # Use ensure_ascii_output flag to control JSON encoding 544 | click.echo( 545 | json.dumps( 546 | output_data, indent=2, ensure_ascii=ensure_ascii_output 547 | ) 548 | ) 549 | else: 550 | # Handle None result (e.g., 204 No Content) gracefully 551 | # Verbose check is implicitly handled by logger level 552 | logger.debug("Operation successful (No content returned).") 553 | 554 | except ( 555 | APIError, 556 | AuthenticationError, 557 | ValueError, 558 | ValidationError, 559 | TypeError, 560 | ) as e: 561 | logger.error(f"Error: {e}") 562 | # Provide more detail for TypeErrors during binding/call 563 | if isinstance(e, TypeError): 564 | logger.error(f"Error: {e}") 565 | # Provide more detail for TypeErrors during binding/call 566 | if isinstance(e, TypeError) and verbose: 567 | logger.debug(traceback.format_exc()) # Use top-level import 568 | sys.exit(1) 569 | except Exception as e: 570 | logger.error(f"An unexpected error occurred: {e}") 571 | if verbose: 572 | logger.debug(traceback.format_exc()) # Use top-level import 573 | sys.exit(1) 574 | 575 | # Set the name of the inner function for help display purposes 576 | command_func.__name__ = method_name 577 | return command_func 578 | 579 | # Create the actual command function instance using the factory 580 | command_func = command_func_factory(api_method_name, sig) 581 | 582 | # --- Add Click options/arguments based on the captured method signature --- 583 | click_params = [] 584 | # Use the docstring from the original method (captured by functools.update_wrapper) 585 | docstring = api_method.__doc__ or f"Execute the {api_method_name} API operation." 586 | docstring = dedent(docstring) 587 | docstring_lines = docstring.split("\n") 588 | help_text = " ".join( 589 | docstring.split("\n\n")[0].splitlines() 590 | ).strip() # First lines as short help 591 | # Full docstring as help 592 | full_help = docstring 593 | 594 | # tweak the whitespaces in the full help: 595 | full_help = full_help.replace("\n ", " ") 596 | full_help = full_help.replace("\n", "\n\n") 597 | 598 | # Extract parameter descriptions from the Args section of the docstring 599 | param_descriptions = {} 600 | in_args_section = False 601 | args_section_lines = [] 602 | assert "Returns:" in docstring 603 | assert "Raises:" in docstring 604 | for line in docstring_lines: 605 | stripped_line = line.strip() 606 | if stripped_line == "Args:": 607 | in_args_section = True 608 | elif stripped_line == "Returns:" or stripped_line == "Raises:": 609 | in_args_section = False # Stop capturing when Returns/Raises section starts 610 | elif in_args_section and stripped_line: 611 | args_section_lines.append(stripped_line) 612 | # Use regex to capture 'param_name: description' structure, allowing leading whitespace 613 | # Pattern: ^\s+ (parameter_name): \s* (description) $ 614 | match = re.match(r"^\s+([a-zA-Z_][a-zA-Z0-9_]*):\s+(.*)$", line) 615 | if match and match.group(1) != "Example": 616 | param_name = match.group(1) 617 | description = match.group(2).strip() 618 | param_descriptions[param_name] = description 619 | logger.trace( 620 | f"Parsed docstring param: '{param_name}' -> '{description}'" 621 | ) 622 | else: 623 | param_descriptions[param_name] += " " + stripped_line 624 | 625 | # Removed breakpoint() that was added for debugging 626 | # Add parameters from signature to Click command 627 | for param in params: # Use the filtered list from signature inspection 628 | param_name_cli = param.name.replace("_", "-") # Use kebab-case for CLI options 629 | is_required_in_sig = param.default is inspect.Parameter.empty 630 | default_value = param.default if not is_required_in_sig else None 631 | param_type = click.STRING # Default to string for CLI 632 | 633 | # Basic type mapping for Click 634 | annotation = param.annotation 635 | origin = getattr(annotation, "__origin__", None) 636 | args = getattr(annotation, "__args__", []) 637 | 638 | # Determine Click type and if it's a flag 639 | is_flag = False 640 | click_type = click.STRING 641 | if annotation is int: 642 | click_type = click.INT 643 | elif annotation is float: 644 | click_type = click.FLOAT 645 | elif annotation is bool: 646 | click_type = click.BOOL 647 | # Boolean options are flags if they don't have a default or default is False 648 | is_flag = is_required_in_sig or default_value is False 649 | # Handle Optional[T] - makes the option not required unless T is bool 650 | elif origin is Union and type(None) in args and len(args) == 2: 651 | non_none_type = args[0] if args[1] is type(None) else args[1] 652 | if non_none_type is int: 653 | click_type = click.INT 654 | elif non_none_type is float: 655 | click_type = click.FLOAT 656 | elif non_none_type is bool: 657 | click_type = click.BOOL 658 | # Optional bools are typically flags like --enable-feature/--disable-feature 659 | # For simplicity, treat as a standard option unless explicitly designed as toggle 660 | is_flag = ( 661 | False # Treat Optional[bool] as --option/--no-option by default 662 | ) 663 | # Keep click_type as STRING for Optional[List/Dict/str/Any] 664 | is_required_in_sig = False # Optional means not required 665 | default_value = None # Default for Optional is None 666 | 667 | # Handle List[T] or Dict[K, V] - expect JSON string 668 | elif origin in (list, dict, List, Dict) or annotation in (list, dict): 669 | click_type = click.STRING # Expect JSON string 670 | # Handle Literal[...] for choices 671 | elif origin is Literal: 672 | choices = get_args(annotation) 673 | # Ensure all choices are strings for click.Choice 674 | if all(isinstance(c, str) for c in choices): 675 | click_type = click.Choice(choices, case_sensitive=False) 676 | else: 677 | logger.warning( 678 | f"Parameter '{param.name}' is Literal but contains non-string types. Treating as STRING." 679 | ) 680 | click_type = click.STRING # Fallback 681 | 682 | # Determine option name(s) and help text 683 | option_names = [f"--{param_name_cli}"] 684 | # Add /--no- option for boolean flags that are not required and default to True 685 | if ( 686 | is_flag 687 | and annotation is bool 688 | and not is_required_in_sig 689 | and default_value is True 690 | ): 691 | option_names.append(f"--no-{param_name_cli}") 692 | 693 | param_help = param_descriptions.get(param.name, f"Parameter '{param.name}'.") 694 | if click_type is click.STRING and ( 695 | origin in (list, dict) or annotation in (list, dict) 696 | ): 697 | param_help += " (Provide as JSON string)" 698 | elif isinstance(click_type, click.Choice): 699 | param_help += f" (Choices: {', '.join(click_type.choices)})" 700 | 701 | click_required = is_required_in_sig and default_value is None and not is_flag 702 | 703 | # Make copies of properties that might be modified for specific commands/params 704 | current_param_help = param_help 705 | current_click_required = click_required 706 | current_default_value = default_value 707 | current_is_flag = ( 708 | is_flag # Though is_flag interpretation might change help/required 709 | ) 710 | 711 | # Special handling for 'get_all_bookmarks' command parameters 712 | if api_method_name == "get_all_bookmarks": 713 | if param.name == "cursor": 714 | current_param_help = ( 715 | "[Ignored by CLI for get-all-bookmarks] " + param_help 716 | ) 717 | current_click_required = ( 718 | False # Cursor is handled by CLI, not required from user 719 | ) 720 | current_default_value = ( 721 | None # Explicitly set default to None for ignored param 722 | ) 723 | elif param.name == "limit": 724 | current_param_help = "Total maximum number of bookmarks to fetch across pages for get-all-bookmarks. If omitted, all are fetched." 725 | # For 'limit', required status and default remain as derived from its Optional[int] type hint 726 | # current_click_required and current_default_value will be correctly False and None respectively. 727 | 728 | # Special handling for 'get_all_highlights' command parameters 729 | if api_method_name == "get_all_highlights": 730 | if param.name == "cursor": 731 | current_param_help = ( 732 | "[Ignored by CLI for get-all-highlights] " + param_help 733 | ) 734 | current_click_required = ( 735 | False # Cursor is handled by CLI, not required from user 736 | ) 737 | current_default_value = ( 738 | None # Explicitly set default to None for ignored param 739 | ) 740 | elif param.name == "limit": 741 | current_param_help = "Total maximum number of highlights to fetch across pages for get-all-highlights. If omitted, all are fetched." 742 | # For 'limit', required status and default remain as derived from its Optional[int] type hint 743 | # current_click_required and current_default_value will be correctly False and None respectively. 744 | 745 | # Add the Click Option 746 | click_params.append( 747 | click.Option( 748 | option_names, 749 | type=click_type, 750 | required=current_click_required, 751 | default=current_default_value if not current_is_flag else None, 752 | help=current_param_help, 753 | is_flag=(current_is_flag if len(option_names) == 1 else False), 754 | show_default=not current_is_flag and current_default_value is not None, 755 | # Click derives the Python identifier (e.g., 'bookmark_id') from the first long option name 756 | ) 757 | ) 758 | 759 | # Create the Click command 760 | try: 761 | dynamic_command = click.Command( 762 | name=api_method_name.replace("_", "-"), # Use kebab-case for command names 763 | callback=command_func, 764 | params=click_params, 765 | help=full_help, 766 | short_help=help_text, 767 | ) 768 | return dynamic_command 769 | except Exception as e: 770 | logger.warning(f"Failed to create click command for '{api_method_name}': {e}") 771 | return None 772 | 773 | 774 | # --- Dynamically Add Commands to CLI Group --- 775 | def add_commands_to_cli(cli_group): 776 | """ 777 | Inspects the KarakeepAPI class *statically* to find public methods 778 | and adds them as Click commands. Does NOT require API keys or URL for inspection. 779 | """ 780 | logger.info("Statically inspecting KarakeepAPI class and generating commands...") 781 | 782 | try: 783 | added_count = 0 784 | skipped_count = 0 785 | # Inspect the KarakeepAPI class directly, not an instance 786 | for name, member in inspect.getmembers(KarakeepAPI): 787 | # Check if it's a public function/method defined in the class 788 | if ( 789 | not name.startswith("_") 790 | and inspect.isfunction( 791 | member 792 | ) # Check if it's a function (method in class def) 793 | # Add further checks if needed, e.g., based on naming convention or decorators 794 | ): 795 | try: 796 | # Attempt to create a command for the method 797 | # We pass the function object directly. The command_func will later 798 | # get the bound method from the API instance created at runtime. 799 | command = create_click_command(name, member) 800 | if command: 801 | cli_group.add_command(command) 802 | added_count += 1 803 | else: 804 | logger.warning(f"Skipped command generation for method: {name}") 805 | skipped_count += 1 806 | except Exception as cmd_gen_e: 807 | logger.warning( 808 | f"Failed to create command for method '{name}': {cmd_gen_e}" 809 | ) 810 | skipped_count += 1 811 | 812 | if added_count == 0: 813 | logger.warning( 814 | "No API commands were dynamically added. Check KarakeepAPI class definition and logs." 815 | ) 816 | else: 817 | logger.info(f"Added {added_count} API commands. Skipped {skipped_count}.") 818 | 819 | except Exception as e: 820 | # Handle errors during static inspection or command creation 821 | logger.error(f"Unexpected error during dynamic command setup: {e}") 822 | # Determine verbosity from environment for traceback logging during setup 823 | verbose_setup = os.environ.get("KARAKEEP_PYTHON_API_VERBOSE", "").lower() in ( 824 | "true", 825 | "1", 826 | "yes", 827 | ) 828 | if verbose_setup: 829 | logger.debug(traceback.format_exc()) # Use top-level import 830 | # Raise an exception to halt execution if setup fails 831 | error_message = f"Error: Unexpected error during dynamic command setup: {e}" 832 | raise click.ClickException(error_message) 833 | 834 | 835 | # Add commands when the script is loaded by calling the function 836 | add_commands_to_cli(cli) 837 | 838 | # Main entry point for the script 839 | if __name__ == "__main__": 840 | # Normal Click execution starts here. The --dump-openapi-specification 841 | # is now handled by its callback function defined above. 842 | cli(obj={}) # Pass initial empty object for context 843 | --------------------------------------------------------------------------------