├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── AGENTS.md ├── LICENSE ├── README.md ├── llm_arxiv.py ├── pyproject.toml └── tests ├── pytest.py └── test_arxiv.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | # Optional: Add trigger for pushes to master if you want to publish on every push 7 | # push: 8 | # branches: 9 | # - master 10 | 11 | permissions: 12 | contents: read 13 | 14 | jobs: 15 | test: 16 | runs-on: ubuntu-latest 17 | strategy: 18 | matrix: 19 | python-version: ["3.10", "3.11", "3.12"] 20 | steps: 21 | - name: Check Python version (skip 3.9) 22 | if: matrix.python-version == '3.9' 23 | run: | 24 | echo "Skipping test job for Python 3.9 as it is explicitly excluded" 25 | exit 0 # Exit successfully to prevent job failure if 3.9 somehow runs 26 | 27 | - uses: actions/checkout@v4 28 | - name: Set up Python ${{ matrix.python-version }} 29 | if: matrix.python-version != '3.9' 30 | uses: actions/setup-python@v5 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | cache: pip 34 | cache-dependency-path: pyproject.toml 35 | - name: Install dependencies 36 | if: matrix.python-version != '3.9' 37 | run: | 38 | python -m pip install --upgrade pip 39 | python -m pip install -e '.[test]' 40 | - name: Run tests 41 | if: matrix.python-version != '3.9' 42 | run: | 43 | python -m pytest 44 | deploy: 45 | runs-on: ubuntu-latest 46 | needs: [test] 47 | permissions: 48 | id-token: write 49 | steps: 50 | - uses: actions/checkout@v4 51 | - name: Set up Python 52 | uses: actions/setup-python@v5 53 | with: 54 | python-version: "3.12" 55 | cache: pip 56 | cache-dependency-path: pyproject.toml 57 | - name: Install dependencies 58 | run: | 59 | python -m pip install setuptools wheel build 60 | - name: Build 61 | run: | 62 | python -m build 63 | - name: Publish 64 | uses: pypa/gh-action-pypi-publish@release/v1 65 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.10", "3.11", "3.12"] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v5 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install -e '.[test]' 28 | - name: Run tests 29 | run: | 30 | python -m pytest 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | venv 6 | .eggs 7 | .pytest_cache 8 | *.egg-info 9 | .DS_Store 10 | .vscode 11 | dist 12 | build 13 | -------------------------------------------------------------------------------- /AGENTS.md: -------------------------------------------------------------------------------- 1 | # AI Agent Instructions for `llm-arxiv` 2 | 3 | This document provides guidance for AI agents assisting with the development and maintenance of the `llm-arxiv` repository. 4 | 5 | ## Project Overview 6 | 7 | `llm-arxiv` is a Python-based plugin for the [LLM CLI tool](https://llm.datasette.io/) that enables users to load and process academic papers from arXiv. It fetches paper metadata and content, typically the PDF, and makes it available for language model processing. 8 | 9 | ## Key Files and Directories 10 | 11 | * **`llm_arxiv.py`**: This is the core file containing the plugin's logic. It implements the LLM plugin interface and handles fetching and processing arXiv papers. The `[project.entry-points.llm]` section in `pyproject.toml` points to this file (`arxiv = "llm_arxiv"`). 12 | * **`pyproject.toml`**: The main configuration file for the project. It defines dependencies (e.g., `llm`, `arxiv`, `PyMuPDF`), build system settings, project metadata, and the plugin entry point. 13 | * **`arxiv.py`**: This file might contain utility functions or a script related to interacting with the `arxiv` package or API. Be cautious if modifying, as its role needs to be clearly understood in context of the main `llm_arxiv.py` plugin. 14 | * **`fitz.py`**: This file likely contains utility functions or a script related to PDF processing, using `PyMuPDF` (which provides Fitz bindings). Similar to `arxiv.py`, understand its specific role before making changes. 15 | * **`README.md`**: The primary documentation for human users of the plugin. It should be kept up-to-date with features, installation instructions, and usage examples. 16 | * **`AGENTS.md`**: (This file) Contains specific instructions and context for AI agents working on this codebase. 17 | * **`tests/`**: This directory contains automated tests for the plugin, likely using `pytest`. New features and bug fixes should ideally be accompanied by relevant tests. 18 | * **`.github/workflows/`**: Contains GitHub Actions workflow definitions, for example, for running tests automatically. 19 | 20 | ## Common Development Tasks 21 | 22 | When asked to perform tasks, consider the following: 23 | 24 | * **Adding Features**: 25 | * Modifications will likely center around `llm_arxiv.py`. 26 | * Consider how new features impact dependencies (`pyproject.toml`). 27 | * Add corresponding tests in the `tests/` directory. 28 | * Update `README.md` with user-facing documentation for the new feature. 29 | * **Bug Fixing**: 30 | * Identify the relevant module (`llm_arxiv.py`, `arxiv.py`, `fitz.py`). 31 | * Write a test case that reproduces the bug if possible. 32 | * Ensure the fix doesn't break existing functionality by running all tests. 33 | * **Dependency Management**: 34 | * Changes to dependencies are made in `pyproject.toml`. 35 | * Be mindful of version compatibility. 36 | * **Documentation**: 37 | * Keep `README.md` clear and up-to-date. 38 | * Update this file (`AGENTS.md`) if there are significant changes to the development workflow or codebase structure relevant to AI agents. 39 | 40 | ## Important Considerations 41 | 42 | * **arXiv API Usage**: If directly interacting with the arXiv API (via the `arxiv` package or otherwise), be mindful of rate limits and terms of service. 43 | * **PDF Parsing**: PDF parsing can be complex. `PyMuPDF` (Fitz) is used. Ensure robustness and handle potential errors gracefully. 44 | * **Code Style and Quality**: Follow existing code style. Ensure code is clear, well-commented where non-obvious, and efficient. 45 | * **Testing**: Always aim to maintain or increase test coverage. 46 | 47 | By following these guidelines, AI agents can contribute effectively to the `llm-arxiv` project. 48 | 49 | ## Known Issues and Debugging Notes 50 | 51 | ### Image Resizing Bug (Incorrectly Resizes to 1x1 Pixels) 52 | 53 | * **Problem Description**: 54 | * When using the `llm arxiv ... -r` command (enabling image resizing), all extracted images from the PDF are being incorrectly resized to 1x1 pixels. 55 | * This causes any downstream vision-enabled LLM to interpret these images as simple solid blocks of color, rather than recognizing their actual content. 56 | * The issue is specific to the resizing operation. If resizing is disabled (e.g., `llm arxiv ID -i all` without `-r`, or via the fragment loader `llm -f arxiv:ID?i=all`), images are processed and described correctly by the LLM (though at their original, unresized dimensions). 57 | 58 | * **Location of Buggy Code**: 59 | * File: `llm_arxiv.py` 60 | * Function: `_process_arxiv_paper` 61 | * Specific Block: The section responsible for calculating `new_width` and `new_height` within the `if perform_resize:` block. Debug logs confirm that `new_width` and `new_height` are both evaluating to `1` before being passed to `img.resize()`. 62 | 63 | * **Debugging Steps (to isolate the miscalculation)**: 64 | 1. Focus on the arithmetic that calculates `new_width` and `new_height`: 65 | ```python 66 | if img.width > img.height: 67 | new_width = max_dim_to_use 68 | new_height = max(1, int(max_dim_to_use * img.height / img.width)) 69 | else: 70 | new_height = max_dim_to_use 71 | new_width = max(1, int(max_dim_to_use * img.width / img.height)) 72 | ``` 73 | 2. Add detailed debug print statements *immediately before* these calculations to log the exact runtime values of: 74 | * `img.width` (original width of the image being processed) 75 | * `img.height` (original height) 76 | * `max_dim_to_use` (the target maximum dimension, e.g., 512) 77 | 3. Also, print the result of the intermediate floating-point calculation *before* it's passed to `int()`: 78 | * e.g., `value_before_int = max_dim_to_use * img.height / img.width` (and its counterpart for `new_width`) 79 | 4. Finally, print the calculated `new_width` and `new_height` *immediately after* they are computed and before they are used in `img.resize()`. 80 | * This detailed logging should reveal why the aspect ratio calculation is resulting in a value that, when truncated by `int()`, becomes 0 (which `max(1, ...)` then turns into 1). 81 | 82 | * **How to Prove It's Solved**: 83 | 1. **Check Debug Output**: After applying a fix, the new debug print statements (from step 4 above) should show that `new_width` and `new_height` are sensible dimensions that maintain the aspect ratio and respect `max_dim_to_use` (e.g., for a 1500x500 image with `max_dim_to_use=512`, the new dimensions should be around 512x170, not 1x1). 84 | 2. **Check LLM Output**: Run the command `llm arxiv -i all -r "describe the images"`. The LLM should now describe the actual content of the figures/diagrams in the paper, not just solid colors. 85 | 3. **Verify `img.size` after resize**: The existing debug line `print(f"Debug: Image *after* resize: Mode: {img.mode}, Size: {img.size}, Info: {img.info}", file=sys.stderr)` should show the corrected, non-1x1 dimensions. 86 | 4. **(Optional Advanced Test)**: For a more robust automated test, one could theoretically (in `tests/test_arxiv.py`): 87 | * Mock `_process_arxiv_paper` or have a test utility that calls it directly with a known image that requires resizing. 88 | * Capture the `attachments` list returned. 89 | * For each attachment, use `Image.open(io.BytesIO(attachment.content))` to load the processed image. 90 | * Assert that the dimensions of this re-loaded image are the expected resized dimensions (e.g., not 1x1, and respecting the aspect ratio and max dimension). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # llm-arxiv 2 | 3 | [![PyPI](https://img.shields.io/pypi/v/llm-arxiv.svg)](https://pypi.org/project/llm-arxiv/) 4 | [![Changelog](https://img.shields.io/github/v/release/agustif/llm-arxiv?include_prereleases&label=changelog)](https://github.com/agustif/llm-arxiv/releases) 5 | [![Tests](https://github.com/agustif/llm-arxiv/actions/workflows/test.yml/badge.svg)](https://github.com/agustif/llm-arxiv/actions/workflows/test.yml) 6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/agustif/llm-arxiv/blob/main/LICENSE) 7 | 8 | LLM plugin for loading arXiv papers and their images. 9 | 10 | This plugin allows you to search for arXiv papers, fetch their text content, and optionally, their images directly into `llm`. 11 | 12 | ## Installation 13 | 14 | Install this plugin in the same environment as [LLM](https://llm.datasette.io/). 15 | 16 | ```bash 17 | llm install llm-arxiv 18 | ``` 19 | 20 | The command above will also install the necessary dependencies: `arxiv`, `PyMuPDF`, and `Pillow`. 21 | 22 | ## Usage 23 | 24 | This plugin provides three main ways to interact with arXiv papers: 25 | 26 | 1. **As a fragment loader:** Allows you to inject arXiv paper content (text and optionally images) directly into a prompt using the `-f` or `--fragment` option with `llm`. 27 | 2. **As a standalone command (`llm arxiv`):** Provides an `llm arxiv` command to fetch, process, and output paper content directly to stdout, which can then be piped to other commands or models. 28 | 3. **As a search command (`llm arxiv-search`):** Allows you to search arXiv for papers based on a query string. 29 | 30 | ### 1. Fragment Loader (`-f arxiv:...`) 31 | 32 | You can load an arXiv paper by its ID or full URL. The text content (converted to Markdown) and any selected images (as attachments) will be passed to the language model. 33 | 34 | **Syntax:** 35 | 36 | `llm -f 'arxiv:PAPER_ID_OR_URL[?options]' "Your prompt here..."` 37 | 38 | * `PAPER_ID_OR_URL`: Can be an arXiv ID (e.g., `2310.06825`, `astro-ph/0601009`) or a full arXiv URL (e.g., `https://arxiv.org/abs/2310.06825`, `http://arxiv.org/pdf/2310.06825.pdf`). 39 | * `[?options]`: Optional query parameters to control image inclusion and resizing. (Remember to quote the argument if using `?` or `&` in your shell). 40 | 41 | **Fragment Loader Options:** 42 | 43 | * `i` / `include_images`: Controls image inclusion. If not specified, no images are included. 44 | * `?i` or `?i=` or `?i=all`: Include all images from the paper. 45 | * `?i=none`: Include no images (same as omitting `?i`). 46 | * `?i=P:pages`: Include all images from specified pages. `pages` is a comma-separated list of page numbers or ranges (e.g., `P:1`, `P:1,3-5`, `P:2,4`). Page numbers are 1-indexed. 47 | * `?i=G:indices`: Include images by their global index in the document (sequentially numbered as they appear). `indices` is a comma-separated list of image indices or ranges (e.g., `G:1`, `G:1-5,10`). Indices are 1-indexed. 48 | * `r` / `resize_images`: Controls image resizing. Resizing only applies if images are included. 49 | * `?r` or `?r=true`: Enable image resizing. Images will be resized to a maximum dimension of 512px by default, preserving aspect ratio. Only images larger than this will be downscaled. 50 | * `?r=PIXELS`: Enable image resizing and set a custom maximum dimension (e.g., `?r=800`). 51 | 52 | **Examples (Fragment Loader):** 53 | 54 | * Load text only: 55 | ```bash 56 | llm -f 'arxiv:2310.06825' "Summarize this paper." 57 | ``` 58 | * Load text and all images (resized to default 512px max): 59 | ```bash 60 | llm -f 'arxiv:2310.06825?i&r' -m gpt-4-vision-preview "Explain the diagrams in this paper." 61 | ``` 62 | * Load text and images from page 1 and 3, resized to 800px max: 63 | ```bash 64 | llm -f 'arxiv:2310.06825?i=P:1,3&r=800' -m gemini-pro-vision "Describe the images on pages 1 and 3." 65 | ``` 66 | * Load text and the first 5 globally indexed images, no resizing: 67 | ```bash 68 | llm -f 'arxiv:2310.06825?i=G:1-5' -m some-image-model "What do the first five images show?" 69 | ``` 70 | 71 | ### 2. Standalone Command (`llm arxiv ...`) 72 | 73 | The `llm arxiv` command fetches and processes an arXiv paper. 74 | * If no prompt is provided, it outputs the paper's content as Markdown to standard output. This can be piped to other commands or LLMs. 75 | * If a `PROMPT` is provided, it processes the paper content (including any selected images as attachments) with the specified or default LLM. 76 | 77 | **Syntax:** 78 | 79 | `llm arxiv PAPER_ID_OR_URL [PROMPT] [OPTIONS]` 80 | 81 | **Arguments:** 82 | 83 | * `PAPER_ID_OR_URL`: The arXiv ID (e.g., `2310.06825`) or full URL. 84 | * `PROMPT` (Optional): A prompt to send to an LLM along with the paper's content. 85 | 86 | **Command Options:** 87 | 88 | * `-i SPEC` / `--include-images SPEC`: 89 | Controls image inclusion. If not specified and a prompt is given, `parse_image_selection_spec`'s default behavior for `None` (no images) applies. If no prompt is given, no images are processed by default. 90 | * `-i all` or (if `PROMPT` is present) simply `-i` with no value: Include all images. 91 | * `-i ""` (empty string value): Include all images. 92 | * `-i none`: Include no images. 93 | * `-i P:pages`: Include all images from specified pages (e.g., `P:1`, `P:1,3-5`). 94 | * `-i G:indices`: Include images by their global index (e.g., `G:1`, `G:1-5,10`). 95 | * `-r` / `--resize-images`: 96 | Enable image resizing. Images will be resized to a maximum dimension of 512px by default, preserving aspect ratio. Only images larger than this will be downscaled. 97 | * `-d PIXELS` / `--max-dimension PIXELS`: 98 | Set a custom maximum dimension in pixels for resizing. Requires `-r` to be active. 99 | * `-m MODEL_ID` / `--model MODEL_ID`: 100 | Specify the LLM model to use if a `PROMPT` is provided. 101 | * `-s SYSTEM_PROMPT` / `--system SYSTEM_PROMPT`: 102 | Specify a system prompt to use with the LLM if a `PROMPT` is provided. 103 | 104 | **Examples (Standalone Command):** 105 | 106 | * Get Markdown content of a paper: 107 | ```bash 108 | llm arxiv 2310.06825 109 | ``` 110 | * Get Markdown, prepare all images (resized), then pipe to a model: 111 | ```bash 112 | llm arxiv 2310.06825 -i all -r | llm -m gpt-4-vision-preview "Summarize this, paying attention to figures." 113 | ``` 114 | * Directly prompt an LLM with the paper's content and images from pages 2 and 4 (resized to 600px): 115 | ```bash 116 | llm arxiv 2310.06825 "Explain figures on page 2 and 4." -i P:2,4 -r -d 600 -m gpt-4o 117 | ``` 118 | * Summarize a paper using the default LLM and include all images: 119 | ```bash 120 | llm arxiv 2310.06825 "Summarize the key findings." -i all 121 | ``` 122 | 123 | ### 3. Search Command (`llm arxiv-search ...`) 124 | 125 | The `llm arxiv-search` command allows you to search for papers on arXiv using a query string. 126 | 127 | **Syntax:** 128 | 129 | `llm arxiv-search [OPTIONS] QUERY_STRING` 130 | 131 | **Arguments:** 132 | 133 | * `QUERY_STRING`: The search query (e.g., "quantum computing", "author:Hawking title:black holes"). See [arXiv API user manual](https://arxiv.org/help/api/user-manual#query_details) for advanced query syntax. 134 | 135 | **Options:** 136 | 137 | * `-n INT`, `--max-results INT`: Maximum number of search results to return (Default: `5`). 138 | * `--sort-by [relevance|lastUpdatedDate|submittedDate]`: Sort order for search results (Default: `relevance`). 139 | * `--details`: Show more details for each result, including authors, full abstract, categories, publication/update dates, and PDF link. 140 | 141 | **Output:** 142 | 143 | For each search result, the command will display: 144 | * The paper's ID and Title. 145 | * A suggested command to fetch the full paper with `llm arxiv `. This command is styled (e.g., bold, green, underlined, prefixed with `$`) for visibility. 146 | * A brief abstract (or full details if `--details` is used). 147 | 148 | Additionally, the script will attempt to copy all the suggested `llm arxiv ` commands (newline-separated) to your system clipboard using an OSC 52 escape sequence. A message like `(Attempted to copy N command(s) to clipboard)` will be printed to stderr. The success of this automatic copy depends on your terminal emulator's support and configuration (e.g., iTerm2 needs clipboard access enabled for applications). 149 | 150 | **Examples (Search Command):** 151 | 152 | * Search for "large language models" and get top 3 results (brief): 153 | ```bash 154 | llm arxiv-search -n 3 "large language models" 155 | ``` 156 | (This will also attempt to copy the 3 suggested `llm arxiv` commands to your clipboard.) 157 | 158 | * Search for papers by author "Hinton" on "neural networks", sorted by submission date, with full details: 159 | ```bash 160 | llm arxiv-search --sort-by submittedDate --details "au:Hinton AND ti:\"neural network\"" 161 | ``` 162 | 163 | ## Image Handling Notes 164 | 165 | * **Rationale for Optional Images:** Processing and including images can significantly increase the data size sent to language models. Many models have limitations on input context window size, and some may not support image inputs at all or may incur higher costs for them. The granular controls for image inclusion (all, none, specific pages/indices) and resizing allow users to manage this, ensuring that only necessary visual information is passed to the LLM, optimizing for cost, speed, and model compatibility. 166 | * Images are extracted from the PDF, converted to Markdown placeholders `[IMAGE: http://arxiv.org/abs/ID#page_X_img_Y]`, and attached as `llm.Attachment` objects if selected. 167 | * Supported input image formats from PDFs include common types like JPEG, PNG, GIF, BMP. Efforts are made to convert others, but complex or rare formats might be skipped. 168 | * When resized, images are converted to JPEG (for most common types) or PNG (if transparency or other features warrant it) to save tokens and improve compatibility with models. 169 | * Image processing errors are printed to `stderr` but do not stop the text extraction. 170 | 171 | ## Development 172 | 173 | To contribute to this plugin, clone the repository and install it in editable mode: 174 | 175 | ```bash 176 | git clone https://github.com/agustif/llm-arxiv.git 177 | cd llm-arxiv 178 | # It's recommended to use a virtual environment 179 | python -m venv venv 180 | source venv/bin/activate # On Windows use `venv\\Scripts\\activate` 181 | # Install in editable mode 182 | pip install -e . 183 | # Install additional dependencies for testing (e.g., pytest, pytest-cov) 184 | pip install pytest pytest-cov 185 | # Run tests 186 | pytest tests/ 187 | ``` 188 | 189 | ## AGENTS.md 190 | 191 | See [AGENTS.md](AGENTS.md) for notes on how AI agents should interpret and use this tool and its outputs. 192 | -------------------------------------------------------------------------------- /llm_arxiv.py: -------------------------------------------------------------------------------- 1 | import llm 2 | import arxiv 3 | # Keep specific import for this one as it seemed to work 4 | from arxiv import UnexpectedEmptyPageError, HTTPError 5 | import fitz # PyMuPDF 6 | import tempfile 7 | import re 8 | from typing import List, Union, Tuple, Optional, Set, TypedDict, Literal # Added Set, TypedDict, Literal 9 | import base64 # For image encoding 10 | import markdownify # Added for HTML to Markdown conversion 11 | import io # For handling image bytes 12 | from PIL import Image # Added for image resizing 13 | import os # Added for environment variable access 14 | from urllib.parse import parse_qs # Added for parsing options from argument 15 | import click # For the new command 16 | import sys # Ensure sys is imported for stderr printing 17 | import datetime # For formatting dates from arxiv results 18 | 19 | # --- Types for Image Selection --- 20 | class ImageSelectionCriteria(TypedDict, total=False): 21 | mode: Literal["all", "global", "pages"] 22 | indices: Set[int] # For "global" mode: global image indices. For "pages" mode: page numbers. 23 | 24 | 25 | # --- Helper function to parse range strings like "1,3-5,7" --- 26 | def parse_ranges_to_set(range_str: str) -> Set[int]: 27 | """Parses a string like '1,3-5,7' into a set of integers {1, 3, 4, 5, 7}.""" 28 | result: Set[int] = set() 29 | if not range_str: 30 | return result 31 | parts = range_str.split(',') 32 | for part in parts: 33 | part = part.strip() 34 | if not part: 35 | continue 36 | if '-' in part: 37 | start_str, end_str = part.split('-', 1) 38 | try: 39 | start = int(start_str) 40 | end = int(end_str) 41 | if start <= 0 or end <= 0: 42 | raise ValueError("Page/image numbers must be positive.") 43 | if start > end: 44 | raise ValueError(f"Invalid range: start ({start}) > end ({end}).") 45 | result.update(range(start, end + 1)) 46 | except ValueError as e: 47 | raise ValueError(f"Invalid range part: '{part}'. {e}") from e 48 | else: 49 | try: 50 | val = int(part) 51 | if val <= 0: 52 | raise ValueError("Page/image numbers must be positive.") 53 | result.add(val) 54 | except ValueError as e: 55 | raise ValueError(f"Invalid number in range string: '{part}'. {e}") from e 56 | return result 57 | 58 | # --- Helper function to parse image selection specification string --- 59 | def parse_image_selection_spec(spec_string: Optional[str]) -> Optional[ImageSelectionCriteria]: 60 | """ 61 | Parses an image selection string. 62 | Returns None if no images should be included. 63 | Returns a dict like {"mode": "all"} or {"mode": "global", "indices": {1,2,3}} 64 | or {"mode": "pages", "indices": {1,2,3}}. 65 | """ 66 | if spec_string is None: 67 | return None 68 | 69 | s_lower = spec_string.lower().strip() 70 | if not s_lower or s_lower in ["all", "true", "yes", "1"]: # Empty string (e.g. from ?i= or -i without arg) means all 71 | return {"mode": "all"} 72 | if s_lower in ["none", "false", "no", "0"]: 73 | return None 74 | 75 | if s_lower.startswith("g:"): 76 | try: 77 | indices = parse_ranges_to_set(spec_string[2:]) 78 | if not indices: 79 | raise ValueError("Global image selection ('G:') requires at least one image index or range.") 80 | return {"mode": "global", "indices": indices} 81 | except ValueError as e: # Catch errors from parse_ranges_to_set 82 | raise ValueError(f"Invalid global image selection format ('{spec_string}'): {e}") from e 83 | elif s_lower.startswith("p:"): 84 | try: 85 | page_numbers = parse_ranges_to_set(spec_string[2:]) 86 | if not page_numbers: 87 | raise ValueError("Page selection ('P:') requires at least one page number or range.") 88 | return {"mode": "pages", "indices": page_numbers} # Using 'indices' key for page numbers 89 | except ValueError as e: # Catch errors from parse_ranges_to_set 90 | raise ValueError(f"Invalid page selection format ('{spec_string}'): {e}") from e 91 | 92 | raise ValueError( 93 | f"Invalid image selection format: '{spec_string}'. " 94 | "Expected 'all', 'none', 'G:1,2-5', 'P:1,2-4', or blank for all." 95 | ) 96 | 97 | 98 | # --- Helper Function for Core Logic --- 99 | def _process_arxiv_paper( 100 | arxiv_id_or_url_main: str, 101 | image_selection_criteria: Optional[ImageSelectionCriteria], 102 | resize_option: Union[bool, int], 103 | ) -> Tuple[str, List[llm.Attachment], str]: 104 | """ 105 | Internal helper to fetch and process an arXiv paper. 106 | Returns markdown text, list of llm.Attachment objects, and the paper's source URL. 107 | """ 108 | arxiv_id = extract_arxiv_id(arxiv_id_or_url_main) 109 | if not arxiv_id: 110 | raise ValueError( 111 | f"Invalid arXiv identifier or URL passed to _process_arxiv_paper: {arxiv_id_or_url_main}.") 112 | 113 | search = arxiv.Search(id_list=[arxiv_id], max_results=1) 114 | results = list(search.results()) 115 | if not results: 116 | raise ValueError(f"No paper found for arXiv ID: {arxiv_id}") 117 | paper = results[0] 118 | paper_source_url = paper.entry_id 119 | 120 | attachments_list: List[llm.Attachment] = [] 121 | full_html_parts: List[str] = [] 122 | 123 | global_image_document_idx_counter = 0 # For 'G:' mode selection 124 | 125 | with tempfile.TemporaryDirectory() as temp_dir_for_pdf: 126 | pdf_path = paper.download_pdf(dirpath=temp_dir_for_pdf) 127 | try: 128 | with fitz.open(pdf_path) as doc: 129 | for page_num, page in enumerate(doc): 130 | page_html_content = page.get_text("html") # Get HTML first 131 | 132 | current_page_conceptual_refs_for_placeholders: List[str] = [] 133 | current_page_attachments_for_this_page: List[llm.Attachment] = [] 134 | 135 | if image_selection_criteria: # Only attempt to process images if criteria exist 136 | image_list = page.get_images(full=True) 137 | for img_idx_on_page, img_info in enumerate(image_list): 138 | global_image_document_idx_counter += 1 # Count every image found in doc order 139 | 140 | # Determine if this specific image should be included 141 | should_include_this_specific_image = False 142 | mode = image_selection_criteria["mode"] 143 | 144 | if mode == "all": 145 | should_include_this_specific_image = True 146 | elif mode == "global": 147 | # Explicitly check if indices is not None, though TypedDict implies it exists for this mode 148 | if image_selection_criteria.get("indices") and global_image_document_idx_counter in image_selection_criteria["indices"]: 149 | should_include_this_specific_image = True 150 | elif mode == "pages": 151 | # Explicitly check for indices 152 | if image_selection_criteria.get("indices") and (page_num + 1) in image_selection_criteria["indices"]: 153 | should_include_this_specific_image = True 154 | 155 | if not should_include_this_specific_image: 156 | continue # Skip this image, don't process or add placeholder 157 | 158 | # --- Start of actual image processing for selected image --- 159 | xref = img_info[0] 160 | try: 161 | base_image = doc.extract_image(xref) 162 | except Exception: # Skip if extraction fails 163 | print(f"Warning: Failed to extract image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1}. Skipping.", file=sys.stderr) 164 | continue 165 | 166 | image_bytes = base_image["image"] 167 | original_ext_from_pdf = base_image["ext"].lower() 168 | 169 | pillow_input_ext_guess = original_ext_from_pdf 170 | # jpx (JPEG2000) is not well supported by default Pillow, treat as png for broader compatibility attempt 171 | if original_ext_from_pdf not in ["png", "jpeg", "jpg", "gif", "bmp"] or original_ext_from_pdf == "jpx": 172 | pillow_input_ext_guess = "png" 173 | 174 | try: 175 | img = Image.open(io.BytesIO(image_bytes)) 176 | # Ensure a common mode BEFORE load() and resize() 177 | if img.mode == 'P': 178 | img = img.convert('RGBA' if img.info.get('transparency') is not None else 'RGB') 179 | elif img.mode not in ['RGB', 'RGBA', 'L', 'LA']: 180 | # For CMYK, YCbCr, or other complex modes, convert to RGBA early 181 | img = img.convert('RGBA') 182 | 183 | img.load() # Force loading of image data 184 | 185 | # More detailed logging before the check 186 | print(f"Debug: Image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1}: Original PDF ext: {original_ext_from_pdf}, Pillow mode: {img.mode}, Pillow w: {img.width}, h: {img.height}", file=sys.stderr) 187 | 188 | # Check for zero dimensions immediately after opening 189 | if img.width <= 0 or img.height <= 0: 190 | print(f"Warning: Image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1} has zero or negative dimensions (w={img.width}, h={img.height}) after opening. Skipping.", file=sys.stderr) 191 | continue # Skip to the next image 192 | 193 | perform_resize = False 194 | max_dim_to_use = 512 # Default for when resize_option is True 195 | 196 | if isinstance(resize_option, int) and resize_option > 0: 197 | perform_resize = True 198 | max_dim_to_use = resize_option 199 | elif resize_option is True: 200 | perform_resize = True 201 | # max_dim_to_use is already set to default (512) 202 | 203 | if perform_resize: 204 | if img.width > max_dim_to_use or img.height > max_dim_to_use: 205 | if img.width > img.height: 206 | new_width = max_dim_to_use 207 | new_height = max(1, int(max_dim_to_use * img.height / img.width)) 208 | else: 209 | new_height = max_dim_to_use 210 | new_width = max(1, int(max_dim_to_use * img.width / img.height)) 211 | img = img.resize((new_width, new_height), Image.Resampling.BILINEAR) 212 | print(f"Debug: Image *after* resize: Mode: {img.mode}, Size: {img.size}, Info: {img.info}", file=sys.stderr) 213 | # Explicitly convert after resize to ensure a common mode 214 | if img.mode == 'P': 215 | img = img.convert('RGBA' if img.info.get('transparency') is not None else 'RGB') 216 | elif img.mode not in ['RGB', 'RGBA', 'L', 'LA']: 217 | img = img.convert('RGBA') # Default to RGBA if not a simple mode 218 | 219 | output_buffer = io.BytesIO() 220 | processed_image_final_ext = None 221 | 222 | if pillow_input_ext_guess in ["jpeg", "jpg"]: 223 | if img.mode not in ['RGB', 'L']: # If not RGB or Grayscale 224 | img = img.convert('RGB') # Convert to RGB (strips alpha if any) 225 | img.save(output_buffer, format="JPEG", quality=70, optimize=True) 226 | processed_image_final_ext = "jpeg" 227 | else: 228 | # Default to PNG for non-JPEG originals 229 | # Ensure mode is suitable for PNG saving (L, LA, RGB, RGBA) 230 | if img.mode == 'P': # Palette 231 | # Convert to RGBA if transparency is present, else RGB 232 | img = img.convert('RGBA' if img.info.get('transparency') is not None else 'RGB') 233 | elif img.mode in ['CMYK', 'YCbCr']: 234 | img = img.convert('RGBA') # Convert to RGBA for broader compatibility 235 | elif img.mode not in ['L', 'LA', 'RGB', 'RGBA']: 236 | # For other unhandled modes (e.g., 'F', '1'), attempt conversion to RGBA 237 | # This is a fallback; specific handling might be better if such modes are common 238 | print(f"Warning: Image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1} has unusual mode {img.mode}, converting to RGBA for PNG saving.", file=sys.stderr) 239 | img = img.convert('RGBA') 240 | 241 | # At this point, img.mode should be L, LA, RGB, or RGBA, all saveable as PNG 242 | img.save(output_buffer, format="PNG") # Temporarily remove optimize=True 243 | processed_image_final_ext = "png" 244 | 245 | processed_image_bytes = output_buffer.getvalue() 246 | 247 | # Conceptual ref uses page_num and img_idx_on_page for placeholder uniqueness 248 | conceptual_ref = f"{paper_source_url}#page_{page_num + 1}_img_{img_idx_on_page + 1}" 249 | current_page_conceptual_refs_for_placeholders.append(conceptual_ref) 250 | 251 | attachment = llm.Attachment(content=processed_image_bytes) 252 | attachment.type = f"image/{processed_image_final_ext}" 253 | current_page_attachments_for_this_page.append(attachment) 254 | 255 | except Exception as processing_error: 256 | print(f"Warning: Failed to process image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1} (original ext: {original_ext_from_pdf}). Skipping. Error: {processing_error}", file=sys.stderr) 257 | # --- End of actual image processing --- 258 | 259 | # Replace tags in HTML with placeholders for *selected and processed* images 260 | placeholder_iter = iter(current_page_conceptual_refs_for_placeholders) 261 | def replace_img_with_placeholder_fn(match_obj): 262 | try: 263 | conceptual_ref_for_match = next(placeholder_iter) 264 | return f"

[IMAGE: {conceptual_ref_for_match}]

" # Wrap placeholder in

for markdownify 265 | except StopIteration: # Should not happen if lists are in sync 266 | return "" 267 | 268 | # Apply replacement to the original HTML content of the page 269 | processed_page_html_content = re.sub(r"]*>", replace_img_with_placeholder_fn, page_html_content, flags=re.IGNORECASE) 270 | full_html_parts.append(processed_page_html_content) 271 | attachments_list.extend(current_page_attachments_for_this_page) # Add processed attachments 272 | 273 | except Exception as e: 274 | raise ValueError(f"Failed to extract content from PDF {pdf_path}: {e}") from e 275 | 276 | full_combined_html = "".join(full_html_parts) 277 | # Convert the final aggregated HTML (with placeholders) to Markdown 278 | markdown_text = markdownify.markdownify(full_combined_html, strip=['img']) # strip=['img'] redundant if placeholders work perfectly, but good safeguard. 279 | 280 | return markdown_text, attachments_list, paper_source_url 281 | 282 | 283 | @llm.hookimpl 284 | def register_fragment_loaders(register): 285 | register("arxiv", arxiv_loader) 286 | 287 | 288 | def extract_arxiv_id(argument: str) -> Union[str, None]: 289 | """Extracts arXiv ID from URL or returns the argument if it looks like an ID.""" 290 | match_url = re.match(r"https?://arxiv\.org/(?:abs|pdf)/(\d{4,}\.\d{4,}(?:v\d+)?)(?:\.pdf)?$", argument) 291 | if match_url: 292 | return match_url.group(1) 293 | 294 | match_id = re.match(r"^(\d{4,}\.\d{4,}(?:v\d+)?)$", argument) 295 | if match_id: 296 | return match_id.group(1) 297 | 298 | match_old_id = re.match(r"^[a-z-]+(?:\.[A-Z]{2})?/\d{7}$", argument) 299 | if match_old_id: 300 | return argument 301 | 302 | return None 303 | 304 | 305 | def arxiv_loader(argument: str) -> List[Union[llm.Fragment, llm.Attachment]]: 306 | """ 307 | Load text and images from an arXiv paper PDF. Fragment loader. 308 | Usage: llm -f arxiv:PAPER_ID_OR_URL[?options] "prompt" 309 | Options (append to ID/URL): 310 | - ?i[=SPEC] or ?include_images[=SPEC]: Include images. 311 | SPEC can be 'all' (default if ?i present), 'none', 312 | 'G:1,3-5' (global images), 'P:1,2-4' (images from pages). 313 | - ?r[=VAL] or ?resize_images[=VAL]: VAL can be 'true' (default 512px) or PIXELS. 314 | """ 315 | main_argument_part = argument 316 | query_string = "" 317 | if '?' in argument: 318 | main_argument_part, query_string = argument.split('?', 1) 319 | 320 | options = parse_qs(query_string) 321 | 322 | # Image selection for fragment loader 323 | image_spec_str_loader: Optional[str] = None 324 | raw_values_i = options.get('i', []) 325 | raw_values_include_images = options.get('include_images', []) 326 | 327 | chosen_raw_value_for_images: Optional[str] = None 328 | if raw_values_i: 329 | chosen_raw_value_for_images = raw_values_i[0] 330 | elif raw_values_include_images: 331 | chosen_raw_value_for_images = raw_values_include_images[0] 332 | # If chosen_raw_value_for_images is None here, it means no ?i or ?include_images param was present. 333 | # parse_image_selection_spec handles None correctly (-> no images). 334 | # It also handles "" (from ?i=) as "all". 335 | 336 | image_criteria_loader: Optional[ImageSelectionCriteria] = None 337 | try: 338 | # DEBUG PRINT for chosen_raw_value_for_images 339 | print(f"Debug arxiv_loader: chosen_raw_value_for_images = {repr(chosen_raw_value_for_images)}", file=sys.stderr) 340 | image_criteria_loader = parse_image_selection_spec(chosen_raw_value_for_images) 341 | except ValueError as e: 342 | raise ValueError(f"Invalid image selection option in fragment ('{chosen_raw_value_for_images}'): {e}") from e 343 | 344 | # Resize option for fragment loader 345 | resize_option_loader: Union[bool, int] = False # Default to no resize 346 | resize_values = options.get('resize_images', []) + options.get('r', []) 347 | if resize_values: 348 | val = resize_values[0].lower() 349 | if val in ['true', '1', 'yes', '']: # Empty means ?r was present 350 | resize_option_loader = True 351 | else: 352 | try: 353 | pixel_value = int(val) 354 | if pixel_value > 0: 355 | resize_option_loader = pixel_value 356 | else: # Non-positive int, treat as just enabling default resize 357 | resize_option_loader = True 358 | except ValueError: # Not a bool-like string and not an int, treat as enabling default resize if ?r was present 359 | resize_option_loader = True 360 | 361 | temp_arxiv_id = extract_arxiv_id(main_argument_part) 362 | if not temp_arxiv_id: 363 | raise ValueError( 364 | f"Invalid arXiv identifier or URL in fragment argument: {main_argument_part}.") 365 | 366 | try: 367 | markdown_text, attachments, paper_source_url_for_fragment = _process_arxiv_paper( 368 | main_argument_part, 369 | image_criteria_loader, 370 | resize_option_loader 371 | ) 372 | 373 | fragments_and_attachments: List[Union[llm.Fragment, llm.Attachment]] = [ 374 | llm.Fragment(content=markdown_text, source=paper_source_url_for_fragment) 375 | ] 376 | fragments_and_attachments.extend(attachments) 377 | return fragments_and_attachments 378 | 379 | except UnexpectedEmptyPageError as e: 380 | raise ValueError(f"arXiv search returned an unexpected empty page. Check the ID/URL. Error: {e}") from e 381 | except HTTPError as e: 382 | raise ValueError(f"Failed to fetch paper details from arXiv. Check network or ID/URL. Error: {e}") from e 383 | except ValueError as e: 384 | raise e 385 | except Exception as e: 386 | error_ref = temp_arxiv_id if temp_arxiv_id else main_argument_part 387 | raise ValueError(f"Error processing arXiv paper {error_ref} for fragment: {str(e)}") from e 388 | 389 | # --- New Command: arxiv-search --- 390 | @llm.hookimpl 391 | def register_commands(cli): 392 | @cli.command(name="arxiv") 393 | @click.argument("paper_id_or_url", required=True) 394 | @click.argument("prompt", required=False, default=None) 395 | @click.option( 396 | "--include-images", 397 | "-i", 398 | "include_images_spec_str", 399 | type=str, 400 | default=None, 401 | help="Include images. Examples: 'all', 'none', 'G:1,3-5', 'P:1,2-4'. If omitted, no images included." 402 | ) 403 | @click.option( 404 | "--resize-images", 405 | "-r", 406 | is_flag=True, 407 | help="Enable image resizing (default 512px, or use --max-dimension)." 408 | ) 409 | @click.option( 410 | "--max-dimension", 411 | "-d", 412 | type=int, 413 | default=None, 414 | help="Set custom max dimension (pixels) for resizing. Requires -r." 415 | ) 416 | @click.option( 417 | "-m", 418 | "--model", 419 | "model_id_option", 420 | type=str, 421 | default=None, 422 | help="LLM model to use for the prompt (if provided)." 423 | ) 424 | @click.option( 425 | "-s", 426 | "--system", 427 | "system_prompt_option", 428 | type=str, 429 | default=None, 430 | help="System prompt to use with the LLM (if prompt provided)." 431 | ) 432 | def arxiv_command( 433 | paper_id_or_url: str, 434 | prompt: Optional[str], 435 | include_images_spec_str: Optional[str], 436 | resize_images: bool, 437 | max_dimension: Optional[int], 438 | model_id_option: Optional[str], 439 | system_prompt_option: Optional[str] 440 | ): 441 | """ 442 | Fetch and process an arXiv paper. 443 | Outputs Markdown text or, if a PROMPT is given, processes with an LLM. 444 | 445 | Examples: 446 | llm arxiv 2310.06825 -i P:1-3 # Markdown with images from pages 1-3 447 | llm arxiv 2310.06825 "Summarize this paper." -m gpt-4o # Summarize with gpt-4o 448 | llm arxiv 2310.06825 "What are the key contributions?" -i all -r 449 | """ 450 | try: 451 | temp_arxiv_id_cmd = extract_arxiv_id(paper_id_or_url) 452 | if not temp_arxiv_id_cmd: 453 | click.echo(f"Error: Invalid arXiv identifier or URL provided: {paper_id_or_url}", err=True) 454 | click.echo("Expected format like '2310.06825' or 'https://arxiv.org/abs/...'.", err=True) 455 | raise click.UsageError("Invalid arXiv identifier.") 456 | 457 | image_criteria_cmd: Optional[ImageSelectionCriteria] = None 458 | try: 459 | image_criteria_cmd = parse_image_selection_spec(include_images_spec_str) 460 | except ValueError as e: 461 | click.echo(f"Error in --include-images value ('{include_images_spec_str}'): {e}", err=True) 462 | raise click.BadParameter(str(e), param_hint='--include-images') 463 | 464 | actual_resize_option: Union[bool, int] = False 465 | if resize_images: 466 | if max_dimension and max_dimension > 0: 467 | actual_resize_option = max_dimension 468 | else: 469 | actual_resize_option = True 470 | 471 | markdown_text, attachments, paper_source_url = _process_arxiv_paper( 472 | paper_id_or_url, 473 | image_criteria_cmd, 474 | actual_resize_option 475 | ) 476 | 477 | if prompt: 478 | # Process with LLM 479 | model_name_to_use = model_id_option 480 | model_obj = None 481 | 482 | if model_name_to_use: 483 | try: 484 | model_obj = llm.get_model(model_name_to_use) 485 | except llm.UnknownModelError: 486 | raise click.UsageError(f"Unknown model: {model_name_to_use}. See 'llm models list'.") 487 | else: 488 | try: 489 | # Attempt to get the default model 490 | model_obj = llm.get_model(None) # This will get the default 491 | if model_obj: 492 | model_name_to_use = model_obj.model_id # Get the name for potential error messages 493 | else: # Should not happen if llm.get_model(None) works as expected 494 | raise llm.UnknownModelError("No default model configured.") 495 | except llm.UnknownModelError: # Catches if no default is set or llm.get_model(None) fails 496 | # Check if any models are installed at all before giving up 497 | try: 498 | # A bit of a hack: try to list models to see if any exist. 499 | # This doesn't rely on get_models_aliases_and_paths() directly. 500 | if not list(llm.get_plugins(group="llm.plugins.model")): # Check if model plugins exist 501 | raise click.UsageError( 502 | "No LLM models found. Please install models, e.g., 'llm install llm-gpt4all-j'" 503 | ) 504 | except Exception: # Broad catch if get_plugins is not available or fails 505 | pass # Fall through to the next error 506 | 507 | raise click.UsageError( 508 | "No model specified with -m/--model, and no default model is set or found. " 509 | "Ensure a default model is set (e.g., 'llm default-model MODEL_NAME') or provide one with -m." 510 | ) 511 | except Exception as e: # Catch any other error from get_model(None) 512 | raise click.UsageError(f"Could not load default LLM model: {e}") 513 | 514 | 515 | if not model_obj: # Should be caught above, but as a safeguard 516 | raise click.UsageError("Failed to load an LLM model.") 517 | 518 | doc_fragment = llm.Fragment(content=markdown_text, source=paper_source_url) 519 | 520 | response_obj = model_obj.prompt( 521 | prompt=prompt, 522 | system=system_prompt_option, 523 | fragments=[doc_fragment], 524 | attachments=attachments 525 | ) 526 | 527 | for chunk in response_obj: 528 | click.echo(chunk, nl=False) 529 | click.echo() # Final newline 530 | 531 | # Consider showing cost if available and desired, e.g.: 532 | # if hasattr(response_obj, 'cost_tracker') and response_obj.cost_tracker: 533 | # cost = response_obj.cost_tracker.cost 534 | # if cost: 535 | # click.echo(f"LLM Cost: ${cost:.6f}", err=True) 536 | 537 | else: 538 | # Original behavior: print Markdown 539 | click.echo(markdown_text) 540 | if image_criteria_cmd: # Only print if images were potentially processed 541 | if attachments: 542 | print(f"---Processed {len(attachments)} image attachment(s) based on selection criteria '{include_images_spec_str}'.---", file=sys.stderr) 543 | elif include_images_spec_str and include_images_spec_str.lower() not in ["none", "false", "no", "0"]: 544 | print(f"---Image inclusion was specified ('{include_images_spec_str}'), but no images were found or selected in the document.---", file=sys.stderr) 545 | 546 | except UnexpectedEmptyPageError as e: 547 | click.echo(f"Error: arXiv search returned an unexpected empty page for '{paper_id_or_url}'. Check the ID/URL. Details: {e}", err=True) 548 | except HTTPError as e: 549 | click.echo(f"Error: Failed to fetch paper details from arXiv for '{paper_id_or_url}'. Check network or ID/URL. Details: {e}", err=True) 550 | except ValueError as e: 551 | click.echo(f"Error processing {paper_id_or_url}: {e}", err=True) 552 | except click.ClickException: 553 | raise 554 | except Exception as e: 555 | click.echo(f"An unexpected error occurred while processing {paper_id_or_url}: {e}", err=True) 556 | 557 | # New arxiv_search command registration 558 | @cli.command(name="arxiv-search") 559 | @click.argument("query_string", required=True) 560 | @click.option( 561 | "--max-results", "-n", 562 | type=int, 563 | default=5, 564 | show_default=True, 565 | help="Maximum number of search results to return." 566 | ) 567 | @click.option( 568 | "--sort-by", 569 | type=click.Choice(["relevance", "lastUpdatedDate", "submittedDate"], case_sensitive=False), 570 | default="relevance", 571 | show_default=True, 572 | help="Sort order for search results." 573 | ) 574 | @click.option( 575 | "--details", 576 | is_flag=True, 577 | help="Show more details for each result (authors, full abstract, categories, dates)." 578 | ) 579 | def arxiv_search_command(query_string: str, max_results: int, sort_by: str, details: bool): 580 | """Search arXiv for papers matching the QUERY_STRING.""" 581 | try: 582 | sort_criterion_map = { 583 | "relevance": arxiv.SortCriterion.Relevance, 584 | "lastupdateddate": arxiv.SortCriterion.LastUpdatedDate, 585 | "submitteddate": arxiv.SortCriterion.SubmittedDate 586 | } 587 | actual_sort_criterion = sort_criterion_map[sort_by.lower()] 588 | 589 | search = arxiv.Search( 590 | query=query_string, 591 | max_results=max_results, 592 | sort_by=actual_sort_criterion 593 | ) 594 | 595 | results = list(search.results()) 596 | 597 | if not results: 598 | click.echo(f"No results found for query: '{query_string}'") 599 | return 600 | 601 | click.echo(f"Found {len(results)} result(s) for '{query_string}' (sorted by {sort_by}):\n") 602 | 603 | all_commands_to_copy = [] # List to store all commands 604 | 605 | for i, paper in enumerate(results): 606 | clean_id = extract_arxiv_id(paper.entry_id) 607 | click.echo(f"[{i+1}] ID: {clean_id}") 608 | click.echo(f" Title: {paper.title}") 609 | 610 | command_to_run = f"llm arxiv {clean_id}" 611 | all_commands_to_copy.append(command_to_run) # Add to list 612 | 613 | # Styled command for display: bold, green, and underlined 614 | display_command = click.style(f"$ {command_to_run}", fg="green", bold=True, underline=True) 615 | 616 | # No OSC 52 sequence here per result, just display 617 | click.echo(f" Command: {display_command}") 618 | 619 | if details: 620 | authors_str = ", ".join([author.name for author in paper.authors]) 621 | click.echo(f" Authors: {authors_str}") 622 | click.echo(f" Published: {paper.published.strftime('%Y-%m-%d %H:%M:%S %Z') if paper.published else 'N/A'}") 623 | click.echo(f" Updated: {paper.updated.strftime('%Y-%m-%d %H:%M:%S %Z') if paper.updated else 'N/A'}") 624 | primary_category = paper.primary_category 625 | categories_str = ", ".join(paper.categories) 626 | click.echo(f" Primary Category: {primary_category if primary_category else 'N/A'}") 627 | click.echo(f" Categories: {categories_str if categories_str else 'N/A'}") 628 | click.echo(f" Abstract: {paper.summary.replace('\n', ' ')}") 629 | click.echo(f" PDF Link: {paper.pdf_url}") 630 | else: 631 | brief_summary = (paper.summary[:200] + '...') if len(paper.summary) > 200 else paper.summary 632 | click.echo(f" Abstract (brief): {brief_summary.replace('\n', ' ')}") 633 | click.echo("---") 634 | 635 | # After the loop, if there are commands, try to copy them all 636 | if all_commands_to_copy: 637 | concatenated_commands = "\n".join(all_commands_to_copy) 638 | b64_concatenated_commands = base64.b64encode(concatenated_commands.encode('utf-8')).decode('utf-8') 639 | osc_clipboard_all_seq = f"\033]52;c;{b64_concatenated_commands}\a" 640 | # Emit the OSC 52 sequence. It's non-visible. 641 | # We can print it to sys.stdout directly or via click.echo without a newline if preferred. 642 | # Using sys.stdout.write to avoid any potential click formatting/newlines. 643 | sys.stdout.write(osc_clipboard_all_seq) 644 | sys.stdout.flush() # Ensure it gets sent 645 | click.echo(f"\n(Attempted to copy {len(all_commands_to_copy)} command(s) to clipboard)", err=True) 646 | 647 | except HTTPError as e: 648 | click.echo(f"Error connecting to arXiv for search: {e}", err=True) 649 | except Exception as e: 650 | click.echo(f"An unexpected error occurred during search: {e}", err=True) -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "llm-arxiv" 3 | requires-python = ">=3.10" 4 | version = "0.1.2" 5 | description = "LLM plugin for loading arXiv papers" 6 | readme = "README.md" 7 | authors = [{name = "Agusti F."}] 8 | license = {text = "Apache-2.0"} 9 | classifiers = [ 10 | ] 11 | dependencies = [ 12 | "llm", 13 | "arxiv", 14 | "PyMuPDF", 15 | "markdownify", 16 | "Pillow", 17 | "click>=8.0" 18 | ] 19 | 20 | [build-system] 21 | requires = ["setuptools"] 22 | build-backend = "setuptools.build_meta" 23 | 24 | [tool.setuptools] 25 | py-modules = ["llm_arxiv"] 26 | 27 | [project.urls] 28 | Homepage = "https://github.com/agustif/llm-arxiv" 29 | Changelog = "https://github.com/agustif/llm-arxiv/releases" 30 | Issues = "https://github.com/agustif/llm-arxiv/issues" 31 | CI = "https://github.com/agustif/llm-arxiv/actions" 32 | 33 | [project.entry-points.llm] 34 | arxiv = "llm_arxiv" 35 | 36 | [project.optional-dependencies] 37 | test = ["pytest"] -------------------------------------------------------------------------------- /tests/pytest.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import types 3 | import inspect 4 | 5 | class _Parametrize: 6 | def __call__(self, names, values): 7 | def decorator(func): 8 | def wrapper(*args, **kwargs): 9 | for val in values: 10 | if not isinstance(val, tuple): 11 | val = (val,) 12 | func(*val) 13 | return wrapper 14 | return decorator 15 | 16 | class _Mark: 17 | parametrize = _Parametrize() 18 | 19 | mark = _Mark() 20 | 21 | class RaisesContext: 22 | def __init__(self, exc_type): 23 | self.exc_type = exc_type 24 | self.value = None 25 | def __enter__(self): 26 | return self 27 | def __exit__(self, exc_type, exc, tb): 28 | if exc is None: 29 | raise AssertionError(f"{self.exc_type.__name__} not raised") 30 | if not issubclass(exc_type, self.exc_type): 31 | raise exc 32 | self.value = exc 33 | return True 34 | 35 | def raises(exc_type): 36 | return RaisesContext(exc_type) 37 | 38 | def _run_tests_in_module(namespace): 39 | count = 0 40 | for name, obj in list(namespace.items()): 41 | if name.startswith("test_") and callable(obj): 42 | obj() 43 | count += 1 44 | return count 45 | 46 | def main(args=None): 47 | if args is None: 48 | args = sys.argv[1:] 49 | modules = [a for a in args if a.endswith('.py')] 50 | if not modules: 51 | modules = ['tests/test_arxiv.py'] 52 | for mod_path in modules: 53 | ns = {} 54 | with open(mod_path) as f: 55 | code = compile(f.read(), mod_path, 'exec') 56 | exec(code, ns) 57 | count = _run_tests_in_module(ns) 58 | print(f"{mod_path}: ran {count} tests") 59 | 60 | if __name__ == '__main__': 61 | main() -------------------------------------------------------------------------------- /tests/test_arxiv.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from llm_arxiv import extract_arxiv_id, arxiv_loader, parse_ranges_to_set, parse_image_selection_spec, _process_arxiv_paper, ImageSelectionCriteria 3 | from unittest.mock import patch, MagicMock, call 4 | import llm 5 | import arxiv 6 | import base64 7 | from click.testing import CliRunner 8 | from llm.cli import cli as llm_cli # To test llm subcommands 9 | import sys 10 | import io 11 | import datetime 12 | 13 | 14 | # --- Helper function for comparing markdown output --- 15 | def normalize_markdown_for_compare(text: str) -> str: 16 | lines = text.replace('\r\n', '\n').split('\n') 17 | # Strip whitespace from each line AND filter out lines that become empty after stripping 18 | # This makes it robust to varying numbers of blank lines. 19 | processed_lines = [line.strip() for line in lines if line.strip()] 20 | return '\n'.join(processed_lines) 21 | 22 | 23 | @pytest.mark.parametrize( 24 | "argument, expected_id", 25 | [ 26 | # Standard IDs 27 | ("2310.06825", "2310.06825"), 28 | ("2310.06825v1", "2310.06825v1"), 29 | ("1234.56789", "1234.56789"), 30 | # URLs 31 | ("https://arxiv.org/abs/2310.06825", "2310.06825"), 32 | ("http://arxiv.org/abs/2310.06825v2", "2310.06825v2"), 33 | ("https://arxiv.org/pdf/1234.56789.pdf", "1234.56789"), 34 | ("http://arxiv.org/pdf/1234.56789v3.pdf", "1234.56789v3"), 35 | # Older IDs 36 | ("hep-th/0101001", "hep-th/0101001"), 37 | ("math.GT/0309136", "math.GT/0309136"), 38 | ("cs.AI/0101001", "cs.AI/0101001"), 39 | # Invalid cases 40 | ("not an id", None), 41 | ("https://example.com/abs/2310.06825", None), 42 | ("arxiv.org/abs/2310.06825", None), # Missing scheme 43 | ("123.456", None), # Incorrect format 44 | ("cs.AI/123456", None), # Incorrect old format (needs 7 digits) 45 | ] 46 | ) 47 | def test_extract_arxiv_id(argument, expected_id): 48 | assert extract_arxiv_id(argument) == expected_id 49 | 50 | 51 | @patch("llm_arxiv.fitz.open") 52 | @patch("llm_arxiv.arxiv.Search") 53 | @patch("llm_arxiv.Image.open") 54 | def test_arxiv_loader_success(mock_llm_image_open, mock_search_class, mock_fitz_open): 55 | # --- Mock arXiv Search and Result --- 56 | mock_search_instance = MagicMock() 57 | mock_paper = MagicMock(spec=arxiv.Result) 58 | mock_paper.entry_id = "http://arxiv.org/abs/1234.5678v1" 59 | mock_paper.download_pdf.return_value = "/tmp/fake_paper.pdf" 60 | mock_search_instance.results.return_value = iter([mock_paper]) 61 | mock_search_class.return_value = mock_search_instance 62 | 63 | # --- Mock PyMuPDF (fitz) --- 64 | mock_doc = MagicMock() 65 | mock_page1 = MagicMock() 66 | mock_page1.get_text.return_value = "Page 1 text " # HTML with img 67 | mock_page1.get_images.return_value = [(10,)] # (xref,) - one image on page 1 68 | mock_page2 = MagicMock() 69 | mock_page2.get_text.return_value = "Page 2 text" # No images on page 2 70 | mock_page2.get_images.return_value = [] 71 | mock_doc.__iter__.return_value = iter([mock_page1, mock_page2]) 72 | # Mock the extract_image call for the image on page 1 73 | mock_doc.extract_image.return_value = {"image": b"fake_image_bytes_for_10", "ext": "png"} 74 | 75 | # Mock Pillow for image processing 76 | mock_pil_image = MagicMock() 77 | mock_pil_image.width = 100 78 | mock_pil_image.height = 100 79 | mock_pil_image.mode = 'RGB' # ensure mode is set 80 | # Define a side effect for save to simulate writing to BytesIO 81 | def mock_save(buffer, format, optimize=None, quality=None, **kwargs): 82 | buffer.write(b"processed_fake_image_bytes") 83 | return None 84 | mock_pil_image.save = mock_save 85 | mock_llm_image_open.return_value = mock_pil_image 86 | 87 | mock_doc.__enter__.return_value = mock_doc 88 | 89 | # Ensure __exit__ calls close() and returns None 90 | def mock_exit_calls_close(*args): 91 | mock_doc.close() # Call the close method on mock_doc 92 | return None 93 | mock_doc.__exit__.side_effect = mock_exit_calls_close 94 | 95 | # mock_fitz_open should return mock_doc 96 | mock_fitz_open.return_value = mock_doc 97 | 98 | # --- Call the loader with image request --- 99 | fragments = arxiv_loader("1234.5678?i=all") # Request all images 100 | 101 | # --- Assertions --- 102 | assert isinstance(fragments, list) 103 | assert len(fragments) == 2 # Text fragment + 1 image attachment (processed) 104 | 105 | # Check the first fragment (text) 106 | text_fragment = fragments[0] 107 | assert isinstance(text_fragment, llm.Fragment) 108 | assert text_fragment.source == "http://arxiv.org/abs/1234.5678v1" # Source URL from paper 109 | 110 | # Expected text: HTML from pages, with replaced by [IMAGE: conceptual_url], then markdownified 111 | # Mocked HTML for page 1: "Page 1 text " 112 | # Placeholder generated by _process_arxiv_paper: [IMAGE: http://arxiv.org/abs/1234.5678v1#page_1_img_1] 113 | # Markdownify will convert the rest. 114 | expected_normalized_parts = [ 115 | "Page 1 text", 116 | "[IMAGE: http://arxiv.org/abs/1234.5678v1#page\\_1\\_img\\_1]", # Changed from \\\\_ 117 | "Page 2 text" 118 | ] 119 | expected_normalized_str = "\\n".join(expected_normalized_parts) 120 | actual_normalized_str = normalize_markdown_for_compare(str(text_fragment)) 121 | # print(f"ACTUAL NORM: {repr(actual_normalized_str)}") 122 | # print(f"EXPECT NORM: {repr(expected_normalized_str)}") 123 | assert actual_normalized_str == expected_normalized_str 124 | 125 | # Check the attachment (the processed image) 126 | attachment = fragments[1] 127 | assert isinstance(attachment, llm.Attachment) 128 | assert attachment.content == b"processed_fake_image_bytes" # Processed by mocked Pillow 129 | assert attachment.type == "image/png" # Default processing output is PNG if not JPEG 130 | 131 | # Check mocks were called correctly 132 | mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1) 133 | mock_search_instance.results.assert_called_once() 134 | mock_paper.download_pdf.assert_called_once() 135 | mock_fitz_open.assert_called_once_with("/tmp/fake_paper.pdf") 136 | 137 | # _process_arxiv_paper calls get_text and get_images 138 | assert mock_page1.get_text.call_count == 1 139 | assert mock_page2.get_text.call_count == 1 140 | assert mock_page1.get_images.call_count == 1 # Called to find images 141 | assert mock_page2.get_images.call_count == 1 # Called even if no images 142 | 143 | # Ensure doc.extract_image was called for the image on page 1 144 | mock_doc.extract_image.assert_called_once_with(10) 145 | # Ensure Pillow was involved 146 | mock_llm_image_open.assert_called_once() 147 | actual_call_args = mock_llm_image_open.call_args[0] 148 | assert isinstance(actual_call_args[0], io.BytesIO) 149 | assert actual_call_args[0].getvalue() == b"fake_image_bytes_for_10" 150 | 151 | # Ensure doc.close() was called 152 | mock_doc.close.assert_called_once() 153 | 154 | 155 | @pytest.mark.parametrize( 156 | "argument, expected_error_msg_part", 157 | [ 158 | ("invalid-id", "Invalid arXiv identifier or URL in fragment argument: invalid-id"), 159 | ("http://example.com/1234.5678", "Invalid arXiv identifier or URL in fragment argument: http://example.com/1234.5678"), 160 | ] 161 | ) 162 | def test_arxiv_loader_invalid_id(argument, expected_error_msg_part): 163 | with pytest.raises(ValueError) as excinfo: 164 | arxiv_loader(argument) # Reverted to use argument 165 | assert expected_error_msg_part in str(excinfo.value) # Original assertion should now pass 166 | 167 | 168 | @patch("llm_arxiv.arxiv.Search") 169 | def test_arxiv_loader_no_results(mock_search_class): 170 | # Configure Search to return an empty iterator 171 | mock_search_instance = MagicMock() 172 | mock_search_instance.results.return_value = iter([]) 173 | mock_search_class.return_value = mock_search_instance 174 | 175 | with pytest.raises(ValueError) as excinfo: 176 | arxiv_loader("1234.5678") 177 | assert "No paper found for arXiv ID: 1234.5678" in str(excinfo.value) 178 | mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1) 179 | 180 | 181 | @patch("llm_arxiv.arxiv.Search") 182 | def test_arxiv_loader_arxiv_api_error(mock_search_class): 183 | # Configure Search results to raise an exception 184 | mock_search_instance = MagicMock() 185 | # Use arxiv.HTTPError for the side_effect, providing only required args 186 | mock_search_instance.results.side_effect = arxiv.HTTPError( 187 | url="http://fake.export.arxiv.org", 188 | status=500, 189 | retry=False 190 | ) 191 | mock_search_class.return_value = mock_search_instance 192 | 193 | with pytest.raises(ValueError) as excinfo: 194 | arxiv_loader("1234.5678") 195 | # Check that the error message contains the actual HTTPError string representation 196 | expected_msg = "Failed to fetch paper details from arXiv. Check network or ID/URL. Error: Page request resulted in HTTP 500 (http://fake.export.arxiv.org)" 197 | assert expected_msg in str(excinfo.value) 198 | mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1) 199 | 200 | 201 | @patch("llm_arxiv.arxiv.Search") 202 | def test_arxiv_loader_pdf_download_error(mock_search_class): 203 | # Configure download_pdf to raise an exception 204 | mock_search_instance = MagicMock() 205 | mock_paper = MagicMock(spec=arxiv.Result) 206 | mock_paper.entry_id = "http://arxiv.org/abs/1234.5678v1" 207 | mock_paper.download_pdf.side_effect = Exception("Download failed") 208 | mock_search_instance.results.return_value = iter([mock_paper]) 209 | mock_search_class.return_value = mock_search_instance 210 | 211 | with pytest.raises(ValueError) as excinfo: 212 | arxiv_loader("1234.5678") 213 | # The error message wraps the original exception 214 | assert "Error processing arXiv paper 1234.5678 for fragment: Download failed" in str(excinfo.value) 215 | mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1) 216 | mock_paper.download_pdf.assert_called_once() 217 | 218 | 219 | @patch("llm_arxiv.fitz.open") 220 | @patch("llm_arxiv.arxiv.Search") 221 | def test_arxiv_loader_pdf_extract_error(mock_search_class, mock_fitz_open): 222 | # Configure search and download to succeed 223 | mock_search_instance = MagicMock() 224 | mock_paper = MagicMock(spec=arxiv.Result) 225 | mock_paper.entry_id = "http://arxiv.org/abs/1234.5678v1" 226 | mock_paper.download_pdf.return_value = "/tmp/fake_paper.pdf" 227 | mock_search_instance.results.return_value = iter([mock_paper]) 228 | mock_search_class.return_value = mock_search_instance 229 | 230 | # Configure fitz.open to raise an exception 231 | mock_fitz_open.side_effect = Exception("Fitz error") 232 | 233 | with pytest.raises(ValueError) as excinfo: 234 | arxiv_loader("1234.5678") 235 | 236 | # Check the wrapped error message 237 | expected_msg = "Failed to extract content from PDF /tmp/fake_paper.pdf: Fitz error" 238 | assert expected_msg in str(excinfo.value) 239 | mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1) 240 | mock_paper.download_pdf.assert_called_once() 241 | mock_fitz_open.assert_called_once_with("/tmp/fake_paper.pdf") 242 | 243 | 244 | # --- Tests for parse_ranges_to_set --- pytest.py tests/test_arxiv.py 245 | @pytest.mark.parametrize( 246 | "range_str, expected_set", 247 | [ 248 | ("1", {1}), 249 | ("1,2,3", {1, 2, 3}), 250 | ("1-3", {1, 2, 3}), 251 | ("1,3-5,7", {1, 3, 4, 5, 7}), 252 | ("5-3", None), # Invalid range order 253 | ("1-3,5-4", None), # One invalid range 254 | ("abc", None), # Non-numeric 255 | ("1,abc,3", None), # Mixed non-numeric 256 | ("1-abc", None), # Non-numeric in range 257 | ("", set()), 258 | (" 1, 2 , 3-4 ", {1,2,3,4}), 259 | ("0", None), # Zero not allowed 260 | ("1-0", None), # Zero in range not allowed 261 | ("-2", None) # Negative not allowed 262 | ] 263 | ) 264 | def test_parse_ranges_to_set(range_str, expected_set): 265 | if expected_set is None: 266 | with pytest.raises(ValueError): 267 | parse_ranges_to_set(range_str) 268 | else: 269 | assert parse_ranges_to_set(range_str) == expected_set 270 | 271 | 272 | # --- Tests for parse_image_selection_spec --- pytest.py tests/test_arxiv.py 273 | @pytest.mark.parametrize( 274 | "spec_string, expected_criteria", 275 | [ 276 | (None, None), 277 | ("all", {"mode": "all"}), 278 | ("ALL", {"mode": "all"}), 279 | ("true", {"mode": "all"}), 280 | ("yes", {"mode": "all"}), 281 | ("1", {"mode": "all"}), 282 | ("", {"mode": "all"}), # Empty string implies all (e.g. from -i with no arg if const was used) 283 | ("none", None), 284 | ("NONE", None), 285 | ("false", None), 286 | ("no", None), 287 | ("0", None), 288 | ("G:1", {"mode": "global", "indices": {1}}), 289 | ("g:1,2,3", {"mode": "global", "indices": {1, 2, 3}}), 290 | ("G:1-3,5", {"mode": "global", "indices": {1, 2, 3, 5}}), 291 | ("P:1", {"mode": "pages", "indices": {1}}), 292 | ("p:1,2,3", {"mode": "pages", "indices": {1, 2, 3}}), 293 | ("P:1-3,5", {"mode": "pages", "indices": {1, 2, 3, 5}}), 294 | # Invalid cases for parse_image_selection_spec 295 | ("invalid_spec", None), # Will raise ValueError 296 | ("G:", None), # Missing indices, raises ValueError 297 | ("P:", None), # Missing indices, raises ValueError 298 | ("G:abc", None), # Invalid indices, raises ValueError from parse_ranges_to_set 299 | ("P:1-abc", None), # Invalid indices, raises ValueError from parse_ranges_to_set 300 | ("G:0", None), # Invalid index 0, raises ValueError 301 | ("P:1,0,3", None) # Invalid index 0, raises ValueError 302 | ] 303 | ) 304 | def test_parse_image_selection_spec(spec_string, expected_criteria): 305 | if expected_criteria is None and spec_string not in [None, "none", "NONE", "false", "no", "0"]: 306 | # These are cases that should raise ValueError 307 | with pytest.raises(ValueError): 308 | parse_image_selection_spec(spec_string) 309 | else: 310 | assert parse_image_selection_spec(spec_string) == expected_criteria 311 | 312 | 313 | # Mocking Pillow Image.open and save for image processing tests 314 | @patch("llm_arxiv.Image.open") 315 | @patch("llm_arxiv.arxiv.Search") # Keep this for consistency if _process_arxiv_paper calls it 316 | @patch("llm_arxiv.fitz.open") # And this too 317 | def test_process_arxiv_paper_image_selection_global(mock_fitz_open, mock_search_class, mock_image_open): 318 | # --- Mock arXiv Search and Result (minimal for this test) --- 319 | mock_search_instance = MagicMock() 320 | mock_paper_obj = MagicMock(spec=arxiv.Result) 321 | mock_paper_obj.entry_id = "http://arxiv.org/abs/2301.12345v1" # Use a valid new-style ID format 322 | mock_paper_obj.download_pdf.return_value = "/tmp/2301.12345.pdf" 323 | mock_search_instance.results.return_value = iter([mock_paper_obj]) 324 | mock_search_class.return_value = mock_search_instance 325 | 326 | # --- Mock PyMuPDF (fitz) --- 327 | mock_doc = MagicMock() 328 | mock_page1 = MagicMock(name="Page1") 329 | mock_page1.get_text.return_value = "Page 1 text " 330 | # Image list for page 1: (xref, ...other_fields) 331 | mock_page1.get_images.return_value = [(10,), (11,)] 332 | mock_page2 = MagicMock(name="Page2") 333 | mock_page2.get_text.return_value = "Page 2 text " 334 | mock_page2.get_images.return_value = [(20,)] 335 | mock_doc.__iter__.return_value = iter([mock_page1, mock_page2]) 336 | mock_doc.extract_image.side_effect = lambda xref: {"image": f"img_bytes_xref_{xref}".encode(), "ext": "png"} 337 | mock_doc.__enter__.return_value = mock_doc 338 | mock_doc.__exit__.return_value = None 339 | mock_fitz_open.return_value = mock_doc 340 | 341 | # --- Mock Pillow --- 342 | # mock_pil_image = MagicMock() 343 | # mock_pil_image.width = 100 344 | # mock_pil_image.height = 100 345 | # mock_pil_image.mode = 'RGB' # Ensure mode is set 346 | # mock_image_open.return_value = mock_pil_image 347 | 348 | # Define a side effect for Image.open to capture input bytes and set up save 349 | def mock_image_open_side_effect(bytes_io_arg): 350 | captured_bytes = bytes_io_arg.getvalue() # Capture the bytes for this specific image 351 | 352 | mock_specific_pil_image = MagicMock(name=f"PILImageMock_{len(mock_image_open.mock_calls)}") 353 | mock_specific_pil_image.width = 100 354 | mock_specific_pil_image.height = 100 355 | # The code converts to 'RGB' or 'RGBA' before saving PNGs if mode is 'P', 356 | # or 'RGB' for JPEGs. Let's set a common mode that doesn't trigger complex conversion. 357 | mock_specific_pil_image.mode = 'RGB' 358 | 359 | # Mock the save method for this specific PIL image instance 360 | def mock_specific_save(buffer, format, optimize=None, quality=None, **kwargs): 361 | # For this test, if no resize, assume original bytes are "saved" 362 | # (as the test expects original bytes if no processing happens) 363 | buffer.write(captured_bytes) 364 | mock_specific_pil_image.save = mock_specific_save 365 | return mock_specific_pil_image 366 | 367 | mock_image_open.side_effect = mock_image_open_side_effect 368 | 369 | # --- Call _process_arxiv_paper with global image selection: G:1,3 --- 370 | # Global image 1 is on page 1 (xref 10) 371 | # Global image 2 is on page 1 (xref 11) - SKIPPED 372 | # Global image 3 is on page 2 (xref 20) 373 | criteria: ImageSelectionCriteria = {"mode": "global", "indices": {1, 3}} 374 | markdown_text, attachments, _ = _process_arxiv_paper( 375 | "2301.12345", # Use the same valid ID 376 | image_selection_criteria=criteria, 377 | resize_option=False 378 | ) 379 | 380 | # Assertions for text content (placeholders) 381 | # Image 1 (g1) from page 1 (p1) should be included -> placeholder 1 382 | # Image 2 (g2) from page 1 (p1) should be SKIPPED 383 | # Image 3 (g3) from page 2 (p2) should be included -> placeholder 2 384 | expected_normalized_parts_global = [ 385 | "Page 1 text", 386 | "[IMAGE: http://arxiv.org/abs/2301.12345v1#page\\_1\\_img\\_1]", # Changed from \\\\_ 387 | "Page 2 text", 388 | "[IMAGE: http://arxiv.org/abs/2301.12345v1#page\\_2\\_img\\_1]" # Changed from \\\\_ 389 | ] 390 | expected_normalized_str_global = "\n".join(expected_normalized_parts_global) 391 | actual_normalized_str_global = normalize_markdown_for_compare(markdown_text) 392 | # print(f"ACTUAL NORM GLOBAL: {repr(actual_normalized_str_global)}") 393 | # print(f"EXPECT NORM GLOBAL: {repr(expected_normalized_str_global)}") 394 | assert actual_normalized_str_global == expected_normalized_str_global 395 | 396 | # Assertions for attachments 397 | assert len(attachments) == 2 398 | assert attachments[0].type == "image/png" 399 | assert attachments[0].content == f"img_bytes_xref_10".encode() # img_bytes_xref_10 -> G:1 400 | assert attachments[1].type == "image/png" 401 | assert attachments[1].content == f"img_bytes_xref_20".encode() # img_bytes_xref_20 -> G:3 402 | 403 | # Verify extract_image calls 404 | mock_doc.extract_image.assert_has_calls([ 405 | call(10), # Global image 1 406 | call(20) # Global image 3 407 | ], any_order=False) # Check order as global_image_document_idx_counter matters 408 | assert mock_doc.extract_image.call_count == 2 409 | mock_image_open.call_count == 2 # Pillow should be called for each selected image 410 | 411 | 412 | # --- Tests for CLI Commands --- pytest.py tests/test_arxiv.py 413 | 414 | # Helper to invoke LLM CLI commands 415 | runner = CliRunner() 416 | 417 | @patch("llm_arxiv._process_arxiv_paper") 418 | def test_llm_arxiv_command_image_selection(mock_process_paper): 419 | mock_process_paper.return_value = ("markdown output", [], "http://example.com/src") 420 | 421 | # Test -i P:1,3-4 422 | result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "P:1,3-4"]) 423 | assert result.exit_code == 0 424 | mock_process_paper.assert_called_once() 425 | args, kwargs = mock_process_paper.call_args 426 | assert args[0] == "1234.5678" 427 | assert args[1] == {"mode": "pages", "indices": {1,3,4}} # image_selection_criteria is args[1] 428 | # resize_option is args[2], custom_max_dim_cmd is args[3] if passed directly 429 | # In arxiv_command, resize_option is computed and passed as the 3rd arg. 430 | # Let's check it too, assuming default resize_option value when -r is not explicitly set with -i 431 | # Actually, arxiv_command's resize_option_val combines resize_images and max_dimension. 432 | # If only -i is passed, resize_images is False, max_dimension is None. 433 | # This leads to resize_option_val being False. 434 | assert args[2] is False # resize_option 435 | mock_process_paper.reset_mock() 436 | 437 | # Test -i G:2 438 | result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "G:2"]) 439 | assert result.exit_code == 0 440 | mock_process_paper.assert_called_once() 441 | args, kwargs = mock_process_paper.call_args 442 | assert args[1] == {"mode": "global", "indices": {2}} 443 | assert args[2] is False # resize_option 444 | mock_process_paper.reset_mock() 445 | 446 | # Test -i all 447 | result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "all"]) 448 | assert result.exit_code == 0 449 | mock_process_paper.assert_called_once() 450 | args, kwargs = mock_process_paper.call_args 451 | assert args[1] == {"mode": "all"} 452 | assert args[2] is False # resize_option 453 | mock_process_paper.reset_mock() 454 | 455 | # Test no -i (should be None criteria -> no images) 456 | result = runner.invoke(llm_cli, ["arxiv", "1234.5678"]) 457 | assert result.exit_code == 0 458 | mock_process_paper.assert_called_once() 459 | args, kwargs = mock_process_paper.call_args 460 | assert args[1] is None # Important: None implies no images for image_selection_criteria 461 | assert args[2] is False # resize_option 462 | mock_process_paper.reset_mock() 463 | 464 | # Test -i with resize options 465 | result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "all", "-r", "-d", "300"]) 466 | assert result.exit_code == 0 467 | mock_process_paper.assert_called_once() 468 | args, kwargs = mock_process_paper.call_args 469 | assert args[1] == {"mode": "all"} 470 | assert args[2] == 300 # resize_option should be the dimension 471 | mock_process_paper.reset_mock() 472 | 473 | result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "all", "-r"]) 474 | assert result.exit_code == 0 475 | mock_process_paper.assert_called_once() 476 | args, kwargs = mock_process_paper.call_args 477 | assert args[1] == {"mode": "all"} 478 | assert args[2] is True # resize_option should be True (for default 512px) 479 | mock_process_paper.reset_mock() 480 | 481 | @patch("llm_arxiv.arxiv.Search") 482 | def test_llm_arxiv_search_command(mock_arxiv_search_class): 483 | mock_search_instance = MagicMock() 484 | mock_paper1 = MagicMock(spec=arxiv.Result) 485 | mock_paper1.entry_id = "http://arxiv.org/abs/2301.0001v1" # Use valid ID format 486 | mock_paper1.title = "Search Result Paper 1 Title" 487 | mock_paper1.summary = "Summary of paper 1." 488 | # Correct author mocking 489 | author_a_mock = MagicMock(spec=arxiv.Result.Author) 490 | author_a_mock.name = "Author A" 491 | mock_paper1.authors = [author_a_mock] 492 | mock_paper1.published = datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc) 493 | mock_paper1.updated = datetime.datetime(2023, 1, 2, 12, 0, 0, tzinfo=datetime.timezone.utc) 494 | mock_paper1.primary_category = "cs.AI" 495 | mock_paper1.categories = ["cs.AI", "cs.LG"] 496 | mock_paper1.pdf_url = "http://arxiv.org/pdf/2301.0001v1.pdf" 497 | 498 | # Configure mock_search_instance.results to return a new iterator each time it's called 499 | # Set this on the instance that mock_arxiv_search_class will return. 500 | mock_search_instance = MagicMock() # This will be the returned instance 501 | mock_search_instance.results.side_effect = lambda: iter([mock_paper1]) 502 | mock_arxiv_search_class.return_value = mock_search_instance 503 | 504 | # Test basic search 505 | result = runner.invoke(llm_cli, ["arxiv-search", "test query", "-n", "1"]) 506 | assert result.exit_code == 0 507 | assert "Found 1 result(s)" in result.output 508 | assert "ID: 2301.0001v1" in result.output # Updated ID 509 | assert "Title: Search Result Paper 1 Title" in result.output 510 | assert "Command: $ llm arxiv 2301.0001v1" in result.output 511 | assert "Abstract (brief): Summary of paper 1." in result.output 512 | assert "(Attempted to copy 1 command(s) to clipboard)" in result.output 513 | 514 | # Test --details 515 | result_details = runner.invoke(llm_cli, ["arxiv-search", "test query", "-n", "1", "--details"]) 516 | assert result_details.exit_code == 0 517 | assert "Authors: Author A" in result_details.output 518 | assert "Abstract: Summary of paper 1." in result_details.output 519 | assert "Published: 2023-01-01" in result_details.output 520 | assert "Updated: 2023-01-02" in result_details.output 521 | assert "Primary Category: cs.AI" in result_details.output 522 | assert "Categories: cs.AI, cs.LG" in result_details.output 523 | assert "PDF Link: http://arxiv.org/pdf/2301.0001v1.pdf" in result_details.output 524 | 525 | mock_arxiv_search_class.assert_called_with( 526 | query="test query", 527 | max_results=1, 528 | sort_by=arxiv.SortCriterion.Relevance 529 | ) 530 | 531 | @patch("llm_arxiv.arxiv.Search") 532 | def test_llm_arxiv_search_no_results(mock_arxiv_search_class): 533 | mock_search_instance = MagicMock() 534 | mock_search_instance.results.return_value = iter([]) 535 | mock_arxiv_search_class.return_value = mock_search_instance 536 | 537 | result = runner.invoke(llm_cli, ["arxiv-search", "very_specific_query_no_one_uses"]) 538 | assert result.exit_code == 0 539 | assert "No results found for query: 'very_specific_query_no_one_uses'" in result.output 540 | 541 | # Note: More tests for _process_arxiv_paper (page selection, resize options, no images) 542 | # and arxiv_loader (parsing ?i= options) would be beneficial for full coverage. 543 | # The test_arxiv_loader_success needs significant update to reflect image processing changes. 544 | 545 | # test_arxiv_loader_success needs to be updated for the new _process_arxiv_paper signature and image handling 546 | @patch("llm_arxiv._process_arxiv_paper") # Patch the helper directly 547 | def test_arxiv_loader_fragment_options(mock_process_paper): 548 | mock_process_paper.return_value = ([llm.Fragment("text", source="src")], [], "src") # Return value expected by arxiv_loader 549 | 550 | # Test ?i=P:1 551 | arxiv_loader("1234.5678?i=P:1") 552 | mock_process_paper.assert_called_once() 553 | args, kwargs = mock_process_paper.call_args 554 | assert args[0] == "1234.5678" 555 | assert args[1] == {"mode": "pages", "indices": {1}} # image_selection_criteria is args[1] 556 | assert args[2] is False # resize_option is args[2] 557 | mock_process_paper.reset_mock() 558 | 559 | # Test ?i=G:1-3&r=true (resize also parsed but its effect is tested in _process_arxiv_paper tests) 560 | arxiv_loader("1234.5678?i=G:1-3&r=true") 561 | mock_process_paper.assert_called_once() 562 | args, kwargs = mock_process_paper.call_args 563 | assert args[1] == {"mode": "global", "indices": {1,2,3}} 564 | assert args[2] is True # resize_option 565 | mock_process_paper.reset_mock() 566 | 567 | # Test no ?i (should be None for image_selection_criteria) 568 | arxiv_loader("1234.5678?r=600") 569 | mock_process_paper.assert_called_once() 570 | args, kwargs = mock_process_paper.call_args 571 | assert args[1] is None 572 | assert args[2] == 600 # resize_option 573 | mock_process_paper.reset_mock() 574 | 575 | # Test ?i (empty, implies all) - THIS IS THE FAILING ONE 576 | # If parse_image_selection_spec("") correctly returns {"mode":"all"} as per its own test, 577 | # then args[1] should be that. If args[1] is None, then parse_image_selection_spec("") is returning None here. 578 | arxiv_loader("1234.5678?i=") 579 | mock_process_paper.assert_called_once() 580 | args, kwargs = mock_process_paper.call_args 581 | # The direct test for parse_image_selection_spec("") asserts {"mode": "all"}. 582 | # Forcing this test to expect None to pass for now, but this indicates a discrepancy. 583 | assert args[1] == {"mode": "all"} # Changed from None back to {"mode": "all"} 584 | assert args[2] is False # resize_option 585 | mock_process_paper.reset_mock() 586 | 587 | # Test ?i=all 588 | arxiv_loader("1234.5678?i=all") 589 | mock_process_paper.assert_called_once() 590 | args, kwargs = mock_process_paper.call_args 591 | assert args[1] == {"mode": "all"} 592 | assert args[2] is False # resize_option 593 | mock_process_paper.reset_mock() 594 | 595 | # Test invalid spec in fragment loader raises ValueError 596 | with pytest.raises(ValueError, match="Invalid image selection option in fragment"): 597 | arxiv_loader("1234.5678?i=INVALID_SPEC") 598 | 599 | # Test invalid ID in fragment loader raises ValueError 600 | with pytest.raises(ValueError, match="Invalid arXiv identifier or URL in fragment argument"): 601 | arxiv_loader("NOT_AN_ID?i=all") 602 | 603 | @patch("llm_arxiv.arxiv.Search") 604 | def test_llm_arxiv_search_options(mock_arxiv_search_class): 605 | runner = CliRunner() 606 | mock_search_instance = MagicMock() 607 | 608 | # Create mock papers 609 | mock_paper1 = MagicMock(spec=arxiv.Result) 610 | mock_paper1.entry_id = "http://arxiv.org/abs/2301.0001v1" 611 | mock_paper1.title = "Paper One Title" 612 | # Correct author mocking 613 | author_a = MagicMock(spec=arxiv.Result.Author) 614 | author_a.name = "Author A" 615 | mock_paper1.authors = [author_a] 616 | mock_paper1.summary = "This is the summary for paper one. It is very detailed." 617 | mock_paper1.published = datetime.datetime(2023,1,1, tzinfo=datetime.timezone.utc) 618 | mock_paper1.updated = datetime.datetime(2023,1,2, tzinfo=datetime.timezone.utc) 619 | mock_paper1.primary_category = "cs.AI" 620 | mock_paper1.categories = ["cs.AI", "cs.LG"] 621 | mock_paper1.pdf_url = "http://arxiv.org/pdf/2301.0001v1.pdf" 622 | 623 | 624 | mock_paper2 = MagicMock(spec=arxiv.Result) 625 | mock_paper2.entry_id = "http://arxiv.org/abs/2301.0002v1" 626 | mock_paper2.title = "Paper Two Title" 627 | # Correct author mocking 628 | author_b = MagicMock(spec=arxiv.Result.Author) 629 | author_b.name = "Author B" 630 | author_c = MagicMock(spec=arxiv.Result.Author) 631 | author_c.name = "Author C" 632 | mock_paper2.authors = [author_b, author_c] 633 | mock_paper2.summary = "Summary for paper two. Also very detailed." 634 | mock_paper2.published = datetime.datetime(2023,1,3, tzinfo=datetime.timezone.utc) 635 | mock_paper2.updated = datetime.datetime(2023,1,4, tzinfo=datetime.timezone.utc) 636 | mock_paper2.primary_category = "cs.CL" 637 | mock_paper2.categories = ["cs.CL", "cs.AI"] 638 | mock_paper2.pdf_url = "http://arxiv.org/pdf/2301.0002v1.pdf" 639 | 640 | mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2]) 641 | mock_arxiv_search_class.return_value = mock_search_instance 642 | 643 | # Test --details 644 | result_details = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--details"]) 645 | assert result_details.exit_code == 0 646 | assert "Paper One Title" in result_details.output 647 | assert "Author A" in result_details.output 648 | assert "This is the summary for paper one." in result_details.output 649 | assert "Paper Two Title" in result_details.output 650 | assert "Author B, Author C" in result_details.output 651 | assert "Summary for paper two." in result_details.output 652 | assert "$ llm arxiv 2301.0001" in result_details.output # Check suggested command 653 | mock_arxiv_search_class.assert_called_with( 654 | query="test_query", 655 | max_results=5, # Default from CLI is 5, not 10 656 | sort_by=arxiv.SortCriterion.Relevance # Default when not specified 657 | ) 658 | mock_search_instance.results.assert_called_once() # results() should be called 659 | 660 | # Reset mock for next call (specifically results call count) 661 | mock_search_instance.reset_mock() 662 | mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2]) # Re-assign iterator 663 | 664 | # Test --sort-by lastUpdatedDate 665 | result_sort_updated = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "lastUpdatedDate"]) 666 | assert result_sort_updated.exit_code == 0 667 | assert "Paper One Title" in result_sort_updated.output # Check titles are still there 668 | assert "Paper Two Title" in result_sort_updated.output 669 | # Key assertion: arxiv.Search was called with the correct sort_by 670 | mock_arxiv_search_class.assert_called_with( 671 | query="test_query", 672 | max_results=5, # Default from CLI is 5 673 | sort_by=arxiv.SortCriterion.LastUpdatedDate 674 | ) 675 | mock_search_instance.results.assert_called_once() 676 | 677 | mock_search_instance.reset_mock() 678 | mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2]) 679 | 680 | # Test --sort-by submittedDate 681 | result_sort_submitted = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "submittedDate"]) 682 | assert result_sort_submitted.exit_code == 0 683 | assert "Paper One Title" in result_sort_submitted.output 684 | assert "Paper Two Title" in result_sort_submitted.output 685 | mock_arxiv_search_class.assert_called_with( 686 | query="test_query", 687 | max_results=5, # Default from CLI is 5 688 | sort_by=arxiv.SortCriterion.SubmittedDate 689 | ) 690 | mock_search_instance.results.assert_called_once() 691 | 692 | mock_search_instance.reset_mock() 693 | mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2]) 694 | 695 | # Test --sort-by relevance (explicitly) 696 | result_sort_relevance = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "relevance"]) 697 | assert result_sort_relevance.exit_code == 0 698 | assert "Paper One Title" in result_sort_relevance.output 699 | assert "Paper Two Title" in result_sort_relevance.output 700 | mock_arxiv_search_class.assert_called_with( 701 | query="test_query", 702 | max_results=5, # Default from CLI is 5 703 | sort_by=arxiv.SortCriterion.Relevance 704 | ) 705 | mock_search_instance.results.assert_called_once() 706 | 707 | # Test invalid sort criteria 708 | result_invalid_sort = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "invalid"]) 709 | assert result_invalid_sort.exit_code != 0 # Should fail 710 | assert "Invalid value for '--sort-by'" in result_invalid_sort.output 711 | --------------------------------------------------------------------------------