├── .github
    └── workflows
    │   ├── publish.yml
    │   └── test.yml
├── .gitignore
├── AGENTS.md
├── LICENSE
├── README.md
├── llm_arxiv.py
├── pyproject.toml
└── tests
    ├── pytest.py
    └── test_arxiv.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 |   # Optional: Add trigger for pushes to master if you want to publish on every push
 7 |   # push:
 8 |   #   branches:
 9 |   #     - master
10 | 
11 | permissions:
12 |   contents: read
13 | 
14 | jobs:
15 |   test:
16 |     runs-on: ubuntu-latest
17 |     strategy:
18 |       matrix:
19 |         python-version: ["3.10", "3.11", "3.12"]
20 |     steps:
21 |     - name: Check Python version (skip 3.9)
22 |       if: matrix.python-version == '3.9'
23 |       run: |
24 |         echo "Skipping test job for Python 3.9 as it is explicitly excluded"
25 |         exit 0 # Exit successfully to prevent job failure if 3.9 somehow runs
26 | 
27 |     - uses: actions/checkout@v4
28 |     - name: Set up Python ${{ matrix.python-version }}
29 |       if: matrix.python-version != '3.9'
30 |       uses: actions/setup-python@v5
31 |       with:
32 |         python-version: ${{ matrix.python-version }}
33 |         cache: pip
34 |         cache-dependency-path: pyproject.toml
35 |     - name: Install dependencies
36 |       if: matrix.python-version != '3.9'
37 |       run: |
38 |         python -m pip install --upgrade pip
39 |         python -m pip install -e '.[test]'
40 |     - name: Run tests
41 |       if: matrix.python-version != '3.9'
42 |       run: |
43 |         python -m pytest
44 |   deploy:
45 |     runs-on: ubuntu-latest
46 |     needs: [test]
47 |     permissions:
48 |       id-token: write
49 |     steps:
50 |     - uses: actions/checkout@v4
51 |     - name: Set up Python
52 |       uses: actions/setup-python@v5
53 |       with:
54 |         python-version: "3.12"
55 |         cache: pip
56 |         cache-dependency-path: pyproject.toml
57 |     - name: Install dependencies
58 |       run: |
59 |         python -m pip install setuptools wheel build
60 |     - name: Build
61 |       run: |
62 |         python -m build
63 |     - name: Publish
64 |       uses: pypa/gh-action-pypi-publish@release/v1
65 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.10", "3.11", "3.12"]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v5
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         python -m pip install -e '.[test]'
28 |     - name: Run tests
29 |       run: |
30 |         python -m pytest
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | venv
 6 | .eggs
 7 | .pytest_cache
 8 | *.egg-info
 9 | .DS_Store
10 | .vscode
11 | dist
12 | build
13 | 


--------------------------------------------------------------------------------
/AGENTS.md:
--------------------------------------------------------------------------------
 1 | # AI Agent Instructions for `llm-arxiv`
 2 | 
 3 | This document provides guidance for AI agents assisting with the development and maintenance of the `llm-arxiv` repository.
 4 | 
 5 | ## Project Overview
 6 | 
 7 | `llm-arxiv` is a Python-based plugin for the [LLM CLI tool](https://llm.datasette.io/) that enables users to load and process academic papers from arXiv. It fetches paper metadata and content, typically the PDF, and makes it available for language model processing.
 8 | 
 9 | ## Key Files and Directories
10 | 
11 | *   **`llm_arxiv.py`**: This is the core file containing the plugin's logic. It implements the LLM plugin interface and handles fetching and processing arXiv papers. The `[project.entry-points.llm]` section in `pyproject.toml` points to this file (`arxiv = "llm_arxiv"`).
12 | *   **`pyproject.toml`**: The main configuration file for the project. It defines dependencies (e.g., `llm`, `arxiv`, `PyMuPDF`), build system settings, project metadata, and the plugin entry point.
13 | *   **`arxiv.py`**: This file might contain utility functions or a script related to interacting with the `arxiv` package or API. Be cautious if modifying, as its role needs to be clearly understood in context of the main `llm_arxiv.py` plugin.
14 | *   **`fitz.py`**: This file likely contains utility functions or a script related to PDF processing, using `PyMuPDF` (which provides Fitz bindings). Similar to `arxiv.py`, understand its specific role before making changes.
15 | *   **`README.md`**: The primary documentation for human users of the plugin. It should be kept up-to-date with features, installation instructions, and usage examples.
16 | *   **`AGENTS.md`**: (This file) Contains specific instructions and context for AI agents working on this codebase.
17 | *   **`tests/`**: This directory contains automated tests for the plugin, likely using `pytest`. New features and bug fixes should ideally be accompanied by relevant tests.
18 | *   **`.github/workflows/`**: Contains GitHub Actions workflow definitions, for example, for running tests automatically.
19 | 
20 | ## Common Development Tasks
21 | 
22 | When asked to perform tasks, consider the following:
23 | 
24 | *   **Adding Features**:
25 |     *   Modifications will likely center around `llm_arxiv.py`.
26 |     *   Consider how new features impact dependencies (`pyproject.toml`).
27 |     *   Add corresponding tests in the `tests/` directory.
28 |     *   Update `README.md` with user-facing documentation for the new feature.
29 | *   **Bug Fixing**:
30 |     *   Identify the relevant module (`llm_arxiv.py`, `arxiv.py`, `fitz.py`).
31 |     *   Write a test case that reproduces the bug if possible.
32 |     *   Ensure the fix doesn't break existing functionality by running all tests.
33 | *   **Dependency Management**:
34 |     *   Changes to dependencies are made in `pyproject.toml`.
35 |     *   Be mindful of version compatibility.
36 | *   **Documentation**:
37 |     *   Keep `README.md` clear and up-to-date.
38 |     *   Update this file (`AGENTS.md`) if there are significant changes to the development workflow or codebase structure relevant to AI agents.
39 | 
40 | ## Important Considerations
41 | 
42 | *   **arXiv API Usage**: If directly interacting with the arXiv API (via the `arxiv` package or otherwise), be mindful of rate limits and terms of service.
43 | *   **PDF Parsing**: PDF parsing can be complex. `PyMuPDF` (Fitz) is used. Ensure robustness and handle potential errors gracefully.
44 | *   **Code Style and Quality**: Follow existing code style. Ensure code is clear, well-commented where non-obvious, and efficient.
45 | *   **Testing**: Always aim to maintain or increase test coverage.
46 | 
47 | By following these guidelines, AI agents can contribute effectively to the `llm-arxiv` project. 
48 | 
49 | ## Known Issues and Debugging Notes
50 | 
51 | ### Image Resizing Bug (Incorrectly Resizes to 1x1 Pixels)
52 | 
53 | *   **Problem Description**:
54 |     *   When using the `llm arxiv ... -r` command (enabling image resizing), all extracted images from the PDF are being incorrectly resized to 1x1 pixels.
55 |     *   This causes any downstream vision-enabled LLM to interpret these images as simple solid blocks of color, rather than recognizing their actual content.
56 |     *   The issue is specific to the resizing operation. If resizing is disabled (e.g., `llm arxiv ID -i all` without `-r`, or via the fragment loader `llm -f arxiv:ID?i=all`), images are processed and described correctly by the LLM (though at their original, unresized dimensions).
57 | 
58 | *   **Location of Buggy Code**:
59 |     *   File: `llm_arxiv.py`
60 |     *   Function: `_process_arxiv_paper`
61 |     *   Specific Block: The section responsible for calculating `new_width` and `new_height` within the `if perform_resize:` block. Debug logs confirm that `new_width` and `new_height` are both evaluating to `1` before being passed to `img.resize()`.
62 | 
63 | *   **Debugging Steps (to isolate the miscalculation)**:
64 |     1.  Focus on the arithmetic that calculates `new_width` and `new_height`:
65 |         ```python
66 |         if img.width > img.height:
67 |             new_width = max_dim_to_use 
68 |             new_height = max(1, int(max_dim_to_use * img.height / img.width))
69 |         else:
70 |             new_height = max_dim_to_use
71 |             new_width = max(1, int(max_dim_to_use * img.width / img.height))
72 |         ```
73 |     2.  Add detailed debug print statements *immediately before* these calculations to log the exact runtime values of:
74 |         *   `img.width` (original width of the image being processed)
75 |         *   `img.height` (original height)
76 |         *   `max_dim_to_use` (the target maximum dimension, e.g., 512)
77 |     3.  Also, print the result of the intermediate floating-point calculation *before* it's passed to `int()`:
78 |         *   e.g., `value_before_int = max_dim_to_use * img.height / img.width` (and its counterpart for `new_width`)
79 |     4.  Finally, print the calculated `new_width` and `new_height` *immediately after* they are computed and before they are used in `img.resize()`.
80 |     *   This detailed logging should reveal why the aspect ratio calculation is resulting in a value that, when truncated by `int()`, becomes 0 (which `max(1, ...)` then turns into 1).
81 | 
82 | *   **How to Prove It's Solved**:
83 |     1.  **Check Debug Output**: After applying a fix, the new debug print statements (from step 4 above) should show that `new_width` and `new_height` are sensible dimensions that maintain the aspect ratio and respect `max_dim_to_use` (e.g., for a 1500x500 image with `max_dim_to_use=512`, the new dimensions should be around 512x170, not 1x1).
84 |     2.  **Check LLM Output**: Run the command `llm arxiv <PAPER_ID> -i all -r "describe the images"`. The LLM should now describe the actual content of the figures/diagrams in the paper, not just solid colors.
85 |     3.  **Verify `img.size` after resize**: The existing debug line `print(f"Debug: Image *after* resize: Mode: {img.mode}, Size: {img.size}, Info: {img.info}", file=sys.stderr)` should show the corrected, non-1x1 dimensions.
86 |     4.  **(Optional Advanced Test)**: For a more robust automated test, one could theoretically (in `tests/test_arxiv.py`):
87 |         *   Mock `_process_arxiv_paper` or have a test utility that calls it directly with a known image that requires resizing.
88 |         *   Capture the `attachments` list returned.
89 |         *   For each attachment, use `Image.open(io.BytesIO(attachment.content))` to load the processed image.
90 |         *   Assert that the dimensions of this re-loaded image are the expected resized dimensions (e.g., not 1x1, and respecting the aspect ratio and max dimension). 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # llm-arxiv
  2 | 
  3 | [![PyPI](https://img.shields.io/pypi/v/llm-arxiv.svg)](https://pypi.org/project/llm-arxiv/)
  4 | [![Changelog](https://img.shields.io/github/v/release/agustif/llm-arxiv?include_prereleases&label=changelog)](https://github.com/agustif/llm-arxiv/releases)
  5 | [![Tests](https://github.com/agustif/llm-arxiv/actions/workflows/test.yml/badge.svg)](https://github.com/agustif/llm-arxiv/actions/workflows/test.yml)
  6 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/agustif/llm-arxiv/blob/main/LICENSE)
  7 | 
  8 | LLM plugin for loading arXiv papers and their images.
  9 | 
 10 | This plugin allows you to search for arXiv papers, fetch their text content, and optionally, their images directly into `llm`.
 11 | 
 12 | ## Installation
 13 | 
 14 | Install this plugin in the same environment as [LLM](https://llm.datasette.io/).
 15 | 
 16 | ```bash
 17 | llm install llm-arxiv
 18 | ```
 19 | 
 20 | The command above will also install the necessary dependencies: `arxiv`, `PyMuPDF`, and `Pillow`.
 21 | 
 22 | ## Usage
 23 | 
 24 | This plugin provides three main ways to interact with arXiv papers:
 25 | 
 26 | 1.  **As a fragment loader:** Allows you to inject arXiv paper content (text and optionally images) directly into a prompt using the `-f` or `--fragment` option with `llm`.
 27 | 2.  **As a standalone command (`llm arxiv`):** Provides an `llm arxiv` command to fetch, process, and output paper content directly to stdout, which can then be piped to other commands or models.
 28 | 3.  **As a search command (`llm arxiv-search`):** Allows you to search arXiv for papers based on a query string.
 29 | 
 30 | ### 1. Fragment Loader (`-f arxiv:...`)
 31 | 
 32 | You can load an arXiv paper by its ID or full URL. The text content (converted to Markdown) and any selected images (as attachments) will be passed to the language model.
 33 | 
 34 | **Syntax:**
 35 | 
 36 | `llm -f 'arxiv:PAPER_ID_OR_URL[?options]' "Your prompt here..."`
 37 | 
 38 | *   `PAPER_ID_OR_URL`: Can be an arXiv ID (e.g., `2310.06825`, `astro-ph/0601009`) or a full arXiv URL (e.g., `https://arxiv.org/abs/2310.06825`, `http://arxiv.org/pdf/2310.06825.pdf`).
 39 | *   `[?options]`: Optional query parameters to control image inclusion and resizing. (Remember to quote the argument if using `?` or `&` in your shell).
 40 | 
 41 | **Fragment Loader Options:**
 42 | 
 43 | *   `i` / `include_images`: Controls image inclusion. If not specified, no images are included.
 44 |     *   `?i` or `?i=` or `?i=all`: Include all images from the paper.
 45 |     *   `?i=none`: Include no images (same as omitting `?i`).
 46 |     *   `?i=P:pages`: Include all images from specified pages. `pages` is a comma-separated list of page numbers or ranges (e.g., `P:1`, `P:1,3-5`, `P:2,4`). Page numbers are 1-indexed.
 47 |     *   `?i=G:indices`: Include images by their global index in the document (sequentially numbered as they appear). `indices` is a comma-separated list of image indices or ranges (e.g., `G:1`, `G:1-5,10`). Indices are 1-indexed.
 48 | *   `r` / `resize_images`: Controls image resizing. Resizing only applies if images are included.
 49 |     *   `?r` or `?r=true`: Enable image resizing. Images will be resized to a maximum dimension of 512px by default, preserving aspect ratio. Only images larger than this will be downscaled.
 50 |     *   `?r=PIXELS`: Enable image resizing and set a custom maximum dimension (e.g., `?r=800`).
 51 | 
 52 | **Examples (Fragment Loader):**
 53 | 
 54 | *   Load text only:
 55 |     ```bash
 56 |     llm -f 'arxiv:2310.06825' "Summarize this paper."
 57 |     ```
 58 | *   Load text and all images (resized to default 512px max):
 59 |     ```bash
 60 |     llm -f 'arxiv:2310.06825?i&r' -m gpt-4-vision-preview "Explain the diagrams in this paper."
 61 |     ```
 62 | *   Load text and images from page 1 and 3, resized to 800px max:
 63 |     ```bash
 64 |     llm -f 'arxiv:2310.06825?i=P:1,3&r=800' -m gemini-pro-vision "Describe the images on pages 1 and 3."
 65 |     ```
 66 | *   Load text and the first 5 globally indexed images, no resizing:
 67 |     ```bash
 68 |     llm -f 'arxiv:2310.06825?i=G:1-5' -m some-image-model "What do the first five images show?"
 69 |     ```
 70 | 
 71 | ### 2. Standalone Command (`llm arxiv ...`)
 72 | 
 73 | The `llm arxiv` command fetches and processes an arXiv paper.
 74 | *   If no prompt is provided, it outputs the paper's content as Markdown to standard output. This can be piped to other commands or LLMs.
 75 | *   If a `PROMPT` is provided, it processes the paper content (including any selected images as attachments) with the specified or default LLM.
 76 | 
 77 | **Syntax:**
 78 | 
 79 | `llm arxiv PAPER_ID_OR_URL [PROMPT] [OPTIONS]`
 80 | 
 81 | **Arguments:**
 82 | 
 83 | *   `PAPER_ID_OR_URL`: The arXiv ID (e.g., `2310.06825`) or full URL.
 84 | *   `PROMPT` (Optional): A prompt to send to an LLM along with the paper's content.
 85 | 
 86 | **Command Options:**
 87 | 
 88 | *   `-i SPEC` / `--include-images SPEC`:
 89 |     Controls image inclusion. If not specified and a prompt is given, `parse_image_selection_spec`'s default behavior for `None` (no images) applies. If no prompt is given, no images are processed by default.
 90 |     *   `-i all` or (if `PROMPT` is present) simply `-i` with no value: Include all images.
 91 |     *   `-i ""` (empty string value): Include all images.
 92 |     *   `-i none`: Include no images.
 93 |     *   `-i P:pages`: Include all images from specified pages (e.g., `P:1`, `P:1,3-5`).
 94 |     *   `-i G:indices`: Include images by their global index (e.g., `G:1`, `G:1-5,10`).
 95 | *   `-r` / `--resize-images`:
 96 |     Enable image resizing. Images will be resized to a maximum dimension of 512px by default, preserving aspect ratio. Only images larger than this will be downscaled.
 97 | *   `-d PIXELS` / `--max-dimension PIXELS`:
 98 |     Set a custom maximum dimension in pixels for resizing. Requires `-r` to be active.
 99 | *   `-m MODEL_ID` / `--model MODEL_ID`:
100 |     Specify the LLM model to use if a `PROMPT` is provided.
101 | *   `-s SYSTEM_PROMPT` / `--system SYSTEM_PROMPT`:
102 |     Specify a system prompt to use with the LLM if a `PROMPT` is provided.
103 | 
104 | **Examples (Standalone Command):**
105 | 
106 | *   Get Markdown content of a paper:
107 |     ```bash
108 |     llm arxiv 2310.06825
109 |     ```
110 | *   Get Markdown, prepare all images (resized), then pipe to a model:
111 |     ```bash
112 |     llm arxiv 2310.06825 -i all -r | llm -m gpt-4-vision-preview "Summarize this, paying attention to figures."
113 |     ```
114 | *   Directly prompt an LLM with the paper's content and images from pages 2 and 4 (resized to 600px):
115 |     ```bash
116 |     llm arxiv 2310.06825 "Explain figures on page 2 and 4." -i P:2,4 -r -d 600 -m gpt-4o
117 |     ```
118 | *   Summarize a paper using the default LLM and include all images:
119 |     ```bash
120 |     llm arxiv 2310.06825 "Summarize the key findings." -i all
121 |     ```
122 | 
123 | ### 3. Search Command (`llm arxiv-search ...`)
124 | 
125 | The `llm arxiv-search` command allows you to search for papers on arXiv using a query string.
126 | 
127 | **Syntax:**
128 | 
129 | `llm arxiv-search [OPTIONS] QUERY_STRING`
130 | 
131 | **Arguments:**
132 | 
133 | *   `QUERY_STRING`: The search query (e.g., "quantum computing", "author:Hawking title:black holes"). See [arXiv API user manual](https://arxiv.org/help/api/user-manual#query_details) for advanced query syntax.
134 | 
135 | **Options:**
136 | 
137 | *   `-n INT`, `--max-results INT`: Maximum number of search results to return (Default: `5`).
138 | *   `--sort-by [relevance|lastUpdatedDate|submittedDate]`: Sort order for search results (Default: `relevance`).
139 | *   `--details`: Show more details for each result, including authors, full abstract, categories, publication/update dates, and PDF link.
140 | 
141 | **Output:**
142 | 
143 | For each search result, the command will display:
144 | *   The paper's ID and Title.
145 | *   A suggested command to fetch the full paper with `llm arxiv <ID>`. This command is styled (e.g., bold, green, underlined, prefixed with `$`) for visibility.
146 | *   A brief abstract (or full details if `--details` is used).
147 | 
148 | Additionally, the script will attempt to copy all the suggested `llm arxiv <ID>` commands (newline-separated) to your system clipboard using an OSC 52 escape sequence. A message like `(Attempted to copy N command(s) to clipboard)` will be printed to stderr. The success of this automatic copy depends on your terminal emulator's support and configuration (e.g., iTerm2 needs clipboard access enabled for applications).
149 | 
150 | **Examples (Search Command):**
151 | 
152 | *   Search for "large language models" and get top 3 results (brief):
153 |     ```bash
154 |     llm arxiv-search -n 3 "large language models"
155 |     ```
156 |     (This will also attempt to copy the 3 suggested `llm arxiv` commands to your clipboard.)
157 | 
158 | *   Search for papers by author "Hinton" on "neural networks", sorted by submission date, with full details:
159 |     ```bash
160 |     llm arxiv-search --sort-by submittedDate --details "au:Hinton AND ti:\"neural network\""
161 |     ```
162 | 
163 | ## Image Handling Notes
164 | 
165 | *   **Rationale for Optional Images:** Processing and including images can significantly increase the data size sent to language models. Many models have limitations on input context window size, and some may not support image inputs at all or may incur higher costs for them. The granular controls for image inclusion (all, none, specific pages/indices) and resizing allow users to manage this, ensuring that only necessary visual information is passed to the LLM, optimizing for cost, speed, and model compatibility.
166 | *   Images are extracted from the PDF, converted to Markdown placeholders `[IMAGE: http://arxiv.org/abs/ID#page_X_img_Y]`, and attached as `llm.Attachment` objects if selected.
167 | *   Supported input image formats from PDFs include common types like JPEG, PNG, GIF, BMP. Efforts are made to convert others, but complex or rare formats might be skipped.
168 | *   When resized, images are converted to JPEG (for most common types) or PNG (if transparency or other features warrant it) to save tokens and improve compatibility with models.
169 | *   Image processing errors are printed to `stderr` but do not stop the text extraction.
170 | 
171 | ## Development
172 | 
173 | To contribute to this plugin, clone the repository and install it in editable mode:
174 | 
175 | ```bash
176 | git clone https://github.com/agustif/llm-arxiv.git
177 | cd llm-arxiv
178 | # It's recommended to use a virtual environment
179 | python -m venv venv
180 | source venv/bin/activate  # On Windows use `venv\\Scripts\\activate`
181 | # Install in editable mode
182 | pip install -e .
183 | # Install additional dependencies for testing (e.g., pytest, pytest-cov)
184 | pip install pytest pytest-cov
185 | # Run tests
186 | pytest tests/
187 | ```
188 | 
189 | ## AGENTS.md
190 | 
191 | See [AGENTS.md](AGENTS.md) for notes on how AI agents should interpret and use this tool and its outputs.
192 | 


--------------------------------------------------------------------------------
/llm_arxiv.py:
--------------------------------------------------------------------------------
  1 | import llm
  2 | import arxiv
  3 | # Keep specific import for this one as it seemed to work
  4 | from arxiv import UnexpectedEmptyPageError, HTTPError
  5 | import fitz  # PyMuPDF
  6 | import tempfile
  7 | import re
  8 | from typing import List, Union, Tuple, Optional, Set, TypedDict, Literal # Added Set, TypedDict, Literal
  9 | import base64 # For image encoding
 10 | import markdownify # Added for HTML to Markdown conversion
 11 | import io # For handling image bytes
 12 | from PIL import Image # Added for image resizing
 13 | import os # Added for environment variable access
 14 | from urllib.parse import parse_qs # Added for parsing options from argument
 15 | import click # For the new command
 16 | import sys # Ensure sys is imported for stderr printing
 17 | import datetime # For formatting dates from arxiv results
 18 | 
 19 | # --- Types for Image Selection ---
 20 | class ImageSelectionCriteria(TypedDict, total=False):
 21 |     mode: Literal["all", "global", "pages"]
 22 |     indices: Set[int] # For "global" mode: global image indices. For "pages" mode: page numbers.
 23 | 
 24 | 
 25 | # --- Helper function to parse range strings like "1,3-5,7" ---
 26 | def parse_ranges_to_set(range_str: str) -> Set[int]:
 27 |     """Parses a string like '1,3-5,7' into a set of integers {1, 3, 4, 5, 7}."""
 28 |     result: Set[int] = set()
 29 |     if not range_str:
 30 |         return result
 31 |     parts = range_str.split(',')
 32 |     for part in parts:
 33 |         part = part.strip()
 34 |         if not part:
 35 |             continue
 36 |         if '-' in part:
 37 |             start_str, end_str = part.split('-', 1)
 38 |             try:
 39 |                 start = int(start_str)
 40 |                 end = int(end_str)
 41 |                 if start <= 0 or end <= 0:
 42 |                     raise ValueError("Page/image numbers must be positive.")
 43 |                 if start > end:
 44 |                     raise ValueError(f"Invalid range: start ({start}) > end ({end}).")
 45 |                 result.update(range(start, end + 1))
 46 |             except ValueError as e:
 47 |                 raise ValueError(f"Invalid range part: '{part}'. {e}") from e
 48 |         else:
 49 |             try:
 50 |                 val = int(part)
 51 |                 if val <= 0:
 52 |                     raise ValueError("Page/image numbers must be positive.")
 53 |                 result.add(val)
 54 |             except ValueError as e:
 55 |                 raise ValueError(f"Invalid number in range string: '{part}'. {e}") from e
 56 |     return result
 57 | 
 58 | # --- Helper function to parse image selection specification string ---
 59 | def parse_image_selection_spec(spec_string: Optional[str]) -> Optional[ImageSelectionCriteria]:
 60 |     """
 61 |     Parses an image selection string.
 62 |     Returns None if no images should be included.
 63 |     Returns a dict like {"mode": "all"} or {"mode": "global", "indices": {1,2,3}}
 64 |     or {"mode": "pages", "indices": {1,2,3}}.
 65 |     """
 66 |     if spec_string is None:
 67 |         return None
 68 | 
 69 |     s_lower = spec_string.lower().strip()
 70 |     if not s_lower or s_lower in ["all", "true", "yes", "1"]: # Empty string (e.g. from ?i= or -i without arg) means all
 71 |         return {"mode": "all"}
 72 |     if s_lower in ["none", "false", "no", "0"]:
 73 |         return None
 74 | 
 75 |     if s_lower.startswith("g:"):
 76 |         try:
 77 |             indices = parse_ranges_to_set(spec_string[2:])
 78 |             if not indices:
 79 |                  raise ValueError("Global image selection ('G:') requires at least one image index or range.")
 80 |             return {"mode": "global", "indices": indices}
 81 |         except ValueError as e: # Catch errors from parse_ranges_to_set
 82 |             raise ValueError(f"Invalid global image selection format ('{spec_string}'): {e}") from e
 83 |     elif s_lower.startswith("p:"):
 84 |         try:
 85 |             page_numbers = parse_ranges_to_set(spec_string[2:])
 86 |             if not page_numbers:
 87 |                 raise ValueError("Page selection ('P:') requires at least one page number or range.")
 88 |             return {"mode": "pages", "indices": page_numbers} # Using 'indices' key for page numbers
 89 |         except ValueError as e: # Catch errors from parse_ranges_to_set
 90 |             raise ValueError(f"Invalid page selection format ('{spec_string}'): {e}") from e
 91 |     
 92 |     raise ValueError(
 93 |         f"Invalid image selection format: '{spec_string}'. "
 94 |         "Expected 'all', 'none', 'G:1,2-5', 'P:1,2-4', or blank for all."
 95 |     )
 96 | 
 97 | 
 98 | # --- Helper Function for Core Logic ---
 99 | def _process_arxiv_paper(
100 |     arxiv_id_or_url_main: str, 
101 |     image_selection_criteria: Optional[ImageSelectionCriteria], 
102 |     resize_option: Union[bool, int], 
103 | ) -> Tuple[str, List[llm.Attachment], str]:
104 |     """
105 |     Internal helper to fetch and process an arXiv paper.
106 |     Returns markdown text, list of llm.Attachment objects, and the paper's source URL.
107 |     """
108 |     arxiv_id = extract_arxiv_id(arxiv_id_or_url_main)
109 |     if not arxiv_id:
110 |         raise ValueError(
111 |             f"Invalid arXiv identifier or URL passed to _process_arxiv_paper: {arxiv_id_or_url_main}.")
112 | 
113 |     search = arxiv.Search(id_list=[arxiv_id], max_results=1)
114 |     results = list(search.results())
115 |     if not results:
116 |         raise ValueError(f"No paper found for arXiv ID: {arxiv_id}")
117 |     paper = results[0]
118 |     paper_source_url = paper.entry_id
119 | 
120 |     attachments_list: List[llm.Attachment] = []
121 |     full_html_parts: List[str] = []
122 |     
123 |     global_image_document_idx_counter = 0 # For 'G:' mode selection
124 | 
125 |     with tempfile.TemporaryDirectory() as temp_dir_for_pdf:
126 |         pdf_path = paper.download_pdf(dirpath=temp_dir_for_pdf)
127 |         try:
128 |             with fitz.open(pdf_path) as doc:
129 |                 for page_num, page in enumerate(doc):
130 |                     page_html_content = page.get_text("html") # Get HTML first
131 |                     
132 |                     current_page_conceptual_refs_for_placeholders: List[str] = []
133 |                     current_page_attachments_for_this_page: List[llm.Attachment] = []
134 | 
135 |                     if image_selection_criteria: # Only attempt to process images if criteria exist
136 |                         image_list = page.get_images(full=True)
137 |                         for img_idx_on_page, img_info in enumerate(image_list):
138 |                             global_image_document_idx_counter += 1 # Count every image found in doc order
139 | 
140 |                             # Determine if this specific image should be included
141 |                             should_include_this_specific_image = False
142 |                             mode = image_selection_criteria["mode"]
143 |                             
144 |                             if mode == "all":
145 |                                 should_include_this_specific_image = True
146 |                             elif mode == "global":
147 |                                 # Explicitly check if indices is not None, though TypedDict implies it exists for this mode
148 |                                 if image_selection_criteria.get("indices") and global_image_document_idx_counter in image_selection_criteria["indices"]:
149 |                                     should_include_this_specific_image = True
150 |                             elif mode == "pages":
151 |                                 # Explicitly check for indices
152 |                                 if image_selection_criteria.get("indices") and (page_num + 1) in image_selection_criteria["indices"]:
153 |                                     should_include_this_specific_image = True
154 |                             
155 |                             if not should_include_this_specific_image:
156 |                                 continue # Skip this image, don't process or add placeholder
157 | 
158 |                             # --- Start of actual image processing for selected image ---
159 |                             xref = img_info[0]
160 |                             try:
161 |                                 base_image = doc.extract_image(xref)
162 |                             except Exception: # Skip if extraction fails
163 |                                 print(f"Warning: Failed to extract image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1}. Skipping.", file=sys.stderr)
164 |                                 continue 
165 |                             
166 |                             image_bytes = base_image["image"]
167 |                             original_ext_from_pdf = base_image["ext"].lower()
168 |                             
169 |                             pillow_input_ext_guess = original_ext_from_pdf
170 |                             # jpx (JPEG2000) is not well supported by default Pillow, treat as png for broader compatibility attempt
171 |                             if original_ext_from_pdf not in ["png", "jpeg", "jpg", "gif", "bmp"] or original_ext_from_pdf == "jpx":
172 |                                 pillow_input_ext_guess = "png"
173 | 
174 |                             try:
175 |                                 img = Image.open(io.BytesIO(image_bytes))
176 |                                 # Ensure a common mode BEFORE load() and resize()
177 |                                 if img.mode == 'P':
178 |                                     img = img.convert('RGBA' if img.info.get('transparency') is not None else 'RGB')
179 |                                 elif img.mode not in ['RGB', 'RGBA', 'L', 'LA']:
180 |                                     # For CMYK, YCbCr, or other complex modes, convert to RGBA early
181 |                                     img = img.convert('RGBA')
182 |                                 
183 |                                 img.load() # Force loading of image data
184 | 
185 |                                 # More detailed logging before the check
186 |                                 print(f"Debug: Image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1}: Original PDF ext: {original_ext_from_pdf}, Pillow mode: {img.mode}, Pillow w: {img.width}, h: {img.height}", file=sys.stderr)
187 | 
188 |                                 # Check for zero dimensions immediately after opening
189 |                                 if img.width <= 0 or img.height <= 0:
190 |                                     print(f"Warning: Image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1} has zero or negative dimensions (w={img.width}, h={img.height}) after opening. Skipping.", file=sys.stderr)
191 |                                     continue # Skip to the next image
192 | 
193 |                                 perform_resize = False
194 |                                 max_dim_to_use = 512 # Default for when resize_option is True
195 | 
196 |                                 if isinstance(resize_option, int) and resize_option > 0:
197 |                                     perform_resize = True
198 |                                     max_dim_to_use = resize_option
199 |                                 elif resize_option is True: 
200 |                                     perform_resize = True
201 |                                     # max_dim_to_use is already set to default (512)
202 |                                 
203 |                                 if perform_resize:
204 |                                     if img.width > max_dim_to_use or img.height > max_dim_to_use:
205 |                                         if img.width > img.height:
206 |                                             new_width = max_dim_to_use
207 |                                             new_height = max(1, int(max_dim_to_use * img.height / img.width))
208 |                                         else:
209 |                                             new_height = max_dim_to_use
210 |                                             new_width = max(1, int(max_dim_to_use * img.width / img.height))
211 |                                         img = img.resize((new_width, new_height), Image.Resampling.BILINEAR)
212 |                                         print(f"Debug: Image *after* resize: Mode: {img.mode}, Size: {img.size}, Info: {img.info}", file=sys.stderr)
213 |                                         # Explicitly convert after resize to ensure a common mode
214 |                                         if img.mode == 'P':
215 |                                             img = img.convert('RGBA' if img.info.get('transparency') is not None else 'RGB')
216 |                                         elif img.mode not in ['RGB', 'RGBA', 'L', 'LA']:
217 |                                             img = img.convert('RGBA') # Default to RGBA if not a simple mode
218 |                                 
219 |                                 output_buffer = io.BytesIO()
220 |                                 processed_image_final_ext = None
221 | 
222 |                                 if pillow_input_ext_guess in ["jpeg", "jpg"]:
223 |                                     if img.mode not in ['RGB', 'L']: # If not RGB or Grayscale
224 |                                         img = img.convert('RGB') # Convert to RGB (strips alpha if any)
225 |                                     img.save(output_buffer, format="JPEG", quality=70, optimize=True) 
226 |                                     processed_image_final_ext = "jpeg"
227 |                                 else: 
228 |                                     # Default to PNG for non-JPEG originals
229 |                                     # Ensure mode is suitable for PNG saving (L, LA, RGB, RGBA)
230 |                                     if img.mode == 'P': # Palette
231 |                                         # Convert to RGBA if transparency is present, else RGB
232 |                                         img = img.convert('RGBA' if img.info.get('transparency') is not None else 'RGB')
233 |                                     elif img.mode in ['CMYK', 'YCbCr']:
234 |                                         img = img.convert('RGBA') # Convert to RGBA for broader compatibility
235 |                                     elif img.mode not in ['L', 'LA', 'RGB', 'RGBA']:
236 |                                         # For other unhandled modes (e.g., 'F', '1'), attempt conversion to RGBA
237 |                                         # This is a fallback; specific handling might be better if such modes are common
238 |                                         print(f"Warning: Image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1} has unusual mode {img.mode}, converting to RGBA for PNG saving.", file=sys.stderr)
239 |                                         img = img.convert('RGBA')
240 |                                     
241 |                                     # At this point, img.mode should be L, LA, RGB, or RGBA, all saveable as PNG
242 |                                     img.save(output_buffer, format="PNG") # Temporarily remove optimize=True
243 |                                     processed_image_final_ext = "png"
244 |                                 
245 |                                 processed_image_bytes = output_buffer.getvalue()
246 |                                 
247 |                                 # Conceptual ref uses page_num and img_idx_on_page for placeholder uniqueness
248 |                                 conceptual_ref = f"{paper_source_url}#page_{page_num + 1}_img_{img_idx_on_page + 1}"
249 |                                 current_page_conceptual_refs_for_placeholders.append(conceptual_ref)
250 |                                 
251 |                                 attachment = llm.Attachment(content=processed_image_bytes) 
252 |                                 attachment.type = f"image/{processed_image_final_ext}" 
253 |                                 current_page_attachments_for_this_page.append(attachment)
254 | 
255 |                             except Exception as processing_error:
256 |                                 print(f"Warning: Failed to process image {img_idx_on_page} (global {global_image_document_idx_counter}) on page {page_num + 1} (original ext: {original_ext_from_pdf}). Skipping. Error: {processing_error}", file=sys.stderr)
257 |                             # --- End of actual image processing ---
258 |                     
259 |                     # Replace <img> tags in HTML with placeholders for *selected and processed* images
260 |                     placeholder_iter = iter(current_page_conceptual_refs_for_placeholders)
261 |                     def replace_img_with_placeholder_fn(match_obj):
262 |                         try:
263 |                             conceptual_ref_for_match = next(placeholder_iter)
264 |                             return f"<p>[IMAGE: {conceptual_ref_for_match}]</p>" # Wrap placeholder in <p> for markdownify
265 |                         except StopIteration: # Should not happen if lists are in sync
266 |                             return "" 
267 |                     
268 |                     # Apply replacement to the original HTML content of the page
269 |                     processed_page_html_content = re.sub(r"<img[^>]*>", replace_img_with_placeholder_fn, page_html_content, flags=re.IGNORECASE)
270 |                     full_html_parts.append(processed_page_html_content)
271 |                     attachments_list.extend(current_page_attachments_for_this_page) # Add processed attachments
272 |                     
273 |         except Exception as e:
274 |             raise ValueError(f"Failed to extract content from PDF {pdf_path}: {e}") from e
275 | 
276 |     full_combined_html = "".join(full_html_parts)
277 |     # Convert the final aggregated HTML (with placeholders) to Markdown
278 |     markdown_text = markdownify.markdownify(full_combined_html, strip=['img']) # strip=['img'] redundant if placeholders work perfectly, but good safeguard.
279 |     
280 |     return markdown_text, attachments_list, paper_source_url
281 | 
282 | 
283 | @llm.hookimpl
284 | def register_fragment_loaders(register):
285 |     register("arxiv", arxiv_loader)
286 | 
287 | 
288 | def extract_arxiv_id(argument: str) -> Union[str, None]:
289 |     """Extracts arXiv ID from URL or returns the argument if it looks like an ID."""
290 |     match_url = re.match(r"https?://arxiv\.org/(?:abs|pdf)/(\d{4,}\.\d{4,}(?:v\d+)?)(?:\.pdf)?$", argument)
291 |     if match_url:
292 |         return match_url.group(1)
293 | 
294 |     match_id = re.match(r"^(\d{4,}\.\d{4,}(?:v\d+)?)$", argument)
295 |     if match_id:
296 |         return match_id.group(1)
297 | 
298 |     match_old_id = re.match(r"^[a-z-]+(?:\.[A-Z]{2})?/\d{7}$", argument)
299 |     if match_old_id:
300 |         return argument
301 | 
302 |     return None
303 | 
304 | 
305 | def arxiv_loader(argument: str) -> List[Union[llm.Fragment, llm.Attachment]]:
306 |     """
307 |     Load text and images from an arXiv paper PDF. Fragment loader.
308 |     Usage: llm -f arxiv:PAPER_ID_OR_URL[?options] "prompt"
309 |     Options (append to ID/URL):
310 |     - ?i[=SPEC] or ?include_images[=SPEC]: Include images.
311 |         SPEC can be 'all' (default if ?i present), 'none',
312 |         'G:1,3-5' (global images), 'P:1,2-4' (images from pages).
313 |     - ?r[=VAL] or ?resize_images[=VAL]: VAL can be 'true' (default 512px) or PIXELS.
314 |     """
315 |     main_argument_part = argument
316 |     query_string = ""
317 |     if '?' in argument:
318 |         main_argument_part, query_string = argument.split('?', 1)
319 |     
320 |     options = parse_qs(query_string)
321 | 
322 |     # Image selection for fragment loader
323 |     image_spec_str_loader: Optional[str] = None
324 |     raw_values_i = options.get('i', [])
325 |     raw_values_include_images = options.get('include_images', [])
326 |     
327 |     chosen_raw_value_for_images: Optional[str] = None
328 |     if raw_values_i:
329 |         chosen_raw_value_for_images = raw_values_i[0]
330 |     elif raw_values_include_images:
331 |         chosen_raw_value_for_images = raw_values_include_images[0]
332 |     # If chosen_raw_value_for_images is None here, it means no ?i or ?include_images param was present.
333 |     # parse_image_selection_spec handles None correctly (-> no images).
334 |     # It also handles "" (from ?i=) as "all".
335 |     
336 |     image_criteria_loader: Optional[ImageSelectionCriteria] = None
337 |     try:
338 |         # DEBUG PRINT for chosen_raw_value_for_images
339 |         print(f"Debug arxiv_loader: chosen_raw_value_for_images = {repr(chosen_raw_value_for_images)}", file=sys.stderr)
340 |         image_criteria_loader = parse_image_selection_spec(chosen_raw_value_for_images)
341 |     except ValueError as e:
342 |         raise ValueError(f"Invalid image selection option in fragment ('{chosen_raw_value_for_images}'): {e}") from e
343 |     
344 |     # Resize option for fragment loader
345 |     resize_option_loader: Union[bool, int] = False # Default to no resize
346 |     resize_values = options.get('resize_images', []) + options.get('r', [])
347 |     if resize_values:
348 |         val = resize_values[0].lower()
349 |         if val in ['true', '1', 'yes', '']: # Empty means ?r was present
350 |             resize_option_loader = True
351 |         else:
352 |             try:
353 |                 pixel_value = int(val)
354 |                 if pixel_value > 0:
355 |                     resize_option_loader = pixel_value
356 |                 else: # Non-positive int, treat as just enabling default resize
357 |                     resize_option_loader = True 
358 |             except ValueError: # Not a bool-like string and not an int, treat as enabling default resize if ?r was present
359 |                 resize_option_loader = True 
360 | 
361 |     temp_arxiv_id = extract_arxiv_id(main_argument_part)
362 |     if not temp_arxiv_id:
363 |          raise ValueError(
364 |             f"Invalid arXiv identifier or URL in fragment argument: {main_argument_part}.")
365 | 
366 |     try:
367 |         markdown_text, attachments, paper_source_url_for_fragment = _process_arxiv_paper(
368 |             main_argument_part, 
369 |             image_criteria_loader, 
370 |             resize_option_loader
371 |         )
372 |         
373 |         fragments_and_attachments: List[Union[llm.Fragment, llm.Attachment]] = [
374 |             llm.Fragment(content=markdown_text, source=paper_source_url_for_fragment)
375 |         ]
376 |         fragments_and_attachments.extend(attachments)
377 |         return fragments_and_attachments
378 | 
379 |     except UnexpectedEmptyPageError as e:
380 |          raise ValueError(f"arXiv search returned an unexpected empty page. Check the ID/URL. Error: {e}") from e
381 |     except HTTPError as e:
382 |         raise ValueError(f"Failed to fetch paper details from arXiv. Check network or ID/URL. Error: {e}") from e
383 |     except ValueError as e: 
384 |         raise e 
385 |     except Exception as e:
386 |         error_ref = temp_arxiv_id if temp_arxiv_id else main_argument_part
387 |         raise ValueError(f"Error processing arXiv paper {error_ref} for fragment: {str(e)}") from e
388 | 
389 | # --- New Command: arxiv-search ---
390 | @llm.hookimpl
391 | def register_commands(cli):
392 |     @cli.command(name="arxiv")
393 |     @click.argument("paper_id_or_url", required=True)
394 |     @click.argument("prompt", required=False, default=None)
395 |     @click.option(
396 |         "--include-images",
397 |         "-i",
398 |         "include_images_spec_str",
399 |         type=str,
400 |         default=None,
401 |         help="Include images. Examples: 'all', 'none', 'G:1,3-5', 'P:1,2-4'. If omitted, no images included."
402 |     )
403 |     @click.option(
404 |         "--resize-images",
405 |         "-r",
406 |         is_flag=True,
407 |         help="Enable image resizing (default 512px, or use --max-dimension)."
408 |     )
409 |     @click.option(
410 |         "--max-dimension",
411 |         "-d",
412 |         type=int,
413 |         default=None,
414 |         help="Set custom max dimension (pixels) for resizing. Requires -r."
415 |     )
416 |     @click.option(
417 |         "-m",
418 |         "--model",
419 |         "model_id_option",
420 |         type=str,
421 |         default=None,
422 |         help="LLM model to use for the prompt (if provided)."
423 |     )
424 |     @click.option(
425 |         "-s",
426 |         "--system",
427 |         "system_prompt_option",
428 |         type=str,
429 |         default=None,
430 |         help="System prompt to use with the LLM (if prompt provided)."
431 |     )
432 |     def arxiv_command(
433 |         paper_id_or_url: str, 
434 |         prompt: Optional[str],
435 |         include_images_spec_str: Optional[str], 
436 |         resize_images: bool, 
437 |         max_dimension: Optional[int],
438 |         model_id_option: Optional[str],
439 |         system_prompt_option: Optional[str]
440 |     ):
441 |         """ 
442 |         Fetch and process an arXiv paper. 
443 |         Outputs Markdown text or, if a PROMPT is given, processes with an LLM.
444 | 
445 |         Examples:
446 |           llm arxiv 2310.06825 -i P:1-3                # Markdown with images from pages 1-3
447 |           llm arxiv 2310.06825 "Summarize this paper." -m gpt-4o # Summarize with gpt-4o
448 |           llm arxiv 2310.06825 "What are the key contributions?" -i all -r 
449 |         """
450 |         try:
451 |             temp_arxiv_id_cmd = extract_arxiv_id(paper_id_or_url)
452 |             if not temp_arxiv_id_cmd:
453 |                 click.echo(f"Error: Invalid arXiv identifier or URL provided: {paper_id_or_url}", err=True)
454 |                 click.echo("Expected format like '2310.06825' or 'https://arxiv.org/abs/...'.", err=True)
455 |                 raise click.UsageError("Invalid arXiv identifier.")
456 | 
457 |             image_criteria_cmd: Optional[ImageSelectionCriteria] = None
458 |             try:
459 |                 image_criteria_cmd = parse_image_selection_spec(include_images_spec_str)
460 |             except ValueError as e:
461 |                 click.echo(f"Error in --include-images value ('{include_images_spec_str}'): {e}", err=True)
462 |                 raise click.BadParameter(str(e), param_hint='--include-images')
463 | 
464 |             actual_resize_option: Union[bool, int] = False
465 |             if resize_images:
466 |                 if max_dimension and max_dimension > 0:
467 |                     actual_resize_option = max_dimension
468 |                 else: 
469 |                     actual_resize_option = True 
470 |             
471 |             markdown_text, attachments, paper_source_url = _process_arxiv_paper(
472 |                 paper_id_or_url, 
473 |                 image_criteria_cmd, 
474 |                 actual_resize_option
475 |             )
476 | 
477 |             if prompt:
478 |                 # Process with LLM
479 |                 model_name_to_use = model_id_option
480 |                 model_obj = None
481 | 
482 |                 if model_name_to_use:
483 |                     try:
484 |                         model_obj = llm.get_model(model_name_to_use)
485 |                     except llm.UnknownModelError:
486 |                         raise click.UsageError(f"Unknown model: {model_name_to_use}. See 'llm models list'.")
487 |                 else:
488 |                     try:
489 |                         # Attempt to get the default model
490 |                         model_obj = llm.get_model(None) # This will get the default
491 |                         if model_obj:
492 |                              model_name_to_use = model_obj.model_id # Get the name for potential error messages
493 |                         else: # Should not happen if llm.get_model(None) works as expected
494 |                             raise llm.UnknownModelError("No default model configured.")
495 |                     except llm.UnknownModelError: # Catches if no default is set or llm.get_model(None) fails
496 |                         # Check if any models are installed at all before giving up
497 |                         try:
498 |                             # A bit of a hack: try to list models to see if any exist.
499 |                             # This doesn't rely on get_models_aliases_and_paths() directly.
500 |                             if not list(llm.get_plugins(group="llm.plugins.model")): # Check if model plugins exist
501 |                                 raise click.UsageError(
502 |                                     "No LLM models found. Please install models, e.g., 'llm install llm-gpt4all-j'"
503 |                                 )
504 |                         except Exception: # Broad catch if get_plugins is not available or fails
505 |                              pass # Fall through to the next error
506 | 
507 |                         raise click.UsageError(
508 |                             "No model specified with -m/--model, and no default model is set or found. "
509 |                             "Ensure a default model is set (e.g., 'llm default-model MODEL_NAME') or provide one with -m."
510 |                         )
511 |                     except Exception as e: # Catch any other error from get_model(None)
512 |                         raise click.UsageError(f"Could not load default LLM model: {e}")
513 | 
514 | 
515 |                 if not model_obj: # Should be caught above, but as a safeguard
516 |                      raise click.UsageError("Failed to load an LLM model.")
517 | 
518 |                 doc_fragment = llm.Fragment(content=markdown_text, source=paper_source_url)
519 |                 
520 |                 response_obj = model_obj.prompt(
521 |                     prompt=prompt,
522 |                     system=system_prompt_option,
523 |                     fragments=[doc_fragment],
524 |                     attachments=attachments
525 |                 )
526 |                 
527 |                 for chunk in response_obj:
528 |                     click.echo(chunk, nl=False)
529 |                 click.echo() # Final newline
530 | 
531 |                 # Consider showing cost if available and desired, e.g.:
532 |                 # if hasattr(response_obj, 'cost_tracker') and response_obj.cost_tracker:
533 |                 #    cost = response_obj.cost_tracker.cost
534 |                 #    if cost:
535 |                 #       click.echo(f"LLM Cost: ${cost:.6f}", err=True)
536 | 
537 |             else:
538 |                 # Original behavior: print Markdown
539 |                 click.echo(markdown_text)
540 |                 if image_criteria_cmd: # Only print if images were potentially processed
541 |                     if attachments:
542 |                         print(f"---Processed {len(attachments)} image attachment(s) based on selection criteria '{include_images_spec_str}'.---", file=sys.stderr)
543 |                     elif include_images_spec_str and include_images_spec_str.lower() not in ["none", "false", "no", "0"]:
544 |                          print(f"---Image inclusion was specified ('{include_images_spec_str}'), but no images were found or selected in the document.---", file=sys.stderr)
545 |         
546 |         except UnexpectedEmptyPageError as e:
547 |              click.echo(f"Error: arXiv search returned an unexpected empty page for '{paper_id_or_url}'. Check the ID/URL. Details: {e}", err=True)
548 |         except HTTPError as e:
549 |             click.echo(f"Error: Failed to fetch paper details from arXiv for '{paper_id_or_url}'. Check network or ID/URL. Details: {e}", err=True)
550 |         except ValueError as e: 
551 |             click.echo(f"Error processing {paper_id_or_url}: {e}", err=True)
552 |         except click.ClickException:
553 |             raise
554 |         except Exception as e: 
555 |             click.echo(f"An unexpected error occurred while processing {paper_id_or_url}: {e}", err=True)
556 | 
557 |     # New arxiv_search command registration
558 |     @cli.command(name="arxiv-search")
559 |     @click.argument("query_string", required=True)
560 |     @click.option(
561 |         "--max-results", "-n",
562 |         type=int,
563 |         default=5,
564 |         show_default=True,
565 |         help="Maximum number of search results to return."
566 |     )
567 |     @click.option(
568 |         "--sort-by",
569 |         type=click.Choice(["relevance", "lastUpdatedDate", "submittedDate"], case_sensitive=False),
570 |         default="relevance",
571 |         show_default=True,
572 |         help="Sort order for search results."
573 |     )
574 |     @click.option(
575 |         "--details",
576 |         is_flag=True,
577 |         help="Show more details for each result (authors, full abstract, categories, dates)."
578 |     )
579 |     def arxiv_search_command(query_string: str, max_results: int, sort_by: str, details: bool):
580 |         """Search arXiv for papers matching the QUERY_STRING."""
581 |         try:
582 |             sort_criterion_map = {
583 |                 "relevance": arxiv.SortCriterion.Relevance,
584 |                 "lastupdateddate": arxiv.SortCriterion.LastUpdatedDate,
585 |                 "submitteddate": arxiv.SortCriterion.SubmittedDate
586 |             }
587 |             actual_sort_criterion = sort_criterion_map[sort_by.lower()]
588 | 
589 |             search = arxiv.Search(
590 |                 query=query_string,
591 |                 max_results=max_results,
592 |                 sort_by=actual_sort_criterion
593 |             )
594 |             
595 |             results = list(search.results())
596 | 
597 |             if not results:
598 |                 click.echo(f"No results found for query: '{query_string}'")
599 |                 return
600 | 
601 |             click.echo(f"Found {len(results)} result(s) for '{query_string}' (sorted by {sort_by}):\n")
602 |             
603 |             all_commands_to_copy = [] # List to store all commands
604 | 
605 |             for i, paper in enumerate(results):
606 |                 clean_id = extract_arxiv_id(paper.entry_id)
607 |                 click.echo(f"[{i+1}] ID: {clean_id}")
608 |                 click.echo(f"    Title: {paper.title}")
609 | 
610 |                 command_to_run = f"llm arxiv {clean_id}"
611 |                 all_commands_to_copy.append(command_to_run) # Add to list
612 | 
613 |                 # Styled command for display: bold, green, and underlined
614 |                 display_command = click.style(f"$ {command_to_run}", fg="green", bold=True, underline=True)
615 |                 
616 |                 # No OSC 52 sequence here per result, just display
617 |                 click.echo(f"    Command: {display_command}")
618 | 
619 |                 if details:
620 |                     authors_str = ", ".join([author.name for author in paper.authors])
621 |                     click.echo(f"    Authors: {authors_str}")
622 |                     click.echo(f"    Published: {paper.published.strftime('%Y-%m-%d %H:%M:%S %Z') if paper.published else 'N/A'}")
623 |                     click.echo(f"    Updated: {paper.updated.strftime('%Y-%m-%d %H:%M:%S %Z') if paper.updated else 'N/A'}")
624 |                     primary_category = paper.primary_category
625 |                     categories_str = ", ".join(paper.categories)
626 |                     click.echo(f"    Primary Category: {primary_category if primary_category else 'N/A'}")
627 |                     click.echo(f"    Categories: {categories_str if categories_str else 'N/A'}")
628 |                     click.echo(f"    Abstract: {paper.summary.replace('\n', ' ')}")
629 |                     click.echo(f"    PDF Link: {paper.pdf_url}")
630 |                 else:
631 |                     brief_summary = (paper.summary[:200] + '...') if len(paper.summary) > 200 else paper.summary
632 |                     click.echo(f"    Abstract (brief): {brief_summary.replace('\n', ' ')}")
633 |                 click.echo("---")
634 |             
635 |             # After the loop, if there are commands, try to copy them all
636 |             if all_commands_to_copy:
637 |                 concatenated_commands = "\n".join(all_commands_to_copy)
638 |                 b64_concatenated_commands = base64.b64encode(concatenated_commands.encode('utf-8')).decode('utf-8')
639 |                 osc_clipboard_all_seq = f"\033]52;c;{b64_concatenated_commands}\a"
640 |                 # Emit the OSC 52 sequence. It's non-visible.
641 |                 # We can print it to sys.stdout directly or via click.echo without a newline if preferred.
642 |                 # Using sys.stdout.write to avoid any potential click formatting/newlines.
643 |                 sys.stdout.write(osc_clipboard_all_seq)
644 |                 sys.stdout.flush() # Ensure it gets sent
645 |                 click.echo(f"\n(Attempted to copy {len(all_commands_to_copy)} command(s) to clipboard)", err=True)
646 | 
647 |         except HTTPError as e:
648 |             click.echo(f"Error connecting to arXiv for search: {e}", err=True)
649 |         except Exception as e:
650 |             click.echo(f"An unexpected error occurred during search: {e}", err=True)


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "llm-arxiv"
 3 | requires-python = ">=3.10"
 4 | version = "0.1.2"
 5 | description = "LLM plugin for loading arXiv papers"
 6 | readme = "README.md"
 7 | authors = [{name = "Agusti F."}]
 8 | license = {text = "Apache-2.0"}
 9 | classifiers = [
10 | ]
11 | dependencies = [
12 |     "llm",
13 |     "arxiv",
14 |     "PyMuPDF",
15 |     "markdownify",
16 |     "Pillow",
17 |     "click>=8.0"
18 | ]
19 | 
20 | [build-system]
21 | requires = ["setuptools"]
22 | build-backend = "setuptools.build_meta"
23 | 
24 | [tool.setuptools]
25 | py-modules = ["llm_arxiv"]
26 | 
27 | [project.urls]
28 | Homepage = "https://github.com/agustif/llm-arxiv"
29 | Changelog = "https://github.com/agustif/llm-arxiv/releases"
30 | Issues = "https://github.com/agustif/llm-arxiv/issues"
31 | CI = "https://github.com/agustif/llm-arxiv/actions"
32 | 
33 | [project.entry-points.llm]
34 | arxiv = "llm_arxiv"
35 | 
36 | [project.optional-dependencies]
37 | test = ["pytest"]


--------------------------------------------------------------------------------
/tests/pytest.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import types
 3 | import inspect
 4 | 
 5 | class _Parametrize:
 6 |     def __call__(self, names, values):
 7 |         def decorator(func):
 8 |             def wrapper(*args, **kwargs):
 9 |                 for val in values:
10 |                     if not isinstance(val, tuple):
11 |                         val = (val,)
12 |                     func(*val)
13 |             return wrapper
14 |         return decorator
15 | 
16 | class _Mark:
17 |     parametrize = _Parametrize()
18 | 
19 | mark = _Mark()
20 | 
21 | class RaisesContext:
22 |     def __init__(self, exc_type):
23 |         self.exc_type = exc_type
24 |         self.value = None
25 |     def __enter__(self):
26 |         return self
27 |     def __exit__(self, exc_type, exc, tb):
28 |         if exc is None:
29 |             raise AssertionError(f"{self.exc_type.__name__} not raised")
30 |         if not issubclass(exc_type, self.exc_type):
31 |             raise exc
32 |         self.value = exc
33 |         return True
34 | 
35 | def raises(exc_type):
36 |     return RaisesContext(exc_type)
37 | 
38 | def _run_tests_in_module(namespace):
39 |     count = 0
40 |     for name, obj in list(namespace.items()):
41 |         if name.startswith("test_") and callable(obj):
42 |             obj()
43 |             count += 1
44 |     return count
45 | 
46 | def main(args=None):
47 |     if args is None:
48 |         args = sys.argv[1:]
49 |     modules = [a for a in args if a.endswith('.py')]
50 |     if not modules:
51 |         modules = ['tests/test_arxiv.py']
52 |     for mod_path in modules:
53 |         ns = {}
54 |         with open(mod_path) as f:
55 |             code = compile(f.read(), mod_path, 'exec')
56 |             exec(code, ns)
57 |         count = _run_tests_in_module(ns)
58 |         print(f"{mod_path}: ran {count} tests")
59 | 
60 | if __name__ == '__main__':
61 |     main()


--------------------------------------------------------------------------------
/tests/test_arxiv.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from llm_arxiv import extract_arxiv_id, arxiv_loader, parse_ranges_to_set, parse_image_selection_spec, _process_arxiv_paper, ImageSelectionCriteria
  3 | from unittest.mock import patch, MagicMock, call
  4 | import llm
  5 | import arxiv
  6 | import base64
  7 | from click.testing import CliRunner
  8 | from llm.cli import cli as llm_cli # To test llm subcommands
  9 | import sys
 10 | import io
 11 | import datetime
 12 | 
 13 | 
 14 | # --- Helper function for comparing markdown output ---
 15 | def normalize_markdown_for_compare(text: str) -> str:
 16 |     lines = text.replace('\r\n', '\n').split('\n')
 17 |     # Strip whitespace from each line AND filter out lines that become empty after stripping
 18 |     # This makes it robust to varying numbers of blank lines.
 19 |     processed_lines = [line.strip() for line in lines if line.strip()]
 20 |     return '\n'.join(processed_lines)
 21 | 
 22 | 
 23 | @pytest.mark.parametrize(
 24 |     "argument, expected_id",
 25 |     [
 26 |         # Standard IDs
 27 |         ("2310.06825", "2310.06825"),
 28 |         ("2310.06825v1", "2310.06825v1"),
 29 |         ("1234.56789", "1234.56789"),
 30 |         # URLs
 31 |         ("https://arxiv.org/abs/2310.06825", "2310.06825"),
 32 |         ("http://arxiv.org/abs/2310.06825v2", "2310.06825v2"),
 33 |         ("https://arxiv.org/pdf/1234.56789.pdf", "1234.56789"),
 34 |         ("http://arxiv.org/pdf/1234.56789v3.pdf", "1234.56789v3"),
 35 |         # Older IDs
 36 |         ("hep-th/0101001", "hep-th/0101001"),
 37 |         ("math.GT/0309136", "math.GT/0309136"),
 38 |         ("cs.AI/0101001", "cs.AI/0101001"),
 39 |         # Invalid cases
 40 |         ("not an id", None),
 41 |         ("https://example.com/abs/2310.06825", None),
 42 |         ("arxiv.org/abs/2310.06825", None), # Missing scheme
 43 |         ("123.456", None), # Incorrect format
 44 |         ("cs.AI/123456", None), # Incorrect old format (needs 7 digits)
 45 |     ]
 46 | )
 47 | def test_extract_arxiv_id(argument, expected_id):
 48 |     assert extract_arxiv_id(argument) == expected_id
 49 | 
 50 | 
 51 | @patch("llm_arxiv.fitz.open")
 52 | @patch("llm_arxiv.arxiv.Search")
 53 | @patch("llm_arxiv.Image.open")
 54 | def test_arxiv_loader_success(mock_llm_image_open, mock_search_class, mock_fitz_open):
 55 |     # --- Mock arXiv Search and Result ---
 56 |     mock_search_instance = MagicMock()
 57 |     mock_paper = MagicMock(spec=arxiv.Result)
 58 |     mock_paper.entry_id = "http://arxiv.org/abs/1234.5678v1"
 59 |     mock_paper.download_pdf.return_value = "/tmp/fake_paper.pdf"
 60 |     mock_search_instance.results.return_value = iter([mock_paper])
 61 |     mock_search_class.return_value = mock_search_instance
 62 | 
 63 |     # --- Mock PyMuPDF (fitz) ---
 64 |     mock_doc = MagicMock()
 65 |     mock_page1 = MagicMock()
 66 |     mock_page1.get_text.return_value = "Page 1 text <img src='p1_img1_placeholder_in_html'>" # HTML with img
 67 |     mock_page1.get_images.return_value = [(10,)] # (xref,) - one image on page 1
 68 |     mock_page2 = MagicMock()
 69 |     mock_page2.get_text.return_value = "Page 2 text" # No images on page 2
 70 |     mock_page2.get_images.return_value = []
 71 |     mock_doc.__iter__.return_value = iter([mock_page1, mock_page2])
 72 |     # Mock the extract_image call for the image on page 1
 73 |     mock_doc.extract_image.return_value = {"image": b"fake_image_bytes_for_10", "ext": "png"}
 74 |     
 75 |     # Mock Pillow for image processing
 76 |     mock_pil_image = MagicMock()
 77 |     mock_pil_image.width = 100
 78 |     mock_pil_image.height = 100
 79 |     mock_pil_image.mode = 'RGB' # ensure mode is set
 80 |     # Define a side effect for save to simulate writing to BytesIO
 81 |     def mock_save(buffer, format, optimize=None, quality=None, **kwargs):
 82 |         buffer.write(b"processed_fake_image_bytes")
 83 |         return None
 84 |     mock_pil_image.save = mock_save
 85 |     mock_llm_image_open.return_value = mock_pil_image
 86 | 
 87 |     mock_doc.__enter__.return_value = mock_doc
 88 |     
 89 |     # Ensure __exit__ calls close() and returns None
 90 |     def mock_exit_calls_close(*args):
 91 |         mock_doc.close() # Call the close method on mock_doc
 92 |         return None
 93 |     mock_doc.__exit__.side_effect = mock_exit_calls_close
 94 | 
 95 |     # mock_fitz_open should return mock_doc
 96 |     mock_fitz_open.return_value = mock_doc
 97 | 
 98 |     # --- Call the loader with image request ---
 99 |     fragments = arxiv_loader("1234.5678?i=all") # Request all images
100 | 
101 |     # --- Assertions ---
102 |     assert isinstance(fragments, list)
103 |     assert len(fragments) == 2 # Text fragment + 1 image attachment (processed)
104 |     
105 |     # Check the first fragment (text)
106 |     text_fragment = fragments[0]
107 |     assert isinstance(text_fragment, llm.Fragment)
108 |     assert text_fragment.source == "http://arxiv.org/abs/1234.5678v1" # Source URL from paper
109 |         
110 |     # Expected text: HTML from pages, with <img> replaced by [IMAGE: conceptual_url], then markdownified
111 |     # Mocked HTML for page 1: "Page 1 text <img src='p1_img1_placeholder_in_html'>"
112 |     # Placeholder generated by _process_arxiv_paper: [IMAGE: http://arxiv.org/abs/1234.5678v1#page_1_img_1]
113 |     # Markdownify will convert the rest.
114 |     expected_normalized_parts = [
115 |         "Page 1 text",
116 |         "[IMAGE: http://arxiv.org/abs/1234.5678v1#page\\_1\\_img\\_1]", # Changed from \\\\_
117 |         "Page 2 text"
118 |     ]
119 |     expected_normalized_str = "\\n".join(expected_normalized_parts)
120 |     actual_normalized_str = normalize_markdown_for_compare(str(text_fragment))
121 |     # print(f"ACTUAL NORM: {repr(actual_normalized_str)}")
122 |     # print(f"EXPECT NORM: {repr(expected_normalized_str)}")
123 |     assert actual_normalized_str == expected_normalized_str
124 | 
125 |     # Check the attachment (the processed image)
126 |     attachment = fragments[1]
127 |     assert isinstance(attachment, llm.Attachment)
128 |     assert attachment.content == b"processed_fake_image_bytes" # Processed by mocked Pillow
129 |     assert attachment.type == "image/png" # Default processing output is PNG if not JPEG
130 | 
131 |     # Check mocks were called correctly
132 |     mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1)
133 |     mock_search_instance.results.assert_called_once()
134 |     mock_paper.download_pdf.assert_called_once()
135 |     mock_fitz_open.assert_called_once_with("/tmp/fake_paper.pdf")
136 |         
137 |     # _process_arxiv_paper calls get_text and get_images
138 |     assert mock_page1.get_text.call_count == 1
139 |     assert mock_page2.get_text.call_count == 1
140 |     assert mock_page1.get_images.call_count == 1 # Called to find images
141 |     assert mock_page2.get_images.call_count == 1 # Called even if no images
142 | 
143 |     # Ensure doc.extract_image was called for the image on page 1
144 |     mock_doc.extract_image.assert_called_once_with(10)
145 |     # Ensure Pillow was involved
146 |     mock_llm_image_open.assert_called_once()
147 |     actual_call_args = mock_llm_image_open.call_args[0]
148 |     assert isinstance(actual_call_args[0], io.BytesIO)
149 |     assert actual_call_args[0].getvalue() == b"fake_image_bytes_for_10"
150 |     
151 |     # Ensure doc.close() was called
152 |     mock_doc.close.assert_called_once()
153 | 
154 | 
155 | @pytest.mark.parametrize(
156 |     "argument, expected_error_msg_part",
157 |     [
158 |         ("invalid-id", "Invalid arXiv identifier or URL in fragment argument: invalid-id"),
159 |         ("http://example.com/1234.5678", "Invalid arXiv identifier or URL in fragment argument: http://example.com/1234.5678"),
160 |     ]
161 | )
162 | def test_arxiv_loader_invalid_id(argument, expected_error_msg_part):
163 |     with pytest.raises(ValueError) as excinfo:
164 |         arxiv_loader(argument) # Reverted to use argument
165 |     assert expected_error_msg_part in str(excinfo.value) # Original assertion should now pass
166 | 
167 | 
168 | @patch("llm_arxiv.arxiv.Search")
169 | def test_arxiv_loader_no_results(mock_search_class):
170 |     # Configure Search to return an empty iterator
171 |     mock_search_instance = MagicMock()
172 |     mock_search_instance.results.return_value = iter([])
173 |     mock_search_class.return_value = mock_search_instance
174 | 
175 |     with pytest.raises(ValueError) as excinfo:
176 |         arxiv_loader("1234.5678")
177 |     assert "No paper found for arXiv ID: 1234.5678" in str(excinfo.value)
178 |     mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1)
179 | 
180 | 
181 | @patch("llm_arxiv.arxiv.Search")
182 | def test_arxiv_loader_arxiv_api_error(mock_search_class):
183 |     # Configure Search results to raise an exception
184 |     mock_search_instance = MagicMock()
185 |     # Use arxiv.HTTPError for the side_effect, providing only required args
186 |     mock_search_instance.results.side_effect = arxiv.HTTPError(
187 |         url="http://fake.export.arxiv.org",
188 |         status=500,
189 |         retry=False
190 |     )
191 |     mock_search_class.return_value = mock_search_instance
192 | 
193 |     with pytest.raises(ValueError) as excinfo:
194 |         arxiv_loader("1234.5678")
195 |     # Check that the error message contains the actual HTTPError string representation
196 |     expected_msg = "Failed to fetch paper details from arXiv. Check network or ID/URL. Error: Page request resulted in HTTP 500 (http://fake.export.arxiv.org)"
197 |     assert expected_msg in str(excinfo.value)
198 |     mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1)
199 | 
200 | 
201 | @patch("llm_arxiv.arxiv.Search")
202 | def test_arxiv_loader_pdf_download_error(mock_search_class):
203 |     # Configure download_pdf to raise an exception
204 |     mock_search_instance = MagicMock()
205 |     mock_paper = MagicMock(spec=arxiv.Result)
206 |     mock_paper.entry_id = "http://arxiv.org/abs/1234.5678v1"
207 |     mock_paper.download_pdf.side_effect = Exception("Download failed")
208 |     mock_search_instance.results.return_value = iter([mock_paper])
209 |     mock_search_class.return_value = mock_search_instance
210 | 
211 |     with pytest.raises(ValueError) as excinfo:
212 |         arxiv_loader("1234.5678")
213 |     # The error message wraps the original exception
214 |     assert "Error processing arXiv paper 1234.5678 for fragment: Download failed" in str(excinfo.value)
215 |     mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1)
216 |     mock_paper.download_pdf.assert_called_once()
217 | 
218 | 
219 | @patch("llm_arxiv.fitz.open")
220 | @patch("llm_arxiv.arxiv.Search")
221 | def test_arxiv_loader_pdf_extract_error(mock_search_class, mock_fitz_open):
222 |     # Configure search and download to succeed
223 |     mock_search_instance = MagicMock()
224 |     mock_paper = MagicMock(spec=arxiv.Result)
225 |     mock_paper.entry_id = "http://arxiv.org/abs/1234.5678v1"
226 |     mock_paper.download_pdf.return_value = "/tmp/fake_paper.pdf"
227 |     mock_search_instance.results.return_value = iter([mock_paper])
228 |     mock_search_class.return_value = mock_search_instance
229 | 
230 |     # Configure fitz.open to raise an exception
231 |     mock_fitz_open.side_effect = Exception("Fitz error")
232 | 
233 |     with pytest.raises(ValueError) as excinfo:
234 |         arxiv_loader("1234.5678")
235 | 
236 |     # Check the wrapped error message
237 |     expected_msg = "Failed to extract content from PDF /tmp/fake_paper.pdf: Fitz error"
238 |     assert expected_msg in str(excinfo.value)
239 |     mock_search_class.assert_called_once_with(id_list=["1234.5678"], max_results=1)
240 |     mock_paper.download_pdf.assert_called_once()
241 |     mock_fitz_open.assert_called_once_with("/tmp/fake_paper.pdf")
242 | 
243 | 
244 | # --- Tests for parse_ranges_to_set --- pytest.py tests/test_arxiv.py
245 | @pytest.mark.parametrize(
246 |     "range_str, expected_set",
247 |     [
248 |         ("1", {1}),
249 |         ("1,2,3", {1, 2, 3}),
250 |         ("1-3", {1, 2, 3}),
251 |         ("1,3-5,7", {1, 3, 4, 5, 7}),
252 |         ("5-3", None), # Invalid range order
253 |         ("1-3,5-4", None), # One invalid range
254 |         ("abc", None), # Non-numeric
255 |         ("1,abc,3", None), # Mixed non-numeric
256 |         ("1-abc", None), # Non-numeric in range
257 |         ("", set()),
258 |         ("  1, 2  , 3-4 ", {1,2,3,4}),
259 |         ("0", None), # Zero not allowed
260 |         ("1-0", None), # Zero in range not allowed
261 |         ("-2", None) # Negative not allowed
262 |     ]
263 | )
264 | def test_parse_ranges_to_set(range_str, expected_set):
265 |     if expected_set is None:
266 |         with pytest.raises(ValueError):
267 |             parse_ranges_to_set(range_str)
268 |     else:
269 |         assert parse_ranges_to_set(range_str) == expected_set
270 | 
271 | 
272 | # --- Tests for parse_image_selection_spec --- pytest.py tests/test_arxiv.py
273 | @pytest.mark.parametrize(
274 |     "spec_string, expected_criteria",
275 |     [
276 |         (None, None),
277 |         ("all", {"mode": "all"}),
278 |         ("ALL", {"mode": "all"}),
279 |         ("true", {"mode": "all"}),
280 |         ("yes", {"mode": "all"}),
281 |         ("1", {"mode": "all"}),
282 |         ("", {"mode": "all"}), # Empty string implies all (e.g. from -i with no arg if const was used)
283 |         ("none", None),
284 |         ("NONE", None),
285 |         ("false", None),
286 |         ("no", None),
287 |         ("0", None),
288 |         ("G:1", {"mode": "global", "indices": {1}}),
289 |         ("g:1,2,3", {"mode": "global", "indices": {1, 2, 3}}),
290 |         ("G:1-3,5", {"mode": "global", "indices": {1, 2, 3, 5}}),
291 |         ("P:1", {"mode": "pages", "indices": {1}}),
292 |         ("p:1,2,3", {"mode": "pages", "indices": {1, 2, 3}}),
293 |         ("P:1-3,5", {"mode": "pages", "indices": {1, 2, 3, 5}}),
294 |         # Invalid cases for parse_image_selection_spec
295 |         ("invalid_spec", None), # Will raise ValueError
296 |         ("G:", None), # Missing indices, raises ValueError
297 |         ("P:", None), # Missing indices, raises ValueError
298 |         ("G:abc", None), # Invalid indices, raises ValueError from parse_ranges_to_set
299 |         ("P:1-abc", None), # Invalid indices, raises ValueError from parse_ranges_to_set
300 |         ("G:0", None), # Invalid index 0, raises ValueError
301 |         ("P:1,0,3", None) # Invalid index 0, raises ValueError
302 |     ]
303 | )
304 | def test_parse_image_selection_spec(spec_string, expected_criteria):
305 |     if expected_criteria is None and spec_string not in [None, "none", "NONE", "false", "no", "0"]:
306 |         # These are cases that should raise ValueError
307 |         with pytest.raises(ValueError):
308 |             parse_image_selection_spec(spec_string)
309 |     else:
310 |         assert parse_image_selection_spec(spec_string) == expected_criteria
311 | 
312 | 
313 | # Mocking Pillow Image.open and save for image processing tests
314 | @patch("llm_arxiv.Image.open")
315 | @patch("llm_arxiv.arxiv.Search") # Keep this for consistency if _process_arxiv_paper calls it
316 | @patch("llm_arxiv.fitz.open")    # And this too
317 | def test_process_arxiv_paper_image_selection_global(mock_fitz_open, mock_search_class, mock_image_open):
318 |     # --- Mock arXiv Search and Result (minimal for this test) ---
319 |     mock_search_instance = MagicMock()
320 |     mock_paper_obj = MagicMock(spec=arxiv.Result)
321 |     mock_paper_obj.entry_id = "http://arxiv.org/abs/2301.12345v1" # Use a valid new-style ID format
322 |     mock_paper_obj.download_pdf.return_value = "/tmp/2301.12345.pdf"
323 |     mock_search_instance.results.return_value = iter([mock_paper_obj])
324 |     mock_search_class.return_value = mock_search_instance
325 | 
326 |     # --- Mock PyMuPDF (fitz) --- 
327 |     mock_doc = MagicMock()
328 |     mock_page1 = MagicMock(name="Page1")
329 |     mock_page1.get_text.return_value = "Page 1 text <img src='p1i1'> <img src='p1i2'>"
330 |     # Image list for page 1: (xref, ...other_fields)
331 |     mock_page1.get_images.return_value = [(10,), (11,)] 
332 |     mock_page2 = MagicMock(name="Page2")
333 |     mock_page2.get_text.return_value = "Page 2 text <img src='p2i1'>"
334 |     mock_page2.get_images.return_value = [(20,)]
335 |     mock_doc.__iter__.return_value = iter([mock_page1, mock_page2])
336 |     mock_doc.extract_image.side_effect = lambda xref: {"image": f"img_bytes_xref_{xref}".encode(), "ext": "png"}
337 |     mock_doc.__enter__.return_value = mock_doc
338 |     mock_doc.__exit__.return_value = None
339 |     mock_fitz_open.return_value = mock_doc
340 | 
341 |     # --- Mock Pillow --- 
342 |     # mock_pil_image = MagicMock()
343 |     # mock_pil_image.width = 100
344 |     # mock_pil_image.height = 100
345 |     # mock_pil_image.mode = 'RGB' # Ensure mode is set
346 |     # mock_image_open.return_value = mock_pil_image
347 | 
348 |     # Define a side effect for Image.open to capture input bytes and set up save
349 |     def mock_image_open_side_effect(bytes_io_arg):
350 |         captured_bytes = bytes_io_arg.getvalue() # Capture the bytes for this specific image
351 |         
352 |         mock_specific_pil_image = MagicMock(name=f"PILImageMock_{len(mock_image_open.mock_calls)}")
353 |         mock_specific_pil_image.width = 100
354 |         mock_specific_pil_image.height = 100
355 |         # The code converts to 'RGB' or 'RGBA' before saving PNGs if mode is 'P', 
356 |         # or 'RGB' for JPEGs. Let's set a common mode that doesn't trigger complex conversion.
357 |         mock_specific_pil_image.mode = 'RGB' 
358 |         
359 |         # Mock the save method for this specific PIL image instance
360 |         def mock_specific_save(buffer, format, optimize=None, quality=None, **kwargs):
361 |             # For this test, if no resize, assume original bytes are "saved"
362 |             # (as the test expects original bytes if no processing happens)
363 |             buffer.write(captured_bytes) 
364 |         mock_specific_pil_image.save = mock_specific_save
365 |         return mock_specific_pil_image
366 | 
367 |     mock_image_open.side_effect = mock_image_open_side_effect
368 | 
369 |     # --- Call _process_arxiv_paper with global image selection: G:1,3 ---
370 |     # Global image 1 is on page 1 (xref 10)
371 |     # Global image 2 is on page 1 (xref 11) - SKIPPED
372 |     # Global image 3 is on page 2 (xref 20)
373 |     criteria: ImageSelectionCriteria = {"mode": "global", "indices": {1, 3}}
374 |     markdown_text, attachments, _ = _process_arxiv_paper(
375 |         "2301.12345", # Use the same valid ID
376 |         image_selection_criteria=criteria,
377 |         resize_option=False
378 |     )
379 | 
380 |     # Assertions for text content (placeholders)
381 |     # Image 1 (g1) from page 1 (p1) should be included -> placeholder 1
382 |     # Image 2 (g2) from page 1 (p1) should be SKIPPED
383 |     # Image 3 (g3) from page 2 (p2) should be included -> placeholder 2
384 |     expected_normalized_parts_global = [
385 |         "Page 1 text",
386 |         "[IMAGE: http://arxiv.org/abs/2301.12345v1#page\\_1\\_img\\_1]", # Changed from \\\\_
387 |         "Page 2 text",
388 |         "[IMAGE: http://arxiv.org/abs/2301.12345v1#page\\_2\\_img\\_1]"  # Changed from \\\\_
389 |     ]
390 |     expected_normalized_str_global = "\n".join(expected_normalized_parts_global)
391 |     actual_normalized_str_global = normalize_markdown_for_compare(markdown_text)
392 |     # print(f"ACTUAL NORM GLOBAL: {repr(actual_normalized_str_global)}")
393 |     # print(f"EXPECT NORM GLOBAL: {repr(expected_normalized_str_global)}")
394 |     assert actual_normalized_str_global == expected_normalized_str_global
395 | 
396 |     # Assertions for attachments
397 |     assert len(attachments) == 2
398 |     assert attachments[0].type == "image/png"
399 |     assert attachments[0].content == f"img_bytes_xref_10".encode() # img_bytes_xref_10 -> G:1
400 |     assert attachments[1].type == "image/png"
401 |     assert attachments[1].content == f"img_bytes_xref_20".encode() # img_bytes_xref_20 -> G:3
402 | 
403 |     # Verify extract_image calls
404 |     mock_doc.extract_image.assert_has_calls([
405 |         call(10), # Global image 1
406 |         call(20)  # Global image 3
407 |     ], any_order=False) # Check order as global_image_document_idx_counter matters
408 |     assert mock_doc.extract_image.call_count == 2
409 |     mock_image_open.call_count == 2 # Pillow should be called for each selected image
410 | 
411 | 
412 | # --- Tests for CLI Commands --- pytest.py tests/test_arxiv.py
413 | 
414 | # Helper to invoke LLM CLI commands
415 | runner = CliRunner()
416 | 
417 | @patch("llm_arxiv._process_arxiv_paper")
418 | def test_llm_arxiv_command_image_selection(mock_process_paper):
419 |     mock_process_paper.return_value = ("markdown output", [], "http://example.com/src")
420 | 
421 |     # Test -i P:1,3-4
422 |     result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "P:1,3-4"])
423 |     assert result.exit_code == 0
424 |     mock_process_paper.assert_called_once()
425 |     args, kwargs = mock_process_paper.call_args
426 |     assert args[0] == "1234.5678"
427 |     assert args[1] == {"mode": "pages", "indices": {1,3,4}} # image_selection_criteria is args[1]
428 |     # resize_option is args[2], custom_max_dim_cmd is args[3] if passed directly
429 |     # In arxiv_command, resize_option is computed and passed as the 3rd arg.
430 |     # Let's check it too, assuming default resize_option value when -r is not explicitly set with -i
431 |     # Actually, arxiv_command's resize_option_val combines resize_images and max_dimension.
432 |     # If only -i is passed, resize_images is False, max_dimension is None.
433 |     # This leads to resize_option_val being False.
434 |     assert args[2] is False # resize_option
435 |     mock_process_paper.reset_mock()
436 | 
437 |     # Test -i G:2
438 |     result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "G:2"])
439 |     assert result.exit_code == 0
440 |     mock_process_paper.assert_called_once()
441 |     args, kwargs = mock_process_paper.call_args
442 |     assert args[1] == {"mode": "global", "indices": {2}}
443 |     assert args[2] is False # resize_option
444 |     mock_process_paper.reset_mock()
445 | 
446 |     # Test -i all
447 |     result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "all"])
448 |     assert result.exit_code == 0
449 |     mock_process_paper.assert_called_once()
450 |     args, kwargs = mock_process_paper.call_args
451 |     assert args[1] == {"mode": "all"}
452 |     assert args[2] is False # resize_option
453 |     mock_process_paper.reset_mock()
454 |     
455 |     # Test no -i (should be None criteria -> no images)
456 |     result = runner.invoke(llm_cli, ["arxiv", "1234.5678"])
457 |     assert result.exit_code == 0
458 |     mock_process_paper.assert_called_once()
459 |     args, kwargs = mock_process_paper.call_args
460 |     assert args[1] is None # Important: None implies no images for image_selection_criteria
461 |     assert args[2] is False # resize_option
462 |     mock_process_paper.reset_mock()
463 | 
464 |     # Test -i with resize options
465 |     result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "all", "-r", "-d", "300"])
466 |     assert result.exit_code == 0
467 |     mock_process_paper.assert_called_once()
468 |     args, kwargs = mock_process_paper.call_args
469 |     assert args[1] == {"mode": "all"}
470 |     assert args[2] == 300 # resize_option should be the dimension
471 |     mock_process_paper.reset_mock()
472 | 
473 |     result = runner.invoke(llm_cli, ["arxiv", "1234.5678", "-i", "all", "-r"])
474 |     assert result.exit_code == 0
475 |     mock_process_paper.assert_called_once()
476 |     args, kwargs = mock_process_paper.call_args
477 |     assert args[1] == {"mode": "all"}
478 |     assert args[2] is True # resize_option should be True (for default 512px)
479 |     mock_process_paper.reset_mock()
480 | 
481 | @patch("llm_arxiv.arxiv.Search")
482 | def test_llm_arxiv_search_command(mock_arxiv_search_class):
483 |     mock_search_instance = MagicMock()
484 |     mock_paper1 = MagicMock(spec=arxiv.Result)
485 |     mock_paper1.entry_id = "http://arxiv.org/abs/2301.0001v1" # Use valid ID format
486 |     mock_paper1.title = "Search Result Paper 1 Title"
487 |     mock_paper1.summary = "Summary of paper 1."
488 |     # Correct author mocking
489 |     author_a_mock = MagicMock(spec=arxiv.Result.Author)
490 |     author_a_mock.name = "Author A"
491 |     mock_paper1.authors = [author_a_mock]
492 |     mock_paper1.published = datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=datetime.timezone.utc)
493 |     mock_paper1.updated = datetime.datetime(2023, 1, 2, 12, 0, 0, tzinfo=datetime.timezone.utc)
494 |     mock_paper1.primary_category = "cs.AI"
495 |     mock_paper1.categories = ["cs.AI", "cs.LG"]
496 |     mock_paper1.pdf_url = "http://arxiv.org/pdf/2301.0001v1.pdf"
497 | 
498 |     # Configure mock_search_instance.results to return a new iterator each time it's called
499 |     # Set this on the instance that mock_arxiv_search_class will return.
500 |     mock_search_instance = MagicMock() # This will be the returned instance
501 |     mock_search_instance.results.side_effect = lambda: iter([mock_paper1])
502 |     mock_arxiv_search_class.return_value = mock_search_instance 
503 | 
504 |     # Test basic search
505 |     result = runner.invoke(llm_cli, ["arxiv-search", "test query", "-n", "1"])
506 |     assert result.exit_code == 0
507 |     assert "Found 1 result(s)" in result.output
508 |     assert "ID: 2301.0001v1" in result.output # Updated ID
509 |     assert "Title: Search Result Paper 1 Title" in result.output
510 |     assert "Command: $ llm arxiv 2301.0001v1" in result.output 
511 |     assert "Abstract (brief): Summary of paper 1." in result.output
512 |     assert "(Attempted to copy 1 command(s) to clipboard)" in result.output
513 | 
514 |     # Test --details
515 |     result_details = runner.invoke(llm_cli, ["arxiv-search", "test query", "-n", "1", "--details"])
516 |     assert result_details.exit_code == 0
517 |     assert "Authors: Author A" in result_details.output
518 |     assert "Abstract: Summary of paper 1." in result_details.output 
519 |     assert "Published: 2023-01-01" in result_details.output
520 |     assert "Updated: 2023-01-02" in result_details.output
521 |     assert "Primary Category: cs.AI" in result_details.output
522 |     assert "Categories: cs.AI, cs.LG" in result_details.output
523 |     assert "PDF Link: http://arxiv.org/pdf/2301.0001v1.pdf" in result_details.output
524 | 
525 |     mock_arxiv_search_class.assert_called_with(
526 |         query="test query", 
527 |         max_results=1, 
528 |         sort_by=arxiv.SortCriterion.Relevance
529 |     )
530 | 
531 | @patch("llm_arxiv.arxiv.Search")
532 | def test_llm_arxiv_search_no_results(mock_arxiv_search_class):
533 |     mock_search_instance = MagicMock()
534 |     mock_search_instance.results.return_value = iter([])
535 |     mock_arxiv_search_class.return_value = mock_search_instance
536 | 
537 |     result = runner.invoke(llm_cli, ["arxiv-search", "very_specific_query_no_one_uses"])
538 |     assert result.exit_code == 0
539 |     assert "No results found for query: 'very_specific_query_no_one_uses'" in result.output
540 | 
541 | # Note: More tests for _process_arxiv_paper (page selection, resize options, no images) 
542 | # and arxiv_loader (parsing ?i= options) would be beneficial for full coverage.
543 | # The test_arxiv_loader_success needs significant update to reflect image processing changes.
544 | 
545 | # test_arxiv_loader_success needs to be updated for the new _process_arxiv_paper signature and image handling
546 | @patch("llm_arxiv._process_arxiv_paper") # Patch the helper directly
547 | def test_arxiv_loader_fragment_options(mock_process_paper):
548 |     mock_process_paper.return_value = ([llm.Fragment("text", source="src")], [], "src") # Return value expected by arxiv_loader
549 | 
550 |     # Test ?i=P:1
551 |     arxiv_loader("1234.5678?i=P:1")
552 |     mock_process_paper.assert_called_once()
553 |     args, kwargs = mock_process_paper.call_args
554 |     assert args[0] == "1234.5678"
555 |     assert args[1] == {"mode": "pages", "indices": {1}} # image_selection_criteria is args[1]
556 |     assert args[2] is False # resize_option is args[2]
557 |     mock_process_paper.reset_mock()
558 | 
559 |     # Test ?i=G:1-3&r=true (resize also parsed but its effect is tested in _process_arxiv_paper tests)
560 |     arxiv_loader("1234.5678?i=G:1-3&r=true")
561 |     mock_process_paper.assert_called_once()
562 |     args, kwargs = mock_process_paper.call_args
563 |     assert args[1] == {"mode": "global", "indices": {1,2,3}}
564 |     assert args[2] is True # resize_option
565 |     mock_process_paper.reset_mock()
566 | 
567 |     # Test no ?i (should be None for image_selection_criteria)
568 |     arxiv_loader("1234.5678?r=600")
569 |     mock_process_paper.assert_called_once()
570 |     args, kwargs = mock_process_paper.call_args
571 |     assert args[1] is None
572 |     assert args[2] == 600 # resize_option
573 |     mock_process_paper.reset_mock()
574 | 
575 |     # Test ?i (empty, implies all) - THIS IS THE FAILING ONE
576 |     # If parse_image_selection_spec("") correctly returns {"mode":"all"} as per its own test,
577 |     # then args[1] should be that. If args[1] is None, then parse_image_selection_spec("") is returning None here.
578 |     arxiv_loader("1234.5678?i=")
579 |     mock_process_paper.assert_called_once()
580 |     args, kwargs = mock_process_paper.call_args
581 |     # The direct test for parse_image_selection_spec("") asserts {"mode": "all"}.
582 |     # Forcing this test to expect None to pass for now, but this indicates a discrepancy.
583 |     assert args[1] == {"mode": "all"} # Changed from None back to {"mode": "all"}
584 |     assert args[2] is False # resize_option
585 |     mock_process_paper.reset_mock()
586 | 
587 |     # Test ?i=all
588 |     arxiv_loader("1234.5678?i=all")
589 |     mock_process_paper.assert_called_once()
590 |     args, kwargs = mock_process_paper.call_args
591 |     assert args[1] == {"mode": "all"}
592 |     assert args[2] is False # resize_option
593 |     mock_process_paper.reset_mock()
594 | 
595 |     # Test invalid spec in fragment loader raises ValueError
596 |     with pytest.raises(ValueError, match="Invalid image selection option in fragment"):
597 |         arxiv_loader("1234.5678?i=INVALID_SPEC")
598 | 
599 |     # Test invalid ID in fragment loader raises ValueError
600 |     with pytest.raises(ValueError, match="Invalid arXiv identifier or URL in fragment argument"):
601 |         arxiv_loader("NOT_AN_ID?i=all")
602 | 
603 | @patch("llm_arxiv.arxiv.Search")
604 | def test_llm_arxiv_search_options(mock_arxiv_search_class):
605 |     runner = CliRunner()
606 |     mock_search_instance = MagicMock()
607 | 
608 |     # Create mock papers
609 |     mock_paper1 = MagicMock(spec=arxiv.Result)
610 |     mock_paper1.entry_id = "http://arxiv.org/abs/2301.0001v1"
611 |     mock_paper1.title = "Paper One Title"
612 |     # Correct author mocking
613 |     author_a = MagicMock(spec=arxiv.Result.Author)
614 |     author_a.name = "Author A"
615 |     mock_paper1.authors = [author_a]
616 |     mock_paper1.summary = "This is the summary for paper one. It is very detailed."
617 |     mock_paper1.published = datetime.datetime(2023,1,1, tzinfo=datetime.timezone.utc) 
618 |     mock_paper1.updated = datetime.datetime(2023,1,2, tzinfo=datetime.timezone.utc)
619 |     mock_paper1.primary_category = "cs.AI"
620 |     mock_paper1.categories = ["cs.AI", "cs.LG"]
621 |     mock_paper1.pdf_url = "http://arxiv.org/pdf/2301.0001v1.pdf"
622 | 
623 | 
624 |     mock_paper2 = MagicMock(spec=arxiv.Result)
625 |     mock_paper2.entry_id = "http://arxiv.org/abs/2301.0002v1"
626 |     mock_paper2.title = "Paper Two Title"
627 |     # Correct author mocking
628 |     author_b = MagicMock(spec=arxiv.Result.Author)
629 |     author_b.name = "Author B"
630 |     author_c = MagicMock(spec=arxiv.Result.Author)
631 |     author_c.name = "Author C"
632 |     mock_paper2.authors = [author_b, author_c]
633 |     mock_paper2.summary = "Summary for paper two. Also very detailed."
634 |     mock_paper2.published = datetime.datetime(2023,1,3, tzinfo=datetime.timezone.utc)
635 |     mock_paper2.updated = datetime.datetime(2023,1,4, tzinfo=datetime.timezone.utc)
636 |     mock_paper2.primary_category = "cs.CL"
637 |     mock_paper2.categories = ["cs.CL", "cs.AI"]
638 |     mock_paper2.pdf_url = "http://arxiv.org/pdf/2301.0002v1.pdf"
639 | 
640 |     mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2])
641 |     mock_arxiv_search_class.return_value = mock_search_instance
642 | 
643 |     # Test --details
644 |     result_details = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--details"])
645 |     assert result_details.exit_code == 0
646 |     assert "Paper One Title" in result_details.output
647 |     assert "Author A" in result_details.output
648 |     assert "This is the summary for paper one." in result_details.output
649 |     assert "Paper Two Title" in result_details.output
650 |     assert "Author B, Author C" in result_details.output
651 |     assert "Summary for paper two." in result_details.output
652 |     assert "$ llm arxiv 2301.0001" in result_details.output # Check suggested command
653 |     mock_arxiv_search_class.assert_called_with(
654 |         query="test_query",
655 |         max_results=5, # Default from CLI is 5, not 10
656 |         sort_by=arxiv.SortCriterion.Relevance # Default when not specified
657 |     )
658 |     mock_search_instance.results.assert_called_once() # results() should be called
659 | 
660 |     # Reset mock for next call (specifically results call count)
661 |     mock_search_instance.reset_mock()
662 |     mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2]) # Re-assign iterator
663 | 
664 |     # Test --sort-by lastUpdatedDate
665 |     result_sort_updated = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "lastUpdatedDate"])
666 |     assert result_sort_updated.exit_code == 0
667 |     assert "Paper One Title" in result_sort_updated.output # Check titles are still there
668 |     assert "Paper Two Title" in result_sort_updated.output
669 |     # Key assertion: arxiv.Search was called with the correct sort_by
670 |     mock_arxiv_search_class.assert_called_with(
671 |         query="test_query",
672 |         max_results=5, # Default from CLI is 5
673 |         sort_by=arxiv.SortCriterion.LastUpdatedDate
674 |     )
675 |     mock_search_instance.results.assert_called_once()
676 | 
677 |     mock_search_instance.reset_mock()
678 |     mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2])
679 | 
680 |     # Test --sort-by submittedDate
681 |     result_sort_submitted = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "submittedDate"])
682 |     assert result_sort_submitted.exit_code == 0
683 |     assert "Paper One Title" in result_sort_submitted.output
684 |     assert "Paper Two Title" in result_sort_submitted.output
685 |     mock_arxiv_search_class.assert_called_with(
686 |         query="test_query",
687 |         max_results=5, # Default from CLI is 5
688 |         sort_by=arxiv.SortCriterion.SubmittedDate
689 |     )
690 |     mock_search_instance.results.assert_called_once()
691 | 
692 |     mock_search_instance.reset_mock()
693 |     mock_search_instance.results.return_value = iter([mock_paper1, mock_paper2])
694 | 
695 |     # Test --sort-by relevance (explicitly)
696 |     result_sort_relevance = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "relevance"])
697 |     assert result_sort_relevance.exit_code == 0
698 |     assert "Paper One Title" in result_sort_relevance.output
699 |     assert "Paper Two Title" in result_sort_relevance.output
700 |     mock_arxiv_search_class.assert_called_with(
701 |         query="test_query",
702 |         max_results=5, # Default from CLI is 5
703 |         sort_by=arxiv.SortCriterion.Relevance
704 |     )
705 |     mock_search_instance.results.assert_called_once()
706 | 
707 |     # Test invalid sort criteria
708 |     result_invalid_sort = runner.invoke(llm_cli, ["arxiv-search", "test_query", "--sort-by", "invalid"])
709 |     assert result_invalid_sort.exit_code != 0 # Should fail
710 |     assert "Invalid value for '--sort-by'" in result_invalid_sort.output
711 | 


--------------------------------------------------------------------------------