├── MANIFEST.in ├── .github └── funding.yml ├── mcp_simple_arxiv ├── __main__.py ├── __init__.py ├── update_taxonomy.py ├── taxonomy.json ├── categories.py ├── server.py └── arxiv_client.py ├── smithery.yaml ├── Dockerfile.web ├── Dockerfile ├── pyproject.toml ├── LICENSE ├── TODO.md ├── README.md ├── test_client.py ├── .gitignore ├── DEPLOYMENT.md └── test_web_client.py /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include mcp_simple_arxiv/taxonomy.json -------------------------------------------------------------------------------- /.github/funding.yml: -------------------------------------------------------------------------------- 1 | # If you find this useful you can support this by using the funding links below 2 | # 3 | github: andybrandt 4 | buy_me_a_coffee: andybrandt 5 | -------------------------------------------------------------------------------- /mcp_simple_arxiv/__main__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main entry point for the MCP server. 3 | """ 4 | 5 | from .server import main 6 | 7 | if __name__ == "__main__": 8 | main() -------------------------------------------------------------------------------- /mcp_simple_arxiv/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | MCP server providing access to arXiv papers through their API. 3 | """ 4 | import asyncio 5 | from .server import main as server_main 6 | 7 | __version__ = "0.1.0" 8 | 9 | def main(): 10 | """Main entry point for the package.""" 11 | asyncio.run(server_main()) -------------------------------------------------------------------------------- /smithery.yaml: -------------------------------------------------------------------------------- 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml 2 | 3 | startCommand: 4 | type: stdio 5 | configSchema: 6 | # JSON Schema defining the configuration options for the MCP. 7 | type: object 8 | required: [] 9 | properties: {} 10 | commandFunction: 11 | # A function that produces the CLI command to start the MCP on stdio. 12 | |- 13 | config => ({command: 'python', args: ['-m', 'mcp_simple_arxiv']}) -------------------------------------------------------------------------------- /Dockerfile.web: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /usr/src/app 6 | 7 | # Copy the entire project into the container 8 | COPY . . 9 | 10 | # Install the project dependencies 11 | # Using --no-cache-dir is a good practice for keeping image size down 12 | RUN pip install --no-cache-dir . 13 | 14 | # Expose the port the app runs on 15 | EXPOSE 8000 16 | 17 | # Run the web server when the container launches 18 | CMD ["python", "-m", "mcp_simple_arxiv.web_server"] -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile 2 | # Use an official Python runtime as a parent image 3 | FROM python:3.10-slim 4 | 5 | # Set the working directory in the container 6 | WORKDIR /app 7 | 8 | # Copy the current directory contents into the container at /app 9 | ADD . /app 10 | 11 | # Install any needed packages specified in requirements.txt 12 | RUN pip install --no-cache-dir . 13 | 14 | # Make port 80 available to the world outside this container 15 | EXPOSE 80 16 | 17 | # Run mcp_simple_arxiv when the container launches 18 | ENTRYPOINT ["python", "-m", "mcp_simple_arxiv"] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "mcp-simple-arxiv" 7 | version = "0.3.0" 8 | description = "An MCP server that provides access to arXiv papers through their API." 9 | readme = "README.md" 10 | authors = [ 11 | {name = "Andy Brandt", email = "andy@codesprinters.com"} 12 | ] 13 | requires-python = ">=3.10" 14 | license = "MIT" 15 | classifiers = [ 16 | "Programming Language :: Python :: 3", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "fastmcp", 21 | "feedparser", # for parsing arXiv API responses 22 | "httpx", # for making HTTP requests with async support 23 | "beautifulsoup4", # for parsing arXiv taxonomy page 24 | ] 25 | 26 | [project.urls] 27 | Homepage = "https://github.com/andybrandt/mcp-simple-arxiv" 28 | 29 | [project.scripts] 30 | mcp-simple-arxiv = "mcp_simple_arxiv:main" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Andy Brandt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mcp_simple_arxiv/update_taxonomy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to verify and optionally update arXiv category taxonomy. 3 | """ 4 | 5 | import json 6 | from pathlib import Path 7 | from .categories import CATEGORIES 8 | 9 | TAXONOMY_FILE = Path(__file__).parent / "taxonomy.json" 10 | 11 | def update_taxonomy_file(): 12 | """ 13 | Create taxonomy.json from the built-in categories. 14 | Returns the taxonomy dictionary. 15 | """ 16 | print(f"Creating taxonomy file at {TAXONOMY_FILE}...") 17 | with open(TAXONOMY_FILE, 'w', encoding='utf-8') as f: 18 | json.dump(CATEGORIES, f, indent=2, ensure_ascii=False) 19 | print("Done!") 20 | return CATEGORIES 21 | 22 | def load_taxonomy() -> dict: 23 | """ 24 | Load taxonomy from the JSON file. 25 | If file doesn't exist, create it from built-in categories. 26 | """ 27 | if not TAXONOMY_FILE.exists(): 28 | print(f"Taxonomy file not found at {TAXONOMY_FILE}, creating it...") 29 | return update_taxonomy_file() 30 | 31 | print(f"Loading taxonomy from {TAXONOMY_FILE}") 32 | with open(TAXONOMY_FILE, 'r', encoding='utf-8') as f: 33 | return json.load(f) 34 | 35 | if __name__ == "__main__": 36 | # When run directly, create/update the taxonomy file 37 | print("Creating taxonomy file from built-in categories...") 38 | taxonomy = update_taxonomy_file() 39 | print(f"\nCreated taxonomy with {len(taxonomy)} primary categories:") 40 | for primary, data in taxonomy.items(): 41 | print(f"- {primary}: {data['name']} ({len(data['subcategories'])} subcategories)") -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Planned Features and Improvements 2 | 3 | 1. **Total Match Count in Search Results** 4 | - Add total number of matches found to search results 5 | - This helps users and AIs determine if search needs refinement 6 | - Should handle cases where there are more results than displayed 7 | - Example: "Found 1234 matches, showing first 10" 8 | 9 | 2. **Categories in Search Results** 10 | - Show primary category and all subcategories for each paper in search results 11 | - Makes it easier to quickly assess paper's field and relevance 12 | - Categories should be clearly labeled (primary vs additional) 13 | - Example: "Primary: cs.AI, Additional: cs.LG, cs.CL" 14 | 15 | 3. **Abstract Preview in Search Results** 16 | - Add first 2-3 sentences of abstract to each search result 17 | - Helps quickly assess paper relevance without fetching full details 18 | - Should handle varying abstract lengths gracefully 19 | - Should end with ellipsis (...) if truncated 20 | 21 | 4. **Flexible Search Result Sorting** 22 | - Add support for different sorting options in search 23 | - Support sorting by: submission date, last update date, relevance 24 | - Make sort order configurable (ascending/descending) 25 | - Expose sorting options in tool description 26 | 27 | 5. **Date Range Filters** 28 | - Allow filtering papers by submission/update date range 29 | - Support both absolute dates and relative ranges (last week/month/year) 30 | - Implement using arXiv API's date filtering capabilities 31 | 32 | 6. **DOI Integration** [DONE] 33 | - Add DOI (Digital Object Identifier) to paper details when available 34 | - Extract from arXiv API response 35 | - Include DOI URL for easy access 36 | 37 | 7. **Enhanced Category Presentation** 38 | - Improve how categories are displayed in paper details 39 | - Clearly distinguish primary and secondary categories 40 | - Include category descriptions where helpful 41 | - Group related categories together 42 | 43 | 8. **Advanced Category Search** 44 | - Support complex category combinations in search 45 | - Allow AND/OR/NOT operations between categories 46 | - Support parentheses for grouping 47 | - Example: "(cs.AI OR cs.LG) AND NOT cs.DB" 48 | 49 | 9. **Citation Format Support** 50 | - Generate citation strings in common formats (BibTeX, APA, etc.) 51 | - Include all necessary metadata (authors, title, arXiv ID, etc.) 52 | - Handle special characters in titles and names correctly 53 | 54 | 10. **Impact Metrics** 55 | - Add citation count or other impact metrics if available 56 | - Consider alternative metrics like downloads or social media mentions 57 | - NOTE: Might require integration with additional APIs 58 | 59 | 11. **HTML Paper Access** [DONE] 60 | - Add detection of HTML version availability 61 | - Include HTML URL in paper metadata when available 62 | - Add URL construction logic (changing PDF URL to HTML) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mcp-simple-arxiv 2 | 3 | [![Trust Score](https://archestra.ai/mcp-catalog/api/badge/quality/andybrandt/mcp-simple-arxiv)](https://archestra.ai/mcp-catalog/andybrandt__mcp-simple-arxiv) 4 | [![smithery badge](https://smithery.ai/badge/mcp-simple-arxiv)](https://smithery.ai/server/mcp-simple-arxiv) 5 | [![Verified on MseeP](https://mseep.ai/badge.svg)](https://mseep.ai/app/dc95dba9-149a-4eaa-bf08-36e0cb0f3a5a) 6 | 7 | An MCP server that provides access to arXiv papers through their API. 8 | 9 | mcp-simple-arxiv MCP server 10 | 11 | ## Features 12 | 13 | This server allows LLM clients (like Claude Desktop) to: 14 | - Search for scientific papers on arXiv by title and abstract content 15 | - Get paper metadata and abstracts 16 | - Access links to available paper formats (PDF/HTML) 17 | 18 | To use the web version just add this connector to Claude.ai https://mcp.andybrandt.net/arxiv . 19 | You can also install & use it locally. 20 | 21 | ## Installation and Deployment 22 | 23 | This server can be run in two modes: as a local `stdio` server for desktop clients or as a network-accessible web server. 24 | 25 | ### Installing via Smithery 26 | 27 | To install Simple Arxiv for Claude Desktop automatically via [Smithery](https://smithery.ai/server/mcp-simple-arxiv): 28 | 29 | ```bash 30 | npx -y @smithery/cli install mcp-simple-arxiv --client claude 31 | ``` 32 | 33 | ### Manual Installation 34 | ```bash 35 | pip install mcp-simple-arxiv 36 | ``` 37 | 38 | ## Usage with Claude Desktop 39 | 40 | Add this configuration to your `claude_desktop_config.json`: 41 | 42 | (Mac OS) 43 | 44 | ```json 45 | { 46 | "mcpServers": { 47 | "simple-arxiv": { 48 | "command": "python", 49 | "args": ["-m", "mcp_simple_arxiv"] 50 | } 51 | } 52 | } 53 | ``` 54 | 55 | (Windows version): 56 | 57 | ```json 58 | { 59 | "mcpServers": { 60 | "simple-arxiv": { 61 | "command": "C:\\Users\\YOUR_USERNAME\\AppData\\Local\\Programs\\Python\\Python311\\python.exe", 62 | "args": [ 63 | "-m", 64 | "mcp_simple_arxiv" 65 | ] 66 | } 67 | } 68 | } 69 | ``` 70 | 71 | After restarting Claude Desktop, the following capabilities will be available: 72 | 73 | ### Searching Papers 74 | 75 | You can ask Claude to search for papers using queries like: 76 | ``` 77 | Can you search arXiv for recent papers about large language models? 78 | ``` 79 | 80 | The search will return basic information about matching papers including: 81 | - Paper title 82 | - Authors 83 | - arXiv ID 84 | - Publication date 85 | 86 | ### Getting Paper Details 87 | 88 | Once you have a paper ID, you can ask for more details: 89 | ``` 90 | Can you show me the details for paper 2103.08220? 91 | ``` 92 | 93 | This will return: 94 | - Full paper title 95 | - Authors 96 | - Publication and update dates 97 | - Journal reference (if available) 98 | - Paper abstract 99 | - Links to available formats (PDF/HTML) 100 | 101 | 102 | *For web deployment see [DEPLOYMENT.md](DEPLOYMENT.md)*. 103 | 104 | ## Development 105 | 106 | To install for development: 107 | ```bash 108 | git clone https://github.com/andybrandt/mcp-simple-arxiv 109 | cd mcp-simple-arxiv 110 | pip install -e . 111 | ``` 112 | 113 | ### arXiv API Guidelines 114 | 115 | This server follows arXiv API usage guidelines: 116 | - Rate limiting to max 1 request per 3 seconds 117 | - Single connection at a time 118 | - Proper error handling and retry logic 119 | 120 | ## License 121 | 122 | MIT 123 | -------------------------------------------------------------------------------- /test_client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging 3 | from fastmcp.client import Client, StdioTransport 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 7 | 8 | async def main(): 9 | """ 10 | Test client for the mcp-simple-arxiv server. 11 | Connects to the stdio server, lists tools, and calls each tool to verify functionality. 12 | """ 13 | logging.info("Starting test client for mcp-simple-arxiv...") 14 | 15 | # Configure the stdio transport to run the server as a module 16 | transport = StdioTransport( 17 | command="python", 18 | args=["-m", "mcp_simple_arxiv"] 19 | ) 20 | 21 | # Create a client with the transport 22 | client = Client(transport) 23 | 24 | async with client: 25 | try: 26 | # 1. List available tools 27 | logging.info("--- Testing tools/list ---") 28 | tools = await client.list_tools() 29 | logging.info(f"Found {len(tools)} tools:") 30 | for tool in tools: 31 | logging.info(f"- {tool.name}: {tool.description.splitlines()[0]}") 32 | assert len(tools) == 4, "Expected 4 tools" 33 | logging.info("✅ tools/list test PASSED") 34 | 35 | # 2. Test search_papers 36 | logging.info("\n--- Testing search_papers ---") 37 | query = "electron" 38 | logging.info(f"Calling search_papers with query: '{query}'") 39 | result = await client.call_tool("search_papers", {"query": query, "max_results": 2}) 40 | logging.info(f"Result:\n{result.data}") 41 | assert "Search Results" in result.data 42 | logging.info("✅ search_papers test PASSED") 43 | 44 | # 3. Test get_paper_data 45 | logging.info("\n--- Testing get_paper_data ---") 46 | paper_id = "0808.3772" # A known paper 47 | logging.info(f"Calling get_paper_data with paper_id: '{paper_id}'") 48 | result = await client.call_tool("get_paper_data", {"paper_id": paper_id}) 49 | logging.info(f"Result:\n{result.data}") 50 | assert "A common mass scale for satellite galaxies of the Milky Way" in result.data 51 | logging.info("✅ get_paper_data test PASSED") 52 | 53 | # 4. Test list_categories 54 | logging.info("\n--- Testing list_categories ---") 55 | logging.info("Calling list_categories without a filter...") 56 | result = await client.call_tool("list_categories") 57 | logging.info(f"Result snippet:\n{result.data[:300]}...") 58 | assert "arXiv Categories" in result.data 59 | logging.info("✅ list_categories (no filter) test PASSED") 60 | 61 | logging.info("Calling list_categories with filter 'cs'...") 62 | result = await client.call_tool("list_categories", {"primary_category": "cs"}) 63 | logging.info(f"Result snippet:\n{result.data[:300]}...") 64 | assert "cs: Computer Science" in result.data 65 | assert "math: Mathematics" not in result.data 66 | logging.info("✅ list_categories (with filter) test PASSED") 67 | 68 | # 5. Test update_categories - This might take a moment 69 | logging.info("\n--- Testing update_categories ---") 70 | logging.info("Calling update_categories...") 71 | result = await client.call_tool("update_categories") 72 | logging.info(f"Result:\n{result.data}") 73 | assert "Successfully updated category taxonomy" in result.data 74 | logging.info("✅ update_categories test PASSED") 75 | 76 | except Exception as e: 77 | logging.error(f"An error occurred during testing: {e}", exc_info=True) 78 | finally: 79 | logging.info("\nTest run finished.") 80 | 81 | if __name__ == "__main__": 82 | asyncio.run(main()) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /DEPLOYMENT.md: -------------------------------------------------------------------------------- 1 | # Web Server Deployment Guide 2 | 3 | This guide explains how to run the network-hostable version of the `mcp-simple-arxiv` server. This version uses the MCP Streamable HTTP transport and is ideal for deployments where clients connect over a network. 4 | 5 | ## Overview 6 | 7 | The web server is a stateless service that exposes four tools for interacting with the arXiv API: 8 | - `search_papers`: Search for papers by keyword. 9 | - `get_paper_data`: Fetch detailed information for a specific paper ID. 10 | - `list_categories`: List the available arXiv subject categories. 11 | - `update_categories`: Refresh the locally cached category list from arXiv. 12 | 13 | It runs using FastMCP’s built-in web server (based on Uvicorn/Starlette). 14 | 15 | ## Local Development 16 | 17 | ### 1. Install Dependencies 18 | 19 | First, ensure you have a virtual environment set up and the project installed in editable mode: 20 | ```bash 21 | python3 -m venv venv 22 | source venv/bin/activate 23 | pip install -e . 24 | ``` 25 | 26 | ### 2. Run the Server 27 | 28 | The server can be run directly using its Python module: 29 | ```bash 30 | python -m mcp_simple_arxiv.web_server 31 | ``` 32 | This will start the server on `http://0.0.0.0:8000`. 33 | 34 | ### 3. Test the Server 35 | 36 | You can test the running server from your command line using `curl`. 37 | 38 | **List Tools Request:** 39 | ```bash 40 | curl -X POST http://127.0.0.1:8000/mcp/ \ 41 | -H "Content-Type: application/json" \ 42 | -H "Accept: application/json, text/event-stream" \ 43 | -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}' 44 | ``` 45 | 46 | **Tool Call Request (`search_papers`):** 47 | ```bash 48 | curl -X POST http://127.0.0.1:8000/mcp/ \ 49 | -H "Content-Type: application/json" \ 50 | -H "Accept: application/json, text/event-stream" \ 51 | -d '{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"search_papers","arguments":{"query": "quantum computing"}}}' 52 | ``` 53 | 54 | ## Docker Deployment 55 | 56 | The project includes a `Dockerfile.web` for easy containerization. 57 | 58 | ### 1. Build the Docker Image 59 | ```bash 60 | docker build -f Dockerfile.web -t mcp-simple-arxiv:web . 61 | ``` 62 | 63 | ### 2. Run the Docker Container 64 | 65 | To run the container for use with a local reverse proxy (like Apache or Nginx), you should map the container’s port only to the host’s loopback interface: 66 | ```bash 67 | docker run -d -p 127.0.0.1:8000:8000 --name arxiv-web mcp-simple-arxiv:web 68 | ``` 69 | This command does two important things: 70 | 1. It runs the container in detached mode (`-d`). 71 | 2. It maps port 8000 inside the container to port 8000 on the host machine’s `localhost` interface only (`-p 127.0.0.1:8000:8000`). This ensures the server is not directly accessible from the network, which is the recommended setup when placing it behind a reverse proxy. 72 | 73 | For persistence, you can set the container to restart automatically: 74 | ```bash 75 | docker run -d --restart always -p 127.0.0.1:8000:8000 --name arxiv-web mcp-simple-arxiv:web 76 | ``` 77 | 78 | ### 3. Transferring the Image 79 | 80 | If you built the image on a different machine, you can package it for transfer: 81 | ```bash 82 | # On the source machine 83 | docker save -o mcp-simple-arxiv-web.tar mcp-simple-arxiv:web 84 | gzip mcp-simple-arxiv-web.tar 85 | 86 | # On the destination machine 87 | gunzip mcp-simple-arxiv-web.tar.gz 88 | docker load -i mcp-simple-arxiv-web.tar 89 | ``` 90 | 91 | ## Changing the Port 92 | 93 | The server is configured to run on port 8000 inside the container. To map this to a different host port, change the first value in the `-p` parameter. The format is `-p :`. 94 | ```bash 95 | # Map container's port 8000 to host's port 9001 96 | docker run -d -p 127.0.0.1:9001:8000 --name arxiv-web mcp-simple-arxiv:web 97 | ``` 98 | The server will now be accessible at `http://127.0.0.1:9001` on the host. 99 | 100 | ## Apache Reverse Proxy Configuration 101 | 102 | **Important**: MCP clients may request URLs with or without trailing slashes. Your Apache configuration must handle both cases to avoid 404 errors. 103 | 104 | Example Configuration: 105 | ```apache 106 | 107 | ServerName mcp.yourdomain.com 108 | 109 | # SSL Configuration (recommended for production) 110 | # SSLEngine on 111 | # SSLCertificateFile /path/to/cert.pem 112 | # SSLCertificateKeyFile /path/to/key.pem 113 | 114 | # Main proxy configuration 115 | 116 | ProxyPass http://127.0.0.1:8000/mcp/ 117 | ProxyPassReverse http://127.0.0.1:8000/mcp/ 118 | 119 | 120 | ``` 121 | This configuration will make your arXiv MCP server available at `https://mcp.yourdomain.com/arxiv`. -------------------------------------------------------------------------------- /mcp_simple_arxiv/taxonomy.json: -------------------------------------------------------------------------------- 1 | { 2 | "cs": { 3 | "name": "Computer Science", 4 | "subcategories": { 5 | "AI": "Artificial Intelligence", 6 | "CL": "Computation and Language", 7 | "CC": "Computational Complexity", 8 | "CE": "Computational Engineering, Finance, and Science", 9 | "CG": "Computational Geometry", 10 | "GT": "Computer Science and Game Theory", 11 | "CV": "Computer Vision and Pattern Recognition", 12 | "CY": "Computers and Society", 13 | "CR": "Cryptography and Security", 14 | "DB": "Databases", 15 | "DS": "Data Structures and Algorithms", 16 | "DL": "Digital Libraries", 17 | "DM": "Discrete Mathematics", 18 | "DC": "Distributed Computing", 19 | "ET": "Emerging Technologies", 20 | "FL": "Formal Languages and Automata Theory", 21 | "GL": "General Literature", 22 | "GR": "Graphics", 23 | "AR": "Hardware Architecture", 24 | "HC": "Human-Computer Interaction", 25 | "IR": "Information Retrieval", 26 | "IT": "Information Theory", 27 | "LG": "Machine Learning", 28 | "LO": "Logic in Computer Science", 29 | "MS": "Mathematical Software", 30 | "MA": "Multiagent Systems", 31 | "MM": "Multimedia", 32 | "NI": "Networking and Internet Architecture", 33 | "NE": "Neural and Evolutionary Computing", 34 | "NA": "Numerical Analysis", 35 | "OS": "Operating Systems", 36 | "OH": "Other Computer Science", 37 | "PF": "Performance", 38 | "PL": "Programming Languages", 39 | "RO": "Robotics", 40 | "SI": "Social and Information Networks", 41 | "SE": "Software Engineering", 42 | "SD": "Sound", 43 | "SC": "Symbolic Computation", 44 | "SY": "Systems and Control" 45 | } 46 | }, 47 | "econ": { 48 | "name": "Economics", 49 | "subcategories": { 50 | "EM": "Econometrics", 51 | "GN": "General Economics", 52 | "TH": "Theoretical Economics" 53 | } 54 | }, 55 | "eess": { 56 | "name": "Electrical Engineering and Systems Science", 57 | "subcategories": { 58 | "AS": "Audio and Speech Processing", 59 | "IV": "Image and Video Processing", 60 | "SP": "Signal Processing", 61 | "SY": "Systems and Control" 62 | } 63 | }, 64 | "math": { 65 | "name": "Mathematics", 66 | "subcategories": { 67 | "AG": "Algebraic Geometry", 68 | "AT": "Algebraic Topology", 69 | "AP": "Analysis of PDEs", 70 | "CT": "Category Theory", 71 | "CA": "Classical Analysis and ODEs", 72 | "CO": "Combinatorics", 73 | "AC": "Commutative Algebra", 74 | "CV": "Complex Variables", 75 | "DG": "Differential Geometry", 76 | "DS": "Dynamical Systems", 77 | "FA": "Functional Analysis", 78 | "GM": "General Mathematics", 79 | "GN": "General Topology", 80 | "GT": "Geometric Topology", 81 | "GR": "Group Theory", 82 | "HO": "History and Overview", 83 | "IT": "Information Theory", 84 | "KT": "K-Theory and Homology", 85 | "LO": "Logic", 86 | "MP": "Mathematical Physics", 87 | "MG": "Metric Geometry", 88 | "NT": "Number Theory", 89 | "NA": "Numerical Analysis", 90 | "OA": "Operator Algebras", 91 | "OC": "Optimization and Control", 92 | "PR": "Probability", 93 | "QA": "Quantum Algebra", 94 | "RT": "Representation Theory", 95 | "RA": "Rings and Algebras", 96 | "SP": "Spectral Theory", 97 | "ST": "Statistics Theory", 98 | "SG": "Symplectic Geometry" 99 | } 100 | }, 101 | "physics": { 102 | "name": "Physics", 103 | "subcategories": { 104 | "acc-ph": "Accelerator Physics", 105 | "ao-ph": "Atmospheric and Oceanic Physics", 106 | "atom-ph": "Atomic Physics", 107 | "bio-ph": "Biological Physics", 108 | "chem-ph": "Chemical Physics", 109 | "class-ph": "Classical Physics", 110 | "comp-ph": "Computational Physics", 111 | "data-an": "Data Analysis, Statistics and Probability", 112 | "flu-dyn": "Fluid Dynamics", 113 | "gen-ph": "General Physics", 114 | "geo-ph": "Geophysics", 115 | "hist-ph": "History and Philosophy of Physics", 116 | "ins-det": "Instrumentation and Detectors", 117 | "med-ph": "Medical Physics", 118 | "optics": "Optics", 119 | "ed-ph": "Physics Education", 120 | "soc-ph": "Physics and Society", 121 | "plasm-ph": "Plasma Physics", 122 | "pop-ph": "Popular Physics", 123 | "space-ph": "Space Physics" 124 | } 125 | }, 126 | "q-bio": { 127 | "name": "Quantitative Biology", 128 | "subcategories": { 129 | "BM": "Biomolecules", 130 | "CB": "Cell Behavior", 131 | "GN": "Genomics", 132 | "MN": "Molecular Networks", 133 | "NC": "Neurons and Cognition", 134 | "OT": "Other Quantitative Biology", 135 | "PE": "Populations and Evolution", 136 | "QM": "Quantitative Methods", 137 | "SC": "Subcellular Processes", 138 | "TO": "Tissues and Organs" 139 | } 140 | }, 141 | "q-fin": { 142 | "name": "Quantitative Finance", 143 | "subcategories": { 144 | "CP": "Computational Finance", 145 | "EC": "Economics", 146 | "GN": "General Finance", 147 | "MF": "Mathematical Finance", 148 | "PM": "Portfolio Management", 149 | "PR": "Pricing of Securities", 150 | "RM": "Risk Management", 151 | "ST": "Statistical Finance", 152 | "TR": "Trading and Market Microstructure" 153 | } 154 | }, 155 | "stat": { 156 | "name": "Statistics", 157 | "subcategories": { 158 | "AP": "Applications", 159 | "CO": "Computation", 160 | "ME": "Methodology", 161 | "ML": "Machine Learning", 162 | "OT": "Other Statistics", 163 | "TH": "Statistics Theory" 164 | } 165 | } 166 | } -------------------------------------------------------------------------------- /test_web_client.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import httpx 3 | import logging 4 | import subprocess 5 | import time 6 | import sys 7 | import signal 8 | 9 | # Configure logging 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 11 | 12 | SERVER_URL = "http://127.0.0.1:8000/mcp/" 13 | HEADERS = { 14 | "Accept": "application/json, text/event-stream", 15 | "Content-Type": "application/json" 16 | } 17 | 18 | async def check_server_ready(client: httpx.AsyncClient): 19 | """Polls the server until it is ready to accept connections.""" 20 | for _ in range(20): # Poll for up to 10 seconds 21 | try: 22 | response = await client.post(SERVER_URL, json={"jsonrpc": "2.0", "id": 0, "method": "tools/list"}, headers=HEADERS) 23 | if response.status_code == 200: 24 | logging.info("Web server is up and running.") 25 | return True 26 | except httpx.ConnectError: 27 | pass 28 | await asyncio.sleep(0.5) 29 | logging.error("Web server did not start in time.") 30 | return False 31 | 32 | async def call_tool(client: httpx.AsyncClient, tool_name: str, params: dict = None) -> dict: 33 | """Helper function to call a tool via JSON-RPC.""" 34 | method = "tools/call" if tool_name != "tools/list" else "tools/list" 35 | 36 | payload = { 37 | "jsonrpc": "2.0", 38 | "id": 1, 39 | "method": method, 40 | } 41 | 42 | if method == "tools/call": 43 | payload["params"] = {"name": tool_name, "arguments": params or {}} 44 | 45 | response = await client.post(SERVER_URL, json=payload, headers=HEADERS) 46 | response.raise_for_status() 47 | # The response is Server-Sent Events, we need to parse it 48 | for line in response.text.strip().split('\n'): 49 | if line.startswith('data:'): 50 | import json 51 | return json.loads(line[len('data:'):].strip()) 52 | raise ValueError("Did not receive a valid data event from the server.") 53 | 54 | 55 | async def main(): 56 | """ 57 | Test client for the mcp-simple-arxiv web server. 58 | Starts the server, runs tests, and then stops it. 59 | """ 60 | server_process = None 61 | try: 62 | logging.info("Starting web server process...") 63 | # Start the server as a subprocess 64 | server_process = subprocess.Popen( 65 | [sys.executable, "-m", "mcp_simple_arxiv.web_server"], 66 | stdout=sys.stdout, 67 | stderr=sys.stderr 68 | ) 69 | 70 | async with httpx.AsyncClient(timeout=30.0) as client: 71 | if not await check_server_ready(client): 72 | raise RuntimeError("Could not connect to the web server.") 73 | 74 | # 1. List available tools 75 | logging.info("\n--- Testing tools/list ---") 76 | response_json = await call_tool(client, "tools/list") # Using call_tool to simplify logic 77 | tools = response_json['result']['tools'] 78 | logging.info(f"Found {len(tools)} tools.") 79 | assert len(tools) == 4 80 | logging.info("✅ tools/list test PASSED") 81 | 82 | # 2. Test search_papers 83 | logging.info("\n--- Testing search_papers ---") 84 | query = "dark matter" 85 | response_json = await call_tool(client, "search_papers", {"query": query, "max_results": 1}) 86 | result = response_json['result']['structuredContent']['result'] 87 | logging.info(f"Result for '{query}':\n{result}") 88 | assert "Search Results" in result 89 | logging.info("✅ search_papers test PASSED") 90 | 91 | # 3. Test get_paper_data 92 | logging.info("\n--- Testing get_paper_data ---") 93 | paper_id = "0808.3772" # Using the same ID as the stdio test for consistency 94 | response_json = await call_tool(client, "get_paper_data", {"paper_id": paper_id}) 95 | result = response_json['result']['structuredContent']['result'] 96 | logging.info(f"Result for paper '{paper_id}':\n{result}") 97 | assert "A common mass scale for satellite galaxies of the Milky Way" in result 98 | logging.info("✅ get_paper_data test PASSED") 99 | 100 | # 4. Test list_categories 101 | logging.info("\n--- Testing list_categories ---") 102 | response_json = await call_tool(client, "list_categories") 103 | result = response_json['result']['structuredContent']['result'] 104 | logging.info("Result snippet:\n" + result[:200] + "...") 105 | assert "arXiv Categories" in result 106 | logging.info("✅ list_categories test PASSED") 107 | 108 | # 5. Test update_categories 109 | logging.info("\n--- Testing update_categories ---") 110 | response_json = await call_tool(client, "update_categories") 111 | result = response_json['result']['structuredContent']['result'] 112 | logging.info(f"Result:\n{result}") 113 | assert "Successfully updated category taxonomy" in result 114 | logging.info("✅ update_categories test PASSED") 115 | 116 | except Exception as e: 117 | logging.error(f"An error occurred during testing: {e}", exc_info=True) 118 | finally: 119 | if server_process: 120 | logging.info("\nStopping web server process...") 121 | server_process.send_signal(signal.SIGINT) # Send Ctrl+C 122 | try: 123 | server_process.wait(timeout=10) 124 | logging.info("Web server stopped gracefully.") 125 | except subprocess.TimeoutExpired: 126 | logging.warning("Web server did not stop gracefully, killing.") 127 | server_process.kill() 128 | logging.info("Test run finished.") 129 | 130 | if __name__ == "__main__": 131 | asyncio.run(main()) -------------------------------------------------------------------------------- /mcp_simple_arxiv/categories.py: -------------------------------------------------------------------------------- 1 | """ 2 | arXiv category taxonomy. 3 | """ 4 | 5 | CATEGORIES = { 6 | "cs": { 7 | "name": "Computer Science", 8 | "subcategories": { 9 | "AI": "Artificial Intelligence", 10 | "CL": "Computation and Language", 11 | "CC": "Computational Complexity", 12 | "CE": "Computational Engineering, Finance, and Science", 13 | "CG": "Computational Geometry", 14 | "GT": "Computer Science and Game Theory", 15 | "CV": "Computer Vision and Pattern Recognition", 16 | "CY": "Computers and Society", 17 | "CR": "Cryptography and Security", 18 | "DB": "Databases", 19 | "DS": "Data Structures and Algorithms", 20 | "DL": "Digital Libraries", 21 | "DM": "Discrete Mathematics", 22 | "DC": "Distributed Computing", 23 | "ET": "Emerging Technologies", 24 | "FL": "Formal Languages and Automata Theory", 25 | "GL": "General Literature", 26 | "GR": "Graphics", 27 | "AR": "Hardware Architecture", 28 | "HC": "Human-Computer Interaction", 29 | "IR": "Information Retrieval", 30 | "IT": "Information Theory", 31 | "LG": "Machine Learning", 32 | "LO": "Logic in Computer Science", 33 | "MS": "Mathematical Software", 34 | "MA": "Multiagent Systems", 35 | "MM": "Multimedia", 36 | "NI": "Networking and Internet Architecture", 37 | "NE": "Neural and Evolutionary Computing", 38 | "NA": "Numerical Analysis", 39 | "OS": "Operating Systems", 40 | "OH": "Other Computer Science", 41 | "PF": "Performance", 42 | "PL": "Programming Languages", 43 | "RO": "Robotics", 44 | "SI": "Social and Information Networks", 45 | "SE": "Software Engineering", 46 | "SD": "Sound", 47 | "SC": "Symbolic Computation", 48 | "SY": "Systems and Control", 49 | } 50 | }, 51 | "econ": { 52 | "name": "Economics", 53 | "subcategories": { 54 | "EM": "Econometrics", 55 | "GN": "General Economics", 56 | "TH": "Theoretical Economics", 57 | } 58 | }, 59 | "eess": { 60 | "name": "Electrical Engineering and Systems Science", 61 | "subcategories": { 62 | "AS": "Audio and Speech Processing", 63 | "IV": "Image and Video Processing", 64 | "SP": "Signal Processing", 65 | "SY": "Systems and Control", 66 | } 67 | }, 68 | "math": { 69 | "name": "Mathematics", 70 | "subcategories": { 71 | "AG": "Algebraic Geometry", 72 | "AT": "Algebraic Topology", 73 | "AP": "Analysis of PDEs", 74 | "CT": "Category Theory", 75 | "CA": "Classical Analysis and ODEs", 76 | "CO": "Combinatorics", 77 | "AC": "Commutative Algebra", 78 | "CV": "Complex Variables", 79 | "DG": "Differential Geometry", 80 | "DS": "Dynamical Systems", 81 | "FA": "Functional Analysis", 82 | "GM": "General Mathematics", 83 | "GN": "General Topology", 84 | "GT": "Geometric Topology", 85 | "GR": "Group Theory", 86 | "HO": "History and Overview", 87 | "IT": "Information Theory", 88 | "KT": "K-Theory and Homology", 89 | "LO": "Logic", 90 | "MP": "Mathematical Physics", 91 | "MG": "Metric Geometry", 92 | "NT": "Number Theory", 93 | "NA": "Numerical Analysis", 94 | "OA": "Operator Algebras", 95 | "OC": "Optimization and Control", 96 | "PR": "Probability", 97 | "QA": "Quantum Algebra", 98 | "RT": "Representation Theory", 99 | "RA": "Rings and Algebras", 100 | "SP": "Spectral Theory", 101 | "ST": "Statistics Theory", 102 | "SG": "Symplectic Geometry", 103 | } 104 | }, 105 | "physics": { 106 | "name": "Physics", 107 | "subcategories": { 108 | "acc-ph": "Accelerator Physics", 109 | "ao-ph": "Atmospheric and Oceanic Physics", 110 | "atom-ph": "Atomic Physics", 111 | "bio-ph": "Biological Physics", 112 | "chem-ph": "Chemical Physics", 113 | "class-ph": "Classical Physics", 114 | "comp-ph": "Computational Physics", 115 | "data-an": "Data Analysis, Statistics and Probability", 116 | "flu-dyn": "Fluid Dynamics", 117 | "gen-ph": "General Physics", 118 | "geo-ph": "Geophysics", 119 | "hist-ph": "History and Philosophy of Physics", 120 | "ins-det": "Instrumentation and Detectors", 121 | "med-ph": "Medical Physics", 122 | "optics": "Optics", 123 | "ed-ph": "Physics Education", 124 | "soc-ph": "Physics and Society", 125 | "plasm-ph": "Plasma Physics", 126 | "pop-ph": "Popular Physics", 127 | "space-ph": "Space Physics", 128 | } 129 | }, 130 | "q-bio": { 131 | "name": "Quantitative Biology", 132 | "subcategories": { 133 | "BM": "Biomolecules", 134 | "CB": "Cell Behavior", 135 | "GN": "Genomics", 136 | "MN": "Molecular Networks", 137 | "NC": "Neurons and Cognition", 138 | "OT": "Other Quantitative Biology", 139 | "PE": "Populations and Evolution", 140 | "QM": "Quantitative Methods", 141 | "SC": "Subcellular Processes", 142 | "TO": "Tissues and Organs", 143 | } 144 | }, 145 | "q-fin": { 146 | "name": "Quantitative Finance", 147 | "subcategories": { 148 | "CP": "Computational Finance", 149 | "EC": "Economics", 150 | "GN": "General Finance", 151 | "MF": "Mathematical Finance", 152 | "PM": "Portfolio Management", 153 | "PR": "Pricing of Securities", 154 | "RM": "Risk Management", 155 | "ST": "Statistical Finance", 156 | "TR": "Trading and Market Microstructure", 157 | } 158 | }, 159 | "stat": { 160 | "name": "Statistics", 161 | "subcategories": { 162 | "AP": "Applications", 163 | "CO": "Computation", 164 | "ME": "Methodology", 165 | "ML": "Machine Learning", 166 | "OT": "Other Statistics", 167 | "TH": "Statistics Theory", 168 | } 169 | }, 170 | } -------------------------------------------------------------------------------- /mcp_simple_arxiv/server.py: -------------------------------------------------------------------------------- 1 | """ 2 | MCP server for accessing arXiv papers. 3 | """ 4 | 5 | import sys 6 | sys.stdout.reconfigure(encoding='utf-8') 7 | sys.stdin.reconfigure(encoding='utf-8') 8 | 9 | import asyncio 10 | import logging 11 | 12 | from fastmcp import FastMCP 13 | 14 | from .arxiv_client import ArxivClient 15 | from .update_taxonomy import load_taxonomy, update_taxonomy_file 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | def get_first_sentence(text: str, max_len: int = 200) -> str: 20 | """Extract first sentence from text, limiting length.""" 21 | # Look for common sentence endings 22 | for end in ['. ', '! ', '? ']: 23 | pos = text.find(end) 24 | if pos != -1 and pos < max_len: 25 | return text[:pos + 1] 26 | # If no sentence ending found, just take first max_len chars 27 | if len(text) > max_len: 28 | return text[:max_len].rstrip() + '...' 29 | return text 30 | 31 | def create_app(): 32 | """Creates and configures the FastMCP app instance and its tools.""" 33 | app = FastMCP("arxiv-server") 34 | arxiv_client = ArxivClient() 35 | 36 | @app.tool( 37 | annotations={ 38 | "title": "Search arXiv Papers", 39 | "readOnlyHint": True, 40 | "openWorldHint": True 41 | } 42 | ) 43 | async def search_papers(query: str, max_results: int = 10) -> str: 44 | """ 45 | Search for papers on arXiv by title and abstract content. 46 | 47 | You can use advanced search syntax: 48 | - Search in title: ti:"search terms" 49 | - Search in abstract: abs:"search terms" 50 | - Search by author: au:"author name" 51 | - Combine terms with: AND, OR, ANDNOT 52 | - Filter by category: cat:cs.AI (use list_categories tool to see available categories) 53 | 54 | Examples: 55 | - "machine learning" (searches all fields) 56 | - ti:"neural networks" AND cat:cs.AI (title with category) 57 | - au:bengio AND ti:"deep learning" (author and title) 58 | """ 59 | max_results = min(max_results, 50) 60 | papers = await arxiv_client.search(query, max_results) 61 | 62 | # Format results in a readable way 63 | result = "Search Results:\n\n" 64 | for i, paper in enumerate(papers, 1): 65 | result += f"{i}. {paper['title']}\n" 66 | result += f" Authors: {', '.join(paper['authors'])}\n" 67 | result += f" ID: {paper['id']}\n" 68 | result += f" Categories: " 69 | if paper['primary_category']: 70 | result += f"Primary: {paper['primary_category']}" 71 | if paper['categories']: 72 | result += f", Additional: {', '.join(paper['categories'])}" 73 | result += f"\n Published: {paper['published']}\n" 74 | 75 | # Add first sentence of abstract 76 | abstract_preview = get_first_sentence(paper['summary']) 77 | result += f" Preview: {abstract_preview}\n" 78 | result += "\n" 79 | 80 | return result 81 | 82 | @app.tool( 83 | annotations={ 84 | "title": "Get arXiv Paper Data", 85 | "readOnlyHint": True, 86 | "openWorldHint": True 87 | } 88 | ) 89 | async def get_paper_data(paper_id: str) -> str: 90 | """Get detailed information about a specific paper including abstract and available formats.""" 91 | paper = await arxiv_client.get_paper(paper_id) 92 | 93 | # Format paper details in a readable way with clear sections 94 | result = f"Title: {paper['title']}\n\n" 95 | 96 | # Metadata section 97 | result += "Metadata:\n" 98 | result += f"- Authors: {', '.join(paper['authors'])}\n" 99 | result += f"- Published: {paper['published']}\n" 100 | result += f"- Last Updated: {paper['updated']}\n" 101 | result += "- Categories: " 102 | if paper['primary_category']: 103 | result += f"Primary: {paper['primary_category']}" 104 | if paper['categories']: 105 | result += f", Additional: {', '.join(paper['categories'])}" 106 | result += "\n" 107 | 108 | if paper['doi']: 109 | result += f"- DOI: {paper['doi']}\n" 110 | if paper["journal_ref"]: 111 | result += f"- Journal Reference: {paper['journal_ref']}\n" 112 | 113 | # Abstract section 114 | result += "\nAbstract:\n" 115 | result += paper["summary"] 116 | result += "\n" 117 | 118 | # Access options section 119 | result += "\nAccess Options:\n" 120 | result += "- Abstract page: " + paper["abstract_url"] + "\n" 121 | if paper["html_url"]: # Add HTML version if available 122 | result += "- Full text HTML version: " + paper["html_url"] + "\n" 123 | result += "- PDF version: " + paper["pdf_url"] + "\n" 124 | 125 | # Additional information section 126 | if paper["comment"] or "code" in paper["comment"].lower(): 127 | result += "\nAdditional Information:\n" 128 | if paper["comment"]: 129 | result += "- Comment: " + paper["comment"] + "\n" 130 | 131 | return result 132 | 133 | @app.tool( 134 | annotations={ 135 | "title": "List arXiv Categories", 136 | "readOnlyHint": True, 137 | "openWorldHint": False 138 | } 139 | ) 140 | def list_categories(primary_category: str = None) -> str: 141 | """List all available arXiv categories and how to use them in search.""" 142 | try: 143 | taxonomy = load_taxonomy() 144 | except Exception as e: 145 | logger.error(f"Error loading taxonomy: {e}") 146 | return f"Error loading category taxonomy. Try using update_categories tool to refresh it." 147 | 148 | result = "arXiv Categories:\n\n" 149 | 150 | for primary, data in taxonomy.items(): 151 | if primary_category and primary != primary_category: 152 | continue 153 | 154 | result += f"{primary}: {data['name']}\n" 155 | for code, desc in data['subcategories'].items(): 156 | result += f" {primary}.{code}: {desc}\n" 157 | result += "\n" 158 | 159 | result += "\nUsage in search:\n" 160 | result += '- Search in specific category: cat:cs.AI\n' 161 | result += '- Combine with other terms: "neural networks" AND cat:cs.AI\n' 162 | result += '- Multiple categories: (cat:cs.AI OR cat:cs.LG)\n' 163 | result += '\nNote: If categories seem outdated, use the update_categories tool to refresh them.\n' 164 | 165 | return result 166 | 167 | @app.tool( 168 | annotations={ 169 | "title": "Update arXiv Categories", 170 | "readOnlyHint": False, 171 | "openWorldHint": True 172 | } 173 | ) 174 | def update_categories() -> str: 175 | """Update the stored category taxonomy by fetching the latest version from arxiv.org""" 176 | try: 177 | taxonomy = update_taxonomy_file() 178 | result = "Successfully updated category taxonomy.\n\n" 179 | result += f"Found {len(taxonomy)} primary categories:\n" 180 | for primary, data in taxonomy.items(): 181 | result += f"- {primary}: {data['name']} ({len(data['subcategories'])} subcategories)\n" 182 | return result 183 | except Exception as e: 184 | logger.error(f"Error updating taxonomy: {e}") 185 | # FastMCP will handle raising this as a proper JSON-RPC error 186 | raise e 187 | 188 | return app 189 | 190 | app = create_app() 191 | 192 | def main(): 193 | """Run the MCP server.""" 194 | app.run() 195 | 196 | if __name__ == "__main__": 197 | logging.basicConfig(level=logging.INFO) 198 | main() -------------------------------------------------------------------------------- /mcp_simple_arxiv/arxiv_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | arXiv API client with rate limiting. 3 | """ 4 | 5 | import asyncio 6 | import logging 7 | from datetime import datetime, timedelta 8 | import feedparser 9 | import httpx 10 | from typing import Optional, Dict, List, Any 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class ArxivClient: 15 | """ 16 | arXiv API client with built-in rate limiting. 17 | Ensures no more than 1 request every 3 seconds. 18 | """ 19 | 20 | def __init__(self): 21 | self.base_url = "https://export.arxiv.org/api/query" 22 | self._last_request: Optional[datetime] = None 23 | self._lock = asyncio.Lock() 24 | 25 | async def _wait_for_rate_limit(self) -> None: 26 | """Ensures we respect arXiv's rate limit of 1 request every 3 seconds.""" 27 | async with self._lock: 28 | if self._last_request is not None: 29 | elapsed = datetime.now() - self._last_request 30 | if elapsed < timedelta(seconds=3): 31 | await asyncio.sleep(3 - elapsed.total_seconds()) 32 | self._last_request = datetime.now() 33 | 34 | def _clean_text(self, text: str) -> str: 35 | """Clean up text by removing extra whitespace and newlines.""" 36 | return " ".join(text.split()) 37 | 38 | def _get_html_url(self, arxiv_id: str) -> str: 39 | """ 40 | Construct HTML version URL for a paper. 41 | 42 | The HTML version URL is not provided by the API but can be constructed 43 | by modifying the PDF URL pattern. 44 | """ 45 | # Remove version suffix if present (e.g., v1, v2) 46 | base_id = arxiv_id.split('v')[0] 47 | return f"https://arxiv.org/html/{arxiv_id}" 48 | 49 | def _parse_entry(self, entry: Dict[str, Any]) -> Dict[str, Any]: 50 | """Parse a feed entry into a paper dictionary.""" 51 | # Extract PDF and HTML links 52 | pdf_url = None 53 | abstract_url = None # This is the URL to the abstract page 54 | for link in entry.get('links', []): 55 | if isinstance(link, dict): 56 | if link.get('type') == 'application/pdf': 57 | pdf_url = link.get('href') 58 | elif link.get('type') == 'text/html': 59 | abstract_url = link.get('href') 60 | 61 | # Get paper ID 62 | paper_id = entry.get('id', '').split("/abs/")[-1].rstrip() 63 | 64 | # Create HTML version URL 65 | html_url = self._get_html_url(paper_id) if paper_id else None 66 | 67 | # Get authors 68 | authors = [] 69 | for author in entry.get('authors', []): 70 | if isinstance(author, dict) and 'name' in author: 71 | authors.append(author['name']) 72 | elif hasattr(author, 'name'): 73 | authors.append(author.name) 74 | 75 | # Get categories 76 | categories = [] 77 | primary_category = None 78 | 79 | # Get primary category 80 | if 'arxiv_primary_category' in entry: 81 | if isinstance(entry['arxiv_primary_category'], dict): 82 | primary_category = entry['arxiv_primary_category'].get('term') 83 | elif hasattr(entry['arxiv_primary_category'], 'term'): 84 | primary_category = entry['arxiv_primary_category'].term 85 | 86 | # Get all categories 87 | for category in entry.get('tags', []): 88 | if isinstance(category, dict) and 'term' in category: 89 | categories.append(category['term']) 90 | elif hasattr(category, 'term'): 91 | categories.append(category.term) 92 | 93 | # Remove primary category from regular categories if it's there 94 | if primary_category and primary_category in categories: 95 | categories.remove(primary_category) 96 | 97 | return { 98 | "id": paper_id, 99 | "title": self._clean_text(entry.get('title', '')), 100 | "authors": authors, 101 | "primary_category": primary_category, 102 | "categories": categories, 103 | "published": entry.get('published', ''), 104 | "updated": entry.get('updated', ''), 105 | "summary": self._clean_text(entry.get('summary', '')), 106 | "comment": self._clean_text(entry.get('arxiv_comment', '')), 107 | "journal_ref": entry.get('arxiv_journal_ref', ''), 108 | "doi": entry.get('arxiv_doi', ''), 109 | "pdf_url": pdf_url, 110 | "abstract_url": abstract_url, # URL to abstract page 111 | "html_url": html_url # URL to HTML version if available 112 | } 113 | 114 | async def search(self, query: str, max_results: int = 10) -> List[Dict[str, Any]]: 115 | """ 116 | Search arXiv papers. 117 | 118 | The query string supports arXiv's advanced search syntax: 119 | - Search in title: ti:"search terms" 120 | - Search in abstract: abs:"search terms" 121 | - Search by author: au:"author name" 122 | - Combine terms with: AND, OR, ANDNOT 123 | - Filter by category: cat:cs.AI 124 | 125 | Examples: 126 | - "machine learning" (searches all fields) 127 | - ti:"neural networks" AND cat:cs.AI (title with category) 128 | - au:bengio AND ti:"deep learning" (author and title) 129 | """ 130 | await self._wait_for_rate_limit() 131 | 132 | # Ensure max_results is within API limits 133 | max_results = min(max_results, 2000) # API limit: 2000 per request 134 | 135 | params = { 136 | "search_query": query, 137 | "max_results": max_results, 138 | "sortBy": "submittedDate", # Default to newest papers first 139 | "sortOrder": "descending", 140 | } 141 | 142 | async with httpx.AsyncClient(timeout=20.0) as client: 143 | try: 144 | response = await client.get(self.base_url, params=params) 145 | response.raise_for_status() # Raise an exception for bad status codes 146 | 147 | # Parse the Atom feed response 148 | feed = feedparser.parse(response.text) 149 | 150 | if not isinstance(feed, dict) or 'entries' not in feed: 151 | logger.error("Invalid response from arXiv API") 152 | logger.debug(f"Response text: {response.text[:1000]}...") 153 | raise ValueError("Invalid response from arXiv API") 154 | 155 | if not feed.get('entries'): 156 | # Empty results are ok - return empty list 157 | return [] 158 | 159 | return [self._parse_entry(entry) for entry in feed.entries] 160 | 161 | except httpx.HTTPError as e: 162 | logger.error(f"HTTP error while searching: {e}") 163 | raise ValueError(f"arXiv API HTTP error: {str(e)}") 164 | 165 | async def get_paper(self, paper_id: str) -> Dict[str, Any]: 166 | """ 167 | Get detailed information about a specific paper. 168 | 169 | Args: 170 | paper_id: arXiv paper ID (e.g., "2103.08220") 171 | 172 | Returns: 173 | Dictionary containing paper metadata, including: 174 | - Basic metadata (title, authors, dates) 175 | - Categories (primary and others) 176 | - Abstract and comments 177 | - URLs (abstract page, PDF version, HTML version if available) 178 | - DOI if available 179 | """ 180 | await self._wait_for_rate_limit() 181 | 182 | params = { 183 | "id_list": paper_id, 184 | "max_results": 1 185 | } 186 | 187 | async with httpx.AsyncClient(timeout=20.0) as client: 188 | try: 189 | response = await client.get(self.base_url, params=params) 190 | response.raise_for_status() 191 | 192 | feed = feedparser.parse(response.text) 193 | if not isinstance(feed, dict) or 'entries' not in feed: 194 | logger.error("Invalid response from arXiv API") 195 | logger.debug(f"Response text: {response.text[:1000]}...") 196 | raise ValueError("Invalid response from arXiv API") 197 | 198 | if not feed.get('entries'): 199 | raise ValueError(f"Paper not found: {paper_id}") 200 | 201 | return self._parse_entry(feed.entries[0]) 202 | 203 | except httpx.HTTPError as e: 204 | logger.error(f"HTTP error while fetching paper: {e}") 205 | raise ValueError(f"arXiv API HTTP error: {str(e)}") 206 | --------------------------------------------------------------------------------