├── MANIFEST.in
├── .github
└── funding.yml
├── mcp_simple_arxiv
├── __main__.py
├── __init__.py
├── update_taxonomy.py
├── taxonomy.json
├── categories.py
├── server.py
└── arxiv_client.py
├── smithery.yaml
├── Dockerfile.web
├── Dockerfile
├── pyproject.toml
├── LICENSE
├── TODO.md
├── README.md
├── test_client.py
├── .gitignore
├── DEPLOYMENT.md
└── test_web_client.py
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include mcp_simple_arxiv/taxonomy.json
--------------------------------------------------------------------------------
/.github/funding.yml:
--------------------------------------------------------------------------------
1 | # If you find this useful you can support this by using the funding links below
2 | #
3 | github: andybrandt
4 | buy_me_a_coffee: andybrandt
5 |
--------------------------------------------------------------------------------
/mcp_simple_arxiv/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | Main entry point for the MCP server.
3 | """
4 |
5 | from .server import main
6 |
7 | if __name__ == "__main__":
8 | main()
--------------------------------------------------------------------------------
/mcp_simple_arxiv/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | MCP server providing access to arXiv papers through their API.
3 | """
4 | import asyncio
5 | from .server import main as server_main
6 |
7 | __version__ = "0.1.0"
8 |
9 | def main():
10 | """Main entry point for the package."""
11 | asyncio.run(server_main())
--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------
1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml
2 |
3 | startCommand:
4 | type: stdio
5 | configSchema:
6 | # JSON Schema defining the configuration options for the MCP.
7 | type: object
8 | required: []
9 | properties: {}
10 | commandFunction:
11 | # A function that produces the CLI command to start the MCP on stdio.
12 | |-
13 | config => ({command: 'python', args: ['-m', 'mcp_simple_arxiv']})
--------------------------------------------------------------------------------
/Dockerfile.web:
--------------------------------------------------------------------------------
1 | # Use an official Python runtime as a parent image
2 | FROM python:3.11-slim
3 |
4 | # Set the working directory in the container
5 | WORKDIR /usr/src/app
6 |
7 | # Copy the entire project into the container
8 | COPY . .
9 |
10 | # Install the project dependencies
11 | # Using --no-cache-dir is a good practice for keeping image size down
12 | RUN pip install --no-cache-dir .
13 |
14 | # Expose the port the app runs on
15 | EXPOSE 8000
16 |
17 | # Run the web server when the container launches
18 | CMD ["python", "-m", "mcp_simple_arxiv.web_server"]
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
2 | # Use an official Python runtime as a parent image
3 | FROM python:3.10-slim
4 |
5 | # Set the working directory in the container
6 | WORKDIR /app
7 |
8 | # Copy the current directory contents into the container at /app
9 | ADD . /app
10 |
11 | # Install any needed packages specified in requirements.txt
12 | RUN pip install --no-cache-dir .
13 |
14 | # Make port 80 available to the world outside this container
15 | EXPOSE 80
16 |
17 | # Run mcp_simple_arxiv when the container launches
18 | ENTRYPOINT ["python", "-m", "mcp_simple_arxiv"]
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "mcp-simple-arxiv"
7 | version = "0.3.0"
8 | description = "An MCP server that provides access to arXiv papers through their API."
9 | readme = "README.md"
10 | authors = [
11 | {name = "Andy Brandt", email = "andy@codesprinters.com"}
12 | ]
13 | requires-python = ">=3.10"
14 | license = "MIT"
15 | classifiers = [
16 | "Programming Language :: Python :: 3",
17 | "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 | "fastmcp",
21 | "feedparser", # for parsing arXiv API responses
22 | "httpx", # for making HTTP requests with async support
23 | "beautifulsoup4", # for parsing arXiv taxonomy page
24 | ]
25 |
26 | [project.urls]
27 | Homepage = "https://github.com/andybrandt/mcp-simple-arxiv"
28 |
29 | [project.scripts]
30 | mcp-simple-arxiv = "mcp_simple_arxiv:main"
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Andy Brandt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/mcp_simple_arxiv/update_taxonomy.py:
--------------------------------------------------------------------------------
1 | """
2 | Script to verify and optionally update arXiv category taxonomy.
3 | """
4 |
5 | import json
6 | from pathlib import Path
7 | from .categories import CATEGORIES
8 |
9 | TAXONOMY_FILE = Path(__file__).parent / "taxonomy.json"
10 |
11 | def update_taxonomy_file():
12 | """
13 | Create taxonomy.json from the built-in categories.
14 | Returns the taxonomy dictionary.
15 | """
16 | print(f"Creating taxonomy file at {TAXONOMY_FILE}...")
17 | with open(TAXONOMY_FILE, 'w', encoding='utf-8') as f:
18 | json.dump(CATEGORIES, f, indent=2, ensure_ascii=False)
19 | print("Done!")
20 | return CATEGORIES
21 |
22 | def load_taxonomy() -> dict:
23 | """
24 | Load taxonomy from the JSON file.
25 | If file doesn't exist, create it from built-in categories.
26 | """
27 | if not TAXONOMY_FILE.exists():
28 | print(f"Taxonomy file not found at {TAXONOMY_FILE}, creating it...")
29 | return update_taxonomy_file()
30 |
31 | print(f"Loading taxonomy from {TAXONOMY_FILE}")
32 | with open(TAXONOMY_FILE, 'r', encoding='utf-8') as f:
33 | return json.load(f)
34 |
35 | if __name__ == "__main__":
36 | # When run directly, create/update the taxonomy file
37 | print("Creating taxonomy file from built-in categories...")
38 | taxonomy = update_taxonomy_file()
39 | print(f"\nCreated taxonomy with {len(taxonomy)} primary categories:")
40 | for primary, data in taxonomy.items():
41 | print(f"- {primary}: {data['name']} ({len(data['subcategories'])} subcategories)")
--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
1 | # Planned Features and Improvements
2 |
3 | 1. **Total Match Count in Search Results**
4 | - Add total number of matches found to search results
5 | - This helps users and AIs determine if search needs refinement
6 | - Should handle cases where there are more results than displayed
7 | - Example: "Found 1234 matches, showing first 10"
8 |
9 | 2. **Categories in Search Results**
10 | - Show primary category and all subcategories for each paper in search results
11 | - Makes it easier to quickly assess paper's field and relevance
12 | - Categories should be clearly labeled (primary vs additional)
13 | - Example: "Primary: cs.AI, Additional: cs.LG, cs.CL"
14 |
15 | 3. **Abstract Preview in Search Results**
16 | - Add first 2-3 sentences of abstract to each search result
17 | - Helps quickly assess paper relevance without fetching full details
18 | - Should handle varying abstract lengths gracefully
19 | - Should end with ellipsis (...) if truncated
20 |
21 | 4. **Flexible Search Result Sorting**
22 | - Add support for different sorting options in search
23 | - Support sorting by: submission date, last update date, relevance
24 | - Make sort order configurable (ascending/descending)
25 | - Expose sorting options in tool description
26 |
27 | 5. **Date Range Filters**
28 | - Allow filtering papers by submission/update date range
29 | - Support both absolute dates and relative ranges (last week/month/year)
30 | - Implement using arXiv API's date filtering capabilities
31 |
32 | 6. **DOI Integration** [DONE]
33 | - Add DOI (Digital Object Identifier) to paper details when available
34 | - Extract from arXiv API response
35 | - Include DOI URL for easy access
36 |
37 | 7. **Enhanced Category Presentation**
38 | - Improve how categories are displayed in paper details
39 | - Clearly distinguish primary and secondary categories
40 | - Include category descriptions where helpful
41 | - Group related categories together
42 |
43 | 8. **Advanced Category Search**
44 | - Support complex category combinations in search
45 | - Allow AND/OR/NOT operations between categories
46 | - Support parentheses for grouping
47 | - Example: "(cs.AI OR cs.LG) AND NOT cs.DB"
48 |
49 | 9. **Citation Format Support**
50 | - Generate citation strings in common formats (BibTeX, APA, etc.)
51 | - Include all necessary metadata (authors, title, arXiv ID, etc.)
52 | - Handle special characters in titles and names correctly
53 |
54 | 10. **Impact Metrics**
55 | - Add citation count or other impact metrics if available
56 | - Consider alternative metrics like downloads or social media mentions
57 | - NOTE: Might require integration with additional APIs
58 |
59 | 11. **HTML Paper Access** [DONE]
60 | - Add detection of HTML version availability
61 | - Include HTML URL in paper metadata when available
62 | - Add URL construction logic (changing PDF URL to HTML)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mcp-simple-arxiv
2 |
3 | [](https://archestra.ai/mcp-catalog/andybrandt__mcp-simple-arxiv)
4 | [](https://smithery.ai/server/mcp-simple-arxiv)
5 | [](https://mseep.ai/app/dc95dba9-149a-4eaa-bf08-36e0cb0f3a5a)
6 |
7 | An MCP server that provides access to arXiv papers through their API.
8 |
9 |
10 |
11 | ## Features
12 |
13 | This server allows LLM clients (like Claude Desktop) to:
14 | - Search for scientific papers on arXiv by title and abstract content
15 | - Get paper metadata and abstracts
16 | - Access links to available paper formats (PDF/HTML)
17 |
18 | To use the web version just add this connector to Claude.ai https://mcp.andybrandt.net/arxiv .
19 | You can also install & use it locally.
20 |
21 | ## Installation and Deployment
22 |
23 | This server can be run in two modes: as a local `stdio` server for desktop clients or as a network-accessible web server.
24 |
25 | ### Installing via Smithery
26 |
27 | To install Simple Arxiv for Claude Desktop automatically via [Smithery](https://smithery.ai/server/mcp-simple-arxiv):
28 |
29 | ```bash
30 | npx -y @smithery/cli install mcp-simple-arxiv --client claude
31 | ```
32 |
33 | ### Manual Installation
34 | ```bash
35 | pip install mcp-simple-arxiv
36 | ```
37 |
38 | ## Usage with Claude Desktop
39 |
40 | Add this configuration to your `claude_desktop_config.json`:
41 |
42 | (Mac OS)
43 |
44 | ```json
45 | {
46 | "mcpServers": {
47 | "simple-arxiv": {
48 | "command": "python",
49 | "args": ["-m", "mcp_simple_arxiv"]
50 | }
51 | }
52 | }
53 | ```
54 |
55 | (Windows version):
56 |
57 | ```json
58 | {
59 | "mcpServers": {
60 | "simple-arxiv": {
61 | "command": "C:\\Users\\YOUR_USERNAME\\AppData\\Local\\Programs\\Python\\Python311\\python.exe",
62 | "args": [
63 | "-m",
64 | "mcp_simple_arxiv"
65 | ]
66 | }
67 | }
68 | }
69 | ```
70 |
71 | After restarting Claude Desktop, the following capabilities will be available:
72 |
73 | ### Searching Papers
74 |
75 | You can ask Claude to search for papers using queries like:
76 | ```
77 | Can you search arXiv for recent papers about large language models?
78 | ```
79 |
80 | The search will return basic information about matching papers including:
81 | - Paper title
82 | - Authors
83 | - arXiv ID
84 | - Publication date
85 |
86 | ### Getting Paper Details
87 |
88 | Once you have a paper ID, you can ask for more details:
89 | ```
90 | Can you show me the details for paper 2103.08220?
91 | ```
92 |
93 | This will return:
94 | - Full paper title
95 | - Authors
96 | - Publication and update dates
97 | - Journal reference (if available)
98 | - Paper abstract
99 | - Links to available formats (PDF/HTML)
100 |
101 |
102 | *For web deployment see [DEPLOYMENT.md](DEPLOYMENT.md)*.
103 |
104 | ## Development
105 |
106 | To install for development:
107 | ```bash
108 | git clone https://github.com/andybrandt/mcp-simple-arxiv
109 | cd mcp-simple-arxiv
110 | pip install -e .
111 | ```
112 |
113 | ### arXiv API Guidelines
114 |
115 | This server follows arXiv API usage guidelines:
116 | - Rate limiting to max 1 request per 3 seconds
117 | - Single connection at a time
118 | - Proper error handling and retry logic
119 |
120 | ## License
121 |
122 | MIT
123 |
--------------------------------------------------------------------------------
/test_client.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import logging
3 | from fastmcp.client import Client, StdioTransport
4 |
5 | # Configure logging
6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7 |
8 | async def main():
9 | """
10 | Test client for the mcp-simple-arxiv server.
11 | Connects to the stdio server, lists tools, and calls each tool to verify functionality.
12 | """
13 | logging.info("Starting test client for mcp-simple-arxiv...")
14 |
15 | # Configure the stdio transport to run the server as a module
16 | transport = StdioTransport(
17 | command="python",
18 | args=["-m", "mcp_simple_arxiv"]
19 | )
20 |
21 | # Create a client with the transport
22 | client = Client(transport)
23 |
24 | async with client:
25 | try:
26 | # 1. List available tools
27 | logging.info("--- Testing tools/list ---")
28 | tools = await client.list_tools()
29 | logging.info(f"Found {len(tools)} tools:")
30 | for tool in tools:
31 | logging.info(f"- {tool.name}: {tool.description.splitlines()[0]}")
32 | assert len(tools) == 4, "Expected 4 tools"
33 | logging.info("✅ tools/list test PASSED")
34 |
35 | # 2. Test search_papers
36 | logging.info("\n--- Testing search_papers ---")
37 | query = "electron"
38 | logging.info(f"Calling search_papers with query: '{query}'")
39 | result = await client.call_tool("search_papers", {"query": query, "max_results": 2})
40 | logging.info(f"Result:\n{result.data}")
41 | assert "Search Results" in result.data
42 | logging.info("✅ search_papers test PASSED")
43 |
44 | # 3. Test get_paper_data
45 | logging.info("\n--- Testing get_paper_data ---")
46 | paper_id = "0808.3772" # A known paper
47 | logging.info(f"Calling get_paper_data with paper_id: '{paper_id}'")
48 | result = await client.call_tool("get_paper_data", {"paper_id": paper_id})
49 | logging.info(f"Result:\n{result.data}")
50 | assert "A common mass scale for satellite galaxies of the Milky Way" in result.data
51 | logging.info("✅ get_paper_data test PASSED")
52 |
53 | # 4. Test list_categories
54 | logging.info("\n--- Testing list_categories ---")
55 | logging.info("Calling list_categories without a filter...")
56 | result = await client.call_tool("list_categories")
57 | logging.info(f"Result snippet:\n{result.data[:300]}...")
58 | assert "arXiv Categories" in result.data
59 | logging.info("✅ list_categories (no filter) test PASSED")
60 |
61 | logging.info("Calling list_categories with filter 'cs'...")
62 | result = await client.call_tool("list_categories", {"primary_category": "cs"})
63 | logging.info(f"Result snippet:\n{result.data[:300]}...")
64 | assert "cs: Computer Science" in result.data
65 | assert "math: Mathematics" not in result.data
66 | logging.info("✅ list_categories (with filter) test PASSED")
67 |
68 | # 5. Test update_categories - This might take a moment
69 | logging.info("\n--- Testing update_categories ---")
70 | logging.info("Calling update_categories...")
71 | result = await client.call_tool("update_categories")
72 | logging.info(f"Result:\n{result.data}")
73 | assert "Successfully updated category taxonomy" in result.data
74 | logging.info("✅ update_categories test PASSED")
75 |
76 | except Exception as e:
77 | logging.error(f"An error occurred during testing: {e}", exc_info=True)
78 | finally:
79 | logging.info("\nTest run finished.")
80 |
81 | if __name__ == "__main__":
82 | asyncio.run(main())
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
--------------------------------------------------------------------------------
/DEPLOYMENT.md:
--------------------------------------------------------------------------------
1 | # Web Server Deployment Guide
2 |
3 | This guide explains how to run the network-hostable version of the `mcp-simple-arxiv` server. This version uses the MCP Streamable HTTP transport and is ideal for deployments where clients connect over a network.
4 |
5 | ## Overview
6 |
7 | The web server is a stateless service that exposes four tools for interacting with the arXiv API:
8 | - `search_papers`: Search for papers by keyword.
9 | - `get_paper_data`: Fetch detailed information for a specific paper ID.
10 | - `list_categories`: List the available arXiv subject categories.
11 | - `update_categories`: Refresh the locally cached category list from arXiv.
12 |
13 | It runs using FastMCP’s built-in web server (based on Uvicorn/Starlette).
14 |
15 | ## Local Development
16 |
17 | ### 1. Install Dependencies
18 |
19 | First, ensure you have a virtual environment set up and the project installed in editable mode:
20 | ```bash
21 | python3 -m venv venv
22 | source venv/bin/activate
23 | pip install -e .
24 | ```
25 |
26 | ### 2. Run the Server
27 |
28 | The server can be run directly using its Python module:
29 | ```bash
30 | python -m mcp_simple_arxiv.web_server
31 | ```
32 | This will start the server on `http://0.0.0.0:8000`.
33 |
34 | ### 3. Test the Server
35 |
36 | You can test the running server from your command line using `curl`.
37 |
38 | **List Tools Request:**
39 | ```bash
40 | curl -X POST http://127.0.0.1:8000/mcp/ \
41 | -H "Content-Type: application/json" \
42 | -H "Accept: application/json, text/event-stream" \
43 | -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}'
44 | ```
45 |
46 | **Tool Call Request (`search_papers`):**
47 | ```bash
48 | curl -X POST http://127.0.0.1:8000/mcp/ \
49 | -H "Content-Type: application/json" \
50 | -H "Accept: application/json, text/event-stream" \
51 | -d '{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"search_papers","arguments":{"query": "quantum computing"}}}'
52 | ```
53 |
54 | ## Docker Deployment
55 |
56 | The project includes a `Dockerfile.web` for easy containerization.
57 |
58 | ### 1. Build the Docker Image
59 | ```bash
60 | docker build -f Dockerfile.web -t mcp-simple-arxiv:web .
61 | ```
62 |
63 | ### 2. Run the Docker Container
64 |
65 | To run the container for use with a local reverse proxy (like Apache or Nginx), you should map the container’s port only to the host’s loopback interface:
66 | ```bash
67 | docker run -d -p 127.0.0.1:8000:8000 --name arxiv-web mcp-simple-arxiv:web
68 | ```
69 | This command does two important things:
70 | 1. It runs the container in detached mode (`-d`).
71 | 2. It maps port 8000 inside the container to port 8000 on the host machine’s `localhost` interface only (`-p 127.0.0.1:8000:8000`). This ensures the server is not directly accessible from the network, which is the recommended setup when placing it behind a reverse proxy.
72 |
73 | For persistence, you can set the container to restart automatically:
74 | ```bash
75 | docker run -d --restart always -p 127.0.0.1:8000:8000 --name arxiv-web mcp-simple-arxiv:web
76 | ```
77 |
78 | ### 3. Transferring the Image
79 |
80 | If you built the image on a different machine, you can package it for transfer:
81 | ```bash
82 | # On the source machine
83 | docker save -o mcp-simple-arxiv-web.tar mcp-simple-arxiv:web
84 | gzip mcp-simple-arxiv-web.tar
85 |
86 | # On the destination machine
87 | gunzip mcp-simple-arxiv-web.tar.gz
88 | docker load -i mcp-simple-arxiv-web.tar
89 | ```
90 |
91 | ## Changing the Port
92 |
93 | The server is configured to run on port 8000 inside the container. To map this to a different host port, change the first value in the `-p` parameter. The format is `-p :`.
94 | ```bash
95 | # Map container's port 8000 to host's port 9001
96 | docker run -d -p 127.0.0.1:9001:8000 --name arxiv-web mcp-simple-arxiv:web
97 | ```
98 | The server will now be accessible at `http://127.0.0.1:9001` on the host.
99 |
100 | ## Apache Reverse Proxy Configuration
101 |
102 | **Important**: MCP clients may request URLs with or without trailing slashes. Your Apache configuration must handle both cases to avoid 404 errors.
103 |
104 | Example Configuration:
105 | ```apache
106 |
107 | ServerName mcp.yourdomain.com
108 |
109 | # SSL Configuration (recommended for production)
110 | # SSLEngine on
111 | # SSLCertificateFile /path/to/cert.pem
112 | # SSLCertificateKeyFile /path/to/key.pem
113 |
114 | # Main proxy configuration
115 |
116 | ProxyPass http://127.0.0.1:8000/mcp/
117 | ProxyPassReverse http://127.0.0.1:8000/mcp/
118 |
119 |
120 | ```
121 | This configuration will make your arXiv MCP server available at `https://mcp.yourdomain.com/arxiv`.
--------------------------------------------------------------------------------
/mcp_simple_arxiv/taxonomy.json:
--------------------------------------------------------------------------------
1 | {
2 | "cs": {
3 | "name": "Computer Science",
4 | "subcategories": {
5 | "AI": "Artificial Intelligence",
6 | "CL": "Computation and Language",
7 | "CC": "Computational Complexity",
8 | "CE": "Computational Engineering, Finance, and Science",
9 | "CG": "Computational Geometry",
10 | "GT": "Computer Science and Game Theory",
11 | "CV": "Computer Vision and Pattern Recognition",
12 | "CY": "Computers and Society",
13 | "CR": "Cryptography and Security",
14 | "DB": "Databases",
15 | "DS": "Data Structures and Algorithms",
16 | "DL": "Digital Libraries",
17 | "DM": "Discrete Mathematics",
18 | "DC": "Distributed Computing",
19 | "ET": "Emerging Technologies",
20 | "FL": "Formal Languages and Automata Theory",
21 | "GL": "General Literature",
22 | "GR": "Graphics",
23 | "AR": "Hardware Architecture",
24 | "HC": "Human-Computer Interaction",
25 | "IR": "Information Retrieval",
26 | "IT": "Information Theory",
27 | "LG": "Machine Learning",
28 | "LO": "Logic in Computer Science",
29 | "MS": "Mathematical Software",
30 | "MA": "Multiagent Systems",
31 | "MM": "Multimedia",
32 | "NI": "Networking and Internet Architecture",
33 | "NE": "Neural and Evolutionary Computing",
34 | "NA": "Numerical Analysis",
35 | "OS": "Operating Systems",
36 | "OH": "Other Computer Science",
37 | "PF": "Performance",
38 | "PL": "Programming Languages",
39 | "RO": "Robotics",
40 | "SI": "Social and Information Networks",
41 | "SE": "Software Engineering",
42 | "SD": "Sound",
43 | "SC": "Symbolic Computation",
44 | "SY": "Systems and Control"
45 | }
46 | },
47 | "econ": {
48 | "name": "Economics",
49 | "subcategories": {
50 | "EM": "Econometrics",
51 | "GN": "General Economics",
52 | "TH": "Theoretical Economics"
53 | }
54 | },
55 | "eess": {
56 | "name": "Electrical Engineering and Systems Science",
57 | "subcategories": {
58 | "AS": "Audio and Speech Processing",
59 | "IV": "Image and Video Processing",
60 | "SP": "Signal Processing",
61 | "SY": "Systems and Control"
62 | }
63 | },
64 | "math": {
65 | "name": "Mathematics",
66 | "subcategories": {
67 | "AG": "Algebraic Geometry",
68 | "AT": "Algebraic Topology",
69 | "AP": "Analysis of PDEs",
70 | "CT": "Category Theory",
71 | "CA": "Classical Analysis and ODEs",
72 | "CO": "Combinatorics",
73 | "AC": "Commutative Algebra",
74 | "CV": "Complex Variables",
75 | "DG": "Differential Geometry",
76 | "DS": "Dynamical Systems",
77 | "FA": "Functional Analysis",
78 | "GM": "General Mathematics",
79 | "GN": "General Topology",
80 | "GT": "Geometric Topology",
81 | "GR": "Group Theory",
82 | "HO": "History and Overview",
83 | "IT": "Information Theory",
84 | "KT": "K-Theory and Homology",
85 | "LO": "Logic",
86 | "MP": "Mathematical Physics",
87 | "MG": "Metric Geometry",
88 | "NT": "Number Theory",
89 | "NA": "Numerical Analysis",
90 | "OA": "Operator Algebras",
91 | "OC": "Optimization and Control",
92 | "PR": "Probability",
93 | "QA": "Quantum Algebra",
94 | "RT": "Representation Theory",
95 | "RA": "Rings and Algebras",
96 | "SP": "Spectral Theory",
97 | "ST": "Statistics Theory",
98 | "SG": "Symplectic Geometry"
99 | }
100 | },
101 | "physics": {
102 | "name": "Physics",
103 | "subcategories": {
104 | "acc-ph": "Accelerator Physics",
105 | "ao-ph": "Atmospheric and Oceanic Physics",
106 | "atom-ph": "Atomic Physics",
107 | "bio-ph": "Biological Physics",
108 | "chem-ph": "Chemical Physics",
109 | "class-ph": "Classical Physics",
110 | "comp-ph": "Computational Physics",
111 | "data-an": "Data Analysis, Statistics and Probability",
112 | "flu-dyn": "Fluid Dynamics",
113 | "gen-ph": "General Physics",
114 | "geo-ph": "Geophysics",
115 | "hist-ph": "History and Philosophy of Physics",
116 | "ins-det": "Instrumentation and Detectors",
117 | "med-ph": "Medical Physics",
118 | "optics": "Optics",
119 | "ed-ph": "Physics Education",
120 | "soc-ph": "Physics and Society",
121 | "plasm-ph": "Plasma Physics",
122 | "pop-ph": "Popular Physics",
123 | "space-ph": "Space Physics"
124 | }
125 | },
126 | "q-bio": {
127 | "name": "Quantitative Biology",
128 | "subcategories": {
129 | "BM": "Biomolecules",
130 | "CB": "Cell Behavior",
131 | "GN": "Genomics",
132 | "MN": "Molecular Networks",
133 | "NC": "Neurons and Cognition",
134 | "OT": "Other Quantitative Biology",
135 | "PE": "Populations and Evolution",
136 | "QM": "Quantitative Methods",
137 | "SC": "Subcellular Processes",
138 | "TO": "Tissues and Organs"
139 | }
140 | },
141 | "q-fin": {
142 | "name": "Quantitative Finance",
143 | "subcategories": {
144 | "CP": "Computational Finance",
145 | "EC": "Economics",
146 | "GN": "General Finance",
147 | "MF": "Mathematical Finance",
148 | "PM": "Portfolio Management",
149 | "PR": "Pricing of Securities",
150 | "RM": "Risk Management",
151 | "ST": "Statistical Finance",
152 | "TR": "Trading and Market Microstructure"
153 | }
154 | },
155 | "stat": {
156 | "name": "Statistics",
157 | "subcategories": {
158 | "AP": "Applications",
159 | "CO": "Computation",
160 | "ME": "Methodology",
161 | "ML": "Machine Learning",
162 | "OT": "Other Statistics",
163 | "TH": "Statistics Theory"
164 | }
165 | }
166 | }
--------------------------------------------------------------------------------
/test_web_client.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import httpx
3 | import logging
4 | import subprocess
5 | import time
6 | import sys
7 | import signal
8 |
9 | # Configure logging
10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
11 |
12 | SERVER_URL = "http://127.0.0.1:8000/mcp/"
13 | HEADERS = {
14 | "Accept": "application/json, text/event-stream",
15 | "Content-Type": "application/json"
16 | }
17 |
18 | async def check_server_ready(client: httpx.AsyncClient):
19 | """Polls the server until it is ready to accept connections."""
20 | for _ in range(20): # Poll for up to 10 seconds
21 | try:
22 | response = await client.post(SERVER_URL, json={"jsonrpc": "2.0", "id": 0, "method": "tools/list"}, headers=HEADERS)
23 | if response.status_code == 200:
24 | logging.info("Web server is up and running.")
25 | return True
26 | except httpx.ConnectError:
27 | pass
28 | await asyncio.sleep(0.5)
29 | logging.error("Web server did not start in time.")
30 | return False
31 |
32 | async def call_tool(client: httpx.AsyncClient, tool_name: str, params: dict = None) -> dict:
33 | """Helper function to call a tool via JSON-RPC."""
34 | method = "tools/call" if tool_name != "tools/list" else "tools/list"
35 |
36 | payload = {
37 | "jsonrpc": "2.0",
38 | "id": 1,
39 | "method": method,
40 | }
41 |
42 | if method == "tools/call":
43 | payload["params"] = {"name": tool_name, "arguments": params or {}}
44 |
45 | response = await client.post(SERVER_URL, json=payload, headers=HEADERS)
46 | response.raise_for_status()
47 | # The response is Server-Sent Events, we need to parse it
48 | for line in response.text.strip().split('\n'):
49 | if line.startswith('data:'):
50 | import json
51 | return json.loads(line[len('data:'):].strip())
52 | raise ValueError("Did not receive a valid data event from the server.")
53 |
54 |
55 | async def main():
56 | """
57 | Test client for the mcp-simple-arxiv web server.
58 | Starts the server, runs tests, and then stops it.
59 | """
60 | server_process = None
61 | try:
62 | logging.info("Starting web server process...")
63 | # Start the server as a subprocess
64 | server_process = subprocess.Popen(
65 | [sys.executable, "-m", "mcp_simple_arxiv.web_server"],
66 | stdout=sys.stdout,
67 | stderr=sys.stderr
68 | )
69 |
70 | async with httpx.AsyncClient(timeout=30.0) as client:
71 | if not await check_server_ready(client):
72 | raise RuntimeError("Could not connect to the web server.")
73 |
74 | # 1. List available tools
75 | logging.info("\n--- Testing tools/list ---")
76 | response_json = await call_tool(client, "tools/list") # Using call_tool to simplify logic
77 | tools = response_json['result']['tools']
78 | logging.info(f"Found {len(tools)} tools.")
79 | assert len(tools) == 4
80 | logging.info("✅ tools/list test PASSED")
81 |
82 | # 2. Test search_papers
83 | logging.info("\n--- Testing search_papers ---")
84 | query = "dark matter"
85 | response_json = await call_tool(client, "search_papers", {"query": query, "max_results": 1})
86 | result = response_json['result']['structuredContent']['result']
87 | logging.info(f"Result for '{query}':\n{result}")
88 | assert "Search Results" in result
89 | logging.info("✅ search_papers test PASSED")
90 |
91 | # 3. Test get_paper_data
92 | logging.info("\n--- Testing get_paper_data ---")
93 | paper_id = "0808.3772" # Using the same ID as the stdio test for consistency
94 | response_json = await call_tool(client, "get_paper_data", {"paper_id": paper_id})
95 | result = response_json['result']['structuredContent']['result']
96 | logging.info(f"Result for paper '{paper_id}':\n{result}")
97 | assert "A common mass scale for satellite galaxies of the Milky Way" in result
98 | logging.info("✅ get_paper_data test PASSED")
99 |
100 | # 4. Test list_categories
101 | logging.info("\n--- Testing list_categories ---")
102 | response_json = await call_tool(client, "list_categories")
103 | result = response_json['result']['structuredContent']['result']
104 | logging.info("Result snippet:\n" + result[:200] + "...")
105 | assert "arXiv Categories" in result
106 | logging.info("✅ list_categories test PASSED")
107 |
108 | # 5. Test update_categories
109 | logging.info("\n--- Testing update_categories ---")
110 | response_json = await call_tool(client, "update_categories")
111 | result = response_json['result']['structuredContent']['result']
112 | logging.info(f"Result:\n{result}")
113 | assert "Successfully updated category taxonomy" in result
114 | logging.info("✅ update_categories test PASSED")
115 |
116 | except Exception as e:
117 | logging.error(f"An error occurred during testing: {e}", exc_info=True)
118 | finally:
119 | if server_process:
120 | logging.info("\nStopping web server process...")
121 | server_process.send_signal(signal.SIGINT) # Send Ctrl+C
122 | try:
123 | server_process.wait(timeout=10)
124 | logging.info("Web server stopped gracefully.")
125 | except subprocess.TimeoutExpired:
126 | logging.warning("Web server did not stop gracefully, killing.")
127 | server_process.kill()
128 | logging.info("Test run finished.")
129 |
130 | if __name__ == "__main__":
131 | asyncio.run(main())
--------------------------------------------------------------------------------
/mcp_simple_arxiv/categories.py:
--------------------------------------------------------------------------------
1 | """
2 | arXiv category taxonomy.
3 | """
4 |
5 | CATEGORIES = {
6 | "cs": {
7 | "name": "Computer Science",
8 | "subcategories": {
9 | "AI": "Artificial Intelligence",
10 | "CL": "Computation and Language",
11 | "CC": "Computational Complexity",
12 | "CE": "Computational Engineering, Finance, and Science",
13 | "CG": "Computational Geometry",
14 | "GT": "Computer Science and Game Theory",
15 | "CV": "Computer Vision and Pattern Recognition",
16 | "CY": "Computers and Society",
17 | "CR": "Cryptography and Security",
18 | "DB": "Databases",
19 | "DS": "Data Structures and Algorithms",
20 | "DL": "Digital Libraries",
21 | "DM": "Discrete Mathematics",
22 | "DC": "Distributed Computing",
23 | "ET": "Emerging Technologies",
24 | "FL": "Formal Languages and Automata Theory",
25 | "GL": "General Literature",
26 | "GR": "Graphics",
27 | "AR": "Hardware Architecture",
28 | "HC": "Human-Computer Interaction",
29 | "IR": "Information Retrieval",
30 | "IT": "Information Theory",
31 | "LG": "Machine Learning",
32 | "LO": "Logic in Computer Science",
33 | "MS": "Mathematical Software",
34 | "MA": "Multiagent Systems",
35 | "MM": "Multimedia",
36 | "NI": "Networking and Internet Architecture",
37 | "NE": "Neural and Evolutionary Computing",
38 | "NA": "Numerical Analysis",
39 | "OS": "Operating Systems",
40 | "OH": "Other Computer Science",
41 | "PF": "Performance",
42 | "PL": "Programming Languages",
43 | "RO": "Robotics",
44 | "SI": "Social and Information Networks",
45 | "SE": "Software Engineering",
46 | "SD": "Sound",
47 | "SC": "Symbolic Computation",
48 | "SY": "Systems and Control",
49 | }
50 | },
51 | "econ": {
52 | "name": "Economics",
53 | "subcategories": {
54 | "EM": "Econometrics",
55 | "GN": "General Economics",
56 | "TH": "Theoretical Economics",
57 | }
58 | },
59 | "eess": {
60 | "name": "Electrical Engineering and Systems Science",
61 | "subcategories": {
62 | "AS": "Audio and Speech Processing",
63 | "IV": "Image and Video Processing",
64 | "SP": "Signal Processing",
65 | "SY": "Systems and Control",
66 | }
67 | },
68 | "math": {
69 | "name": "Mathematics",
70 | "subcategories": {
71 | "AG": "Algebraic Geometry",
72 | "AT": "Algebraic Topology",
73 | "AP": "Analysis of PDEs",
74 | "CT": "Category Theory",
75 | "CA": "Classical Analysis and ODEs",
76 | "CO": "Combinatorics",
77 | "AC": "Commutative Algebra",
78 | "CV": "Complex Variables",
79 | "DG": "Differential Geometry",
80 | "DS": "Dynamical Systems",
81 | "FA": "Functional Analysis",
82 | "GM": "General Mathematics",
83 | "GN": "General Topology",
84 | "GT": "Geometric Topology",
85 | "GR": "Group Theory",
86 | "HO": "History and Overview",
87 | "IT": "Information Theory",
88 | "KT": "K-Theory and Homology",
89 | "LO": "Logic",
90 | "MP": "Mathematical Physics",
91 | "MG": "Metric Geometry",
92 | "NT": "Number Theory",
93 | "NA": "Numerical Analysis",
94 | "OA": "Operator Algebras",
95 | "OC": "Optimization and Control",
96 | "PR": "Probability",
97 | "QA": "Quantum Algebra",
98 | "RT": "Representation Theory",
99 | "RA": "Rings and Algebras",
100 | "SP": "Spectral Theory",
101 | "ST": "Statistics Theory",
102 | "SG": "Symplectic Geometry",
103 | }
104 | },
105 | "physics": {
106 | "name": "Physics",
107 | "subcategories": {
108 | "acc-ph": "Accelerator Physics",
109 | "ao-ph": "Atmospheric and Oceanic Physics",
110 | "atom-ph": "Atomic Physics",
111 | "bio-ph": "Biological Physics",
112 | "chem-ph": "Chemical Physics",
113 | "class-ph": "Classical Physics",
114 | "comp-ph": "Computational Physics",
115 | "data-an": "Data Analysis, Statistics and Probability",
116 | "flu-dyn": "Fluid Dynamics",
117 | "gen-ph": "General Physics",
118 | "geo-ph": "Geophysics",
119 | "hist-ph": "History and Philosophy of Physics",
120 | "ins-det": "Instrumentation and Detectors",
121 | "med-ph": "Medical Physics",
122 | "optics": "Optics",
123 | "ed-ph": "Physics Education",
124 | "soc-ph": "Physics and Society",
125 | "plasm-ph": "Plasma Physics",
126 | "pop-ph": "Popular Physics",
127 | "space-ph": "Space Physics",
128 | }
129 | },
130 | "q-bio": {
131 | "name": "Quantitative Biology",
132 | "subcategories": {
133 | "BM": "Biomolecules",
134 | "CB": "Cell Behavior",
135 | "GN": "Genomics",
136 | "MN": "Molecular Networks",
137 | "NC": "Neurons and Cognition",
138 | "OT": "Other Quantitative Biology",
139 | "PE": "Populations and Evolution",
140 | "QM": "Quantitative Methods",
141 | "SC": "Subcellular Processes",
142 | "TO": "Tissues and Organs",
143 | }
144 | },
145 | "q-fin": {
146 | "name": "Quantitative Finance",
147 | "subcategories": {
148 | "CP": "Computational Finance",
149 | "EC": "Economics",
150 | "GN": "General Finance",
151 | "MF": "Mathematical Finance",
152 | "PM": "Portfolio Management",
153 | "PR": "Pricing of Securities",
154 | "RM": "Risk Management",
155 | "ST": "Statistical Finance",
156 | "TR": "Trading and Market Microstructure",
157 | }
158 | },
159 | "stat": {
160 | "name": "Statistics",
161 | "subcategories": {
162 | "AP": "Applications",
163 | "CO": "Computation",
164 | "ME": "Methodology",
165 | "ML": "Machine Learning",
166 | "OT": "Other Statistics",
167 | "TH": "Statistics Theory",
168 | }
169 | },
170 | }
--------------------------------------------------------------------------------
/mcp_simple_arxiv/server.py:
--------------------------------------------------------------------------------
1 | """
2 | MCP server for accessing arXiv papers.
3 | """
4 |
5 | import sys
6 | sys.stdout.reconfigure(encoding='utf-8')
7 | sys.stdin.reconfigure(encoding='utf-8')
8 |
9 | import asyncio
10 | import logging
11 |
12 | from fastmcp import FastMCP
13 |
14 | from .arxiv_client import ArxivClient
15 | from .update_taxonomy import load_taxonomy, update_taxonomy_file
16 |
17 | logger = logging.getLogger(__name__)
18 |
19 | def get_first_sentence(text: str, max_len: int = 200) -> str:
20 | """Extract first sentence from text, limiting length."""
21 | # Look for common sentence endings
22 | for end in ['. ', '! ', '? ']:
23 | pos = text.find(end)
24 | if pos != -1 and pos < max_len:
25 | return text[:pos + 1]
26 | # If no sentence ending found, just take first max_len chars
27 | if len(text) > max_len:
28 | return text[:max_len].rstrip() + '...'
29 | return text
30 |
31 | def create_app():
32 | """Creates and configures the FastMCP app instance and its tools."""
33 | app = FastMCP("arxiv-server")
34 | arxiv_client = ArxivClient()
35 |
36 | @app.tool(
37 | annotations={
38 | "title": "Search arXiv Papers",
39 | "readOnlyHint": True,
40 | "openWorldHint": True
41 | }
42 | )
43 | async def search_papers(query: str, max_results: int = 10) -> str:
44 | """
45 | Search for papers on arXiv by title and abstract content.
46 |
47 | You can use advanced search syntax:
48 | - Search in title: ti:"search terms"
49 | - Search in abstract: abs:"search terms"
50 | - Search by author: au:"author name"
51 | - Combine terms with: AND, OR, ANDNOT
52 | - Filter by category: cat:cs.AI (use list_categories tool to see available categories)
53 |
54 | Examples:
55 | - "machine learning" (searches all fields)
56 | - ti:"neural networks" AND cat:cs.AI (title with category)
57 | - au:bengio AND ti:"deep learning" (author and title)
58 | """
59 | max_results = min(max_results, 50)
60 | papers = await arxiv_client.search(query, max_results)
61 |
62 | # Format results in a readable way
63 | result = "Search Results:\n\n"
64 | for i, paper in enumerate(papers, 1):
65 | result += f"{i}. {paper['title']}\n"
66 | result += f" Authors: {', '.join(paper['authors'])}\n"
67 | result += f" ID: {paper['id']}\n"
68 | result += f" Categories: "
69 | if paper['primary_category']:
70 | result += f"Primary: {paper['primary_category']}"
71 | if paper['categories']:
72 | result += f", Additional: {', '.join(paper['categories'])}"
73 | result += f"\n Published: {paper['published']}\n"
74 |
75 | # Add first sentence of abstract
76 | abstract_preview = get_first_sentence(paper['summary'])
77 | result += f" Preview: {abstract_preview}\n"
78 | result += "\n"
79 |
80 | return result
81 |
82 | @app.tool(
83 | annotations={
84 | "title": "Get arXiv Paper Data",
85 | "readOnlyHint": True,
86 | "openWorldHint": True
87 | }
88 | )
89 | async def get_paper_data(paper_id: str) -> str:
90 | """Get detailed information about a specific paper including abstract and available formats."""
91 | paper = await arxiv_client.get_paper(paper_id)
92 |
93 | # Format paper details in a readable way with clear sections
94 | result = f"Title: {paper['title']}\n\n"
95 |
96 | # Metadata section
97 | result += "Metadata:\n"
98 | result += f"- Authors: {', '.join(paper['authors'])}\n"
99 | result += f"- Published: {paper['published']}\n"
100 | result += f"- Last Updated: {paper['updated']}\n"
101 | result += "- Categories: "
102 | if paper['primary_category']:
103 | result += f"Primary: {paper['primary_category']}"
104 | if paper['categories']:
105 | result += f", Additional: {', '.join(paper['categories'])}"
106 | result += "\n"
107 |
108 | if paper['doi']:
109 | result += f"- DOI: {paper['doi']}\n"
110 | if paper["journal_ref"]:
111 | result += f"- Journal Reference: {paper['journal_ref']}\n"
112 |
113 | # Abstract section
114 | result += "\nAbstract:\n"
115 | result += paper["summary"]
116 | result += "\n"
117 |
118 | # Access options section
119 | result += "\nAccess Options:\n"
120 | result += "- Abstract page: " + paper["abstract_url"] + "\n"
121 | if paper["html_url"]: # Add HTML version if available
122 | result += "- Full text HTML version: " + paper["html_url"] + "\n"
123 | result += "- PDF version: " + paper["pdf_url"] + "\n"
124 |
125 | # Additional information section
126 | if paper["comment"] or "code" in paper["comment"].lower():
127 | result += "\nAdditional Information:\n"
128 | if paper["comment"]:
129 | result += "- Comment: " + paper["comment"] + "\n"
130 |
131 | return result
132 |
133 | @app.tool(
134 | annotations={
135 | "title": "List arXiv Categories",
136 | "readOnlyHint": True,
137 | "openWorldHint": False
138 | }
139 | )
140 | def list_categories(primary_category: str = None) -> str:
141 | """List all available arXiv categories and how to use them in search."""
142 | try:
143 | taxonomy = load_taxonomy()
144 | except Exception as e:
145 | logger.error(f"Error loading taxonomy: {e}")
146 | return f"Error loading category taxonomy. Try using update_categories tool to refresh it."
147 |
148 | result = "arXiv Categories:\n\n"
149 |
150 | for primary, data in taxonomy.items():
151 | if primary_category and primary != primary_category:
152 | continue
153 |
154 | result += f"{primary}: {data['name']}\n"
155 | for code, desc in data['subcategories'].items():
156 | result += f" {primary}.{code}: {desc}\n"
157 | result += "\n"
158 |
159 | result += "\nUsage in search:\n"
160 | result += '- Search in specific category: cat:cs.AI\n'
161 | result += '- Combine with other terms: "neural networks" AND cat:cs.AI\n'
162 | result += '- Multiple categories: (cat:cs.AI OR cat:cs.LG)\n'
163 | result += '\nNote: If categories seem outdated, use the update_categories tool to refresh them.\n'
164 |
165 | return result
166 |
167 | @app.tool(
168 | annotations={
169 | "title": "Update arXiv Categories",
170 | "readOnlyHint": False,
171 | "openWorldHint": True
172 | }
173 | )
174 | def update_categories() -> str:
175 | """Update the stored category taxonomy by fetching the latest version from arxiv.org"""
176 | try:
177 | taxonomy = update_taxonomy_file()
178 | result = "Successfully updated category taxonomy.\n\n"
179 | result += f"Found {len(taxonomy)} primary categories:\n"
180 | for primary, data in taxonomy.items():
181 | result += f"- {primary}: {data['name']} ({len(data['subcategories'])} subcategories)\n"
182 | return result
183 | except Exception as e:
184 | logger.error(f"Error updating taxonomy: {e}")
185 | # FastMCP will handle raising this as a proper JSON-RPC error
186 | raise e
187 |
188 | return app
189 |
190 | app = create_app()
191 |
192 | def main():
193 | """Run the MCP server."""
194 | app.run()
195 |
196 | if __name__ == "__main__":
197 | logging.basicConfig(level=logging.INFO)
198 | main()
--------------------------------------------------------------------------------
/mcp_simple_arxiv/arxiv_client.py:
--------------------------------------------------------------------------------
1 | """
2 | arXiv API client with rate limiting.
3 | """
4 |
5 | import asyncio
6 | import logging
7 | from datetime import datetime, timedelta
8 | import feedparser
9 | import httpx
10 | from typing import Optional, Dict, List, Any
11 |
12 | logger = logging.getLogger(__name__)
13 |
14 | class ArxivClient:
15 | """
16 | arXiv API client with built-in rate limiting.
17 | Ensures no more than 1 request every 3 seconds.
18 | """
19 |
20 | def __init__(self):
21 | self.base_url = "https://export.arxiv.org/api/query"
22 | self._last_request: Optional[datetime] = None
23 | self._lock = asyncio.Lock()
24 |
25 | async def _wait_for_rate_limit(self) -> None:
26 | """Ensures we respect arXiv's rate limit of 1 request every 3 seconds."""
27 | async with self._lock:
28 | if self._last_request is not None:
29 | elapsed = datetime.now() - self._last_request
30 | if elapsed < timedelta(seconds=3):
31 | await asyncio.sleep(3 - elapsed.total_seconds())
32 | self._last_request = datetime.now()
33 |
34 | def _clean_text(self, text: str) -> str:
35 | """Clean up text by removing extra whitespace and newlines."""
36 | return " ".join(text.split())
37 |
38 | def _get_html_url(self, arxiv_id: str) -> str:
39 | """
40 | Construct HTML version URL for a paper.
41 |
42 | The HTML version URL is not provided by the API but can be constructed
43 | by modifying the PDF URL pattern.
44 | """
45 | # Remove version suffix if present (e.g., v1, v2)
46 | base_id = arxiv_id.split('v')[0]
47 | return f"https://arxiv.org/html/{arxiv_id}"
48 |
49 | def _parse_entry(self, entry: Dict[str, Any]) -> Dict[str, Any]:
50 | """Parse a feed entry into a paper dictionary."""
51 | # Extract PDF and HTML links
52 | pdf_url = None
53 | abstract_url = None # This is the URL to the abstract page
54 | for link in entry.get('links', []):
55 | if isinstance(link, dict):
56 | if link.get('type') == 'application/pdf':
57 | pdf_url = link.get('href')
58 | elif link.get('type') == 'text/html':
59 | abstract_url = link.get('href')
60 |
61 | # Get paper ID
62 | paper_id = entry.get('id', '').split("/abs/")[-1].rstrip()
63 |
64 | # Create HTML version URL
65 | html_url = self._get_html_url(paper_id) if paper_id else None
66 |
67 | # Get authors
68 | authors = []
69 | for author in entry.get('authors', []):
70 | if isinstance(author, dict) and 'name' in author:
71 | authors.append(author['name'])
72 | elif hasattr(author, 'name'):
73 | authors.append(author.name)
74 |
75 | # Get categories
76 | categories = []
77 | primary_category = None
78 |
79 | # Get primary category
80 | if 'arxiv_primary_category' in entry:
81 | if isinstance(entry['arxiv_primary_category'], dict):
82 | primary_category = entry['arxiv_primary_category'].get('term')
83 | elif hasattr(entry['arxiv_primary_category'], 'term'):
84 | primary_category = entry['arxiv_primary_category'].term
85 |
86 | # Get all categories
87 | for category in entry.get('tags', []):
88 | if isinstance(category, dict) and 'term' in category:
89 | categories.append(category['term'])
90 | elif hasattr(category, 'term'):
91 | categories.append(category.term)
92 |
93 | # Remove primary category from regular categories if it's there
94 | if primary_category and primary_category in categories:
95 | categories.remove(primary_category)
96 |
97 | return {
98 | "id": paper_id,
99 | "title": self._clean_text(entry.get('title', '')),
100 | "authors": authors,
101 | "primary_category": primary_category,
102 | "categories": categories,
103 | "published": entry.get('published', ''),
104 | "updated": entry.get('updated', ''),
105 | "summary": self._clean_text(entry.get('summary', '')),
106 | "comment": self._clean_text(entry.get('arxiv_comment', '')),
107 | "journal_ref": entry.get('arxiv_journal_ref', ''),
108 | "doi": entry.get('arxiv_doi', ''),
109 | "pdf_url": pdf_url,
110 | "abstract_url": abstract_url, # URL to abstract page
111 | "html_url": html_url # URL to HTML version if available
112 | }
113 |
114 | async def search(self, query: str, max_results: int = 10) -> List[Dict[str, Any]]:
115 | """
116 | Search arXiv papers.
117 |
118 | The query string supports arXiv's advanced search syntax:
119 | - Search in title: ti:"search terms"
120 | - Search in abstract: abs:"search terms"
121 | - Search by author: au:"author name"
122 | - Combine terms with: AND, OR, ANDNOT
123 | - Filter by category: cat:cs.AI
124 |
125 | Examples:
126 | - "machine learning" (searches all fields)
127 | - ti:"neural networks" AND cat:cs.AI (title with category)
128 | - au:bengio AND ti:"deep learning" (author and title)
129 | """
130 | await self._wait_for_rate_limit()
131 |
132 | # Ensure max_results is within API limits
133 | max_results = min(max_results, 2000) # API limit: 2000 per request
134 |
135 | params = {
136 | "search_query": query,
137 | "max_results": max_results,
138 | "sortBy": "submittedDate", # Default to newest papers first
139 | "sortOrder": "descending",
140 | }
141 |
142 | async with httpx.AsyncClient(timeout=20.0) as client:
143 | try:
144 | response = await client.get(self.base_url, params=params)
145 | response.raise_for_status() # Raise an exception for bad status codes
146 |
147 | # Parse the Atom feed response
148 | feed = feedparser.parse(response.text)
149 |
150 | if not isinstance(feed, dict) or 'entries' not in feed:
151 | logger.error("Invalid response from arXiv API")
152 | logger.debug(f"Response text: {response.text[:1000]}...")
153 | raise ValueError("Invalid response from arXiv API")
154 |
155 | if not feed.get('entries'):
156 | # Empty results are ok - return empty list
157 | return []
158 |
159 | return [self._parse_entry(entry) for entry in feed.entries]
160 |
161 | except httpx.HTTPError as e:
162 | logger.error(f"HTTP error while searching: {e}")
163 | raise ValueError(f"arXiv API HTTP error: {str(e)}")
164 |
165 | async def get_paper(self, paper_id: str) -> Dict[str, Any]:
166 | """
167 | Get detailed information about a specific paper.
168 |
169 | Args:
170 | paper_id: arXiv paper ID (e.g., "2103.08220")
171 |
172 | Returns:
173 | Dictionary containing paper metadata, including:
174 | - Basic metadata (title, authors, dates)
175 | - Categories (primary and others)
176 | - Abstract and comments
177 | - URLs (abstract page, PDF version, HTML version if available)
178 | - DOI if available
179 | """
180 | await self._wait_for_rate_limit()
181 |
182 | params = {
183 | "id_list": paper_id,
184 | "max_results": 1
185 | }
186 |
187 | async with httpx.AsyncClient(timeout=20.0) as client:
188 | try:
189 | response = await client.get(self.base_url, params=params)
190 | response.raise_for_status()
191 |
192 | feed = feedparser.parse(response.text)
193 | if not isinstance(feed, dict) or 'entries' not in feed:
194 | logger.error("Invalid response from arXiv API")
195 | logger.debug(f"Response text: {response.text[:1000]}...")
196 | raise ValueError("Invalid response from arXiv API")
197 |
198 | if not feed.get('entries'):
199 | raise ValueError(f"Paper not found: {paper_id}")
200 |
201 | return self._parse_entry(feed.entries[0])
202 |
203 | except httpx.HTTPError as e:
204 | logger.error(f"HTTP error while fetching paper: {e}")
205 | raise ValueError(f"arXiv API HTTP error: {str(e)}")
206 |
--------------------------------------------------------------------------------