├── MANIFEST.in
├── .github
    └── funding.yml
├── mcp_simple_arxiv
    ├── __main__.py
    ├── __init__.py
    ├── update_taxonomy.py
    ├── taxonomy.json
    ├── categories.py
    ├── server.py
    └── arxiv_client.py
├── smithery.yaml
├── Dockerfile.web
├── Dockerfile
├── pyproject.toml
├── LICENSE
├── TODO.md
├── README.md
├── test_client.py
├── .gitignore
├── DEPLOYMENT.md
└── test_web_client.py


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include mcp_simple_arxiv/taxonomy.json


--------------------------------------------------------------------------------
/.github/funding.yml:
--------------------------------------------------------------------------------
1 | # If you find this useful you can support this by using the funding links below
2 | #
3 | github: andybrandt
4 | buy_me_a_coffee: andybrandt
5 | 


--------------------------------------------------------------------------------
/mcp_simple_arxiv/__main__.py:
--------------------------------------------------------------------------------
1 | """
2 | Main entry point for the MCP server.
3 | """
4 | 
5 | from .server import main
6 | 
7 | if __name__ == "__main__":
8 |     main()


--------------------------------------------------------------------------------
/mcp_simple_arxiv/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MCP server providing access to arXiv papers through their API.
 3 | """
 4 | import asyncio
 5 | from .server import main as server_main
 6 | 
 7 | __version__ = "0.1.0"
 8 | 
 9 | def main():
10 |     """Main entry point for the package."""
11 |     asyncio.run(server_main())


--------------------------------------------------------------------------------
/smithery.yaml:
--------------------------------------------------------------------------------
 1 | # Smithery configuration file: https://smithery.ai/docs/config#smitheryyaml
 2 | 
 3 | startCommand:
 4 |   type: stdio
 5 |   configSchema:
 6 |     # JSON Schema defining the configuration options for the MCP.
 7 |     type: object
 8 |     required: []
 9 |     properties: {}
10 |   commandFunction:
11 |     # A function that produces the CLI command to start the MCP on stdio.
12 |     |-
13 |     config => ({command: 'python', args: ['-m', 'mcp_simple_arxiv']})


--------------------------------------------------------------------------------
/Dockerfile.web:
--------------------------------------------------------------------------------
 1 | # Use an official Python runtime as a parent image
 2 | FROM python:3.11-slim
 3 | 
 4 | # Set the working directory in the container
 5 | WORKDIR /usr/src/app
 6 | 
 7 | # Copy the entire project into the container
 8 | COPY . .
 9 | 
10 | # Install the project dependencies
11 | # Using --no-cache-dir is a good practice for keeping image size down
12 | RUN pip install --no-cache-dir .
13 | 
14 | # Expose the port the app runs on
15 | EXPOSE 8000
16 | 
17 | # Run the web server when the container launches
18 | CMD ["python", "-m", "mcp_simple_arxiv.web_server"] 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Generated by https://smithery.ai. See: https://smithery.ai/docs/config#dockerfile
 2 | # Use an official Python runtime as a parent image
 3 | FROM python:3.10-slim
 4 | 
 5 | # Set the working directory in the container
 6 | WORKDIR /app
 7 | 
 8 | # Copy the current directory contents into the container at /app
 9 | ADD . /app
10 | 
11 | # Install any needed packages specified in requirements.txt
12 | RUN pip install --no-cache-dir .
13 | 
14 | # Make port 80 available to the world outside this container
15 | EXPOSE 80
16 | 
17 | # Run mcp_simple_arxiv when the container launches
18 | ENTRYPOINT ["python", "-m", "mcp_simple_arxiv"]


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "mcp-simple-arxiv"
 7 | version = "0.3.0"
 8 | description = "An MCP server that provides access to arXiv papers through their API."
 9 | readme = "README.md"
10 | authors = [
11 |     {name = "Andy Brandt", email = "andy@codesprinters.com"}
12 | ]
13 | requires-python = ">=3.10"
14 | license = "MIT"
15 | classifiers = [
16 |     "Programming Language :: Python :: 3",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "fastmcp",
21 |     "feedparser",  # for parsing arXiv API responses
22 |     "httpx",      # for making HTTP requests with async support
23 |     "beautifulsoup4",  # for parsing arXiv taxonomy page
24 | ]
25 | 
26 | [project.urls]
27 | Homepage = "https://github.com/andybrandt/mcp-simple-arxiv"
28 | 
29 | [project.scripts]
30 | mcp-simple-arxiv = "mcp_simple_arxiv:main"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Andy Brandt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/mcp_simple_arxiv/update_taxonomy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Script to verify and optionally update arXiv category taxonomy.
 3 | """
 4 | 
 5 | import json
 6 | from pathlib import Path
 7 | from .categories import CATEGORIES
 8 | 
 9 | TAXONOMY_FILE = Path(__file__).parent / "taxonomy.json"
10 | 
11 | def update_taxonomy_file():
12 |     """
13 |     Create taxonomy.json from the built-in categories.
14 |     Returns the taxonomy dictionary.
15 |     """
16 |     print(f"Creating taxonomy file at {TAXONOMY_FILE}...")
17 |     with open(TAXONOMY_FILE, 'w', encoding='utf-8') as f:
18 |         json.dump(CATEGORIES, f, indent=2, ensure_ascii=False)
19 |     print("Done!")
20 |     return CATEGORIES
21 | 
22 | def load_taxonomy() -> dict:
23 |     """
24 |     Load taxonomy from the JSON file.
25 |     If file doesn't exist, create it from built-in categories.
26 |     """
27 |     if not TAXONOMY_FILE.exists():
28 |         print(f"Taxonomy file not found at {TAXONOMY_FILE}, creating it...")
29 |         return update_taxonomy_file()
30 |     
31 |     print(f"Loading taxonomy from {TAXONOMY_FILE}")
32 |     with open(TAXONOMY_FILE, 'r', encoding='utf-8') as f:
33 |         return json.load(f)
34 | 
35 | if __name__ == "__main__":
36 |     # When run directly, create/update the taxonomy file
37 |     print("Creating taxonomy file from built-in categories...")
38 |     taxonomy = update_taxonomy_file()
39 |     print(f"\nCreated taxonomy with {len(taxonomy)} primary categories:")
40 |     for primary, data in taxonomy.items():
41 |         print(f"- {primary}: {data['name']} ({len(data['subcategories'])} subcategories)")


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Planned Features and Improvements
 2 | 
 3 | 1. **Total Match Count in Search Results**
 4 |    - Add total number of matches found to search results
 5 |    - This helps users and AIs determine if search needs refinement
 6 |    - Should handle cases where there are more results than displayed
 7 |    - Example: "Found 1234 matches, showing first 10"
 8 | 
 9 | 2. **Categories in Search Results**
10 |    - Show primary category and all subcategories for each paper in search results
11 |    - Makes it easier to quickly assess paper's field and relevance
12 |    - Categories should be clearly labeled (primary vs additional)
13 |    - Example: "Primary: cs.AI, Additional: cs.LG, cs.CL"
14 | 
15 | 3. **Abstract Preview in Search Results**
16 |    - Add first 2-3 sentences of abstract to each search result
17 |    - Helps quickly assess paper relevance without fetching full details
18 |    - Should handle varying abstract lengths gracefully
19 |    - Should end with ellipsis (...) if truncated
20 | 
21 | 4. **Flexible Search Result Sorting**
22 |    - Add support for different sorting options in search
23 |    - Support sorting by: submission date, last update date, relevance
24 |    - Make sort order configurable (ascending/descending)
25 |    - Expose sorting options in tool description
26 | 
27 | 5. **Date Range Filters**
28 |    - Allow filtering papers by submission/update date range
29 |    - Support both absolute dates and relative ranges (last week/month/year)
30 |    - Implement using arXiv API's date filtering capabilities
31 | 
32 | 6. **DOI Integration** [DONE]
33 |    - Add DOI (Digital Object Identifier) to paper details when available
34 |    - Extract from arXiv API response
35 |    - Include DOI URL for easy access
36 | 
37 | 7. **Enhanced Category Presentation**
38 |    - Improve how categories are displayed in paper details
39 |    - Clearly distinguish primary and secondary categories
40 |    - Include category descriptions where helpful
41 |    - Group related categories together
42 | 
43 | 8. **Advanced Category Search**
44 |    - Support complex category combinations in search
45 |    - Allow AND/OR/NOT operations between categories
46 |    - Support parentheses for grouping
47 |    - Example: "(cs.AI OR cs.LG) AND NOT cs.DB"
48 | 
49 | 9. **Citation Format Support**
50 |    - Generate citation strings in common formats (BibTeX, APA, etc.)
51 |    - Include all necessary metadata (authors, title, arXiv ID, etc.)
52 |    - Handle special characters in titles and names correctly
53 | 
54 | 10. **Impact Metrics**
55 |     - Add citation count or other impact metrics if available
56 |     - Consider alternative metrics like downloads or social media mentions
57 |     - NOTE: Might require integration with additional APIs
58 | 
59 | 11. **HTML Paper Access**  [DONE]
60 |     - Add detection of HTML version availability
61 |     - Include HTML URL in paper metadata when available
62 |     - Add URL construction logic (changing PDF URL to HTML)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # mcp-simple-arxiv
  2 | 
  3 | [![Trust Score](https://archestra.ai/mcp-catalog/api/badge/quality/andybrandt/mcp-simple-arxiv)](https://archestra.ai/mcp-catalog/andybrandt__mcp-simple-arxiv)
  4 | [![smithery badge](https://smithery.ai/badge/mcp-simple-arxiv)](https://smithery.ai/server/mcp-simple-arxiv)
  5 | [![Verified on MseeP](https://mseep.ai/badge.svg)](https://mseep.ai/app/dc95dba9-149a-4eaa-bf08-36e0cb0f3a5a)
  6 | 
  7 | An MCP server that provides access to arXiv papers through their API.
  8 | 
  9 | <a href="https://glama.ai/mcp/servers/p38q3nagwb"><img width="380" height="200" src="https://glama.ai/mcp/servers/p38q3nagwb/badge" alt="mcp-simple-arxiv MCP server" /></a>
 10 | 
 11 | ## Features
 12 | 
 13 | This server allows LLM clients (like Claude Desktop) to:
 14 | - Search for scientific papers on arXiv by title and abstract content
 15 | - Get paper metadata and abstracts
 16 | - Access links to available paper formats (PDF/HTML)
 17 | 
 18 | To use the web version just add this connector to Claude.ai https://mcp.andybrandt.net/arxiv .
 19 | You can also install & use it locally. 
 20 | 
 21 | ## Installation and Deployment
 22 | 
 23 | This server can be run in two modes: as a local `stdio` server for desktop clients or as a network-accessible web server.
 24 | 
 25 | ### Installing via Smithery
 26 | 
 27 | To install Simple Arxiv for Claude Desktop automatically via [Smithery](https://smithery.ai/server/mcp-simple-arxiv):
 28 | 
 29 | ```bash
 30 | npx -y @smithery/cli install mcp-simple-arxiv --client claude
 31 | ```
 32 | 
 33 | ### Manual Installation
 34 | ```bash
 35 | pip install mcp-simple-arxiv
 36 | ```
 37 | 
 38 | ## Usage with Claude Desktop
 39 | 
 40 | Add this configuration to your `claude_desktop_config.json`:
 41 | 
 42 | (Mac OS)
 43 | 
 44 | ```json
 45 | {
 46 |   "mcpServers": {
 47 |     "simple-arxiv": {
 48 |       "command": "python",
 49 |       "args": ["-m", "mcp_simple_arxiv"]
 50 |       }
 51 |   }
 52 | }
 53 | ```
 54 | 
 55 | (Windows version):
 56 | 
 57 | ```json
 58 | {
 59 |   "mcpServers": {
 60 |     "simple-arxiv": {
 61 |       "command": "C:\\Users\\YOUR_USERNAME\\AppData\\Local\\Programs\\Python\\Python311\\python.exe",
 62 |       "args": [
 63 |         "-m",
 64 |         "mcp_simple_arxiv"
 65 |       ]
 66 |     }
 67 |   }
 68 | }
 69 | ```
 70 | 
 71 | After restarting Claude Desktop, the following capabilities will be available:
 72 | 
 73 | ### Searching Papers
 74 | 
 75 | You can ask Claude to search for papers using queries like:
 76 | ```
 77 | Can you search arXiv for recent papers about large language models?
 78 | ```
 79 | 
 80 | The search will return basic information about matching papers including:
 81 | - Paper title
 82 | - Authors
 83 | - arXiv ID
 84 | - Publication date
 85 | 
 86 | ### Getting Paper Details
 87 | 
 88 | Once you have a paper ID, you can ask for more details:
 89 | ```
 90 | Can you show me the details for paper 2103.08220?
 91 | ```
 92 | 
 93 | This will return:
 94 | - Full paper title
 95 | - Authors
 96 | - Publication and update dates
 97 | - Journal reference (if available)
 98 | - Paper abstract
 99 | - Links to available formats (PDF/HTML)
100 | 
101 | 
102 | *For web deployment see [DEPLOYMENT.md](DEPLOYMENT.md)*.
103 | 
104 | ## Development
105 | 
106 | To install for development:
107 | ```bash
108 | git clone https://github.com/andybrandt/mcp-simple-arxiv
109 | cd mcp-simple-arxiv
110 | pip install -e .
111 | ```
112 | 
113 | ### arXiv API Guidelines
114 | 
115 | This server follows arXiv API usage guidelines:
116 | - Rate limiting to max 1 request per 3 seconds
117 | - Single connection at a time
118 | - Proper error handling and retry logic
119 | 
120 | ## License
121 | 
122 | MIT
123 | 


--------------------------------------------------------------------------------
/test_client.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import logging
 3 | from fastmcp.client import Client, StdioTransport
 4 | 
 5 | # Configure logging
 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 7 | 
 8 | async def main():
 9 |     """
10 |     Test client for the mcp-simple-arxiv server.
11 |     Connects to the stdio server, lists tools, and calls each tool to verify functionality.
12 |     """
13 |     logging.info("Starting test client for mcp-simple-arxiv...")
14 |     
15 |     # Configure the stdio transport to run the server as a module
16 |     transport = StdioTransport(
17 |         command="python",
18 |         args=["-m", "mcp_simple_arxiv"]
19 |     )
20 |     
21 |     # Create a client with the transport
22 |     client = Client(transport)
23 |     
24 |     async with client:
25 |         try:
26 |             # 1. List available tools
27 |             logging.info("--- Testing tools/list ---")
28 |             tools = await client.list_tools()
29 |             logging.info(f"Found {len(tools)} tools:")
30 |             for tool in tools:
31 |                 logging.info(f"- {tool.name}: {tool.description.splitlines()[0]}")
32 |             assert len(tools) == 4, "Expected 4 tools"
33 |             logging.info("✅ tools/list test PASSED")
34 |             
35 |             # 2. Test search_papers
36 |             logging.info("\n--- Testing search_papers ---")
37 |             query = "electron"
38 |             logging.info(f"Calling search_papers with query: '{query}'")
39 |             result = await client.call_tool("search_papers", {"query": query, "max_results": 2})
40 |             logging.info(f"Result:\n{result.data}")
41 |             assert "Search Results" in result.data
42 |             logging.info("✅ search_papers test PASSED")
43 | 
44 |             # 3. Test get_paper_data
45 |             logging.info("\n--- Testing get_paper_data ---")
46 |             paper_id = "0808.3772" # A known paper
47 |             logging.info(f"Calling get_paper_data with paper_id: '{paper_id}'")
48 |             result = await client.call_tool("get_paper_data", {"paper_id": paper_id})
49 |             logging.info(f"Result:\n{result.data}")
50 |             assert "A common mass scale for satellite galaxies of the Milky Way" in result.data
51 |             logging.info("✅ get_paper_data test PASSED")
52 |             
53 |             # 4. Test list_categories
54 |             logging.info("\n--- Testing list_categories ---")
55 |             logging.info("Calling list_categories without a filter...")
56 |             result = await client.call_tool("list_categories")
57 |             logging.info(f"Result snippet:\n{result.data[:300]}...")
58 |             assert "arXiv Categories" in result.data
59 |             logging.info("✅ list_categories (no filter) test PASSED")
60 |             
61 |             logging.info("Calling list_categories with filter 'cs'...")
62 |             result = await client.call_tool("list_categories", {"primary_category": "cs"})
63 |             logging.info(f"Result snippet:\n{result.data[:300]}...")
64 |             assert "cs: Computer Science" in result.data
65 |             assert "math: Mathematics" not in result.data
66 |             logging.info("✅ list_categories (with filter) test PASSED")
67 | 
68 |             # 5. Test update_categories - This might take a moment
69 |             logging.info("\n--- Testing update_categories ---")
70 |             logging.info("Calling update_categories...")
71 |             result = await client.call_tool("update_categories")
72 |             logging.info(f"Result:\n{result.data}")
73 |             assert "Successfully updated category taxonomy" in result.data
74 |             logging.info("✅ update_categories test PASSED")
75 | 
76 |         except Exception as e:
77 |             logging.error(f"An error occurred during testing: {e}", exc_info=True)
78 |         finally:
79 |             logging.info("\nTest run finished.")
80 | 
81 | if __name__ == "__main__":
82 |     asyncio.run(main()) 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/DEPLOYMENT.md:
--------------------------------------------------------------------------------
  1 | # Web Server Deployment Guide
  2 | 
  3 | This guide explains how to run the network-hostable version of the `mcp-simple-arxiv` server. This version uses the MCP Streamable HTTP transport and is ideal for deployments where clients connect over a network.
  4 | 
  5 | ## Overview
  6 | 
  7 | The web server is a stateless service that exposes four tools for interacting with the arXiv API:
  8 | - `search_papers`: Search for papers by keyword.
  9 | - `get_paper_data`: Fetch detailed information for a specific paper ID.
 10 | - `list_categories`: List the available arXiv subject categories.
 11 | - `update_categories`: Refresh the locally cached category list from arXiv.
 12 | 
 13 | It runs using FastMCP’s built-in web server (based on Uvicorn/Starlette).
 14 | 
 15 | ## Local Development
 16 | 
 17 | ### 1. Install Dependencies
 18 | 
 19 | First, ensure you have a virtual environment set up and the project installed in editable mode:
 20 | ```bash
 21 | python3 -m venv venv
 22 | source venv/bin/activate
 23 | pip install -e .
 24 | ```
 25 | 
 26 | ### 2. Run the Server
 27 | 
 28 | The server can be run directly using its Python module:
 29 | ```bash
 30 | python -m mcp_simple_arxiv.web_server
 31 | ```
 32 | This will start the server on `http://0.0.0.0:8000`.
 33 | 
 34 | ### 3. Test the Server
 35 | 
 36 | You can test the running server from your command line using `curl`.
 37 | 
 38 | **List Tools Request:**
 39 | ```bash
 40 | curl -X POST http://127.0.0.1:8000/mcp/ \
 41 |   -H "Content-Type: application/json" \
 42 |   -H "Accept: application/json, text/event-stream" \
 43 |   -d '{"jsonrpc":"2.0","id":1,"method":"tools/list"}'
 44 | ```
 45 | 
 46 | **Tool Call Request (`search_papers`):**
 47 | ```bash
 48 | curl -X POST http://127.0.0.1:8000/mcp/ \
 49 |   -H "Content-Type: application/json" \
 50 |   -H "Accept: application/json, text/event-stream" \
 51 |   -d '{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"search_papers","arguments":{"query": "quantum computing"}}}'
 52 | ```
 53 | 
 54 | ## Docker Deployment
 55 | 
 56 | The project includes a `Dockerfile.web` for easy containerization.
 57 | 
 58 | ### 1. Build the Docker Image
 59 | ```bash
 60 | docker build -f Dockerfile.web -t mcp-simple-arxiv:web .
 61 | ```
 62 | 
 63 | ### 2. Run the Docker Container
 64 | 
 65 | To run the container for use with a local reverse proxy (like Apache or Nginx), you should map the container’s port only to the host’s loopback interface:
 66 | ```bash
 67 | docker run -d -p 127.0.0.1:8000:8000 --name arxiv-web mcp-simple-arxiv:web
 68 | ```
 69 | This command does two important things:
 70 | 1.  It runs the container in detached mode (`-d`).
 71 | 2.  It maps port 8000 inside the container to port 8000 on the host machine’s `localhost` interface only (`-p 127.0.0.1:8000:8000`). This ensures the server is not directly accessible from the network, which is the recommended setup when placing it behind a reverse proxy.
 72 | 
 73 | For persistence, you can set the container to restart automatically:
 74 | ```bash
 75 | docker run -d --restart always -p 127.0.0.1:8000:8000 --name arxiv-web mcp-simple-arxiv:web
 76 | ```
 77 | 
 78 | ### 3. Transferring the Image
 79 | 
 80 | If you built the image on a different machine, you can package it for transfer:
 81 | ```bash
 82 | # On the source machine
 83 | docker save -o mcp-simple-arxiv-web.tar mcp-simple-arxiv:web
 84 | gzip mcp-simple-arxiv-web.tar
 85 | 
 86 | # On the destination machine
 87 | gunzip mcp-simple-arxiv-web.tar.gz
 88 | docker load -i mcp-simple-arxiv-web.tar
 89 | ```
 90 | 
 91 | ## Changing the Port
 92 | 
 93 | The server is configured to run on port 8000 inside the container. To map this to a different host port, change the first value in the `-p` parameter. The format is `-p <host_port>:<container_port>`.
 94 | ```bash
 95 | # Map container's port 8000 to host's port 9001
 96 | docker run -d -p 127.0.0.1:9001:8000 --name arxiv-web mcp-simple-arxiv:web
 97 | ```
 98 | The server will now be accessible at `http://127.0.0.1:9001` on the host.
 99 | 
100 | ## Apache Reverse Proxy Configuration
101 | 
102 | **Important**: MCP clients may request URLs with or without trailing slashes. Your Apache configuration must handle both cases to avoid 404 errors.
103 | 
104 | Example Configuration:
105 | ```apache
106 | <VirtualHost *:443>
107 |     ServerName mcp.yourdomain.com
108 | 
109 |     # SSL Configuration (recommended for production)
110 |     # SSLEngine on
111 |     # SSLCertificateFile /path/to/cert.pem
112 |     # SSLCertificateKeyFile /path/to/key.pem
113 | 
114 |     # Main proxy configuration
115 |     <Location /arxiv>
116 |         ProxyPass http://127.0.0.1:8000/mcp/
117 |         ProxyPassReverse http://127.0.0.1:8000/mcp/
118 |     </Location>
119 | </VirtualHost>
120 | ```
121 | This configuration will make your arXiv MCP server available at `https://mcp.yourdomain.com/arxiv`. 


--------------------------------------------------------------------------------
/mcp_simple_arxiv/taxonomy.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cs": {
  3 |     "name": "Computer Science",
  4 |     "subcategories": {
  5 |       "AI": "Artificial Intelligence",
  6 |       "CL": "Computation and Language",
  7 |       "CC": "Computational Complexity",
  8 |       "CE": "Computational Engineering, Finance, and Science",
  9 |       "CG": "Computational Geometry",
 10 |       "GT": "Computer Science and Game Theory",
 11 |       "CV": "Computer Vision and Pattern Recognition",
 12 |       "CY": "Computers and Society",
 13 |       "CR": "Cryptography and Security",
 14 |       "DB": "Databases",
 15 |       "DS": "Data Structures and Algorithms",
 16 |       "DL": "Digital Libraries",
 17 |       "DM": "Discrete Mathematics",
 18 |       "DC": "Distributed Computing",
 19 |       "ET": "Emerging Technologies",
 20 |       "FL": "Formal Languages and Automata Theory",
 21 |       "GL": "General Literature",
 22 |       "GR": "Graphics",
 23 |       "AR": "Hardware Architecture",
 24 |       "HC": "Human-Computer Interaction",
 25 |       "IR": "Information Retrieval",
 26 |       "IT": "Information Theory",
 27 |       "LG": "Machine Learning",
 28 |       "LO": "Logic in Computer Science",
 29 |       "MS": "Mathematical Software",
 30 |       "MA": "Multiagent Systems",
 31 |       "MM": "Multimedia",
 32 |       "NI": "Networking and Internet Architecture",
 33 |       "NE": "Neural and Evolutionary Computing",
 34 |       "NA": "Numerical Analysis",
 35 |       "OS": "Operating Systems",
 36 |       "OH": "Other Computer Science",
 37 |       "PF": "Performance",
 38 |       "PL": "Programming Languages",
 39 |       "RO": "Robotics",
 40 |       "SI": "Social and Information Networks",
 41 |       "SE": "Software Engineering",
 42 |       "SD": "Sound",
 43 |       "SC": "Symbolic Computation",
 44 |       "SY": "Systems and Control"
 45 |     }
 46 |   },
 47 |   "econ": {
 48 |     "name": "Economics",
 49 |     "subcategories": {
 50 |       "EM": "Econometrics",
 51 |       "GN": "General Economics",
 52 |       "TH": "Theoretical Economics"
 53 |     }
 54 |   },
 55 |   "eess": {
 56 |     "name": "Electrical Engineering and Systems Science",
 57 |     "subcategories": {
 58 |       "AS": "Audio and Speech Processing",
 59 |       "IV": "Image and Video Processing",
 60 |       "SP": "Signal Processing",
 61 |       "SY": "Systems and Control"
 62 |     }
 63 |   },
 64 |   "math": {
 65 |     "name": "Mathematics",
 66 |     "subcategories": {
 67 |       "AG": "Algebraic Geometry",
 68 |       "AT": "Algebraic Topology",
 69 |       "AP": "Analysis of PDEs",
 70 |       "CT": "Category Theory",
 71 |       "CA": "Classical Analysis and ODEs",
 72 |       "CO": "Combinatorics",
 73 |       "AC": "Commutative Algebra",
 74 |       "CV": "Complex Variables",
 75 |       "DG": "Differential Geometry",
 76 |       "DS": "Dynamical Systems",
 77 |       "FA": "Functional Analysis",
 78 |       "GM": "General Mathematics",
 79 |       "GN": "General Topology",
 80 |       "GT": "Geometric Topology",
 81 |       "GR": "Group Theory",
 82 |       "HO": "History and Overview",
 83 |       "IT": "Information Theory",
 84 |       "KT": "K-Theory and Homology",
 85 |       "LO": "Logic",
 86 |       "MP": "Mathematical Physics",
 87 |       "MG": "Metric Geometry",
 88 |       "NT": "Number Theory",
 89 |       "NA": "Numerical Analysis",
 90 |       "OA": "Operator Algebras",
 91 |       "OC": "Optimization and Control",
 92 |       "PR": "Probability",
 93 |       "QA": "Quantum Algebra",
 94 |       "RT": "Representation Theory",
 95 |       "RA": "Rings and Algebras",
 96 |       "SP": "Spectral Theory",
 97 |       "ST": "Statistics Theory",
 98 |       "SG": "Symplectic Geometry"
 99 |     }
100 |   },
101 |   "physics": {
102 |     "name": "Physics",
103 |     "subcategories": {
104 |       "acc-ph": "Accelerator Physics",
105 |       "ao-ph": "Atmospheric and Oceanic Physics",
106 |       "atom-ph": "Atomic Physics",
107 |       "bio-ph": "Biological Physics",
108 |       "chem-ph": "Chemical Physics",
109 |       "class-ph": "Classical Physics",
110 |       "comp-ph": "Computational Physics",
111 |       "data-an": "Data Analysis, Statistics and Probability",
112 |       "flu-dyn": "Fluid Dynamics",
113 |       "gen-ph": "General Physics",
114 |       "geo-ph": "Geophysics",
115 |       "hist-ph": "History and Philosophy of Physics",
116 |       "ins-det": "Instrumentation and Detectors",
117 |       "med-ph": "Medical Physics",
118 |       "optics": "Optics",
119 |       "ed-ph": "Physics Education",
120 |       "soc-ph": "Physics and Society",
121 |       "plasm-ph": "Plasma Physics",
122 |       "pop-ph": "Popular Physics",
123 |       "space-ph": "Space Physics"
124 |     }
125 |   },
126 |   "q-bio": {
127 |     "name": "Quantitative Biology",
128 |     "subcategories": {
129 |       "BM": "Biomolecules",
130 |       "CB": "Cell Behavior",
131 |       "GN": "Genomics",
132 |       "MN": "Molecular Networks",
133 |       "NC": "Neurons and Cognition",
134 |       "OT": "Other Quantitative Biology",
135 |       "PE": "Populations and Evolution",
136 |       "QM": "Quantitative Methods",
137 |       "SC": "Subcellular Processes",
138 |       "TO": "Tissues and Organs"
139 |     }
140 |   },
141 |   "q-fin": {
142 |     "name": "Quantitative Finance",
143 |     "subcategories": {
144 |       "CP": "Computational Finance",
145 |       "EC": "Economics",
146 |       "GN": "General Finance",
147 |       "MF": "Mathematical Finance",
148 |       "PM": "Portfolio Management",
149 |       "PR": "Pricing of Securities",
150 |       "RM": "Risk Management",
151 |       "ST": "Statistical Finance",
152 |       "TR": "Trading and Market Microstructure"
153 |     }
154 |   },
155 |   "stat": {
156 |     "name": "Statistics",
157 |     "subcategories": {
158 |       "AP": "Applications",
159 |       "CO": "Computation",
160 |       "ME": "Methodology",
161 |       "ML": "Machine Learning",
162 |       "OT": "Other Statistics",
163 |       "TH": "Statistics Theory"
164 |     }
165 |   }
166 | }


--------------------------------------------------------------------------------
/test_web_client.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import httpx
  3 | import logging
  4 | import subprocess
  5 | import time
  6 | import sys
  7 | import signal
  8 | 
  9 | # Configure logging
 10 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 11 | 
 12 | SERVER_URL = "http://127.0.0.1:8000/mcp/"
 13 | HEADERS = {
 14 |     "Accept": "application/json, text/event-stream",
 15 |     "Content-Type": "application/json"
 16 | }
 17 | 
 18 | async def check_server_ready(client: httpx.AsyncClient):
 19 |     """Polls the server until it is ready to accept connections."""
 20 |     for _ in range(20):  # Poll for up to 10 seconds
 21 |         try:
 22 |             response = await client.post(SERVER_URL, json={"jsonrpc": "2.0", "id": 0, "method": "tools/list"}, headers=HEADERS)
 23 |             if response.status_code == 200:
 24 |                 logging.info("Web server is up and running.")
 25 |                 return True
 26 |         except httpx.ConnectError:
 27 |             pass
 28 |         await asyncio.sleep(0.5)
 29 |     logging.error("Web server did not start in time.")
 30 |     return False
 31 | 
 32 | async def call_tool(client: httpx.AsyncClient, tool_name: str, params: dict = None) -> dict:
 33 |     """Helper function to call a tool via JSON-RPC."""
 34 |     method = "tools/call" if tool_name != "tools/list" else "tools/list"
 35 |     
 36 |     payload = {
 37 |         "jsonrpc": "2.0",
 38 |         "id": 1,
 39 |         "method": method,
 40 |     }
 41 | 
 42 |     if method == "tools/call":
 43 |         payload["params"] = {"name": tool_name, "arguments": params or {}}
 44 |     
 45 |     response = await client.post(SERVER_URL, json=payload, headers=HEADERS)
 46 |     response.raise_for_status()
 47 |     # The response is Server-Sent Events, we need to parse it
 48 |     for line in response.text.strip().split('\n'):
 49 |         if line.startswith('data:'):
 50 |             import json
 51 |             return json.loads(line[len('data:'):].strip())
 52 |     raise ValueError("Did not receive a valid data event from the server.")
 53 | 
 54 | 
 55 | async def main():
 56 |     """
 57 |     Test client for the mcp-simple-arxiv web server.
 58 |     Starts the server, runs tests, and then stops it.
 59 |     """
 60 |     server_process = None
 61 |     try:
 62 |         logging.info("Starting web server process...")
 63 |         # Start the server as a subprocess
 64 |         server_process = subprocess.Popen(
 65 |             [sys.executable, "-m", "mcp_simple_arxiv.web_server"],
 66 |             stdout=sys.stdout,
 67 |             stderr=sys.stderr
 68 |         )
 69 | 
 70 |         async with httpx.AsyncClient(timeout=30.0) as client:
 71 |             if not await check_server_ready(client):
 72 |                 raise RuntimeError("Could not connect to the web server.")
 73 | 
 74 |             # 1. List available tools
 75 |             logging.info("\n--- Testing tools/list ---")
 76 |             response_json = await call_tool(client, "tools/list") # Using call_tool to simplify logic
 77 |             tools = response_json['result']['tools']
 78 |             logging.info(f"Found {len(tools)} tools.")
 79 |             assert len(tools) == 4
 80 |             logging.info("✅ tools/list test PASSED")
 81 | 
 82 |             # 2. Test search_papers
 83 |             logging.info("\n--- Testing search_papers ---")
 84 |             query = "dark matter"
 85 |             response_json = await call_tool(client, "search_papers", {"query": query, "max_results": 1})
 86 |             result = response_json['result']['structuredContent']['result']
 87 |             logging.info(f"Result for '{query}':\n{result}")
 88 |             assert "Search Results" in result
 89 |             logging.info("✅ search_papers test PASSED")
 90 | 
 91 |             # 3. Test get_paper_data
 92 |             logging.info("\n--- Testing get_paper_data ---")
 93 |             paper_id = "0808.3772"  # Using the same ID as the stdio test for consistency
 94 |             response_json = await call_tool(client, "get_paper_data", {"paper_id": paper_id})
 95 |             result = response_json['result']['structuredContent']['result']
 96 |             logging.info(f"Result for paper '{paper_id}':\n{result}")
 97 |             assert "A common mass scale for satellite galaxies of the Milky Way" in result
 98 |             logging.info("✅ get_paper_data test PASSED")
 99 | 
100 |             # 4. Test list_categories
101 |             logging.info("\n--- Testing list_categories ---")
102 |             response_json = await call_tool(client, "list_categories")
103 |             result = response_json['result']['structuredContent']['result']
104 |             logging.info("Result snippet:\n" + result[:200] + "...")
105 |             assert "arXiv Categories" in result
106 |             logging.info("✅ list_categories test PASSED")
107 | 
108 |             # 5. Test update_categories
109 |             logging.info("\n--- Testing update_categories ---")
110 |             response_json = await call_tool(client, "update_categories")
111 |             result = response_json['result']['structuredContent']['result']
112 |             logging.info(f"Result:\n{result}")
113 |             assert "Successfully updated category taxonomy" in result
114 |             logging.info("✅ update_categories test PASSED")
115 | 
116 |     except Exception as e:
117 |         logging.error(f"An error occurred during testing: {e}", exc_info=True)
118 |     finally:
119 |         if server_process:
120 |             logging.info("\nStopping web server process...")
121 |             server_process.send_signal(signal.SIGINT)  # Send Ctrl+C
122 |             try:
123 |                 server_process.wait(timeout=10)
124 |                 logging.info("Web server stopped gracefully.")
125 |             except subprocess.TimeoutExpired:
126 |                 logging.warning("Web server did not stop gracefully, killing.")
127 |                 server_process.kill()
128 |         logging.info("Test run finished.")
129 | 
130 | if __name__ == "__main__":
131 |     asyncio.run(main()) 


--------------------------------------------------------------------------------
/mcp_simple_arxiv/categories.py:
--------------------------------------------------------------------------------
  1 | """
  2 | arXiv category taxonomy.
  3 | """
  4 | 
  5 | CATEGORIES = {
  6 |     "cs": {
  7 |         "name": "Computer Science",
  8 |         "subcategories": {
  9 |             "AI": "Artificial Intelligence",
 10 |             "CL": "Computation and Language",
 11 |             "CC": "Computational Complexity",
 12 |             "CE": "Computational Engineering, Finance, and Science",
 13 |             "CG": "Computational Geometry",
 14 |             "GT": "Computer Science and Game Theory",
 15 |             "CV": "Computer Vision and Pattern Recognition",
 16 |             "CY": "Computers and Society",
 17 |             "CR": "Cryptography and Security",
 18 |             "DB": "Databases",
 19 |             "DS": "Data Structures and Algorithms",
 20 |             "DL": "Digital Libraries",
 21 |             "DM": "Discrete Mathematics",
 22 |             "DC": "Distributed Computing",
 23 |             "ET": "Emerging Technologies",
 24 |             "FL": "Formal Languages and Automata Theory",
 25 |             "GL": "General Literature",
 26 |             "GR": "Graphics",
 27 |             "AR": "Hardware Architecture",
 28 |             "HC": "Human-Computer Interaction",
 29 |             "IR": "Information Retrieval",
 30 |             "IT": "Information Theory",
 31 |             "LG": "Machine Learning",
 32 |             "LO": "Logic in Computer Science",
 33 |             "MS": "Mathematical Software",
 34 |             "MA": "Multiagent Systems",
 35 |             "MM": "Multimedia",
 36 |             "NI": "Networking and Internet Architecture",
 37 |             "NE": "Neural and Evolutionary Computing",
 38 |             "NA": "Numerical Analysis",
 39 |             "OS": "Operating Systems",
 40 |             "OH": "Other Computer Science",
 41 |             "PF": "Performance",
 42 |             "PL": "Programming Languages",
 43 |             "RO": "Robotics",
 44 |             "SI": "Social and Information Networks",
 45 |             "SE": "Software Engineering",
 46 |             "SD": "Sound",
 47 |             "SC": "Symbolic Computation",
 48 |             "SY": "Systems and Control",
 49 |         }
 50 |     },
 51 |     "econ": {
 52 |         "name": "Economics",
 53 |         "subcategories": {
 54 |             "EM": "Econometrics",
 55 |             "GN": "General Economics",
 56 |             "TH": "Theoretical Economics",
 57 |         }
 58 |     },
 59 |     "eess": {
 60 |         "name": "Electrical Engineering and Systems Science",
 61 |         "subcategories": {
 62 |             "AS": "Audio and Speech Processing",
 63 |             "IV": "Image and Video Processing",
 64 |             "SP": "Signal Processing",
 65 |             "SY": "Systems and Control",
 66 |         }
 67 |     },
 68 |     "math": {
 69 |         "name": "Mathematics",
 70 |         "subcategories": {
 71 |             "AG": "Algebraic Geometry",
 72 |             "AT": "Algebraic Topology",
 73 |             "AP": "Analysis of PDEs",
 74 |             "CT": "Category Theory",
 75 |             "CA": "Classical Analysis and ODEs",
 76 |             "CO": "Combinatorics",
 77 |             "AC": "Commutative Algebra",
 78 |             "CV": "Complex Variables",
 79 |             "DG": "Differential Geometry",
 80 |             "DS": "Dynamical Systems",
 81 |             "FA": "Functional Analysis",
 82 |             "GM": "General Mathematics",
 83 |             "GN": "General Topology",
 84 |             "GT": "Geometric Topology",
 85 |             "GR": "Group Theory",
 86 |             "HO": "History and Overview",
 87 |             "IT": "Information Theory",
 88 |             "KT": "K-Theory and Homology",
 89 |             "LO": "Logic",
 90 |             "MP": "Mathematical Physics",
 91 |             "MG": "Metric Geometry",
 92 |             "NT": "Number Theory",
 93 |             "NA": "Numerical Analysis",
 94 |             "OA": "Operator Algebras",
 95 |             "OC": "Optimization and Control",
 96 |             "PR": "Probability",
 97 |             "QA": "Quantum Algebra",
 98 |             "RT": "Representation Theory",
 99 |             "RA": "Rings and Algebras",
100 |             "SP": "Spectral Theory",
101 |             "ST": "Statistics Theory",
102 |             "SG": "Symplectic Geometry",
103 |         }
104 |     },
105 |     "physics": {
106 |         "name": "Physics",
107 |         "subcategories": {
108 |             "acc-ph": "Accelerator Physics",
109 |             "ao-ph": "Atmospheric and Oceanic Physics",
110 |             "atom-ph": "Atomic Physics",
111 |             "bio-ph": "Biological Physics",
112 |             "chem-ph": "Chemical Physics",
113 |             "class-ph": "Classical Physics",
114 |             "comp-ph": "Computational Physics",
115 |             "data-an": "Data Analysis, Statistics and Probability",
116 |             "flu-dyn": "Fluid Dynamics",
117 |             "gen-ph": "General Physics",
118 |             "geo-ph": "Geophysics",
119 |             "hist-ph": "History and Philosophy of Physics",
120 |             "ins-det": "Instrumentation and Detectors",
121 |             "med-ph": "Medical Physics",
122 |             "optics": "Optics",
123 |             "ed-ph": "Physics Education",
124 |             "soc-ph": "Physics and Society",
125 |             "plasm-ph": "Plasma Physics",
126 |             "pop-ph": "Popular Physics",
127 |             "space-ph": "Space Physics",
128 |         }
129 |     },
130 |     "q-bio": {
131 |         "name": "Quantitative Biology",
132 |         "subcategories": {
133 |             "BM": "Biomolecules",
134 |             "CB": "Cell Behavior",
135 |             "GN": "Genomics",
136 |             "MN": "Molecular Networks",
137 |             "NC": "Neurons and Cognition",
138 |             "OT": "Other Quantitative Biology",
139 |             "PE": "Populations and Evolution",
140 |             "QM": "Quantitative Methods",
141 |             "SC": "Subcellular Processes",
142 |             "TO": "Tissues and Organs",
143 |         }
144 |     },
145 |     "q-fin": {
146 |         "name": "Quantitative Finance",
147 |         "subcategories": {
148 |             "CP": "Computational Finance",
149 |             "EC": "Economics",
150 |             "GN": "General Finance",
151 |             "MF": "Mathematical Finance",
152 |             "PM": "Portfolio Management",
153 |             "PR": "Pricing of Securities",
154 |             "RM": "Risk Management",
155 |             "ST": "Statistical Finance",
156 |             "TR": "Trading and Market Microstructure",
157 |         }
158 |     },
159 |     "stat": {
160 |         "name": "Statistics",
161 |         "subcategories": {
162 |             "AP": "Applications",
163 |             "CO": "Computation",
164 |             "ME": "Methodology",
165 |             "ML": "Machine Learning",
166 |             "OT": "Other Statistics",
167 |             "TH": "Statistics Theory",
168 |         }
169 |     },
170 | }


--------------------------------------------------------------------------------
/mcp_simple_arxiv/server.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MCP server for accessing arXiv papers.
  3 | """
  4 | 
  5 | import sys
  6 | sys.stdout.reconfigure(encoding='utf-8')
  7 | sys.stdin.reconfigure(encoding='utf-8')
  8 | 
  9 | import asyncio
 10 | import logging
 11 | 
 12 | from fastmcp import FastMCP
 13 | 
 14 | from .arxiv_client import ArxivClient
 15 | from .update_taxonomy import load_taxonomy, update_taxonomy_file
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | def get_first_sentence(text: str, max_len: int = 200) -> str:
 20 |     """Extract first sentence from text, limiting length."""
 21 |     # Look for common sentence endings
 22 |     for end in ['. ', '! ', '? ']:
 23 |         pos = text.find(end)
 24 |         if pos != -1 and pos < max_len:
 25 |             return text[:pos + 1]
 26 |     # If no sentence ending found, just take first max_len chars
 27 |     if len(text) > max_len:
 28 |         return text[:max_len].rstrip() + '...'
 29 |     return text
 30 | 
 31 | def create_app():
 32 |     """Creates and configures the FastMCP app instance and its tools."""
 33 |     app = FastMCP("arxiv-server")
 34 |     arxiv_client = ArxivClient()
 35 | 
 36 |     @app.tool(
 37 |         annotations={
 38 |             "title": "Search arXiv Papers",
 39 |             "readOnlyHint": True,
 40 |             "openWorldHint": True
 41 |         }
 42 |     )
 43 |     async def search_papers(query: str, max_results: int = 10) -> str:
 44 |         """
 45 | Search for papers on arXiv by title and abstract content.
 46 | 
 47 | You can use advanced search syntax:
 48 | - Search in title: ti:"search terms"
 49 | - Search in abstract: abs:"search terms"
 50 | - Search by author: au:"author name"
 51 | - Combine terms with: AND, OR, ANDNOT
 52 | - Filter by category: cat:cs.AI (use list_categories tool to see available categories)
 53 | 
 54 | Examples:
 55 | - "machine learning"  (searches all fields)
 56 | - ti:"neural networks" AND cat:cs.AI  (title with category)
 57 | - au:bengio AND ti:"deep learning"  (author and title)
 58 |         """
 59 |         max_results = min(max_results, 50)
 60 |         papers = await arxiv_client.search(query, max_results)
 61 |         
 62 |         # Format results in a readable way
 63 |         result = "Search Results:\n\n"
 64 |         for i, paper in enumerate(papers, 1):
 65 |             result += f"{i}. {paper['title']}\n"
 66 |             result += f"   Authors: {', '.join(paper['authors'])}\n"
 67 |             result += f"   ID: {paper['id']}\n"
 68 |             result += f"   Categories: "
 69 |             if paper['primary_category']:
 70 |                 result += f"Primary: {paper['primary_category']}"
 71 |             if paper['categories']:
 72 |                 result += f", Additional: {', '.join(paper['categories'])}"
 73 |             result += f"\n   Published: {paper['published']}\n"
 74 |             
 75 |             # Add first sentence of abstract
 76 |             abstract_preview = get_first_sentence(paper['summary'])
 77 |             result += f"   Preview: {abstract_preview}\n"
 78 |             result += "\n"
 79 |         
 80 |         return result
 81 | 
 82 |     @app.tool(
 83 |         annotations={
 84 |             "title": "Get arXiv Paper Data",
 85 |             "readOnlyHint": True,
 86 |             "openWorldHint": True
 87 |         }
 88 |     )
 89 |     async def get_paper_data(paper_id: str) -> str:
 90 |         """Get detailed information about a specific paper including abstract and available formats."""
 91 |         paper = await arxiv_client.get_paper(paper_id)
 92 |         
 93 |         # Format paper details in a readable way with clear sections
 94 |         result = f"Title: {paper['title']}\n\n"
 95 |         
 96 |         # Metadata section
 97 |         result += "Metadata:\n"
 98 |         result += f"- Authors: {', '.join(paper['authors'])}\n"
 99 |         result += f"- Published: {paper['published']}\n"
100 |         result += f"- Last Updated: {paper['updated']}\n"
101 |         result += "- Categories: "
102 |         if paper['primary_category']:
103 |             result += f"Primary: {paper['primary_category']}"
104 |         if paper['categories']:
105 |             result += f", Additional: {', '.join(paper['categories'])}"
106 |         result += "\n"
107 |         
108 |         if paper['doi']:
109 |             result += f"- DOI: {paper['doi']}\n"
110 |         if paper["journal_ref"]:
111 |             result += f"- Journal Reference: {paper['journal_ref']}\n"
112 |         
113 |         # Abstract section
114 |         result += "\nAbstract:\n"
115 |         result += paper["summary"]
116 |         result += "\n"
117 |         
118 |         # Access options section
119 |         result += "\nAccess Options:\n"
120 |         result += "- Abstract page: " + paper["abstract_url"] + "\n"
121 |         if paper["html_url"]:  # Add HTML version if available
122 |             result += "- Full text HTML version: " + paper["html_url"] + "\n"
123 |         result += "- PDF version: " + paper["pdf_url"] + "\n"
124 |         
125 |         # Additional information section
126 |         if paper["comment"] or "code" in paper["comment"].lower():
127 |             result += "\nAdditional Information:\n"
128 |             if paper["comment"]:
129 |                 result += "- Comment: " + paper["comment"] + "\n"
130 |                 
131 |         return result
132 | 
133 |     @app.tool(
134 |         annotations={
135 |             "title": "List arXiv Categories",
136 |             "readOnlyHint": True,
137 |             "openWorldHint": False
138 |         }
139 |     )
140 |     def list_categories(primary_category: str = None) -> str:
141 |         """List all available arXiv categories and how to use them in search."""
142 |         try:
143 |             taxonomy = load_taxonomy()
144 |         except Exception as e:
145 |             logger.error(f"Error loading taxonomy: {e}")
146 |             return f"Error loading category taxonomy. Try using update_categories tool to refresh it."
147 | 
148 |         result = "arXiv Categories:\n\n"
149 |         
150 |         for primary, data in taxonomy.items():
151 |             if primary_category and primary != primary_category:
152 |                 continue
153 |                 
154 |             result += f"{primary}: {data['name']}\n"
155 |             for code, desc in data['subcategories'].items():
156 |                 result += f"  {primary}.{code}: {desc}\n"
157 |             result += "\n"
158 |             
159 |         result += "\nUsage in search:\n"
160 |         result += '- Search in specific category: cat:cs.AI\n'
161 |         result += '- Combine with other terms: "neural networks" AND cat:cs.AI\n'
162 |         result += '- Multiple categories: (cat:cs.AI OR cat:cs.LG)\n'
163 |         result += '\nNote: If categories seem outdated, use the update_categories tool to refresh them.\n'
164 |         
165 |         return result
166 | 
167 |     @app.tool(
168 |         annotations={
169 |             "title": "Update arXiv Categories",
170 |             "readOnlyHint": False,
171 |             "openWorldHint": True
172 |         }
173 |     )
174 |     def update_categories() -> str:
175 |         """Update the stored category taxonomy by fetching the latest version from arxiv.org"""
176 |         try:
177 |             taxonomy = update_taxonomy_file()
178 |             result = "Successfully updated category taxonomy.\n\n"
179 |             result += f"Found {len(taxonomy)} primary categories:\n"
180 |             for primary, data in taxonomy.items():
181 |                 result += f"- {primary}: {data['name']} ({len(data['subcategories'])} subcategories)\n"
182 |             return result
183 |         except Exception as e:
184 |             logger.error(f"Error updating taxonomy: {e}")
185 |             # FastMCP will handle raising this as a proper JSON-RPC error
186 |             raise e
187 |             
188 |     return app
189 | 
190 | app = create_app()
191 | 
192 | def main():
193 |     """Run the MCP server."""
194 |     app.run()
195 | 
196 | if __name__ == "__main__":
197 |     logging.basicConfig(level=logging.INFO)
198 |     main()


--------------------------------------------------------------------------------
/mcp_simple_arxiv/arxiv_client.py:
--------------------------------------------------------------------------------
  1 | """
  2 | arXiv API client with rate limiting.
  3 | """
  4 | 
  5 | import asyncio
  6 | import logging
  7 | from datetime import datetime, timedelta
  8 | import feedparser
  9 | import httpx
 10 | from typing import Optional, Dict, List, Any
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class ArxivClient:
 15 |     """
 16 |     arXiv API client with built-in rate limiting.
 17 |     Ensures no more than 1 request every 3 seconds.
 18 |     """
 19 |     
 20 |     def __init__(self):
 21 |         self.base_url = "https://export.arxiv.org/api/query"
 22 |         self._last_request: Optional[datetime] = None
 23 |         self._lock = asyncio.Lock()
 24 |         
 25 |     async def _wait_for_rate_limit(self) -> None:
 26 |         """Ensures we respect arXiv's rate limit of 1 request every 3 seconds."""
 27 |         async with self._lock:
 28 |             if self._last_request is not None:
 29 |                 elapsed = datetime.now() - self._last_request
 30 |                 if elapsed < timedelta(seconds=3):
 31 |                     await asyncio.sleep(3 - elapsed.total_seconds())
 32 |             self._last_request = datetime.now()
 33 | 
 34 |     def _clean_text(self, text: str) -> str:
 35 |         """Clean up text by removing extra whitespace and newlines."""
 36 |         return " ".join(text.split())
 37 | 
 38 |     def _get_html_url(self, arxiv_id: str) -> str:
 39 |         """
 40 |         Construct HTML version URL for a paper.
 41 |         
 42 |         The HTML version URL is not provided by the API but can be constructed
 43 |         by modifying the PDF URL pattern.
 44 |         """
 45 |         # Remove version suffix if present (e.g., v1, v2)
 46 |         base_id = arxiv_id.split('v')[0]
 47 |         return f"https://arxiv.org/html/{arxiv_id}"
 48 | 
 49 |     def _parse_entry(self, entry: Dict[str, Any]) -> Dict[str, Any]:
 50 |         """Parse a feed entry into a paper dictionary."""
 51 |         # Extract PDF and HTML links
 52 |         pdf_url = None
 53 |         abstract_url = None  # This is the URL to the abstract page
 54 |         for link in entry.get('links', []):
 55 |             if isinstance(link, dict):
 56 |                 if link.get('type') == 'application/pdf':
 57 |                     pdf_url = link.get('href')
 58 |                 elif link.get('type') == 'text/html':
 59 |                     abstract_url = link.get('href')
 60 | 
 61 |         # Get paper ID
 62 |         paper_id = entry.get('id', '').split("/abs/")[-1].rstrip()
 63 |         
 64 |         # Create HTML version URL
 65 |         html_url = self._get_html_url(paper_id) if paper_id else None
 66 | 
 67 |         # Get authors
 68 |         authors = []
 69 |         for author in entry.get('authors', []):
 70 |             if isinstance(author, dict) and 'name' in author:
 71 |                 authors.append(author['name'])
 72 |             elif hasattr(author, 'name'):
 73 |                 authors.append(author.name)
 74 | 
 75 |         # Get categories
 76 |         categories = []
 77 |         primary_category = None
 78 |         
 79 |         # Get primary category
 80 |         if 'arxiv_primary_category' in entry:
 81 |             if isinstance(entry['arxiv_primary_category'], dict):
 82 |                 primary_category = entry['arxiv_primary_category'].get('term')
 83 |             elif hasattr(entry['arxiv_primary_category'], 'term'):
 84 |                 primary_category = entry['arxiv_primary_category'].term
 85 |         
 86 |         # Get all categories
 87 |         for category in entry.get('tags', []):
 88 |             if isinstance(category, dict) and 'term' in category:
 89 |                 categories.append(category['term'])
 90 |             elif hasattr(category, 'term'):
 91 |                 categories.append(category.term)
 92 | 
 93 |         # Remove primary category from regular categories if it's there
 94 |         if primary_category and primary_category in categories:
 95 |             categories.remove(primary_category)
 96 | 
 97 |         return {
 98 |             "id": paper_id,
 99 |             "title": self._clean_text(entry.get('title', '')),
100 |             "authors": authors,
101 |             "primary_category": primary_category,
102 |             "categories": categories,
103 |             "published": entry.get('published', ''),
104 |             "updated": entry.get('updated', ''),
105 |             "summary": self._clean_text(entry.get('summary', '')),
106 |             "comment": self._clean_text(entry.get('arxiv_comment', '')),
107 |             "journal_ref": entry.get('arxiv_journal_ref', ''),
108 |             "doi": entry.get('arxiv_doi', ''),
109 |             "pdf_url": pdf_url,
110 |             "abstract_url": abstract_url,  # URL to abstract page
111 |             "html_url": html_url  # URL to HTML version if available
112 |         }
113 | 
114 |     async def search(self, query: str, max_results: int = 10) -> List[Dict[str, Any]]:
115 |         """
116 |         Search arXiv papers.
117 |         
118 |         The query string supports arXiv's advanced search syntax:
119 |         - Search in title: ti:"search terms"
120 |         - Search in abstract: abs:"search terms"
121 |         - Search by author: au:"author name"
122 |         - Combine terms with: AND, OR, ANDNOT
123 |         - Filter by category: cat:cs.AI
124 |         
125 |         Examples:
126 |         - "machine learning"  (searches all fields)
127 |         - ti:"neural networks" AND cat:cs.AI  (title with category)
128 |         - au:bengio AND ti:"deep learning"  (author and title)
129 |         """
130 |         await self._wait_for_rate_limit()
131 |         
132 |         # Ensure max_results is within API limits
133 |         max_results = min(max_results, 2000)  # API limit: 2000 per request
134 |         
135 |         params = {
136 |             "search_query": query,
137 |             "max_results": max_results,
138 |             "sortBy": "submittedDate",  # Default to newest papers first
139 |             "sortOrder": "descending",
140 |         }
141 |         
142 |         async with httpx.AsyncClient(timeout=20.0) as client:
143 |             try:
144 |                 response = await client.get(self.base_url, params=params)
145 |                 response.raise_for_status() # Raise an exception for bad status codes
146 |                 
147 |                 # Parse the Atom feed response
148 |                 feed = feedparser.parse(response.text)
149 |                 
150 |                 if not isinstance(feed, dict) or 'entries' not in feed:
151 |                     logger.error("Invalid response from arXiv API")
152 |                     logger.debug(f"Response text: {response.text[:1000]}...")
153 |                     raise ValueError("Invalid response from arXiv API")
154 |                     
155 |                 if not feed.get('entries'):
156 |                     # Empty results are ok - return empty list
157 |                     return []
158 |                 
159 |                 return [self._parse_entry(entry) for entry in feed.entries]
160 |                 
161 |             except httpx.HTTPError as e:
162 |                 logger.error(f"HTTP error while searching: {e}")
163 |                 raise ValueError(f"arXiv API HTTP error: {str(e)}")
164 |             
165 |     async def get_paper(self, paper_id: str) -> Dict[str, Any]:
166 |         """
167 |         Get detailed information about a specific paper.
168 |         
169 |         Args:
170 |             paper_id: arXiv paper ID (e.g., "2103.08220")
171 |             
172 |         Returns:
173 |             Dictionary containing paper metadata, including:
174 |             - Basic metadata (title, authors, dates)
175 |             - Categories (primary and others)
176 |             - Abstract and comments
177 |             - URLs (abstract page, PDF version, HTML version if available)
178 |             - DOI if available
179 |         """
180 |         await self._wait_for_rate_limit()
181 |         
182 |         params = {
183 |             "id_list": paper_id,
184 |             "max_results": 1
185 |         }
186 |         
187 |         async with httpx.AsyncClient(timeout=20.0) as client:
188 |             try:
189 |                 response = await client.get(self.base_url, params=params)
190 |                 response.raise_for_status()
191 |                 
192 |                 feed = feedparser.parse(response.text)
193 |                 if not isinstance(feed, dict) or 'entries' not in feed:
194 |                     logger.error("Invalid response from arXiv API")
195 |                     logger.debug(f"Response text: {response.text[:1000]}...")
196 |                     raise ValueError("Invalid response from arXiv API")
197 |                 
198 |                 if not feed.get('entries'):
199 |                     raise ValueError(f"Paper not found: {paper_id}")
200 |                     
201 |                 return self._parse_entry(feed.entries[0])
202 |                 
203 |             except httpx.HTTPError as e:
204 |                 logger.error(f"HTTP error while fetching paper: {e}")
205 |                 raise ValueError(f"arXiv API HTTP error: {str(e)}")
206 | 


--------------------------------------------------------------------------------