├── src
    └── alex_mcp
    │   ├── __init__.py
    │   ├── data_objects.py
    │   └── server.py
├── img
    ├── oam_logo_avatar.png
    └── oam_logo_rectangular.png
├── alex-mcp-wrapper.sh
├── requirements.txt
├── LICENSE
├── pyproject.toml
├── examples
    ├── test_institution_resolution.py
    └── test_author_disambiguation.py
├── setup.py
├── .gitignore
├── INSTALL.md
└── README.md


/src/alex_mcp/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenAlex MCP Server."""
2 | __version__ = "4.1.0"


--------------------------------------------------------------------------------
/img/oam_logo_avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drAbreu/alex-mcp/HEAD/img/oam_logo_avatar.png


--------------------------------------------------------------------------------
/img/oam_logo_rectangular.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drAbreu/alex-mcp/HEAD/img/oam_logo_rectangular.png


--------------------------------------------------------------------------------
/alex-mcp-wrapper.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Wrapper script for alex-mcp that activates the virtual environment
 3 | 
 4 | # Get the directory where this script is located
 5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 6 | 
 7 | # Activate the virtual environment
 8 | source "$SCRIPT_DIR/venv/bin/activate"
 9 | 
10 | # Run the MCP server
11 | exec python -m alex_mcp.server "$@"
12 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # OpenAlex + PubMed Hybrid Author Disambiguation MCP Server Dependencies
 2 | # Following MCP best practices with FastMCP
 3 | 
 4 | # MCP SDK - Latest version with FastMCP support
 5 | mcp>=1.2.0
 6 | 
 7 | # HTTP client for OpenAlex API and ORCID integration
 8 | httpx>=0.25.0
 9 | aiohttp>=3.8.0
10 | 
11 | # Optional: For enhanced logging and debugging
12 | rich>=13.0.0
13 | 
14 | # OpenAlex API wrapper
15 | pyalex==0.18
16 | 
17 | # PubMed API integration
18 | biopython>=1.83
19 | requests>=2.31.0


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Author Disambiguation MCP Server
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "alex-mcp"
 7 | version = "4.8.2"
 8 | description = "MCP server for OpenAlex academic research API"
 9 | authors = [{name = "Jorge Abreu Vicente", email = "jorge.abreu@embo.org"}]
10 | license = {text = "MIT"}
11 | readme = "README.md"
12 | requires-python = ">=3.10"
13 | classifiers = [
14 |     "Development Status :: 3 - Alpha",
15 |     "Intended Audience :: Developers",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Programming Language :: Python :: 3",
18 |     "Programming Language :: Python :: 3.10",
19 |     "Programming Language :: Python :: 3.11",
20 |     "Programming Language :: Python :: 3.12",
21 | ]
22 | dependencies = [
23 |     "fastmcp>=2.8.1",
24 |     "httpx>=0.28.1",
25 |     "pydantic>=2.7.2",
26 |     "rich>=13.9.4",
27 |     "pyalex==0.18",
28 |     "aiohttp>=3.8.0"
29 | ]
30 | 
31 | [project.urls]
32 | Homepage = "https://github.com/drAbreu/alex-mcp"
33 | Repository = "https://github.com/drAbreu/alex-mcp"
34 | Issues = "https://github.com/drAbreu/alex-mcp/issues"
35 | 
36 | [project.scripts]
37 | alex-mcp = "alex_mcp.server:main"
38 | 
39 | [tool.setuptools.packages.find]
40 | where = ["src"]
41 | include = ["alex_mcp*"]
42 | 
43 | 


--------------------------------------------------------------------------------
/examples/test_institution_resolution.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test suite for resolve_institution using the MCP server and pyalex.
 3 | Focus: EMBO, MPIA, IRAM.
 4 | """
 5 | 
 6 | import pytest
 7 | import pyalex
 8 | 
 9 | pyalex.config.email = "test@example.com"
10 | pyalex.config.max_retries = 2
11 | pyalex.config.retry_backoff_factor = 0.1
12 | pyalex.config.retry_http_codes = [429, 500, 503]
13 | 
14 | import sys
15 | import os
16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
17 | from src.alex_mcp.server import _resolve_institution_impl as resolve_institution
18 | 
19 | def test_resolve_institution_embo():
20 |     result = resolve_institution("EMBO")
21 |     assert result["best_match"] is not None
22 |     assert "i1303691731" in result["best_match"]["id"].lower() or "I1303691731" in result["best_match"]["id"]
23 | 
24 | def test_resolve_institution_mpia():
25 |     result = resolve_institution("MPIA")
26 |     assert result["best_match"] is not None
27 |     assert "i4210109156" in result["best_match"]["id"].lower() or "I4210109156" in result["best_match"]["id"]
28 | 
29 | def test_resolve_institution_iram():
30 |     result = resolve_institution("IRAM")
31 |     assert result["best_match"] is not None
32 |     assert "i4210096876" in result["best_match"]["id"].lower() or "I4210096876" in result["best_match"]["id"]
33 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setup(
 7 |     name="alex-mcp",
 8 |     version="4.2.5",
 9 |     author="OpenAlex MCP Team",
10 |     description="OpenAlex Author Disambiguation MCP Server",
11 |     long_description=long_description,
12 |     long_description_content_type="text/markdown",
13 |     url="https://github.com/drAbreu/alex-mcp",
14 |     package_dir={"": "src"},
15 |     packages=find_packages(where="src"),
16 |     classifiers=[
17 |         "Development Status :: 4 - Beta",
18 |         "Intended Audience :: Science/Research",
19 |         "License :: OSI Approved :: MIT License",
20 |         "Operating System :: OS Independent",
21 |         "Programming Language :: Python :: 3",
22 |         "Programming Language :: Python :: 3.10",
23 |         "Programming Language :: Python :: 3.11",
24 |         "Programming Language :: Python :: 3.12",
25 |     ],
26 |     python_requires=">=3.10",  # Added this since pyalex requires Python 3.8+
27 |     install_requires=[
28 |         "fastmcp>=2.8.1",
29 |         "httpx>=0.28.1",
30 |         "pydantic>=2.7.2",
31 |         "rich>=13.9.4",
32 |         "pyalex==0.18",
33 |     ],
34 |     entry_points={
35 |         "console_scripts": [
36 |             "alex-mcp=alex_mcp.server:main",
37 |         ],
38 |     },
39 |     include_package_data=True,
40 |     zip_safe=False,
41 | )


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.ipynb
  2 | 
  3 | # Python
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | *.so
  8 | .Python
  9 | build/
 10 | develop-eggs/
 11 | dist/
 12 | downloads/
 13 | eggs/
 14 | .eggs/
 15 | lib/
 16 | lib64/
 17 | parts/
 18 | sdist/
 19 | var/
 20 | wheels/
 21 | *.egg-info/
 22 | .installed.cfg
 23 | *.egg
 24 | MANIFEST
 25 | 
 26 | # PyInstaller
 27 | *.manifest
 28 | *.spec
 29 | 
 30 | # Installer logs
 31 | pip-log.txt
 32 | pip-delete-this-directory.txt
 33 | 
 34 | # Unit test / coverage reports
 35 | htmlcov/
 36 | .tox/
 37 | .nox/
 38 | .coverage
 39 | .coverage.*
 40 | .cache
 41 | nosetests.xml
 42 | coverage.xml
 43 | *.cover
 44 | .hypothesis/
 45 | .pytest_cache/
 46 | 
 47 | # Translations
 48 | *.mo
 49 | *.pot
 50 | 
 51 | # Django stuff:
 52 | *.log
 53 | local_settings.py
 54 | db.sqlite3
 55 | 
 56 | # Flask stuff:
 57 | instance/
 58 | .webassets-cache
 59 | 
 60 | # Scrapy stuff:
 61 | .scrapy
 62 | 
 63 | # Sphinx documentation
 64 | docs/_build/
 65 | 
 66 | # PyBuilder
 67 | target/
 68 | 
 69 | # Jupyter Notebook
 70 | .ipynb_checkpoints
 71 | 
 72 | # IPython
 73 | profile_default/
 74 | ipython_config.py
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | .dmypy.json
107 | dmypy.json
108 | 
109 | # Pyre type checker
110 | .pyre/
111 | 
112 | # IDEs
113 | .vscode/
114 | .idea/
115 | *.swp
116 | *.swo
117 | *~
118 | 
119 | # OS
120 | .DS_Store
121 | .DS_Store?
122 | ._*
123 | .Spotlight-V100
124 | .Trashes
125 | ehthumbs.db
126 | Thumbs.db
127 | 
128 | # Project specific
129 | *.json
130 | !package.json
131 | !tsconfig.json
132 | test_results/
133 | logs/
134 | temp/
135 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # OpenAlex MCP Server Installation Guide
 2 | 
 3 | This guide provides instructions for installing and running the OpenAlex MCP server.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - Python 3.10 or higher
 8 | - pip (Python package installer)
 9 | 
10 | ## Installation
11 | 
12 | 1. Clone the repository:
13 |    ```bash
14 |    git clone https://github.com/drAbreu/alex-mcp.git
15 |    cd alex-mcp
16 |    ```
17 | 
18 | 2. Create a virtual environment:
19 |    ```bash
20 |    python3 -m venv venv
21 |    ```
22 | 
23 | 3. Activate the virtual environment:
24 |    ```bash
25 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
26 |    ```
27 | 
28 | 4. Install the package in development mode:
29 |    ```bash
30 |    pip install -e .
31 |    ```
32 | 
33 | ## Running the Server
34 | 
35 | ### Option 1: Using the run script
36 | 
37 | The easiest way to run the server is to use the provided run script:
38 | 
39 | ```bash
40 | ./run_alex_mcp.sh
41 | ```
42 | 
43 | This script activates the virtual environment and runs the server.
44 | 
45 | ### Option 2: Manual execution
46 | 
47 | 1. Activate the virtual environment:
48 |    ```bash
49 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
50 |    ```
51 | 
52 | 2. Run the server:
53 |    ```bash
54 |    python run_server.py
55 |    ```
56 | 
57 | ## Using with Claude Desktop
58 | 
59 | To use this MCP server with Claude Desktop, add the following configuration:
60 | 
61 | ```json
62 | {
63 |   "mcpServers": {
64 |     "alex-mcp": {
65 |       "command": "/path/to/alex-mcp/run_alex_mcp.sh"
66 |     }
67 |   }
68 | }
69 | ```
70 | 
71 | Replace `/path/to/alex-mcp` with the actual path to the repository on your system.
72 | 
73 | ## Available Tools
74 | 
75 | The OpenAlex MCP server provides the following tools:
76 | 
77 | 1. **disambiguate_author**: Disambiguate an author using OpenAlex's ML-powered disambiguation system.
78 | 2. **search_authors**: Search for authors with advanced filtering capabilities.
79 | 3. **get_author_profile**: Get detailed author profile by OpenAlex ID.
80 | 4. **resolve_institution**: Resolve institution name or abbreviation to full OpenAlex data.
81 | 
82 | ## Troubleshooting
83 | 
84 | If you encounter any issues, make sure:
85 | 
86 | 1. You're using Python 3.10 or higher
87 | 2. The virtual environment is activated
88 | 3. All dependencies are installed correctly
89 | 
90 | For more information, see the [README.md](README.md) file.
91 | 


--------------------------------------------------------------------------------
/examples/test_author_disambiguation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Test suite for disambiguate_author using the MCP server and pyalex.
 3 | Focus: Fiona M. Watt and Jorge Abreu Vicente.
 4 | """
 5 | 
 6 | import pytest
 7 | import pyalex
 8 | 
 9 | pyalex.config.email = "test@example.com"
10 | pyalex.config.max_retries = 2
11 | pyalex.config.retry_backoff_factor = 0.1
12 | pyalex.config.retry_http_codes = [429, 500, 503]
13 | 
14 | import sys
15 | import os
16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
17 | from src.alex_mcp.server import _disambiguate_author_impl as disambiguate_author
18 | 
19 | def test_disambiguate_fiona_watt_name_only():
20 |     result = disambiguate_author(name="Fiona M Watt")
21 |     print(f"Disambiguation result for Fiona M Watt: {result}")
22 |     assert result["most_likely"] is not None
23 |     assert "A5068471552" in result["most_likely"]["author"]["id"]
24 | 
25 | def test_disambiguate_fiona_watt_with_institution():
26 |     result = disambiguate_author(name="Fiona M Watt", affiliation="EMBO")
27 |     print(f"Disambiguation result for Fiona M Watt: {result}")
28 |     assert result["most_likely"] is not None
29 |     assert "A5068471552" in result["most_likely"]["author"]["id"]
30 | 
31 | def test_disambiguate_fiona_watt_with_topic():
32 |     result = disambiguate_author(name="Fiona M Watt", research_field="Stem Cells")
33 |     print(f"Disambiguation result for Fiona M Watt: {result}")
34 |     assert result["most_likely"] is not None
35 |     assert "A5068471552" in result["most_likely"]["author"]["id"]
36 | 
37 | def test_disambiguate_jorge_abreu_name_only():
38 |     result = disambiguate_author(name="Jorge Abreu Vicente")
39 |     print(f"Disambiguation result for J. Abreu-Vicente: {result}")
40 |     assert result["most_likely"] is not None
41 |     assert "A5058921480" in result["most_likely"]["author"]["id"]
42 | 
43 | def test_disambiguate_jorge_abreu_with_institution():
44 |     result = disambiguate_author(name="Jorge Abreu Vicente", affiliation="MPIA")
45 |     print(f"Disambiguation result for J. Abreu-Vicente: {result}")
46 |     assert result["most_likely"] is not None
47 |     assert "A5058921480" in result["most_likely"]["author"]["id"]
48 | 
49 | def test_disambiguate_jorge_abreu_with_topic():
50 |     result = disambiguate_author(name="Jorge Abreu Vicente", research_field="molecular clouds")
51 |     print(f"Disambiguation result for J. Abreu-Vicente: {result}")
52 |     assert result["most_likely"] is not None
53 |     assert "A5058921480" in result["most_likely"]["author"]["id"]
54 | 


--------------------------------------------------------------------------------
/src/alex_mcp/data_objects.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Optimized data models for the OpenAlex MCP server.
  4 | 
  5 | Streamlined versions focusing on essential information for author disambiguation
  6 | and work retrieval while minimizing token usage. Enhanced to preserve comprehensive
  7 | ID information (DOI, PMID, PMCID, OpenAlex, MAG).
  8 | """
  9 | 
 10 | from typing import List, Optional, Dict, Any
 11 | from datetime import datetime
 12 | from pydantic import BaseModel, Field
 13 | 
 14 | 
 15 | class WorkIDs(BaseModel):
 16 |     """
 17 |     Comprehensive work identifiers from OpenAlex.
 18 |     
 19 |     Preserves all available identifiers for cross-database linkage.
 20 |     """
 21 |     openalex: Optional[str] = None
 22 |     doi: Optional[str] = None
 23 |     pmid: Optional[str] = None
 24 |     pmcid: Optional[str] = None
 25 |     mag: Optional[str] = None
 26 | 
 27 | 
 28 | class OptimizedAuthorResult(BaseModel):
 29 |     """
 30 |     Streamlined author representation focusing on disambiguation essentials.
 31 |     
 32 |     Reduces token usage by ~70% compared to full OpenAlex author object.
 33 |     """
 34 |     id: str
 35 |     display_name: str
 36 |     orcid: Optional[str] = None
 37 |     display_name_alternatives: Optional[List[str]] = None
 38 |     
 39 |     # Simplified affiliations - just institution names as strings
 40 |     current_affiliations: Optional[List[str]] = None
 41 |     past_affiliations: Optional[List[str]] = None
 42 |     
 43 |     # Key metrics for research impact
 44 |     cited_by_count: int = 0
 45 |     works_count: int = 0
 46 |     h_index: Optional[int] = None
 47 |     i10_index: Optional[int] = None
 48 |     
 49 |     # Research fields (simplified)
 50 |     research_fields: Optional[List[str]] = None
 51 |     
 52 |     # Basic metadata
 53 |     last_known_institutions: Optional[List[str]] = None
 54 |     countries: Optional[List[str]] = None
 55 |     
 56 |     # For API access
 57 |     works_api_url: Optional[str] = None
 58 | 
 59 | 
 60 | class OptimizedWorkResult(BaseModel):
 61 |     """
 62 |     Streamlined work representation focusing on essential publication info.
 63 |     
 64 |     Reduces token usage by ~80% compared to full OpenAlex work object while
 65 |     preserving comprehensive identifier information.
 66 |     """
 67 |     id: str
 68 |     title: Optional[str] = None
 69 |     doi: Optional[str] = None  # Kept for backward compatibility
 70 |     publication_year: Optional[int] = None
 71 |     type: Optional[str] = None  # journal-article, book-chapter, etc.
 72 |     
 73 |     # COMPREHENSIVE ID INFORMATION - This was missing!
 74 |     ids: Optional[WorkIDs] = None
 75 |     
 76 |     # Citation metrics
 77 |     cited_by_count: Optional[int] = 0
 78 |     
 79 |     # Publication venue (simplified)
 80 |     journal_name: Optional[str] = None
 81 |     journal_issn: Optional[str] = None
 82 |     publisher: Optional[str] = None
 83 |     
 84 |     # Open access info (simplified)
 85 |     is_open_access: Optional[bool] = None
 86 |     
 87 |     # Author info (minimal)
 88 |     author_count: Optional[int] = None
 89 |     first_author: Optional[str] = None
 90 |     corresponding_author: Optional[str] = None
 91 |     
 92 |     # Research categorization (simplified)
 93 |     primary_field: Optional[str] = None
 94 |     concepts: Optional[List[str]] = None
 95 | 
 96 | 
 97 | class OptimizedSearchResponse(BaseModel):
 98 |     """
 99 |     Streamlined search response.
100 |     """
101 |     query: str
102 |     total_count: int
103 |     results: List[OptimizedAuthorResult]
104 |     search_time: Optional[datetime] = Field(default_factory=datetime.now)
105 | 
106 | 
107 | class OptimizedWorksSearchResponse(BaseModel):
108 |     """
109 |     Streamlined works search response for author works.
110 |     """
111 |     author_id: str
112 |     author_name: Optional[str] = None
113 |     total_count: int
114 |     results: List[OptimizedWorkResult]
115 |     search_time: Optional[datetime] = Field(default_factory=datetime.now)
116 |     filters: Optional[Dict[str, Any]] = None
117 | 
118 | 
119 | class OptimizedGeneralWorksSearchResponse(BaseModel):
120 |     """
121 |     Streamlined works search response for general work searches.
122 |     """
123 |     query: str
124 |     total_count: int
125 |     results: List[OptimizedWorkResult]
126 |     search_time: Optional[datetime] = Field(default_factory=datetime.now)
127 |     filters: Optional[Dict[str, Any]] = None
128 | 
129 | 
130 | class AutocompleteAuthorCandidate(BaseModel):
131 |     """
132 |     A single author candidate from autocomplete API.
133 |     
134 |     Optimized for fast disambiguation with essential context.
135 |     """
136 |     openalex_id: str
137 |     display_name: str
138 |     institution_hint: Optional[str] = None  # Current/last known institution
139 |     works_count: int = 0
140 |     cited_by_count: int = 0
141 |     entity_type: str = "author"
142 |     external_id: Optional[str] = None  # ORCID or other external ID
143 | 
144 | 
145 | class AutocompleteAuthorsResponse(BaseModel):
146 |     """
147 |     Response model for author autocomplete with multiple candidates.
148 |     
149 |     Enables intelligent disambiguation by providing multiple options
150 |     with institutional context and research metrics.
151 |     """
152 |     query: str
153 |     context: Optional[str] = None
154 |     total_candidates: int
155 |     candidates: List[AutocompleteAuthorCandidate]
156 |     search_metadata: Dict[str, Any] = Field(default_factory=dict)
157 | 
158 | 
159 | def extract_institution_names(affiliations: List[Dict[str, Any]]) -> tuple[List[str], List[str]]:
160 |     """
161 |     Extract and categorize institution names from OpenAlex affiliation objects.
162 |     
163 |     Returns:
164 |         tuple: (current_affiliations, past_affiliations)
165 |     """
166 |     current = []
167 |     past = []
168 |     
169 |     if not affiliations:
170 |         return current, past
171 |     
172 |     for affiliation in affiliations:
173 |         institution = affiliation.get('institution', {})
174 |         if not institution:
175 |             continue
176 |             
177 |         institution_name = institution.get('display_name')
178 |         if not institution_name:
179 |             continue
180 |         
181 |         # Determine if current or past based on years
182 |         years = affiliation.get('years', [])
183 |         if years:
184 |             current_year = datetime.now().year
185 |             # Consider current if active in last 3 years
186 |             if max(years) >= current_year - 3:
187 |                 current.append(institution_name)
188 |             else:
189 |                 past.append(institution_name)
190 |         else:
191 |             # Default to current if no year info
192 |             current.append(institution_name)
193 |     
194 |     return current, past
195 | 
196 | 
197 | def extract_research_fields(concepts_or_topics: List[Dict[str, Any]]) -> List[str]:
198 |     """
199 |     Extract research field names from concepts or topics.
200 |     
201 |     Args:
202 |         concepts_or_topics: List of concept/topic objects from OpenAlex
203 |         
204 |     Returns:
205 |         List of field names, limited to top 5 most relevant
206 |     """
207 |     fields = []
208 |     
209 |     if not concepts_or_topics:
210 |         return fields
211 |     
212 |     # Sort by score/level and take top fields
213 |     sorted_items = sorted(
214 |         concepts_or_topics, 
215 |         key=lambda x: x.get('score', 0) or x.get('count', 0), 
216 |         reverse=True
217 |     )
218 |     
219 |     for item in sorted_items[:5]:  # Limit to top 5
220 |         name = item.get('display_name')
221 |         if name:
222 |             fields.append(name)
223 |     
224 |     return fields
225 | 
226 | 
227 | def extract_journal_info(locations: List[Dict[str, Any]]) -> tuple[Optional[str], Optional[str], Optional[str]]:
228 |     """
229 |     Extract journal information from OpenAlex locations.
230 |     
231 |     Returns:
232 |         tuple: (journal_name, journal_issn, publisher)
233 |     """
234 |     if not locations:
235 |         return None, None, None
236 |     
237 |     # Look for primary location (usually first) or journal location
238 |     for location in locations:
239 |         source = location.get('source', {})
240 |         if source and source.get('type') == 'journal':
241 |             journal_name = source.get('display_name')
242 |             issn = None
243 |             if source.get('issn'):
244 |                 issn = source['issn'][0] if isinstance(source['issn'], list) else source['issn']
245 |             
246 |             publisher = source.get('host_organization_name')
247 |             return journal_name, issn, publisher
248 |     
249 |     # Fallback to first location
250 |     if locations:
251 |         source = locations[0].get('source', {})
252 |         if source:
253 |             return source.get('display_name'), None, source.get('host_organization_name')
254 |     
255 |     return None, None, None
256 | 
257 | 
258 | def extract_authorship_info(authorships: List[Dict[str, Any]]) -> tuple[Optional[int], Optional[str], Optional[str]]:
259 |     """
260 |     Extract simplified authorship information.
261 |     
262 |     Returns:
263 |         tuple: (author_count, first_author, corresponding_author)
264 |     """
265 |     if not authorships:
266 |         return None, None, None
267 |     
268 |     author_count = len(authorships)
269 |     first_author = None
270 |     corresponding_author = None
271 |     
272 |     # Find first author (author_position == 'first')
273 |     for authorship in authorships:
274 |         if authorship.get('author_position') == 'first':
275 |             author = authorship.get('author', {})
276 |             first_author = author.get('display_name')
277 |             break
278 |     
279 |     # Find corresponding author
280 |     for authorship in authorships:
281 |         if authorship.get('is_corresponding'):
282 |             author = authorship.get('author', {})
283 |             corresponding_author = author.get('display_name')
284 |             break
285 |     
286 |     return author_count, first_author, corresponding_author
287 | 
288 | 
289 | def extract_comprehensive_ids(work_data: Dict[str, Any]) -> WorkIDs:
290 |     """
291 |     Extract comprehensive identifier information from OpenAlex work data.
292 |     
293 |     This was the missing piece! OpenAlex provides comprehensive IDs in the 'ids' object.
294 |     
295 |     Args:
296 |         work_data: Full OpenAlex work object
297 |         
298 |     Returns:
299 |         WorkIDs object with all available identifiers
300 |     """
301 |     ids_data = work_data.get('ids', {})
302 |     
303 |     # Extract all available IDs
304 |     openalex_id = ids_data.get('openalex') or work_data.get('id')
305 |     doi = ids_data.get('doi') or work_data.get('doi')  # Fallback to standalone doi
306 |     pmid = ids_data.get('pmid')
307 |     pmcid = ids_data.get('pmcid')
308 |     mag = ids_data.get('mag')
309 |     
310 |     return WorkIDs(
311 |         openalex=openalex_id,
312 |         doi=doi,
313 |         pmid=pmid,
314 |         pmcid=pmcid,
315 |         mag=mag
316 |     )
317 | 
318 | 
319 | def optimize_author_data(author_data: Dict[str, Any]) -> OptimizedAuthorResult:
320 |     """
321 |     Convert full OpenAlex author object to optimized version.
322 |     
323 |     Args:
324 |         author_data: Full OpenAlex author object
325 |         
326 |     Returns:
327 |         OptimizedAuthorResult with essential information only
328 |     """
329 |     # Extract basic info
330 |     author_id = author_data.get('id', '')
331 |     display_name = author_data.get('display_name', '')
332 |     orcid = author_data.get('orcid')
333 |     alternatives = author_data.get('display_name_alternatives', [])
334 |     
335 |     # Process affiliations
336 |     affiliations = author_data.get('affiliations', [])
337 |     current_affiliations, past_affiliations = extract_institution_names(affiliations)
338 |     
339 |     # Extract metrics
340 |     cited_by_count = author_data.get('cited_by_count', 0)
341 |     works_count = author_data.get('works_count', 0)
342 |     
343 |     # Extract summary stats
344 |     summary_stats = author_data.get('summary_stats', {})
345 |     h_index = summary_stats.get('h_index')
346 |     i10_index = summary_stats.get('i10_index')
347 |     
348 |     # Extract research fields from concepts or topics
349 |     research_fields = []
350 |     concepts = author_data.get('x_concepts', []) or author_data.get('topics', [])
351 |     research_fields = extract_research_fields(concepts)
352 |     
353 |     # Extract geographic info
354 |     countries = []
355 |     if affiliations:
356 |         for affiliation in affiliations:
357 |             institution = affiliation.get('institution', {})
358 |             country = institution.get('country_code')
359 |             if country and country not in countries:
360 |                 countries.append(country)
361 |     
362 |     # API URL
363 |     works_api_url = author_data.get('works_api_url')
364 |     
365 |     return OptimizedAuthorResult(
366 |         id=author_id,
367 |         display_name=display_name,
368 |         orcid=orcid,
369 |         display_name_alternatives=alternatives[:3] if alternatives else None,  # Limit alternatives
370 |         current_affiliations=current_affiliations[:3] if current_affiliations else None,  # Limit to 3 most recent
371 |         past_affiliations=past_affiliations[:3] if past_affiliations else None,  # Limit to 3 most recent
372 |         cited_by_count=cited_by_count,
373 |         works_count=works_count,
374 |         h_index=h_index,
375 |         i10_index=i10_index,
376 |         research_fields=research_fields[:5] if research_fields else None,  # Top 5 fields
377 |         last_known_institutions=current_affiliations[:2] if current_affiliations else past_affiliations[:2],
378 |         countries=countries[:3] if countries else None,  # Limit countries
379 |         works_api_url=works_api_url
380 |     )
381 | 
382 | 
383 | def optimize_work_data(work_data: Dict[str, Any]) -> OptimizedWorkResult:
384 |     """
385 |     Convert full OpenAlex work object to optimized version.
386 |     
387 |     NOW INCLUDES COMPREHENSIVE ID EXTRACTION!
388 |     
389 |     Args:
390 |         work_data: Full OpenAlex work object
391 |         
392 |     Returns:
393 |         OptimizedWorkResult with essential information AND comprehensive IDs
394 |     """
395 |     # Basic work info
396 |     work_id = work_data.get('id', '')
397 |     title = work_data.get('title')
398 |     doi = work_data.get('doi')  # Kept for backward compatibility
399 |     publication_year = work_data.get('publication_year')
400 |     work_type = work_data.get('type')
401 |     
402 |     # EXTRACT COMPREHENSIVE IDS - This is the fix!
403 |     comprehensive_ids = extract_comprehensive_ids(work_data)
404 |     
405 |     # Citation metrics
406 |     cited_by_count = work_data.get('cited_by_count', 0)
407 |     
408 |     # Journal information
409 |     locations = work_data.get('locations', [])
410 |     journal_name, journal_issn, publisher = extract_journal_info(locations)
411 |     
412 |     # Open access info
413 |     open_access = work_data.get('open_access', {})
414 |     is_open_access = open_access.get('is_oa') if open_access else None
415 |     
416 |     # Authorship info
417 |     authorships = work_data.get('authorships', [])
418 |     author_count, first_author, corresponding_author = extract_authorship_info(authorships)
419 |     
420 |     # Research categorization
421 |     primary_topic = work_data.get('primary_topic', {})
422 |     primary_field = primary_topic.get('display_name') if primary_topic else None
423 |     
424 |     # Simplified concepts (top 3)
425 |     concepts = work_data.get('concepts', [])
426 |     concept_names = []
427 |     if concepts:
428 |         sorted_concepts = sorted(concepts, key=lambda x: x.get('score', 0), reverse=True)
429 |         concept_names = [c.get('display_name') for c in sorted_concepts[:3] if c.get('display_name')]
430 |     
431 |     return OptimizedWorkResult(
432 |         id=work_id,
433 |         title=title,
434 |         doi=doi,  
435 |         publication_year=publication_year,
436 |         type=work_type,
437 |         ids=comprehensive_ids,  
438 |         cited_by_count=cited_by_count,
439 |         journal_name=journal_name,
440 |         journal_issn=journal_issn,
441 |         publisher=publisher,
442 |         is_open_access=is_open_access,
443 |         author_count=author_count,
444 |         first_author=first_author,
445 |         corresponding_author=corresponding_author,
446 |         primary_field=primary_field,
447 |         concepts=concept_names if concept_names else None
448 |     )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <img src="img/oam_logo_rectangular.png" alt="OpenAlex MCP Server" width="600"/>
  3 |   
  4 |   # OpenAlex Author Disambiguation MCP Server
  5 | 
  6 |   [![MCP](https://img.shields.io/badge/Model%20Context%20Protocol-Compatible-blue)](https://modelcontextprotocol.io/)
  7 |   [![Python](https://img.shields.io/badge/Python-3.10+-green)](https://python.org)
  8 |   [![OpenAlex](https://img.shields.io/badge/OpenAlex-API-orange)](https://openalex.org)
  9 |   [![License](https://img.shields.io/badge/License-MIT-yellow)](LICENSE)
 10 |   [![Optimized](https://img.shields.io/badge/AI%20Agent-Optimized-brightgreen)](https://github.com/drAbreu/alex-mcp)
 11 | </div>
 12 | 
 13 | A **streamlined** Model Context Protocol (MCP) server for author disambiguation and academic research using the OpenAlex.org API. Specifically designed for AI agents with optimized data structures and enhanced functionality.
 14 | 
 15 | ---
 16 | 
 17 | ## 🎯 Key Features
 18 | 
 19 | ### 🔍 **Core Capabilities**
 20 | - **Advanced Author Disambiguation**: Handles complex career transitions and name variations
 21 | - **Institution Resolution**: Current and past affiliations with transition tracking
 22 | - **Academic Work Retrieval**: Journal articles, letters, and research papers
 23 | - **Citation Analysis**: H-index, citation counts, and impact metrics
 24 | - **ORCID Integration**: Highest accuracy matching with ORCID identifiers
 25 | 
 26 | ### 🚀 **AI Agent Optimized**
 27 | - **Streamlined Data**: Focused on essential information for disambiguation
 28 | - **Fast Processing**: Optimized data structures for rapid analysis
 29 | - **Smart Filtering**: Enhanced filtering options for targeted queries
 30 | - **Clean Output**: Structured responses optimized for AI reasoning
 31 | 
 32 | ### 🤖 **Agent Integration**
 33 | - **Multiple Candidates**: Ranked results for automated decision-making
 34 | - **Structured Responses**: Clean, parseable output optimized for LLMs
 35 | - **Error Handling**: Graceful degradation with informative messages
 36 | - **Enhanced Filtering**: Journal-only, citation thresholds, and temporal filters
 37 | 
 38 | ### 🏛️ **Professional Grade**
 39 | - **MCP Best Practices**: Built with FastMCP following official guidelines
 40 | - **Tool Annotations**: Proper MCP tool annotations for optimal client integration
 41 | - **Resource Management**: Efficient HTTP client management and cleanup
 42 | - **Rate Limiting**: Respectful API usage with proper delays
 43 | 
 44 | ---
 45 | 
 46 | ## 🚀 Quick Start
 47 | 
 48 | ### Prerequisites
 49 | 
 50 | - Python 3.10 or higher
 51 | - MCP-compatible client (e.g., Claude Desktop)
 52 | - Email address (for OpenAlex API courtesy)
 53 | 
 54 | ### Installation
 55 | 
 56 | For detailed installation instructions, see [INSTALL.md](INSTALL.md).
 57 | 
 58 | 1. **Clone the repository:**
 59 |    ```bash
 60 |    git clone https://github.com/drAbreu/alex-mcp.git
 61 |    cd alex-mcp
 62 |    ```
 63 | 
 64 | 2. **Create a virtual environment:**
 65 |    ```bash
 66 |    python3 -m venv venv
 67 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
 68 |    ```
 69 | 
 70 | 3. **Install the package:**
 71 |    ```bash
 72 |    pip install -e .
 73 |    ```
 74 | 
 75 | 4. **Configure environment:**
 76 |    ```bash
 77 |    export OPENALEX_MAILTO=your-email@domain.com
 78 |    ```
 79 | 
 80 | 5. **Run the server:**
 81 |    ```bash
 82 |    ./run_alex_mcp.sh
 83 |    # Or, if installed as a CLI tool:
 84 |    alex-mcp
 85 |    ```
 86 | 
 87 | ---
 88 | 
 89 | ## ⚙️ MCP Configuration
 90 | 
 91 | ### Claude Desktop Configuration
 92 | 
 93 | Add to your Claude Desktop configuration file:
 94 | 
 95 | ```json
 96 | {
 97 |   "mcpServers": {
 98 |     "alex-mcp": {
 99 |       "command": "/path/to/alex-mcp/run_alex_mcp.sh",
100 |       "env": {
101 |         "OPENALEX_MAILTO": "your-email@domain.com"
102 |       }
103 |     }
104 |   }
105 | }
106 | ```
107 | 
108 | Replace `/path/to/alex-mcp` with the actual path to the repository on your system.
109 | 
110 | ---
111 | 
112 | ## 🤖 Using with AI Agents
113 | 
114 | ### OpenAI Agents Integration
115 | 
116 | You can load this MCP server in your OpenAI agent workflow using the [`agents.mcp.MCPServerStdio`](https://github.com/openai/openai-agents) interface:
117 | 
118 | ```python
119 | from agents.mcp import MCPServerStdio
120 | 
121 | async with MCPServerStdio(
122 |     name="OpenAlex MCP For Author disambiguation and works",
123 |     cache_tools_list=True,
124 |     params={
125 |         "command": "uvx",
126 |         "args": [
127 |             "--from", "git+https://github.com/drAbreu/alex-mcp.git@4.1.0",
128 |             "alex-mcp"
129 |         ],
130 |         "env": {
131 |             "OPENALEX_MAILTO": "your-email@domain.com"
132 |         }
133 |     },
134 |     client_session_timeout_seconds=10
135 | ) as alex_mcp:
136 |     await alex_mcp.connect()
137 |     tools = await alex_mcp.list_tools()
138 |     print(f"Available tools: {[tool.name for tool in tools]}")
139 | ```
140 | 
141 | ### Academic Research Agent Integration
142 | 
143 | This MCP server is specifically optimized for academic research workflows:
144 | 
145 | ```python
146 | # Optimized for academic research workflows
147 | from alex_agent import run_author_research
148 | 
149 | # Enhanced functionality with streamlined data
150 | result = await run_author_research(
151 |     "Find J. Abreu at EMBO with recent publications"
152 | )
153 | 
154 | # Clean, structured output for AI processing
155 | print(f"Success: {result['workflow_metadata']['success']}")
156 | print(f"Quality: {result['research_result']['metadata']['result_analysis']['quality_score']}/100")
157 | ```
158 | 
159 | ### Direct Launch with uvx
160 | 
161 | ```bash
162 | # Standard launch
163 | uvx --from git+https://github.com/drAbreu/alex-mcp.git@4.1.0 alex-mcp
164 | 
165 | # With environment variables
166 | OPENALEX_MAILTO=your-email@domain.com uvx --from git+https://github.com/drAbreu/alex-mcp.git@4.1.0 alex-mcp
167 | ```
168 | 
169 | ---
170 | 
171 | ## 🛠️ Available Tools
172 | 
173 | ### 1. **autocomplete_authors** ⭐ NEW
174 | Get multiple author candidates using OpenAlex autocomplete API for intelligent disambiguation.
175 | 
176 | **Parameters:**
177 | - `name` (required): Author name to search (e.g., "James Briscoe", "M. Ralser")
178 | - `context` (optional): Context for disambiguation (e.g., "Francis Crick Institute developmental biology")
179 | - `limit` (optional): Maximum candidates (1-10, default: 5)
180 | 
181 | **Key Features:**
182 | - ⚡ **Fast**: ~200ms response time
183 | - 🎯 **Smart**: Multiple candidates with institutional hints
184 | - 🧠 **AI-Ready**: Perfect for context-based selection
185 | - 📊 **Rich**: Works count, citations, institution info
186 | 
187 | **Streamlined Output:**
188 | ```json
189 | {
190 |   "query": "James Briscoe",
191 |   "context": "Francis Crick Institute",
192 |   "total_candidates": 3,
193 |   "candidates": [
194 |     {
195 |       "openalex_id": "https://openalex.org/A5019391436",
196 |       "display_name": "James Briscoe",
197 |       "institution_hint": "The Francis Crick Institute, UK",
198 |       "works_count": 415,
199 |       "cited_by_count": 24623,
200 |       "external_id": "https://orcid.org/0000-0002-1020-5240"
201 |     }
202 |   ]
203 | }
204 | ```
205 | 
206 | **Usage Pattern:**
207 | ```python
208 | # Get multiple candidates for disambiguation
209 | candidates = await autocomplete_authors(
210 |     "James Briscoe", 
211 |     context="Francis Crick Institute developmental biology"
212 | )
213 | 
214 | # AI selects best match based on institutional context
215 | # Much more accurate than single search result!
216 | ```
217 | 
218 | ### 2. **search_authors**
219 | Search for authors with streamlined output for AI agents.
220 | 
221 | **Parameters:**
222 | - `name` (required): Author name to search
223 | - `institution` (optional): Institution name filter
224 | - `topic` (optional): Research topic filter
225 | - `country_code` (optional): Country code filter (e.g., "US", "DE")
226 | - `limit` (optional): Maximum results (1-25, default: 20)
227 | 
228 | **Streamlined Output:**
229 | ```json
230 | {
231 |   "query": "J. Abreu",
232 |   "total_count": 3,
233 |   "results": [
234 |     {
235 |       "id": "https://openalex.org/A123456789",
236 |       "display_name": "Jorge Abreu-Vicente",
237 |       "orcid": "https://orcid.org/0000-0000-0000-0000",
238 |       "display_name_alternatives": ["J. Abreu-Vicente", "Jorge Abreu Vicente"],
239 |       "affiliations": [
240 |         {
241 |           "institution": {
242 |             "display_name": "European Molecular Biology Organization",
243 |             "country_code": "DE"
244 |           },
245 |           "years": [2023, 2024, 2025]
246 |         }
247 |       ],
248 |       "cited_by_count": 316,
249 |       "works_count": 25,
250 |       "summary_stats": {
251 |         "h_index": 9,
252 |         "i10_index": 5
253 |       },
254 |       "x_concepts": [
255 |         {
256 |           "display_name": "Astrophysics",
257 |           "score": 0.8
258 |         },
259 |         {
260 |           "display_name": "Machine Learning", 
261 |           "score": 0.6
262 |         }
263 |       ]
264 |     }
265 |   ]
266 | }
267 | ```
268 | 
269 | **Features**: Clean structure optimized for AI reasoning and disambiguation
270 | 
271 | ---
272 | 
273 | ### 2. **retrieve_author_works**
274 | Retrieve works for a given author with enhanced filtering capabilities.
275 | 
276 | **Parameters:**
277 | - `author_id` (required): OpenAlex author ID
278 | - `limit` (optional): Maximum results (1-50, default: 20)
279 | - `order_by` (optional): "date" or "citations" (default: "date")
280 | - `publication_year` (optional): Filter by specific year
281 | - `type` (optional): Work type filter (e.g., "journal-article")
282 | - `authorships_institutions_id` (optional): Filter by institution
283 | - `is_retracted` (optional): Filter retracted works
284 | - `open_access_is_oa` (optional): Filter by open access status
285 | 
286 | **Enhanced Output:**
287 | ```json
288 | {
289 |   "author_id": "https://openalex.org/A123456789",
290 |   "total_count": 25,
291 |   "results": [
292 |     {
293 |       "id": "https://openalex.org/W123456789",
294 |       "title": "A platform for the biomedical application of large language models",
295 |       "doi": "10.1038/s41587-024-02534-3",
296 |       "publication_year": 2025,
297 |       "type": "journal-article",
298 |       "cited_by_count": 42,
299 |       "authorships": [
300 |         {
301 |           "author": {
302 |             "display_name": "Jorge Abreu-Vicente"
303 |           },
304 |           "institutions": [
305 |             {
306 |               "display_name": "European Molecular Biology Organization"
307 |             }
308 |           ]
309 |         }
310 |       ],
311 |       "locations": [
312 |         {
313 |           "source": {
314 |             "display_name": "Nature Biotechnology",
315 |             "type": "journal"
316 |           }
317 |         }
318 |       ],
319 |       "open_access": {
320 |         "is_oa": true
321 |       },
322 |       "primary_topic": {
323 |         "display_name": "Biomedical Engineering"
324 |       }
325 |     }
326 |   ]
327 | }
328 | ```
329 | 
330 | **Features**: Comprehensive work data with flexible filtering for targeted queries
331 | 
332 | ---
333 | 
334 | ## 📊 Data Optimization
335 | 
336 | ### Focused Information Architecture
337 | This MCP server provides focused, structured data specifically designed for AI agent consumption:
338 | 
339 | ### Author Data Features
340 | - **Identity Resolution**: Names, ORCID, alternatives for disambiguation
341 | - **Affiliation Tracking**: Current and historical institutional connections
342 | - **Impact Metrics**: Citation counts, h-index, and scholarly impact
343 | - **Research Context**: Fields, concepts, and domain expertise
344 | - **Career Analysis**: Temporal affiliation changes and transitions
345 | 
346 | ### Work Data Features
347 | - **Publication Metadata**: Title, DOI, venue, and publication details
348 | - **Impact Assessment**: Citation counts and scholarly influence
349 | - **Access Information**: Open access status and availability
350 | - **Authorship Details**: Complete author lists and institutional affiliations
351 | - **Research Classification**: Topics, concepts, and domain categorization
352 | 
353 | ### Enhanced Filtering
354 | 
355 | ```python
356 | # Target high-impact journal articles
357 | works = await retrieve_author_works(
358 |     author_id="https://openalex.org/A123456789",
359 |     type="journal-article",      # Focus on journal publications
360 |     open_access_is_oa=True,      # Open access only
361 |     order_by="citations",        # Most cited first
362 |     limit=15
363 | )
364 | 
365 | # Career transition analysis
366 | authors = await search_authors(
367 |     name="J. Abreu",
368 |     institution="EMBO",          # Current institution
369 |     topic="Machine Learning",    # Research focus
370 |     limit=10
371 | )
372 | ```
373 | 
374 | ---
375 | 
376 | ## 🧪 Example Usage
377 | 
378 | ### Author Disambiguation
379 | 
380 | ```python
381 | from alex_mcp.server import search_authors_core
382 | 
383 | # Comprehensive author search
384 | results = search_authors_core(
385 |     name="J Abreu Vicente",
386 |     institution="EMBO",
387 |     topic="Machine Learning",
388 |     limit=20
389 | )
390 | 
391 | print(f"Found {results.total_count} candidates")
392 | for author in results.results:
393 |     print(f"- {author.display_name}")
394 |     if author.affiliations:
395 |         current_inst = author.affiliations[0].institution.display_name
396 |         print(f"  Institution: {current_inst}")
397 |     print(f"  Metrics: {author.cited_by_count} citations, h-index {author.summary_stats.h_index}")
398 |     if author.x_concepts:
399 |         fields = [c.display_name for c in author.x_concepts[:3]]
400 |         print(f"  Research: {', '.join(fields)}")
401 | ```
402 | 
403 | ### Academic Work Analysis
404 | 
405 | ```python
406 | from alex_mcp.server import retrieve_author_works_core
407 | 
408 | # Comprehensive work retrieval
409 | works = retrieve_author_works_core(
410 |     author_id="https://openalex.org/A5058921480",
411 |     type="journal-article",      # Academic focus
412 |     order_by="citations",        # Impact-based ordering
413 |     limit=20
414 | )
415 | 
416 | print(f"Found {works.total_count} publications")
417 | for work in works.results:
418 |     print(f"- {work.title}")
419 |     if work.locations:
420 |         journal = work.locations[0].source.display_name
421 |         print(f"  Published in: {journal} ({work.publication_year})")
422 |     print(f"  Impact: {work.cited_by_count} citations")
423 |     if work.open_access and work.open_access.is_oa:
424 |         print("  ✓ Open Access")
425 | ```
426 | 
427 | ### Institution and Field Analysis
428 | 
429 | ```python
430 | # Analyze career transitions
431 | def analyze_career_path(author_result):
432 |     affiliations = author_result.affiliations
433 |     if len(affiliations) > 1:
434 |         print("Career path:")
435 |         for aff in sorted(affiliations, key=lambda x: min(x.years)):
436 |             years = f"{min(aff.years)}-{max(aff.years)}"
437 |             print(f"  {years}: {aff.institution.display_name}")
438 |     
439 |     # Research evolution
440 |     if author_result.x_concepts:
441 |         print("Research areas:")
442 |         for concept in author_result.x_concepts[:5]:
443 |             print(f"  {concept.display_name} (score: {concept.score:.2f})")
444 | 
445 | # Usage
446 | results = search_authors_core("Jorge Abreu Vicente")
447 | if results.results:
448 |     analyze_career_path(results.results[0])
449 | ```
450 | 
451 | ---
452 | 
453 | ## 🔧 Configuration Options
454 | 
455 | ### Environment Variables
456 | 
457 | ```bash
458 | # Required
459 | export OPENALEX_MAILTO=your-email@domain.com
460 | 
461 | # Optional settings
462 | export OPENALEX_MAX_AUTHORS=100             # Maximum authors per query
463 | export OPENALEX_USER_AGENT=research-agent-v1.0
464 | export ALEX_MCP_VERSION=4.1.0
465 | 
466 | # Rate limiting (respectful usage)
467 | export OPENALEX_RATE_PER_SEC=10
468 | export OPENALEX_RATE_PER_DAY=100000
469 | ```
470 | 
471 | ### Performance Tuning
472 | 
473 | ```python
474 | # For comprehensive research applications
475 | config = {
476 |     "max_authors_per_query": 25,     # Detailed author analysis
477 |     "max_works_per_author": 50,      # Complete publication history
478 |     "enable_all_filters": True,      # Full filtering capabilities
479 |     "detailed_affiliations": True,   # Complete institutional data
480 |     "research_concepts": True        # Detailed concept analysis
481 | }
482 | ```
483 | 
484 | ---
485 | 
486 | ## 🧑‍💻 Development & Testing
487 | 
488 | ### Project Structure
489 | ```
490 | alex-mcp/
491 | ├── src/alex_mcp/
492 | │   ├── server.py              # Main MCP server
493 | │   ├── data_objects.py        # Data models and structures
494 | │   └── utils.py               # Utility functions
495 | ├── examples/
496 | │   ├── basic_usage.py         # Simple examples
497 | │   ├── advanced_queries.py    # Complex query examples
498 | │   └── integration_demo.py    # AI agent integration
499 | ├── tests/
500 | │   ├── test_server.py         # Server functionality tests
501 | │   └── test_integration.py    # Integration tests
502 | └── docs/
503 |     └── api_reference.md       # Detailed API documentation
504 | ```
505 | 
506 | ### Running Tests
507 | 
508 | ```bash
509 | # Install test dependencies
510 | pip install -e ".[test]"
511 | 
512 | # Run functionality tests
513 | pytest tests/test_server.py -v
514 | 
515 | # Test with real queries
516 | python examples/basic_usage.py
517 | 
518 | # Test AI agent integration
519 | python examples/integration_demo.py
520 | ```
521 | 
522 | ### Development Examples
523 | 
524 | ```bash
525 | # Test author disambiguation
526 | python examples/basic_usage.py --query "J. Abreu" --institution "EMBO"
527 | 
528 | # Test work retrieval
529 | python examples/advanced_queries.py --author-id "A123456789" --type "journal-article"
530 | 
531 | # Test integration patterns
532 | python examples/integration_demo.py --workflow "career-analysis"
533 | ```
534 | 
535 | ---
536 | 
537 | ## 📈 Integration Examples
538 | 
539 | ### Academic Research Workflows
540 | 
541 | Perfect integration with AI-powered research analysis:
542 | 
543 | ```python
544 | # Enhanced academic research agent
545 | from alex_agent import AcademicResearchAgent
546 | 
547 | agent = AcademicResearchAgent(
548 |     mcp_servers=[alex_mcp],  # Streamlined data processing
549 |     model="gpt-4.1-2025-04-14"
550 | )
551 | 
552 | # Complex research queries with structured data
553 | result = await agent.research_author(
554 |     "Find J. Abreu at EMBO with machine learning publications"
555 | )
556 | 
557 | # Rich, structured output for AI reasoning
558 | print(f"Quality Score: {result.quality_score}/100")
559 | print(f"Author disambiguation: {result.confidence}")
560 | print(f"Research fields: {result.research_domains}")
561 | ```
562 | 
563 | ### Multi-Agent Systems
564 | 
565 | ```python
566 | # Collaborative research analysis
567 | async def research_collaboration_network(seed_author):
568 |     # Find primary author
569 |     authors = await alex_mcp.search_authors(seed_author)
570 |     primary = authors['results'][0]
571 |     
572 |     # Get their works
573 |     works = await alex_mcp.retrieve_author_works(
574 |         primary['id'], 
575 |         type="journal-article"
576 |     )
577 |     
578 |     # Analyze co-authors and build network
579 |     collaborators = set()
580 |     for work in works['results']:
581 |         for authorship in work.get('authorships', []):
582 |             collaborators.add(authorship['author']['display_name'])
583 |     
584 |     return {
585 |         'primary_author': primary,
586 |         'publication_count': len(works['results']),
587 |         'collaborator_network': list(collaborators),
588 |         'research_impact': sum(w['cited_by_count'] for w in works['results'])
589 |     }
590 | ```
591 | 
592 | ---
593 | 
594 | ## 🤝 Contributing
595 | 
596 | We welcome contributions to improve functionality and add new features:
597 | 
598 | 1. **Fork the repository**
599 | 2. **Create a feature branch**: `git checkout -b feature/enhanced-filtering`
600 | 3. **Add tests**: Ensure your changes maintain data quality and structure
601 | 4. **Submit a pull request**: Include examples and documentation
602 | 
603 | ### Development Priorities
604 | 
605 | - [ ] Enhanced filtering capabilities
606 | - [ ] Additional data enrichment
607 | - [ ] Performance optimizations
608 | - [ ] Integration examples
609 | - [ ] Documentation improvements
610 | 
611 | ---
612 | 
613 | ## 📄 License
614 | 
615 | This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
616 | 
617 | ---
618 | 
619 | ## 🌐 Links
620 | 
621 | - [OpenAlex API Documentation](https://docs.openalex.org/)
622 | - [Model Context Protocol](https://modelcontextprotocol.io/)
623 | - [FastMCP](https://github.com/ContextualAI/fastmcp)
624 | - [OpenAI Agents](https://github.com/openai/openai-agents)
625 | - [Academic Research Examples](examples/)
626 | 


--------------------------------------------------------------------------------
/src/alex_mcp/server.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | """
   3 | Optimized OpenAlex Author Disambiguation MCP Server with Peer-Review Filtering
   4 | 
   5 | Provides a FastMCP-compliant API for author disambiguation and institution resolution
   6 | using the OpenAlex API with streamlined output to minimize token usage.
   7 | """
   8 | 
   9 | import logging
  10 | from typing import Optional
  11 | from fastmcp import FastMCP
  12 | from alex_mcp.data_objects import (
  13 |     OptimizedAuthorResult,
  14 |     OptimizedSearchResponse,
  15 |     OptimizedWorksSearchResponse,
  16 |     OptimizedGeneralWorksSearchResponse,
  17 |     OptimizedWorkResult,
  18 |     AutocompleteAuthorCandidate,
  19 |     AutocompleteAuthorsResponse,
  20 |     optimize_author_data,
  21 |     optimize_work_data
  22 | )
  23 | import pyalex
  24 | import os
  25 | import sys
  26 | import aiohttp
  27 | import asyncio
  28 | import json
  29 | import re
  30 | 
  31 | def get_config():
  32 |     mailto = os.environ.get("OPENALEX_MAILTO")
  33 |     if not mailto:
  34 |         print(
  35 |             "ERROR: The environment variable OPENALEX_MAILTO must be set to your email address "
  36 |             "to use the OpenAlex MCP server. Example: export OPENALEX_MAILTO='your-email@example.com'",
  37 |             file=sys.stderr
  38 |         )
  39 |         sys.exit(1)
  40 |     return {
  41 |         "OPENALEX_MAILTO": mailto,
  42 |         "OPENALEX_USER_AGENT": os.environ.get(
  43 |             "OPENALEX_USER_AGENT",
  44 |             f"alex-mcp (+{mailto})"
  45 |         ),
  46 |         "OPENALEX_MAX_AUTHORS": int(os.environ.get("OPENALEX_MAX_AUTHORS", 50)),  # Reduced default
  47 |         "OPENALEX_RATE_PER_SEC": int(os.environ.get("OPENALEX_RATE_PER_SEC", 10)),
  48 |         "OPENALEX_RATE_PER_DAY": int(os.environ.get("OPENALEX_RATE_PER_DAY", 100000)),
  49 |         "OPENALEX_USE_DAILY_API": os.environ.get("OPENALEX_USE_DAILY_API", "true").lower() == "true",
  50 |         "OPENALEX_SNAPSHOT_INTERVAL_DAYS": int(os.environ.get("OPENALEX_SNAPSHOT_INTERVAL_DAYS", 30)),
  51 |         "OPENALEX_PREMIUM_UPDATES": os.environ.get("OPENALEX_PREMIUM_UPDATES", "hourly"),
  52 |         "OPENALEX_RETRACTION_BUG_START": os.environ.get("OPENALEX_RETRACTION_BUG_START", "2023-12-22"),
  53 |         "OPENALEX_RETRACTION_BUG_END": os.environ.get("OPENALEX_RETRACTION_BUG_END", "2024-03-19"),
  54 |         "OPENALEX_NO_FUNDING_DATA": os.environ.get("OPENALEX_NO_FUNDING_DATA", "true").lower() == "true",
  55 |         "OPENALEX_MISSING_CORRESPONDING_AUTHORS": os.environ.get("OPENALEX_MISSING_CORRESPONDING_AUTHORS", "true").lower() == "true",
  56 |         "OPENALEX_PARTIAL_ABSTRACTS": os.environ.get("OPENALEX_PARTIAL_ABSTRACTS", "true").lower() == "true",
  57 |     }
  58 | 
  59 | # Configure logging
  60 | logging.basicConfig(level=logging.INFO)
  61 | logger = logging.getLogger(__name__)
  62 | 
  63 | # Initialize FastMCP server
  64 | mcp = FastMCP("OpenAlex Academic Research")
  65 | 
  66 | 
  67 | def configure_pyalex(email: str):
  68 |     """
  69 |     Configure pyalex for OpenAlex API usage.
  70 | 
  71 |     Args:
  72 |         email (str): The email to use for OpenAlex API requests.
  73 |     """
  74 |     pyalex.config.email = email
  75 | 
  76 | # Load configuration
  77 | config = get_config()
  78 | configure_pyalex(config["OPENALEX_MAILTO"])
  79 | pyalex.config.user_agent = config["OPENALEX_USER_AGENT"]
  80 | 
  81 | 
  82 | def is_peer_reviewed_journal(work_data) -> bool:
  83 |     """
  84 |     Improved filter to determine if a work is from a peer-reviewed journal.
  85 |     
  86 |     Uses a balanced approach that catches data catalogs and preprints while
  87 |     not being overly strict about DOIs (some legitimate papers lack them in OpenAlex).
  88 |     
  89 |     Args:
  90 |         work_data: OpenAlex work object
  91 |         
  92 |     Returns:
  93 |         bool: True if the work appears to be from a peer-reviewed journal
  94 |     """
  95 |     try:
  96 |         # Safe string extraction with None checking
  97 |         title = work_data.get('title') or ''
  98 |         if isinstance(title, str):
  99 |             title = title.lower()
 100 |         else:
 101 |             title = str(title).lower() if title is not None else ''
 102 |         
 103 |         # Quick exclusions based on title patterns
 104 |         title_exclusions = [
 105 |             'vizier online data catalog',
 106 |             'online data catalog',
 107 |             'data catalog',
 108 |             'catalog:',
 109 |             'database:',
 110 |             'repository:',
 111 |             'preprint',
 112 |             'arxiv:',
 113 |             'biorxiv',
 114 |             'medrxiv',
 115 |         ]
 116 |         
 117 |         for exclusion in title_exclusions:
 118 |             if exclusion in title:
 119 |                 logger.debug(f"Excluding based on title pattern '{exclusion}': {title[:100]}")
 120 |                 return False
 121 |         
 122 |         # Check primary location
 123 |         primary_location = work_data.get('primary_location')
 124 |         if not primary_location:
 125 |             logger.debug("Excluding work without primary location")
 126 |             return False
 127 |         
 128 |         # Check source information
 129 |         source = primary_location.get('source', {})
 130 |         if not source:
 131 |             logger.debug("Excluding work without source")
 132 |             return False
 133 |         
 134 |         # Get journal/source information with safe None checking
 135 |         journal_name_raw = source.get('display_name') or ''
 136 |         journal_name = journal_name_raw.lower() if isinstance(journal_name_raw, str) else str(journal_name_raw).lower()
 137 |         
 138 |         publisher = work_data.get('publisher', '')
 139 |         doi = work_data.get('doi')
 140 |         issn_l = source.get('issn_l')
 141 |         issn = source.get('issn')
 142 |         
 143 |         source_type_raw = source.get('type') or ''
 144 |         source_type = source_type_raw.lower() if isinstance(source_type_raw, str) else str(source_type_raw).lower()
 145 |         
 146 |         # CRITICAL: Exclude known data catalogs by journal name
 147 |         excluded_journals = [
 148 |             'vizier online data catalog',
 149 |             'ycat',
 150 |             'catalog',
 151 |             'database',
 152 |             'repository',
 153 |             'arxiv',
 154 |             'biorxiv',
 155 |             'medrxiv',
 156 |             'ssrn',
 157 |             'research square',
 158 |             'zenodo',
 159 |             'figshare',
 160 |             'dryad',
 161 |             'github',
 162 |             'protocols.io',
 163 |             'ceur',
 164 |             'conference proceedings',
 165 |             'workshop proceedings',
 166 |         ]
 167 |         
 168 |         for excluded in excluded_journals:
 169 |             if excluded in journal_name:
 170 |                 logger.debug(f"Excluding journal pattern '{excluded}': {journal_name}")
 171 |                 return False
 172 |         
 173 |         # CRITICAL: Data catalogs typically have no publisher AND no DOI
 174 |         # This catches VizieR entries effectively
 175 |         if not publisher and not doi:
 176 |             logger.debug(f"Excluding work without publisher AND DOI: {title[:100]}")
 177 |             return False
 178 |         
 179 |         # Source type should be journal (if specified)
 180 |         if source_type and source_type not in ['journal', '']:
 181 |             logger.debug(f"Excluding non-journal source type: {source_type}")
 182 |             return False
 183 |         
 184 |         # Work type should be article or letter with safe None checking
 185 |         work_type_raw = work_data.get('type') or ''
 186 |         work_type = work_type_raw.lower() if isinstance(work_type_raw, str) else str(work_type_raw).lower()
 187 |         if work_type not in ['article', 'letter']:
 188 |             logger.debug(f"Excluding work type: {work_type}")
 189 |             return False
 190 |         
 191 |         # Should have reasonable publication year
 192 |         pub_year = work_data.get('publication_year')
 193 |         if not pub_year or pub_year < 1900 or pub_year > 2030:
 194 |             logger.debug(f"Excluding work with invalid publication year: {pub_year}")
 195 |             return False
 196 |         
 197 |         # For papers claiming to be from legitimate journals, check quality signals
 198 |         known_legitimate_journals = [
 199 |             'nature',
 200 |             'science',
 201 |             'cell',
 202 |             'astrophysical journal',
 203 |             'astronomy and astrophysics',
 204 |             'monthly notices',
 205 |             'physical review',
 206 |             'journal of',
 207 |             'proceedings of',
 208 |         ]
 209 |         
 210 |         is_known_journal = any(known in journal_name for known in known_legitimate_journals)
 211 |         
 212 |         if is_known_journal:
 213 |             # For known journals, be more lenient (don't require DOI)
 214 |             # But still require either publisher or ISSN
 215 |             if not publisher and not issn_l and not issn:
 216 |                 logger.debug(f"Excluding known journal without publisher/ISSN: {journal_name}")
 217 |                 return False
 218 |         else:
 219 |             # For unknown journals, require more quality signals
 220 |             quality_signals = sum([
 221 |                 bool(doi),          # Has DOI
 222 |                 bool(publisher),    # Has publisher  
 223 |                 bool(issn_l or issn),  # Has ISSN
 224 |                 bool(journal_name and len(journal_name) > 5),  # Reasonable journal name
 225 |             ])
 226 |             
 227 |             if quality_signals < 2:  # Require at least 2 quality signals
 228 |                 logger.debug(f"Excluding unknown journal with insufficient quality signals ({quality_signals}/4): {journal_name}")
 229 |                 return False
 230 |         
 231 |         # Additional quality checks
 232 |         if 'cited_by_count' not in work_data:
 233 |             logger.debug("Excluding work without citation data")
 234 |             return False
 235 |         
 236 |         # Very long titles might be data descriptions
 237 |         if len(title) > 250:
 238 |             logger.debug(f"Excluding work with very long title: {title[:100]}...")
 239 |             return False
 240 |         
 241 |         # If we get here, it passes all checks
 242 |         logger.debug(f"ACCEPTED: {title[:100]}")
 243 |         return True
 244 |         
 245 |     except Exception as e:
 246 |         logger.error(f"Error in peer review check for work: {e}")
 247 |         logger.error(f"Work data keys: {list(work_data.keys()) if isinstance(work_data, dict) else 'Not a dict'}")
 248 |         logger.error(f"Work title: {repr(work_data.get('title') if isinstance(work_data, dict) else 'N/A')}")
 249 |         logger.error(f"Primary location: {repr(work_data.get('primary_location') if isinstance(work_data, dict) else 'N/A')}")
 250 |         import traceback
 251 |         logger.error(f"Full traceback: {traceback.format_exc()}")
 252 |         return False
 253 | 
 254 | 
 255 | def filter_peer_reviewed_works(works: list) -> list:
 256 |     """
 257 |     Apply peer-review filtering to a list of works.
 258 |     
 259 |     Args:
 260 |         works: List of OpenAlex work objects
 261 |         
 262 |     Returns:
 263 |         list: Filtered list containing only peer-reviewed journal works
 264 |     """
 265 |     filtered_works = []
 266 |     excluded_count = 0
 267 |     
 268 |     logger.info(f"Starting filtering of {len(works)} works...")
 269 |     
 270 |     for i, work in enumerate(works):
 271 |         # Safe handling of potentially None work or title
 272 |         if work is None:
 273 |             logger.warning(f"Skipping None work at position {i+1}")
 274 |             excluded_count += 1
 275 |             continue
 276 |             
 277 |         title_raw = work.get('title') if isinstance(work, dict) else None
 278 |         title = (title_raw or 'Unknown')[:60] if title_raw is not None else 'Unknown'
 279 |         
 280 |         try:
 281 |             if is_peer_reviewed_journal(work):
 282 |                 filtered_works.append(work)
 283 |                 logger.debug(f"✓ KEPT work {i+1}: {title}")
 284 |             else:
 285 |                 excluded_count += 1
 286 |                 logger.debug(f"✗ EXCLUDED work {i+1}: {title}")
 287 |         except Exception as e:
 288 |             logger.error(f"Error filtering work {i+1} (title: {title}): {e}")
 289 |             excluded_count += 1
 290 |     
 291 |     logger.info(f"Filtering complete: {len(filtered_works)} kept, {excluded_count} excluded from {len(works)} total")
 292 |     return filtered_works
 293 | 
 294 | 
 295 | def search_authors_core(
 296 |     name: str,
 297 |     institution: Optional[str] = None,
 298 |     topic: Optional[str] = None,
 299 |     country_code: Optional[str] = None,
 300 |     limit: int = 15  # Reduced default limit
 301 | ) -> OptimizedSearchResponse:
 302 |     """
 303 |     Optimized core logic for searching authors using OpenAlex.
 304 |     Returns streamlined author data to minimize token usage.
 305 | 
 306 |     Args:
 307 |         name: Author name to search for.
 308 |         institution: (Optional) Institution name filter.
 309 |         topic: (Optional) Topic filter.
 310 |         country_code: (Optional) Country code filter.
 311 |         limit: Maximum number of results to return (default: 15).
 312 | 
 313 |     Returns:
 314 |         OptimizedSearchResponse: Streamlined response with essential author data.
 315 |     """
 316 |     try:
 317 |         # Build query
 318 |         query = pyalex.Authors().search_filter(display_name=name)
 319 |         
 320 |         # Add filters if provided
 321 |         filters = {}
 322 |         if institution:
 323 |             filters['affiliations.institution.display_name.search'] = institution
 324 |         if topic:
 325 |             filters['x_concepts.display_name.search'] = topic
 326 |         if country_code:
 327 |             filters['affiliations.institution.country_code'] = country_code
 328 |         
 329 |         if filters:
 330 |             query = query.filter(**filters)
 331 |         
 332 |         # Execute query with limit
 333 |         results = query.get(per_page=min(limit, 100))  # Increased for comprehensive search
 334 |         authors = list(results)
 335 |         
 336 |         # Convert to optimized format
 337 |         optimized_authors = []
 338 |         for author_data in authors:
 339 |             try:
 340 |                 optimized_author = optimize_author_data(author_data)
 341 |                 optimized_authors.append(optimized_author)
 342 |             except Exception as e:
 343 |                 logger.warning(f"Error optimizing author data: {e}")
 344 |                 # Skip problematic authors rather than failing completely
 345 |                 continue
 346 |         
 347 |         logger.info(f"Found {len(optimized_authors)} authors for query: {name}")
 348 |         
 349 |         return OptimizedSearchResponse(
 350 |             query=name,
 351 |             total_count=len(optimized_authors),
 352 |             results=optimized_authors
 353 |         )
 354 |         
 355 |     except Exception as e:
 356 |         logger.error(f"Error searching authors for query '{name}': {e}")
 357 |         return OptimizedSearchResponse(
 358 |             query=name,
 359 |             total_count=0,
 360 |             results=[]
 361 |         )
 362 | 
 363 | 
 364 | def autocomplete_authors_core(
 365 |     name: str, 
 366 |     context: Optional[str] = None, 
 367 |     limit: int = 10,
 368 |     filter_no_institution: bool = True,
 369 |     enable_institution_ranking: bool = True
 370 | ) -> AutocompleteAuthorsResponse:
 371 |     """
 372 |     Enhanced core function for author autocomplete with intelligent filtering and ranking.
 373 |     
 374 |     Args:
 375 |         name: Author name to search for
 376 |         context: Optional context for better matching (institution, research area, etc.)
 377 |         limit: Maximum number of candidates to return (increased default to 10)
 378 |         filter_no_institution: If True, exclude candidates with no institutional affiliation
 379 |         enable_institution_ranking: If True, rank candidates by institutional context relevance
 380 |         
 381 |     Returns:
 382 |         AutocompleteAuthorsResponse with filtered and ranked candidate authors
 383 |     """
 384 |     try:
 385 |         logger.info(f"🔍 Autocompleting authors for: '{name}' (limit: {limit})")
 386 |         if context:
 387 |             logger.info(f"   📝 Context provided: {context}")
 388 |         
 389 |         # Use PyAlex autocomplete for authors - get more results for filtering
 390 |         raw_limit = min(limit * 2, 20)  # Get 2x candidates for filtering
 391 |         results = pyalex.Authors().autocomplete(name)[:raw_limit]
 392 |         
 393 |         # Convert to our data model first
 394 |         all_candidates = []
 395 |         for result in results:
 396 |             candidate = AutocompleteAuthorCandidate(
 397 |                 openalex_id=result.get('id', ''),
 398 |                 display_name=result.get('display_name', ''),
 399 |                 institution_hint=result.get('hint'),
 400 |                 works_count=result.get('works_count', 0),
 401 |                 cited_by_count=result.get('cited_by_count', 0),
 402 |                 entity_type=result.get('entity_type', 'author'),
 403 |                 external_id=result.get('external_id')
 404 |             )
 405 |             all_candidates.append(candidate)
 406 |         
 407 |         # ENHANCEMENT 1: Filter out candidates with no institution
 408 |         if filter_no_institution:
 409 |             filtered_candidates = [
 410 |                 c for c in all_candidates 
 411 |                 if c.institution_hint and c.institution_hint not in ['No institution', 'None', '']
 412 |             ]
 413 |             excluded_count = len(all_candidates) - len(filtered_candidates)
 414 |             if excluded_count > 0:
 415 |                 logger.info(f"   🔍 Filtered out {excluded_count} candidates with no institution")
 416 |         else:
 417 |             filtered_candidates = all_candidates
 418 |         
 419 |         # ENHANCEMENT 2: Institution-aware ranking (if context provided)
 420 |         if enable_institution_ranking and context and filtered_candidates:
 421 |             scored_candidates = []
 422 |             context_lower = context.lower()
 423 |             
 424 |             for candidate in filtered_candidates:
 425 |                 relevance_score = 0
 426 |                 matched_terms = []
 427 |                 
 428 |                 inst_hint = (candidate.institution_hint or '').lower()
 429 |                 
 430 |                 # High-value institutional matches
 431 |                 high_value_terms = [
 432 |                     'max planck', 'harvard', 'stanford', 'mit', 'cambridge', 'oxford',
 433 |                     'excellence cluster', 'crick', 'wellcome', 'nih', 'cnrs', 'inserm'
 434 |                 ]
 435 |                 for term in high_value_terms:
 436 |                     if term in context_lower and term in inst_hint:
 437 |                         relevance_score += 3
 438 |                         matched_terms.append(f"{term} (+3)")
 439 |                 
 440 |                 # Location-based matches
 441 |                 location_terms = ['germany', 'uk', 'usa', 'france', 'köln', 'cologne', 'london', 'boston', 'berlin']
 442 |                 for term in location_terms:
 443 |                     if term in context_lower and term in inst_hint:
 444 |                         relevance_score += 2
 445 |                         matched_terms.append(f"{term} (+2)")
 446 |                 
 447 |                 # Research field alignment (basic keyword matching)
 448 |                 research_terms = ['biology', 'chemistry', 'biochemistry', 'physics', 'medicine']
 449 |                 for term in research_terms:
 450 |                     if term in context_lower and term in inst_hint:
 451 |                         relevance_score += 1
 452 |                         matched_terms.append(f"{term} (+1)")
 453 |                 
 454 |                 # High-impact researcher bonus
 455 |                 if candidate.cited_by_count and candidate.cited_by_count > 1000:
 456 |                     relevance_score += 1
 457 |                     matched_terms.append("high-impact (+1)")
 458 |                 
 459 |                 scored_candidates.append({
 460 |                     'candidate': candidate,
 461 |                     'relevance_score': relevance_score,
 462 |                     'matched_terms': matched_terms
 463 |                 })
 464 |             
 465 |             # Sort by relevance score (descending), then by citation count
 466 |             scored_candidates.sort(key=lambda x: (x['relevance_score'], x['candidate'].cited_by_count), reverse=True)
 467 |             
 468 |             # Extract ranked candidates
 469 |             final_candidates = [sc['candidate'] for sc in scored_candidates[:limit]]
 470 |             
 471 |             # Log ranking results
 472 |             logger.info(f"   🏆 Institution-aware ranking applied:")
 473 |             for i, sc in enumerate(scored_candidates[:3], 1):  # Log top 3
 474 |                 candidate = sc['candidate']
 475 |                 logger.info(f"      {i}. {candidate.display_name} (score: {sc['relevance_score']}, {candidate.institution_hint})")
 476 |         else:
 477 |             # No ranking, just take first N candidates
 478 |             final_candidates = filtered_candidates[:limit]
 479 |         
 480 |         # Log final candidates
 481 |         for candidate in final_candidates:
 482 |             logger.info(f"   👤 {candidate.display_name} ({candidate.institution_hint or 'No institution'}) - {candidate.works_count} works")
 483 |         
 484 |         response = AutocompleteAuthorsResponse(
 485 |             query=name,
 486 |             context=context,
 487 |             total_candidates=len(final_candidates),
 488 |             candidates=final_candidates,
 489 |             search_metadata={
 490 |                 'api_used': 'openalex_autocomplete',
 491 |                 'has_context': context is not None,
 492 |                 'filtered_no_institution': filter_no_institution,
 493 |                 'institution_ranking_enabled': enable_institution_ranking and context is not None,
 494 |                 'response_time_ms': None  # Could be added with timing
 495 |             }
 496 |         )
 497 |         
 498 |         logger.info(f"✅ Found {len(final_candidates)} candidates for '{name}'")
 499 |         return response
 500 |         
 501 |     except Exception as e:
 502 |         logger.error(f"❌ Error in autocomplete_authors_core: {e}")
 503 |         # Return empty response on error
 504 |         return AutocompleteAuthorsResponse(
 505 |             query=name,
 506 |             context=context,
 507 |             total_candidates=0,
 508 |             candidates=[],
 509 |             search_metadata={
 510 |                 'api_used': 'openalex_autocomplete',
 511 |                 'has_context': context is not None,
 512 |                 'error': str(e)
 513 |             }
 514 |         )
 515 | 
 516 | 
 517 | def search_works_core(
 518 |     query: str,
 519 |     author: Optional[str] = None,
 520 |     institution: Optional[str] = None,
 521 |     publication_year: Optional[int] = None,
 522 |     type: Optional[str] = None,
 523 |     limit: int = 25,
 524 |     peer_reviewed_only: bool = True,
 525 |     search_type: str = "general"
 526 | ) -> OptimizedGeneralWorksSearchResponse:
 527 |     """
 528 |     Core logic for searching works using OpenAlex with configurable search modes.
 529 |     Returns streamlined work data to minimize token usage.
 530 | 
 531 |     Args:
 532 |         query: Search query text
 533 |         author: (Optional) Author name filter
 534 |         institution: (Optional) Institution name filter
 535 |         publication_year: (Optional) Publication year filter
 536 |         type: (Optional) Work type filter (e.g., "article", "letter")
 537 |         limit: Maximum number of results (default: 25, max: 100)
 538 |         peer_reviewed_only: If True, apply peer-review filters (default: True)
 539 |         search_type: Search mode - "general" (title/abstract/fulltext), "title" (title only), 
 540 |                     or "title_and_abstract" (title and abstract only)
 541 | 
 542 |     Returns:
 543 |         OptimizedGeneralWorksSearchResponse: Streamlined response with work data.
 544 |     """
 545 |     try:
 546 |         # Ensure reasonable limits to control token usage
 547 |         limit = min(limit, 100)
 548 |         
 549 |         # Build the search query using PyAlex based on search_type
 550 |         if search_type == "title":
 551 |             # Use title-specific search for precise title matching
 552 |             works_query = pyalex.Works()
 553 |             filters = {'title.search': query}
 554 |         elif search_type == "title_and_abstract":
 555 |             # Use title and abstract search
 556 |             works_query = pyalex.Works()
 557 |             filters = {'title_and_abstract.search': query}
 558 |         else:  # search_type == "general" or any other value
 559 |             # Use general search across title, abstract, and fulltext (default behavior)
 560 |             works_query = pyalex.Works().search(query)
 561 |             filters = {}
 562 |         
 563 |         # Add author filter if provided
 564 |         if author:
 565 |             # For general work search, we can use raw_author_name.search for name-based filtering
 566 |             # This searches for works where the author name appears in the raw author strings
 567 |             filters['raw_author_name.search'] = author
 568 |         
 569 |         # Add institution filter if provided  
 570 |         if institution:
 571 |             # Use the correct field for institution name filtering
 572 |             filters['authorships.institutions.display_name.search'] = institution
 573 |         
 574 |         # Add publication year filter
 575 |         if publication_year:
 576 |             filters['publication_year'] = publication_year
 577 |         
 578 |         # Add type filter
 579 |         if type:
 580 |             filters['type'] = type
 581 |         elif peer_reviewed_only:
 582 |             # Focus on journal articles and letters for academic work
 583 |             filters['type'] = 'article|letter'
 584 |         
 585 |         # Add basic quality filters
 586 |         if peer_reviewed_only:
 587 |             filters['is_retracted'] = False
 588 |         
 589 |         # Apply filters to query
 590 |         if filters:
 591 |             works_query = works_query.filter(**filters)
 592 |         
 593 |         # Execute query
 594 |         logger.info(f"Searching OpenAlex works with search_type='{search_type}', query: '{query[:50]}...' and {len(filters)} filters")
 595 |         results = works_query.get(per_page=limit)
 596 |         
 597 |         # Apply additional peer-review filtering if requested
 598 |         if peer_reviewed_only and results:
 599 |             logger.info(f"Applying peer-review filtering to {len(results)} results...")
 600 |             results = filter_peer_reviewed_works(results)
 601 |             logger.info(f"After peer-review filtering: {len(results)} results remain")
 602 |         
 603 |         # Convert to optimized format
 604 |         optimized_works = []
 605 |         for work in results:
 606 |             try:
 607 |                 optimized_work = optimize_work_data(work)
 608 |                 optimized_works.append(optimized_work)
 609 |             except Exception as e:
 610 |                 logger.warning(f"Error optimizing work data: {e}")
 611 |                 continue
 612 |         
 613 |         logger.info(f"Returning {len(optimized_works)} optimized works for search query")
 614 |         
 615 |         return OptimizedGeneralWorksSearchResponse(
 616 |             query=query,
 617 |             total_count=len(optimized_works),
 618 |             results=optimized_works,
 619 |             filters=filters
 620 |         )
 621 |         
 622 |     except Exception as e:
 623 |         logger.error(f"Error searching works for query '{query}': {e}")
 624 |         return OptimizedGeneralWorksSearchResponse(
 625 |             query=query,
 626 |             total_count=0,
 627 |             results=[],
 628 |             filters={}
 629 |         )
 630 | 
 631 | 
 632 | def retrieve_author_works_core(
 633 |     author_id: str,
 634 |     limit: int = 20_000,  # High default limit for comprehensive analysis
 635 |     order_by: str = "date",  # "date" or "citations"
 636 |     publication_year: Optional[int] = None,
 637 |     type: Optional[str] = None,
 638 |     journal_only: bool = True,  # Default to True for peer-reviewed content
 639 |     min_citations: Optional[int] = None,
 640 |     peer_reviewed_only: bool = True,  # Default to True
 641 | ) -> OptimizedWorksSearchResponse:
 642 |     """
 643 |     Enhanced core logic to retrieve peer-reviewed works for a given OpenAlex Author ID.
 644 |     Returns streamlined work data to minimize token usage and ensures only legitimate
 645 |     peer-reviewed journal articles and letters.
 646 | 
 647 |     Args:
 648 |         author_id: OpenAlex Author ID
 649 |         limit: Maximum number of results (default: 2000 for comprehensive analysis)
 650 |         order_by: Sort order - "date" or "citations"
 651 |         publication_year: Filter by specific year
 652 |         type: Filter by work type (e.g., "journal-article")
 653 |         journal_only: If True, only return journal articles and letters
 654 |         min_citations: Minimum citation count filter
 655 |         peer_reviewed_only: If True, apply comprehensive peer-review filters
 656 | 
 657 |     Returns:
 658 |         OptimizedWorksSearchResponse: Streamlined response with peer-reviewed work data.
 659 |     """
 660 |     try:
 661 |         limit = min(limit, 20_000)
 662 |         
 663 |         # Build base filters
 664 |         filters = {"author.id": author_id}
 665 |         
 666 |         # Add optional filters
 667 |         if publication_year:
 668 |             filters["publication_year"] = publication_year
 669 |         if type:
 670 |             filters["type"] = type
 671 |         elif journal_only:
 672 |             # Focus on journal articles and letters for academic work
 673 |             filters["type"] = "article|letter"
 674 |         if min_citations:
 675 |             filters["cited_by_count"] = f">={min_citations}"
 676 |         
 677 |         # Add some basic API-level filters (but not too restrictive)
 678 |         if peer_reviewed_only or journal_only:
 679 |             # Only exclude obviously retracted papers at API level
 680 |             filters["is_retracted"] = "false"
 681 |         
 682 |         # Convert author_id to proper format if needed
 683 |         if author_id.startswith("https://openalex.org/"):
 684 |             author_id_short = author_id.split("/")[-1]
 685 |             filters["author.id"] = f"https://openalex.org/{author_id_short}"
 686 | 
 687 |         # Build query - get more results for post-filtering if needed
 688 |         if peer_reviewed_only:
 689 |             initial_limit = min(limit * 4, 20_000)  # Get 4x more for filtering, much higher limit
 690 |         else:
 691 |             initial_limit = limit
 692 |             
 693 |         works_query = pyalex.Works().filter(**filters)
 694 |         
 695 |         # Apply sorting
 696 |         if order_by == "citations":
 697 |             works_query = works_query.sort(cited_by_count="desc")
 698 |         else:
 699 |             works_query = works_query.sort(publication_date="desc")
 700 |         
 701 |         # Execute query using pagination to get ALL works
 702 |         logger.info(f"Querying OpenAlex for up to {initial_limit} works with filters: {filters}")
 703 |         
 704 |         # Use paginate() to get all works, not just the first page
 705 |         all_works = []
 706 |         pager = works_query.paginate(per_page=200, n_max=initial_limit)  # Use 200 per page (API recommended)
 707 |         
 708 |         for page in pager:
 709 |             all_works.extend(page)
 710 |             if len(all_works) >= initial_limit:
 711 |                 break
 712 |         
 713 |         works = all_works[:initial_limit]  # Ensure we don't exceed the limit
 714 |         logger.info(f"Retrieved {len(works)} works from OpenAlex via pagination")
 715 |         
 716 |         # Apply peer-review filtering if requested
 717 |         if peer_reviewed_only:
 718 |             logger.info("Applying peer-review filtering...")
 719 |             works = filter_peer_reviewed_works(works)
 720 |             logger.info(f"After filtering: {len(works)} works remain")
 721 |         
 722 |         # Limit to requested number after filtering
 723 |         works = works[:limit]
 724 |         
 725 |         # Get author name for response (if available from first work)
 726 |         author_name = None
 727 |         if works:
 728 |             authorships = works[0].get('authorships', [])
 729 |             for authorship in authorships:
 730 |                 author = authorship.get('author', {})
 731 |                 if author.get('id') == author_id:
 732 |                     author_name = author.get('display_name')
 733 |                     break
 734 |         
 735 |         # Convert to optimized format
 736 |         optimized_works = []
 737 |         for work_data in works:
 738 |             try:
 739 |                 optimized_work = optimize_work_data(work_data)
 740 |                 optimized_works.append(optimized_work)
 741 |             except Exception as e:
 742 |                 logger.warning(f"Error optimizing work data: {e}")
 743 |                 continue
 744 |         
 745 |         logger.info(f"Final result: {len(optimized_works)} works for author: {author_id}")
 746 |         
 747 |         return OptimizedWorksSearchResponse(
 748 |             author_id=author_id,
 749 |             author_name=author_name,
 750 |             total_count=len(optimized_works),
 751 |             results=optimized_works,
 752 |             filters=filters
 753 |         )
 754 |         
 755 |     except Exception as e:
 756 |         logger.error(f"Error retrieving works for author {author_id}: {e}")
 757 |         return OptimizedWorksSearchResponse(
 758 |             author_id=author_id,
 759 |             total_count=0,
 760 |             results=[],
 761 |             filters={}
 762 |         )
 763 | 
 764 | 
 765 | @mcp.tool(
 766 |     annotations={
 767 |         "title": "Search Authors (Optimized)",
 768 |         "description": (
 769 |             "Search for authors by name with optional filters. "
 770 |             "Returns streamlined author data optimized for AI agents with ~70% fewer tokens. "
 771 |             "Includes essential info: name, ORCID, affiliations (as strings), metrics, and research fields."
 772 |         ),
 773 |         "readOnlyHint": True,
 774 |         "openWorldHint": True
 775 |     }
 776 | )
 777 | async def search_authors(
 778 |     name: str,
 779 |     institution: Optional[str] = None,
 780 |     topic: Optional[str] = None,
 781 |     country_code: Optional[str] = None,
 782 |     limit: int = 15
 783 | ) -> dict:
 784 |     """
 785 |     Optimized MCP tool wrapper for searching authors.
 786 | 
 787 |     Args:
 788 |         name: Author name to search for.
 789 |         institution: (Optional) Institution name filter.
 790 |         topic: (Optional) Topic filter.
 791 |         country_code: (Optional) Country code filter.
 792 |         limit: Maximum number of results to return (default: 15, max: 100).
 793 | 
 794 |     Returns:
 795 |         dict: Serialized OptimizedSearchResponse with streamlined author data.
 796 |     """
 797 |     # Ensure reasonable limits to control token usage
 798 |     limit = min(limit, 100)  # Increased for comprehensive author search
 799 |     
 800 |     response = search_authors_core(
 801 |         name=name,
 802 |         institution=institution,
 803 |         topic=topic,
 804 |         country_code=country_code,
 805 |         limit=limit
 806 |     )
 807 |     return response.model_dump()
 808 | 
 809 | 
 810 | @mcp.tool(
 811 |     annotations={
 812 |         "title": "Retrieve Author Works (Peer-Reviewed Only)",
 813 |         "description": (
 814 |             "Retrieve peer-reviewed journal works for a given OpenAlex Author ID. "
 815 |             "Automatically filters out data catalogs, preprint servers, and non-journal content. "
 816 |             "Returns streamlined work data optimized for AI agents with ~80% fewer tokens. "
 817 |             "Uses balanced filtering: excludes VizieR catalogs but allows legitimate papers without DOIs."
 818 |         ),
 819 |         "readOnlyHint": True,
 820 |         "openWorldHint": True
 821 |     }
 822 | )
 823 | async def retrieve_author_works(
 824 |     author_id: str,
 825 |     limit: Optional[int] = None,
 826 |     order_by: str = "date",
 827 |     publication_year: Optional[int] = None,
 828 |     type: Optional[str] = None,
 829 |     journal_only: bool = True,
 830 |     min_citations: Optional[int] = None,
 831 |     peer_reviewed_only: bool = True,
 832 | ) -> dict:
 833 |     """
 834 |     Enhanced MCP tool wrapper for retrieving author works with flexible filtering.
 835 | 
 836 |     Args:
 837 |         author_id: OpenAlex Author ID (e.g., 'https://openalex.org/A123456789')
 838 |         limit: Maximum number of results (default: None = ALL works via pagination, max: 2000)
 839 |         order_by: Sort order - "date" for newest first, "citations" for most cited first
 840 |         publication_year: Filter by specific publication year
 841 |         type: Filter by work type (e.g., "journal-article", "letter")
 842 |         journal_only: If True, only return journal articles and letters (default: True)
 843 |         min_citations: Only return works with at least this many citations
 844 |         peer_reviewed_only: If True, apply balanced peer-review filters (default: True)
 845 | 
 846 |     Returns:
 847 |         dict: Serialized OptimizedWorksSearchResponse with author's works.
 848 |         
 849 |     Usage Patterns:
 850 |         # For AI validation (sample of high-impact works)
 851 |         retrieve_author_works(author_id, limit=20, order_by="citations")
 852 |         
 853 |         # For complete benchmark evaluation (ALL works, minimal filtering)
 854 |         retrieve_author_works(author_id, peer_reviewed_only=False, journal_only=False)
 855 |         
 856 |         # For peer-reviewed works only (default behavior)
 857 |         retrieve_author_works(author_id)
 858 |     """
 859 |     # Handle limit: None means ALL works, otherwise cap at reasonable limit
 860 |     logger.info(f"MCP tool received limit parameter: {limit}")
 861 |     if limit is None:
 862 |         limit = 2000  # Set a very high limit to get ALL works
 863 |         logger.info(f"No limit specified, setting to {limit} for comprehensive retrieval")
 864 |     else:
 865 |         limit = min(limit, 2000)  # Increased max limit for comprehensive analysis
 866 |         logger.info(f"Explicit limit specified, capped to {limit}")
 867 |     
 868 |     response = retrieve_author_works_core(
 869 |         author_id=author_id,
 870 |         limit=limit,
 871 |         order_by=order_by,
 872 |         publication_year=publication_year,
 873 |         type=type,
 874 |         journal_only=journal_only,
 875 |         min_citations=min_citations,
 876 |         peer_reviewed_only=peer_reviewed_only,
 877 |     )
 878 |     return response.model_dump()
 879 | 
 880 | 
 881 | @mcp.tool(
 882 |     annotations={
 883 |         "title": "Search Works (Optimized)",
 884 |         "description": (
 885 |             "Search for academic works with configurable search modes and optional filters. "
 886 |             "Returns streamlined work data optimized for AI agents with ~80% fewer tokens. "
 887 |             "Supports different search types: 'general' (title/abstract/fulltext), 'title' (title only), "
 888 |             "or 'title_and_abstract' (title and abstract only). "
 889 |             "Supports author, institution, publication year, and type filters. "
 890 |             "Automatically applies peer-review filtering to exclude data catalogs and preprints."
 891 |         ),
 892 |         "readOnlyHint": True,
 893 |         "openWorldHint": True
 894 |     }
 895 | )
 896 | async def search_works(
 897 |     query: str,
 898 |     author: Optional[str] = None,
 899 |     institution: Optional[str] = None,
 900 |     publication_year: Optional[int] = None,
 901 |     type: Optional[str] = None,
 902 |     limit: int = 25,
 903 |     peer_reviewed_only: bool = True,
 904 |     search_type: str = "general"
 905 | ) -> dict:
 906 |     """
 907 |     Optimized MCP tool wrapper for searching works.
 908 | 
 909 |     Args:
 910 |         query: Search query text
 911 |         author: (Optional) Author name filter
 912 |         institution: (Optional) Institution name filter
 913 |         publication_year: (Optional) Publication year filter
 914 |         type: (Optional) Work type filter (e.g., "article", "letter")
 915 |         limit: Maximum number of results (default: 25, max: 100)
 916 |         peer_reviewed_only: If True, apply peer-review filters (default: True)
 917 |         search_type: Search mode - "general" (title/abstract/fulltext), "title" (title only), 
 918 |                     or "title_and_abstract" (title and abstract only)
 919 | 
 920 |     Returns:
 921 |         dict: Serialized OptimizedGeneralWorksSearchResponse with streamlined work data.
 922 |     """
 923 |     # Ensure reasonable limits to control token usage
 924 |     limit = min(limit, 100)
 925 |     
 926 |     response = search_works_core(
 927 |         query=query,
 928 |         author=author,
 929 |         institution=institution,
 930 |         publication_year=publication_year,
 931 |         type=type,
 932 |         limit=limit,
 933 |         peer_reviewed_only=peer_reviewed_only,
 934 |         search_type=search_type
 935 |     )
 936 |     return response.model_dump()
 937 | 
 938 | 
 939 | @mcp.tool(
 940 |     annotations={
 941 |         "title": "Autocomplete Authors (Smart Disambiguation)",
 942 |         "description": (
 943 |             "Get multiple author candidates using OpenAlex autocomplete API for intelligent disambiguation. "
 944 |             "Returns a ranked list of potential author matches with institutional hints and research metrics. "
 945 |             "Perfect when you need to disambiguate authors and have context like institution, research area, or co-authors. "
 946 |             "The AI can select the best match based on the provided context. "
 947 |             "Much faster than full search (~200ms) and provides multiple options for better accuracy."
 948 |         ),
 949 |         "readOnlyHint": True,
 950 |         "openWorldHint": True
 951 |     }
 952 | )
 953 | async def autocomplete_authors(
 954 |     name: str,
 955 |     context: Optional[str] = None, 
 956 |     limit: int = 10,
 957 |     filter_no_institution: bool = True,
 958 |     enable_institution_ranking: bool = True
 959 | ) -> dict:
 960 |     """
 961 |     Enhanced autocomplete authors with intelligent filtering and ranking.
 962 |     
 963 |     Args:
 964 |         name: Author name to search for (e.g., "James Briscoe", "M. Ralser")
 965 |         context: Optional context to help with disambiguation (e.g., "Francis Crick Institute developmental biology", "Max Planck Institute Köln Germany")
 966 |         limit: Maximum number of candidates to return (default: 10, max: 15)
 967 |         filter_no_institution: If True, exclude candidates with no institutional affiliation (default: True)
 968 |         enable_institution_ranking: If True, rank candidates by institutional context relevance (default: True)
 969 |         
 970 |     Returns:
 971 |         dict: Serialized AutocompleteAuthorsResponse with filtered and ranked candidate authors, including:
 972 |         - openalex_id: Full OpenAlex author ID
 973 |         - display_name: Author's display name
 974 |         - institution_hint: Current/last known institution 
 975 |         - works_count: Number of published works
 976 |         - cited_by_count: Total citation count
 977 |         - external_id: ORCID or other external identifiers
 978 |         - search_metadata: Information about filtering and ranking applied
 979 |         
 980 |     Example usage:
 981 |         # Get high-quality candidates with institutional filtering
 982 |         candidates = await autocomplete_authors("Ivan Matić", context="Max Planck Institute Biology Ageing Köln Germany")
 983 |         
 984 |         # For seasoned researchers, institution hints and ranking help disambiguation
 985 |         # AI can then select the best match or retrieve works for further verification
 986 |         
 987 |     Enhanced Features:
 988 |         - Filters out candidates with no institutional affiliation (reduces noise)
 989 |         - Institution-aware ranking when context is provided (improves accuracy)
 990 |         - Higher default limit (10 vs 5) for better candidate coverage
 991 |         - Detailed logging for debugging and optimization
 992 |     """
 993 |     # Ensure reasonable limits - increased max to 15
 994 |     limit = min(max(limit, 1), 15)
 995 |     
 996 |     response = autocomplete_authors_core(
 997 |         name=name,
 998 |         context=context, 
 999 |         limit=limit,
1000 |         filter_no_institution=filter_no_institution,
1001 |         enable_institution_ranking=enable_institution_ranking
1002 |     )
1003 |     return response.model_dump()
1004 | 
1005 | 
1006 | # PubMed Integration Functions
1007 | import requests
1008 | import xml.etree.ElementTree as ET
1009 | from typing import Union
1010 | 
1011 | def pubmed_search_core(
1012 |     query: str,
1013 |     max_results: int = 20,
1014 |     search_type: str = "author"
1015 | ) -> dict:
1016 |     """
1017 |     Core PubMed search functionality using E-utilities API.
1018 |     
1019 |     Args:
1020 |         query: Search query (author name, DOI, or keywords)
1021 |         max_results: Maximum number of results to return
1022 |         search_type: Type of search ("author", "doi", "title", "keywords")
1023 |         
1024 |     Returns:
1025 |         dict with search results including PMIDs, total count, and basic metadata
1026 |     """
1027 |     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1028 |     
1029 |     try:
1030 |         # Format search term based on type
1031 |         if search_type == "author":
1032 |             search_term = f'"{query}"[Author]'
1033 |         elif search_type == "doi":
1034 |             clean_doi = query.replace('https://doi.org/', '').replace('http://dx.doi.org/', '')
1035 |             search_term = f'"{clean_doi}"[AID]'
1036 |         elif search_type == "title":
1037 |             search_term = f'"{query}"[Title]'
1038 |         else:  # keywords
1039 |             search_term = query
1040 |         
1041 |         logger.info(f"🔍 PubMed search: {search_term} (max: {max_results})")
1042 |         
1043 |         # Search PubMed
1044 |         search_url = f"{base_url}esearch.fcgi"
1045 |         search_params = {
1046 |             'db': 'pubmed',
1047 |             'term': search_term,
1048 |             'retmax': max_results,
1049 |             'retmode': 'json',
1050 |             'sort': 'relevance'
1051 |         }
1052 |         
1053 |         response = requests.get(search_url, params=search_params, timeout=10)
1054 |         response.raise_for_status()
1055 |         search_data = response.json()
1056 |         
1057 |         pmids = search_data.get('esearchresult', {}).get('idlist', [])
1058 |         total_count = int(search_data.get('esearchresult', {}).get('count', 0))
1059 |         
1060 |         logger.info(f"📊 Found {total_count} total results, retrieved {len(pmids)} PMIDs")
1061 |         
1062 |         # Get basic details for retrieved PMIDs (if any)
1063 |         articles = []
1064 |         if pmids:
1065 |             articles = get_pubmed_summaries(pmids[:min(len(pmids), 10)])  # Limit to 10 for performance
1066 |         
1067 |         return {
1068 |             'query': query,
1069 |             'search_type': search_type,
1070 |             'search_term_used': search_term,
1071 |             'total_count': total_count,
1072 |             'retrieved_count': len(pmids),
1073 |             'pmids': pmids,
1074 |             'articles': articles,
1075 |             'search_metadata': {
1076 |                 'api_used': 'pubmed_esearch',
1077 |                 'max_results_requested': max_results,
1078 |                 'response_time_ms': None
1079 |             }
1080 |         }
1081 |         
1082 |     except Exception as e:
1083 |         logger.error(f"❌ PubMed search error: {e}")
1084 |         return {
1085 |             'query': query,
1086 |             'search_type': search_type,
1087 |             'total_count': 0,
1088 |             'retrieved_count': 0,
1089 |             'pmids': [],
1090 |             'articles': [],
1091 |             'error': str(e)
1092 |         }
1093 | 
1094 | 
1095 | def get_pubmed_summaries(pmids: list) -> list:
1096 |     """
1097 |     Get summary information for a list of PMIDs using esummary.
1098 |     
1099 |     Args:
1100 |         pmids: List of PubMed IDs
1101 |         
1102 |     Returns:
1103 |         List of article summaries with basic metadata
1104 |     """
1105 |     if not pmids:
1106 |         return []
1107 |     
1108 |     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1109 |     
1110 |     try:
1111 |         # Get summaries
1112 |         summary_url = f"{base_url}esummary.fcgi"
1113 |         summary_params = {
1114 |             'db': 'pubmed',
1115 |             'id': ','.join(pmids),
1116 |             'retmode': 'json'
1117 |         }
1118 |         
1119 |         response = requests.get(summary_url, params=summary_params, timeout=15)
1120 |         response.raise_for_status()
1121 |         summary_data = response.json()
1122 |         
1123 |         articles = []
1124 |         uids = summary_data.get('result', {}).get('uids', [])
1125 |         
1126 |         for uid in uids:
1127 |             article_data = summary_data.get('result', {}).get(uid, {})
1128 |             if article_data:
1129 |                 # Extract key information
1130 |                 authors = article_data.get('authors', [])
1131 |                 author_names = [author.get('name', '') for author in authors[:5]]  # First 5 authors
1132 |                 
1133 |                 article = {
1134 |                     'pmid': uid,
1135 |                     'title': article_data.get('title', ''),
1136 |                     'authors': author_names,
1137 |                     'journal': article_data.get('fulljournalname', ''),
1138 |                     'pub_date': article_data.get('pubdate', ''),
1139 |                     'doi': article_data.get('elocationid', ''),  # Often contains DOI
1140 |                     'pmcid': article_data.get('pmcid', ''),
1141 |                     'publication_types': article_data.get('pubtype', [])
1142 |                 }
1143 |                 articles.append(article)
1144 |         
1145 |         logger.info(f"📄 Retrieved summaries for {len(articles)} articles")
1146 |         return articles
1147 |         
1148 |     except Exception as e:
1149 |         logger.error(f"❌ Error getting PubMed summaries: {e}")
1150 |         return []
1151 | 
1152 | 
1153 | def get_pubmed_author_sample(author_name: str, sample_size: int = 5) -> dict:
1154 |     """
1155 |     Get a sample of works by an author from PubMed with institutional information.
1156 |     
1157 |     Args:
1158 |         author_name: Author name to search for
1159 |         sample_size: Number of sample works to analyze in detail
1160 |         
1161 |     Returns:
1162 |         dict with author sample analysis including affiliations and name variants
1163 |     """
1164 |     try:
1165 |         logger.info(f"🔍 Getting PubMed author sample for: {author_name}")
1166 |         
1167 |         # Search for author
1168 |         search_result = pubmed_search_core(author_name, max_results=sample_size, search_type="author")
1169 |         
1170 |         if not search_result['pmids']:
1171 |             return {
1172 |                 'author_name': author_name,
1173 |                 'total_works': 0,
1174 |                 'sample_works': [],
1175 |                 'institutional_keywords': [],
1176 |                 'name_variants': [],
1177 |                 'email_addresses': []
1178 |             }
1179 |         
1180 |         # Get detailed information for sample
1181 |         sample_pmids = search_result['pmids'][:sample_size]
1182 |         detailed_articles = []
1183 |         all_affiliations = []
1184 |         name_variants = set()
1185 |         email_addresses = set()
1186 |         
1187 |         for pmid in sample_pmids:
1188 |             article_details = get_detailed_pubmed_article(pmid, author_name)
1189 |             if article_details:
1190 |                 detailed_articles.append(article_details)
1191 |                 
1192 |                 # Extract affiliations and variants for target author
1193 |                 for author_info in article_details.get('author_details', []):
1194 |                     if is_target_author(author_info, author_name):
1195 |                         all_affiliations.extend(author_info.get('affiliations', []))
1196 |                         
1197 |                         # Collect name variants
1198 |                         full_name = f"{author_info['first_name']} {author_info['last_name']}".strip()
1199 |                         if full_name:
1200 |                             name_variants.add(full_name)
1201 |                         
1202 |                         # Extract email addresses
1203 |                         for affil in author_info.get('affiliations', []):
1204 |                             emails = extract_emails_from_text(affil)
1205 |                             email_addresses.update(emails)
1206 |         
1207 |         # Extract institutional keywords
1208 |         institutional_keywords = extract_institutional_keywords(all_affiliations)
1209 |         
1210 |         return {
1211 |             'author_name': author_name,
1212 |             'total_works': search_result['total_count'],
1213 |             'sample_works': detailed_articles,
1214 |             'institutional_keywords': institutional_keywords,
1215 |             'name_variants': list(name_variants),
1216 |             'email_addresses': list(email_addresses),
1217 |             'sample_metadata': {
1218 |                 'sample_size': len(detailed_articles),
1219 |                 'affiliations_found': len(all_affiliations)
1220 |             }
1221 |         }
1222 |         
1223 |     except Exception as e:
1224 |         logger.error(f"❌ Error in PubMed author sample: {e}")
1225 |         return {
1226 |             'author_name': author_name,
1227 |             'total_works': 0,
1228 |             'sample_works': [],
1229 |             'error': str(e)
1230 |         }
1231 | 
1232 | 
1233 | def get_detailed_pubmed_article(pmid: str, target_author: str) -> dict:
1234 |     """Get detailed article information including author affiliations"""
1235 |     base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1236 |     
1237 |     try:
1238 |         fetch_url = f"{base_url}efetch.fcgi"
1239 |         fetch_params = {
1240 |             'db': 'pubmed',
1241 |             'id': pmid,
1242 |             'retmode': 'xml',
1243 |             'rettype': 'abstract'
1244 |         }
1245 |         
1246 |         response = requests.get(fetch_url, params=fetch_params, timeout=10)
1247 |         response.raise_for_status()
1248 |         
1249 |         # Parse XML
1250 |         root = ET.fromstring(response.text)
1251 |         article = root.find('.//PubmedArticle')
1252 |         
1253 |         if article is None:
1254 |             return None
1255 |         
1256 |         # Extract basic info
1257 |         title_elem = article.find('.//ArticleTitle')
1258 |         title = ''.join(title_elem.itertext()).strip() if title_elem is not None else ''
1259 |         
1260 |         journal_elem = article.find('.//Journal/Title')
1261 |         journal = journal_elem.text if journal_elem is not None else ''
1262 |         
1263 |         # Extract authors with affiliations
1264 |         author_details = []
1265 |         author_list = article.find('.//AuthorList')
1266 |         if author_list is not None:
1267 |             for author_elem in author_list.findall('Author'):
1268 |                 author_info = extract_detailed_author_info(author_elem)
1269 |                 author_details.append(author_info)
1270 |         
1271 |         return {
1272 |             'pmid': pmid,
1273 |             'title': title,
1274 |             'journal': journal,
1275 |             'author_details': author_details
1276 |         }
1277 |         
1278 |     except Exception as e:
1279 |         logger.error(f"❌ Error fetching detailed article {pmid}: {e}")
1280 |         return None
1281 | 
1282 | 
1283 | def extract_detailed_author_info(author_elem: ET.Element) -> dict:
1284 |     """Extract detailed author information from XML element"""
1285 |     author_info = {
1286 |         'last_name': '',
1287 |         'first_name': '',
1288 |         'initials': '',
1289 |         'affiliations': []
1290 |     }
1291 |     
1292 |     try:
1293 |         last_name = author_elem.find('LastName')
1294 |         if last_name is not None:
1295 |             author_info['last_name'] = last_name.text or ''
1296 |         
1297 |         first_name = author_elem.find('ForeName')
1298 |         if first_name is not None:
1299 |             author_info['first_name'] = first_name.text or ''
1300 |         
1301 |         initials = author_elem.find('Initials')
1302 |         if initials is not None:
1303 |             author_info['initials'] = initials.text or ''
1304 |         
1305 |         # Get affiliations
1306 |         affil_info = author_elem.find('AffiliationInfo')
1307 |         if affil_info is not None:
1308 |             for affil in affil_info.findall('Affiliation'):
1309 |                 if affil.text:
1310 |                     author_info['affiliations'].append(affil.text.strip())
1311 |         
1312 |     except Exception:
1313 |         pass
1314 |     
1315 |     return author_info
1316 | 
1317 | 
1318 | def is_target_author(author_info: dict, target_name: str) -> bool:
1319 |     """Check if author_info matches target author name"""
1320 |     full_name = f"{author_info['first_name']} {author_info['last_name']}".strip().lower()
1321 |     target_lower = target_name.lower()
1322 |     
1323 |     # Simple similarity check
1324 |     return (target_lower in full_name or 
1325 |             full_name in target_lower or
1326 |             author_info['last_name'].lower() in target_lower)
1327 | 
1328 | 
1329 | def extract_emails_from_text(text: str) -> list:
1330 |     """Extract email addresses from text"""
1331 |     import re
1332 |     email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
1333 |     return re.findall(email_pattern, text)
1334 | 
1335 | 
1336 | def extract_institutional_keywords(affiliations: list) -> list:
1337 |     """Extract common institutional keywords from affiliations"""
1338 |     if not affiliations:
1339 |         return []
1340 |     
1341 |     # Combine all affiliations
1342 |     all_text = ' '.join(affiliations).lower()
1343 |     
1344 |     # Common institutional keywords
1345 |     keywords = []
1346 |     institutional_terms = [
1347 |         'university', 'institute', 'college', 'school', 'center', 'centre',
1348 |         'hospital', 'laboratory', 'department', 'faculty', 'division',
1349 |         'max planck', 'harvard', 'stanford', 'mit', 'cambridge', 'oxford',
1350 |         'excellence cluster', 'cnrs', 'inserm', 'nih'
1351 |     ]
1352 |     
1353 |     for term in institutional_terms:
1354 |         if term in all_text:
1355 |             keywords.append(term)
1356 |     
1357 |     return keywords[:10]  # Return top 10
1358 | 
1359 | 
1360 | @mcp.tool(
1361 |     annotations={
1362 |         "title": "Search PubMed",
1363 |         "description": (
1364 |             "Search PubMed database for publications by author, DOI, title, or keywords. "
1365 |             "Provides basic article metadata including authors, journal, and publication info. "
1366 |             "Useful for cross-validation with OpenAlex data and discovering name variants."
1367 |         ),
1368 |         "readOnlyHint": True,
1369 |         "openWorldHint": True
1370 |     }
1371 | )
1372 | async def search_pubmed(
1373 |     query: str,
1374 |     search_type: str = "author",
1375 |     max_results: int = 20
1376 | ) -> dict:
1377 |     """
1378 |     Search PubMed database for publications.
1379 |     
1380 |     Args:
1381 |         query: Search query (author name, DOI, title, or keywords)
1382 |         search_type: Type of search - "author", "doi", "title", or "keywords" (default: "author")
1383 |         max_results: Maximum number of results to return (default: 20, max: 50)
1384 |         
1385 |     Returns:
1386 |         dict: Search results with PMIDs, article metadata, and summary statistics
1387 |         
1388 |     Example usage:
1389 |         # Search for author
1390 |         search_pubmed("Ivan Matic", search_type="author", max_results=10)
1391 |         
1392 |         # Search by DOI
1393 |         search_pubmed("10.1038/nprot.2009.36", search_type="doi")
1394 |         
1395 |         # Search by keywords
1396 |         search_pubmed("ADP-ribosylation DNA repair", search_type="keywords")
1397 |     """
1398 |     # Validate parameters
1399 |     max_results = min(max(max_results, 1), 50)  # Cap at 50 for performance
1400 |     valid_types = ["author", "doi", "title", "keywords"]
1401 |     if search_type not in valid_types:
1402 |         search_type = "author"
1403 |     
1404 |     logger.info(f"🔍 PubMed search: '{query}' (type: {search_type}, max: {max_results})")
1405 |     
1406 |     result = pubmed_search_core(query, max_results, search_type)
1407 |     return result
1408 | 
1409 | 
1410 | @mcp.tool(
1411 |     annotations={
1412 |         "title": "PubMed Author Sample",
1413 |         "description": (
1414 |             "Get a detailed sample of works by an author from PubMed including "
1415 |             "institutional affiliations, name variants, and email addresses. "
1416 |             "Useful for cross-validation and institutional disambiguation."
1417 |         ),
1418 |         "readOnlyHint": True,
1419 |         "openWorldHint": True
1420 |     }
1421 | )
1422 | async def pubmed_author_sample(
1423 |     author_name: str,
1424 |     sample_size: int = 5
1425 | ) -> dict:
1426 |     """
1427 |     Get detailed author sample from PubMed with institutional information.
1428 |     
1429 |     Args:
1430 |         author_name: Author name to search for (e.g., "Ivan Matic", "J Smith")
1431 |         sample_size: Number of recent works to analyze in detail (default: 5, max: 10)
1432 |         
1433 |     Returns:
1434 |         dict: Author analysis including:
1435 |         - total_works: Total number of works found in PubMed
1436 |         - sample_works: Detailed information for sample works
1437 |         - institutional_keywords: Common institutional terms found
1438 |         - name_variants: Different name formats found
1439 |         - email_addresses: Email addresses extracted from affiliations
1440 |         
1441 |     Example usage:
1442 |         # Get institutional profile for author
1443 |         pubmed_author_sample("Ivan Matic", sample_size=5)
1444 |     """
1445 |     # Validate parameters
1446 |     sample_size = min(max(sample_size, 1), 10)  # Cap at 10 for performance
1447 |     
1448 |     logger.info(f"🔍 PubMed author sample: '{author_name}' (sample: {sample_size})")
1449 |     
1450 |     result = get_pubmed_author_sample(author_name, sample_size)
1451 |     return result
1452 | 
1453 | 
1454 | # ============================================================================
1455 | # ORCID Integration Functions
1456 | # ============================================================================
1457 | 
1458 | async def search_orcid_by_name(name: str, affiliation: str = None, max_results: int = 10) -> dict:
1459 |     """
1460 |     Search ORCID by author name and optionally affiliation.
1461 |     
1462 |     Args:
1463 |         name: Author name to search
1464 |         affiliation: Optional affiliation to help disambiguation
1465 |         max_results: Maximum number of results to return
1466 |         
1467 |     Returns:
1468 |         dict: ORCID search results with author profiles
1469 |     """
1470 |     try:
1471 |         # ORCID Public API search endpoint
1472 |         base_url = "https://pub.orcid.org/v3.0/search"
1473 |         
1474 |         # Build search query
1475 |         query_parts = []
1476 |         if name:
1477 |             # Split name into parts for better matching
1478 |             name_parts = name.replace(",", "").split()
1479 |             if len(name_parts) >= 2:
1480 |                 # Assume last part is family name, rest are given names
1481 |                 family_name = name_parts[-1]
1482 |                 given_names = " ".join(name_parts[:-1])
1483 |                 query_parts.append(f'family-name:"{family_name}"')
1484 |                 query_parts.append(f'given-names:"{given_names}"')
1485 |             else:
1486 |                 query_parts.append(f'text:"{name}"')
1487 |         
1488 |         if affiliation:
1489 |             query_parts.append(f'affiliation-org-name:"{affiliation}"')
1490 |         
1491 |         query = " AND ".join(query_parts)
1492 |         
1493 |         params = {
1494 |             'q': query,
1495 |             'rows': min(max_results, 50),  # ORCID API limit
1496 |             'start': 0
1497 |         }
1498 |         
1499 |         headers = {
1500 |             'Accept': 'application/json',
1501 |             'User-Agent': f'alex-mcp (+{get_config()["OPENALEX_MAILTO"]})'
1502 |         }
1503 |         
1504 |         logger.info(f"🔍 ORCID search: '{query}' (max: {max_results})")
1505 |         
1506 |         async with aiohttp.ClientSession() as session:
1507 |             async with session.get(base_url, params=params, headers=headers) as response:
1508 |                 if response.status == 200:
1509 |                     data = await response.json()
1510 |                     
1511 |                     results = []
1512 |                     for result in data.get('result', []):
1513 |                         orcid_id = result.get('orcid-identifier', {}).get('path', '')
1514 |                         
1515 |                         # Extract name information
1516 |                         person = result.get('person', {})
1517 |                         names = person.get('name', {})
1518 |                         given_names = names.get('given-names', {}).get('value', '') if names.get('given-names') else ''
1519 |                         family_name = names.get('family-name', {}).get('value', '') if names.get('family-name') else ''
1520 |                         
1521 |                         # Extract employment/affiliation info
1522 |                         employments = []
1523 |                         employment_summaries = result.get('employment-summary', [])
1524 |                         for emp in employment_summaries[:3]:  # Limit to top 3
1525 |                             org_name = emp.get('organization', {}).get('name', '')
1526 |                             if org_name:
1527 |                                 employments.append(org_name)
1528 |                         
1529 |                         results.append({
1530 |                             'orcid_id': orcid_id,
1531 |                             'orcid_url': f'https://orcid.org/{orcid_id}' if orcid_id else '',
1532 |                             'given_names': given_names,
1533 |                             'family_name': family_name,
1534 |                             'full_name': f"{given_names} {family_name}".strip(),
1535 |                             'employments': employments,
1536 |                             'relevance_score': result.get('relevance-score', {}).get('value', 0)
1537 |                         })
1538 |                     
1539 |                     logger.info(f"📊 Found {len(results)} ORCID profiles")
1540 |                     
1541 |                     return {
1542 |                         'total_found': data.get('num-found', 0),
1543 |                         'results_returned': len(results),
1544 |                         'results': results
1545 |                     }
1546 |                 else:
1547 |                     logger.warning(f"ORCID API error: {response.status}")
1548 |                     return {'total_found': 0, 'results_returned': 0, 'results': [], 'error': f'HTTP {response.status}'}
1549 |                     
1550 |     except Exception as e:
1551 |         logger.error(f"ORCID search error: {str(e)}")
1552 |         return {'total_found': 0, 'results_returned': 0, 'results': [], 'error': str(e)}
1553 | 
1554 | 
1555 | async def get_orcid_works(orcid_id: str, max_works: int = 20) -> dict:
1556 |     """
1557 |     Get works/publications for a specific ORCID ID.
1558 |     
1559 |     Args:
1560 |         orcid_id: ORCID identifier (e.g., "0000-0000-0000-0000")
1561 |         max_works: Maximum number of works to retrieve
1562 |         
1563 |     Returns:
1564 |         dict: Works information from ORCID profile
1565 |     """
1566 |     try:
1567 |         # Clean ORCID ID (remove URL if present)
1568 |         clean_orcid = orcid_id.replace('https://orcid.org/', '').replace('http://orcid.org/', '')
1569 |         if not re.match(r'^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$', clean_orcid):
1570 |             return {'error': 'Invalid ORCID format', 'works': []}
1571 |         
1572 |         # ORCID Public API works endpoint
1573 |         url = f"https://pub.orcid.org/v3.0/{clean_orcid}/works"
1574 |         
1575 |         headers = {
1576 |             'Accept': 'application/json',
1577 |             'User-Agent': f'alex-mcp (+{get_config()["OPENALEX_MAILTO"]})'
1578 |         }
1579 |         
1580 |         logger.info(f"🔍 Getting ORCID works: {clean_orcid} (max: {max_works})")
1581 |         
1582 |         async with aiohttp.ClientSession() as session:
1583 |             async with session.get(url, headers=headers) as response:
1584 |                 if response.status == 200:
1585 |                     data = await response.json()
1586 |                     
1587 |                     works = []
1588 |                     work_summaries = data.get('group', [])[:max_works]
1589 |                     
1590 |                     for group in work_summaries:
1591 |                         for work_summary in group.get('work-summary', []):
1592 |                             title_info = work_summary.get('title', {})
1593 |                             title = title_info.get('title', {}).get('value', '') if title_info else ''
1594 |                             
1595 |                             journal_title = work_summary.get('journal-title', {}).get('value', '') if work_summary.get('journal-title') else ''
1596 |                             
1597 |                             # Extract publication date
1598 |                             pub_date = work_summary.get('publication-date')
1599 |                             pub_year = ''
1600 |                             if pub_date and pub_date.get('year'):
1601 |                                 pub_year = pub_date['year'].get('value', '')
1602 |                             
1603 |                             # Extract external IDs (DOI, PMID, etc.)
1604 |                             external_ids = {}
1605 |                             for ext_id in work_summary.get('external-ids', {}).get('external-id', []):
1606 |                                 id_type = ext_id.get('external-id-type', '')
1607 |                                 id_value = ext_id.get('external-id-value', '')
1608 |                                 if id_type and id_value:
1609 |                                     external_ids[id_type.lower()] = id_value
1610 |                             
1611 |                             works.append({
1612 |                                 'title': title,
1613 |                                 'journal': journal_title,
1614 |                                 'publication_year': pub_year,
1615 |                                 'external_ids': external_ids,
1616 |                                 'doi': external_ids.get('doi', ''),
1617 |                                 'pmid': external_ids.get('pmid', ''),
1618 |                                 'type': work_summary.get('type', '')
1619 |                             })
1620 |                     
1621 |                     logger.info(f"📊 Retrieved {len(works)} works from ORCID")
1622 |                     
1623 |                     return {
1624 |                         'orcid_id': clean_orcid,
1625 |                         'total_works': len(works),
1626 |                         'works': works
1627 |                     }
1628 |                 else:
1629 |                     logger.warning(f"ORCID works API error: {response.status}")
1630 |                     return {'error': f'HTTP {response.status}', 'works': []}
1631 |                     
1632 |     except Exception as e:
1633 |         logger.error(f"ORCID works error: {str(e)}")
1634 |         return {'error': str(e), 'works': []}
1635 | 
1636 | 
1637 | # ============================================================================
1638 | # ORCID MCP Tools
1639 | # ============================================================================
1640 | 
1641 | @mcp.tool(
1642 |     annotations={
1643 |         "title": "Search ORCID Authors",
1644 |         "description": (
1645 |             "Search ORCID database for author profiles by name and optionally affiliation. "
1646 |             "Provides ORCID IDs, verified names, and institutional affiliations for "
1647 |             "enhanced author disambiguation and verification."
1648 |         ),
1649 |         "readOnlyHint": True,
1650 |         "openWorldHint": True
1651 |     }
1652 | )
1653 | async def search_orcid_authors(
1654 |     name: str,
1655 |     affiliation: str = None,
1656 |     max_results: int = 10
1657 | ) -> dict:
1658 |     """
1659 |     Search ORCID for author profiles by name and affiliation.
1660 |     
1661 |     Args:
1662 |         name: Author name to search (e.g., "John Smith", "Maria Garcia")
1663 |         affiliation: Optional institutional affiliation for disambiguation
1664 |         max_results: Maximum number of results to return (default: 10, max: 50)
1665 |         
1666 |     Returns:
1667 |         dict: ORCID search results with:
1668 |         - total_found: Total number of matches found
1669 |         - results_returned: Number of results returned
1670 |         - results: List of author profiles with ORCID IDs, names, and affiliations
1671 |         
1672 |     Example usage:
1673 |         # Basic name search
1674 |         search_orcid_authors("John Smith")
1675 |         
1676 |         # Search with affiliation for better disambiguation
1677 |         search_orcid_authors("Maria Garcia", "University of Barcelona")
1678 |     """
1679 |     # Validate parameters
1680 |     max_results = min(max(max_results, 1), 50)  # ORCID API limit
1681 |     
1682 |     result = await search_orcid_by_name(name, affiliation, max_results)
1683 |     return result
1684 | 
1685 | 
1686 | @mcp.tool(
1687 |     annotations={
1688 |         "title": "Get ORCID Works",
1689 |         "description": (
1690 |             "Retrieve publications/works from a specific ORCID profile. "
1691 |             "Useful for cross-validation with OpenAlex data and verifying "
1692 |             "author publication records."
1693 |         ),
1694 |         "readOnlyHint": True,
1695 |         "openWorldHint": True
1696 |     }
1697 | )
1698 | async def get_orcid_publications(
1699 |     orcid_id: str,
1700 |     max_works: int = 20
1701 | ) -> dict:
1702 |     """
1703 |     Get publications/works from an ORCID profile.
1704 |     
1705 |     Args:
1706 |         orcid_id: ORCID identifier (e.g., "0000-0000-0000-0000" or full URL)
1707 |         max_works: Maximum number of works to retrieve (default: 20, max: 100)
1708 |         
1709 |     Returns:
1710 |         dict: Publications data with:
1711 |         - orcid_id: Cleaned ORCID identifier
1712 |         - total_works: Number of works found
1713 |         - works: List of publications with titles, journals, DOIs, PMIDs
1714 |         
1715 |     Example usage:
1716 |         # Get works for specific ORCID
1717 |         get_orcid_publications("0000-0000-0000-0000")
1718 |         
1719 |         # Get limited number of works
1720 |         get_orcid_publications("0000-0000-0000-0000", max_works=10)
1721 |     """
1722 |     # Validate parameters
1723 |     max_works = min(max(max_works, 1), 100)  # Reasonable limit
1724 |     
1725 |     result = await get_orcid_works(orcid_id, max_works)
1726 |     return result
1727 | 
1728 | 
1729 | def main():
1730 |     """
1731 |     Entry point for the enhanced alex-mcp server with balanced peer-review filtering.
1732 |     """
1733 |     import asyncio
1734 |     logger.info("Enhanced OpenAlex Author Disambiguation MCP Server starting...")
1735 |     logger.info("Features: ~70% token reduction for authors, ~80% for works")
1736 |     logger.info("Balanced peer-review filtering: excludes data catalogs while preserving legitimate papers")
1737 |     asyncio.run(mcp.run())
1738 | 
1739 | 
1740 | if __name__ == "__main__":
1741 |     main()


--------------------------------------------------------------------------------