├── src └── alex_mcp │ ├── __init__.py │ ├── data_objects.py │ └── server.py ├── img ├── oam_logo_avatar.png └── oam_logo_rectangular.png ├── alex-mcp-wrapper.sh ├── requirements.txt ├── LICENSE ├── pyproject.toml ├── examples ├── test_institution_resolution.py └── test_author_disambiguation.py ├── setup.py ├── .gitignore ├── INSTALL.md └── README.md /src/alex_mcp/__init__.py: -------------------------------------------------------------------------------- 1 | """OpenAlex MCP Server.""" 2 | __version__ = "4.1.0" -------------------------------------------------------------------------------- /img/oam_logo_avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drAbreu/alex-mcp/HEAD/img/oam_logo_avatar.png -------------------------------------------------------------------------------- /img/oam_logo_rectangular.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/drAbreu/alex-mcp/HEAD/img/oam_logo_rectangular.png -------------------------------------------------------------------------------- /alex-mcp-wrapper.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Wrapper script for alex-mcp that activates the virtual environment 3 | 4 | # Get the directory where this script is located 5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 6 | 7 | # Activate the virtual environment 8 | source "$SCRIPT_DIR/venv/bin/activate" 9 | 10 | # Run the MCP server 11 | exec python -m alex_mcp.server "$@" 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # OpenAlex + PubMed Hybrid Author Disambiguation MCP Server Dependencies 2 | # Following MCP best practices with FastMCP 3 | 4 | # MCP SDK - Latest version with FastMCP support 5 | mcp>=1.2.0 6 | 7 | # HTTP client for OpenAlex API and ORCID integration 8 | httpx>=0.25.0 9 | aiohttp>=3.8.0 10 | 11 | # Optional: For enhanced logging and debugging 12 | rich>=13.0.0 13 | 14 | # OpenAlex API wrapper 15 | pyalex==0.18 16 | 17 | # PubMed API integration 18 | biopython>=1.83 19 | requests>=2.31.0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Author Disambiguation MCP Server 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "alex-mcp" 7 | version = "4.8.2" 8 | description = "MCP server for OpenAlex academic research API" 9 | authors = [{name = "Jorge Abreu Vicente", email = "jorge.abreu@embo.org"}] 10 | license = {text = "MIT"} 11 | readme = "README.md" 12 | requires-python = ">=3.10" 13 | classifiers = [ 14 | "Development Status :: 3 - Alpha", 15 | "Intended Audience :: Developers", 16 | "License :: OSI Approved :: MIT License", 17 | "Programming Language :: Python :: 3", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: 3.11", 20 | "Programming Language :: Python :: 3.12", 21 | ] 22 | dependencies = [ 23 | "fastmcp>=2.8.1", 24 | "httpx>=0.28.1", 25 | "pydantic>=2.7.2", 26 | "rich>=13.9.4", 27 | "pyalex==0.18", 28 | "aiohttp>=3.8.0" 29 | ] 30 | 31 | [project.urls] 32 | Homepage = "https://github.com/drAbreu/alex-mcp" 33 | Repository = "https://github.com/drAbreu/alex-mcp" 34 | Issues = "https://github.com/drAbreu/alex-mcp/issues" 35 | 36 | [project.scripts] 37 | alex-mcp = "alex_mcp.server:main" 38 | 39 | [tool.setuptools.packages.find] 40 | where = ["src"] 41 | include = ["alex_mcp*"] 42 | 43 | -------------------------------------------------------------------------------- /examples/test_institution_resolution.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test suite for resolve_institution using the MCP server and pyalex. 3 | Focus: EMBO, MPIA, IRAM. 4 | """ 5 | 6 | import pytest 7 | import pyalex 8 | 9 | pyalex.config.email = "test@example.com" 10 | pyalex.config.max_retries = 2 11 | pyalex.config.retry_backoff_factor = 0.1 12 | pyalex.config.retry_http_codes = [429, 500, 503] 13 | 14 | import sys 15 | import os 16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 17 | from src.alex_mcp.server import _resolve_institution_impl as resolve_institution 18 | 19 | def test_resolve_institution_embo(): 20 | result = resolve_institution("EMBO") 21 | assert result["best_match"] is not None 22 | assert "i1303691731" in result["best_match"]["id"].lower() or "I1303691731" in result["best_match"]["id"] 23 | 24 | def test_resolve_institution_mpia(): 25 | result = resolve_institution("MPIA") 26 | assert result["best_match"] is not None 27 | assert "i4210109156" in result["best_match"]["id"].lower() or "I4210109156" in result["best_match"]["id"] 28 | 29 | def test_resolve_institution_iram(): 30 | result = resolve_institution("IRAM") 31 | assert result["best_match"] is not None 32 | assert "i4210096876" in result["best_match"]["id"].lower() or "I4210096876" in result["best_match"]["id"] 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="alex-mcp", 8 | version="4.2.5", 9 | author="OpenAlex MCP Team", 10 | description="OpenAlex Author Disambiguation MCP Server", 11 | long_description=long_description, 12 | long_description_content_type="text/markdown", 13 | url="https://github.com/drAbreu/alex-mcp", 14 | package_dir={"": "src"}, 15 | packages=find_packages(where="src"), 16 | classifiers=[ 17 | "Development Status :: 4 - Beta", 18 | "Intended Audience :: Science/Research", 19 | "License :: OSI Approved :: MIT License", 20 | "Operating System :: OS Independent", 21 | "Programming Language :: Python :: 3", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Programming Language :: Python :: 3.12", 25 | ], 26 | python_requires=">=3.10", # Added this since pyalex requires Python 3.8+ 27 | install_requires=[ 28 | "fastmcp>=2.8.1", 29 | "httpx>=0.28.1", 30 | "pydantic>=2.7.2", 31 | "rich>=13.9.4", 32 | "pyalex==0.18", 33 | ], 34 | entry_points={ 35 | "console_scripts": [ 36 | "alex-mcp=alex_mcp.server:main", 37 | ], 38 | }, 39 | include_package_data=True, 40 | zip_safe=False, 41 | ) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb 2 | 3 | # Python 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | *.so 8 | .Python 9 | build/ 10 | develop-eggs/ 11 | dist/ 12 | downloads/ 13 | eggs/ 14 | .eggs/ 15 | lib/ 16 | lib64/ 17 | parts/ 18 | sdist/ 19 | var/ 20 | wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | 26 | # PyInstaller 27 | *.manifest 28 | *.spec 29 | 30 | # Installer logs 31 | pip-log.txt 32 | pip-delete-this-directory.txt 33 | 34 | # Unit test / coverage reports 35 | htmlcov/ 36 | .tox/ 37 | .nox/ 38 | .coverage 39 | .coverage.* 40 | .cache 41 | nosetests.xml 42 | coverage.xml 43 | *.cover 44 | .hypothesis/ 45 | .pytest_cache/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | local_settings.py 54 | db.sqlite3 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # IPython 73 | profile_default/ 74 | ipython_config.py 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | .dmypy.json 107 | dmypy.json 108 | 109 | # Pyre type checker 110 | .pyre/ 111 | 112 | # IDEs 113 | .vscode/ 114 | .idea/ 115 | *.swp 116 | *.swo 117 | *~ 118 | 119 | # OS 120 | .DS_Store 121 | .DS_Store? 122 | ._* 123 | .Spotlight-V100 124 | .Trashes 125 | ehthumbs.db 126 | Thumbs.db 127 | 128 | # Project specific 129 | *.json 130 | !package.json 131 | !tsconfig.json 132 | test_results/ 133 | logs/ 134 | temp/ 135 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # OpenAlex MCP Server Installation Guide 2 | 3 | This guide provides instructions for installing and running the OpenAlex MCP server. 4 | 5 | ## Prerequisites 6 | 7 | - Python 3.10 or higher 8 | - pip (Python package installer) 9 | 10 | ## Installation 11 | 12 | 1. Clone the repository: 13 | ```bash 14 | git clone https://github.com/drAbreu/alex-mcp.git 15 | cd alex-mcp 16 | ``` 17 | 18 | 2. Create a virtual environment: 19 | ```bash 20 | python3 -m venv venv 21 | ``` 22 | 23 | 3. Activate the virtual environment: 24 | ```bash 25 | source venv/bin/activate # On Windows: venv\Scripts\activate 26 | ``` 27 | 28 | 4. Install the package in development mode: 29 | ```bash 30 | pip install -e . 31 | ``` 32 | 33 | ## Running the Server 34 | 35 | ### Option 1: Using the run script 36 | 37 | The easiest way to run the server is to use the provided run script: 38 | 39 | ```bash 40 | ./run_alex_mcp.sh 41 | ``` 42 | 43 | This script activates the virtual environment and runs the server. 44 | 45 | ### Option 2: Manual execution 46 | 47 | 1. Activate the virtual environment: 48 | ```bash 49 | source venv/bin/activate # On Windows: venv\Scripts\activate 50 | ``` 51 | 52 | 2. Run the server: 53 | ```bash 54 | python run_server.py 55 | ``` 56 | 57 | ## Using with Claude Desktop 58 | 59 | To use this MCP server with Claude Desktop, add the following configuration: 60 | 61 | ```json 62 | { 63 | "mcpServers": { 64 | "alex-mcp": { 65 | "command": "/path/to/alex-mcp/run_alex_mcp.sh" 66 | } 67 | } 68 | } 69 | ``` 70 | 71 | Replace `/path/to/alex-mcp` with the actual path to the repository on your system. 72 | 73 | ## Available Tools 74 | 75 | The OpenAlex MCP server provides the following tools: 76 | 77 | 1. **disambiguate_author**: Disambiguate an author using OpenAlex's ML-powered disambiguation system. 78 | 2. **search_authors**: Search for authors with advanced filtering capabilities. 79 | 3. **get_author_profile**: Get detailed author profile by OpenAlex ID. 80 | 4. **resolve_institution**: Resolve institution name or abbreviation to full OpenAlex data. 81 | 82 | ## Troubleshooting 83 | 84 | If you encounter any issues, make sure: 85 | 86 | 1. You're using Python 3.10 or higher 87 | 2. The virtual environment is activated 88 | 3. All dependencies are installed correctly 89 | 90 | For more information, see the [README.md](README.md) file. 91 | -------------------------------------------------------------------------------- /examples/test_author_disambiguation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Test suite for disambiguate_author using the MCP server and pyalex. 3 | Focus: Fiona M. Watt and Jorge Abreu Vicente. 4 | """ 5 | 6 | import pytest 7 | import pyalex 8 | 9 | pyalex.config.email = "test@example.com" 10 | pyalex.config.max_retries = 2 11 | pyalex.config.retry_backoff_factor = 0.1 12 | pyalex.config.retry_http_codes = [429, 500, 503] 13 | 14 | import sys 15 | import os 16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) 17 | from src.alex_mcp.server import _disambiguate_author_impl as disambiguate_author 18 | 19 | def test_disambiguate_fiona_watt_name_only(): 20 | result = disambiguate_author(name="Fiona M Watt") 21 | print(f"Disambiguation result for Fiona M Watt: {result}") 22 | assert result["most_likely"] is not None 23 | assert "A5068471552" in result["most_likely"]["author"]["id"] 24 | 25 | def test_disambiguate_fiona_watt_with_institution(): 26 | result = disambiguate_author(name="Fiona M Watt", affiliation="EMBO") 27 | print(f"Disambiguation result for Fiona M Watt: {result}") 28 | assert result["most_likely"] is not None 29 | assert "A5068471552" in result["most_likely"]["author"]["id"] 30 | 31 | def test_disambiguate_fiona_watt_with_topic(): 32 | result = disambiguate_author(name="Fiona M Watt", research_field="Stem Cells") 33 | print(f"Disambiguation result for Fiona M Watt: {result}") 34 | assert result["most_likely"] is not None 35 | assert "A5068471552" in result["most_likely"]["author"]["id"] 36 | 37 | def test_disambiguate_jorge_abreu_name_only(): 38 | result = disambiguate_author(name="Jorge Abreu Vicente") 39 | print(f"Disambiguation result for J. Abreu-Vicente: {result}") 40 | assert result["most_likely"] is not None 41 | assert "A5058921480" in result["most_likely"]["author"]["id"] 42 | 43 | def test_disambiguate_jorge_abreu_with_institution(): 44 | result = disambiguate_author(name="Jorge Abreu Vicente", affiliation="MPIA") 45 | print(f"Disambiguation result for J. Abreu-Vicente: {result}") 46 | assert result["most_likely"] is not None 47 | assert "A5058921480" in result["most_likely"]["author"]["id"] 48 | 49 | def test_disambiguate_jorge_abreu_with_topic(): 50 | result = disambiguate_author(name="Jorge Abreu Vicente", research_field="molecular clouds") 51 | print(f"Disambiguation result for J. Abreu-Vicente: {result}") 52 | assert result["most_likely"] is not None 53 | assert "A5058921480" in result["most_likely"]["author"]["id"] 54 | -------------------------------------------------------------------------------- /src/alex_mcp/data_objects.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Optimized data models for the OpenAlex MCP server. 4 | 5 | Streamlined versions focusing on essential information for author disambiguation 6 | and work retrieval while minimizing token usage. Enhanced to preserve comprehensive 7 | ID information (DOI, PMID, PMCID, OpenAlex, MAG). 8 | """ 9 | 10 | from typing import List, Optional, Dict, Any 11 | from datetime import datetime 12 | from pydantic import BaseModel, Field 13 | 14 | 15 | class WorkIDs(BaseModel): 16 | """ 17 | Comprehensive work identifiers from OpenAlex. 18 | 19 | Preserves all available identifiers for cross-database linkage. 20 | """ 21 | openalex: Optional[str] = None 22 | doi: Optional[str] = None 23 | pmid: Optional[str] = None 24 | pmcid: Optional[str] = None 25 | mag: Optional[str] = None 26 | 27 | 28 | class OptimizedAuthorResult(BaseModel): 29 | """ 30 | Streamlined author representation focusing on disambiguation essentials. 31 | 32 | Reduces token usage by ~70% compared to full OpenAlex author object. 33 | """ 34 | id: str 35 | display_name: str 36 | orcid: Optional[str] = None 37 | display_name_alternatives: Optional[List[str]] = None 38 | 39 | # Simplified affiliations - just institution names as strings 40 | current_affiliations: Optional[List[str]] = None 41 | past_affiliations: Optional[List[str]] = None 42 | 43 | # Key metrics for research impact 44 | cited_by_count: int = 0 45 | works_count: int = 0 46 | h_index: Optional[int] = None 47 | i10_index: Optional[int] = None 48 | 49 | # Research fields (simplified) 50 | research_fields: Optional[List[str]] = None 51 | 52 | # Basic metadata 53 | last_known_institutions: Optional[List[str]] = None 54 | countries: Optional[List[str]] = None 55 | 56 | # For API access 57 | works_api_url: Optional[str] = None 58 | 59 | 60 | class OptimizedWorkResult(BaseModel): 61 | """ 62 | Streamlined work representation focusing on essential publication info. 63 | 64 | Reduces token usage by ~80% compared to full OpenAlex work object while 65 | preserving comprehensive identifier information. 66 | """ 67 | id: str 68 | title: Optional[str] = None 69 | doi: Optional[str] = None # Kept for backward compatibility 70 | publication_year: Optional[int] = None 71 | type: Optional[str] = None # journal-article, book-chapter, etc. 72 | 73 | # COMPREHENSIVE ID INFORMATION - This was missing! 74 | ids: Optional[WorkIDs] = None 75 | 76 | # Citation metrics 77 | cited_by_count: Optional[int] = 0 78 | 79 | # Publication venue (simplified) 80 | journal_name: Optional[str] = None 81 | journal_issn: Optional[str] = None 82 | publisher: Optional[str] = None 83 | 84 | # Open access info (simplified) 85 | is_open_access: Optional[bool] = None 86 | 87 | # Author info (minimal) 88 | author_count: Optional[int] = None 89 | first_author: Optional[str] = None 90 | corresponding_author: Optional[str] = None 91 | 92 | # Research categorization (simplified) 93 | primary_field: Optional[str] = None 94 | concepts: Optional[List[str]] = None 95 | 96 | 97 | class OptimizedSearchResponse(BaseModel): 98 | """ 99 | Streamlined search response. 100 | """ 101 | query: str 102 | total_count: int 103 | results: List[OptimizedAuthorResult] 104 | search_time: Optional[datetime] = Field(default_factory=datetime.now) 105 | 106 | 107 | class OptimizedWorksSearchResponse(BaseModel): 108 | """ 109 | Streamlined works search response for author works. 110 | """ 111 | author_id: str 112 | author_name: Optional[str] = None 113 | total_count: int 114 | results: List[OptimizedWorkResult] 115 | search_time: Optional[datetime] = Field(default_factory=datetime.now) 116 | filters: Optional[Dict[str, Any]] = None 117 | 118 | 119 | class OptimizedGeneralWorksSearchResponse(BaseModel): 120 | """ 121 | Streamlined works search response for general work searches. 122 | """ 123 | query: str 124 | total_count: int 125 | results: List[OptimizedWorkResult] 126 | search_time: Optional[datetime] = Field(default_factory=datetime.now) 127 | filters: Optional[Dict[str, Any]] = None 128 | 129 | 130 | class AutocompleteAuthorCandidate(BaseModel): 131 | """ 132 | A single author candidate from autocomplete API. 133 | 134 | Optimized for fast disambiguation with essential context. 135 | """ 136 | openalex_id: str 137 | display_name: str 138 | institution_hint: Optional[str] = None # Current/last known institution 139 | works_count: int = 0 140 | cited_by_count: int = 0 141 | entity_type: str = "author" 142 | external_id: Optional[str] = None # ORCID or other external ID 143 | 144 | 145 | class AutocompleteAuthorsResponse(BaseModel): 146 | """ 147 | Response model for author autocomplete with multiple candidates. 148 | 149 | Enables intelligent disambiguation by providing multiple options 150 | with institutional context and research metrics. 151 | """ 152 | query: str 153 | context: Optional[str] = None 154 | total_candidates: int 155 | candidates: List[AutocompleteAuthorCandidate] 156 | search_metadata: Dict[str, Any] = Field(default_factory=dict) 157 | 158 | 159 | def extract_institution_names(affiliations: List[Dict[str, Any]]) -> tuple[List[str], List[str]]: 160 | """ 161 | Extract and categorize institution names from OpenAlex affiliation objects. 162 | 163 | Returns: 164 | tuple: (current_affiliations, past_affiliations) 165 | """ 166 | current = [] 167 | past = [] 168 | 169 | if not affiliations: 170 | return current, past 171 | 172 | for affiliation in affiliations: 173 | institution = affiliation.get('institution', {}) 174 | if not institution: 175 | continue 176 | 177 | institution_name = institution.get('display_name') 178 | if not institution_name: 179 | continue 180 | 181 | # Determine if current or past based on years 182 | years = affiliation.get('years', []) 183 | if years: 184 | current_year = datetime.now().year 185 | # Consider current if active in last 3 years 186 | if max(years) >= current_year - 3: 187 | current.append(institution_name) 188 | else: 189 | past.append(institution_name) 190 | else: 191 | # Default to current if no year info 192 | current.append(institution_name) 193 | 194 | return current, past 195 | 196 | 197 | def extract_research_fields(concepts_or_topics: List[Dict[str, Any]]) -> List[str]: 198 | """ 199 | Extract research field names from concepts or topics. 200 | 201 | Args: 202 | concepts_or_topics: List of concept/topic objects from OpenAlex 203 | 204 | Returns: 205 | List of field names, limited to top 5 most relevant 206 | """ 207 | fields = [] 208 | 209 | if not concepts_or_topics: 210 | return fields 211 | 212 | # Sort by score/level and take top fields 213 | sorted_items = sorted( 214 | concepts_or_topics, 215 | key=lambda x: x.get('score', 0) or x.get('count', 0), 216 | reverse=True 217 | ) 218 | 219 | for item in sorted_items[:5]: # Limit to top 5 220 | name = item.get('display_name') 221 | if name: 222 | fields.append(name) 223 | 224 | return fields 225 | 226 | 227 | def extract_journal_info(locations: List[Dict[str, Any]]) -> tuple[Optional[str], Optional[str], Optional[str]]: 228 | """ 229 | Extract journal information from OpenAlex locations. 230 | 231 | Returns: 232 | tuple: (journal_name, journal_issn, publisher) 233 | """ 234 | if not locations: 235 | return None, None, None 236 | 237 | # Look for primary location (usually first) or journal location 238 | for location in locations: 239 | source = location.get('source', {}) 240 | if source and source.get('type') == 'journal': 241 | journal_name = source.get('display_name') 242 | issn = None 243 | if source.get('issn'): 244 | issn = source['issn'][0] if isinstance(source['issn'], list) else source['issn'] 245 | 246 | publisher = source.get('host_organization_name') 247 | return journal_name, issn, publisher 248 | 249 | # Fallback to first location 250 | if locations: 251 | source = locations[0].get('source', {}) 252 | if source: 253 | return source.get('display_name'), None, source.get('host_organization_name') 254 | 255 | return None, None, None 256 | 257 | 258 | def extract_authorship_info(authorships: List[Dict[str, Any]]) -> tuple[Optional[int], Optional[str], Optional[str]]: 259 | """ 260 | Extract simplified authorship information. 261 | 262 | Returns: 263 | tuple: (author_count, first_author, corresponding_author) 264 | """ 265 | if not authorships: 266 | return None, None, None 267 | 268 | author_count = len(authorships) 269 | first_author = None 270 | corresponding_author = None 271 | 272 | # Find first author (author_position == 'first') 273 | for authorship in authorships: 274 | if authorship.get('author_position') == 'first': 275 | author = authorship.get('author', {}) 276 | first_author = author.get('display_name') 277 | break 278 | 279 | # Find corresponding author 280 | for authorship in authorships: 281 | if authorship.get('is_corresponding'): 282 | author = authorship.get('author', {}) 283 | corresponding_author = author.get('display_name') 284 | break 285 | 286 | return author_count, first_author, corresponding_author 287 | 288 | 289 | def extract_comprehensive_ids(work_data: Dict[str, Any]) -> WorkIDs: 290 | """ 291 | Extract comprehensive identifier information from OpenAlex work data. 292 | 293 | This was the missing piece! OpenAlex provides comprehensive IDs in the 'ids' object. 294 | 295 | Args: 296 | work_data: Full OpenAlex work object 297 | 298 | Returns: 299 | WorkIDs object with all available identifiers 300 | """ 301 | ids_data = work_data.get('ids', {}) 302 | 303 | # Extract all available IDs 304 | openalex_id = ids_data.get('openalex') or work_data.get('id') 305 | doi = ids_data.get('doi') or work_data.get('doi') # Fallback to standalone doi 306 | pmid = ids_data.get('pmid') 307 | pmcid = ids_data.get('pmcid') 308 | mag = ids_data.get('mag') 309 | 310 | return WorkIDs( 311 | openalex=openalex_id, 312 | doi=doi, 313 | pmid=pmid, 314 | pmcid=pmcid, 315 | mag=mag 316 | ) 317 | 318 | 319 | def optimize_author_data(author_data: Dict[str, Any]) -> OptimizedAuthorResult: 320 | """ 321 | Convert full OpenAlex author object to optimized version. 322 | 323 | Args: 324 | author_data: Full OpenAlex author object 325 | 326 | Returns: 327 | OptimizedAuthorResult with essential information only 328 | """ 329 | # Extract basic info 330 | author_id = author_data.get('id', '') 331 | display_name = author_data.get('display_name', '') 332 | orcid = author_data.get('orcid') 333 | alternatives = author_data.get('display_name_alternatives', []) 334 | 335 | # Process affiliations 336 | affiliations = author_data.get('affiliations', []) 337 | current_affiliations, past_affiliations = extract_institution_names(affiliations) 338 | 339 | # Extract metrics 340 | cited_by_count = author_data.get('cited_by_count', 0) 341 | works_count = author_data.get('works_count', 0) 342 | 343 | # Extract summary stats 344 | summary_stats = author_data.get('summary_stats', {}) 345 | h_index = summary_stats.get('h_index') 346 | i10_index = summary_stats.get('i10_index') 347 | 348 | # Extract research fields from concepts or topics 349 | research_fields = [] 350 | concepts = author_data.get('x_concepts', []) or author_data.get('topics', []) 351 | research_fields = extract_research_fields(concepts) 352 | 353 | # Extract geographic info 354 | countries = [] 355 | if affiliations: 356 | for affiliation in affiliations: 357 | institution = affiliation.get('institution', {}) 358 | country = institution.get('country_code') 359 | if country and country not in countries: 360 | countries.append(country) 361 | 362 | # API URL 363 | works_api_url = author_data.get('works_api_url') 364 | 365 | return OptimizedAuthorResult( 366 | id=author_id, 367 | display_name=display_name, 368 | orcid=orcid, 369 | display_name_alternatives=alternatives[:3] if alternatives else None, # Limit alternatives 370 | current_affiliations=current_affiliations[:3] if current_affiliations else None, # Limit to 3 most recent 371 | past_affiliations=past_affiliations[:3] if past_affiliations else None, # Limit to 3 most recent 372 | cited_by_count=cited_by_count, 373 | works_count=works_count, 374 | h_index=h_index, 375 | i10_index=i10_index, 376 | research_fields=research_fields[:5] if research_fields else None, # Top 5 fields 377 | last_known_institutions=current_affiliations[:2] if current_affiliations else past_affiliations[:2], 378 | countries=countries[:3] if countries else None, # Limit countries 379 | works_api_url=works_api_url 380 | ) 381 | 382 | 383 | def optimize_work_data(work_data: Dict[str, Any]) -> OptimizedWorkResult: 384 | """ 385 | Convert full OpenAlex work object to optimized version. 386 | 387 | NOW INCLUDES COMPREHENSIVE ID EXTRACTION! 388 | 389 | Args: 390 | work_data: Full OpenAlex work object 391 | 392 | Returns: 393 | OptimizedWorkResult with essential information AND comprehensive IDs 394 | """ 395 | # Basic work info 396 | work_id = work_data.get('id', '') 397 | title = work_data.get('title') 398 | doi = work_data.get('doi') # Kept for backward compatibility 399 | publication_year = work_data.get('publication_year') 400 | work_type = work_data.get('type') 401 | 402 | # EXTRACT COMPREHENSIVE IDS - This is the fix! 403 | comprehensive_ids = extract_comprehensive_ids(work_data) 404 | 405 | # Citation metrics 406 | cited_by_count = work_data.get('cited_by_count', 0) 407 | 408 | # Journal information 409 | locations = work_data.get('locations', []) 410 | journal_name, journal_issn, publisher = extract_journal_info(locations) 411 | 412 | # Open access info 413 | open_access = work_data.get('open_access', {}) 414 | is_open_access = open_access.get('is_oa') if open_access else None 415 | 416 | # Authorship info 417 | authorships = work_data.get('authorships', []) 418 | author_count, first_author, corresponding_author = extract_authorship_info(authorships) 419 | 420 | # Research categorization 421 | primary_topic = work_data.get('primary_topic', {}) 422 | primary_field = primary_topic.get('display_name') if primary_topic else None 423 | 424 | # Simplified concepts (top 3) 425 | concepts = work_data.get('concepts', []) 426 | concept_names = [] 427 | if concepts: 428 | sorted_concepts = sorted(concepts, key=lambda x: x.get('score', 0), reverse=True) 429 | concept_names = [c.get('display_name') for c in sorted_concepts[:3] if c.get('display_name')] 430 | 431 | return OptimizedWorkResult( 432 | id=work_id, 433 | title=title, 434 | doi=doi, 435 | publication_year=publication_year, 436 | type=work_type, 437 | ids=comprehensive_ids, 438 | cited_by_count=cited_by_count, 439 | journal_name=journal_name, 440 | journal_issn=journal_issn, 441 | publisher=publisher, 442 | is_open_access=is_open_access, 443 | author_count=author_count, 444 | first_author=first_author, 445 | corresponding_author=corresponding_author, 446 | primary_field=primary_field, 447 | concepts=concept_names if concept_names else None 448 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | OpenAlex MCP Server 3 | 4 | # OpenAlex Author Disambiguation MCP Server 5 | 6 | [![MCP](https://img.shields.io/badge/Model%20Context%20Protocol-Compatible-blue)](https://modelcontextprotocol.io/) 7 | [![Python](https://img.shields.io/badge/Python-3.10+-green)](https://python.org) 8 | [![OpenAlex](https://img.shields.io/badge/OpenAlex-API-orange)](https://openalex.org) 9 | [![License](https://img.shields.io/badge/License-MIT-yellow)](LICENSE) 10 | [![Optimized](https://img.shields.io/badge/AI%20Agent-Optimized-brightgreen)](https://github.com/drAbreu/alex-mcp) 11 |
12 | 13 | A **streamlined** Model Context Protocol (MCP) server for author disambiguation and academic research using the OpenAlex.org API. Specifically designed for AI agents with optimized data structures and enhanced functionality. 14 | 15 | --- 16 | 17 | ## 🎯 Key Features 18 | 19 | ### 🔍 **Core Capabilities** 20 | - **Advanced Author Disambiguation**: Handles complex career transitions and name variations 21 | - **Institution Resolution**: Current and past affiliations with transition tracking 22 | - **Academic Work Retrieval**: Journal articles, letters, and research papers 23 | - **Citation Analysis**: H-index, citation counts, and impact metrics 24 | - **ORCID Integration**: Highest accuracy matching with ORCID identifiers 25 | 26 | ### 🚀 **AI Agent Optimized** 27 | - **Streamlined Data**: Focused on essential information for disambiguation 28 | - **Fast Processing**: Optimized data structures for rapid analysis 29 | - **Smart Filtering**: Enhanced filtering options for targeted queries 30 | - **Clean Output**: Structured responses optimized for AI reasoning 31 | 32 | ### 🤖 **Agent Integration** 33 | - **Multiple Candidates**: Ranked results for automated decision-making 34 | - **Structured Responses**: Clean, parseable output optimized for LLMs 35 | - **Error Handling**: Graceful degradation with informative messages 36 | - **Enhanced Filtering**: Journal-only, citation thresholds, and temporal filters 37 | 38 | ### 🏛️ **Professional Grade** 39 | - **MCP Best Practices**: Built with FastMCP following official guidelines 40 | - **Tool Annotations**: Proper MCP tool annotations for optimal client integration 41 | - **Resource Management**: Efficient HTTP client management and cleanup 42 | - **Rate Limiting**: Respectful API usage with proper delays 43 | 44 | --- 45 | 46 | ## 🚀 Quick Start 47 | 48 | ### Prerequisites 49 | 50 | - Python 3.10 or higher 51 | - MCP-compatible client (e.g., Claude Desktop) 52 | - Email address (for OpenAlex API courtesy) 53 | 54 | ### Installation 55 | 56 | For detailed installation instructions, see [INSTALL.md](INSTALL.md). 57 | 58 | 1. **Clone the repository:** 59 | ```bash 60 | git clone https://github.com/drAbreu/alex-mcp.git 61 | cd alex-mcp 62 | ``` 63 | 64 | 2. **Create a virtual environment:** 65 | ```bash 66 | python3 -m venv venv 67 | source venv/bin/activate # On Windows: venv\Scripts\activate 68 | ``` 69 | 70 | 3. **Install the package:** 71 | ```bash 72 | pip install -e . 73 | ``` 74 | 75 | 4. **Configure environment:** 76 | ```bash 77 | export OPENALEX_MAILTO=your-email@domain.com 78 | ``` 79 | 80 | 5. **Run the server:** 81 | ```bash 82 | ./run_alex_mcp.sh 83 | # Or, if installed as a CLI tool: 84 | alex-mcp 85 | ``` 86 | 87 | --- 88 | 89 | ## ⚙️ MCP Configuration 90 | 91 | ### Claude Desktop Configuration 92 | 93 | Add to your Claude Desktop configuration file: 94 | 95 | ```json 96 | { 97 | "mcpServers": { 98 | "alex-mcp": { 99 | "command": "/path/to/alex-mcp/run_alex_mcp.sh", 100 | "env": { 101 | "OPENALEX_MAILTO": "your-email@domain.com" 102 | } 103 | } 104 | } 105 | } 106 | ``` 107 | 108 | Replace `/path/to/alex-mcp` with the actual path to the repository on your system. 109 | 110 | --- 111 | 112 | ## 🤖 Using with AI Agents 113 | 114 | ### OpenAI Agents Integration 115 | 116 | You can load this MCP server in your OpenAI agent workflow using the [`agents.mcp.MCPServerStdio`](https://github.com/openai/openai-agents) interface: 117 | 118 | ```python 119 | from agents.mcp import MCPServerStdio 120 | 121 | async with MCPServerStdio( 122 | name="OpenAlex MCP For Author disambiguation and works", 123 | cache_tools_list=True, 124 | params={ 125 | "command": "uvx", 126 | "args": [ 127 | "--from", "git+https://github.com/drAbreu/alex-mcp.git@4.1.0", 128 | "alex-mcp" 129 | ], 130 | "env": { 131 | "OPENALEX_MAILTO": "your-email@domain.com" 132 | } 133 | }, 134 | client_session_timeout_seconds=10 135 | ) as alex_mcp: 136 | await alex_mcp.connect() 137 | tools = await alex_mcp.list_tools() 138 | print(f"Available tools: {[tool.name for tool in tools]}") 139 | ``` 140 | 141 | ### Academic Research Agent Integration 142 | 143 | This MCP server is specifically optimized for academic research workflows: 144 | 145 | ```python 146 | # Optimized for academic research workflows 147 | from alex_agent import run_author_research 148 | 149 | # Enhanced functionality with streamlined data 150 | result = await run_author_research( 151 | "Find J. Abreu at EMBO with recent publications" 152 | ) 153 | 154 | # Clean, structured output for AI processing 155 | print(f"Success: {result['workflow_metadata']['success']}") 156 | print(f"Quality: {result['research_result']['metadata']['result_analysis']['quality_score']}/100") 157 | ``` 158 | 159 | ### Direct Launch with uvx 160 | 161 | ```bash 162 | # Standard launch 163 | uvx --from git+https://github.com/drAbreu/alex-mcp.git@4.1.0 alex-mcp 164 | 165 | # With environment variables 166 | OPENALEX_MAILTO=your-email@domain.com uvx --from git+https://github.com/drAbreu/alex-mcp.git@4.1.0 alex-mcp 167 | ``` 168 | 169 | --- 170 | 171 | ## 🛠️ Available Tools 172 | 173 | ### 1. **autocomplete_authors** ⭐ NEW 174 | Get multiple author candidates using OpenAlex autocomplete API for intelligent disambiguation. 175 | 176 | **Parameters:** 177 | - `name` (required): Author name to search (e.g., "James Briscoe", "M. Ralser") 178 | - `context` (optional): Context for disambiguation (e.g., "Francis Crick Institute developmental biology") 179 | - `limit` (optional): Maximum candidates (1-10, default: 5) 180 | 181 | **Key Features:** 182 | - ⚡ **Fast**: ~200ms response time 183 | - 🎯 **Smart**: Multiple candidates with institutional hints 184 | - 🧠 **AI-Ready**: Perfect for context-based selection 185 | - 📊 **Rich**: Works count, citations, institution info 186 | 187 | **Streamlined Output:** 188 | ```json 189 | { 190 | "query": "James Briscoe", 191 | "context": "Francis Crick Institute", 192 | "total_candidates": 3, 193 | "candidates": [ 194 | { 195 | "openalex_id": "https://openalex.org/A5019391436", 196 | "display_name": "James Briscoe", 197 | "institution_hint": "The Francis Crick Institute, UK", 198 | "works_count": 415, 199 | "cited_by_count": 24623, 200 | "external_id": "https://orcid.org/0000-0002-1020-5240" 201 | } 202 | ] 203 | } 204 | ``` 205 | 206 | **Usage Pattern:** 207 | ```python 208 | # Get multiple candidates for disambiguation 209 | candidates = await autocomplete_authors( 210 | "James Briscoe", 211 | context="Francis Crick Institute developmental biology" 212 | ) 213 | 214 | # AI selects best match based on institutional context 215 | # Much more accurate than single search result! 216 | ``` 217 | 218 | ### 2. **search_authors** 219 | Search for authors with streamlined output for AI agents. 220 | 221 | **Parameters:** 222 | - `name` (required): Author name to search 223 | - `institution` (optional): Institution name filter 224 | - `topic` (optional): Research topic filter 225 | - `country_code` (optional): Country code filter (e.g., "US", "DE") 226 | - `limit` (optional): Maximum results (1-25, default: 20) 227 | 228 | **Streamlined Output:** 229 | ```json 230 | { 231 | "query": "J. Abreu", 232 | "total_count": 3, 233 | "results": [ 234 | { 235 | "id": "https://openalex.org/A123456789", 236 | "display_name": "Jorge Abreu-Vicente", 237 | "orcid": "https://orcid.org/0000-0000-0000-0000", 238 | "display_name_alternatives": ["J. Abreu-Vicente", "Jorge Abreu Vicente"], 239 | "affiliations": [ 240 | { 241 | "institution": { 242 | "display_name": "European Molecular Biology Organization", 243 | "country_code": "DE" 244 | }, 245 | "years": [2023, 2024, 2025] 246 | } 247 | ], 248 | "cited_by_count": 316, 249 | "works_count": 25, 250 | "summary_stats": { 251 | "h_index": 9, 252 | "i10_index": 5 253 | }, 254 | "x_concepts": [ 255 | { 256 | "display_name": "Astrophysics", 257 | "score": 0.8 258 | }, 259 | { 260 | "display_name": "Machine Learning", 261 | "score": 0.6 262 | } 263 | ] 264 | } 265 | ] 266 | } 267 | ``` 268 | 269 | **Features**: Clean structure optimized for AI reasoning and disambiguation 270 | 271 | --- 272 | 273 | ### 2. **retrieve_author_works** 274 | Retrieve works for a given author with enhanced filtering capabilities. 275 | 276 | **Parameters:** 277 | - `author_id` (required): OpenAlex author ID 278 | - `limit` (optional): Maximum results (1-50, default: 20) 279 | - `order_by` (optional): "date" or "citations" (default: "date") 280 | - `publication_year` (optional): Filter by specific year 281 | - `type` (optional): Work type filter (e.g., "journal-article") 282 | - `authorships_institutions_id` (optional): Filter by institution 283 | - `is_retracted` (optional): Filter retracted works 284 | - `open_access_is_oa` (optional): Filter by open access status 285 | 286 | **Enhanced Output:** 287 | ```json 288 | { 289 | "author_id": "https://openalex.org/A123456789", 290 | "total_count": 25, 291 | "results": [ 292 | { 293 | "id": "https://openalex.org/W123456789", 294 | "title": "A platform for the biomedical application of large language models", 295 | "doi": "10.1038/s41587-024-02534-3", 296 | "publication_year": 2025, 297 | "type": "journal-article", 298 | "cited_by_count": 42, 299 | "authorships": [ 300 | { 301 | "author": { 302 | "display_name": "Jorge Abreu-Vicente" 303 | }, 304 | "institutions": [ 305 | { 306 | "display_name": "European Molecular Biology Organization" 307 | } 308 | ] 309 | } 310 | ], 311 | "locations": [ 312 | { 313 | "source": { 314 | "display_name": "Nature Biotechnology", 315 | "type": "journal" 316 | } 317 | } 318 | ], 319 | "open_access": { 320 | "is_oa": true 321 | }, 322 | "primary_topic": { 323 | "display_name": "Biomedical Engineering" 324 | } 325 | } 326 | ] 327 | } 328 | ``` 329 | 330 | **Features**: Comprehensive work data with flexible filtering for targeted queries 331 | 332 | --- 333 | 334 | ## 📊 Data Optimization 335 | 336 | ### Focused Information Architecture 337 | This MCP server provides focused, structured data specifically designed for AI agent consumption: 338 | 339 | ### Author Data Features 340 | - **Identity Resolution**: Names, ORCID, alternatives for disambiguation 341 | - **Affiliation Tracking**: Current and historical institutional connections 342 | - **Impact Metrics**: Citation counts, h-index, and scholarly impact 343 | - **Research Context**: Fields, concepts, and domain expertise 344 | - **Career Analysis**: Temporal affiliation changes and transitions 345 | 346 | ### Work Data Features 347 | - **Publication Metadata**: Title, DOI, venue, and publication details 348 | - **Impact Assessment**: Citation counts and scholarly influence 349 | - **Access Information**: Open access status and availability 350 | - **Authorship Details**: Complete author lists and institutional affiliations 351 | - **Research Classification**: Topics, concepts, and domain categorization 352 | 353 | ### Enhanced Filtering 354 | 355 | ```python 356 | # Target high-impact journal articles 357 | works = await retrieve_author_works( 358 | author_id="https://openalex.org/A123456789", 359 | type="journal-article", # Focus on journal publications 360 | open_access_is_oa=True, # Open access only 361 | order_by="citations", # Most cited first 362 | limit=15 363 | ) 364 | 365 | # Career transition analysis 366 | authors = await search_authors( 367 | name="J. Abreu", 368 | institution="EMBO", # Current institution 369 | topic="Machine Learning", # Research focus 370 | limit=10 371 | ) 372 | ``` 373 | 374 | --- 375 | 376 | ## 🧪 Example Usage 377 | 378 | ### Author Disambiguation 379 | 380 | ```python 381 | from alex_mcp.server import search_authors_core 382 | 383 | # Comprehensive author search 384 | results = search_authors_core( 385 | name="J Abreu Vicente", 386 | institution="EMBO", 387 | topic="Machine Learning", 388 | limit=20 389 | ) 390 | 391 | print(f"Found {results.total_count} candidates") 392 | for author in results.results: 393 | print(f"- {author.display_name}") 394 | if author.affiliations: 395 | current_inst = author.affiliations[0].institution.display_name 396 | print(f" Institution: {current_inst}") 397 | print(f" Metrics: {author.cited_by_count} citations, h-index {author.summary_stats.h_index}") 398 | if author.x_concepts: 399 | fields = [c.display_name for c in author.x_concepts[:3]] 400 | print(f" Research: {', '.join(fields)}") 401 | ``` 402 | 403 | ### Academic Work Analysis 404 | 405 | ```python 406 | from alex_mcp.server import retrieve_author_works_core 407 | 408 | # Comprehensive work retrieval 409 | works = retrieve_author_works_core( 410 | author_id="https://openalex.org/A5058921480", 411 | type="journal-article", # Academic focus 412 | order_by="citations", # Impact-based ordering 413 | limit=20 414 | ) 415 | 416 | print(f"Found {works.total_count} publications") 417 | for work in works.results: 418 | print(f"- {work.title}") 419 | if work.locations: 420 | journal = work.locations[0].source.display_name 421 | print(f" Published in: {journal} ({work.publication_year})") 422 | print(f" Impact: {work.cited_by_count} citations") 423 | if work.open_access and work.open_access.is_oa: 424 | print(" ✓ Open Access") 425 | ``` 426 | 427 | ### Institution and Field Analysis 428 | 429 | ```python 430 | # Analyze career transitions 431 | def analyze_career_path(author_result): 432 | affiliations = author_result.affiliations 433 | if len(affiliations) > 1: 434 | print("Career path:") 435 | for aff in sorted(affiliations, key=lambda x: min(x.years)): 436 | years = f"{min(aff.years)}-{max(aff.years)}" 437 | print(f" {years}: {aff.institution.display_name}") 438 | 439 | # Research evolution 440 | if author_result.x_concepts: 441 | print("Research areas:") 442 | for concept in author_result.x_concepts[:5]: 443 | print(f" {concept.display_name} (score: {concept.score:.2f})") 444 | 445 | # Usage 446 | results = search_authors_core("Jorge Abreu Vicente") 447 | if results.results: 448 | analyze_career_path(results.results[0]) 449 | ``` 450 | 451 | --- 452 | 453 | ## 🔧 Configuration Options 454 | 455 | ### Environment Variables 456 | 457 | ```bash 458 | # Required 459 | export OPENALEX_MAILTO=your-email@domain.com 460 | 461 | # Optional settings 462 | export OPENALEX_MAX_AUTHORS=100 # Maximum authors per query 463 | export OPENALEX_USER_AGENT=research-agent-v1.0 464 | export ALEX_MCP_VERSION=4.1.0 465 | 466 | # Rate limiting (respectful usage) 467 | export OPENALEX_RATE_PER_SEC=10 468 | export OPENALEX_RATE_PER_DAY=100000 469 | ``` 470 | 471 | ### Performance Tuning 472 | 473 | ```python 474 | # For comprehensive research applications 475 | config = { 476 | "max_authors_per_query": 25, # Detailed author analysis 477 | "max_works_per_author": 50, # Complete publication history 478 | "enable_all_filters": True, # Full filtering capabilities 479 | "detailed_affiliations": True, # Complete institutional data 480 | "research_concepts": True # Detailed concept analysis 481 | } 482 | ``` 483 | 484 | --- 485 | 486 | ## 🧑‍💻 Development & Testing 487 | 488 | ### Project Structure 489 | ``` 490 | alex-mcp/ 491 | ├── src/alex_mcp/ 492 | │ ├── server.py # Main MCP server 493 | │ ├── data_objects.py # Data models and structures 494 | │ └── utils.py # Utility functions 495 | ├── examples/ 496 | │ ├── basic_usage.py # Simple examples 497 | │ ├── advanced_queries.py # Complex query examples 498 | │ └── integration_demo.py # AI agent integration 499 | ├── tests/ 500 | │ ├── test_server.py # Server functionality tests 501 | │ └── test_integration.py # Integration tests 502 | └── docs/ 503 | └── api_reference.md # Detailed API documentation 504 | ``` 505 | 506 | ### Running Tests 507 | 508 | ```bash 509 | # Install test dependencies 510 | pip install -e ".[test]" 511 | 512 | # Run functionality tests 513 | pytest tests/test_server.py -v 514 | 515 | # Test with real queries 516 | python examples/basic_usage.py 517 | 518 | # Test AI agent integration 519 | python examples/integration_demo.py 520 | ``` 521 | 522 | ### Development Examples 523 | 524 | ```bash 525 | # Test author disambiguation 526 | python examples/basic_usage.py --query "J. Abreu" --institution "EMBO" 527 | 528 | # Test work retrieval 529 | python examples/advanced_queries.py --author-id "A123456789" --type "journal-article" 530 | 531 | # Test integration patterns 532 | python examples/integration_demo.py --workflow "career-analysis" 533 | ``` 534 | 535 | --- 536 | 537 | ## 📈 Integration Examples 538 | 539 | ### Academic Research Workflows 540 | 541 | Perfect integration with AI-powered research analysis: 542 | 543 | ```python 544 | # Enhanced academic research agent 545 | from alex_agent import AcademicResearchAgent 546 | 547 | agent = AcademicResearchAgent( 548 | mcp_servers=[alex_mcp], # Streamlined data processing 549 | model="gpt-4.1-2025-04-14" 550 | ) 551 | 552 | # Complex research queries with structured data 553 | result = await agent.research_author( 554 | "Find J. Abreu at EMBO with machine learning publications" 555 | ) 556 | 557 | # Rich, structured output for AI reasoning 558 | print(f"Quality Score: {result.quality_score}/100") 559 | print(f"Author disambiguation: {result.confidence}") 560 | print(f"Research fields: {result.research_domains}") 561 | ``` 562 | 563 | ### Multi-Agent Systems 564 | 565 | ```python 566 | # Collaborative research analysis 567 | async def research_collaboration_network(seed_author): 568 | # Find primary author 569 | authors = await alex_mcp.search_authors(seed_author) 570 | primary = authors['results'][0] 571 | 572 | # Get their works 573 | works = await alex_mcp.retrieve_author_works( 574 | primary['id'], 575 | type="journal-article" 576 | ) 577 | 578 | # Analyze co-authors and build network 579 | collaborators = set() 580 | for work in works['results']: 581 | for authorship in work.get('authorships', []): 582 | collaborators.add(authorship['author']['display_name']) 583 | 584 | return { 585 | 'primary_author': primary, 586 | 'publication_count': len(works['results']), 587 | 'collaborator_network': list(collaborators), 588 | 'research_impact': sum(w['cited_by_count'] for w in works['results']) 589 | } 590 | ``` 591 | 592 | --- 593 | 594 | ## 🤝 Contributing 595 | 596 | We welcome contributions to improve functionality and add new features: 597 | 598 | 1. **Fork the repository** 599 | 2. **Create a feature branch**: `git checkout -b feature/enhanced-filtering` 600 | 3. **Add tests**: Ensure your changes maintain data quality and structure 601 | 4. **Submit a pull request**: Include examples and documentation 602 | 603 | ### Development Priorities 604 | 605 | - [ ] Enhanced filtering capabilities 606 | - [ ] Additional data enrichment 607 | - [ ] Performance optimizations 608 | - [ ] Integration examples 609 | - [ ] Documentation improvements 610 | 611 | --- 612 | 613 | ## 📄 License 614 | 615 | This project is licensed under the MIT License. See [LICENSE](LICENSE) for details. 616 | 617 | --- 618 | 619 | ## 🌐 Links 620 | 621 | - [OpenAlex API Documentation](https://docs.openalex.org/) 622 | - [Model Context Protocol](https://modelcontextprotocol.io/) 623 | - [FastMCP](https://github.com/ContextualAI/fastmcp) 624 | - [OpenAI Agents](https://github.com/openai/openai-agents) 625 | - [Academic Research Examples](examples/) 626 | -------------------------------------------------------------------------------- /src/alex_mcp/server.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Optimized OpenAlex Author Disambiguation MCP Server with Peer-Review Filtering 4 | 5 | Provides a FastMCP-compliant API for author disambiguation and institution resolution 6 | using the OpenAlex API with streamlined output to minimize token usage. 7 | """ 8 | 9 | import logging 10 | from typing import Optional 11 | from fastmcp import FastMCP 12 | from alex_mcp.data_objects import ( 13 | OptimizedAuthorResult, 14 | OptimizedSearchResponse, 15 | OptimizedWorksSearchResponse, 16 | OptimizedGeneralWorksSearchResponse, 17 | OptimizedWorkResult, 18 | AutocompleteAuthorCandidate, 19 | AutocompleteAuthorsResponse, 20 | optimize_author_data, 21 | optimize_work_data 22 | ) 23 | import pyalex 24 | import os 25 | import sys 26 | import aiohttp 27 | import asyncio 28 | import json 29 | import re 30 | 31 | def get_config(): 32 | mailto = os.environ.get("OPENALEX_MAILTO") 33 | if not mailto: 34 | print( 35 | "ERROR: The environment variable OPENALEX_MAILTO must be set to your email address " 36 | "to use the OpenAlex MCP server. Example: export OPENALEX_MAILTO='your-email@example.com'", 37 | file=sys.stderr 38 | ) 39 | sys.exit(1) 40 | return { 41 | "OPENALEX_MAILTO": mailto, 42 | "OPENALEX_USER_AGENT": os.environ.get( 43 | "OPENALEX_USER_AGENT", 44 | f"alex-mcp (+{mailto})" 45 | ), 46 | "OPENALEX_MAX_AUTHORS": int(os.environ.get("OPENALEX_MAX_AUTHORS", 50)), # Reduced default 47 | "OPENALEX_RATE_PER_SEC": int(os.environ.get("OPENALEX_RATE_PER_SEC", 10)), 48 | "OPENALEX_RATE_PER_DAY": int(os.environ.get("OPENALEX_RATE_PER_DAY", 100000)), 49 | "OPENALEX_USE_DAILY_API": os.environ.get("OPENALEX_USE_DAILY_API", "true").lower() == "true", 50 | "OPENALEX_SNAPSHOT_INTERVAL_DAYS": int(os.environ.get("OPENALEX_SNAPSHOT_INTERVAL_DAYS", 30)), 51 | "OPENALEX_PREMIUM_UPDATES": os.environ.get("OPENALEX_PREMIUM_UPDATES", "hourly"), 52 | "OPENALEX_RETRACTION_BUG_START": os.environ.get("OPENALEX_RETRACTION_BUG_START", "2023-12-22"), 53 | "OPENALEX_RETRACTION_BUG_END": os.environ.get("OPENALEX_RETRACTION_BUG_END", "2024-03-19"), 54 | "OPENALEX_NO_FUNDING_DATA": os.environ.get("OPENALEX_NO_FUNDING_DATA", "true").lower() == "true", 55 | "OPENALEX_MISSING_CORRESPONDING_AUTHORS": os.environ.get("OPENALEX_MISSING_CORRESPONDING_AUTHORS", "true").lower() == "true", 56 | "OPENALEX_PARTIAL_ABSTRACTS": os.environ.get("OPENALEX_PARTIAL_ABSTRACTS", "true").lower() == "true", 57 | } 58 | 59 | # Configure logging 60 | logging.basicConfig(level=logging.INFO) 61 | logger = logging.getLogger(__name__) 62 | 63 | # Initialize FastMCP server 64 | mcp = FastMCP("OpenAlex Academic Research") 65 | 66 | 67 | def configure_pyalex(email: str): 68 | """ 69 | Configure pyalex for OpenAlex API usage. 70 | 71 | Args: 72 | email (str): The email to use for OpenAlex API requests. 73 | """ 74 | pyalex.config.email = email 75 | 76 | # Load configuration 77 | config = get_config() 78 | configure_pyalex(config["OPENALEX_MAILTO"]) 79 | pyalex.config.user_agent = config["OPENALEX_USER_AGENT"] 80 | 81 | 82 | def is_peer_reviewed_journal(work_data) -> bool: 83 | """ 84 | Improved filter to determine if a work is from a peer-reviewed journal. 85 | 86 | Uses a balanced approach that catches data catalogs and preprints while 87 | not being overly strict about DOIs (some legitimate papers lack them in OpenAlex). 88 | 89 | Args: 90 | work_data: OpenAlex work object 91 | 92 | Returns: 93 | bool: True if the work appears to be from a peer-reviewed journal 94 | """ 95 | try: 96 | # Safe string extraction with None checking 97 | title = work_data.get('title') or '' 98 | if isinstance(title, str): 99 | title = title.lower() 100 | else: 101 | title = str(title).lower() if title is not None else '' 102 | 103 | # Quick exclusions based on title patterns 104 | title_exclusions = [ 105 | 'vizier online data catalog', 106 | 'online data catalog', 107 | 'data catalog', 108 | 'catalog:', 109 | 'database:', 110 | 'repository:', 111 | 'preprint', 112 | 'arxiv:', 113 | 'biorxiv', 114 | 'medrxiv', 115 | ] 116 | 117 | for exclusion in title_exclusions: 118 | if exclusion in title: 119 | logger.debug(f"Excluding based on title pattern '{exclusion}': {title[:100]}") 120 | return False 121 | 122 | # Check primary location 123 | primary_location = work_data.get('primary_location') 124 | if not primary_location: 125 | logger.debug("Excluding work without primary location") 126 | return False 127 | 128 | # Check source information 129 | source = primary_location.get('source', {}) 130 | if not source: 131 | logger.debug("Excluding work without source") 132 | return False 133 | 134 | # Get journal/source information with safe None checking 135 | journal_name_raw = source.get('display_name') or '' 136 | journal_name = journal_name_raw.lower() if isinstance(journal_name_raw, str) else str(journal_name_raw).lower() 137 | 138 | publisher = work_data.get('publisher', '') 139 | doi = work_data.get('doi') 140 | issn_l = source.get('issn_l') 141 | issn = source.get('issn') 142 | 143 | source_type_raw = source.get('type') or '' 144 | source_type = source_type_raw.lower() if isinstance(source_type_raw, str) else str(source_type_raw).lower() 145 | 146 | # CRITICAL: Exclude known data catalogs by journal name 147 | excluded_journals = [ 148 | 'vizier online data catalog', 149 | 'ycat', 150 | 'catalog', 151 | 'database', 152 | 'repository', 153 | 'arxiv', 154 | 'biorxiv', 155 | 'medrxiv', 156 | 'ssrn', 157 | 'research square', 158 | 'zenodo', 159 | 'figshare', 160 | 'dryad', 161 | 'github', 162 | 'protocols.io', 163 | 'ceur', 164 | 'conference proceedings', 165 | 'workshop proceedings', 166 | ] 167 | 168 | for excluded in excluded_journals: 169 | if excluded in journal_name: 170 | logger.debug(f"Excluding journal pattern '{excluded}': {journal_name}") 171 | return False 172 | 173 | # CRITICAL: Data catalogs typically have no publisher AND no DOI 174 | # This catches VizieR entries effectively 175 | if not publisher and not doi: 176 | logger.debug(f"Excluding work without publisher AND DOI: {title[:100]}") 177 | return False 178 | 179 | # Source type should be journal (if specified) 180 | if source_type and source_type not in ['journal', '']: 181 | logger.debug(f"Excluding non-journal source type: {source_type}") 182 | return False 183 | 184 | # Work type should be article or letter with safe None checking 185 | work_type_raw = work_data.get('type') or '' 186 | work_type = work_type_raw.lower() if isinstance(work_type_raw, str) else str(work_type_raw).lower() 187 | if work_type not in ['article', 'letter']: 188 | logger.debug(f"Excluding work type: {work_type}") 189 | return False 190 | 191 | # Should have reasonable publication year 192 | pub_year = work_data.get('publication_year') 193 | if not pub_year or pub_year < 1900 or pub_year > 2030: 194 | logger.debug(f"Excluding work with invalid publication year: {pub_year}") 195 | return False 196 | 197 | # For papers claiming to be from legitimate journals, check quality signals 198 | known_legitimate_journals = [ 199 | 'nature', 200 | 'science', 201 | 'cell', 202 | 'astrophysical journal', 203 | 'astronomy and astrophysics', 204 | 'monthly notices', 205 | 'physical review', 206 | 'journal of', 207 | 'proceedings of', 208 | ] 209 | 210 | is_known_journal = any(known in journal_name for known in known_legitimate_journals) 211 | 212 | if is_known_journal: 213 | # For known journals, be more lenient (don't require DOI) 214 | # But still require either publisher or ISSN 215 | if not publisher and not issn_l and not issn: 216 | logger.debug(f"Excluding known journal without publisher/ISSN: {journal_name}") 217 | return False 218 | else: 219 | # For unknown journals, require more quality signals 220 | quality_signals = sum([ 221 | bool(doi), # Has DOI 222 | bool(publisher), # Has publisher 223 | bool(issn_l or issn), # Has ISSN 224 | bool(journal_name and len(journal_name) > 5), # Reasonable journal name 225 | ]) 226 | 227 | if quality_signals < 2: # Require at least 2 quality signals 228 | logger.debug(f"Excluding unknown journal with insufficient quality signals ({quality_signals}/4): {journal_name}") 229 | return False 230 | 231 | # Additional quality checks 232 | if 'cited_by_count' not in work_data: 233 | logger.debug("Excluding work without citation data") 234 | return False 235 | 236 | # Very long titles might be data descriptions 237 | if len(title) > 250: 238 | logger.debug(f"Excluding work with very long title: {title[:100]}...") 239 | return False 240 | 241 | # If we get here, it passes all checks 242 | logger.debug(f"ACCEPTED: {title[:100]}") 243 | return True 244 | 245 | except Exception as e: 246 | logger.error(f"Error in peer review check for work: {e}") 247 | logger.error(f"Work data keys: {list(work_data.keys()) if isinstance(work_data, dict) else 'Not a dict'}") 248 | logger.error(f"Work title: {repr(work_data.get('title') if isinstance(work_data, dict) else 'N/A')}") 249 | logger.error(f"Primary location: {repr(work_data.get('primary_location') if isinstance(work_data, dict) else 'N/A')}") 250 | import traceback 251 | logger.error(f"Full traceback: {traceback.format_exc()}") 252 | return False 253 | 254 | 255 | def filter_peer_reviewed_works(works: list) -> list: 256 | """ 257 | Apply peer-review filtering to a list of works. 258 | 259 | Args: 260 | works: List of OpenAlex work objects 261 | 262 | Returns: 263 | list: Filtered list containing only peer-reviewed journal works 264 | """ 265 | filtered_works = [] 266 | excluded_count = 0 267 | 268 | logger.info(f"Starting filtering of {len(works)} works...") 269 | 270 | for i, work in enumerate(works): 271 | # Safe handling of potentially None work or title 272 | if work is None: 273 | logger.warning(f"Skipping None work at position {i+1}") 274 | excluded_count += 1 275 | continue 276 | 277 | title_raw = work.get('title') if isinstance(work, dict) else None 278 | title = (title_raw or 'Unknown')[:60] if title_raw is not None else 'Unknown' 279 | 280 | try: 281 | if is_peer_reviewed_journal(work): 282 | filtered_works.append(work) 283 | logger.debug(f"✓ KEPT work {i+1}: {title}") 284 | else: 285 | excluded_count += 1 286 | logger.debug(f"✗ EXCLUDED work {i+1}: {title}") 287 | except Exception as e: 288 | logger.error(f"Error filtering work {i+1} (title: {title}): {e}") 289 | excluded_count += 1 290 | 291 | logger.info(f"Filtering complete: {len(filtered_works)} kept, {excluded_count} excluded from {len(works)} total") 292 | return filtered_works 293 | 294 | 295 | def search_authors_core( 296 | name: str, 297 | institution: Optional[str] = None, 298 | topic: Optional[str] = None, 299 | country_code: Optional[str] = None, 300 | limit: int = 15 # Reduced default limit 301 | ) -> OptimizedSearchResponse: 302 | """ 303 | Optimized core logic for searching authors using OpenAlex. 304 | Returns streamlined author data to minimize token usage. 305 | 306 | Args: 307 | name: Author name to search for. 308 | institution: (Optional) Institution name filter. 309 | topic: (Optional) Topic filter. 310 | country_code: (Optional) Country code filter. 311 | limit: Maximum number of results to return (default: 15). 312 | 313 | Returns: 314 | OptimizedSearchResponse: Streamlined response with essential author data. 315 | """ 316 | try: 317 | # Build query 318 | query = pyalex.Authors().search_filter(display_name=name) 319 | 320 | # Add filters if provided 321 | filters = {} 322 | if institution: 323 | filters['affiliations.institution.display_name.search'] = institution 324 | if topic: 325 | filters['x_concepts.display_name.search'] = topic 326 | if country_code: 327 | filters['affiliations.institution.country_code'] = country_code 328 | 329 | if filters: 330 | query = query.filter(**filters) 331 | 332 | # Execute query with limit 333 | results = query.get(per_page=min(limit, 100)) # Increased for comprehensive search 334 | authors = list(results) 335 | 336 | # Convert to optimized format 337 | optimized_authors = [] 338 | for author_data in authors: 339 | try: 340 | optimized_author = optimize_author_data(author_data) 341 | optimized_authors.append(optimized_author) 342 | except Exception as e: 343 | logger.warning(f"Error optimizing author data: {e}") 344 | # Skip problematic authors rather than failing completely 345 | continue 346 | 347 | logger.info(f"Found {len(optimized_authors)} authors for query: {name}") 348 | 349 | return OptimizedSearchResponse( 350 | query=name, 351 | total_count=len(optimized_authors), 352 | results=optimized_authors 353 | ) 354 | 355 | except Exception as e: 356 | logger.error(f"Error searching authors for query '{name}': {e}") 357 | return OptimizedSearchResponse( 358 | query=name, 359 | total_count=0, 360 | results=[] 361 | ) 362 | 363 | 364 | def autocomplete_authors_core( 365 | name: str, 366 | context: Optional[str] = None, 367 | limit: int = 10, 368 | filter_no_institution: bool = True, 369 | enable_institution_ranking: bool = True 370 | ) -> AutocompleteAuthorsResponse: 371 | """ 372 | Enhanced core function for author autocomplete with intelligent filtering and ranking. 373 | 374 | Args: 375 | name: Author name to search for 376 | context: Optional context for better matching (institution, research area, etc.) 377 | limit: Maximum number of candidates to return (increased default to 10) 378 | filter_no_institution: If True, exclude candidates with no institutional affiliation 379 | enable_institution_ranking: If True, rank candidates by institutional context relevance 380 | 381 | Returns: 382 | AutocompleteAuthorsResponse with filtered and ranked candidate authors 383 | """ 384 | try: 385 | logger.info(f"🔍 Autocompleting authors for: '{name}' (limit: {limit})") 386 | if context: 387 | logger.info(f" 📝 Context provided: {context}") 388 | 389 | # Use PyAlex autocomplete for authors - get more results for filtering 390 | raw_limit = min(limit * 2, 20) # Get 2x candidates for filtering 391 | results = pyalex.Authors().autocomplete(name)[:raw_limit] 392 | 393 | # Convert to our data model first 394 | all_candidates = [] 395 | for result in results: 396 | candidate = AutocompleteAuthorCandidate( 397 | openalex_id=result.get('id', ''), 398 | display_name=result.get('display_name', ''), 399 | institution_hint=result.get('hint'), 400 | works_count=result.get('works_count', 0), 401 | cited_by_count=result.get('cited_by_count', 0), 402 | entity_type=result.get('entity_type', 'author'), 403 | external_id=result.get('external_id') 404 | ) 405 | all_candidates.append(candidate) 406 | 407 | # ENHANCEMENT 1: Filter out candidates with no institution 408 | if filter_no_institution: 409 | filtered_candidates = [ 410 | c for c in all_candidates 411 | if c.institution_hint and c.institution_hint not in ['No institution', 'None', ''] 412 | ] 413 | excluded_count = len(all_candidates) - len(filtered_candidates) 414 | if excluded_count > 0: 415 | logger.info(f" 🔍 Filtered out {excluded_count} candidates with no institution") 416 | else: 417 | filtered_candidates = all_candidates 418 | 419 | # ENHANCEMENT 2: Institution-aware ranking (if context provided) 420 | if enable_institution_ranking and context and filtered_candidates: 421 | scored_candidates = [] 422 | context_lower = context.lower() 423 | 424 | for candidate in filtered_candidates: 425 | relevance_score = 0 426 | matched_terms = [] 427 | 428 | inst_hint = (candidate.institution_hint or '').lower() 429 | 430 | # High-value institutional matches 431 | high_value_terms = [ 432 | 'max planck', 'harvard', 'stanford', 'mit', 'cambridge', 'oxford', 433 | 'excellence cluster', 'crick', 'wellcome', 'nih', 'cnrs', 'inserm' 434 | ] 435 | for term in high_value_terms: 436 | if term in context_lower and term in inst_hint: 437 | relevance_score += 3 438 | matched_terms.append(f"{term} (+3)") 439 | 440 | # Location-based matches 441 | location_terms = ['germany', 'uk', 'usa', 'france', 'köln', 'cologne', 'london', 'boston', 'berlin'] 442 | for term in location_terms: 443 | if term in context_lower and term in inst_hint: 444 | relevance_score += 2 445 | matched_terms.append(f"{term} (+2)") 446 | 447 | # Research field alignment (basic keyword matching) 448 | research_terms = ['biology', 'chemistry', 'biochemistry', 'physics', 'medicine'] 449 | for term in research_terms: 450 | if term in context_lower and term in inst_hint: 451 | relevance_score += 1 452 | matched_terms.append(f"{term} (+1)") 453 | 454 | # High-impact researcher bonus 455 | if candidate.cited_by_count and candidate.cited_by_count > 1000: 456 | relevance_score += 1 457 | matched_terms.append("high-impact (+1)") 458 | 459 | scored_candidates.append({ 460 | 'candidate': candidate, 461 | 'relevance_score': relevance_score, 462 | 'matched_terms': matched_terms 463 | }) 464 | 465 | # Sort by relevance score (descending), then by citation count 466 | scored_candidates.sort(key=lambda x: (x['relevance_score'], x['candidate'].cited_by_count), reverse=True) 467 | 468 | # Extract ranked candidates 469 | final_candidates = [sc['candidate'] for sc in scored_candidates[:limit]] 470 | 471 | # Log ranking results 472 | logger.info(f" 🏆 Institution-aware ranking applied:") 473 | for i, sc in enumerate(scored_candidates[:3], 1): # Log top 3 474 | candidate = sc['candidate'] 475 | logger.info(f" {i}. {candidate.display_name} (score: {sc['relevance_score']}, {candidate.institution_hint})") 476 | else: 477 | # No ranking, just take first N candidates 478 | final_candidates = filtered_candidates[:limit] 479 | 480 | # Log final candidates 481 | for candidate in final_candidates: 482 | logger.info(f" 👤 {candidate.display_name} ({candidate.institution_hint or 'No institution'}) - {candidate.works_count} works") 483 | 484 | response = AutocompleteAuthorsResponse( 485 | query=name, 486 | context=context, 487 | total_candidates=len(final_candidates), 488 | candidates=final_candidates, 489 | search_metadata={ 490 | 'api_used': 'openalex_autocomplete', 491 | 'has_context': context is not None, 492 | 'filtered_no_institution': filter_no_institution, 493 | 'institution_ranking_enabled': enable_institution_ranking and context is not None, 494 | 'response_time_ms': None # Could be added with timing 495 | } 496 | ) 497 | 498 | logger.info(f"✅ Found {len(final_candidates)} candidates for '{name}'") 499 | return response 500 | 501 | except Exception as e: 502 | logger.error(f"❌ Error in autocomplete_authors_core: {e}") 503 | # Return empty response on error 504 | return AutocompleteAuthorsResponse( 505 | query=name, 506 | context=context, 507 | total_candidates=0, 508 | candidates=[], 509 | search_metadata={ 510 | 'api_used': 'openalex_autocomplete', 511 | 'has_context': context is not None, 512 | 'error': str(e) 513 | } 514 | ) 515 | 516 | 517 | def search_works_core( 518 | query: str, 519 | author: Optional[str] = None, 520 | institution: Optional[str] = None, 521 | publication_year: Optional[int] = None, 522 | type: Optional[str] = None, 523 | limit: int = 25, 524 | peer_reviewed_only: bool = True, 525 | search_type: str = "general" 526 | ) -> OptimizedGeneralWorksSearchResponse: 527 | """ 528 | Core logic for searching works using OpenAlex with configurable search modes. 529 | Returns streamlined work data to minimize token usage. 530 | 531 | Args: 532 | query: Search query text 533 | author: (Optional) Author name filter 534 | institution: (Optional) Institution name filter 535 | publication_year: (Optional) Publication year filter 536 | type: (Optional) Work type filter (e.g., "article", "letter") 537 | limit: Maximum number of results (default: 25, max: 100) 538 | peer_reviewed_only: If True, apply peer-review filters (default: True) 539 | search_type: Search mode - "general" (title/abstract/fulltext), "title" (title only), 540 | or "title_and_abstract" (title and abstract only) 541 | 542 | Returns: 543 | OptimizedGeneralWorksSearchResponse: Streamlined response with work data. 544 | """ 545 | try: 546 | # Ensure reasonable limits to control token usage 547 | limit = min(limit, 100) 548 | 549 | # Build the search query using PyAlex based on search_type 550 | if search_type == "title": 551 | # Use title-specific search for precise title matching 552 | works_query = pyalex.Works() 553 | filters = {'title.search': query} 554 | elif search_type == "title_and_abstract": 555 | # Use title and abstract search 556 | works_query = pyalex.Works() 557 | filters = {'title_and_abstract.search': query} 558 | else: # search_type == "general" or any other value 559 | # Use general search across title, abstract, and fulltext (default behavior) 560 | works_query = pyalex.Works().search(query) 561 | filters = {} 562 | 563 | # Add author filter if provided 564 | if author: 565 | # For general work search, we can use raw_author_name.search for name-based filtering 566 | # This searches for works where the author name appears in the raw author strings 567 | filters['raw_author_name.search'] = author 568 | 569 | # Add institution filter if provided 570 | if institution: 571 | # Use the correct field for institution name filtering 572 | filters['authorships.institutions.display_name.search'] = institution 573 | 574 | # Add publication year filter 575 | if publication_year: 576 | filters['publication_year'] = publication_year 577 | 578 | # Add type filter 579 | if type: 580 | filters['type'] = type 581 | elif peer_reviewed_only: 582 | # Focus on journal articles and letters for academic work 583 | filters['type'] = 'article|letter' 584 | 585 | # Add basic quality filters 586 | if peer_reviewed_only: 587 | filters['is_retracted'] = False 588 | 589 | # Apply filters to query 590 | if filters: 591 | works_query = works_query.filter(**filters) 592 | 593 | # Execute query 594 | logger.info(f"Searching OpenAlex works with search_type='{search_type}', query: '{query[:50]}...' and {len(filters)} filters") 595 | results = works_query.get(per_page=limit) 596 | 597 | # Apply additional peer-review filtering if requested 598 | if peer_reviewed_only and results: 599 | logger.info(f"Applying peer-review filtering to {len(results)} results...") 600 | results = filter_peer_reviewed_works(results) 601 | logger.info(f"After peer-review filtering: {len(results)} results remain") 602 | 603 | # Convert to optimized format 604 | optimized_works = [] 605 | for work in results: 606 | try: 607 | optimized_work = optimize_work_data(work) 608 | optimized_works.append(optimized_work) 609 | except Exception as e: 610 | logger.warning(f"Error optimizing work data: {e}") 611 | continue 612 | 613 | logger.info(f"Returning {len(optimized_works)} optimized works for search query") 614 | 615 | return OptimizedGeneralWorksSearchResponse( 616 | query=query, 617 | total_count=len(optimized_works), 618 | results=optimized_works, 619 | filters=filters 620 | ) 621 | 622 | except Exception as e: 623 | logger.error(f"Error searching works for query '{query}': {e}") 624 | return OptimizedGeneralWorksSearchResponse( 625 | query=query, 626 | total_count=0, 627 | results=[], 628 | filters={} 629 | ) 630 | 631 | 632 | def retrieve_author_works_core( 633 | author_id: str, 634 | limit: int = 20_000, # High default limit for comprehensive analysis 635 | order_by: str = "date", # "date" or "citations" 636 | publication_year: Optional[int] = None, 637 | type: Optional[str] = None, 638 | journal_only: bool = True, # Default to True for peer-reviewed content 639 | min_citations: Optional[int] = None, 640 | peer_reviewed_only: bool = True, # Default to True 641 | ) -> OptimizedWorksSearchResponse: 642 | """ 643 | Enhanced core logic to retrieve peer-reviewed works for a given OpenAlex Author ID. 644 | Returns streamlined work data to minimize token usage and ensures only legitimate 645 | peer-reviewed journal articles and letters. 646 | 647 | Args: 648 | author_id: OpenAlex Author ID 649 | limit: Maximum number of results (default: 2000 for comprehensive analysis) 650 | order_by: Sort order - "date" or "citations" 651 | publication_year: Filter by specific year 652 | type: Filter by work type (e.g., "journal-article") 653 | journal_only: If True, only return journal articles and letters 654 | min_citations: Minimum citation count filter 655 | peer_reviewed_only: If True, apply comprehensive peer-review filters 656 | 657 | Returns: 658 | OptimizedWorksSearchResponse: Streamlined response with peer-reviewed work data. 659 | """ 660 | try: 661 | limit = min(limit, 20_000) 662 | 663 | # Build base filters 664 | filters = {"author.id": author_id} 665 | 666 | # Add optional filters 667 | if publication_year: 668 | filters["publication_year"] = publication_year 669 | if type: 670 | filters["type"] = type 671 | elif journal_only: 672 | # Focus on journal articles and letters for academic work 673 | filters["type"] = "article|letter" 674 | if min_citations: 675 | filters["cited_by_count"] = f">={min_citations}" 676 | 677 | # Add some basic API-level filters (but not too restrictive) 678 | if peer_reviewed_only or journal_only: 679 | # Only exclude obviously retracted papers at API level 680 | filters["is_retracted"] = "false" 681 | 682 | # Convert author_id to proper format if needed 683 | if author_id.startswith("https://openalex.org/"): 684 | author_id_short = author_id.split("/")[-1] 685 | filters["author.id"] = f"https://openalex.org/{author_id_short}" 686 | 687 | # Build query - get more results for post-filtering if needed 688 | if peer_reviewed_only: 689 | initial_limit = min(limit * 4, 20_000) # Get 4x more for filtering, much higher limit 690 | else: 691 | initial_limit = limit 692 | 693 | works_query = pyalex.Works().filter(**filters) 694 | 695 | # Apply sorting 696 | if order_by == "citations": 697 | works_query = works_query.sort(cited_by_count="desc") 698 | else: 699 | works_query = works_query.sort(publication_date="desc") 700 | 701 | # Execute query using pagination to get ALL works 702 | logger.info(f"Querying OpenAlex for up to {initial_limit} works with filters: {filters}") 703 | 704 | # Use paginate() to get all works, not just the first page 705 | all_works = [] 706 | pager = works_query.paginate(per_page=200, n_max=initial_limit) # Use 200 per page (API recommended) 707 | 708 | for page in pager: 709 | all_works.extend(page) 710 | if len(all_works) >= initial_limit: 711 | break 712 | 713 | works = all_works[:initial_limit] # Ensure we don't exceed the limit 714 | logger.info(f"Retrieved {len(works)} works from OpenAlex via pagination") 715 | 716 | # Apply peer-review filtering if requested 717 | if peer_reviewed_only: 718 | logger.info("Applying peer-review filtering...") 719 | works = filter_peer_reviewed_works(works) 720 | logger.info(f"After filtering: {len(works)} works remain") 721 | 722 | # Limit to requested number after filtering 723 | works = works[:limit] 724 | 725 | # Get author name for response (if available from first work) 726 | author_name = None 727 | if works: 728 | authorships = works[0].get('authorships', []) 729 | for authorship in authorships: 730 | author = authorship.get('author', {}) 731 | if author.get('id') == author_id: 732 | author_name = author.get('display_name') 733 | break 734 | 735 | # Convert to optimized format 736 | optimized_works = [] 737 | for work_data in works: 738 | try: 739 | optimized_work = optimize_work_data(work_data) 740 | optimized_works.append(optimized_work) 741 | except Exception as e: 742 | logger.warning(f"Error optimizing work data: {e}") 743 | continue 744 | 745 | logger.info(f"Final result: {len(optimized_works)} works for author: {author_id}") 746 | 747 | return OptimizedWorksSearchResponse( 748 | author_id=author_id, 749 | author_name=author_name, 750 | total_count=len(optimized_works), 751 | results=optimized_works, 752 | filters=filters 753 | ) 754 | 755 | except Exception as e: 756 | logger.error(f"Error retrieving works for author {author_id}: {e}") 757 | return OptimizedWorksSearchResponse( 758 | author_id=author_id, 759 | total_count=0, 760 | results=[], 761 | filters={} 762 | ) 763 | 764 | 765 | @mcp.tool( 766 | annotations={ 767 | "title": "Search Authors (Optimized)", 768 | "description": ( 769 | "Search for authors by name with optional filters. " 770 | "Returns streamlined author data optimized for AI agents with ~70% fewer tokens. " 771 | "Includes essential info: name, ORCID, affiliations (as strings), metrics, and research fields." 772 | ), 773 | "readOnlyHint": True, 774 | "openWorldHint": True 775 | } 776 | ) 777 | async def search_authors( 778 | name: str, 779 | institution: Optional[str] = None, 780 | topic: Optional[str] = None, 781 | country_code: Optional[str] = None, 782 | limit: int = 15 783 | ) -> dict: 784 | """ 785 | Optimized MCP tool wrapper for searching authors. 786 | 787 | Args: 788 | name: Author name to search for. 789 | institution: (Optional) Institution name filter. 790 | topic: (Optional) Topic filter. 791 | country_code: (Optional) Country code filter. 792 | limit: Maximum number of results to return (default: 15, max: 100). 793 | 794 | Returns: 795 | dict: Serialized OptimizedSearchResponse with streamlined author data. 796 | """ 797 | # Ensure reasonable limits to control token usage 798 | limit = min(limit, 100) # Increased for comprehensive author search 799 | 800 | response = search_authors_core( 801 | name=name, 802 | institution=institution, 803 | topic=topic, 804 | country_code=country_code, 805 | limit=limit 806 | ) 807 | return response.model_dump() 808 | 809 | 810 | @mcp.tool( 811 | annotations={ 812 | "title": "Retrieve Author Works (Peer-Reviewed Only)", 813 | "description": ( 814 | "Retrieve peer-reviewed journal works for a given OpenAlex Author ID. " 815 | "Automatically filters out data catalogs, preprint servers, and non-journal content. " 816 | "Returns streamlined work data optimized for AI agents with ~80% fewer tokens. " 817 | "Uses balanced filtering: excludes VizieR catalogs but allows legitimate papers without DOIs." 818 | ), 819 | "readOnlyHint": True, 820 | "openWorldHint": True 821 | } 822 | ) 823 | async def retrieve_author_works( 824 | author_id: str, 825 | limit: Optional[int] = None, 826 | order_by: str = "date", 827 | publication_year: Optional[int] = None, 828 | type: Optional[str] = None, 829 | journal_only: bool = True, 830 | min_citations: Optional[int] = None, 831 | peer_reviewed_only: bool = True, 832 | ) -> dict: 833 | """ 834 | Enhanced MCP tool wrapper for retrieving author works with flexible filtering. 835 | 836 | Args: 837 | author_id: OpenAlex Author ID (e.g., 'https://openalex.org/A123456789') 838 | limit: Maximum number of results (default: None = ALL works via pagination, max: 2000) 839 | order_by: Sort order - "date" for newest first, "citations" for most cited first 840 | publication_year: Filter by specific publication year 841 | type: Filter by work type (e.g., "journal-article", "letter") 842 | journal_only: If True, only return journal articles and letters (default: True) 843 | min_citations: Only return works with at least this many citations 844 | peer_reviewed_only: If True, apply balanced peer-review filters (default: True) 845 | 846 | Returns: 847 | dict: Serialized OptimizedWorksSearchResponse with author's works. 848 | 849 | Usage Patterns: 850 | # For AI validation (sample of high-impact works) 851 | retrieve_author_works(author_id, limit=20, order_by="citations") 852 | 853 | # For complete benchmark evaluation (ALL works, minimal filtering) 854 | retrieve_author_works(author_id, peer_reviewed_only=False, journal_only=False) 855 | 856 | # For peer-reviewed works only (default behavior) 857 | retrieve_author_works(author_id) 858 | """ 859 | # Handle limit: None means ALL works, otherwise cap at reasonable limit 860 | logger.info(f"MCP tool received limit parameter: {limit}") 861 | if limit is None: 862 | limit = 2000 # Set a very high limit to get ALL works 863 | logger.info(f"No limit specified, setting to {limit} for comprehensive retrieval") 864 | else: 865 | limit = min(limit, 2000) # Increased max limit for comprehensive analysis 866 | logger.info(f"Explicit limit specified, capped to {limit}") 867 | 868 | response = retrieve_author_works_core( 869 | author_id=author_id, 870 | limit=limit, 871 | order_by=order_by, 872 | publication_year=publication_year, 873 | type=type, 874 | journal_only=journal_only, 875 | min_citations=min_citations, 876 | peer_reviewed_only=peer_reviewed_only, 877 | ) 878 | return response.model_dump() 879 | 880 | 881 | @mcp.tool( 882 | annotations={ 883 | "title": "Search Works (Optimized)", 884 | "description": ( 885 | "Search for academic works with configurable search modes and optional filters. " 886 | "Returns streamlined work data optimized for AI agents with ~80% fewer tokens. " 887 | "Supports different search types: 'general' (title/abstract/fulltext), 'title' (title only), " 888 | "or 'title_and_abstract' (title and abstract only). " 889 | "Supports author, institution, publication year, and type filters. " 890 | "Automatically applies peer-review filtering to exclude data catalogs and preprints." 891 | ), 892 | "readOnlyHint": True, 893 | "openWorldHint": True 894 | } 895 | ) 896 | async def search_works( 897 | query: str, 898 | author: Optional[str] = None, 899 | institution: Optional[str] = None, 900 | publication_year: Optional[int] = None, 901 | type: Optional[str] = None, 902 | limit: int = 25, 903 | peer_reviewed_only: bool = True, 904 | search_type: str = "general" 905 | ) -> dict: 906 | """ 907 | Optimized MCP tool wrapper for searching works. 908 | 909 | Args: 910 | query: Search query text 911 | author: (Optional) Author name filter 912 | institution: (Optional) Institution name filter 913 | publication_year: (Optional) Publication year filter 914 | type: (Optional) Work type filter (e.g., "article", "letter") 915 | limit: Maximum number of results (default: 25, max: 100) 916 | peer_reviewed_only: If True, apply peer-review filters (default: True) 917 | search_type: Search mode - "general" (title/abstract/fulltext), "title" (title only), 918 | or "title_and_abstract" (title and abstract only) 919 | 920 | Returns: 921 | dict: Serialized OptimizedGeneralWorksSearchResponse with streamlined work data. 922 | """ 923 | # Ensure reasonable limits to control token usage 924 | limit = min(limit, 100) 925 | 926 | response = search_works_core( 927 | query=query, 928 | author=author, 929 | institution=institution, 930 | publication_year=publication_year, 931 | type=type, 932 | limit=limit, 933 | peer_reviewed_only=peer_reviewed_only, 934 | search_type=search_type 935 | ) 936 | return response.model_dump() 937 | 938 | 939 | @mcp.tool( 940 | annotations={ 941 | "title": "Autocomplete Authors (Smart Disambiguation)", 942 | "description": ( 943 | "Get multiple author candidates using OpenAlex autocomplete API for intelligent disambiguation. " 944 | "Returns a ranked list of potential author matches with institutional hints and research metrics. " 945 | "Perfect when you need to disambiguate authors and have context like institution, research area, or co-authors. " 946 | "The AI can select the best match based on the provided context. " 947 | "Much faster than full search (~200ms) and provides multiple options for better accuracy." 948 | ), 949 | "readOnlyHint": True, 950 | "openWorldHint": True 951 | } 952 | ) 953 | async def autocomplete_authors( 954 | name: str, 955 | context: Optional[str] = None, 956 | limit: int = 10, 957 | filter_no_institution: bool = True, 958 | enable_institution_ranking: bool = True 959 | ) -> dict: 960 | """ 961 | Enhanced autocomplete authors with intelligent filtering and ranking. 962 | 963 | Args: 964 | name: Author name to search for (e.g., "James Briscoe", "M. Ralser") 965 | context: Optional context to help with disambiguation (e.g., "Francis Crick Institute developmental biology", "Max Planck Institute Köln Germany") 966 | limit: Maximum number of candidates to return (default: 10, max: 15) 967 | filter_no_institution: If True, exclude candidates with no institutional affiliation (default: True) 968 | enable_institution_ranking: If True, rank candidates by institutional context relevance (default: True) 969 | 970 | Returns: 971 | dict: Serialized AutocompleteAuthorsResponse with filtered and ranked candidate authors, including: 972 | - openalex_id: Full OpenAlex author ID 973 | - display_name: Author's display name 974 | - institution_hint: Current/last known institution 975 | - works_count: Number of published works 976 | - cited_by_count: Total citation count 977 | - external_id: ORCID or other external identifiers 978 | - search_metadata: Information about filtering and ranking applied 979 | 980 | Example usage: 981 | # Get high-quality candidates with institutional filtering 982 | candidates = await autocomplete_authors("Ivan Matić", context="Max Planck Institute Biology Ageing Köln Germany") 983 | 984 | # For seasoned researchers, institution hints and ranking help disambiguation 985 | # AI can then select the best match or retrieve works for further verification 986 | 987 | Enhanced Features: 988 | - Filters out candidates with no institutional affiliation (reduces noise) 989 | - Institution-aware ranking when context is provided (improves accuracy) 990 | - Higher default limit (10 vs 5) for better candidate coverage 991 | - Detailed logging for debugging and optimization 992 | """ 993 | # Ensure reasonable limits - increased max to 15 994 | limit = min(max(limit, 1), 15) 995 | 996 | response = autocomplete_authors_core( 997 | name=name, 998 | context=context, 999 | limit=limit, 1000 | filter_no_institution=filter_no_institution, 1001 | enable_institution_ranking=enable_institution_ranking 1002 | ) 1003 | return response.model_dump() 1004 | 1005 | 1006 | # PubMed Integration Functions 1007 | import requests 1008 | import xml.etree.ElementTree as ET 1009 | from typing import Union 1010 | 1011 | def pubmed_search_core( 1012 | query: str, 1013 | max_results: int = 20, 1014 | search_type: str = "author" 1015 | ) -> dict: 1016 | """ 1017 | Core PubMed search functionality using E-utilities API. 1018 | 1019 | Args: 1020 | query: Search query (author name, DOI, or keywords) 1021 | max_results: Maximum number of results to return 1022 | search_type: Type of search ("author", "doi", "title", "keywords") 1023 | 1024 | Returns: 1025 | dict with search results including PMIDs, total count, and basic metadata 1026 | """ 1027 | base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" 1028 | 1029 | try: 1030 | # Format search term based on type 1031 | if search_type == "author": 1032 | search_term = f'"{query}"[Author]' 1033 | elif search_type == "doi": 1034 | clean_doi = query.replace('https://doi.org/', '').replace('http://dx.doi.org/', '') 1035 | search_term = f'"{clean_doi}"[AID]' 1036 | elif search_type == "title": 1037 | search_term = f'"{query}"[Title]' 1038 | else: # keywords 1039 | search_term = query 1040 | 1041 | logger.info(f"🔍 PubMed search: {search_term} (max: {max_results})") 1042 | 1043 | # Search PubMed 1044 | search_url = f"{base_url}esearch.fcgi" 1045 | search_params = { 1046 | 'db': 'pubmed', 1047 | 'term': search_term, 1048 | 'retmax': max_results, 1049 | 'retmode': 'json', 1050 | 'sort': 'relevance' 1051 | } 1052 | 1053 | response = requests.get(search_url, params=search_params, timeout=10) 1054 | response.raise_for_status() 1055 | search_data = response.json() 1056 | 1057 | pmids = search_data.get('esearchresult', {}).get('idlist', []) 1058 | total_count = int(search_data.get('esearchresult', {}).get('count', 0)) 1059 | 1060 | logger.info(f"📊 Found {total_count} total results, retrieved {len(pmids)} PMIDs") 1061 | 1062 | # Get basic details for retrieved PMIDs (if any) 1063 | articles = [] 1064 | if pmids: 1065 | articles = get_pubmed_summaries(pmids[:min(len(pmids), 10)]) # Limit to 10 for performance 1066 | 1067 | return { 1068 | 'query': query, 1069 | 'search_type': search_type, 1070 | 'search_term_used': search_term, 1071 | 'total_count': total_count, 1072 | 'retrieved_count': len(pmids), 1073 | 'pmids': pmids, 1074 | 'articles': articles, 1075 | 'search_metadata': { 1076 | 'api_used': 'pubmed_esearch', 1077 | 'max_results_requested': max_results, 1078 | 'response_time_ms': None 1079 | } 1080 | } 1081 | 1082 | except Exception as e: 1083 | logger.error(f"❌ PubMed search error: {e}") 1084 | return { 1085 | 'query': query, 1086 | 'search_type': search_type, 1087 | 'total_count': 0, 1088 | 'retrieved_count': 0, 1089 | 'pmids': [], 1090 | 'articles': [], 1091 | 'error': str(e) 1092 | } 1093 | 1094 | 1095 | def get_pubmed_summaries(pmids: list) -> list: 1096 | """ 1097 | Get summary information for a list of PMIDs using esummary. 1098 | 1099 | Args: 1100 | pmids: List of PubMed IDs 1101 | 1102 | Returns: 1103 | List of article summaries with basic metadata 1104 | """ 1105 | if not pmids: 1106 | return [] 1107 | 1108 | base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" 1109 | 1110 | try: 1111 | # Get summaries 1112 | summary_url = f"{base_url}esummary.fcgi" 1113 | summary_params = { 1114 | 'db': 'pubmed', 1115 | 'id': ','.join(pmids), 1116 | 'retmode': 'json' 1117 | } 1118 | 1119 | response = requests.get(summary_url, params=summary_params, timeout=15) 1120 | response.raise_for_status() 1121 | summary_data = response.json() 1122 | 1123 | articles = [] 1124 | uids = summary_data.get('result', {}).get('uids', []) 1125 | 1126 | for uid in uids: 1127 | article_data = summary_data.get('result', {}).get(uid, {}) 1128 | if article_data: 1129 | # Extract key information 1130 | authors = article_data.get('authors', []) 1131 | author_names = [author.get('name', '') for author in authors[:5]] # First 5 authors 1132 | 1133 | article = { 1134 | 'pmid': uid, 1135 | 'title': article_data.get('title', ''), 1136 | 'authors': author_names, 1137 | 'journal': article_data.get('fulljournalname', ''), 1138 | 'pub_date': article_data.get('pubdate', ''), 1139 | 'doi': article_data.get('elocationid', ''), # Often contains DOI 1140 | 'pmcid': article_data.get('pmcid', ''), 1141 | 'publication_types': article_data.get('pubtype', []) 1142 | } 1143 | articles.append(article) 1144 | 1145 | logger.info(f"📄 Retrieved summaries for {len(articles)} articles") 1146 | return articles 1147 | 1148 | except Exception as e: 1149 | logger.error(f"❌ Error getting PubMed summaries: {e}") 1150 | return [] 1151 | 1152 | 1153 | def get_pubmed_author_sample(author_name: str, sample_size: int = 5) -> dict: 1154 | """ 1155 | Get a sample of works by an author from PubMed with institutional information. 1156 | 1157 | Args: 1158 | author_name: Author name to search for 1159 | sample_size: Number of sample works to analyze in detail 1160 | 1161 | Returns: 1162 | dict with author sample analysis including affiliations and name variants 1163 | """ 1164 | try: 1165 | logger.info(f"🔍 Getting PubMed author sample for: {author_name}") 1166 | 1167 | # Search for author 1168 | search_result = pubmed_search_core(author_name, max_results=sample_size, search_type="author") 1169 | 1170 | if not search_result['pmids']: 1171 | return { 1172 | 'author_name': author_name, 1173 | 'total_works': 0, 1174 | 'sample_works': [], 1175 | 'institutional_keywords': [], 1176 | 'name_variants': [], 1177 | 'email_addresses': [] 1178 | } 1179 | 1180 | # Get detailed information for sample 1181 | sample_pmids = search_result['pmids'][:sample_size] 1182 | detailed_articles = [] 1183 | all_affiliations = [] 1184 | name_variants = set() 1185 | email_addresses = set() 1186 | 1187 | for pmid in sample_pmids: 1188 | article_details = get_detailed_pubmed_article(pmid, author_name) 1189 | if article_details: 1190 | detailed_articles.append(article_details) 1191 | 1192 | # Extract affiliations and variants for target author 1193 | for author_info in article_details.get('author_details', []): 1194 | if is_target_author(author_info, author_name): 1195 | all_affiliations.extend(author_info.get('affiliations', [])) 1196 | 1197 | # Collect name variants 1198 | full_name = f"{author_info['first_name']} {author_info['last_name']}".strip() 1199 | if full_name: 1200 | name_variants.add(full_name) 1201 | 1202 | # Extract email addresses 1203 | for affil in author_info.get('affiliations', []): 1204 | emails = extract_emails_from_text(affil) 1205 | email_addresses.update(emails) 1206 | 1207 | # Extract institutional keywords 1208 | institutional_keywords = extract_institutional_keywords(all_affiliations) 1209 | 1210 | return { 1211 | 'author_name': author_name, 1212 | 'total_works': search_result['total_count'], 1213 | 'sample_works': detailed_articles, 1214 | 'institutional_keywords': institutional_keywords, 1215 | 'name_variants': list(name_variants), 1216 | 'email_addresses': list(email_addresses), 1217 | 'sample_metadata': { 1218 | 'sample_size': len(detailed_articles), 1219 | 'affiliations_found': len(all_affiliations) 1220 | } 1221 | } 1222 | 1223 | except Exception as e: 1224 | logger.error(f"❌ Error in PubMed author sample: {e}") 1225 | return { 1226 | 'author_name': author_name, 1227 | 'total_works': 0, 1228 | 'sample_works': [], 1229 | 'error': str(e) 1230 | } 1231 | 1232 | 1233 | def get_detailed_pubmed_article(pmid: str, target_author: str) -> dict: 1234 | """Get detailed article information including author affiliations""" 1235 | base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/" 1236 | 1237 | try: 1238 | fetch_url = f"{base_url}efetch.fcgi" 1239 | fetch_params = { 1240 | 'db': 'pubmed', 1241 | 'id': pmid, 1242 | 'retmode': 'xml', 1243 | 'rettype': 'abstract' 1244 | } 1245 | 1246 | response = requests.get(fetch_url, params=fetch_params, timeout=10) 1247 | response.raise_for_status() 1248 | 1249 | # Parse XML 1250 | root = ET.fromstring(response.text) 1251 | article = root.find('.//PubmedArticle') 1252 | 1253 | if article is None: 1254 | return None 1255 | 1256 | # Extract basic info 1257 | title_elem = article.find('.//ArticleTitle') 1258 | title = ''.join(title_elem.itertext()).strip() if title_elem is not None else '' 1259 | 1260 | journal_elem = article.find('.//Journal/Title') 1261 | journal = journal_elem.text if journal_elem is not None else '' 1262 | 1263 | # Extract authors with affiliations 1264 | author_details = [] 1265 | author_list = article.find('.//AuthorList') 1266 | if author_list is not None: 1267 | for author_elem in author_list.findall('Author'): 1268 | author_info = extract_detailed_author_info(author_elem) 1269 | author_details.append(author_info) 1270 | 1271 | return { 1272 | 'pmid': pmid, 1273 | 'title': title, 1274 | 'journal': journal, 1275 | 'author_details': author_details 1276 | } 1277 | 1278 | except Exception as e: 1279 | logger.error(f"❌ Error fetching detailed article {pmid}: {e}") 1280 | return None 1281 | 1282 | 1283 | def extract_detailed_author_info(author_elem: ET.Element) -> dict: 1284 | """Extract detailed author information from XML element""" 1285 | author_info = { 1286 | 'last_name': '', 1287 | 'first_name': '', 1288 | 'initials': '', 1289 | 'affiliations': [] 1290 | } 1291 | 1292 | try: 1293 | last_name = author_elem.find('LastName') 1294 | if last_name is not None: 1295 | author_info['last_name'] = last_name.text or '' 1296 | 1297 | first_name = author_elem.find('ForeName') 1298 | if first_name is not None: 1299 | author_info['first_name'] = first_name.text or '' 1300 | 1301 | initials = author_elem.find('Initials') 1302 | if initials is not None: 1303 | author_info['initials'] = initials.text or '' 1304 | 1305 | # Get affiliations 1306 | affil_info = author_elem.find('AffiliationInfo') 1307 | if affil_info is not None: 1308 | for affil in affil_info.findall('Affiliation'): 1309 | if affil.text: 1310 | author_info['affiliations'].append(affil.text.strip()) 1311 | 1312 | except Exception: 1313 | pass 1314 | 1315 | return author_info 1316 | 1317 | 1318 | def is_target_author(author_info: dict, target_name: str) -> bool: 1319 | """Check if author_info matches target author name""" 1320 | full_name = f"{author_info['first_name']} {author_info['last_name']}".strip().lower() 1321 | target_lower = target_name.lower() 1322 | 1323 | # Simple similarity check 1324 | return (target_lower in full_name or 1325 | full_name in target_lower or 1326 | author_info['last_name'].lower() in target_lower) 1327 | 1328 | 1329 | def extract_emails_from_text(text: str) -> list: 1330 | """Extract email addresses from text""" 1331 | import re 1332 | email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' 1333 | return re.findall(email_pattern, text) 1334 | 1335 | 1336 | def extract_institutional_keywords(affiliations: list) -> list: 1337 | """Extract common institutional keywords from affiliations""" 1338 | if not affiliations: 1339 | return [] 1340 | 1341 | # Combine all affiliations 1342 | all_text = ' '.join(affiliations).lower() 1343 | 1344 | # Common institutional keywords 1345 | keywords = [] 1346 | institutional_terms = [ 1347 | 'university', 'institute', 'college', 'school', 'center', 'centre', 1348 | 'hospital', 'laboratory', 'department', 'faculty', 'division', 1349 | 'max planck', 'harvard', 'stanford', 'mit', 'cambridge', 'oxford', 1350 | 'excellence cluster', 'cnrs', 'inserm', 'nih' 1351 | ] 1352 | 1353 | for term in institutional_terms: 1354 | if term in all_text: 1355 | keywords.append(term) 1356 | 1357 | return keywords[:10] # Return top 10 1358 | 1359 | 1360 | @mcp.tool( 1361 | annotations={ 1362 | "title": "Search PubMed", 1363 | "description": ( 1364 | "Search PubMed database for publications by author, DOI, title, or keywords. " 1365 | "Provides basic article metadata including authors, journal, and publication info. " 1366 | "Useful for cross-validation with OpenAlex data and discovering name variants." 1367 | ), 1368 | "readOnlyHint": True, 1369 | "openWorldHint": True 1370 | } 1371 | ) 1372 | async def search_pubmed( 1373 | query: str, 1374 | search_type: str = "author", 1375 | max_results: int = 20 1376 | ) -> dict: 1377 | """ 1378 | Search PubMed database for publications. 1379 | 1380 | Args: 1381 | query: Search query (author name, DOI, title, or keywords) 1382 | search_type: Type of search - "author", "doi", "title", or "keywords" (default: "author") 1383 | max_results: Maximum number of results to return (default: 20, max: 50) 1384 | 1385 | Returns: 1386 | dict: Search results with PMIDs, article metadata, and summary statistics 1387 | 1388 | Example usage: 1389 | # Search for author 1390 | search_pubmed("Ivan Matic", search_type="author", max_results=10) 1391 | 1392 | # Search by DOI 1393 | search_pubmed("10.1038/nprot.2009.36", search_type="doi") 1394 | 1395 | # Search by keywords 1396 | search_pubmed("ADP-ribosylation DNA repair", search_type="keywords") 1397 | """ 1398 | # Validate parameters 1399 | max_results = min(max(max_results, 1), 50) # Cap at 50 for performance 1400 | valid_types = ["author", "doi", "title", "keywords"] 1401 | if search_type not in valid_types: 1402 | search_type = "author" 1403 | 1404 | logger.info(f"🔍 PubMed search: '{query}' (type: {search_type}, max: {max_results})") 1405 | 1406 | result = pubmed_search_core(query, max_results, search_type) 1407 | return result 1408 | 1409 | 1410 | @mcp.tool( 1411 | annotations={ 1412 | "title": "PubMed Author Sample", 1413 | "description": ( 1414 | "Get a detailed sample of works by an author from PubMed including " 1415 | "institutional affiliations, name variants, and email addresses. " 1416 | "Useful for cross-validation and institutional disambiguation." 1417 | ), 1418 | "readOnlyHint": True, 1419 | "openWorldHint": True 1420 | } 1421 | ) 1422 | async def pubmed_author_sample( 1423 | author_name: str, 1424 | sample_size: int = 5 1425 | ) -> dict: 1426 | """ 1427 | Get detailed author sample from PubMed with institutional information. 1428 | 1429 | Args: 1430 | author_name: Author name to search for (e.g., "Ivan Matic", "J Smith") 1431 | sample_size: Number of recent works to analyze in detail (default: 5, max: 10) 1432 | 1433 | Returns: 1434 | dict: Author analysis including: 1435 | - total_works: Total number of works found in PubMed 1436 | - sample_works: Detailed information for sample works 1437 | - institutional_keywords: Common institutional terms found 1438 | - name_variants: Different name formats found 1439 | - email_addresses: Email addresses extracted from affiliations 1440 | 1441 | Example usage: 1442 | # Get institutional profile for author 1443 | pubmed_author_sample("Ivan Matic", sample_size=5) 1444 | """ 1445 | # Validate parameters 1446 | sample_size = min(max(sample_size, 1), 10) # Cap at 10 for performance 1447 | 1448 | logger.info(f"🔍 PubMed author sample: '{author_name}' (sample: {sample_size})") 1449 | 1450 | result = get_pubmed_author_sample(author_name, sample_size) 1451 | return result 1452 | 1453 | 1454 | # ============================================================================ 1455 | # ORCID Integration Functions 1456 | # ============================================================================ 1457 | 1458 | async def search_orcid_by_name(name: str, affiliation: str = None, max_results: int = 10) -> dict: 1459 | """ 1460 | Search ORCID by author name and optionally affiliation. 1461 | 1462 | Args: 1463 | name: Author name to search 1464 | affiliation: Optional affiliation to help disambiguation 1465 | max_results: Maximum number of results to return 1466 | 1467 | Returns: 1468 | dict: ORCID search results with author profiles 1469 | """ 1470 | try: 1471 | # ORCID Public API search endpoint 1472 | base_url = "https://pub.orcid.org/v3.0/search" 1473 | 1474 | # Build search query 1475 | query_parts = [] 1476 | if name: 1477 | # Split name into parts for better matching 1478 | name_parts = name.replace(",", "").split() 1479 | if len(name_parts) >= 2: 1480 | # Assume last part is family name, rest are given names 1481 | family_name = name_parts[-1] 1482 | given_names = " ".join(name_parts[:-1]) 1483 | query_parts.append(f'family-name:"{family_name}"') 1484 | query_parts.append(f'given-names:"{given_names}"') 1485 | else: 1486 | query_parts.append(f'text:"{name}"') 1487 | 1488 | if affiliation: 1489 | query_parts.append(f'affiliation-org-name:"{affiliation}"') 1490 | 1491 | query = " AND ".join(query_parts) 1492 | 1493 | params = { 1494 | 'q': query, 1495 | 'rows': min(max_results, 50), # ORCID API limit 1496 | 'start': 0 1497 | } 1498 | 1499 | headers = { 1500 | 'Accept': 'application/json', 1501 | 'User-Agent': f'alex-mcp (+{get_config()["OPENALEX_MAILTO"]})' 1502 | } 1503 | 1504 | logger.info(f"🔍 ORCID search: '{query}' (max: {max_results})") 1505 | 1506 | async with aiohttp.ClientSession() as session: 1507 | async with session.get(base_url, params=params, headers=headers) as response: 1508 | if response.status == 200: 1509 | data = await response.json() 1510 | 1511 | results = [] 1512 | for result in data.get('result', []): 1513 | orcid_id = result.get('orcid-identifier', {}).get('path', '') 1514 | 1515 | # Extract name information 1516 | person = result.get('person', {}) 1517 | names = person.get('name', {}) 1518 | given_names = names.get('given-names', {}).get('value', '') if names.get('given-names') else '' 1519 | family_name = names.get('family-name', {}).get('value', '') if names.get('family-name') else '' 1520 | 1521 | # Extract employment/affiliation info 1522 | employments = [] 1523 | employment_summaries = result.get('employment-summary', []) 1524 | for emp in employment_summaries[:3]: # Limit to top 3 1525 | org_name = emp.get('organization', {}).get('name', '') 1526 | if org_name: 1527 | employments.append(org_name) 1528 | 1529 | results.append({ 1530 | 'orcid_id': orcid_id, 1531 | 'orcid_url': f'https://orcid.org/{orcid_id}' if orcid_id else '', 1532 | 'given_names': given_names, 1533 | 'family_name': family_name, 1534 | 'full_name': f"{given_names} {family_name}".strip(), 1535 | 'employments': employments, 1536 | 'relevance_score': result.get('relevance-score', {}).get('value', 0) 1537 | }) 1538 | 1539 | logger.info(f"📊 Found {len(results)} ORCID profiles") 1540 | 1541 | return { 1542 | 'total_found': data.get('num-found', 0), 1543 | 'results_returned': len(results), 1544 | 'results': results 1545 | } 1546 | else: 1547 | logger.warning(f"ORCID API error: {response.status}") 1548 | return {'total_found': 0, 'results_returned': 0, 'results': [], 'error': f'HTTP {response.status}'} 1549 | 1550 | except Exception as e: 1551 | logger.error(f"ORCID search error: {str(e)}") 1552 | return {'total_found': 0, 'results_returned': 0, 'results': [], 'error': str(e)} 1553 | 1554 | 1555 | async def get_orcid_works(orcid_id: str, max_works: int = 20) -> dict: 1556 | """ 1557 | Get works/publications for a specific ORCID ID. 1558 | 1559 | Args: 1560 | orcid_id: ORCID identifier (e.g., "0000-0000-0000-0000") 1561 | max_works: Maximum number of works to retrieve 1562 | 1563 | Returns: 1564 | dict: Works information from ORCID profile 1565 | """ 1566 | try: 1567 | # Clean ORCID ID (remove URL if present) 1568 | clean_orcid = orcid_id.replace('https://orcid.org/', '').replace('http://orcid.org/', '') 1569 | if not re.match(r'^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$', clean_orcid): 1570 | return {'error': 'Invalid ORCID format', 'works': []} 1571 | 1572 | # ORCID Public API works endpoint 1573 | url = f"https://pub.orcid.org/v3.0/{clean_orcid}/works" 1574 | 1575 | headers = { 1576 | 'Accept': 'application/json', 1577 | 'User-Agent': f'alex-mcp (+{get_config()["OPENALEX_MAILTO"]})' 1578 | } 1579 | 1580 | logger.info(f"🔍 Getting ORCID works: {clean_orcid} (max: {max_works})") 1581 | 1582 | async with aiohttp.ClientSession() as session: 1583 | async with session.get(url, headers=headers) as response: 1584 | if response.status == 200: 1585 | data = await response.json() 1586 | 1587 | works = [] 1588 | work_summaries = data.get('group', [])[:max_works] 1589 | 1590 | for group in work_summaries: 1591 | for work_summary in group.get('work-summary', []): 1592 | title_info = work_summary.get('title', {}) 1593 | title = title_info.get('title', {}).get('value', '') if title_info else '' 1594 | 1595 | journal_title = work_summary.get('journal-title', {}).get('value', '') if work_summary.get('journal-title') else '' 1596 | 1597 | # Extract publication date 1598 | pub_date = work_summary.get('publication-date') 1599 | pub_year = '' 1600 | if pub_date and pub_date.get('year'): 1601 | pub_year = pub_date['year'].get('value', '') 1602 | 1603 | # Extract external IDs (DOI, PMID, etc.) 1604 | external_ids = {} 1605 | for ext_id in work_summary.get('external-ids', {}).get('external-id', []): 1606 | id_type = ext_id.get('external-id-type', '') 1607 | id_value = ext_id.get('external-id-value', '') 1608 | if id_type and id_value: 1609 | external_ids[id_type.lower()] = id_value 1610 | 1611 | works.append({ 1612 | 'title': title, 1613 | 'journal': journal_title, 1614 | 'publication_year': pub_year, 1615 | 'external_ids': external_ids, 1616 | 'doi': external_ids.get('doi', ''), 1617 | 'pmid': external_ids.get('pmid', ''), 1618 | 'type': work_summary.get('type', '') 1619 | }) 1620 | 1621 | logger.info(f"📊 Retrieved {len(works)} works from ORCID") 1622 | 1623 | return { 1624 | 'orcid_id': clean_orcid, 1625 | 'total_works': len(works), 1626 | 'works': works 1627 | } 1628 | else: 1629 | logger.warning(f"ORCID works API error: {response.status}") 1630 | return {'error': f'HTTP {response.status}', 'works': []} 1631 | 1632 | except Exception as e: 1633 | logger.error(f"ORCID works error: {str(e)}") 1634 | return {'error': str(e), 'works': []} 1635 | 1636 | 1637 | # ============================================================================ 1638 | # ORCID MCP Tools 1639 | # ============================================================================ 1640 | 1641 | @mcp.tool( 1642 | annotations={ 1643 | "title": "Search ORCID Authors", 1644 | "description": ( 1645 | "Search ORCID database for author profiles by name and optionally affiliation. " 1646 | "Provides ORCID IDs, verified names, and institutional affiliations for " 1647 | "enhanced author disambiguation and verification." 1648 | ), 1649 | "readOnlyHint": True, 1650 | "openWorldHint": True 1651 | } 1652 | ) 1653 | async def search_orcid_authors( 1654 | name: str, 1655 | affiliation: str = None, 1656 | max_results: int = 10 1657 | ) -> dict: 1658 | """ 1659 | Search ORCID for author profiles by name and affiliation. 1660 | 1661 | Args: 1662 | name: Author name to search (e.g., "John Smith", "Maria Garcia") 1663 | affiliation: Optional institutional affiliation for disambiguation 1664 | max_results: Maximum number of results to return (default: 10, max: 50) 1665 | 1666 | Returns: 1667 | dict: ORCID search results with: 1668 | - total_found: Total number of matches found 1669 | - results_returned: Number of results returned 1670 | - results: List of author profiles with ORCID IDs, names, and affiliations 1671 | 1672 | Example usage: 1673 | # Basic name search 1674 | search_orcid_authors("John Smith") 1675 | 1676 | # Search with affiliation for better disambiguation 1677 | search_orcid_authors("Maria Garcia", "University of Barcelona") 1678 | """ 1679 | # Validate parameters 1680 | max_results = min(max(max_results, 1), 50) # ORCID API limit 1681 | 1682 | result = await search_orcid_by_name(name, affiliation, max_results) 1683 | return result 1684 | 1685 | 1686 | @mcp.tool( 1687 | annotations={ 1688 | "title": "Get ORCID Works", 1689 | "description": ( 1690 | "Retrieve publications/works from a specific ORCID profile. " 1691 | "Useful for cross-validation with OpenAlex data and verifying " 1692 | "author publication records." 1693 | ), 1694 | "readOnlyHint": True, 1695 | "openWorldHint": True 1696 | } 1697 | ) 1698 | async def get_orcid_publications( 1699 | orcid_id: str, 1700 | max_works: int = 20 1701 | ) -> dict: 1702 | """ 1703 | Get publications/works from an ORCID profile. 1704 | 1705 | Args: 1706 | orcid_id: ORCID identifier (e.g., "0000-0000-0000-0000" or full URL) 1707 | max_works: Maximum number of works to retrieve (default: 20, max: 100) 1708 | 1709 | Returns: 1710 | dict: Publications data with: 1711 | - orcid_id: Cleaned ORCID identifier 1712 | - total_works: Number of works found 1713 | - works: List of publications with titles, journals, DOIs, PMIDs 1714 | 1715 | Example usage: 1716 | # Get works for specific ORCID 1717 | get_orcid_publications("0000-0000-0000-0000") 1718 | 1719 | # Get limited number of works 1720 | get_orcid_publications("0000-0000-0000-0000", max_works=10) 1721 | """ 1722 | # Validate parameters 1723 | max_works = min(max(max_works, 1), 100) # Reasonable limit 1724 | 1725 | result = await get_orcid_works(orcid_id, max_works) 1726 | return result 1727 | 1728 | 1729 | def main(): 1730 | """ 1731 | Entry point for the enhanced alex-mcp server with balanced peer-review filtering. 1732 | """ 1733 | import asyncio 1734 | logger.info("Enhanced OpenAlex Author Disambiguation MCP Server starting...") 1735 | logger.info("Features: ~70% token reduction for authors, ~80% for works") 1736 | logger.info("Balanced peer-review filtering: excludes data catalogs while preserving legitimate papers") 1737 | asyncio.run(mcp.run()) 1738 | 1739 | 1740 | if __name__ == "__main__": 1741 | main() --------------------------------------------------------------------------------