├── src
└── alex_mcp
│ ├── __init__.py
│ ├── data_objects.py
│ └── server.py
├── img
├── oam_logo_avatar.png
└── oam_logo_rectangular.png
├── alex-mcp-wrapper.sh
├── requirements.txt
├── LICENSE
├── pyproject.toml
├── examples
├── test_institution_resolution.py
└── test_author_disambiguation.py
├── setup.py
├── .gitignore
├── INSTALL.md
└── README.md
/src/alex_mcp/__init__.py:
--------------------------------------------------------------------------------
1 | """OpenAlex MCP Server."""
2 | __version__ = "4.1.0"
--------------------------------------------------------------------------------
/img/oam_logo_avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drAbreu/alex-mcp/HEAD/img/oam_logo_avatar.png
--------------------------------------------------------------------------------
/img/oam_logo_rectangular.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/drAbreu/alex-mcp/HEAD/img/oam_logo_rectangular.png
--------------------------------------------------------------------------------
/alex-mcp-wrapper.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Wrapper script for alex-mcp that activates the virtual environment
3 |
4 | # Get the directory where this script is located
5 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
6 |
7 | # Activate the virtual environment
8 | source "$SCRIPT_DIR/venv/bin/activate"
9 |
10 | # Run the MCP server
11 | exec python -m alex_mcp.server "$@"
12 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # OpenAlex + PubMed Hybrid Author Disambiguation MCP Server Dependencies
2 | # Following MCP best practices with FastMCP
3 |
4 | # MCP SDK - Latest version with FastMCP support
5 | mcp>=1.2.0
6 |
7 | # HTTP client for OpenAlex API and ORCID integration
8 | httpx>=0.25.0
9 | aiohttp>=3.8.0
10 |
11 | # Optional: For enhanced logging and debugging
12 | rich>=13.0.0
13 |
14 | # OpenAlex API wrapper
15 | pyalex==0.18
16 |
17 | # PubMed API integration
18 | biopython>=1.83
19 | requests>=2.31.0
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Author Disambiguation MCP Server
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "alex-mcp"
7 | version = "4.8.2"
8 | description = "MCP server for OpenAlex academic research API"
9 | authors = [{name = "Jorge Abreu Vicente", email = "jorge.abreu@embo.org"}]
10 | license = {text = "MIT"}
11 | readme = "README.md"
12 | requires-python = ">=3.10"
13 | classifiers = [
14 | "Development Status :: 3 - Alpha",
15 | "Intended Audience :: Developers",
16 | "License :: OSI Approved :: MIT License",
17 | "Programming Language :: Python :: 3",
18 | "Programming Language :: Python :: 3.10",
19 | "Programming Language :: Python :: 3.11",
20 | "Programming Language :: Python :: 3.12",
21 | ]
22 | dependencies = [
23 | "fastmcp>=2.8.1",
24 | "httpx>=0.28.1",
25 | "pydantic>=2.7.2",
26 | "rich>=13.9.4",
27 | "pyalex==0.18",
28 | "aiohttp>=3.8.0"
29 | ]
30 |
31 | [project.urls]
32 | Homepage = "https://github.com/drAbreu/alex-mcp"
33 | Repository = "https://github.com/drAbreu/alex-mcp"
34 | Issues = "https://github.com/drAbreu/alex-mcp/issues"
35 |
36 | [project.scripts]
37 | alex-mcp = "alex_mcp.server:main"
38 |
39 | [tool.setuptools.packages.find]
40 | where = ["src"]
41 | include = ["alex_mcp*"]
42 |
43 |
--------------------------------------------------------------------------------
/examples/test_institution_resolution.py:
--------------------------------------------------------------------------------
1 | """
2 | Test suite for resolve_institution using the MCP server and pyalex.
3 | Focus: EMBO, MPIA, IRAM.
4 | """
5 |
6 | import pytest
7 | import pyalex
8 |
9 | pyalex.config.email = "test@example.com"
10 | pyalex.config.max_retries = 2
11 | pyalex.config.retry_backoff_factor = 0.1
12 | pyalex.config.retry_http_codes = [429, 500, 503]
13 |
14 | import sys
15 | import os
16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
17 | from src.alex_mcp.server import _resolve_institution_impl as resolve_institution
18 |
19 | def test_resolve_institution_embo():
20 | result = resolve_institution("EMBO")
21 | assert result["best_match"] is not None
22 | assert "i1303691731" in result["best_match"]["id"].lower() or "I1303691731" in result["best_match"]["id"]
23 |
24 | def test_resolve_institution_mpia():
25 | result = resolve_institution("MPIA")
26 | assert result["best_match"] is not None
27 | assert "i4210109156" in result["best_match"]["id"].lower() or "I4210109156" in result["best_match"]["id"]
28 |
29 | def test_resolve_institution_iram():
30 | result = resolve_institution("IRAM")
31 | assert result["best_match"] is not None
32 | assert "i4210096876" in result["best_match"]["id"].lower() or "I4210096876" in result["best_match"]["id"]
33 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r", encoding="utf-8") as fh:
4 | long_description = fh.read()
5 |
6 | setup(
7 | name="alex-mcp",
8 | version="4.2.5",
9 | author="OpenAlex MCP Team",
10 | description="OpenAlex Author Disambiguation MCP Server",
11 | long_description=long_description,
12 | long_description_content_type="text/markdown",
13 | url="https://github.com/drAbreu/alex-mcp",
14 | package_dir={"": "src"},
15 | packages=find_packages(where="src"),
16 | classifiers=[
17 | "Development Status :: 4 - Beta",
18 | "Intended Audience :: Science/Research",
19 | "License :: OSI Approved :: MIT License",
20 | "Operating System :: OS Independent",
21 | "Programming Language :: Python :: 3",
22 | "Programming Language :: Python :: 3.10",
23 | "Programming Language :: Python :: 3.11",
24 | "Programming Language :: Python :: 3.12",
25 | ],
26 | python_requires=">=3.10", # Added this since pyalex requires Python 3.8+
27 | install_requires=[
28 | "fastmcp>=2.8.1",
29 | "httpx>=0.28.1",
30 | "pydantic>=2.7.2",
31 | "rich>=13.9.4",
32 | "pyalex==0.18",
33 | ],
34 | entry_points={
35 | "console_scripts": [
36 | "alex-mcp=alex_mcp.server:main",
37 | ],
38 | },
39 | include_package_data=True,
40 | zip_safe=False,
41 | )
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.ipynb
2 |
3 | # Python
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 | *.so
8 | .Python
9 | build/
10 | develop-eggs/
11 | dist/
12 | downloads/
13 | eggs/
14 | .eggs/
15 | lib/
16 | lib64/
17 | parts/
18 | sdist/
19 | var/
20 | wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 |
26 | # PyInstaller
27 | *.manifest
28 | *.spec
29 |
30 | # Installer logs
31 | pip-log.txt
32 | pip-delete-this-directory.txt
33 |
34 | # Unit test / coverage reports
35 | htmlcov/
36 | .tox/
37 | .nox/
38 | .coverage
39 | .coverage.*
40 | .cache
41 | nosetests.xml
42 | coverage.xml
43 | *.cover
44 | .hypothesis/
45 | .pytest_cache/
46 |
47 | # Translations
48 | *.mo
49 | *.pot
50 |
51 | # Django stuff:
52 | *.log
53 | local_settings.py
54 | db.sqlite3
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # Jupyter Notebook
70 | .ipynb_checkpoints
71 |
72 | # IPython
73 | profile_default/
74 | ipython_config.py
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # SageMath parsed files
83 | *.sage.py
84 |
85 | # Environments
86 | .env
87 | .venv
88 | env/
89 | venv/
90 | ENV/
91 | env.bak/
92 | venv.bak/
93 |
94 | # Spyder project settings
95 | .spyderproject
96 | .spyproject
97 |
98 | # Rope project settings
99 | .ropeproject
100 |
101 | # mkdocs documentation
102 | /site
103 |
104 | # mypy
105 | .mypy_cache/
106 | .dmypy.json
107 | dmypy.json
108 |
109 | # Pyre type checker
110 | .pyre/
111 |
112 | # IDEs
113 | .vscode/
114 | .idea/
115 | *.swp
116 | *.swo
117 | *~
118 |
119 | # OS
120 | .DS_Store
121 | .DS_Store?
122 | ._*
123 | .Spotlight-V100
124 | .Trashes
125 | ehthumbs.db
126 | Thumbs.db
127 |
128 | # Project specific
129 | *.json
130 | !package.json
131 | !tsconfig.json
132 | test_results/
133 | logs/
134 | temp/
135 |
--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
1 | # OpenAlex MCP Server Installation Guide
2 |
3 | This guide provides instructions for installing and running the OpenAlex MCP server.
4 |
5 | ## Prerequisites
6 |
7 | - Python 3.10 or higher
8 | - pip (Python package installer)
9 |
10 | ## Installation
11 |
12 | 1. Clone the repository:
13 | ```bash
14 | git clone https://github.com/drAbreu/alex-mcp.git
15 | cd alex-mcp
16 | ```
17 |
18 | 2. Create a virtual environment:
19 | ```bash
20 | python3 -m venv venv
21 | ```
22 |
23 | 3. Activate the virtual environment:
24 | ```bash
25 | source venv/bin/activate # On Windows: venv\Scripts\activate
26 | ```
27 |
28 | 4. Install the package in development mode:
29 | ```bash
30 | pip install -e .
31 | ```
32 |
33 | ## Running the Server
34 |
35 | ### Option 1: Using the run script
36 |
37 | The easiest way to run the server is to use the provided run script:
38 |
39 | ```bash
40 | ./run_alex_mcp.sh
41 | ```
42 |
43 | This script activates the virtual environment and runs the server.
44 |
45 | ### Option 2: Manual execution
46 |
47 | 1. Activate the virtual environment:
48 | ```bash
49 | source venv/bin/activate # On Windows: venv\Scripts\activate
50 | ```
51 |
52 | 2. Run the server:
53 | ```bash
54 | python run_server.py
55 | ```
56 |
57 | ## Using with Claude Desktop
58 |
59 | To use this MCP server with Claude Desktop, add the following configuration:
60 |
61 | ```json
62 | {
63 | "mcpServers": {
64 | "alex-mcp": {
65 | "command": "/path/to/alex-mcp/run_alex_mcp.sh"
66 | }
67 | }
68 | }
69 | ```
70 |
71 | Replace `/path/to/alex-mcp` with the actual path to the repository on your system.
72 |
73 | ## Available Tools
74 |
75 | The OpenAlex MCP server provides the following tools:
76 |
77 | 1. **disambiguate_author**: Disambiguate an author using OpenAlex's ML-powered disambiguation system.
78 | 2. **search_authors**: Search for authors with advanced filtering capabilities.
79 | 3. **get_author_profile**: Get detailed author profile by OpenAlex ID.
80 | 4. **resolve_institution**: Resolve institution name or abbreviation to full OpenAlex data.
81 |
82 | ## Troubleshooting
83 |
84 | If you encounter any issues, make sure:
85 |
86 | 1. You're using Python 3.10 or higher
87 | 2. The virtual environment is activated
88 | 3. All dependencies are installed correctly
89 |
90 | For more information, see the [README.md](README.md) file.
91 |
--------------------------------------------------------------------------------
/examples/test_author_disambiguation.py:
--------------------------------------------------------------------------------
1 | """
2 | Test suite for disambiguate_author using the MCP server and pyalex.
3 | Focus: Fiona M. Watt and Jorge Abreu Vicente.
4 | """
5 |
6 | import pytest
7 | import pyalex
8 |
9 | pyalex.config.email = "test@example.com"
10 | pyalex.config.max_retries = 2
11 | pyalex.config.retry_backoff_factor = 0.1
12 | pyalex.config.retry_http_codes = [429, 500, 503]
13 |
14 | import sys
15 | import os
16 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
17 | from src.alex_mcp.server import _disambiguate_author_impl as disambiguate_author
18 |
19 | def test_disambiguate_fiona_watt_name_only():
20 | result = disambiguate_author(name="Fiona M Watt")
21 | print(f"Disambiguation result for Fiona M Watt: {result}")
22 | assert result["most_likely"] is not None
23 | assert "A5068471552" in result["most_likely"]["author"]["id"]
24 |
25 | def test_disambiguate_fiona_watt_with_institution():
26 | result = disambiguate_author(name="Fiona M Watt", affiliation="EMBO")
27 | print(f"Disambiguation result for Fiona M Watt: {result}")
28 | assert result["most_likely"] is not None
29 | assert "A5068471552" in result["most_likely"]["author"]["id"]
30 |
31 | def test_disambiguate_fiona_watt_with_topic():
32 | result = disambiguate_author(name="Fiona M Watt", research_field="Stem Cells")
33 | print(f"Disambiguation result for Fiona M Watt: {result}")
34 | assert result["most_likely"] is not None
35 | assert "A5068471552" in result["most_likely"]["author"]["id"]
36 |
37 | def test_disambiguate_jorge_abreu_name_only():
38 | result = disambiguate_author(name="Jorge Abreu Vicente")
39 | print(f"Disambiguation result for J. Abreu-Vicente: {result}")
40 | assert result["most_likely"] is not None
41 | assert "A5058921480" in result["most_likely"]["author"]["id"]
42 |
43 | def test_disambiguate_jorge_abreu_with_institution():
44 | result = disambiguate_author(name="Jorge Abreu Vicente", affiliation="MPIA")
45 | print(f"Disambiguation result for J. Abreu-Vicente: {result}")
46 | assert result["most_likely"] is not None
47 | assert "A5058921480" in result["most_likely"]["author"]["id"]
48 |
49 | def test_disambiguate_jorge_abreu_with_topic():
50 | result = disambiguate_author(name="Jorge Abreu Vicente", research_field="molecular clouds")
51 | print(f"Disambiguation result for J. Abreu-Vicente: {result}")
52 | assert result["most_likely"] is not None
53 | assert "A5058921480" in result["most_likely"]["author"]["id"]
54 |
--------------------------------------------------------------------------------
/src/alex_mcp/data_objects.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Optimized data models for the OpenAlex MCP server.
4 |
5 | Streamlined versions focusing on essential information for author disambiguation
6 | and work retrieval while minimizing token usage. Enhanced to preserve comprehensive
7 | ID information (DOI, PMID, PMCID, OpenAlex, MAG).
8 | """
9 |
10 | from typing import List, Optional, Dict, Any
11 | from datetime import datetime
12 | from pydantic import BaseModel, Field
13 |
14 |
15 | class WorkIDs(BaseModel):
16 | """
17 | Comprehensive work identifiers from OpenAlex.
18 |
19 | Preserves all available identifiers for cross-database linkage.
20 | """
21 | openalex: Optional[str] = None
22 | doi: Optional[str] = None
23 | pmid: Optional[str] = None
24 | pmcid: Optional[str] = None
25 | mag: Optional[str] = None
26 |
27 |
28 | class OptimizedAuthorResult(BaseModel):
29 | """
30 | Streamlined author representation focusing on disambiguation essentials.
31 |
32 | Reduces token usage by ~70% compared to full OpenAlex author object.
33 | """
34 | id: str
35 | display_name: str
36 | orcid: Optional[str] = None
37 | display_name_alternatives: Optional[List[str]] = None
38 |
39 | # Simplified affiliations - just institution names as strings
40 | current_affiliations: Optional[List[str]] = None
41 | past_affiliations: Optional[List[str]] = None
42 |
43 | # Key metrics for research impact
44 | cited_by_count: int = 0
45 | works_count: int = 0
46 | h_index: Optional[int] = None
47 | i10_index: Optional[int] = None
48 |
49 | # Research fields (simplified)
50 | research_fields: Optional[List[str]] = None
51 |
52 | # Basic metadata
53 | last_known_institutions: Optional[List[str]] = None
54 | countries: Optional[List[str]] = None
55 |
56 | # For API access
57 | works_api_url: Optional[str] = None
58 |
59 |
60 | class OptimizedWorkResult(BaseModel):
61 | """
62 | Streamlined work representation focusing on essential publication info.
63 |
64 | Reduces token usage by ~80% compared to full OpenAlex work object while
65 | preserving comprehensive identifier information.
66 | """
67 | id: str
68 | title: Optional[str] = None
69 | doi: Optional[str] = None # Kept for backward compatibility
70 | publication_year: Optional[int] = None
71 | type: Optional[str] = None # journal-article, book-chapter, etc.
72 |
73 | # COMPREHENSIVE ID INFORMATION - This was missing!
74 | ids: Optional[WorkIDs] = None
75 |
76 | # Citation metrics
77 | cited_by_count: Optional[int] = 0
78 |
79 | # Publication venue (simplified)
80 | journal_name: Optional[str] = None
81 | journal_issn: Optional[str] = None
82 | publisher: Optional[str] = None
83 |
84 | # Open access info (simplified)
85 | is_open_access: Optional[bool] = None
86 |
87 | # Author info (minimal)
88 | author_count: Optional[int] = None
89 | first_author: Optional[str] = None
90 | corresponding_author: Optional[str] = None
91 |
92 | # Research categorization (simplified)
93 | primary_field: Optional[str] = None
94 | concepts: Optional[List[str]] = None
95 |
96 |
97 | class OptimizedSearchResponse(BaseModel):
98 | """
99 | Streamlined search response.
100 | """
101 | query: str
102 | total_count: int
103 | results: List[OptimizedAuthorResult]
104 | search_time: Optional[datetime] = Field(default_factory=datetime.now)
105 |
106 |
107 | class OptimizedWorksSearchResponse(BaseModel):
108 | """
109 | Streamlined works search response for author works.
110 | """
111 | author_id: str
112 | author_name: Optional[str] = None
113 | total_count: int
114 | results: List[OptimizedWorkResult]
115 | search_time: Optional[datetime] = Field(default_factory=datetime.now)
116 | filters: Optional[Dict[str, Any]] = None
117 |
118 |
119 | class OptimizedGeneralWorksSearchResponse(BaseModel):
120 | """
121 | Streamlined works search response for general work searches.
122 | """
123 | query: str
124 | total_count: int
125 | results: List[OptimizedWorkResult]
126 | search_time: Optional[datetime] = Field(default_factory=datetime.now)
127 | filters: Optional[Dict[str, Any]] = None
128 |
129 |
130 | class AutocompleteAuthorCandidate(BaseModel):
131 | """
132 | A single author candidate from autocomplete API.
133 |
134 | Optimized for fast disambiguation with essential context.
135 | """
136 | openalex_id: str
137 | display_name: str
138 | institution_hint: Optional[str] = None # Current/last known institution
139 | works_count: int = 0
140 | cited_by_count: int = 0
141 | entity_type: str = "author"
142 | external_id: Optional[str] = None # ORCID or other external ID
143 |
144 |
145 | class AutocompleteAuthorsResponse(BaseModel):
146 | """
147 | Response model for author autocomplete with multiple candidates.
148 |
149 | Enables intelligent disambiguation by providing multiple options
150 | with institutional context and research metrics.
151 | """
152 | query: str
153 | context: Optional[str] = None
154 | total_candidates: int
155 | candidates: List[AutocompleteAuthorCandidate]
156 | search_metadata: Dict[str, Any] = Field(default_factory=dict)
157 |
158 |
159 | def extract_institution_names(affiliations: List[Dict[str, Any]]) -> tuple[List[str], List[str]]:
160 | """
161 | Extract and categorize institution names from OpenAlex affiliation objects.
162 |
163 | Returns:
164 | tuple: (current_affiliations, past_affiliations)
165 | """
166 | current = []
167 | past = []
168 |
169 | if not affiliations:
170 | return current, past
171 |
172 | for affiliation in affiliations:
173 | institution = affiliation.get('institution', {})
174 | if not institution:
175 | continue
176 |
177 | institution_name = institution.get('display_name')
178 | if not institution_name:
179 | continue
180 |
181 | # Determine if current or past based on years
182 | years = affiliation.get('years', [])
183 | if years:
184 | current_year = datetime.now().year
185 | # Consider current if active in last 3 years
186 | if max(years) >= current_year - 3:
187 | current.append(institution_name)
188 | else:
189 | past.append(institution_name)
190 | else:
191 | # Default to current if no year info
192 | current.append(institution_name)
193 |
194 | return current, past
195 |
196 |
197 | def extract_research_fields(concepts_or_topics: List[Dict[str, Any]]) -> List[str]:
198 | """
199 | Extract research field names from concepts or topics.
200 |
201 | Args:
202 | concepts_or_topics: List of concept/topic objects from OpenAlex
203 |
204 | Returns:
205 | List of field names, limited to top 5 most relevant
206 | """
207 | fields = []
208 |
209 | if not concepts_or_topics:
210 | return fields
211 |
212 | # Sort by score/level and take top fields
213 | sorted_items = sorted(
214 | concepts_or_topics,
215 | key=lambda x: x.get('score', 0) or x.get('count', 0),
216 | reverse=True
217 | )
218 |
219 | for item in sorted_items[:5]: # Limit to top 5
220 | name = item.get('display_name')
221 | if name:
222 | fields.append(name)
223 |
224 | return fields
225 |
226 |
227 | def extract_journal_info(locations: List[Dict[str, Any]]) -> tuple[Optional[str], Optional[str], Optional[str]]:
228 | """
229 | Extract journal information from OpenAlex locations.
230 |
231 | Returns:
232 | tuple: (journal_name, journal_issn, publisher)
233 | """
234 | if not locations:
235 | return None, None, None
236 |
237 | # Look for primary location (usually first) or journal location
238 | for location in locations:
239 | source = location.get('source', {})
240 | if source and source.get('type') == 'journal':
241 | journal_name = source.get('display_name')
242 | issn = None
243 | if source.get('issn'):
244 | issn = source['issn'][0] if isinstance(source['issn'], list) else source['issn']
245 |
246 | publisher = source.get('host_organization_name')
247 | return journal_name, issn, publisher
248 |
249 | # Fallback to first location
250 | if locations:
251 | source = locations[0].get('source', {})
252 | if source:
253 | return source.get('display_name'), None, source.get('host_organization_name')
254 |
255 | return None, None, None
256 |
257 |
258 | def extract_authorship_info(authorships: List[Dict[str, Any]]) -> tuple[Optional[int], Optional[str], Optional[str]]:
259 | """
260 | Extract simplified authorship information.
261 |
262 | Returns:
263 | tuple: (author_count, first_author, corresponding_author)
264 | """
265 | if not authorships:
266 | return None, None, None
267 |
268 | author_count = len(authorships)
269 | first_author = None
270 | corresponding_author = None
271 |
272 | # Find first author (author_position == 'first')
273 | for authorship in authorships:
274 | if authorship.get('author_position') == 'first':
275 | author = authorship.get('author', {})
276 | first_author = author.get('display_name')
277 | break
278 |
279 | # Find corresponding author
280 | for authorship in authorships:
281 | if authorship.get('is_corresponding'):
282 | author = authorship.get('author', {})
283 | corresponding_author = author.get('display_name')
284 | break
285 |
286 | return author_count, first_author, corresponding_author
287 |
288 |
289 | def extract_comprehensive_ids(work_data: Dict[str, Any]) -> WorkIDs:
290 | """
291 | Extract comprehensive identifier information from OpenAlex work data.
292 |
293 | This was the missing piece! OpenAlex provides comprehensive IDs in the 'ids' object.
294 |
295 | Args:
296 | work_data: Full OpenAlex work object
297 |
298 | Returns:
299 | WorkIDs object with all available identifiers
300 | """
301 | ids_data = work_data.get('ids', {})
302 |
303 | # Extract all available IDs
304 | openalex_id = ids_data.get('openalex') or work_data.get('id')
305 | doi = ids_data.get('doi') or work_data.get('doi') # Fallback to standalone doi
306 | pmid = ids_data.get('pmid')
307 | pmcid = ids_data.get('pmcid')
308 | mag = ids_data.get('mag')
309 |
310 | return WorkIDs(
311 | openalex=openalex_id,
312 | doi=doi,
313 | pmid=pmid,
314 | pmcid=pmcid,
315 | mag=mag
316 | )
317 |
318 |
319 | def optimize_author_data(author_data: Dict[str, Any]) -> OptimizedAuthorResult:
320 | """
321 | Convert full OpenAlex author object to optimized version.
322 |
323 | Args:
324 | author_data: Full OpenAlex author object
325 |
326 | Returns:
327 | OptimizedAuthorResult with essential information only
328 | """
329 | # Extract basic info
330 | author_id = author_data.get('id', '')
331 | display_name = author_data.get('display_name', '')
332 | orcid = author_data.get('orcid')
333 | alternatives = author_data.get('display_name_alternatives', [])
334 |
335 | # Process affiliations
336 | affiliations = author_data.get('affiliations', [])
337 | current_affiliations, past_affiliations = extract_institution_names(affiliations)
338 |
339 | # Extract metrics
340 | cited_by_count = author_data.get('cited_by_count', 0)
341 | works_count = author_data.get('works_count', 0)
342 |
343 | # Extract summary stats
344 | summary_stats = author_data.get('summary_stats', {})
345 | h_index = summary_stats.get('h_index')
346 | i10_index = summary_stats.get('i10_index')
347 |
348 | # Extract research fields from concepts or topics
349 | research_fields = []
350 | concepts = author_data.get('x_concepts', []) or author_data.get('topics', [])
351 | research_fields = extract_research_fields(concepts)
352 |
353 | # Extract geographic info
354 | countries = []
355 | if affiliations:
356 | for affiliation in affiliations:
357 | institution = affiliation.get('institution', {})
358 | country = institution.get('country_code')
359 | if country and country not in countries:
360 | countries.append(country)
361 |
362 | # API URL
363 | works_api_url = author_data.get('works_api_url')
364 |
365 | return OptimizedAuthorResult(
366 | id=author_id,
367 | display_name=display_name,
368 | orcid=orcid,
369 | display_name_alternatives=alternatives[:3] if alternatives else None, # Limit alternatives
370 | current_affiliations=current_affiliations[:3] if current_affiliations else None, # Limit to 3 most recent
371 | past_affiliations=past_affiliations[:3] if past_affiliations else None, # Limit to 3 most recent
372 | cited_by_count=cited_by_count,
373 | works_count=works_count,
374 | h_index=h_index,
375 | i10_index=i10_index,
376 | research_fields=research_fields[:5] if research_fields else None, # Top 5 fields
377 | last_known_institutions=current_affiliations[:2] if current_affiliations else past_affiliations[:2],
378 | countries=countries[:3] if countries else None, # Limit countries
379 | works_api_url=works_api_url
380 | )
381 |
382 |
383 | def optimize_work_data(work_data: Dict[str, Any]) -> OptimizedWorkResult:
384 | """
385 | Convert full OpenAlex work object to optimized version.
386 |
387 | NOW INCLUDES COMPREHENSIVE ID EXTRACTION!
388 |
389 | Args:
390 | work_data: Full OpenAlex work object
391 |
392 | Returns:
393 | OptimizedWorkResult with essential information AND comprehensive IDs
394 | """
395 | # Basic work info
396 | work_id = work_data.get('id', '')
397 | title = work_data.get('title')
398 | doi = work_data.get('doi') # Kept for backward compatibility
399 | publication_year = work_data.get('publication_year')
400 | work_type = work_data.get('type')
401 |
402 | # EXTRACT COMPREHENSIVE IDS - This is the fix!
403 | comprehensive_ids = extract_comprehensive_ids(work_data)
404 |
405 | # Citation metrics
406 | cited_by_count = work_data.get('cited_by_count', 0)
407 |
408 | # Journal information
409 | locations = work_data.get('locations', [])
410 | journal_name, journal_issn, publisher = extract_journal_info(locations)
411 |
412 | # Open access info
413 | open_access = work_data.get('open_access', {})
414 | is_open_access = open_access.get('is_oa') if open_access else None
415 |
416 | # Authorship info
417 | authorships = work_data.get('authorships', [])
418 | author_count, first_author, corresponding_author = extract_authorship_info(authorships)
419 |
420 | # Research categorization
421 | primary_topic = work_data.get('primary_topic', {})
422 | primary_field = primary_topic.get('display_name') if primary_topic else None
423 |
424 | # Simplified concepts (top 3)
425 | concepts = work_data.get('concepts', [])
426 | concept_names = []
427 | if concepts:
428 | sorted_concepts = sorted(concepts, key=lambda x: x.get('score', 0), reverse=True)
429 | concept_names = [c.get('display_name') for c in sorted_concepts[:3] if c.get('display_name')]
430 |
431 | return OptimizedWorkResult(
432 | id=work_id,
433 | title=title,
434 | doi=doi,
435 | publication_year=publication_year,
436 | type=work_type,
437 | ids=comprehensive_ids,
438 | cited_by_count=cited_by_count,
439 | journal_name=journal_name,
440 | journal_issn=journal_issn,
441 | publisher=publisher,
442 | is_open_access=is_open_access,
443 | author_count=author_count,
444 | first_author=first_author,
445 | corresponding_author=corresponding_author,
446 | primary_field=primary_field,
447 | concepts=concept_names if concept_names else None
448 | )
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |
4 | # OpenAlex Author Disambiguation MCP Server
5 |
6 | [](https://modelcontextprotocol.io/)
7 | [](https://python.org)
8 | [](https://openalex.org)
9 | [](LICENSE)
10 | [](https://github.com/drAbreu/alex-mcp)
11 |
12 |
13 | A **streamlined** Model Context Protocol (MCP) server for author disambiguation and academic research using the OpenAlex.org API. Specifically designed for AI agents with optimized data structures and enhanced functionality.
14 |
15 | ---
16 |
17 | ## 🎯 Key Features
18 |
19 | ### 🔍 **Core Capabilities**
20 | - **Advanced Author Disambiguation**: Handles complex career transitions and name variations
21 | - **Institution Resolution**: Current and past affiliations with transition tracking
22 | - **Academic Work Retrieval**: Journal articles, letters, and research papers
23 | - **Citation Analysis**: H-index, citation counts, and impact metrics
24 | - **ORCID Integration**: Highest accuracy matching with ORCID identifiers
25 |
26 | ### 🚀 **AI Agent Optimized**
27 | - **Streamlined Data**: Focused on essential information for disambiguation
28 | - **Fast Processing**: Optimized data structures for rapid analysis
29 | - **Smart Filtering**: Enhanced filtering options for targeted queries
30 | - **Clean Output**: Structured responses optimized for AI reasoning
31 |
32 | ### 🤖 **Agent Integration**
33 | - **Multiple Candidates**: Ranked results for automated decision-making
34 | - **Structured Responses**: Clean, parseable output optimized for LLMs
35 | - **Error Handling**: Graceful degradation with informative messages
36 | - **Enhanced Filtering**: Journal-only, citation thresholds, and temporal filters
37 |
38 | ### 🏛️ **Professional Grade**
39 | - **MCP Best Practices**: Built with FastMCP following official guidelines
40 | - **Tool Annotations**: Proper MCP tool annotations for optimal client integration
41 | - **Resource Management**: Efficient HTTP client management and cleanup
42 | - **Rate Limiting**: Respectful API usage with proper delays
43 |
44 | ---
45 |
46 | ## 🚀 Quick Start
47 |
48 | ### Prerequisites
49 |
50 | - Python 3.10 or higher
51 | - MCP-compatible client (e.g., Claude Desktop)
52 | - Email address (for OpenAlex API courtesy)
53 |
54 | ### Installation
55 |
56 | For detailed installation instructions, see [INSTALL.md](INSTALL.md).
57 |
58 | 1. **Clone the repository:**
59 | ```bash
60 | git clone https://github.com/drAbreu/alex-mcp.git
61 | cd alex-mcp
62 | ```
63 |
64 | 2. **Create a virtual environment:**
65 | ```bash
66 | python3 -m venv venv
67 | source venv/bin/activate # On Windows: venv\Scripts\activate
68 | ```
69 |
70 | 3. **Install the package:**
71 | ```bash
72 | pip install -e .
73 | ```
74 |
75 | 4. **Configure environment:**
76 | ```bash
77 | export OPENALEX_MAILTO=your-email@domain.com
78 | ```
79 |
80 | 5. **Run the server:**
81 | ```bash
82 | ./run_alex_mcp.sh
83 | # Or, if installed as a CLI tool:
84 | alex-mcp
85 | ```
86 |
87 | ---
88 |
89 | ## ⚙️ MCP Configuration
90 |
91 | ### Claude Desktop Configuration
92 |
93 | Add to your Claude Desktop configuration file:
94 |
95 | ```json
96 | {
97 | "mcpServers": {
98 | "alex-mcp": {
99 | "command": "/path/to/alex-mcp/run_alex_mcp.sh",
100 | "env": {
101 | "OPENALEX_MAILTO": "your-email@domain.com"
102 | }
103 | }
104 | }
105 | }
106 | ```
107 |
108 | Replace `/path/to/alex-mcp` with the actual path to the repository on your system.
109 |
110 | ---
111 |
112 | ## 🤖 Using with AI Agents
113 |
114 | ### OpenAI Agents Integration
115 |
116 | You can load this MCP server in your OpenAI agent workflow using the [`agents.mcp.MCPServerStdio`](https://github.com/openai/openai-agents) interface:
117 |
118 | ```python
119 | from agents.mcp import MCPServerStdio
120 |
121 | async with MCPServerStdio(
122 | name="OpenAlex MCP For Author disambiguation and works",
123 | cache_tools_list=True,
124 | params={
125 | "command": "uvx",
126 | "args": [
127 | "--from", "git+https://github.com/drAbreu/alex-mcp.git@4.1.0",
128 | "alex-mcp"
129 | ],
130 | "env": {
131 | "OPENALEX_MAILTO": "your-email@domain.com"
132 | }
133 | },
134 | client_session_timeout_seconds=10
135 | ) as alex_mcp:
136 | await alex_mcp.connect()
137 | tools = await alex_mcp.list_tools()
138 | print(f"Available tools: {[tool.name for tool in tools]}")
139 | ```
140 |
141 | ### Academic Research Agent Integration
142 |
143 | This MCP server is specifically optimized for academic research workflows:
144 |
145 | ```python
146 | # Optimized for academic research workflows
147 | from alex_agent import run_author_research
148 |
149 | # Enhanced functionality with streamlined data
150 | result = await run_author_research(
151 | "Find J. Abreu at EMBO with recent publications"
152 | )
153 |
154 | # Clean, structured output for AI processing
155 | print(f"Success: {result['workflow_metadata']['success']}")
156 | print(f"Quality: {result['research_result']['metadata']['result_analysis']['quality_score']}/100")
157 | ```
158 |
159 | ### Direct Launch with uvx
160 |
161 | ```bash
162 | # Standard launch
163 | uvx --from git+https://github.com/drAbreu/alex-mcp.git@4.1.0 alex-mcp
164 |
165 | # With environment variables
166 | OPENALEX_MAILTO=your-email@domain.com uvx --from git+https://github.com/drAbreu/alex-mcp.git@4.1.0 alex-mcp
167 | ```
168 |
169 | ---
170 |
171 | ## 🛠️ Available Tools
172 |
173 | ### 1. **autocomplete_authors** ⭐ NEW
174 | Get multiple author candidates using OpenAlex autocomplete API for intelligent disambiguation.
175 |
176 | **Parameters:**
177 | - `name` (required): Author name to search (e.g., "James Briscoe", "M. Ralser")
178 | - `context` (optional): Context for disambiguation (e.g., "Francis Crick Institute developmental biology")
179 | - `limit` (optional): Maximum candidates (1-10, default: 5)
180 |
181 | **Key Features:**
182 | - ⚡ **Fast**: ~200ms response time
183 | - 🎯 **Smart**: Multiple candidates with institutional hints
184 | - 🧠 **AI-Ready**: Perfect for context-based selection
185 | - 📊 **Rich**: Works count, citations, institution info
186 |
187 | **Streamlined Output:**
188 | ```json
189 | {
190 | "query": "James Briscoe",
191 | "context": "Francis Crick Institute",
192 | "total_candidates": 3,
193 | "candidates": [
194 | {
195 | "openalex_id": "https://openalex.org/A5019391436",
196 | "display_name": "James Briscoe",
197 | "institution_hint": "The Francis Crick Institute, UK",
198 | "works_count": 415,
199 | "cited_by_count": 24623,
200 | "external_id": "https://orcid.org/0000-0002-1020-5240"
201 | }
202 | ]
203 | }
204 | ```
205 |
206 | **Usage Pattern:**
207 | ```python
208 | # Get multiple candidates for disambiguation
209 | candidates = await autocomplete_authors(
210 | "James Briscoe",
211 | context="Francis Crick Institute developmental biology"
212 | )
213 |
214 | # AI selects best match based on institutional context
215 | # Much more accurate than single search result!
216 | ```
217 |
218 | ### 2. **search_authors**
219 | Search for authors with streamlined output for AI agents.
220 |
221 | **Parameters:**
222 | - `name` (required): Author name to search
223 | - `institution` (optional): Institution name filter
224 | - `topic` (optional): Research topic filter
225 | - `country_code` (optional): Country code filter (e.g., "US", "DE")
226 | - `limit` (optional): Maximum results (1-25, default: 20)
227 |
228 | **Streamlined Output:**
229 | ```json
230 | {
231 | "query": "J. Abreu",
232 | "total_count": 3,
233 | "results": [
234 | {
235 | "id": "https://openalex.org/A123456789",
236 | "display_name": "Jorge Abreu-Vicente",
237 | "orcid": "https://orcid.org/0000-0000-0000-0000",
238 | "display_name_alternatives": ["J. Abreu-Vicente", "Jorge Abreu Vicente"],
239 | "affiliations": [
240 | {
241 | "institution": {
242 | "display_name": "European Molecular Biology Organization",
243 | "country_code": "DE"
244 | },
245 | "years": [2023, 2024, 2025]
246 | }
247 | ],
248 | "cited_by_count": 316,
249 | "works_count": 25,
250 | "summary_stats": {
251 | "h_index": 9,
252 | "i10_index": 5
253 | },
254 | "x_concepts": [
255 | {
256 | "display_name": "Astrophysics",
257 | "score": 0.8
258 | },
259 | {
260 | "display_name": "Machine Learning",
261 | "score": 0.6
262 | }
263 | ]
264 | }
265 | ]
266 | }
267 | ```
268 |
269 | **Features**: Clean structure optimized for AI reasoning and disambiguation
270 |
271 | ---
272 |
273 | ### 2. **retrieve_author_works**
274 | Retrieve works for a given author with enhanced filtering capabilities.
275 |
276 | **Parameters:**
277 | - `author_id` (required): OpenAlex author ID
278 | - `limit` (optional): Maximum results (1-50, default: 20)
279 | - `order_by` (optional): "date" or "citations" (default: "date")
280 | - `publication_year` (optional): Filter by specific year
281 | - `type` (optional): Work type filter (e.g., "journal-article")
282 | - `authorships_institutions_id` (optional): Filter by institution
283 | - `is_retracted` (optional): Filter retracted works
284 | - `open_access_is_oa` (optional): Filter by open access status
285 |
286 | **Enhanced Output:**
287 | ```json
288 | {
289 | "author_id": "https://openalex.org/A123456789",
290 | "total_count": 25,
291 | "results": [
292 | {
293 | "id": "https://openalex.org/W123456789",
294 | "title": "A platform for the biomedical application of large language models",
295 | "doi": "10.1038/s41587-024-02534-3",
296 | "publication_year": 2025,
297 | "type": "journal-article",
298 | "cited_by_count": 42,
299 | "authorships": [
300 | {
301 | "author": {
302 | "display_name": "Jorge Abreu-Vicente"
303 | },
304 | "institutions": [
305 | {
306 | "display_name": "European Molecular Biology Organization"
307 | }
308 | ]
309 | }
310 | ],
311 | "locations": [
312 | {
313 | "source": {
314 | "display_name": "Nature Biotechnology",
315 | "type": "journal"
316 | }
317 | }
318 | ],
319 | "open_access": {
320 | "is_oa": true
321 | },
322 | "primary_topic": {
323 | "display_name": "Biomedical Engineering"
324 | }
325 | }
326 | ]
327 | }
328 | ```
329 |
330 | **Features**: Comprehensive work data with flexible filtering for targeted queries
331 |
332 | ---
333 |
334 | ## 📊 Data Optimization
335 |
336 | ### Focused Information Architecture
337 | This MCP server provides focused, structured data specifically designed for AI agent consumption:
338 |
339 | ### Author Data Features
340 | - **Identity Resolution**: Names, ORCID, alternatives for disambiguation
341 | - **Affiliation Tracking**: Current and historical institutional connections
342 | - **Impact Metrics**: Citation counts, h-index, and scholarly impact
343 | - **Research Context**: Fields, concepts, and domain expertise
344 | - **Career Analysis**: Temporal affiliation changes and transitions
345 |
346 | ### Work Data Features
347 | - **Publication Metadata**: Title, DOI, venue, and publication details
348 | - **Impact Assessment**: Citation counts and scholarly influence
349 | - **Access Information**: Open access status and availability
350 | - **Authorship Details**: Complete author lists and institutional affiliations
351 | - **Research Classification**: Topics, concepts, and domain categorization
352 |
353 | ### Enhanced Filtering
354 |
355 | ```python
356 | # Target high-impact journal articles
357 | works = await retrieve_author_works(
358 | author_id="https://openalex.org/A123456789",
359 | type="journal-article", # Focus on journal publications
360 | open_access_is_oa=True, # Open access only
361 | order_by="citations", # Most cited first
362 | limit=15
363 | )
364 |
365 | # Career transition analysis
366 | authors = await search_authors(
367 | name="J. Abreu",
368 | institution="EMBO", # Current institution
369 | topic="Machine Learning", # Research focus
370 | limit=10
371 | )
372 | ```
373 |
374 | ---
375 |
376 | ## 🧪 Example Usage
377 |
378 | ### Author Disambiguation
379 |
380 | ```python
381 | from alex_mcp.server import search_authors_core
382 |
383 | # Comprehensive author search
384 | results = search_authors_core(
385 | name="J Abreu Vicente",
386 | institution="EMBO",
387 | topic="Machine Learning",
388 | limit=20
389 | )
390 |
391 | print(f"Found {results.total_count} candidates")
392 | for author in results.results:
393 | print(f"- {author.display_name}")
394 | if author.affiliations:
395 | current_inst = author.affiliations[0].institution.display_name
396 | print(f" Institution: {current_inst}")
397 | print(f" Metrics: {author.cited_by_count} citations, h-index {author.summary_stats.h_index}")
398 | if author.x_concepts:
399 | fields = [c.display_name for c in author.x_concepts[:3]]
400 | print(f" Research: {', '.join(fields)}")
401 | ```
402 |
403 | ### Academic Work Analysis
404 |
405 | ```python
406 | from alex_mcp.server import retrieve_author_works_core
407 |
408 | # Comprehensive work retrieval
409 | works = retrieve_author_works_core(
410 | author_id="https://openalex.org/A5058921480",
411 | type="journal-article", # Academic focus
412 | order_by="citations", # Impact-based ordering
413 | limit=20
414 | )
415 |
416 | print(f"Found {works.total_count} publications")
417 | for work in works.results:
418 | print(f"- {work.title}")
419 | if work.locations:
420 | journal = work.locations[0].source.display_name
421 | print(f" Published in: {journal} ({work.publication_year})")
422 | print(f" Impact: {work.cited_by_count} citations")
423 | if work.open_access and work.open_access.is_oa:
424 | print(" ✓ Open Access")
425 | ```
426 |
427 | ### Institution and Field Analysis
428 |
429 | ```python
430 | # Analyze career transitions
431 | def analyze_career_path(author_result):
432 | affiliations = author_result.affiliations
433 | if len(affiliations) > 1:
434 | print("Career path:")
435 | for aff in sorted(affiliations, key=lambda x: min(x.years)):
436 | years = f"{min(aff.years)}-{max(aff.years)}"
437 | print(f" {years}: {aff.institution.display_name}")
438 |
439 | # Research evolution
440 | if author_result.x_concepts:
441 | print("Research areas:")
442 | for concept in author_result.x_concepts[:5]:
443 | print(f" {concept.display_name} (score: {concept.score:.2f})")
444 |
445 | # Usage
446 | results = search_authors_core("Jorge Abreu Vicente")
447 | if results.results:
448 | analyze_career_path(results.results[0])
449 | ```
450 |
451 | ---
452 |
453 | ## 🔧 Configuration Options
454 |
455 | ### Environment Variables
456 |
457 | ```bash
458 | # Required
459 | export OPENALEX_MAILTO=your-email@domain.com
460 |
461 | # Optional settings
462 | export OPENALEX_MAX_AUTHORS=100 # Maximum authors per query
463 | export OPENALEX_USER_AGENT=research-agent-v1.0
464 | export ALEX_MCP_VERSION=4.1.0
465 |
466 | # Rate limiting (respectful usage)
467 | export OPENALEX_RATE_PER_SEC=10
468 | export OPENALEX_RATE_PER_DAY=100000
469 | ```
470 |
471 | ### Performance Tuning
472 |
473 | ```python
474 | # For comprehensive research applications
475 | config = {
476 | "max_authors_per_query": 25, # Detailed author analysis
477 | "max_works_per_author": 50, # Complete publication history
478 | "enable_all_filters": True, # Full filtering capabilities
479 | "detailed_affiliations": True, # Complete institutional data
480 | "research_concepts": True # Detailed concept analysis
481 | }
482 | ```
483 |
484 | ---
485 |
486 | ## 🧑💻 Development & Testing
487 |
488 | ### Project Structure
489 | ```
490 | alex-mcp/
491 | ├── src/alex_mcp/
492 | │ ├── server.py # Main MCP server
493 | │ ├── data_objects.py # Data models and structures
494 | │ └── utils.py # Utility functions
495 | ├── examples/
496 | │ ├── basic_usage.py # Simple examples
497 | │ ├── advanced_queries.py # Complex query examples
498 | │ └── integration_demo.py # AI agent integration
499 | ├── tests/
500 | │ ├── test_server.py # Server functionality tests
501 | │ └── test_integration.py # Integration tests
502 | └── docs/
503 | └── api_reference.md # Detailed API documentation
504 | ```
505 |
506 | ### Running Tests
507 |
508 | ```bash
509 | # Install test dependencies
510 | pip install -e ".[test]"
511 |
512 | # Run functionality tests
513 | pytest tests/test_server.py -v
514 |
515 | # Test with real queries
516 | python examples/basic_usage.py
517 |
518 | # Test AI agent integration
519 | python examples/integration_demo.py
520 | ```
521 |
522 | ### Development Examples
523 |
524 | ```bash
525 | # Test author disambiguation
526 | python examples/basic_usage.py --query "J. Abreu" --institution "EMBO"
527 |
528 | # Test work retrieval
529 | python examples/advanced_queries.py --author-id "A123456789" --type "journal-article"
530 |
531 | # Test integration patterns
532 | python examples/integration_demo.py --workflow "career-analysis"
533 | ```
534 |
535 | ---
536 |
537 | ## 📈 Integration Examples
538 |
539 | ### Academic Research Workflows
540 |
541 | Perfect integration with AI-powered research analysis:
542 |
543 | ```python
544 | # Enhanced academic research agent
545 | from alex_agent import AcademicResearchAgent
546 |
547 | agent = AcademicResearchAgent(
548 | mcp_servers=[alex_mcp], # Streamlined data processing
549 | model="gpt-4.1-2025-04-14"
550 | )
551 |
552 | # Complex research queries with structured data
553 | result = await agent.research_author(
554 | "Find J. Abreu at EMBO with machine learning publications"
555 | )
556 |
557 | # Rich, structured output for AI reasoning
558 | print(f"Quality Score: {result.quality_score}/100")
559 | print(f"Author disambiguation: {result.confidence}")
560 | print(f"Research fields: {result.research_domains}")
561 | ```
562 |
563 | ### Multi-Agent Systems
564 |
565 | ```python
566 | # Collaborative research analysis
567 | async def research_collaboration_network(seed_author):
568 | # Find primary author
569 | authors = await alex_mcp.search_authors(seed_author)
570 | primary = authors['results'][0]
571 |
572 | # Get their works
573 | works = await alex_mcp.retrieve_author_works(
574 | primary['id'],
575 | type="journal-article"
576 | )
577 |
578 | # Analyze co-authors and build network
579 | collaborators = set()
580 | for work in works['results']:
581 | for authorship in work.get('authorships', []):
582 | collaborators.add(authorship['author']['display_name'])
583 |
584 | return {
585 | 'primary_author': primary,
586 | 'publication_count': len(works['results']),
587 | 'collaborator_network': list(collaborators),
588 | 'research_impact': sum(w['cited_by_count'] for w in works['results'])
589 | }
590 | ```
591 |
592 | ---
593 |
594 | ## 🤝 Contributing
595 |
596 | We welcome contributions to improve functionality and add new features:
597 |
598 | 1. **Fork the repository**
599 | 2. **Create a feature branch**: `git checkout -b feature/enhanced-filtering`
600 | 3. **Add tests**: Ensure your changes maintain data quality and structure
601 | 4. **Submit a pull request**: Include examples and documentation
602 |
603 | ### Development Priorities
604 |
605 | - [ ] Enhanced filtering capabilities
606 | - [ ] Additional data enrichment
607 | - [ ] Performance optimizations
608 | - [ ] Integration examples
609 | - [ ] Documentation improvements
610 |
611 | ---
612 |
613 | ## 📄 License
614 |
615 | This project is licensed under the MIT License. See [LICENSE](LICENSE) for details.
616 |
617 | ---
618 |
619 | ## 🌐 Links
620 |
621 | - [OpenAlex API Documentation](https://docs.openalex.org/)
622 | - [Model Context Protocol](https://modelcontextprotocol.io/)
623 | - [FastMCP](https://github.com/ContextualAI/fastmcp)
624 | - [OpenAI Agents](https://github.com/openai/openai-agents)
625 | - [Academic Research Examples](examples/)
626 |
--------------------------------------------------------------------------------
/src/alex_mcp/server.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Optimized OpenAlex Author Disambiguation MCP Server with Peer-Review Filtering
4 |
5 | Provides a FastMCP-compliant API for author disambiguation and institution resolution
6 | using the OpenAlex API with streamlined output to minimize token usage.
7 | """
8 |
9 | import logging
10 | from typing import Optional
11 | from fastmcp import FastMCP
12 | from alex_mcp.data_objects import (
13 | OptimizedAuthorResult,
14 | OptimizedSearchResponse,
15 | OptimizedWorksSearchResponse,
16 | OptimizedGeneralWorksSearchResponse,
17 | OptimizedWorkResult,
18 | AutocompleteAuthorCandidate,
19 | AutocompleteAuthorsResponse,
20 | optimize_author_data,
21 | optimize_work_data
22 | )
23 | import pyalex
24 | import os
25 | import sys
26 | import aiohttp
27 | import asyncio
28 | import json
29 | import re
30 |
31 | def get_config():
32 | mailto = os.environ.get("OPENALEX_MAILTO")
33 | if not mailto:
34 | print(
35 | "ERROR: The environment variable OPENALEX_MAILTO must be set to your email address "
36 | "to use the OpenAlex MCP server. Example: export OPENALEX_MAILTO='your-email@example.com'",
37 | file=sys.stderr
38 | )
39 | sys.exit(1)
40 | return {
41 | "OPENALEX_MAILTO": mailto,
42 | "OPENALEX_USER_AGENT": os.environ.get(
43 | "OPENALEX_USER_AGENT",
44 | f"alex-mcp (+{mailto})"
45 | ),
46 | "OPENALEX_MAX_AUTHORS": int(os.environ.get("OPENALEX_MAX_AUTHORS", 50)), # Reduced default
47 | "OPENALEX_RATE_PER_SEC": int(os.environ.get("OPENALEX_RATE_PER_SEC", 10)),
48 | "OPENALEX_RATE_PER_DAY": int(os.environ.get("OPENALEX_RATE_PER_DAY", 100000)),
49 | "OPENALEX_USE_DAILY_API": os.environ.get("OPENALEX_USE_DAILY_API", "true").lower() == "true",
50 | "OPENALEX_SNAPSHOT_INTERVAL_DAYS": int(os.environ.get("OPENALEX_SNAPSHOT_INTERVAL_DAYS", 30)),
51 | "OPENALEX_PREMIUM_UPDATES": os.environ.get("OPENALEX_PREMIUM_UPDATES", "hourly"),
52 | "OPENALEX_RETRACTION_BUG_START": os.environ.get("OPENALEX_RETRACTION_BUG_START", "2023-12-22"),
53 | "OPENALEX_RETRACTION_BUG_END": os.environ.get("OPENALEX_RETRACTION_BUG_END", "2024-03-19"),
54 | "OPENALEX_NO_FUNDING_DATA": os.environ.get("OPENALEX_NO_FUNDING_DATA", "true").lower() == "true",
55 | "OPENALEX_MISSING_CORRESPONDING_AUTHORS": os.environ.get("OPENALEX_MISSING_CORRESPONDING_AUTHORS", "true").lower() == "true",
56 | "OPENALEX_PARTIAL_ABSTRACTS": os.environ.get("OPENALEX_PARTIAL_ABSTRACTS", "true").lower() == "true",
57 | }
58 |
59 | # Configure logging
60 | logging.basicConfig(level=logging.INFO)
61 | logger = logging.getLogger(__name__)
62 |
63 | # Initialize FastMCP server
64 | mcp = FastMCP("OpenAlex Academic Research")
65 |
66 |
67 | def configure_pyalex(email: str):
68 | """
69 | Configure pyalex for OpenAlex API usage.
70 |
71 | Args:
72 | email (str): The email to use for OpenAlex API requests.
73 | """
74 | pyalex.config.email = email
75 |
76 | # Load configuration
77 | config = get_config()
78 | configure_pyalex(config["OPENALEX_MAILTO"])
79 | pyalex.config.user_agent = config["OPENALEX_USER_AGENT"]
80 |
81 |
82 | def is_peer_reviewed_journal(work_data) -> bool:
83 | """
84 | Improved filter to determine if a work is from a peer-reviewed journal.
85 |
86 | Uses a balanced approach that catches data catalogs and preprints while
87 | not being overly strict about DOIs (some legitimate papers lack them in OpenAlex).
88 |
89 | Args:
90 | work_data: OpenAlex work object
91 |
92 | Returns:
93 | bool: True if the work appears to be from a peer-reviewed journal
94 | """
95 | try:
96 | # Safe string extraction with None checking
97 | title = work_data.get('title') or ''
98 | if isinstance(title, str):
99 | title = title.lower()
100 | else:
101 | title = str(title).lower() if title is not None else ''
102 |
103 | # Quick exclusions based on title patterns
104 | title_exclusions = [
105 | 'vizier online data catalog',
106 | 'online data catalog',
107 | 'data catalog',
108 | 'catalog:',
109 | 'database:',
110 | 'repository:',
111 | 'preprint',
112 | 'arxiv:',
113 | 'biorxiv',
114 | 'medrxiv',
115 | ]
116 |
117 | for exclusion in title_exclusions:
118 | if exclusion in title:
119 | logger.debug(f"Excluding based on title pattern '{exclusion}': {title[:100]}")
120 | return False
121 |
122 | # Check primary location
123 | primary_location = work_data.get('primary_location')
124 | if not primary_location:
125 | logger.debug("Excluding work without primary location")
126 | return False
127 |
128 | # Check source information
129 | source = primary_location.get('source', {})
130 | if not source:
131 | logger.debug("Excluding work without source")
132 | return False
133 |
134 | # Get journal/source information with safe None checking
135 | journal_name_raw = source.get('display_name') or ''
136 | journal_name = journal_name_raw.lower() if isinstance(journal_name_raw, str) else str(journal_name_raw).lower()
137 |
138 | publisher = work_data.get('publisher', '')
139 | doi = work_data.get('doi')
140 | issn_l = source.get('issn_l')
141 | issn = source.get('issn')
142 |
143 | source_type_raw = source.get('type') or ''
144 | source_type = source_type_raw.lower() if isinstance(source_type_raw, str) else str(source_type_raw).lower()
145 |
146 | # CRITICAL: Exclude known data catalogs by journal name
147 | excluded_journals = [
148 | 'vizier online data catalog',
149 | 'ycat',
150 | 'catalog',
151 | 'database',
152 | 'repository',
153 | 'arxiv',
154 | 'biorxiv',
155 | 'medrxiv',
156 | 'ssrn',
157 | 'research square',
158 | 'zenodo',
159 | 'figshare',
160 | 'dryad',
161 | 'github',
162 | 'protocols.io',
163 | 'ceur',
164 | 'conference proceedings',
165 | 'workshop proceedings',
166 | ]
167 |
168 | for excluded in excluded_journals:
169 | if excluded in journal_name:
170 | logger.debug(f"Excluding journal pattern '{excluded}': {journal_name}")
171 | return False
172 |
173 | # CRITICAL: Data catalogs typically have no publisher AND no DOI
174 | # This catches VizieR entries effectively
175 | if not publisher and not doi:
176 | logger.debug(f"Excluding work without publisher AND DOI: {title[:100]}")
177 | return False
178 |
179 | # Source type should be journal (if specified)
180 | if source_type and source_type not in ['journal', '']:
181 | logger.debug(f"Excluding non-journal source type: {source_type}")
182 | return False
183 |
184 | # Work type should be article or letter with safe None checking
185 | work_type_raw = work_data.get('type') or ''
186 | work_type = work_type_raw.lower() if isinstance(work_type_raw, str) else str(work_type_raw).lower()
187 | if work_type not in ['article', 'letter']:
188 | logger.debug(f"Excluding work type: {work_type}")
189 | return False
190 |
191 | # Should have reasonable publication year
192 | pub_year = work_data.get('publication_year')
193 | if not pub_year or pub_year < 1900 or pub_year > 2030:
194 | logger.debug(f"Excluding work with invalid publication year: {pub_year}")
195 | return False
196 |
197 | # For papers claiming to be from legitimate journals, check quality signals
198 | known_legitimate_journals = [
199 | 'nature',
200 | 'science',
201 | 'cell',
202 | 'astrophysical journal',
203 | 'astronomy and astrophysics',
204 | 'monthly notices',
205 | 'physical review',
206 | 'journal of',
207 | 'proceedings of',
208 | ]
209 |
210 | is_known_journal = any(known in journal_name for known in known_legitimate_journals)
211 |
212 | if is_known_journal:
213 | # For known journals, be more lenient (don't require DOI)
214 | # But still require either publisher or ISSN
215 | if not publisher and not issn_l and not issn:
216 | logger.debug(f"Excluding known journal without publisher/ISSN: {journal_name}")
217 | return False
218 | else:
219 | # For unknown journals, require more quality signals
220 | quality_signals = sum([
221 | bool(doi), # Has DOI
222 | bool(publisher), # Has publisher
223 | bool(issn_l or issn), # Has ISSN
224 | bool(journal_name and len(journal_name) > 5), # Reasonable journal name
225 | ])
226 |
227 | if quality_signals < 2: # Require at least 2 quality signals
228 | logger.debug(f"Excluding unknown journal with insufficient quality signals ({quality_signals}/4): {journal_name}")
229 | return False
230 |
231 | # Additional quality checks
232 | if 'cited_by_count' not in work_data:
233 | logger.debug("Excluding work without citation data")
234 | return False
235 |
236 | # Very long titles might be data descriptions
237 | if len(title) > 250:
238 | logger.debug(f"Excluding work with very long title: {title[:100]}...")
239 | return False
240 |
241 | # If we get here, it passes all checks
242 | logger.debug(f"ACCEPTED: {title[:100]}")
243 | return True
244 |
245 | except Exception as e:
246 | logger.error(f"Error in peer review check for work: {e}")
247 | logger.error(f"Work data keys: {list(work_data.keys()) if isinstance(work_data, dict) else 'Not a dict'}")
248 | logger.error(f"Work title: {repr(work_data.get('title') if isinstance(work_data, dict) else 'N/A')}")
249 | logger.error(f"Primary location: {repr(work_data.get('primary_location') if isinstance(work_data, dict) else 'N/A')}")
250 | import traceback
251 | logger.error(f"Full traceback: {traceback.format_exc()}")
252 | return False
253 |
254 |
255 | def filter_peer_reviewed_works(works: list) -> list:
256 | """
257 | Apply peer-review filtering to a list of works.
258 |
259 | Args:
260 | works: List of OpenAlex work objects
261 |
262 | Returns:
263 | list: Filtered list containing only peer-reviewed journal works
264 | """
265 | filtered_works = []
266 | excluded_count = 0
267 |
268 | logger.info(f"Starting filtering of {len(works)} works...")
269 |
270 | for i, work in enumerate(works):
271 | # Safe handling of potentially None work or title
272 | if work is None:
273 | logger.warning(f"Skipping None work at position {i+1}")
274 | excluded_count += 1
275 | continue
276 |
277 | title_raw = work.get('title') if isinstance(work, dict) else None
278 | title = (title_raw or 'Unknown')[:60] if title_raw is not None else 'Unknown'
279 |
280 | try:
281 | if is_peer_reviewed_journal(work):
282 | filtered_works.append(work)
283 | logger.debug(f"✓ KEPT work {i+1}: {title}")
284 | else:
285 | excluded_count += 1
286 | logger.debug(f"✗ EXCLUDED work {i+1}: {title}")
287 | except Exception as e:
288 | logger.error(f"Error filtering work {i+1} (title: {title}): {e}")
289 | excluded_count += 1
290 |
291 | logger.info(f"Filtering complete: {len(filtered_works)} kept, {excluded_count} excluded from {len(works)} total")
292 | return filtered_works
293 |
294 |
295 | def search_authors_core(
296 | name: str,
297 | institution: Optional[str] = None,
298 | topic: Optional[str] = None,
299 | country_code: Optional[str] = None,
300 | limit: int = 15 # Reduced default limit
301 | ) -> OptimizedSearchResponse:
302 | """
303 | Optimized core logic for searching authors using OpenAlex.
304 | Returns streamlined author data to minimize token usage.
305 |
306 | Args:
307 | name: Author name to search for.
308 | institution: (Optional) Institution name filter.
309 | topic: (Optional) Topic filter.
310 | country_code: (Optional) Country code filter.
311 | limit: Maximum number of results to return (default: 15).
312 |
313 | Returns:
314 | OptimizedSearchResponse: Streamlined response with essential author data.
315 | """
316 | try:
317 | # Build query
318 | query = pyalex.Authors().search_filter(display_name=name)
319 |
320 | # Add filters if provided
321 | filters = {}
322 | if institution:
323 | filters['affiliations.institution.display_name.search'] = institution
324 | if topic:
325 | filters['x_concepts.display_name.search'] = topic
326 | if country_code:
327 | filters['affiliations.institution.country_code'] = country_code
328 |
329 | if filters:
330 | query = query.filter(**filters)
331 |
332 | # Execute query with limit
333 | results = query.get(per_page=min(limit, 100)) # Increased for comprehensive search
334 | authors = list(results)
335 |
336 | # Convert to optimized format
337 | optimized_authors = []
338 | for author_data in authors:
339 | try:
340 | optimized_author = optimize_author_data(author_data)
341 | optimized_authors.append(optimized_author)
342 | except Exception as e:
343 | logger.warning(f"Error optimizing author data: {e}")
344 | # Skip problematic authors rather than failing completely
345 | continue
346 |
347 | logger.info(f"Found {len(optimized_authors)} authors for query: {name}")
348 |
349 | return OptimizedSearchResponse(
350 | query=name,
351 | total_count=len(optimized_authors),
352 | results=optimized_authors
353 | )
354 |
355 | except Exception as e:
356 | logger.error(f"Error searching authors for query '{name}': {e}")
357 | return OptimizedSearchResponse(
358 | query=name,
359 | total_count=0,
360 | results=[]
361 | )
362 |
363 |
364 | def autocomplete_authors_core(
365 | name: str,
366 | context: Optional[str] = None,
367 | limit: int = 10,
368 | filter_no_institution: bool = True,
369 | enable_institution_ranking: bool = True
370 | ) -> AutocompleteAuthorsResponse:
371 | """
372 | Enhanced core function for author autocomplete with intelligent filtering and ranking.
373 |
374 | Args:
375 | name: Author name to search for
376 | context: Optional context for better matching (institution, research area, etc.)
377 | limit: Maximum number of candidates to return (increased default to 10)
378 | filter_no_institution: If True, exclude candidates with no institutional affiliation
379 | enable_institution_ranking: If True, rank candidates by institutional context relevance
380 |
381 | Returns:
382 | AutocompleteAuthorsResponse with filtered and ranked candidate authors
383 | """
384 | try:
385 | logger.info(f"🔍 Autocompleting authors for: '{name}' (limit: {limit})")
386 | if context:
387 | logger.info(f" 📝 Context provided: {context}")
388 |
389 | # Use PyAlex autocomplete for authors - get more results for filtering
390 | raw_limit = min(limit * 2, 20) # Get 2x candidates for filtering
391 | results = pyalex.Authors().autocomplete(name)[:raw_limit]
392 |
393 | # Convert to our data model first
394 | all_candidates = []
395 | for result in results:
396 | candidate = AutocompleteAuthorCandidate(
397 | openalex_id=result.get('id', ''),
398 | display_name=result.get('display_name', ''),
399 | institution_hint=result.get('hint'),
400 | works_count=result.get('works_count', 0),
401 | cited_by_count=result.get('cited_by_count', 0),
402 | entity_type=result.get('entity_type', 'author'),
403 | external_id=result.get('external_id')
404 | )
405 | all_candidates.append(candidate)
406 |
407 | # ENHANCEMENT 1: Filter out candidates with no institution
408 | if filter_no_institution:
409 | filtered_candidates = [
410 | c for c in all_candidates
411 | if c.institution_hint and c.institution_hint not in ['No institution', 'None', '']
412 | ]
413 | excluded_count = len(all_candidates) - len(filtered_candidates)
414 | if excluded_count > 0:
415 | logger.info(f" 🔍 Filtered out {excluded_count} candidates with no institution")
416 | else:
417 | filtered_candidates = all_candidates
418 |
419 | # ENHANCEMENT 2: Institution-aware ranking (if context provided)
420 | if enable_institution_ranking and context and filtered_candidates:
421 | scored_candidates = []
422 | context_lower = context.lower()
423 |
424 | for candidate in filtered_candidates:
425 | relevance_score = 0
426 | matched_terms = []
427 |
428 | inst_hint = (candidate.institution_hint or '').lower()
429 |
430 | # High-value institutional matches
431 | high_value_terms = [
432 | 'max planck', 'harvard', 'stanford', 'mit', 'cambridge', 'oxford',
433 | 'excellence cluster', 'crick', 'wellcome', 'nih', 'cnrs', 'inserm'
434 | ]
435 | for term in high_value_terms:
436 | if term in context_lower and term in inst_hint:
437 | relevance_score += 3
438 | matched_terms.append(f"{term} (+3)")
439 |
440 | # Location-based matches
441 | location_terms = ['germany', 'uk', 'usa', 'france', 'köln', 'cologne', 'london', 'boston', 'berlin']
442 | for term in location_terms:
443 | if term in context_lower and term in inst_hint:
444 | relevance_score += 2
445 | matched_terms.append(f"{term} (+2)")
446 |
447 | # Research field alignment (basic keyword matching)
448 | research_terms = ['biology', 'chemistry', 'biochemistry', 'physics', 'medicine']
449 | for term in research_terms:
450 | if term in context_lower and term in inst_hint:
451 | relevance_score += 1
452 | matched_terms.append(f"{term} (+1)")
453 |
454 | # High-impact researcher bonus
455 | if candidate.cited_by_count and candidate.cited_by_count > 1000:
456 | relevance_score += 1
457 | matched_terms.append("high-impact (+1)")
458 |
459 | scored_candidates.append({
460 | 'candidate': candidate,
461 | 'relevance_score': relevance_score,
462 | 'matched_terms': matched_terms
463 | })
464 |
465 | # Sort by relevance score (descending), then by citation count
466 | scored_candidates.sort(key=lambda x: (x['relevance_score'], x['candidate'].cited_by_count), reverse=True)
467 |
468 | # Extract ranked candidates
469 | final_candidates = [sc['candidate'] for sc in scored_candidates[:limit]]
470 |
471 | # Log ranking results
472 | logger.info(f" 🏆 Institution-aware ranking applied:")
473 | for i, sc in enumerate(scored_candidates[:3], 1): # Log top 3
474 | candidate = sc['candidate']
475 | logger.info(f" {i}. {candidate.display_name} (score: {sc['relevance_score']}, {candidate.institution_hint})")
476 | else:
477 | # No ranking, just take first N candidates
478 | final_candidates = filtered_candidates[:limit]
479 |
480 | # Log final candidates
481 | for candidate in final_candidates:
482 | logger.info(f" 👤 {candidate.display_name} ({candidate.institution_hint or 'No institution'}) - {candidate.works_count} works")
483 |
484 | response = AutocompleteAuthorsResponse(
485 | query=name,
486 | context=context,
487 | total_candidates=len(final_candidates),
488 | candidates=final_candidates,
489 | search_metadata={
490 | 'api_used': 'openalex_autocomplete',
491 | 'has_context': context is not None,
492 | 'filtered_no_institution': filter_no_institution,
493 | 'institution_ranking_enabled': enable_institution_ranking and context is not None,
494 | 'response_time_ms': None # Could be added with timing
495 | }
496 | )
497 |
498 | logger.info(f"✅ Found {len(final_candidates)} candidates for '{name}'")
499 | return response
500 |
501 | except Exception as e:
502 | logger.error(f"❌ Error in autocomplete_authors_core: {e}")
503 | # Return empty response on error
504 | return AutocompleteAuthorsResponse(
505 | query=name,
506 | context=context,
507 | total_candidates=0,
508 | candidates=[],
509 | search_metadata={
510 | 'api_used': 'openalex_autocomplete',
511 | 'has_context': context is not None,
512 | 'error': str(e)
513 | }
514 | )
515 |
516 |
517 | def search_works_core(
518 | query: str,
519 | author: Optional[str] = None,
520 | institution: Optional[str] = None,
521 | publication_year: Optional[int] = None,
522 | type: Optional[str] = None,
523 | limit: int = 25,
524 | peer_reviewed_only: bool = True,
525 | search_type: str = "general"
526 | ) -> OptimizedGeneralWorksSearchResponse:
527 | """
528 | Core logic for searching works using OpenAlex with configurable search modes.
529 | Returns streamlined work data to minimize token usage.
530 |
531 | Args:
532 | query: Search query text
533 | author: (Optional) Author name filter
534 | institution: (Optional) Institution name filter
535 | publication_year: (Optional) Publication year filter
536 | type: (Optional) Work type filter (e.g., "article", "letter")
537 | limit: Maximum number of results (default: 25, max: 100)
538 | peer_reviewed_only: If True, apply peer-review filters (default: True)
539 | search_type: Search mode - "general" (title/abstract/fulltext), "title" (title only),
540 | or "title_and_abstract" (title and abstract only)
541 |
542 | Returns:
543 | OptimizedGeneralWorksSearchResponse: Streamlined response with work data.
544 | """
545 | try:
546 | # Ensure reasonable limits to control token usage
547 | limit = min(limit, 100)
548 |
549 | # Build the search query using PyAlex based on search_type
550 | if search_type == "title":
551 | # Use title-specific search for precise title matching
552 | works_query = pyalex.Works()
553 | filters = {'title.search': query}
554 | elif search_type == "title_and_abstract":
555 | # Use title and abstract search
556 | works_query = pyalex.Works()
557 | filters = {'title_and_abstract.search': query}
558 | else: # search_type == "general" or any other value
559 | # Use general search across title, abstract, and fulltext (default behavior)
560 | works_query = pyalex.Works().search(query)
561 | filters = {}
562 |
563 | # Add author filter if provided
564 | if author:
565 | # For general work search, we can use raw_author_name.search for name-based filtering
566 | # This searches for works where the author name appears in the raw author strings
567 | filters['raw_author_name.search'] = author
568 |
569 | # Add institution filter if provided
570 | if institution:
571 | # Use the correct field for institution name filtering
572 | filters['authorships.institutions.display_name.search'] = institution
573 |
574 | # Add publication year filter
575 | if publication_year:
576 | filters['publication_year'] = publication_year
577 |
578 | # Add type filter
579 | if type:
580 | filters['type'] = type
581 | elif peer_reviewed_only:
582 | # Focus on journal articles and letters for academic work
583 | filters['type'] = 'article|letter'
584 |
585 | # Add basic quality filters
586 | if peer_reviewed_only:
587 | filters['is_retracted'] = False
588 |
589 | # Apply filters to query
590 | if filters:
591 | works_query = works_query.filter(**filters)
592 |
593 | # Execute query
594 | logger.info(f"Searching OpenAlex works with search_type='{search_type}', query: '{query[:50]}...' and {len(filters)} filters")
595 | results = works_query.get(per_page=limit)
596 |
597 | # Apply additional peer-review filtering if requested
598 | if peer_reviewed_only and results:
599 | logger.info(f"Applying peer-review filtering to {len(results)} results...")
600 | results = filter_peer_reviewed_works(results)
601 | logger.info(f"After peer-review filtering: {len(results)} results remain")
602 |
603 | # Convert to optimized format
604 | optimized_works = []
605 | for work in results:
606 | try:
607 | optimized_work = optimize_work_data(work)
608 | optimized_works.append(optimized_work)
609 | except Exception as e:
610 | logger.warning(f"Error optimizing work data: {e}")
611 | continue
612 |
613 | logger.info(f"Returning {len(optimized_works)} optimized works for search query")
614 |
615 | return OptimizedGeneralWorksSearchResponse(
616 | query=query,
617 | total_count=len(optimized_works),
618 | results=optimized_works,
619 | filters=filters
620 | )
621 |
622 | except Exception as e:
623 | logger.error(f"Error searching works for query '{query}': {e}")
624 | return OptimizedGeneralWorksSearchResponse(
625 | query=query,
626 | total_count=0,
627 | results=[],
628 | filters={}
629 | )
630 |
631 |
632 | def retrieve_author_works_core(
633 | author_id: str,
634 | limit: int = 20_000, # High default limit for comprehensive analysis
635 | order_by: str = "date", # "date" or "citations"
636 | publication_year: Optional[int] = None,
637 | type: Optional[str] = None,
638 | journal_only: bool = True, # Default to True for peer-reviewed content
639 | min_citations: Optional[int] = None,
640 | peer_reviewed_only: bool = True, # Default to True
641 | ) -> OptimizedWorksSearchResponse:
642 | """
643 | Enhanced core logic to retrieve peer-reviewed works for a given OpenAlex Author ID.
644 | Returns streamlined work data to minimize token usage and ensures only legitimate
645 | peer-reviewed journal articles and letters.
646 |
647 | Args:
648 | author_id: OpenAlex Author ID
649 | limit: Maximum number of results (default: 2000 for comprehensive analysis)
650 | order_by: Sort order - "date" or "citations"
651 | publication_year: Filter by specific year
652 | type: Filter by work type (e.g., "journal-article")
653 | journal_only: If True, only return journal articles and letters
654 | min_citations: Minimum citation count filter
655 | peer_reviewed_only: If True, apply comprehensive peer-review filters
656 |
657 | Returns:
658 | OptimizedWorksSearchResponse: Streamlined response with peer-reviewed work data.
659 | """
660 | try:
661 | limit = min(limit, 20_000)
662 |
663 | # Build base filters
664 | filters = {"author.id": author_id}
665 |
666 | # Add optional filters
667 | if publication_year:
668 | filters["publication_year"] = publication_year
669 | if type:
670 | filters["type"] = type
671 | elif journal_only:
672 | # Focus on journal articles and letters for academic work
673 | filters["type"] = "article|letter"
674 | if min_citations:
675 | filters["cited_by_count"] = f">={min_citations}"
676 |
677 | # Add some basic API-level filters (but not too restrictive)
678 | if peer_reviewed_only or journal_only:
679 | # Only exclude obviously retracted papers at API level
680 | filters["is_retracted"] = "false"
681 |
682 | # Convert author_id to proper format if needed
683 | if author_id.startswith("https://openalex.org/"):
684 | author_id_short = author_id.split("/")[-1]
685 | filters["author.id"] = f"https://openalex.org/{author_id_short}"
686 |
687 | # Build query - get more results for post-filtering if needed
688 | if peer_reviewed_only:
689 | initial_limit = min(limit * 4, 20_000) # Get 4x more for filtering, much higher limit
690 | else:
691 | initial_limit = limit
692 |
693 | works_query = pyalex.Works().filter(**filters)
694 |
695 | # Apply sorting
696 | if order_by == "citations":
697 | works_query = works_query.sort(cited_by_count="desc")
698 | else:
699 | works_query = works_query.sort(publication_date="desc")
700 |
701 | # Execute query using pagination to get ALL works
702 | logger.info(f"Querying OpenAlex for up to {initial_limit} works with filters: {filters}")
703 |
704 | # Use paginate() to get all works, not just the first page
705 | all_works = []
706 | pager = works_query.paginate(per_page=200, n_max=initial_limit) # Use 200 per page (API recommended)
707 |
708 | for page in pager:
709 | all_works.extend(page)
710 | if len(all_works) >= initial_limit:
711 | break
712 |
713 | works = all_works[:initial_limit] # Ensure we don't exceed the limit
714 | logger.info(f"Retrieved {len(works)} works from OpenAlex via pagination")
715 |
716 | # Apply peer-review filtering if requested
717 | if peer_reviewed_only:
718 | logger.info("Applying peer-review filtering...")
719 | works = filter_peer_reviewed_works(works)
720 | logger.info(f"After filtering: {len(works)} works remain")
721 |
722 | # Limit to requested number after filtering
723 | works = works[:limit]
724 |
725 | # Get author name for response (if available from first work)
726 | author_name = None
727 | if works:
728 | authorships = works[0].get('authorships', [])
729 | for authorship in authorships:
730 | author = authorship.get('author', {})
731 | if author.get('id') == author_id:
732 | author_name = author.get('display_name')
733 | break
734 |
735 | # Convert to optimized format
736 | optimized_works = []
737 | for work_data in works:
738 | try:
739 | optimized_work = optimize_work_data(work_data)
740 | optimized_works.append(optimized_work)
741 | except Exception as e:
742 | logger.warning(f"Error optimizing work data: {e}")
743 | continue
744 |
745 | logger.info(f"Final result: {len(optimized_works)} works for author: {author_id}")
746 |
747 | return OptimizedWorksSearchResponse(
748 | author_id=author_id,
749 | author_name=author_name,
750 | total_count=len(optimized_works),
751 | results=optimized_works,
752 | filters=filters
753 | )
754 |
755 | except Exception as e:
756 | logger.error(f"Error retrieving works for author {author_id}: {e}")
757 | return OptimizedWorksSearchResponse(
758 | author_id=author_id,
759 | total_count=0,
760 | results=[],
761 | filters={}
762 | )
763 |
764 |
765 | @mcp.tool(
766 | annotations={
767 | "title": "Search Authors (Optimized)",
768 | "description": (
769 | "Search for authors by name with optional filters. "
770 | "Returns streamlined author data optimized for AI agents with ~70% fewer tokens. "
771 | "Includes essential info: name, ORCID, affiliations (as strings), metrics, and research fields."
772 | ),
773 | "readOnlyHint": True,
774 | "openWorldHint": True
775 | }
776 | )
777 | async def search_authors(
778 | name: str,
779 | institution: Optional[str] = None,
780 | topic: Optional[str] = None,
781 | country_code: Optional[str] = None,
782 | limit: int = 15
783 | ) -> dict:
784 | """
785 | Optimized MCP tool wrapper for searching authors.
786 |
787 | Args:
788 | name: Author name to search for.
789 | institution: (Optional) Institution name filter.
790 | topic: (Optional) Topic filter.
791 | country_code: (Optional) Country code filter.
792 | limit: Maximum number of results to return (default: 15, max: 100).
793 |
794 | Returns:
795 | dict: Serialized OptimizedSearchResponse with streamlined author data.
796 | """
797 | # Ensure reasonable limits to control token usage
798 | limit = min(limit, 100) # Increased for comprehensive author search
799 |
800 | response = search_authors_core(
801 | name=name,
802 | institution=institution,
803 | topic=topic,
804 | country_code=country_code,
805 | limit=limit
806 | )
807 | return response.model_dump()
808 |
809 |
810 | @mcp.tool(
811 | annotations={
812 | "title": "Retrieve Author Works (Peer-Reviewed Only)",
813 | "description": (
814 | "Retrieve peer-reviewed journal works for a given OpenAlex Author ID. "
815 | "Automatically filters out data catalogs, preprint servers, and non-journal content. "
816 | "Returns streamlined work data optimized for AI agents with ~80% fewer tokens. "
817 | "Uses balanced filtering: excludes VizieR catalogs but allows legitimate papers without DOIs."
818 | ),
819 | "readOnlyHint": True,
820 | "openWorldHint": True
821 | }
822 | )
823 | async def retrieve_author_works(
824 | author_id: str,
825 | limit: Optional[int] = None,
826 | order_by: str = "date",
827 | publication_year: Optional[int] = None,
828 | type: Optional[str] = None,
829 | journal_only: bool = True,
830 | min_citations: Optional[int] = None,
831 | peer_reviewed_only: bool = True,
832 | ) -> dict:
833 | """
834 | Enhanced MCP tool wrapper for retrieving author works with flexible filtering.
835 |
836 | Args:
837 | author_id: OpenAlex Author ID (e.g., 'https://openalex.org/A123456789')
838 | limit: Maximum number of results (default: None = ALL works via pagination, max: 2000)
839 | order_by: Sort order - "date" for newest first, "citations" for most cited first
840 | publication_year: Filter by specific publication year
841 | type: Filter by work type (e.g., "journal-article", "letter")
842 | journal_only: If True, only return journal articles and letters (default: True)
843 | min_citations: Only return works with at least this many citations
844 | peer_reviewed_only: If True, apply balanced peer-review filters (default: True)
845 |
846 | Returns:
847 | dict: Serialized OptimizedWorksSearchResponse with author's works.
848 |
849 | Usage Patterns:
850 | # For AI validation (sample of high-impact works)
851 | retrieve_author_works(author_id, limit=20, order_by="citations")
852 |
853 | # For complete benchmark evaluation (ALL works, minimal filtering)
854 | retrieve_author_works(author_id, peer_reviewed_only=False, journal_only=False)
855 |
856 | # For peer-reviewed works only (default behavior)
857 | retrieve_author_works(author_id)
858 | """
859 | # Handle limit: None means ALL works, otherwise cap at reasonable limit
860 | logger.info(f"MCP tool received limit parameter: {limit}")
861 | if limit is None:
862 | limit = 2000 # Set a very high limit to get ALL works
863 | logger.info(f"No limit specified, setting to {limit} for comprehensive retrieval")
864 | else:
865 | limit = min(limit, 2000) # Increased max limit for comprehensive analysis
866 | logger.info(f"Explicit limit specified, capped to {limit}")
867 |
868 | response = retrieve_author_works_core(
869 | author_id=author_id,
870 | limit=limit,
871 | order_by=order_by,
872 | publication_year=publication_year,
873 | type=type,
874 | journal_only=journal_only,
875 | min_citations=min_citations,
876 | peer_reviewed_only=peer_reviewed_only,
877 | )
878 | return response.model_dump()
879 |
880 |
881 | @mcp.tool(
882 | annotations={
883 | "title": "Search Works (Optimized)",
884 | "description": (
885 | "Search for academic works with configurable search modes and optional filters. "
886 | "Returns streamlined work data optimized for AI agents with ~80% fewer tokens. "
887 | "Supports different search types: 'general' (title/abstract/fulltext), 'title' (title only), "
888 | "or 'title_and_abstract' (title and abstract only). "
889 | "Supports author, institution, publication year, and type filters. "
890 | "Automatically applies peer-review filtering to exclude data catalogs and preprints."
891 | ),
892 | "readOnlyHint": True,
893 | "openWorldHint": True
894 | }
895 | )
896 | async def search_works(
897 | query: str,
898 | author: Optional[str] = None,
899 | institution: Optional[str] = None,
900 | publication_year: Optional[int] = None,
901 | type: Optional[str] = None,
902 | limit: int = 25,
903 | peer_reviewed_only: bool = True,
904 | search_type: str = "general"
905 | ) -> dict:
906 | """
907 | Optimized MCP tool wrapper for searching works.
908 |
909 | Args:
910 | query: Search query text
911 | author: (Optional) Author name filter
912 | institution: (Optional) Institution name filter
913 | publication_year: (Optional) Publication year filter
914 | type: (Optional) Work type filter (e.g., "article", "letter")
915 | limit: Maximum number of results (default: 25, max: 100)
916 | peer_reviewed_only: If True, apply peer-review filters (default: True)
917 | search_type: Search mode - "general" (title/abstract/fulltext), "title" (title only),
918 | or "title_and_abstract" (title and abstract only)
919 |
920 | Returns:
921 | dict: Serialized OptimizedGeneralWorksSearchResponse with streamlined work data.
922 | """
923 | # Ensure reasonable limits to control token usage
924 | limit = min(limit, 100)
925 |
926 | response = search_works_core(
927 | query=query,
928 | author=author,
929 | institution=institution,
930 | publication_year=publication_year,
931 | type=type,
932 | limit=limit,
933 | peer_reviewed_only=peer_reviewed_only,
934 | search_type=search_type
935 | )
936 | return response.model_dump()
937 |
938 |
939 | @mcp.tool(
940 | annotations={
941 | "title": "Autocomplete Authors (Smart Disambiguation)",
942 | "description": (
943 | "Get multiple author candidates using OpenAlex autocomplete API for intelligent disambiguation. "
944 | "Returns a ranked list of potential author matches with institutional hints and research metrics. "
945 | "Perfect when you need to disambiguate authors and have context like institution, research area, or co-authors. "
946 | "The AI can select the best match based on the provided context. "
947 | "Much faster than full search (~200ms) and provides multiple options for better accuracy."
948 | ),
949 | "readOnlyHint": True,
950 | "openWorldHint": True
951 | }
952 | )
953 | async def autocomplete_authors(
954 | name: str,
955 | context: Optional[str] = None,
956 | limit: int = 10,
957 | filter_no_institution: bool = True,
958 | enable_institution_ranking: bool = True
959 | ) -> dict:
960 | """
961 | Enhanced autocomplete authors with intelligent filtering and ranking.
962 |
963 | Args:
964 | name: Author name to search for (e.g., "James Briscoe", "M. Ralser")
965 | context: Optional context to help with disambiguation (e.g., "Francis Crick Institute developmental biology", "Max Planck Institute Köln Germany")
966 | limit: Maximum number of candidates to return (default: 10, max: 15)
967 | filter_no_institution: If True, exclude candidates with no institutional affiliation (default: True)
968 | enable_institution_ranking: If True, rank candidates by institutional context relevance (default: True)
969 |
970 | Returns:
971 | dict: Serialized AutocompleteAuthorsResponse with filtered and ranked candidate authors, including:
972 | - openalex_id: Full OpenAlex author ID
973 | - display_name: Author's display name
974 | - institution_hint: Current/last known institution
975 | - works_count: Number of published works
976 | - cited_by_count: Total citation count
977 | - external_id: ORCID or other external identifiers
978 | - search_metadata: Information about filtering and ranking applied
979 |
980 | Example usage:
981 | # Get high-quality candidates with institutional filtering
982 | candidates = await autocomplete_authors("Ivan Matić", context="Max Planck Institute Biology Ageing Köln Germany")
983 |
984 | # For seasoned researchers, institution hints and ranking help disambiguation
985 | # AI can then select the best match or retrieve works for further verification
986 |
987 | Enhanced Features:
988 | - Filters out candidates with no institutional affiliation (reduces noise)
989 | - Institution-aware ranking when context is provided (improves accuracy)
990 | - Higher default limit (10 vs 5) for better candidate coverage
991 | - Detailed logging for debugging and optimization
992 | """
993 | # Ensure reasonable limits - increased max to 15
994 | limit = min(max(limit, 1), 15)
995 |
996 | response = autocomplete_authors_core(
997 | name=name,
998 | context=context,
999 | limit=limit,
1000 | filter_no_institution=filter_no_institution,
1001 | enable_institution_ranking=enable_institution_ranking
1002 | )
1003 | return response.model_dump()
1004 |
1005 |
1006 | # PubMed Integration Functions
1007 | import requests
1008 | import xml.etree.ElementTree as ET
1009 | from typing import Union
1010 |
1011 | def pubmed_search_core(
1012 | query: str,
1013 | max_results: int = 20,
1014 | search_type: str = "author"
1015 | ) -> dict:
1016 | """
1017 | Core PubMed search functionality using E-utilities API.
1018 |
1019 | Args:
1020 | query: Search query (author name, DOI, or keywords)
1021 | max_results: Maximum number of results to return
1022 | search_type: Type of search ("author", "doi", "title", "keywords")
1023 |
1024 | Returns:
1025 | dict with search results including PMIDs, total count, and basic metadata
1026 | """
1027 | base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1028 |
1029 | try:
1030 | # Format search term based on type
1031 | if search_type == "author":
1032 | search_term = f'"{query}"[Author]'
1033 | elif search_type == "doi":
1034 | clean_doi = query.replace('https://doi.org/', '').replace('http://dx.doi.org/', '')
1035 | search_term = f'"{clean_doi}"[AID]'
1036 | elif search_type == "title":
1037 | search_term = f'"{query}"[Title]'
1038 | else: # keywords
1039 | search_term = query
1040 |
1041 | logger.info(f"🔍 PubMed search: {search_term} (max: {max_results})")
1042 |
1043 | # Search PubMed
1044 | search_url = f"{base_url}esearch.fcgi"
1045 | search_params = {
1046 | 'db': 'pubmed',
1047 | 'term': search_term,
1048 | 'retmax': max_results,
1049 | 'retmode': 'json',
1050 | 'sort': 'relevance'
1051 | }
1052 |
1053 | response = requests.get(search_url, params=search_params, timeout=10)
1054 | response.raise_for_status()
1055 | search_data = response.json()
1056 |
1057 | pmids = search_data.get('esearchresult', {}).get('idlist', [])
1058 | total_count = int(search_data.get('esearchresult', {}).get('count', 0))
1059 |
1060 | logger.info(f"📊 Found {total_count} total results, retrieved {len(pmids)} PMIDs")
1061 |
1062 | # Get basic details for retrieved PMIDs (if any)
1063 | articles = []
1064 | if pmids:
1065 | articles = get_pubmed_summaries(pmids[:min(len(pmids), 10)]) # Limit to 10 for performance
1066 |
1067 | return {
1068 | 'query': query,
1069 | 'search_type': search_type,
1070 | 'search_term_used': search_term,
1071 | 'total_count': total_count,
1072 | 'retrieved_count': len(pmids),
1073 | 'pmids': pmids,
1074 | 'articles': articles,
1075 | 'search_metadata': {
1076 | 'api_used': 'pubmed_esearch',
1077 | 'max_results_requested': max_results,
1078 | 'response_time_ms': None
1079 | }
1080 | }
1081 |
1082 | except Exception as e:
1083 | logger.error(f"❌ PubMed search error: {e}")
1084 | return {
1085 | 'query': query,
1086 | 'search_type': search_type,
1087 | 'total_count': 0,
1088 | 'retrieved_count': 0,
1089 | 'pmids': [],
1090 | 'articles': [],
1091 | 'error': str(e)
1092 | }
1093 |
1094 |
1095 | def get_pubmed_summaries(pmids: list) -> list:
1096 | """
1097 | Get summary information for a list of PMIDs using esummary.
1098 |
1099 | Args:
1100 | pmids: List of PubMed IDs
1101 |
1102 | Returns:
1103 | List of article summaries with basic metadata
1104 | """
1105 | if not pmids:
1106 | return []
1107 |
1108 | base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1109 |
1110 | try:
1111 | # Get summaries
1112 | summary_url = f"{base_url}esummary.fcgi"
1113 | summary_params = {
1114 | 'db': 'pubmed',
1115 | 'id': ','.join(pmids),
1116 | 'retmode': 'json'
1117 | }
1118 |
1119 | response = requests.get(summary_url, params=summary_params, timeout=15)
1120 | response.raise_for_status()
1121 | summary_data = response.json()
1122 |
1123 | articles = []
1124 | uids = summary_data.get('result', {}).get('uids', [])
1125 |
1126 | for uid in uids:
1127 | article_data = summary_data.get('result', {}).get(uid, {})
1128 | if article_data:
1129 | # Extract key information
1130 | authors = article_data.get('authors', [])
1131 | author_names = [author.get('name', '') for author in authors[:5]] # First 5 authors
1132 |
1133 | article = {
1134 | 'pmid': uid,
1135 | 'title': article_data.get('title', ''),
1136 | 'authors': author_names,
1137 | 'journal': article_data.get('fulljournalname', ''),
1138 | 'pub_date': article_data.get('pubdate', ''),
1139 | 'doi': article_data.get('elocationid', ''), # Often contains DOI
1140 | 'pmcid': article_data.get('pmcid', ''),
1141 | 'publication_types': article_data.get('pubtype', [])
1142 | }
1143 | articles.append(article)
1144 |
1145 | logger.info(f"📄 Retrieved summaries for {len(articles)} articles")
1146 | return articles
1147 |
1148 | except Exception as e:
1149 | logger.error(f"❌ Error getting PubMed summaries: {e}")
1150 | return []
1151 |
1152 |
1153 | def get_pubmed_author_sample(author_name: str, sample_size: int = 5) -> dict:
1154 | """
1155 | Get a sample of works by an author from PubMed with institutional information.
1156 |
1157 | Args:
1158 | author_name: Author name to search for
1159 | sample_size: Number of sample works to analyze in detail
1160 |
1161 | Returns:
1162 | dict with author sample analysis including affiliations and name variants
1163 | """
1164 | try:
1165 | logger.info(f"🔍 Getting PubMed author sample for: {author_name}")
1166 |
1167 | # Search for author
1168 | search_result = pubmed_search_core(author_name, max_results=sample_size, search_type="author")
1169 |
1170 | if not search_result['pmids']:
1171 | return {
1172 | 'author_name': author_name,
1173 | 'total_works': 0,
1174 | 'sample_works': [],
1175 | 'institutional_keywords': [],
1176 | 'name_variants': [],
1177 | 'email_addresses': []
1178 | }
1179 |
1180 | # Get detailed information for sample
1181 | sample_pmids = search_result['pmids'][:sample_size]
1182 | detailed_articles = []
1183 | all_affiliations = []
1184 | name_variants = set()
1185 | email_addresses = set()
1186 |
1187 | for pmid in sample_pmids:
1188 | article_details = get_detailed_pubmed_article(pmid, author_name)
1189 | if article_details:
1190 | detailed_articles.append(article_details)
1191 |
1192 | # Extract affiliations and variants for target author
1193 | for author_info in article_details.get('author_details', []):
1194 | if is_target_author(author_info, author_name):
1195 | all_affiliations.extend(author_info.get('affiliations', []))
1196 |
1197 | # Collect name variants
1198 | full_name = f"{author_info['first_name']} {author_info['last_name']}".strip()
1199 | if full_name:
1200 | name_variants.add(full_name)
1201 |
1202 | # Extract email addresses
1203 | for affil in author_info.get('affiliations', []):
1204 | emails = extract_emails_from_text(affil)
1205 | email_addresses.update(emails)
1206 |
1207 | # Extract institutional keywords
1208 | institutional_keywords = extract_institutional_keywords(all_affiliations)
1209 |
1210 | return {
1211 | 'author_name': author_name,
1212 | 'total_works': search_result['total_count'],
1213 | 'sample_works': detailed_articles,
1214 | 'institutional_keywords': institutional_keywords,
1215 | 'name_variants': list(name_variants),
1216 | 'email_addresses': list(email_addresses),
1217 | 'sample_metadata': {
1218 | 'sample_size': len(detailed_articles),
1219 | 'affiliations_found': len(all_affiliations)
1220 | }
1221 | }
1222 |
1223 | except Exception as e:
1224 | logger.error(f"❌ Error in PubMed author sample: {e}")
1225 | return {
1226 | 'author_name': author_name,
1227 | 'total_works': 0,
1228 | 'sample_works': [],
1229 | 'error': str(e)
1230 | }
1231 |
1232 |
1233 | def get_detailed_pubmed_article(pmid: str, target_author: str) -> dict:
1234 | """Get detailed article information including author affiliations"""
1235 | base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/"
1236 |
1237 | try:
1238 | fetch_url = f"{base_url}efetch.fcgi"
1239 | fetch_params = {
1240 | 'db': 'pubmed',
1241 | 'id': pmid,
1242 | 'retmode': 'xml',
1243 | 'rettype': 'abstract'
1244 | }
1245 |
1246 | response = requests.get(fetch_url, params=fetch_params, timeout=10)
1247 | response.raise_for_status()
1248 |
1249 | # Parse XML
1250 | root = ET.fromstring(response.text)
1251 | article = root.find('.//PubmedArticle')
1252 |
1253 | if article is None:
1254 | return None
1255 |
1256 | # Extract basic info
1257 | title_elem = article.find('.//ArticleTitle')
1258 | title = ''.join(title_elem.itertext()).strip() if title_elem is not None else ''
1259 |
1260 | journal_elem = article.find('.//Journal/Title')
1261 | journal = journal_elem.text if journal_elem is not None else ''
1262 |
1263 | # Extract authors with affiliations
1264 | author_details = []
1265 | author_list = article.find('.//AuthorList')
1266 | if author_list is not None:
1267 | for author_elem in author_list.findall('Author'):
1268 | author_info = extract_detailed_author_info(author_elem)
1269 | author_details.append(author_info)
1270 |
1271 | return {
1272 | 'pmid': pmid,
1273 | 'title': title,
1274 | 'journal': journal,
1275 | 'author_details': author_details
1276 | }
1277 |
1278 | except Exception as e:
1279 | logger.error(f"❌ Error fetching detailed article {pmid}: {e}")
1280 | return None
1281 |
1282 |
1283 | def extract_detailed_author_info(author_elem: ET.Element) -> dict:
1284 | """Extract detailed author information from XML element"""
1285 | author_info = {
1286 | 'last_name': '',
1287 | 'first_name': '',
1288 | 'initials': '',
1289 | 'affiliations': []
1290 | }
1291 |
1292 | try:
1293 | last_name = author_elem.find('LastName')
1294 | if last_name is not None:
1295 | author_info['last_name'] = last_name.text or ''
1296 |
1297 | first_name = author_elem.find('ForeName')
1298 | if first_name is not None:
1299 | author_info['first_name'] = first_name.text or ''
1300 |
1301 | initials = author_elem.find('Initials')
1302 | if initials is not None:
1303 | author_info['initials'] = initials.text or ''
1304 |
1305 | # Get affiliations
1306 | affil_info = author_elem.find('AffiliationInfo')
1307 | if affil_info is not None:
1308 | for affil in affil_info.findall('Affiliation'):
1309 | if affil.text:
1310 | author_info['affiliations'].append(affil.text.strip())
1311 |
1312 | except Exception:
1313 | pass
1314 |
1315 | return author_info
1316 |
1317 |
1318 | def is_target_author(author_info: dict, target_name: str) -> bool:
1319 | """Check if author_info matches target author name"""
1320 | full_name = f"{author_info['first_name']} {author_info['last_name']}".strip().lower()
1321 | target_lower = target_name.lower()
1322 |
1323 | # Simple similarity check
1324 | return (target_lower in full_name or
1325 | full_name in target_lower or
1326 | author_info['last_name'].lower() in target_lower)
1327 |
1328 |
1329 | def extract_emails_from_text(text: str) -> list:
1330 | """Extract email addresses from text"""
1331 | import re
1332 | email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
1333 | return re.findall(email_pattern, text)
1334 |
1335 |
1336 | def extract_institutional_keywords(affiliations: list) -> list:
1337 | """Extract common institutional keywords from affiliations"""
1338 | if not affiliations:
1339 | return []
1340 |
1341 | # Combine all affiliations
1342 | all_text = ' '.join(affiliations).lower()
1343 |
1344 | # Common institutional keywords
1345 | keywords = []
1346 | institutional_terms = [
1347 | 'university', 'institute', 'college', 'school', 'center', 'centre',
1348 | 'hospital', 'laboratory', 'department', 'faculty', 'division',
1349 | 'max planck', 'harvard', 'stanford', 'mit', 'cambridge', 'oxford',
1350 | 'excellence cluster', 'cnrs', 'inserm', 'nih'
1351 | ]
1352 |
1353 | for term in institutional_terms:
1354 | if term in all_text:
1355 | keywords.append(term)
1356 |
1357 | return keywords[:10] # Return top 10
1358 |
1359 |
1360 | @mcp.tool(
1361 | annotations={
1362 | "title": "Search PubMed",
1363 | "description": (
1364 | "Search PubMed database for publications by author, DOI, title, or keywords. "
1365 | "Provides basic article metadata including authors, journal, and publication info. "
1366 | "Useful for cross-validation with OpenAlex data and discovering name variants."
1367 | ),
1368 | "readOnlyHint": True,
1369 | "openWorldHint": True
1370 | }
1371 | )
1372 | async def search_pubmed(
1373 | query: str,
1374 | search_type: str = "author",
1375 | max_results: int = 20
1376 | ) -> dict:
1377 | """
1378 | Search PubMed database for publications.
1379 |
1380 | Args:
1381 | query: Search query (author name, DOI, title, or keywords)
1382 | search_type: Type of search - "author", "doi", "title", or "keywords" (default: "author")
1383 | max_results: Maximum number of results to return (default: 20, max: 50)
1384 |
1385 | Returns:
1386 | dict: Search results with PMIDs, article metadata, and summary statistics
1387 |
1388 | Example usage:
1389 | # Search for author
1390 | search_pubmed("Ivan Matic", search_type="author", max_results=10)
1391 |
1392 | # Search by DOI
1393 | search_pubmed("10.1038/nprot.2009.36", search_type="doi")
1394 |
1395 | # Search by keywords
1396 | search_pubmed("ADP-ribosylation DNA repair", search_type="keywords")
1397 | """
1398 | # Validate parameters
1399 | max_results = min(max(max_results, 1), 50) # Cap at 50 for performance
1400 | valid_types = ["author", "doi", "title", "keywords"]
1401 | if search_type not in valid_types:
1402 | search_type = "author"
1403 |
1404 | logger.info(f"🔍 PubMed search: '{query}' (type: {search_type}, max: {max_results})")
1405 |
1406 | result = pubmed_search_core(query, max_results, search_type)
1407 | return result
1408 |
1409 |
1410 | @mcp.tool(
1411 | annotations={
1412 | "title": "PubMed Author Sample",
1413 | "description": (
1414 | "Get a detailed sample of works by an author from PubMed including "
1415 | "institutional affiliations, name variants, and email addresses. "
1416 | "Useful for cross-validation and institutional disambiguation."
1417 | ),
1418 | "readOnlyHint": True,
1419 | "openWorldHint": True
1420 | }
1421 | )
1422 | async def pubmed_author_sample(
1423 | author_name: str,
1424 | sample_size: int = 5
1425 | ) -> dict:
1426 | """
1427 | Get detailed author sample from PubMed with institutional information.
1428 |
1429 | Args:
1430 | author_name: Author name to search for (e.g., "Ivan Matic", "J Smith")
1431 | sample_size: Number of recent works to analyze in detail (default: 5, max: 10)
1432 |
1433 | Returns:
1434 | dict: Author analysis including:
1435 | - total_works: Total number of works found in PubMed
1436 | - sample_works: Detailed information for sample works
1437 | - institutional_keywords: Common institutional terms found
1438 | - name_variants: Different name formats found
1439 | - email_addresses: Email addresses extracted from affiliations
1440 |
1441 | Example usage:
1442 | # Get institutional profile for author
1443 | pubmed_author_sample("Ivan Matic", sample_size=5)
1444 | """
1445 | # Validate parameters
1446 | sample_size = min(max(sample_size, 1), 10) # Cap at 10 for performance
1447 |
1448 | logger.info(f"🔍 PubMed author sample: '{author_name}' (sample: {sample_size})")
1449 |
1450 | result = get_pubmed_author_sample(author_name, sample_size)
1451 | return result
1452 |
1453 |
1454 | # ============================================================================
1455 | # ORCID Integration Functions
1456 | # ============================================================================
1457 |
1458 | async def search_orcid_by_name(name: str, affiliation: str = None, max_results: int = 10) -> dict:
1459 | """
1460 | Search ORCID by author name and optionally affiliation.
1461 |
1462 | Args:
1463 | name: Author name to search
1464 | affiliation: Optional affiliation to help disambiguation
1465 | max_results: Maximum number of results to return
1466 |
1467 | Returns:
1468 | dict: ORCID search results with author profiles
1469 | """
1470 | try:
1471 | # ORCID Public API search endpoint
1472 | base_url = "https://pub.orcid.org/v3.0/search"
1473 |
1474 | # Build search query
1475 | query_parts = []
1476 | if name:
1477 | # Split name into parts for better matching
1478 | name_parts = name.replace(",", "").split()
1479 | if len(name_parts) >= 2:
1480 | # Assume last part is family name, rest are given names
1481 | family_name = name_parts[-1]
1482 | given_names = " ".join(name_parts[:-1])
1483 | query_parts.append(f'family-name:"{family_name}"')
1484 | query_parts.append(f'given-names:"{given_names}"')
1485 | else:
1486 | query_parts.append(f'text:"{name}"')
1487 |
1488 | if affiliation:
1489 | query_parts.append(f'affiliation-org-name:"{affiliation}"')
1490 |
1491 | query = " AND ".join(query_parts)
1492 |
1493 | params = {
1494 | 'q': query,
1495 | 'rows': min(max_results, 50), # ORCID API limit
1496 | 'start': 0
1497 | }
1498 |
1499 | headers = {
1500 | 'Accept': 'application/json',
1501 | 'User-Agent': f'alex-mcp (+{get_config()["OPENALEX_MAILTO"]})'
1502 | }
1503 |
1504 | logger.info(f"🔍 ORCID search: '{query}' (max: {max_results})")
1505 |
1506 | async with aiohttp.ClientSession() as session:
1507 | async with session.get(base_url, params=params, headers=headers) as response:
1508 | if response.status == 200:
1509 | data = await response.json()
1510 |
1511 | results = []
1512 | for result in data.get('result', []):
1513 | orcid_id = result.get('orcid-identifier', {}).get('path', '')
1514 |
1515 | # Extract name information
1516 | person = result.get('person', {})
1517 | names = person.get('name', {})
1518 | given_names = names.get('given-names', {}).get('value', '') if names.get('given-names') else ''
1519 | family_name = names.get('family-name', {}).get('value', '') if names.get('family-name') else ''
1520 |
1521 | # Extract employment/affiliation info
1522 | employments = []
1523 | employment_summaries = result.get('employment-summary', [])
1524 | for emp in employment_summaries[:3]: # Limit to top 3
1525 | org_name = emp.get('organization', {}).get('name', '')
1526 | if org_name:
1527 | employments.append(org_name)
1528 |
1529 | results.append({
1530 | 'orcid_id': orcid_id,
1531 | 'orcid_url': f'https://orcid.org/{orcid_id}' if orcid_id else '',
1532 | 'given_names': given_names,
1533 | 'family_name': family_name,
1534 | 'full_name': f"{given_names} {family_name}".strip(),
1535 | 'employments': employments,
1536 | 'relevance_score': result.get('relevance-score', {}).get('value', 0)
1537 | })
1538 |
1539 | logger.info(f"📊 Found {len(results)} ORCID profiles")
1540 |
1541 | return {
1542 | 'total_found': data.get('num-found', 0),
1543 | 'results_returned': len(results),
1544 | 'results': results
1545 | }
1546 | else:
1547 | logger.warning(f"ORCID API error: {response.status}")
1548 | return {'total_found': 0, 'results_returned': 0, 'results': [], 'error': f'HTTP {response.status}'}
1549 |
1550 | except Exception as e:
1551 | logger.error(f"ORCID search error: {str(e)}")
1552 | return {'total_found': 0, 'results_returned': 0, 'results': [], 'error': str(e)}
1553 |
1554 |
1555 | async def get_orcid_works(orcid_id: str, max_works: int = 20) -> dict:
1556 | """
1557 | Get works/publications for a specific ORCID ID.
1558 |
1559 | Args:
1560 | orcid_id: ORCID identifier (e.g., "0000-0000-0000-0000")
1561 | max_works: Maximum number of works to retrieve
1562 |
1563 | Returns:
1564 | dict: Works information from ORCID profile
1565 | """
1566 | try:
1567 | # Clean ORCID ID (remove URL if present)
1568 | clean_orcid = orcid_id.replace('https://orcid.org/', '').replace('http://orcid.org/', '')
1569 | if not re.match(r'^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$', clean_orcid):
1570 | return {'error': 'Invalid ORCID format', 'works': []}
1571 |
1572 | # ORCID Public API works endpoint
1573 | url = f"https://pub.orcid.org/v3.0/{clean_orcid}/works"
1574 |
1575 | headers = {
1576 | 'Accept': 'application/json',
1577 | 'User-Agent': f'alex-mcp (+{get_config()["OPENALEX_MAILTO"]})'
1578 | }
1579 |
1580 | logger.info(f"🔍 Getting ORCID works: {clean_orcid} (max: {max_works})")
1581 |
1582 | async with aiohttp.ClientSession() as session:
1583 | async with session.get(url, headers=headers) as response:
1584 | if response.status == 200:
1585 | data = await response.json()
1586 |
1587 | works = []
1588 | work_summaries = data.get('group', [])[:max_works]
1589 |
1590 | for group in work_summaries:
1591 | for work_summary in group.get('work-summary', []):
1592 | title_info = work_summary.get('title', {})
1593 | title = title_info.get('title', {}).get('value', '') if title_info else ''
1594 |
1595 | journal_title = work_summary.get('journal-title', {}).get('value', '') if work_summary.get('journal-title') else ''
1596 |
1597 | # Extract publication date
1598 | pub_date = work_summary.get('publication-date')
1599 | pub_year = ''
1600 | if pub_date and pub_date.get('year'):
1601 | pub_year = pub_date['year'].get('value', '')
1602 |
1603 | # Extract external IDs (DOI, PMID, etc.)
1604 | external_ids = {}
1605 | for ext_id in work_summary.get('external-ids', {}).get('external-id', []):
1606 | id_type = ext_id.get('external-id-type', '')
1607 | id_value = ext_id.get('external-id-value', '')
1608 | if id_type and id_value:
1609 | external_ids[id_type.lower()] = id_value
1610 |
1611 | works.append({
1612 | 'title': title,
1613 | 'journal': journal_title,
1614 | 'publication_year': pub_year,
1615 | 'external_ids': external_ids,
1616 | 'doi': external_ids.get('doi', ''),
1617 | 'pmid': external_ids.get('pmid', ''),
1618 | 'type': work_summary.get('type', '')
1619 | })
1620 |
1621 | logger.info(f"📊 Retrieved {len(works)} works from ORCID")
1622 |
1623 | return {
1624 | 'orcid_id': clean_orcid,
1625 | 'total_works': len(works),
1626 | 'works': works
1627 | }
1628 | else:
1629 | logger.warning(f"ORCID works API error: {response.status}")
1630 | return {'error': f'HTTP {response.status}', 'works': []}
1631 |
1632 | except Exception as e:
1633 | logger.error(f"ORCID works error: {str(e)}")
1634 | return {'error': str(e), 'works': []}
1635 |
1636 |
1637 | # ============================================================================
1638 | # ORCID MCP Tools
1639 | # ============================================================================
1640 |
1641 | @mcp.tool(
1642 | annotations={
1643 | "title": "Search ORCID Authors",
1644 | "description": (
1645 | "Search ORCID database for author profiles by name and optionally affiliation. "
1646 | "Provides ORCID IDs, verified names, and institutional affiliations for "
1647 | "enhanced author disambiguation and verification."
1648 | ),
1649 | "readOnlyHint": True,
1650 | "openWorldHint": True
1651 | }
1652 | )
1653 | async def search_orcid_authors(
1654 | name: str,
1655 | affiliation: str = None,
1656 | max_results: int = 10
1657 | ) -> dict:
1658 | """
1659 | Search ORCID for author profiles by name and affiliation.
1660 |
1661 | Args:
1662 | name: Author name to search (e.g., "John Smith", "Maria Garcia")
1663 | affiliation: Optional institutional affiliation for disambiguation
1664 | max_results: Maximum number of results to return (default: 10, max: 50)
1665 |
1666 | Returns:
1667 | dict: ORCID search results with:
1668 | - total_found: Total number of matches found
1669 | - results_returned: Number of results returned
1670 | - results: List of author profiles with ORCID IDs, names, and affiliations
1671 |
1672 | Example usage:
1673 | # Basic name search
1674 | search_orcid_authors("John Smith")
1675 |
1676 | # Search with affiliation for better disambiguation
1677 | search_orcid_authors("Maria Garcia", "University of Barcelona")
1678 | """
1679 | # Validate parameters
1680 | max_results = min(max(max_results, 1), 50) # ORCID API limit
1681 |
1682 | result = await search_orcid_by_name(name, affiliation, max_results)
1683 | return result
1684 |
1685 |
1686 | @mcp.tool(
1687 | annotations={
1688 | "title": "Get ORCID Works",
1689 | "description": (
1690 | "Retrieve publications/works from a specific ORCID profile. "
1691 | "Useful for cross-validation with OpenAlex data and verifying "
1692 | "author publication records."
1693 | ),
1694 | "readOnlyHint": True,
1695 | "openWorldHint": True
1696 | }
1697 | )
1698 | async def get_orcid_publications(
1699 | orcid_id: str,
1700 | max_works: int = 20
1701 | ) -> dict:
1702 | """
1703 | Get publications/works from an ORCID profile.
1704 |
1705 | Args:
1706 | orcid_id: ORCID identifier (e.g., "0000-0000-0000-0000" or full URL)
1707 | max_works: Maximum number of works to retrieve (default: 20, max: 100)
1708 |
1709 | Returns:
1710 | dict: Publications data with:
1711 | - orcid_id: Cleaned ORCID identifier
1712 | - total_works: Number of works found
1713 | - works: List of publications with titles, journals, DOIs, PMIDs
1714 |
1715 | Example usage:
1716 | # Get works for specific ORCID
1717 | get_orcid_publications("0000-0000-0000-0000")
1718 |
1719 | # Get limited number of works
1720 | get_orcid_publications("0000-0000-0000-0000", max_works=10)
1721 | """
1722 | # Validate parameters
1723 | max_works = min(max(max_works, 1), 100) # Reasonable limit
1724 |
1725 | result = await get_orcid_works(orcid_id, max_works)
1726 | return result
1727 |
1728 |
1729 | def main():
1730 | """
1731 | Entry point for the enhanced alex-mcp server with balanced peer-review filtering.
1732 | """
1733 | import asyncio
1734 | logger.info("Enhanced OpenAlex Author Disambiguation MCP Server starting...")
1735 | logger.info("Features: ~70% token reduction for authors, ~80% for works")
1736 | logger.info("Balanced peer-review filtering: excludes data catalogs while preserving legitimate papers")
1737 | asyncio.run(mcp.run())
1738 |
1739 |
1740 | if __name__ == "__main__":
1741 | main()
--------------------------------------------------------------------------------