├── .python-version ├── src └── dataset_viewer │ ├── __init__.py │ └── server.py ├── pyproject.toml ├── .gitignore ├── LICENSE ├── README.md └── uv.lock /.python-version: -------------------------------------------------------------------------------- 1 | 3.12 2 | -------------------------------------------------------------------------------- /src/dataset_viewer/__init__.py: -------------------------------------------------------------------------------- 1 | from . import server 2 | import asyncio 3 | 4 | def main(): 5 | """Main entry point for the package.""" 6 | asyncio.run(server.main()) 7 | 8 | # Expose important items at package level 9 | __all__ = ['main', 'server'] -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "dataset-viewer" 3 | version = "0.1.0" 4 | description = "MCP server for interacting with Hugging Face dataset viewer API, providing dataset browsing, filtering, and statistics capabilities" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "mcp>=1.1.2", 9 | "httpx>=0.28.1", 10 | ] 11 | 12 | [[project.authors]] 13 | name = "privetin" 14 | email = "81558906+privetin@users.noreply.github.com" 15 | 16 | [build-system] 17 | requires = ["hatchling"] 18 | build-backend = "hatchling.build" 19 | 20 | [project.scripts] 21 | dataset-viewer = "dataset_viewer:main" 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python-generated files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | build/ 6 | dist/ 7 | wheels/ 8 | *.egg-info/ 9 | .eggs/ 10 | *.so 11 | MANIFEST 12 | 13 | # Virtual environments 14 | .venv 15 | venv/ 16 | ENV/ 17 | env/ 18 | .env/ 19 | .python-version 20 | 21 | # IDE settings 22 | .idea/ 23 | .vscode/ 24 | *.swp 25 | *.swo 26 | .project 27 | .pydevproject 28 | .settings/ 29 | *.sublime-project 30 | *.sublime-workspace 31 | 32 | # Testing 33 | .coverage 34 | coverage.xml 35 | *.cover 36 | .pytest_cache/ 37 | .tox/ 38 | nosetests.xml 39 | htmlcov/ 40 | .hypothesis/ 41 | .coverage.* 42 | 43 | # Documentation 44 | docs/_build/ 45 | site/ 46 | docs/generated/ 47 | 48 | # Jupyter Notebook 49 | .ipynb_checkpoints 50 | *.ipynb 51 | 52 | # Environment variables 53 | .env 54 | .env.local 55 | .env*.local 56 | *.env 57 | 58 | # OS-specific files 59 | .DS_Store 60 | Thumbs.db 61 | Desktop.ini 62 | $RECYCLE.BIN/ 63 | 64 | # Logs and databases 65 | *.log 66 | *.sqlite 67 | *.db 68 | 69 | # Local development 70 | local_settings.py 71 | db.sqlite3 72 | media/ 73 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Dataset Viewer Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dataset Viewer MCP Server 2 | 3 | An MCP server for interacting with the [Hugging Face Dataset Viewer API](https://huggingface.co/docs/dataset-viewer), providing capabilities to browse and analyze datasets hosted on the Hugging Face Hub. 4 | 5 | ## Features 6 | 7 | ### Resources 8 | 9 | - Uses `dataset://` URI scheme for accessing Hugging Face datasets 10 | - Supports dataset configurations and splits 11 | - Provides paginated access to dataset contents 12 | - Handles authentication for private datasets 13 | - Supports searching and filtering dataset contents 14 | - Provides dataset statistics and analysis 15 | 16 | ### Tools 17 | 18 | The server provides the following tools: 19 | 20 | 1. **validate** 21 | - Check if a dataset exists and is accessible 22 | - Parameters: 23 | - `dataset`: Dataset identifier (e.g. 'stanfordnlp/imdb') 24 | - `auth_token` (optional): For private datasets 25 | 26 | 2. **get_info** 27 | - Get detailed information about a dataset 28 | - Parameters: 29 | - `dataset`: Dataset identifier 30 | - `auth_token` (optional): For private datasets 31 | 32 | 3. **get_rows** 33 | - Get paginated contents of a dataset 34 | - Parameters: 35 | - `dataset`: Dataset identifier 36 | - `config`: Configuration name 37 | - `split`: Split name 38 | - `page` (optional): Page number (0-based) 39 | - `auth_token` (optional): For private datasets 40 | 41 | 4. **get_first_rows** 42 | - Get first rows from a dataset split 43 | - Parameters: 44 | - `dataset`: Dataset identifier 45 | - `config`: Configuration name 46 | - `split`: Split name 47 | - `auth_token` (optional): For private datasets 48 | 49 | 5. **get_statistics** 50 | - Get statistics about a dataset split 51 | - Parameters: 52 | - `dataset`: Dataset identifier 53 | - `config`: Configuration name 54 | - `split`: Split name 55 | - `auth_token` (optional): For private datasets 56 | 57 | 6. **search_dataset** 58 | - Search for text within a dataset 59 | - Parameters: 60 | - `dataset`: Dataset identifier 61 | - `config`: Configuration name 62 | - `split`: Split name 63 | - `query`: Text to search for 64 | - `auth_token` (optional): For private datasets 65 | 66 | 7. **filter** 67 | - Filter rows using SQL-like conditions 68 | - Parameters: 69 | - `dataset`: Dataset identifier 70 | - `config`: Configuration name 71 | - `split`: Split name 72 | - `where`: SQL WHERE clause (e.g. "score > 0.5") 73 | - `orderby` (optional): SQL ORDER BY clause 74 | - `page` (optional): Page number (0-based) 75 | - `auth_token` (optional): For private datasets 76 | 77 | 8. **get_parquet** 78 | - Download entire dataset in Parquet format 79 | - Parameters: 80 | - `dataset`: Dataset identifier 81 | - `auth_token` (optional): For private datasets 82 | 83 | ## Installation 84 | 85 | ### Prerequisites 86 | 87 | - Python 3.12 or higher 88 | - [uv](https://github.com/astral-sh/uv) - Fast Python package installer and resolver 89 | 90 | ### Setup 91 | 92 | 1. Clone the repository: 93 | ```bash 94 | git clone https://github.com/privetin/dataset-viewer.git 95 | cd dataset-viewer 96 | ``` 97 | 98 | 2. Create a virtual environment and install: 99 | ```bash 100 | # Create virtual environment 101 | uv venv 102 | 103 | # Activate virtual environment 104 | # On Unix: 105 | source .venv/bin/activate 106 | # On Windows: 107 | .venv\Scripts\activate 108 | 109 | # Install in development mode 110 | uv add -e . 111 | ``` 112 | 113 | ## Configuration 114 | 115 | ### Environment Variables 116 | 117 | - `HUGGINGFACE_TOKEN`: Your Hugging Face API token for accessing private datasets 118 | 119 | ### Claude Desktop Integration 120 | 121 | Add the following to your Claude Desktop config file: 122 | 123 | On Windows: `%APPDATA%\Claude\claude_desktop_config.json` 124 | 125 | On MacOS: `~/Library/Application Support/Claude/claude_desktop_config.json` 126 | 127 | ```json 128 | { 129 | "mcpServers": { 130 | "dataset-viewer": { 131 | "command": "uv", 132 | "args": [ 133 | "--directory", 134 | "parent_to_repo/dataset-viewer", 135 | "run", 136 | "dataset-viewer" 137 | ] 138 | } 139 | } 140 | } 141 | ``` 142 | 143 | ## License 144 | 145 | MIT License - see [LICENSE](LICENSE) for details -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | requires-python = ">=3.12" 3 | 4 | [[package]] 5 | name = "annotated-types" 6 | version = "0.7.0" 7 | source = { registry = "https://pypi.org/simple" } 8 | sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 } 9 | wheels = [ 10 | { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 }, 11 | ] 12 | 13 | [[package]] 14 | name = "anyio" 15 | version = "4.7.0" 16 | source = { registry = "https://pypi.org/simple" } 17 | dependencies = [ 18 | { name = "idna" }, 19 | { name = "sniffio" }, 20 | { name = "typing-extensions", marker = "python_full_version < '3.13'" }, 21 | ] 22 | sdist = { url = "https://files.pythonhosted.org/packages/f6/40/318e58f669b1a9e00f5c4453910682e2d9dd594334539c7b7817dabb765f/anyio-4.7.0.tar.gz", hash = "sha256:2f834749c602966b7d456a7567cafcb309f96482b5081d14ac93ccd457f9dd48", size = 177076 } 23 | wheels = [ 24 | { url = "https://files.pythonhosted.org/packages/a0/7a/4daaf3b6c08ad7ceffea4634ec206faeff697526421c20f07628c7372156/anyio-4.7.0-py3-none-any.whl", hash = "sha256:ea60c3723ab42ba6fff7e8ccb0488c898ec538ff4df1f1d5e642c3601d07e352", size = 93052 }, 25 | ] 26 | 27 | [[package]] 28 | name = "certifi" 29 | version = "2024.12.14" 30 | source = { registry = "https://pypi.org/simple" } 31 | sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 } 32 | wheels = [ 33 | { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 }, 34 | ] 35 | 36 | [[package]] 37 | name = "dataset-viewer" 38 | version = "0.1.0" 39 | source = { editable = "." } 40 | dependencies = [ 41 | { name = "httpx" }, 42 | { name = "mcp" }, 43 | ] 44 | 45 | [package.metadata] 46 | requires-dist = [ 47 | { name = "httpx", specifier = ">=0.28.1" }, 48 | { name = "mcp", specifier = ">=1.1.2" }, 49 | ] 50 | 51 | [[package]] 52 | name = "h11" 53 | version = "0.14.0" 54 | source = { registry = "https://pypi.org/simple" } 55 | sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 } 56 | wheels = [ 57 | { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 }, 58 | ] 59 | 60 | [[package]] 61 | name = "httpcore" 62 | version = "1.0.7" 63 | source = { registry = "https://pypi.org/simple" } 64 | dependencies = [ 65 | { name = "certifi" }, 66 | { name = "h11" }, 67 | ] 68 | sdist = { url = "https://files.pythonhosted.org/packages/6a/41/d7d0a89eb493922c37d343b607bc1b5da7f5be7e383740b4753ad8943e90/httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c", size = 85196 } 69 | wheels = [ 70 | { url = "https://files.pythonhosted.org/packages/87/f5/72347bc88306acb359581ac4d52f23c0ef445b57157adedb9aee0cd689d2/httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd", size = 78551 }, 71 | ] 72 | 73 | [[package]] 74 | name = "httpx" 75 | version = "0.28.1" 76 | source = { registry = "https://pypi.org/simple" } 77 | dependencies = [ 78 | { name = "anyio" }, 79 | { name = "certifi" }, 80 | { name = "httpcore" }, 81 | { name = "idna" }, 82 | ] 83 | sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 } 84 | wheels = [ 85 | { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 }, 86 | ] 87 | 88 | [[package]] 89 | name = "httpx-sse" 90 | version = "0.4.0" 91 | source = { registry = "https://pypi.org/simple" } 92 | sdist = { url = "https://files.pythonhosted.org/packages/4c/60/8f4281fa9bbf3c8034fd54c0e7412e66edbab6bc74c4996bd616f8d0406e/httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721", size = 12624 } 93 | wheels = [ 94 | { url = "https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f", size = 7819 }, 95 | ] 96 | 97 | [[package]] 98 | name = "idna" 99 | version = "3.10" 100 | source = { registry = "https://pypi.org/simple" } 101 | sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 } 102 | wheels = [ 103 | { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 }, 104 | ] 105 | 106 | [[package]] 107 | name = "mcp" 108 | version = "1.1.2" 109 | source = { registry = "https://pypi.org/simple" } 110 | dependencies = [ 111 | { name = "anyio" }, 112 | { name = "httpx" }, 113 | { name = "httpx-sse" }, 114 | { name = "pydantic" }, 115 | { name = "sse-starlette" }, 116 | { name = "starlette" }, 117 | ] 118 | sdist = { url = "https://files.pythonhosted.org/packages/9b/f3/5cf212e60681ea6da0dbb6e0d1bc0ab2dbf5eebc749b69663d46f114fea1/mcp-1.1.2.tar.gz", hash = "sha256:694aa9df7a8641b24953c935eb72c63136dc948981021525a0add199bdfee402", size = 57628 } 119 | wheels = [ 120 | { url = "https://files.pythonhosted.org/packages/df/40/9883eac3718b860d4006eba1920bfcb628f0a1fe37fac46a4f4e391edca6/mcp-1.1.2-py3-none-any.whl", hash = "sha256:a4d32d60fd80a1702440ba4751b847a8a88957a1f7b059880953143e9759965a", size = 36652 }, 121 | ] 122 | 123 | [[package]] 124 | name = "pydantic" 125 | version = "2.10.4" 126 | source = { registry = "https://pypi.org/simple" } 127 | dependencies = [ 128 | { name = "annotated-types" }, 129 | { name = "pydantic-core" }, 130 | { name = "typing-extensions" }, 131 | ] 132 | sdist = { url = "https://files.pythonhosted.org/packages/70/7e/fb60e6fee04d0ef8f15e4e01ff187a196fa976eb0f0ab524af4599e5754c/pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06", size = 762094 } 133 | wheels = [ 134 | { url = "https://files.pythonhosted.org/packages/f3/26/3e1bbe954fde7ee22a6e7d31582c642aad9e84ffe4b5fb61e63b87cd326f/pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d", size = 431765 }, 135 | ] 136 | 137 | [[package]] 138 | name = "pydantic-core" 139 | version = "2.27.2" 140 | source = { registry = "https://pypi.org/simple" } 141 | dependencies = [ 142 | { name = "typing-extensions" }, 143 | ] 144 | sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443 } 145 | wheels = [ 146 | { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127 }, 147 | { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340 }, 148 | { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900 }, 149 | { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177 }, 150 | { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046 }, 151 | { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386 }, 152 | { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060 }, 153 | { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870 }, 154 | { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822 }, 155 | { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364 }, 156 | { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303 }, 157 | { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064 }, 158 | { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046 }, 159 | { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092 }, 160 | { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709 }, 161 | { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273 }, 162 | { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027 }, 163 | { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888 }, 164 | { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738 }, 165 | { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138 }, 166 | { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025 }, 167 | { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633 }, 168 | { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404 }, 169 | { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130 }, 170 | { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946 }, 171 | { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387 }, 172 | { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453 }, 173 | { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186 }, 174 | ] 175 | 176 | [[package]] 177 | name = "sniffio" 178 | version = "1.3.1" 179 | source = { registry = "https://pypi.org/simple" } 180 | sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 } 181 | wheels = [ 182 | { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 }, 183 | ] 184 | 185 | [[package]] 186 | name = "sse-starlette" 187 | version = "2.2.1" 188 | source = { registry = "https://pypi.org/simple" } 189 | dependencies = [ 190 | { name = "anyio" }, 191 | { name = "starlette" }, 192 | ] 193 | sdist = { url = "https://files.pythonhosted.org/packages/71/a4/80d2a11af59fe75b48230846989e93979c892d3a20016b42bb44edb9e398/sse_starlette-2.2.1.tar.gz", hash = "sha256:54470d5f19274aeed6b2d473430b08b4b379ea851d953b11d7f1c4a2c118b419", size = 17376 } 194 | wheels = [ 195 | { url = "https://files.pythonhosted.org/packages/d9/e0/5b8bd393f27f4a62461c5cf2479c75a2cc2ffa330976f9f00f5f6e4f50eb/sse_starlette-2.2.1-py3-none-any.whl", hash = "sha256:6410a3d3ba0c89e7675d4c273a301d64649c03a5ef1ca101f10b47f895fd0e99", size = 10120 }, 196 | ] 197 | 198 | [[package]] 199 | name = "starlette" 200 | version = "0.45.1" 201 | source = { registry = "https://pypi.org/simple" } 202 | dependencies = [ 203 | { name = "anyio" }, 204 | ] 205 | sdist = { url = "https://files.pythonhosted.org/packages/c1/be/b398217eb35b356d2d9bb84ec67071ea2842e02950fcf38b33df9d5b24ba/starlette-0.45.1.tar.gz", hash = "sha256:a8ae1fa3b1ab7ca83a4abd77871921a13fb5aeaf4874436fb96c29dfcd4ecfa3", size = 2573953 } 206 | wheels = [ 207 | { url = "https://files.pythonhosted.org/packages/6b/2c/a50484b035ee0e13ebb7a42391e391befbfc1b6a9ad5503e83badd182ada/starlette-0.45.1-py3-none-any.whl", hash = "sha256:5656c0524f586e9148d9a3c1dd5257fb42a99892fb0dc6877dd76ef4d184aac3", size = 71488 }, 208 | ] 209 | 210 | [[package]] 211 | name = "typing-extensions" 212 | version = "4.12.2" 213 | source = { registry = "https://pypi.org/simple" } 214 | sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 } 215 | wheels = [ 216 | { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 }, 217 | ] 218 | -------------------------------------------------------------------------------- /src/dataset_viewer/server.py: -------------------------------------------------------------------------------- 1 | """MCP Server for interacting with Hugging Face dataset viewer API. 2 | 3 | This server provides tools for browsing, filtering and getting statistics about datasets hosted on the 4 | Hugging Face Hub. It uses the official dataset viewer API (https://huggingface.co/docs/dataset-viewer) 5 | to provide: 6 | 7 | - Dataset validation and basic info 8 | - Paginated content viewing 9 | - Dataset statistics 10 | - Support for dataset configurations and splits 11 | 12 | Note: This only works with datasets hosted on the Hugging Face Hub. For local datasets or datasets from 13 | other sources, you'll need to upload them to Hugging Face first. 14 | """ 15 | 16 | import asyncio 17 | from typing import Optional 18 | import httpx 19 | import os 20 | import re 21 | import json 22 | 23 | from mcp.server.models import InitializationOptions 24 | import mcp.types as types 25 | from mcp.server import NotificationOptions, Server 26 | from pydantic import AnyUrl, BaseModel 27 | import mcp.server.stdio 28 | 29 | 30 | class DatasetViewerAPI: 31 | """Internal API client for dataset viewer""" 32 | def __init__(self, base_url: str = "https://datasets-server.huggingface.co", auth_token: str | None = None): 33 | self.base_url = base_url.rstrip("/") 34 | headers = {"Authorization": f"Bearer {auth_token}"} if auth_token else {} 35 | self.client = httpx.AsyncClient(base_url=self.base_url, headers=headers) 36 | 37 | async def validate_dataset(self, dataset: str) -> None: 38 | """Validate dataset ID format and check if it exists""" 39 | # Validate format (username/dataset-name) 40 | if not re.match(r"^[^/]+/[^/]+$", dataset): 41 | raise ValueError("Dataset ID must be in the format 'owner/dataset'") 42 | 43 | # Check if dataset exists and is accessible 44 | try: 45 | response = await self.client.head(f"/is-valid?dataset={dataset}") 46 | response.raise_for_status() 47 | except httpx.NetworkError as e: 48 | raise ConnectionError(f"Network error while validating dataset: {e}") 49 | except httpx.HTTPStatusError as e: 50 | if e.response.status_code == 404: 51 | raise ValueError(f"Dataset '{dataset}' not found") 52 | elif e.response.status_code == 403: 53 | raise ValueError(f"Dataset '{dataset}' exists but requires authentication") 54 | else: 55 | raise RuntimeError(f"Error validating dataset: {e}") 56 | 57 | async def get_info(self, dataset: str) -> dict: 58 | """Get detailed information about a dataset""" 59 | try: 60 | # Get detailed dataset info 61 | response = await self.client.get("/info", params={"dataset": dataset}) 62 | response.raise_for_status() 63 | return response.json() 64 | except httpx.HTTPStatusError as e: 65 | if e.response.status_code == 404: 66 | raise ValueError(f"Dataset '{dataset}' not found") 67 | raise 68 | 69 | async def get_rows(self, dataset: str, config: str, split: str, page: int = 0) -> dict: 70 | """Get paginated rows of a dataset""" 71 | params = { 72 | "dataset": dataset, 73 | "config": config, 74 | "split": split, 75 | "offset": page * 100, # 100 rows per page 76 | "length": 100 77 | } 78 | response = await self.client.get("/rows", params=params) 79 | response.raise_for_status() 80 | return response.json() 81 | 82 | async def get_statistics(self, dataset: str, config: str, split: str) -> dict: 83 | """Get statistics about a dataset""" 84 | params = { 85 | "dataset": dataset, 86 | "config": config, 87 | "split": split 88 | } 89 | response = await self.client.get("/statistics", params=params) 90 | response.raise_for_status() 91 | return response.json() 92 | 93 | async def get_first_rows(self, dataset: str, config: str, split: str) -> dict: 94 | """Get first few rows of a dataset split""" 95 | params = { 96 | "dataset": dataset, 97 | "config": config, 98 | "split": split 99 | } 100 | response = await self.client.get("/first-rows", params=params) 101 | response.raise_for_status() 102 | return response.json() 103 | 104 | async def search(self, dataset: str, config: str, split: str, query: str) -> dict: 105 | """Search for text within a dataset split""" 106 | params = { 107 | "dataset": dataset, 108 | "config": config, 109 | "split": split, 110 | "query": query 111 | } 112 | response = await self.client.get("/search", params=params) 113 | response.raise_for_status() 114 | return response.json() 115 | 116 | async def filter(self, dataset: str, config: str, split: str, where: str, orderby: str | None = None, page: int = 0) -> dict: 117 | """Filter dataset rows based on conditions""" 118 | # Validate page number 119 | if page < 0: 120 | raise ValueError("Page number must be non-negative") 121 | 122 | # Basic SQL clause validation 123 | if not where.strip(): 124 | raise ValueError("WHERE clause cannot be empty") 125 | if orderby and not orderby.strip(): 126 | raise ValueError("ORDER BY clause cannot be empty") 127 | 128 | params = { 129 | "dataset": dataset, 130 | "config": config, 131 | "split": split, 132 | "where": where, 133 | "offset": page * 100, # 100 rows per page 134 | "length": 100 135 | } 136 | if orderby: 137 | params["orderby"] = orderby 138 | 139 | try: 140 | response = await self.client.get("/filter", params=params) 141 | response.raise_for_status() 142 | return response.json() 143 | except httpx.NetworkError as e: 144 | raise ConnectionError(f"Network error while filtering dataset: {e}") 145 | except httpx.HTTPStatusError as e: 146 | if e.response.status_code == 400: 147 | raise ValueError(f"Invalid filter query: {e.response.text}") 148 | elif e.response.status_code == 404: 149 | raise ValueError(f"Dataset, config or split not found: {dataset}/{config}/{split}") 150 | else: 151 | raise RuntimeError(f"Error filtering dataset: {e}") 152 | 153 | async def get_parquet(self, dataset: str) -> bytes: 154 | """Get entire dataset in Parquet format""" 155 | response = await self.client.get("/parquet", params={"dataset": dataset}) 156 | response.raise_for_status() 157 | return response.content 158 | 159 | async def get_splits(self, dataset: str) -> dict: 160 | """Get list of available splits for a dataset""" 161 | response = await self.client.get("/splits", params={"dataset": dataset}) 162 | response.raise_for_status() 163 | return response.json() 164 | 165 | 166 | class DatasetState: 167 | """Manages dataset state and caching""" 168 | def __init__(self): 169 | self.datasets: dict[str, dict] = {} # Cache dataset info 170 | self.current_page: dict[str, int] = {} # Track pagination 171 | # Get auth token from environment if available 172 | auth_token = os.environ.get("HUGGINGFACE_TOKEN") 173 | self.api = DatasetViewerAPI(auth_token=auth_token) 174 | 175 | async def get_dataset(self, dataset: str) -> dict: 176 | """Get dataset info, using cache if available""" 177 | if dataset not in self.datasets: 178 | self.datasets[dataset] = await self.api.get_info(dataset) 179 | return self.datasets[dataset] 180 | 181 | 182 | # Initialize server and state 183 | server = Server("dataset-viewer") 184 | state = DatasetState() 185 | 186 | 187 | @server.list_resources() 188 | async def handle_list_resources() -> list[types.Resource]: 189 | """List available dataset resources""" 190 | resources = [] 191 | for dataset, info in state.datasets.items(): 192 | resources.append( 193 | types.Resource( 194 | uri=AnyUrl(f"dataset://{dataset}"), 195 | name=dataset, 196 | description=info.get("description", "No description available"), 197 | mimeType="application/json", 198 | ) 199 | ) 200 | return resources 201 | 202 | 203 | @server.read_resource() 204 | async def handle_read_resource(uri: AnyUrl) -> str: 205 | """Read a specific dataset's content""" 206 | if uri.scheme != "dataset": 207 | raise ValueError(f"Unsupported URI scheme: {uri.scheme}") 208 | 209 | dataset = uri.path 210 | if dataset is not None: 211 | dataset = dataset.lstrip("/") 212 | info = await state.get_dataset(dataset) 213 | return str(info) # Convert to string for display 214 | raise ValueError(f"Dataset not found: {dataset}") 215 | 216 | 217 | @server.list_tools() 218 | async def handle_list_tools() -> list[types.Tool]: 219 | """List available dataset tools for Hugging Face datasets""" 220 | return [ 221 | types.Tool( 222 | name="get_info", 223 | description="Get detailed information about a Hugging Face dataset including description, features, splits, and statistics. Run validate first to check if the dataset exists and is accessible.", 224 | inputSchema={ 225 | "type": "object", 226 | "properties": { 227 | "dataset": { 228 | "type": "string", 229 | "description": "Hugging Face dataset identifier in the format owner/dataset", 230 | "pattern": "^[^/]+/[^/]+$", 231 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 232 | }, 233 | "auth_token": { 234 | "type": "string", 235 | "description": "Hugging Face auth token for private/gated datasets", 236 | "optional": True 237 | } 238 | }, 239 | "required": ["dataset"], 240 | } 241 | ), 242 | types.Tool( 243 | name="get_rows", 244 | description="Get paginated rows from a Hugging Face dataset", 245 | inputSchema={ 246 | "type": "object", 247 | "properties": { 248 | "dataset": { 249 | "type": "string", 250 | "description": "Hugging Face dataset identifier in the format owner/dataset", 251 | "pattern": "^[^/]+/[^/]+$", 252 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 253 | }, 254 | "config": { 255 | "type": "string", 256 | "description": "Dataset configuration/subset name. Use get_info to list available configs", 257 | "examples": ["default", "en", "es"] 258 | }, 259 | "split": { 260 | "type": "string", 261 | "description": "Dataset split name. Splits partition the data for training/evaluation", 262 | "examples": ["train", "validation", "test"] 263 | }, 264 | "page": {"type": "integer", "description": "Page number (0-based), returns 100 rows per page", "default": 0}, 265 | "auth_token": { 266 | "type": "string", 267 | "description": "Hugging Face auth token for private/gated datasets", 268 | "optional": True 269 | } 270 | }, 271 | "required": ["dataset", "config", "split"], 272 | } 273 | ), 274 | types.Tool( 275 | name="get_first_rows", 276 | description="Get first rows from a Hugging Face dataset split", 277 | inputSchema={ 278 | "type": "object", 279 | "properties": { 280 | "dataset": { 281 | "type": "string", 282 | "description": "Hugging Face dataset identifier in the format owner/dataset", 283 | "pattern": "^[^/]+/[^/]+$", 284 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 285 | }, 286 | "config": { 287 | "type": "string", 288 | "description": "Dataset configuration/subset name. Use get_info to list available configs", 289 | "examples": ["default", "en", "es"] 290 | }, 291 | "split": { 292 | "type": "string", 293 | "description": "Dataset split name. Splits partition the data for training/evaluation", 294 | "examples": ["train", "validation", "test"] 295 | }, 296 | "auth_token": { 297 | "type": "string", 298 | "description": "Hugging Face auth token for private/gated datasets", 299 | "optional": True 300 | } 301 | }, 302 | "required": ["dataset", "config", "split"], 303 | } 304 | ), 305 | types.Tool( 306 | name="search_dataset", 307 | description="Search for text within a Hugging Face dataset", 308 | inputSchema={ 309 | "type": "object", 310 | "properties": { 311 | "dataset": { 312 | "type": "string", 313 | "description": "Hugging Face dataset identifier in the format owner/dataset", 314 | "pattern": "^[^/]+/[^/]+$", 315 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 316 | }, 317 | "config": { 318 | "type": "string", 319 | "description": "Dataset configuration/subset name. Use get_info to list available configs", 320 | "examples": ["default", "en", "es"] 321 | }, 322 | "split": { 323 | "type": "string", 324 | "description": "Dataset split name. Splits partition the data for training/evaluation", 325 | "examples": ["train", "validation", "test"] 326 | }, 327 | "query": {"type": "string", "description": "Text to search for in the dataset"}, 328 | "auth_token": { 329 | "type": "string", 330 | "description": "Hugging Face auth token for private/gated datasets", 331 | "optional": True 332 | } 333 | }, 334 | "required": ["dataset", "config", "split", "query"], 335 | } 336 | ), 337 | types.Tool( 338 | name="filter", 339 | description="Filter rows in a Hugging Face dataset using SQL-like conditions", 340 | inputSchema={ 341 | "type": "object", 342 | "properties": { 343 | "dataset": { 344 | "type": "string", 345 | "description": "Hugging Face dataset identifier in the format owner/dataset", 346 | "pattern": "^[^/]+/[^/]+$", 347 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 348 | }, 349 | "config": { 350 | "type": "string", 351 | "description": "Dataset configuration/subset name. Use get_info to list available configs", 352 | "examples": ["default", "en", "es"] 353 | }, 354 | "split": { 355 | "type": "string", 356 | "description": "Dataset split name. Splits partition the data for training/evaluation", 357 | "examples": ["train", "validation", "test"] 358 | }, 359 | "where": { 360 | "type": "string", 361 | "description": "SQL-like WHERE clause to filter rows", 362 | "examples": ["column = \"value\"", "score > 0.5", "text LIKE \"%query%\""] 363 | }, 364 | "orderby": { 365 | "type": "string", 366 | "description": "SQL-like ORDER BY clause to sort results", 367 | "optional": True, 368 | "examples": ["column ASC", "score DESC", "name ASC, id DESC"] 369 | }, 370 | "page": { 371 | "type": "integer", 372 | "description": "Page number for paginated results (100 rows per page)", 373 | "default": 0, 374 | "minimum": 0 375 | }, 376 | "auth_token": { 377 | "type": "string", 378 | "description": "Hugging Face auth token for private/gated datasets", 379 | "optional": True 380 | } 381 | }, 382 | "required": ["dataset", "config", "split", "where"], 383 | } 384 | ), 385 | types.Tool( 386 | name="get_statistics", 387 | description="Get statistics about a Hugging Face dataset", 388 | inputSchema={ 389 | "type": "object", 390 | "properties": { 391 | "dataset": { 392 | "type": "string", 393 | "description": "Hugging Face dataset identifier in the format owner/dataset", 394 | "pattern": "^[^/]+/[^/]+$", 395 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 396 | }, 397 | "config": { 398 | "type": "string", 399 | "description": "Dataset configuration/subset name. Use get_info to list available configs", 400 | "examples": ["default", "en", "es"] 401 | }, 402 | "split": { 403 | "type": "string", 404 | "description": "Dataset split name. Splits partition the data for training/evaluation", 405 | "examples": ["train", "validation", "test"] 406 | }, 407 | "auth_token": { 408 | "type": "string", 409 | "description": "Hugging Face auth token for private/gated datasets", 410 | "optional": True 411 | } 412 | }, 413 | "required": ["dataset", "config", "split"], 414 | } 415 | ), 416 | types.Tool( 417 | name="get_parquet", 418 | description="Export Hugging Face dataset split as Parquet file", 419 | inputSchema={ 420 | "type": "object", 421 | "properties": { 422 | "dataset": { 423 | "type": "string", 424 | "description": "Hugging Face dataset identifier in the format owner/dataset", 425 | "pattern": "^[^/]+/[^/]+$", 426 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 427 | }, 428 | "auth_token": { 429 | "type": "string", 430 | "description": "Hugging Face auth token for private/gated datasets", 431 | "optional": True 432 | } 433 | }, 434 | "required": ["dataset"], 435 | } 436 | ), 437 | types.Tool( 438 | name="validate", 439 | description="Check if a Hugging Face dataset exists and is accessible", 440 | inputSchema={ 441 | "type": "object", 442 | "properties": { 443 | "dataset": { 444 | "type": "string", 445 | "description": "Hugging Face dataset identifier in the format owner/dataset", 446 | "pattern": "^[^/]+/[^/]+$", 447 | "examples": ["ylecun/mnist", "stanfordnlp/imdb"] 448 | }, 449 | "auth_token": { 450 | "type": "string", 451 | "description": "Hugging Face auth token for private/gated datasets", 452 | "optional": True 453 | } 454 | }, 455 | "required": ["dataset"], 456 | } 457 | ), 458 | ] 459 | 460 | 461 | @server.call_tool() 462 | async def handle_call_tool( 463 | name: str, arguments: dict | None 464 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]: 465 | """Handle tool execution requests""" 466 | if arguments is None: 467 | arguments = {} 468 | 469 | # Allow overriding env token with explicit token 470 | auth_token = arguments.pop("auth_token", None) or os.environ.get("HUGGINGFACE_TOKEN") 471 | 472 | if name == "get_info": 473 | dataset = arguments["dataset"] 474 | try: 475 | response = await DatasetViewerAPI(auth_token=auth_token).client.get("/info", params={"dataset": dataset}) 476 | response.raise_for_status() 477 | result = response.json() 478 | return [ 479 | types.TextContent( 480 | type="text", 481 | text=json.dumps(result, indent=2) 482 | ) 483 | ] 484 | except httpx.HTTPStatusError as e: 485 | if e.response.status_code == 404: 486 | return [ 487 | types.TextContent( 488 | type="text", 489 | text=f"Dataset '{dataset}' not found" 490 | ) 491 | ] 492 | raise 493 | 494 | elif name == "get_rows": 495 | dataset = arguments["dataset"] 496 | config = arguments["config"] 497 | split = arguments["split"] 498 | page = arguments.get("page", 0) 499 | rows = await DatasetViewerAPI(auth_token=auth_token).get_rows(dataset, config=config, split=split, page=page) 500 | return [ 501 | types.TextContent( 502 | type="text", 503 | text=json.dumps(rows, indent=2) 504 | ) 505 | ] 506 | 507 | elif name == "get_first_rows": 508 | dataset = arguments["dataset"] 509 | config = arguments["config"] 510 | split = arguments["split"] 511 | first_rows = await DatasetViewerAPI(auth_token=auth_token).get_first_rows(dataset, config=config, split=split) 512 | return [ 513 | types.TextContent( 514 | type="text", 515 | text=json.dumps(first_rows, indent=2) 516 | ) 517 | ] 518 | 519 | elif name == "search_dataset": 520 | dataset = arguments["dataset"] 521 | config = arguments["config"] 522 | split = arguments["split"] 523 | query = arguments["query"] 524 | search_result = await DatasetViewerAPI(auth_token=auth_token).search(dataset, config=config, split=split, query=query) 525 | return [ 526 | types.TextContent( 527 | type="text", 528 | text=json.dumps(search_result, indent=2) 529 | ) 530 | ] 531 | 532 | elif name == "filter": 533 | dataset = arguments["dataset"] 534 | config = arguments["config"] 535 | split = arguments["split"] 536 | where = arguments["where"] 537 | orderby = arguments.get("orderby") 538 | page = arguments.get("page", 0) 539 | filtered = await DatasetViewerAPI(auth_token=auth_token).filter(dataset, config=config, split=split, where=where, orderby=orderby, page=page) 540 | return [ 541 | types.TextContent( 542 | type="text", 543 | text=json.dumps(filtered, indent=2) 544 | ) 545 | ] 546 | 547 | elif name == "get_statistics": 548 | dataset = arguments["dataset"] 549 | config = arguments["config"] 550 | split = arguments["split"] 551 | stats = await DatasetViewerAPI(auth_token=auth_token).get_statistics(dataset, config=config, split=split) 552 | return [ 553 | types.TextContent( 554 | type="text", 555 | text=json.dumps(stats, indent=2) 556 | ) 557 | ] 558 | 559 | elif name == "get_parquet": 560 | dataset = arguments["dataset"] 561 | parquet_data = await DatasetViewerAPI(auth_token=auth_token).get_parquet(dataset) 562 | 563 | # Save to a temporary file with .parquet extension 564 | filename = f"{dataset.replace('/', '_')}.parquet" 565 | filepath = os.path.join(os.getcwd(), filename) 566 | with open(filepath, "wb") as f: 567 | f.write(parquet_data) 568 | 569 | return [ 570 | types.TextContent( 571 | type="text", 572 | text=f"Dataset exported to: {filepath}" 573 | ) 574 | ] 575 | 576 | elif name == "validate": 577 | dataset = arguments["dataset"] 578 | try: 579 | # First check format 580 | if not re.match(r"^[^/]+/[^/]+$", dataset): 581 | return [ 582 | types.TextContent( 583 | type="text", 584 | text="Dataset must be in the format 'owner/dataset'" 585 | ) 586 | ] 587 | 588 | # Then check if dataset exists and is accessible 589 | response = await DatasetViewerAPI(auth_token=auth_token).client.get("/is-valid", params={"dataset": dataset}) 590 | response.raise_for_status() 591 | result = response.json() 592 | 593 | return [ 594 | types.TextContent( 595 | type="text", 596 | text=json.dumps(result, indent=2) 597 | ) 598 | ] 599 | except httpx.NetworkError as e: 600 | return [ 601 | types.TextContent( 602 | type="text", 603 | text=str(e) 604 | ) 605 | ] 606 | except httpx.HTTPStatusError as e: 607 | if e.response.status_code == 404: 608 | return [ 609 | types.TextContent( 610 | type="text", 611 | text=f"Dataset '{dataset}' not found" 612 | ) 613 | ] 614 | elif e.response.status_code == 403: 615 | return [ 616 | types.TextContent( 617 | type="text", 618 | text=f"Dataset '{dataset}' requires authentication" 619 | ) 620 | ] 621 | else: 622 | return [ 623 | types.TextContent( 624 | type="text", 625 | text=str(e) 626 | ) 627 | ] 628 | raise ValueError(f"Unknown tool: {name}") 629 | 630 | 631 | @server.list_prompts() 632 | async def handle_list_prompts() -> list[types.Prompt]: 633 | """List available prompts for dataset analysis""" 634 | return [ 635 | types.Prompt( 636 | name="analyze-dataset", 637 | description="Analyze a dataset's content and structure", 638 | arguments=[ 639 | types.PromptArgument( 640 | name="dataset", 641 | description="Dataset identifier", 642 | required=True, 643 | ) 644 | ], 645 | ) 646 | ] 647 | 648 | @server.get_prompt() 649 | async def handle_get_prompt( 650 | name: str, arguments: dict[str, str] | None 651 | ) -> types.GetPromptResult: 652 | """Generate dataset analysis prompts""" 653 | if name != "analyze-dataset": 654 | raise ValueError(f"Unknown prompt: {name}") 655 | 656 | if not arguments or "dataset" not in arguments: 657 | raise ValueError("Missing dataset argument") 658 | 659 | dataset = arguments["dataset"] 660 | info = await state.get_dataset(dataset) 661 | 662 | return types.GetPromptResult( 663 | description=f"Analyze dataset: {dataset}", 664 | messages=[ 665 | types.PromptMessage( 666 | role="user", 667 | content=types.TextContent( 668 | type="text", 669 | text=f"Please analyze this dataset:\n\n{str(info)}", 670 | ), 671 | ) 672 | ], 673 | ) 674 | 675 | 676 | async def main(): 677 | """Run the server using stdin/stdout streams""" 678 | async with mcp.server.stdio.stdio_server() as (read_stream, write_stream): 679 | await server.run( 680 | read_stream, 681 | write_stream, 682 | InitializationOptions( 683 | server_name="dataset-viewer", 684 | server_version="0.1.0", 685 | capabilities=server.get_capabilities( 686 | notification_options=NotificationOptions(), 687 | experimental_capabilities={}, 688 | ), 689 | ), 690 | ) 691 | 692 | if __name__ == "__main__": 693 | asyncio.run(main()) --------------------------------------------------------------------------------