├── .python-version
├── src
    └── dataset_viewer
    │   ├── __init__.py
    │   └── server.py
├── pyproject.toml
├── .gitignore
├── LICENSE
├── README.md
└── uv.lock


/.python-version:
--------------------------------------------------------------------------------
1 | 3.12
2 | 


--------------------------------------------------------------------------------
/src/dataset_viewer/__init__.py:
--------------------------------------------------------------------------------
1 | from . import server
2 | import asyncio
3 | 
4 | def main():
5 |     """Main entry point for the package."""
6 |     asyncio.run(server.main())
7 | 
8 | # Expose important items at package level
9 | __all__ = ['main', 'server']


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "dataset-viewer"
 3 | version = "0.1.0"
 4 | description = "MCP server for interacting with Hugging Face dataset viewer API, providing dataset browsing, filtering, and statistics capabilities"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "mcp>=1.1.2",
 9 |     "httpx>=0.28.1",
10 | ]
11 | 
12 | [[project.authors]]
13 | name = "privetin"
14 | email = "81558906+privetin@users.noreply.github.com"
15 | 
16 | [build-system]
17 | requires = ["hatchling"]
18 | build-backend = "hatchling.build"
19 | 
20 | [project.scripts]
21 | dataset-viewer = "dataset_viewer:main"
22 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python-generated files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | build/
 6 | dist/
 7 | wheels/
 8 | *.egg-info/
 9 | .eggs/
10 | *.so
11 | MANIFEST
12 | 
13 | # Virtual environments
14 | .venv
15 | venv/
16 | ENV/
17 | env/
18 | .env/
19 | .python-version
20 | 
21 | # IDE settings
22 | .idea/
23 | .vscode/
24 | *.swp
25 | *.swo
26 | .project
27 | .pydevproject
28 | .settings/
29 | *.sublime-project
30 | *.sublime-workspace
31 | 
32 | # Testing
33 | .coverage
34 | coverage.xml
35 | *.cover
36 | .pytest_cache/
37 | .tox/
38 | nosetests.xml
39 | htmlcov/
40 | .hypothesis/
41 | .coverage.*
42 | 
43 | # Documentation
44 | docs/_build/
45 | site/
46 | docs/generated/
47 | 
48 | # Jupyter Notebook
49 | .ipynb_checkpoints
50 | *.ipynb
51 | 
52 | # Environment variables
53 | .env
54 | .env.local
55 | .env*.local
56 | *.env
57 | 
58 | # OS-specific files
59 | .DS_Store
60 | Thumbs.db
61 | Desktop.ini
62 | $RECYCLE.BIN/
63 | 
64 | # Logs and databases
65 | *.log
66 | *.sqlite
67 | *.db
68 | 
69 | # Local development
70 | local_settings.py
71 | db.sqlite3
72 | media/
73 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Dataset Viewer Contributors
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dataset Viewer MCP Server
  2 | 
  3 | An MCP server for interacting with the [Hugging Face Dataset Viewer API](https://huggingface.co/docs/dataset-viewer), providing capabilities to browse and analyze datasets hosted on the Hugging Face Hub.
  4 | 
  5 | ## Features
  6 | 
  7 | ### Resources
  8 | 
  9 | - Uses `dataset://` URI scheme for accessing Hugging Face datasets
 10 | - Supports dataset configurations and splits
 11 | - Provides paginated access to dataset contents
 12 | - Handles authentication for private datasets
 13 | - Supports searching and filtering dataset contents
 14 | - Provides dataset statistics and analysis
 15 | 
 16 | ### Tools
 17 | 
 18 | The server provides the following tools:
 19 | 
 20 | 1. **validate**
 21 |    - Check if a dataset exists and is accessible
 22 |    - Parameters:
 23 |      - `dataset`: Dataset identifier (e.g. 'stanfordnlp/imdb')
 24 |      - `auth_token` (optional): For private datasets
 25 | 
 26 | 2. **get_info**
 27 |    - Get detailed information about a dataset
 28 |    - Parameters:
 29 |      - `dataset`: Dataset identifier
 30 |      - `auth_token` (optional): For private datasets
 31 | 
 32 | 3. **get_rows**
 33 |    - Get paginated contents of a dataset
 34 |    - Parameters:
 35 |      - `dataset`: Dataset identifier
 36 |      - `config`: Configuration name
 37 |      - `split`: Split name
 38 |      - `page` (optional): Page number (0-based)
 39 |      - `auth_token` (optional): For private datasets
 40 | 
 41 | 4. **get_first_rows**
 42 |    - Get first rows from a dataset split
 43 |    - Parameters:
 44 |      - `dataset`: Dataset identifier
 45 |      - `config`: Configuration name
 46 |      - `split`: Split name
 47 |      - `auth_token` (optional): For private datasets
 48 | 
 49 | 5. **get_statistics**
 50 |    - Get statistics about a dataset split
 51 |    - Parameters:
 52 |      - `dataset`: Dataset identifier
 53 |      - `config`: Configuration name
 54 |      - `split`: Split name
 55 |      - `auth_token` (optional): For private datasets
 56 | 
 57 | 6. **search_dataset**
 58 |    - Search for text within a dataset
 59 |    - Parameters:
 60 |      - `dataset`: Dataset identifier
 61 |      - `config`: Configuration name
 62 |      - `split`: Split name
 63 |      - `query`: Text to search for
 64 |      - `auth_token` (optional): For private datasets
 65 | 
 66 | 7. **filter**
 67 |    - Filter rows using SQL-like conditions
 68 |    - Parameters:
 69 |      - `dataset`: Dataset identifier
 70 |      - `config`: Configuration name
 71 |      - `split`: Split name
 72 |      - `where`: SQL WHERE clause (e.g. "score > 0.5")
 73 |      - `orderby` (optional): SQL ORDER BY clause
 74 |      - `page` (optional): Page number (0-based)
 75 |      - `auth_token` (optional): For private datasets
 76 | 
 77 | 8. **get_parquet**
 78 |    - Download entire dataset in Parquet format
 79 |    - Parameters:
 80 |      - `dataset`: Dataset identifier
 81 |      - `auth_token` (optional): For private datasets
 82 | 
 83 | ## Installation
 84 | 
 85 | ### Prerequisites
 86 | 
 87 | - Python 3.12 or higher
 88 | - [uv](https://github.com/astral-sh/uv) - Fast Python package installer and resolver
 89 | 
 90 | ### Setup
 91 | 
 92 | 1. Clone the repository:
 93 | ```bash
 94 | git clone https://github.com/privetin/dataset-viewer.git
 95 | cd dataset-viewer
 96 | ```
 97 | 
 98 | 2. Create a virtual environment and install:
 99 | ```bash
100 | # Create virtual environment
101 | uv venv
102 | 
103 | # Activate virtual environment
104 | # On Unix:
105 | source .venv/bin/activate
106 | # On Windows:
107 | .venv\Scripts\activate
108 | 
109 | # Install in development mode
110 | uv add -e .
111 | ```
112 | 
113 | ## Configuration
114 | 
115 | ### Environment Variables
116 | 
117 | - `HUGGINGFACE_TOKEN`: Your Hugging Face API token for accessing private datasets
118 | 
119 | ### Claude Desktop Integration
120 | 
121 | Add the following to your Claude Desktop config file:
122 | 
123 | On Windows: `%APPDATA%\Claude\claude_desktop_config.json`
124 | 
125 | On MacOS: `~/Library/Application Support/Claude/claude_desktop_config.json`
126 | 
127 | ```json
128 | {
129 |   "mcpServers": {
130 |     "dataset-viewer": {
131 |       "command": "uv",
132 |       "args": [
133 |         "--directory",
134 |         "parent_to_repo/dataset-viewer",
135 |         "run",
136 |         "dataset-viewer"
137 |       ]
138 |     }
139 |   }
140 | }
141 | ```
142 | 
143 | ## License
144 | 
145 | MIT License - see [LICENSE](LICENSE) for details


--------------------------------------------------------------------------------
/uv.lock:
--------------------------------------------------------------------------------
  1 | version = 1
  2 | requires-python = ">=3.12"
  3 | 
  4 | [[package]]
  5 | name = "annotated-types"
  6 | version = "0.7.0"
  7 | source = { registry = "https://pypi.org/simple" }
  8 | sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081 }
  9 | wheels = [
 10 |     { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643 },
 11 | ]
 12 | 
 13 | [[package]]
 14 | name = "anyio"
 15 | version = "4.7.0"
 16 | source = { registry = "https://pypi.org/simple" }
 17 | dependencies = [
 18 |     { name = "idna" },
 19 |     { name = "sniffio" },
 20 |     { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 21 | ]
 22 | sdist = { url = "https://files.pythonhosted.org/packages/f6/40/318e58f669b1a9e00f5c4453910682e2d9dd594334539c7b7817dabb765f/anyio-4.7.0.tar.gz", hash = "sha256:2f834749c602966b7d456a7567cafcb309f96482b5081d14ac93ccd457f9dd48", size = 177076 }
 23 | wheels = [
 24 |     { url = "https://files.pythonhosted.org/packages/a0/7a/4daaf3b6c08ad7ceffea4634ec206faeff697526421c20f07628c7372156/anyio-4.7.0-py3-none-any.whl", hash = "sha256:ea60c3723ab42ba6fff7e8ccb0488c898ec538ff4df1f1d5e642c3601d07e352", size = 93052 },
 25 | ]
 26 | 
 27 | [[package]]
 28 | name = "certifi"
 29 | version = "2024.12.14"
 30 | source = { registry = "https://pypi.org/simple" }
 31 | sdist = { url = "https://files.pythonhosted.org/packages/0f/bd/1d41ee578ce09523c81a15426705dd20969f5abf006d1afe8aeff0dd776a/certifi-2024.12.14.tar.gz", hash = "sha256:b650d30f370c2b724812bee08008be0c4163b163ddaec3f2546c1caf65f191db", size = 166010 }
 32 | wheels = [
 33 |     { url = "https://files.pythonhosted.org/packages/a5/32/8f6669fc4798494966bf446c8c4a162e0b5d893dff088afddf76414f70e1/certifi-2024.12.14-py3-none-any.whl", hash = "sha256:1275f7a45be9464efc1173084eaa30f866fe2e47d389406136d332ed4967ec56", size = 164927 },
 34 | ]
 35 | 
 36 | [[package]]
 37 | name = "dataset-viewer"
 38 | version = "0.1.0"
 39 | source = { editable = "." }
 40 | dependencies = [
 41 |     { name = "httpx" },
 42 |     { name = "mcp" },
 43 | ]
 44 | 
 45 | [package.metadata]
 46 | requires-dist = [
 47 |     { name = "httpx", specifier = ">=0.28.1" },
 48 |     { name = "mcp", specifier = ">=1.1.2" },
 49 | ]
 50 | 
 51 | [[package]]
 52 | name = "h11"
 53 | version = "0.14.0"
 54 | source = { registry = "https://pypi.org/simple" }
 55 | sdist = { url = "https://files.pythonhosted.org/packages/f5/38/3af3d3633a34a3316095b39c8e8fb4853a28a536e55d347bd8d8e9a14b03/h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d", size = 100418 }
 56 | wheels = [
 57 |     { url = "https://files.pythonhosted.org/packages/95/04/ff642e65ad6b90db43e668d70ffb6736436c7ce41fcc549f4e9472234127/h11-0.14.0-py3-none-any.whl", hash = "sha256:e3fe4ac4b851c468cc8363d500db52c2ead036020723024a109d37346efaa761", size = 58259 },
 58 | ]
 59 | 
 60 | [[package]]
 61 | name = "httpcore"
 62 | version = "1.0.7"
 63 | source = { registry = "https://pypi.org/simple" }
 64 | dependencies = [
 65 |     { name = "certifi" },
 66 |     { name = "h11" },
 67 | ]
 68 | sdist = { url = "https://files.pythonhosted.org/packages/6a/41/d7d0a89eb493922c37d343b607bc1b5da7f5be7e383740b4753ad8943e90/httpcore-1.0.7.tar.gz", hash = "sha256:8551cb62a169ec7162ac7be8d4817d561f60e08eaa485234898414bb5a8a0b4c", size = 85196 }
 69 | wheels = [
 70 |     { url = "https://files.pythonhosted.org/packages/87/f5/72347bc88306acb359581ac4d52f23c0ef445b57157adedb9aee0cd689d2/httpcore-1.0.7-py3-none-any.whl", hash = "sha256:a3fff8f43dc260d5bd363d9f9cf1830fa3a458b332856f34282de498ed420edd", size = 78551 },
 71 | ]
 72 | 
 73 | [[package]]
 74 | name = "httpx"
 75 | version = "0.28.1"
 76 | source = { registry = "https://pypi.org/simple" }
 77 | dependencies = [
 78 |     { name = "anyio" },
 79 |     { name = "certifi" },
 80 |     { name = "httpcore" },
 81 |     { name = "idna" },
 82 | ]
 83 | sdist = { url = "https://files.pythonhosted.org/packages/b1/df/48c586a5fe32a0f01324ee087459e112ebb7224f646c0b5023f5e79e9956/httpx-0.28.1.tar.gz", hash = "sha256:75e98c5f16b0f35b567856f597f06ff2270a374470a5c2392242528e3e3e42fc", size = 141406 }
 84 | wheels = [
 85 |     { url = "https://files.pythonhosted.org/packages/2a/39/e50c7c3a983047577ee07d2a9e53faf5a69493943ec3f6a384bdc792deb2/httpx-0.28.1-py3-none-any.whl", hash = "sha256:d909fcccc110f8c7faf814ca82a9a4d816bc5a6dbfea25d6591d6985b8ba59ad", size = 73517 },
 86 | ]
 87 | 
 88 | [[package]]
 89 | name = "httpx-sse"
 90 | version = "0.4.0"
 91 | source = { registry = "https://pypi.org/simple" }
 92 | sdist = { url = "https://files.pythonhosted.org/packages/4c/60/8f4281fa9bbf3c8034fd54c0e7412e66edbab6bc74c4996bd616f8d0406e/httpx-sse-0.4.0.tar.gz", hash = "sha256:1e81a3a3070ce322add1d3529ed42eb5f70817f45ed6ec915ab753f961139721", size = 12624 }
 93 | wheels = [
 94 |     { url = "https://files.pythonhosted.org/packages/e1/9b/a181f281f65d776426002f330c31849b86b31fc9d848db62e16f03ff739f/httpx_sse-0.4.0-py3-none-any.whl", hash = "sha256:f329af6eae57eaa2bdfd962b42524764af68075ea87370a2de920af5341e318f", size = 7819 },
 95 | ]
 96 | 
 97 | [[package]]
 98 | name = "idna"
 99 | version = "3.10"
100 | source = { registry = "https://pypi.org/simple" }
101 | sdist = { url = "https://files.pythonhosted.org/packages/f1/70/7703c29685631f5a7590aa73f1f1d3fa9a380e654b86af429e0934a32f7d/idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9", size = 190490 }
102 | wheels = [
103 |     { url = "https://files.pythonhosted.org/packages/76/c6/c88e154df9c4e1a2a66ccf0005a88dfb2650c1dffb6f5ce603dfbd452ce3/idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3", size = 70442 },
104 | ]
105 | 
106 | [[package]]
107 | name = "mcp"
108 | version = "1.1.2"
109 | source = { registry = "https://pypi.org/simple" }
110 | dependencies = [
111 |     { name = "anyio" },
112 |     { name = "httpx" },
113 |     { name = "httpx-sse" },
114 |     { name = "pydantic" },
115 |     { name = "sse-starlette" },
116 |     { name = "starlette" },
117 | ]
118 | sdist = { url = "https://files.pythonhosted.org/packages/9b/f3/5cf212e60681ea6da0dbb6e0d1bc0ab2dbf5eebc749b69663d46f114fea1/mcp-1.1.2.tar.gz", hash = "sha256:694aa9df7a8641b24953c935eb72c63136dc948981021525a0add199bdfee402", size = 57628 }
119 | wheels = [
120 |     { url = "https://files.pythonhosted.org/packages/df/40/9883eac3718b860d4006eba1920bfcb628f0a1fe37fac46a4f4e391edca6/mcp-1.1.2-py3-none-any.whl", hash = "sha256:a4d32d60fd80a1702440ba4751b847a8a88957a1f7b059880953143e9759965a", size = 36652 },
121 | ]
122 | 
123 | [[package]]
124 | name = "pydantic"
125 | version = "2.10.4"
126 | source = { registry = "https://pypi.org/simple" }
127 | dependencies = [
128 |     { name = "annotated-types" },
129 |     { name = "pydantic-core" },
130 |     { name = "typing-extensions" },
131 | ]
132 | sdist = { url = "https://files.pythonhosted.org/packages/70/7e/fb60e6fee04d0ef8f15e4e01ff187a196fa976eb0f0ab524af4599e5754c/pydantic-2.10.4.tar.gz", hash = "sha256:82f12e9723da6de4fe2ba888b5971157b3be7ad914267dea8f05f82b28254f06", size = 762094 }
133 | wheels = [
134 |     { url = "https://files.pythonhosted.org/packages/f3/26/3e1bbe954fde7ee22a6e7d31582c642aad9e84ffe4b5fb61e63b87cd326f/pydantic-2.10.4-py3-none-any.whl", hash = "sha256:597e135ea68be3a37552fb524bc7d0d66dcf93d395acd93a00682f1efcb8ee3d", size = 431765 },
135 | ]
136 | 
137 | [[package]]
138 | name = "pydantic-core"
139 | version = "2.27.2"
140 | source = { registry = "https://pypi.org/simple" }
141 | dependencies = [
142 |     { name = "typing-extensions" },
143 | ]
144 | sdist = { url = "https://files.pythonhosted.org/packages/fc/01/f3e5ac5e7c25833db5eb555f7b7ab24cd6f8c322d3a3ad2d67a952dc0abc/pydantic_core-2.27.2.tar.gz", hash = "sha256:eb026e5a4c1fee05726072337ff51d1efb6f59090b7da90d30ea58625b1ffb39", size = 413443 }
145 | wheels = [
146 |     { url = "https://files.pythonhosted.org/packages/d6/74/51c8a5482ca447871c93e142d9d4a92ead74de6c8dc5e66733e22c9bba89/pydantic_core-2.27.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9e0c8cfefa0ef83b4da9588448b6d8d2a2bf1a53c3f1ae5fca39eb3061e2f0b0", size = 1893127 },
147 |     { url = "https://files.pythonhosted.org/packages/d3/f3/c97e80721735868313c58b89d2de85fa80fe8dfeeed84dc51598b92a135e/pydantic_core-2.27.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:83097677b8e3bd7eaa6775720ec8e0405f1575015a463285a92bfdfe254529ef", size = 1811340 },
148 |     { url = "https://files.pythonhosted.org/packages/9e/91/840ec1375e686dbae1bd80a9e46c26a1e0083e1186abc610efa3d9a36180/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:172fce187655fece0c90d90a678424b013f8fbb0ca8b036ac266749c09438cb7", size = 1822900 },
149 |     { url = "https://files.pythonhosted.org/packages/f6/31/4240bc96025035500c18adc149aa6ffdf1a0062a4b525c932065ceb4d868/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:519f29f5213271eeeeb3093f662ba2fd512b91c5f188f3bb7b27bc5973816934", size = 1869177 },
150 |     { url = "https://files.pythonhosted.org/packages/fa/20/02fbaadb7808be578317015c462655c317a77a7c8f0ef274bc016a784c54/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:05e3a55d124407fffba0dd6b0c0cd056d10e983ceb4e5dbd10dda135c31071d6", size = 2038046 },
151 |     { url = "https://files.pythonhosted.org/packages/06/86/7f306b904e6c9eccf0668248b3f272090e49c275bc488a7b88b0823444a4/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c3ed807c7b91de05e63930188f19e921d1fe90de6b4f5cd43ee7fcc3525cb8c", size = 2685386 },
152 |     { url = "https://files.pythonhosted.org/packages/8d/f0/49129b27c43396581a635d8710dae54a791b17dfc50c70164866bbf865e3/pydantic_core-2.27.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fb4aadc0b9a0c063206846d603b92030eb6f03069151a625667f982887153e2", size = 1997060 },
153 |     { url = "https://files.pythonhosted.org/packages/0d/0f/943b4af7cd416c477fd40b187036c4f89b416a33d3cc0ab7b82708a667aa/pydantic_core-2.27.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:28ccb213807e037460326424ceb8b5245acb88f32f3d2777427476e1b32c48c4", size = 2004870 },
154 |     { url = "https://files.pythonhosted.org/packages/35/40/aea70b5b1a63911c53a4c8117c0a828d6790483f858041f47bab0b779f44/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:de3cd1899e2c279b140adde9357c4495ed9d47131b4a4eaff9052f23398076b3", size = 1999822 },
155 |     { url = "https://files.pythonhosted.org/packages/f2/b3/807b94fd337d58effc5498fd1a7a4d9d59af4133e83e32ae39a96fddec9d/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:220f892729375e2d736b97d0e51466252ad84c51857d4d15f5e9692f9ef12be4", size = 2130364 },
156 |     { url = "https://files.pythonhosted.org/packages/fc/df/791c827cd4ee6efd59248dca9369fb35e80a9484462c33c6649a8d02b565/pydantic_core-2.27.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a0fcd29cd6b4e74fe8ddd2c90330fd8edf2e30cb52acda47f06dd615ae72da57", size = 2158303 },
157 |     { url = "https://files.pythonhosted.org/packages/9b/67/4e197c300976af185b7cef4c02203e175fb127e414125916bf1128b639a9/pydantic_core-2.27.2-cp312-cp312-win32.whl", hash = "sha256:1e2cb691ed9834cd6a8be61228471d0a503731abfb42f82458ff27be7b2186fc", size = 1834064 },
158 |     { url = "https://files.pythonhosted.org/packages/1f/ea/cd7209a889163b8dcca139fe32b9687dd05249161a3edda62860430457a5/pydantic_core-2.27.2-cp312-cp312-win_amd64.whl", hash = "sha256:cc3f1a99a4f4f9dd1de4fe0312c114e740b5ddead65bb4102884b384c15d8bc9", size = 1989046 },
159 |     { url = "https://files.pythonhosted.org/packages/bc/49/c54baab2f4658c26ac633d798dab66b4c3a9bbf47cff5284e9c182f4137a/pydantic_core-2.27.2-cp312-cp312-win_arm64.whl", hash = "sha256:3911ac9284cd8a1792d3cb26a2da18f3ca26c6908cc434a18f730dc0db7bfa3b", size = 1885092 },
160 |     { url = "https://files.pythonhosted.org/packages/41/b1/9bc383f48f8002f99104e3acff6cba1231b29ef76cfa45d1506a5cad1f84/pydantic_core-2.27.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7d14bd329640e63852364c306f4d23eb744e0f8193148d4044dd3dacdaacbd8b", size = 1892709 },
161 |     { url = "https://files.pythonhosted.org/packages/10/6c/e62b8657b834f3eb2961b49ec8e301eb99946245e70bf42c8817350cbefc/pydantic_core-2.27.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:82f91663004eb8ed30ff478d77c4d1179b3563df6cdb15c0817cd1cdaf34d154", size = 1811273 },
162 |     { url = "https://files.pythonhosted.org/packages/ba/15/52cfe49c8c986e081b863b102d6b859d9defc63446b642ccbbb3742bf371/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71b24c7d61131bb83df10cc7e687433609963a944ccf45190cfc21e0887b08c9", size = 1823027 },
163 |     { url = "https://files.pythonhosted.org/packages/b1/1c/b6f402cfc18ec0024120602bdbcebc7bdd5b856528c013bd4d13865ca473/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fa8e459d4954f608fa26116118bb67f56b93b209c39b008277ace29937453dc9", size = 1868888 },
164 |     { url = "https://files.pythonhosted.org/packages/bd/7b/8cb75b66ac37bc2975a3b7de99f3c6f355fcc4d89820b61dffa8f1e81677/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce8918cbebc8da707ba805b7fd0b382816858728ae7fe19a942080c24e5b7cd1", size = 2037738 },
165 |     { url = "https://files.pythonhosted.org/packages/c8/f1/786d8fe78970a06f61df22cba58e365ce304bf9b9f46cc71c8c424e0c334/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eda3f5c2a021bbc5d976107bb302e0131351c2ba54343f8a496dc8783d3d3a6a", size = 2685138 },
166 |     { url = "https://files.pythonhosted.org/packages/a6/74/d12b2cd841d8724dc8ffb13fc5cef86566a53ed358103150209ecd5d1999/pydantic_core-2.27.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bd8086fa684c4775c27f03f062cbb9eaa6e17f064307e86b21b9e0abc9c0f02e", size = 1997025 },
167 |     { url = "https://files.pythonhosted.org/packages/a0/6e/940bcd631bc4d9a06c9539b51f070b66e8f370ed0933f392db6ff350d873/pydantic_core-2.27.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8d9b3388db186ba0c099a6d20f0604a44eabdeef1777ddd94786cdae158729e4", size = 2004633 },
168 |     { url = "https://files.pythonhosted.org/packages/50/cc/a46b34f1708d82498c227d5d80ce615b2dd502ddcfd8376fc14a36655af1/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7a66efda2387de898c8f38c0cf7f14fca0b51a8ef0b24bfea5849f1b3c95af27", size = 1999404 },
169 |     { url = "https://files.pythonhosted.org/packages/ca/2d/c365cfa930ed23bc58c41463bae347d1005537dc8db79e998af8ba28d35e/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:18a101c168e4e092ab40dbc2503bdc0f62010e95d292b27827871dc85450d7ee", size = 2130130 },
170 |     { url = "https://files.pythonhosted.org/packages/f4/d7/eb64d015c350b7cdb371145b54d96c919d4db516817f31cd1c650cae3b21/pydantic_core-2.27.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ba5dd002f88b78a4215ed2f8ddbdf85e8513382820ba15ad5ad8955ce0ca19a1", size = 2157946 },
171 |     { url = "https://files.pythonhosted.org/packages/a4/99/bddde3ddde76c03b65dfd5a66ab436c4e58ffc42927d4ff1198ffbf96f5f/pydantic_core-2.27.2-cp313-cp313-win32.whl", hash = "sha256:1ebaf1d0481914d004a573394f4be3a7616334be70261007e47c2a6fe7e50130", size = 1834387 },
172 |     { url = "https://files.pythonhosted.org/packages/71/47/82b5e846e01b26ac6f1893d3c5f9f3a2eb6ba79be26eef0b759b4fe72946/pydantic_core-2.27.2-cp313-cp313-win_amd64.whl", hash = "sha256:953101387ecf2f5652883208769a79e48db18c6df442568a0b5ccd8c2723abee", size = 1990453 },
173 |     { url = "https://files.pythonhosted.org/packages/51/b2/b2b50d5ecf21acf870190ae5d093602d95f66c9c31f9d5de6062eb329ad1/pydantic_core-2.27.2-cp313-cp313-win_arm64.whl", hash = "sha256:ac4dbfd1691affb8f48c2c13241a2e3b60ff23247cbcf981759c768b6633cf8b", size = 1885186 },
174 | ]
175 | 
176 | [[package]]
177 | name = "sniffio"
178 | version = "1.3.1"
179 | source = { registry = "https://pypi.org/simple" }
180 | sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372 }
181 | wheels = [
182 |     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235 },
183 | ]
184 | 
185 | [[package]]
186 | name = "sse-starlette"
187 | version = "2.2.1"
188 | source = { registry = "https://pypi.org/simple" }
189 | dependencies = [
190 |     { name = "anyio" },
191 |     { name = "starlette" },
192 | ]
193 | sdist = { url = "https://files.pythonhosted.org/packages/71/a4/80d2a11af59fe75b48230846989e93979c892d3a20016b42bb44edb9e398/sse_starlette-2.2.1.tar.gz", hash = "sha256:54470d5f19274aeed6b2d473430b08b4b379ea851d953b11d7f1c4a2c118b419", size = 17376 }
194 | wheels = [
195 |     { url = "https://files.pythonhosted.org/packages/d9/e0/5b8bd393f27f4a62461c5cf2479c75a2cc2ffa330976f9f00f5f6e4f50eb/sse_starlette-2.2.1-py3-none-any.whl", hash = "sha256:6410a3d3ba0c89e7675d4c273a301d64649c03a5ef1ca101f10b47f895fd0e99", size = 10120 },
196 | ]
197 | 
198 | [[package]]
199 | name = "starlette"
200 | version = "0.45.1"
201 | source = { registry = "https://pypi.org/simple" }
202 | dependencies = [
203 |     { name = "anyio" },
204 | ]
205 | sdist = { url = "https://files.pythonhosted.org/packages/c1/be/b398217eb35b356d2d9bb84ec67071ea2842e02950fcf38b33df9d5b24ba/starlette-0.45.1.tar.gz", hash = "sha256:a8ae1fa3b1ab7ca83a4abd77871921a13fb5aeaf4874436fb96c29dfcd4ecfa3", size = 2573953 }
206 | wheels = [
207 |     { url = "https://files.pythonhosted.org/packages/6b/2c/a50484b035ee0e13ebb7a42391e391befbfc1b6a9ad5503e83badd182ada/starlette-0.45.1-py3-none-any.whl", hash = "sha256:5656c0524f586e9148d9a3c1dd5257fb42a99892fb0dc6877dd76ef4d184aac3", size = 71488 },
208 | ]
209 | 
210 | [[package]]
211 | name = "typing-extensions"
212 | version = "4.12.2"
213 | source = { registry = "https://pypi.org/simple" }
214 | sdist = { url = "https://files.pythonhosted.org/packages/df/db/f35a00659bc03fec321ba8bce9420de607a1d37f8342eee1863174c69557/typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8", size = 85321 }
215 | wheels = [
216 |     { url = "https://files.pythonhosted.org/packages/26/9f/ad63fc0248c5379346306f8668cda6e2e2e9c95e01216d2b8ffd9ff037d0/typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d", size = 37438 },
217 | ]
218 | 


--------------------------------------------------------------------------------
/src/dataset_viewer/server.py:
--------------------------------------------------------------------------------
  1 | """MCP Server for interacting with Hugging Face dataset viewer API.
  2 | 
  3 | This server provides tools for browsing, filtering and getting statistics about datasets hosted on the 
  4 | Hugging Face Hub. It uses the official dataset viewer API (https://huggingface.co/docs/dataset-viewer)
  5 | to provide:
  6 | 
  7 | - Dataset validation and basic info
  8 | - Paginated content viewing 
  9 | - Dataset statistics
 10 | - Support for dataset configurations and splits
 11 | 
 12 | Note: This only works with datasets hosted on the Hugging Face Hub. For local datasets or datasets from
 13 | other sources, you'll need to upload them to Hugging Face first.
 14 | """
 15 | 
 16 | import asyncio
 17 | from typing import Optional
 18 | import httpx
 19 | import os
 20 | import re
 21 | import json
 22 | 
 23 | from mcp.server.models import InitializationOptions
 24 | import mcp.types as types
 25 | from mcp.server import NotificationOptions, Server
 26 | from pydantic import AnyUrl, BaseModel
 27 | import mcp.server.stdio
 28 | 
 29 | 
 30 | class DatasetViewerAPI:
 31 |     """Internal API client for dataset viewer"""
 32 |     def __init__(self, base_url: str = "https://datasets-server.huggingface.co", auth_token: str | None = None):
 33 |         self.base_url = base_url.rstrip("/")
 34 |         headers = {"Authorization": f"Bearer {auth_token}"} if auth_token else {}
 35 |         self.client = httpx.AsyncClient(base_url=self.base_url, headers=headers)
 36 | 
 37 |     async def validate_dataset(self, dataset: str) -> None:
 38 |         """Validate dataset ID format and check if it exists"""
 39 |         # Validate format (username/dataset-name)
 40 |         if not re.match(r"^[^/]+/[^/]+$", dataset):
 41 |             raise ValueError("Dataset ID must be in the format 'owner/dataset'")
 42 |             
 43 |         # Check if dataset exists and is accessible
 44 |         try:
 45 |             response = await self.client.head(f"/is-valid?dataset={dataset}")
 46 |             response.raise_for_status()
 47 |         except httpx.NetworkError as e:
 48 |             raise ConnectionError(f"Network error while validating dataset: {e}")
 49 |         except httpx.HTTPStatusError as e:
 50 |             if e.response.status_code == 404:
 51 |                 raise ValueError(f"Dataset '{dataset}' not found")
 52 |             elif e.response.status_code == 403:
 53 |                 raise ValueError(f"Dataset '{dataset}' exists but requires authentication")
 54 |             else:
 55 |                 raise RuntimeError(f"Error validating dataset: {e}")
 56 | 
 57 |     async def get_info(self, dataset: str) -> dict:
 58 |         """Get detailed information about a dataset"""
 59 |         try:
 60 |             # Get detailed dataset info
 61 |             response = await self.client.get("/info", params={"dataset": dataset})
 62 |             response.raise_for_status()
 63 |             return response.json()
 64 |         except httpx.HTTPStatusError as e:
 65 |             if e.response.status_code == 404:
 66 |                 raise ValueError(f"Dataset '{dataset}' not found")
 67 |             raise
 68 |             
 69 |     async def get_rows(self, dataset: str, config: str, split: str, page: int = 0) -> dict:
 70 |         """Get paginated rows of a dataset"""
 71 |         params = {
 72 |             "dataset": dataset,
 73 |             "config": config,
 74 |             "split": split,
 75 |             "offset": page * 100,  # 100 rows per page
 76 |             "length": 100
 77 |         }
 78 |         response = await self.client.get("/rows", params=params)
 79 |         response.raise_for_status()
 80 |         return response.json()
 81 | 
 82 |     async def get_statistics(self, dataset: str, config: str, split: str) -> dict:
 83 |         """Get statistics about a dataset"""
 84 |         params = {
 85 |             "dataset": dataset,
 86 |             "config": config,
 87 |             "split": split
 88 |         }
 89 |         response = await self.client.get("/statistics", params=params)
 90 |         response.raise_for_status()
 91 |         return response.json()
 92 |         
 93 |     async def get_first_rows(self, dataset: str, config: str, split: str) -> dict:
 94 |         """Get first few rows of a dataset split"""
 95 |         params = {
 96 |             "dataset": dataset,
 97 |             "config": config,
 98 |             "split": split
 99 |         }
100 |         response = await self.client.get("/first-rows", params=params)
101 |         response.raise_for_status()
102 |         return response.json()
103 |         
104 |     async def search(self, dataset: str, config: str, split: str, query: str) -> dict:
105 |         """Search for text within a dataset split"""
106 |         params = {
107 |             "dataset": dataset,
108 |             "config": config,
109 |             "split": split,
110 |             "query": query
111 |         }
112 |         response = await self.client.get("/search", params=params)
113 |         response.raise_for_status()
114 |         return response.json()
115 | 
116 |     async def filter(self, dataset: str, config: str, split: str, where: str, orderby: str | None = None, page: int = 0) -> dict:
117 |         """Filter dataset rows based on conditions"""
118 |         # Validate page number
119 |         if page < 0:
120 |             raise ValueError("Page number must be non-negative")
121 |             
122 |         # Basic SQL clause validation
123 |         if not where.strip():
124 |             raise ValueError("WHERE clause cannot be empty")
125 |         if orderby and not orderby.strip():
126 |             raise ValueError("ORDER BY clause cannot be empty")
127 |             
128 |         params = {
129 |             "dataset": dataset,
130 |             "config": config,
131 |             "split": split,
132 |             "where": where,
133 |             "offset": page * 100,  # 100 rows per page
134 |             "length": 100
135 |         }
136 |         if orderby:
137 |             params["orderby"] = orderby
138 |             
139 |         try:
140 |             response = await self.client.get("/filter", params=params)
141 |             response.raise_for_status()
142 |             return response.json()
143 |         except httpx.NetworkError as e:
144 |             raise ConnectionError(f"Network error while filtering dataset: {e}")
145 |         except httpx.HTTPStatusError as e:
146 |             if e.response.status_code == 400:
147 |                 raise ValueError(f"Invalid filter query: {e.response.text}")
148 |             elif e.response.status_code == 404:
149 |                 raise ValueError(f"Dataset, config or split not found: {dataset}/{config}/{split}")
150 |             else:
151 |                 raise RuntimeError(f"Error filtering dataset: {e}")
152 | 
153 |     async def get_parquet(self, dataset: str) -> bytes:
154 |         """Get entire dataset in Parquet format"""
155 |         response = await self.client.get("/parquet", params={"dataset": dataset})
156 |         response.raise_for_status()
157 |         return response.content
158 | 
159 |     async def get_splits(self, dataset: str) -> dict:
160 |         """Get list of available splits for a dataset"""
161 |         response = await self.client.get("/splits", params={"dataset": dataset})
162 |         response.raise_for_status()
163 |         return response.json()
164 | 
165 | 
166 | class DatasetState:
167 |     """Manages dataset state and caching"""
168 |     def __init__(self):
169 |         self.datasets: dict[str, dict] = {}  # Cache dataset info
170 |         self.current_page: dict[str, int] = {}  # Track pagination
171 |         # Get auth token from environment if available
172 |         auth_token = os.environ.get("HUGGINGFACE_TOKEN")
173 |         self.api = DatasetViewerAPI(auth_token=auth_token)
174 | 
175 |     async def get_dataset(self, dataset: str) -> dict:
176 |         """Get dataset info, using cache if available"""
177 |         if dataset not in self.datasets:
178 |             self.datasets[dataset] = await self.api.get_info(dataset)
179 |         return self.datasets[dataset]
180 | 
181 | 
182 | # Initialize server and state
183 | server = Server("dataset-viewer")
184 | state = DatasetState()
185 | 
186 | 
187 | @server.list_resources()
188 | async def handle_list_resources() -> list[types.Resource]:
189 |     """List available dataset resources"""
190 |     resources = []
191 |     for dataset, info in state.datasets.items():
192 |         resources.append(
193 |             types.Resource(
194 |                 uri=AnyUrl(f"dataset://{dataset}"),
195 |                 name=dataset,
196 |                 description=info.get("description", "No description available"),
197 |                 mimeType="application/json",
198 |             )
199 |         )
200 |     return resources
201 | 
202 | 
203 | @server.read_resource()
204 | async def handle_read_resource(uri: AnyUrl) -> str:
205 |     """Read a specific dataset's content"""
206 |     if uri.scheme != "dataset":
207 |         raise ValueError(f"Unsupported URI scheme: {uri.scheme}")
208 | 
209 |     dataset = uri.path
210 |     if dataset is not None:
211 |         dataset = dataset.lstrip("/")
212 |         info = await state.get_dataset(dataset)
213 |         return str(info)  # Convert to string for display
214 |     raise ValueError(f"Dataset not found: {dataset}")
215 | 
216 | 
217 | @server.list_tools()
218 | async def handle_list_tools() -> list[types.Tool]:
219 |     """List available dataset tools for Hugging Face datasets"""
220 |     return [
221 |         types.Tool(
222 |             name="get_info",
223 |             description="Get detailed information about a Hugging Face dataset including description, features, splits, and statistics. Run validate first to check if the dataset exists and is accessible.",
224 |             inputSchema={
225 |                 "type": "object",
226 |                 "properties": {
227 |                     "dataset": {
228 |                         "type": "string",
229 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
230 |                         "pattern": "^[^/]+/[^/]+$",
231 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
232 |                     },
233 |                     "auth_token": {
234 |                         "type": "string",
235 |                         "description": "Hugging Face auth token for private/gated datasets",
236 |                         "optional": True
237 |                     }
238 |                 },
239 |                 "required": ["dataset"],
240 |             }
241 |         ),
242 |         types.Tool(
243 |             name="get_rows",
244 |             description="Get paginated rows from a Hugging Face dataset",
245 |             inputSchema={
246 |                 "type": "object",
247 |                 "properties": {
248 |                     "dataset": {
249 |                         "type": "string",
250 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
251 |                         "pattern": "^[^/]+/[^/]+$",
252 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
253 |                     },
254 |                     "config": {
255 |                         "type": "string",
256 |                         "description": "Dataset configuration/subset name. Use get_info to list available configs",
257 |                         "examples": ["default", "en", "es"]
258 |                     },
259 |                     "split": {
260 |                         "type": "string",
261 |                         "description": "Dataset split name. Splits partition the data for training/evaluation",
262 |                         "examples": ["train", "validation", "test"]
263 |                     },
264 |                     "page": {"type": "integer", "description": "Page number (0-based), returns 100 rows per page", "default": 0},
265 |                     "auth_token": {
266 |                         "type": "string",
267 |                         "description": "Hugging Face auth token for private/gated datasets",
268 |                         "optional": True
269 |                     }
270 |                 },
271 |                 "required": ["dataset", "config", "split"],
272 |             }
273 |         ),
274 |         types.Tool(
275 |             name="get_first_rows",
276 |             description="Get first rows from a Hugging Face dataset split",
277 |             inputSchema={
278 |                 "type": "object",
279 |                 "properties": {
280 |                     "dataset": {
281 |                         "type": "string",
282 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
283 |                         "pattern": "^[^/]+/[^/]+$",
284 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
285 |                     },
286 |                     "config": {
287 |                         "type": "string",
288 |                         "description": "Dataset configuration/subset name. Use get_info to list available configs",
289 |                         "examples": ["default", "en", "es"]
290 |                     },
291 |                     "split": {
292 |                         "type": "string",
293 |                         "description": "Dataset split name. Splits partition the data for training/evaluation",
294 |                         "examples": ["train", "validation", "test"]
295 |                     },
296 |                     "auth_token": {
297 |                         "type": "string",
298 |                         "description": "Hugging Face auth token for private/gated datasets",
299 |                         "optional": True
300 |                     }
301 |                 },
302 |                 "required": ["dataset", "config", "split"],
303 |             }
304 |         ),
305 |         types.Tool(
306 |             name="search_dataset",
307 |             description="Search for text within a Hugging Face dataset",
308 |             inputSchema={
309 |                 "type": "object",
310 |                 "properties": {
311 |                     "dataset": {
312 |                         "type": "string",
313 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
314 |                         "pattern": "^[^/]+/[^/]+$",
315 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
316 |                     },
317 |                     "config": {
318 |                         "type": "string",
319 |                         "description": "Dataset configuration/subset name. Use get_info to list available configs",
320 |                         "examples": ["default", "en", "es"]
321 |                     },
322 |                     "split": {
323 |                         "type": "string",
324 |                         "description": "Dataset split name. Splits partition the data for training/evaluation",
325 |                         "examples": ["train", "validation", "test"]
326 |                     },
327 |                     "query": {"type": "string", "description": "Text to search for in the dataset"},
328 |                     "auth_token": {
329 |                         "type": "string",
330 |                         "description": "Hugging Face auth token for private/gated datasets",
331 |                         "optional": True
332 |                     }
333 |                 },
334 |                 "required": ["dataset", "config", "split", "query"],
335 |             }
336 |         ),
337 |         types.Tool(
338 |             name="filter",
339 |             description="Filter rows in a Hugging Face dataset using SQL-like conditions",
340 |             inputSchema={
341 |                 "type": "object",
342 |                 "properties": {
343 |                     "dataset": {
344 |                         "type": "string",
345 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
346 |                         "pattern": "^[^/]+/[^/]+$",
347 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
348 |                     },
349 |                     "config": {
350 |                         "type": "string",
351 |                         "description": "Dataset configuration/subset name. Use get_info to list available configs",
352 |                         "examples": ["default", "en", "es"]
353 |                     },
354 |                     "split": {
355 |                         "type": "string",
356 |                         "description": "Dataset split name. Splits partition the data for training/evaluation",
357 |                         "examples": ["train", "validation", "test"]
358 |                     },
359 |                     "where": {
360 |                         "type": "string",
361 |                         "description": "SQL-like WHERE clause to filter rows",
362 |                         "examples": ["column = \"value\"", "score > 0.5", "text LIKE \"%query%\""]
363 |                     },
364 |                     "orderby": {
365 |                         "type": "string",
366 |                         "description": "SQL-like ORDER BY clause to sort results",
367 |                         "optional": True,
368 |                         "examples": ["column ASC", "score DESC", "name ASC, id DESC"]
369 |                     },
370 |                     "page": {
371 |                         "type": "integer",
372 |                         "description": "Page number for paginated results (100 rows per page)",
373 |                         "default": 0,
374 |                         "minimum": 0
375 |                     },
376 |                     "auth_token": {
377 |                         "type": "string",
378 |                         "description": "Hugging Face auth token for private/gated datasets",
379 |                         "optional": True
380 |                     }
381 |                 },
382 |                 "required": ["dataset", "config", "split", "where"],
383 |             }
384 |         ),
385 |         types.Tool(
386 |             name="get_statistics",
387 |             description="Get statistics about a Hugging Face dataset",
388 |             inputSchema={
389 |                 "type": "object",
390 |                 "properties": {
391 |                     "dataset": {
392 |                         "type": "string",
393 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
394 |                         "pattern": "^[^/]+/[^/]+$",
395 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
396 |                     },
397 |                     "config": {
398 |                         "type": "string",
399 |                         "description": "Dataset configuration/subset name. Use get_info to list available configs",
400 |                         "examples": ["default", "en", "es"]
401 |                     },
402 |                     "split": {
403 |                         "type": "string",
404 |                         "description": "Dataset split name. Splits partition the data for training/evaluation",
405 |                         "examples": ["train", "validation", "test"]
406 |                     },
407 |                     "auth_token": {
408 |                         "type": "string",
409 |                         "description": "Hugging Face auth token for private/gated datasets",
410 |                         "optional": True
411 |                     }
412 |                 },
413 |                 "required": ["dataset", "config", "split"],
414 |             }
415 |         ),
416 |         types.Tool(
417 |             name="get_parquet",
418 |             description="Export Hugging Face dataset split as Parquet file",
419 |             inputSchema={
420 |                 "type": "object",
421 |                 "properties": {
422 |                     "dataset": {
423 |                         "type": "string",
424 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
425 |                         "pattern": "^[^/]+/[^/]+$",
426 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
427 |                     },
428 |                     "auth_token": {
429 |                         "type": "string",
430 |                         "description": "Hugging Face auth token for private/gated datasets",
431 |                         "optional": True
432 |                     }
433 |                 },
434 |                 "required": ["dataset"],
435 |             }
436 |         ),
437 |         types.Tool(
438 |             name="validate",
439 |             description="Check if a Hugging Face dataset exists and is accessible",
440 |             inputSchema={
441 |                 "type": "object",
442 |                 "properties": {
443 |                     "dataset": {
444 |                         "type": "string", 
445 |                         "description": "Hugging Face dataset identifier in the format owner/dataset",
446 |                         "pattern": "^[^/]+/[^/]+$",
447 |                         "examples": ["ylecun/mnist", "stanfordnlp/imdb"]
448 |                     },
449 |                     "auth_token": {
450 |                         "type": "string",
451 |                         "description": "Hugging Face auth token for private/gated datasets",
452 |                         "optional": True
453 |                     }
454 |                 },
455 |                 "required": ["dataset"],
456 |             }
457 |         ),
458 |     ]
459 | 
460 | 
461 | @server.call_tool()
462 | async def handle_call_tool(
463 |     name: str, arguments: dict | None
464 | ) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
465 |     """Handle tool execution requests"""
466 |     if arguments is None:
467 |         arguments = {}
468 | 
469 |     # Allow overriding env token with explicit token
470 |     auth_token = arguments.pop("auth_token", None) or os.environ.get("HUGGINGFACE_TOKEN")
471 | 
472 |     if name == "get_info":
473 |         dataset = arguments["dataset"]
474 |         try:
475 |             response = await DatasetViewerAPI(auth_token=auth_token).client.get("/info", params={"dataset": dataset})
476 |             response.raise_for_status()
477 |             result = response.json()
478 |             return [
479 |                 types.TextContent(
480 |                     type="text",
481 |                     text=json.dumps(result, indent=2)
482 |                 )
483 |             ]
484 |         except httpx.HTTPStatusError as e:
485 |             if e.response.status_code == 404:
486 |                 return [
487 |                     types.TextContent(
488 |                         type="text",
489 |                         text=f"Dataset '{dataset}' not found"
490 |                     )
491 |                 ]
492 |             raise
493 | 
494 |     elif name == "get_rows":
495 |         dataset = arguments["dataset"]
496 |         config = arguments["config"]
497 |         split = arguments["split"]
498 |         page = arguments.get("page", 0)
499 |         rows = await DatasetViewerAPI(auth_token=auth_token).get_rows(dataset, config=config, split=split, page=page)
500 |         return [
501 |             types.TextContent(
502 |                 type="text",
503 |                 text=json.dumps(rows, indent=2)
504 |             )
505 |         ]
506 | 
507 |     elif name == "get_first_rows":
508 |         dataset = arguments["dataset"]
509 |         config = arguments["config"]
510 |         split = arguments["split"]
511 |         first_rows = await DatasetViewerAPI(auth_token=auth_token).get_first_rows(dataset, config=config, split=split)
512 |         return [
513 |             types.TextContent(
514 |                 type="text",
515 |                 text=json.dumps(first_rows, indent=2)
516 |             )
517 |         ]
518 | 
519 |     elif name == "search_dataset":
520 |         dataset = arguments["dataset"]
521 |         config = arguments["config"]
522 |         split = arguments["split"]
523 |         query = arguments["query"]
524 |         search_result = await DatasetViewerAPI(auth_token=auth_token).search(dataset, config=config, split=split, query=query)
525 |         return [
526 |             types.TextContent(
527 |                 type="text",
528 |                 text=json.dumps(search_result, indent=2)
529 |             )
530 |         ]
531 | 
532 |     elif name == "filter":
533 |         dataset = arguments["dataset"]
534 |         config = arguments["config"]
535 |         split = arguments["split"]
536 |         where = arguments["where"]
537 |         orderby = arguments.get("orderby")
538 |         page = arguments.get("page", 0)
539 |         filtered = await DatasetViewerAPI(auth_token=auth_token).filter(dataset, config=config, split=split, where=where, orderby=orderby, page=page)
540 |         return [
541 |             types.TextContent(
542 |                 type="text",
543 |                 text=json.dumps(filtered, indent=2)
544 |             )
545 |         ]
546 | 
547 |     elif name == "get_statistics":
548 |         dataset = arguments["dataset"]
549 |         config = arguments["config"]
550 |         split = arguments["split"]
551 |         stats = await DatasetViewerAPI(auth_token=auth_token).get_statistics(dataset, config=config, split=split)
552 |         return [
553 |             types.TextContent(
554 |                 type="text",
555 |                 text=json.dumps(stats, indent=2)
556 |             )
557 |         ]
558 | 
559 |     elif name == "get_parquet":
560 |         dataset = arguments["dataset"]
561 |         parquet_data = await DatasetViewerAPI(auth_token=auth_token).get_parquet(dataset)
562 |         
563 |         # Save to a temporary file with .parquet extension
564 |         filename = f"{dataset.replace('/', '_')}.parquet"
565 |         filepath = os.path.join(os.getcwd(), filename)
566 |         with open(filepath, "wb") as f:
567 |             f.write(parquet_data)
568 |             
569 |         return [
570 |             types.TextContent(
571 |                 type="text",
572 |                 text=f"Dataset exported to: {filepath}"
573 |             )
574 |         ]
575 | 
576 |     elif name == "validate":
577 |         dataset = arguments["dataset"]
578 |         try:
579 |             # First check format
580 |             if not re.match(r"^[^/]+/[^/]+$", dataset):
581 |                 return [
582 |                     types.TextContent(
583 |                         type="text",
584 |                         text="Dataset must be in the format 'owner/dataset'"
585 |                     )
586 |                 ]
587 |                 
588 |             # Then check if dataset exists and is accessible
589 |             response = await DatasetViewerAPI(auth_token=auth_token).client.get("/is-valid", params={"dataset": dataset})
590 |             response.raise_for_status()
591 |             result = response.json()
592 |             
593 |             return [
594 |                 types.TextContent(
595 |                     type="text",
596 |                     text=json.dumps(result, indent=2)
597 |                 )
598 |             ]
599 |         except httpx.NetworkError as e:
600 |             return [
601 |                 types.TextContent(
602 |                     type="text",
603 |                     text=str(e)
604 |                 )
605 |             ]
606 |         except httpx.HTTPStatusError as e:
607 |             if e.response.status_code == 404:
608 |                 return [
609 |                     types.TextContent(
610 |                         type="text",
611 |                         text=f"Dataset '{dataset}' not found"
612 |                     )
613 |                 ]
614 |             elif e.response.status_code == 403:
615 |                 return [
616 |                     types.TextContent(
617 |                         type="text",
618 |                         text=f"Dataset '{dataset}' requires authentication"
619 |                     )
620 |                 ]
621 |             else:
622 |                 return [
623 |                     types.TextContent(
624 |                         type="text",
625 |                         text=str(e)
626 |                     )
627 |                 ]
628 |     raise ValueError(f"Unknown tool: {name}")
629 | 
630 | 
631 | @server.list_prompts()
632 | async def handle_list_prompts() -> list[types.Prompt]:
633 |     """List available prompts for dataset analysis"""
634 |     return [
635 |         types.Prompt(
636 |             name="analyze-dataset",
637 |             description="Analyze a dataset's content and structure",
638 |             arguments=[
639 |                 types.PromptArgument(
640 |                     name="dataset",
641 |                     description="Dataset identifier",
642 |                     required=True,
643 |                 )
644 |             ],
645 |         )
646 |     ]
647 | 
648 | @server.get_prompt()
649 | async def handle_get_prompt(
650 |     name: str, arguments: dict[str, str] | None
651 | ) -> types.GetPromptResult:
652 |     """Generate dataset analysis prompts"""
653 |     if name != "analyze-dataset":
654 |         raise ValueError(f"Unknown prompt: {name}")
655 | 
656 |     if not arguments or "dataset" not in arguments:
657 |         raise ValueError("Missing dataset argument")
658 | 
659 |     dataset = arguments["dataset"]
660 |     info = await state.get_dataset(dataset)
661 |     
662 |     return types.GetPromptResult(
663 |         description=f"Analyze dataset: {dataset}",
664 |         messages=[
665 |             types.PromptMessage(
666 |                 role="user",
667 |                 content=types.TextContent(
668 |                     type="text",
669 |                     text=f"Please analyze this dataset:\n\n{str(info)}",
670 |                 ),
671 |             )
672 |         ],
673 |     )
674 | 
675 | 
676 | async def main():
677 |     """Run the server using stdin/stdout streams"""
678 |     async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
679 |         await server.run(
680 |             read_stream,
681 |             write_stream,
682 |             InitializationOptions(
683 |                 server_name="dataset-viewer",
684 |                 server_version="0.1.0",
685 |                 capabilities=server.get_capabilities(
686 |                     notification_options=NotificationOptions(),
687 |                     experimental_capabilities={},
688 |                 ),
689 |             ),
690 |         )
691 | 
692 | if __name__ == "__main__":
693 |     asyncio.run(main())


--------------------------------------------------------------------------------