├── physionet ├── auth.py ├── config.py ├── project │ ├── search.py │ ├── __init__.py │ ├── metadata.py │ └── loader.py ├── datathon │ ├── __init__.py │ └── mlhc.py ├── metrics │ ├── __init__.py │ ├── openalex.py │ └── dimensions.py ├── __main__.py ├── __init__.py ├── validate │ ├── __init__.py │ ├── checks │ │ ├── __init__.py │ │ ├── documentation.py │ │ ├── quality.py │ │ ├── integrity.py │ │ ├── filesystem.py │ │ └── privacy.py │ ├── config.py │ ├── validator.py │ └── models.py ├── api │ ├── __init__.py │ ├── exceptions.py │ ├── utils.py │ ├── models.py │ ├── client.py │ └── endpoints.py └── cli.py ├── requirements.txt ├── tests ├── api │ ├── __init__.py │ ├── test_exceptions.py │ ├── test_utils.py │ ├── test_models.py │ ├── test_client.py │ └── test_endpoints.py ├── test_loader.py ├── test_search.py ├── validate │ ├── __init__.py │ ├── test_cli.py │ ├── test_validator.py │ └── test_checks.py └── test_dataset.py ├── .gitattributes ├── LICENSE ├── .gitignore ├── .github └── workflows │ └── run-tests.yml ├── pyproject.toml └── README.md /physionet/auth.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/config.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_loader.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_search.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/project/search.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/datathon/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/metrics/openalex.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/project/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /physionet/project/metadata.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/validate/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for validation module.""" 2 | -------------------------------------------------------------------------------- /physionet/metrics/dimensions.py: -------------------------------------------------------------------------------- 1 | # TODO: add tools for getting metrics from dimensions.ai 2 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | def test_hello_world(): 2 | assert "Hello, world!" == "Hello, world!" 3 | -------------------------------------------------------------------------------- /physionet/__main__.py: -------------------------------------------------------------------------------- 1 | """Allow running the CLI as a module: python -m physionet.""" 2 | 3 | import sys 4 | from physionet.cli import main 5 | 6 | if __name__ == "__main__": 7 | sys.exit(main()) 8 | -------------------------------------------------------------------------------- /physionet/__init__.py: -------------------------------------------------------------------------------- 1 | from physionet.api import PhysioNetClient 2 | 3 | try: 4 | from importlib.metadata import version 5 | __version__ = version("physionet") 6 | except Exception: 7 | __version__ = "unknown" 8 | 9 | __all__ = ["PhysioNetClient"] 10 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py diff=python 2 | 3 | *.anI binary 4 | *.atr binary 5 | *.d[0-9] binary 6 | *.dat binary 7 | *.edf binary 8 | *.gz binary 9 | *.mat binary 10 | *.qrs binary 11 | *.wabp binary 12 | *.wav binary 13 | *.wqrs binary 14 | *.xyz binary 15 | -------------------------------------------------------------------------------- /physionet/validate/__init__.py: -------------------------------------------------------------------------------- 1 | """Dataset validation module for PhysioNet submissions.""" 2 | 3 | from physionet.validate.validator import validate_dataset 4 | from physionet.validate.config import ValidationConfig 5 | from physionet.validate.models import ValidationResult 6 | 7 | __all__ = ["validate_dataset", "ValidationConfig", "ValidationResult"] 8 | -------------------------------------------------------------------------------- /physionet/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .client import PhysioNetClient 2 | from .exceptions import ( 3 | PhysioNetAPIError, 4 | BadRequestError, 5 | ForbiddenError, 6 | NotFoundError, 7 | RateLimitError, 8 | ) 9 | 10 | __all__ = [ 11 | "PhysioNetClient", 12 | "PhysioNetAPIError", 13 | "BadRequestError", 14 | "ForbiddenError", 15 | "NotFoundError", 16 | "RateLimitError", 17 | ] 18 | -------------------------------------------------------------------------------- /physionet/validate/checks/__init__.py: -------------------------------------------------------------------------------- 1 | """Validation check modules.""" 2 | 3 | from physionet.validate.checks.filesystem import check_filesystem 4 | from physionet.validate.checks.documentation import check_documentation 5 | from physionet.validate.checks.integrity import check_integrity 6 | from physionet.validate.checks.quality import check_quality 7 | from physionet.validate.checks.privacy import check_privacy 8 | 9 | __all__ = [ 10 | "check_filesystem", 11 | "check_documentation", 12 | "check_integrity", 13 | "check_quality", 14 | "check_privacy", 15 | ] 16 | -------------------------------------------------------------------------------- /physionet/api/exceptions.py: -------------------------------------------------------------------------------- 1 | class PhysioNetAPIError(Exception): 2 | """Base exception for PhysioNet API errors.""" 3 | 4 | pass 5 | 6 | 7 | class BadRequestError(PhysioNetAPIError): 8 | """Raised when API returns 400 Bad Request.""" 9 | 10 | pass 11 | 12 | 13 | class ForbiddenError(PhysioNetAPIError): 14 | """Raised when API returns 403 Forbidden.""" 15 | 16 | pass 17 | 18 | 19 | class NotFoundError(PhysioNetAPIError): 20 | """Raised when API returns 404 Not Found.""" 21 | 22 | pass 23 | 24 | 25 | class RateLimitError(PhysioNetAPIError): 26 | """Raised when API returns 429 Too Many Requests.""" 27 | 28 | pass 29 | -------------------------------------------------------------------------------- /physionet/project/loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | The dataset module contains code for loading and processing PhysioNet data. 3 | """ 4 | import requests 5 | 6 | 7 | def hello(): 8 | print("Hello world!") 9 | 10 | 11 | def _get_request(root='https://physionet.org/api/v1/', 12 | endpoint='project/published/'): 13 | """ 14 | Make a GET request to the PhysioNet API. 15 | 16 | Returns: 17 | response (requests.models.Response): Response object from the API call. 18 | """ 19 | url = root + endpoint 20 | response = requests.get(url) 21 | 22 | if not response.status_code == 200: 23 | raise Exception(f'Error: {response.status_code}') 24 | 25 | return response 26 | -------------------------------------------------------------------------------- /physionet/api/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import os 3 | 4 | 5 | def get_credentials_from_env() -> tuple[Optional[str], Optional[str]]: 6 | """ 7 | Get PhysioNet credentials from environment variables. 8 | 9 | Returns: 10 | Tuple of (username, password) or (None, None) 11 | """ 12 | username = os.getenv("PHYSIONET_USERNAME") 13 | password = os.getenv("PHYSIONET_PASSWORD") 14 | return username, password 15 | 16 | 17 | def format_size(size_bytes: int) -> str: 18 | """ 19 | Format bytes to human-readable size. 20 | 21 | Args: 22 | size_bytes: Size in bytes 23 | 24 | Returns: 25 | Formatted string (e.g., "1.5 GB") 26 | """ 27 | for unit in ["B", "KB", "MB", "GB", "TB"]: 28 | if size_bytes < 1024.0: 29 | return f"{size_bytes:.2f} {unit}" 30 | size_bytes /= 1024.0 31 | return f"{size_bytes:.2f} PB" 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Tom Pollard 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .env 11 | .Python 12 | env/ 13 | venv/ 14 | .venv/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyPI credentials 31 | .pypirc 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *,cover 52 | .hypothesis/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | 61 | # Sphinx documentation 62 | docs/_build/ 63 | 64 | # PyBuilder 65 | target/ 66 | 67 | # Ipython Notebook 68 | .ipynb_checkpoints 69 | 70 | # DS_Store files 71 | .DS_Store 72 | 73 | # Local config 74 | .vscode 75 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | fail-fast: false 17 | matrix: 18 | python-version: ["3.9", "3.10"] 19 | include: 20 | - python-version: "3.10" 21 | coverage: 1 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | python -m pip install flake8 pyright 32 | python -m pip install -e ".[dev]" 33 | - name: Lint with flake8 34 | run: | 35 | # stop the build if there are Python syntax errors or undefined names 36 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 37 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 38 | flake8 . --count --exit-zero --max-complexity=10 --statistics 39 | - name: Test with pytest 40 | run: | 41 | pytest . 42 | -------------------------------------------------------------------------------- /tests/api/test_exceptions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from physionet.api.exceptions import ( 3 | PhysioNetAPIError, 4 | BadRequestError, 5 | ForbiddenError, 6 | NotFoundError, 7 | RateLimitError, 8 | ) 9 | 10 | 11 | def test_base_exception(): 12 | """Test base PhysioNetAPIError exception.""" 13 | with pytest.raises(PhysioNetAPIError): 14 | raise PhysioNetAPIError("Test error") 15 | 16 | 17 | def test_bad_request_error(): 18 | """Test BadRequestError is a subclass of PhysioNetAPIError.""" 19 | with pytest.raises(PhysioNetAPIError): 20 | raise BadRequestError("Bad request") 21 | 22 | 23 | def test_forbidden_error(): 24 | """Test ForbiddenError is a subclass of PhysioNetAPIError.""" 25 | with pytest.raises(PhysioNetAPIError): 26 | raise ForbiddenError("Forbidden") 27 | 28 | 29 | def test_not_found_error(): 30 | """Test NotFoundError is a subclass of PhysioNetAPIError.""" 31 | with pytest.raises(PhysioNetAPIError): 32 | raise NotFoundError("Not found") 33 | 34 | 35 | def test_rate_limit_error(): 36 | """Test RateLimitError is a subclass of PhysioNetAPIError.""" 37 | with pytest.raises(PhysioNetAPIError): 38 | raise RateLimitError("Rate limit exceeded") 39 | 40 | 41 | def test_exception_messages(): 42 | """Test that exception messages are preserved.""" 43 | error_msg = "Custom error message" 44 | 45 | try: 46 | raise BadRequestError(error_msg) 47 | except BadRequestError as e: 48 | assert str(e) == error_msg 49 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "physionet" 7 | version = "0.1.6" 8 | authors = [ 9 | { name="Tom Pollard", email="tpollard@mit.edu" }, 10 | ] 11 | license = {file = "LICENSE"} 12 | description = "A collection of tools for working with the PhysioNet repository." 13 | readme = "README.md" 14 | requires-python = ">=3.9" 15 | keywords=["physionet", "mimic", "medical", "dataset"] 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ] 21 | dependencies = [ 22 | "numpy", 23 | "pandas", 24 | "openpyxl", 25 | "requests", 26 | "tqdm", 27 | ] 28 | 29 | [project.optional-dependencies] 30 | dev = [ 31 | "pytest>=7.0.0", 32 | "requests-mock>=1.9.0", 33 | ] 34 | build = [ 35 | "build>=0.10.0", 36 | "twine>=4.0.0", 37 | ] 38 | 39 | [tool.black] 40 | line-length = 119 41 | 42 | [tool.pyright] 43 | reportMissingImports = true 44 | 45 | [project.scripts] 46 | physionet = "physionet.cli:main" 47 | 48 | [project.urls] 49 | homepage = "https://github.com/MIT-LCP/physionet" 50 | repository = "https://github.com/MIT-LCP/physionet" 51 | 52 | [tool.hatch.build.targets.sdist] 53 | exclude = [ 54 | "venv/", 55 | "env/", 56 | ".venv/", 57 | "*.egg-info/", 58 | "dist/", 59 | "build/", 60 | ".pytest_cache/", 61 | ".git/", 62 | ".github/", 63 | "__pycache__/", 64 | "*.pyc", 65 | ".DS_Store", 66 | ] 67 | 68 | [tool.hatch.build.targets.wheel] 69 | packages = ["physionet"] 70 | -------------------------------------------------------------------------------- /physionet/validate/checks/documentation.py: -------------------------------------------------------------------------------- 1 | """Documentation validation checks.""" 2 | 3 | from pathlib import Path 4 | 5 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity 6 | from physionet.validate.config import ValidationConfig 7 | 8 | 9 | def check_documentation(path: Path, config: ValidationConfig) -> CheckResult: 10 | """ 11 | Check documentation completeness. 12 | 13 | Validates: 14 | - Required files exist (if any are specified in config) 15 | 16 | Args: 17 | path: Path to dataset directory 18 | config: Validation configuration 19 | 20 | Returns: 21 | CheckResult with any documentation issues found 22 | """ 23 | result = CheckResult(category=CheckCategory.DOCUMENTATION) 24 | 25 | # Check for required files 26 | for required_file in config.required_files: 27 | file_path = path / required_file 28 | if not file_path.exists(): 29 | # Customize suggestion for README.md 30 | if required_file == "README.md": 31 | suggestion = ( 32 | "Add README.md to your dataset. At minimum, the file should include " 33 | "a title and a brief description of the package content." 34 | ) 35 | else: 36 | suggestion = f"Add {required_file} to your dataset" 37 | 38 | result.issues.append( 39 | ValidationIssue( 40 | severity=Severity.ERROR, 41 | category=CheckCategory.DOCUMENTATION, 42 | file=required_file, 43 | message=f"Required file not found: {required_file}", 44 | suggestion=suggestion, 45 | ) 46 | ) 47 | 48 | return result 49 | -------------------------------------------------------------------------------- /physionet/datathon/mlhc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Temporary module for the MLHC Professional Studies Class. 3 | """ 4 | from collections import Counter 5 | 6 | from google.colab import widgets 7 | import numpy as np 8 | 9 | 10 | def visualize_notes(notes, hadm_id): 11 | """ 12 | Temporary function for visualizing notes. 13 | """ 14 | # When did this patient arrive (useful for getting first 48 hours) 15 | admittime = notes[notes.hadm_id == hadm_id].admittime.values[0] 16 | 17 | # Get the notes for this patient 18 | notes_subject = notes.loc[notes.hadm_id == hadm_id] 19 | 20 | # How many notes for each category? 21 | category_counts = Counter(notes_subject.category.values) 22 | category_sorted = sorted(category_counts.keys(), key=lambda t: category_counts[t], reverse=True) 23 | 24 | # Outer tab is for different category of notes 25 | outer_tab = widgets.TabBar(category_sorted, location="top") 26 | for category in category_sorted: 27 | with outer_tab.output_to(category): 28 | notes_cat = notes_subject.loc[notes_subject.category == category] 29 | titles = [] 30 | for num, (i, row) in enumerate(notes_cat.iterrows()): 31 | # Format the text with additional metadata 32 | time_offset = (row.charttime - admittime).total_seconds() / 3600.0 33 | time_offset = int(time_offset) if not np.isnan(time_offset) else "n/a" 34 | 35 | # Only first 48 hours of data 36 | titles += ["%s Note #%d (%s Hours)" % (category, num, time_offset)] 37 | 38 | # Inner tab is for each note in a category 39 | inner_tab = widgets.TabBar(titles, location="start") 40 | for i in range(len(titles)): 41 | with inner_tab.output_to(titles[i]): 42 | print(notes_cat.iloc[i]["text"]) 43 | -------------------------------------------------------------------------------- /physionet/validate/config.py: -------------------------------------------------------------------------------- 1 | """Configuration for validation checks.""" 2 | 3 | from dataclasses import dataclass, field 4 | from typing import Dict, List, Optional, Tuple 5 | 6 | 7 | @dataclass 8 | class ValidationConfig: 9 | """Configuration for dataset validation.""" 10 | 11 | # General settings 12 | check_filesystem: bool = True 13 | check_documentation: bool = True 14 | check_integrity: bool = True 15 | check_quality: bool = True 16 | check_phi: bool = True 17 | 18 | # File system settings 19 | max_file_size_bytes: Optional[int] = None # None = no limit 20 | warn_small_files_threshold: int = 100 # Warn if more than this many small files 21 | ignore_patterns: List[str] = field(default_factory=lambda: [ 22 | ".git", ".gitignore", ".DS_Store", "__pycache__", "*.pyc", ".pytest_cache" 23 | ]) 24 | 25 | # Documentation settings 26 | required_files: List[str] = field(default_factory=lambda: ["README.md"]) 27 | recommended_readme_sections: List[str] = field(default_factory=list) 28 | 29 | # Performance settings 30 | max_rows_to_scan: Optional[int] = 10000 # Max rows to scan per CSV for privacy/quality checks (None = all rows) 31 | sample_large_files: bool = True # If True, sample rows from large files instead of scanning all 32 | 33 | # Quality settings 34 | missing_value_threshold: float = 1.0 # Warn if column has 100% missing values 35 | value_ranges: Dict[str, Tuple[float, float]] = field(default_factory=dict) 36 | # Example: {"heart_rate": (20, 300), "temperature": (32, 43)} 37 | 38 | # Privacy settings 39 | allowed_age_max: int = 89 40 | phi_patterns: List[str] = field(default_factory=lambda: [ 41 | r"\b\d{3}-\d{2}-\d{4}\b", # SSN pattern 42 | r"\b[\w\.-]+@[\w\.-]+\.\w+\b", # Email pattern 43 | r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b", # Phone pattern 44 | ]) 45 | -------------------------------------------------------------------------------- /tests/api/test_utils.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from physionet.api.utils import get_credentials_from_env, format_size 4 | 5 | 6 | def test_get_credentials_from_env_with_credentials(monkeypatch): 7 | """Test getting credentials from environment variables.""" 8 | monkeypatch.setenv("PHYSIONET_USERNAME", "testuser") 9 | monkeypatch.setenv("PHYSIONET_PASSWORD", "testpass") 10 | 11 | username, password = get_credentials_from_env() 12 | 13 | assert username == "testuser" 14 | assert password == "testpass" 15 | 16 | 17 | def test_get_credentials_from_env_without_credentials(monkeypatch): 18 | """Test getting credentials when environment variables are not set.""" 19 | monkeypatch.delenv("PHYSIONET_USERNAME", raising=False) 20 | monkeypatch.delenv("PHYSIONET_PASSWORD", raising=False) 21 | 22 | username, password = get_credentials_from_env() 23 | 24 | assert username is None 25 | assert password is None 26 | 27 | 28 | def test_get_credentials_from_env_partial(monkeypatch): 29 | """Test getting credentials when only one variable is set.""" 30 | monkeypatch.setenv("PHYSIONET_USERNAME", "testuser") 31 | monkeypatch.delenv("PHYSIONET_PASSWORD", raising=False) 32 | 33 | username, password = get_credentials_from_env() 34 | 35 | assert username == "testuser" 36 | assert password is None 37 | 38 | 39 | def test_format_size_bytes(): 40 | """Test formatting bytes.""" 41 | assert format_size(100) == "100.00 B" 42 | assert format_size(512) == "512.00 B" 43 | 44 | 45 | def test_format_size_kilobytes(): 46 | """Test formatting kilobytes.""" 47 | assert format_size(1024) == "1.00 KB" 48 | assert format_size(1536) == "1.50 KB" 49 | assert format_size(2048) == "2.00 KB" 50 | 51 | 52 | def test_format_size_megabytes(): 53 | """Test formatting megabytes.""" 54 | assert format_size(1024 * 1024) == "1.00 MB" 55 | assert format_size(1024 * 1024 * 5) == "5.00 MB" 56 | assert format_size(1024 * 1024 * 1.5) == "1.50 MB" 57 | 58 | 59 | def test_format_size_gigabytes(): 60 | """Test formatting gigabytes.""" 61 | assert format_size(1024 * 1024 * 1024) == "1.00 GB" 62 | assert format_size(1024 * 1024 * 1024 * 2.5) == "2.50 GB" 63 | 64 | 65 | def test_format_size_terabytes(): 66 | """Test formatting terabytes.""" 67 | assert format_size(1024 * 1024 * 1024 * 1024) == "1.00 TB" 68 | assert format_size(1024 * 1024 * 1024 * 1024 * 3) == "3.00 TB" 69 | 70 | 71 | def test_format_size_petabytes(): 72 | """Test formatting petabytes.""" 73 | assert format_size(1024 * 1024 * 1024 * 1024 * 1024) == "1.00 PB" 74 | assert format_size(1024 * 1024 * 1024 * 1024 * 1024 * 2) == "2.00 PB" 75 | 76 | 77 | def test_format_size_edge_cases(): 78 | """Test edge cases for size formatting.""" 79 | assert format_size(0) == "0.00 B" 80 | assert format_size(1) == "1.00 B" 81 | assert format_size(1023) == "1023.00 B" 82 | -------------------------------------------------------------------------------- /physionet/api/models.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional, List, Any 3 | 4 | 5 | @dataclass 6 | class ProjectVersion: 7 | """Represents a project version.""" 8 | 9 | slug: str 10 | title: str 11 | version: str 12 | abstract: str 13 | citation: str 14 | 15 | 16 | @dataclass 17 | class PublishedProject: 18 | """Represents a published project.""" 19 | 20 | slug: str 21 | version: str 22 | title: str 23 | short_description: str 24 | abstract: str 25 | core_doi: Optional[str] 26 | version_doi: Optional[str] 27 | is_latest_version: bool 28 | publish_date: str 29 | license: Optional[dict] 30 | dua: Optional[dict] 31 | main_storage_size: int 32 | compressed_storage_size: int 33 | 34 | @classmethod 35 | def from_dict(cls, data: dict) -> "PublishedProject": 36 | """Create instance from API response dictionary.""" 37 | return cls( 38 | slug=data["slug"], 39 | version=data["version"], 40 | title=data["title"], 41 | short_description=data.get("short_description", ""), 42 | abstract=data.get("abstract", ""), 43 | core_doi=data.get("core_doi"), 44 | version_doi=data.get("version_doi"), 45 | is_latest_version=data.get("is_latest_version", False), 46 | publish_date=data.get("publish_date", ""), 47 | license=data.get("license"), 48 | dua=data.get("dua"), 49 | main_storage_size=data.get("main_storage_size", 0), 50 | compressed_storage_size=data.get("compressed_storage_size", 0), 51 | ) 52 | 53 | 54 | @dataclass 55 | class ProjectDetail: 56 | """Detailed project information.""" 57 | 58 | slug: str 59 | title: str 60 | version: str 61 | abstract: str 62 | license: Optional[dict] 63 | short_description: str 64 | project_home_page: Optional[str] 65 | publish_datetime: str 66 | doi: str 67 | main_storage_size: int 68 | compressed_storage_size: int 69 | 70 | @classmethod 71 | def from_dict(cls, data: dict) -> "ProjectDetail": 72 | """Create instance from API response dictionary.""" 73 | return cls( 74 | slug=data["slug"], 75 | title=data["title"], 76 | version=data["version"], 77 | abstract=data.get("abstract", ""), 78 | license=data.get("license"), 79 | short_description=data.get("short_description", ""), 80 | project_home_page=data.get("project_home_page"), 81 | publish_datetime=data.get("publish_datetime", ""), 82 | doi=data.get("doi", ""), 83 | main_storage_size=data.get("main_storage_size", 0), 84 | compressed_storage_size=data.get("compressed_storage_size", 0), 85 | ) 86 | 87 | 88 | @dataclass 89 | class PaginatedResponse: 90 | """Paginated API response.""" 91 | 92 | count: int 93 | next: Optional[str] 94 | previous: Optional[str] 95 | results: List[Any] 96 | -------------------------------------------------------------------------------- /physionet/api/client.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from typing import Optional, Dict, Any 3 | from urllib.parse import urljoin 4 | 5 | from .exceptions import ( 6 | PhysioNetAPIError, 7 | BadRequestError, 8 | ForbiddenError, 9 | NotFoundError, 10 | RateLimitError, 11 | ) 12 | from .endpoints import ProjectsAPI 13 | 14 | 15 | class PhysioNetClient: 16 | """Main client for interacting with PhysioNet API v1.""" 17 | 18 | def __init__( 19 | self, 20 | base_url: str = "https://physionet.org", 21 | username: Optional[str] = None, 22 | password: Optional[str] = None, 23 | timeout: int = 30, 24 | ): 25 | """ 26 | Initialize PhysioNet API client. 27 | 28 | Args: 29 | base_url: Base URL for PhysioNet (default: https://physionet.org) 30 | username: Optional username for authenticated requests 31 | password: Optional password for authenticated requests 32 | timeout: Request timeout in seconds 33 | """ 34 | self.base_url = base_url.rstrip("/") 35 | self.api_base = f"{self.base_url}/api/v1/" 36 | self.timeout = timeout 37 | self.session = requests.Session() 38 | 39 | if username and password: 40 | self.session.auth = (username, password) 41 | 42 | self.session.headers.update({"User-Agent": "PhysioNet-Python-Client/1.0", "Accept": "application/json"}) 43 | 44 | self.projects = ProjectsAPI(self) 45 | 46 | def _make_request( 47 | self, method: str, endpoint: str, params: Optional[Dict[str, Any]] = None, **kwargs 48 | ) -> requests.Response: 49 | """ 50 | Make HTTP request to API. 51 | 52 | Args: 53 | method: HTTP method (GET, POST, etc.) 54 | endpoint: API endpoint path 55 | params: Query parameters 56 | **kwargs: Additional arguments for requests 57 | 58 | Returns: 59 | Response object 60 | 61 | Raises: 62 | PhysioNetAPIError: On API errors 63 | requests.RequestException: On network errors 64 | """ 65 | url = urljoin(self.api_base, endpoint) 66 | 67 | response = self.session.request(method=method, url=url, params=params, timeout=self.timeout, **kwargs) 68 | 69 | if response.status_code >= 400: 70 | self._handle_error(response) 71 | 72 | return response 73 | 74 | def _handle_error(self, response: requests.Response): 75 | """Handle API error responses.""" 76 | try: 77 | error_data = response.json() 78 | error_msg = error_data.get("error", str(error_data)) 79 | except Exception: 80 | error_msg = response.text or response.reason 81 | 82 | if response.status_code == 400: 83 | raise BadRequestError(error_msg) 84 | elif response.status_code == 403: 85 | raise ForbiddenError(error_msg) 86 | elif response.status_code == 404: 87 | raise NotFoundError(error_msg) 88 | elif response.status_code == 429: 89 | raise RateLimitError(error_msg) 90 | else: 91 | raise PhysioNetAPIError(f"HTTP {response.status_code}: {error_msg}") 92 | 93 | def close(self): 94 | """Close the session.""" 95 | self.session.close() 96 | 97 | def __enter__(self): 98 | return self 99 | 100 | def __exit__(self, exc_type, exc_val, exc_tb): 101 | self.close() 102 | -------------------------------------------------------------------------------- /physionet/api/endpoints.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Iterator 2 | from physionet.api.models import PublishedProject, ProjectVersion, ProjectDetail 3 | 4 | 5 | class ProjectsAPI: 6 | """API methods for interacting with projects.""" 7 | 8 | def __init__(self, client): 9 | self.client = client 10 | 11 | def list_published(self) -> List[PublishedProject]: 12 | """ 13 | List all published projects. 14 | 15 | Returns: 16 | List of PublishedProject objects 17 | 18 | Note: 19 | The API returns all projects in a single response (no pagination). 20 | """ 21 | response = self.client._make_request("GET", "projects/published/") 22 | data = response.json() 23 | 24 | return [PublishedProject.from_dict(p) for p in data] 25 | 26 | def iter_published(self) -> Iterator[PublishedProject]: 27 | """ 28 | Iterator that yields all published projects. 29 | 30 | Yields: 31 | PublishedProject objects 32 | 33 | Note: 34 | This is a convenience method that iterates over list_published() results. 35 | """ 36 | for project in self.list_published(): 37 | yield project 38 | 39 | def search(self, search_term: str, resource_type: Optional[List[str]] = None) -> List[PublishedProject]: 40 | """ 41 | Search published projects. 42 | 43 | Args: 44 | search_term: Search keywords 45 | resource_type: Filter by resource type(s), or ['all'] for all types 46 | 47 | Returns: 48 | List of matching PublishedProject objects 49 | """ 50 | params = {"search_term": search_term} 51 | 52 | if resource_type: 53 | params["resource_type"] = resource_type 54 | 55 | response = self.client._make_request("GET", "projects/search/", params=params) 56 | data = response.json() 57 | 58 | return [PublishedProject.from_dict(p) for p in data] 59 | 60 | def list_versions(self, project_slug: str) -> List[ProjectVersion]: 61 | """ 62 | List all versions of a project. 63 | 64 | Args: 65 | project_slug: Project identifier 66 | 67 | Returns: 68 | List of ProjectVersion objects 69 | """ 70 | endpoint = f"projects/{project_slug}/versions/" 71 | response = self.client._make_request("GET", endpoint) 72 | data = response.json() 73 | 74 | return [ 75 | ProjectVersion( 76 | slug=v["slug"], 77 | title=v["title"], 78 | version=v["version"], 79 | abstract=v["abstract"], 80 | citation=v["citation"], 81 | ) 82 | for v in data 83 | ] 84 | 85 | def get_details(self, project_slug: str, version: str) -> ProjectDetail: 86 | """ 87 | Get detailed information about a specific project version. 88 | 89 | Args: 90 | project_slug: Project identifier 91 | version: Version number 92 | 93 | Returns: 94 | ProjectDetail object 95 | """ 96 | endpoint = f"projects/{project_slug}/versions/{version}/" 97 | response = self.client._make_request("GET", endpoint) 98 | data = response.json() 99 | 100 | return ProjectDetail.from_dict(data) 101 | 102 | def download_checksums(self, project_slug: str, version: str, output_path: str): 103 | """ 104 | Download SHA256 checksums file for a project. 105 | 106 | Args: 107 | project_slug: Project identifier 108 | version: Version number 109 | output_path: Path to save the checksums file 110 | 111 | Note: 112 | Requires authentication and project access permissions. 113 | """ 114 | endpoint = f"projects/published/{project_slug}/{version}/sha256sums/" 115 | response = self.client._make_request("GET", endpoint) 116 | 117 | with open(output_path, "wb") as f: 118 | f.write(response.content) 119 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PhysioNet 2 | 3 | A collection of tools for working with the [PhysioNet](http://physionet.org/) repository. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | pip install physionet 9 | ``` 10 | 11 | ## Usage 12 | 13 | ### PhysioNet "Preflight" 14 | 15 | Validate your dataset before submission to PhysioNet: 16 | 17 | ```bash 18 | # Validate a dataset 19 | physionet validate /path/to/dataset 20 | 21 | # Run specific checks only 22 | physionet validate /path/to/dataset --checks filesystem,privacy 23 | 24 | # Disable sampling for complete validation (slower) 25 | physionet validate /path/to/dataset --no-sampling 26 | ``` 27 | 28 | The validator checks for: 29 | 30 | - File naming issues (spaces, special characters, long names) 31 | - Proprietary formats (suggests open alternatives) 32 | - Missing documentation (README.md) 33 | - CSV integrity (structure, encoding, duplicate columns) 34 | - Data quality (missing values, out-of-range data) 35 | - Privacy concerns (PHI patterns, sensitive files) 36 | 37 | A validation report (PHYSIONET_REPORT.md) is automatically saved in your dataset folder. 38 | 39 | ### API Client 40 | 41 | Interact with the PhysioNet API to explore and search published projects: 42 | 43 | ```python 44 | from physionet import PhysioNetClient 45 | 46 | # Create a client instance 47 | client = PhysioNetClient() 48 | 49 | # List all published projects 50 | projects = client.projects.list_published() 51 | print(f"Total projects: {len(projects)}") 52 | 53 | # Display first few projects 54 | for project in projects[:5]: 55 | print(f"{project.slug} v{project.version}: {project.title}") 56 | 57 | # Search for projects 58 | ecg_projects = client.projects.search('ECG') 59 | print(f"Found {len(ecg_projects)} ECG-related projects") 60 | 61 | # Get all versions of a project 62 | versions = client.projects.list_versions('mimic-iv-demo') 63 | for version in versions: 64 | print(f"Version {version.version}: {version.title}") 65 | 66 | # Get detailed information about a specific version 67 | details = client.projects.get_details('mimic-iv-demo', '2.2') 68 | print(f"Title: {details.title}") 69 | print(f"DOI: {details.doi}") 70 | print(f"Published: {details.publish_datetime}") 71 | print(f"Size: {details.main_storage_size} bytes") 72 | ``` 73 | 74 | ### Authenticated Requests 75 | 76 | For endpoints that require authentication (e.g., downloading checksums): 77 | 78 | ```python 79 | from physionet import PhysioNetClient 80 | 81 | # Create client with authentication 82 | client = PhysioNetClient( 83 | username='your_username', 84 | password='your_password' 85 | ) 86 | 87 | # Download checksums file 88 | client.projects.download_checksums( 89 | 'mimic-iv-demo', 90 | '2.2', 91 | 'checksums.txt' 92 | ) 93 | 94 | # Or use environment variables 95 | # Set PHYSIONET_USERNAME and PHYSIONET_PASSWORD 96 | from physionet.api.utils import get_credentials_from_env 97 | 98 | username, password = get_credentials_from_env() 99 | client = PhysioNetClient(username=username, password=password) 100 | ``` 101 | 102 | ### Using Context Manager 103 | 104 | ```python 105 | from physionet import PhysioNetClient 106 | 107 | # Automatically close session when done 108 | with PhysioNetClient() as client: 109 | projects = client.projects.list_published() 110 | print(f"Found {len(projects)} projects") 111 | ``` 112 | 113 | ### Utility Functions 114 | 115 | ```python 116 | from physionet.api.utils import format_size 117 | 118 | # Format bytes to human-readable size 119 | size = format_size(16224447) 120 | print(size) # "15.47 MB" 121 | ``` 122 | 123 | ## Error Handling 124 | 125 | ```python 126 | from physionet import PhysioNetClient 127 | from physionet.api.exceptions import NotFoundError, RateLimitError, ForbiddenError 128 | 129 | client = PhysioNetClient() 130 | 131 | try: 132 | details = client.projects.get_details('nonexistent-project', '1.0') 133 | except NotFoundError: 134 | print("Project not found") 135 | except RateLimitError: 136 | print("Rate limit exceeded, please wait before retrying") 137 | except ForbiddenError: 138 | print("Access denied - check credentials or project permissions") 139 | ``` 140 | 141 | ## Contributing 142 | 143 | Contributions are welcome! 144 | 145 | ## License 146 | 147 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 148 | -------------------------------------------------------------------------------- /tests/api/test_models.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from physionet.api.models import ( 3 | ProjectVersion, 4 | PublishedProject, 5 | ProjectDetail, 6 | PaginatedResponse, 7 | ) 8 | 9 | 10 | def test_project_version_creation(): 11 | """Test ProjectVersion dataclass creation.""" 12 | pv = ProjectVersion( 13 | slug="test-project", 14 | title="Test Project", 15 | version="1.0", 16 | abstract="Test abstract", 17 | citation="Test citation", 18 | ) 19 | 20 | assert pv.slug == "test-project" 21 | assert pv.title == "Test Project" 22 | assert pv.version == "1.0" 23 | assert pv.abstract == "Test abstract" 24 | assert pv.citation == "Test citation" 25 | 26 | 27 | def test_published_project_from_dict(): 28 | """Test PublishedProject creation from API response dict.""" 29 | data = { 30 | "slug": "mimic-iv-demo", 31 | "version": "2.2", 32 | "title": "MIMIC-IV Demo", 33 | "short_description": "Demo dataset", 34 | "abstract": "Abstract text", 35 | "core_doi": "10.1234/test", 36 | "version_doi": "10.1234/test.v2.2", 37 | "is_latest_version": True, 38 | "publish_date": "2023-01-01", 39 | "license": {"name": "MIT"}, 40 | "dua": None, 41 | "main_storage_size": 1000000, 42 | "compressed_storage_size": 500000, 43 | } 44 | 45 | project = PublishedProject.from_dict(data) 46 | 47 | assert project.slug == "mimic-iv-demo" 48 | assert project.version == "2.2" 49 | assert project.title == "MIMIC-IV Demo" 50 | assert project.short_description == "Demo dataset" 51 | assert project.core_doi == "10.1234/test" 52 | assert project.is_latest_version is True 53 | assert project.main_storage_size == 1000000 54 | 55 | 56 | def test_published_project_from_dict_with_missing_fields(): 57 | """Test PublishedProject handles missing optional fields.""" 58 | data = { 59 | "slug": "test-project", 60 | "version": "1.0", 61 | "title": "Test", 62 | } 63 | 64 | project = PublishedProject.from_dict(data) 65 | 66 | assert project.slug == "test-project" 67 | assert project.version == "1.0" 68 | assert project.title == "Test" 69 | assert project.short_description == "" 70 | assert project.abstract == "" 71 | assert project.core_doi is None 72 | assert project.is_latest_version is False 73 | assert project.main_storage_size == 0 74 | 75 | 76 | def test_project_detail_from_dict(): 77 | """Test ProjectDetail creation from API response dict.""" 78 | data = { 79 | "slug": "test-project", 80 | "title": "Test Project", 81 | "version": "1.0", 82 | "abstract": "Test abstract", 83 | "license": {"name": "MIT"}, 84 | "short_description": "Short desc", 85 | "project_home_page": "https://example.com", 86 | "publish_datetime": "2023-01-01T00:00:00", 87 | "doi": "10.1234/test", 88 | "main_storage_size": 1000000, 89 | "compressed_storage_size": 500000, 90 | } 91 | 92 | detail = ProjectDetail.from_dict(data) 93 | 94 | assert detail.slug == "test-project" 95 | assert detail.title == "Test Project" 96 | assert detail.version == "1.0" 97 | assert detail.doi == "10.1234/test" 98 | assert detail.project_home_page == "https://example.com" 99 | 100 | 101 | def test_project_detail_from_dict_with_missing_fields(): 102 | """Test ProjectDetail handles missing optional fields.""" 103 | data = { 104 | "slug": "test-project", 105 | "title": "Test", 106 | "version": "1.0", 107 | } 108 | 109 | detail = ProjectDetail.from_dict(data) 110 | 111 | assert detail.slug == "test-project" 112 | assert detail.abstract == "" 113 | assert detail.license is None 114 | assert detail.project_home_page is None 115 | assert detail.doi == "" 116 | assert detail.main_storage_size == 0 117 | 118 | 119 | def test_paginated_response_creation(): 120 | """Test PaginatedResponse creation.""" 121 | response = PaginatedResponse( 122 | count=100, 123 | next="https://api.example.com/page2", 124 | previous=None, 125 | results=["item1", "item2", "item3"], 126 | ) 127 | 128 | assert response.count == 100 129 | assert response.next == "https://api.example.com/page2" 130 | assert response.previous is None 131 | assert len(response.results) == 3 132 | -------------------------------------------------------------------------------- /tests/validate/test_cli.py: -------------------------------------------------------------------------------- 1 | """Tests for CLI interface.""" 2 | 3 | import pytest 4 | import json 5 | import subprocess 6 | import sys 7 | from pathlib import Path 8 | 9 | 10 | class TestValidateCLI: 11 | """Tests for the validate CLI command.""" 12 | 13 | def test_cli_validates_directory(self, tmp_path): 14 | """Test that CLI can validate a directory.""" 15 | # Create a minimal dataset 16 | readme = tmp_path / "README.md" 17 | readme.write_text("""# Test Dataset 18 | 19 | ## Background 20 | Test background. 21 | 22 | ## Methods 23 | Test methods. 24 | 25 | ## Data Description 26 | Test data. 27 | 28 | ## Usage Notes 29 | Test usage. 30 | 31 | ## References 32 | Test references. 33 | """) 34 | 35 | # Run CLI 36 | result = subprocess.run( 37 | [sys.executable, "-m", "physionet", "validate", str(tmp_path)], 38 | capture_output=True, 39 | text=True, 40 | ) 41 | 42 | assert result.returncode == 0 43 | assert "PhysioNet Dataset Validation Report" in result.stdout 44 | 45 | def test_cli_handles_nonexistent_path(self): 46 | """Test that CLI handles nonexistent paths gracefully.""" 47 | result = subprocess.run( 48 | [sys.executable, "-m", "physionet", "validate", "/nonexistent/path"], 49 | capture_output=True, 50 | text=True, 51 | ) 52 | 53 | assert result.returncode == 1 54 | assert "does not exist" in result.stderr 55 | 56 | def test_cli_generates_json_report(self, tmp_path): 57 | """Test that CLI can generate JSON report.""" 58 | # Create dataset 59 | readme = tmp_path / "README.md" 60 | readme.write_text("# Test") 61 | 62 | # Run CLI with --report 63 | report_file = tmp_path / "report.json" 64 | result = subprocess.run( 65 | [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--report", str(report_file)], 66 | capture_output=True, 67 | text=True, 68 | ) 69 | 70 | # Check that report was created 71 | assert report_file.exists() 72 | 73 | # Validate JSON structure 74 | with open(report_file) as f: 75 | report = json.load(f) 76 | 77 | assert "dataset_path" in report 78 | assert "timestamp" in report 79 | assert "summary" in report 80 | assert "checks" in report 81 | 82 | def test_cli_filters_by_check_category(self, tmp_path): 83 | """Test that CLI can filter checks by category.""" 84 | readme = tmp_path / "README.md" 85 | readme.write_text("# Test") 86 | 87 | result = subprocess.run( 88 | [sys.executable, "-m", "physionet", "validate", str(tmp_path), "--checks", "filesystem"], 89 | capture_output=True, 90 | text=True, 91 | ) 92 | 93 | assert result.returncode == 0 94 | # Should only show filesystem checks 95 | assert "Filesystem" in result.stdout or "filesystem" in result.stdout.lower() 96 | 97 | def test_cli_exits_with_error_on_validation_failure(self, tmp_path): 98 | """Test that CLI exits with error code when validation fails.""" 99 | # Create dataset with PHI 100 | csv_file = tmp_path / "data.csv" 101 | csv_file.write_text("patient_id,email\n1,test@example.com\n") 102 | 103 | result = subprocess.run( 104 | [sys.executable, "-m", "physionet", "validate", str(tmp_path)], 105 | capture_output=True, 106 | text=True, 107 | ) 108 | 109 | # Should exit with error code due to validation errors 110 | assert result.returncode == 1 111 | 112 | def test_cli_shows_help(self): 113 | """Test that CLI shows help message.""" 114 | result = subprocess.run( 115 | [sys.executable, "-m", "physionet", "--help"], 116 | capture_output=True, 117 | text=True, 118 | ) 119 | 120 | assert result.returncode == 0 121 | assert "validate" in result.stdout 122 | 123 | def test_validate_subcommand_help(self): 124 | """Test that validate subcommand shows help.""" 125 | result = subprocess.run( 126 | [sys.executable, "-m", "physionet", "validate", "--help"], 127 | capture_output=True, 128 | text=True, 129 | ) 130 | 131 | assert result.returncode == 0 132 | assert "path" in result.stdout 133 | assert "--report" in result.stdout 134 | assert "--checks" in result.stdout 135 | -------------------------------------------------------------------------------- /physionet/cli.py: -------------------------------------------------------------------------------- 1 | """Command-line interface for physionet package.""" 2 | 3 | import argparse 4 | import json 5 | import sys 6 | from pathlib import Path 7 | 8 | from physionet.validate import validate_dataset, ValidationConfig 9 | 10 | 11 | def main(): 12 | """Main entry point for the CLI.""" 13 | parser = argparse.ArgumentParser( 14 | prog="physionet", 15 | description="Tools for working with PhysioNet datasets", 16 | ) 17 | 18 | subparsers = parser.add_subparsers(dest="command", help="Available commands") 19 | 20 | # Validate subcommand 21 | validate_parser = subparsers.add_parser( 22 | "validate", 23 | help="Validate a dataset before submission to PhysioNet", 24 | ) 25 | validate_parser.add_argument( 26 | "path", 27 | help="Path to the dataset directory to validate", 28 | ) 29 | validate_parser.add_argument( 30 | "--report", 31 | metavar="FILE", 32 | help="Generate detailed JSON report and save to FILE", 33 | ) 34 | validate_parser.add_argument( 35 | "--checks", 36 | metavar="CATEGORIES", 37 | help="Comma-separated list of check categories to run (filesystem,documentation,integrity,quality,privacy)", 38 | ) 39 | validate_parser.add_argument( 40 | "--level", 41 | choices=["error", "warning", "info"], 42 | default="info", 43 | help="Minimum severity level to display (default: info)", 44 | ) 45 | validate_parser.add_argument( 46 | "--no-sampling", 47 | action="store_true", 48 | help="Disable sampling for large files (scan all rows, slower but more thorough)", 49 | ) 50 | validate_parser.add_argument( 51 | "--max-rows", 52 | type=int, 53 | metavar="N", 54 | help="Maximum rows to scan per CSV file (default: 10000)", 55 | ) 56 | 57 | args = parser.parse_args() 58 | 59 | if args.command == "validate": 60 | return _handle_validate(args) 61 | elif args.command is None: 62 | parser.print_help() 63 | return 0 64 | else: 65 | print(f"Unknown command: {args.command}", file=sys.stderr) 66 | return 1 67 | 68 | 69 | def _handle_validate(args): 70 | """Handle the validate subcommand.""" 71 | # Validate path 72 | dataset_path = Path(args.path) 73 | if not dataset_path.exists(): 74 | print(f"Error: Path does not exist: {args.path}", file=sys.stderr) 75 | return 1 76 | 77 | if not dataset_path.is_dir(): 78 | print(f"Error: Path is not a directory: {args.path}", file=sys.stderr) 79 | return 1 80 | 81 | # Configure validation 82 | config = ValidationConfig() 83 | 84 | # Parse check categories if specified 85 | if args.checks: 86 | categories = [c.strip().lower() for c in args.checks.split(",")] 87 | config.check_filesystem = "filesystem" in categories 88 | config.check_documentation = "documentation" in categories 89 | config.check_integrity = "integrity" in categories 90 | config.check_quality = "quality" in categories 91 | config.check_phi = "privacy" in categories 92 | 93 | # Configure sampling options 94 | if args.no_sampling: 95 | config.sample_large_files = False 96 | if args.max_rows: 97 | config.max_rows_to_scan = args.max_rows 98 | 99 | # Run validation 100 | try: 101 | print(f"Validating dataset: {dataset_path}") 102 | result = validate_dataset(str(dataset_path), config, show_progress=True) 103 | print() 104 | 105 | print(result.summary()) 106 | 107 | # Save validation report - either to specified path or default location 108 | if args.report: 109 | report_path = Path(args.report) 110 | # Determine format based on file extension 111 | if report_path.suffix.lower() == '.json': 112 | # Save as JSON 113 | with open(report_path, "w", encoding="utf-8") as f: 114 | json.dump(result.to_dict(), f, indent=2) 115 | else: 116 | # Save as Markdown 117 | with open(report_path, "w", encoding="utf-8") as f: 118 | f.write(result.summary()) 119 | else: 120 | # Default: save as Markdown in the root of the dataset folder 121 | report_path = dataset_path / "PHYSIONET_REPORT.md" 122 | with open(report_path, "w", encoding="utf-8") as f: 123 | f.write(result.summary()) 124 | 125 | print() 126 | print(f"Validation report saved to: {report_path}") 127 | 128 | if result.status == "error": 129 | return 1 130 | elif result.status == "warning" and args.level == "error": 131 | return 0 # Warnings don't fail if level is error 132 | return 0 133 | 134 | except Exception as e: 135 | print(f"Error during validation: {str(e)}", file=sys.stderr) 136 | import traceback 137 | traceback.print_exc() 138 | return 1 139 | 140 | 141 | if __name__ == "__main__": 142 | sys.exit(main()) 143 | -------------------------------------------------------------------------------- /physionet/validate/validator.py: -------------------------------------------------------------------------------- 1 | """Main validation logic.""" 2 | 3 | import os 4 | from datetime import datetime, timezone 5 | from pathlib import Path 6 | from typing import Optional 7 | 8 | from tqdm import tqdm 9 | 10 | from physionet.validate.config import ValidationConfig 11 | from physionet.validate.models import ( 12 | ValidationResult, 13 | CheckResult, 14 | ValidationIssue, 15 | CheckCategory, 16 | Severity, 17 | DatasetStats, 18 | ) 19 | from physionet.validate.checks import ( 20 | check_filesystem, 21 | check_documentation, 22 | check_integrity, 23 | check_quality, 24 | check_privacy, 25 | ) 26 | 27 | 28 | def validate_dataset( 29 | dataset_path: str, 30 | config: Optional[ValidationConfig] = None, 31 | show_progress: bool = True 32 | ) -> ValidationResult: 33 | """ 34 | Validate a PhysioNet dataset before submission. 35 | 36 | Args: 37 | dataset_path: Path to the dataset directory 38 | config: Optional validation configuration. If None, uses defaults. 39 | show_progress: Whether to show progress bar. Default True. 40 | 41 | Returns: 42 | ValidationResult containing all validation issues and statistics 43 | 44 | Raises: 45 | ValueError: If dataset_path doesn't exist or isn't a directory 46 | """ 47 | path = Path(dataset_path) 48 | if not path.exists(): 49 | raise ValueError(f"Dataset path does not exist: {dataset_path}") 50 | if not path.is_dir(): 51 | raise ValueError(f"Dataset path is not a directory: {dataset_path}") 52 | 53 | if config is None: 54 | config = ValidationConfig() 55 | 56 | # Initialize result 57 | result = ValidationResult( 58 | dataset_path=path.name, # Use just the dataset folder name, not full path 59 | timestamp=datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), 60 | ) 61 | 62 | # Calculate dataset statistics 63 | result.dataset_stats = _calculate_stats(path, config) 64 | 65 | # Determine which checks to run 66 | checks_to_run = [] 67 | if config.check_filesystem: 68 | checks_to_run.append(("Filesystem", CheckCategory.FILESYSTEM, check_filesystem)) 69 | if config.check_documentation: 70 | checks_to_run.append(("Documentation", CheckCategory.DOCUMENTATION, check_documentation)) 71 | if config.check_integrity: 72 | checks_to_run.append(("Integrity", CheckCategory.INTEGRITY, check_integrity)) 73 | if config.check_quality: 74 | checks_to_run.append(("Quality", CheckCategory.QUALITY, check_quality)) 75 | if config.check_phi: 76 | checks_to_run.append(("Privacy", CheckCategory.PRIVACY, check_privacy)) 77 | 78 | # Run validation checks with progress bar 79 | if show_progress: 80 | progress_bar = tqdm( 81 | total=100, 82 | desc="Running validation checks", 83 | unit="%", 84 | leave=False, 85 | ncols=100, 86 | bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}%" 87 | ) 88 | 89 | steps_per_check = 100 // len(checks_to_run) if checks_to_run else 100 90 | 91 | for i, (name, category, check_func) in enumerate(checks_to_run): 92 | # Create a callback to update progress during this check 93 | def update_progress(msg: str): 94 | progress_bar.set_description(f"{name}: {msg}"[:80]) 95 | 96 | progress_bar.set_description(f"{name}"[:80]) 97 | 98 | # Call check function with progress callback if it supports it 99 | try: 100 | result.check_results[category] = check_func(path, config, progress_callback=update_progress) 101 | except TypeError: 102 | # Function doesn't support progress_callback parameter 103 | result.check_results[category] = check_func(path, config) 104 | 105 | # Update progress 106 | progress_bar.update(steps_per_check) 107 | 108 | progress_bar.close() 109 | else: 110 | for name, category, check_func in checks_to_run: 111 | # Try with progress_callback first, fall back to without 112 | try: 113 | result.check_results[category] = check_func(path, config, progress_callback=None) 114 | except TypeError: 115 | result.check_results[category] = check_func(path, config) 116 | 117 | return result 118 | 119 | 120 | def _calculate_stats(path: Path, config: ValidationConfig) -> DatasetStats: 121 | """Calculate statistics about the dataset.""" 122 | stats = DatasetStats() 123 | 124 | for root, dirs, files in os.walk(path): 125 | # Filter out ignored directories 126 | dirs[:] = [d for d in dirs if not _should_ignore(d, config.ignore_patterns)] 127 | 128 | stats.directory_count += len(dirs) 129 | 130 | for file in files: 131 | if _should_ignore(file, config.ignore_patterns): 132 | continue 133 | 134 | file_path = Path(root) / file 135 | try: 136 | stats.file_count += 1 137 | stats.total_size_bytes += file_path.stat().st_size 138 | except (OSError, PermissionError): 139 | # Skip files we can't access 140 | pass 141 | 142 | return stats 143 | 144 | 145 | def _should_ignore(name: str, patterns: list) -> bool: 146 | """Check if a file or directory should be ignored.""" 147 | for pattern in patterns: 148 | if pattern.startswith("*"): 149 | if name.endswith(pattern[1:]): 150 | return True 151 | elif pattern in name: 152 | return True 153 | return False 154 | -------------------------------------------------------------------------------- /tests/api/test_client.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import requests_mock 3 | from physionet.api.client import PhysioNetClient 4 | from physionet.api.exceptions import ( 5 | BadRequestError, 6 | ForbiddenError, 7 | NotFoundError, 8 | RateLimitError, 9 | PhysioNetAPIError, 10 | ) 11 | 12 | 13 | def test_client_initialization(): 14 | """Test client initializes with correct defaults.""" 15 | client = PhysioNetClient() 16 | 17 | assert client.base_url == "https://physionet.org" 18 | assert client.api_base == "https://physionet.org/api/v1/" 19 | assert client.timeout == 30 20 | assert "User-Agent" in client.session.headers 21 | assert client.session.headers["Accept"] == "application/json" 22 | 23 | 24 | def test_client_initialization_with_custom_base_url(): 25 | """Test client with custom base URL.""" 26 | client = PhysioNetClient(base_url="https://test.example.com") 27 | 28 | assert client.base_url == "https://test.example.com" 29 | assert client.api_base == "https://test.example.com/api/v1/" 30 | 31 | 32 | def test_client_initialization_with_trailing_slash(): 33 | """Test that trailing slash is removed from base URL.""" 34 | client = PhysioNetClient(base_url="https://physionet.org/") 35 | 36 | assert client.base_url == "https://physionet.org" 37 | 38 | 39 | def test_client_initialization_with_auth(): 40 | """Test client initializes with authentication.""" 41 | client = PhysioNetClient(username="testuser", password="testpass") 42 | 43 | assert client.session.auth == ("testuser", "testpass") 44 | 45 | 46 | def test_client_initialization_without_auth(): 47 | """Test client initializes without authentication.""" 48 | client = PhysioNetClient() 49 | 50 | assert client.session.auth is None 51 | 52 | 53 | def test_client_has_projects_api(): 54 | """Test client has projects API endpoint.""" 55 | client = PhysioNetClient() 56 | 57 | assert hasattr(client, "projects") 58 | assert client.projects.client is client 59 | 60 | 61 | def test_make_request_success(): 62 | """Test successful API request.""" 63 | client = PhysioNetClient() 64 | 65 | with requests_mock.Mocker() as m: 66 | m.get("https://physionet.org/api/v1/test/", json={"status": "ok"}) 67 | 68 | response = client._make_request("GET", "test/") 69 | 70 | assert response.json() == {"status": "ok"} 71 | 72 | 73 | def test_make_request_with_params(): 74 | """Test API request with query parameters.""" 75 | client = PhysioNetClient() 76 | 77 | with requests_mock.Mocker() as m: 78 | m.get("https://physionet.org/api/v1/test/", json={"status": "ok"}) 79 | 80 | response = client._make_request("GET", "test/", params={"page": 1, "size": 10}) 81 | 82 | assert "page=1" in m.last_request.url 83 | assert "size=10" in m.last_request.url 84 | 85 | 86 | def test_error_handling_400(): 87 | """Test 400 Bad Request error handling.""" 88 | client = PhysioNetClient() 89 | 90 | with requests_mock.Mocker() as m: 91 | m.get("https://physionet.org/api/v1/test/", status_code=400, json={"error": "Bad request"}) 92 | 93 | with pytest.raises(BadRequestError) as exc_info: 94 | client._make_request("GET", "test/") 95 | 96 | assert "Bad request" in str(exc_info.value) 97 | 98 | 99 | def test_error_handling_403(): 100 | """Test 403 Forbidden error handling.""" 101 | client = PhysioNetClient() 102 | 103 | with requests_mock.Mocker() as m: 104 | m.get("https://physionet.org/api/v1/test/", status_code=403, json={"error": "Forbidden"}) 105 | 106 | with pytest.raises(ForbiddenError) as exc_info: 107 | client._make_request("GET", "test/") 108 | 109 | assert "Forbidden" in str(exc_info.value) 110 | 111 | 112 | def test_error_handling_404(): 113 | """Test 404 Not Found error handling.""" 114 | client = PhysioNetClient() 115 | 116 | with requests_mock.Mocker() as m: 117 | m.get("https://physionet.org/api/v1/test/", status_code=404, json={"error": "Not found"}) 118 | 119 | with pytest.raises(NotFoundError) as exc_info: 120 | client._make_request("GET", "test/") 121 | 122 | assert "Not found" in str(exc_info.value) 123 | 124 | 125 | def test_error_handling_429(): 126 | """Test 429 Rate Limit error handling.""" 127 | client = PhysioNetClient() 128 | 129 | with requests_mock.Mocker() as m: 130 | m.get("https://physionet.org/api/v1/test/", status_code=429, json={"error": "Rate limit exceeded"}) 131 | 132 | with pytest.raises(RateLimitError) as exc_info: 133 | client._make_request("GET", "test/") 134 | 135 | assert "Rate limit exceeded" in str(exc_info.value) 136 | 137 | 138 | def test_error_handling_500(): 139 | """Test 500 Server Error handling.""" 140 | client = PhysioNetClient() 141 | 142 | with requests_mock.Mocker() as m: 143 | m.get("https://physionet.org/api/v1/test/", status_code=500, json={"error": "Server error"}) 144 | 145 | with pytest.raises(PhysioNetAPIError) as exc_info: 146 | client._make_request("GET", "test/") 147 | 148 | assert "HTTP 500" in str(exc_info.value) 149 | 150 | 151 | def test_error_handling_non_json_response(): 152 | """Test error handling with non-JSON error response.""" 153 | client = PhysioNetClient() 154 | 155 | with requests_mock.Mocker() as m: 156 | m.get("https://physionet.org/api/v1/test/", status_code=500, text="Internal Server Error") 157 | 158 | with pytest.raises(PhysioNetAPIError) as exc_info: 159 | client._make_request("GET", "test/") 160 | 161 | assert "Internal Server Error" in str(exc_info.value) 162 | 163 | 164 | def test_context_manager(): 165 | """Test client works as context manager.""" 166 | with requests_mock.Mocker() as m: 167 | m.get("https://physionet.org/api/v1/test/", json={"status": "ok"}) 168 | 169 | with PhysioNetClient() as client: 170 | assert client.session is not None 171 | response = client._make_request("GET", "test/") 172 | assert response.json() == {"status": "ok"} 173 | 174 | 175 | def test_close_method(): 176 | """Test close method closes session.""" 177 | client = PhysioNetClient() 178 | assert client.session is not None 179 | client.close() 180 | -------------------------------------------------------------------------------- /physionet/validate/checks/quality.py: -------------------------------------------------------------------------------- 1 | """Data quality validation checks.""" 2 | 3 | import csv 4 | from pathlib import Path 5 | from typing import Optional, Callable 6 | 7 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity 8 | from physionet.validate.config import ValidationConfig 9 | 10 | 11 | def check_quality(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult: 12 | """ 13 | Check data quality. 14 | 15 | Validates: 16 | - Missing value thresholds 17 | - Value range plausibility 18 | - Data type consistency 19 | 20 | Args: 21 | path: Path to dataset directory 22 | config: Validation configuration 23 | progress_callback: Optional callback to report progress 24 | 25 | Returns: 26 | CheckResult with any quality issues found 27 | """ 28 | result = CheckResult(category=CheckCategory.QUALITY) 29 | 30 | # Find and validate CSV files 31 | csv_files = list(path.rglob("*.csv")) 32 | for i, csv_file in enumerate(csv_files): 33 | if progress_callback: 34 | progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)") 35 | 36 | if any(p in str(csv_file) for p in config.ignore_patterns): 37 | continue 38 | 39 | _check_csv_quality(csv_file, path, config, result) 40 | 41 | return result 42 | 43 | 44 | def _check_csv_quality(csv_file: Path, base_path: Path, config: ValidationConfig, result: CheckResult) -> None: 45 | """Check quality metrics for a CSV file.""" 46 | try: 47 | with open(csv_file, "r", encoding="utf-8") as f: 48 | reader = csv.DictReader(f) 49 | 50 | # Track column statistics 51 | column_stats = {col: {"total": 0, "missing": 0, "values": []} for col in reader.fieldnames or []} 52 | 53 | # Determine if we should sample this file 54 | rows_scanned = 0 55 | max_rows = config.max_rows_to_scan 56 | 57 | # Sample if enabled and file is large 58 | if config.sample_large_files and max_rows: 59 | all_rows = list(reader) 60 | total_rows = len(all_rows) 61 | 62 | if total_rows > max_rows: 63 | # Sample evenly distributed rows 64 | import random 65 | random.seed(42) # Deterministic sampling 66 | step = total_rows / max_rows 67 | sampled_indices = [int(i * step) for i in range(max_rows)] 68 | rows_to_scan = [all_rows[i] for i in sampled_indices] 69 | else: 70 | rows_to_scan = all_rows 71 | else: 72 | rows_to_scan = reader 73 | 74 | for row in rows_to_scan: 75 | # Stop if we've hit the limit (when not sampling) 76 | if max_rows and not config.sample_large_files and rows_scanned >= max_rows: 77 | break 78 | rows_scanned += 1 79 | 80 | for col, value in row.items(): 81 | column_stats[col]["total"] += 1 82 | 83 | # Check for missing values 84 | if not value or value.strip() in ("", "NA", "N/A", "NULL", "null", "None", "NaN"): 85 | column_stats[col]["missing"] += 1 86 | else: 87 | # Store value for range checking if configured 88 | if col.lower().replace("_", " ") in [k.lower().replace("_", " ") for k in config.value_ranges]: 89 | try: 90 | numeric_value = float(value.strip()) 91 | column_stats[col]["values"].append(numeric_value) 92 | except ValueError: 93 | pass 94 | 95 | # Analyze results 96 | for col, stats in column_stats.items(): 97 | if stats["total"] == 0: 98 | continue 99 | 100 | # Check missing value threshold 101 | missing_ratio = stats["missing"] / stats["total"] 102 | if missing_ratio >= config.missing_value_threshold: 103 | result.issues.append( 104 | ValidationIssue( 105 | severity=Severity.WARNING, 106 | category=CheckCategory.QUALITY, 107 | file=str(csv_file.relative_to(base_path)), 108 | column=col, 109 | message=f"Column '{col}' is completely empty (100% missing values)", 110 | suggestion=f"Consider removing empty column '{col}' or adding data", 111 | ) 112 | ) 113 | 114 | # Check value ranges 115 | for range_key, (min_val, max_val) in config.value_ranges.items(): 116 | if col.lower().replace("_", " ") == range_key.lower().replace("_", " "): 117 | for value in stats["values"]: 118 | if value < min_val or value > max_val: 119 | result.issues.append( 120 | ValidationIssue( 121 | severity=Severity.WARNING, 122 | category=CheckCategory.QUALITY, 123 | file=str(csv_file.relative_to(base_path)), 124 | column=col, 125 | value=str(value), 126 | message=f"Value {value} in '{col}' outside expected range [{min_val}, {max_val}]", 127 | suggestion="Verify data accuracy or adjust validation ranges", 128 | ) 129 | ) 130 | # Limit warnings per column 131 | break 132 | 133 | except Exception as e: 134 | result.issues.append( 135 | ValidationIssue( 136 | severity=Severity.WARNING, 137 | category=CheckCategory.QUALITY, 138 | file=str(csv_file.relative_to(base_path)), 139 | message=f"Could not perform quality checks: {str(e)}", 140 | ) 141 | ) 142 | -------------------------------------------------------------------------------- /physionet/validate/checks/integrity.py: -------------------------------------------------------------------------------- 1 | """Data integrity validation checks.""" 2 | 3 | import csv 4 | from pathlib import Path 5 | from typing import Optional, Callable 6 | 7 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity 8 | from physionet.validate.config import ValidationConfig 9 | 10 | 11 | def check_integrity(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult: 12 | """ 13 | Check data integrity and format validation. 14 | 15 | Validates: 16 | - CSV file structure 17 | - File format validity 18 | - Basic structural consistency 19 | 20 | Args: 21 | path: Path to dataset directory 22 | config: Validation configuration 23 | progress_callback: Optional callback to report progress 24 | 25 | Returns: 26 | CheckResult with any integrity issues found 27 | """ 28 | result = CheckResult(category=CheckCategory.INTEGRITY) 29 | 30 | # Find and validate CSV files 31 | csv_files = list(path.rglob("*.csv")) 32 | for i, csv_file in enumerate(csv_files): 33 | if progress_callback: 34 | progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)") 35 | 36 | if any(p in str(csv_file) for p in config.ignore_patterns): 37 | continue 38 | 39 | _validate_csv_structure(csv_file, path, result) 40 | 41 | return result 42 | 43 | 44 | def _validate_csv_structure(csv_file: Path, base_path: Path, result: CheckResult) -> None: 45 | """Validate CSV file structure.""" 46 | try: 47 | with open(csv_file, "r", encoding="utf-8") as f: 48 | # Try to detect dialect 49 | sample = f.read(1024) 50 | f.seek(0) 51 | 52 | try: 53 | dialect = csv.Sniffer().sniff(sample) 54 | except csv.Error: 55 | # Use default dialect if detection fails 56 | dialect = csv.excel 57 | 58 | reader = csv.reader(f, dialect) 59 | 60 | # Read header 61 | try: 62 | header = next(reader) 63 | except StopIteration: 64 | result.issues.append( 65 | ValidationIssue( 66 | severity=Severity.ERROR, 67 | category=CheckCategory.INTEGRITY, 68 | file=str(csv_file.relative_to(base_path)), 69 | message="CSV file is empty", 70 | ) 71 | ) 72 | return 73 | 74 | if not header: 75 | result.issues.append( 76 | ValidationIssue( 77 | severity=Severity.ERROR, 78 | category=CheckCategory.INTEGRITY, 79 | file=str(csv_file.relative_to(base_path)), 80 | message="CSV file has no header row", 81 | ) 82 | ) 83 | return 84 | 85 | # Check for duplicate column names 86 | if len(header) != len(set(header)): 87 | duplicates = [col for col in header if header.count(col) > 1] 88 | result.issues.append( 89 | ValidationIssue( 90 | severity=Severity.ERROR, 91 | category=CheckCategory.INTEGRITY, 92 | file=str(csv_file.relative_to(base_path)), 93 | message=f"Duplicate column names found: {', '.join(set(duplicates))}", 94 | ) 95 | ) 96 | 97 | # Check for empty column names 98 | if any(not col.strip() for col in header): 99 | result.issues.append( 100 | ValidationIssue( 101 | severity=Severity.ERROR, 102 | category=CheckCategory.INTEGRITY, 103 | file=str(csv_file.relative_to(base_path)), 104 | message="CSV contains empty column names", 105 | ) 106 | ) 107 | 108 | # Validate row consistency 109 | expected_cols = len(header) 110 | row_count = 0 111 | for line_num, row in enumerate(reader, start=2): # Start at 2 (after header) 112 | row_count += 1 113 | if len(row) != expected_cols: 114 | result.issues.append( 115 | ValidationIssue( 116 | severity=Severity.ERROR, 117 | category=CheckCategory.INTEGRITY, 118 | file=str(csv_file.relative_to(base_path)), 119 | line=line_num, 120 | message=f"Row has {len(row)} columns, expected {expected_cols}", 121 | ) 122 | ) 123 | # Only report first few inconsistencies to avoid spam 124 | if len([i for i in result.issues if i.file == str(csv_file.relative_to(base_path))]) >= 5: 125 | result.issues.append( 126 | ValidationIssue( 127 | severity=Severity.INFO, 128 | category=CheckCategory.INTEGRITY, 129 | file=str(csv_file.relative_to(base_path)), 130 | message=f"Additional row inconsistencies may exist (showing first 5)", 131 | ) 132 | ) 133 | break 134 | 135 | if row_count == 0: 136 | result.issues.append( 137 | ValidationIssue( 138 | severity=Severity.WARNING, 139 | category=CheckCategory.INTEGRITY, 140 | file=str(csv_file.relative_to(base_path)), 141 | message="CSV file contains only header row (no data)", 142 | ) 143 | ) 144 | 145 | except UnicodeDecodeError: 146 | result.issues.append( 147 | ValidationIssue( 148 | severity=Severity.ERROR, 149 | category=CheckCategory.INTEGRITY, 150 | file=str(csv_file.relative_to(base_path)), 151 | message="CSV file has encoding issues (not valid UTF-8)", 152 | suggestion="Convert file to UTF-8 encoding", 153 | ) 154 | ) 155 | except Exception as e: 156 | result.issues.append( 157 | ValidationIssue( 158 | severity=Severity.WARNING, 159 | category=CheckCategory.INTEGRITY, 160 | file=str(csv_file.relative_to(base_path)), 161 | message=f"Could not validate CSV file: {str(e)}", 162 | ) 163 | ) 164 | -------------------------------------------------------------------------------- /tests/api/test_endpoints.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import requests_mock 3 | from physionet.api.client import PhysioNetClient 4 | from physionet.api.models import PublishedProject, ProjectVersion, ProjectDetail 5 | 6 | 7 | @pytest.fixture 8 | def client(): 9 | """Fixture providing a PhysioNetClient instance.""" 10 | return PhysioNetClient() 11 | 12 | 13 | def test_list_published_basic(client): 14 | """Test listing published projects.""" 15 | mock_response = [ 16 | { 17 | "slug": "project-1", 18 | "version": "1.0", 19 | "title": "Project 1", 20 | "short_description": "Description 1", 21 | "abstract": "Abstract 1", 22 | "core_doi": "10.1234/p1", 23 | "version_doi": "10.1234/p1.v1", 24 | "is_latest_version": True, 25 | "publish_date": "2023-01-01", 26 | "license": {"name": "MIT"}, 27 | "dua": None, 28 | "main_storage_size": 1000, 29 | "compressed_storage_size": 500, 30 | }, 31 | { 32 | "slug": "project-2", 33 | "version": "2.0", 34 | "title": "Project 2", 35 | "short_description": "Description 2", 36 | "abstract": "Abstract 2", 37 | "core_doi": "10.1234/p2", 38 | "version_doi": "10.1234/p2.v2", 39 | "is_latest_version": True, 40 | "publish_date": "2023-02-01", 41 | "license": {"name": "GPL"}, 42 | "dua": None, 43 | "main_storage_size": 2000, 44 | "compressed_storage_size": 1000, 45 | }, 46 | ] 47 | 48 | with requests_mock.Mocker() as m: 49 | m.get("https://physionet.org/api/v1/projects/published/", json=mock_response) 50 | 51 | result = client.projects.list_published() 52 | 53 | assert len(result) == 2 54 | assert isinstance(result[0], PublishedProject) 55 | assert result[0].slug == "project-1" 56 | assert result[1].slug == "project-2" 57 | 58 | 59 | def test_iter_published(client): 60 | """Test iterating through published projects.""" 61 | mock_response = [ 62 | { 63 | "slug": "project-1", 64 | "version": "1.0", 65 | "title": "Project 1", 66 | "short_description": "", 67 | "abstract": "", 68 | "core_doi": None, 69 | "version_doi": None, 70 | "is_latest_version": True, 71 | "publish_date": "", 72 | "license": None, 73 | "dua": None, 74 | "main_storage_size": 0, 75 | "compressed_storage_size": 0, 76 | }, 77 | { 78 | "slug": "project-2", 79 | "version": "1.0", 80 | "title": "Project 2", 81 | "short_description": "", 82 | "abstract": "", 83 | "core_doi": None, 84 | "version_doi": None, 85 | "is_latest_version": True, 86 | "publish_date": "", 87 | "license": None, 88 | "dua": None, 89 | "main_storage_size": 0, 90 | "compressed_storage_size": 0, 91 | }, 92 | ] 93 | 94 | with requests_mock.Mocker() as m: 95 | m.get("https://physionet.org/api/v1/projects/published/", json=mock_response) 96 | 97 | projects = list(client.projects.iter_published()) 98 | 99 | assert len(projects) == 2 100 | assert projects[0].slug == "project-1" 101 | assert projects[1].slug == "project-2" 102 | 103 | 104 | def test_search_projects(client): 105 | """Test searching for projects.""" 106 | mock_response = [ 107 | { 108 | "slug": "ecg-project", 109 | "version": "1.0", 110 | "title": "ECG Database", 111 | "short_description": "ECG data", 112 | "abstract": "ECG abstract", 113 | "core_doi": None, 114 | "version_doi": None, 115 | "is_latest_version": True, 116 | "publish_date": "", 117 | "license": None, 118 | "dua": None, 119 | "main_storage_size": 0, 120 | "compressed_storage_size": 0, 121 | } 122 | ] 123 | 124 | with requests_mock.Mocker() as m: 125 | m.get("https://physionet.org/api/v1/projects/search/", json=mock_response) 126 | 127 | results = client.projects.search(search_term="ECG", resource_type=["all"]) 128 | 129 | assert "search_term=ECG" in m.last_request.url 130 | assert len(results) == 1 131 | assert isinstance(results[0], PublishedProject) 132 | assert results[0].slug == "ecg-project" 133 | 134 | 135 | def test_list_versions(client): 136 | """Test listing all versions of a project.""" 137 | mock_response = [ 138 | { 139 | "slug": "test-project", 140 | "title": "Test Project", 141 | "version": "1.0", 142 | "abstract": "Version 1.0", 143 | "citation": "Citation v1.0", 144 | }, 145 | { 146 | "slug": "test-project", 147 | "title": "Test Project", 148 | "version": "2.0", 149 | "abstract": "Version 2.0", 150 | "citation": "Citation v2.0", 151 | }, 152 | ] 153 | 154 | with requests_mock.Mocker() as m: 155 | m.get("https://physionet.org/api/v1/projects/test-project/versions/", json=mock_response) 156 | 157 | versions = client.projects.list_versions("test-project") 158 | 159 | assert len(versions) == 2 160 | assert isinstance(versions[0], ProjectVersion) 161 | assert versions[0].version == "1.0" 162 | assert versions[1].version == "2.0" 163 | 164 | 165 | def test_get_details(client): 166 | """Test getting project details.""" 167 | mock_response = { 168 | "slug": "test-project", 169 | "title": "Test Project", 170 | "version": "1.0", 171 | "abstract": "Test abstract", 172 | "license": {"name": "MIT"}, 173 | "short_description": "Short desc", 174 | "project_home_page": "https://example.com", 175 | "publish_datetime": "2023-01-01T00:00:00", 176 | "doi": "10.1234/test", 177 | "main_storage_size": 1000, 178 | "compressed_storage_size": 500, 179 | } 180 | 181 | with requests_mock.Mocker() as m: 182 | m.get("https://physionet.org/api/v1/projects/test-project/versions/1.0/", json=mock_response) 183 | 184 | detail = client.projects.get_details("test-project", "1.0") 185 | 186 | assert isinstance(detail, ProjectDetail) 187 | assert detail.slug == "test-project" 188 | assert detail.version == "1.0" 189 | assert detail.doi == "10.1234/test" 190 | 191 | 192 | def test_download_checksums(client, tmp_path): 193 | """Test downloading checksums file.""" 194 | checksum_content = b"abc123 file1.txt\ndef456 file2.txt\n" 195 | output_file = tmp_path / "checksums.txt" 196 | 197 | with requests_mock.Mocker() as m: 198 | m.get( 199 | "https://physionet.org/api/v1/projects/published/test-project/1.0/sha256sums/", content=checksum_content 200 | ) 201 | 202 | client.projects.download_checksums("test-project", "1.0", str(output_file)) 203 | 204 | assert output_file.exists() 205 | assert output_file.read_bytes() == checksum_content 206 | -------------------------------------------------------------------------------- /tests/validate/test_validator.py: -------------------------------------------------------------------------------- 1 | """Tests for main validation functionality.""" 2 | 3 | import pytest 4 | import tempfile 5 | from pathlib import Path 6 | 7 | from physionet.validate import validate_dataset, ValidationConfig 8 | from physionet.validate.models import Severity, CheckCategory 9 | 10 | 11 | class TestValidateDataset: 12 | """Tests for validate_dataset function.""" 13 | 14 | def test_nonexistent_path_raises_error(self): 15 | """Test that validating a nonexistent path raises ValueError.""" 16 | with pytest.raises(ValueError, match="does not exist"): 17 | validate_dataset("/nonexistent/path") 18 | 19 | def test_file_instead_of_directory_raises_error(self, tmp_path): 20 | """Test that validating a file instead of directory raises ValueError.""" 21 | test_file = tmp_path / "test.txt" 22 | test_file.write_text("test") 23 | 24 | with pytest.raises(ValueError, match="not a directory"): 25 | validate_dataset(str(test_file)) 26 | 27 | def test_empty_directory_validation(self, tmp_path): 28 | """Test validation of an empty directory.""" 29 | result = validate_dataset(str(tmp_path)) 30 | 31 | assert result.dataset_path == tmp_path.name 32 | assert result.timestamp is not None 33 | assert result.dataset_stats.file_count == 0 34 | assert result.dataset_stats.total_size_bytes == 0 35 | 36 | # Should have error for missing README.md 37 | assert result.total_errors == 1 38 | assert any("README.md" in str(issue.message) for issue in result.check_results[CheckCategory.DOCUMENTATION].issues) 39 | 40 | def test_minimal_valid_dataset(self, tmp_path): 41 | """Test validation of a minimal valid dataset.""" 42 | # Create README and a simple CSV file 43 | (tmp_path / "README.md").write_text("# Test Dataset") 44 | csv_file = tmp_path / "data.csv" 45 | csv_file.write_text("id,value\n1,100\n2,200\n") 46 | 47 | result = validate_dataset(str(tmp_path)) 48 | 49 | assert result.dataset_stats.file_count == 2 50 | assert result.total_errors == 0 51 | 52 | def test_validation_with_custom_config(self, tmp_path): 53 | """Test validation with custom configuration.""" 54 | # Create a dataset with custom requirements 55 | readme = tmp_path / "README.md" 56 | readme.write_text("# Test") 57 | 58 | config = ValidationConfig( 59 | check_filesystem=True, 60 | check_documentation=False, # Disable documentation checks 61 | check_integrity=False, 62 | check_quality=False, 63 | check_phi=False, 64 | ) 65 | 66 | result = validate_dataset(str(tmp_path), config) 67 | 68 | # Should only have filesystem checks 69 | assert CheckCategory.FILESYSTEM in result.check_results 70 | assert CheckCategory.DOCUMENTATION not in result.check_results 71 | 72 | def test_validation_without_progress_bar(self, tmp_path): 73 | """Test validation with progress bar disabled.""" 74 | readme = tmp_path / "README.md" 75 | readme.write_text("# Test") 76 | 77 | # Should not raise any errors with show_progress=False 78 | result = validate_dataset(str(tmp_path), show_progress=False) 79 | assert result.total_errors == 0 80 | 81 | 82 | class TestValidationStats: 83 | """Tests for dataset statistics calculation.""" 84 | 85 | def test_calculates_file_count(self, tmp_path): 86 | """Test that file count is calculated correctly.""" 87 | (tmp_path / "README.md").write_text("# Test") 88 | (tmp_path / "data.csv").write_text("col1,col2\n1,2\n") 89 | (tmp_path / "subdir").mkdir() 90 | (tmp_path / "subdir" / "data2.csv").write_text("col1\n1\n") 91 | 92 | result = validate_dataset(str(tmp_path)) 93 | 94 | assert result.dataset_stats.file_count == 3 95 | assert result.dataset_stats.directory_count == 1 96 | 97 | def test_calculates_total_size(self, tmp_path): 98 | """Test that total size is calculated correctly.""" 99 | content = "x" * 1000 100 | (tmp_path / "README.md").write_text(content) 101 | 102 | result = validate_dataset(str(tmp_path)) 103 | 104 | assert result.dataset_stats.total_size_bytes >= 1000 105 | 106 | def test_ignores_specified_patterns(self, tmp_path): 107 | """Test that ignored patterns are not counted in stats.""" 108 | (tmp_path / "README.md").write_text("# Test") 109 | (tmp_path / ".git").mkdir() 110 | (tmp_path / ".git" / "config").write_text("test") 111 | 112 | result = validate_dataset(str(tmp_path)) 113 | 114 | # .git directory and its contents should be ignored 115 | assert result.dataset_stats.file_count == 1 116 | 117 | 118 | class TestValidationResult: 119 | """Tests for ValidationResult model.""" 120 | 121 | def test_summary_format(self, tmp_path): 122 | """Test that summary is properly formatted.""" 123 | (tmp_path / "README.md").write_text("# Test") 124 | 125 | result = validate_dataset(str(tmp_path)) 126 | summary = result.summary() 127 | 128 | assert "PhysioNet Dataset Validation Report" in summary 129 | assert tmp_path.name in summary 130 | assert "Summary:" in summary 131 | assert "Metadata:" in summary 132 | assert "Validation Results:" in summary 133 | 134 | def test_to_dict_format(self, tmp_path): 135 | """Test that to_dict produces valid structure.""" 136 | (tmp_path / "README.md").write_text("# Test") 137 | 138 | result = validate_dataset(str(tmp_path)) 139 | result_dict = result.to_dict() 140 | 141 | assert "dataset_path" in result_dict 142 | assert "timestamp" in result_dict 143 | assert "dataset_stats" in result_dict 144 | assert "summary" in result_dict 145 | assert "checks" in result_dict 146 | 147 | assert result_dict["summary"]["total_errors"] == result.total_errors 148 | assert result_dict["summary"]["total_warnings"] == result.total_warnings 149 | 150 | def test_recommendations_section(self, tmp_path): 151 | """Test that recommendations section is included when there are issues.""" 152 | # Create files with issues to trigger recommendations 153 | (tmp_path / "file with spaces.csv").write_text("col1,col2\n1,2\n") 154 | (tmp_path / ".env").write_text("API_KEY=secret") 155 | (tmp_path / "empty.txt").write_text("") 156 | 157 | result = validate_dataset(str(tmp_path)) 158 | summary = result.summary() 159 | 160 | # Should include recommendations section 161 | assert "Recommendations:" in summary 162 | assert "Replace spaces with underscores or hyphens" in summary 163 | assert "Remove" in summary # Various remove recommendations 164 | 165 | def test_large_dataset_recommendation(self, tmp_path): 166 | """Test that large datasets get upload assistance recommendation.""" 167 | # Create README to avoid documentation errors 168 | (tmp_path / "README.md").write_text("# Large Dataset") 169 | 170 | # Create a large file (simulated - we'll modify the stats) 171 | (tmp_path / "data.csv").write_text("col1,col2\n1,2\n") 172 | 173 | result = validate_dataset(str(tmp_path)) 174 | 175 | # Manually set large size for testing (>200GB) 176 | result.dataset_stats.total_size_bytes = 250 * 1024 ** 3 # 250 GB 177 | 178 | summary = result.summary() 179 | 180 | # Should include contact recommendation for large datasets 181 | assert "contact@physionet.org" in summary 182 | assert "very large" in summary.lower() 183 | assert "250" in summary # Should show the size 184 | -------------------------------------------------------------------------------- /physionet/validate/checks/filesystem.py: -------------------------------------------------------------------------------- 1 | """File system validation checks.""" 2 | 3 | import os 4 | from pathlib import Path 5 | 6 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity 7 | from physionet.validate.config import ValidationConfig 8 | 9 | # Proprietary file formats and their recommended open alternatives 10 | PROPRIETARY_FORMATS = { 11 | '.mat': 'MATLAB format; consider .csv, .zarr, .parquet, or .npy instead', 12 | '.sas7bdat': 'SAS format; consider .csv or .parquet instead', 13 | '.dta': 'Stata format; consider .csv or .parquet instead', 14 | '.sav': 'SPSS format; consider .csv or .parquet instead', 15 | '.xlsx': 'Excel format; consider .csv instead', 16 | '.xls': 'Excel format; consider .csv instead', 17 | '.rds': 'R binary format; consider .csv or .parquet instead', 18 | '.rdata': 'R binary format; consider .csv or .parquet instead', 19 | '.ppt': 'PowerPoint format; consider .pdf instead', 20 | '.pptx': 'PowerPoint format; consider .pdf instead', 21 | } 22 | 23 | 24 | def check_filesystem(path: Path, config: ValidationConfig) -> CheckResult: 25 | """ 26 | Check file system organization and structure. 27 | 28 | Validates: 29 | - File naming conventions 30 | - Presence of version control artifacts 31 | - File sizes 32 | - Small file count 33 | 34 | Args: 35 | path: Path to dataset directory 36 | config: Validation configuration 37 | 38 | Returns: 39 | CheckResult with any filesystem issues found 40 | """ 41 | result = CheckResult(category=CheckCategory.FILESYSTEM) 42 | 43 | # Check for version control artifacts 44 | for pattern in [".git", ".svn", ".hg", "__pycache__", ".pytest_cache"]: 45 | found_paths = list(path.rglob(pattern)) 46 | if found_paths: 47 | result.issues.append( 48 | ValidationIssue( 49 | severity=Severity.WARNING, 50 | category=CheckCategory.FILESYSTEM, 51 | message=f"Found version control/build artifacts: {pattern}", 52 | suggestion=f"Remove {pattern} directories before submission", 53 | ) 54 | ) 55 | 56 | # Check for hidden and temp files 57 | for root, dirs, files in os.walk(path): 58 | # Filter ignored directories 59 | dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)] 60 | 61 | for file in files: 62 | file_path = Path(root) / file 63 | 64 | # Skip ignored files 65 | if any(p in file for p in config.ignore_patterns): 66 | continue 67 | 68 | # Check for hidden files (starting with .) 69 | if file.startswith(".") and file not in [".gitignore", ".gitattributes"]: 70 | result.issues.append( 71 | ValidationIssue( 72 | severity=Severity.WARNING, 73 | category=CheckCategory.FILESYSTEM, 74 | file=str(file_path.relative_to(path)), 75 | message=f"Hidden file found: {file}", 76 | suggestion="Remove hidden files before submission", 77 | ) 78 | ) 79 | 80 | # Check for temp files 81 | if file.endswith(("~", ".tmp", ".bak", ".swp")): 82 | result.issues.append( 83 | ValidationIssue( 84 | severity=Severity.WARNING, 85 | category=CheckCategory.FILESYSTEM, 86 | file=str(file_path.relative_to(path)), 87 | message=f"Temporary file found: {file}", 88 | suggestion="Remove temporary files before submission", 89 | ) 90 | ) 91 | 92 | # Check file size 93 | try: 94 | size = file_path.stat().st_size 95 | if size == 0: 96 | result.issues.append( 97 | ValidationIssue( 98 | severity=Severity.WARNING, 99 | category=CheckCategory.FILESYSTEM, 100 | file=str(file_path.relative_to(path)), 101 | message="Empty file (0 bytes)", 102 | suggestion="Remove empty files or add content", 103 | ) 104 | ) 105 | elif config.max_file_size_bytes and size > config.max_file_size_bytes: 106 | result.issues.append( 107 | ValidationIssue( 108 | severity=Severity.INFO, 109 | category=CheckCategory.FILESYSTEM, 110 | file=str(file_path.relative_to(path)), 111 | message=f"Large file: {_format_size(size)}", 112 | suggestion="Consider splitting or compressing large files", 113 | ) 114 | ) 115 | except (OSError, PermissionError): 116 | pass 117 | 118 | # Check for excessively long filenames 119 | # Most filesystems support 255 characters, but recommend shorter for compatibility 120 | if len(file) > 255: 121 | result.issues.append( 122 | ValidationIssue( 123 | severity=Severity.ERROR, 124 | category=CheckCategory.FILESYSTEM, 125 | file=str(file_path.relative_to(path)), 126 | message=f"Filename exceeds maximum length ({len(file)} characters): {file[:50]}...", 127 | suggestion="Shorten filename to 255 characters or less", 128 | ) 129 | ) 130 | elif len(file) > 100: 131 | result.issues.append( 132 | ValidationIssue( 133 | severity=Severity.WARNING, 134 | category=CheckCategory.FILESYSTEM, 135 | file=str(file_path.relative_to(path)), 136 | message=f"Filename is very long ({len(file)} characters): {file[:50]}...", 137 | suggestion="Consider shortening filename for better compatibility (recommended: under 100 characters)", 138 | ) 139 | ) 140 | 141 | # Check for spaces in filename 142 | if " " in file: 143 | result.issues.append( 144 | ValidationIssue( 145 | severity=Severity.WARNING, 146 | category=CheckCategory.FILESYSTEM, 147 | file=str(file_path.relative_to(path)), 148 | message=f"Filename contains spaces: {file}", 149 | suggestion="Replace spaces with underscores or hyphens", 150 | ) 151 | ) 152 | 153 | # Check for invalid/awkward characters in filename 154 | # Include path separators, quotes, and other problematic characters 155 | invalid_chars = set('<>:"|?*/\\\'') 156 | found_invalid = [char for char in invalid_chars if char in file] 157 | 158 | if found_invalid: 159 | char_list = ", ".join(f"'{char}'" for char in found_invalid) 160 | result.issues.append( 161 | ValidationIssue( 162 | severity=Severity.ERROR, 163 | category=CheckCategory.FILESYSTEM, 164 | file=str(file_path.relative_to(path)), 165 | message=f"Filename contains invalid characters ({char_list}): {file}", 166 | suggestion="Remove special characters from filename (use only letters, numbers, underscores, hyphens, and periods)", 167 | ) 168 | ) 169 | 170 | # Check for proprietary file formats 171 | file_ext = "." + file.split(".")[-1] if "." in file else "" 172 | file_ext_lower = file_ext.lower() 173 | 174 | if file_ext_lower in PROPRIETARY_FORMATS: 175 | result.issues.append( 176 | ValidationIssue( 177 | severity=Severity.WARNING, 178 | category=CheckCategory.FILESYSTEM, 179 | file=str(file_path.relative_to(path)), 180 | message=f"Proprietary file format detected: {file}", 181 | suggestion=f"{PROPRIETARY_FORMATS[file_ext_lower]}", 182 | ) 183 | ) 184 | 185 | return result 186 | 187 | 188 | def _format_size(size_bytes: int) -> str: 189 | """Format byte size as human-readable string.""" 190 | for unit in ["B", "KB", "MB", "GB", "TB"]: 191 | if size_bytes < 1024.0: 192 | return f"{size_bytes:.1f} {unit}" 193 | size_bytes /= 1024.0 194 | return f"{size_bytes:.1f} PB" 195 | -------------------------------------------------------------------------------- /physionet/validate/models.py: -------------------------------------------------------------------------------- 1 | """Data models for validation results.""" 2 | 3 | from dataclasses import dataclass, field 4 | from typing import List, Optional, Dict, Any 5 | from enum import Enum 6 | from datetime import datetime 7 | import textwrap 8 | 9 | 10 | class Severity(Enum): 11 | """Severity levels for validation issues.""" 12 | ERROR = "error" 13 | WARNING = "warning" 14 | INFO = "info" 15 | 16 | 17 | class CheckCategory(Enum): 18 | """Categories of validation checks.""" 19 | FILESYSTEM = "filesystem" 20 | DOCUMENTATION = "documentation" 21 | INTEGRITY = "integrity" 22 | QUALITY = "quality" 23 | PRIVACY = "privacy" 24 | 25 | 26 | @dataclass 27 | class ValidationIssue: 28 | """Represents a single validation issue.""" 29 | severity: Severity 30 | category: CheckCategory 31 | message: str 32 | file: Optional[str] = None 33 | line: Optional[int] = None 34 | column: Optional[str] = None 35 | value: Optional[str] = None 36 | suggestion: Optional[str] = None 37 | 38 | def to_dict(self) -> Dict[str, Any]: 39 | """Convert issue to dictionary format.""" 40 | result = { 41 | "severity": self.severity.value, 42 | "category": self.category.value, 43 | "message": self.message, 44 | } 45 | if self.file: 46 | result["file"] = self.file 47 | if self.line is not None: 48 | result["line"] = self.line 49 | if self.column: 50 | result["column"] = self.column 51 | if self.value: 52 | result["value"] = self.value 53 | if self.suggestion: 54 | result["suggestion"] = self.suggestion 55 | return result 56 | 57 | 58 | @dataclass 59 | class CheckResult: 60 | """Results from a specific category of checks.""" 61 | category: CheckCategory 62 | issues: List[ValidationIssue] = field(default_factory=list) 63 | 64 | @property 65 | def status(self) -> str: 66 | """Get overall status for this check category.""" 67 | if any(issue.severity == Severity.ERROR for issue in self.issues): 68 | return "error" 69 | elif any(issue.severity == Severity.WARNING for issue in self.issues): 70 | return "warning" 71 | return "pass" 72 | 73 | @property 74 | def error_count(self) -> int: 75 | """Count of errors in this category.""" 76 | return sum(1 for issue in self.issues if issue.severity == Severity.ERROR) 77 | 78 | @property 79 | def warning_count(self) -> int: 80 | """Count of warnings in this category.""" 81 | return sum(1 for issue in self.issues if issue.severity == Severity.WARNING) 82 | 83 | @property 84 | def info_count(self) -> int: 85 | """Count of info messages in this category.""" 86 | return sum(1 for issue in self.issues if issue.severity == Severity.INFO) 87 | 88 | 89 | @dataclass 90 | class DatasetStats: 91 | """Statistics about the dataset being validated.""" 92 | total_size_bytes: int = 0 93 | file_count: int = 0 94 | directory_count: int = 0 95 | 96 | 97 | @dataclass 98 | class ValidationResult: 99 | """Complete validation results for a dataset.""" 100 | dataset_path: str 101 | timestamp: str 102 | check_results: Dict[CheckCategory, CheckResult] = field(default_factory=dict) 103 | dataset_stats: DatasetStats = field(default_factory=DatasetStats) 104 | 105 | @property 106 | def total_errors(self) -> int: 107 | """Total count of errors across all checks.""" 108 | return sum(result.error_count for result in self.check_results.values()) 109 | 110 | @property 111 | def total_warnings(self) -> int: 112 | """Total count of warnings across all checks.""" 113 | return sum(result.warning_count for result in self.check_results.values()) 114 | 115 | @property 116 | def total_info(self) -> int: 117 | """Total count of info messages across all checks.""" 118 | return sum(result.info_count for result in self.check_results.values()) 119 | 120 | @property 121 | def status(self) -> str: 122 | """Overall validation status.""" 123 | if self.total_errors > 0: 124 | return "error" 125 | elif self.total_warnings > 0: 126 | return "warning" 127 | return "pass" 128 | 129 | def summary(self) -> str: 130 | """Generate a human-readable summary.""" 131 | # Format timestamp as human-readable 132 | try: 133 | dt = datetime.fromisoformat(self.timestamp.replace('Z', '+00:00')) 134 | formatted_timestamp = dt.strftime("%Y-%m-%d %H:%M:%S UTC") 135 | except (ValueError, AttributeError): 136 | formatted_timestamp = self.timestamp 137 | 138 | # Get package version 139 | try: 140 | import physionet 141 | validator_version = physionet.__version__ 142 | except (ImportError, AttributeError): 143 | validator_version = "unknown" 144 | 145 | lines = [] 146 | 147 | # Section 1: Metadata 148 | lines.extend([ 149 | "PhysioNet Dataset Validation Report", 150 | "=" * 50, 151 | "", 152 | "Metadata:", 153 | f" Dataset: {self.dataset_path}", 154 | f" Validator version: {validator_version}", 155 | f" Timestamp: {formatted_timestamp}", 156 | f" Total size: {self._format_size(self.dataset_stats.total_size_bytes)} " 157 | f"({self.dataset_stats.file_count} files)", 158 | "", 159 | ]) 160 | 161 | # Section 2: Validation Results 162 | lines.extend([ 163 | "Validation Results:", 164 | "=" * 50, 165 | ]) 166 | 167 | first_category = True 168 | for category, result in self.check_results.items(): 169 | # Add blank line before each category (except first) 170 | if not first_category: 171 | lines.append("") 172 | first_category = False 173 | 174 | # Only show ✗ for errors, ✓ for pass or warnings-only 175 | status_icon = "✗" if result.error_count > 0 else "✓" 176 | issue_summary = "" 177 | if result.error_count or result.warning_count: 178 | parts = [] 179 | if result.error_count: 180 | parts.append(f"{result.error_count} error{'s' if result.error_count != 1 else ''}") 181 | if result.warning_count: 182 | parts.append(f"{result.warning_count} warning{'s' if result.warning_count != 1 else ''}") 183 | issue_summary = f" ({', '.join(parts)})" 184 | 185 | lines.append(f"{status_icon} {category.value.replace('_', ' ').title()}{issue_summary}") 186 | 187 | for issue in result.issues: 188 | icon = "✗" if issue.severity == Severity.ERROR else "⚠" 189 | location = f" {issue.file}" 190 | if issue.line: 191 | location += f":{issue.line}" 192 | lines.append(f" {icon}{location} - {issue.message}") 193 | 194 | lines.append("") 195 | 196 | # Section 3: Summary 197 | lines.extend([ 198 | "Summary:", 199 | "=" * 50, 200 | f"{self.total_errors} error{'s' if self.total_errors != 1 else ''}, " 201 | f"{self.total_warnings} warning{'s' if self.total_warnings != 1 else ''}", 202 | "", 203 | ]) 204 | 205 | if self.status == "error": 206 | lines.append("✗ Dataset has errors that must be fixed before submission") 207 | elif self.status == "warning": 208 | lines.append("⚠ Dataset has warnings that should be addressed before submission") 209 | else: 210 | lines.append("✓ Dataset passed validation") 211 | 212 | # Add recommendations section if there are issues 213 | recommendations = self._generate_recommendations() 214 | if recommendations: 215 | lines.extend([ 216 | "", 217 | "Recommendations:", 218 | "=" * 50, 219 | ]) 220 | lines.extend(recommendations) 221 | 222 | # Add note about including validation report in submission 223 | note_text = "Note: A validation report (PHYSIONET_REPORT.md) has been saved in your dataset folder. Please include this file in your final submission." 224 | lines.append("") 225 | lines.extend(self._wrap_text(note_text)) 226 | 227 | # Add footer with package information 228 | lines.extend([ 229 | "", 230 | "=" * 50, 231 | "This report was generated by the PhysioNet Python package.", 232 | "Install: pip install physionet", 233 | "Learn more: https://github.com/MIT-LCP/physionet", 234 | ]) 235 | 236 | return "\n".join(lines) + "\n" 237 | 238 | def to_dict(self) -> Dict[str, Any]: 239 | """Convert validation result to dictionary format.""" 240 | return { 241 | "dataset_path": self.dataset_path, 242 | "timestamp": self.timestamp, 243 | "dataset_stats": { 244 | "total_size_bytes": self.dataset_stats.total_size_bytes, 245 | "file_count": self.dataset_stats.file_count, 246 | "directory_count": self.dataset_stats.directory_count, 247 | }, 248 | "summary": { 249 | "total_errors": self.total_errors, 250 | "total_warnings": self.total_warnings, 251 | "total_info": self.total_info, 252 | "status": self.status, 253 | }, 254 | "checks": { 255 | category.value: { 256 | "status": result.status, 257 | "issues": [issue.to_dict() for issue in result.issues], 258 | } 259 | for category, result in self.check_results.items() 260 | }, 261 | } 262 | 263 | def _generate_recommendations(self) -> List[str]: 264 | """Generate actionable recommendations based on issues found.""" 265 | recommendations = [] 266 | 267 | # Check for very large datasets (>200GB) 268 | size_gb = self.dataset_stats.total_size_bytes / (1024 ** 3) 269 | if size_gb > 200: 270 | recommendations.append("\nDataset Size:") 271 | large_dataset_text = ( 272 | f" ℹ Your dataset is very large ({self._format_size(self.dataset_stats.total_size_bytes)}). " 273 | "If you need assistance uploading large files, please contact the PhysioNet team at contact@physionet.org" 274 | ) 275 | recommendations.extend(self._wrap_text(large_dataset_text, indent=" ")) 276 | 277 | # Collect unique suggestions from all issues 278 | suggestions_by_category = {} 279 | 280 | for category, result in self.check_results.items(): 281 | category_suggestions = {} 282 | 283 | for issue in result.issues: 284 | if issue.suggestion: 285 | # Group by suggestion to avoid duplicates 286 | if issue.suggestion not in category_suggestions: 287 | category_suggestions[issue.suggestion] = { 288 | 'severity': issue.severity, 289 | 'count': 0 290 | } 291 | category_suggestions[issue.suggestion]['count'] += 1 292 | 293 | if category_suggestions: 294 | suggestions_by_category[category] = category_suggestions 295 | 296 | # Generate recommendations by category 297 | for category, suggestions in suggestions_by_category.items(): 298 | if not suggestions: 299 | continue 300 | 301 | recommendations.append(f"\n{category.value.replace('_', ' ').title()}:") 302 | 303 | # Sort by severity (errors first) and then by count 304 | sorted_suggestions = sorted( 305 | suggestions.items(), 306 | key=lambda x: (x[1]['severity'] != Severity.ERROR, -x[1]['count']) 307 | ) 308 | 309 | for suggestion, info in sorted_suggestions: 310 | count = info['count'] 311 | icon = "✗" if info['severity'] == Severity.ERROR else "⚠" 312 | count_str = f" ({count} file{'s' if count != 1 else ''})" if count > 1 else "" 313 | suggestion_text = f" {icon} {suggestion}{count_str}" 314 | # Wrap long suggestions 315 | wrapped = self._wrap_text(suggestion_text, indent=" ") 316 | recommendations.extend(wrapped) 317 | 318 | return recommendations 319 | 320 | @staticmethod 321 | def _format_size(size_bytes: int) -> str: 322 | """Format byte size as human-readable string.""" 323 | for unit in ["B", "KB", "MB", "GB", "TB"]: 324 | if size_bytes < 1024.0: 325 | return f"{size_bytes:.1f} {unit}" 326 | size_bytes /= 1024.0 327 | return f"{size_bytes:.1f} PB" 328 | 329 | @staticmethod 330 | def _wrap_text(text: str, width: int = 80, indent: str = " ") -> List[str]: 331 | """Wrap text to specified width with continuation indent.""" 332 | # Use textwrap to wrap the text 333 | wrapped = textwrap.fill(text, width=width, subsequent_indent=indent) 334 | return wrapped.split('\n') 335 | -------------------------------------------------------------------------------- /physionet/validate/checks/privacy.py: -------------------------------------------------------------------------------- 1 | """Privacy and PHI validation checks.""" 2 | 3 | import csv 4 | import os 5 | import re 6 | from pathlib import Path 7 | from typing import Optional, Callable 8 | 9 | from physionet.validate.models import CheckResult, ValidationIssue, CheckCategory, Severity 10 | from physionet.validate.config import ValidationConfig 11 | 12 | # Pattern names for better error messages 13 | PHI_PATTERN_NAMES = { 14 | r"\b\d{3}-\d{2}-\d{4}\b": "SSN", 15 | r"\b[\w\.-]+@[\w\.-]+\.\w+\b": "email address", 16 | r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b": "phone number", 17 | } 18 | 19 | # Sensitive configuration files that should not be included in datasets 20 | SENSITIVE_FILES = { 21 | # API keys and credentials 22 | ".env": "environment variables (may contain API keys)", 23 | ".env.local": "local environment variables", 24 | ".env.production": "production environment variables", 25 | "credentials.json": "credential file", 26 | "secrets.json": "secrets file", 27 | "config.json": "configuration file (may contain credentials)", 28 | ".aws/credentials": "AWS credentials", 29 | ".aws/config": "AWS configuration", 30 | 31 | # SSH and certificates 32 | "id_rsa": "SSH private key", 33 | "id_dsa": "SSH private key", 34 | "id_ecdsa": "SSH private key", 35 | "id_ed25519": "SSH private key", 36 | ".pem": "private certificate/key", 37 | ".key": "private key", 38 | ".p12": "certificate file", 39 | ".pfx": "certificate file", 40 | 41 | # Database 42 | ".pgpass": "PostgreSQL password file", 43 | ".my.cnf": "MySQL configuration (may contain passwords)", 44 | 45 | # Other sensitive files 46 | ".netrc": "authentication credentials", 47 | ".htpasswd": "HTTP authentication", 48 | "docker-compose.override.yml": "Docker override (may contain secrets)", 49 | } 50 | 51 | 52 | def check_privacy(path: Path, config: ValidationConfig, progress_callback: Optional[Callable[[str], None]] = None) -> CheckResult: 53 | """ 54 | Check for potential privacy issues and PHI. 55 | 56 | Validates: 57 | - PHI pattern detection 58 | - Age de-identification 59 | - Sensitive configuration files (keys, credentials) 60 | - Date patterns 61 | 62 | Args: 63 | path: Path to dataset directory 64 | config: Validation configuration 65 | progress_callback: Optional callback to report progress 66 | 67 | Returns: 68 | CheckResult with any privacy issues found 69 | """ 70 | result = CheckResult(category=CheckCategory.PRIVACY) 71 | 72 | # Check for sensitive configuration files 73 | if progress_callback: 74 | progress_callback("Checking for sensitive configuration files") 75 | _check_sensitive_files(path, config, result) 76 | 77 | # Compile PHI patterns with names 78 | pattern_info = [(re.compile(pattern), PHI_PATTERN_NAMES.get(pattern, "unknown pattern")) 79 | for pattern in config.phi_patterns] 80 | 81 | # Check CSV files 82 | csv_files = list(path.rglob("*.csv")) 83 | for i, csv_file in enumerate(csv_files): 84 | if progress_callback: 85 | progress_callback(f"Checking {csv_file.name} ({i+1}/{len(csv_files)} CSV files)") 86 | 87 | if any(p in str(csv_file) for p in config.ignore_patterns): 88 | continue 89 | 90 | _check_csv_privacy(csv_file, path, config, pattern_info, result) 91 | 92 | # Check text files for PHI 93 | text_files = list(path.rglob("*.txt")) 94 | for i, text_file in enumerate(text_files): 95 | if progress_callback: 96 | progress_callback(f"Checking {text_file.name} ({i+1}/{len(text_files)} text files)") 97 | 98 | if any(p in str(text_file) for p in config.ignore_patterns): 99 | continue 100 | 101 | _check_text_file_privacy(text_file, path, pattern_info, result, config) 102 | 103 | return result 104 | 105 | 106 | def _check_sensitive_files(path: Path, config: ValidationConfig, result: CheckResult) -> None: 107 | """Check for sensitive configuration files that shouldn't be in the dataset.""" 108 | for root, dirs, files in os.walk(path): 109 | # Filter out ignored directories 110 | dirs[:] = [d for d in dirs if not any(p in d for p in config.ignore_patterns)] 111 | 112 | for file in files: 113 | file_path = Path(root) / file 114 | relative_path = str(file_path.relative_to(path)) 115 | 116 | # Skip ignored files 117 | if any(p in str(file_path) for p in config.ignore_patterns): 118 | continue 119 | 120 | # Check exact filename matches 121 | if file in SENSITIVE_FILES: 122 | result.issues.append( 123 | ValidationIssue( 124 | severity=Severity.ERROR, 125 | category=CheckCategory.PRIVACY, 126 | file=relative_path, 127 | message=f"Sensitive file detected: {SENSITIVE_FILES[file]}", 128 | suggestion=f"Remove '{file}' from the dataset before submission", 129 | ) 130 | ) 131 | continue 132 | 133 | # Check file extensions for sensitive files 134 | for sensitive_name, description in SENSITIVE_FILES.items(): 135 | # Check if it's an extension pattern (starts with .) 136 | if sensitive_name.startswith(".") and "." in file: 137 | ext = "." + file.split(".")[-1] 138 | if ext == sensitive_name: 139 | result.issues.append( 140 | ValidationIssue( 141 | severity=Severity.ERROR, 142 | category=CheckCategory.PRIVACY, 143 | file=relative_path, 144 | message=f"Sensitive file detected: {description}", 145 | suggestion=f"Remove '{file}' from the dataset before submission", 146 | ) 147 | ) 148 | break 149 | 150 | # Check for common patterns in filenames 151 | lower_file = file.lower() 152 | if any(keyword in lower_file for keyword in ["password", "secret", "token", "apikey", "api_key"]): 153 | result.issues.append( 154 | ValidationIssue( 155 | severity=Severity.WARNING, 156 | category=CheckCategory.PRIVACY, 157 | file=relative_path, 158 | message=f"File name suggests sensitive content: '{file}'", 159 | suggestion="Review file contents and remove if it contains credentials or keys", 160 | ) 161 | ) 162 | 163 | 164 | def _check_csv_privacy( 165 | csv_file: Path, 166 | base_path: Path, 167 | config: ValidationConfig, 168 | pattern_info: list, 169 | result: CheckResult 170 | ) -> None: 171 | """Check a CSV file for privacy issues.""" 172 | relative_path = str(csv_file.relative_to(base_path)) 173 | 174 | # Track which columns have which types of issues (to report only once per column) 175 | # Maps column name to the pattern name that matched 176 | phi_columns = {} # {column: pattern_name} 177 | age_columns = set() # Columns with age violations 178 | 179 | try: 180 | with open(csv_file, "r", encoding="utf-8") as f: 181 | reader = csv.DictReader(f) 182 | 183 | # Determine if we should sample this file 184 | rows_scanned = 0 185 | max_rows = config.max_rows_to_scan 186 | 187 | # Count total rows first if we're sampling 188 | if config.sample_large_files and max_rows: 189 | # Read all rows into list to enable sampling 190 | all_rows = list(reader) 191 | total_rows = len(all_rows) 192 | 193 | if total_rows > max_rows: 194 | # Sample evenly distributed rows 195 | import random 196 | random.seed(42) # Deterministic sampling 197 | step = total_rows / max_rows 198 | sampled_indices = [int(i * step) for i in range(max_rows)] 199 | rows_to_scan = [all_rows[i] for i in sampled_indices] 200 | is_sampled = True 201 | else: 202 | rows_to_scan = all_rows 203 | is_sampled = False 204 | else: 205 | # No sampling, but still respect max_rows limit 206 | rows_to_scan = reader 207 | is_sampled = False 208 | 209 | for line_num, row in enumerate(rows_to_scan, start=2): # Start at 2 (after header) 210 | # Stop if we've hit the limit (when not sampling) 211 | if max_rows and not is_sampled and rows_scanned >= max_rows: 212 | break 213 | rows_scanned += 1 214 | 215 | for col, value in row.items(): 216 | if not value: 217 | continue 218 | 219 | value_str = str(value).strip() 220 | 221 | # Check for PHI patterns (only track if not already found in this column) 222 | if col not in phi_columns: 223 | for pattern, pattern_name in pattern_info: 224 | if pattern.search(value_str): 225 | phi_columns[col] = pattern_name 226 | break 227 | 228 | # Check for age violations (only track if not already found in this column) 229 | if col not in age_columns and "age" in col.lower(): 230 | try: 231 | age_value = float(value_str) 232 | if age_value > config.allowed_age_max: 233 | age_columns.add(col) 234 | except ValueError: 235 | pass 236 | 237 | # Report one issue per column type with specific pattern info 238 | for col, pattern_name in phi_columns.items(): 239 | result.issues.append( 240 | ValidationIssue( 241 | severity=Severity.WARNING, 242 | category=CheckCategory.PRIVACY, 243 | file=relative_path, 244 | column=col, 245 | message=f"Potential private information detected in column '{col}' (pattern: {pattern_name})", 246 | suggestion="Review and remove or de-identify sensitive information", 247 | ) 248 | ) 249 | 250 | for col in age_columns: 251 | result.issues.append( 252 | ValidationIssue( 253 | severity=Severity.WARNING, 254 | category=CheckCategory.PRIVACY, 255 | file=relative_path, 256 | column=col, 257 | message=f"Ages exceeding HIPAA limit of {config.allowed_age_max} found in column '{col}'", 258 | suggestion=f"De-identify ages >{config.allowed_age_max} (e.g., set to {config.allowed_age_max}+)", 259 | ) 260 | ) 261 | 262 | except Exception as e: 263 | result.issues.append( 264 | ValidationIssue( 265 | severity=Severity.WARNING, 266 | category=CheckCategory.PRIVACY, 267 | file=str(csv_file.relative_to(base_path)), 268 | message=f"Could not perform privacy checks: {str(e)}", 269 | ) 270 | ) 271 | 272 | 273 | def _check_text_file_privacy(text_file: Path, base_path: Path, pattern_info: list, result: CheckResult, config: ValidationConfig) -> None: 274 | """Check a text file for privacy issues.""" 275 | relative_path = str(text_file.relative_to(base_path)) 276 | detected_patterns = set() 277 | 278 | try: 279 | with open(text_file, "r", encoding="utf-8") as f: 280 | content = f.read() 281 | 282 | # Check for PHI patterns and track which ones are found 283 | for line in content.split("\n"): 284 | for pattern, pattern_name in pattern_info: 285 | if pattern.search(line): 286 | detected_patterns.add(pattern_name) 287 | 288 | # Report once per file with specific patterns found 289 | if detected_patterns: 290 | patterns_str = ", ".join(sorted(detected_patterns)) 291 | result.issues.append( 292 | ValidationIssue( 293 | severity=Severity.WARNING, 294 | category=CheckCategory.PRIVACY, 295 | file=relative_path, 296 | message=f"Potential private information detected ({patterns_str})", 297 | suggestion="Review and remove or de-identify sensitive information", 298 | ) 299 | ) 300 | 301 | except UnicodeDecodeError: 302 | # Skip binary files 303 | pass 304 | except Exception as e: 305 | result.issues.append( 306 | ValidationIssue( 307 | severity=Severity.WARNING, 308 | category=CheckCategory.PRIVACY, 309 | file=str(text_file.relative_to(base_path)), 310 | message=f"Could not perform privacy checks: {str(e)}", 311 | ) 312 | ) 313 | -------------------------------------------------------------------------------- /tests/validate/test_checks.py: -------------------------------------------------------------------------------- 1 | """Tests for individual validation checks.""" 2 | 3 | import pytest 4 | import csv 5 | from pathlib import Path 6 | 7 | from physionet.validate import ValidationConfig 8 | from physionet.validate.checks import ( 9 | check_filesystem, 10 | check_documentation, 11 | check_integrity, 12 | check_quality, 13 | check_privacy, 14 | ) 15 | from physionet.validate.models import Severity, CheckCategory 16 | 17 | 18 | class TestFilesystemChecks: 19 | """Tests for filesystem validation checks.""" 20 | 21 | def test_detects_git_directory(self, tmp_path): 22 | """Test that .git directories are detected.""" 23 | (tmp_path / ".git").mkdir() 24 | (tmp_path / ".git" / "config").write_text("test") 25 | 26 | config = ValidationConfig() 27 | result = check_filesystem(tmp_path, config) 28 | 29 | assert any(".git" in issue.message for issue in result.issues) 30 | 31 | def test_detects_hidden_files(self, tmp_path): 32 | """Test that hidden files are detected.""" 33 | (tmp_path / ".hidden").write_text("test") 34 | 35 | config = ValidationConfig() 36 | result = check_filesystem(tmp_path, config) 37 | 38 | assert any(issue.file and ".hidden" in issue.file for issue in result.issues) 39 | 40 | def test_detects_temp_files(self, tmp_path): 41 | """Test that temporary files are detected.""" 42 | (tmp_path / "file.txt~").write_text("test") 43 | (tmp_path / "temp.tmp").write_text("test") 44 | 45 | config = ValidationConfig() 46 | result = check_filesystem(tmp_path, config) 47 | 48 | assert len(result.issues) >= 2 49 | 50 | def test_detects_empty_files(self, tmp_path): 51 | """Test that empty files are detected.""" 52 | (tmp_path / "empty.txt").write_text("") 53 | 54 | config = ValidationConfig() 55 | result = check_filesystem(tmp_path, config) 56 | 57 | assert any("Empty file" in issue.message for issue in result.issues) 58 | 59 | def test_detects_invalid_filename_characters(self, tmp_path): 60 | """Test that invalid filename characters are detected.""" 61 | # Note: This test might not work on all filesystems 62 | try: 63 | (tmp_path / "file.txt").write_text("test") 64 | config = ValidationConfig() 65 | result = check_filesystem(tmp_path, config) 66 | assert any("invalid characters" in issue.message.lower() for issue in result.issues) 67 | # Should show which character was found 68 | assert any("<" in issue.message for issue in result.issues) 69 | except OSError: 70 | # Skip test if filesystem doesn't allow these characters 71 | pytest.skip("Filesystem doesn't support invalid characters in filenames") 72 | 73 | def test_detects_path_separators_in_filenames(self, tmp_path): 74 | """Test that path separators and other awkward characters are flagged.""" 75 | # These characters should be caught even though they can't actually be in filenames on most systems 76 | # We test the validation logic by checking the character set 77 | from physionet.validate.checks.filesystem import check_filesystem 78 | 79 | # Create a file with a valid name for the actual test 80 | (tmp_path / "normalfile.txt").write_text("test") 81 | 82 | config = ValidationConfig() 83 | result = check_filesystem(tmp_path, config) 84 | 85 | # The check should flag files with /, \, quotes, etc if they could exist 86 | # Since we can't create such files, we verify the character set in the code includes them 87 | # This is tested indirectly through the previous test 88 | 89 | def test_detects_spaces_in_filenames(self, tmp_path): 90 | """Test that filenames with spaces are flagged.""" 91 | (tmp_path / "my data file.csv").write_text("col1,col2\n1,2\n") 92 | (tmp_path / "analysis results.txt").write_text("test") 93 | 94 | config = ValidationConfig() 95 | result = check_filesystem(tmp_path, config) 96 | 97 | # Should warn about both files with spaces 98 | space_warnings = [ 99 | issue for issue in result.issues 100 | if "spaces" in issue.message.lower() 101 | ] 102 | assert len(space_warnings) == 2 103 | assert any("my data file.csv" in issue.file for issue in space_warnings) 104 | assert any("analysis results.txt" in issue.file for issue in space_warnings) 105 | 106 | def test_detects_long_filenames(self, tmp_path): 107 | """Test that excessively long filenames are flagged.""" 108 | # Create a file with a very long name (120 characters total) 109 | long_name = "a" * 116 + ".csv" # 116 + 4 = 120 characters 110 | (tmp_path / long_name).write_text("col1,col2\n1,2\n") 111 | 112 | config = ValidationConfig() 113 | result = check_filesystem(tmp_path, config) 114 | 115 | # Should warn about long filename 116 | long_warnings = [ 117 | issue for issue in result.issues 118 | if "very long" in issue.message.lower() 119 | ] 120 | assert len(long_warnings) == 1 121 | assert "120 characters" in long_warnings[0].message 122 | 123 | def test_detects_extremely_long_filenames(self, tmp_path): 124 | """Test that filenames exceeding maximum length are errors.""" 125 | # Create a file with name exceeding 255 characters 126 | extreme_name = "b" * 260 + ".csv" 127 | try: 128 | (tmp_path / extreme_name).write_text("col1,col2\n1,2\n") 129 | 130 | config = ValidationConfig() 131 | result = check_filesystem(tmp_path, config) 132 | 133 | # Should error about exceeding maximum length 134 | length_errors = [ 135 | issue for issue in result.issues 136 | if "exceeds maximum length" in issue.message.lower() 137 | ] 138 | assert len(length_errors) == 1 139 | assert "260 characters" in length_errors[0].message 140 | except OSError: 141 | # Skip test if filesystem doesn't support such long names 142 | pytest.skip("Filesystem doesn't support filenames over 255 characters") 143 | 144 | def test_detects_proprietary_formats(self, tmp_path): 145 | """Test that proprietary file formats are flagged.""" 146 | # Create files with proprietary formats 147 | (tmp_path / "data.xlsx").write_text("test") 148 | (tmp_path / "analysis.mat").write_text("test") 149 | (tmp_path / "results.sas7bdat").write_text("test") 150 | 151 | config = ValidationConfig() 152 | result = check_filesystem(tmp_path, config) 153 | 154 | # Should warn about proprietary data formats (not .docx which is allowed) 155 | proprietary_warnings = [ 156 | issue for issue in result.issues 157 | if "proprietary file format" in issue.message.lower() 158 | ] 159 | assert len(proprietary_warnings) == 3 160 | 161 | # Check that suggestions include alternatives 162 | suggestions = [issue.suggestion for issue in proprietary_warnings] 163 | assert any(".csv" in s or ".parquet" in s for s in suggestions) 164 | assert any(".zarr" in s for s in suggestions) 165 | 166 | def test_allows_open_formats(self, tmp_path): 167 | """Test that open file formats are not flagged.""" 168 | # Create files with open formats (including .docx which is now allowed) 169 | (tmp_path / "README.md").write_text("# Test") 170 | (tmp_path / "data.csv").write_text("col1,col2\n1,2\n") 171 | (tmp_path / "signal.hdf5").write_text("test") 172 | (tmp_path / "record.json").write_text("{}") 173 | (tmp_path / "notes.txt").write_text("notes") 174 | (tmp_path / "protocol.docx").write_text("test") # .docx is now allowed 175 | 176 | config = ValidationConfig() 177 | result = check_filesystem(tmp_path, config) 178 | 179 | # Should not warn about proprietary formats 180 | proprietary_warnings = [ 181 | issue for issue in result.issues 182 | if "proprietary file format" in issue.message.lower() 183 | ] 184 | assert len(proprietary_warnings) == 0 185 | 186 | 187 | class TestDocumentationChecks: 188 | """Tests for documentation validation checks.""" 189 | 190 | def test_readme_required_by_default(self, tmp_path): 191 | """Test that README.md is required by default.""" 192 | config = ValidationConfig() 193 | result = check_documentation(tmp_path, config) 194 | 195 | # Should have error for missing README.md 196 | assert result.error_count == 1 197 | assert any("README.md" in issue.message for issue in result.issues) 198 | 199 | # Should have helpful suggestion about minimum content 200 | readme_issue = [issue for issue in result.issues if "README.md" in issue.message][0] 201 | assert "title and a brief description" in readme_issue.suggestion 202 | 203 | def test_custom_required_files(self, tmp_path): 204 | """Test that custom required files are validated.""" 205 | config = ValidationConfig(required_files=["README.md", "LICENSE"]) 206 | result = check_documentation(tmp_path, config) 207 | 208 | # Should have errors for both missing files 209 | assert result.error_count == 2 210 | assert any("README.md" in issue.message for issue in result.issues) 211 | assert any("LICENSE" in issue.message for issue in result.issues) 212 | 213 | def test_required_file_exists(self, tmp_path): 214 | """Test that existing required file passes validation.""" 215 | readme = tmp_path / "README.md" 216 | readme.write_text("# Title\n\nSome content.") 217 | 218 | config = ValidationConfig(required_files=["README.md"]) 219 | result = check_documentation(tmp_path, config) 220 | 221 | # Should have no errors since README exists 222 | assert result.error_count == 0 223 | 224 | 225 | class TestIntegrityChecks: 226 | """Tests for data integrity validation checks.""" 227 | 228 | def test_validates_valid_csv(self, tmp_path): 229 | """Test that valid CSV passes validation.""" 230 | csv_file = tmp_path / "data.csv" 231 | csv_file.write_text("col1,col2,col3\n1,2,3\n4,5,6\n") 232 | 233 | config = ValidationConfig() 234 | result = check_integrity(tmp_path, config) 235 | 236 | assert result.error_count == 0 237 | 238 | def test_detects_empty_csv(self, tmp_path): 239 | """Test that empty CSV is detected.""" 240 | csv_file = tmp_path / "data.csv" 241 | csv_file.write_text("") 242 | 243 | config = ValidationConfig() 244 | result = check_integrity(tmp_path, config) 245 | 246 | assert any("empty" in issue.message.lower() for issue in result.issues) 247 | 248 | def test_detects_duplicate_column_names(self, tmp_path): 249 | """Test that duplicate column names are detected.""" 250 | csv_file = tmp_path / "data.csv" 251 | csv_file.write_text("col1,col2,col1\n1,2,3\n") 252 | 253 | config = ValidationConfig() 254 | result = check_integrity(tmp_path, config) 255 | 256 | assert any("Duplicate" in issue.message for issue in result.issues) 257 | 258 | def test_detects_inconsistent_row_length(self, tmp_path): 259 | """Test that inconsistent row lengths are detected.""" 260 | csv_file = tmp_path / "data.csv" 261 | csv_file.write_text("col1,col2,col3\n1,2,3\n4,5\n6,7,8,9\n") 262 | 263 | config = ValidationConfig() 264 | result = check_integrity(tmp_path, config) 265 | 266 | # Should detect both short and long rows 267 | assert result.error_count >= 2 268 | 269 | def test_detects_encoding_issues(self, tmp_path): 270 | """Test that encoding issues are detected.""" 271 | csv_file = tmp_path / "data.csv" 272 | # Write invalid UTF-8 273 | csv_file.write_bytes(b"col1,col2\n1,\xff\xfe\n") 274 | 275 | config = ValidationConfig() 276 | result = check_integrity(tmp_path, config) 277 | 278 | assert any("encoding" in issue.message.lower() for issue in result.issues) 279 | 280 | 281 | class TestQualityChecks: 282 | """Tests for data quality validation checks.""" 283 | 284 | def test_detects_completely_empty_columns(self, tmp_path): 285 | """Test that completely empty columns (100% missing) are detected.""" 286 | csv_file = tmp_path / "data.csv" 287 | # Create CSV with one column that's 100% empty 288 | rows = ["col1,col2,col3\n"] 289 | for i in range(10): 290 | rows.append(f"{i},data,\n") 291 | csv_file.write_text("".join(rows)) 292 | 293 | config = ValidationConfig() 294 | result = check_quality(tmp_path, config) 295 | 296 | # Should detect the empty column 297 | assert any("empty" in issue.message.lower() and "col3" in issue.column for issue in result.issues) 298 | 299 | def test_partial_missing_values_not_flagged(self, tmp_path): 300 | """Test that partially missing columns (e.g., 75%) are not flagged.""" 301 | csv_file = tmp_path / "data.csv" 302 | # Create CSV with 75% missing values in a column 303 | rows = ["col1,col2\n"] 304 | for i in range(100): 305 | if i < 75: 306 | rows.append("1,\n") 307 | else: 308 | rows.append("1,2\n") 309 | csv_file.write_text("".join(rows)) 310 | 311 | config = ValidationConfig() 312 | result = check_quality(tmp_path, config) 313 | 314 | # Should NOT flag col2 since it has some data (25%) 315 | assert not any("col2" in str(issue.column) for issue in result.issues) 316 | 317 | def test_detects_out_of_range_values(self, tmp_path): 318 | """Test that out-of-range values are detected.""" 319 | csv_file = tmp_path / "data.csv" 320 | csv_file.write_text("heart_rate\n80\n350\n75\n") 321 | 322 | config = ValidationConfig(value_ranges={"heart_rate": (20, 300)}) 323 | result = check_quality(tmp_path, config) 324 | 325 | assert any("outside expected range" in issue.message for issue in result.issues) 326 | 327 | 328 | class TestPrivacyChecks: 329 | """Tests for privacy validation checks.""" 330 | 331 | def test_date_format_not_flagged(self, tmp_path): 332 | """Test that date formats (YYYY-MM-DD) are not automatically flagged as PHI. 333 | 334 | Dates are commonly used in medical datasets as de-identified timestamps. 335 | They should not be flagged without additional context. 336 | """ 337 | csv_file = tmp_path / "data.csv" 338 | csv_file.write_text("patient_id,admission_date\n1,2023-05-15\n2,2023-06-20\n") 339 | 340 | config = ValidationConfig() 341 | result = check_privacy(tmp_path, config) 342 | 343 | # Dates alone should not be flagged 344 | assert result.error_count == 0 345 | 346 | def test_detects_email_addresses(self, tmp_path): 347 | """Test that email addresses are detected as PHI.""" 348 | csv_file = tmp_path / "data.csv" 349 | csv_file.write_text("patient_id,contact\n1,patient@example.com\n2,test@test.com\n") 350 | 351 | config = ValidationConfig() 352 | result = check_privacy(tmp_path, config) 353 | 354 | # Should have one warning for the 'contact' column with pattern type 355 | assert result.warning_count == 1 356 | assert any( 357 | issue.severity == Severity.WARNING 358 | and "contact" in str(issue.column) 359 | and "email address" in issue.message 360 | for issue in result.issues 361 | ) 362 | 363 | def test_detects_age_violations(self, tmp_path): 364 | """Test that ages over limit are detected.""" 365 | csv_file = tmp_path / "data.csv" 366 | csv_file.write_text("patient_id,age\n1,92\n2,95\n3,85\n") 367 | 368 | config = ValidationConfig(allowed_age_max=89) 369 | result = check_privacy(tmp_path, config) 370 | 371 | # Should have one warning for the age column (consolidated) 372 | age_violations = [ 373 | issue for issue in result.issues 374 | if "age" in issue.message.lower() and issue.severity == Severity.WARNING 375 | ] 376 | assert len(age_violations) == 1 377 | assert "age" in age_violations[0].column.lower() 378 | 379 | def test_text_files_checked_for_phi(self, tmp_path): 380 | """Test that text files are checked for PHI patterns.""" 381 | text_file = tmp_path / "notes.txt" 382 | text_file.write_text("Contact: test@example.com\nPhone: 555-123-4567") 383 | 384 | config = ValidationConfig() 385 | result = check_privacy(tmp_path, config) 386 | 387 | # Should detect private information patterns in text files as a single consolidated warning with pattern types 388 | assert result.warning_count >= 1 389 | assert any( 390 | "private information detected" in issue.message 391 | and ("email address" in issue.message or "phone number" in issue.message) 392 | for issue in result.issues 393 | ) 394 | 395 | def test_allows_year_only_dates(self, tmp_path): 396 | """Test that year-only dates are allowed.""" 397 | csv_file = tmp_path / "data.csv" 398 | csv_file.write_text("patient_id,year\n1,2023\n2,2024\n") 399 | 400 | config = ValidationConfig() 401 | result = check_privacy(tmp_path, config) 402 | 403 | # Should not flag year-only as PHI 404 | phi_issues = [ 405 | issue for issue in result.issues 406 | if issue.severity == Severity.ERROR 407 | ] 408 | assert len(phi_issues) == 0 409 | 410 | def test_detects_sensitive_config_files(self, tmp_path): 411 | """Test that sensitive configuration files are detected.""" 412 | # Create some sensitive files 413 | (tmp_path / ".env").write_text("API_KEY=secret123") 414 | (tmp_path / "credentials.json").write_text('{"key": "value"}') 415 | (tmp_path / "id_rsa").write_text("-----BEGIN RSA PRIVATE KEY-----") 416 | 417 | config = ValidationConfig() 418 | result = check_privacy(tmp_path, config) 419 | 420 | # Should detect all three sensitive files as errors 421 | sensitive_file_errors = [ 422 | issue for issue in result.issues 423 | if issue.severity == Severity.ERROR and "Sensitive file detected" in issue.message 424 | ] 425 | assert len(sensitive_file_errors) == 3 426 | 427 | def test_detects_files_with_sensitive_names(self, tmp_path): 428 | """Test that files with sensitive keywords in names are flagged.""" 429 | (tmp_path / "my_api_key.txt").write_text("some data") 430 | (tmp_path / "database_password.csv").write_text("col1\nval1") 431 | 432 | config = ValidationConfig() 433 | result = check_privacy(tmp_path, config) 434 | 435 | # Should warn about files with sensitive keywords in names 436 | keyword_warnings = [ 437 | issue for issue in result.issues 438 | if issue.severity == Severity.WARNING and "name suggests sensitive content" in issue.message 439 | ] 440 | assert len(keyword_warnings) >= 2 441 | 442 | def test_detects_key_file_extensions(self, tmp_path): 443 | """Test that private key file extensions are detected.""" 444 | (tmp_path / "server.pem").write_text("certificate") 445 | (tmp_path / "private.key").write_text("key data") 446 | 447 | config = ValidationConfig() 448 | result = check_privacy(tmp_path, config) 449 | 450 | # Should detect both key files 451 | key_errors = [ 452 | issue for issue in result.issues 453 | if issue.severity == Severity.ERROR 454 | ] 455 | assert len(key_errors) >= 2 456 | 457 | def test_sampling_large_files(self, tmp_path): 458 | """Test that large files are sampled for performance.""" 459 | csv_file = tmp_path / "large.csv" 460 | 461 | # Create a file with more rows than the sampling limit 462 | rows = ["patient_id,email\n"] 463 | for i in range(15000): # More than default max_rows_to_scan (10000) 464 | rows.append(f"{i},test{i}@example.com\n") 465 | csv_file.write_text("".join(rows)) 466 | 467 | config = ValidationConfig(max_rows_to_scan=1000, sample_large_files=True) 468 | result = check_privacy(tmp_path, config) 469 | 470 | # Should still detect the email pattern even with sampling 471 | assert result.warning_count >= 1 472 | assert any("email" in str(issue.column) for issue in result.issues) 473 | --------------------------------------------------------------------------------